From 7dafac8615ec46e926203210724f6467d4d99a38 Mon Sep 17 00:00:00 2001
From: Shane <shanehill@mail.com>
Date: Sat, 10 May 2025 20:15:03 +1000
Subject: [PATCH] Rate Limit Handling

---
 foodie_automator_google.py | 25 +++++++++++------------
 foodie_automator_reddit.py | 41 ++++++++++++++++++++++++++++++--------
 foodie_utils.py            | 23 ++++++++-------------
 3 files changed, 53 insertions(+), 36 deletions(-)

diff --git a/foodie_automator_google.py b/foodie_automator_google.py
index 8b93b94..e7b8f66 100644
--- a/foodie_automator_google.py
+++ b/foodie_automator_google.py
@@ -36,9 +36,21 @@ import fcntl
 
 load_dotenv()
 
+# Define constants at the top
+POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_google_titles.json'
+USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
+EXPIRATION_HOURS = 24
+IMAGE_EXPIRATION_DAYS = 7
+
 is_posting = False
 LOCK_FILE = "/home/shane/foodie_automator/locks/foodie_automator_google.lock"
 
+# Load JSON files after constants are defined
+posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
+posted_titles = set(entry["title"] for entry in posted_titles_data if "title" in entry)
+used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
+used_images = set(entry["title"] for entry in used_images_data if "title" in entry)
+
 def signal_handler(sig, frame):
     logging.info("Received termination signal, checking if safe to exit...")
     if is_posting:
@@ -55,10 +67,6 @@ LOG_PRUNE_DAYS = 30
 MAX_RETRIES = 3
 RETRY_BACKOFF = 2
 
-posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
-posted_titles = set(entry["title"] for entry in posted_titles_data)
-used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)
-
 def setup_logging():
     if os.path.exists(LOG_FILE):
         with open(LOG_FILE, 'r') as f:
@@ -105,15 +113,6 @@ def setup_logging():
 
 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 
-POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_google_titles.json'
-USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
-EXPIRATION_HOURS = 24
-IMAGE_EXPIRATION_DAYS = 7
-
-posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
-posted_titles = set(entry["title"] for entry in posted_titles_data)
-used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)
-
 def acquire_lock():
     os.makedirs(os.path.dirname(LOCK_FILE), exist_ok=True)
     lock_fd = open(LOCK_FILE, 'w')
diff --git a/foodie_automator_reddit.py b/foodie_automator_reddit.py
index 39e161c..e111cbe 100644
--- a/foodie_automator_reddit.py
+++ b/foodie_automator_reddit.py
@@ -125,7 +125,16 @@ def acquire_lock():
         sys.exit(0)
 
 def clean_reddit_title(title):
-    cleaned_title = re.sub(r'^\[.*?\]\s*', '', title).strip()
+    """Clean Reddit post title by removing prefixes, newlines, and special characters."""
+    if not title or not isinstance(title, str):
+        logging.warning(f"Invalid title received: {title}")
+        return ""
+    # Remove [prefixes], newlines, and excessive whitespace
+    cleaned_title = re.sub(r'^\[.*?\]\s*', '', title)  # Remove [prefix]
+    cleaned_title = re.sub(r'\n+', ' ', cleaned_title)  # Replace newlines with space
+    cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip()  # Normalize spaces
+    # Remove special characters (keep alphanumeric, spaces, and basic punctuation)
+    cleaned_title = re.sub(r'[^\w\s.,!?-]', '', cleaned_title)
     logging.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'")
     return cleaned_title
 
@@ -223,6 +232,7 @@ def fetch_duckduckgo_news_context(title, hours=24):
     return title
 
 def fetch_reddit_posts():
+    """Fetch Reddit posts from specified subreddits, filtering low-quality posts."""
     try:
         reddit = praw.Reddit(
             client_id=REDDIT_CLIENT_ID,
@@ -244,15 +254,26 @@ def fetch_reddit_posts():
                             logging.info(f"Skipping old post: {submission.title} (Published: {pub_date})")
                             continue
                         cleaned_title = clean_reddit_title(submission.title)
+                        if not cleaned_title or len(cleaned_title) < 5:
+                            logging.info(f"Skipping post with invalid or short title: {submission.title}")
+                            continue
+                        # Filter out posts with empty or very short summaries
+                        summary = submission.selftext.strip() if submission.selftext else ""
+                        if len(summary) < 20 and not submission.url.endswith(('.jpg', '.jpeg', '.png', '.gif')):
+                            logging.info(f"Skipping post with insufficient summary: {cleaned_title}")
+                            continue
+                        # Fetch top comments for additional context
+                        top_comments = get_top_comments(f"https://www.reddit.com{submission.permalink}", reddit)
                         articles.append({
                             "title": cleaned_title,
                             "raw_title": submission.title,
                             "link": f"https://www.reddit.com{submission.permalink}",
-                            "summary": submission.selftext,
+                            "summary": summary,
                             "feed_title": get_clean_source_name(subreddit_name),
                             "pub_date": pub_date,
                             "upvotes": submission.score,
-                            "comment_count": submission.num_comments
+                            "comment_count": submission.num_comments,
+                            "top_comments": top_comments
                         })
                     logging.info(f"Fetched {len(articles)} posts from r/{subreddit_name}")
                     break
@@ -283,16 +304,18 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used
             title = post["title"]
             link = post.get("link", "")
             summary = post.get("summary", "")
-            source_name = post.get("source", "Reddit")
+            source_name = post.get("feed_title", "Reddit")
             original_source = f'<a href="{link}">{source_name}</a>'
-            original_url = link  # Store for fallback
+            original_url = link
+            upvotes = post.get("upvotes", 0)
+            comment_count = post.get("comment_count", 0)
+            top_comments = post.get("top_comments", [])
 
             if title in posted_titles:
                 logging.info(f"Skipping already posted Reddit post: {title}")
                 attempts += 1
                 continue
 
-            # Check author availability before GPT calls
             author = get_next_author_round_robin()
             if not author:
                 logging.info(f"Skipping post '{title}' due to tweet rate limits for all authors")
@@ -317,8 +340,10 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used
                 continue
 
             ddg_context = fetch_duckduckgo_news_context(title)
-            scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}"
-            interest_score = is_interesting(scoring_content)
+            # Log full scoring content for debugging
+            scoring_content = f"Title: {title}\n\nContent: {summary}\n\nTop Comments: {top_comments}\n\nAdditional Context: {ddg_context}"
+            logging.debug(f"Scoring content for '{title}': {scoring_content}")
+            interest_score = is_interesting_reddit(title, summary, upvotes, comment_count, top_comments)
             logging.info(f"Interest score for '{title}': {interest_score}")
             if interest_score < 6:
                 logging.info(f"Reddit Interest Too Low: {interest_score}")
diff --git a/foodie_utils.py b/foodie_utils.py
index 532016d..f222e86 100644
--- a/foodie_utils.py
+++ b/foodie_utils.py
@@ -1324,31 +1324,20 @@ def check_author_rate_limit(author, max_tweets=17, tweet_window_seconds=86400):
     author_info = rate_limit_info[username]
     script_run_id = author_info.get('script_run_id', 0)
 
-    # Check if quota has reset based on previous reset time
-    reset = author_info.get('tweet_reset', current_time + tweet_window_seconds)
-    if current_time >= reset:
-        logger.info(f"Quota reset for {username}, restoring to {max_tweets} tweets")
-        author_info['tweet_remaining'] = max_tweets
-        author_info['tweet_reset'] = current_time + tweet_window_seconds
-        author_info['tweets_posted_in_run'] = 0
-        author_info['script_run_id'] = check_author_rate_limit.script_run_id
-        rate_limit_info[username] = author_info
-        save_json_file(rate_limit_file, rate_limit_info)
-
     # If script restarted or new author, post a test tweet to sync quota
     if script_run_id != check_author_rate_limit.script_run_id:
         logger.info(f"Script restart detected for {username}, posting test tweet to sync quota")
         remaining, api_reset = get_x_rate_limit_status(author)
         if remaining is None or api_reset is None:
             # Fallback: Use last known quota or assume 0 remaining
-            if current_time < author_info.get('tweet_reset', 0):
+            if current_time < author_info.get('tweet_reset', current_time + tweet_window_seconds):
                 remaining = author_info.get('tweet_remaining', 0)
                 reset = author_info.get('tweet_reset', current_time + tweet_window_seconds)
                 logger.warning(f"Test tweet failed for {username}, using last known quota: {remaining} remaining")
             else:
-                remaining = max_tweets
+                remaining = 0  # Assume exhausted if API fails and reset time has passed
                 reset = current_time + tweet_window_seconds
-                logger.warning(f"Test tweet failed for {username}, resetting quota to {max_tweets}")
+                logger.warning(f"Test tweet failed for {username}, assuming quota exhausted")
         else:
             remaining = min(remaining, max_tweets)  # Ensure within Free tier limit
             reset = api_reset
@@ -1360,9 +1349,13 @@ def check_author_rate_limit(author, max_tweets=17, tweet_window_seconds=86400):
         author_info['script_run_id'] = check_author_rate_limit.script_run_id
         rate_limit_info[username] = author_info
         save_json_file(rate_limit_file, rate_limit_info)
+    else:
+        # Use existing quota without resetting
+        remaining = author_info.get('tweet_remaining', max_tweets)
+        reset = author_info.get('tweet_reset', current_time + tweet_window_seconds)
 
     # Calculate remaining tweets
-    remaining = author_info['tweet_remaining'] - author_info['tweets_posted_in_run']
+    remaining = remaining - author_info.get('tweets_posted_in_run', 0)
 
     can_post = remaining > 0
     if not can_post: