From 7dafac8615ec46e926203210724f6467d4d99a38 Mon Sep 17 00:00:00 2001 From: Shane Date: Sat, 10 May 2025 20:15:03 +1000 Subject: [PATCH] Rate Limit Handling --- foodie_automator_google.py | 25 +++++++++++------------ foodie_automator_reddit.py | 41 ++++++++++++++++++++++++++++++-------- foodie_utils.py | 23 ++++++++------------- 3 files changed, 53 insertions(+), 36 deletions(-) diff --git a/foodie_automator_google.py b/foodie_automator_google.py index 8b93b94..e7b8f66 100644 --- a/foodie_automator_google.py +++ b/foodie_automator_google.py @@ -36,9 +36,21 @@ import fcntl load_dotenv() +# Define constants at the top +POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_google_titles.json' +USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json' +EXPIRATION_HOURS = 24 +IMAGE_EXPIRATION_DAYS = 7 + is_posting = False LOCK_FILE = "/home/shane/foodie_automator/locks/foodie_automator_google.lock" +# Load JSON files after constants are defined +posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS) +posted_titles = set(entry["title"] for entry in posted_titles_data if "title" in entry) +used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) +used_images = set(entry["title"] for entry in used_images_data if "title" in entry) + def signal_handler(sig, frame): logging.info("Received termination signal, checking if safe to exit...") if is_posting: @@ -55,10 +67,6 @@ LOG_PRUNE_DAYS = 30 MAX_RETRIES = 3 RETRY_BACKOFF = 2 -posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS) -posted_titles = set(entry["title"] for entry in posted_titles_data) -used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry) - def setup_logging(): if os.path.exists(LOG_FILE): with open(LOG_FILE, 'r') as f: @@ -105,15 +113,6 @@ def setup_logging(): client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) -POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_google_titles.json' -USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json' -EXPIRATION_HOURS = 24 -IMAGE_EXPIRATION_DAYS = 7 - -posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS) -posted_titles = set(entry["title"] for entry in posted_titles_data) -used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry) - def acquire_lock(): os.makedirs(os.path.dirname(LOCK_FILE), exist_ok=True) lock_fd = open(LOCK_FILE, 'w') diff --git a/foodie_automator_reddit.py b/foodie_automator_reddit.py index 39e161c..e111cbe 100644 --- a/foodie_automator_reddit.py +++ b/foodie_automator_reddit.py @@ -125,7 +125,16 @@ def acquire_lock(): sys.exit(0) def clean_reddit_title(title): - cleaned_title = re.sub(r'^\[.*?\]\s*', '', title).strip() + """Clean Reddit post title by removing prefixes, newlines, and special characters.""" + if not title or not isinstance(title, str): + logging.warning(f"Invalid title received: {title}") + return "" + # Remove [prefixes], newlines, and excessive whitespace + cleaned_title = re.sub(r'^\[.*?\]\s*', '', title) # Remove [prefix] + cleaned_title = re.sub(r'\n+', ' ', cleaned_title) # Replace newlines with space + cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip() # Normalize spaces + # Remove special characters (keep alphanumeric, spaces, and basic punctuation) + cleaned_title = re.sub(r'[^\w\s.,!?-]', '', cleaned_title) logging.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'") return cleaned_title @@ -223,6 +232,7 @@ def fetch_duckduckgo_news_context(title, hours=24): return title def fetch_reddit_posts(): + """Fetch Reddit posts from specified subreddits, filtering low-quality posts.""" try: reddit = praw.Reddit( client_id=REDDIT_CLIENT_ID, @@ -244,15 +254,26 @@ def fetch_reddit_posts(): logging.info(f"Skipping old post: {submission.title} (Published: {pub_date})") continue cleaned_title = clean_reddit_title(submission.title) + if not cleaned_title or len(cleaned_title) < 5: + logging.info(f"Skipping post with invalid or short title: {submission.title}") + continue + # Filter out posts with empty or very short summaries + summary = submission.selftext.strip() if submission.selftext else "" + if len(summary) < 20 and not submission.url.endswith(('.jpg', '.jpeg', '.png', '.gif')): + logging.info(f"Skipping post with insufficient summary: {cleaned_title}") + continue + # Fetch top comments for additional context + top_comments = get_top_comments(f"https://www.reddit.com{submission.permalink}", reddit) articles.append({ "title": cleaned_title, "raw_title": submission.title, "link": f"https://www.reddit.com{submission.permalink}", - "summary": submission.selftext, + "summary": summary, "feed_title": get_clean_source_name(subreddit_name), "pub_date": pub_date, "upvotes": submission.score, - "comment_count": submission.num_comments + "comment_count": submission.num_comments, + "top_comments": top_comments }) logging.info(f"Fetched {len(articles)} posts from r/{subreddit_name}") break @@ -283,16 +304,18 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used title = post["title"] link = post.get("link", "") summary = post.get("summary", "") - source_name = post.get("source", "Reddit") + source_name = post.get("feed_title", "Reddit") original_source = f'{source_name}' - original_url = link # Store for fallback + original_url = link + upvotes = post.get("upvotes", 0) + comment_count = post.get("comment_count", 0) + top_comments = post.get("top_comments", []) if title in posted_titles: logging.info(f"Skipping already posted Reddit post: {title}") attempts += 1 continue - # Check author availability before GPT calls author = get_next_author_round_robin() if not author: logging.info(f"Skipping post '{title}' due to tweet rate limits for all authors") @@ -317,8 +340,10 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used continue ddg_context = fetch_duckduckgo_news_context(title) - scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}" - interest_score = is_interesting(scoring_content) + # Log full scoring content for debugging + scoring_content = f"Title: {title}\n\nContent: {summary}\n\nTop Comments: {top_comments}\n\nAdditional Context: {ddg_context}" + logging.debug(f"Scoring content for '{title}': {scoring_content}") + interest_score = is_interesting_reddit(title, summary, upvotes, comment_count, top_comments) logging.info(f"Interest score for '{title}': {interest_score}") if interest_score < 6: logging.info(f"Reddit Interest Too Low: {interest_score}") diff --git a/foodie_utils.py b/foodie_utils.py index 532016d..f222e86 100644 --- a/foodie_utils.py +++ b/foodie_utils.py @@ -1324,31 +1324,20 @@ def check_author_rate_limit(author, max_tweets=17, tweet_window_seconds=86400): author_info = rate_limit_info[username] script_run_id = author_info.get('script_run_id', 0) - # Check if quota has reset based on previous reset time - reset = author_info.get('tweet_reset', current_time + tweet_window_seconds) - if current_time >= reset: - logger.info(f"Quota reset for {username}, restoring to {max_tweets} tweets") - author_info['tweet_remaining'] = max_tweets - author_info['tweet_reset'] = current_time + tweet_window_seconds - author_info['tweets_posted_in_run'] = 0 - author_info['script_run_id'] = check_author_rate_limit.script_run_id - rate_limit_info[username] = author_info - save_json_file(rate_limit_file, rate_limit_info) - # If script restarted or new author, post a test tweet to sync quota if script_run_id != check_author_rate_limit.script_run_id: logger.info(f"Script restart detected for {username}, posting test tweet to sync quota") remaining, api_reset = get_x_rate_limit_status(author) if remaining is None or api_reset is None: # Fallback: Use last known quota or assume 0 remaining - if current_time < author_info.get('tweet_reset', 0): + if current_time < author_info.get('tweet_reset', current_time + tweet_window_seconds): remaining = author_info.get('tweet_remaining', 0) reset = author_info.get('tweet_reset', current_time + tweet_window_seconds) logger.warning(f"Test tweet failed for {username}, using last known quota: {remaining} remaining") else: - remaining = max_tweets + remaining = 0 # Assume exhausted if API fails and reset time has passed reset = current_time + tweet_window_seconds - logger.warning(f"Test tweet failed for {username}, resetting quota to {max_tweets}") + logger.warning(f"Test tweet failed for {username}, assuming quota exhausted") else: remaining = min(remaining, max_tweets) # Ensure within Free tier limit reset = api_reset @@ -1360,9 +1349,13 @@ def check_author_rate_limit(author, max_tweets=17, tweet_window_seconds=86400): author_info['script_run_id'] = check_author_rate_limit.script_run_id rate_limit_info[username] = author_info save_json_file(rate_limit_file, rate_limit_info) + else: + # Use existing quota without resetting + remaining = author_info.get('tweet_remaining', max_tweets) + reset = author_info.get('tweet_reset', current_time + tweet_window_seconds) # Calculate remaining tweets - remaining = author_info['tweet_remaining'] - author_info['tweets_posted_in_run'] + remaining = remaining - author_info.get('tweets_posted_in_run', 0) can_post = remaining > 0 if not can_post: