Rate Limit Handling

2025-05-10 20:15:03 +10:00
parent 903dbf21d0
commit 7dafac8615
3 changed files with 53 additions and 36 deletions
@@ -125,7 +125,16 @@ def acquire_lock():
        sys.exit(0)

 def clean_reddit_title(title):
-    cleaned_title = re.sub(r'^\[.*?\]\s*', '', title).strip()
+    """Clean Reddit post title by removing prefixes, newlines, and special characters."""
+    if not title or not isinstance(title, str):
+        logging.warning(f"Invalid title received: {title}")
+        return ""
+    # Remove [prefixes], newlines, and excessive whitespace
+    cleaned_title = re.sub(r'^\[.*?\]\s*', '', title)  # Remove [prefix]
+    cleaned_title = re.sub(r'\n+', ' ', cleaned_title)  # Replace newlines with space
+    cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip()  # Normalize spaces
+    # Remove special characters (keep alphanumeric, spaces, and basic punctuation)
+    cleaned_title = re.sub(r'[^\w\s.,!?-]', '', cleaned_title)
    logging.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'")
    return cleaned_title

@@ -223,6 +232,7 @@ def fetch_duckduckgo_news_context(title, hours=24):
    return title

 def fetch_reddit_posts():
+    """Fetch Reddit posts from specified subreddits, filtering low-quality posts."""
    try:
        reddit = praw.Reddit(
            client_id=REDDIT_CLIENT_ID,
@@ -244,15 +254,26 @@ def fetch_reddit_posts():
                            logging.info(f"Skipping old post: {submission.title} (Published: {pub_date})")
                            continue
                        cleaned_title = clean_reddit_title(submission.title)
+                        if not cleaned_title or len(cleaned_title) < 5:
+                            logging.info(f"Skipping post with invalid or short title: {submission.title}")
+                            continue
+                        # Filter out posts with empty or very short summaries
+                        summary = submission.selftext.strip() if submission.selftext else ""
+                        if len(summary) < 20 and not submission.url.endswith(('.jpg', '.jpeg', '.png', '.gif')):
+                            logging.info(f"Skipping post with insufficient summary: {cleaned_title}")
+                            continue
+                        # Fetch top comments for additional context
+                        top_comments = get_top_comments(f"https://www.reddit.com{submission.permalink}", reddit)
                        articles.append({
                            "title": cleaned_title,
                            "raw_title": submission.title,
                            "link": f"https://www.reddit.com{submission.permalink}",
-                            "summary": submission.selftext,
+                            "summary": summary,
                            "feed_title": get_clean_source_name(subreddit_name),
                            "pub_date": pub_date,
                            "upvotes": submission.score,
-                            "comment_count": submission.num_comments
+                            "comment_count": submission.num_comments,
+                            "top_comments": top_comments
                        })
                    logging.info(f"Fetched {len(articles)} posts from r/{subreddit_name}")
                    break
@@ -283,16 +304,18 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used
            title = post["title"]
            link = post.get("link", "")
            summary = post.get("summary", "")
-            source_name = post.get("source", "Reddit")
+            source_name = post.get("feed_title", "Reddit")
            original_source = f'<a href="{link}">{source_name}</a>'
-            original_url = link  # Store for fallback
+            original_url = link
+            upvotes = post.get("upvotes", 0)
+            comment_count = post.get("comment_count", 0)
+            top_comments = post.get("top_comments", [])

            if title in posted_titles:
                logging.info(f"Skipping already posted Reddit post: {title}")
                attempts += 1
                continue

-            # Check author availability before GPT calls
            author = get_next_author_round_robin()
            if not author:
                logging.info(f"Skipping post '{title}' due to tweet rate limits for all authors")
@@ -317,8 +340,10 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used
                continue

            ddg_context = fetch_duckduckgo_news_context(title)
-            scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}"
-            interest_score = is_interesting(scoring_content)
+            # Log full scoring content for debugging
+            scoring_content = f"Title: {title}\n\nContent: {summary}\n\nTop Comments: {top_comments}\n\nAdditional Context: {ddg_context}"
+            logging.debug(f"Scoring content for '{title}': {scoring_content}")
+            interest_score = is_interesting_reddit(title, summary, upvotes, comment_count, top_comments)
            logging.info(f"Interest score for '{title}': {interest_score}")
            if interest_score < 6:
                logging.info(f"Reddit Interest Too Low: {interest_score}")