|
|
|
@ -125,7 +125,16 @@ def acquire_lock(): |
|
|
|
sys.exit(0) |
|
|
|
sys.exit(0) |
|
|
|
|
|
|
|
|
|
|
|
def clean_reddit_title(title): |
|
|
|
def clean_reddit_title(title): |
|
|
|
cleaned_title = re.sub(r'^\[.*?\]\s*', '', title).strip() |
|
|
|
"""Clean Reddit post title by removing prefixes, newlines, and special characters.""" |
|
|
|
|
|
|
|
if not title or not isinstance(title, str): |
|
|
|
|
|
|
|
logging.warning(f"Invalid title received: {title}") |
|
|
|
|
|
|
|
return "" |
|
|
|
|
|
|
|
# Remove [prefixes], newlines, and excessive whitespace |
|
|
|
|
|
|
|
cleaned_title = re.sub(r'^\[.*?\]\s*', '', title) # Remove [prefix] |
|
|
|
|
|
|
|
cleaned_title = re.sub(r'\n+', ' ', cleaned_title) # Replace newlines with space |
|
|
|
|
|
|
|
cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip() # Normalize spaces |
|
|
|
|
|
|
|
# Remove special characters (keep alphanumeric, spaces, and basic punctuation) |
|
|
|
|
|
|
|
cleaned_title = re.sub(r'[^\w\s.,!?-]', '', cleaned_title) |
|
|
|
logging.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'") |
|
|
|
logging.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'") |
|
|
|
return cleaned_title |
|
|
|
return cleaned_title |
|
|
|
|
|
|
|
|
|
|
|
@ -223,6 +232,7 @@ def fetch_duckduckgo_news_context(title, hours=24): |
|
|
|
return title |
|
|
|
return title |
|
|
|
|
|
|
|
|
|
|
|
def fetch_reddit_posts(): |
|
|
|
def fetch_reddit_posts(): |
|
|
|
|
|
|
|
"""Fetch Reddit posts from specified subreddits, filtering low-quality posts.""" |
|
|
|
try: |
|
|
|
try: |
|
|
|
reddit = praw.Reddit( |
|
|
|
reddit = praw.Reddit( |
|
|
|
client_id=REDDIT_CLIENT_ID, |
|
|
|
client_id=REDDIT_CLIENT_ID, |
|
|
|
@ -244,15 +254,26 @@ def fetch_reddit_posts(): |
|
|
|
logging.info(f"Skipping old post: {submission.title} (Published: {pub_date})") |
|
|
|
logging.info(f"Skipping old post: {submission.title} (Published: {pub_date})") |
|
|
|
continue |
|
|
|
continue |
|
|
|
cleaned_title = clean_reddit_title(submission.title) |
|
|
|
cleaned_title = clean_reddit_title(submission.title) |
|
|
|
|
|
|
|
if not cleaned_title or len(cleaned_title) < 5: |
|
|
|
|
|
|
|
logging.info(f"Skipping post with invalid or short title: {submission.title}") |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
# Filter out posts with empty or very short summaries |
|
|
|
|
|
|
|
summary = submission.selftext.strip() if submission.selftext else "" |
|
|
|
|
|
|
|
if len(summary) < 20 and not submission.url.endswith(('.jpg', '.jpeg', '.png', '.gif')): |
|
|
|
|
|
|
|
logging.info(f"Skipping post with insufficient summary: {cleaned_title}") |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
# Fetch top comments for additional context |
|
|
|
|
|
|
|
top_comments = get_top_comments(f"https://www.reddit.com{submission.permalink}", reddit) |
|
|
|
articles.append({ |
|
|
|
articles.append({ |
|
|
|
"title": cleaned_title, |
|
|
|
"title": cleaned_title, |
|
|
|
"raw_title": submission.title, |
|
|
|
"raw_title": submission.title, |
|
|
|
"link": f"https://www.reddit.com{submission.permalink}", |
|
|
|
"link": f"https://www.reddit.com{submission.permalink}", |
|
|
|
"summary": submission.selftext, |
|
|
|
"summary": summary, |
|
|
|
"feed_title": get_clean_source_name(subreddit_name), |
|
|
|
"feed_title": get_clean_source_name(subreddit_name), |
|
|
|
"pub_date": pub_date, |
|
|
|
"pub_date": pub_date, |
|
|
|
"upvotes": submission.score, |
|
|
|
"upvotes": submission.score, |
|
|
|
"comment_count": submission.num_comments |
|
|
|
"comment_count": submission.num_comments, |
|
|
|
|
|
|
|
"top_comments": top_comments |
|
|
|
}) |
|
|
|
}) |
|
|
|
logging.info(f"Fetched {len(articles)} posts from r/{subreddit_name}") |
|
|
|
logging.info(f"Fetched {len(articles)} posts from r/{subreddit_name}") |
|
|
|
break |
|
|
|
break |
|
|
|
@ -283,16 +304,18 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used |
|
|
|
title = post["title"] |
|
|
|
title = post["title"] |
|
|
|
link = post.get("link", "") |
|
|
|
link = post.get("link", "") |
|
|
|
summary = post.get("summary", "") |
|
|
|
summary = post.get("summary", "") |
|
|
|
source_name = post.get("source", "Reddit") |
|
|
|
source_name = post.get("feed_title", "Reddit") |
|
|
|
original_source = f'<a href="{link}">{source_name}</a>' |
|
|
|
original_source = f'<a href="{link}">{source_name}</a>' |
|
|
|
original_url = link # Store for fallback |
|
|
|
original_url = link |
|
|
|
|
|
|
|
upvotes = post.get("upvotes", 0) |
|
|
|
|
|
|
|
comment_count = post.get("comment_count", 0) |
|
|
|
|
|
|
|
top_comments = post.get("top_comments", []) |
|
|
|
|
|
|
|
|
|
|
|
if title in posted_titles: |
|
|
|
if title in posted_titles: |
|
|
|
logging.info(f"Skipping already posted Reddit post: {title}") |
|
|
|
logging.info(f"Skipping already posted Reddit post: {title}") |
|
|
|
attempts += 1 |
|
|
|
attempts += 1 |
|
|
|
continue |
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
# Check author availability before GPT calls |
|
|
|
|
|
|
|
author = get_next_author_round_robin() |
|
|
|
author = get_next_author_round_robin() |
|
|
|
if not author: |
|
|
|
if not author: |
|
|
|
logging.info(f"Skipping post '{title}' due to tweet rate limits for all authors") |
|
|
|
logging.info(f"Skipping post '{title}' due to tweet rate limits for all authors") |
|
|
|
@ -317,8 +340,10 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used |
|
|
|
continue |
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
ddg_context = fetch_duckduckgo_news_context(title) |
|
|
|
ddg_context = fetch_duckduckgo_news_context(title) |
|
|
|
scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}" |
|
|
|
# Log full scoring content for debugging |
|
|
|
interest_score = is_interesting(scoring_content) |
|
|
|
scoring_content = f"Title: {title}\n\nContent: {summary}\n\nTop Comments: {top_comments}\n\nAdditional Context: {ddg_context}" |
|
|
|
|
|
|
|
logging.debug(f"Scoring content for '{title}': {scoring_content}") |
|
|
|
|
|
|
|
interest_score = is_interesting_reddit(title, summary, upvotes, comment_count, top_comments) |
|
|
|
logging.info(f"Interest score for '{title}': {interest_score}") |
|
|
|
logging.info(f"Interest score for '{title}': {interest_score}") |
|
|
|
if interest_score < 6: |
|
|
|
if interest_score < 6: |
|
|
|
logging.info(f"Reddit Interest Too Low: {interest_score}") |
|
|
|
logging.info(f"Reddit Interest Too Low: {interest_score}") |
|
|
|
|