|
|
|
@ -6,6 +6,7 @@ import os |
|
|
|
import json |
|
|
|
import json |
|
|
|
import signal |
|
|
|
import signal |
|
|
|
import sys |
|
|
|
import sys |
|
|
|
|
|
|
|
import re |
|
|
|
from datetime import datetime, timedelta, timezone |
|
|
|
from datetime import datetime, timedelta, timezone |
|
|
|
from openai import OpenAI |
|
|
|
from openai import OpenAI |
|
|
|
from urllib.parse import quote |
|
|
|
from urllib.parse import quote |
|
|
|
@ -83,7 +84,14 @@ posted_titles = set(entry["title"] for entry in posted_titles_data if "title" in |
|
|
|
used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) |
|
|
|
used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) |
|
|
|
used_images = set(entry["title"] for entry in used_images_data if "title" in entry) |
|
|
|
used_images = set(entry["title"] for entry in used_images_data if "title" in entry) |
|
|
|
|
|
|
|
|
|
|
|
client = OpenAI(api_key="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") |
|
|
|
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_reddit_title(title): |
|
|
|
|
|
|
|
"""Remove Reddit flairs like [pro/chef] or [homemade] from the title.""" |
|
|
|
|
|
|
|
# Match patterns like [pro/chef], [homemade], etc. at the start of the title |
|
|
|
|
|
|
|
cleaned_title = re.sub(r'^\[.*?\]\s*', '', title).strip() |
|
|
|
|
|
|
|
logging.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'") |
|
|
|
|
|
|
|
return cleaned_title |
|
|
|
|
|
|
|
|
|
|
|
def is_interesting_reddit(title, summary, upvotes, comment_count, top_comments): |
|
|
|
def is_interesting_reddit(title, summary, upvotes, comment_count, top_comments): |
|
|
|
try: |
|
|
|
try: |
|
|
|
@ -160,8 +168,11 @@ def fetch_reddit_posts(): |
|
|
|
if pub_date < cutoff_date: |
|
|
|
if pub_date < cutoff_date: |
|
|
|
logging.info(f"Skipping old post: {submission.title} (Published: {pub_date})") |
|
|
|
logging.info(f"Skipping old post: {submission.title} (Published: {pub_date})") |
|
|
|
continue |
|
|
|
continue |
|
|
|
|
|
|
|
# Clean the title before storing |
|
|
|
|
|
|
|
cleaned_title = clean_reddit_title(submission.title) |
|
|
|
articles.append({ |
|
|
|
articles.append({ |
|
|
|
"title": submission.title, |
|
|
|
"title": cleaned_title, # Use cleaned title |
|
|
|
|
|
|
|
"raw_title": submission.title, # Store raw title for reference if needed |
|
|
|
"link": f"https://www.reddit.com{submission.permalink}", |
|
|
|
"link": f"https://www.reddit.com{submission.permalink}", |
|
|
|
"summary": submission.selftext, |
|
|
|
"summary": submission.selftext, |
|
|
|
"feed_title": get_clean_source_name(subreddit_name), |
|
|
|
"feed_title": get_clean_source_name(subreddit_name), |
|
|
|
@ -196,15 +207,16 @@ def curate_from_reddit(): |
|
|
|
max_attempts = 10 |
|
|
|
max_attempts = 10 |
|
|
|
while attempts < max_attempts and articles: |
|
|
|
while attempts < max_attempts and articles: |
|
|
|
article = articles.pop(0) # Take highest-upvote post |
|
|
|
article = articles.pop(0) # Take highest-upvote post |
|
|
|
title = article["title"] |
|
|
|
title = article["title"] # Use cleaned title |
|
|
|
|
|
|
|
raw_title = article["raw_title"] # Use raw title for deduplication |
|
|
|
link = article["link"] |
|
|
|
link = article["link"] |
|
|
|
summary = article["summary"] |
|
|
|
summary = article["summary"] |
|
|
|
source_name = "Reddit" |
|
|
|
source_name = "Reddit" |
|
|
|
original_source = '<a href="https://www.reddit.com/">Reddit</a>' |
|
|
|
original_source = '<a href="https://www.reddit.com/">Reddit</a>' |
|
|
|
|
|
|
|
|
|
|
|
if title in posted_titles: |
|
|
|
if raw_title in posted_titles: # Check against raw title |
|
|
|
print(f"Skipping already posted post: {title}") |
|
|
|
print(f"Skipping already posted post: {raw_title}") |
|
|
|
logging.info(f"Skipping already posted post: {title}") |
|
|
|
logging.info(f"Skipping already posted post: {raw_title}") |
|
|
|
attempts += 1 |
|
|
|
attempts += 1 |
|
|
|
continue |
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
@ -220,7 +232,7 @@ def curate_from_reddit(): |
|
|
|
|
|
|
|
|
|
|
|
top_comments = get_top_comments(link, reddit, limit=3) |
|
|
|
top_comments = get_top_comments(link, reddit, limit=3) |
|
|
|
interest_score = is_interesting_reddit( |
|
|
|
interest_score = is_interesting_reddit( |
|
|
|
title, |
|
|
|
title, # Use cleaned title |
|
|
|
summary, |
|
|
|
summary, |
|
|
|
article["upvotes"], |
|
|
|
article["upvotes"], |
|
|
|
article["comment_count"], |
|
|
|
article["comment_count"], |
|
|
|
@ -241,7 +253,7 @@ def curate_from_reddit(): |
|
|
|
"Do NOT introduce unrelated concepts unless in the content or comments. " |
|
|
|
"Do NOT introduce unrelated concepts unless in the content or comments. " |
|
|
|
"If brief, expand on the core idea with relevant context about its appeal or significance." |
|
|
|
"If brief, expand on the core idea with relevant context about its appeal or significance." |
|
|
|
) |
|
|
|
) |
|
|
|
content_to_summarize = f"{title}\n\n{summary}" |
|
|
|
content_to_summarize = f"{title}\n\n{summary}" # Use cleaned title |
|
|
|
if top_comments: |
|
|
|
if top_comments: |
|
|
|
content_to_summarize += f"\n\nTop Comments:\n{'\n'.join(top_comments)}" |
|
|
|
content_to_summarize += f"\n\nTop Comments:\n{'\n'.join(top_comments)}" |
|
|
|
|
|
|
|
|
|
|
|
@ -259,7 +271,7 @@ def curate_from_reddit(): |
|
|
|
|
|
|
|
|
|
|
|
final_summary = insert_link_naturally(final_summary, source_name, link) |
|
|
|
final_summary = insert_link_naturally(final_summary, source_name, link) |
|
|
|
|
|
|
|
|
|
|
|
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title) |
|
|
|
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title) # Use cleaned title |
|
|
|
if not post_data: |
|
|
|
if not post_data: |
|
|
|
attempts += 1 |
|
|
|
attempts += 1 |
|
|
|
continue |
|
|
|
continue |
|
|
|
@ -314,9 +326,9 @@ def curate_from_reddit(): |
|
|
|
is_posting = False |
|
|
|
is_posting = False |
|
|
|
|
|
|
|
|
|
|
|
timestamp = datetime.now(timezone.utc).isoformat() |
|
|
|
timestamp = datetime.now(timezone.utc).isoformat() |
|
|
|
save_json_file(POSTED_TITLES_FILE, title, timestamp) |
|
|
|
save_json_file(POSTED_TITLES_FILE, raw_title, timestamp) # Save raw title |
|
|
|
posted_titles.add(title) |
|
|
|
posted_titles.add(raw_title) # Add raw title to set |
|
|
|
logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE} with timestamp {timestamp}") |
|
|
|
logging.info(f"Successfully saved '{raw_title}' to {POSTED_TITLES_FILE} with timestamp {timestamp}") |
|
|
|
|
|
|
|
|
|
|
|
if image_url: |
|
|
|
if image_url: |
|
|
|
save_json_file(USED_IMAGES_FILE, image_url, timestamp) |
|
|
|
save_json_file(USED_IMAGES_FILE, image_url, timestamp) |
|
|
|
|