From 5b4b4c0253f8faca10b40ee3575ad49f27e09b16 Mon Sep 17 00:00:00 2001 From: Shane Date: Sat, 26 Apr 2025 14:47:25 +1000 Subject: [PATCH] update reddit --- foodie_automator_google.py | 4 +++- foodie_automator_reddit.py | 36 ++++++++++++++++++++++++------------ foodie_automator_rss.py | 2 ++ foodie_utils.py | 6 ++++-- 4 files changed, 33 insertions(+), 15 deletions(-) diff --git a/foodie_automator_google.py b/foodie_automator_google.py index 56c614a..cf007f8 100644 --- a/foodie_automator_google.py +++ b/foodie_automator_google.py @@ -28,7 +28,9 @@ from foodie_utils import ( prepare_post_data, smart_image_and_filter, insert_link_naturally, get_flickr_image_via_ddg ) from foodie_hooks import get_dynamic_hook, select_best_cta +from dotenv import load_dotenv +load_dotenv() # Flag to indicate if we're in the middle of posting is_posting = False @@ -53,7 +55,7 @@ console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %( logger.addHandler(console_handler) logging.info("Logging initialized for foodie_automator_google.py") -client = OpenAI(api_key="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") +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_google_titles.json' USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json' diff --git a/foodie_automator_reddit.py b/foodie_automator_reddit.py index 3f28e1b..0b0f244 100644 --- a/foodie_automator_reddit.py +++ b/foodie_automator_reddit.py @@ -6,6 +6,7 @@ import os import json import signal import sys +import re from datetime import datetime, timedelta, timezone from openai import OpenAI from urllib.parse import quote @@ -83,7 +84,14 @@ posted_titles = set(entry["title"] for entry in posted_titles_data if "title" in used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) used_images = set(entry["title"] for entry in used_images_data if "title" in entry) -client = OpenAI(api_key="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") +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + +def clean_reddit_title(title): + """Remove Reddit flairs like [pro/chef] or [homemade] from the title.""" + # Match patterns like [pro/chef], [homemade], etc. at the start of the title + cleaned_title = re.sub(r'^\[.*?\]\s*', '', title).strip() + logging.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'") + return cleaned_title def is_interesting_reddit(title, summary, upvotes, comment_count, top_comments): try: @@ -160,8 +168,11 @@ def fetch_reddit_posts(): if pub_date < cutoff_date: logging.info(f"Skipping old post: {submission.title} (Published: {pub_date})") continue + # Clean the title before storing + cleaned_title = clean_reddit_title(submission.title) articles.append({ - "title": submission.title, + "title": cleaned_title, # Use cleaned title + "raw_title": submission.title, # Store raw title for reference if needed "link": f"https://www.reddit.com{submission.permalink}", "summary": submission.selftext, "feed_title": get_clean_source_name(subreddit_name), @@ -196,15 +207,16 @@ def curate_from_reddit(): max_attempts = 10 while attempts < max_attempts and articles: article = articles.pop(0) # Take highest-upvote post - title = article["title"] + title = article["title"] # Use cleaned title + raw_title = article["raw_title"] # Use raw title for deduplication link = article["link"] summary = article["summary"] source_name = "Reddit" original_source = 'Reddit' - if title in posted_titles: - print(f"Skipping already posted post: {title}") - logging.info(f"Skipping already posted post: {title}") + if raw_title in posted_titles: # Check against raw title + print(f"Skipping already posted post: {raw_title}") + logging.info(f"Skipping already posted post: {raw_title}") attempts += 1 continue @@ -220,7 +232,7 @@ def curate_from_reddit(): top_comments = get_top_comments(link, reddit, limit=3) interest_score = is_interesting_reddit( - title, + title, # Use cleaned title summary, article["upvotes"], article["comment_count"], @@ -241,7 +253,7 @@ def curate_from_reddit(): "Do NOT introduce unrelated concepts unless in the content or comments. " "If brief, expand on the core idea with relevant context about its appeal or significance." ) - content_to_summarize = f"{title}\n\n{summary}" + content_to_summarize = f"{title}\n\n{summary}" # Use cleaned title if top_comments: content_to_summarize += f"\n\nTop Comments:\n{'\n'.join(top_comments)}" @@ -259,7 +271,7 @@ def curate_from_reddit(): final_summary = insert_link_naturally(final_summary, source_name, link) - post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title) + post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title) # Use cleaned title if not post_data: attempts += 1 continue @@ -314,9 +326,9 @@ def curate_from_reddit(): is_posting = False timestamp = datetime.now(timezone.utc).isoformat() - save_json_file(POSTED_TITLES_FILE, title, timestamp) - posted_titles.add(title) - logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE} with timestamp {timestamp}") + save_json_file(POSTED_TITLES_FILE, raw_title, timestamp) # Save raw title + posted_titles.add(raw_title) # Add raw title to set + logging.info(f"Successfully saved '{raw_title}' to {POSTED_TITLES_FILE} with timestamp {timestamp}") if image_url: save_json_file(USED_IMAGES_FILE, image_url, timestamp) diff --git a/foodie_automator_rss.py b/foodie_automator_rss.py index 2de6b0f..91554cf 100644 --- a/foodie_automator_rss.py +++ b/foodie_automator_rss.py @@ -24,7 +24,9 @@ from foodie_hooks import get_dynamic_hook, select_best_cta import feedparser from concurrent.futures import ThreadPoolExecutor, as_completed from typing import List, Dict, Any, Optional +from dotenv import load_dotenv +load_dotenv() # Flag to indicate if we're in the middle of posting is_posting = False diff --git a/foodie_utils.py b/foodie_utils.py index 8c8b0a2..6c2d053 100644 --- a/foodie_utils.py +++ b/foodie_utils.py @@ -10,6 +10,8 @@ import io import tempfile import requests import time +from dotenv import load_dotenv +import os from datetime import datetime, timezone, timedelta from openai import OpenAI from urllib.parse import quote @@ -21,8 +23,8 @@ from foodie_config import ( RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, SUMMARY_PERSONA_PROMPTS, get_clean_source_name, AUTHORS, LIGHT_TASK_MODEL, SUMMARY_MODEL ) -#test -client = OpenAI(api_key="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") +load_dotenv() +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) def load_json_file(filename, expiration_days=None): data = []