import requests import random import time import logging import os import json import signal import sys import re from datetime import datetime, timedelta, timezone from openai import OpenAI from urllib.parse import quote from requests.packages.urllib3.util.retry import Retry from requests.adapters import HTTPAdapter import praw from foodie_config import ( AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, SUMMARY_PERSONA_PROMPTS, CATEGORIES, CTAS, get_clean_source_name, REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT, LIGHT_TASK_MODEL ) from foodie_utils import ( load_json_file, save_json_file, get_image, generate_image_query, upload_image_to_wp, determine_paragraph_count, insert_link_naturally, summarize_with_gpt4o, generate_category_from_summary, post_to_wp, prepare_post_data, select_best_author, smart_image_and_filter, get_flickr_image_via_ddg ) from foodie_hooks import get_dynamic_hook, select_best_cta # Flag to indicate if we're in the middle of posting is_posting = False def signal_handler(sig, frame): logging.info("Received termination signal, checking if safe to exit...") if is_posting: logging.info("Currently posting, will exit after completion.") else: logging.info("Safe to exit immediately.") sys.exit(0) signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) LOG_FILE = "/home/shane/foodie_automator/foodie_automator_reddit.log" LOG_PRUNE_DAYS = 30 def setup_logging(): if os.path.exists(LOG_FILE): with open(LOG_FILE, 'r') as f: lines = f.readlines() cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS) pruned_lines = [] for line in lines: try: timestamp = datetime.strptime(line[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc) if timestamp > cutoff: pruned_lines.append(line) except ValueError: logging.warning(f"Skipping malformed log line: {line.strip()[:50]}...") continue with open(LOG_FILE, 'w') as f: f.writelines(pruned_lines) logging.basicConfig( filename=LOG_FILE, level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("prawcore").setLevel(logging.WARNING) console_handler = logging.StreamHandler() console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) logging.getLogger().addHandler(console_handler) logging.info("Logging initialized for foodie_automator_reddit.py") setup_logging() POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_reddit_titles.json' USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json' EXPIRATION_HOURS = 24 IMAGE_EXPIRATION_DAYS = 7 posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS) posted_titles = set(entry["title"] for entry in posted_titles_data if "title" in entry) used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) used_images = set(entry["title"] for entry in used_images_data if "title" in entry) client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) def clean_reddit_title(title): """Remove Reddit flairs like [pro/chef] or [homemade] from the title.""" # Match patterns like [pro/chef], [homemade], etc. at the start of the title cleaned_title = re.sub(r'^\[.*?\]\s*', '', title).strip() logging.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'") return cleaned_title def is_interesting_reddit(title, summary, upvotes, comment_count, top_comments): try: content = f"Title: {title}\n\nContent: {summary}" if top_comments: content += f"\n\nTop Comments:\n{'\n'.join(top_comments)}" response = client.chat.completions.create( model=LIGHT_TASK_MODEL, messages=[ {"role": "system", "content": ( "Rate this Reddit post from 0-10 based on rarity, buzzworthiness, and engagement potential for food lovers, covering food topics (skip recipes). " "Score 8-10 for rare, highly shareable ideas (e.g., unique dishes or restaurant trends). " "Score 5-7 for fresh, engaging updates with broad appeal. Score below 5 for common or unremarkable content. " "Consider comments for added context (e.g., specific locations or unique details). " "Return only a number." )}, {"role": "user", "content": content} ], max_tokens=5 ) base_score = int(response.choices[0].message.content.strip()) if response.choices[0].message.content.strip().isdigit() else 0 engagement_boost = 0 if upvotes >= 500: engagement_boost += 3 elif upvotes >= 100: engagement_boost += 2 elif upvotes >= 50: engagement_boost += 1 if comment_count >= 100: engagement_boost += 2 elif comment_count >= 20: engagement_boost += 1 final_score = min(base_score + engagement_boost, 10) logging.info(f"Reddit Interest Score: {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count}, top_comments: {len(top_comments)}) for '{title}'") print(f"Interest Score for '{title[:50]}...': {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count})") return final_score except Exception as e: logging.error(f"Reddit interestingness scoring failed: {e}") print(f"Reddit Interest Error: {e}") return 0 def get_top_comments(post_url, reddit, limit=3): try: submission = reddit.submission(url=post_url) submission.comments.replace_more(limit=0) submission.comment_sort = 'top' top_comments = [comment.body for comment in submission.comments[:limit] if not comment.body.startswith('[deleted]')] logging.info(f"Fetched {len(top_comments)} top comments for {post_url}") return top_comments except Exception as e: logging.error(f"Failed to fetch comments for {post_url}: {e}") return [] def fetch_reddit_posts(): reddit = praw.Reddit( client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT ) feeds = ['FoodPorn', 'restaurant', 'FoodIndustry', 'food'] articles = [] cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS) logging.info(f"Starting fetch with cutoff date: {cutoff_date}") for subreddit_name in feeds: try: subreddit = reddit.subreddit(subreddit_name) for submission in subreddit.top(time_filter='day', limit=100): pub_date = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc) if pub_date < cutoff_date: logging.info(f"Skipping old post: {submission.title} (Published: {pub_date})") continue # Clean the title before storing cleaned_title = clean_reddit_title(submission.title) articles.append({ "title": cleaned_title, # Use cleaned title "raw_title": submission.title, # Store raw title for reference if needed "link": f"https://www.reddit.com{submission.permalink}", "summary": submission.selftext, "feed_title": get_clean_source_name(subreddit_name), "pub_date": pub_date, "upvotes": submission.score, "comment_count": submission.num_comments }) logging.info(f"Fetched {len(articles)} posts from r/{subreddit_name}") except Exception as e: logging.error(f"Failed to fetch Reddit feed r/{subreddit_name}: {e}") logging.info(f"Total Reddit posts fetched: {len(articles)}") return articles def curate_from_reddit(): articles = fetch_reddit_posts() if not articles: print("No Reddit posts available") logging.info("No Reddit posts available") return None, None, None # Sort by upvotes descending articles.sort(key=lambda x: x["upvotes"], reverse=True) reddit = praw.Reddit( client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT ) attempts = 0 max_attempts = 10 while attempts < max_attempts and articles: article = articles.pop(0) # Take highest-upvote post title = article["title"] # Use cleaned title raw_title = article["raw_title"] # Use raw title for deduplication link = article["link"] summary = article["summary"] source_name = "Reddit" original_source = 'Reddit' if raw_title in posted_titles: # Check against raw title print(f"Skipping already posted post: {raw_title}") logging.info(f"Skipping already posted post: {raw_title}") attempts += 1 continue print(f"Trying Reddit Post: {title} from {source_name}") logging.info(f"Trying Reddit Post: {title} from {source_name}") image_query, relevance_keywords, skip = smart_image_and_filter(title, summary) if skip or any(keyword in title.lower() or keyword in summary.lower() for keyword in RECIPE_KEYWORDS + ["homemade"]): print(f"Skipping filtered Reddit post: {title}") logging.info(f"Skipping filtered Reddit post: {title}") attempts += 1 continue top_comments = get_top_comments(link, reddit, limit=3) interest_score = is_interesting_reddit( title, # Use cleaned title summary, article["upvotes"], article["comment_count"], top_comments ) logging.info(f"Interest Score: {interest_score} for '{title}'") if interest_score < 6: print(f"Reddit Interest Too Low: {interest_score}") logging.info(f"Reddit Interest Too Low: {interest_score}") attempts += 1 continue num_paragraphs = determine_paragraph_count(interest_score) extra_prompt = ( f"Generate exactly {num_paragraphs} paragraphs. " f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details. " "Incorporate relevant insights from these top comments if available: {', '.join(top_comments) if top_comments else 'None'}. " "Do NOT introduce unrelated concepts unless in the content or comments. " "If brief, expand on the core idea with relevant context about its appeal or significance." ) content_to_summarize = f"{title}\n\n{summary}" # Use cleaned title if top_comments: content_to_summarize += f"\n\nTop Comments:\n{'\n'.join(top_comments)}" final_summary = summarize_with_gpt4o( content_to_summarize, source_name, link, interest_score=interest_score, extra_prompt=extra_prompt ) if not final_summary: logging.info(f"Summary failed for '{title}'") attempts += 1 continue final_summary = insert_link_naturally(final_summary, source_name, link) post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title) # Use cleaned title if not post_data: attempts += 1 continue image_url, image_source, uploader, page_url = get_flickr_image_via_ddg(image_query, relevance_keywords) if not image_url: image_url, image_source, uploader, page_url = get_image(image_query) hook = get_dynamic_hook(post_data["title"]).strip() cta = select_best_cta(post_data["title"], final_summary, post_url=None) post_data["content"] = f"{final_summary}\n\n{cta}" global is_posting is_posting = True try: post_id, post_url = post_to_wp( post_data=post_data, category=category, link=link, author=author, image_url=image_url, original_source=original_source, image_source=image_source, uploader=uploader, pixabay_url=pixabay_url, interest_score=interest_score ) finally: is_posting = False if post_id: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url) post_data["content"] = f"{final_summary}\n\n{cta}" is_posting = True try: post_to_wp( post_data=post_data, category=category, link=link, author=author, image_url=image_url, original_source=original_source, image_source=image_source, uploader=uploader, pixabay_url=pixabay_url, interest_score=interest_score, post_id=post_id ) finally: is_posting = False timestamp = datetime.now(timezone.utc).isoformat() save_json_file(POSTED_TITLES_FILE, raw_title, timestamp) # Save raw title posted_titles.add(raw_title) # Add raw title to set logging.info(f"Successfully saved '{raw_title}' to {POSTED_TITLES_FILE} with timestamp {timestamp}") if image_url: save_json_file(USED_IMAGES_FILE, image_url, timestamp) used_images.add(image_url) logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE} with timestamp {timestamp}") print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Reddit *****") print(f"Actual post URL: {post_url}") logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Reddit *****") logging.info(f"Actual post URL: {post_url}") return post_data, category, random.randint(0, 1800) attempts += 1 logging.info(f"WP posting failed for '{post_data['title']}'") print("No interesting Reddit post found after attempts") logging.info("No interesting Reddit post found after attempts") return None, None, random.randint(600, 1800) def run_reddit_automator(): print(f"{datetime.now(timezone.utc)} - INFO - ***** Reddit Automator Launched *****") logging.info("***** Reddit Automator Launched *****") post_data, category, sleep_time = curate_from_reddit() if not post_data: print(f"No postable Reddit article found - sleeping for {sleep_time} seconds") logging.info(f"No postable Reddit article found - sleeping for {sleep_time} seconds") else: print(f"Completed Reddit run with sleep time: {sleep_time} seconds") logging.info(f"Completed Reddit run with sleep time: {sleep_time} seconds") print(f"Sleeping for {sleep_time}s") time.sleep(sleep_time) return post_data, category, sleep_time if __name__ == "__main__": run_reddit_automator()