# foodie_automator_reddit.py import requests import random import time import logging import os import json import signal import sys import re from datetime import datetime, timedelta, timezone from typing import List, Dict, Optional, Tuple, Set from openai import OpenAI from urllib.parse import quote from requests.packages.urllib3.util.retry import Retry from requests.adapters import HTTPAdapter import praw from dotenv import load_dotenv from foodie_config import ( AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES, get_clean_source_name, REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT, LIGHT_TASK_MODEL, X_API_CREDENTIALS, FILE_PATHS, EXPIRATION_DAYS, IMAGE_EXPIRATION_DAYS ) from foodie_utils import ( load_json_file, save_json_file, get_image, generate_image_query, upload_image_to_wp, determine_paragraph_count, insert_link_naturally, summarize_with_gpt4o, generate_category_from_summary, post_to_wp, prepare_post_data, select_best_author, smart_image_and_filter, get_flickr_image ) from foodie_hooks import get_dynamic_hook, get_viral_share_prompt # Load environment variables load_dotenv() # Global state is_posting = False logger = logging.getLogger(__name__) class RedditScraper: def __init__(self): self.setup_logging() self.setup_signal_handlers() self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) self.posted_titles = self.load_posted_titles() self.used_images = self.load_used_images() self.reddit = self.setup_reddit_client() self.setup_requests_session() def setup_logging(self) -> None: """Configure logging for the scraper.""" log_file = FILE_PATHS["posted_reddit_titles"].with_suffix('.log') self.prune_old_logs(log_file) logging.basicConfig( filename=str(log_file), level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("prawcore").setLevel(logging.WARNING) console_handler = logging.StreamHandler() console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) logging.getLogger().addHandler(console_handler) logger.info("Logging initialized for Reddit scraper") def prune_old_logs(self, log_file: str) -> None: """Prune log entries older than LOG_PRUNE_DAYS.""" if not os.path.exists(log_file): return with open(log_file, 'r') as f: lines = f.readlines() log_entries = [] current_entry = [] timestamp_pattern = re.compile(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}') for line in lines: if timestamp_pattern.match(line): if current_entry: log_entries.append(''.join(current_entry)) current_entry = [line] else: current_entry.append(line) if current_entry: log_entries.append(''.join(current_entry)) cutoff = datetime.now(timezone.utc) - timedelta(days=30) # LOG_PRUNE_DAYS pruned_entries = [] for entry in log_entries: try: timestamp = datetime.strptime(entry[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc) if timestamp > cutoff: pruned_entries.append(entry) except ValueError: logger.warning(f"Skipping malformed log entry (no timestamp): {entry[:50]}...") continue with open(log_file, 'w') as f: f.writelines(pruned_entries) def setup_signal_handlers(self) -> None: """Set up signal handlers for graceful shutdown.""" def signal_handler(sig, frame): logger.info("Received termination signal, checking if safe to exit...") if is_posting: logger.info("Currently posting, will exit after completion.") else: logger.info("Safe to exit immediately.") sys.exit(0) signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) def setup_reddit_client(self) -> praw.Reddit: """Set up and return a Reddit client with proper configuration.""" return praw.Reddit( client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT ) def setup_requests_session(self) -> None: """Set up a requests session with retry logic.""" self.session = requests.Session() retries = Retry( total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504] ) self.session.mount('https://', HTTPAdapter(max_retries=retries)) def load_posted_titles(self) -> Set[str]: """Load and return the set of posted titles.""" try: data = load_json_file(FILE_PATHS["posted_reddit_titles"], EXPIRATION_DAYS) return {entry["title"] for entry in data if "title" in entry} except Exception as e: logger.error(f"Error loading posted titles: {e}") return set() def load_used_images(self) -> Set[str]: """Load and return the set of used images.""" try: data = load_json_file(FILE_PATHS["used_images"], IMAGE_EXPIRATION_DAYS) return {entry["url"] for entry in data if "url" in entry} except Exception as e: logger.error(f"Error loading used images: {e}") return set() def clean_reddit_title(self, title: str) -> str: """Clean and standardize Reddit post titles.""" cleaned_title = re.sub(r'^\[.*?\]\s*', '', title).strip() logger.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'") return cleaned_title def is_interesting_reddit(self, title: str, summary: str, upvotes: int, comment_count: int, top_comments: List[str]) -> int: """Determine the interest score for a Reddit post.""" try: content = f"Title: {title}\n\nContent: {summary}" if top_comments: content += f"\n\nTop Comments:\n{'\n'.join(top_comments)}" response = self.client.chat.completions.create( model=LIGHT_TASK_MODEL, messages=[ {"role": "system", "content": ( "Rate this Reddit post from 0-10 based on rarity, buzzworthiness, and engagement potential for food lovers, covering food topics (skip recipes). " "Score 8-10 for rare, highly shareable ideas (e.g., unique dishes or restaurant trends). " "Score 5-7 for fresh, engaging updates with broad appeal. Score below 5 for common or unremarkable content. " "Consider comments for added context (e.g., specific locations or unique details). " "Return only a number." )}, {"role": "user", "content": content} ], max_tokens=5 ) base_score = int(response.choices[0].message.content.strip()) if response.choices[0].message.content.strip().isdigit() else 0 engagement_boost = 0 if upvotes >= 500: engagement_boost += 3 elif upvotes >= 100: engagement_boost += 2 elif upvotes >= 50: engagement_boost += 1 if comment_count >= 100: engagement_boost += 2 elif comment_count >= 20: engagement_boost += 1 final_score = min(base_score + engagement_boost, 10) logger.info(f"Reddit Interest Score: {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count}, top_comments: {len(top_comments)}) for '{title}'") return final_score except Exception as e: logger.error(f"Reddit interestingness scoring failed: {e}") return 0 def get_top_comments(self, post_url: str, limit: int = 3) -> List[str]: """Fetch top comments for a Reddit post.""" try: submission = self.reddit.submission(url=post_url) submission.comment_sort = 'top' submission.comments.replace_more(limit=0) top_comments = [comment.body for comment in submission.comments[:limit] if not comment.body.startswith('[deleted]')] logger.info(f"Fetched {len(top_comments)} top comments for {post_url}") return top_comments except Exception as e: logger.error(f"Failed to fetch comments for {post_url}: {e}") return [] def fetch_reddit_posts(self) -> List[Dict]: """Fetch posts from configured Reddit subreddits.""" feeds = ['FoodPorn', 'restaurant', 'FoodIndustry', 'food'] articles = [] cutoff_date = datetime.now(timezone.utc) - timedelta(hours=24) logger.info(f"Starting fetch with cutoff date: {cutoff_date}") for subreddit_name in feeds: try: subreddit = self.reddit.subreddit(subreddit_name) for submission in subreddit.top(time_filter='day', limit=100): pub_date = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc) if pub_date < cutoff_date: logger.info(f"Skipping old post: {submission.title} (Published: {pub_date})") continue cleaned_title = self.clean_reddit_title(submission.title) articles.append({ "title": cleaned_title, "raw_title": submission.title, "link": f"https://www.reddit.com{submission.permalink}", "summary": submission.selftext, "feed_title": get_clean_source_name(subreddit_name), "pub_date": pub_date, "upvotes": submission.score, "comment_count": submission.num_comments }) logger.info(f"Fetched {len(articles)} posts from r/{subreddit_name}") except Exception as e: logger.error(f"Failed to fetch Reddit feed r/{subreddit_name}: {e}") logger.info(f"Total Reddit posts fetched: {len(articles)}") return articles def curate_from_reddit(self) -> Tuple[Optional[Dict], Optional[str], int]: """Curate content from Reddit posts.""" articles = self.fetch_reddit_posts() if not articles: logger.info("No Reddit posts available") return None, None, random.randint(600, 1800) articles.sort(key=lambda x: x["upvotes"], reverse=True) for article in articles: title = article["title"] raw_title = article["raw_title"] link = article["link"] summary = article["summary"] if raw_title in self.posted_titles: logger.info(f"Skipping already posted post: {raw_title}") continue logger.info(f"Processing Reddit Post: {title}") image_query, relevance_keywords, skip = smart_image_and_filter(title, summary) if skip or any(keyword in title.lower() or keyword in raw_title.lower() for keyword in RECIPE_KEYWORDS + ["homemade"]): logger.info(f"Skipping filtered Reddit post: {title}") continue top_comments = self.get_top_comments(link) interest_score = self.is_interesting_reddit(title, summary, article["upvotes"], article["comment_count"], top_comments) if interest_score < 6: logger.info(f"Reddit Interest Too Low: {interest_score}") continue num_paragraphs = determine_paragraph_count(interest_score) extra_prompt = ( f"Generate exactly {num_paragraphs} paragraphs.\n" f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n" f"Do NOT introduce unrelated concepts.\n" f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n" f"Do not include emojis in the summary." ) final_summary = summarize_with_gpt4o( f"{title}\n\n{summary}", "Reddit", link, interest_score=interest_score, extra_prompt=extra_prompt ) if not final_summary: logger.info(f"Summary failed for '{title}'") continue final_summary = insert_link_naturally(final_summary, "Reddit", link) post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title) if post_data and author: return post_data, author, random.randint(600, 1800) return None, None, random.randint(600, 1800) def run_reddit_automator(): """Main function to run the Reddit automator.""" scraper = RedditScraper() while True: try: post_data, author, sleep_time = scraper.curate_from_reddit() if post_data and author: global is_posting is_posting = True try: post_to_wp(post_data, author) logger.info(f"Successfully posted: {post_data['title']}") finally: is_posting = False time.sleep(sleep_time) except Exception as e: logger.error(f"Error in Reddit automator: {e}") time.sleep(300) # Wait 5 minutes before retrying if __name__ == "__main__": run_reddit_automator()