# foodie_automator_reddit.py import requests import random import time import logging import os import json import signal import sys import re from duckduckgo_search import DDGS from datetime import datetime, timedelta, timezone from openai import OpenAI from urllib.parse import quote from requests.packages.urllib3.util.retry import Retry from requests.adapters import HTTPAdapter import praw from dotenv import load_dotenv from foodie_config import ( AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES, get_clean_source_name, REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT, LIGHT_TASK_MODEL, X_API_CREDENTIALS ) from foodie_utils import ( load_json_file, save_json_file, get_image, generate_image_query, upload_image_to_wp, determine_paragraph_count, insert_link_naturally, is_interesting, generate_title_from_summary, summarize_with_gpt4o, generate_category_from_summary, post_to_wp, prepare_post_data, select_best_author, smart_image_and_filter, get_flickr_image, get_next_author_round_robin, check_author_rate_limit, update_system_activity ) from foodie_hooks import get_dynamic_hook, get_viral_share_prompt import fcntl load_dotenv() SCRIPT_NAME = "foodie_automator_reddit" is_posting = False LOCK_FILE = "/home/shane/foodie_automator/locks/foodie_automator_reddit.lock" def signal_handler(sig, frame): logging.info("Received termination signal, marking script as stopped...") update_system_activity(SCRIPT_NAME, "stopped") if is_posting: logging.info("Currently posting, will exit after completion.") else: logging.info("Safe to exit immediately.") sys.exit(0) signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) LOG_FILE = "/home/shane/foodie_automator/logs/foodie_automator_reddit.log" LOG_PRUNE_DAYS = 30 MAX_RETRIES = 3 RETRY_BACKOFF = 2 POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_reddit_titles.json' USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json' EXPIRATION_HOURS = 24 IMAGE_EXPIRATION_DAYS = 7 posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS) posted_titles = set(entry["title"] for entry in posted_titles_data if "title" in entry) used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) used_images = set(entry["title"] for entry in used_images_data if "title" in entry) client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) def setup_logging(): try: # Ensure log directory exists os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True) logging.debug(f"Log directory created/verified: {os.path.dirname(LOG_FILE)}") # Check write permissions if not os.access(os.path.dirname(LOG_FILE), os.W_OK): raise PermissionError(f"No write permission for {os.path.dirname(LOG_FILE)}") # Test write to log file try: with open(LOG_FILE, 'a') as f: f.write("") logging.debug(f"Confirmed write access to {LOG_FILE}") except Exception as e: raise PermissionError(f"Cannot write to {LOG_FILE}: {e}") # Prune old logs if os.path.exists(LOG_FILE): with open(LOG_FILE, 'r') as f: lines = f.readlines() log_entries = [] current_entry = [] timestamp_pattern = re.compile(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}') for line in lines: if timestamp_pattern.match(line): if current_entry: log_entries.append(''.join(current_entry)) current_entry = [line] else: current_entry.append(line) if current_entry: log_entries.append(''.join(current_entry)) cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS) pruned_entries = [] for entry in log_entries: try: timestamp = datetime.strptime(entry[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc) if timestamp > cutoff: pruned_entries.append(entry) except ValueError: logging.warning(f"Skipping malformed log entry (no timestamp): {entry[:50]}...") continue with open(LOG_FILE, 'w') as f: f.writelines(pruned_entries) logging.debug(f"Log file pruned: {LOG_FILE}") # Configure logging logging.basicConfig( filename=LOG_FILE, level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", force=True # Ensure this config takes precedence ) logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("prawcore").setLevel(logging.WARNING) console_handler = logging.StreamHandler() console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) logging.getLogger().addHandler(console_handler) logging.info("Logging initialized for foodie_automator_reddit.py") except Exception as e: # Fallback to console logging if file logging fails logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", force=True ) logging.error(f"Failed to setup file logging for {LOG_FILE}: {e}. Using console logging.") console_handler = logging.StreamHandler() console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) logging.getLogger().addHandler(console_handler) logging.info("Console logging initialized as fallback for foodie_automator_reddit.py") def acquire_lock(): os.makedirs(os.path.dirname(LOCK_FILE), exist_ok=True) lock_fd = open(LOCK_FILE, 'w') try: fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) lock_fd.write(str(os.getpid())) lock_fd.flush() return lock_fd except IOError: logging.info("Another instance of foodie_automator_reddit.py is running") sys.exit(0) def clean_reddit_title(title): """Clean Reddit post title by removing prefixes, newlines, and special characters.""" if not title or not isinstance(title, str): logging.warning(f"Invalid title received: {title}") return "" # Remove [prefixes], newlines, and excessive whitespace cleaned_title = re.sub(r'^\[.*?\]\s*', '', title) # Remove [prefix] cleaned_title = re.sub(r'\n+', ' ', cleaned_title) # Replace newlines with space cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip() # Normalize spaces # Remove special characters (keep alphanumeric, spaces, and basic punctuation) cleaned_title = re.sub(r'[^\w\s.,!?-]', '', cleaned_title) logging.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'") return cleaned_title def is_interesting_reddit(title, summary, upvotes, comment_count, top_comments): for attempt in range(MAX_RETRIES): try: content = f"Title: {title}\n\nContent: {summary}" if top_comments: content += f"\n\nTop Comments:\n{'\n'.join(top_comments)}" response = client.chat.completions.create( model=LIGHT_TASK_MODEL, messages=[ {"role": "system", "content": ( "Rate this Reddit post from 0-10 based on rarity, buzzworthiness, and engagement potential for food lovers, covering food topics (skip recipes). " "Score 8-10 for rare, highly shareable ideas (e.g., unique dishes or restaurant trends). " "Score 5-7 for fresh, engaging updates with broad appeal. Score below 5 for common or unremarkable content. " "Consider comments for added context (e.g., specific locations or unique details). " "Return only a number" )}, {"role": "user", "content": content} ], max_tokens=5 ) base_score = int(response.choices[0].message.content.strip()) if response.choices[0].message.content.strip().isdigit() else 0 engagement_boost = 0 if upvotes >= 500: engagement_boost += 3 elif upvotes >= 100: engagement_boost += 2 elif upvotes >= 50: engagement_boost += 1 if comment_count >= 100: engagement_boost += 2 elif comment_count >= 20: engagement_boost += 1 final_score = min(base_score + engagement_boost, 10) logging.info(f"Reddit Interest Score: {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count}, top_comments: {len(top_comments)}) for '{title}'") return final_score except Exception as e: logging.warning(f"Reddit interestingness scoring failed (attempt {attempt + 1}): {e}") if attempt < MAX_RETRIES - 1: time.sleep(RETRY_BACKOFF * (2 ** attempt)) continue logging.error(f"Failed to score Reddit post '{title}' after {MAX_RETRIES} attempts") return 0 def get_top_comments(post_url, reddit, limit=3): for attempt in range(MAX_RETRIES): try: submission = reddit.submission(url=post_url) submission.comment_sort = 'top' submission.comments.replace_more(limit=0) top_comments = [comment.body for comment in submission.comments[:limit] if not comment.body.startswith('[deleted]')] logging.info(f"Fetched {len(top_comments)} top comments for {post_url}") return top_comments except Exception as e: logging.warning(f"Failed to fetch comments for {post_url} (attempt {attempt + 1}): {e}") if attempt < MAX_RETRIES - 1: time.sleep(RETRY_BACKOFF * (2 ** attempt)) continue logging.error(f"Failed to fetch comments for {post_url} after {MAX_RETRIES} attempts") return [] def fetch_duckduckgo_news_context(title, hours=24): for attempt in range(MAX_RETRIES): try: with DDGS() as ddgs: results = ddgs.news(f"{title} news", timelimit="d", max_results=5) titles = [] for r in results: try: date_str = r["date"] if '+00:00' in date_str: dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc) else: dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc) if dt > (datetime.now(timezone.utc) - timedelta(hours=24)): titles.append(r["title"].lower()) except ValueError as e: logging.warning(f"Date parsing failed for '{date_str}': {e}") continue context = " ".join(titles) if titles else "No recent news found within 24 hours" logging.info(f"DuckDuckGo News context for '{title}': {context}") return context except Exception as e: logging.warning(f"DuckDuckGo News context fetch failed for '{title}' (attempt {attempt + 1}): {e}") if attempt < MAX_RETRIES - 1: time.sleep(RETRY_BACKOFF * (2 ** attempt)) continue logging.error(f"Failed to fetch DuckDuckGo News context for '{title}' after {MAX_RETRIES} attempts") return title def fetch_reddit_posts(): """Fetch Reddit posts from specified subreddits, filtering low-quality and [homemade] posts.""" try: reddit = praw.Reddit( client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT ) feeds = [ "food", "FoodPorn", "spicy", "KoreanFood", "JapaneseFood", "DessertPorn", "ChineseFood", "IndianFood" ] articles = [] cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS) logging.info(f"Starting fetch with cutoff date: {cutoff_date}") for subreddit_name in feeds: for attempt in range(MAX_RETRIES): try: subreddit = reddit.subreddit(subreddit_name) for submission in subreddit.top(time_filter='day', limit=100): pub_date = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc) if pub_date < cutoff_date: logging.info(f"Skipping old post: {submission.title} (Published: {pub_date})") continue if "[homemade]" in submission.title.lower(): logging.info(f"Skipping homemade post: {submission.title}") continue cleaned_title = clean_reddit_title(submission.title) if not cleaned_title or len(cleaned_title) < 5: logging.info(f"Skipping post with invalid or short title: {submission.title}") continue # Filter out posts with empty or very short summaries summary = submission.selftext.strip() if submission.selftext else "" if len(summary) < 20 and not submission.url.endswith(('.jpg', '.jpeg', '.png', '.gif')): logging.info(f"Skipping post with insufficient summary: {cleaned_title}") continue # Fetch top comments for additional context top_comments = get_top_comments(f"https://www.reddit.com{submission.permalink}", reddit) articles.append({ "title": cleaned_title, "raw_title": submission.title, "link": f"https://www.reddit.com{submission.permalink}", "summary": summary, "feed_title": get_clean_source_name(subreddit_name), "pub_date": pub_date, "upvotes": submission.score, "comment_count": submission.num_comments, "top_comments": top_comments }) logging.info(f"Fetched {len(articles)} posts from r/{subreddit_name}") break except Exception as e: logging.error(f"Failed to fetch Reddit feed r/{subreddit_name} (attempt {attempt + 1}): {e}") if attempt < MAX_RETRIES - 1: time.sleep(RETRY_BACKOFF * (2 ** attempt)) continue logging.info(f"Total Reddit posts fetched: {len(articles)}") return articles except Exception as e: logging.error(f"Unexpected error in fetch_reddit_posts: {e}", exc_info=True) return [] def curate_from_reddit(post, original_source, source_name, link, page_url): logger = logging.getLogger(__name__) try: content = post.selftext if post.selftext else post.url if not content: logger.info(f"No content for Reddit post: {post.title}") return None, None interest_score = is_interesting(content) if interest_score < 4: logger.info(f"Reddit post '{post.title}' not interesting enough: score {interest_score}") return None, None summary = summarize_with_gpt4o(content, source_name, link, interest_score=interest_score) if not summary: logger.warning(f"Failed to summarize Reddit post: {post.title}") return None, None if post.title in summary: summary = summary.replace(post.title, "").strip() while "\n\n\n" in summary: summary = summary.replace("\n\n\n", "\n\n") final_summary = insert_link_naturally(summary, source_name, link) if not final_summary: logger.warning(f"Failed to insert link for Reddit post: {post.title}") return None, None result = prepare_post_data(final_summary, post.title) if not result: logger.info(f"Post preparation failed for Reddit post: {post.title}") return None, None logger.debug(f"prepare_post_data returned {len(result)} values: {result}") if len(result) < 7: logger.error(f"prepare_post_data returned too few values: {result}") return None, None post_data = result[0] author = result[1] category = result[2] image_url = result[3] image_source = result[4] uploader = result[5] page_url = result[6] share_text = f"Check out this tasty find: {post_data['title']}" share_text_encoded = quote(share_text) share_links_template = ( "Share this post: " 'X | ' 'Facebook' ) post_data["content"] = final_summary post_id, post_url = post_to_wp( post_data=post_data, category=category, link=link, author=author, image_url=image_url, original_source=original_source, image_source=image_source, uploader=uploader, page_url=page_url, interest_score=interest_score, should_post_tweet=True, summary=final_summary ) if not post_id: logger.warning(f"Failed to post Reddit post to WP: {post_data['title']}") return None, None post_url_encoded = quote(post_url) share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded) post_data["content"] = f"{final_summary}\n\n{share_links}" post_id, post_url = post_to_wp( post_data=post_data, category=category, link=link, author=author, image_url=None, original_source=original_source, image_source=image_source, uploader=uploader, page_url=page_url, interest_score=interest_score, post_id=post_id, should_post_tweet=False, summary=final_summary ) if post_id: logger.info(f"Successfully curated and posted Reddit post: {post_data['title']} (URL: {post_url})") return post_id, post_url else: logger.warning(f"Failed to update Reddit post with share links: {post_data['title']}") return None, None except Exception as e: logger.error(f"Error curating Reddit post '{getattr(post, 'title', 'unknown')}': {e}") return None, None def run_reddit_automator(): lock_fd = None try: lock_fd = acquire_lock() update_system_activity(SCRIPT_NAME, "running", os.getpid()) # Record start logging.info("***** Reddit Automator Launched *****") # Load JSON files once posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS) posted_titles = set(entry["title"] for entry in posted_titles_data) used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) used_images = set(entry["title"] for entry in used_images_data if "title" in entry) post_data, category, sleep_time = curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used_images) if not post_data: logging.info("No postable Reddit article found") logging.info("Completed Reddit run") update_system_activity(SCRIPT_NAME, "stopped") # Record stop logging.info(f"Run completed, sleep_time: {sleep_time} seconds") return post_data, category, sleep_time except Exception as e: logging.error(f"Fatal error in run_reddit_automator: {e}", exc_info=True) update_system_activity(SCRIPT_NAME, "stopped") # Record stop on error sleep_time = random.randint(1200, 1800) # 20–30 minutes logging.info(f"Run completed, sleep_time: {sleep_time} seconds") return None, None, sleep_time finally: if lock_fd: fcntl.flock(lock_fd, fcntl.LOCK_UN) lock_fd.close() os.remove(LOCK_FILE) if os.path.exists(LOCK_FILE) else None if __name__ == "__main__": setup_logging() post_data, category, sleep_time = run_reddit_automator() logging.info(f"Run completed, sleep_time: {sleep_time} seconds")