# foodie_automator_rss.py import requests import random import time import logging import os import json import signal import sys import re import email.utils import feedparser from duckduckgo_search import DDGS from datetime import datetime, timedelta, timezone from bs4 import BeautifulSoup from openai import OpenAI from urllib.parse import quote from requests.packages.urllib3.util.retry import Retry from requests.adapters import HTTPAdapter from foodie_config import ( RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES, get_clean_source_name, X_API_CREDENTIALS ) from foodie_utils import ( load_json_file, save_json_file, get_image, generate_image_query, upload_image_to_wp, determine_paragraph_count, insert_link_naturally, is_interesting, generate_title_from_summary, summarize_with_gpt4o, generate_category_from_summary, post_to_wp, prepare_post_data, select_best_author, smart_image_and_filter, get_flickr_image ) from foodie_hooks import get_dynamic_hook, get_viral_share_prompt from dotenv import load_dotenv import fcntl load_dotenv() is_posting = False LOCK_FILE = "/home/shane/foodie_automator/locks/foodie_automator_rss.lock" def signal_handler(sig, frame): logging.info("Received termination signal, checking if safe to exit...") if is_posting: logging.info("Currently posting, will exit after completion.") else: logging.info("Safe to exit immediately.") sys.exit(0) signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) LOG_FILE = "/home/shane/foodie_automator/logs/foodie_automator_rss.log" LOG_PRUNE_DAYS = 30 FEED_TIMEOUT = 15 MAX_RETRIES = 3 RETRY_BACKOFF = 2 POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json' USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json' EXPIRATION_HOURS = 24 IMAGE_EXPIRATION_DAYS = 7 posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS) posted_titles = set(entry["title"] for entry in posted_titles_data) used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry) def setup_logging(): if os.path.exists(LOG_FILE): with open(LOG_FILE, 'r') as f: lines = f.readlines() cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS) pruned_lines = [] malformed_count = 0 for line in lines: if len(line) < 19 or not line[:19].replace('-', '').replace(':', '').replace(' ', '').isdigit(): malformed_count += 1 continue try: timestamp = datetime.strptime(line[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc) if timestamp > cutoff: pruned_lines.append(line) except ValueError: malformed_count += 1 continue if malformed_count > 0: logging.info(f"Skipped {malformed_count} malformed log lines during pruning") with open(LOG_FILE, 'w') as f: f.writelines(pruned_lines) logging.basicConfig( filename=LOG_FILE, level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S" ) console_handler = logging.StreamHandler() console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) logging.getLogger().addHandler(console_handler) logging.getLogger("requests").setLevel(logging.WARNING) logging.info("Logging initialized for foodie_automator_rss.py") def acquire_lock(): os.makedirs(os.path.dirname(LOCK_FILE), exist_ok=True) lock_fd = open(LOCK_FILE, 'w') try: fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) lock_fd.write(str(os.getpid())) lock_fd.flush() return lock_fd except IOError: logging.info("Another instance of foodie_automator_rss.py is running") sys.exit(0) def create_http_session() -> requests.Session: session = requests.Session() retry_strategy = Retry( total=MAX_RETRIES, backoff_factor=RETRY_BACKOFF, status_forcelist=[403, 429, 500, 502, 503, 504], allowed_methods=["GET", "POST"] ) adapter = HTTPAdapter(max_retries=retry_strategy) session.mount("http://", adapter) session.mount("https://", adapter) session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36' }) return session def parse_date(date_str): try: parsed_date = email.utils.parsedate_to_datetime(date_str) if parsed_date.tzinfo is None: parsed_date = parsed_date.replace(tzinfo=timezone.utc) return parsed_date except Exception as e: logging.error(f"Failed to parse date '{date_str}': {e}") return datetime.now(timezone.utc) def fetch_rss_feeds(): logging.info("Starting fetch_rss_feeds") articles = [] cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS) session = create_http_session() if not RSS_FEEDS: logging.error("RSS_FEEDS is empty in foodie_config.py") return articles logging.info(f"Processing feeds: {RSS_FEEDS}") for feed_url in RSS_FEEDS: for attempt in range(MAX_RETRIES): logging.info(f"Processing feed: {feed_url} (attempt {attempt + 1})") try: response = session.get(feed_url, timeout=FEED_TIMEOUT) response.raise_for_status() soup = BeautifulSoup(response.content, 'xml') items = soup.find_all('item') feed_title = RSS_FEED_NAMES.get(feed_url, (get_clean_source_name(feed_url), feed_url)) for item in items: try: title = item.find('title').text.strip() if item.find('title') else "Untitled" link = item.find('link').text.strip() if item.find('link') else "" pub_date = item.find('pubDate') pub_date = parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc) if pub_date < cutoff_date: logging.info(f"Skipping old article: {title} (Published: {pub_date})") continue description = item.find('description') summary = BeautifulSoup(description.text, 'html.parser').get_text().strip() if description else "" content = item.find('content:encoded') content_text = BeautifulSoup(content.text, 'html.parser').get_text().strip() if content else summary articles.append({ "title": title, "link": link, "summary": summary, "content": content_text, "feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title, "pub_date": pub_date }) logging.debug(f"Processed article: {title}") except Exception as e: logging.warning(f"Error processing entry in {feed_url}: {e}") continue logging.info(f"Filtered to {len(articles)} articles from {feed_url}") break except Exception as e: logging.error(f"Failed to fetch RSS feed {feed_url}: {e}") if attempt < MAX_RETRIES - 1: time.sleep(RETRY_BACKOFF * (2 ** attempt)) continue articles.sort(key=lambda x: x["pub_date"], reverse=True) logging.info(f"Total RSS articles fetched: {len(articles)}") return articles def fetch_duckduckgo_news_context(title, hours=24): for attempt in range(MAX_RETRIES): try: with DDGS() as ddgs: results = ddgs.news(f"{title} news", timelimit="d", max_results=5) titles = [] for r in results: try: date_str = r["date"] if '+00:00' in date_str: dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc) else: dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S%Z").replace(tzinfo=timezone.utc) if dt > (datetime.now(timezone.utc) - timedelta(hours=24)): titles.append(r["title"].lower()) except ValueError as e: logging.warning(f"Date parsing failed for '{date_str}': {e}") continue context = " ".join(titles) if titles else "No recent news found within 24 hours" logging.info(f"DuckDuckGo News context for '{title}': {context}") return context except Exception as e: logging.warning(f"DuckDuckGo News context fetch failed for '{title}' (attempt {attempt + 1}): {e}") if attempt < MAX_RETRIES - 1: time.sleep(RETRY_BACKOFF * (2 ** attempt)) continue logging.error(f"Failed to fetch DuckDuckGo News context for '{title}' after {MAX_RETRIES} attempts") return title def curate_from_rss(): try: articles = fetch_rss_feeds() if not articles: logging.info("No RSS articles available") return None, None, False # Continue running attempts = 0 max_attempts = 10 while attempts < max_attempts and articles: article = articles.pop(0) title = article["title"] link = article["link"] summary = article.get("summary", "") source_name = article.get("feed_title", "Unknown Source") original_source = f'{source_name}' if title in posted_titles: logging.info(f"Skipping already posted article: {title}") attempts += 1 continue logging.info(f"Trying RSS Article: {title} from {source_name}") image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary) if skip: logging.info(f"Skipping filtered RSS article: {title}") attempts += 1 continue ddg_context = fetch_duckduckgo_news_context(title) scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}" interest_score = is_interesting(scoring_content) logging.info(f"Interest score for '{title}': {interest_score}") if interest_score < 6: logging.info(f"RSS Interest Too Low: {interest_score}") attempts += 1 continue num_paragraphs = determine_paragraph_count(interest_score) extra_prompt = ( f"Generate exactly {num_paragraphs} paragraphs.\n" f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n" f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n" f"Do NOT introduce unrelated concepts unless in the content or additional context.\n" f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n" f"Do not include emojis in the summary." ) content_to_summarize = scoring_content final_summary = summarize_with_gpt4o( content_to_summarize, source_name, link, interest_score=interest_score, extra_prompt=extra_prompt ) if not final_summary: logging.info(f"Summary failed for '{title}'") attempts += 1 continue final_summary = insert_link_naturally(final_summary, source_name, link) post_data, author, category, image_url, image_source, uploader, page_url = prepare_post_data(final_summary, title, main_topic) if not post_data: attempts += 1 continue image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic) if not image_url: image_url, image_source, uploader, page_url = get_image(image_query) hook = get_dynamic_hook(post_data["title"]).strip() share_prompt = get_viral_share_prompt(post_data["title"], final_summary) share_links_template = ( f'
' ) post_data["content"] = f"{final_summary}\n\n{share_links_template}" global is_posting is_posting = True try: post_id, post_url = post_to_wp( post_data=post_data, category=category, link=link, author=author, image_url=image_url, original_source=original_source, image_source=image_source, uploader=uploader, page_url=page_url, interest_score=interest_score, should_post_tweet=True ) except Exception as e: logging.error(f"Failed to post to WordPress for '{title}': {e}", exc_info=True) attempts += 1 continue finally: is_posting = False if post_id: share_text = f"Check out this foodie gem! {post_data['title']}" share_text_encoded = quote(share_text) post_url_encoded = quote(post_url) share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded) post_data["content"] = f"{final_summary}\n\n{share_links}" is_posting = True try: post_to_wp( post_data=post_data, category=category, link=link, author=author, image_url=image_url, original_source=original_source, image_source=image_source, uploader=uploader, page_url=page_url, interest_score=interest_score, post_id=post_id, should_post_tweet=False ) except Exception as e: logging.error(f"Failed to update WordPress post '{title}' with share links: {e}", exc_info=True) finally: is_posting = False timestamp = datetime.now(timezone.utc).isoformat() save_json_file(POSTED_TITLES_FILE, title, timestamp) posted_titles.add(title) logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}") if image_url: save_json_file(USED_IMAGES_FILE, image_url, timestamp) used_images.add(image_url) logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}") logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****") return post_data, category, True # Run again immediately attempts += 1 logging.info(f"WP posting failed for '{post_data['title']}'") logging.info("No interesting RSS article found after attempts") return None, None, False # Wait before running again except Exception as e: logging.error(f"Unexpected error in curate_from_rss: {e}", exc_info=True) return None, None, False def run_rss_automator(): lock_fd = None try: lock_fd = acquire_lock() logging.info("***** RSS Automator Launched *****") post_data, category, should_continue = curate_from_rss() if not post_data: logging.info("No postable RSS article found") else: logging.info("Completed RSS run") return post_data, category, should_continue except Exception as e: logging.error(f"Fatal error in run_rss_automator: {e}", exc_info=True) return None, None, False finally: if lock_fd: fcntl.flock(lock_fd, fcntl.LOCK_UN) lock_fd.close() os.remove(LOCK_FILE) if os.path.exists(LOCK_FILE) else None if __name__ == "__main__": setup_logging() post_data, category, should_continue = run_rss_automator() # Remove sleep timer, let manage_scripts.sh control execution logging.info(f"Run completed, should_continue: {should_continue}")