# foodie_automator_rss.py import requests import random import time import logging import os import json import signal import sys import re import email.utils import feedparser from duckduckgo_search import DDGS from datetime import datetime, timedelta, timezone from bs4 import BeautifulSoup from openai import OpenAI from urllib.parse import quote from requests.packages.urllib3.util.retry import Retry from requests.adapters import HTTPAdapter from foodie_config import ( RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES, get_clean_source_name, X_API_CREDENTIALS ) from foodie_utils import ( load_json_file, save_json_file, get_image, generate_image_query, upload_image_to_wp, determine_paragraph_count, insert_link_naturally, is_interesting, generate_title_from_summary, summarize_with_gpt4o, generate_category_from_summary, post_to_wp, prepare_post_data, select_best_author, smart_image_and_filter, get_flickr_image, get_next_author_round_robin, check_author_rate_limit, update_system_activity ) from foodie_hooks import get_dynamic_hook, get_viral_share_prompt from dotenv import load_dotenv import fcntl load_dotenv() is_posting = False SCRIPT_NAME = "foodie_automator_rss" LOCK_FILE = "/home/shane/foodie_automator/locks/foodie_automator_rss.lock" LOG_FILE = "/home/shane/foodie_automator/logs/foodie_automator_rss.log" LOG_PRUNE_DAYS = 30 FEED_TIMEOUT = 15 MAX_RETRIES = 3 RETRY_BACKOFF = 2 POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json' USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json' EXPIRATION_HOURS = 24 IMAGE_EXPIRATION_DAYS = 7 def setup_logging(): """Initialize logging with pruning of old logs.""" try: logging.debug("Attempting to set up logging") os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True) logging.debug(f"Log directory created/verified: {os.path.dirname(LOG_FILE)}") if not os.access(os.path.dirname(LOG_FILE), os.W_OK): raise PermissionError(f"No write permission for {os.path.dirname(LOG_FILE)}") # Test write to log file try: with open(LOG_FILE, 'a') as f: f.write("") logging.debug(f"Confirmed write access to {LOG_FILE}") except Exception as e: raise PermissionError(f"Cannot write to {LOG_FILE}: {e}") if os.path.exists(LOG_FILE): with open(LOG_FILE, 'r') as f: lines = f.readlines() cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS) pruned_lines = [] malformed_count = 0 for line in lines: if len(line) < 19 or not line[:19].replace('-', '').replace(':', '').replace(' ', '').isdigit(): malformed_count += 1 continue try: timestamp = datetime.strptime(line[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc) if timestamp > cutoff: pruned_lines.append(line) except ValueError: malformed_count += 1 continue if malformed_count > 0: logging.info(f"Skipped {malformed_count} malformed log lines during pruning") with open(LOG_FILE, 'w') as f: f.writelines(pruned_lines) logging.debug(f"Log file pruned: {LOG_FILE}") logging.basicConfig( filename=LOG_FILE, level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", force=True ) console_handler = logging.StreamHandler() console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) logging.getLogger().addHandler(console_handler) logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("openai").setLevel(logging.WARNING) logging.info("Logging initialized for foodie_automator_rss.py") except Exception as e: print(f"Failed to setup logging: {e}") sys.exit(1) # Call setup_logging immediately setup_logging() check_author_rate_limit.script_run_id = int(time.time()) logging.info(f"Set script_run_id to {check_author_rate_limit.script_run_id}") posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS) posted_titles = set(entry["title"] for entry in posted_titles_data) used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry) def acquire_lock(): try: logging.debug("Attempting to acquire lock") os.makedirs(os.path.dirname(LOCK_FILE), exist_ok=True) lock_fd = open(LOCK_FILE, 'w') fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) lock_fd.write(str(os.getpid())) lock_fd.flush() logging.debug(f"Lock acquired: {LOCK_FILE}") return lock_fd except IOError: logging.info("Another instance of foodie_automator_rss.py is running") sys.exit(0) def signal_handler(sig, frame): logging.info("Received termination signal, marking script as stopped...") update_system_activity(SCRIPT_NAME, "stopped") sys.exit(0) signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) def create_http_session() -> requests.Session: session = requests.Session() retry_strategy = Retry( total=MAX_RETRIES, backoff_factor=RETRY_BACKOFF, status_forcelist=[403, 429, 500, 502, 503, 504], allowed_methods=["GET", "POST"] ) adapter = HTTPAdapter(max_retries=retry_strategy) session.mount("http://", adapter) session.mount("https://", adapter) session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36' }) return session def parse_date(date_str): try: parsed_date = email.utils.parsedate_to_datetime(date_str) if parsed_date.tzinfo is None: parsed_date = parsed_date.replace(tzinfo=timezone.utc) return parsed_date except Exception as e: logging.error(f"Failed to parse date '{date_str}': {e}") return datetime.now(timezone.utc) def fetch_rss_feeds(): logging.info("Starting fetch_rss_feeds") articles = [] cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS) session = create_http_session() if not RSS_FEEDS: logging.error("RSS_FEEDS is empty in foodie_config.py") return articles logging.info(f"Processing feeds: {RSS_FEEDS}") for feed_url in RSS_FEEDS: for attempt in range(MAX_RETRIES): logging.info(f"Processing feed: {feed_url} (attempt {attempt + 1})") try: response = session.get(feed_url, timeout=FEED_TIMEOUT) response.raise_for_status() soup = BeautifulSoup(response.content, 'xml') items = soup.find_all('item') feed_title = RSS_FEED_NAMES.get(feed_url, (get_clean_source_name(feed_url), feed_url)) for item in items: try: title = item.find('title').text.strip() if item.find('title') else "Untitled" link = item.find('link').text.strip() if item.find('link') else "" pub_date = item.find('pubDate') pub_date = parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc) if pub_date < cutoff_date: logging.info(f"Skipping old article: {title} (Published: {pub_date})") continue description = item.find('description') summary = BeautifulSoup(description.text, 'html.parser').get_text().strip() if description else "" content = item.find('content:encoded') content_text = BeautifulSoup(content.text, 'html.parser').get_text().strip() if content else summary articles.append({ "title": title, "link": link, "summary": summary, "content": content_text, "feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title, "pub_date": pub_date }) logging.debug(f"Processed article: {title}") except Exception as e: logging.warning(f"Error processing entry in {feed_url}: {e}") continue logging.info(f"Filtered to {len(articles)} articles from {feed_url}") break except Exception as e: logging.error(f"Failed to fetch RSS feed {feed_url}: {e}") if attempt < MAX_RETRIES - 1: time.sleep(RETRY_BACKOFF * (2 ** attempt)) continue articles.sort(key=lambda x: x["pub_date"], reverse=True) logging.info(f"Total RSS articles fetched: {len(articles)}") return articles def fetch_duckduckgo_news_context(title, hours=24): for attempt in range(MAX_RETRIES): try: with DDGS() as ddgs: results = ddgs.news(f"{title} news", timelimit="d", max_results=5) titles = [] for r in results: try: date_str = r["date"] if '+00:00' in date_str: dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc) else: dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc) if dt > (datetime.now(timezone.utc) - timedelta(hours=24)): titles.append(r["title"].lower()) except ValueError as e: logging.warning(f"Date parsing failed for '{date_str}': {e}") continue context = " ".join(titles) if titles else "No recent news found within 24 hours" logging.info(f"DuckDuckGo News context for '{title}': {context}") return context except Exception as e: logging.warning(f"DuckDuckGo News context fetch failed for '{title}' (attempt {attempt + 1}): {e}") if attempt < MAX_RETRIES - 1: time.sleep(RETRY_BACKOFF * (2 ** attempt)) continue logging.error(f"Failed to fetch DuckDuckGo News context for '{title}' after {MAX_RETRIES} attempts") return title def curate_from_rss(entry, original_source, source_name, link, page_url): logger = logging.getLogger(__name__) try: content = entry.summary if not content: logger.info(f"No content for RSS entry: {entry.title}") return None, None interest_score = is_interesting(content) if interest_score < 4: logger.info(f"RSS entry '{entry.title}' not interesting enough: score {interest_score}") return None, None summary = summarize_with_gpt4o(content, source_name, link, interest_score=interest_score) if not summary: logger.warning(f"Failed to summarize RSS entry: {entry.title}") return None, None # Remove the original title from the summary if present if entry.title in summary: summary = summary.replace(entry.title, "").strip() while "\n\n\n" in summary: summary = summary.replace("\n\n\n", "\n\n") final_summary = insert_link_naturally(summary, source_name, link) if not final_summary: logger.warning(f"Failed to insert link for RSS entry: {entry.title}") return None, None # Call prepare_post_data and handle return values dynamically result = prepare_post_data(final_summary, entry.title) if not result: logger.info(f"Post preparation failed for RSS entry: {entry.title}") return None, None # Log the number of values returned for debugging logger.debug(f"prepare_post_data returned {len(result)} values: {result}") # Expect at least 7 values; handle additional values gracefully if len(result) < 7: logger.error(f"prepare_post_data returned too few values: {result}") return None, None post_data = result[0] author = result[1] category = result[2] image_url = result[3] image_source = result[4] uploader = result[5] page_url = result[6] share_text = f"Check out this tasty find: {post_data['title']}" share_text_encoded = quote(share_text) share_links_template = ( "Share this post: " 'X | ' 'Facebook' ) # First call: Post without share links post_data["content"] = final_summary post_id, post_url = post_to_wp( post_data=post_data, category=category, link=link, author=author, image_url=image_url, original_source=original_source, image_source=image_source, uploader=uploader, page_url=page_url, interest_score=interest_score, should_post_tweet=True, summary=final_summary ) if not post_id: logger.warning(f"Failed to post RSS entry to WP: {post_data['title']}") return None, None # Second call: Update with share links post_url_encoded = quote(post_url) share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded) post_data["content"] = f"{final_summary}\n\n{share_links}" post_id, post_url = post_to_wp( post_data=post_data, category=category, link=link, author=author, image_url=None, original_source=original_source, image_source=image_source, uploader=uploader, page_url=page_url, interest_score=interest_score, post_id=post_id, should_post_tweet=False, summary=final_summary ) if post_id: logger.info(f"Successfully curated and posted RSS entry: {post_data['title']} (URL: {post_url})") return post_id, post_url else: logger.warning(f"Failed to update RSS post with share links: {post_data['title']}") return None, None except Exception as e: logger.error(f"Error curating RSS entry '{getattr(entry, 'title', 'unknown')}': {e}") return None, None def run_rss_automator(): lock_fd = None try: lock_fd = acquire_lock() update_system_activity(SCRIPT_NAME, "running", os.getpid()) # Record start logging.info("***** RSS Automator Launched *****") # Load posted titles and used images posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS) posted_titles = set(entry["title"] for entry in posted_titles_data) used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) used_images = set(entry["title"] for entry in used_images_data if "title" in entry) # Fetch RSS articles articles = fetch_rss_feeds() if not articles: logging.info("No new RSS articles found") sleep_time = random.randint(1200, 1800) # 20–30 minutes return None, None, sleep_time # Process each article for article in articles: title = article["title"] if title in posted_titles: logging.info(f"Skipping already posted article: {title}") continue # Extract necessary fields entry = type('Entry', (), { 'title': title, 'summary': article["summary"], 'link': article["link"] })() original_source = article["feed_title"] source_name = get_clean_source_name(original_source) link = article["link"] page_url = link # Use the article link as the page_url # Curate the article post_id, post_url = curate_from_rss(entry, original_source, source_name, link, page_url) if post_id and post_url: # Prepare post_data for return post_data = { "title": title, "url": post_url, "id": post_id } # Update posted titles timestamp = datetime.now(timezone.utc).isoformat() save_json_file(POSTED_TITLES_FILE, title, timestamp) posted_titles.add(title) # Determine category (you might need to adjust this based on your actual usage) category = generate_category_from_summary(article["summary"]) logging.info("Completed RSS run") update_system_activity(SCRIPT_NAME, "stopped") # Record stop sleep_time = random.randint(1200, 1800) # 20–30 minutes logging.info(f"Run completed, sleep_time: {sleep_time} seconds") return post_data, category, sleep_time # If no articles were posted logging.info("No postable RSS article found") update_system_activity(SCRIPT_NAME, "stopped") # Record stop sleep_time = random.randint(1200, 1800) # 20–30 minutes logging.info(f"Run completed, sleep_time: {sleep_time} seconds") return None, None, sleep_time except Exception as e: logging.error(f"Fatal error in run_rss_automator: {e}", exc_info=True) update_system_activity(SCRIPT_NAME, "stopped") # Record stop on error sleep_time = random.randint(1200, 1800) # 20–30 minutes logging.info(f"Run completed, sleep_time: {sleep_time} seconds") return None, None, sleep_time finally: if lock_fd: fcntl.flock(lock_fd, fcntl.LOCK_UN) lock_fd.close() os.remove(LOCK_FILE) if os.path.exists(LOCK_FILE) else None if __name__ == "__main__": post_data, category, sleep_time = run_rss_automator() logging.info(f"Run completed, sleep_time: {sleep_time} seconds")