# foodie_automator_rss.py import requests import random import time import logging import os import json import signal import sys import re import email.utils from datetime import datetime, timedelta, timezone from typing import List, Dict, Optional, Tuple, Set from bs4 import BeautifulSoup from openai import OpenAI from urllib.parse import quote from requests.packages.urllib3.util.retry import Retry from requests.adapters import HTTPAdapter from foodie_config import ( RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES, get_clean_source_name, X_API_CREDENTIALS, FILE_PATHS, EXPIRATION_DAYS, IMAGE_EXPIRATION_DAYS, LIGHT_TASK_MODEL ) from foodie_utils import ( load_json_file, save_json_file, get_image, generate_image_query, upload_image_to_wp, determine_paragraph_count, insert_link_naturally, is_interesting, generate_title_from_summary, summarize_with_gpt4o, generate_category_from_summary, post_to_wp, prepare_post_data, select_best_author, smart_image_and_filter, get_flickr_image, select_best_persona ) from foodie_hooks import get_dynamic_hook, get_viral_share_prompt from dotenv import load_dotenv # Load environment variables load_dotenv() # Global state is_posting = False logger = logging.getLogger(__name__) class RSSScraper: def __init__(self): self.setup_logging() self.setup_signal_handlers() self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) self.posted_titles = self.load_posted_titles() self.used_images = self.load_used_images() self.session = self.setup_http_session() def setup_logging(self) -> None: """Configure logging for the scraper.""" log_file = FILE_PATHS["posted_rss_titles"].with_suffix('.log') self.prune_old_logs(log_file) logging.basicConfig( filename=str(log_file), level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) logging.getLogger("requests").setLevel(logging.WARNING) console_handler = logging.StreamHandler() console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) logging.getLogger().addHandler(console_handler) logger.info("Logging initialized for RSS scraper") def prune_old_logs(self, log_file: str) -> None: """Prune log entries older than LOG_PRUNE_DAYS.""" if not os.path.exists(log_file): return with open(log_file, 'r') as f: lines = f.readlines() cutoff = datetime.now(timezone.utc) - timedelta(days=30) # LOG_PRUNE_DAYS pruned_lines = [] malformed_count = 0 for line in lines: if len(line) < 19 or not line[:19].replace('-', '').replace(':', '').replace(' ', '').isdigit(): malformed_count += 1 continue try: timestamp = datetime.strptime(line[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc) if timestamp > cutoff: pruned_lines.append(line) except ValueError: malformed_count += 1 continue if malformed_count > 0: logger.warning(f"Skipped {malformed_count} malformed log lines during pruning") with open(log_file, 'w') as f: f.writelines(pruned_lines) def setup_signal_handlers(self) -> None: """Set up signal handlers for graceful shutdown.""" def signal_handler(sig, frame): logger.info("Received termination signal, checking if safe to exit...") if is_posting: logger.info("Currently posting, will exit after completion.") else: logger.info("Safe to exit immediately.") sys.exit(0) signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) def setup_http_session(self) -> requests.Session: """Set up a requests session with retry logic.""" session = requests.Session() retry_strategy = Retry( total=3, backoff_factor=2, status_forcelist=[403, 429, 500, 502, 503, 504], allowed_methods=["GET", "POST"] ) adapter = HTTPAdapter( max_retries=retry_strategy, pool_connections=10, pool_maxsize=10 ) session.mount("http://", adapter) session.mount("https://", adapter) session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36' }) return session def load_posted_titles(self) -> Set[str]: """Load and return the set of posted titles.""" try: data = load_json_file(FILE_PATHS["posted_rss_titles"], EXPIRATION_DAYS) return {entry["title"] for entry in data if "title" in entry} except Exception as e: logger.error(f"Error loading posted titles: {e}") return set() def load_used_images(self) -> Set[str]: """Load and return the set of used images.""" try: data = load_json_file(FILE_PATHS["used_images"], IMAGE_EXPIRATION_DAYS) return {entry["url"] for entry in data if "url" in entry} except Exception as e: logger.error(f"Error loading used images: {e}") return set() def parse_date(self, date_str: str) -> datetime: """Parse a date string into a datetime object.""" try: parsed_date = email.utils.parsedate_to_datetime(date_str) if parsed_date.tzinfo is None: parsed_date = parsed_date.replace(tzinfo=timezone.utc) return parsed_date except Exception as e: logger.error(f"Failed to parse date '{date_str}': {e}") return datetime.now(timezone.utc) def fetch_rss_feeds(self) -> List[Dict]: """Fetch and process RSS feeds.""" logger.info("Starting fetch_rss_feeds") articles = [] cutoff_date = datetime.now(timezone.utc) - timedelta(hours=24) if not RSS_FEEDS: logger.error("RSS_FEEDS is empty in foodie_config.py") return articles for feed_url in RSS_FEEDS: logger.info(f"Processing feed: {feed_url}") try: response = self.session.get(feed_url, timeout=15) response.raise_for_status() soup = BeautifulSoup(response.content, 'xml') items = soup.find_all('item') feed_title = RSS_FEED_NAMES.get(feed_url, (get_clean_source_name(feed_url), feed_url)) for item in items: try: title = item.find('title').text.strip() if item.find('title') else "Untitled" link = item.find('link').text.strip() if item.find('link') else "" pub_date = item.find('pubDate') pub_date = self.parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc) if pub_date < cutoff_date: logger.info(f"Skipping old article: {title} (Published: {pub_date})") continue description = item.find('description') summary = BeautifulSoup(description.text, 'html.parser').get_text().strip() if description else "" content = item.find('content:encoded') content_text = BeautifulSoup(content.text, 'html.parser').get_text().strip() if content else summary articles.append({ "title": title, "link": link, "summary": summary, "content": content_text, "feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title, "pub_date": pub_date }) except Exception as e: logger.warning(f"Error processing entry in {feed_url}: {e}") continue logger.info(f"Filtered to {len(articles)} articles from {feed_url}") except Exception as e: logger.error(f"Failed to fetch RSS feed {feed_url}: {e}") continue articles.sort(key=lambda x: x["pub_date"], reverse=True) logger.info(f"Total RSS articles fetched: {len(articles)}") return articles def curate_from_rss(self) -> Tuple[Optional[Dict], Optional[str], int]: """Curate content from RSS feeds.""" articles = self.fetch_rss_feeds() if not articles: logger.info("No RSS articles available") return None, None, random.randint(600, 1800) for article in articles: title = article["title"] link = article["link"] summary = article["summary"] content = article["content"] source_name = article["feed_title"] if title in self.posted_titles: logger.info(f"Skipping already posted article: {title}") continue logger.info(f"Processing RSS Article: {title} from {source_name}") image_query, relevance_keywords, skip = smart_image_and_filter(title, summary) if skip: logger.info(f"Skipping filtered RSS article: {title}") continue scoring_content = f"{title}\n\n{summary}\n\nContent: {content}" interest_score = is_interesting(scoring_content) logger.info(f"Interest score for '{title}': {interest_score}") if interest_score < 6: logger.info(f"RSS Interest Too Low: {interest_score}") continue num_paragraphs = determine_paragraph_count(interest_score) extra_prompt = ( f"Generate exactly {num_paragraphs} paragraphs.\n" f"Focus on the most interesting aspects of the content.\n" f"Use a {select_best_persona(interest_score, content)} tone.\n" f"Make it engaging and shareable." ) summary = summarize_with_gpt4o(content, source_name, link, interest_score, extra_prompt) if not summary: logger.warning(f"Failed to generate summary for '{title}'") continue summary = insert_link_naturally(summary, source_name, link) if not summary: logger.warning(f"Failed to insert link for '{title}'") continue post_data, author, category, image_url, image_source, uploader, page_url = prepare_post_data( summary, title, f"RSS: {source_name}" ) if not post_data or not author: logger.warning(f"Failed to prepare post data for '{title}'") continue try: post_id, post_url = post_to_wp( post_data=post_data, category=category, link=link, author=author, image_url=image_url, original_source=source_name, image_source=image_source, uploader=uploader, pixabay_url=page_url, interest_score=interest_score ) if post_id and post_url: logger.info(f"Successfully posted '{title}' to WordPress (ID: {post_id})") self.posted_titles.add(title) save_json_file(FILE_PATHS["posted_rss_titles"], title, datetime.now(timezone.utc).isoformat()) return post_data, author["username"], random.randint(600, 1800) except Exception as e: logger.error(f"Error in RSS automator: {e}") continue return None, None, random.randint(600, 1800) def run_rss_automator(): """Main function to run the RSS automator.""" scraper = RSSScraper() while True: try: post_data, author, sleep_time = scraper.curate_from_rss() if post_data and author: global is_posting is_posting = True try: post_to_wp(post_data, author) logger.info(f"Successfully posted: {post_data['title']}") finally: is_posting = False time.sleep(sleep_time) except Exception as e: logger.error(f"Error in RSS automator: {e}") time.sleep(300) # Wait 5 minutes before retrying if __name__ == "__main__": run_rss_automator()