# foodie_automator_rss.py import requests import random import time import logging import os import json import signal import sys import email.utils from datetime import datetime, timedelta, timezone from bs4 import BeautifulSoup from openai import OpenAI from urllib.parse import quote from requests.packages.urllib3.util.retry import Retry from requests.adapters import HTTPAdapter from foodie_config import ( RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES, get_clean_source_name, X_API_CREDENTIALS ) from foodie_utils import ( load_json_file, save_json_file, get_image, generate_image_query, upload_image_to_wp, determine_paragraph_count, insert_link_naturally, is_interesting, generate_title_from_summary, summarize_with_gpt4o, generate_category_from_summary, post_to_wp, prepare_post_data, select_best_author, smart_image_and_filter ) from foodie_hooks import get_dynamic_hook, select_best_cta import feedparser from concurrent.futures import ThreadPoolExecutor, as_completed from typing import List, Dict, Any, Optional from dotenv import load_dotenv load_dotenv() # Flag to indicate if we're in the middle of posting is_posting = False def signal_handler(sig, frame): logging.info("Received termination signal, checking if safe to exit...") if is_posting: logging.info("Currently posting, will exit after completion.") else: logging.info("Safe to exit immediately.") sys.exit(0) signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) LOG_FILE = "/home/shane/foodie_automator/foodie_automator_rss.log" LOG_PRUNE_DAYS = 30 MAX_WORKERS = 5 RATE_LIMIT_DELAY = 1 FEED_TIMEOUT = 30 MAX_RETRIES = 3 POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json' USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json' EXPIRATION_HOURS = 24 IMAGE_EXPIRATION_DAYS = 7 posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS) posted_titles = set(entry["title"] for entry in posted_titles_data) used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry) def setup_logging(): if os.path.exists(LOG_FILE): with open(LOG_FILE, 'r') as f: lines = f.readlines() cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS) pruned_lines = [] for line in lines: try: timestamp = datetime.strptime(line[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc) if timestamp > cutoff: pruned_lines.append(line) except ValueError: logging.warning(f"Skipping malformed log line: {line.strip()[:50]}...") continue with open(LOG_FILE, 'w') as f: f.writelines(pruned_lines) logging.basicConfig( filename=LOG_FILE, level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S" ) console_handler = logging.StreamHandler() console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) logging.getLogger().addHandler(console_handler) logging.info("Logging initialized for foodie_automator_rss.py") setup_logging() def create_http_session() -> requests.Session: session = requests.Session() retry_strategy = Retry( total=MAX_RETRIES, backoff_factor=2, # Increased backoff factor for better retry handling status_forcelist=[429, 500, 502, 503, 504, 403], # Added 403 to retry list allowed_methods=["GET", "POST"] ) adapter = HTTPAdapter( max_retries=retry_strategy, pool_connections=10, pool_maxsize=10 ) session.mount("http://", adapter) session.mount("https://", adapter) # Add a realistic User-Agent header session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36' }) return session def fetch_feed(feed_url: str, session: requests.Session) -> Optional[feedparser.FeedParserDict]: logging.debug(f"Fetching feed: {feed_url}") try: response = session.get(feed_url, timeout=15) # Reduced timeout to 15 seconds response.raise_for_status() feed = feedparser.parse(response.content) if feed.bozo: logging.warning(f"Feed parsing error for {feed_url}: {feed.bozo_exception}") return None logging.debug(f"Successfully fetched feed: {feed_url}") return feed except Exception as e: logging.error(f"Error fetching feed {feed_url}: {str(e)}") return None def is_interesting_rss(title: str, summary: str, pub_date: datetime) -> bool: try: if not title or not summary: return False if datetime.now(timezone.utc) - pub_date > timedelta(days=7): return False score = 0 title_lower = title.lower() if any(keyword in title_lower for keyword in RECIPE_KEYWORDS): score += 3 if any(keyword in title_lower for keyword in PROMO_KEYWORDS): score += 2 if any(keyword in title_lower for keyword in HOME_KEYWORDS): score += 1 summary_lower = summary.lower() if len(summary.split()) < 100: score -= 2 if any(keyword in summary_lower for keyword in PRODUCT_KEYWORDS): score += 1 return score >= 4 except Exception as e: logging.error(f"Error in is_interesting_rss: {str(e)}") return False def fetch_rss_feeds() -> List[Dict[str, Any]]: logging.info("Starting fetch_rss_feeds") session = create_http_session() articles = [] try: logging.info(f"Processing {len(RSS_FEEDS)} feeds: {RSS_FEEDS}") with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: futures = [] for feed_url in RSS_FEEDS: logging.debug(f"Scheduling feed: {feed_url}") future = executor.submit(process_feed, feed_url, session) futures.append(future) for future in as_completed(futures): try: feed_articles = future.result() logging.info(f"Completed feed processing, got {len(feed_articles)} articles") articles.extend(feed_articles) except Exception as e: logging.error(f"Error processing feed in future: {str(e)}") continue logging.info(f"Finished fetch_rss_feeds, total articles: {len(articles)}") return articles except Exception as e: logging.error(f"Error in fetch_rss_feeds: {str(e)}") return [] def process_feed(feed_url: str, session: requests.Session) -> List[Dict[str, Any]]: logging.info(f"Processing feed: {feed_url}") try: feed = fetch_feed(feed_url, session) if not feed: logging.warning(f"No feed data for {feed_url}") return [] articles = [] logging.debug(f"Feed entries count: {len(feed.entries)}") for entry in feed.entries: try: logging.debug(f"Processing entry: {entry.get('title', 'No title')}") pub_date = datetime.fromtimestamp(time.mktime(entry.published_parsed), tz=timezone.utc) # Safely extract content content = "" if hasattr(entry, 'content') and isinstance(entry.content, list) and len(entry.content) > 0: content_item = entry.content[0] if isinstance(content_item, dict) and 'value' in content_item: content = content_item['value'] elif hasattr(content_item, 'value'): content = content_item.value elif hasattr(entry, 'description'): content = entry.description elif hasattr(entry, 'summary'): content = entry.summary article = { "title": entry.title, "link": entry.link, "summary": entry.summary if hasattr(entry, 'summary') else entry.description if hasattr(entry, 'description') else "", "content": content, "feed_title": get_clean_source_name(feed_url), "pub_date": pub_date } if is_interesting_rss(article["title"], article["summary"], pub_date): logging.info(f"Interesting article found: {article['title']}") articles.append(article) time.sleep(RATE_LIMIT_DELAY) except Exception as e: logging.warning(f"Error processing entry in {feed_url}: {str(e)}") continue logging.info(f"Finished processing {feed_url}, found {len(articles)} articles") return articles except Exception as e: logging.error(f"Error processing feed {feed_url}: {str(e)}") return [] def parse_date(date_str): try: parsed_date = email.utils.parsedate_to_datetime(date_str) if parsed_date.tzinfo is None: parsed_date = parsed_date.replace(tzinfo=timezone.utc) return parsed_date except Exception as e: logging.error(f"Failed to parse date '{date_str}': {e}") return datetime.now(timezone.utc) def curate_from_rss(): articles = fetch_rss_feeds() if not articles: print("No RSS articles available") logging.info("No RSS articles available") return None, None, None attempts = 0 max_attempts = 10 while attempts < max_attempts and articles: article = articles.pop(0) title = article["title"] link = article["link"] summary = article["summary"] content = article["content"] source_name = article["feed_title"] original_source = f'{source_name}' if title in posted_titles: print(f"Skipping already posted article: {title}") logging.info(f"Skipping already posted article: {title}") attempts += 1 continue print(f"Trying RSS Article: {title} from {source_name}") logging.info(f"Trying RSS Article: {title} from {source_name}") image_query, relevance_keywords, skip = smart_image_and_filter(title, summary) if skip: print(f"Skipping filtered RSS article: {title}") logging.info(f"Skipping filtered RSS article: {title}") attempts += 1 continue scoring_content = f"{title}\n\n{summary}\n\nContent: {content}" interest_score = is_interesting(scoring_content) logging.info(f"Interest score for '{title}': {interest_score}") if interest_score < 6: print(f"RSS Interest Too Low: {interest_score}") logging.info(f"RSS Interest Too Low: {interest_score}") attempts += 1 continue num_paragraphs = determine_paragraph_count(interest_score) extra_prompt = ( f"Generate exactly {num_paragraphs} paragraphs.\n" f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n" f"Do NOT introduce unrelated concepts.\n" f"Expand on the core idea with relevant context about its appeal or significance.\n" f"Do not include emojis in the summary." ) content_to_summarize = scoring_content final_summary = summarize_with_gpt4o( content_to_summarize, source_name, link, interest_score=interest_score, extra_prompt=extra_prompt ) if not final_summary: logging.info(f"Summary failed for '{title}'") attempts += 1 continue final_summary = insert_link_naturally(final_summary, source_name, link) post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title) if not post_data: attempts += 1 continue hook = get_dynamic_hook(post_data["title"]).strip() cta = select_best_cta(post_data["title"], final_summary, post_url=None) post_data["content"] = f"{final_summary}\n\n{cta}" global is_posting is_posting = True try: post_id, post_url = post_to_wp( post_data=post_data, category=category, link=link, author=author, image_url=image_url, original_source=original_source, image_source=image_source, uploader=uploader, pixabay_url=pixabay_url, interest_score=interest_score, should_post_tweet=True # Post the X tweet on the first call ) finally: is_posting = False if post_id: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url) post_data["content"] = f"{final_summary}\n\n{cta}" is_posting = True try: post_to_wp( post_data=post_data, category=category, link=link, author=author, image_url=image_url, original_source=original_source, image_source=image_source, uploader=uploader, pixabay_url=pixabay_url, interest_score=interest_score, post_id=post_id, should_post_tweet=False # Skip X tweet on the update call ) finally: is_posting = False timestamp = datetime.now(timezone.utc).isoformat() save_json_file(POSTED_TITLES_FILE, title, timestamp) posted_titles.add(title) logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}") if image_url: save_json_file(USED_IMAGES_FILE, image_url, timestamp) used_images.add(image_url) logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}") print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****") logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****") return post_data, category, random.randint(0, 1800) attempts += 1 logging.info(f"WP posting failed for '{post_data['title']}'") print("No interesting RSS article found after attempts") logging.info("No interesting RSS article found after attempts") return None, None, random.randint(600, 1800) def run_rss_automator(): print(f"{datetime.now(timezone.utc)} - INFO - ***** RSS Automator Launched *****") logging.info("***** RSS Automator Launched *****") post_data, category, sleep_time = curate_from_rss() print(f"Sleeping for {sleep_time}s") logging.info(f"Completed run with sleep time: {sleep_time} seconds") time.sleep(sleep_time) return post_data, category, sleep_time if __name__ == "__main__": run_rss_automator()