import requests import random import time import logging import os import json import email.utils from datetime import datetime, timedelta, timezone from bs4 import BeautifulSoup from openai import OpenAI from urllib.parse import quote from requests.packages.urllib3.util.retry import Retry from requests.adapters import HTTPAdapter from foodie_config import RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, SUMMARY_PERSONA_PROMPTS, CATEGORIES, get_clean_source_name from foodie_utils import ( load_json_file, save_json_file, get_image, generate_image_query, upload_image_to_wp, determine_paragraph_count, insert_link_naturally, is_interesting, generate_title_from_summary, summarize_with_gpt4o, generate_category_from_summary, post_to_wp, prepare_post_data, select_best_author, smart_image_and_filter ) from foodie_hooks import get_dynamic_hook, select_best_cta import feedparser from concurrent.futures import ThreadPoolExecutor, as_completed from typing import List, Dict, Any, Optional LOG_FILE = "/home/shane/foodie_automator/foodie_automator_rss.log" LOG_PRUNE_DAYS = 30 MAX_WORKERS = 5 # Number of concurrent workers for parallel processing RATE_LIMIT_DELAY = 1 # Delay between API calls in seconds FEED_TIMEOUT = 30 # Timeout for feed requests in seconds MAX_RETRIES = 3 # Maximum number of retries for failed requests def setup_logging(): """Configure logging with rotation and cleanup.""" if os.path.exists(LOG_FILE): with open(LOG_FILE, 'r') as f: lines = f.readlines() cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS) pruned_lines = [] for line in lines: try: timestamp = datetime.strptime(line[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc) if timestamp > cutoff: pruned_lines.append(line) except ValueError: logging.warning(f"Skipping malformed log line: {line.strip()[:50]}...") continue with open(LOG_FILE, 'w') as f: f.writelines(pruned_lines) logging.basicConfig( filename=LOG_FILE, level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S" ) def create_http_session() -> requests.Session: """Create and configure an HTTP session with retry logic.""" session = requests.Session() retry_strategy = Retry( total=MAX_RETRIES, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=["GET", "POST"] ) adapter = HTTPAdapter( max_retries=retry_strategy, pool_connections=10, pool_maxsize=10 ) session.mount("http://", adapter) session.mount("https://", adapter) return session def fetch_feed(feed_url: str, session: requests.Session) -> Optional[feedparser.FeedParserDict]: """Fetch and parse an RSS feed with error handling and retries.""" try: response = session.get(feed_url, timeout=FEED_TIMEOUT) response.raise_for_status() feed = feedparser.parse(response.content) if feed.bozo: logging.warning(f"Feed parsing error for {feed_url}: {feed.bozo_exception}") return None return feed except Exception as e: logging.error(f"Error fetching feed {feed_url}: {str(e)}") return None def is_interesting_rss(title: str, summary: str, pub_date: datetime) -> bool: """Enhanced content filtering with improved scoring.""" try: # Basic validation if not title or not summary: return False # Check if content is too old if datetime.now(timezone.utc) - pub_date > timedelta(days=7): return False # Calculate interest score score = 0 # Title analysis title_lower = title.lower() if any(keyword in title_lower for keyword in RECIPE_KEYWORDS): score += 3 if any(keyword in title_lower for keyword in PROMO_KEYWORDS): score += 2 if any(keyword in title_lower for keyword in HOME_KEYWORDS): score += 1 # Content analysis summary_lower = summary.lower() if len(summary.split()) < 100: score -= 2 if any(keyword in summary_lower for keyword in PRODUCT_KEYWORDS): score += 1 return score >= 4 except Exception as e: logging.error(f"Error in is_interesting_rss: {str(e)}") return False def fetch_rss_feeds() -> List[Dict[str, Any]]: """Fetch RSS feeds with parallel processing and improved error handling.""" session = create_http_session() articles = [] try: with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: futures = [] for feed_url in RSS_FEEDS: future = executor.submit(process_feed, feed_url, session) futures.append(future) for future in as_completed(futures): try: feed_articles = future.result() articles.extend(feed_articles) except Exception as e: logging.error(f"Error processing feed: {str(e)}") continue return articles except Exception as e: logging.error(f"Error in fetch_rss_feeds: {str(e)}") return [] def process_feed(feed_url: str, session: requests.Session) -> List[Dict[str, Any]]: """Process a single RSS feed and extract articles.""" try: feed = fetch_feed(feed_url, session) if not feed: return [] articles = [] for entry in feed.entries: try: pub_date = datetime.fromtimestamp(time.mktime(entry.published_parsed), tz=timezone.utc) article = { "title": entry.title, "link": entry.link, "summary": entry.summary if hasattr(entry, 'summary') else entry.description, "feed_title": get_clean_source_name(feed.feed.title), "pub_date": pub_date } if is_interesting_rss(article["title"], article["summary"], pub_date): articles.append(article) time.sleep(RATE_LIMIT_DELAY) except Exception as e: logging.warning(f"Error processing entry: {str(e)}") continue return articles except Exception as e: logging.error(f"Error processing feed {feed_url}: {str(e)}") return [] def parse_date(date_str): try: parsed_date = email.utils.parsedate_to_datetime(date_str) if parsed_date.tzinfo is None: parsed_date = parsed_date.replace(tzinfo=timezone.utc) return parsed_date except Exception as e: logging.error(f"Failed to parse date '{date_str}': {e}") return datetime.now(timezone.utc) def curate_from_rss(): articles = fetch_rss_feeds() if not articles: print("No RSS articles available") logging.info("No RSS articles available") return None, None, None attempts = 0 max_attempts = 10 while attempts < max_attempts and articles: article = articles.pop(0) # Take newest article title = article["title"] link = article["link"] summary = article["summary"] content = article["content"] feed_url = article["feed_title"] source_name = feed_url[0] if isinstance(feed_url, tuple) and len(feed_url) > 0 else feed_url original_source = f'{source_name}' if title in posted_titles: print(f"Skipping already posted article: {title}") logging.info(f"Skipping already posted article: {title}") attempts += 1 continue print(f"Trying RSS Article: {title} from {source_name}") logging.info(f"Trying RSS Article: {title} from {source_name}") image_query, relevance_keywords, skip = smart_image_and_filter(title, summary) if skip: print(f"Skipping filtered RSS article: {title}") logging.info(f"Skipping filtered RSS article: {title}") attempts += 1 continue # Score using title, summary, and content scoring_content = f"{title}\n\n{summary}\n\nContent: {content}" interest_score = is_interesting(scoring_content) logging.info(f"Interest score for '{title}': {interest_score}") if interest_score < 6: print(f"RSS Interest Too Low: {interest_score}") logging.info(f"RSS Interest Too Low: {interest_score}") attempts += 1 continue num_paragraphs = determine_paragraph_count(interest_score) extra_prompt = ( f"Generate exactly {num_paragraphs} paragraphs. " f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details. " f"Do NOT introduce unrelated concepts. Expand on the core idea with relevant context about its appeal or significance." ) content_to_summarize = scoring_content final_summary = summarize_with_gpt4o( content_to_summarize, source_name, link, interest_score=interest_score, extra_prompt=extra_prompt ) if not final_summary: logging.info(f"Summary failed for '{title}'") attempts += 1 continue final_summary = insert_link_naturally(final_summary, source_name, link) post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title) if not post_data: attempts += 1 continue hook = get_dynamic_hook(post_data["title"]).strip() cta = select_best_cta(post_data["title"], final_summary, post_url=None) post_data["content"] = f"{final_summary}\n\n{cta}" post_id, post_url = post_to_wp( post_data=post_data, category=category, link=link, author=author, image_url=image_url, original_source=original_source, image_source=image_source, uploader=uploader, pixabay_url=pixabay_url, interest_score=interest_score ) if post_id: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url) post_data["content"] = f"{final_summary}\n\n{cta}" post_to_wp( post_data=post_data, category=category, link=link, author=author, image_url=image_url, original_source=original_source, image_source=image_source, uploader=uploader, pixabay_url=pixabay_url, interest_score=interest_score, post_id=post_id ) timestamp = datetime.now(timezone.utc).isoformat() save_json_file(POSTED_TITLES_FILE, title, timestamp) posted_titles.add(title) logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}") if image_url: save_json_file(USED_IMAGES_FILE, image_url, timestamp) used_images.add(image_url) logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}") print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****") logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****") return post_data, category, random.randint(0, 1800) attempts += 1 logging.info(f"WP posting failed for '{post_data['title']}'") print("No interesting RSS article found after attempts") logging.info("No interesting RSS article found after attempts") return None, None, random.randint(600, 1800) def run_rss_automator(): print(f"{datetime.now(timezone.utc)} - INFO - ***** RSS Automator Launched *****") logging.info("***** RSS Automator Launched *****") post_data, category, sleep_time = curate_from_rss() print(f"Sleeping for {sleep_time}s") logging.info(f"Completed run with sleep time: {sleep_time} seconds") time.sleep(sleep_time) return post_data, category, sleep_time if __name__ == "__main__": run_rss_automator()