foodie-automator/foodie_automator_rss.py

# foodie_automator_rss.py
import requests
import random
import time
import logging
import os
import json
import signal
import sys
import re
import email.utils
from datetime import datetime, timedelta, timezone
from typing import List, Dict, Optional, Tuple, Set
from bs4 import BeautifulSoup
from openai import OpenAI
from urllib.parse import quote
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from foodie_config import (
    RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS,
    HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES,
    get_clean_source_name, X_API_CREDENTIALS, FILE_PATHS, EXPIRATION_DAYS,
    IMAGE_EXPIRATION_DAYS, LIGHT_TASK_MODEL
)
from foodie_utils import (
    load_json_file, save_json_file, get_image, generate_image_query,
    upload_image_to_wp, determine_paragraph_count, insert_link_naturally,
    is_interesting, generate_title_from_summary, summarize_with_gpt4o,
    generate_category_from_summary, post_to_wp, prepare_post_data,
    select_best_author, smart_image_and_filter, get_flickr_image
)
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Global state
is_posting = False
logger = logging.getLogger(__name__)

class RSSScraper:
    def __init__(self):
        self.setup_logging()
        self.setup_signal_handlers()
        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        self.posted_titles = self.load_posted_titles()
        self.used_images = self.load_used_images()
        self.session = self.setup_http_session()

    def setup_logging(self) -> None:
        """Configure logging for the scraper."""
        log_file = FILE_PATHS["posted_rss_titles"].with_suffix('.log')
        self.prune_old_logs(log_file)

        logging.basicConfig(
            filename=str(log_file),
            level=logging.INFO,
            format="%(asctime)s - %(levelname)s - %(message)s"
        )
        logging.getLogger("requests").setLevel(logging.WARNING)
        console_handler = logging.StreamHandler()
        console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
        logging.getLogger().addHandler(console_handler)
        logger.info("Logging initialized for RSS scraper")

    def prune_old_logs(self, log_file: str) -> None:
        """Prune log entries older than LOG_PRUNE_DAYS."""
        if not os.path.exists(log_file):
            return

        with open(log_file, 'r') as f:
            lines = f.readlines()

        cutoff = datetime.now(timezone.utc) - timedelta(days=30)  # LOG_PRUNE_DAYS
        pruned_lines = []
        malformed_count = 0

        for line in lines:
            if len(line) < 19 or not line[:19].replace('-', '').replace(':', '').replace(' ', '').isdigit():
                malformed_count += 1
                continue
            try:
                timestamp = datetime.strptime(line[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc)
                if timestamp > cutoff:
                    pruned_lines.append(line)
            except ValueError:
                malformed_count += 1
                continue

        if malformed_count > 0:
            logger.warning(f"Skipped {malformed_count} malformed log lines during pruning")

        with open(log_file, 'w') as f:
            f.writelines(pruned_lines)

    def setup_signal_handlers(self) -> None:
        """Set up signal handlers for graceful shutdown."""
        def signal_handler(sig, frame):
            logger.info("Received termination signal, checking if safe to exit...")
            if is_posting:
                logger.info("Currently posting, will exit after completion.")
            else:
                logger.info("Safe to exit immediately.")
                sys.exit(0)

        signal.signal(signal.SIGTERM, signal_handler)
        signal.signal(signal.SIGINT, signal_handler)

    def setup_http_session(self) -> requests.Session:
        """Set up a requests session with retry logic."""
        session = requests.Session()
        retry_strategy = Retry(
            total=3,
            backoff_factor=2,
            status_forcelist=[403, 429, 500, 502, 503, 504],
            allowed_methods=["GET", "POST"]
        )
        adapter = HTTPAdapter(
            max_retries=retry_strategy,
            pool_connections=10,
            pool_maxsize=10
        )
        session.mount("http://", adapter)
        session.mount("https://", adapter)
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
        })
        return session

    def load_posted_titles(self) -> Set[str]:
        """Load and return the set of posted titles."""
        try:
            data = load_json_file(FILE_PATHS["posted_rss_titles"], EXPIRATION_DAYS)
            return {entry["title"] for entry in data if "title" in entry}
        except Exception as e:
            logger.error(f"Error loading posted titles: {e}")
            return set()

    def load_used_images(self) -> Set[str]:
        """Load and return the set of used images."""
        try:
            data = load_json_file(FILE_PATHS["used_images"], IMAGE_EXPIRATION_DAYS)
            return {entry["title"] for entry in data if "title" in entry}
        except Exception as e:
            logger.error(f"Error loading used images: {e}")
            return set()

    def parse_date(self, date_str: str) -> datetime:
        """Parse a date string into a datetime object."""
        try:
            parsed_date = email.utils.parsedate_to_datetime(date_str)
            if parsed_date.tzinfo is None:
                parsed_date = parsed_date.replace(tzinfo=timezone.utc)
            return parsed_date
        except Exception as e:
            logger.error(f"Failed to parse date '{date_str}': {e}")
            return datetime.now(timezone.utc)

    def fetch_rss_feeds(self) -> List[Dict]:
        """Fetch and process RSS feeds."""
        logger.info("Starting fetch_rss_feeds")
        articles = []
        cutoff_date = datetime.now(timezone.utc) - timedelta(hours=24)

        if not RSS_FEEDS:
            logger.error("RSS_FEEDS is empty in foodie_config.py")
            return articles

        for feed_url in RSS_FEEDS:
            logger.info(f"Processing feed: {feed_url}")
            try:
                response = self.session.get(feed_url, timeout=15)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'xml')
                items = soup.find_all('item')

                feed_title = RSS_FEED_NAMES.get(feed_url, (get_clean_source_name(feed_url), feed_url))
                for item in items:
                    try:
                        title = item.find('title').text.strip() if item.find('title') else "Untitled"
                        link = item.find('link').text.strip() if item.find('link') else ""
                        pub_date = item.find('pubDate')
                        pub_date = self.parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc)

                        if pub_date < cutoff_date:
                            logger.info(f"Skipping old article: {title} (Published: {pub_date})")
                            continue

                        description = item.find('description')
                        summary = BeautifulSoup(description.text, 'html.parser').get_text().strip() if description else ""
                        content = item.find('content:encoded')
                        content_text = BeautifulSoup(content.text, 'html.parser').get_text().strip() if content else summary

                        articles.append({
                            "title": title,
                            "link": link,
                            "summary": summary,
                            "content": content_text,
                            "feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title,
                            "pub_date": pub_date
                        })
                    except Exception as e:
                        logger.warning(f"Error processing entry in {feed_url}: {e}")
                        continue
                logger.info(f"Filtered to {len(articles)} articles from {feed_url}")
            except Exception as e:
                logger.error(f"Failed to fetch RSS feed {feed_url}: {e}")
                continue

        articles.sort(key=lambda x: x["pub_date"], reverse=True)
        logger.info(f"Total RSS articles fetched: {len(articles)}")
        return articles

    def curate_from_rss(self) -> Tuple[Optional[Dict], Optional[str], int]:
        """Curate content from RSS feeds."""
        articles = self.fetch_rss_feeds()
        if not articles:
            logger.info("No RSS articles available")
            return None, None, random.randint(600, 1800)

        for article in articles:
            title = article["title"]
            link = article["link"]
            summary = article["summary"]
            content = article["content"]
            source_name = article["feed_title"]

            if title in self.posted_titles:
                logger.info(f"Skipping already posted article: {title}")
                continue

            logger.info(f"Processing RSS Article: {title} from {source_name}")

            image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
            if skip:
                logger.info(f"Skipping filtered RSS article: {title}")
                continue

            scoring_content = f"{title}\n\n{summary}\n\nContent: {content}"
            interest_score = is_interesting(scoring_content)
            logger.info(f"Interest score for '{title}': {interest_score}")

            if interest_score < 6:
                logger.info(f"RSS Interest Too Low: {interest_score}")
                continue

            num_paragraphs = determine_paragraph_count(interest_score)
            extra_prompt = (
                f"Generate exactly {num_paragraphs} paragraphs.\n"
                f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
                f"Do NOT introduce unrelated concepts.\n"
                f"Expand on the core idea with relevant context about its appeal or significance.\n"
                f"Do not include emojis in the summary."
            )

            final_summary = summarize_with_gpt4o(
                scoring_content,
                source_name,
                link,
                interest_score=interest_score,
                extra_prompt=extra_prompt
            )

            if not final_summary:
                logger.info(f"Summary failed for '{title}'")
                continue

            final_summary = insert_link_naturally(final_summary, source_name, link)
            post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)

            if post_data and author:
                return post_data, author, random.randint(600, 1800)

        return None, None, random.randint(600, 1800)

def run_rss_automator():
    """Main function to run the RSS automator."""
    scraper = RSSScraper()
    while True:
        try:
            post_data, author, sleep_time = scraper.curate_from_rss()
            if post_data and author:
                global is_posting
                is_posting = True
                try:
                    post_to_wp(post_data, author)
                    logger.info(f"Successfully posted: {post_data['title']}")
                finally:
                    is_posting = False
            time.sleep(sleep_time)
        except Exception as e:
            logger.error(f"Error in RSS automator: {e}")
            time.sleep(300)  # Wait 5 minutes before retrying

if __name__ == "__main__":
    run_rss_automator()