foodie-automator/foodie_automator_rss.py

# foodie_automator_rss.py
import requests
import random
import time
import logging
import os
import json
import signal
import sys
import email.utils
from datetime import datetime, timedelta, timezone
from bs4 import BeautifulSoup
from openai import OpenAI
from urllib.parse import quote
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from foodie_config import (
    RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS,
    HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES,
    get_clean_source_name, X_API_CREDENTIALS
)
from foodie_utils import (
    load_json_file, save_json_file, get_image, generate_image_query,
    upload_image_to_wp, determine_paragraph_count, insert_link_naturally,
    is_interesting, generate_title_from_summary, summarize_with_gpt4o,
    generate_category_from_summary, post_to_wp, prepare_post_data,
    select_best_author, smart_image_and_filter
)
from foodie_hooks import get_dynamic_hook, select_best_cta
import feedparser
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict, Any, Optional
from dotenv import load_dotenv

load_dotenv()

# Flag to indicate if we're in the middle of posting
is_posting = False

def signal_handler(sig, frame):
    logging.info("Received termination signal, checking if safe to exit...")
    if is_posting:
        logging.info("Currently posting, will exit after completion.")
    else:
        logging.info("Safe to exit immediately.")
        sys.exit(0)

signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)

LOG_FILE = "/home/shane/foodie_automator/foodie_automator_rss.log"
LOG_PRUNE_DAYS = 30
MAX_WORKERS = 5
RATE_LIMIT_DELAY = 1
FEED_TIMEOUT = 30
MAX_RETRIES = 3

POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json'
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
EXPIRATION_HOURS = 24
IMAGE_EXPIRATION_DAYS = 7

posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
posted_titles = set(entry["title"] for entry in posted_titles_data)
used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)

def setup_logging():
    if os.path.exists(LOG_FILE):
        with open(LOG_FILE, 'r') as f:
            lines = f.readlines()
        cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
        pruned_lines = []
        for line in lines:
            try:
                timestamp = datetime.strptime(line[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc)
                if timestamp > cutoff:
                    pruned_lines.append(line)
            except ValueError:
                logging.warning(f"Skipping malformed log line: {line.strip()[:50]}...")
                continue
        with open(LOG_FILE, 'w') as f:
            f.writelines(pruned_lines)

    logging.basicConfig(
        filename=LOG_FILE,
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S"
    )
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    logging.getLogger().addHandler(console_handler)
    logging.info("Logging initialized for foodie_automator_rss.py")

setup_logging()

def create_http_session() -> requests.Session:
    session = requests.Session()
    retry_strategy = Retry(
        total=MAX_RETRIES,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET", "POST"]
    )
    adapter = HTTPAdapter(
        max_retries=retry_strategy,
        pool_connections=10,
        pool_maxsize=10
    )
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session

def fetch_feed(feed_url: str, session: requests.Session) -> Optional[feedparser.FeedParserDict]:
    try:
        response = session.get(feed_url, timeout=FEED_TIMEOUT)
        response.raise_for_status()
        feed = feedparser.parse(response.content)

        if feed.bozo:
            logging.warning(f"Feed parsing error for {feed_url}: {feed.bozo_exception}")
            return None

        return feed
    except Exception as e:
        logging.error(f"Error fetching feed {feed_url}: {str(e)}")
        return None

def is_interesting_rss(title: str, summary: str, pub_date: datetime) -> bool:
    try:
        if not title or not summary:
            return False

        if datetime.now(timezone.utc) - pub_date > timedelta(days=7):
            return False

        score = 0
        title_lower = title.lower()
        if any(keyword in title_lower for keyword in RECIPE_KEYWORDS):
            score += 3
        if any(keyword in title_lower for keyword in PROMO_KEYWORDS):
            score += 2
        if any(keyword in title_lower for keyword in HOME_KEYWORDS):
            score += 1

        summary_lower = summary.lower()
        if len(summary.split()) < 100:
            score -= 2
        if any(keyword in summary_lower for keyword in PRODUCT_KEYWORDS):
            score += 1

        return score >= 4
    except Exception as e:
        logging.error(f"Error in is_interesting_rss: {str(e)}")
        return False

def fetch_rss_feeds() -> List[Dict[str, Any]]:
    session = create_http_session()
    articles = []

    try:
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            futures = []
            for feed_url in RSS_FEEDS:
                future = executor.submit(process_feed, feed_url, session)
                futures.append(future)

            for future in as_completed(futures):
                try:
                    feed_articles = future.result()
                    articles.extend(feed_articles)
                except Exception as e:
                    logging.error(f"Error processing feed: {str(e)}")
                    continue

        return articles
    except Exception as e:
        logging.error(f"Error in fetch_rss_feeds: {str(e)}")
        return []

def process_feed(feed_url: str, session: requests.Session) -> List[Dict[str, Any]]:
    try:
        feed = fetch_feed(feed_url, session)
        if not feed:
            return []

        articles = []
        for entry in feed.entries:
            try:
                pub_date = datetime.fromtimestamp(time.mktime(entry.published_parsed), tz=timezone.utc)

                article = {
                    "title": entry.title,
                    "link": entry.link,
                    "summary": entry.summary if hasattr(entry, 'summary') else entry.description,
                    "content": getattr(entry, 'content', [{'value': ''}])[0].value,
                    "feed_title": get_clean_source_name(feed_url),
                    "pub_date": pub_date
                }

                if is_interesting_rss(article["title"], article["summary"], pub_date):
                    articles.append(article)

                time.sleep(RATE_LIMIT_DELAY)
            except Exception as e:
                logging.warning(f"Error processing entry: {str(e)}")
                continue

        return articles
    except Exception as e:
        logging.error(f"Error processing feed {feed_url}: {str(e)}")
        return []

def parse_date(date_str):
    try:
        parsed_date = email.utils.parsedate_to_datetime(date_str)
        if parsed_date.tzinfo is None:
            parsed_date = parsed_date.replace(tzinfo=timezone.utc)
        return parsed_date
    except Exception as e:
        logging.error(f"Failed to parse date '{date_str}': {e}")
        return datetime.now(timezone.utc)

def curate_from_rss():
    articles = fetch_rss_feeds()
    if not articles:
        print("No RSS articles available")
        logging.info("No RSS articles available")
        return None, None, None

    attempts = 0
    max_attempts = 10
    while attempts < max_attempts and articles:
        article = articles.pop(0)
        title = article["title"]
        link = article["link"]
        summary = article["summary"]
        content = article["content"]
        source_name = article["feed_title"]
        original_source = f'<a href="{link}">{source_name}</a>'

        if title in posted_titles:
            print(f"Skipping already posted article: {title}")
            logging.info(f"Skipping already posted article: {title}")
            attempts += 1
            continue

        print(f"Trying RSS Article: {title} from {source_name}")
        logging.info(f"Trying RSS Article: {title} from {source_name}")

        image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
        if skip:
            print(f"Skipping filtered RSS article: {title}")
            logging.info(f"Skipping filtered RSS article: {title}")
            attempts += 1
            continue

        scoring_content = f"{title}\n\n{summary}\n\nContent: {content}"
        interest_score = is_interesting(scoring_content)
        logging.info(f"Interest score for '{title}': {interest_score}")
        if interest_score < 6:
            print(f"RSS Interest Too Low: {interest_score}")
            logging.info(f"RSS Interest Too Low: {interest_score}")
            attempts += 1
            continue

        num_paragraphs = determine_paragraph_count(interest_score)
        extra_prompt = (
            f"Generate exactly {num_paragraphs} paragraphs. "
            f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details. "
            f"Do NOT introduce unrelated concepts. Expand on the core idea with relevant context about its appeal or significance."
            "Do not include emojis in the summary."
        )
        content_to_summarize = scoring_content
        final_summary = summarize_with_gpt4o(
            content_to_summarize,
            source_name,
            link,
            interest_score=interest_score,
            extra_prompt=extra_prompt
        )
        if not final_summary:
            logging.info(f"Summary failed for '{title}'")
            attempts += 1
            continue

        final_summary = insert_link_naturally(final_summary, source_name, link)
        post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
        if not post_data:
            attempts += 1
            continue

        hook = get_dynamic_hook(post_data["title"]).strip()
        cta = select_best_cta(post_data["title"], final_summary, post_url=None)

        post_data["content"] = f"{final_summary}\n\n{cta}"
        global is_posting
        is_posting = True
        try:
            post_id, post_url = post_to_wp(
                post_data=post_data,
                category=category,
                link=link,
                author=author,
                image_url=image_url,
                original_source=original_source,
                image_source=image_source,
                uploader=uploader,
                pixabay_url=pixabay_url,
                interest_score=interest_score
            )
        finally:
            is_posting = False

        if post_id:
            cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
            post_data["content"] = f"{final_summary}\n\n{cta}"
            is_posting = True
            try:
                post_to_wp(
                    post_data=post_data,
                    category=category,
                    link=link,
                    author=author,
                    image_url=image_url,
                    original_source=original_source,
                    image_source=image_source,
                    uploader=uploader,
                    pixabay_url=pixabay_url,
                    interest_score=interest_score,
                    post_id=post_id
                )
            finally:
                is_posting = False

            timestamp = datetime.now(timezone.utc).isoformat()
            save_json_file(POSTED_TITLES_FILE, title, timestamp)
            posted_titles.add(title)
            logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")

            if image_url:
                save_json_file(USED_IMAGES_FILE, image_url, timestamp)
                used_images.add(image_url)
                logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")

            print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
            logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
            return post_data, category, random.randint(0, 1800)

        attempts += 1
        logging.info(f"WP posting failed for '{post_data['title']}'")

    print("No interesting RSS article found after attempts")
    logging.info("No interesting RSS article found after attempts")
    return None, None, random.randint(600, 1800)

def run_rss_automator():
    print(f"{datetime.now(timezone.utc)} - INFO - ***** RSS Automator Launched *****")
    logging.info("***** RSS Automator Launched *****")
    post_data, category, sleep_time = curate_from_rss()
    print(f"Sleeping for {sleep_time}s")
    logging.info(f"Completed run with sleep time: {sleep_time} seconds")
    time.sleep(sleep_time)
    return post_data, category, sleep_time

if __name__ == "__main__":
    run_rss_automator()