foodie-automator/foodie_automator_rss.py

# foodie_automator_rss.py
import requests
import random
import time
import logging
import os
import json
import signal
import sys
import re
import email.utils
from datetime import datetime, timedelta, timezone
from bs4 import BeautifulSoup
from openai import OpenAI
from urllib.parse import quote
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from foodie_config import (
    RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS,
    HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES,
    get_clean_source_name, X_API_CREDENTIALS
)
from foodie_utils import (
    load_json_file, save_json_file, get_image, generate_image_query,
    upload_image_to_wp, determine_paragraph_count, insert_link_naturally,
    is_interesting, generate_title_from_summary, summarize_with_gpt4o,
    generate_category_from_summary, post_to_wp, prepare_post_data,
    select_best_author, smart_image_and_filter, get_flickr_image
)
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
from dotenv import load_dotenv

load_dotenv()

# Log script version to ensure it's the latest
SCRIPT_VERSION = "1.2.0"
logging.info(f"Starting foodie_automator_rss.py version {SCRIPT_VERSION}")

is_posting = False

def signal_handler(sig, frame):
    logging.info("Received termination signal, checking if safe to exit...")
    if is_posting:
        logging.info("Currently posting, will exit after completion.")
    else:
        logging.info("Safe to exit immediately.")
        sys.exit(0)

signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)

LOG_FILE = "/home/shane/foodie_automator/foodie_automator_rss.log"
LOG_PRUNE_DAYS = 30
FEED_TIMEOUT = 15
MAX_RETRIES = 3

POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json'
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
EXPIRATION_HOURS = 24
IMAGE_EXPIRATION_DAYS = 7

posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
posted_titles = set(entry["title"] for entry in posted_titles_data)
used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)

def setup_logging():
    if os.path.exists(LOG_FILE):
        with open(LOG_FILE, 'r') as f:
            lines = f.readlines()
        cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
        pruned_lines = []
        malformed_count = 0
        for line in lines:
            if len(line) < 19 or not line[:19].replace('-', '').replace(':', '').replace(' ', '').isdigit():
                malformed_count += 1
                continue
            try:
                timestamp = datetime.strptime(line[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc)
                if timestamp > cutoff:
                    pruned_lines.append(line)
            except ValueError:
                malformed_count += 1
                continue
        if malformed_count > 0:
            logging.info(f"Skipped {malformed_count} malformed log lines during pruning")
        with open(LOG_FILE, 'w') as f:
            f.writelines(pruned_lines)

    logging.basicConfig(
        filename=LOG_FILE,
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S"
    )
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    logging.getLogger().addHandler(console_handler)
    logging.getLogger("requests").setLevel(logging.WARNING)
    logging.info("Logging initialized for foodie_automator_rss.py")

setup_logging()

def create_http_session() -> requests.Session:
    session = requests.Session()
    retry_strategy = Retry(
        total=MAX_RETRIES,
        backoff_factor=2,
        status_forcelist=[403, 429, 500, 502, 503, 504],
        allowed_methods=["GET", "POST"]
    )
    adapter = HTTPAdapter(
        max_retries=retry_strategy,
        pool_connections=10,
        pool_maxsize=10
    )
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
    })
    return session

def parse_date(date_str):
    try:
        parsed_date = email.utils.parsedate_to_datetime(date_str)
        if parsed_date.tzinfo is None:
            parsed_date = parsed_date.replace(tzinfo=timezone.utc)
        return parsed_date
    except Exception as e:
        logging.error(f"Failed to parse date '{date_str}': {e}")
        return datetime.now(timezone.utc)

def fetch_rss_feeds():
    logging.info("Starting fetch_rss_feeds")
    articles = []
    cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS)
    session = create_http_session()

    if not RSS_FEEDS:
        logging.error("RSS_FEEDS is empty in foodie_config.py")
        return articles

    for feed_url in RSS_FEEDS:
        logging.info(f"Processing feed: {feed_url}")
        try:
            response = session.get(feed_url, timeout=FEED_TIMEOUT)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'xml')
            items = soup.find_all('item')

            feed_title = RSS_FEED_NAMES.get(feed_url, (get_clean_source_name(feed_url), feed_url))
            for item in items:
                try:
                    title = item.find('title').text.strip() if item.find('title') else "Untitled"
                    link = item.find('link').text.strip() if item.find('link') else ""
                    pub_date = item.find('pubDate')
                    pub_date = parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc)

                    if pub_date < cutoff_date:
                        logging.info(f"Skipping old article: {title} (Published: {pub_date})")
                        continue

                    description = item.find('description')
                    summary = BeautifulSoup(description.text, 'html.parser').get_text().strip() if description else ""
                    content = item.find('content:encoded')
                    content_text = BeautifulSoup(content.text, 'html.parser').get_text().strip() if content else summary

                    articles.append({
                        "title": title,
                        "link": link,
                        "summary": summary,
                        "content": content_text,
                        "feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title,
                        "pub_date": pub_date
                    })
                    logging.debug(f"Processed article: {title}")
                except Exception as e:
                    logging.warning(f"Error processing entry in {feed_url}: {e}")
                    continue
            logging.info(f"Filtered to {len(articles)} articles from {feed_url}")
        except Exception as e:
            logging.error(f"Failed to fetch RSS feed {feed_url}: {e}")
            continue

    articles.sort(key=lambda x: x["pub_date"], reverse=True)
    logging.info(f"Total RSS articles fetched: {len(articles)}")
    return articles

def curate_from_rss():
    articles = fetch_rss_feeds()
    if not articles:
        print("No RSS articles available")
        logging.info("No RSS articles available")
        return None, None, random.randint(600, 1800)

    attempts = 0
    max_attempts = 10
    while attempts < max_attempts and articles:
        article = articles.pop(0)
        title = article["title"]
        link = article["link"]
        summary = article["summary"]
        content = article["content"]
        source_name = article["feed_title"]
        original_source = f'<a href="{link}">{source_name}</a>'

        if title in posted_titles:
            print(f"Skipping already posted article: {title}")
            logging.info(f"Skipping already posted article: {title}")
            attempts += 1
            continue

        print(f"Trying RSS Article: {title} from {source_name}")
        logging.info(f"Trying RSS Article: {title} from {source_name}")

        image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
        if skip:
            print(f"Skipping filtered RSS article: {title}")
            logging.info(f"Skipping filtered RSS article: {title}")
            attempts += 1
            continue

        scoring_content = f"{title}\n\n{summary}\n\nContent: {content}"
        interest_score = is_interesting(scoring_content)
        logging.info(f"Interest score for '{title}': {interest_score}")
        if interest_score < 6:
            print(f"RSS Interest Too Low: {interest_score}")
            logging.info(f"RSS Interest Too Low: {interest_score}")
            attempts += 1
            continue

        num_paragraphs = determine_paragraph_count(interest_score)
        extra_prompt = (
            f"Generate exactly {num_paragraphs} paragraphs.\n"
            f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
            f"Do NOT introduce unrelated concepts.\n"
            f"Expand on the core idea with relevant context about its appeal or significance.\n"
            f"Do not include emojis in the summary."
        )
        content_to_summarize = scoring_content
        final_summary = summarize_with_gpt4o(
            content_to_summarize,
            source_name,
            link,
            interest_score=interest_score,
            extra_prompt=extra_prompt
        )
        if not final_summary:
            logging.info(f"Summary failed for '{title}'")
            attempts += 1
            continue

        # Remove the original title from the summary while preserving paragraphs
        title_pattern = re.compile(
            r'\*\*' + re.escape(title) + r'\*\*|' + re.escape(title),
            re.IGNORECASE
        )
        paragraphs = final_summary.split('\n')
        cleaned_paragraphs = []
        for para in paragraphs:
            if para.strip():
                cleaned_para = title_pattern.sub('', para).strip()
                cleaned_para = re.sub(r'\s+', ' ', cleaned_para)
                cleaned_paragraphs.append(cleaned_para)
        final_summary = '\n'.join(cleaned_paragraphs)

        final_summary = insert_link_naturally(final_summary, source_name, link)
        post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
        if not post_data:
            attempts += 1
            continue

        # Fetch image
        image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
        if not image_url:
            image_url, image_source, uploader, page_url = get_image(image_query)

        # Log the fetched image details
        logging.info(f"Fetched image for '{post_data['title']}': URL={image_url}, Source={image_source}, Uploader={uploader}, Page URL={page_url}")

        hook = get_dynamic_hook(post_data["title"]).strip()

        # Generate viral share prompt
        share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
        share_links_template = (
            f'<p>{share_prompt} '
            f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
            f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
        )
        post_data["content"] = f"{final_summary}\n\n{share_links_template}"

        global is_posting
        is_posting = True
        try:
            post_id, post_url = post_to_wp(
                post_data=post_data,
                category=category,
                link=link,
                author=author,
                image_url=image_url,
                original_source=original_source,
                image_source=image_source,
                uploader=uploader,
                pixabay_url=pixabay_url,
                interest_score=interest_score,
                should_post_tweet=True
            )
        finally:
            is_posting = False

        if post_id:
            share_text = f"Check out this foodie gem! {post_data['title']}"
            share_text_encoded = quote(share_text)
            post_url_encoded = quote(post_url)
            share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
            post_data["content"] = f"{final_summary}\n\n{share_links}"
            is_posting = True
            try:
                post_to_wp(
                    post_data=post_data,
                    category=category,
                    link=link,
                    author=author,
                    image_url=image_url,
                    original_source=original_source,
                    image_source=image_source,
                    uploader=uploader,
                    pixabay_url=pixabay_url,
                    interest_score=interest_score,
                    post_id=post_id,
                    should_post_tweet=False
                )
            finally:
                is_posting = False

            timestamp = datetime.now(timezone.utc).isoformat()
            save_json_file(POSTED_TITLES_FILE, title, timestamp)
            posted_titles.add(title)
            logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")

            if image_url:
                # Check if image is already used
                used_images_list = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
                used_image_urls = {entry["title"] for entry in used_images_list}
                if image_url in used_image_urls:
                    logging.warning(f"Image '{image_url}' already used, attempting to fetch a new image")
                    image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
                    if not image_url:
                        image_url, image_source, uploader, page_url = get_image(image_query)
                    logging.info(f"New image fetched: URL={image_url}, Source={image_source}, Uploader={uploader}, Page URL={page_url}")

                save_json_file(USED_IMAGES_FILE, image_url, timestamp)
                used_images.add(image_url)
                logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")

            print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
            logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
            return post_data, category, random.randint(0, 1800)

        attempts += 1
        logging.info(f"WP posting failed for '{post_data['title']}'")

    print("No interesting RSS article found after attempts")
    logging.info("No interesting RSS article found after attempts")
    return None, None, random.randint(600, 1800)

def run_rss_automator():
    print(f"{datetime.now(timezone.utc)} - INFO - ***** RSS Automator Launched *****")
    logging.info("***** RSS Automator Launched *****")
    post_data, category, sleep_time = curate_from_rss()
    print(f"Sleeping for {sleep_time}s")
    logging.info(f"Completed run with sleep time: {sleep_time} seconds")
    time.sleep(sleep_time)
    return post_data, category, sleep_time

if __name__ == "__main__":
    run_rss_automator()