foodie-automator/foodie_automator_rss.py

# foodie_automator_rss.py
import requests
import random
import time
import logging
import os
import json
import signal
import sys
import re
import email.utils
import feedparser
from duckduckgo_search import DDGS
from datetime import datetime, timedelta, timezone
from bs4 import BeautifulSoup
from openai import OpenAI
from urllib.parse import quote
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from foodie_config import (
    RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS,
    HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES,
    get_clean_source_name, X_API_CREDENTIALS
)
from foodie_utils import (
    load_json_file, save_json_file, get_image, generate_image_query,
    upload_image_to_wp, determine_paragraph_count, insert_link_naturally,
    is_interesting, generate_title_from_summary, summarize_with_gpt4o,
    generate_category_from_summary, post_to_wp, prepare_post_data,
    select_best_author, smart_image_and_filter, get_flickr_image,
    get_next_author_round_robin  # Add this line
)
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
from dotenv import load_dotenv
import fcntl

load_dotenv()

is_posting = False
LOCK_FILE = "/home/shane/foodie_automator/locks/foodie_automator_rss.lock"
LOG_FILE = "/home/shane/foodie_automator/logs/foodie_automator_rss.log"
LOG_PRUNE_DAYS = 30
FEED_TIMEOUT = 15
MAX_RETRIES = 3
RETRY_BACKOFF = 2

POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json'
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
EXPIRATION_HOURS = 24
IMAGE_EXPIRATION_DAYS = 7

def setup_logging():
    """Initialize logging with pruning of old logs."""
    try:
        logging.debug("Attempting to set up logging")
        os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
        logging.debug(f"Log directory created/verified: {os.path.dirname(LOG_FILE)}")
        if not os.access(os.path.dirname(LOG_FILE), os.W_OK):
            raise PermissionError(f"No write permission for {os.path.dirname(LOG_FILE)}")

        # Test write to log file
        try:
            with open(LOG_FILE, 'a') as f:
                f.write("")
            logging.debug(f"Confirmed write access to {LOG_FILE}")
        except Exception as e:
            raise PermissionError(f"Cannot write to {LOG_FILE}: {e}")

        if os.path.exists(LOG_FILE):
            with open(LOG_FILE, 'r') as f:
                lines = f.readlines()
            cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
            pruned_lines = []
            malformed_count = 0
            for line in lines:
                if len(line) < 19 or not line[:19].replace('-', '').replace(':', '').replace(' ', '').isdigit():
                    malformed_count += 1
                    continue
                try:
                    timestamp = datetime.strptime(line[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc)
                    if timestamp > cutoff:
                        pruned_lines.append(line)
                except ValueError:
                    malformed_count += 1
                    continue
            if malformed_count > 0:
                logging.info(f"Skipped {malformed_count} malformed log lines during pruning")
            with open(LOG_FILE, 'w') as f:
                f.writelines(pruned_lines)
            logging.debug(f"Log file pruned: {LOG_FILE}")

        logging.basicConfig(
            filename=LOG_FILE,
            level=logging.INFO,
            format="%(asctime)s - %(levelname)s - %(message)s",
            datefmt="%Y-%m-%d %H:%M:%S",
            force=True
        )
        console_handler = logging.StreamHandler()
        console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
        logging.getLogger().addHandler(console_handler)
        logging.getLogger("requests").setLevel(logging.WARNING)
        logging.getLogger("openai").setLevel(logging.WARNING)
        logging.info("Logging initialized for foodie_automator_rss.py")
    except Exception as e:
        print(f"Failed to setup logging: {e}")
        sys.exit(1)

# Call setup_logging immediately
setup_logging()

posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
posted_titles = set(entry["title"] for entry in posted_titles_data)
used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)

def acquire_lock():
    try:
        logging.debug("Attempting to acquire lock")
        os.makedirs(os.path.dirname(LOCK_FILE), exist_ok=True)
        lock_fd = open(LOCK_FILE, 'w')
        fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
        lock_fd.write(str(os.getpid()))
        lock_fd.flush()
        logging.debug(f"Lock acquired: {LOCK_FILE}")
        return lock_fd
    except IOError:
        logging.info("Another instance of foodie_automator_rss.py is running")
        sys.exit(0)

def signal_handler(sig, frame):
    logging.info("Received termination signal, checking if safe to exit...")
    if is_posting:
        logging.info("Currently posting, will exit after completion.")
    else:
        logging.info("Safe to exit immediately.")
        sys.exit(0)

signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)

def create_http_session() -> requests.Session:
    session = requests.Session()
    retry_strategy = Retry(
        total=MAX_RETRIES,
        backoff_factor=RETRY_BACKOFF,
        status_forcelist=[403, 429, 500, 502, 503, 504],
        allowed_methods=["GET", "POST"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
    })
    return session

def parse_date(date_str):
    try:
        parsed_date = email.utils.parsedate_to_datetime(date_str)
        if parsed_date.tzinfo is None:
            parsed_date = parsed_date.replace(tzinfo=timezone.utc)
        return parsed_date
    except Exception as e:
        logging.error(f"Failed to parse date '{date_str}': {e}")
        return datetime.now(timezone.utc)

def fetch_rss_feeds():
    logging.info("Starting fetch_rss_feeds")
    articles = []
    cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS)
    session = create_http_session()

    if not RSS_FEEDS:
        logging.error("RSS_FEEDS is empty in foodie_config.py")
        return articles

    logging.info(f"Processing feeds: {RSS_FEEDS}")
    for feed_url in RSS_FEEDS:
        for attempt in range(MAX_RETRIES):
            logging.info(f"Processing feed: {feed_url} (attempt {attempt + 1})")
            try:
                response = session.get(feed_url, timeout=FEED_TIMEOUT)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'xml')
                items = soup.find_all('item')

                feed_title = RSS_FEED_NAMES.get(feed_url, (get_clean_source_name(feed_url), feed_url))
                for item in items:
                    try:
                        title = item.find('title').text.strip() if item.find('title') else "Untitled"
                        link = item.find('link').text.strip() if item.find('link') else ""
                        pub_date = item.find('pubDate')
                        pub_date = parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc)

                        if pub_date < cutoff_date:
                            logging.info(f"Skipping old article: {title} (Published: {pub_date})")
                            continue

                        description = item.find('description')
                        summary = BeautifulSoup(description.text, 'html.parser').get_text().strip() if description else ""
                        content = item.find('content:encoded')
                        content_text = BeautifulSoup(content.text, 'html.parser').get_text().strip() if content else summary

                        articles.append({
                            "title": title,
                            "link": link,
                            "summary": summary,
                            "content": content_text,
                            "feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title,
                            "pub_date": pub_date
                        })
                        logging.debug(f"Processed article: {title}")
                    except Exception as e:
                        logging.warning(f"Error processing entry in {feed_url}: {e}")
                        continue
                logging.info(f"Filtered to {len(articles)} articles from {feed_url}")
                break
            except Exception as e:
                logging.error(f"Failed to fetch RSS feed {feed_url}: {e}")
                if attempt < MAX_RETRIES - 1:
                    time.sleep(RETRY_BACKOFF * (2 ** attempt))
                continue
    articles.sort(key=lambda x: x["pub_date"], reverse=True)
    logging.info(f"Total RSS articles fetched: {len(articles)}")
    return articles

def fetch_duckduckgo_news_context(title, hours=24):
    for attempt in range(MAX_RETRIES):
        try:
            with DDGS() as ddgs:
                results = ddgs.news(f"{title} news", timelimit="d", max_results=5)
                titles = []
                for r in results:
                    try:
                        date_str = r["date"]
                        if '+00:00' in date_str:
                            dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
                        else:
                            dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
                        if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
                            titles.append(r["title"].lower())
                    except ValueError as e:
                        logging.warning(f"Date parsing failed for '{date_str}': {e}")
                        continue
                context = " ".join(titles) if titles else "No recent news found within 24 hours"
                logging.info(f"DuckDuckGo News context for '{title}': {context}")
                return context
        except Exception as e:
            logging.warning(f"DuckDuckGo News context fetch failed for '{title}' (attempt {attempt + 1}): {e}")
            if attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_BACKOFF * (2 ** attempt))
            continue
    logging.error(f"Failed to fetch DuckDuckGo News context for '{title}' after {MAX_RETRIES} attempts")
    return title

def curate_from_rss():
    try:
        global posted_titles_data, posted_titles, used_images
        posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
        posted_titles = set(entry["title"] for entry in posted_titles_data)
        used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)
        logging.debug(f"Loaded {len(posted_titles)} posted titles and {len(used_images)} used images")

        articles = fetch_rss_feeds()
        if not articles:
            print("No RSS articles available")
            logging.info("No RSS articles available")
            return None, None, random.randint(600, 1800)

        attempts = 0
        max_attempts = 10
        while attempts < max_attempts and articles:
            article = articles.pop(0)
            title = article["title"]
            link = article["link"]
            summary = article.get("summary", "")
            source_name = article.get("feed_title", "Unknown Source")
            original_source = f'<a href="{link}">{source_name}</a>'

            if title in posted_titles:
                print(f"Skipping already posted article: {title}")
                logging.info(f"Skipping already posted article: {title}")
                attempts += 1
                continue

            print(f"Trying RSS Article: {title} from {source_name}")
            logging.info(f"Trying RSS Article: {title} from {source_name}")

            try:
                image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary)
            except Exception as e:
                print(f"Smart image/filter error for '{title}': {e}")
                logging.warning(f"Failed to process smart_image_and_filter for '{title}': {e}")
                attempts += 1
                continue

            if skip:
                print(f"Skipping filtered RSS article: {title}")
                logging.info(f"Skipping filtered RSS article: {title}")
                attempts += 1
                continue

            ddg_context = fetch_duckduckgo_news_context(title)
            scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}"
            interest_score = is_interesting(scoring_content)
            print(f"Interest Score for '{title[:50]}...': {interest_score}")
            logging.info(f"Interest score for '{title}': {interest_score}")
            if interest_score < 6:
                print(f"RSS Interest Too Low: {interest_score}")
                logging.info(f"RSS Interest Too Low: {interest_score}")
                attempts += 1
                continue

            num_paragraphs = determine_paragraph_count(interest_score)
            extra_prompt = (
                f"Generate exactly {num_paragraphs} paragraphs.\n"
                f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
                f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
                f"Do NOT introduce unrelated concepts unless in the content or additional context.\n"
                f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
                f"Do not include emojis in the summary."
            )
            content_to_summarize = scoring_content
            final_summary = summarize_with_gpt4o(
                content_to_summarize,
                source_name,
                link,
                interest_score=interest_score,
                extra_prompt=extra_prompt
            )
            if not final_summary:
                print(f"Summary failed for '{title}'")
                logging.info(f"Summary failed for '{title}'")
                attempts += 1
                continue

            final_summary = insert_link_naturally(final_summary, source_name, link)

            # Use round-robin author selection
            author = get_next_author_round_robin()
            author_username = author["username"]
            logging.info(f"Selected author via round-robin: {author_username}")

            post_data = {
                "title": generate_title_from_summary(final_summary),
                "content": final_summary,
                "status": "publish",
                "author": author_username,
                "categories": [generate_category_from_summary(final_summary)]
            }
            category = post_data["categories"][0]
            image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic)
            if not image_url:
                print(f"Flickr image fetch failed for '{image_query}', trying fallback")
                logging.warning(f"Flickr image fetch failed for '{image_query}', trying fallback")
                image_url, image_source, uploader, page_url = get_image(image_query)
                if not image_url:
                    print(f"All image uploads failed for '{title}' - posting without image")
                    logging.warning(f"All image uploads failed for '{title}' - posting without image")
                    image_source = None
                    uploader = None
                    page_url = None

            hook = get_dynamic_hook(post_data["title"]).strip()

            share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
            share_links_template = (
                f'<p>{share_prompt} '
                f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
                f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
            )
            post_data["content"] = f"{final_summary}\n\n{share_links_template}"

            global is_posting
            is_posting = True
            try:
                post_id, post_url = post_to_wp(
                    post_data=post_data,
                    category=category,
                    link=link,
                    author=author,
                    image_url=image_url,
                    original_source=original_source,
                    image_source=image_source,
                    uploader=uploader,
                    page_url=page_url,
                    interest_score=interest_score,
                    should_post_tweet=True
                )
                if not post_id:
                    print(f"Failed to post to WordPress for '{title}'")
                    logging.warning(f"Failed to post to WordPress for '{title}'")
                    attempts += 1
                    continue
            except Exception as e:
                print(f"WordPress posting error for '{title}': {e}")
                logging.error(f"Failed to post to WordPress for '{title}': {e}", exc_info=True)
                attempts += 1
                continue
            finally:
                is_posting = False

            if post_id:
                share_text = f"Check out this foodie gem! {post_data['title']}"
                share_text_encoded = quote(share_text)
                post_url_encoded = quote(post_url)
                share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
                post_data["content"] = f"{final_summary}\n\n{share_links}"
                is_posting = True
                try:
                    post_to_wp(
                        post_data=post_data,
                        category=category,
                        link=link,
                        author=author,
                        image_url=image_url,
                        original_source=original_source,
                        image_source=image_source,
                        uploader=uploader,
                        page_url=page_url,
                        interest_score=interest_score,
                        post_id=post_id,
                        should_post_tweet=False
                    )
                except Exception as e:
                    print(f"Failed to update WordPress post '{title}' with share links: {e}")
                    logging.error(f"Failed to update WordPress post '{title}' with share links: {e}", exc_info=True)
                finally:
                    is_posting = False

                timestamp = datetime.now(timezone.utc).isoformat()
                save_json_file(POSTED_TITLES_FILE, title, timestamp)
                posted_titles.add(title)
                print(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")
                logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")

                if image_url:
                    save_json_file(USED_IMAGES_FILE, image_url, timestamp)
                    used_images.add(image_url)
                    print(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
                    logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")

                print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
                logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
                return post_data, category, random.randint(0, 1800)

            attempts += 1
            print(f"WP posting failed for '{post_data['title']}'")
            logging.info(f"WP posting failed for '{post_data['title']}'")

        print("No interesting RSS article found after attempts")
        logging.info("No interesting RSS article found after attempts")
        return None, None, random.randint(600, 1800)
    except Exception as e:
        logging.error(f"Unexpected error in curate_from_rss: {e}", exc_info=True)
        print(f"Unexpected error in curate_from_rss: {e}")
        return None, None, random.randint(600, 1800)

def run_rss_automator():
    lock_fd = None
    try:
        lock_fd = acquire_lock()
        print(f"{datetime.now(timezone.utc)} - INFO - ***** RSS Automator Launched *****")
        logging.info("***** RSS Automator Launched *****")
        post_data, category, sleep_time = curate_from_rss()
        if not post_data:
            print("No postable RSS article found")
            logging.info("No postable RSS article found")
        print(f"Sleeping for {sleep_time}s")
        logging.info(f"Completed run with sleep time: {sleep_time} seconds")
        time.sleep(sleep_time)
        return post_data, category, sleep_time
    except Exception as e:
        print(f"Fatal error in run_rss_automator: {e}")
        logging.error(f"Fatal error in run_rss_automator: {e}", exc_info=True)
        return None, None, random.randint(600, 1800)
    finally:
        if lock_fd:
            fcntl.flock(lock_fd, fcntl.LOCK_UN)
            lock_fd.close()
            os.remove(LOCK_FILE) if os.path.exists(LOCK_FILE) else None

if __name__ == "__main__":
    run_rss_automator()