Initial commit of foodie automator scripts

2025-04-20 20:12:00 +10:00
commit d4f098639e
7 changed files with 2120 additions and 0 deletions
@@ -0,0 +1,330 @@
+import requests
+import random
+import time
+import logging
+import os
+import json
+import email.utils
+from datetime import datetime, timedelta, timezone
+from bs4 import BeautifulSoup
+from openai import OpenAI
+from urllib.parse import quote
+from requests.packages.urllib3.util.retry import Retry
+from requests.adapters import HTTPAdapter
+from foodie_config import RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, SUMMARY_PERSONA_PROMPTS, CATEGORIES, get_clean_source_name
+from foodie_utils import (
+    load_json_file, save_json_file, get_image, generate_image_query,
+    upload_image_to_wp, determine_paragraph_count, insert_link_naturally, is_interesting,
+    generate_title_from_summary, summarize_with_gpt4o, generate_category_from_summary, post_to_wp,
+    prepare_post_data, select_best_author, smart_image_and_filter
+)
+from foodie_hooks import get_dynamic_hook, select_best_cta
+import feedparser
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import List, Dict, Any, Optional
+
+LOG_FILE = "/home/shane/foodie_automator/foodie_automator_rss.log"
+LOG_PRUNE_DAYS = 30
+MAX_WORKERS = 5  # Number of concurrent workers for parallel processing
+RATE_LIMIT_DELAY = 1  # Delay between API calls in seconds
+FEED_TIMEOUT = 30  # Timeout for feed requests in seconds
+MAX_RETRIES = 3  # Maximum number of retries for failed requests
+
+def setup_logging():
+    """Configure logging with rotation and cleanup."""
+    if os.path.exists(LOG_FILE):
+        with open(LOG_FILE, 'r') as f:
+            lines = f.readlines()
+        cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
+        pruned_lines = []
+        for line in lines:
+            try:
+                timestamp = datetime.strptime(line[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc)
+                if timestamp > cutoff:
+                    pruned_lines.append(line)
+            except ValueError:
+                logging.warning(f"Skipping malformed log line: {line.strip()[:50]}...")
+                continue
+        with open(LOG_FILE, 'w') as f:
+            f.writelines(pruned_lines)
+    
+    logging.basicConfig(
+        filename=LOG_FILE,
+        level=logging.INFO,
+        format="%(asctime)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S"
+    )
+
+def create_http_session() -> requests.Session:
+    """Create and configure an HTTP session with retry logic."""
+    session = requests.Session()
+    retry_strategy = Retry(
+        total=MAX_RETRIES,
+        backoff_factor=1,
+        status_forcelist=[429, 500, 502, 503, 504],
+        allowed_methods=["GET", "POST"]
+    )
+    adapter = HTTPAdapter(
+        max_retries=retry_strategy,
+        pool_connections=10,
+        pool_maxsize=10
+    )
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    return session
+
+def fetch_feed(feed_url: str, session: requests.Session) -> Optional[feedparser.FeedParserDict]:
+    """Fetch and parse an RSS feed with error handling and retries."""
+    try:
+        response = session.get(feed_url, timeout=FEED_TIMEOUT)
+        response.raise_for_status()
+        feed = feedparser.parse(response.content)
+        
+        if feed.bozo:
+            logging.warning(f"Feed parsing error for {feed_url}: {feed.bozo_exception}")
+            return None
+            
+        return feed
+    except Exception as e:
+        logging.error(f"Error fetching feed {feed_url}: {str(e)}")
+        return None
+
+def is_interesting_rss(title: str, summary: str, pub_date: datetime) -> bool:
+    """Enhanced content filtering with improved scoring."""
+    try:
+        # Basic validation
+        if not title or not summary:
+            return False
+            
+        # Check if content is too old
+        if datetime.now(timezone.utc) - pub_date > timedelta(days=7):
+            return False
+            
+        # Calculate interest score
+        score = 0
+        
+        # Title analysis
+        title_lower = title.lower()
+        if any(keyword in title_lower for keyword in RECIPE_KEYWORDS):
+            score += 3
+        if any(keyword in title_lower for keyword in PROMO_KEYWORDS):
+            score += 2
+        if any(keyword in title_lower for keyword in HOME_KEYWORDS):
+            score += 1
+            
+        # Content analysis
+        summary_lower = summary.lower()
+        if len(summary.split()) < 100:
+            score -= 2
+        if any(keyword in summary_lower for keyword in PRODUCT_KEYWORDS):
+            score += 1
+            
+        return score >= 4
+    except Exception as e:
+        logging.error(f"Error in is_interesting_rss: {str(e)}")
+        return False
+
+def fetch_rss_feeds() -> List[Dict[str, Any]]:
+    """Fetch RSS feeds with parallel processing and improved error handling."""
+    session = create_http_session()
+    articles = []
+    
+    try:
+        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+            futures = []
+            for feed_url in RSS_FEEDS:
+                future = executor.submit(process_feed, feed_url, session)
+                futures.append(future)
+                
+            for future in as_completed(futures):
+                try:
+                    feed_articles = future.result()
+                    articles.extend(feed_articles)
+                except Exception as e:
+                    logging.error(f"Error processing feed: {str(e)}")
+                    continue
+                    
+        return articles
+    except Exception as e:
+        logging.error(f"Error in fetch_rss_feeds: {str(e)}")
+        return []
+
+def process_feed(feed_url: str, session: requests.Session) -> List[Dict[str, Any]]:
+    """Process a single RSS feed and extract articles."""
+    try:
+        feed = fetch_feed(feed_url, session)
+        if not feed:
+            return []
+            
+        articles = []
+        for entry in feed.entries:
+            try:
+                pub_date = datetime.fromtimestamp(time.mktime(entry.published_parsed), tz=timezone.utc)
+                
+                article = {
+                    "title": entry.title,
+                    "link": entry.link,
+                    "summary": entry.summary if hasattr(entry, 'summary') else entry.description,
+                    "feed_title": get_clean_source_name(feed.feed.title),
+                    "pub_date": pub_date
+                }
+                
+                if is_interesting_rss(article["title"], article["summary"], pub_date):
+                    articles.append(article)
+                    
+                time.sleep(RATE_LIMIT_DELAY)
+            except Exception as e:
+                logging.warning(f"Error processing entry: {str(e)}")
+                continue
+                
+        return articles
+    except Exception as e:
+        logging.error(f"Error processing feed {feed_url}: {str(e)}")
+        return []
+
+def parse_date(date_str):
+    try:
+        parsed_date = email.utils.parsedate_to_datetime(date_str)
+        if parsed_date.tzinfo is None:
+            parsed_date = parsed_date.replace(tzinfo=timezone.utc)
+        return parsed_date
+    except Exception as e:
+        logging.error(f"Failed to parse date '{date_str}': {e}")
+        return datetime.now(timezone.utc)
+
+def curate_from_rss():
+    articles = fetch_rss_feeds()
+    if not articles:
+        print("No RSS articles available")
+        logging.info("No RSS articles available")
+        return None, None, None
+
+    attempts = 0
+    max_attempts = 10
+    while attempts < max_attempts and articles:
+        article = articles.pop(0)  # Take newest article
+        title = article["title"]
+        link = article["link"]
+        summary = article["summary"]
+        content = article["content"]
+        feed_url = article["feed_title"]
+        source_name = feed_url[0] if isinstance(feed_url, tuple) and len(feed_url) > 0 else feed_url
+        original_source = f'<a href="{link}">{source_name}</a>'
+
+        if title in posted_titles:
+            print(f"Skipping already posted article: {title}")
+            logging.info(f"Skipping already posted article: {title}")
+            attempts += 1
+            continue
+
+        print(f"Trying RSS Article: {title} from {source_name}")
+        logging.info(f"Trying RSS Article: {title} from {source_name}")
+
+        image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
+        if skip:
+            print(f"Skipping filtered RSS article: {title}")
+            logging.info(f"Skipping filtered RSS article: {title}")
+            attempts += 1
+            continue
+
+        # Score using title, summary, and content
+        scoring_content = f"{title}\n\n{summary}\n\nContent: {content}"
+        interest_score = is_interesting(scoring_content)
+        logging.info(f"Interest score for '{title}': {interest_score}")
+        if interest_score < 6:
+            print(f"RSS Interest Too Low: {interest_score}")
+            logging.info(f"RSS Interest Too Low: {interest_score}")
+            attempts += 1
+            continue
+
+        num_paragraphs = determine_paragraph_count(interest_score)
+        extra_prompt = (
+            f"Generate exactly {num_paragraphs} paragraphs. "
+            f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details. "
+            f"Do NOT introduce unrelated concepts. Expand on the core idea with relevant context about its appeal or significance."
+        )
+        content_to_summarize = scoring_content
+        final_summary = summarize_with_gpt4o(
+            content_to_summarize,
+            source_name,
+            link,
+            interest_score=interest_score,
+            extra_prompt=extra_prompt
+        )
+        if not final_summary:
+            logging.info(f"Summary failed for '{title}'")
+            attempts += 1
+            continue
+
+        final_summary = insert_link_naturally(final_summary, source_name, link)
+        post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
+        if not post_data:
+            attempts += 1
+            continue
+
+        hook = get_dynamic_hook(post_data["title"]).strip()
+        cta = select_best_cta(post_data["title"], final_summary, post_url=None)
+
+        post_data["content"] = f"{final_summary}\n\n{cta}"
+        post_id, post_url = post_to_wp(
+            post_data=post_data,
+            category=category,
+            link=link,
+            author=author,
+            image_url=image_url,
+            original_source=original_source,
+            image_source=image_source,
+            uploader=uploader,
+            pixabay_url=pixabay_url,
+            interest_score=interest_score
+        )
+
+        if post_id:
+            cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
+            post_data["content"] = f"{final_summary}\n\n{cta}"
+            post_to_wp(
+                post_data=post_data,
+                category=category,
+                link=link,
+                author=author,
+                image_url=image_url,
+                original_source=original_source,
+                image_source=image_source,
+                uploader=uploader,
+                pixabay_url=pixabay_url,
+                interest_score=interest_score,
+                post_id=post_id
+            )
+
+            timestamp = datetime.now(timezone.utc).isoformat()
+            save_json_file(POSTED_TITLES_FILE, title, timestamp)
+            posted_titles.add(title)
+            logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")
+
+            if image_url:
+                save_json_file(USED_IMAGES_FILE, image_url, timestamp)
+                used_images.add(image_url)
+                logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
+
+            print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
+            logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
+            return post_data, category, random.randint(0, 1800)
+
+        attempts += 1
+        logging.info(f"WP posting failed for '{post_data['title']}'")
+
+    print("No interesting RSS article found after attempts")
+    logging.info("No interesting RSS article found after attempts")
+    return None, None, random.randint(600, 1800)
+
+def run_rss_automator():
+    print(f"{datetime.now(timezone.utc)} - INFO - ***** RSS Automator Launched *****")
+    logging.info("***** RSS Automator Launched *****")
+    post_data, category, sleep_time = curate_from_rss()
+    print(f"Sleeping for {sleep_time}s")
+    logging.info(f"Completed run with sleep time: {sleep_time} seconds")
+    time.sleep(sleep_time)
+    return post_data, category, sleep_time
+
+if __name__ == "__main__":
+    run_rss_automator()