merge posting x into main files

2025-04-28 21:23:12 +10:00
parent a1d2ce4215
commit ea7d36a22b
7 changed files with 394 additions and 446 deletions
@@ -1,3 +1,4 @@
+# foodie_automator_rss.py
 import requests
 import random
 import time
@@ -13,12 +14,17 @@ from openai import OpenAI
 from urllib.parse import quote
 from requests.packages.urllib3.util.retry import Retry
 from requests.adapters import HTTPAdapter
-from foodie_config import RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, SUMMARY_PERSONA_PROMPTS, CATEGORIES, get_clean_source_name
+from foodie_config import (
+    RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS,
+    HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES,
+    get_clean_source_name, X_API_CREDENTIALS
+)
 from foodie_utils import (
    load_json_file, save_json_file, get_image, generate_image_query,
-    upload_image_to_wp, determine_paragraph_count, insert_link_naturally, is_interesting,
-    generate_title_from_summary, summarize_with_gpt4o, generate_category_from_summary, post_to_wp,
-    prepare_post_data, select_best_author, smart_image_and_filter
+    upload_image_to_wp, determine_paragraph_count, insert_link_naturally,
+    is_interesting, generate_title_from_summary, summarize_with_gpt4o,
+    generate_category_from_summary, post_to_wp, prepare_post_data,
+    select_best_author, smart_image_and_filter
 )
 from foodie_hooks import get_dynamic_hook, select_best_cta
 import feedparser
@@ -27,6 +33,7 @@ from typing import List, Dict, Any, Optional
 from dotenv import load_dotenv

 load_dotenv()
+
 # Flag to indicate if we're in the middle of posting
 is_posting = False

@@ -43,10 +50,10 @@ signal.signal(signal.SIGINT, signal_handler)

 LOG_FILE = "/home/shane/foodie_automator/foodie_automator_rss.log"
 LOG_PRUNE_DAYS = 30
-MAX_WORKERS = 5  # Number of concurrent workers for parallel processing
-RATE_LIMIT_DELAY = 1  # Delay between API calls in seconds
-FEED_TIMEOUT = 30  # Timeout for feed requests in seconds
-MAX_RETRIES = 3  # Maximum number of retries for failed requests
+MAX_WORKERS = 5
+RATE_LIMIT_DELAY = 1
+FEED_TIMEOUT = 30
+MAX_RETRIES = 3

 POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json'
 USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
@@ -58,7 +65,6 @@ posted_titles = set(entry["title"] for entry in posted_titles_data)
 used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)

 def setup_logging():
-    """Configure logging with rotation and cleanup."""
    if os.path.exists(LOG_FILE):
        with open(LOG_FILE, 'r') as f:
            lines = f.readlines()
@@ -81,9 +87,14 @@ def setup_logging():
        format="%(asctime)s - %(levelname)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S"
    )
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
+    logging.getLogger().addHandler(console_handler)
+    logging.info("Logging initialized for foodie_automator_rss.py")
+
+setup_logging()

 def create_http_session() -> requests.Session:
-    """Create and configure an HTTP session with retry logic."""
    session = requests.Session()
    retry_strategy = Retry(
        total=MAX_RETRIES,
@@ -101,7 +112,6 @@ def create_http_session() -> requests.Session:
    return session

 def fetch_feed(feed_url: str, session: requests.Session) -> Optional[feedparser.FeedParserDict]:
-    """Fetch and parse an RSS feed with error handling and retries."""
    try:
        response = session.get(feed_url, timeout=FEED_TIMEOUT)
        response.raise_for_status()
@@ -117,20 +127,14 @@ def fetch_feed(feed_url: str, session: requests.Session) -> Optional[feedparser.
        return None

 def is_interesting_rss(title: str, summary: str, pub_date: datetime) -> bool:
-    """Enhanced content filtering with improved scoring."""
    try:
-        # Basic validation
        if not title or not summary:
            return False
            
-        # Check if content is too old
        if datetime.now(timezone.utc) - pub_date > timedelta(days=7):
            return False
            
-        # Calculate interest score
        score = 0
-        
-        # Title analysis
        title_lower = title.lower()
        if any(keyword in title_lower for keyword in RECIPE_KEYWORDS):
            score += 3
@@ -139,7 +143,6 @@ def is_interesting_rss(title: str, summary: str, pub_date: datetime) -> bool:
        if any(keyword in title_lower for keyword in HOME_KEYWORDS):
            score += 1
            
-        # Content analysis
        summary_lower = summary.lower()
        if len(summary.split()) < 100:
            score -= 2
@@ -152,7 +155,6 @@ def is_interesting_rss(title: str, summary: str, pub_date: datetime) -> bool:
        return False

 def fetch_rss_feeds() -> List[Dict[str, Any]]:
-    """Fetch RSS feeds with parallel processing and improved error handling."""
    session = create_http_session()
    articles = []
    
@@ -177,7 +179,6 @@ def fetch_rss_feeds() -> List[Dict[str, Any]]:
        return []

 def process_feed(feed_url: str, session: requests.Session) -> List[Dict[str, Any]]:
-    """Process a single RSS feed and extract articles."""
    try:
        feed = fetch_feed(feed_url, session)
        if not feed:
@@ -192,7 +193,8 @@ def process_feed(feed_url: str, session: requests.Session) -> List[Dict[str, Any
                    "title": entry.title,
                    "link": entry.link,
                    "summary": entry.summary if hasattr(entry, 'summary') else entry.description,
-                    "feed_title": get_clean_source_name(feed.feed.title),
+                    "content": getattr(entry, 'content', [{'value': ''}])[0].value,
+                    "feed_title": get_clean_source_name(feed_url),
                    "pub_date": pub_date
                }
                
@@ -229,13 +231,12 @@ def curate_from_rss():
    attempts = 0
    max_attempts = 10
    while attempts < max_attempts and articles:
-        article = articles.pop(0)  # Take newest article
+        article = articles.pop(0)
        title = article["title"]
        link = article["link"]
        summary = article["summary"]
        content = article["content"]
-        feed_url = article["feed_title"]
-        source_name = feed_url[0] if isinstance(feed_url, tuple) and len(feed_url) > 0 else feed_url
+        source_name = article["feed_title"]
        original_source = f'<a href="{link}">{source_name}</a>'

        if title in posted_titles:
@@ -254,7 +255,6 @@ def curate_from_rss():
            attempts += 1
            continue

-        # Score using title, summary, and content
        scoring_content = f"{title}\n\n{summary}\n\nContent: {content}"
        interest_score = is_interesting(scoring_content)
        logging.info(f"Interest score for '{title}': {interest_score}")