update rss script to old

2025-04-30 19:19:53 +10:00
parent 2dd5311c57
commit 5cf367fb40
1 changed files with 70 additions and 139 deletions
@@ -16,7 +16,7 @@ from requests.packages.urllib3.util.retry import Retry
 from requests.adapters import HTTPAdapter
 from foodie_config import (
    RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS,
-    HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES,
+    HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES, CTAS,
    get_clean_source_name, X_API_CREDENTIALS
 )
 from foodie_utils import (
@@ -24,12 +24,9 @@ from foodie_utils import (
    upload_image_to_wp, determine_paragraph_count, insert_link_naturally,
    is_interesting, generate_title_from_summary, summarize_with_gpt4o,
    generate_category_from_summary, post_to_wp, prepare_post_data,
-    select_best_author, smart_image_and_filter
+    select_best_author, smart_image_and_filter, get_flickr_image_via_ddg
 )
 from foodie_hooks import get_dynamic_hook, select_best_cta
-import feedparser
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import List, Dict, Any, Optional
 from dotenv import load_dotenv

 load_dotenv()
@@ -50,9 +47,7 @@ signal.signal(signal.SIGINT, signal_handler)

 LOG_FILE = "/home/shane/foodie_automator/foodie_automator_rss.log"
 LOG_PRUNE_DAYS = 30
-MAX_WORKERS = 5
-RATE_LIMIT_DELAY = 1
-FEED_TIMEOUT = 30
+FEED_TIMEOUT = 15
 MAX_RETRIES = 3

 POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json'
@@ -90,6 +85,7 @@ def setup_logging():
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    logging.getLogger().addHandler(console_handler)
+    logging.getLogger("requests").setLevel(logging.WARNING)
    logging.info("Logging initialized for foodie_automator_rss.py")

 setup_logging()
@@ -98,8 +94,8 @@ def create_http_session() -> requests.Session:
    session = requests.Session()
    retry_strategy = Retry(
        total=MAX_RETRIES,
-        backoff_factor=2,  # Increased backoff factor for better retry handling
-        status_forcelist=[429, 500, 502, 503, 504, 403],  # Added 403 to retry list
+        backoff_factor=2,
+        status_forcelist=[403, 429, 500, 502, 503, 504],
        allowed_methods=["GET", "POST"]
    )
    adapter = HTTPAdapter(
@@ -109,137 +105,11 @@ def create_http_session() -> requests.Session:
    )
    session.mount("http://", adapter)
    session.mount("https://", adapter)
-    # Add a realistic User-Agent header
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
    })
    return session

-def fetch_feed(feed_url: str, session: requests.Session) -> Optional[feedparser.FeedParserDict]:
-    logging.debug(f"Fetching feed: {feed_url}")
-    try:
-        response = session.get(feed_url, timeout=15)  # Reduced timeout to 15 seconds
-        response.raise_for_status()
-        feed = feedparser.parse(response.content)
-        
-        if feed.bozo:
-            logging.warning(f"Feed parsing error for {feed_url}: {feed.bozo_exception}")
-            return None
-            
-        logging.debug(f"Successfully fetched feed: {feed_url}")
-        return feed
-    except Exception as e:
-        logging.error(f"Error fetching feed {feed_url}: {str(e)}")
-        return None
-
-def is_interesting_rss(title: str, summary: str, pub_date: datetime) -> bool:
-    try:
-        if not title or not summary:
-            return False
-            
-        if datetime.now(timezone.utc) - pub_date > timedelta(days=7):
-            return False
-            
-        score = 0
-        title_lower = title.lower()
-        if any(keyword in title_lower for keyword in RECIPE_KEYWORDS):
-            score += 3
-        if any(keyword in title_lower for keyword in PROMO_KEYWORDS):
-            score += 2
-        if any(keyword in title_lower for keyword in HOME_KEYWORDS):
-            score += 1
-            
-        summary_lower = summary.lower()
-        if len(summary.split()) < 100:
-            score -= 2
-        if any(keyword in summary_lower for keyword in PRODUCT_KEYWORDS):
-            score += 1
-            
-        return score >= 4
-    except Exception as e:
-        logging.error(f"Error in is_interesting_rss: {str(e)}")
-        return False
-
-def fetch_rss_feeds() -> List[Dict[str, Any]]:
-    logging.info("Starting fetch_rss_feeds")
-    session = create_http_session()
-    articles = []
-    
-    try:
-        logging.info(f"Processing {len(RSS_FEEDS)} feeds: {RSS_FEEDS}")
-        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
-            futures = []
-            for feed_url in RSS_FEEDS:
-                logging.debug(f"Scheduling feed: {feed_url}")
-                future = executor.submit(process_feed, feed_url, session)
-                futures.append(future)
-                
-            for future in as_completed(futures):
-                try:
-                    feed_articles = future.result()
-                    logging.info(f"Completed feed processing, got {len(feed_articles)} articles")
-                    articles.extend(feed_articles)
-                except Exception as e:
-                    logging.error(f"Error processing feed in future: {str(e)}")
-                    continue
-        logging.info(f"Finished fetch_rss_feeds, total articles: {len(articles)}")
-        return articles
-    except Exception as e:
-        logging.error(f"Error in fetch_rss_feeds: {str(e)}")
-        return []
-
-def process_feed(feed_url: str, session: requests.Session) -> List[Dict[str, Any]]:
-    logging.info(f"Processing feed: {feed_url}")
-    try:
-        feed = fetch_feed(feed_url, session)
-        if not feed:
-            logging.warning(f"No feed data for {feed_url}")
-            return []
-            
-        articles = []
-        logging.debug(f"Feed entries count: {len(feed.entries)}")
-        for entry in feed.entries:
-            try:
-                logging.debug(f"Processing entry: {entry.get('title', 'No title')}")
-                pub_date = datetime.fromtimestamp(time.mktime(entry.published_parsed), tz=timezone.utc)
-                
-                # Safely extract content
-                content = ""
-                if hasattr(entry, 'content') and isinstance(entry.content, list) and len(entry.content) > 0:
-                    content_item = entry.content[0]
-                    if isinstance(content_item, dict) and 'value' in content_item:
-                        content = content_item['value']
-                    elif hasattr(content_item, 'value'):
-                        content = content_item.value
-                elif hasattr(entry, 'description'):
-                    content = entry.description
-                elif hasattr(entry, 'summary'):
-                    content = entry.summary
-
-                article = {
-                    "title": entry.title,
-                    "link": entry.link,
-                    "summary": entry.summary if hasattr(entry, 'summary') else entry.description if hasattr(entry, 'description') else "",
-                    "content": content,
-                    "feed_title": get_clean_source_name(feed_url),
-                    "pub_date": pub_date
-                }
-                
-                if is_interesting_rss(article["title"], article["summary"], pub_date):
-                    logging.info(f"Interesting article found: {article['title']}")
-                    articles.append(article)
-                    
-                time.sleep(RATE_LIMIT_DELAY)
-            except Exception as e:
-                logging.warning(f"Error processing entry in {feed_url}: {str(e)}")
-                continue
-                
-        logging.info(f"Finished processing {feed_url}, found {len(articles)} articles")
-        return articles
-    except Exception as e:
-        logging.error(f"Error processing feed {feed_url}: {str(e)}")
-        return []
-
 def parse_date(date_str):
    try:
        parsed_date = email.utils.parsedate_to_datetime(date_str)
@@ -250,12 +120,68 @@ def parse_date(date_str):
        logging.error(f"Failed to parse date '{date_str}': {e}")
        return datetime.now(timezone.utc)

+def fetch_rss_feeds():
+    logging.info("Starting fetch_rss_feeds")
+    articles = []
+    cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS)
+    session = create_http_session()
+
+    if not RSS_FEEDS:
+        logging.error("RSS_FEEDS is empty in foodie_config.py")
+        return articles
+
+    for feed_url in RSS_FEEDS:
+        logging.info(f"Processing feed: {feed_url}")
+        try:
+            response = session.get(feed_url, timeout=FEED_TIMEOUT)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'xml')
+            items = soup.find_all('item')
+
+            feed_title = RSS_FEED_NAMES.get(feed_url, (get_clean_source_name(feed_url), feed_url))
+            for item in items:
+                try:
+                    title = item.find('title').text.strip() if item.find('title') else "Untitled"
+                    link = item.find('link').text.strip() if item.find('link') else ""
+                    pub_date = item.find('pubDate')
+                    pub_date = parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc)
+
+                    if pub_date < cutoff_date:
+                        logging.info(f"Skipping old article: {title} (Published: {pub_date})")
+                        continue
+
+                    description = item.find('description')
+                    summary = BeautifulSoup(description.text, 'html.parser').get_text().strip() if description else ""
+                    content = item.find('content:encoded')
+                    content_text = BeautifulSoup(content.text, 'html.parser').get_text().strip() if content else summary
+
+                    articles.append({
+                        "title": title,
+                        "link": link,
+                        "summary": summary,
+                        "content": content_text,
+                        "feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title,
+                        "pub_date": pub_date
+                    })
+                    logging.debug(f"Processed article: {title}")
+                except Exception as e:
+                    logging.warning(f"Error processing entry in {feed_url}: {e}")
+                    continue
+            logging.info(f"Filtered to {len(articles)} articles from {feed_url}")
+        except Exception as e:
+            logging.error(f"Failed to fetch RSS feed {feed_url}: {e}")
+            continue
+
+    articles.sort(key=lambda x: x["pub_date"], reverse=True)
+    logging.info(f"Total RSS articles fetched: {len(articles)}")
+    return articles
+
 def curate_from_rss():
    articles = fetch_rss_feeds()
    if not articles:
        print("No RSS articles available")
        logging.info("No RSS articles available")
-        return None, None, None
+        return None, None, random.randint(600, 1800)

    attempts = 0
    max_attempts = 10
@@ -320,6 +246,11 @@ def curate_from_rss():
            attempts += 1
            continue

+        # Fetch image
+        image_url, image_source, uploader, page_url = get_flickr_image_via_ddg(image_query, relevance_keywords)
+        if not image_url:
+            image_url, image_source, uploader, page_url = get_image(image_query)
+
        hook = get_dynamic_hook(post_data["title"]).strip()
        cta = select_best_cta(post_data["title"], final_summary, post_url=None)

@@ -338,7 +269,7 @@ def curate_from_rss():
                uploader=uploader,
                pixabay_url=pixabay_url,
                interest_score=interest_score,
-                should_post_tweet=True  # Post the X tweet on the first call
+                should_post_tweet=True
            )
        finally:
            is_posting = False
@@ -360,7 +291,7 @@ def curate_from_rss():
                    pixabay_url=pixabay_url,
                    interest_score=interest_score,
                    post_id=post_id,
-                    should_post_tweet=False  # Skip X tweet on the update call
+                    should_post_tweet=False
                )
            finally:
                is_posting = False