From 5cf367fb40a6803216548702a2d8c214ed2b083e Mon Sep 17 00:00:00 2001 From: Shane Date: Wed, 30 Apr 2025 19:19:53 +1000 Subject: [PATCH] update rss script to old --- foodie_automator_rss.py | 209 ++++++++++++++-------------------------- 1 file changed, 70 insertions(+), 139 deletions(-) diff --git a/foodie_automator_rss.py b/foodie_automator_rss.py index 11ba873..61537a3 100644 --- a/foodie_automator_rss.py +++ b/foodie_automator_rss.py @@ -16,7 +16,7 @@ from requests.packages.urllib3.util.retry import Retry from requests.adapters import HTTPAdapter from foodie_config import ( RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, - HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES, + HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES, CTAS, get_clean_source_name, X_API_CREDENTIALS ) from foodie_utils import ( @@ -24,12 +24,9 @@ from foodie_utils import ( upload_image_to_wp, determine_paragraph_count, insert_link_naturally, is_interesting, generate_title_from_summary, summarize_with_gpt4o, generate_category_from_summary, post_to_wp, prepare_post_data, - select_best_author, smart_image_and_filter + select_best_author, smart_image_and_filter, get_flickr_image_via_ddg ) from foodie_hooks import get_dynamic_hook, select_best_cta -import feedparser -from concurrent.futures import ThreadPoolExecutor, as_completed -from typing import List, Dict, Any, Optional from dotenv import load_dotenv load_dotenv() @@ -50,9 +47,7 @@ signal.signal(signal.SIGINT, signal_handler) LOG_FILE = "/home/shane/foodie_automator/foodie_automator_rss.log" LOG_PRUNE_DAYS = 30 -MAX_WORKERS = 5 -RATE_LIMIT_DELAY = 1 -FEED_TIMEOUT = 30 +FEED_TIMEOUT = 15 MAX_RETRIES = 3 POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json' @@ -90,6 +85,7 @@ def setup_logging(): console_handler = logging.StreamHandler() console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) logging.getLogger().addHandler(console_handler) + logging.getLogger("requests").setLevel(logging.WARNING) logging.info("Logging initialized for foodie_automator_rss.py") setup_logging() @@ -98,8 +94,8 @@ def create_http_session() -> requests.Session: session = requests.Session() retry_strategy = Retry( total=MAX_RETRIES, - backoff_factor=2, # Increased backoff factor for better retry handling - status_forcelist=[429, 500, 502, 503, 504, 403], # Added 403 to retry list + backoff_factor=2, + status_forcelist=[403, 429, 500, 502, 503, 504], allowed_methods=["GET", "POST"] ) adapter = HTTPAdapter( @@ -109,137 +105,11 @@ def create_http_session() -> requests.Session: ) session.mount("http://", adapter) session.mount("https://", adapter) - # Add a realistic User-Agent header session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36' }) return session -def fetch_feed(feed_url: str, session: requests.Session) -> Optional[feedparser.FeedParserDict]: - logging.debug(f"Fetching feed: {feed_url}") - try: - response = session.get(feed_url, timeout=15) # Reduced timeout to 15 seconds - response.raise_for_status() - feed = feedparser.parse(response.content) - - if feed.bozo: - logging.warning(f"Feed parsing error for {feed_url}: {feed.bozo_exception}") - return None - - logging.debug(f"Successfully fetched feed: {feed_url}") - return feed - except Exception as e: - logging.error(f"Error fetching feed {feed_url}: {str(e)}") - return None - -def is_interesting_rss(title: str, summary: str, pub_date: datetime) -> bool: - try: - if not title or not summary: - return False - - if datetime.now(timezone.utc) - pub_date > timedelta(days=7): - return False - - score = 0 - title_lower = title.lower() - if any(keyword in title_lower for keyword in RECIPE_KEYWORDS): - score += 3 - if any(keyword in title_lower for keyword in PROMO_KEYWORDS): - score += 2 - if any(keyword in title_lower for keyword in HOME_KEYWORDS): - score += 1 - - summary_lower = summary.lower() - if len(summary.split()) < 100: - score -= 2 - if any(keyword in summary_lower for keyword in PRODUCT_KEYWORDS): - score += 1 - - return score >= 4 - except Exception as e: - logging.error(f"Error in is_interesting_rss: {str(e)}") - return False - -def fetch_rss_feeds() -> List[Dict[str, Any]]: - logging.info("Starting fetch_rss_feeds") - session = create_http_session() - articles = [] - - try: - logging.info(f"Processing {len(RSS_FEEDS)} feeds: {RSS_FEEDS}") - with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: - futures = [] - for feed_url in RSS_FEEDS: - logging.debug(f"Scheduling feed: {feed_url}") - future = executor.submit(process_feed, feed_url, session) - futures.append(future) - - for future in as_completed(futures): - try: - feed_articles = future.result() - logging.info(f"Completed feed processing, got {len(feed_articles)} articles") - articles.extend(feed_articles) - except Exception as e: - logging.error(f"Error processing feed in future: {str(e)}") - continue - logging.info(f"Finished fetch_rss_feeds, total articles: {len(articles)}") - return articles - except Exception as e: - logging.error(f"Error in fetch_rss_feeds: {str(e)}") - return [] - -def process_feed(feed_url: str, session: requests.Session) -> List[Dict[str, Any]]: - logging.info(f"Processing feed: {feed_url}") - try: - feed = fetch_feed(feed_url, session) - if not feed: - logging.warning(f"No feed data for {feed_url}") - return [] - - articles = [] - logging.debug(f"Feed entries count: {len(feed.entries)}") - for entry in feed.entries: - try: - logging.debug(f"Processing entry: {entry.get('title', 'No title')}") - pub_date = datetime.fromtimestamp(time.mktime(entry.published_parsed), tz=timezone.utc) - - # Safely extract content - content = "" - if hasattr(entry, 'content') and isinstance(entry.content, list) and len(entry.content) > 0: - content_item = entry.content[0] - if isinstance(content_item, dict) and 'value' in content_item: - content = content_item['value'] - elif hasattr(content_item, 'value'): - content = content_item.value - elif hasattr(entry, 'description'): - content = entry.description - elif hasattr(entry, 'summary'): - content = entry.summary - - article = { - "title": entry.title, - "link": entry.link, - "summary": entry.summary if hasattr(entry, 'summary') else entry.description if hasattr(entry, 'description') else "", - "content": content, - "feed_title": get_clean_source_name(feed_url), - "pub_date": pub_date - } - - if is_interesting_rss(article["title"], article["summary"], pub_date): - logging.info(f"Interesting article found: {article['title']}") - articles.append(article) - - time.sleep(RATE_LIMIT_DELAY) - except Exception as e: - logging.warning(f"Error processing entry in {feed_url}: {str(e)}") - continue - - logging.info(f"Finished processing {feed_url}, found {len(articles)} articles") - return articles - except Exception as e: - logging.error(f"Error processing feed {feed_url}: {str(e)}") - return [] - def parse_date(date_str): try: parsed_date = email.utils.parsedate_to_datetime(date_str) @@ -250,12 +120,68 @@ def parse_date(date_str): logging.error(f"Failed to parse date '{date_str}': {e}") return datetime.now(timezone.utc) +def fetch_rss_feeds(): + logging.info("Starting fetch_rss_feeds") + articles = [] + cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS) + session = create_http_session() + + if not RSS_FEEDS: + logging.error("RSS_FEEDS is empty in foodie_config.py") + return articles + + for feed_url in RSS_FEEDS: + logging.info(f"Processing feed: {feed_url}") + try: + response = session.get(feed_url, timeout=FEED_TIMEOUT) + response.raise_for_status() + soup = BeautifulSoup(response.content, 'xml') + items = soup.find_all('item') + + feed_title = RSS_FEED_NAMES.get(feed_url, (get_clean_source_name(feed_url), feed_url)) + for item in items: + try: + title = item.find('title').text.strip() if item.find('title') else "Untitled" + link = item.find('link').text.strip() if item.find('link') else "" + pub_date = item.find('pubDate') + pub_date = parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc) + + if pub_date < cutoff_date: + logging.info(f"Skipping old article: {title} (Published: {pub_date})") + continue + + description = item.find('description') + summary = BeautifulSoup(description.text, 'html.parser').get_text().strip() if description else "" + content = item.find('content:encoded') + content_text = BeautifulSoup(content.text, 'html.parser').get_text().strip() if content else summary + + articles.append({ + "title": title, + "link": link, + "summary": summary, + "content": content_text, + "feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title, + "pub_date": pub_date + }) + logging.debug(f"Processed article: {title}") + except Exception as e: + logging.warning(f"Error processing entry in {feed_url}: {e}") + continue + logging.info(f"Filtered to {len(articles)} articles from {feed_url}") + except Exception as e: + logging.error(f"Failed to fetch RSS feed {feed_url}: {e}") + continue + + articles.sort(key=lambda x: x["pub_date"], reverse=True) + logging.info(f"Total RSS articles fetched: {len(articles)}") + return articles + def curate_from_rss(): articles = fetch_rss_feeds() if not articles: print("No RSS articles available") logging.info("No RSS articles available") - return None, None, None + return None, None, random.randint(600, 1800) attempts = 0 max_attempts = 10 @@ -320,6 +246,11 @@ def curate_from_rss(): attempts += 1 continue + # Fetch image + image_url, image_source, uploader, page_url = get_flickr_image_via_ddg(image_query, relevance_keywords) + if not image_url: + image_url, image_source, uploader, page_url = get_image(image_query) + hook = get_dynamic_hook(post_data["title"]).strip() cta = select_best_cta(post_data["title"], final_summary, post_url=None) @@ -338,7 +269,7 @@ def curate_from_rss(): uploader=uploader, pixabay_url=pixabay_url, interest_score=interest_score, - should_post_tweet=True # Post the X tweet on the first call + should_post_tweet=True ) finally: is_posting = False @@ -360,7 +291,7 @@ def curate_from_rss(): pixabay_url=pixabay_url, interest_score=interest_score, post_id=post_id, - should_post_tweet=False # Skip X tweet on the update call + should_post_tweet=False ) finally: is_posting = False