From 3fc1b4036879921ec4add64837d2c78b33717a9c Mon Sep 17 00:00:00 2001 From: Shane Date: Wed, 30 Apr 2025 19:02:18 +1000 Subject: [PATCH] Replace the create_http_session --- foodie_automator_rss.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/foodie_automator_rss.py b/foodie_automator_rss.py index 46f149e..676d037 100644 --- a/foodie_automator_rss.py +++ b/foodie_automator_rss.py @@ -98,8 +98,8 @@ def create_http_session() -> requests.Session: session = requests.Session() retry_strategy = Retry( total=MAX_RETRIES, - backoff_factor=1, - status_forcelist=[429, 500, 502, 503, 504], + backoff_factor=2, # Increased backoff factor for better retry handling + status_forcelist=[429, 500, 502, 503, 504, 403], # Added 403 to retry list allowed_methods=["GET", "POST"] ) adapter = HTTPAdapter( @@ -109,6 +109,10 @@ def create_http_session() -> requests.Session: ) session.mount("http://", adapter) session.mount("https://", adapter) + # Add a realistic User-Agent header + session.headers.update({ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36' + }) return session def fetch_feed(feed_url: str, session: requests.Session) -> Optional[feedparser.FeedParserDict]: @@ -189,11 +193,24 @@ def process_feed(feed_url: str, session: requests.Session) -> List[Dict[str, Any try: pub_date = datetime.fromtimestamp(time.mktime(entry.published_parsed), tz=timezone.utc) + # Safely extract content + content = "" + if hasattr(entry, 'content') and isinstance(entry.content, list) and len(entry.content) > 0: + content_item = entry.content[0] + if isinstance(content_item, dict) and 'value' in content_item: + content = content_item['value'] + elif hasattr(content_item, 'value'): + content = content_item.value + elif hasattr(entry, 'description'): + content = entry.description + elif hasattr(entry, 'summary'): + content = entry.summary + article = { "title": entry.title, "link": entry.link, - "summary": entry.summary if hasattr(entry, 'summary') else entry.description, - "content": getattr(entry, 'content', [{'value': ''}])[0].value, + "summary": entry.summary if hasattr(entry, 'summary') else entry.description if hasattr(entry, 'description') else "", + "content": content, "feed_title": get_clean_source_name(feed_url), "pub_date": pub_date }