|
|
|
|
@ -98,8 +98,8 @@ def create_http_session() -> requests.Session: |
|
|
|
|
session = requests.Session() |
|
|
|
|
retry_strategy = Retry( |
|
|
|
|
total=MAX_RETRIES, |
|
|
|
|
backoff_factor=1, |
|
|
|
|
status_forcelist=[429, 500, 502, 503, 504], |
|
|
|
|
backoff_factor=2, # Increased backoff factor for better retry handling |
|
|
|
|
status_forcelist=[429, 500, 502, 503, 504, 403], # Added 403 to retry list |
|
|
|
|
allowed_methods=["GET", "POST"] |
|
|
|
|
) |
|
|
|
|
adapter = HTTPAdapter( |
|
|
|
|
@ -109,6 +109,10 @@ def create_http_session() -> requests.Session: |
|
|
|
|
) |
|
|
|
|
session.mount("http://", adapter) |
|
|
|
|
session.mount("https://", adapter) |
|
|
|
|
# Add a realistic User-Agent header |
|
|
|
|
session.headers.update({ |
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36' |
|
|
|
|
}) |
|
|
|
|
return session |
|
|
|
|
|
|
|
|
|
def fetch_feed(feed_url: str, session: requests.Session) -> Optional[feedparser.FeedParserDict]: |
|
|
|
|
@ -189,11 +193,24 @@ def process_feed(feed_url: str, session: requests.Session) -> List[Dict[str, Any |
|
|
|
|
try: |
|
|
|
|
pub_date = datetime.fromtimestamp(time.mktime(entry.published_parsed), tz=timezone.utc) |
|
|
|
|
|
|
|
|
|
# Safely extract content |
|
|
|
|
content = "" |
|
|
|
|
if hasattr(entry, 'content') and isinstance(entry.content, list) and len(entry.content) > 0: |
|
|
|
|
content_item = entry.content[0] |
|
|
|
|
if isinstance(content_item, dict) and 'value' in content_item: |
|
|
|
|
content = content_item['value'] |
|
|
|
|
elif hasattr(content_item, 'value'): |
|
|
|
|
content = content_item.value |
|
|
|
|
elif hasattr(entry, 'description'): |
|
|
|
|
content = entry.description |
|
|
|
|
elif hasattr(entry, 'summary'): |
|
|
|
|
content = entry.summary |
|
|
|
|
|
|
|
|
|
article = { |
|
|
|
|
"title": entry.title, |
|
|
|
|
"link": entry.link, |
|
|
|
|
"summary": entry.summary if hasattr(entry, 'summary') else entry.description, |
|
|
|
|
"content": getattr(entry, 'content', [{'value': ''}])[0].value, |
|
|
|
|
"summary": entry.summary if hasattr(entry, 'summary') else entry.description if hasattr(entry, 'description') else "", |
|
|
|
|
"content": content, |
|
|
|
|
"feed_title": get_clean_source_name(feed_url), |
|
|
|
|
"pub_date": pub_date |
|
|
|
|
} |
|
|
|
|
|