|
|
|
@ -116,8 +116,9 @@ def create_http_session() -> requests.Session: |
|
|
|
return session |
|
|
|
return session |
|
|
|
|
|
|
|
|
|
|
|
def fetch_feed(feed_url: str, session: requests.Session) -> Optional[feedparser.FeedParserDict]: |
|
|
|
def fetch_feed(feed_url: str, session: requests.Session) -> Optional[feedparser.FeedParserDict]: |
|
|
|
|
|
|
|
logging.debug(f"Fetching feed: {feed_url}") |
|
|
|
try: |
|
|
|
try: |
|
|
|
response = session.get(feed_url, timeout=FEED_TIMEOUT) |
|
|
|
response = session.get(feed_url, timeout=15) # Reduced timeout to 15 seconds |
|
|
|
response.raise_for_status() |
|
|
|
response.raise_for_status() |
|
|
|
feed = feedparser.parse(response.content) |
|
|
|
feed = feedparser.parse(response.content) |
|
|
|
|
|
|
|
|
|
|
|
@ -125,6 +126,7 @@ def fetch_feed(feed_url: str, session: requests.Session) -> Optional[feedparser. |
|
|
|
logging.warning(f"Feed parsing error for {feed_url}: {feed.bozo_exception}") |
|
|
|
logging.warning(f"Feed parsing error for {feed_url}: {feed.bozo_exception}") |
|
|
|
return None |
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logging.debug(f"Successfully fetched feed: {feed_url}") |
|
|
|
return feed |
|
|
|
return feed |
|
|
|
except Exception as e: |
|
|
|
except Exception as e: |
|
|
|
logging.error(f"Error fetching feed {feed_url}: {str(e)}") |
|
|
|
logging.error(f"Error fetching feed {feed_url}: {str(e)}") |
|
|
|
@ -159,38 +161,46 @@ def is_interesting_rss(title: str, summary: str, pub_date: datetime) -> bool: |
|
|
|
return False |
|
|
|
return False |
|
|
|
|
|
|
|
|
|
|
|
def fetch_rss_feeds() -> List[Dict[str, Any]]: |
|
|
|
def fetch_rss_feeds() -> List[Dict[str, Any]]: |
|
|
|
|
|
|
|
logging.info("Starting fetch_rss_feeds") |
|
|
|
session = create_http_session() |
|
|
|
session = create_http_session() |
|
|
|
articles = [] |
|
|
|
articles = [] |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
try: |
|
|
|
|
|
|
|
logging.info(f"Processing {len(RSS_FEEDS)} feeds: {RSS_FEEDS}") |
|
|
|
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: |
|
|
|
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: |
|
|
|
futures = [] |
|
|
|
futures = [] |
|
|
|
for feed_url in RSS_FEEDS: |
|
|
|
for feed_url in RSS_FEEDS: |
|
|
|
|
|
|
|
logging.debug(f"Scheduling feed: {feed_url}") |
|
|
|
future = executor.submit(process_feed, feed_url, session) |
|
|
|
future = executor.submit(process_feed, feed_url, session) |
|
|
|
futures.append(future) |
|
|
|
futures.append(future) |
|
|
|
|
|
|
|
|
|
|
|
for future in as_completed(futures): |
|
|
|
for future in as_completed(futures): |
|
|
|
try: |
|
|
|
try: |
|
|
|
feed_articles = future.result() |
|
|
|
feed_articles = future.result() |
|
|
|
|
|
|
|
logging.info(f"Completed feed processing, got {len(feed_articles)} articles") |
|
|
|
articles.extend(feed_articles) |
|
|
|
articles.extend(feed_articles) |
|
|
|
except Exception as e: |
|
|
|
except Exception as e: |
|
|
|
logging.error(f"Error processing feed: {str(e)}") |
|
|
|
logging.error(f"Error processing feed in future: {str(e)}") |
|
|
|
continue |
|
|
|
continue |
|
|
|
|
|
|
|
logging.info(f"Finished fetch_rss_feeds, total articles: {len(articles)}") |
|
|
|
return articles |
|
|
|
return articles |
|
|
|
except Exception as e: |
|
|
|
except Exception as e: |
|
|
|
logging.error(f"Error in fetch_rss_feeds: {str(e)}") |
|
|
|
logging.error(f"Error in fetch_rss_feeds: {str(e)}") |
|
|
|
return [] |
|
|
|
return [] |
|
|
|
|
|
|
|
|
|
|
|
def process_feed(feed_url: str, session: requests.Session) -> List[Dict[str, Any]]: |
|
|
|
def process_feed(feed_url: str, session: requests.Session) -> List[Dict[str, Any]]: |
|
|
|
|
|
|
|
logging.info(f"Processing feed: {feed_url}") |
|
|
|
try: |
|
|
|
try: |
|
|
|
feed = fetch_feed(feed_url, session) |
|
|
|
feed = fetch_feed(feed_url, session) |
|
|
|
if not feed: |
|
|
|
if not feed: |
|
|
|
|
|
|
|
logging.warning(f"No feed data for {feed_url}") |
|
|
|
return [] |
|
|
|
return [] |
|
|
|
|
|
|
|
|
|
|
|
articles = [] |
|
|
|
articles = [] |
|
|
|
|
|
|
|
logging.debug(f"Feed entries count: {len(feed.entries)}") |
|
|
|
for entry in feed.entries: |
|
|
|
for entry in feed.entries: |
|
|
|
try: |
|
|
|
try: |
|
|
|
|
|
|
|
logging.debug(f"Processing entry: {entry.get('title', 'No title')}") |
|
|
|
pub_date = datetime.fromtimestamp(time.mktime(entry.published_parsed), tz=timezone.utc) |
|
|
|
pub_date = datetime.fromtimestamp(time.mktime(entry.published_parsed), tz=timezone.utc) |
|
|
|
|
|
|
|
|
|
|
|
# Safely extract content |
|
|
|
# Safely extract content |
|
|
|
@ -216,13 +226,15 @@ def process_feed(feed_url: str, session: requests.Session) -> List[Dict[str, Any |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if is_interesting_rss(article["title"], article["summary"], pub_date): |
|
|
|
if is_interesting_rss(article["title"], article["summary"], pub_date): |
|
|
|
|
|
|
|
logging.info(f"Interesting article found: {article['title']}") |
|
|
|
articles.append(article) |
|
|
|
articles.append(article) |
|
|
|
|
|
|
|
|
|
|
|
time.sleep(RATE_LIMIT_DELAY) |
|
|
|
time.sleep(RATE_LIMIT_DELAY) |
|
|
|
except Exception as e: |
|
|
|
except Exception as e: |
|
|
|
logging.warning(f"Error processing entry: {str(e)}") |
|
|
|
logging.warning(f"Error processing entry in {feed_url}: {str(e)}") |
|
|
|
continue |
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logging.info(f"Finished processing {feed_url}, found {len(articles)} articles") |
|
|
|
return articles |
|
|
|
return articles |
|
|
|
except Exception as e: |
|
|
|
except Exception as e: |
|
|
|
logging.error(f"Error processing feed {feed_url}: {str(e)}") |
|
|
|
logging.error(f"Error processing feed {feed_url}: {str(e)}") |
|
|
|
|