incorporate external context from DDG

This commit is contained in:
2025-05-04 09:07:45 +10:00
parent 427a5cb919
commit e5ebd000fe
4 changed files with 79 additions and 37 deletions
+34 -18
View File
@@ -9,6 +9,8 @@ import signal
import sys
import re
import email.utils
import feedparser
from duckduckgo_search import DDGS
from datetime import datetime, timedelta, timezone
from bs4 import BeautifulSoup
from openai import OpenAI
@@ -136,6 +138,7 @@ def fetch_rss_feeds():
logging.error("RSS_FEEDS is empty in foodie_config.py")
return articles
logging.info(f"Processing feeds: {RSS_FEEDS}")
for feed_url in RSS_FEEDS:
logging.info(f"Processing feed: {feed_url}")
try:
@@ -182,6 +185,30 @@ def fetch_rss_feeds():
logging.info(f"Total RSS articles fetched: {len(articles)}")
return articles
def fetch_duckduckgo_news_context(title, hours=24):
try:
with DDGS() as ddgs:
results = ddgs.news(f"{title} news", timelimit="d", max_results=5)
titles = []
for r in results:
try:
date_str = r["date"]
if '+00:00' in date_str:
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
else:
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
titles.append(r["title"].lower())
except ValueError as e:
logging.warning(f"Date parsing failed for '{date_str}': {e}")
continue
context = " ".join(titles) if titles else "No recent news found within 24 hours"
logging.info(f"DuckDuckGo News context for '{title}': {context}")
return context
except Exception as e:
logging.warning(f"DuckDuckGo News context fetch failed for '{title}': {e}")
return title
def curate_from_rss():
articles = fetch_rss_feeds()
if not articles:
@@ -216,10 +243,12 @@ def curate_from_rss():
attempts += 1
continue
scoring_content = f"{title}\n\n{summary}\n\nContent: {content}"
# Fetch additional context via DDG
ddg_context = fetch_duckduckgo_news_context(title)
scoring_content = f"{title}\n\n{summary}\n\nContent: {content}\n\nAdditional Context: {ddg_context}"
interest_score = is_interesting(scoring_content)
logging.info(f"Interest score for '{title}': {interest_score}")
if interest_score < 6:
if interest_score < 7:
print(f"RSS Interest Too Low: {interest_score}")
logging.info(f"RSS Interest Too Low: {interest_score}")
attempts += 1
@@ -228,8 +257,9 @@ def curate_from_rss():
num_paragraphs = determine_paragraph_count(interest_score)
extra_prompt = (
f"Generate exactly {num_paragraphs} paragraphs.\n"
f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
f"Do NOT introduce unrelated concepts.\n"
f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
f"Do NOT introduce unrelated concepts unless in the content or additional context.\n"
f"Expand on the core idea with relevant context about its appeal or significance.\n"
f"Do not include emojis in the summary."
)
@@ -246,20 +276,6 @@ def curate_from_rss():
attempts += 1
continue
# Remove the original title from the summary while preserving paragraphs
title_pattern = re.compile(
r'\*\*' + re.escape(title) + r'\*\*|' + re.escape(title),
re.IGNORECASE
)
paragraphs = final_summary.split('\n')
cleaned_paragraphs = []
for para in paragraphs:
if para.strip():
cleaned_para = title_pattern.sub('', para).strip()
cleaned_para = re.sub(r'\s+', ' ', cleaned_para)
cleaned_paragraphs.append(cleaned_para)
final_summary = '\n'.join(cleaned_paragraphs)
final_summary = insert_link_naturally(final_summary, source_name, link)
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
if not post_data: