From e5ebd000fe6ce3a68833080a087f5ea889b150db Mon Sep 17 00:00:00 2001
From: Shane
Date: Sun, 4 May 2025 09:07:45 +1000
Subject: [PATCH] incorporate external context from DDG
---
foodie_automator_google.py | 12 +++++----
foodie_automator_reddit.py | 44 ++++++++++++++++++++++++--------
foodie_automator_rss.py | 52 +++++++++++++++++++++++++-------------
foodie_utils.py | 8 +++---
4 files changed, 79 insertions(+), 37 deletions(-)
diff --git a/foodie_automator_google.py b/foodie_automator_google.py
index 7c6ccf9..202408e 100644
--- a/foodie_automator_google.py
+++ b/foodie_automator_google.py
@@ -215,7 +215,9 @@ def curate_from_google_trends(geo_list=['US']):
attempts += 1
continue
- scoring_content = f"{title}\n\n{summary}"
+ # Fetch additional context via DDG
+ ddg_context = fetch_duckduckgo_news_context(title)
+ scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}"
interest_score = is_interesting(scoring_content)
logging.info(f"Interest score for '{title}': {interest_score}")
if interest_score < 6:
@@ -227,8 +229,9 @@ def curate_from_google_trends(geo_list=['US']):
num_paragraphs = determine_paragraph_count(interest_score)
extra_prompt = (
f"Generate exactly {num_paragraphs} paragraphs.\n"
- f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
- f"Do NOT introduce unrelated concepts.\n"
+ f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
+ f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
+ f"Do NOT introduce unrelated concepts unless in the content or additional context.\n"
f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
f"Do not include emojis in the summary."
)
@@ -291,8 +294,7 @@ def curate_from_google_trends(geo_list=['US']):
share_text_encoded = quote(share_text)
post_url_encoded = quote(post_url)
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
- # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
- post_data["content"] = f"{final_summary}\n\n{share_links}" # Removed cta from content
+ post_data["content"] = f"{final_summary}\n\n{share_links}"
is_posting = True
try:
post_to_wp(
diff --git a/foodie_automator_reddit.py b/foodie_automator_reddit.py
index f194789..a8962fa 100644
--- a/foodie_automator_reddit.py
+++ b/foodie_automator_reddit.py
@@ -8,6 +8,7 @@ import json
import signal
import sys
import re
+from duckduckgo_search import DDGS
from datetime import datetime, timedelta, timezone
from openai import OpenAI
from urllib.parse import quote
@@ -168,6 +169,30 @@ def get_top_comments(post_url, reddit, limit=3):
except Exception as e:
logging.error(f"Failed to fetch comments for {post_url}: {e}")
return []
+
+def fetch_duckduckgo_news_context(title, hours=24):
+ try:
+ with DDGS() as ddgs:
+ results = ddgs.news(f"{title} news", timelimit="d", max_results=5)
+ titles = []
+ for r in results:
+ try:
+ date_str = r["date"]
+ if '+00:00' in date_str:
+ dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
+ else:
+ dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
+ if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
+ titles.append(r["title"].lower())
+ except ValueError as e:
+ logging.warning(f"Date parsing failed for '{date_str}': {e}")
+ continue
+ context = " ".join(titles) if titles else "No recent news found within 24 hours"
+ logging.info(f"DuckDuckGo News context for '{title}': {context}")
+ return context
+ except Exception as e:
+ logging.warning(f"DuckDuckGo News context fetch failed for '{title}': {e}")
+ return title
def fetch_reddit_posts():
reddit = praw.Reddit(
@@ -211,7 +236,7 @@ def curate_from_reddit():
if not articles:
print("No Reddit posts available")
logging.info("No Reddit posts available")
- return None, None, None
+ return None, None, random.randint(600, 1800)
articles.sort(key=lambda x: x["upvotes"], reverse=True)
@@ -249,6 +274,9 @@ def curate_from_reddit():
continue
top_comments = get_top_comments(link, reddit, limit=3)
+ # Fetch additional context via DDG
+ ddg_context = fetch_duckduckgo_news_context(title)
+ content_to_summarize = f"{title}\n\n{summary}\n\nTop Comments:\n{'\n'.join(top_comments) if top_comments else 'None'}\n\nAdditional Context: {ddg_context}"
interest_score = is_interesting_reddit(
title,
summary,
@@ -266,15 +294,13 @@ def curate_from_reddit():
num_paragraphs = determine_paragraph_count(interest_score)
extra_prompt = (
f"Generate exactly {num_paragraphs} paragraphs.\n"
- f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
+ f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
f"Incorporate relevant insights from these top comments if available: {', '.join(top_comments) if top_comments else 'None'}.\n"
- f"Do NOT introduce unrelated concepts unless in the content or comments.\n"
+ f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
+ f"Do NOT introduce unrelated concepts unless in the content, comments, or additional context.\n"
f"If brief, expand on the core idea with relevant context about its appeal or significance.\n"
f"Do not include emojis in the summary."
)
- content_to_summarize = f"{title}\n\n{summary}"
- if top_comments:
- content_to_summarize += f"\n\nTop Comments:\n{'\n'.join(top_comments)}"
final_summary = summarize_with_gpt4o(
content_to_summarize,
@@ -300,7 +326,6 @@ def curate_from_reddit():
image_url, image_source, uploader, page_url = get_image(image_query)
hook = get_dynamic_hook(post_data["title"]).strip()
- # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=None)
# Generate viral share prompt
share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
@@ -309,7 +334,7 @@ def curate_from_reddit():
f' '
f'
'
)
- post_data["content"] = f"{final_summary}\n\n{share_links_template}" # Removed cta from content
+ post_data["content"] = f"{final_summary}\n\n{share_links_template}"
global is_posting
is_posting = True
@@ -335,8 +360,7 @@ def curate_from_reddit():
share_text_encoded = quote(share_text)
post_url_encoded = quote(post_url)
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
- # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
- post_data["content"] = f"{final_summary}\n\n{share_links}" # Removed cta from content
+ post_data["content"] = f"{final_summary}\n\n{share_links}"
is_posting = True
try:
post_to_wp(
diff --git a/foodie_automator_rss.py b/foodie_automator_rss.py
index 4c4d0ff..05c5743 100644
--- a/foodie_automator_rss.py
+++ b/foodie_automator_rss.py
@@ -9,6 +9,8 @@ import signal
import sys
import re
import email.utils
+import feedparser
+from duckduckgo_search import DDGS
from datetime import datetime, timedelta, timezone
from bs4 import BeautifulSoup
from openai import OpenAI
@@ -136,6 +138,7 @@ def fetch_rss_feeds():
logging.error("RSS_FEEDS is empty in foodie_config.py")
return articles
+ logging.info(f"Processing feeds: {RSS_FEEDS}")
for feed_url in RSS_FEEDS:
logging.info(f"Processing feed: {feed_url}")
try:
@@ -182,6 +185,30 @@ def fetch_rss_feeds():
logging.info(f"Total RSS articles fetched: {len(articles)}")
return articles
+def fetch_duckduckgo_news_context(title, hours=24):
+ try:
+ with DDGS() as ddgs:
+ results = ddgs.news(f"{title} news", timelimit="d", max_results=5)
+ titles = []
+ for r in results:
+ try:
+ date_str = r["date"]
+ if '+00:00' in date_str:
+ dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
+ else:
+ dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
+ if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
+ titles.append(r["title"].lower())
+ except ValueError as e:
+ logging.warning(f"Date parsing failed for '{date_str}': {e}")
+ continue
+ context = " ".join(titles) if titles else "No recent news found within 24 hours"
+ logging.info(f"DuckDuckGo News context for '{title}': {context}")
+ return context
+ except Exception as e:
+ logging.warning(f"DuckDuckGo News context fetch failed for '{title}': {e}")
+ return title
+
def curate_from_rss():
articles = fetch_rss_feeds()
if not articles:
@@ -216,10 +243,12 @@ def curate_from_rss():
attempts += 1
continue
- scoring_content = f"{title}\n\n{summary}\n\nContent: {content}"
+ # Fetch additional context via DDG
+ ddg_context = fetch_duckduckgo_news_context(title)
+ scoring_content = f"{title}\n\n{summary}\n\nContent: {content}\n\nAdditional Context: {ddg_context}"
interest_score = is_interesting(scoring_content)
logging.info(f"Interest score for '{title}': {interest_score}")
- if interest_score < 6:
+ if interest_score < 7:
print(f"RSS Interest Too Low: {interest_score}")
logging.info(f"RSS Interest Too Low: {interest_score}")
attempts += 1
@@ -228,8 +257,9 @@ def curate_from_rss():
num_paragraphs = determine_paragraph_count(interest_score)
extra_prompt = (
f"Generate exactly {num_paragraphs} paragraphs.\n"
- f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
- f"Do NOT introduce unrelated concepts.\n"
+ f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
+ f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
+ f"Do NOT introduce unrelated concepts unless in the content or additional context.\n"
f"Expand on the core idea with relevant context about its appeal or significance.\n"
f"Do not include emojis in the summary."
)
@@ -246,20 +276,6 @@ def curate_from_rss():
attempts += 1
continue
- # Remove the original title from the summary while preserving paragraphs
- title_pattern = re.compile(
- r'\*\*' + re.escape(title) + r'\*\*|' + re.escape(title),
- re.IGNORECASE
- )
- paragraphs = final_summary.split('\n')
- cleaned_paragraphs = []
- for para in paragraphs:
- if para.strip():
- cleaned_para = title_pattern.sub('', para).strip()
- cleaned_para = re.sub(r'\s+', ' ', cleaned_para)
- cleaned_paragraphs.append(cleaned_para)
- final_summary = '\n'.join(cleaned_paragraphs)
-
final_summary = insert_link_naturally(final_summary, source_name, link)
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
if not post_data:
diff --git a/foodie_utils.py b/foodie_utils.py
index 252e602..83f4e3a 100644
--- a/foodie_utils.py
+++ b/foodie_utils.py
@@ -612,10 +612,10 @@ def insert_link_naturally(summary, source_name, source_url):
target_para = random.choice([p for p in paragraphs if p.strip()])
link_pattern = f'{source_name}'
phrases = [
- f"Learn more from {link_pattern}",
- f"{link_pattern} shares this insight",
- f"Discover more at {link_pattern}",
- f"Check out {link_pattern} for details"
+ f"According to {link_pattern}", # Changed to a more neutral phrasing
+ f"{link_pattern} notes this insight", # Adjusted phrasing
+ f"Details shared by {link_pattern}", # Adjusted phrasing
+ f"Source: {link_pattern}" # Simple attribution
]
insertion_phrase = random.choice(phrases)