Compare commits

..

11 Commits

Author SHA1 Message Date
Shane 3d0d320648 fix 2025-05-04 13:12:20 +10:00
Shane 504d7f6349 fix 2025-05-04 12:57:22 +10:00
Shane ccddefbc8b try 2025-05-04 12:44:50 +10:00
Shane d2022222c3 try 2025-05-04 12:14:00 +10:00
Shane 7fba0fe96a fix 2025-05-04 12:06:46 +10:00
Shane 6be8493878 fix 2025-05-04 11:09:02 +10:00
Shane e445b6ef33 fix 2025-05-04 10:44:43 +10:00
Shane 5554abdc4a fix 2025-05-04 10:35:31 +10:00
Shane 64d17d5599 try 2025-05-04 10:27:26 +10:00
Shane aa0f3364d5 fix image swap 2025-05-04 09:47:47 +10:00
Shane e5ebd000fe incorporate external context from DDG 2025-05-04 09:07:45 +10:00
4 changed files with 286 additions and 219 deletions
+11 -11
View File
@@ -208,14 +208,15 @@ def curate_from_google_trends(geo_list=['US']):
print(f"Trying Google Trend: {title} from {source_name}")
logging.info(f"Trying Google Trend: {title} from {source_name}")
image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary)
if skip:
print(f"Skipping filtered Google Trend: {title}")
logging.info(f"Skipping filtered Google Trend: {title}")
attempts += 1
continue
scoring_content = f"{title}\n\n{summary}"
ddg_context = fetch_duckduckgo_news_context(title)
scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}"
interest_score = is_interesting(scoring_content)
logging.info(f"Interest score for '{title}': {interest_score}")
if interest_score < 6:
@@ -227,8 +228,9 @@ def curate_from_google_trends(geo_list=['US']):
num_paragraphs = determine_paragraph_count(interest_score)
extra_prompt = (
f"Generate exactly {num_paragraphs} paragraphs.\n"
f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
f"Do NOT introduce unrelated concepts.\n"
f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
f"Do NOT introduce unrelated concepts unless in the content or additional context.\n"
f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
f"Do not include emojis in the summary."
)
@@ -247,18 +249,17 @@ def curate_from_google_trends(geo_list=['US']):
final_summary = insert_link_naturally(final_summary, source_name, link)
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
post_data, author, category, image_url, image_source, uploader, page_url = prepare_post_data(final_summary, title, main_topic)
if not post_data:
attempts += 1
continue
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic)
if not image_url:
image_url, image_source, uploader, page_url = get_image(image_query)
hook = get_dynamic_hook(post_data["title"]).strip()
# Generate viral share prompt
share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
share_links_template = (
f'<p>{share_prompt} '
@@ -279,7 +280,7 @@ def curate_from_google_trends(geo_list=['US']):
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
page_url=page_url,
interest_score=interest_score,
should_post_tweet=True
)
@@ -291,8 +292,7 @@ def curate_from_google_trends(geo_list=['US']):
share_text_encoded = quote(share_text)
post_url_encoded = quote(post_url)
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
# Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
post_data["content"] = f"{final_summary}\n\n{share_links}" # Removed cta from content
post_data["content"] = f"{final_summary}\n\n{share_links}"
is_posting = True
try:
post_to_wp(
@@ -304,7 +304,7 @@ def curate_from_google_trends(geo_list=['US']):
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
page_url=page_url,
interest_score=interest_score,
post_id=post_id,
should_post_tweet=False
+38 -16
View File
@@ -8,6 +8,7 @@ import json
import signal
import sys
import re
from duckduckgo_search import DDGS
from datetime import datetime, timedelta, timezone
from openai import OpenAI
from urllib.parse import quote
@@ -169,6 +170,30 @@ def get_top_comments(post_url, reddit, limit=3):
logging.error(f"Failed to fetch comments for {post_url}: {e}")
return []
def fetch_duckduckgo_news_context(title, hours=24):
try:
with DDGS() as ddgs:
results = ddgs.news(f"{title} news", timelimit="d", max_results=5)
titles = []
for r in results:
try:
date_str = r["date"]
if '+00:00' in date_str:
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
else:
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
titles.append(r["title"].lower())
except ValueError as e:
logging.warning(f"Date parsing failed for '{date_str}': {e}")
continue
context = " ".join(titles) if titles else "No recent news found within 24 hours"
logging.info(f"DuckDuckGo News context for '{title}': {context}")
return context
except Exception as e:
logging.warning(f"DuckDuckGo News context fetch failed for '{title}': {e}")
return title
def fetch_reddit_posts():
reddit = praw.Reddit(
client_id=REDDIT_CLIENT_ID,
@@ -211,7 +236,7 @@ def curate_from_reddit():
if not articles:
print("No Reddit posts available")
logging.info("No Reddit posts available")
return None, None, None
return None, None, random.randint(600, 1800)
articles.sort(key=lambda x: x["upvotes"], reverse=True)
@@ -241,7 +266,7 @@ def curate_from_reddit():
print(f"Trying Reddit Post: {title} from {source_name}")
logging.info(f"Trying Reddit Post: {title} from {source_name}")
image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary)
if skip or any(keyword in title.lower() or keyword in raw_title.lower() for keyword in RECIPE_KEYWORDS + ["homemade"]):
print(f"Skipping filtered Reddit post: {title}")
logging.info(f"Skipping filtered Reddit post: {title}")
@@ -249,6 +274,8 @@ def curate_from_reddit():
continue
top_comments = get_top_comments(link, reddit, limit=3)
ddg_context = fetch_duckduckgo_news_context(title)
content_to_summarize = f"{title}\n\n{summary}\n\nTop Comments:\n{'\n'.join(top_comments) if top_comments else 'None'}\n\nAdditional Context: {ddg_context}"
interest_score = is_interesting_reddit(
title,
summary,
@@ -266,15 +293,13 @@ def curate_from_reddit():
num_paragraphs = determine_paragraph_count(interest_score)
extra_prompt = (
f"Generate exactly {num_paragraphs} paragraphs.\n"
f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
f"Incorporate relevant insights from these top comments if available: {', '.join(top_comments) if top_comments else 'None'}.\n"
f"Do NOT introduce unrelated concepts unless in the content or comments.\n"
f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
f"Do NOT introduce unrelated concepts unless in the content, comments, or additional context.\n"
f"If brief, expand on the core idea with relevant context about its appeal or significance.\n"
f"Do not include emojis in the summary."
)
content_to_summarize = f"{title}\n\n{summary}"
if top_comments:
content_to_summarize += f"\n\nTop Comments:\n{'\n'.join(top_comments)}"
final_summary = summarize_with_gpt4o(
content_to_summarize,
@@ -290,26 +315,24 @@ def curate_from_reddit():
final_summary = insert_link_naturally(final_summary, source_name, link)
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
post_data, author, category, image_url, image_source, uploader, page_url = prepare_post_data(final_summary, title, main_topic)
if not post_data:
attempts += 1
continue
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic)
if not image_url:
image_url, image_source, uploader, page_url = get_image(image_query)
hook = get_dynamic_hook(post_data["title"]).strip()
# Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=None)
# Generate viral share prompt
share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
share_links_template = (
f'<p>{share_prompt} '
f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
)
post_data["content"] = f"{final_summary}\n\n{share_links_template}" # Removed cta from content
post_data["content"] = f"{final_summary}\n\n{share_links_template}"
global is_posting
is_posting = True
@@ -323,7 +346,7 @@ def curate_from_reddit():
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
page_url=page_url,
interest_score=interest_score,
should_post_tweet=True
)
@@ -335,8 +358,7 @@ def curate_from_reddit():
share_text_encoded = quote(share_text)
post_url_encoded = quote(post_url)
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
# Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
post_data["content"] = f"{final_summary}\n\n{share_links}" # Removed cta from content
post_data["content"] = f"{final_summary}\n\n{share_links}"
is_posting = True
try:
post_to_wp(
@@ -348,7 +370,7 @@ def curate_from_reddit():
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
page_url=page_url,
interest_score=interest_score,
post_id=post_id,
should_post_tweet=False
+44 -37
View File
@@ -9,6 +9,8 @@ import signal
import sys
import re
import email.utils
import feedparser
from duckduckgo_search import DDGS
from datetime import datetime, timedelta, timezone
from bs4 import BeautifulSoup
from openai import OpenAI
@@ -136,6 +138,7 @@ def fetch_rss_feeds():
logging.error("RSS_FEEDS is empty in foodie_config.py")
return articles
logging.info(f"Processing feeds: {RSS_FEEDS}")
for feed_url in RSS_FEEDS:
logging.info(f"Processing feed: {feed_url}")
try:
@@ -182,8 +185,32 @@ def fetch_rss_feeds():
logging.info(f"Total RSS articles fetched: {len(articles)}")
return articles
def fetch_duckduckgo_news_context(title, hours=24):
try:
with DDGS() as ddgs:
results = ddgs.news(f"{title} news", timelimit="d", max_results=5)
titles = []
for r in results:
try:
date_str = r["date"]
if '+00:00' in date_str:
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
else:
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
titles.append(r["title"].lower())
except ValueError as e:
logging.warning(f"Date parsing failed for '{date_str}': {e}")
continue
context = " ".join(titles) if titles else "No recent news found within 24 hours"
logging.info(f"DuckDuckGo News context for '{title}': {context}")
return context
except Exception as e:
logging.warning(f"DuckDuckGo News context fetch failed for '{title}': {e}")
return title
def curate_from_rss():
articles = fetch_rss_feeds()
articles = fetch_rss_feeds() # Corrected from fetch_rss_articles to fetch_rss_feeds
if not articles:
print("No RSS articles available")
logging.info("No RSS articles available")
@@ -195,9 +222,8 @@ def curate_from_rss():
article = articles.pop(0)
title = article["title"]
link = article["link"]
summary = article["summary"]
content = article["content"]
source_name = article["feed_title"]
summary = article.get("summary", "")
source_name = article.get("feed_title", "Unknown Source") # Adjusted to match fetch_rss_feeds output
original_source = f'<a href="{link}">{source_name}</a>'
if title in posted_titles:
@@ -209,14 +235,15 @@ def curate_from_rss():
print(f"Trying RSS Article: {title} from {source_name}")
logging.info(f"Trying RSS Article: {title} from {source_name}")
image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary)
if skip:
print(f"Skipping filtered RSS article: {title}")
logging.info(f"Skipping filtered RSS article: {title}")
attempts += 1
continue
scoring_content = f"{title}\n\n{summary}\n\nContent: {content}"
ddg_context = fetch_duckduckgo_news_context(title)
scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}"
interest_score = is_interesting(scoring_content)
logging.info(f"Interest score for '{title}': {interest_score}")
if interest_score < 6:
@@ -228,9 +255,10 @@ def curate_from_rss():
num_paragraphs = determine_paragraph_count(interest_score)
extra_prompt = (
f"Generate exactly {num_paragraphs} paragraphs.\n"
f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
f"Do NOT introduce unrelated concepts.\n"
f"Expand on the core idea with relevant context about its appeal or significance.\n"
f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
f"Do NOT introduce unrelated concepts unless in the content or additional context.\n"
f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
f"Do not include emojis in the summary."
)
content_to_summarize = scoring_content
@@ -246,46 +274,26 @@ def curate_from_rss():
attempts += 1
continue
# Remove the original title from the summary while preserving paragraphs
title_pattern = re.compile(
r'\*\*' + re.escape(title) + r'\*\*|' + re.escape(title),
re.IGNORECASE
)
paragraphs = final_summary.split('\n')
cleaned_paragraphs = []
for para in paragraphs:
if para.strip():
cleaned_para = title_pattern.sub('', para).strip()
cleaned_para = re.sub(r'\s+', ' ', cleaned_para)
cleaned_paragraphs.append(cleaned_para)
final_summary = '\n'.join(cleaned_paragraphs)
final_summary = insert_link_naturally(final_summary, source_name, link)
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
post_data, author, category, image_url, image_source, uploader, page_url = prepare_post_data(final_summary, title, main_topic)
if not post_data:
attempts += 1
continue
# Fetch image
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic)
if not image_url:
logging.info(f"Flickr fetch failed for '{image_query}'. Falling back to Pixabay.")
image_url, image_source, uploader, page_url = get_image(image_query)
if not image_url:
logging.info(f"Pixabay fetch failed for '{image_query}'. Skipping article '{title}'.")
attempts += 1
continue
hook = get_dynamic_hook(post_data["title"]).strip()
# Generate viral share prompt
share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
share_links_template = (
f'<p>{share_prompt} '
f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
)
post_data["content"] = f"{final_summary}\n\n{share_links_template}" # Removed cta from content
post_data["content"] = f"{final_summary}\n\n{share_links_template}"
global is_posting
is_posting = True
@@ -299,7 +307,7 @@ def curate_from_rss():
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
page_url=page_url,
interest_score=interest_score,
should_post_tweet=True
)
@@ -311,8 +319,7 @@ def curate_from_rss():
share_text_encoded = quote(share_text)
post_url_encoded = quote(post_url)
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
# Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
post_data["content"] = f"{final_summary}\n\n{share_links}" # Removed cta from content
post_data["content"] = f"{final_summary}\n\n{share_links}"
is_posting = True
try:
post_to_wp(
@@ -324,7 +331,7 @@ def curate_from_rss():
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
page_url=page_url,
interest_score=interest_score,
post_id=post_id,
should_post_tweet=False
+183 -145
View File
@@ -29,6 +29,8 @@ from foodie_config import (
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
IMAGE_EXPIRATION_DAYS = 7 # 7 days, consistent with foodie_automator_rss.py
def load_json_file(file_path, expiration_hours):
entries = []
cutoff = datetime.now(timezone.utc) - timedelta(hours=expiration_hours)
@@ -341,9 +343,10 @@ def smart_image_and_filter(title, summary):
prompt = (
"Analyze this article title and summary. Extract key entities (brands, locations, cuisines, or topics) "
"for an image search about food industry trends or viral content. Prioritize specific terms if present, "
"otherwise focus on the main theme. "
"otherwise focus on the main theme. Also identify the main topic of the article (e.g., a specific food item or cuisine). "
"Return 'SKIP' if the article is about home appliances, recipes, promotions, or contains 'homemade', else 'KEEP'. "
"Return as JSON with double quotes for all property names and string values (e.g., {\"image_query\": \"specific term\", \"relevance\": [\"keyword1\", \"keyword2\"], \"action\": \"KEEP\" or \"SKIP\"})."
"Return as JSON with double quotes for all property names and string values (e.g., "
"{\"image_query\": \"specific term\", \"relevance\": [\"keyword1\", \"keyword2\"], \"main_topic\": \"main food item\", \"action\": \"KEEP\" or \"SKIP\"})."
)
response = client.chat.completions.create(
@@ -357,38 +360,54 @@ def smart_image_and_filter(title, summary):
raw_result = response.choices[0].message.content.strip()
logging.info(f"Raw GPT smart image/filter response: '{raw_result}'")
# Remove ```json markers and fix single quotes in JSON structure
cleaned_result = re.sub(r'```json\s*|\s*```', '', raw_result).strip()
# Replace single quotes with double quotes, but preserve single quotes within string values
fixed_result = re.sub(r"(?<!\\)'(?=\s*[\w\s]*\])|(?<=\[|\{|\s)'|'(?=\s*[\]\},:])|(?<=\w)'(?=\s*:)", '"', cleaned_result)
try:
result = json.loads(fixed_result)
except json.JSONDecodeError as e:
logging.warning(f"JSON parsing failed: {e}, raw: '{fixed_result}'. Using fallback.")
return "food trends", ["cuisine", "dining"], False
# Fallback: Extract main topic using simple keyword matching
main_topic = extract_main_topic(title.lower() + " " + summary.lower())
return main_topic, [main_topic, "food"], False
if not isinstance(result, dict) or "image_query" not in result or "relevance" not in result or "action" not in result:
logging.warning(f"Invalid GPT response format: {result}, using fallback")
return "food trends", ["cuisine", "dining"], False
main_topic = extract_main_topic(title.lower() + " " + summary.lower())
return main_topic, [main_topic, "food"], False
image_query = result["image_query"]
relevance_keywords = result["relevance"]
main_topic = result.get("main_topic", extract_main_topic(title.lower() + " " + summary.lower()))
skip_flag = result["action"] == "SKIP" or "homemade" in title.lower() or "homemade" in summary.lower()
logging.info(f"Smart image query: {image_query}, Relevance: {relevance_keywords}, Skip: {skip_flag}")
logging.info(f"Smart image query: {image_query}, Relevance: {relevance_keywords}, Main Topic: {main_topic}, Skip: {skip_flag}")
if not image_query or len(image_query.split()) < 2:
specific_single_words = ["kimchi", "sushi", "pizza", "taco", "burger"]
if not image_query:
logging.warning(f"Image query is empty, using fallback")
return main_topic, [main_topic, "food"], skip_flag
if len(image_query.split()) < 2 and image_query.lower() not in specific_single_words:
logging.warning(f"Image query '{image_query}' too vague, using fallback")
return "food trends", ["cuisine", "dining"], skip_flag
return main_topic, [main_topic, "food"], skip_flag
return image_query, relevance_keywords, skip_flag
return image_query, relevance_keywords, main_topic, skip_flag
except Exception as e:
logging.error(f"Smart image/filter failed: {e}, using fallback")
return "food trends", ["cuisine", "dining"], False
main_topic = extract_main_topic(title.lower() + " " + summary.lower())
return main_topic, [main_topic, "food"], False
def upload_image_to_wp(image_url, post_title, wp_base_url, wp_username, wp_password, image_source="Pixabay", uploader=None, pixabay_url=None):
def extract_main_topic(text):
# Common food-related keywords (expand as needed)
food_keywords = ["kimchi", "sushi", "pizza", "taco", "burger", "ramen", "curry", "pasta", "salad", "soup"]
for keyword in food_keywords:
if keyword in text:
return keyword
# Fallback to a generic term if no specific food item is found
return "food trends"
def upload_image_to_wp(image_url, post_title, wp_base_url, wp_username, wp_password, image_source="Pixabay", uploader=None, page_url=None):
try:
safe_title = post_title.encode('ascii', 'ignore').decode('ascii').replace(' ', '_')[:50]
headers = {
@@ -401,12 +420,11 @@ def upload_image_to_wp(image_url, post_title, wp_base_url, wp_username, wp_passw
}
logging.info(f"Fetching image from {image_url} for '{post_title}'")
# Add rate limit handling for image download
for attempt in range(3):
try:
image_response = requests.get(image_url, headers=image_headers, timeout=10)
if image_response.status_code == 429:
wait_time = 10 * (2 ** attempt) # 10s, 20s, 40s
wait_time = 10 * (2 ** attempt)
logging.warning(f"Rate limit hit for {image_url}. Retrying after {wait_time}s (attempt {attempt+1}/3).")
time.sleep(wait_time)
continue
@@ -431,7 +449,12 @@ def upload_image_to_wp(image_url, post_title, wp_base_url, wp_username, wp_passw
response.raise_for_status()
image_id = response.json()["id"]
caption = f'<a href="{pixabay_url}">{image_source}</a> by {uploader}' if pixabay_url and uploader else image_source
if page_url and uploader:
caption = f'<a href="{page_url}">{image_source}</a> by {uploader}'
elif page_url:
caption = f'<a href="{page_url}">{image_source}</a>'
else:
caption = image_source
requests.post(
f"{wp_base_url}/media/{image_id}",
headers={"Authorization": headers["Authorization"], "Content-Type": "application/json"},
@@ -565,74 +588,57 @@ def insert_link_naturally(summary, source_name, source_url):
try:
logging.info(f"Input summary to insert_link_naturally: {summary!r}")
prompt = (
"Take this summary and insert a single HTML link naturally into one paragraph (randomly chosen). "
"Use the format '<a href=\"{source_url}\">{source_name}</a>' and weave it into the text seamlessly, "
"e.g., 'The latest scoop from {source_name} reveals...' or '{source_name} shares this insight.' "
"Vary the phrasing creatively to avoid repetition (dont always use 'dives into'). "
"Place the link at a sentence boundary (after a period, not within numbers like '6.30am' or '1.5'). "
"Maintain the original tone, flow, and paragraph structure, preserving all existing newlines exactly as they are. "
"Each paragraph in the input summary is separated by a single \\n; ensure the output maintains this exact separation. "
"Do not add or remove newlines beyond the original summary structure. "
"Return the modified summary with exactly one link.\n\n"
"Summary:\n{summary}\n\n"
"Source Name: {source_name}\nSource URL: {source_url}"
).format(summary=summary, source_name=source_name, source_url=source_url)
response = client.chat.completions.create(
model=LIGHT_TASK_MODEL,
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": "Insert the link naturally into the summary."}
],
max_tokens=1000,
temperature=0.7
)
new_summary = response.choices[0].message.content.strip()
link_pattern = f'<a href="{source_url}">{source_name}</a>'
if new_summary and new_summary.count(link_pattern) == 1:
paragraphs = new_summary.split('\n')
paragraphs = [p.strip() for p in paragraphs]
new_summary = '\n'.join(paragraphs)
logging.info(f"Summary with naturally embedded link (normalized): {new_summary!r}")
return new_summary
logging.warning(f"GPT failed to insert link correctly: {new_summary}. Using fallback.")
except Exception as e:
logging.error(f"Link insertion failed: {e}")
# Fallback path
time_pattern = r'\b\d{1,2}\.\d{2}(?:am|pm)\b'
protected_summary = re.sub(time_pattern, lambda m: m.group(0).replace('.', '@'), summary)
paragraphs = protected_summary.split('\n')
paragraphs = summary.split('\n')
if not paragraphs or all(not p.strip() for p in paragraphs):
logging.error("No valid paragraphs to insert link.")
return summary
target_para = random.choice([p for p in paragraphs if p.strip()])
eligible_paragraphs = [p for p in paragraphs if p.strip() and len(re.split(r'(?<=[.!?])\s+', p.strip())) >= 2]
if not eligible_paragraphs:
logging.warning("No paragraph with multiple sentences found, appending to last paragraph.")
target_para = paragraphs[-1].strip()
link_pattern = f'<a href="{source_url}">{source_name}</a>'
phrases = [
f"Learn more from {link_pattern}",
f"{link_pattern} shares this insight",
f"Discover more at {link_pattern}",
f"Check out {link_pattern} for details"
]
insertion_phrase = random.choice(phrases)
sentences = re.split(r'(?<=[.!?])\s+', target_para)
insertion_point = -1
for i, sent in enumerate(sentences):
if sent.strip() and '@' not in sent:
insertion_point = sum(len(s) + 1 for s in sentences[:i+1])
break
if insertion_point == -1:
insertion_point = len(target_para)
new_para = f"{target_para[:insertion_point]} {insertion_phrase}. {target_para[insertion_point:]}".strip()
paragraphs[paragraphs.index(target_para)] = new_para
new_para = f"{target_para} Source: {link_pattern}."
paragraphs[-1] = new_para
new_summary = '\n'.join(paragraphs)
logging.info(f"Appended link to summary: {new_summary!r}")
return new_summary
new_summary = new_summary.replace('@', '.')
target_para = random.choice(eligible_paragraphs)
sentences = re.split(r'(?<=[.!?])\s+', target_para.strip())
eligible_sentences = [(i, s) for i, s in enumerate(sentences) if i < len(sentences)-1 and s.strip()]
if not eligible_sentences:
eligible_sentences = [(i, s) for i, s in enumerate(sentences) if s.strip()]
sentence_idx, sentence = random.choice(eligible_sentences)
link_pattern = f'<a href="{source_url}">{source_name}</a>'
words = sentence.split()
if len(words) < 5: # Ensure enough words for natural insertion
new_sentence = f"{sentence.rstrip('.')} according to {link_pattern}."
else:
split_point = random.randint(2, len(words)-3) # Split further into the sentence
# Remove trailing punctuation from the first part and ensure proper grammar
first_part = ' '.join(words[:split_point]).rstrip(',')
second_part = ' '.join(words[split_point:]).lstrip(',')
new_sentence = f"{first_part} according to {link_pattern} {second_part}"
# Ensure the sentence ends with a period
if not new_sentence.endswith('.'):
new_sentence += '.'
sentences[sentence_idx] = new_sentence
new_para = ' '.join(sentences)
paragraphs[paragraphs.index(target_para)] = new_para
new_summary = '\n'.join(paragraphs)
logging.info(f"Summary with naturally embedded link: {new_summary!r}")
return new_summary
except Exception as e:
logging.error(f"Link insertion failed: {e}")
link_pattern = f'<a href="{source_url}">{source_name}</a>'
new_summary = f"{summary}\n\nSource: {link_pattern}."
logging.info(f"Fallback summary with link: {new_summary!r}")
return new_summary
@@ -702,7 +708,7 @@ def get_wp_tag_id(tag_name, wp_base_url, wp_username, wp_password):
logging.error(f"Failed to get WP tag ID for '{tag_name}': {e}")
return None
def post_to_wp(post_data, category, link, author, image_url, original_source, image_source="Pixabay", uploader=None, pixabay_url=None, interest_score=4, post_id=None, should_post_tweet=True):
def post_to_wp(post_data, category, link, author, image_url, original_source, image_source="Pixabay", uploader=None, page_url=None, interest_score=4, post_id=None, should_post_tweet=True):
wp_base_url = "https://insiderfoodie.com/wp-json/wp/v2"
logging.info(f"Starting post_to_wp for '{post_data['title']}', image_source: {image_source}")
@@ -749,6 +755,8 @@ def post_to_wp(post_data, category, link, author, image_url, original_source, im
content = "Content unavailable. Check the original source for details."
formatted_content = "\n".join(f"<p>{para}</p>" for para in content.split('\n') if para.strip())
# Removed the block that appends image attribution to the content
author_id_map = {
"owenjohnson": 10,
"javiermorales": 2,
@@ -759,17 +767,16 @@ def post_to_wp(post_data, category, link, author, image_url, original_source, im
}
author_id = author_id_map.get(author["username"], 5)
# Handle image upload
image_id = None
if image_url:
logging.info(f"Attempting image upload for '{post_data['title']}', URL: {image_url}, source: {image_source}")
image_id = upload_image_to_wp(image_url, post_data["title"], wp_base_url, wp_username, wp_password, image_source, uploader, pixabay_url)
image_id = upload_image_to_wp(image_url, post_data["title"], wp_base_url, wp_username, wp_password, image_source, uploader, page_url)
if not image_id:
logging.info(f"Flickr upload failed for '{post_data['title']}', falling back to Pixabay")
pixabay_query = post_data["title"][:50]
image_url, image_source, uploader, pixabay_url = get_image(pixabay_query)
image_url, image_source, uploader, page_url = get_image(pixabay_query)
if image_url:
image_id = upload_image_to_wp(image_url, post_data["title"], wp_base_url, wp_username, wp_password, image_source, uploader, pixabay_url)
image_id = upload_image_to_wp(image_url, post_data["title"], wp_base_url, wp_username, wp_password, image_source, uploader, page_url)
if not image_id:
logging.warning(f"All image uploads failed for '{post_data['title']}' - posting without image")
@@ -808,11 +815,9 @@ def post_to_wp(post_data, category, link, author, image_url, original_source, im
post_id = post_info["id"]
post_url = post_info["link"]
# Save to recent_posts.json
timestamp = datetime.now(timezone.utc).isoformat()
save_post_to_recent(post_data["title"], post_url, author["username"], timestamp)
# Post article tweet to X only if should_post_tweet is True
if should_post_tweet:
try:
post = {"title": post_data["title"], "url": post_url}
@@ -864,42 +869,39 @@ used_images = set()
# Load used images from file if it exists
if os.path.exists(used_images_file):
try:
with open(used_images_file, 'r') as f:
content = f.read().strip()
if not content:
logging.warning(f"Used images file {used_images_file} is empty. Resetting to empty list.")
data = []
entries = load_json_file(used_images_file, IMAGE_EXPIRATION_DAYS * 24) # Use load_json_file for consistency
for entry in entries:
if isinstance(entry, dict) and "title" in entry and entry["title"].startswith('https://'):
used_images.add(entry["title"])
else:
data = json.loads(content)
if not isinstance(data, list):
logging.warning(f"Invalid format in {used_images_file}: expected a list, got {type(data)}. Converting to list.")
if isinstance(data, dict):
# If it's a dict, try to extract URLs from values
data = [v for v in data.values() if isinstance(v, str) and v.startswith('https://')]
else:
logging.warning(f"Cannot convert {type(data)} to list. Resetting to empty list.")
data = []
# Filter out non-string or non-URL entries
data = [item for item in data if isinstance(item, str) and item.startswith('https://')]
used_images.update(data)
logging.warning(f"Skipping invalid entry in {used_images_file}: {entry}")
logging.info(f"Loaded {len(used_images)} used image URLs from {used_images_file}")
except Exception as e:
logging.warning(f"Failed to load used images from {used_images_file}: {e}. Resetting to empty set.")
used_images = set()
with open(used_images_file, 'w') as f:
json.dump([], f)
f.write("")
# Function to save used_images to file
def save_used_images():
try:
# Ensure used_images contains only valid URLs
valid_urls = [url for url in used_images if isinstance(url, str) and url.startswith('https://')]
if len(valid_urls) != len(used_images):
logging.warning(f"Found {len(used_images) - len(valid_urls)} invalid URLs in used_images set")
# Load existing entries to preserve timestamps
entries = load_json_file(used_images_file, IMAGE_EXPIRATION_DAYS * 24)
existing_entries = {entry["title"]: entry for entry in entries if isinstance(entry, dict) and "title" in entry}
# Create new entries for used_images
timestamp = datetime.now(timezone.utc).isoformat()
updated_entries = []
for url in used_images:
if url in existing_entries:
updated_entries.append(existing_entries[url])
else:
updated_entries.append({"title": url, "timestamp": timestamp})
with open(used_images_file, 'w') as f:
json.dump(valid_urls, f, indent=2)
logging.info(f"Saved {len(valid_urls)} used image URLs to {used_images_file}")
for entry in updated_entries:
f.write(json.dumps(entry) + '\n')
logging.info(f"Saved {len(updated_entries)} used image URLs to {used_images_file}")
except Exception as e:
logging.warning(f"Failed to save used images to {used_images_file}: {e}")
@@ -930,8 +932,18 @@ def process_photo(photo, search_query):
logging.warning(f"Medium size not available for photo {photo.id}: {e}")
return None
if not img_url or img_url in used_images:
logging.info(f"Image URL invalid or already used for photo {photo.id}: {img_url}")
if not img_url:
logging.info(f"Image URL invalid for photo {photo.id}")
return None
# Check if the image is highly relevant to the query
query_keywords = set(search_query.lower().split())
photo_keywords = set(tags + title.split())
is_relevant = bool(query_keywords & photo_keywords) # Check if any query keyword is in tags or title
# Allow reuse of highly relevant images
if img_url in used_images and not is_relevant:
logging.info(f"Image already used and not highly relevant for photo {photo.id}: {img_url}")
return None
uploader = photo.owner.username
@@ -1037,14 +1049,13 @@ def classify_keywords(keywords):
logging.warning(f"Keyword classification failed: {e}. Defaulting to all specific.")
return {kw: "specific" for kw in keywords}
def get_flickr_image(search_query, relevance_keywords):
def get_flickr_image(search_query, relevance_keywords, main_topic):
global last_flickr_request_time, flickr_request_count
reset_flickr_request_count()
flickr_request_count += 1
logging.info(f"Flickr request count: {flickr_request_count}/3600")
# Enforce a minimum delay of 10 seconds between Flickr requests
current_time = time.time()
time_since_last_request = current_time - last_flickr_request_time
if time_since_last_request < 10:
@@ -1052,7 +1063,15 @@ def get_flickr_image(search_query, relevance_keywords):
last_flickr_request_time = time.time()
# Step 1: Search DDG to find Flickr photo IDs
# Step 1: Search Flickr directly with the original query
logging.info(f"Searching Flickr directly with query: '{search_query}'")
photos = search_flickr(search_query)
for photo in photos:
result = process_photo(photo, search_query)
if result:
return result
# Step 2: Search DDG to find Flickr photo IDs
logging.info(f"Searching DDG with query: '{search_query} site:flickr.com'")
photo_ids = search_ddg_for_flickr(search_query)
if photo_ids:
@@ -1063,13 +1082,12 @@ def get_flickr_image(search_query, relevance_keywords):
if result:
return result
# Step 2: Break down the query into keywords and classify them for direct Flickr API search
# Step 3: Break down the query into keywords and classify them
keywords = search_query.lower().split()
if len(keywords) > 1:
classifications = classify_keywords(keywords)
logging.info(f"Keyword classifications: {classifications}")
# Prioritize specific keywords
specific_keywords = [kw for kw, classification in classifications.items() if classification == "specific"]
if specific_keywords:
for keyword in specific_keywords:
@@ -1080,9 +1098,17 @@ def get_flickr_image(search_query, relevance_keywords):
if result:
return result
# Step 3: Final fallback using relevance keywords
# Step 4: Fallback using main topic
logging.info(f"No results found. Falling back to main topic: '{main_topic}'")
photos = search_flickr(main_topic)
for photo in photos:
result = process_photo(photo, main_topic)
if result:
return result
# Step 5: Final fallback using relevance keywords
fallback_query = " ".join(relevance_keywords) if isinstance(relevance_keywords, list) else relevance_keywords
logging.info(f"No results found. Falling back to generic query: '{fallback_query}'")
logging.info(f"No results with main topic. Falling back to relevance keywords: '{fallback_query}'")
photos = search_flickr(fallback_query)
for photo in photos:
result = process_photo(photo, search_query)
@@ -1116,46 +1142,58 @@ def select_best_author(summary):
logging.error(f"Author selection failed: {e}")
return "owenjohnson"
def prepare_post_data(final_summary, original_title, context_info=""):
innovative_title = generate_title_from_summary(final_summary)
if not innovative_title:
logging.info(f"Title generation failed for '{original_title}' {context_info}")
def prepare_post_data(summary, title, main_topic=None):
try:
logging.info(f"Preparing post data for summary: {summary[:100]}...")
# Use the original generate_title_from_summary function to generate the title
new_title = generate_title_from_summary(summary)
if not new_title:
logging.warning("Title generation failed, using fallback title")
new_title = "A Tasty Food Discovery Awaits You"
logging.info(f"Generated new title: '{new_title}'")
# Update to unpack four values
search_query, relevance_keywords, generated_main_topic, skip_flag = smart_image_and_filter(new_title, summary)
if skip_flag:
logging.info("Summary filtered out during post preparation")
return None, None, None, None, None, None, None
# Pass innovative_title and final_summary as separate arguments
search_query, relevance_keywords, _ = generate_image_query(innovative_title, final_summary)
if not search_query:
logging.info(f"Image query generation failed for '{innovative_title}' {context_info}")
# Use the provided main_topic if available, otherwise use the generated one
effective_main_topic = main_topic if main_topic else generated_main_topic
image_url, image_source, uploader, page_url = get_flickr_image(search_query, relevance_keywords, effective_main_topic)
if not image_url:
image_url, image_source, uploader, page_url = get_image(search_query)
if not image_url:
logging.warning("No image found for post, skipping")
return None, None, None, None, None, None, None
logging.info(f"Fetching Flickr image for query: '{search_query}' {context_info}")
image_url, image_source, uploader, page_url = get_flickr_image(search_query, relevance_keywords)
# Select a full author dictionary from AUTHORS (already imported from foodie_config)
author = random.choice(AUTHORS)
if not image_url:
logging.info(f"Flickr fetch failed for '{search_query}' - falling back to Pixabay {context_info}")
# Use the same title and summary for fallback
image_query, _, _ = generate_image_query(innovative_title, final_summary)
image_url, image_source, uploader, page_url = get_image(image_query)
if not image_url:
logging.info(f"Pixabay fetch failed for title '{innovative_title}' - falling back to summary {context_info}")
image_query, _, _ = generate_image_query(final_summary, final_summary) # Using summary as both title and summary for fallback
image_url, image_source, uploader, page_url = get_image(image_query)
if not image_url:
logging.info(f"Image fetch failed again for '{original_title}' - proceeding without image {context_info}")
categories = ["Food", "Trends", "Eats", "Culture"]
category = random.choice(categories)
post_data = {"title": innovative_title, "content": final_summary}
selected_username = select_best_author(final_summary)
author = next((a for a in AUTHORS if a["username"] == selected_username), None)
if not author:
logging.error(f"Author '{selected_username}' not found in AUTHORS, defaulting to owenjohnson")
author = {"username": "owenjohnson", "password": "rfjk xhn6 2RPy FuQ9 cGlU K8mC"}
category = generate_category_from_summary(final_summary)
post_data = {
"title": new_title,
"content": summary,
"status": "publish",
"author": author["username"], # Use the username in post_data
"categories": [category]
}
logging.info(f"Post data prepared: Title: '{new_title}', Category: {category}, Author: {author['username']}")
return post_data, author, category, image_url, image_source, uploader, page_url
except Exception as e:
logging.error(f"Failed to prepare post data: {e}")
return None, None, None, None, None, None, None
def save_post_to_recent(post_title, post_url, author_username, timestamp):
try:
recent_posts = load_json_file('/home/shane/foodie_automator/recent_posts.json')
recent_posts = load_json_file('/home/shane/foodie_automator/recent_posts.json', 24) # Added expiration_hours
entry = {
"title": post_title,
"url": post_url,