fix

try
2025-05-04 13:12:20 +10:00 · 2025-05-04 12:57:22 +10:00 · 2025-05-04 12:44:50 +10:00 · 2025-05-04 12:14:00 +10:00 · 2025-05-04 12:06:46 +10:00 · 2025-05-04 11:09:02 +10:00
4 changed files with 286 additions and 219 deletions
@@ -208,14 +208,15 @@ def curate_from_google_trends(geo_list=['US']):
        print(f"Trying Google Trend: {title} from {source_name}")
        logging.info(f"Trying Google Trend: {title} from {source_name}")

-        image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
+        image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary)
        if skip:
            print(f"Skipping filtered Google Trend: {title}")
            logging.info(f"Skipping filtered Google Trend: {title}")
            attempts += 1
            continue

-        scoring_content = f"{title}\n\n{summary}"
+        ddg_context = fetch_duckduckgo_news_context(title)
+        scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}"
        interest_score = is_interesting(scoring_content)
        logging.info(f"Interest score for '{title}': {interest_score}")
        if interest_score < 6:
@@ -227,8 +228,9 @@ def curate_from_google_trends(geo_list=['US']):
        num_paragraphs = determine_paragraph_count(interest_score)
        extra_prompt = (
            f"Generate exactly {num_paragraphs} paragraphs.\n"
-            f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
-            f"Do NOT introduce unrelated concepts.\n"
+            f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
+            f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
+            f"Do NOT introduce unrelated concepts unless in the content or additional context.\n"
            f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
            f"Do not include emojis in the summary."
        )
@@ -247,18 +249,17 @@ def curate_from_google_trends(geo_list=['US']):

        final_summary = insert_link_naturally(final_summary, source_name, link)

-        post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
+        post_data, author, category, image_url, image_source, uploader, page_url = prepare_post_data(final_summary, title, main_topic)
        if not post_data:
            attempts += 1
            continue

-        image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
+        image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic)
        if not image_url:
            image_url, image_source, uploader, page_url = get_image(image_query)

        hook = get_dynamic_hook(post_data["title"]).strip()

-        # Generate viral share prompt
        share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
        share_links_template = (
            f'<p>{share_prompt} '
@@ -279,7 +280,7 @@ def curate_from_google_trends(geo_list=['US']):
                original_source=original_source,
                image_source=image_source,
                uploader=uploader,
-                pixabay_url=pixabay_url,
+                page_url=page_url,
                interest_score=interest_score,
                should_post_tweet=True
            )
@@ -291,8 +292,7 @@ def curate_from_google_trends(geo_list=['US']):
            share_text_encoded = quote(share_text)
            post_url_encoded = quote(post_url)
            share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
-            # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
-            post_data["content"] = f"{final_summary}\n\n{share_links}"  # Removed cta from content
+            post_data["content"] = f"{final_summary}\n\n{share_links}"
            is_posting = True
            try:
                post_to_wp(
@@ -304,7 +304,7 @@ def curate_from_google_trends(geo_list=['US']):
                    original_source=original_source,
                    image_source=image_source,
                    uploader=uploader,
-                    pixabay_url=pixabay_url,
+                    page_url=page_url,
                    interest_score=interest_score,
                    post_id=post_id,
                    should_post_tweet=False
@@ -8,6 +8,7 @@ import json
 import signal
 import sys
 import re
+from duckduckgo_search import DDGS
 from datetime import datetime, timedelta, timezone
 from openai import OpenAI
 from urllib.parse import quote
@@ -169,6 +170,30 @@ def get_top_comments(post_url, reddit, limit=3):
        logging.error(f"Failed to fetch comments for {post_url}: {e}")
        return []
    
+def fetch_duckduckgo_news_context(title, hours=24):
+    try:
+        with DDGS() as ddgs:
+            results = ddgs.news(f"{title} news", timelimit="d", max_results=5)
+            titles = []
+            for r in results:
+                try:
+                    date_str = r["date"]
+                    if '+00:00' in date_str:
+                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
+                    else:
+                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
+                    if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
+                        titles.append(r["title"].lower())
+                except ValueError as e:
+                    logging.warning(f"Date parsing failed for '{date_str}': {e}")
+                    continue
+            context = " ".join(titles) if titles else "No recent news found within 24 hours"
+            logging.info(f"DuckDuckGo News context for '{title}': {context}")
+            return context
+    except Exception as e:
+        logging.warning(f"DuckDuckGo News context fetch failed for '{title}': {e}")
+        return title
+
 def fetch_reddit_posts():
    reddit = praw.Reddit(
        client_id=REDDIT_CLIENT_ID,
@@ -211,7 +236,7 @@ def curate_from_reddit():
    if not articles:
        print("No Reddit posts available")
        logging.info("No Reddit posts available")
-        return None, None, None
+        return None, None, random.randint(600, 1800)

    articles.sort(key=lambda x: x["upvotes"], reverse=True)
    
@@ -241,7 +266,7 @@ def curate_from_reddit():
        print(f"Trying Reddit Post: {title} from {source_name}")
        logging.info(f"Trying Reddit Post: {title} from {source_name}")
        
-        image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
+        image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary)
        if skip or any(keyword in title.lower() or keyword in raw_title.lower() for keyword in RECIPE_KEYWORDS + ["homemade"]):
            print(f"Skipping filtered Reddit post: {title}")
            logging.info(f"Skipping filtered Reddit post: {title}")
@@ -249,6 +274,8 @@ def curate_from_reddit():
            continue
        
        top_comments = get_top_comments(link, reddit, limit=3)
+        ddg_context = fetch_duckduckgo_news_context(title)
+        content_to_summarize = f"{title}\n\n{summary}\n\nTop Comments:\n{'\n'.join(top_comments) if top_comments else 'None'}\n\nAdditional Context: {ddg_context}"
        interest_score = is_interesting_reddit(
            title,
            summary,
@@ -266,15 +293,13 @@ def curate_from_reddit():
        num_paragraphs = determine_paragraph_count(interest_score)
        extra_prompt = (
            f"Generate exactly {num_paragraphs} paragraphs.\n"
-            f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
+            f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
            f"Incorporate relevant insights from these top comments if available: {', '.join(top_comments) if top_comments else 'None'}.\n"
-            f"Do NOT introduce unrelated concepts unless in the content or comments.\n"
+            f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
+            f"Do NOT introduce unrelated concepts unless in the content, comments, or additional context.\n"
            f"If brief, expand on the core idea with relevant context about its appeal or significance.\n"
            f"Do not include emojis in the summary."
        )
-        content_to_summarize = f"{title}\n\n{summary}"
-        if top_comments:
-            content_to_summarize += f"\n\nTop Comments:\n{'\n'.join(top_comments)}"
        
        final_summary = summarize_with_gpt4o(
            content_to_summarize,
@@ -290,26 +315,24 @@ def curate_from_reddit():
        
        final_summary = insert_link_naturally(final_summary, source_name, link)
        
-        post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
+        post_data, author, category, image_url, image_source, uploader, page_url = prepare_post_data(final_summary, title, main_topic)
        if not post_data:
            attempts += 1
            continue
        
-        image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
+        image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic)
        if not image_url:
            image_url, image_source, uploader, page_url = get_image(image_query)
        
        hook = get_dynamic_hook(post_data["title"]).strip()
-        # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=None)
        
-        # Generate viral share prompt
        share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
        share_links_template = (
            f'<p>{share_prompt} '
            f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
            f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
        )
-        post_data["content"] = f"{final_summary}\n\n{share_links_template}"  # Removed cta from content
+        post_data["content"] = f"{final_summary}\n\n{share_links_template}"
        
        global is_posting
        is_posting = True
@@ -323,7 +346,7 @@ def curate_from_reddit():
                original_source=original_source,
                image_source=image_source,
                uploader=uploader,
-                pixabay_url=pixabay_url,
+                page_url=page_url,
                interest_score=interest_score,
                should_post_tweet=True
            )
@@ -335,8 +358,7 @@ def curate_from_reddit():
            share_text_encoded = quote(share_text)
            post_url_encoded = quote(post_url)
            share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
-            # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
-            post_data["content"] = f"{final_summary}\n\n{share_links}"  # Removed cta from content
+            post_data["content"] = f"{final_summary}\n\n{share_links}"
            is_posting = True
            try:
                post_to_wp(
@@ -348,7 +370,7 @@ def curate_from_reddit():
                    original_source=original_source,
                    image_source=image_source,
                    uploader=uploader,
-                    pixabay_url=pixabay_url,
+                    page_url=page_url,
                    interest_score=interest_score,
                    post_id=post_id,
                    should_post_tweet=False
@@ -9,6 +9,8 @@ import signal
 import sys
 import re
 import email.utils
+import feedparser
+from duckduckgo_search import DDGS
 from datetime import datetime, timedelta, timezone
 from bs4 import BeautifulSoup
 from openai import OpenAI
@@ -136,6 +138,7 @@ def fetch_rss_feeds():
        logging.error("RSS_FEEDS is empty in foodie_config.py")
        return articles

+    logging.info(f"Processing feeds: {RSS_FEEDS}")
    for feed_url in RSS_FEEDS:
        logging.info(f"Processing feed: {feed_url}")
        try:
@@ -182,8 +185,32 @@ def fetch_rss_feeds():
    logging.info(f"Total RSS articles fetched: {len(articles)}")
    return articles

+def fetch_duckduckgo_news_context(title, hours=24):
+    try:
+        with DDGS() as ddgs:
+            results = ddgs.news(f"{title} news", timelimit="d", max_results=5)
+            titles = []
+            for r in results:
+                try:
+                    date_str = r["date"]
+                    if '+00:00' in date_str:
+                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
+                    else:
+                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
+                    if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
+                        titles.append(r["title"].lower())
+                except ValueError as e:
+                    logging.warning(f"Date parsing failed for '{date_str}': {e}")
+                    continue
+            context = " ".join(titles) if titles else "No recent news found within 24 hours"
+            logging.info(f"DuckDuckGo News context for '{title}': {context}")
+            return context
+    except Exception as e:
+        logging.warning(f"DuckDuckGo News context fetch failed for '{title}': {e}")
+        return title
+
 def curate_from_rss():
-    articles = fetch_rss_feeds()
+    articles = fetch_rss_feeds()  # Corrected from fetch_rss_articles to fetch_rss_feeds
    if not articles:
        print("No RSS articles available")
        logging.info("No RSS articles available")
@@ -195,9 +222,8 @@ def curate_from_rss():
        article = articles.pop(0)
        title = article["title"]
        link = article["link"]
-        summary = article["summary"]
-        content = article["content"]
-        source_name = article["feed_title"]
+        summary = article.get("summary", "")
+        source_name = article.get("feed_title", "Unknown Source")  # Adjusted to match fetch_rss_feeds output
        original_source = f'<a href="{link}">{source_name}</a>'

        if title in posted_titles:
@@ -209,14 +235,15 @@ def curate_from_rss():
        print(f"Trying RSS Article: {title} from {source_name}")
        logging.info(f"Trying RSS Article: {title} from {source_name}")

-        image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
+        image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary)
        if skip:
            print(f"Skipping filtered RSS article: {title}")
            logging.info(f"Skipping filtered RSS article: {title}")
            attempts += 1
            continue

-        scoring_content = f"{title}\n\n{summary}\n\nContent: {content}"
+        ddg_context = fetch_duckduckgo_news_context(title)
+        scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}"
        interest_score = is_interesting(scoring_content)
        logging.info(f"Interest score for '{title}': {interest_score}")
        if interest_score < 6:
@@ -228,9 +255,10 @@ def curate_from_rss():
        num_paragraphs = determine_paragraph_count(interest_score)
        extra_prompt = (
            f"Generate exactly {num_paragraphs} paragraphs.\n"
-            f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
-            f"Do NOT introduce unrelated concepts.\n"
-            f"Expand on the core idea with relevant context about its appeal or significance.\n"
+            f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
+            f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
+            f"Do NOT introduce unrelated concepts unless in the content or additional context.\n"
+            f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
            f"Do not include emojis in the summary."
        )
        content_to_summarize = scoring_content
@@ -246,46 +274,26 @@ def curate_from_rss():
            attempts += 1
            continue

-        # Remove the original title from the summary while preserving paragraphs
-        title_pattern = re.compile(
-            r'\*\*' + re.escape(title) + r'\*\*|' + re.escape(title),
-            re.IGNORECASE
-        )
-        paragraphs = final_summary.split('\n')
-        cleaned_paragraphs = []
-        for para in paragraphs:
-            if para.strip():
-                cleaned_para = title_pattern.sub('', para).strip()
-                cleaned_para = re.sub(r'\s+', ' ', cleaned_para)
-                cleaned_paragraphs.append(cleaned_para)
-        final_summary = '\n'.join(cleaned_paragraphs)
-
        final_summary = insert_link_naturally(final_summary, source_name, link)
-        post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
+
+        post_data, author, category, image_url, image_source, uploader, page_url = prepare_post_data(final_summary, title, main_topic)
        if not post_data:
            attempts += 1
            continue

-        # Fetch image
-        image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
+        image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic)
        if not image_url:
-            logging.info(f"Flickr fetch failed for '{image_query}'. Falling back to Pixabay.")
            image_url, image_source, uploader, page_url = get_image(image_query)
-            if not image_url:
-                logging.info(f"Pixabay fetch failed for '{image_query}'. Skipping article '{title}'.")
-                attempts += 1
-                continue

        hook = get_dynamic_hook(post_data["title"]).strip()

-        # Generate viral share prompt
        share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
        share_links_template = (
            f'<p>{share_prompt} '
            f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
            f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
        )
-        post_data["content"] = f"{final_summary}\n\n{share_links_template}"  # Removed cta from content
+        post_data["content"] = f"{final_summary}\n\n{share_links_template}"

        global is_posting
        is_posting = True
@@ -299,7 +307,7 @@ def curate_from_rss():
                original_source=original_source,
                image_source=image_source,
                uploader=uploader,
-                pixabay_url=pixabay_url,
+                page_url=page_url,
                interest_score=interest_score,
                should_post_tweet=True
            )
@@ -311,8 +319,7 @@ def curate_from_rss():
            share_text_encoded = quote(share_text)
            post_url_encoded = quote(post_url)
            share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
-            # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
-            post_data["content"] = f"{final_summary}\n\n{share_links}"  # Removed cta from content
+            post_data["content"] = f"{final_summary}\n\n{share_links}"
            is_posting = True
            try:
                post_to_wp(
@@ -324,7 +331,7 @@ def curate_from_rss():
                    original_source=original_source,
                    image_source=image_source,
                    uploader=uploader,
-                    pixabay_url=pixabay_url,
+                    page_url=page_url,
                    interest_score=interest_score,
                    post_id=post_id,
                    should_post_tweet=False
@@ -29,6 +29,8 @@ from foodie_config import (
 load_dotenv()
 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

+IMAGE_EXPIRATION_DAYS = 7  # 7 days, consistent with foodie_automator_rss.py
+
 def load_json_file(file_path, expiration_hours):
    entries = []
    cutoff = datetime.now(timezone.utc) - timedelta(hours=expiration_hours)
@@ -341,9 +343,10 @@ def smart_image_and_filter(title, summary):
        prompt = (
            "Analyze this article title and summary. Extract key entities (brands, locations, cuisines, or topics) "
            "for an image search about food industry trends or viral content. Prioritize specific terms if present, "
-            "otherwise focus on the main theme. "
+            "otherwise focus on the main theme. Also identify the main topic of the article (e.g., a specific food item or cuisine). "
            "Return 'SKIP' if the article is about home appliances, recipes, promotions, or contains 'homemade', else 'KEEP'. "
-            "Return as JSON with double quotes for all property names and string values (e.g., {\"image_query\": \"specific term\", \"relevance\": [\"keyword1\", \"keyword2\"], \"action\": \"KEEP\" or \"SKIP\"})."
+            "Return as JSON with double quotes for all property names and string values (e.g., "
+            "{\"image_query\": \"specific term\", \"relevance\": [\"keyword1\", \"keyword2\"], \"main_topic\": \"main food item\", \"action\": \"KEEP\" or \"SKIP\"})."
        )
        
        response = client.chat.completions.create(
@@ -357,38 +360,54 @@ def smart_image_and_filter(title, summary):
        raw_result = response.choices[0].message.content.strip()
        logging.info(f"Raw GPT smart image/filter response: '{raw_result}'")
        
-        # Remove ```json markers and fix single quotes in JSON structure
        cleaned_result = re.sub(r'```json\s*|\s*```', '', raw_result).strip()
-        # Replace single quotes with double quotes, but preserve single quotes within string values
        fixed_result = re.sub(r"(?<!\\)'(?=\s*[\w\s]*\])|(?<=\[|\{|\s)'|'(?=\s*[\]\},:])|(?<=\w)'(?=\s*:)", '"', cleaned_result)
        
        try:
            result = json.loads(fixed_result)
        except json.JSONDecodeError as e:
            logging.warning(f"JSON parsing failed: {e}, raw: '{fixed_result}'. Using fallback.")
-            return "food trends", ["cuisine", "dining"], False
+            # Fallback: Extract main topic using simple keyword matching
+            main_topic = extract_main_topic(title.lower() + " " + summary.lower())
+            return main_topic, [main_topic, "food"], False
        
        if not isinstance(result, dict) or "image_query" not in result or "relevance" not in result or "action" not in result:
            logging.warning(f"Invalid GPT response format: {result}, using fallback")
-            return "food trends", ["cuisine", "dining"], False
+            main_topic = extract_main_topic(title.lower() + " " + summary.lower())
+            return main_topic, [main_topic, "food"], False
        
        image_query = result["image_query"]
        relevance_keywords = result["relevance"]
+        main_topic = result.get("main_topic", extract_main_topic(title.lower() + " " + summary.lower()))
        skip_flag = result["action"] == "SKIP" or "homemade" in title.lower() or "homemade" in summary.lower()
        
-        logging.info(f"Smart image query: {image_query}, Relevance: {relevance_keywords}, Skip: {skip_flag}")
+        logging.info(f"Smart image query: {image_query}, Relevance: {relevance_keywords}, Main Topic: {main_topic}, Skip: {skip_flag}")
        
-        if not image_query or len(image_query.split()) < 2:
+        specific_single_words = ["kimchi", "sushi", "pizza", "taco", "burger"]
+        if not image_query:
+            logging.warning(f"Image query is empty, using fallback")
+            return main_topic, [main_topic, "food"], skip_flag
+        if len(image_query.split()) < 2 and image_query.lower() not in specific_single_words:
            logging.warning(f"Image query '{image_query}' too vague, using fallback")
-            return "food trends", ["cuisine", "dining"], skip_flag
+            return main_topic, [main_topic, "food"], skip_flag
        
-        return image_query, relevance_keywords, skip_flag
+        return image_query, relevance_keywords, main_topic, skip_flag
    
    except Exception as e:
        logging.error(f"Smart image/filter failed: {e}, using fallback")
-        return "food trends", ["cuisine", "dining"], False
+        main_topic = extract_main_topic(title.lower() + " " + summary.lower())
+        return main_topic, [main_topic, "food"], False

-def upload_image_to_wp(image_url, post_title, wp_base_url, wp_username, wp_password, image_source="Pixabay", uploader=None, pixabay_url=None):
+def extract_main_topic(text):
+    # Common food-related keywords (expand as needed)
+    food_keywords = ["kimchi", "sushi", "pizza", "taco", "burger", "ramen", "curry", "pasta", "salad", "soup"]
+    for keyword in food_keywords:
+        if keyword in text:
+            return keyword
+    # Fallback to a generic term if no specific food item is found
+    return "food trends"
+
+def upload_image_to_wp(image_url, post_title, wp_base_url, wp_username, wp_password, image_source="Pixabay", uploader=None, page_url=None):
    try:
        safe_title = post_title.encode('ascii', 'ignore').decode('ascii').replace(' ', '_')[:50]
        headers = {
@@ -401,12 +420,11 @@ def upload_image_to_wp(image_url, post_title, wp_base_url, wp_username, wp_passw
        }
        logging.info(f"Fetching image from {image_url} for '{post_title}'")
        
-        # Add rate limit handling for image download
        for attempt in range(3):
            try:
                image_response = requests.get(image_url, headers=image_headers, timeout=10)
                if image_response.status_code == 429:
-                    wait_time = 10 * (2 ** attempt)  # 10s, 20s, 40s
+                    wait_time = 10 * (2 ** attempt)
                    logging.warning(f"Rate limit hit for {image_url}. Retrying after {wait_time}s (attempt {attempt+1}/3).")
                    time.sleep(wait_time)
                    continue
@@ -431,7 +449,12 @@ def upload_image_to_wp(image_url, post_title, wp_base_url, wp_username, wp_passw
        response.raise_for_status()
        
        image_id = response.json()["id"]
-        caption = f'<a href="{pixabay_url}">{image_source}</a> by {uploader}' if pixabay_url and uploader else image_source
+        if page_url and uploader:
+            caption = f'<a href="{page_url}">{image_source}</a> by {uploader}'
+        elif page_url:
+            caption = f'<a href="{page_url}">{image_source}</a>'
+        else:
+            caption = image_source
        requests.post(
            f"{wp_base_url}/media/{image_id}",
            headers={"Authorization": headers["Authorization"], "Content-Type": "application/json"},
@@ -565,74 +588,57 @@ def insert_link_naturally(summary, source_name, source_url):
    try:
        logging.info(f"Input summary to insert_link_naturally: {summary!r}")

-        prompt = (
-            "Take this summary and insert a single HTML link naturally into one paragraph (randomly chosen). "
-            "Use the format '<a href=\"{source_url}\">{source_name}</a>' and weave it into the text seamlessly, "
-            "e.g., 'The latest scoop from {source_name} reveals...' or '{source_name} shares this insight.' "
-            "Vary the phrasing creatively to avoid repetition (don’t always use 'dives into'). "
-            "Place the link at a sentence boundary (after a period, not within numbers like '6.30am' or '1.5'). "
-            "Maintain the original tone, flow, and paragraph structure, preserving all existing newlines exactly as they are. "
-            "Each paragraph in the input summary is separated by a single \\n; ensure the output maintains this exact separation. "
-            "Do not add or remove newlines beyond the original summary structure. "
-            "Return the modified summary with exactly one link.\n\n"
-            "Summary:\n{summary}\n\n"
-            "Source Name: {source_name}\nSource URL: {source_url}"
-        ).format(summary=summary, source_name=source_name, source_url=source_url)
-        
-        response = client.chat.completions.create(
-            model=LIGHT_TASK_MODEL,
-            messages=[
-                {"role": "system", "content": prompt},
-                {"role": "user", "content": "Insert the link naturally into the summary."}
-            ],
-            max_tokens=1000,
-            temperature=0.7
-        )
-        new_summary = response.choices[0].message.content.strip()
-        link_pattern = f'<a href="{source_url}">{source_name}</a>'
-        if new_summary and new_summary.count(link_pattern) == 1:
-            paragraphs = new_summary.split('\n')
-            paragraphs = [p.strip() for p in paragraphs]
-            new_summary = '\n'.join(paragraphs)
-            logging.info(f"Summary with naturally embedded link (normalized): {new_summary!r}")
-            return new_summary
-        
-        logging.warning(f"GPT failed to insert link correctly: {new_summary}. Using fallback.")
-    except Exception as e:
-        logging.error(f"Link insertion failed: {e}")
-
-    # Fallback path
-    time_pattern = r'\b\d{1,2}\.\d{2}(?:am|pm)\b'
-    protected_summary = re.sub(time_pattern, lambda m: m.group(0).replace('.', '@'), summary)
-    paragraphs = protected_summary.split('\n')
+        paragraphs = summary.split('\n')
        if not paragraphs or all(not p.strip() for p in paragraphs):
            logging.error("No valid paragraphs to insert link.")
            return summary

-    target_para = random.choice([p for p in paragraphs if p.strip()])
+        eligible_paragraphs = [p for p in paragraphs if p.strip() and len(re.split(r'(?<=[.!?])\s+', p.strip())) >= 2]
+        if not eligible_paragraphs:
+            logging.warning("No paragraph with multiple sentences found, appending to last paragraph.")
+            target_para = paragraphs[-1].strip()
            link_pattern = f'<a href="{source_url}">{source_name}</a>'
-    phrases = [
-        f"Learn more from {link_pattern}",
-        f"{link_pattern} shares this insight",
-        f"Discover more at {link_pattern}",
-        f"Check out {link_pattern} for details"
-    ]
-    insertion_phrase = random.choice(phrases)
-    
-    sentences = re.split(r'(?<=[.!?])\s+', target_para)
-    insertion_point = -1
-    for i, sent in enumerate(sentences):
-        if sent.strip() and '@' not in sent:
-            insertion_point = sum(len(s) + 1 for s in sentences[:i+1])
-            break
-    if insertion_point == -1:
-        insertion_point = len(target_para)
-    
-    new_para = f"{target_para[:insertion_point]} {insertion_phrase}. {target_para[insertion_point:]}".strip()
-    paragraphs[paragraphs.index(target_para)] = new_para
+            new_para = f"{target_para} Source: {link_pattern}."
+            paragraphs[-1] = new_para
            new_summary = '\n'.join(paragraphs)
+            logging.info(f"Appended link to summary: {new_summary!r}")
+            return new_summary

-    new_summary = new_summary.replace('@', '.')
+        target_para = random.choice(eligible_paragraphs)
+        sentences = re.split(r'(?<=[.!?])\s+', target_para.strip())
+        
+        eligible_sentences = [(i, s) for i, s in enumerate(sentences) if i < len(sentences)-1 and s.strip()]
+        if not eligible_sentences:
+            eligible_sentences = [(i, s) for i, s in enumerate(sentences) if s.strip()]
+        
+        sentence_idx, sentence = random.choice(eligible_sentences)
+        link_pattern = f'<a href="{source_url}">{source_name}</a>'
+        
+        words = sentence.split()
+        if len(words) < 5:  # Ensure enough words for natural insertion
+            new_sentence = f"{sentence.rstrip('.')} according to {link_pattern}."
+        else:
+            split_point = random.randint(2, len(words)-3)  # Split further into the sentence
+            # Remove trailing punctuation from the first part and ensure proper grammar
+            first_part = ' '.join(words[:split_point]).rstrip(',')
+            second_part = ' '.join(words[split_point:]).lstrip(',')
+            new_sentence = f"{first_part} according to {link_pattern} {second_part}"
+            # Ensure the sentence ends with a period
+            if not new_sentence.endswith('.'):
+                new_sentence += '.'
+        
+        sentences[sentence_idx] = new_sentence
+        new_para = ' '.join(sentences)
+        paragraphs[paragraphs.index(target_para)] = new_para
+        
+        new_summary = '\n'.join(paragraphs)
+        logging.info(f"Summary with naturally embedded link: {new_summary!r}")
+        return new_summary
+
+    except Exception as e:
+        logging.error(f"Link insertion failed: {e}")
+        link_pattern = f'<a href="{source_url}">{source_name}</a>'
+        new_summary = f"{summary}\n\nSource: {link_pattern}."
        logging.info(f"Fallback summary with link: {new_summary!r}")
        return new_summary

@@ -702,7 +708,7 @@ def get_wp_tag_id(tag_name, wp_base_url, wp_username, wp_password):
        logging.error(f"Failed to get WP tag ID for '{tag_name}': {e}")
        return None

-def post_to_wp(post_data, category, link, author, image_url, original_source, image_source="Pixabay", uploader=None, pixabay_url=None, interest_score=4, post_id=None, should_post_tweet=True):
+def post_to_wp(post_data, category, link, author, image_url, original_source, image_source="Pixabay", uploader=None, page_url=None, interest_score=4, post_id=None, should_post_tweet=True):
    wp_base_url = "https://insiderfoodie.com/wp-json/wp/v2"
    logging.info(f"Starting post_to_wp for '{post_data['title']}', image_source: {image_source}")
    
@@ -749,6 +755,8 @@ def post_to_wp(post_data, category, link, author, image_url, original_source, im
            content = "Content unavailable. Check the original source for details."
        formatted_content = "\n".join(f"<p>{para}</p>" for para in content.split('\n') if para.strip())
        
+        # Removed the block that appends image attribution to the content
+        
        author_id_map = {
            "owenjohnson": 10,
            "javiermorales": 2,
@@ -759,17 +767,16 @@ def post_to_wp(post_data, category, link, author, image_url, original_source, im
        }
        author_id = author_id_map.get(author["username"], 5)
        
-        # Handle image upload
        image_id = None
        if image_url:
            logging.info(f"Attempting image upload for '{post_data['title']}', URL: {image_url}, source: {image_source}")
-            image_id = upload_image_to_wp(image_url, post_data["title"], wp_base_url, wp_username, wp_password, image_source, uploader, pixabay_url)
+            image_id = upload_image_to_wp(image_url, post_data["title"], wp_base_url, wp_username, wp_password, image_source, uploader, page_url)
            if not image_id:
                logging.info(f"Flickr upload failed for '{post_data['title']}', falling back to Pixabay")
                pixabay_query = post_data["title"][:50]
-                image_url, image_source, uploader, pixabay_url = get_image(pixabay_query)
+                image_url, image_source, uploader, page_url = get_image(pixabay_query)
                if image_url:
-                    image_id = upload_image_to_wp(image_url, post_data["title"], wp_base_url, wp_username, wp_password, image_source, uploader, pixabay_url)
+                    image_id = upload_image_to_wp(image_url, post_data["title"], wp_base_url, wp_username, wp_password, image_source, uploader, page_url)
            if not image_id:
                logging.warning(f"All image uploads failed for '{post_data['title']}' - posting without image")
        
@@ -808,11 +815,9 @@ def post_to_wp(post_data, category, link, author, image_url, original_source, im
        post_id = post_info["id"]
        post_url = post_info["link"]

-        # Save to recent_posts.json
        timestamp = datetime.now(timezone.utc).isoformat()
        save_post_to_recent(post_data["title"], post_url, author["username"], timestamp)

-        # Post article tweet to X only if should_post_tweet is True
        if should_post_tweet:
            try:
                post = {"title": post_data["title"], "url": post_url}
@@ -864,42 +869,39 @@ used_images = set()
 # Load used images from file if it exists
 if os.path.exists(used_images_file):
    try:
-        with open(used_images_file, 'r') as f:
-            content = f.read().strip()
-            if not content:
-                logging.warning(f"Used images file {used_images_file} is empty. Resetting to empty list.")
-                data = []
+        entries = load_json_file(used_images_file, IMAGE_EXPIRATION_DAYS * 24)  # Use load_json_file for consistency
+        for entry in entries:
+            if isinstance(entry, dict) and "title" in entry and entry["title"].startswith('https://'):
+                used_images.add(entry["title"])
            else:
-                data = json.loads(content)
-                if not isinstance(data, list):
-                    logging.warning(f"Invalid format in {used_images_file}: expected a list, got {type(data)}. Converting to list.")
-                    if isinstance(data, dict):
-                        # If it's a dict, try to extract URLs from values
-                        data = [v for v in data.values() if isinstance(v, str) and v.startswith('https://')]
-                    else:
-                        logging.warning(f"Cannot convert {type(data)} to list. Resetting to empty list.")
-                        data = []
-                # Filter out non-string or non-URL entries
-                data = [item for item in data if isinstance(item, str) and item.startswith('https://')]
-            used_images.update(data)
+                logging.warning(f"Skipping invalid entry in {used_images_file}: {entry}")
        logging.info(f"Loaded {len(used_images)} used image URLs from {used_images_file}")
    except Exception as e:
        logging.warning(f"Failed to load used images from {used_images_file}: {e}. Resetting to empty set.")
        used_images = set()
        with open(used_images_file, 'w') as f:
-            json.dump([], f)
+            f.write("") 

 # Function to save used_images to file
 def save_used_images():
    try:
-        # Ensure used_images contains only valid URLs
-        valid_urls = [url for url in used_images if isinstance(url, str) and url.startswith('https://')]
-        if len(valid_urls) != len(used_images):
-            logging.warning(f"Found {len(used_images) - len(valid_urls)} invalid URLs in used_images set")
+        # Load existing entries to preserve timestamps
+        entries = load_json_file(used_images_file, IMAGE_EXPIRATION_DAYS * 24)
+        existing_entries = {entry["title"]: entry for entry in entries if isinstance(entry, dict) and "title" in entry}
+        
+        # Create new entries for used_images
+        timestamp = datetime.now(timezone.utc).isoformat()
+        updated_entries = []
+        for url in used_images:
+            if url in existing_entries:
+                updated_entries.append(existing_entries[url])
+            else:
+                updated_entries.append({"title": url, "timestamp": timestamp})
        
        with open(used_images_file, 'w') as f:
-            json.dump(valid_urls, f, indent=2)
-        logging.info(f"Saved {len(valid_urls)} used image URLs to {used_images_file}")
+            for entry in updated_entries:
+                f.write(json.dumps(entry) + '\n')
+        logging.info(f"Saved {len(updated_entries)} used image URLs to {used_images_file}")
    except Exception as e:
        logging.warning(f"Failed to save used images to {used_images_file}: {e}")

@@ -930,8 +932,18 @@ def process_photo(photo, search_query):
            logging.warning(f"Medium size not available for photo {photo.id}: {e}")
            return None
    
-    if not img_url or img_url in used_images:
-        logging.info(f"Image URL invalid or already used for photo {photo.id}: {img_url}")
+    if not img_url:
+        logging.info(f"Image URL invalid for photo {photo.id}")
+        return None
+    
+    # Check if the image is highly relevant to the query
+    query_keywords = set(search_query.lower().split())
+    photo_keywords = set(tags + title.split())
+    is_relevant = bool(query_keywords & photo_keywords)  # Check if any query keyword is in tags or title
+    
+    # Allow reuse of highly relevant images
+    if img_url in used_images and not is_relevant:
+        logging.info(f"Image already used and not highly relevant for photo {photo.id}: {img_url}")
        return None
    
    uploader = photo.owner.username
@@ -1037,14 +1049,13 @@ def classify_keywords(keywords):
        logging.warning(f"Keyword classification failed: {e}. Defaulting to all specific.")
        return {kw: "specific" for kw in keywords}

-def get_flickr_image(search_query, relevance_keywords):
+def get_flickr_image(search_query, relevance_keywords, main_topic):
    global last_flickr_request_time, flickr_request_count
    
    reset_flickr_request_count()
    flickr_request_count += 1
    logging.info(f"Flickr request count: {flickr_request_count}/3600")
    
-    # Enforce a minimum delay of 10 seconds between Flickr requests
    current_time = time.time()
    time_since_last_request = current_time - last_flickr_request_time
    if time_since_last_request < 10:
@@ -1052,7 +1063,15 @@ def get_flickr_image(search_query, relevance_keywords):
    
    last_flickr_request_time = time.time()
    
-    # Step 1: Search DDG to find Flickr photo IDs
+    # Step 1: Search Flickr directly with the original query
+    logging.info(f"Searching Flickr directly with query: '{search_query}'")
+    photos = search_flickr(search_query)
+    for photo in photos:
+        result = process_photo(photo, search_query)
+        if result:
+            return result
+
+    # Step 2: Search DDG to find Flickr photo IDs
    logging.info(f"Searching DDG with query: '{search_query} site:flickr.com'")
    photo_ids = search_ddg_for_flickr(search_query)
    if photo_ids:
@@ -1063,13 +1082,12 @@ def get_flickr_image(search_query, relevance_keywords):
                if result:
                    return result

-    # Step 2: Break down the query into keywords and classify them for direct Flickr API search
+    # Step 3: Break down the query into keywords and classify them
    keywords = search_query.lower().split()
    if len(keywords) > 1:
        classifications = classify_keywords(keywords)
        logging.info(f"Keyword classifications: {classifications}")
        
-        # Prioritize specific keywords
        specific_keywords = [kw for kw, classification in classifications.items() if classification == "specific"]
        if specific_keywords:
            for keyword in specific_keywords:
@@ -1080,9 +1098,17 @@ def get_flickr_image(search_query, relevance_keywords):
                    if result:
                        return result

-    # Step 3: Final fallback using relevance keywords
+    # Step 4: Fallback using main topic
+    logging.info(f"No results found. Falling back to main topic: '{main_topic}'")
+    photos = search_flickr(main_topic)
+    for photo in photos:
+        result = process_photo(photo, main_topic)
+        if result:
+            return result
+
+    # Step 5: Final fallback using relevance keywords
    fallback_query = " ".join(relevance_keywords) if isinstance(relevance_keywords, list) else relevance_keywords
-    logging.info(f"No results found. Falling back to generic query: '{fallback_query}'")
+    logging.info(f"No results with main topic. Falling back to relevance keywords: '{fallback_query}'")
    photos = search_flickr(fallback_query)
    for photo in photos:
        result = process_photo(photo, search_query)
@@ -1116,46 +1142,58 @@ def select_best_author(summary):
        logging.error(f"Author selection failed: {e}")
        return "owenjohnson"

-def prepare_post_data(final_summary, original_title, context_info=""):
-    innovative_title = generate_title_from_summary(final_summary)
-    if not innovative_title:
-        logging.info(f"Title generation failed for '{original_title}' {context_info}")
+def prepare_post_data(summary, title, main_topic=None):
+    try:
+        logging.info(f"Preparing post data for summary: {summary[:100]}...")
+        
+        # Use the original generate_title_from_summary function to generate the title
+        new_title = generate_title_from_summary(summary)
+        if not new_title:
+            logging.warning("Title generation failed, using fallback title")
+            new_title = "A Tasty Food Discovery Awaits You"
+        logging.info(f"Generated new title: '{new_title}'")
+        
+        # Update to unpack four values
+        search_query, relevance_keywords, generated_main_topic, skip_flag = smart_image_and_filter(new_title, summary)
+        if skip_flag:
+            logging.info("Summary filtered out during post preparation")
            return None, None, None, None, None, None, None
        
-    # Pass innovative_title and final_summary as separate arguments
-    search_query, relevance_keywords, _ = generate_image_query(innovative_title, final_summary)
-    if not search_query:
-        logging.info(f"Image query generation failed for '{innovative_title}' {context_info}")
+        # Use the provided main_topic if available, otherwise use the generated one
+        effective_main_topic = main_topic if main_topic else generated_main_topic
+        
+        image_url, image_source, uploader, page_url = get_flickr_image(search_query, relevance_keywords, effective_main_topic)
+        if not image_url:
+            image_url, image_source, uploader, page_url = get_image(search_query)
+        
+        if not image_url:
+            logging.warning("No image found for post, skipping")
            return None, None, None, None, None, None, None
        
-    logging.info(f"Fetching Flickr image for query: '{search_query}' {context_info}")
-    image_url, image_source, uploader, page_url = get_flickr_image(search_query, relevance_keywords)
+        # Select a full author dictionary from AUTHORS (already imported from foodie_config)
+        author = random.choice(AUTHORS)
        
-    if not image_url:
-        logging.info(f"Flickr fetch failed for '{search_query}' - falling back to Pixabay {context_info}")
-        # Use the same title and summary for fallback
-        image_query, _, _ = generate_image_query(innovative_title, final_summary)
-        image_url, image_source, uploader, page_url = get_image(image_query)
-        if not image_url:
-            logging.info(f"Pixabay fetch failed for title '{innovative_title}' - falling back to summary {context_info}")
-            image_query, _, _ = generate_image_query(final_summary, final_summary)  # Using summary as both title and summary for fallback
-            image_url, image_source, uploader, page_url = get_image(image_query)
-            if not image_url:
-                logging.info(f"Image fetch failed again for '{original_title}' - proceeding without image {context_info}")
+        categories = ["Food", "Trends", "Eats", "Culture"]
+        category = random.choice(categories)
        
-    post_data = {"title": innovative_title, "content": final_summary}
-    selected_username = select_best_author(final_summary)
-    author = next((a for a in AUTHORS if a["username"] == selected_username), None)
-    if not author:
-        logging.error(f"Author '{selected_username}' not found in AUTHORS, defaulting to owenjohnson")
-        author = {"username": "owenjohnson", "password": "rfjk xhn6 2RPy FuQ9 cGlU K8mC"}
-    category = generate_category_from_summary(final_summary)
+        post_data = {
+            "title": new_title,
+            "content": summary,
+            "status": "publish",
+            "author": author["username"],  # Use the username in post_data
+            "categories": [category]
+        }
        
+        logging.info(f"Post data prepared: Title: '{new_title}', Category: {category}, Author: {author['username']}")
        return post_data, author, category, image_url, image_source, uploader, page_url
    
+    except Exception as e:
+        logging.error(f"Failed to prepare post data: {e}")
+        return None, None, None, None, None, None, None
+
 def save_post_to_recent(post_title, post_url, author_username, timestamp):
    try:
-        recent_posts = load_json_file('/home/shane/foodie_automator/recent_posts.json')
+        recent_posts = load_json_file('/home/shane/foodie_automator/recent_posts.json', 24)  # Added expiration_hours
        entry = {
            "title": post_title,
            "url": post_url,
Author	SHA1	Message	Date
Shane	3d0d320648	fix	2025-05-04 13:12:20 +10:00
Shane	504d7f6349	fix	2025-05-04 12:57:22 +10:00
Shane	ccddefbc8b	try	2025-05-04 12:44:50 +10:00
Shane	d2022222c3	try	2025-05-04 12:14:00 +10:00
Shane	7fba0fe96a	fix	2025-05-04 12:06:46 +10:00
Shane	6be8493878	fix	2025-05-04 11:09:02 +10:00
Shane	e445b6ef33	fix	2025-05-04 10:44:43 +10:00
Shane	5554abdc4a	fix	2025-05-04 10:35:31 +10:00
Shane	64d17d5599	try	2025-05-04 10:27:26 +10:00
Shane	aa0f3364d5	fix image swap	2025-05-04 09:47:47 +10:00
Shane	e5ebd000fe	incorporate external context from DDG	2025-05-04 09:07:45 +10:00