add more subreddits

fix insert link naturally
fix
2025-05-04 14:42:54 +10:00 · 2025-05-04 13:40:28 +10:00 · 2025-05-04 13:12:20 +10:00 · 2025-05-04 12:57:22 +10:00 · 2025-05-04 12:44:50 +10:00 · 2025-05-04 12:14:00 +10:00
5 changed files with 284 additions and 221 deletions
@@ -208,14 +208,15 @@ def curate_from_google_trends(geo_list=['US']):
        print(f"Trying Google Trend: {title} from {source_name}")
        logging.info(f"Trying Google Trend: {title} from {source_name}")
-        image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
+        image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary)
        if skip:
            print(f"Skipping filtered Google Trend: {title}")
            logging.info(f"Skipping filtered Google Trend: {title}")
            attempts += 1
            continue
-        scoring_content = f"{title}\n\n{summary}"
+        ddg_context = fetch_duckduckgo_news_context(title)
        scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}"
        interest_score = is_interesting(scoring_content)
        logging.info(f"Interest score for '{title}': {interest_score}")
        if interest_score < 6:
@@ -227,8 +228,9 @@ def curate_from_google_trends(geo_list=['US']):
        num_paragraphs = determine_paragraph_count(interest_score)
        extra_prompt = (
            f"Generate exactly {num_paragraphs} paragraphs.\n"
-            f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
+            f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
-            f"Do NOT introduce unrelated concepts.\n"
+            f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
            f"Do NOT introduce unrelated concepts unless in the content or additional context.\n"
            f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
            f"Do not include emojis in the summary."
        )
@@ -247,18 +249,17 @@ def curate_from_google_trends(geo_list=['US']):
        final_summary = insert_link_naturally(final_summary, source_name, link)
-        post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
+        post_data, author, category, image_url, image_source, uploader, page_url = prepare_post_data(final_summary, title, main_topic)
        if not post_data:
            attempts += 1
            continue
-        image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
+        image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic)
        if not image_url:
            image_url, image_source, uploader, page_url = get_image(image_query)
        hook = get_dynamic_hook(post_data["title"]).strip()
        # Generate viral share prompt
        share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
        share_links_template = (
            f'<p>{share_prompt} '
@@ -279,7 +280,7 @@ def curate_from_google_trends(geo_list=['US']):
                original_source=original_source,
                image_source=image_source,
                uploader=uploader,
-                pixabay_url=pixabay_url,
+                page_url=page_url,
                interest_score=interest_score,
                should_post_tweet=True
            )
@@ -291,8 +292,7 @@ def curate_from_google_trends(geo_list=['US']):
            share_text_encoded = quote(share_text)
            post_url_encoded = quote(post_url)
            share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
-            # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
+            post_data["content"] = f"{final_summary}\n\n{share_links}"
            post_data["content"] = f"{final_summary}\n\n{share_links}"  # Removed cta from content
            is_posting = True
            try:
                post_to_wp(
@@ -304,7 +304,7 @@ def curate_from_google_trends(geo_list=['US']):
                    original_source=original_source,
                    image_source=image_source,
                    uploader=uploader,
-                    pixabay_url=pixabay_url,
+                    page_url=page_url,
                    interest_score=interest_score,
                    post_id=post_id,
                    should_post_tweet=False
@@ -8,6 +8,7 @@ import json
 import signal
 import sys
 import re
 from duckduckgo_search import DDGS
 from datetime import datetime, timedelta, timezone
 from openai import OpenAI
 from urllib.parse import quote
@@ -169,6 +170,30 @@ def get_top_comments(post_url, reddit, limit=3):
        logging.error(f"Failed to fetch comments for {post_url}: {e}")
        return []
 def fetch_duckduckgo_news_context(title, hours=24):
    try:
        with DDGS() as ddgs:
            results = ddgs.news(f"{title} news", timelimit="d", max_results=5)
            titles = []
            for r in results:
                try:
                    date_str = r["date"]
                    if '+00:00' in date_str:
                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
                    else:
                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
                    if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
                        titles.append(r["title"].lower())
                except ValueError as e:
                    logging.warning(f"Date parsing failed for '{date_str}': {e}")
                    continue
            context = " ".join(titles) if titles else "No recent news found within 24 hours"
            logging.info(f"DuckDuckGo News context for '{title}': {context}")
            return context
    except Exception as e:
        logging.warning(f"DuckDuckGo News context fetch failed for '{title}': {e}")
        return title
 def fetch_reddit_posts():
    reddit = praw.Reddit(
        client_id=REDDIT_CLIENT_ID,
@@ -211,7 +236,7 @@ def curate_from_reddit():
    if not articles:
        print("No Reddit posts available")
        logging.info("No Reddit posts available")
-        return None, None, None
+        return None, None, random.randint(600, 1800)
    articles.sort(key=lambda x: x["upvotes"], reverse=True)
@@ -241,7 +266,7 @@ def curate_from_reddit():
        print(f"Trying Reddit Post: {title} from {source_name}")
        logging.info(f"Trying Reddit Post: {title} from {source_name}")
-        image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
+        image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary)
        if skip or any(keyword in title.lower() or keyword in raw_title.lower() for keyword in RECIPE_KEYWORDS + ["homemade"]):
            print(f"Skipping filtered Reddit post: {title}")
            logging.info(f"Skipping filtered Reddit post: {title}")
@@ -249,6 +274,8 @@ def curate_from_reddit():
            continue
        top_comments = get_top_comments(link, reddit, limit=3)
        ddg_context = fetch_duckduckgo_news_context(title)
        content_to_summarize = f"{title}\n\n{summary}\n\nTop Comments:\n{'\n'.join(top_comments) if top_comments else 'None'}\n\nAdditional Context: {ddg_context}"
        interest_score = is_interesting_reddit(
            title,
            summary,
@@ -266,15 +293,13 @@ def curate_from_reddit():
        num_paragraphs = determine_paragraph_count(interest_score)
        extra_prompt = (
            f"Generate exactly {num_paragraphs} paragraphs.\n"
-            f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
+            f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
            f"Incorporate relevant insights from these top comments if available: {', '.join(top_comments) if top_comments else 'None'}.\n"
-            f"Do NOT introduce unrelated concepts unless in the content or comments.\n"
+            f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
            f"Do NOT introduce unrelated concepts unless in the content, comments, or additional context.\n"
            f"If brief, expand on the core idea with relevant context about its appeal or significance.\n"
            f"Do not include emojis in the summary."
        )
        content_to_summarize = f"{title}\n\n{summary}"
        if top_comments:
            content_to_summarize += f"\n\nTop Comments:\n{'\n'.join(top_comments)}"
        final_summary = summarize_with_gpt4o(
            content_to_summarize,
@@ -290,26 +315,24 @@ def curate_from_reddit():
        final_summary = insert_link_naturally(final_summary, source_name, link)
-        post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
+        post_data, author, category, image_url, image_source, uploader, page_url = prepare_post_data(final_summary, title, main_topic)
        if not post_data:
            attempts += 1
            continue
-        image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
+        image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic)
        if not image_url:
            image_url, image_source, uploader, page_url = get_image(image_query)
        hook = get_dynamic_hook(post_data["title"]).strip()
        # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=None)
        # Generate viral share prompt
        share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
        share_links_template = (
            f'<p>{share_prompt} '
            f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
            f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
        )
-        post_data["content"] = f"{final_summary}\n\n{share_links_template}"  # Removed cta from content
+        post_data["content"] = f"{final_summary}\n\n{share_links_template}"
        global is_posting
        is_posting = True
@@ -323,7 +346,7 @@ def curate_from_reddit():
                original_source=original_source,
                image_source=image_source,
                uploader=uploader,
-                pixabay_url=pixabay_url,
+                page_url=page_url,
                interest_score=interest_score,
                should_post_tweet=True
            )
@@ -335,8 +358,7 @@ def curate_from_reddit():
            share_text_encoded = quote(share_text)
            post_url_encoded = quote(post_url)
            share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
-            # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
+            post_data["content"] = f"{final_summary}\n\n{share_links}"
            post_data["content"] = f"{final_summary}\n\n{share_links}"  # Removed cta from content
            is_posting = True
            try:
                post_to_wp(
@@ -348,7 +370,7 @@ def curate_from_reddit():
                    original_source=original_source,
                    image_source=image_source,
                    uploader=uploader,
-                    pixabay_url=pixabay_url,
+                    page_url=page_url,
                    interest_score=interest_score,
                    post_id=post_id,
                    should_post_tweet=False
@@ -9,6 +9,8 @@ import signal
 import sys
 import re
 import email.utils
 import feedparser
 from duckduckgo_search import DDGS
 from datetime import datetime, timedelta, timezone
 from bs4 import BeautifulSoup
 from openai import OpenAI
@@ -136,6 +138,7 @@ def fetch_rss_feeds():
        logging.error("RSS_FEEDS is empty in foodie_config.py")
        return articles
    logging.info(f"Processing feeds: {RSS_FEEDS}")
    for feed_url in RSS_FEEDS:
        logging.info(f"Processing feed: {feed_url}")
        try:
@@ -182,8 +185,32 @@ def fetch_rss_feeds():
    logging.info(f"Total RSS articles fetched: {len(articles)}")
    return articles
 def fetch_duckduckgo_news_context(title, hours=24):
    try:
        with DDGS() as ddgs:
            results = ddgs.news(f"{title} news", timelimit="d", max_results=5)
            titles = []
            for r in results:
                try:
                    date_str = r["date"]
                    if '+00:00' in date_str:
                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
                    else:
                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
                    if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
                        titles.append(r["title"].lower())
                except ValueError as e:
                    logging.warning(f"Date parsing failed for '{date_str}': {e}")
                    continue
            context = " ".join(titles) if titles else "No recent news found within 24 hours"
            logging.info(f"DuckDuckGo News context for '{title}': {context}")
            return context
    except Exception as e:
        logging.warning(f"DuckDuckGo News context fetch failed for '{title}': {e}")
        return title
 def curate_from_rss():
-    articles = fetch_rss_feeds()
+    articles = fetch_rss_feeds()  # Corrected from fetch_rss_articles to fetch_rss_feeds
    if not articles:
        print("No RSS articles available")
        logging.info("No RSS articles available")
@@ -195,9 +222,8 @@ def curate_from_rss():
        article = articles.pop(0)
        title = article["title"]
        link = article["link"]
-        summary = article["summary"]
+        summary = article.get("summary", "")
-        content = article["content"]
+        source_name = article.get("feed_title", "Unknown Source")  # Adjusted to match fetch_rss_feeds output
        source_name = article["feed_title"]
        original_source = f'<a href="{link}">{source_name}</a>'
        if title in posted_titles:
@@ -209,14 +235,15 @@ def curate_from_rss():
        print(f"Trying RSS Article: {title} from {source_name}")
        logging.info(f"Trying RSS Article: {title} from {source_name}")
-        image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
+        image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary)
        if skip:
            print(f"Skipping filtered RSS article: {title}")
            logging.info(f"Skipping filtered RSS article: {title}")
            attempts += 1
            continue
-        scoring_content = f"{title}\n\n{summary}\n\nContent: {content}"
+        ddg_context = fetch_duckduckgo_news_context(title)
        scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}"
        interest_score = is_interesting(scoring_content)
        logging.info(f"Interest score for '{title}': {interest_score}")
        if interest_score < 6:
@@ -228,9 +255,10 @@ def curate_from_rss():
        num_paragraphs = determine_paragraph_count(interest_score)
        extra_prompt = (
            f"Generate exactly {num_paragraphs} paragraphs.\n"
-            f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
+            f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
-            f"Do NOT introduce unrelated concepts.\n"
+            f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
-            f"Expand on the core idea with relevant context about its appeal or significance.\n"
+            f"Do NOT introduce unrelated concepts unless in the content or additional context.\n"
            f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
            f"Do not include emojis in the summary."
        )
        content_to_summarize = scoring_content
@@ -246,46 +274,26 @@ def curate_from_rss():
            attempts += 1
            continue
        # Remove the original title from the summary while preserving paragraphs
        title_pattern = re.compile(
            r'\*\*' + re.escape(title) + r'\*\*|' + re.escape(title),
            re.IGNORECASE
        )
        paragraphs = final_summary.split('\n')
        cleaned_paragraphs = []
        for para in paragraphs:
            if para.strip():
                cleaned_para = title_pattern.sub('', para).strip()
                cleaned_para = re.sub(r'\s+', ' ', cleaned_para)
                cleaned_paragraphs.append(cleaned_para)
        final_summary = '\n'.join(cleaned_paragraphs)
        final_summary = insert_link_naturally(final_summary, source_name, link)
-        post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
+
        post_data, author, category, image_url, image_source, uploader, page_url = prepare_post_data(final_summary, title, main_topic)
        if not post_data:
            attempts += 1
            continue
-        # Fetch image
+        image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic)
        image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
        if not image_url:
            logging.info(f"Flickr fetch failed for '{image_query}'. Falling back to Pixabay.")
            image_url, image_source, uploader, page_url = get_image(image_query)
            if not image_url:
                logging.info(f"Pixabay fetch failed for '{image_query}'. Skipping article '{title}'.")
                attempts += 1
                continue
        hook = get_dynamic_hook(post_data["title"]).strip()
        # Generate viral share prompt
        share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
        share_links_template = (
            f'<p>{share_prompt} '
            f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
            f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
        )
-        post_data["content"] = f"{final_summary}\n\n{share_links_template}"  # Removed cta from content
+        post_data["content"] = f"{final_summary}\n\n{share_links_template}"
        global is_posting
        is_posting = True
@@ -299,7 +307,7 @@ def curate_from_rss():
                original_source=original_source,
                image_source=image_source,
                uploader=uploader,
-                pixabay_url=pixabay_url,
+                page_url=page_url,
                interest_score=interest_score,
                should_post_tweet=True
            )
@@ -311,8 +319,7 @@ def curate_from_rss():
            share_text_encoded = quote(share_text)
            post_url_encoded = quote(post_url)
            share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
-            # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
+            post_data["content"] = f"{final_summary}\n\n{share_links}"
            post_data["content"] = f"{final_summary}\n\n{share_links}"  # Removed cta from content
            is_posting = True
            try:
                post_to_wp(
@@ -324,7 +331,7 @@ def curate_from_rss():
                    original_source=original_source,
                    image_source=image_source,
                    uploader=uploader,
-                    pixabay_url=pixabay_url,
+                    page_url=page_url,
                    interest_score=interest_score,
                    post_id=post_id,
                    should_post_tweet=False
@@ -245,7 +245,7 @@ RSS_FEED_NAMES = {
    "https://www.eater.com/rss/full.xml": ("Eater", "https://www.eater.com/"),
    "https://www.nrn.com/rss.xml": ("Nation's Restaurant News", "https://www.nrn.com/"),
    "https://rss.nytimes.com/services/xml/rss/nyt/DiningandWine.xml": ("The New York Times", "https://www.nytimes.com/section/food"),
-    "https://www.theguardian.com/food/rss": ("The Guardian Food", "https://www.theguardian.com/food")
+    "https://www.theguardian.com/food/rss": ("The Guardian", "https://www.theguardian.com/food")
 }
 RECIPE_KEYWORDS = ["recipe", "cook", "bake", "baking", "cooking", "ingredient", "method", "mix", "stir", "preheat", "dinners", "make", "dish", "healthy"]
@@ -264,7 +264,12 @@ REDDIT_USER_AGENT = os.getenv("REDDIT_USER_AGENT")
 REDDIT_SUBREDDITS = [
    "food",
    "FoodPorn",
-    "spicy"
+    "spicy",
    "KoreanFood",
    "JapaneseFood",
    "DessertPorn",
    "ChineseFood",
    "IndianFood"    
 ]
 FAST_FOOD_KEYWORDS = [
    "mcdonald", "burger king", "wendy", "taco bell", "kfc", 
@@ -29,6 +29,8 @@ from foodie_config import (
 load_dotenv()
 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 IMAGE_EXPIRATION_DAYS = 7  # 7 days, consistent with foodie_automator_rss.py
 def load_json_file(file_path, expiration_hours):
    entries = []
    cutoff = datetime.now(timezone.utc) - timedelta(hours=expiration_hours)
@@ -341,9 +343,10 @@ def smart_image_and_filter(title, summary):
        prompt = (
            "Analyze this article title and summary. Extract key entities (brands, locations, cuisines, or topics) "
            "for an image search about food industry trends or viral content. Prioritize specific terms if present, "
-            "otherwise focus on the main theme. "
+            "otherwise focus on the main theme. Also identify the main topic of the article (e.g., a specific food item or cuisine). "
            "Return 'SKIP' if the article is about home appliances, recipes, promotions, or contains 'homemade', else 'KEEP'. "
-            "Return as JSON with double quotes for all property names and string values (e.g., {\"image_query\": \"specific term\", \"relevance\": [\"keyword1\", \"keyword2\"], \"action\": \"KEEP\" or \"SKIP\"})."
+            "Return as JSON with double quotes for all property names and string values (e.g., "
            "{\"image_query\": \"specific term\", \"relevance\": [\"keyword1\", \"keyword2\"], \"main_topic\": \"main food item\", \"action\": \"KEEP\" or \"SKIP\"})."
        )
        response = client.chat.completions.create(
@@ -357,38 +360,54 @@ def smart_image_and_filter(title, summary):
        raw_result = response.choices[0].message.content.strip()
        logging.info(f"Raw GPT smart image/filter response: '{raw_result}'")
        # Remove ```json markers and fix single quotes in JSON structure
        cleaned_result = re.sub(r'```json\s*|\s*```', '', raw_result).strip()
        # Replace single quotes with double quotes, but preserve single quotes within string values
        fixed_result = re.sub(r"(?<!\\)'(?=\s*[\w\s]*\])|(?<=\[|\{|\s)'|'(?=\s*[\]\},:])|(?<=\w)'(?=\s*:)", '"', cleaned_result)
        try:
            result = json.loads(fixed_result)
        except json.JSONDecodeError as e:
            logging.warning(f"JSON parsing failed: {e}, raw: '{fixed_result}'. Using fallback.")
-            return "food trends", ["cuisine", "dining"], False
+            # Fallback: Extract main topic using simple keyword matching
            main_topic = extract_main_topic(title.lower() + " " + summary.lower())
            return main_topic, [main_topic, "food"], False
        if not isinstance(result, dict) or "image_query" not in result or "relevance" not in result or "action" not in result:
            logging.warning(f"Invalid GPT response format: {result}, using fallback")
-            return "food trends", ["cuisine", "dining"], False
+            main_topic = extract_main_topic(title.lower() + " " + summary.lower())
            return main_topic, [main_topic, "food"], False
        image_query = result["image_query"]
        relevance_keywords = result["relevance"]
        main_topic = result.get("main_topic", extract_main_topic(title.lower() + " " + summary.lower()))
        skip_flag = result["action"] == "SKIP" or "homemade" in title.lower() or "homemade" in summary.lower()
-        logging.info(f"Smart image query: {image_query}, Relevance: {relevance_keywords}, Skip: {skip_flag}")
+        logging.info(f"Smart image query: {image_query}, Relevance: {relevance_keywords}, Main Topic: {main_topic}, Skip: {skip_flag}")
-        if not image_query or len(image_query.split()) < 2:
+        specific_single_words = ["kimchi", "sushi", "pizza", "taco", "burger"]
        if not image_query:
            logging.warning(f"Image query is empty, using fallback")
            return main_topic, [main_topic, "food"], skip_flag
        if len(image_query.split()) < 2 and image_query.lower() not in specific_single_words:
            logging.warning(f"Image query '{image_query}' too vague, using fallback")
-            return "food trends", ["cuisine", "dining"], skip_flag
+            return main_topic, [main_topic, "food"], skip_flag
-        return image_query, relevance_keywords, skip_flag
+        return image_query, relevance_keywords, main_topic, skip_flag
    except Exception as e:
        logging.error(f"Smart image/filter failed: {e}, using fallback")
-        return "food trends", ["cuisine", "dining"], False
+        main_topic = extract_main_topic(title.lower() + " " + summary.lower())
        return main_topic, [main_topic, "food"], False
-def upload_image_to_wp(image_url, post_title, wp_base_url, wp_username, wp_password, image_source="Pixabay", uploader=None, pixabay_url=None):
+def extract_main_topic(text):
    # Common food-related keywords (expand as needed)
    food_keywords = ["kimchi", "sushi", "pizza", "taco", "burger", "ramen", "curry", "pasta", "salad", "soup"]
    for keyword in food_keywords:
        if keyword in text:
            return keyword
    # Fallback to a generic term if no specific food item is found
    return "food trends"
 def upload_image_to_wp(image_url, post_title, wp_base_url, wp_username, wp_password, image_source="Pixabay", uploader=None, page_url=None):
    try:
        safe_title = post_title.encode('ascii', 'ignore').decode('ascii').replace(' ', '_')[:50]
        headers = {
@@ -401,12 +420,11 @@ def upload_image_to_wp(image_url, post_title, wp_base_url, wp_username, wp_passw
        }
        logging.info(f"Fetching image from {image_url} for '{post_title}'")
        # Add rate limit handling for image download
        for attempt in range(3):
            try:
                image_response = requests.get(image_url, headers=image_headers, timeout=10)
                if image_response.status_code == 429:
-                    wait_time = 10 * (2 ** attempt)  # 10s, 20s, 40s
+                    wait_time = 10 * (2 ** attempt)
                    logging.warning(f"Rate limit hit for {image_url}. Retrying after {wait_time}s (attempt {attempt+1}/3).")
                    time.sleep(wait_time)
                    continue
@@ -431,7 +449,12 @@ def upload_image_to_wp(image_url, post_title, wp_base_url, wp_username, wp_passw
        response.raise_for_status()
        image_id = response.json()["id"]
-        caption = f'<a href="{pixabay_url}">{image_source}</a> by {uploader}' if pixabay_url and uploader else image_source
+        if page_url and uploader:
            caption = f'<a href="{page_url}">{image_source}</a> by {uploader}'
        elif page_url:
            caption = f'<a href="{page_url}">{image_source}</a>'
        else:
            caption = image_source
        requests.post(
            f"{wp_base_url}/media/{image_id}",
            headers={"Authorization": headers["Authorization"], "Content-Type": "application/json"},
@@ -565,76 +588,50 @@ def insert_link_naturally(summary, source_name, source_url):
    try:
        logging.info(f"Input summary to insert_link_naturally: {summary!r}")
-        prompt = (
+        paragraphs = summary.split('\n')
-            "Take this summary and insert a single HTML link naturally into one paragraph (randomly chosen). "
+        if not paragraphs or all(not p.strip() for p in paragraphs):
-            "Use the format '<a href=\"{source_url}\">{source_name}</a>' and weave it into the text seamlessly, "
+            logging.error("No valid paragraphs to insert link.")
-            "e.g., 'The latest scoop from {source_name} reveals...' or '{source_name} shares this insight.' "
+            return summary
            "Vary the phrasing creatively to avoid repetition (don’t always use 'dives into'). "
            "Place the link at a sentence boundary (after a period, not within numbers like '6.30am' or '1.5'). "
            "Maintain the original tone, flow, and paragraph structure, preserving all existing newlines exactly as they are. "
            "Each paragraph in the input summary is separated by a single \\n; ensure the output maintains this exact separation. "
            "Do not add or remove newlines beyond the original summary structure. "
            "Return the modified summary with exactly one link.\n\n"
            "Summary:\n{summary}\n\n"
            "Source Name: {source_name}\nSource URL: {source_url}"
        ).format(summary=summary, source_name=source_name, source_url=source_url)
-        response = client.chat.completions.create(
+        eligible_paragraphs = [p for p in paragraphs if p.strip() and len(re.split(r'(?<=[.!?])\s+', p.strip())) >= 2]
-            model=LIGHT_TASK_MODEL,
+        if not eligible_paragraphs:
-            messages=[
+            logging.warning("No paragraph with multiple sentences found, appending to last paragraph.")
-                {"role": "system", "content": prompt},
+            target_para = paragraphs[-1].strip()
-                {"role": "user", "content": "Insert the link naturally into the summary."}
+            link_pattern = f'<a href="{source_url}">{source_name}</a>'
-            ],
+            new_para = f"{target_para} Source: {link_pattern}."
-            max_tokens=1000,
+            paragraphs[-1] = new_para
            temperature=0.7
        )
        new_summary = response.choices[0].message.content.strip()
        link_pattern = f'<a href="{source_url}">{source_name}</a>'
        if new_summary and new_summary.count(link_pattern) == 1:
            paragraphs = new_summary.split('\n')
            paragraphs = [p.strip() for p in paragraphs]
            new_summary = '\n'.join(paragraphs)
-            logging.info(f"Summary with naturally embedded link (normalized): {new_summary!r}")
+            logging.info(f"Appended link to summary: {new_summary!r}")
            return new_summary
-        logging.warning(f"GPT failed to insert link correctly: {new_summary}. Using fallback.")
+        target_para = random.choice(eligible_paragraphs)
        sentences = re.split(r'(?<=[.!?])\s+', target_para.strip())
        eligible_sentences = [(i, s) for i, s in enumerate(sentences) if s.strip()]
        if not eligible_sentences:
            logging.error("No eligible sentences found for link insertion.")
            return summary
        sentence_idx, sentence = random.choice(eligible_sentences)
        link_pattern = f'<a href="{source_url}">{source_name}</a>'
        # Insert the link at the end of the sentence
        new_sentence = f"{sentence.rstrip('.')} according to {link_pattern}."
        sentences[sentence_idx] = new_sentence
        new_para = ' '.join(sentences)
        paragraphs[paragraphs.index(target_para)] = new_para
        new_summary = '\n'.join(paragraphs)
        logging.info(f"Summary with naturally embedded link: {new_summary!r}")
        return new_summary
    except Exception as e:
        logging.error(f"Link insertion failed: {e}")
-
+        link_pattern = f'<a href="{source_url}">{source_name}</a>'
-    # Fallback path
+        new_summary = f"{summary}\n\nSource: {link_pattern}."
-    time_pattern = r'\b\d{1,2}\.\d{2}(?:am|pm)\b'
+        logging.info(f"Fallback summary with link: {new_summary!r}")
-    protected_summary = re.sub(time_pattern, lambda m: m.group(0).replace('.', '@'), summary)
+        return new_summary
    paragraphs = protected_summary.split('\n')
    if not paragraphs or all(not p.strip() for p in paragraphs):
        logging.error("No valid paragraphs to insert link.")
        return summary
    target_para = random.choice([p for p in paragraphs if p.strip()])
    link_pattern = f'<a href="{source_url}">{source_name}</a>'
    phrases = [
        f"Learn more from {link_pattern}",
        f"{link_pattern} shares this insight",
        f"Discover more at {link_pattern}",
        f"Check out {link_pattern} for details"
    ]
    insertion_phrase = random.choice(phrases)
    sentences = re.split(r'(?<=[.!?])\s+', target_para)
    insertion_point = -1
    for i, sent in enumerate(sentences):
        if sent.strip() and '@' not in sent:
            insertion_point = sum(len(s) + 1 for s in sentences[:i+1])
            break
    if insertion_point == -1:
        insertion_point = len(target_para)
    new_para = f"{target_para[:insertion_point]} {insertion_phrase}. {target_para[insertion_point:]}".strip()
    paragraphs[paragraphs.index(target_para)] = new_para
    new_summary = '\n'.join(paragraphs)
    new_summary = new_summary.replace('@', '.')
    logging.info(f"Fallback summary with link: {new_summary!r}")
    return new_summary
 def generate_category_from_summary(summary):
    try:
@@ -702,7 +699,7 @@ def get_wp_tag_id(tag_name, wp_base_url, wp_username, wp_password):
        logging.error(f"Failed to get WP tag ID for '{tag_name}': {e}")
        return None
-def post_to_wp(post_data, category, link, author, image_url, original_source, image_source="Pixabay", uploader=None, pixabay_url=None, interest_score=4, post_id=None, should_post_tweet=True):
+def post_to_wp(post_data, category, link, author, image_url, original_source, image_source="Pixabay", uploader=None, page_url=None, interest_score=4, post_id=None, should_post_tweet=True):
    wp_base_url = "https://insiderfoodie.com/wp-json/wp/v2"
    logging.info(f"Starting post_to_wp for '{post_data['title']}', image_source: {image_source}")
@@ -749,6 +746,8 @@ def post_to_wp(post_data, category, link, author, image_url, original_source, im
            content = "Content unavailable. Check the original source for details."
        formatted_content = "\n".join(f"<p>{para}</p>" for para in content.split('\n') if para.strip())
        # Removed the block that appends image attribution to the content
        author_id_map = {
            "owenjohnson": 10,
            "javiermorales": 2,
@@ -759,17 +758,16 @@ def post_to_wp(post_data, category, link, author, image_url, original_source, im
        }
        author_id = author_id_map.get(author["username"], 5)
        # Handle image upload
        image_id = None
        if image_url:
            logging.info(f"Attempting image upload for '{post_data['title']}', URL: {image_url}, source: {image_source}")
-            image_id = upload_image_to_wp(image_url, post_data["title"], wp_base_url, wp_username, wp_password, image_source, uploader, pixabay_url)
+            image_id = upload_image_to_wp(image_url, post_data["title"], wp_base_url, wp_username, wp_password, image_source, uploader, page_url)
            if not image_id:
                logging.info(f"Flickr upload failed for '{post_data['title']}', falling back to Pixabay")
                pixabay_query = post_data["title"][:50]
-                image_url, image_source, uploader, pixabay_url = get_image(pixabay_query)
+                image_url, image_source, uploader, page_url = get_image(pixabay_query)
                if image_url:
-                    image_id = upload_image_to_wp(image_url, post_data["title"], wp_base_url, wp_username, wp_password, image_source, uploader, pixabay_url)
+                    image_id = upload_image_to_wp(image_url, post_data["title"], wp_base_url, wp_username, wp_password, image_source, uploader, page_url)
            if not image_id:
                logging.warning(f"All image uploads failed for '{post_data['title']}' - posting without image")
@@ -808,11 +806,9 @@ def post_to_wp(post_data, category, link, author, image_url, original_source, im
        post_id = post_info["id"]
        post_url = post_info["link"]
        # Save to recent_posts.json
        timestamp = datetime.now(timezone.utc).isoformat()
        save_post_to_recent(post_data["title"], post_url, author["username"], timestamp)
        # Post article tweet to X only if should_post_tweet is True
        if should_post_tweet:
            try:
                post = {"title": post_data["title"], "url": post_url}
@@ -864,42 +860,39 @@ used_images = set()
 # Load used images from file if it exists
 if os.path.exists(used_images_file):
    try:
-        with open(used_images_file, 'r') as f:
+        entries = load_json_file(used_images_file, IMAGE_EXPIRATION_DAYS * 24)  # Use load_json_file for consistency
-            content = f.read().strip()
+        for entry in entries:
-            if not content:
+            if isinstance(entry, dict) and "title" in entry and entry["title"].startswith('https://'):
-                logging.warning(f"Used images file {used_images_file} is empty. Resetting to empty list.")
+                used_images.add(entry["title"])
                data = []
            else:
-                data = json.loads(content)
+                logging.warning(f"Skipping invalid entry in {used_images_file}: {entry}")
                if not isinstance(data, list):
                    logging.warning(f"Invalid format in {used_images_file}: expected a list, got {type(data)}. Converting to list.")
                    if isinstance(data, dict):
                        # If it's a dict, try to extract URLs from values
                        data = [v for v in data.values() if isinstance(v, str) and v.startswith('https://')]
                    else:
                        logging.warning(f"Cannot convert {type(data)} to list. Resetting to empty list.")
                        data = []
                # Filter out non-string or non-URL entries
                data = [item for item in data if isinstance(item, str) and item.startswith('https://')]
            used_images.update(data)
        logging.info(f"Loaded {len(used_images)} used image URLs from {used_images_file}")
    except Exception as e:
        logging.warning(f"Failed to load used images from {used_images_file}: {e}. Resetting to empty set.")
        used_images = set()
        with open(used_images_file, 'w') as f:
-            json.dump([], f)
+            f.write("") 
 # Function to save used_images to file
 def save_used_images():
    try:
-        # Ensure used_images contains only valid URLs
+        # Load existing entries to preserve timestamps
-        valid_urls = [url for url in used_images if isinstance(url, str) and url.startswith('https://')]
+        entries = load_json_file(used_images_file, IMAGE_EXPIRATION_DAYS * 24)
-        if len(valid_urls) != len(used_images):
+        existing_entries = {entry["title"]: entry for entry in entries if isinstance(entry, dict) and "title" in entry}
-            logging.warning(f"Found {len(used_images) - len(valid_urls)} invalid URLs in used_images set")
+        
        # Create new entries for used_images
        timestamp = datetime.now(timezone.utc).isoformat()
        updated_entries = []
        for url in used_images:
            if url in existing_entries:
                updated_entries.append(existing_entries[url])
            else:
                updated_entries.append({"title": url, "timestamp": timestamp})
        with open(used_images_file, 'w') as f:
-            json.dump(valid_urls, f, indent=2)
+            for entry in updated_entries:
-        logging.info(f"Saved {len(valid_urls)} used image URLs to {used_images_file}")
+                f.write(json.dumps(entry) + '\n')
        logging.info(f"Saved {len(updated_entries)} used image URLs to {used_images_file}")
    except Exception as e:
        logging.warning(f"Failed to save used images to {used_images_file}: {e}")
@@ -930,8 +923,18 @@ def process_photo(photo, search_query):
            logging.warning(f"Medium size not available for photo {photo.id}: {e}")
            return None
-    if not img_url or img_url in used_images:
+    if not img_url:
-        logging.info(f"Image URL invalid or already used for photo {photo.id}: {img_url}")
+        logging.info(f"Image URL invalid for photo {photo.id}")
        return None
    # Check if the image is highly relevant to the query
    query_keywords = set(search_query.lower().split())
    photo_keywords = set(tags + title.split())
    is_relevant = bool(query_keywords & photo_keywords)  # Check if any query keyword is in tags or title
    # Allow reuse of highly relevant images
    if img_url in used_images and not is_relevant:
        logging.info(f"Image already used and not highly relevant for photo {photo.id}: {img_url}")
        return None
    uploader = photo.owner.username
@@ -1037,14 +1040,13 @@ def classify_keywords(keywords):
        logging.warning(f"Keyword classification failed: {e}. Defaulting to all specific.")
        return {kw: "specific" for kw in keywords}
-def get_flickr_image(search_query, relevance_keywords):
+def get_flickr_image(search_query, relevance_keywords, main_topic):
    global last_flickr_request_time, flickr_request_count
    reset_flickr_request_count()
    flickr_request_count += 1
    logging.info(f"Flickr request count: {flickr_request_count}/3600")
    # Enforce a minimum delay of 10 seconds between Flickr requests
    current_time = time.time()
    time_since_last_request = current_time - last_flickr_request_time
    if time_since_last_request < 10:
@@ -1052,7 +1054,15 @@ def get_flickr_image(search_query, relevance_keywords):
    last_flickr_request_time = time.time()
-    # Step 1: Search DDG to find Flickr photo IDs
+    # Step 1: Search Flickr directly with the original query
    logging.info(f"Searching Flickr directly with query: '{search_query}'")
    photos = search_flickr(search_query)
    for photo in photos:
        result = process_photo(photo, search_query)
        if result:
            return result
    # Step 2: Search DDG to find Flickr photo IDs
    logging.info(f"Searching DDG with query: '{search_query} site:flickr.com'")
    photo_ids = search_ddg_for_flickr(search_query)
    if photo_ids:
@@ -1063,13 +1073,12 @@ def get_flickr_image(search_query, relevance_keywords):
                if result:
                    return result
-    # Step 2: Break down the query into keywords and classify them for direct Flickr API search
+    # Step 3: Break down the query into keywords and classify them
    keywords = search_query.lower().split()
    if len(keywords) > 1:
        classifications = classify_keywords(keywords)
        logging.info(f"Keyword classifications: {classifications}")
        # Prioritize specific keywords
        specific_keywords = [kw for kw, classification in classifications.items() if classification == "specific"]
        if specific_keywords:
            for keyword in specific_keywords:
@@ -1080,9 +1089,17 @@ def get_flickr_image(search_query, relevance_keywords):
                    if result:
                        return result
-    # Step 3: Final fallback using relevance keywords
+    # Step 4: Fallback using main topic
    logging.info(f"No results found. Falling back to main topic: '{main_topic}'")
    photos = search_flickr(main_topic)
    for photo in photos:
        result = process_photo(photo, main_topic)
        if result:
            return result
    # Step 5: Final fallback using relevance keywords
    fallback_query = " ".join(relevance_keywords) if isinstance(relevance_keywords, list) else relevance_keywords
-    logging.info(f"No results found. Falling back to generic query: '{fallback_query}'")
+    logging.info(f"No results with main topic. Falling back to relevance keywords: '{fallback_query}'")
    photos = search_flickr(fallback_query)
    for photo in photos:
        result = process_photo(photo, search_query)
@@ -1116,46 +1133,58 @@ def select_best_author(summary):
        logging.error(f"Author selection failed: {e}")
        return "owenjohnson"
-def prepare_post_data(final_summary, original_title, context_info=""):
+def prepare_post_data(summary, title, main_topic=None):
-    innovative_title = generate_title_from_summary(final_summary)
+    try:
-    if not innovative_title:
+        logging.info(f"Preparing post data for summary: {summary[:100]}...")
        logging.info(f"Title generation failed for '{original_title}' {context_info}")
        return None, None, None, None, None, None, None
-    # Pass innovative_title and final_summary as separate arguments
+        # Use the original generate_title_from_summary function to generate the title
-    search_query, relevance_keywords, _ = generate_image_query(innovative_title, final_summary)
+        new_title = generate_title_from_summary(summary)
-    if not search_query:
+        if not new_title:
-        logging.info(f"Image query generation failed for '{innovative_title}' {context_info}")
+            logging.warning("Title generation failed, using fallback title")
-        return None, None, None, None, None, None, None
+            new_title = "A Tasty Food Discovery Awaits You"
        logging.info(f"Generated new title: '{new_title}'")
-    logging.info(f"Fetching Flickr image for query: '{search_query}' {context_info}")
+        # Update to unpack four values
-    image_url, image_source, uploader, page_url = get_flickr_image(search_query, relevance_keywords)
+        search_query, relevance_keywords, generated_main_topic, skip_flag = smart_image_and_filter(new_title, summary)
        if skip_flag:
            logging.info("Summary filtered out during post preparation")
            return None, None, None, None, None, None, None
-    if not image_url:
+        # Use the provided main_topic if available, otherwise use the generated one
-        logging.info(f"Flickr fetch failed for '{search_query}' - falling back to Pixabay {context_info}")
+        effective_main_topic = main_topic if main_topic else generated_main_topic
-        # Use the same title and summary for fallback
+        
-        image_query, _, _ = generate_image_query(innovative_title, final_summary)
+        image_url, image_source, uploader, page_url = get_flickr_image(search_query, relevance_keywords, effective_main_topic)
        image_url, image_source, uploader, page_url = get_image(image_query)
        if not image_url:
-            logging.info(f"Pixabay fetch failed for title '{innovative_title}' - falling back to summary {context_info}")
+            image_url, image_source, uploader, page_url = get_image(search_query)
            image_query, _, _ = generate_image_query(final_summary, final_summary)  # Using summary as both title and summary for fallback
            image_url, image_source, uploader, page_url = get_image(image_query)
            if not image_url:
                logging.info(f"Image fetch failed again for '{original_title}' - proceeding without image {context_info}")
-    post_data = {"title": innovative_title, "content": final_summary}
+        if not image_url:
-    selected_username = select_best_author(final_summary)
+            logging.warning("No image found for post, skipping")
-    author = next((a for a in AUTHORS if a["username"] == selected_username), None)
+            return None, None, None, None, None, None, None
    if not author:
        logging.error(f"Author '{selected_username}' not found in AUTHORS, defaulting to owenjohnson")
        author = {"username": "owenjohnson", "password": "rfjk xhn6 2RPy FuQ9 cGlU K8mC"}
    category = generate_category_from_summary(final_summary)
-    return post_data, author, category, image_url, image_source, uploader, page_url
+        # Select a full author dictionary from AUTHORS (already imported from foodie_config)
        author = random.choice(AUTHORS)
        categories = ["Food", "Trends", "Eats", "Culture"]
        category = random.choice(categories)
        post_data = {
            "title": new_title,
            "content": summary,
            "status": "publish",
            "author": author["username"],  # Use the username in post_data
            "categories": [category]
        }
        logging.info(f"Post data prepared: Title: '{new_title}', Category: {category}, Author: {author['username']}")
        return post_data, author, category, image_url, image_source, uploader, page_url
    except Exception as e:
        logging.error(f"Failed to prepare post data: {e}")
        return None, None, None, None, None, None, None
 def save_post_to_recent(post_title, post_url, author_username, timestamp):
    try:
-        recent_posts = load_json_file('/home/shane/foodie_automator/recent_posts.json')
+        recent_posts = load_json_file('/home/shane/foodie_automator/recent_posts.json', 24)  # Added expiration_hours
        entry = {
            "title": post_title,
            "url": post_url,
Author	SHA1	Message	Date
Shane	4116d5f742	add more subreddits	2025-05-04 14:42:54 +10:00
Shane	2ecab209c5	fix insert link naturally	2025-05-04 13:40:28 +10:00
Shane	3d0d320648	fix	2025-05-04 13:12:20 +10:00
Shane	504d7f6349	fix	2025-05-04 12:57:22 +10:00
Shane	ccddefbc8b	try	2025-05-04 12:44:50 +10:00
Shane	d2022222c3	try	2025-05-04 12:14:00 +10:00
Shane	7fba0fe96a	fix	2025-05-04 12:06:46 +10:00
Shane	6be8493878	fix	2025-05-04 11:09:02 +10:00
Shane	e445b6ef33	fix	2025-05-04 10:44:43 +10:00
Shane	5554abdc4a	fix	2025-05-04 10:35:31 +10:00
Shane	64d17d5599	try	2025-05-04 10:27:26 +10:00
Shane	aa0f3364d5	fix image swap	2025-05-04 09:47:47 +10:00
Shane	e5ebd000fe	incorporate external context from DDG	2025-05-04 09:07:45 +10:00