From aa0f3364d59f59aa71fbff9417460e6f611cefa2 Mon Sep 17 00:00:00 2001
From: Shane <shanehill@mail.com>
Date: Sun, 4 May 2025 09:47:47 +1000
Subject: [PATCH] fix image swap

---
 foodie_utils.py | 138 ++++++++++++++++++++++++++----------------------
 1 file changed, 75 insertions(+), 63 deletions(-)
diff --git a/foodie_utils.py b/foodie_utils.py
index 83f4e3a..6143211 100644
--- a/foodie_utils.py
+++ b/foodie_utils.py
@@ -378,11 +378,14 @@ def smart_image_and_filter(title, summary):
         
         logging.info(f"Smart image query: {image_query}, Relevance: {relevance_keywords}, Skip: {skip_flag}")
         
-        if not image_query or len(image_query.split()) < 2:
-            logging.warning(f"Image query '{image_query}' too vague, using fallback")
+        if not image_query:
+            logging.warning(f"Image query is empty, using fallback")
             return "food trends", ["cuisine", "dining"], skip_flag
-        
-        return image_query, relevance_keywords, skip_flag
+        # Allow single-word queries if they are specific (e.g., food items)
+        specific_single_words = ["kimchi", "sushi", "pizza", "taco", "burger"]  # Add more as needed
+        if len(image_query.split()) < 2 and image_query.lower() not in specific_single_words:
+            logging.warning(f"Image query '{image_query}' too vague, using fallback")
+        return "food trends", ["cuisine", "dining"], skip_flag
     
     except Exception as e:
         logging.error(f"Smart image/filter failed: {e}, using fallback")
@@ -566,38 +569,42 @@ def insert_link_naturally(summary, source_name, source_url):
         logging.info(f"Input summary to insert_link_naturally: {summary!r}")
 
         prompt = (
-            "Take this summary and insert a single HTML link naturally into one paragraph (randomly chosen). "
-            "Use the format '<a href=\"{source_url}\">{source_name}</a>' and weave it into the text seamlessly, "
-            "e.g., 'The latest scoop from {source_name} reveals...' or '{source_name} shares this insight.' "
-            "Vary the phrasing creatively to avoid repetition (don’t always use 'dives into'). "
-            "Place the link at a sentence boundary (after a period, not within numbers like '6.30am' or '1.5'). "
-            "Maintain the original tone, flow, and paragraph structure, preserving all existing newlines exactly as they are. "
-            "Each paragraph in the input summary is separated by a single \\n; ensure the output maintains this exact separation. "
-            "Do not add or remove newlines beyond the original summary structure. "
+            "Take this summary and insert a single HTML link into one paragraph (randomly chosen). "
+            "Use the format '<a href=\"{source_url}\">{source_name}</a>' and weave it into the text naturally, "
+            "e.g., 'According to {source_name}, ' or '{source_name} shares that '. "
+            "Place the link at the end of a sentence (after a period). "
+            "Preserve the original paragraph structure, maintaining all newlines exactly as they are (each paragraph separated by a single \\n). "
             "Return the modified summary with exactly one link.\n\n"
             "Summary:\n{summary}\n\n"
             "Source Name: {source_name}\nSource URL: {source_url}"
         ).format(summary=summary, source_name=source_name, source_url=source_url)
         
-        response = client.chat.completions.create(
-            model=LIGHT_TASK_MODEL,
-            messages=[
-                {"role": "system", "content": prompt},
-                {"role": "user", "content": "Insert the link naturally into the summary."}
-            ],
-            max_tokens=1000,
-            temperature=0.7
-        )
-        new_summary = response.choices[0].message.content.strip()
-        link_pattern = f'<a href="{source_url}">{source_name}</a>'
-        if new_summary and new_summary.count(link_pattern) == 1:
-            paragraphs = new_summary.split('\n')
-            paragraphs = [p.strip() for p in paragraphs]
-            new_summary = '\n'.join(paragraphs)
-            logging.info(f"Summary with naturally embedded link (normalized): {new_summary!r}")
-            return new_summary
+        # Add retry mechanism
+        for attempt in range(3):
+            try:
+                response = client.chat.completions.create(
+                    model=LIGHT_TASK_MODEL,
+                    messages=[
+                        {"role": "system", "content": prompt},
+                        {"role": "user", "content": "Insert the link naturally into the summary."}
+                    ],
+                    max_tokens=1000,
+                    temperature=0.7
+                )
+                new_summary = response.choices[0].message.content.strip()
+                link_pattern = f'<a href="{source_url}">{source_name}</a>'
+                if new_summary and new_summary.count(link_pattern) == 1:
+                    paragraphs = new_summary.split('\n')
+                    paragraphs = [p.strip() for p in paragraphs]
+                    new_summary = '\n'.join(paragraphs)
+                    logging.info(f"Summary with naturally embedded link (normalized): {new_summary!r}")
+                    return new_summary
+                else:
+                    logging.warning(f"GPT attempt {attempt + 1}/3 failed to insert link correctly: {new_summary}")
+            except Exception as e:
+                logging.error(f"Link insertion attempt {attempt + 1}/3 failed: {e}")
         
-        logging.warning(f"GPT failed to insert link correctly: {new_summary}. Using fallback.")
+        logging.warning(f"GPT failed to insert link after 3 attempts. Using fallback.")
     except Exception as e:
         logging.error(f"Link insertion failed: {e}")
 
@@ -612,10 +619,10 @@ def insert_link_naturally(summary, source_name, source_url):
     target_para = random.choice([p for p in paragraphs if p.strip()])
     link_pattern = f'<a href="{source_url}">{source_name}</a>'
     phrases = [
-        f"According to {link_pattern}",  # Changed to a more neutral phrasing
-        f"{link_pattern} notes this insight",  # Adjusted phrasing
-        f"Details shared by {link_pattern}",  # Adjusted phrasing
-        f"Source: {link_pattern}"  # Simple attribution
+        f"According to {link_pattern}",
+        f"{link_pattern} notes this insight",
+        f"Details shared by {link_pattern}",
+        f"Source: {link_pattern}"
     ]
     insertion_phrase = random.choice(phrases)
     
@@ -864,42 +871,39 @@ used_images = set()
 # Load used images from file if it exists
 if os.path.exists(used_images_file):
     try:
-        with open(used_images_file, 'r') as f:
-            content = f.read().strip()
-            if not content:
-                logging.warning(f"Used images file {used_images_file} is empty. Resetting to empty list.")
-                data = []
+        entries = load_json_file(used_images_file, IMAGE_EXPIRATION_DAYS * 24)  # Use load_json_file for consistency
+        for entry in entries:
+            if isinstance(entry, dict) and "title" in entry and entry["title"].startswith('https://'):
+                used_images.add(entry["title"])
             else:
-                data = json.loads(content)
-                if not isinstance(data, list):
-                    logging.warning(f"Invalid format in {used_images_file}: expected a list, got {type(data)}. Converting to list.")
-                    if isinstance(data, dict):
-                        # If it's a dict, try to extract URLs from values
-                        data = [v for v in data.values() if isinstance(v, str) and v.startswith('https://')]
-                    else:
-                        logging.warning(f"Cannot convert {type(data)} to list. Resetting to empty list.")
-                        data = []
-                # Filter out non-string or non-URL entries
-                data = [item for item in data if isinstance(item, str) and item.startswith('https://')]
-            used_images.update(data)
+                logging.warning(f"Skipping invalid entry in {used_images_file}: {entry}")
         logging.info(f"Loaded {len(used_images)} used image URLs from {used_images_file}")
     except Exception as e:
         logging.warning(f"Failed to load used images from {used_images_file}: {e}. Resetting to empty set.")
         used_images = set()
         with open(used_images_file, 'w') as f:
-            json.dump([], f)
+            f.write("") 
 
 # Function to save used_images to file
 def save_used_images():
     try:
-        # Ensure used_images contains only valid URLs
-        valid_urls = [url for url in used_images if isinstance(url, str) and url.startswith('https://')]
-        if len(valid_urls) != len(used_images):
-            logging.warning(f"Found {len(used_images) - len(valid_urls)} invalid URLs in used_images set")
+        # Load existing entries to preserve timestamps
+        entries = load_json_file(used_images_file, IMAGE_EXPIRATION_DAYS * 24)
+        existing_entries = {entry["title"]: entry for entry in entries if isinstance(entry, dict) and "title" in entry}
+        
+        # Create new entries for used_images
+        timestamp = datetime.now(timezone.utc).isoformat()
+        updated_entries = []
+        for url in used_images:
+            if url in existing_entries:
+                updated_entries.append(existing_entries[url])
+            else:
+                updated_entries.append({"title": url, "timestamp": timestamp})
         
         with open(used_images_file, 'w') as f:
-            json.dump(valid_urls, f, indent=2)
-        logging.info(f"Saved {len(valid_urls)} used image URLs to {used_images_file}")
+            for entry in updated_entries:
+                f.write(json.dumps(entry) + '\n')
+        logging.info(f"Saved {len(updated_entries)} used image URLs to {used_images_file}")
     except Exception as e:
         logging.warning(f"Failed to save used images to {used_images_file}: {e}")
 
@@ -938,7 +942,7 @@ def process_photo(photo, search_query):
     page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
     
     used_images.add(img_url)
-    save_used_images()
+    save_used_images()  # This will now save in the correct format
     
     flickr_data = {
         "title": search_query,
@@ -1052,7 +1056,15 @@ def get_flickr_image(search_query, relevance_keywords):
     
     last_flickr_request_time = time.time()
     
-    # Step 1: Search DDG to find Flickr photo IDs
+    # Step 1: Search Flickr directly with the original query
+    logging.info(f"Searching Flickr directly with query: '{search_query}'")
+    photos = search_flickr(search_query)
+    for photo in photos:
+        result = process_photo(photo, search_query)
+        if result:
+            return result
+
+    # Step 2: Search DDG to find Flickr photo IDs
     logging.info(f"Searching DDG with query: '{search_query} site:flickr.com'")
     photo_ids = search_ddg_for_flickr(search_query)
     if photo_ids:
@@ -1063,7 +1075,7 @@ def get_flickr_image(search_query, relevance_keywords):
                 if result:
                     return result
 
-    # Step 2: Break down the query into keywords and classify them for direct Flickr API search
+    # Step 3: Break down the query into keywords and classify them
     keywords = search_query.lower().split()
     if len(keywords) > 1:
         classifications = classify_keywords(keywords)
@@ -1080,7 +1092,7 @@ def get_flickr_image(search_query, relevance_keywords):
                     if result:
                         return result
 
-    # Step 3: Final fallback using relevance keywords
+    # Step 4: Final fallback using relevance keywords
     fallback_query = " ".join(relevance_keywords) if isinstance(relevance_keywords, list) else relevance_keywords
     logging.info(f"No results found. Falling back to generic query: '{fallback_query}'")
     photos = search_flickr(fallback_query)
@@ -1155,7 +1167,7 @@ def prepare_post_data(final_summary, original_title, context_info=""):
 
 def save_post_to_recent(post_title, post_url, author_username, timestamp):
     try:
-        recent_posts = load_json_file('/home/shane/foodie_automator/recent_posts.json')
+        recent_posts = load_json_file('/home/shane/foodie_automator/recent_posts.json', 24)  # Added expiration_hours
         entry = {
             "title": post_title,
             "url": post_url,