From aa0f3364d59f59aa71fbff9417460e6f611cefa2 Mon Sep 17 00:00:00 2001 From: Shane Date: Sun, 4 May 2025 09:47:47 +1000 Subject: [PATCH] fix image swap --- foodie_utils.py | 138 ++++++++++++++++++++++++++---------------------- 1 file changed, 75 insertions(+), 63 deletions(-) diff --git a/foodie_utils.py b/foodie_utils.py index 83f4e3a..6143211 100644 --- a/foodie_utils.py +++ b/foodie_utils.py @@ -378,11 +378,14 @@ def smart_image_and_filter(title, summary): logging.info(f"Smart image query: {image_query}, Relevance: {relevance_keywords}, Skip: {skip_flag}") - if not image_query or len(image_query.split()) < 2: - logging.warning(f"Image query '{image_query}' too vague, using fallback") + if not image_query: + logging.warning(f"Image query is empty, using fallback") return "food trends", ["cuisine", "dining"], skip_flag - - return image_query, relevance_keywords, skip_flag + # Allow single-word queries if they are specific (e.g., food items) + specific_single_words = ["kimchi", "sushi", "pizza", "taco", "burger"] # Add more as needed + if len(image_query.split()) < 2 and image_query.lower() not in specific_single_words: + logging.warning(f"Image query '{image_query}' too vague, using fallback") + return "food trends", ["cuisine", "dining"], skip_flag except Exception as e: logging.error(f"Smart image/filter failed: {e}, using fallback") @@ -566,38 +569,42 @@ def insert_link_naturally(summary, source_name, source_url): logging.info(f"Input summary to insert_link_naturally: {summary!r}") prompt = ( - "Take this summary and insert a single HTML link naturally into one paragraph (randomly chosen). " - "Use the format '{source_name}' and weave it into the text seamlessly, " - "e.g., 'The latest scoop from {source_name} reveals...' or '{source_name} shares this insight.' " - "Vary the phrasing creatively to avoid repetition (don’t always use 'dives into'). " - "Place the link at a sentence boundary (after a period, not within numbers like '6.30am' or '1.5'). " - "Maintain the original tone, flow, and paragraph structure, preserving all existing newlines exactly as they are. " - "Each paragraph in the input summary is separated by a single \\n; ensure the output maintains this exact separation. " - "Do not add or remove newlines beyond the original summary structure. " + "Take this summary and insert a single HTML link into one paragraph (randomly chosen). " + "Use the format '{source_name}' and weave it into the text naturally, " + "e.g., 'According to {source_name}, ' or '{source_name} shares that '. " + "Place the link at the end of a sentence (after a period). " + "Preserve the original paragraph structure, maintaining all newlines exactly as they are (each paragraph separated by a single \\n). " "Return the modified summary with exactly one link.\n\n" "Summary:\n{summary}\n\n" "Source Name: {source_name}\nSource URL: {source_url}" ).format(summary=summary, source_name=source_name, source_url=source_url) - response = client.chat.completions.create( - model=LIGHT_TASK_MODEL, - messages=[ - {"role": "system", "content": prompt}, - {"role": "user", "content": "Insert the link naturally into the summary."} - ], - max_tokens=1000, - temperature=0.7 - ) - new_summary = response.choices[0].message.content.strip() - link_pattern = f'{source_name}' - if new_summary and new_summary.count(link_pattern) == 1: - paragraphs = new_summary.split('\n') - paragraphs = [p.strip() for p in paragraphs] - new_summary = '\n'.join(paragraphs) - logging.info(f"Summary with naturally embedded link (normalized): {new_summary!r}") - return new_summary + # Add retry mechanism + for attempt in range(3): + try: + response = client.chat.completions.create( + model=LIGHT_TASK_MODEL, + messages=[ + {"role": "system", "content": prompt}, + {"role": "user", "content": "Insert the link naturally into the summary."} + ], + max_tokens=1000, + temperature=0.7 + ) + new_summary = response.choices[0].message.content.strip() + link_pattern = f'{source_name}' + if new_summary and new_summary.count(link_pattern) == 1: + paragraphs = new_summary.split('\n') + paragraphs = [p.strip() for p in paragraphs] + new_summary = '\n'.join(paragraphs) + logging.info(f"Summary with naturally embedded link (normalized): {new_summary!r}") + return new_summary + else: + logging.warning(f"GPT attempt {attempt + 1}/3 failed to insert link correctly: {new_summary}") + except Exception as e: + logging.error(f"Link insertion attempt {attempt + 1}/3 failed: {e}") - logging.warning(f"GPT failed to insert link correctly: {new_summary}. Using fallback.") + logging.warning(f"GPT failed to insert link after 3 attempts. Using fallback.") except Exception as e: logging.error(f"Link insertion failed: {e}") @@ -612,10 +619,10 @@ def insert_link_naturally(summary, source_name, source_url): target_para = random.choice([p for p in paragraphs if p.strip()]) link_pattern = f'{source_name}' phrases = [ - f"According to {link_pattern}", # Changed to a more neutral phrasing - f"{link_pattern} notes this insight", # Adjusted phrasing - f"Details shared by {link_pattern}", # Adjusted phrasing - f"Source: {link_pattern}" # Simple attribution + f"According to {link_pattern}", + f"{link_pattern} notes this insight", + f"Details shared by {link_pattern}", + f"Source: {link_pattern}" ] insertion_phrase = random.choice(phrases) @@ -864,42 +871,39 @@ used_images = set() # Load used images from file if it exists if os.path.exists(used_images_file): try: - with open(used_images_file, 'r') as f: - content = f.read().strip() - if not content: - logging.warning(f"Used images file {used_images_file} is empty. Resetting to empty list.") - data = [] + entries = load_json_file(used_images_file, IMAGE_EXPIRATION_DAYS * 24) # Use load_json_file for consistency + for entry in entries: + if isinstance(entry, dict) and "title" in entry and entry["title"].startswith('https://'): + used_images.add(entry["title"]) else: - data = json.loads(content) - if not isinstance(data, list): - logging.warning(f"Invalid format in {used_images_file}: expected a list, got {type(data)}. Converting to list.") - if isinstance(data, dict): - # If it's a dict, try to extract URLs from values - data = [v for v in data.values() if isinstance(v, str) and v.startswith('https://')] - else: - logging.warning(f"Cannot convert {type(data)} to list. Resetting to empty list.") - data = [] - # Filter out non-string or non-URL entries - data = [item for item in data if isinstance(item, str) and item.startswith('https://')] - used_images.update(data) + logging.warning(f"Skipping invalid entry in {used_images_file}: {entry}") logging.info(f"Loaded {len(used_images)} used image URLs from {used_images_file}") except Exception as e: logging.warning(f"Failed to load used images from {used_images_file}: {e}. Resetting to empty set.") used_images = set() with open(used_images_file, 'w') as f: - json.dump([], f) + f.write("") # Function to save used_images to file def save_used_images(): try: - # Ensure used_images contains only valid URLs - valid_urls = [url for url in used_images if isinstance(url, str) and url.startswith('https://')] - if len(valid_urls) != len(used_images): - logging.warning(f"Found {len(used_images) - len(valid_urls)} invalid URLs in used_images set") + # Load existing entries to preserve timestamps + entries = load_json_file(used_images_file, IMAGE_EXPIRATION_DAYS * 24) + existing_entries = {entry["title"]: entry for entry in entries if isinstance(entry, dict) and "title" in entry} + + # Create new entries for used_images + timestamp = datetime.now(timezone.utc).isoformat() + updated_entries = [] + for url in used_images: + if url in existing_entries: + updated_entries.append(existing_entries[url]) + else: + updated_entries.append({"title": url, "timestamp": timestamp}) with open(used_images_file, 'w') as f: - json.dump(valid_urls, f, indent=2) - logging.info(f"Saved {len(valid_urls)} used image URLs to {used_images_file}") + for entry in updated_entries: + f.write(json.dumps(entry) + '\n') + logging.info(f"Saved {len(updated_entries)} used image URLs to {used_images_file}") except Exception as e: logging.warning(f"Failed to save used images to {used_images_file}: {e}") @@ -938,7 +942,7 @@ def process_photo(photo, search_query): page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}" used_images.add(img_url) - save_used_images() + save_used_images() # This will now save in the correct format flickr_data = { "title": search_query, @@ -1052,7 +1056,15 @@ def get_flickr_image(search_query, relevance_keywords): last_flickr_request_time = time.time() - # Step 1: Search DDG to find Flickr photo IDs + # Step 1: Search Flickr directly with the original query + logging.info(f"Searching Flickr directly with query: '{search_query}'") + photos = search_flickr(search_query) + for photo in photos: + result = process_photo(photo, search_query) + if result: + return result + + # Step 2: Search DDG to find Flickr photo IDs logging.info(f"Searching DDG with query: '{search_query} site:flickr.com'") photo_ids = search_ddg_for_flickr(search_query) if photo_ids: @@ -1063,7 +1075,7 @@ def get_flickr_image(search_query, relevance_keywords): if result: return result - # Step 2: Break down the query into keywords and classify them for direct Flickr API search + # Step 3: Break down the query into keywords and classify them keywords = search_query.lower().split() if len(keywords) > 1: classifications = classify_keywords(keywords) @@ -1080,7 +1092,7 @@ def get_flickr_image(search_query, relevance_keywords): if result: return result - # Step 3: Final fallback using relevance keywords + # Step 4: Final fallback using relevance keywords fallback_query = " ".join(relevance_keywords) if isinstance(relevance_keywords, list) else relevance_keywords logging.info(f"No results found. Falling back to generic query: '{fallback_query}'") photos = search_flickr(fallback_query) @@ -1155,7 +1167,7 @@ def prepare_post_data(final_summary, original_title, context_info=""): def save_post_to_recent(post_title, post_url, author_username, timestamp): try: - recent_posts = load_json_file('/home/shane/foodie_automator/recent_posts.json') + recent_posts = load_json_file('/home/shane/foodie_automator/recent_posts.json', 24) # Added expiration_hours entry = { "title": post_title, "url": post_url,