diff --git a/foodie_utils.py b/foodie_utils.py index 6a8629d..2fa1a1e 100644 --- a/foodie_utils.py +++ b/foodie_utils.py @@ -62,10 +62,16 @@ def save_json_file(filename, key, value): # Remove duplicates by title data = [item for item in data if item["title"] != key] data.append(entry) - with open(filename, 'w') as f: - for item in data: - json.dump(item, f) - f.write('\n') + # Special handling for used_images.json to save as a flat list + if filename.endswith('used_images.json'): + flat_data = [item["title"] for item in data if isinstance(item, dict) and "title" in item] + with open(filename, 'w') as f: + json.dump(flat_data, f) + else: + with open(filename, 'w') as f: + for item in data: + json.dump(item, f) + f.write('\n') logging.info(f"Saved '{key}' to {filename}") print(f"DEBUG: Saved '{key}' to {filename}") loaded_data = load_json_file(filename, expiration_days=PRUNE_INTERVAL_DAYS) @@ -227,98 +233,165 @@ def get_image(search_query): last_flickr_request_time = time.time() - try: - # Try Flickr API first - photos = flickr_api.Photo.search( - text=search_query, - per_page=10, - sort='relevance', - safe_search=1, - media='photos', - license='4,5,9,10' # Commercial use licenses - ) + headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} + + # Helper function to search Flickr with a given query + def search_flickr(query, per_page=20): + try: + photos = flickr_api.Photo.search( + text=query, + per_page=per_page, + sort='relevance', + safe_search=1, + media='photos', + license='4,5,9,10' + ) + return photos + except Exception as e: + logging.warning(f"Flickr API error for query '{query}': {e}") + return [] + + # Helper function to process a photo + def process_photo(photo): + tags = [tag.text.lower() for tag in photo.getTags()] + title = photo.title.lower() if photo.title else "" - headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} + matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title] + if matched_keywords: + logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})") + return None - for photo in photos: - # Fetch photo metadata (tags and title) - tags = [tag.text.lower() for tag in photo.getTags()] - title = photo.title.lower() if photo.title else "" + img_url = photo.getPhotoFile(size_label='Medium') + if not img_url or img_url in used_images: + return None + + temp_file = None + try: + img_response = requests.get(img_url, headers=headers, timeout=10) + img_response.raise_for_status() + with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file: + temp_file.write(img_response.content) + temp_path = temp_file.name + + img = Image.open(temp_path) + text = pytesseract.image_to_string(img) + char_count = len(text.strip()) + logging.info(f"OCR processed {img_url}: {char_count} characters detected") + + if char_count > 200: + logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})") + return None + + uploader = photo.owner.username + page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}" - # Filter out images with unwanted keywords in tags or title - matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title] - if matched_keywords: - logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})") - continue + used_images.add(img_url) + save_used_images() - img_url = photo.getPhotoFile(size_label='Medium') - if not img_url: - continue - if img_url in used_images: - continue + flickr_data = { + "title": search_query, + "image_url": img_url, + "source": "Flickr", + "uploader": uploader, + "page_url": page_url, + "timestamp": datetime.now(timezone.utc).isoformat(), + "ocr_chars": char_count + } + flickr_file = "/home/shane/foodie_automator/flickr_images.json" + with open(flickr_file, 'a') as f: + json.dump(flickr_data, f) + f.write('\n') + logging.info(f"Saved Flickr image to {flickr_file}: {img_url}") - # Download the image and run OCR to check for excessive text - temp_file = None - try: - img_response = requests.get(img_url, headers=headers, timeout=10) - img_response.raise_for_status() - with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file: - temp_file.write(img_response.content) - temp_path = temp_file.name + logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})") + return img_url, "Flickr", uploader, page_url - img = Image.open(temp_path) - text = pytesseract.image_to_string(img) - char_count = len(text.strip()) - logging.info(f"OCR processed {img_url}: {char_count} characters detected") + except requests.exceptions.HTTPError as e: + if e.response.status_code == 429: + logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.") + return None + else: + logging.warning(f"Download failed for {img_url}: {e}") + return None + except Exception as e: + logging.warning(f"OCR processing failed for {img_url}: {e}") + return None + finally: + if temp_file and os.path.exists(temp_path): + os.unlink(temp_path) - if char_count > 200: - logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})") - continue + # Helper function to classify keywords as specific or generic + def classify_keywords(keywords): + prompt = ( + "Given the following keywords from an image search query, classify each as 'specific' (e.g., brand names, unique entities) or 'generic' (e.g., common or abstract terms). " + "Return a JSON object mapping each keyword to its classification.\n\n" + "Keywords: " + ", ".join(keywords) + "\n\n" + "Example output:\n" + "```json\n" + "{\n" + " \"Wingstop\": \"specific\",\n" + " \"Smart\": \"generic\",\n" + " \"Kitchen\": \"generic\"\n" + "}\n```" + ) + try: + response = client.chat.completions.create( + model=LIGHT_TASK_MODEL, + messages=[ + {"role": "system", "content": "You are a helpful assistant that classifies keywords."}, + {"role": "user", "content": prompt} + ], + max_tokens=100, + temperature=0.5 + ) + raw_response = response.choices[0].message.content + json_match = re.search(r'```json\n([\s\S]*?)\n```', raw_response) + if not json_match: + logging.warning(f"Failed to parse keyword classification JSON: {raw_response}") + return {kw: "specific" for kw in keywords} + + classifications = json.loads(json_match.group(1)) + return classifications + except Exception as e: + logging.warning(f"Keyword classification failed: {e}. Defaulting to all specific.") + return {kw: "specific" for kw in keywords} - uploader = photo.owner.username - page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}" - - # Add the image URL to used_images - used_images.add(img_url) - save_used_images() - - # Save Flickr image metadata - flickr_data = { - "title": search_query, - "image_url": img_url, - "source": "Flickr", - "uploader": uploader, - "page_url": page_url, - "timestamp": datetime.now(timezone.utc).isoformat(), - "ocr_chars": char_count - } - flickr_file = "/home/shane/foodie_automator/flickr_images.json" - with open(flickr_file, 'a') as f: - json.dump(flickr_data, f) - f.write('\n') - logging.info(f"Saved Flickr image to {flickr_file}: {img_url}") - - logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})") - return img_url, "Flickr", uploader, page_url + # Step 1: Try the original search query on Flickr + logging.info(f"Searching Flickr with original query: '{search_query}'") + photos = search_flickr(search_query) + for photo in photos: + result = process_photo(photo) + if result: + return result - except requests.exceptions.HTTPError as e: - if e.response.status_code == 429: - logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.") - return None, None, None, None - else: - logging.warning(f"Download failed for {img_url}: {e}") - continue - except Exception as e: - logging.warning(f"OCR processing failed for {img_url}: {e}") - continue - finally: - if temp_file and os.path.exists(temp_path): - os.unlink(temp_path) + # Step 2: Break down the query into keywords and classify them + keywords = search_query.lower().split() + if len(keywords) > 1: + classifications = classify_keywords(keywords) + logging.info(f"Keyword classifications: {classifications}") - logging.warning(f"No valid Flickr image found in fallback for query '{search_query}'. Trying Pixabay.") - - except Exception as e: - logging.warning(f"Fallback Flickr API error for query '{search_query}': {e}. Falling back to Pixabay.") + # Prioritize specific keywords + specific_keywords = [kw for kw, classification in classifications.items() if classification == "specific"] + if specific_keywords: + for keyword in specific_keywords: + logging.info(f"Searching Flickr with specific keyword: '{keyword}'") + photos = search_flickr(keyword) + for photo in photos: + result = process_photo(photo) + if result: + return result + + # Step 3: Final fallback to a generic food-related query + # Use a simple generic query derived from context (e.g., "food dining") + fallback_query = "food dining" # This could be further contextualized if needed + logging.info(f"No results found. Falling back to generic query: '{fallback_query}'") + photos = search_flickr(fallback_query) + for photo in photos: + result = process_photo(photo) + if result: + return result + + logging.warning(f"No valid Flickr image found in fallback for query '{search_query}'. Trying Pixabay.") # Fallback to Pixabay try: @@ -334,7 +407,6 @@ def get_image(search_query): uploader = hit.get('user', 'Unknown') page_url = hit.get('pageURL', img_url) - # Add the image URL to used_images used_images.add(img_url) save_used_images() @@ -350,14 +422,18 @@ def get_image(search_query): def generate_image_query(content): prompt = ( - "Given the following content, generate a concise image search query (max 5 words) that would likely yield relevant, visually appealing images on platforms like Flickr or Pixabay. Focus on concrete, visual concepts related to food, dining, or restaurants, avoiding overly abstract terms. Also provide relevance keywords (max 5 words) to filter results. Return the result as a JSON object with 'search' and 'relevance' keys.\n\n" + "Given the following content, generate a concise image search query (max 5 words) that would likely yield relevant, visually appealing images on platforms like Flickr or Pixabay. " + "Identify and prioritize specific entities like brand names or unique terms over abstract or generic concepts. " + "Focus on concrete, visual concepts related to food, dining, or restaurants. " + "Also provide relevance keywords (max 5 words) to filter results, using general themes related to the content. " + "Return the result as a JSON object with 'search' and 'relevance' keys.\n\n" "Content:\n" f"{content}\n\n" "Example output:\n" "```json\n" "{\n" - " \"search\": \"modern dining trends\",\n" - " \"relevance\": \"dining habits restaurant trends\"\n" + " \"search\": \"Wingstop dining\",\n" + " \"relevance\": \"fast food dining\"\n" "}\n```" ) @@ -379,18 +455,18 @@ def generate_image_query(content): json_match = re.search(r'```json\n([\s\S]*?)\n```', raw_response) if not json_match: logging.warning(f"Failed to parse image query JSON from GPT response: {raw_response}") - return "restaurant dining", "dining trends" + return "food dining", ["dining", "trends"] query_data = json.loads(json_match.group(1)) - search_query = query_data.get("search", "restaurant dining") - relevance_keywords = query_data.get("relevance", "dining trends") + search_query = query_data.get("search", "food dining") + relevance_keywords = query_data.get("relevance", ["dining", "trends"]) logging.debug(f"Image query from content: {query_data}") return search_query, relevance_keywords except Exception as e: logging.warning(f"Failed to generate image query: {e}. Using fallback.") - return "restaurant dining", "dining trends" + return "food dining", ["dining", "trends"] def smart_image_and_filter(title, summary): try: @@ -893,7 +969,18 @@ if os.path.exists(used_images_file): try: with open(used_images_file, 'r') as f: data = json.load(f) - used_images.update(data) + # Handle malformed format (list of lists) + if isinstance(data, list) and data and isinstance(data[0], list): + logging.warning(f"Fixing malformed used_images.json format: {data[:2]}...") + flat_data = [] + for item in data: + if isinstance(item, list): + flat_data.extend(item) + else: + flat_data.append(item) + used_images.update(flat_data) + else: + used_images.update(data) logging.info(f"Loaded {len(used_images)} used image URLs from {used_images_file}") except Exception as e: logging.warning(f"Failed to load used images from {used_images_file}: {e}") @@ -913,117 +1000,7 @@ def reset_flickr_request_count(): flickr_request_count = 0 flickr_request_start_time = time.time() -def get_flickr_image(search_query, relevance_keywords): - global last_flickr_request_time, flickr_request_count - - reset_flickr_request_count() - flickr_request_count += 1 - logging.info(f"Flickr request count: {flickr_request_count}/3600") - - # Enforce a minimum delay of 1 second between Flickr requests - current_time = time.time() - time_since_last_request = current_time - last_flickr_request_time - if time_since_last_request < 1: - time.sleep(1 - time_since_last_request) - - last_flickr_request_time = time.time() - - try: - # Search for photos on Flickr using the API - photos = flickr_api.Photo.search( - text=search_query, - per_page=10, - sort='relevance', - safe_search=1, - media='photos', - license='4,5,9,10' # Commercial use licenses (CC BY, CC BY-SA, etc.) - ) - - headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} - - for photo in photos: - # Fetch photo metadata (tags and title) - tags = [tag.text.lower() for tag in photo.getTags()] - title = photo.title.lower() if photo.title else "" - - # Filter out images with unwanted keywords in tags or title - matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title] - if matched_keywords: - logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})") - continue - - img_url = photo.getPhotoFile(size_label='Large') - if not img_url: - img_url = photo.getPhotoFile(size_label='Medium') - if not img_url: - continue - if img_url in used_images: - continue - - # Download the image and run OCR to check for excessive text - temp_file = None - try: - img_response = requests.get(img_url, headers=headers, timeout=10) - img_response.raise_for_status() - with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file: - temp_file.write(img_response.content) - temp_path = temp_file.name - - img = Image.open(temp_path) - text = pytesseract.image_to_string(img) - char_count = len(text.strip()) - logging.info(f"OCR processed {img_url}: {char_count} characters detected") - - if char_count > 200: - logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})") - continue - - uploader = photo.owner.username - page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}" - - # Add the image URL to used_images - used_images.add(img_url) - save_used_images() - - # Save Flickr image metadata - flickr_data = { - "title": search_query, - "image_url": img_url, - "source": "Flickr", - "uploader": uploader, - "page_url": page_url, - "timestamp": datetime.now(timezone.utc).isoformat(), - "ocr_chars": char_count - } - flickr_file = "/home/shane/foodie_automator/flickr_images.json" - with open(flickr_file, 'a') as f: - json.dump(flickr_data, f) - f.write('\n') - logging.info(f"Saved Flickr image to {flickr_file}: {img_url}") - - logging.info(f"Fetched Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})") - return img_url, "Flickr", uploader, page_url - - except requests.exceptions.HTTPError as e: - if e.response.status_code == 429: - logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.") - return None, None, None, None - else: - logging.warning(f"Download failed for {img_url}: {e}") - continue - except Exception as e: - logging.warning(f"OCR processing failed for {img_url}: {e}") - continue - finally: - if temp_file and os.path.exists(temp_path): - os.unlink(temp_path) - - logging.warning(f"No valid Flickr image found for query '{search_query}'.") - return None, None, None, None - - except Exception as e: - logging.warning(f"Flickr API error for query '{search_query}': {e}. Falling back to Pixabay.") - return None, None, None, None +if keyword in ['smart', 'ai', 'ai-powered', 'kitchen', 'dining', 'experience']: def select_best_author(summary): try: