diff --git a/foodie_utils.py b/foodie_utils.py index f222e86..8dfac31 100644 --- a/foodie_utils.py +++ b/foodie_utils.py @@ -1077,72 +1077,66 @@ def classify_keywords(keywords): return {kw: "specific" for kw in keywords} def get_flickr_image(search_query, relevance_keywords, main_topic): - global last_flickr_request_time, flickr_request_count - - reset_flickr_request_count() - flickr_request_count += 1 - logging.info(f"Flickr request count: {flickr_request_count}/3600") - - current_time = time.time() - time_since_last_request = current_time - last_flickr_request_time - if time_since_last_request < 10: - time.sleep(10 - time_since_last_request) + global used_images + logger = logging.getLogger(__name__) - last_flickr_request_time = time.time() + def process_image(image_url, source_name, page_url): + """Download image, check for text with OCR, and validate.""" + try: + headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} + response = requests.get(image_url, headers=headers, timeout=10) + response.raise_for_status() + img = Image.open(io.BytesIO(response.content)) + + # OCR to detect text + text = pytesseract.image_to_string(img).strip() + word_count = len(text.split()) + if word_count > 10: + logger.info(f"Skipping image with too much text: {image_url} ({word_count} words)") + return None + + if image_url in used_images: + logger.info(f"Image already used: {image_url}") + return None + + used_images.add(image_url) + save_used_images() + uploader = "Unknown" # Most public domain sources don't provide uploader + logger.info(f"Selected image: {image_url} from {source_name}") + return image_url, source_name, uploader, page_url + except Exception as e: + logger.warning(f"Failed to process image {image_url}: {e}") + return None - # Step 1: Search Flickr directly with the original query - logging.info(f"Searching Flickr directly with query: '{search_query}'") - photos = search_flickr(search_query) - for photo in photos: - result = process_photo(photo, search_query) - if result: - return result - - # Step 2: Search DDG to find Flickr photo IDs - logging.info(f"Searching DDG with query: '{search_query} site:flickr.com'") - photo_ids = search_ddg_for_flickr(search_query) - if photo_ids: - for photo_id in photo_ids: - photo = fetch_photo_by_id(photo_id) - if photo: - result = process_photo(photo, search_query) - if result: - return result - - # Step 3: Break down the query into keywords and classify them - keywords = search_query.lower().split() - if len(keywords) > 1: - classifications = classify_keywords(keywords) - logging.info(f"Keyword classifications: {classifications}") - - specific_keywords = [kw for kw, classification in classifications.items() if classification == "specific"] - if specific_keywords: - for keyword in specific_keywords: - logging.info(f"Searching Flickr with specific keyword: '{keyword}'") - photos = search_flickr(keyword) - for photo in photos: - result = process_photo(photo, search_query) + # Step 1: Search DDG for public domain images + ddg_query = f"{search_query} license:public domain" + logger.info(f"Searching DDG with query: '{ddg_query}'") + try: + with DDGS() as ddgs: + results = ddgs.images(ddg_query, safesearch="on", max_results=10) + for result in results: + image_url = result.get("image") + page_url = result.get("url") + # Extract domain as source_name (e.g., unsplash.com -> Unsplash) + source_match = re.search(r'https?://(?:www\.)?([^/]+)', page_url) + source_name = source_match.group(1).capitalize() if source_match else "Public Domain" + if image_url and image_url.endswith(('.jpg', '.jpeg', '.png')): + result = process_image(image_url, source_name, page_url) if result: return result - - # Step 4: Fallback using main topic - logging.info(f"No results found. Falling back to main topic: '{main_topic}'") - photos = search_flickr(main_topic) - for photo in photos: - result = process_photo(photo, main_topic) - if result: - return result - - # Step 5: Final fallback using relevance keywords - fallback_query = " ".join(relevance_keywords) if isinstance(relevance_keywords, list) else relevance_keywords - logging.info(f"No results with main topic. Falling back to relevance keywords: '{fallback_query}'") - photos = search_flickr(fallback_query) - for photo in photos: - result = process_photo(photo, search_query) - if result: - return result - - logging.warning(f"No valid Flickr image found for query '{search_query}' after all attempts.") + except Exception as e: + logger.warning(f"DDG search failed for '{ddg_query}': {e}") + + # Step 2: Fallback to Pixabay + logger.info(f"No valid DDG images, falling back to Pixabay for '{search_query}'") + image_url, source_name, uploader, page_url = get_image(search_query) + if image_url: + used_images.add(image_url) + save_used_images() + logger.info(f"Selected Pixabay image: {image_url}") + return image_url, source_name, uploader, page_url + + logger.warning(f"No valid images found for query '{search_query}'") return None, None, None, None def select_best_author(content, interest_score):