stop using flickr API for images

2025-05-11 19:47:10 +10:00
parent 7c69b4a451
commit 071726f016
1 changed files with 56 additions and 62 deletions
@@ -1077,72 +1077,66 @@ def classify_keywords(keywords):
        return {kw: "specific" for kw in keywords}
 def get_flickr_image(search_query, relevance_keywords, main_topic):
-    global last_flickr_request_time, flickr_request_count
+    global used_images
    logger = logging.getLogger(__name__)
-    reset_flickr_request_count()
+    def process_image(image_url, source_name, page_url):
-    flickr_request_count += 1
+        """Download image, check for text with OCR, and validate."""
-    logging.info(f"Flickr request count: {flickr_request_count}/3600")
+        try:
            headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
            response = requests.get(image_url, headers=headers, timeout=10)
            response.raise_for_status()
            img = Image.open(io.BytesIO(response.content))
            # OCR to detect text
            text = pytesseract.image_to_string(img).strip()
            word_count = len(text.split())
            if word_count > 10:
                logger.info(f"Skipping image with too much text: {image_url} ({word_count} words)")
                return None
            if image_url in used_images:
                logger.info(f"Image already used: {image_url}")
                return None
            used_images.add(image_url)
            save_used_images()
            uploader = "Unknown"  # Most public domain sources don't provide uploader
            logger.info(f"Selected image: {image_url} from {source_name}")
            return image_url, source_name, uploader, page_url
        except Exception as e:
            logger.warning(f"Failed to process image {image_url}: {e}")
            return None
-    current_time = time.time()
+    # Step 1: Search DDG for public domain images
-    time_since_last_request = current_time - last_flickr_request_time
+    ddg_query = f"{search_query} license:public domain"
-    if time_since_last_request < 10:
+    logger.info(f"Searching DDG with query: '{ddg_query}'")
-        time.sleep(10 - time_since_last_request)
+    try:
-    
+        with DDGS() as ddgs:
-    last_flickr_request_time = time.time()
+            results = ddgs.images(ddg_query, safesearch="on", max_results=10)
-    
+            for result in results:
-    # Step 1: Search Flickr directly with the original query
+                image_url = result.get("image")
-    logging.info(f"Searching Flickr directly with query: '{search_query}'")
+                page_url = result.get("url")
-    photos = search_flickr(search_query)
+                # Extract domain as source_name (e.g., unsplash.com -> Unsplash)
-    for photo in photos:
+                source_match = re.search(r'https?://(?:www\.)?([^/]+)', page_url)
-        result = process_photo(photo, search_query)
+                source_name = source_match.group(1).capitalize() if source_match else "Public Domain"
-        if result:
+                if image_url and image_url.endswith(('.jpg', '.jpeg', '.png')):
-            return result
+                    result = process_image(image_url, source_name, page_url)
    # Step 2: Search DDG to find Flickr photo IDs
    logging.info(f"Searching DDG with query: '{search_query} site:flickr.com'")
    photo_ids = search_ddg_for_flickr(search_query)
    if photo_ids:
        for photo_id in photo_ids:
            photo = fetch_photo_by_id(photo_id)
            if photo:
                result = process_photo(photo, search_query)
                if result:
                    return result
    # Step 3: Break down the query into keywords and classify them
    keywords = search_query.lower().split()
    if len(keywords) > 1:
        classifications = classify_keywords(keywords)
        logging.info(f"Keyword classifications: {classifications}")
        specific_keywords = [kw for kw, classification in classifications.items() if classification == "specific"]
        if specific_keywords:
            for keyword in specific_keywords:
                logging.info(f"Searching Flickr with specific keyword: '{keyword}'")
                photos = search_flickr(keyword)
                for photo in photos:
                    result = process_photo(photo, search_query)
                    if result:
                        return result
-
+    except Exception as e:
-    # Step 4: Fallback using main topic
+        logger.warning(f"DDG search failed for '{ddg_query}': {e}")
-    logging.info(f"No results found. Falling back to main topic: '{main_topic}'")
+    
-    photos = search_flickr(main_topic)
+    # Step 2: Fallback to Pixabay
-    for photo in photos:
+    logger.info(f"No valid DDG images, falling back to Pixabay for '{search_query}'")
-        result = process_photo(photo, main_topic)
+    image_url, source_name, uploader, page_url = get_image(search_query)
-        if result:
+    if image_url:
-            return result
+        used_images.add(image_url)
-
+        save_used_images()
-    # Step 5: Final fallback using relevance keywords
+        logger.info(f"Selected Pixabay image: {image_url}")
-    fallback_query = " ".join(relevance_keywords) if isinstance(relevance_keywords, list) else relevance_keywords
+        return image_url, source_name, uploader, page_url
-    logging.info(f"No results with main topic. Falling back to relevance keywords: '{fallback_query}'")
+    
-    photos = search_flickr(fallback_query)
+    logger.warning(f"No valid images found for query '{search_query}'")
    for photo in photos:
        result = process_photo(photo, search_query)
        if result:
            return result
    logging.warning(f"No valid Flickr image found for query '{search_query}' after all attempts.")
    return None, None, None, None
 def select_best_author(content, interest_score):