add back in DDG search for flickr

2025-05-01 17:31:27 +10:00
parent 86e208c07e
commit 6376129827
1 changed files with 258 additions and 20 deletions
@@ -225,7 +225,6 @@ def get_image(search_query):
    flickr_request_count += 1
    logging.info(f"Flickr request count: {flickr_request_count}/3600")
    
-    # Enforce a minimum delay of 1 second between Flickr requests
    current_time = time.time()
    time_since_last_request = current_time - last_flickr_request_time
    if time_since_last_request < 1:
@@ -235,7 +234,6 @@ def get_image(search_query):
    
    headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
    
-    # Helper function to search Flickr with a given query
    def search_flickr(query, per_page=20):
        try:
            photos = flickr_api.Photo.search(
@@ -251,7 +249,14 @@ def get_image(search_query):
            logging.warning(f"Flickr API error for query '{query}': {e}")
            return []

-    # Helper function to process a photo
+    def fetch_photo_by_id(photo_id):
+        try:
+            photo = flickr_api.Photo(id=photo_id)
+            return photo
+        except Exception as e:
+            logging.warning(f"Failed to fetch Flickr photo ID {photo_id}: {e}")
+            return None
+
    def process_photo(photo):
        tags = [tag.text.lower() for tag in photo.getTags()]
        title = photo.title.lower() if photo.title else ""
@@ -320,7 +325,28 @@ def get_image(search_query):
            if temp_file and os.path.exists(temp_path):
                os.unlink(temp_path)

-    # Helper function to classify keywords as specific or generic
+    def search_ddg_for_flickr(query):
+        ddg_query = f"{query} site:flickr.com"
+        ddg_url = f"https://duckduckgo.com/?q={quote(ddg_query)}"
+        try:
+            response = requests.get(ddg_url, headers=headers, timeout=10)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, 'html.parser')
+            
+            photo_ids = set()
+            for link in soup.find_all('a', href=True):
+                href = link['href']
+                match = re.search(r'flickr\.com/photos/[^/]+/(\d+)', href)
+                if match:
+                    photo_id = match.group(1)
+                    photo_ids.add(photo_id)
+            
+            logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}")
+            return photo_ids
+        except Exception as e:
+            logging.warning(f"DDG search failed for query '{ddg_query}': {e}")
+            return set()
+
    def classify_keywords(keywords):
        prompt = (
            "Given the following keywords from an image search query, classify each as 'specific' (e.g., brand names, unique entities) or 'generic' (e.g., common or abstract terms). "
@@ -330,15 +356,14 @@ def get_image(search_query):
            "```json\n"
            "{\n"
            "  \"Wingstop\": \"specific\",\n"
-            "  \"Smart\": \"generic\",\n"
-            "  \"Kitchen\": \"generic\"\n"
+            "  \"dining\": \"generic\"\n"
            "}\n```"
        )
        try:
            response = client.chat.completions.create(
                model=LIGHT_TASK_MODEL,
                messages=[
-                    {"role": "system", "content": "You are a helpful assistant that classifies keywords."},
+                    {"role": "system", "content": "You are a helper that classifies keywords."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=100,
@@ -356,21 +381,23 @@ def get_image(search_query):
            logging.warning(f"Keyword classification failed: {e}. Defaulting to all specific.")
            return {kw: "specific" for kw in keywords}

-    # Step 1: Try the original search query on Flickr
-    logging.info(f"Searching Flickr with original query: '{search_query}'")
-    photos = search_flickr(search_query)
-    for photo in photos:
-        result = process_photo(photo)
-        if result:
-            return result
+    # Step 1: Search DDG to find Flickr photo IDs
+    logging.info(f"Searching DDG with query: '{search_query} site:flickr.com'")
+    photo_ids = search_ddg_for_flickr(search_query)
+    if photo_ids:
+        for photo_id in photo_ids:
+            photo = fetch_photo_by_id(photo_id)
+            if photo:
+                result = process_photo(photo)
+                if result:
+                    return result

-    # Step 2: Break down the query into keywords and classify them
+    # Step 2: Break down the query into keywords and classify them for direct Flickr API search
    keywords = search_query.lower().split()
    if len(keywords) > 1:
        classifications = classify_keywords(keywords)
        logging.info(f"Keyword classifications: {classifications}")
        
-        # Prioritize specific keywords
        specific_keywords = [kw for kw, classification in classifications.items() if classification == "specific"]
        if specific_keywords:
            for keyword in specific_keywords:
@@ -382,10 +409,8 @@ def get_image(search_query):
                        return result

    # Step 3: Final fallback to a generic food-related query
-    # Use a simple generic query derived from context (e.g., "food dining")
-    fallback_query = "food dining"  # This could be further contextualized if needed
-    logging.info(f"No results found. Falling back to generic query: '{fallback_query}'")
-    photos = search_flickr(fallback_query)
+    logging.info(f"No results found. Falling back to generic query: 'food dining'")
+    photos = search_flickr("food dining")
    for photo in photos:
        result = process_photo(photo)
        if result:
@@ -1000,6 +1025,219 @@ def reset_flickr_request_count():
        flickr_request_count = 0
        flickr_request_start_time = time.time()

+def get_flickr_image(search_query, relevance_keywords):
+    global last_flickr_request_time, flickr_request_count
+    
+    reset_flickr_request_count()
+    flickr_request_count += 1
+    logging.info(f"Flickr request count: {flickr_request_count}/3600")
+    
+    # Enforce a minimum delay of 1 second between Flickr requests
+    current_time = time.time()
+    time_since_last_request = current_time - last_flickr_request_time
+    if time_since_last_request < 1:
+        time.sleep(1 - time_since_last_request)
+    
+    last_flickr_request_time = time.time()
+    
+    headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
+    
+    # Helper function to search Flickr with a given query
+    def search_flickr(query, per_page=20):
+        try:
+            photos = flickr_api.Photo.search(
+                text=query,
+                per_page=per_page,
+                sort='relevance',
+                safe_search=1,
+                media='photos',
+                license='4,5,9,10'
+            )
+            return photos
+        except Exception as e:
+            logging.warning(f"Flickr API error for query '{query}': {e}")
+            return []
+
+    # Helper function to fetch a Flickr photo by ID
+    def fetch_photo_by_id(photo_id):
+        try:
+            photo = flickr_api.Photo(id=photo_id)
+            return photo
+        except Exception as e:
+            logging.warning(f"Failed to fetch Flickr photo ID {photo_id}: {e}")
+            return None
+
+    # Helper function to process a photo
+    def process_photo(photo):
+        tags = [tag.text.lower() for tag in photo.getTags()]
+        title = photo.title.lower() if photo.title else ""
+        
+        matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
+        if matched_keywords:
+            logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})")
+            return None
+        
+        img_url = photo.getPhotoFile(size_label='Large')
+        if not img_url:
+            img_url = photo.getPhotoFile(size_label='Medium')
+        if not img_url or img_url in used_images:
+            return None
+        
+        temp_file = None
+        try:
+            img_response = requests.get(img_url, headers=headers, timeout=10)
+            img_response.raise_for_status()
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
+                temp_file.write(img_response.content)
+                temp_path = temp_file.name
+
+            img = Image.open(temp_path)
+            text = pytesseract.image_to_string(img)
+            char_count = len(text.strip())
+            logging.info(f"OCR processed {img_url}: {char_count} characters detected")
+
+            if char_count > 200:
+                logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})")
+                return None
+
+            uploader = photo.owner.username
+            page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
+            
+            used_images.add(img_url)
+            save_used_images()
+            
+            flickr_data = {
+                "title": search_query,
+                "image_url": img_url,
+                "source": "Flickr",
+                "uploader": uploader,
+                "page_url": page_url,
+                "timestamp": datetime.now(timezone.utc).isoformat(),
+                "ocr_chars": char_count
+            }
+            flickr_file = "/home/shane/foodie_automator/flickr_images.json"
+            with open(flickr_file, 'a') as f:
+                json.dump(flickr_data, f)
+                f.write('\n')
+            logging.info(f"Saved Flickr image to {flickr_file}: {img_url}")
+            
+            logging.info(f"Fetched Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
+            return img_url, "Flickr", uploader, page_url
+
+        except requests.exceptions.HTTPError as e:
+            if e.response.status_code == 429:
+                logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.")
+                return None
+            else:
+                logging.warning(f"Download failed for {img_url}: {e}")
+                return None
+        except Exception as e:
+            logging.warning(f"OCR processing failed for {img_url}: {e}")
+            return None
+        finally:
+            if temp_file and os.path.exists(temp_path):
+                os.unlink(temp_path)
+
+    # Helper function to search DDG and extract Flickr photo IDs
+    def search_ddg_for_flickr(query):
+        ddg_query = f"{query} site:flickr.com"
+        ddg_url = f"https://duckduckgo.com/?q={quote(ddg_query)}"
+        try:
+            response = requests.get(ddg_url, headers=headers, timeout=10)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, 'html.parser')
+            
+            photo_ids = set()
+            # Look for Flickr URLs in the search results
+            for link in soup.find_all('a', href=True):
+                href = link['href']
+                # Match Flickr photo URLs like https://www.flickr.com/photos/username/1234567890
+                match = re.search(r'flickr\.com/photos/[^/]+/(\d+)', href)
+                if match:
+                    photo_id = match.group(1)
+                    photo_ids.add(photo_id)
+            
+            logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}")
+            return photo_ids
+        except Exception as e:
+            logging.warning(f"DDG search failed for query '{ddg_query}': {e}")
+            return set()
+
+    # Helper function to classify keywords as specific or generic
+    def classify_keywords(keywords):
+        prompt = (
+            "Given the following keywords from an image search query, classify each as 'specific' (e.g., brand names, unique entities) or 'generic' (e.g., common or abstract terms). "
+            "Return a JSON object mapping each keyword to its classification.\n\n"
+            "Keywords: " + ", ".join(keywords) + "\n\n"
+            "Example output:\n"
+            "```json\n"
+            "{\n"
+            "  \"Wingstop\": \"specific\",\n"
+            "  \"dining\": \"generic\"\n"
+            "}\n```"
+        )
+        try:
+            response = client.chat.completions.create(
+                model=LIGHT_TASK_MODEL,
+                messages=[
+                    {"role": "system", "content": "You are a helper that classifies keywords."},
+                    {"role": "user", "content": prompt}
+                ],
+                max_tokens=100,
+                temperature=0.5
+            )
+            raw_response = response.choices[0].message.content
+            json_match = re.search(r'```json\n([\s\S]*?)\n```', raw_response)
+            if not json_match:
+                logging.warning(f"Failed to parse keyword classification JSON: {raw_response}")
+                return {kw: "specific" for kw in keywords}
+            
+            classifications = json.loads(json_match.group(1))
+            return classifications
+        except Exception as e:
+            logging.warning(f"Keyword classification failed: {e}. Defaulting to all specific.")
+            return {kw: "specific" for kw in keywords}
+
+    # Step 1: Search DDG to find Flickr photo IDs
+    logging.info(f"Searching DDG with query: '{search_query} site:flickr.com'")
+    photo_ids = search_ddg_for_flickr(search_query)
+    if photo_ids:
+        for photo_id in photo_ids:
+            photo = fetch_photo_by_id(photo_id)
+            if photo:
+                result = process_photo(photo)
+                if result:
+                    return result
+
+    # Step 2: Break down the query into keywords and classify them for direct Flickr API search
+    keywords = search_query.lower().split()
+    if len(keywords) > 1:
+        classifications = classify_keywords(keywords)
+        logging.info(f"Keyword classifications: {classifications}")
+        
+        # Prioritize specific keywords
+        specific_keywords = [kw for kw, classification in classifications.items() if classification == "specific"]
+        if specific_keywords:
+            for keyword in specific_keywords:
+                logging.info(f"Searching Flickr with specific keyword: '{keyword}'")
+                photos = search_flickr(keyword)
+                for photo in photos:
+                    result = process_photo(photo)
+                    if result:
+                        return result
+
+    # Step 3: Final fallback using relevance keywords
+    fallback_query = " ".join(relevance_keywords) if isinstance(relevance_keywords, list) else relevance_keywords
+    logging.info(f"No results found. Falling back to generic query: '{fallback_query}'")
+    photos = search_flickr(fallback_query)
+    for photo in photos:
+        result = process_photo(photo)
+        if result:
+            return result
+
+    logging.warning(f"No valid Flickr image found for query '{search_query}' after all attempts.")
+    return None, None, None, None
+    
 def select_best_author(summary):
    try:
        response = client.chat.completions.create(