add back in DDG search for flickr

2025-05-01 17:31:27 +10:00
parent 86e208c07e
commit 6376129827
1 changed files with 258 additions and 20 deletions
@@ -225,7 +225,6 @@ def get_image(search_query):
    flickr_request_count += 1
    logging.info(f"Flickr request count: {flickr_request_count}/3600")
    # Enforce a minimum delay of 1 second between Flickr requests
    current_time = time.time()
    time_since_last_request = current_time - last_flickr_request_time
    if time_since_last_request < 1:
@@ -235,7 +234,6 @@ def get_image(search_query):
    headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
    # Helper function to search Flickr with a given query
    def search_flickr(query, per_page=20):
        try:
            photos = flickr_api.Photo.search(
@@ -251,7 +249,14 @@ def get_image(search_query):
            logging.warning(f"Flickr API error for query '{query}': {e}")
            return []
-    # Helper function to process a photo
+    def fetch_photo_by_id(photo_id):
        try:
            photo = flickr_api.Photo(id=photo_id)
            return photo
        except Exception as e:
            logging.warning(f"Failed to fetch Flickr photo ID {photo_id}: {e}")
            return None
    def process_photo(photo):
        tags = [tag.text.lower() for tag in photo.getTags()]
        title = photo.title.lower() if photo.title else ""
@@ -320,7 +325,28 @@ def get_image(search_query):
            if temp_file and os.path.exists(temp_path):
                os.unlink(temp_path)
-    # Helper function to classify keywords as specific or generic
+    def search_ddg_for_flickr(query):
        ddg_query = f"{query} site:flickr.com"
        ddg_url = f"https://duckduckgo.com/?q={quote(ddg_query)}"
        try:
            response = requests.get(ddg_url, headers=headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            photo_ids = set()
            for link in soup.find_all('a', href=True):
                href = link['href']
                match = re.search(r'flickr\.com/photos/[^/]+/(\d+)', href)
                if match:
                    photo_id = match.group(1)
                    photo_ids.add(photo_id)
            logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}")
            return photo_ids
        except Exception as e:
            logging.warning(f"DDG search failed for query '{ddg_query}': {e}")
            return set()
    def classify_keywords(keywords):
        prompt = (
            "Given the following keywords from an image search query, classify each as 'specific' (e.g., brand names, unique entities) or 'generic' (e.g., common or abstract terms). "
@@ -330,15 +356,14 @@ def get_image(search_query):
            "```json\n"
            "{\n"
            "  \"Wingstop\": \"specific\",\n"
-            "  \"Smart\": \"generic\",\n"
+            "  \"dining\": \"generic\"\n"
            "  \"Kitchen\": \"generic\"\n"
            "}\n```"
        )
        try:
            response = client.chat.completions.create(
                model=LIGHT_TASK_MODEL,
                messages=[
-                    {"role": "system", "content": "You are a helpful assistant that classifies keywords."},
+                    {"role": "system", "content": "You are a helper that classifies keywords."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=100,
@@ -356,21 +381,23 @@ def get_image(search_query):
            logging.warning(f"Keyword classification failed: {e}. Defaulting to all specific.")
            return {kw: "specific" for kw in keywords}
-    # Step 1: Try the original search query on Flickr
+    # Step 1: Search DDG to find Flickr photo IDs
-    logging.info(f"Searching Flickr with original query: '{search_query}'")
+    logging.info(f"Searching DDG with query: '{search_query} site:flickr.com'")
-    photos = search_flickr(search_query)
+    photo_ids = search_ddg_for_flickr(search_query)
-    for photo in photos:
+    if photo_ids:
-        result = process_photo(photo)
+        for photo_id in photo_ids:
-        if result:
+            photo = fetch_photo_by_id(photo_id)
-            return result
+            if photo:
                result = process_photo(photo)
                if result:
                    return result
-    # Step 2: Break down the query into keywords and classify them
+    # Step 2: Break down the query into keywords and classify them for direct Flickr API search
    keywords = search_query.lower().split()
    if len(keywords) > 1:
        classifications = classify_keywords(keywords)
        logging.info(f"Keyword classifications: {classifications}")
        # Prioritize specific keywords
        specific_keywords = [kw for kw, classification in classifications.items() if classification == "specific"]
        if specific_keywords:
            for keyword in specific_keywords:
@@ -382,10 +409,8 @@ def get_image(search_query):
                        return result
    # Step 3: Final fallback to a generic food-related query
-    # Use a simple generic query derived from context (e.g., "food dining")
+    logging.info(f"No results found. Falling back to generic query: 'food dining'")
-    fallback_query = "food dining"  # This could be further contextualized if needed
+    photos = search_flickr("food dining")
    logging.info(f"No results found. Falling back to generic query: '{fallback_query}'")
    photos = search_flickr(fallback_query)
    for photo in photos:
        result = process_photo(photo)
        if result:
@@ -999,6 +1024,219 @@ def reset_flickr_request_count():
    if time.time() - flickr_request_start_time >= 3600:  # Reset every hour
        flickr_request_count = 0
        flickr_request_start_time = time.time()
 def get_flickr_image(search_query, relevance_keywords):
    global last_flickr_request_time, flickr_request_count
    reset_flickr_request_count()
    flickr_request_count += 1
    logging.info(f"Flickr request count: {flickr_request_count}/3600")
    # Enforce a minimum delay of 1 second between Flickr requests
    current_time = time.time()
    time_since_last_request = current_time - last_flickr_request_time
    if time_since_last_request < 1:
        time.sleep(1 - time_since_last_request)
    last_flickr_request_time = time.time()
    headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
    # Helper function to search Flickr with a given query
    def search_flickr(query, per_page=20):
        try:
            photos = flickr_api.Photo.search(
                text=query,
                per_page=per_page,
                sort='relevance',
                safe_search=1,
                media='photos',
                license='4,5,9,10'
            )
            return photos
        except Exception as e:
            logging.warning(f"Flickr API error for query '{query}': {e}")
            return []
    # Helper function to fetch a Flickr photo by ID
    def fetch_photo_by_id(photo_id):
        try:
            photo = flickr_api.Photo(id=photo_id)
            return photo
        except Exception as e:
            logging.warning(f"Failed to fetch Flickr photo ID {photo_id}: {e}")
            return None
    # Helper function to process a photo
    def process_photo(photo):
        tags = [tag.text.lower() for tag in photo.getTags()]
        title = photo.title.lower() if photo.title else ""
        matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
        if matched_keywords:
            logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})")
            return None
        img_url = photo.getPhotoFile(size_label='Large')
        if not img_url:
            img_url = photo.getPhotoFile(size_label='Medium')
        if not img_url or img_url in used_images:
            return None
        temp_file = None
        try:
            img_response = requests.get(img_url, headers=headers, timeout=10)
            img_response.raise_for_status()
            with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
                temp_file.write(img_response.content)
                temp_path = temp_file.name
            img = Image.open(temp_path)
            text = pytesseract.image_to_string(img)
            char_count = len(text.strip())
            logging.info(f"OCR processed {img_url}: {char_count} characters detected")
            if char_count > 200:
                logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})")
                return None
            uploader = photo.owner.username
            page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
            used_images.add(img_url)
            save_used_images()
            flickr_data = {
                "title": search_query,
                "image_url": img_url,
                "source": "Flickr",
                "uploader": uploader,
                "page_url": page_url,
                "timestamp": datetime.now(timezone.utc).isoformat(),
                "ocr_chars": char_count
            }
            flickr_file = "/home/shane/foodie_automator/flickr_images.json"
            with open(flickr_file, 'a') as f:
                json.dump(flickr_data, f)
                f.write('\n')
            logging.info(f"Saved Flickr image to {flickr_file}: {img_url}")
            logging.info(f"Fetched Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
            return img_url, "Flickr", uploader, page_url
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 429:
                logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.")
                return None
            else:
                logging.warning(f"Download failed for {img_url}: {e}")
                return None
        except Exception as e:
            logging.warning(f"OCR processing failed for {img_url}: {e}")
            return None
        finally:
            if temp_file and os.path.exists(temp_path):
                os.unlink(temp_path)
    # Helper function to search DDG and extract Flickr photo IDs
    def search_ddg_for_flickr(query):
        ddg_query = f"{query} site:flickr.com"
        ddg_url = f"https://duckduckgo.com/?q={quote(ddg_query)}"
        try:
            response = requests.get(ddg_url, headers=headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            photo_ids = set()
            # Look for Flickr URLs in the search results
            for link in soup.find_all('a', href=True):
                href = link['href']
                # Match Flickr photo URLs like https://www.flickr.com/photos/username/1234567890
                match = re.search(r'flickr\.com/photos/[^/]+/(\d+)', href)
                if match:
                    photo_id = match.group(1)
                    photo_ids.add(photo_id)
            logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}")
            return photo_ids
        except Exception as e:
            logging.warning(f"DDG search failed for query '{ddg_query}': {e}")
            return set()
    # Helper function to classify keywords as specific or generic
    def classify_keywords(keywords):
        prompt = (
            "Given the following keywords from an image search query, classify each as 'specific' (e.g., brand names, unique entities) or 'generic' (e.g., common or abstract terms). "
            "Return a JSON object mapping each keyword to its classification.\n\n"
            "Keywords: " + ", ".join(keywords) + "\n\n"
            "Example output:\n"
            "```json\n"
            "{\n"
            "  \"Wingstop\": \"specific\",\n"
            "  \"dining\": \"generic\"\n"
            "}\n```"
        )
        try:
            response = client.chat.completions.create(
                model=LIGHT_TASK_MODEL,
                messages=[
                    {"role": "system", "content": "You are a helper that classifies keywords."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=100,
                temperature=0.5
            )
            raw_response = response.choices[0].message.content
            json_match = re.search(r'```json\n([\s\S]*?)\n```', raw_response)
            if not json_match:
                logging.warning(f"Failed to parse keyword classification JSON: {raw_response}")
                return {kw: "specific" for kw in keywords}
            classifications = json.loads(json_match.group(1))
            return classifications
        except Exception as e:
            logging.warning(f"Keyword classification failed: {e}. Defaulting to all specific.")
            return {kw: "specific" for kw in keywords}
    # Step 1: Search DDG to find Flickr photo IDs
    logging.info(f"Searching DDG with query: '{search_query} site:flickr.com'")
    photo_ids = search_ddg_for_flickr(search_query)
    if photo_ids:
        for photo_id in photo_ids:
            photo = fetch_photo_by_id(photo_id)
            if photo:
                result = process_photo(photo)
                if result:
                    return result
    # Step 2: Break down the query into keywords and classify them for direct Flickr API search
    keywords = search_query.lower().split()
    if len(keywords) > 1:
        classifications = classify_keywords(keywords)
        logging.info(f"Keyword classifications: {classifications}")
        # Prioritize specific keywords
        specific_keywords = [kw for kw, classification in classifications.items() if classification == "specific"]
        if specific_keywords:
            for keyword in specific_keywords:
                logging.info(f"Searching Flickr with specific keyword: '{keyword}'")
                photos = search_flickr(keyword)
                for photo in photos:
                    result = process_photo(photo)
                    if result:
                        return result
    # Step 3: Final fallback using relevance keywords
    fallback_query = " ".join(relevance_keywords) if isinstance(relevance_keywords, list) else relevance_keywords
    logging.info(f"No results found. Falling back to generic query: '{fallback_query}'")
    photos = search_flickr(fallback_query)
    for photo in photos:
        result = process_photo(photo)
        if result:
            return result
    logging.warning(f"No valid Flickr image found for query '{search_query}' after all attempts.")
    return None, None, None, None
 def select_best_author(summary):
    try: