update image flickr prompt

2025-05-01 17:06:51 +10:00
parent bdf09a3355
commit 163e50c2b3
1 changed files with 188 additions and 211 deletions
@@ -62,10 +62,16 @@ def save_json_file(filename, key, value):
        # Remove duplicates by title
        data = [item for item in data if item["title"] != key]
        data.append(entry)
-        with open(filename, 'w') as f:
+        # Special handling for used_images.json to save as a flat list
-            for item in data:
+        if filename.endswith('used_images.json'):
-                json.dump(item, f)
+            flat_data = [item["title"] for item in data if isinstance(item, dict) and "title" in item]
-                f.write('\n')
+            with open(filename, 'w') as f:
                json.dump(flat_data, f)
        else:
            with open(filename, 'w') as f:
                for item in data:
                    json.dump(item, f)
                    f.write('\n')
        logging.info(f"Saved '{key}' to {filename}")
        print(f"DEBUG: Saved '{key}' to {filename}")
        loaded_data = load_json_file(filename, expiration_days=PRUNE_INTERVAL_DAYS)
@@ -227,98 +233,165 @@ def get_image(search_query):
    last_flickr_request_time = time.time()
-    try:
+    headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
-        # Try Flickr API first
+    
-        photos = flickr_api.Photo.search(
+    # Helper function to search Flickr with a given query
-            text=search_query,
+    def search_flickr(query, per_page=20):
-            per_page=10,
+        try:
-            sort='relevance',
+            photos = flickr_api.Photo.search(
-            safe_search=1,
+                text=query,
-            media='photos',
+                per_page=per_page,
-            license='4,5,9,10'  # Commercial use licenses
+                sort='relevance',
                safe_search=1,
                media='photos',
                license='4,5,9,10'
            )
            return photos
        except Exception as e:
            logging.warning(f"Flickr API error for query '{query}': {e}")
            return []
    # Helper function to process a photo
    def process_photo(photo):
        tags = [tag.text.lower() for tag in photo.getTags()]
        title = photo.title.lower() if photo.title else ""
        matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
        if matched_keywords:
            logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})")
            return None
        img_url = photo.getPhotoFile(size_label='Medium')
        if not img_url or img_url in used_images:
            return None
        temp_file = None
        try:
            img_response = requests.get(img_url, headers=headers, timeout=10)
            img_response.raise_for_status()
            with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
                temp_file.write(img_response.content)
                temp_path = temp_file.name
            img = Image.open(temp_path)
            text = pytesseract.image_to_string(img)
            char_count = len(text.strip())
            logging.info(f"OCR processed {img_url}: {char_count} characters detected")
            if char_count > 200:
                logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})")
                return None
            uploader = photo.owner.username
            page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
            used_images.add(img_url)
            save_used_images()
            flickr_data = {
                "title": search_query,
                "image_url": img_url,
                "source": "Flickr",
                "uploader": uploader,
                "page_url": page_url,
                "timestamp": datetime.now(timezone.utc).isoformat(),
                "ocr_chars": char_count
            }
            flickr_file = "/home/shane/foodie_automator/flickr_images.json"
            with open(flickr_file, 'a') as f:
                json.dump(flickr_data, f)
                f.write('\n')
            logging.info(f"Saved Flickr image to {flickr_file}: {img_url}")
            logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
            return img_url, "Flickr", uploader, page_url
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 429:
                logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.")
                return None
            else:
                logging.warning(f"Download failed for {img_url}: {e}")
                return None
        except Exception as e:
            logging.warning(f"OCR processing failed for {img_url}: {e}")
            return None
        finally:
            if temp_file and os.path.exists(temp_path):
                os.unlink(temp_path)
    # Helper function to classify keywords as specific or generic
    def classify_keywords(keywords):
        prompt = (
            "Given the following keywords from an image search query, classify each as 'specific' (e.g., brand names, unique entities) or 'generic' (e.g., common or abstract terms). "
            "Return a JSON object mapping each keyword to its classification.\n\n"
            "Keywords: " + ", ".join(keywords) + "\n\n"
            "Example output:\n"
            "```json\n"
            "{\n"
            "  \"Wingstop\": \"specific\",\n"
            "  \"Smart\": \"generic\",\n"
            "  \"Kitchen\": \"generic\"\n"
            "}\n```"
        )
        try:
            response = client.chat.completions.create(
                model=LIGHT_TASK_MODEL,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that classifies keywords."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=100,
                temperature=0.5
            )
            raw_response = response.choices[0].message.content
            json_match = re.search(r'```json\n([\s\S]*?)\n```', raw_response)
            if not json_match:
                logging.warning(f"Failed to parse keyword classification JSON: {raw_response}")
                return {kw: "specific" for kw in keywords}
-        headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
+            classifications = json.loads(json_match.group(1))
            return classifications
        except Exception as e:
            logging.warning(f"Keyword classification failed: {e}. Defaulting to all specific.")
            return {kw: "specific" for kw in keywords}
-        for photo in photos:
+    # Step 1: Try the original search query on Flickr
-            # Fetch photo metadata (tags and title)
+    logging.info(f"Searching Flickr with original query: '{search_query}'")
-            tags = [tag.text.lower() for tag in photo.getTags()]
+    photos = search_flickr(search_query)
-            title = photo.title.lower() if photo.title else ""
+    for photo in photos:
        result = process_photo(photo)
        if result:
            return result
-            # Filter out images with unwanted keywords in tags or title
+    # Step 2: Break down the query into keywords and classify them
-            matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
+    keywords = search_query.lower().split()
-            if matched_keywords:
+    if len(keywords) > 1:
-                logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})")
+        classifications = classify_keywords(keywords)
-                continue
+        logging.info(f"Keyword classifications: {classifications}")
-            img_url = photo.getPhotoFile(size_label='Medium')
+        # Prioritize specific keywords
-            if not img_url:
+        specific_keywords = [kw for kw, classification in classifications.items() if classification == "specific"]
-                continue
+        if specific_keywords:
-            if img_url in used_images:
+            for keyword in specific_keywords:
-                continue
+                logging.info(f"Searching Flickr with specific keyword: '{keyword}'")
                photos = search_flickr(keyword)
                for photo in photos:
                    result = process_photo(photo)
                    if result:
                        return result
-            # Download the image and run OCR to check for excessive text
+    # Step 3: Final fallback to a generic food-related query
-            temp_file = None
+    # Use a simple generic query derived from context (e.g., "food dining")
-            try:
+    fallback_query = "food dining"  # This could be further contextualized if needed
-                img_response = requests.get(img_url, headers=headers, timeout=10)
+    logging.info(f"No results found. Falling back to generic query: '{fallback_query}'")
-                img_response.raise_for_status()
+    photos = search_flickr(fallback_query)
-                with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
+    for photo in photos:
-                    temp_file.write(img_response.content)
+        result = process_photo(photo)
-                    temp_path = temp_file.name
+        if result:
            return result
-                img = Image.open(temp_path)
+    logging.warning(f"No valid Flickr image found in fallback for query '{search_query}'. Trying Pixabay.")
                text = pytesseract.image_to_string(img)
                char_count = len(text.strip())
                logging.info(f"OCR processed {img_url}: {char_count} characters detected")
                if char_count > 200:
                    logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})")
                    continue
                uploader = photo.owner.username
                page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
                # Add the image URL to used_images
                used_images.add(img_url)
                save_used_images()
                # Save Flickr image metadata
                flickr_data = {
                    "title": search_query,
                    "image_url": img_url,
                    "source": "Flickr",
                    "uploader": uploader,
                    "page_url": page_url,
                    "timestamp": datetime.now(timezone.utc).isoformat(),
                    "ocr_chars": char_count
                }
                flickr_file = "/home/shane/foodie_automator/flickr_images.json"
                with open(flickr_file, 'a') as f:
                    json.dump(flickr_data, f)
                    f.write('\n')
                logging.info(f"Saved Flickr image to {flickr_file}: {img_url}")
                logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
                return img_url, "Flickr", uploader, page_url
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 429:
                    logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.")
                    return None, None, None, None
                else:
                    logging.warning(f"Download failed for {img_url}: {e}")
                    continue
            except Exception as e:
                logging.warning(f"OCR processing failed for {img_url}: {e}")
                continue
            finally:
                if temp_file and os.path.exists(temp_path):
                    os.unlink(temp_path)
        logging.warning(f"No valid Flickr image found in fallback for query '{search_query}'. Trying Pixabay.")
    except Exception as e:
        logging.warning(f"Fallback Flickr API error for query '{search_query}': {e}. Falling back to Pixabay.")
    # Fallback to Pixabay
    try:
@@ -334,7 +407,6 @@ def get_image(search_query):
            uploader = hit.get('user', 'Unknown')
            page_url = hit.get('pageURL', img_url)
            # Add the image URL to used_images
            used_images.add(img_url)
            save_used_images()
@@ -350,14 +422,18 @@ def get_image(search_query):
 def generate_image_query(content):
    prompt = (
-        "Given the following content, generate a concise image search query (max 5 words) that would likely yield relevant, visually appealing images on platforms like Flickr or Pixabay. Focus on concrete, visual concepts related to food, dining, or restaurants, avoiding overly abstract terms. Also provide relevance keywords (max 5 words) to filter results. Return the result as a JSON object with 'search' and 'relevance' keys.\n\n"
+        "Given the following content, generate a concise image search query (max 5 words) that would likely yield relevant, visually appealing images on platforms like Flickr or Pixabay. "
        "Identify and prioritize specific entities like brand names or unique terms over abstract or generic concepts. "
        "Focus on concrete, visual concepts related to food, dining, or restaurants. "
        "Also provide relevance keywords (max 5 words) to filter results, using general themes related to the content. "
        "Return the result as a JSON object with 'search' and 'relevance' keys.\n\n"
        "Content:\n"
        f"{content}\n\n"
        "Example output:\n"
        "```json\n"
        "{\n"
-        "  \"search\": \"modern dining trends\",\n"
+        "  \"search\": \"Wingstop dining\",\n"
-        "  \"relevance\": \"dining habits restaurant trends\"\n"
+        "  \"relevance\": \"fast food dining\"\n"
        "}\n```"
    )
@@ -379,18 +455,18 @@ def generate_image_query(content):
        json_match = re.search(r'```json\n([\s\S]*?)\n```', raw_response)
        if not json_match:
            logging.warning(f"Failed to parse image query JSON from GPT response: {raw_response}")
-            return "restaurant dining", "dining trends"
+            return "food dining", ["dining", "trends"]
        query_data = json.loads(json_match.group(1))
-        search_query = query_data.get("search", "restaurant dining")
+        search_query = query_data.get("search", "food dining")
-        relevance_keywords = query_data.get("relevance", "dining trends")
+        relevance_keywords = query_data.get("relevance", ["dining", "trends"])
        logging.debug(f"Image query from content: {query_data}")
        return search_query, relevance_keywords
    except Exception as e:
        logging.warning(f"Failed to generate image query: {e}. Using fallback.")
-        return "restaurant dining", "dining trends"
+        return "food dining", ["dining", "trends"]
 def smart_image_and_filter(title, summary):
    try:
@@ -893,7 +969,18 @@ if os.path.exists(used_images_file):
    try:
        with open(used_images_file, 'r') as f:
            data = json.load(f)
-            used_images.update(data)
+            # Handle malformed format (list of lists)
            if isinstance(data, list) and data and isinstance(data[0], list):
                logging.warning(f"Fixing malformed used_images.json format: {data[:2]}...")
                flat_data = []
                for item in data:
                    if isinstance(item, list):
                        flat_data.extend(item)
                    else:
                        flat_data.append(item)
                used_images.update(flat_data)
            else:
                used_images.update(data)
        logging.info(f"Loaded {len(used_images)} used image URLs from {used_images_file}")
    except Exception as e:
        logging.warning(f"Failed to load used images from {used_images_file}: {e}")
@@ -913,117 +1000,7 @@ def reset_flickr_request_count():
        flickr_request_count = 0
        flickr_request_start_time = time.time()
-def get_flickr_image(search_query, relevance_keywords):
+if keyword in ['smart', 'ai', 'ai-powered', 'kitchen', 'dining', 'experience']:
    global last_flickr_request_time, flickr_request_count
    reset_flickr_request_count()
    flickr_request_count += 1
    logging.info(f"Flickr request count: {flickr_request_count}/3600")
    # Enforce a minimum delay of 1 second between Flickr requests
    current_time = time.time()
    time_since_last_request = current_time - last_flickr_request_time
    if time_since_last_request < 1:
        time.sleep(1 - time_since_last_request)
    last_flickr_request_time = time.time()
    try:
        # Search for photos on Flickr using the API
        photos = flickr_api.Photo.search(
            text=search_query,
            per_page=10,
            sort='relevance',
            safe_search=1,
            media='photos',
            license='4,5,9,10'  # Commercial use licenses (CC BY, CC BY-SA, etc.)
        )
        headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
        for photo in photos:
            # Fetch photo metadata (tags and title)
            tags = [tag.text.lower() for tag in photo.getTags()]
            title = photo.title.lower() if photo.title else ""
            # Filter out images with unwanted keywords in tags or title
            matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
            if matched_keywords:
                logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})")
                continue
            img_url = photo.getPhotoFile(size_label='Large')
            if not img_url:
                img_url = photo.getPhotoFile(size_label='Medium')
            if not img_url:
                continue
            if img_url in used_images:
                continue
            # Download the image and run OCR to check for excessive text
            temp_file = None
            try:
                img_response = requests.get(img_url, headers=headers, timeout=10)
                img_response.raise_for_status()
                with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
                    temp_file.write(img_response.content)
                    temp_path = temp_file.name
                img = Image.open(temp_path)
                text = pytesseract.image_to_string(img)
                char_count = len(text.strip())
                logging.info(f"OCR processed {img_url}: {char_count} characters detected")
                if char_count > 200:
                    logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})")
                    continue
                uploader = photo.owner.username
                page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
                # Add the image URL to used_images
                used_images.add(img_url)
                save_used_images()
                # Save Flickr image metadata
                flickr_data = {
                    "title": search_query,
                    "image_url": img_url,
                    "source": "Flickr",
                    "uploader": uploader,
                    "page_url": page_url,
                    "timestamp": datetime.now(timezone.utc).isoformat(),
                    "ocr_chars": char_count
                }
                flickr_file = "/home/shane/foodie_automator/flickr_images.json"
                with open(flickr_file, 'a') as f:
                    json.dump(flickr_data, f)
                    f.write('\n')
                logging.info(f"Saved Flickr image to {flickr_file}: {img_url}")
                logging.info(f"Fetched Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
                return img_url, "Flickr", uploader, page_url
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 429:
                    logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.")
                    return None, None, None, None
                else:
                    logging.warning(f"Download failed for {img_url}: {e}")
                    continue
            except Exception as e:
                logging.warning(f"OCR processing failed for {img_url}: {e}")
                continue
            finally:
                if temp_file and os.path.exists(temp_path):
                    os.unlink(temp_path)
        logging.warning(f"No valid Flickr image found for query '{search_query}'.")
        return None, None, None, None
    except Exception as e:
        logging.warning(f"Flickr API error for query '{search_query}': {e}. Falling back to Pixabay.")
        return None, None, None, None
 def select_best_author(summary):
    try: