From 07a68837a3ca09d76e9c04a9ed3604092144b171 Mon Sep 17 00:00:00 2001
From: Shane <shanehill@mail.com>
Date: Sun, 11 May 2025 21:21:50 +1000
Subject: [PATCH] Minimum Resolution Filter 1280px

---
 foodie_utils.py | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/foodie_utils.py b/foodie_utils.py
index 0ff00ac..071a4e0 100644
--- a/foodie_utils.py
+++ b/foodie_utils.py
@@ -1086,14 +1086,27 @@ def get_flickr_image(search_query, relevance_keywords, main_topic):
     logger = logging.getLogger(__name__)
     
     def process_image(image_url, source_name, page_url):
-        """Download image, check for text with OCR, and validate."""
+        """Download image, check for text with OCR, validate resolution, and exclude screenshots."""
         try:
             headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
             response = requests.get(image_url, headers=headers, timeout=10)
             response.raise_for_status()
             img = Image.open(io.BytesIO(response.content))
             
-            # OCR to detect text
+            # Check image resolution
+            width, height = img.size
+            min_dimension = 1280  # Minimum width or height for high quality
+            if width < min_dimension and height < min_dimension:
+                logger.info(f"Skipping low-resolution image: {image_url} ({width}x{height})")
+                return None
+            
+            # Attempt to detect screenshots via aspect ratio or naming
+            aspect_ratio = width / height
+            if (0.9 <= aspect_ratio <= 1.1) or "screenshot" in image_url.lower():
+                logger.info(f"Skipping potential screenshot: {image_url} (aspect ratio: {aspect_ratio})")
+                return None
+            
+            # OCR to detect text (unchanged)
             text = pytesseract.image_to_string(img).strip()
             word_count = len(text.split())
             if word_count > 10:
@@ -1107,7 +1120,7 @@ def get_flickr_image(search_query, relevance_keywords, main_topic):
             used_images.add(image_url)
             save_used_images()
             uploader = "Unknown"
-            logger.info(f"Selected image: {image_url} from {source_name}")
+            logger.info(f"Selected image: {image_url} from {source_name} ({width}x{height})")
             return image_url, source_name, uploader, page_url
         except Exception as e:
             logger.warning(f"Failed to process image {image_url}: {e}")
@@ -1118,16 +1131,14 @@ def get_flickr_image(search_query, relevance_keywords, main_topic):
     logger.info(f"Searching DDG with query: '{ddg_query}'")
     try:
         with DDGS() as ddgs:
-            results = ddgs.images(ddg_query, safesearch="on", max_results=10)
+            results = ddgs.images(ddg_query, safesearch="on", max_results=20)  # Increased to 20 for more options
             for result in results:
                 image_url = result.get("image")
                 page_url = result.get("url")
-                # Extract domain and remove top-level domain (e.g., .cn, .com)
                 source_match = re.search(r'https?://(?:www\.)?([^/]+)', page_url)
                 if source_match:
-                    domain = source_match.group(1)  # e.g., shine.cn
-                    # Split on last dot and take the first part, then capitalize
-                    source_name = domain.rsplit('.', 1)[0].capitalize()  # e.g., Shine
+                    domain = source_match.group(1)
+                    source_name = domain.rsplit('.', 1)[0].capitalize()
                 else:
                     source_name = "Public Domain"
                 if image_url and image_url.endswith(('.jpg', '.jpeg', '.png')):
@@ -1144,7 +1155,6 @@ def get_flickr_image(search_query, relevance_keywords, main_topic):
         used_images.add(image_url)
         save_used_images()
         logger.info(f"Selected Pixabay image: {image_url}")
-        # For Pixabay, source_name is already set to "Pixabay", which is fine
         return image_url, source_name, uploader, page_url
     
     logger.warning(f"No valid images found for query '{search_query}'")