From 07a68837a3ca09d76e9c04a9ed3604092144b171 Mon Sep 17 00:00:00 2001 From: Shane Date: Sun, 11 May 2025 21:21:50 +1000 Subject: [PATCH] Minimum Resolution Filter 1280px --- foodie_utils.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/foodie_utils.py b/foodie_utils.py index 0ff00ac..071a4e0 100644 --- a/foodie_utils.py +++ b/foodie_utils.py @@ -1086,14 +1086,27 @@ def get_flickr_image(search_query, relevance_keywords, main_topic): logger = logging.getLogger(__name__) def process_image(image_url, source_name, page_url): - """Download image, check for text with OCR, and validate.""" + """Download image, check for text with OCR, validate resolution, and exclude screenshots.""" try: headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} response = requests.get(image_url, headers=headers, timeout=10) response.raise_for_status() img = Image.open(io.BytesIO(response.content)) - # OCR to detect text + # Check image resolution + width, height = img.size + min_dimension = 1280 # Minimum width or height for high quality + if width < min_dimension and height < min_dimension: + logger.info(f"Skipping low-resolution image: {image_url} ({width}x{height})") + return None + + # Attempt to detect screenshots via aspect ratio or naming + aspect_ratio = width / height + if (0.9 <= aspect_ratio <= 1.1) or "screenshot" in image_url.lower(): + logger.info(f"Skipping potential screenshot: {image_url} (aspect ratio: {aspect_ratio})") + return None + + # OCR to detect text (unchanged) text = pytesseract.image_to_string(img).strip() word_count = len(text.split()) if word_count > 10: @@ -1107,7 +1120,7 @@ def get_flickr_image(search_query, relevance_keywords, main_topic): used_images.add(image_url) save_used_images() uploader = "Unknown" - logger.info(f"Selected image: {image_url} from {source_name}") + logger.info(f"Selected image: {image_url} from {source_name} ({width}x{height})") return image_url, source_name, uploader, page_url except Exception as e: logger.warning(f"Failed to process image {image_url}: {e}") @@ -1118,16 +1131,14 @@ def get_flickr_image(search_query, relevance_keywords, main_topic): logger.info(f"Searching DDG with query: '{ddg_query}'") try: with DDGS() as ddgs: - results = ddgs.images(ddg_query, safesearch="on", max_results=10) + results = ddgs.images(ddg_query, safesearch="on", max_results=20) # Increased to 20 for more options for result in results: image_url = result.get("image") page_url = result.get("url") - # Extract domain and remove top-level domain (e.g., .cn, .com) source_match = re.search(r'https?://(?:www\.)?([^/]+)', page_url) if source_match: - domain = source_match.group(1) # e.g., shine.cn - # Split on last dot and take the first part, then capitalize - source_name = domain.rsplit('.', 1)[0].capitalize() # e.g., Shine + domain = source_match.group(1) + source_name = domain.rsplit('.', 1)[0].capitalize() else: source_name = "Public Domain" if image_url and image_url.endswith(('.jpg', '.jpeg', '.png')): @@ -1144,7 +1155,6 @@ def get_flickr_image(search_query, relevance_keywords, main_topic): used_images.add(image_url) save_used_images() logger.info(f"Selected Pixabay image: {image_url}") - # For Pixabay, source_name is already set to "Pixabay", which is fine return image_url, source_name, uploader, page_url logger.warning(f"No valid images found for query '{search_query}'")