From 765967fb8cb345e8c1ed7b2b06b896aec83bfb4e Mon Sep 17 00:00:00 2001 From: Shane Date: Sun, 11 May 2025 22:17:59 +1000 Subject: [PATCH] skip youtube --- foodie_utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/foodie_utils.py b/foodie_utils.py index 905a196..ba2a4bb 100644 --- a/foodie_utils.py +++ b/foodie_utils.py @@ -1086,8 +1086,14 @@ def get_flickr_image(search_query, relevance_keywords, main_topic): logger = logging.getLogger(__name__) def process_image(image_url, source_name, page_url): - """Download image, check for text with OCR, validate resolution, exclude screenshots and watermarks.""" + """Download image, check for text with OCR, validate resolution, exclude screenshots, watermarks, and YouTube images.""" try: + # Check for YouTube images via URL or page URL + youtube_domains = ['youtube.com', 'ytimg.com'] + if any(domain in image_url.lower() or domain in page_url.lower() for domain in youtube_domains): + logger.info(f"Skipping YouTube image: {image_url}") + return None + headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} response = requests.get(image_url, headers=headers, timeout=10) response.raise_for_status() @@ -1126,7 +1132,7 @@ def get_flickr_image(search_query, relevance_keywords, main_topic): return None word_count = len(text.split()) - if word_count > 5: # Lowered threshold for stricter filtering + if word_count > 5: logger.info(f"Skipping image with too much text: {image_url} ({word_count} words)") return None