OCR images url filter

9 months ago · 5f38374abd
parent 6e0f8b4759
commit 5f38374abd
1 changed files with 6 additions and 0 deletions
--- a/foodie_utils.py
+++ b/foodie_utils.py
@ -1177,6 +1177,12 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term
                logger.info(f"Skipping watermarked image: {image_url} (detected: {text})")
                return None
            
+            # Check for website URLs in the image text
+            url_pattern = r'(?:https?://)?(?:www\.)?[\w-]+\.(?:com|org|net|edu|gov|co\.uk|io)(?:/[\w-./?%&=]*)?'
+            if re.search(url_pattern, text):
+                logger.info(f"Skipping image with embedded website URL: {image_url} (detected: {text})")
+                return None
+            
            word_count = len(text.split())
            if word_count > 5:
                logger.info(f"Skipping image with too much text: {image_url} ({word_count} words)")