diff --git a/foodie_utils.py b/foodie_utils.py index 7cfbca8..874470b 100644 --- a/foodie_utils.py +++ b/foodie_utils.py @@ -1177,6 +1177,12 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term logger.info(f"Skipping watermarked image: {image_url} (detected: {text})") return None + # Check for website URLs in the image text + url_pattern = r'(?:https?://)?(?:www\.)?[\w-]+\.(?:com|org|net|edu|gov|co\.uk|io)(?:/[\w-./?%&=]*)?' + if re.search(url_pattern, text): + logger.info(f"Skipping image with embedded website URL: {image_url} (detected: {text})") + return None + word_count = len(text.split()) if word_count > 5: logger.info(f"Skipping image with too much text: {image_url} ({word_count} words)")