OCR images url filter
This commit is contained in:
@@ -1177,6 +1177,12 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term
|
|||||||
logger.info(f"Skipping watermarked image: {image_url} (detected: {text})")
|
logger.info(f"Skipping watermarked image: {image_url} (detected: {text})")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Check for website URLs in the image text
|
||||||
|
url_pattern = r'(?:https?://)?(?:www\.)?[\w-]+\.(?:com|org|net|edu|gov|co\.uk|io)(?:/[\w-./?%&=]*)?'
|
||||||
|
if re.search(url_pattern, text):
|
||||||
|
logger.info(f"Skipping image with embedded website URL: {image_url} (detected: {text})")
|
||||||
|
return None
|
||||||
|
|
||||||
word_count = len(text.split())
|
word_count = len(text.split())
|
||||||
if word_count > 5:
|
if word_count > 5:
|
||||||
logger.info(f"Skipping image with too much text: {image_url} ({word_count} words)")
|
logger.info(f"Skipping image with too much text: {image_url} ({word_count} words)")
|
||||||
|
|||||||
Reference in New Issue
Block a user