From eeff0d986180a2b47922b399ec6e935db452b1c5 Mon Sep 17 00:00:00 2001 From: Shane Date: Sun, 11 May 2025 21:54:33 +1000 Subject: [PATCH] watermark detection --- foodie_utils.py | 110 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 104 insertions(+), 6 deletions(-) diff --git a/foodie_utils.py b/foodie_utils.py index 071a4e0..905a196 100644 --- a/foodie_utils.py +++ b/foodie_utils.py @@ -1086,7 +1086,7 @@ def get_flickr_image(search_query, relevance_keywords, main_topic): logger = logging.getLogger(__name__) def process_image(image_url, source_name, page_url): - """Download image, check for text with OCR, validate resolution, and exclude screenshots.""" + """Download image, check for text with OCR, validate resolution, exclude screenshots and watermarks.""" try: headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} response = requests.get(image_url, headers=headers, timeout=10) @@ -1095,7 +1095,7 @@ def get_flickr_image(search_query, relevance_keywords, main_topic): # Check image resolution width, height = img.size - min_dimension = 1280 # Minimum width or height for high quality + min_dimension = 1280 if width < min_dimension and height < min_dimension: logger.info(f"Skipping low-resolution image: {image_url} ({width}x{height})") return None @@ -1106,10 +1106,27 @@ def get_flickr_image(search_query, relevance_keywords, main_topic): logger.info(f"Skipping potential screenshot: {image_url} (aspect ratio: {aspect_ratio})") return None - # OCR to detect text (unchanged) - text = pytesseract.image_to_string(img).strip() + # Check for watermarks in URL or page URL + watermark_domains = [ + 'shutterstock.com', 'gettyimages.com', 'istockphoto.com', 'adobestock.com', + '123rf.com', 'dreamstime.com', 'alamy.com', 'stock.adobe.com' + ] + if any(domain in image_url.lower() or domain in page_url.lower() for domain in watermark_domains): + logger.info(f"Skipping image from stock photo site (potential watermark): {image_url}") + return None + + # OCR to detect text and watermarks + text = pytesseract.image_to_string(img).strip().lower() + watermark_phrases = [ + 'shutterstock', 'getty images', 'istock', 'adobe stock', 'watermark', + '123rf', 'dreamstime', 'alamy', 'preview', 'stock photo' + ] + if any(phrase in text for phrase in watermark_phrases): + logger.info(f"Skipping watermarked image: {image_url} (detected: {text})") + return None + word_count = len(text.split()) - if word_count > 10: + if word_count > 5: # Lowered threshold for stricter filtering logger.info(f"Skipping image with too much text: {image_url} ({word_count} words)") return None @@ -1131,7 +1148,7 @@ def get_flickr_image(search_query, relevance_keywords, main_topic): logger.info(f"Searching DDG with query: '{ddg_query}'") try: with DDGS() as ddgs: - results = ddgs.images(ddg_query, safesearch="on", max_results=20) # Increased to 20 for more options + results = ddgs.images(ddg_query, safesearch="on", max_results=20) for result in results: image_url = result.get("image") page_url = result.get("url") @@ -1159,6 +1176,87 @@ def get_flickr_image(search_query, relevance_keywords, main_topic): logger.warning(f"No valid images found for query '{search_query}'") return None, None, None, None + +def get_image(search_query): + headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} + + def process_image(image_url, source_name, page_url): + """Helper to process Pixabay images for watermarks and resolution.""" + try: + response = requests.get(image_url, headers=headers, timeout=10) + response.raise_for_status() + img = Image.open(io.BytesIO(response.content)) + + # Check resolution + width, height = img.size + min_dimension = 1280 + if width < min_dimension and height < min_dimension: + logger.info(f"Skipping low-resolution Pixabay image: {image_url} ({width}x{height})") + return None + + # Check for watermarks via OCR + text = pytesseract.image_to_string(img).strip().lower() + watermark_phrases = [ + 'shutterstock', 'getty images', 'istock', 'adobe stock', 'watermark', + '123rf', 'dreamstime', 'alamy', 'preview', 'stock photo' + ] + if any(phrase in text for phrase in watermark_phrases): + logger.info(f"Skipping watermarked Pixabay image: {image_url} (detected: {text})") + return None + + word_count = len(text.split()) + if word_count > 5: + logger.info(f"Skipping Pixabay image with too much text: {image_url} ({word_count} words)") + return None + + return img_url, source_name, uploader, page_url + except Exception as e: + logger.warning(f"Failed to process Pixabay image {image_url}: {e}") + return None + + def fetch_pixabay_image(query): + try: + pixabay_url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={quote(query)}&image_type=photo&per_page=20" + response = requests.get(pixabay_url, headers=headers, timeout=10) + response.raise_for_status() + data = response.json() + + for hit in data.get('hits', []): + img_url = hit.get('largeImageURL') + if not img_url or img_url in used_images: + continue + + uploader = hit.get('user', 'Unknown') + page_url = hit.get('pageURL', img_url) + + # Process the image for watermarks and resolution + result = process_image(img_url, "Pixabay", page_url) + if result: + used_images.add(img_url) + save_used_images() + logger.info(f"Selected Pixabay image: {img_url} by {uploader} for query '{query}' ({result[0].split('x')[0]}x{result[0].split('x')[1]})") + return result + + logger.info(f"No valid Pixabay image found for query '{query}'. Trying fallback query.") + return None, None, None, None + + except Exception as e: + logger.warning(f"Pixabay image fetch failed for query '{query}': {e}") + return None, None, None, None + + # Try with the original query + image_url, source_name, uploader, page_url = fetch_pixabay_image(search_query) + if image_url: + return image_url, source_name, uploader, page_url + + # Fallback to a generic query + fallback_query = "food dining" + image_url, source_name, uploader, page_url = fetch_pixabay_image(fallback_query) + if image_url: + return image_url, source_name, uploader, page_url + + logger.error(f"All image fetch attempts failed for query '{search_query}'. Returning None.") + return None, None, None, None def select_best_author(content, interest_score): try: