|
|
|
|
@ -1086,7 +1086,7 @@ def get_flickr_image(search_query, relevance_keywords, main_topic): |
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
|
def process_image(image_url, source_name, page_url): |
|
|
|
|
"""Download image, check for text with OCR, validate resolution, and exclude screenshots.""" |
|
|
|
|
"""Download image, check for text with OCR, validate resolution, exclude screenshots and watermarks.""" |
|
|
|
|
try: |
|
|
|
|
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} |
|
|
|
|
response = requests.get(image_url, headers=headers, timeout=10) |
|
|
|
|
@ -1095,7 +1095,7 @@ def get_flickr_image(search_query, relevance_keywords, main_topic): |
|
|
|
|
|
|
|
|
|
# Check image resolution |
|
|
|
|
width, height = img.size |
|
|
|
|
min_dimension = 1280 # Minimum width or height for high quality |
|
|
|
|
min_dimension = 1280 |
|
|
|
|
if width < min_dimension and height < min_dimension: |
|
|
|
|
logger.info(f"Skipping low-resolution image: {image_url} ({width}x{height})") |
|
|
|
|
return None |
|
|
|
|
@ -1106,10 +1106,27 @@ def get_flickr_image(search_query, relevance_keywords, main_topic): |
|
|
|
|
logger.info(f"Skipping potential screenshot: {image_url} (aspect ratio: {aspect_ratio})") |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
# OCR to detect text (unchanged) |
|
|
|
|
text = pytesseract.image_to_string(img).strip() |
|
|
|
|
# Check for watermarks in URL or page URL |
|
|
|
|
watermark_domains = [ |
|
|
|
|
'shutterstock.com', 'gettyimages.com', 'istockphoto.com', 'adobestock.com', |
|
|
|
|
'123rf.com', 'dreamstime.com', 'alamy.com', 'stock.adobe.com' |
|
|
|
|
] |
|
|
|
|
if any(domain in image_url.lower() or domain in page_url.lower() for domain in watermark_domains): |
|
|
|
|
logger.info(f"Skipping image from stock photo site (potential watermark): {image_url}") |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
# OCR to detect text and watermarks |
|
|
|
|
text = pytesseract.image_to_string(img).strip().lower() |
|
|
|
|
watermark_phrases = [ |
|
|
|
|
'shutterstock', 'getty images', 'istock', 'adobe stock', 'watermark', |
|
|
|
|
'123rf', 'dreamstime', 'alamy', 'preview', 'stock photo' |
|
|
|
|
] |
|
|
|
|
if any(phrase in text for phrase in watermark_phrases): |
|
|
|
|
logger.info(f"Skipping watermarked image: {image_url} (detected: {text})") |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
word_count = len(text.split()) |
|
|
|
|
if word_count > 10: |
|
|
|
|
if word_count > 5: # Lowered threshold for stricter filtering |
|
|
|
|
logger.info(f"Skipping image with too much text: {image_url} ({word_count} words)") |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
@ -1131,7 +1148,7 @@ def get_flickr_image(search_query, relevance_keywords, main_topic): |
|
|
|
|
logger.info(f"Searching DDG with query: '{ddg_query}'") |
|
|
|
|
try: |
|
|
|
|
with DDGS() as ddgs: |
|
|
|
|
results = ddgs.images(ddg_query, safesearch="on", max_results=20) # Increased to 20 for more options |
|
|
|
|
results = ddgs.images(ddg_query, safesearch="on", max_results=20) |
|
|
|
|
for result in results: |
|
|
|
|
image_url = result.get("image") |
|
|
|
|
page_url = result.get("url") |
|
|
|
|
@ -1159,6 +1176,87 @@ def get_flickr_image(search_query, relevance_keywords, main_topic): |
|
|
|
|
|
|
|
|
|
logger.warning(f"No valid images found for query '{search_query}'") |
|
|
|
|
return None, None, None, None |
|
|
|
|
|
|
|
|
|
def get_image(search_query): |
|
|
|
|
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} |
|
|
|
|
|
|
|
|
|
def process_image(image_url, source_name, page_url): |
|
|
|
|
"""Helper to process Pixabay images for watermarks and resolution.""" |
|
|
|
|
try: |
|
|
|
|
response = requests.get(image_url, headers=headers, timeout=10) |
|
|
|
|
response.raise_for_status() |
|
|
|
|
img = Image.open(io.BytesIO(response.content)) |
|
|
|
|
|
|
|
|
|
# Check resolution |
|
|
|
|
width, height = img.size |
|
|
|
|
min_dimension = 1280 |
|
|
|
|
if width < min_dimension and height < min_dimension: |
|
|
|
|
logger.info(f"Skipping low-resolution Pixabay image: {image_url} ({width}x{height})") |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
# Check for watermarks via OCR |
|
|
|
|
text = pytesseract.image_to_string(img).strip().lower() |
|
|
|
|
watermark_phrases = [ |
|
|
|
|
'shutterstock', 'getty images', 'istock', 'adobe stock', 'watermark', |
|
|
|
|
'123rf', 'dreamstime', 'alamy', 'preview', 'stock photo' |
|
|
|
|
] |
|
|
|
|
if any(phrase in text for phrase in watermark_phrases): |
|
|
|
|
logger.info(f"Skipping watermarked Pixabay image: {image_url} (detected: {text})") |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
word_count = len(text.split()) |
|
|
|
|
if word_count > 5: |
|
|
|
|
logger.info(f"Skipping Pixabay image with too much text: {image_url} ({word_count} words)") |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
return img_url, source_name, uploader, page_url |
|
|
|
|
except Exception as e: |
|
|
|
|
logger.warning(f"Failed to process Pixabay image {image_url}: {e}") |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
def fetch_pixabay_image(query): |
|
|
|
|
try: |
|
|
|
|
pixabay_url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={quote(query)}&image_type=photo&per_page=20" |
|
|
|
|
response = requests.get(pixabay_url, headers=headers, timeout=10) |
|
|
|
|
response.raise_for_status() |
|
|
|
|
data = response.json() |
|
|
|
|
|
|
|
|
|
for hit in data.get('hits', []): |
|
|
|
|
img_url = hit.get('largeImageURL') |
|
|
|
|
if not img_url or img_url in used_images: |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
uploader = hit.get('user', 'Unknown') |
|
|
|
|
page_url = hit.get('pageURL', img_url) |
|
|
|
|
|
|
|
|
|
# Process the image for watermarks and resolution |
|
|
|
|
result = process_image(img_url, "Pixabay", page_url) |
|
|
|
|
if result: |
|
|
|
|
used_images.add(img_url) |
|
|
|
|
save_used_images() |
|
|
|
|
logger.info(f"Selected Pixabay image: {img_url} by {uploader} for query '{query}' ({result[0].split('x')[0]}x{result[0].split('x')[1]})") |
|
|
|
|
return result |
|
|
|
|
|
|
|
|
|
logger.info(f"No valid Pixabay image found for query '{query}'. Trying fallback query.") |
|
|
|
|
return None, None, None, None |
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
logger.warning(f"Pixabay image fetch failed for query '{query}': {e}") |
|
|
|
|
return None, None, None, None |
|
|
|
|
|
|
|
|
|
# Try with the original query |
|
|
|
|
image_url, source_name, uploader, page_url = fetch_pixabay_image(search_query) |
|
|
|
|
if image_url: |
|
|
|
|
return image_url, source_name, uploader, page_url |
|
|
|
|
|
|
|
|
|
# Fallback to a generic query |
|
|
|
|
fallback_query = "food dining" |
|
|
|
|
image_url, source_name, uploader, page_url = fetch_pixabay_image(fallback_query) |
|
|
|
|
if image_url: |
|
|
|
|
return image_url, source_name, uploader, page_url |
|
|
|
|
|
|
|
|
|
logger.error(f"All image fetch attempts failed for query '{search_query}'. Returning None.") |
|
|
|
|
return None, None, None, None |
|
|
|
|
|
|
|
|
|
def select_best_author(content, interest_score): |
|
|
|
|
try: |
|
|
|
|
|