Minimum Resolution Filter 1280px
This commit is contained in:
+19
-9
@@ -1086,14 +1086,27 @@ def get_flickr_image(search_query, relevance_keywords, main_topic):
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def process_image(image_url, source_name, page_url):
|
def process_image(image_url, source_name, page_url):
|
||||||
"""Download image, check for text with OCR, and validate."""
|
"""Download image, check for text with OCR, validate resolution, and exclude screenshots."""
|
||||||
try:
|
try:
|
||||||
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
|
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
|
||||||
response = requests.get(image_url, headers=headers, timeout=10)
|
response = requests.get(image_url, headers=headers, timeout=10)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
img = Image.open(io.BytesIO(response.content))
|
img = Image.open(io.BytesIO(response.content))
|
||||||
|
|
||||||
# OCR to detect text
|
# Check image resolution
|
||||||
|
width, height = img.size
|
||||||
|
min_dimension = 1280 # Minimum width or height for high quality
|
||||||
|
if width < min_dimension and height < min_dimension:
|
||||||
|
logger.info(f"Skipping low-resolution image: {image_url} ({width}x{height})")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Attempt to detect screenshots via aspect ratio or naming
|
||||||
|
aspect_ratio = width / height
|
||||||
|
if (0.9 <= aspect_ratio <= 1.1) or "screenshot" in image_url.lower():
|
||||||
|
logger.info(f"Skipping potential screenshot: {image_url} (aspect ratio: {aspect_ratio})")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# OCR to detect text (unchanged)
|
||||||
text = pytesseract.image_to_string(img).strip()
|
text = pytesseract.image_to_string(img).strip()
|
||||||
word_count = len(text.split())
|
word_count = len(text.split())
|
||||||
if word_count > 10:
|
if word_count > 10:
|
||||||
@@ -1107,7 +1120,7 @@ def get_flickr_image(search_query, relevance_keywords, main_topic):
|
|||||||
used_images.add(image_url)
|
used_images.add(image_url)
|
||||||
save_used_images()
|
save_used_images()
|
||||||
uploader = "Unknown"
|
uploader = "Unknown"
|
||||||
logger.info(f"Selected image: {image_url} from {source_name}")
|
logger.info(f"Selected image: {image_url} from {source_name} ({width}x{height})")
|
||||||
return image_url, source_name, uploader, page_url
|
return image_url, source_name, uploader, page_url
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to process image {image_url}: {e}")
|
logger.warning(f"Failed to process image {image_url}: {e}")
|
||||||
@@ -1118,16 +1131,14 @@ def get_flickr_image(search_query, relevance_keywords, main_topic):
|
|||||||
logger.info(f"Searching DDG with query: '{ddg_query}'")
|
logger.info(f"Searching DDG with query: '{ddg_query}'")
|
||||||
try:
|
try:
|
||||||
with DDGS() as ddgs:
|
with DDGS() as ddgs:
|
||||||
results = ddgs.images(ddg_query, safesearch="on", max_results=10)
|
results = ddgs.images(ddg_query, safesearch="on", max_results=20) # Increased to 20 for more options
|
||||||
for result in results:
|
for result in results:
|
||||||
image_url = result.get("image")
|
image_url = result.get("image")
|
||||||
page_url = result.get("url")
|
page_url = result.get("url")
|
||||||
# Extract domain and remove top-level domain (e.g., .cn, .com)
|
|
||||||
source_match = re.search(r'https?://(?:www\.)?([^/]+)', page_url)
|
source_match = re.search(r'https?://(?:www\.)?([^/]+)', page_url)
|
||||||
if source_match:
|
if source_match:
|
||||||
domain = source_match.group(1) # e.g., shine.cn
|
domain = source_match.group(1)
|
||||||
# Split on last dot and take the first part, then capitalize
|
source_name = domain.rsplit('.', 1)[0].capitalize()
|
||||||
source_name = domain.rsplit('.', 1)[0].capitalize() # e.g., Shine
|
|
||||||
else:
|
else:
|
||||||
source_name = "Public Domain"
|
source_name = "Public Domain"
|
||||||
if image_url and image_url.endswith(('.jpg', '.jpeg', '.png')):
|
if image_url and image_url.endswith(('.jpg', '.jpeg', '.png')):
|
||||||
@@ -1144,7 +1155,6 @@ def get_flickr_image(search_query, relevance_keywords, main_topic):
|
|||||||
used_images.add(image_url)
|
used_images.add(image_url)
|
||||||
save_used_images()
|
save_used_images()
|
||||||
logger.info(f"Selected Pixabay image: {image_url}")
|
logger.info(f"Selected Pixabay image: {image_url}")
|
||||||
# For Pixabay, source_name is already set to "Pixabay", which is fine
|
|
||||||
return image_url, source_name, uploader, page_url
|
return image_url, source_name, uploader, page_url
|
||||||
|
|
||||||
logger.warning(f"No valid images found for query '{search_query}'")
|
logger.warning(f"No valid images found for query '{search_query}'")
|
||||||
|
|||||||
Reference in New Issue
Block a user