stop using flickr API for images

main
Shane 7 months ago
parent 7c69b4a451
commit 071726f016
  1. 108
      foodie_utils.py

@ -1077,72 +1077,66 @@ def classify_keywords(keywords):
return {kw: "specific" for kw in keywords} return {kw: "specific" for kw in keywords}
def get_flickr_image(search_query, relevance_keywords, main_topic): def get_flickr_image(search_query, relevance_keywords, main_topic):
global last_flickr_request_time, flickr_request_count global used_images
logger = logging.getLogger(__name__)
reset_flickr_request_count()
flickr_request_count += 1
logging.info(f"Flickr request count: {flickr_request_count}/3600")
current_time = time.time()
time_since_last_request = current_time - last_flickr_request_time
if time_since_last_request < 10:
time.sleep(10 - time_since_last_request)
last_flickr_request_time = time.time() def process_image(image_url, source_name, page_url):
"""Download image, check for text with OCR, and validate."""
try:
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
response = requests.get(image_url, headers=headers, timeout=10)
response.raise_for_status()
img = Image.open(io.BytesIO(response.content))
# Step 1: Search Flickr directly with the original query # OCR to detect text
logging.info(f"Searching Flickr directly with query: '{search_query}'") text = pytesseract.image_to_string(img).strip()
photos = search_flickr(search_query) word_count = len(text.split())
for photo in photos: if word_count > 10:
result = process_photo(photo, search_query) logger.info(f"Skipping image with too much text: {image_url} ({word_count} words)")
if result: return None
return result
# Step 2: Search DDG to find Flickr photo IDs if image_url in used_images:
logging.info(f"Searching DDG with query: '{search_query} site:flickr.com'") logger.info(f"Image already used: {image_url}")
photo_ids = search_ddg_for_flickr(search_query) return None
if photo_ids:
for photo_id in photo_ids:
photo = fetch_photo_by_id(photo_id)
if photo:
result = process_photo(photo, search_query)
if result:
return result
# Step 3: Break down the query into keywords and classify them used_images.add(image_url)
keywords = search_query.lower().split() save_used_images()
if len(keywords) > 1: uploader = "Unknown" # Most public domain sources don't provide uploader
classifications = classify_keywords(keywords) logger.info(f"Selected image: {image_url} from {source_name}")
logging.info(f"Keyword classifications: {classifications}") return image_url, source_name, uploader, page_url
except Exception as e:
specific_keywords = [kw for kw, classification in classifications.items() if classification == "specific"] logger.warning(f"Failed to process image {image_url}: {e}")
if specific_keywords: return None
for keyword in specific_keywords:
logging.info(f"Searching Flickr with specific keyword: '{keyword}'")
photos = search_flickr(keyword)
for photo in photos:
result = process_photo(photo, search_query)
if result:
return result
# Step 4: Fallback using main topic # Step 1: Search DDG for public domain images
logging.info(f"No results found. Falling back to main topic: '{main_topic}'") ddg_query = f"{search_query} license:public domain"
photos = search_flickr(main_topic) logger.info(f"Searching DDG with query: '{ddg_query}'")
for photo in photos: try:
result = process_photo(photo, main_topic) with DDGS() as ddgs:
results = ddgs.images(ddg_query, safesearch="on", max_results=10)
for result in results:
image_url = result.get("image")
page_url = result.get("url")
# Extract domain as source_name (e.g., unsplash.com -> Unsplash)
source_match = re.search(r'https?://(?:www\.)?([^/]+)', page_url)
source_name = source_match.group(1).capitalize() if source_match else "Public Domain"
if image_url and image_url.endswith(('.jpg', '.jpeg', '.png')):
result = process_image(image_url, source_name, page_url)
if result: if result:
return result return result
except Exception as e:
logger.warning(f"DDG search failed for '{ddg_query}': {e}")
# Step 5: Final fallback using relevance keywords # Step 2: Fallback to Pixabay
fallback_query = " ".join(relevance_keywords) if isinstance(relevance_keywords, list) else relevance_keywords logger.info(f"No valid DDG images, falling back to Pixabay for '{search_query}'")
logging.info(f"No results with main topic. Falling back to relevance keywords: '{fallback_query}'") image_url, source_name, uploader, page_url = get_image(search_query)
photos = search_flickr(fallback_query) if image_url:
for photo in photos: used_images.add(image_url)
result = process_photo(photo, search_query) save_used_images()
if result: logger.info(f"Selected Pixabay image: {image_url}")
return result return image_url, source_name, uploader, page_url
logging.warning(f"No valid Flickr image found for query '{search_query}' after all attempts.") logger.warning(f"No valid images found for query '{search_query}'")
return None, None, None, None return None, None, None, None
def select_best_author(content, interest_score): def select_best_author(content, interest_score):

Loading…
Cancel
Save