add back in DDG search for flickr

my-fix-branch
Shane 7 months ago
parent 86e208c07e
commit 6376129827
  1. 280
      foodie_utils.py

@ -225,7 +225,6 @@ def get_image(search_query):
flickr_request_count += 1 flickr_request_count += 1
logging.info(f"Flickr request count: {flickr_request_count}/3600") logging.info(f"Flickr request count: {flickr_request_count}/3600")
# Enforce a minimum delay of 1 second between Flickr requests
current_time = time.time() current_time = time.time()
time_since_last_request = current_time - last_flickr_request_time time_since_last_request = current_time - last_flickr_request_time
if time_since_last_request < 1: if time_since_last_request < 1:
@ -235,7 +234,6 @@ def get_image(search_query):
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
# Helper function to search Flickr with a given query
def search_flickr(query, per_page=20): def search_flickr(query, per_page=20):
try: try:
photos = flickr_api.Photo.search( photos = flickr_api.Photo.search(
@ -251,7 +249,14 @@ def get_image(search_query):
logging.warning(f"Flickr API error for query '{query}': {e}") logging.warning(f"Flickr API error for query '{query}': {e}")
return [] return []
# Helper function to process a photo def fetch_photo_by_id(photo_id):
try:
photo = flickr_api.Photo(id=photo_id)
return photo
except Exception as e:
logging.warning(f"Failed to fetch Flickr photo ID {photo_id}: {e}")
return None
def process_photo(photo): def process_photo(photo):
tags = [tag.text.lower() for tag in photo.getTags()] tags = [tag.text.lower() for tag in photo.getTags()]
title = photo.title.lower() if photo.title else "" title = photo.title.lower() if photo.title else ""
@ -320,7 +325,28 @@ def get_image(search_query):
if temp_file and os.path.exists(temp_path): if temp_file and os.path.exists(temp_path):
os.unlink(temp_path) os.unlink(temp_path)
# Helper function to classify keywords as specific or generic def search_ddg_for_flickr(query):
ddg_query = f"{query} site:flickr.com"
ddg_url = f"https://duckduckgo.com/?q={quote(ddg_query)}"
try:
response = requests.get(ddg_url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
photo_ids = set()
for link in soup.find_all('a', href=True):
href = link['href']
match = re.search(r'flickr\.com/photos/[^/]+/(\d+)', href)
if match:
photo_id = match.group(1)
photo_ids.add(photo_id)
logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}")
return photo_ids
except Exception as e:
logging.warning(f"DDG search failed for query '{ddg_query}': {e}")
return set()
def classify_keywords(keywords): def classify_keywords(keywords):
prompt = ( prompt = (
"Given the following keywords from an image search query, classify each as 'specific' (e.g., brand names, unique entities) or 'generic' (e.g., common or abstract terms). " "Given the following keywords from an image search query, classify each as 'specific' (e.g., brand names, unique entities) or 'generic' (e.g., common or abstract terms). "
@ -330,15 +356,14 @@ def get_image(search_query):
"```json\n" "```json\n"
"{\n" "{\n"
" \"Wingstop\": \"specific\",\n" " \"Wingstop\": \"specific\",\n"
" \"Smart\": \"generic\",\n" " \"dining\": \"generic\"\n"
" \"Kitchen\": \"generic\"\n"
"}\n```" "}\n```"
) )
try: try:
response = client.chat.completions.create( response = client.chat.completions.create(
model=LIGHT_TASK_MODEL, model=LIGHT_TASK_MODEL,
messages=[ messages=[
{"role": "system", "content": "You are a helpful assistant that classifies keywords."}, {"role": "system", "content": "You are a helper that classifies keywords."},
{"role": "user", "content": prompt} {"role": "user", "content": prompt}
], ],
max_tokens=100, max_tokens=100,
@ -356,21 +381,23 @@ def get_image(search_query):
logging.warning(f"Keyword classification failed: {e}. Defaulting to all specific.") logging.warning(f"Keyword classification failed: {e}. Defaulting to all specific.")
return {kw: "specific" for kw in keywords} return {kw: "specific" for kw in keywords}
# Step 1: Try the original search query on Flickr # Step 1: Search DDG to find Flickr photo IDs
logging.info(f"Searching Flickr with original query: '{search_query}'") logging.info(f"Searching DDG with query: '{search_query} site:flickr.com'")
photos = search_flickr(search_query) photo_ids = search_ddg_for_flickr(search_query)
for photo in photos: if photo_ids:
result = process_photo(photo) for photo_id in photo_ids:
if result: photo = fetch_photo_by_id(photo_id)
return result if photo:
result = process_photo(photo)
# Step 2: Break down the query into keywords and classify them if result:
return result
# Step 2: Break down the query into keywords and classify them for direct Flickr API search
keywords = search_query.lower().split() keywords = search_query.lower().split()
if len(keywords) > 1: if len(keywords) > 1:
classifications = classify_keywords(keywords) classifications = classify_keywords(keywords)
logging.info(f"Keyword classifications: {classifications}") logging.info(f"Keyword classifications: {classifications}")
# Prioritize specific keywords
specific_keywords = [kw for kw, classification in classifications.items() if classification == "specific"] specific_keywords = [kw for kw, classification in classifications.items() if classification == "specific"]
if specific_keywords: if specific_keywords:
for keyword in specific_keywords: for keyword in specific_keywords:
@ -382,10 +409,8 @@ def get_image(search_query):
return result return result
# Step 3: Final fallback to a generic food-related query # Step 3: Final fallback to a generic food-related query
# Use a simple generic query derived from context (e.g., "food dining") logging.info(f"No results found. Falling back to generic query: 'food dining'")
fallback_query = "food dining" # This could be further contextualized if needed photos = search_flickr("food dining")
logging.info(f"No results found. Falling back to generic query: '{fallback_query}'")
photos = search_flickr(fallback_query)
for photo in photos: for photo in photos:
result = process_photo(photo) result = process_photo(photo)
if result: if result:
@ -1000,6 +1025,219 @@ def reset_flickr_request_count():
flickr_request_count = 0 flickr_request_count = 0
flickr_request_start_time = time.time() flickr_request_start_time = time.time()
def get_flickr_image(search_query, relevance_keywords):
global last_flickr_request_time, flickr_request_count
reset_flickr_request_count()
flickr_request_count += 1
logging.info(f"Flickr request count: {flickr_request_count}/3600")
# Enforce a minimum delay of 1 second between Flickr requests
current_time = time.time()
time_since_last_request = current_time - last_flickr_request_time
if time_since_last_request < 1:
time.sleep(1 - time_since_last_request)
last_flickr_request_time = time.time()
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
# Helper function to search Flickr with a given query
def search_flickr(query, per_page=20):
try:
photos = flickr_api.Photo.search(
text=query,
per_page=per_page,
sort='relevance',
safe_search=1,
media='photos',
license='4,5,9,10'
)
return photos
except Exception as e:
logging.warning(f"Flickr API error for query '{query}': {e}")
return []
# Helper function to fetch a Flickr photo by ID
def fetch_photo_by_id(photo_id):
try:
photo = flickr_api.Photo(id=photo_id)
return photo
except Exception as e:
logging.warning(f"Failed to fetch Flickr photo ID {photo_id}: {e}")
return None
# Helper function to process a photo
def process_photo(photo):
tags = [tag.text.lower() for tag in photo.getTags()]
title = photo.title.lower() if photo.title else ""
matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
if matched_keywords:
logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})")
return None
img_url = photo.getPhotoFile(size_label='Large')
if not img_url:
img_url = photo.getPhotoFile(size_label='Medium')
if not img_url or img_url in used_images:
return None
temp_file = None
try:
img_response = requests.get(img_url, headers=headers, timeout=10)
img_response.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
temp_file.write(img_response.content)
temp_path = temp_file.name
img = Image.open(temp_path)
text = pytesseract.image_to_string(img)
char_count = len(text.strip())
logging.info(f"OCR processed {img_url}: {char_count} characters detected")
if char_count > 200:
logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})")
return None
uploader = photo.owner.username
page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
used_images.add(img_url)
save_used_images()
flickr_data = {
"title": search_query,
"image_url": img_url,
"source": "Flickr",
"uploader": uploader,
"page_url": page_url,
"timestamp": datetime.now(timezone.utc).isoformat(),
"ocr_chars": char_count
}
flickr_file = "/home/shane/foodie_automator/flickr_images.json"
with open(flickr_file, 'a') as f:
json.dump(flickr_data, f)
f.write('\n')
logging.info(f"Saved Flickr image to {flickr_file}: {img_url}")
logging.info(f"Fetched Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
return img_url, "Flickr", uploader, page_url
except requests.exceptions.HTTPError as e:
if e.response.status_code == 429:
logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.")
return None
else:
logging.warning(f"Download failed for {img_url}: {e}")
return None
except Exception as e:
logging.warning(f"OCR processing failed for {img_url}: {e}")
return None
finally:
if temp_file and os.path.exists(temp_path):
os.unlink(temp_path)
# Helper function to search DDG and extract Flickr photo IDs
def search_ddg_for_flickr(query):
ddg_query = f"{query} site:flickr.com"
ddg_url = f"https://duckduckgo.com/?q={quote(ddg_query)}"
try:
response = requests.get(ddg_url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
photo_ids = set()
# Look for Flickr URLs in the search results
for link in soup.find_all('a', href=True):
href = link['href']
# Match Flickr photo URLs like https://www.flickr.com/photos/username/1234567890
match = re.search(r'flickr\.com/photos/[^/]+/(\d+)', href)
if match:
photo_id = match.group(1)
photo_ids.add(photo_id)
logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}")
return photo_ids
except Exception as e:
logging.warning(f"DDG search failed for query '{ddg_query}': {e}")
return set()
# Helper function to classify keywords as specific or generic
def classify_keywords(keywords):
prompt = (
"Given the following keywords from an image search query, classify each as 'specific' (e.g., brand names, unique entities) or 'generic' (e.g., common or abstract terms). "
"Return a JSON object mapping each keyword to its classification.\n\n"
"Keywords: " + ", ".join(keywords) + "\n\n"
"Example output:\n"
"```json\n"
"{\n"
" \"Wingstop\": \"specific\",\n"
" \"dining\": \"generic\"\n"
"}\n```"
)
try:
response = client.chat.completions.create(
model=LIGHT_TASK_MODEL,
messages=[
{"role": "system", "content": "You are a helper that classifies keywords."},
{"role": "user", "content": prompt}
],
max_tokens=100,
temperature=0.5
)
raw_response = response.choices[0].message.content
json_match = re.search(r'```json\n([\s\S]*?)\n```', raw_response)
if not json_match:
logging.warning(f"Failed to parse keyword classification JSON: {raw_response}")
return {kw: "specific" for kw in keywords}
classifications = json.loads(json_match.group(1))
return classifications
except Exception as e:
logging.warning(f"Keyword classification failed: {e}. Defaulting to all specific.")
return {kw: "specific" for kw in keywords}
# Step 1: Search DDG to find Flickr photo IDs
logging.info(f"Searching DDG with query: '{search_query} site:flickr.com'")
photo_ids = search_ddg_for_flickr(search_query)
if photo_ids:
for photo_id in photo_ids:
photo = fetch_photo_by_id(photo_id)
if photo:
result = process_photo(photo)
if result:
return result
# Step 2: Break down the query into keywords and classify them for direct Flickr API search
keywords = search_query.lower().split()
if len(keywords) > 1:
classifications = classify_keywords(keywords)
logging.info(f"Keyword classifications: {classifications}")
# Prioritize specific keywords
specific_keywords = [kw for kw, classification in classifications.items() if classification == "specific"]
if specific_keywords:
for keyword in specific_keywords:
logging.info(f"Searching Flickr with specific keyword: '{keyword}'")
photos = search_flickr(keyword)
for photo in photos:
result = process_photo(photo)
if result:
return result
# Step 3: Final fallback using relevance keywords
fallback_query = " ".join(relevance_keywords) if isinstance(relevance_keywords, list) else relevance_keywords
logging.info(f"No results found. Falling back to generic query: '{fallback_query}'")
photos = search_flickr(fallback_query)
for photo in photos:
result = process_photo(photo)
if result:
return result
logging.warning(f"No valid Flickr image found for query '{search_query}' after all attempts.")
return None, None, None, None
def select_best_author(summary): def select_best_author(summary):
try: try:
response = client.chat.completions.create( response = client.chat.completions.create(

Loading…
Cancel
Save