update image flickr prompt

my-fix-branch
Shane 7 months ago
parent bdf09a3355
commit 163e50c2b3
  1. 389
      foodie_utils.py

@ -62,10 +62,16 @@ def save_json_file(filename, key, value):
# Remove duplicates by title
data = [item for item in data if item["title"] != key]
data.append(entry)
with open(filename, 'w') as f:
for item in data:
json.dump(item, f)
f.write('\n')
# Special handling for used_images.json to save as a flat list
if filename.endswith('used_images.json'):
flat_data = [item["title"] for item in data if isinstance(item, dict) and "title" in item]
with open(filename, 'w') as f:
json.dump(flat_data, f)
else:
with open(filename, 'w') as f:
for item in data:
json.dump(item, f)
f.write('\n')
logging.info(f"Saved '{key}' to {filename}")
print(f"DEBUG: Saved '{key}' to {filename}")
loaded_data = load_json_file(filename, expiration_days=PRUNE_INTERVAL_DAYS)
@ -227,98 +233,165 @@ def get_image(search_query):
last_flickr_request_time = time.time()
try:
# Try Flickr API first
photos = flickr_api.Photo.search(
text=search_query,
per_page=10,
sort='relevance',
safe_search=1,
media='photos',
license='4,5,9,10' # Commercial use licenses
)
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
# Helper function to search Flickr with a given query
def search_flickr(query, per_page=20):
try:
photos = flickr_api.Photo.search(
text=query,
per_page=per_page,
sort='relevance',
safe_search=1,
media='photos',
license='4,5,9,10'
)
return photos
except Exception as e:
logging.warning(f"Flickr API error for query '{query}': {e}")
return []
# Helper function to process a photo
def process_photo(photo):
tags = [tag.text.lower() for tag in photo.getTags()]
title = photo.title.lower() if photo.title else ""
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
if matched_keywords:
logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})")
return None
for photo in photos:
# Fetch photo metadata (tags and title)
tags = [tag.text.lower() for tag in photo.getTags()]
title = photo.title.lower() if photo.title else ""
img_url = photo.getPhotoFile(size_label='Medium')
if not img_url or img_url in used_images:
return None
temp_file = None
try:
img_response = requests.get(img_url, headers=headers, timeout=10)
img_response.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
temp_file.write(img_response.content)
temp_path = temp_file.name
img = Image.open(temp_path)
text = pytesseract.image_to_string(img)
char_count = len(text.strip())
logging.info(f"OCR processed {img_url}: {char_count} characters detected")
if char_count > 200:
logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})")
return None
uploader = photo.owner.username
page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
# Filter out images with unwanted keywords in tags or title
matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
if matched_keywords:
logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})")
continue
used_images.add(img_url)
save_used_images()
img_url = photo.getPhotoFile(size_label='Medium')
if not img_url:
continue
if img_url in used_images:
continue
flickr_data = {
"title": search_query,
"image_url": img_url,
"source": "Flickr",
"uploader": uploader,
"page_url": page_url,
"timestamp": datetime.now(timezone.utc).isoformat(),
"ocr_chars": char_count
}
flickr_file = "/home/shane/foodie_automator/flickr_images.json"
with open(flickr_file, 'a') as f:
json.dump(flickr_data, f)
f.write('\n')
logging.info(f"Saved Flickr image to {flickr_file}: {img_url}")
# Download the image and run OCR to check for excessive text
temp_file = None
try:
img_response = requests.get(img_url, headers=headers, timeout=10)
img_response.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
temp_file.write(img_response.content)
temp_path = temp_file.name
logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
return img_url, "Flickr", uploader, page_url
img = Image.open(temp_path)
text = pytesseract.image_to_string(img)
char_count = len(text.strip())
logging.info(f"OCR processed {img_url}: {char_count} characters detected")
except requests.exceptions.HTTPError as e:
if e.response.status_code == 429:
logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.")
return None
else:
logging.warning(f"Download failed for {img_url}: {e}")
return None
except Exception as e:
logging.warning(f"OCR processing failed for {img_url}: {e}")
return None
finally:
if temp_file and os.path.exists(temp_path):
os.unlink(temp_path)
if char_count > 200:
logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})")
continue
# Helper function to classify keywords as specific or generic
def classify_keywords(keywords):
prompt = (
"Given the following keywords from an image search query, classify each as 'specific' (e.g., brand names, unique entities) or 'generic' (e.g., common or abstract terms). "
"Return a JSON object mapping each keyword to its classification.\n\n"
"Keywords: " + ", ".join(keywords) + "\n\n"
"Example output:\n"
"```json\n"
"{\n"
" \"Wingstop\": \"specific\",\n"
" \"Smart\": \"generic\",\n"
" \"Kitchen\": \"generic\"\n"
"}\n```"
)
try:
response = client.chat.completions.create(
model=LIGHT_TASK_MODEL,
messages=[
{"role": "system", "content": "You are a helpful assistant that classifies keywords."},
{"role": "user", "content": prompt}
],
max_tokens=100,
temperature=0.5
)
raw_response = response.choices[0].message.content
json_match = re.search(r'```json\n([\s\S]*?)\n```', raw_response)
if not json_match:
logging.warning(f"Failed to parse keyword classification JSON: {raw_response}")
return {kw: "specific" for kw in keywords}
classifications = json.loads(json_match.group(1))
return classifications
except Exception as e:
logging.warning(f"Keyword classification failed: {e}. Defaulting to all specific.")
return {kw: "specific" for kw in keywords}
uploader = photo.owner.username
page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
# Add the image URL to used_images
used_images.add(img_url)
save_used_images()
# Save Flickr image metadata
flickr_data = {
"title": search_query,
"image_url": img_url,
"source": "Flickr",
"uploader": uploader,
"page_url": page_url,
"timestamp": datetime.now(timezone.utc).isoformat(),
"ocr_chars": char_count
}
flickr_file = "/home/shane/foodie_automator/flickr_images.json"
with open(flickr_file, 'a') as f:
json.dump(flickr_data, f)
f.write('\n')
logging.info(f"Saved Flickr image to {flickr_file}: {img_url}")
logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
return img_url, "Flickr", uploader, page_url
# Step 1: Try the original search query on Flickr
logging.info(f"Searching Flickr with original query: '{search_query}'")
photos = search_flickr(search_query)
for photo in photos:
result = process_photo(photo)
if result:
return result
except requests.exceptions.HTTPError as e:
if e.response.status_code == 429:
logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.")
return None, None, None, None
else:
logging.warning(f"Download failed for {img_url}: {e}")
continue
except Exception as e:
logging.warning(f"OCR processing failed for {img_url}: {e}")
continue
finally:
if temp_file and os.path.exists(temp_path):
os.unlink(temp_path)
# Step 2: Break down the query into keywords and classify them
keywords = search_query.lower().split()
if len(keywords) > 1:
classifications = classify_keywords(keywords)
logging.info(f"Keyword classifications: {classifications}")
logging.warning(f"No valid Flickr image found in fallback for query '{search_query}'. Trying Pixabay.")
except Exception as e:
logging.warning(f"Fallback Flickr API error for query '{search_query}': {e}. Falling back to Pixabay.")
# Prioritize specific keywords
specific_keywords = [kw for kw, classification in classifications.items() if classification == "specific"]
if specific_keywords:
for keyword in specific_keywords:
logging.info(f"Searching Flickr with specific keyword: '{keyword}'")
photos = search_flickr(keyword)
for photo in photos:
result = process_photo(photo)
if result:
return result
# Step 3: Final fallback to a generic food-related query
# Use a simple generic query derived from context (e.g., "food dining")
fallback_query = "food dining" # This could be further contextualized if needed
logging.info(f"No results found. Falling back to generic query: '{fallback_query}'")
photos = search_flickr(fallback_query)
for photo in photos:
result = process_photo(photo)
if result:
return result
logging.warning(f"No valid Flickr image found in fallback for query '{search_query}'. Trying Pixabay.")
# Fallback to Pixabay
try:
@ -334,7 +407,6 @@ def get_image(search_query):
uploader = hit.get('user', 'Unknown')
page_url = hit.get('pageURL', img_url)
# Add the image URL to used_images
used_images.add(img_url)
save_used_images()
@ -350,14 +422,18 @@ def get_image(search_query):
def generate_image_query(content):
prompt = (
"Given the following content, generate a concise image search query (max 5 words) that would likely yield relevant, visually appealing images on platforms like Flickr or Pixabay. Focus on concrete, visual concepts related to food, dining, or restaurants, avoiding overly abstract terms. Also provide relevance keywords (max 5 words) to filter results. Return the result as a JSON object with 'search' and 'relevance' keys.\n\n"
"Given the following content, generate a concise image search query (max 5 words) that would likely yield relevant, visually appealing images on platforms like Flickr or Pixabay. "
"Identify and prioritize specific entities like brand names or unique terms over abstract or generic concepts. "
"Focus on concrete, visual concepts related to food, dining, or restaurants. "
"Also provide relevance keywords (max 5 words) to filter results, using general themes related to the content. "
"Return the result as a JSON object with 'search' and 'relevance' keys.\n\n"
"Content:\n"
f"{content}\n\n"
"Example output:\n"
"```json\n"
"{\n"
" \"search\": \"modern dining trends\",\n"
" \"relevance\": \"dining habits restaurant trends\"\n"
" \"search\": \"Wingstop dining\",\n"
" \"relevance\": \"fast food dining\"\n"
"}\n```"
)
@ -379,18 +455,18 @@ def generate_image_query(content):
json_match = re.search(r'```json\n([\s\S]*?)\n```', raw_response)
if not json_match:
logging.warning(f"Failed to parse image query JSON from GPT response: {raw_response}")
return "restaurant dining", "dining trends"
return "food dining", ["dining", "trends"]
query_data = json.loads(json_match.group(1))
search_query = query_data.get("search", "restaurant dining")
relevance_keywords = query_data.get("relevance", "dining trends")
search_query = query_data.get("search", "food dining")
relevance_keywords = query_data.get("relevance", ["dining", "trends"])
logging.debug(f"Image query from content: {query_data}")
return search_query, relevance_keywords
except Exception as e:
logging.warning(f"Failed to generate image query: {e}. Using fallback.")
return "restaurant dining", "dining trends"
return "food dining", ["dining", "trends"]
def smart_image_and_filter(title, summary):
try:
@ -893,7 +969,18 @@ if os.path.exists(used_images_file):
try:
with open(used_images_file, 'r') as f:
data = json.load(f)
used_images.update(data)
# Handle malformed format (list of lists)
if isinstance(data, list) and data and isinstance(data[0], list):
logging.warning(f"Fixing malformed used_images.json format: {data[:2]}...")
flat_data = []
for item in data:
if isinstance(item, list):
flat_data.extend(item)
else:
flat_data.append(item)
used_images.update(flat_data)
else:
used_images.update(data)
logging.info(f"Loaded {len(used_images)} used image URLs from {used_images_file}")
except Exception as e:
logging.warning(f"Failed to load used images from {used_images_file}: {e}")
@ -913,117 +1000,7 @@ def reset_flickr_request_count():
flickr_request_count = 0
flickr_request_start_time = time.time()
def get_flickr_image(search_query, relevance_keywords):
global last_flickr_request_time, flickr_request_count
reset_flickr_request_count()
flickr_request_count += 1
logging.info(f"Flickr request count: {flickr_request_count}/3600")
# Enforce a minimum delay of 1 second between Flickr requests
current_time = time.time()
time_since_last_request = current_time - last_flickr_request_time
if time_since_last_request < 1:
time.sleep(1 - time_since_last_request)
last_flickr_request_time = time.time()
try:
# Search for photos on Flickr using the API
photos = flickr_api.Photo.search(
text=search_query,
per_page=10,
sort='relevance',
safe_search=1,
media='photos',
license='4,5,9,10' # Commercial use licenses (CC BY, CC BY-SA, etc.)
)
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
for photo in photos:
# Fetch photo metadata (tags and title)
tags = [tag.text.lower() for tag in photo.getTags()]
title = photo.title.lower() if photo.title else ""
# Filter out images with unwanted keywords in tags or title
matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
if matched_keywords:
logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})")
continue
img_url = photo.getPhotoFile(size_label='Large')
if not img_url:
img_url = photo.getPhotoFile(size_label='Medium')
if not img_url:
continue
if img_url in used_images:
continue
# Download the image and run OCR to check for excessive text
temp_file = None
try:
img_response = requests.get(img_url, headers=headers, timeout=10)
img_response.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
temp_file.write(img_response.content)
temp_path = temp_file.name
img = Image.open(temp_path)
text = pytesseract.image_to_string(img)
char_count = len(text.strip())
logging.info(f"OCR processed {img_url}: {char_count} characters detected")
if char_count > 200:
logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})")
continue
uploader = photo.owner.username
page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
# Add the image URL to used_images
used_images.add(img_url)
save_used_images()
# Save Flickr image metadata
flickr_data = {
"title": search_query,
"image_url": img_url,
"source": "Flickr",
"uploader": uploader,
"page_url": page_url,
"timestamp": datetime.now(timezone.utc).isoformat(),
"ocr_chars": char_count
}
flickr_file = "/home/shane/foodie_automator/flickr_images.json"
with open(flickr_file, 'a') as f:
json.dump(flickr_data, f)
f.write('\n')
logging.info(f"Saved Flickr image to {flickr_file}: {img_url}")
logging.info(f"Fetched Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
return img_url, "Flickr", uploader, page_url
except requests.exceptions.HTTPError as e:
if e.response.status_code == 429:
logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.")
return None, None, None, None
else:
logging.warning(f"Download failed for {img_url}: {e}")
continue
except Exception as e:
logging.warning(f"OCR processing failed for {img_url}: {e}")
continue
finally:
if temp_file and os.path.exists(temp_path):
os.unlink(temp_path)
logging.warning(f"No valid Flickr image found for query '{search_query}'.")
return None, None, None, None
except Exception as e:
logging.warning(f"Flickr API error for query '{search_query}': {e}. Falling back to Pixabay.")
return None, None, None, None
if keyword in ['smart', 'ai', 'ai-powered', 'kitchen', 'dining', 'experience']:
def select_best_author(summary):
try:

Loading…
Cancel
Save