Compare commits

..

9 Commits

Author SHA1 Message Date
Shane 427a5cb919 update whole file 2025-05-03 15:53:22 +10:00
Shane 6d945dae67 fix 2025-05-03 15:21:08 +10:00
Shane 1fd1ad361b Removed Redundant process_photo 2025-05-03 15:05:16 +10:00
Shane a5182bdfb9 fix 2025-05-03 14:49:06 +10:00
Shane be6514e4e3 fix 2025-05-03 14:35:07 +10:00
Shane c936555741 fix flickr image large issue 2025-05-03 14:22:46 +10:00
Shane cdc54f3f14 remove cta import 2025-05-03 14:06:58 +10:00
Shane aabc989e1c remove cta 2025-05-03 13:59:01 +10:00
Shane b025afe9f3 Revert "remove double cta"
This reverts commit e2c47a1a05.
2025-05-03 13:58:44 +10:00
4 changed files with 199 additions and 376 deletions
+2 -14
View File
@@ -256,9 +256,6 @@ def curate_from_google_trends(geo_list=['US']):
if not image_url: if not image_url:
image_url, image_source, uploader, page_url = get_image(image_query) image_url, image_source, uploader, page_url = get_image(image_query)
# Log the fetched image details
logging.info(f"Fetched image for '{post_data['title']}': URL={image_url}, Source={image_source}, Uploader={uploader}, Page URL={page_url}")
hook = get_dynamic_hook(post_data["title"]).strip() hook = get_dynamic_hook(post_data["title"]).strip()
# Generate viral share prompt # Generate viral share prompt
@@ -294,7 +291,8 @@ def curate_from_google_trends(geo_list=['US']):
share_text_encoded = quote(share_text) share_text_encoded = quote(share_text)
post_url_encoded = quote(post_url) post_url_encoded = quote(post_url)
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded) share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
post_data["content"] = f"{final_summary}\n\n{share_links}" # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
post_data["content"] = f"{final_summary}\n\n{share_links}" # Removed cta from content
is_posting = True is_posting = True
try: try:
post_to_wp( post_to_wp(
@@ -320,16 +318,6 @@ def curate_from_google_trends(geo_list=['US']):
logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}") logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")
if image_url: if image_url:
# Check if image is already used
used_images_list = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
used_image_urls = {entry["title"] for entry in used_images_list}
if image_url in used_image_urls:
logging.warning(f"Image '{image_url}' already used, attempting to fetch a new image")
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
if not image_url:
image_url, image_source, uploader, page_url = get_image(image_query)
logging.info(f"New image fetched: URL={image_url}, Source={image_source}, Uploader={uploader}, Page URL={page_url}")
save_json_file(USED_IMAGES_FILE, image_url, timestamp) save_json_file(USED_IMAGES_FILE, image_url, timestamp)
used_images.add(image_url) used_images.add(image_url)
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}") logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
+5 -16
View File
@@ -211,7 +211,7 @@ def curate_from_reddit():
if not articles: if not articles:
print("No Reddit posts available") print("No Reddit posts available")
logging.info("No Reddit posts available") logging.info("No Reddit posts available")
return None, None, random.randint(600, 1800) return None, None, None
articles.sort(key=lambda x: x["upvotes"], reverse=True) articles.sort(key=lambda x: x["upvotes"], reverse=True)
@@ -299,10 +299,8 @@ def curate_from_reddit():
if not image_url: if not image_url:
image_url, image_source, uploader, page_url = get_image(image_query) image_url, image_source, uploader, page_url = get_image(image_query)
# Log the fetched image details
logging.info(f"Fetched image for '{post_data['title']}': URL={image_url}, Source={image_source}, Uploader={uploader}, Page URL={page_url}")
hook = get_dynamic_hook(post_data["title"]).strip() hook = get_dynamic_hook(post_data["title"]).strip()
# Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=None)
# Generate viral share prompt # Generate viral share prompt
share_prompt = get_viral_share_prompt(post_data["title"], final_summary) share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
@@ -311,7 +309,7 @@ def curate_from_reddit():
f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> ' f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>' f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
) )
post_data["content"] = f"{final_summary}\n\n{share_links_template}" post_data["content"] = f"{final_summary}\n\n{share_links_template}" # Removed cta from content
global is_posting global is_posting
is_posting = True is_posting = True
@@ -337,7 +335,8 @@ def curate_from_reddit():
share_text_encoded = quote(share_text) share_text_encoded = quote(share_text)
post_url_encoded = quote(post_url) post_url_encoded = quote(post_url)
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded) share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
post_data["content"] = f"{final_summary}\n\n{share_links}" # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
post_data["content"] = f"{final_summary}\n\n{share_links}" # Removed cta from content
is_posting = True is_posting = True
try: try:
post_to_wp( post_to_wp(
@@ -363,16 +362,6 @@ def curate_from_reddit():
logging.info(f"Successfully saved '{raw_title}' to {POSTED_TITLES_FILE} with timestamp {timestamp}") logging.info(f"Successfully saved '{raw_title}' to {POSTED_TITLES_FILE} with timestamp {timestamp}")
if image_url: if image_url:
# Check if image is already used
used_images_list = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
used_image_urls = {entry["title"] for entry in used_images_list}
if image_url in used_image_urls:
logging.warning(f"Image '{image_url}' already used, attempting to fetch a new image")
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
if not image_url:
image_url, image_source, uploader, page_url = get_image(image_query)
logging.info(f"New image fetched: URL={image_url}, Source={image_source}, Uploader={uploader}, Page URL={page_url}")
save_json_file(USED_IMAGES_FILE, image_url, timestamp) save_json_file(USED_IMAGES_FILE, image_url, timestamp)
used_images.add(image_url) used_images.add(image_url)
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE} with timestamp {timestamp}") logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE} with timestamp {timestamp}")
+8 -19
View File
@@ -32,10 +32,6 @@ from dotenv import load_dotenv
load_dotenv() load_dotenv()
# Log script version to ensure it's the latest
SCRIPT_VERSION = "1.2.0"
logging.info(f"Starting foodie_automator_rss.py version {SCRIPT_VERSION}")
is_posting = False is_posting = False
def signal_handler(sig, frame): def signal_handler(sig, frame):
@@ -273,10 +269,12 @@ def curate_from_rss():
# Fetch image # Fetch image
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords) image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
if not image_url: if not image_url:
logging.info(f"Flickr fetch failed for '{image_query}'. Falling back to Pixabay.")
image_url, image_source, uploader, page_url = get_image(image_query) image_url, image_source, uploader, page_url = get_image(image_query)
if not image_url:
# Log the fetched image details logging.info(f"Pixabay fetch failed for '{image_query}'. Skipping article '{title}'.")
logging.info(f"Fetched image for '{post_data['title']}': URL={image_url}, Source={image_source}, Uploader={uploader}, Page URL={page_url}") attempts += 1
continue
hook = get_dynamic_hook(post_data["title"]).strip() hook = get_dynamic_hook(post_data["title"]).strip()
@@ -287,7 +285,7 @@ def curate_from_rss():
f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> ' f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>' f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
) )
post_data["content"] = f"{final_summary}\n\n{share_links_template}" post_data["content"] = f"{final_summary}\n\n{share_links_template}" # Removed cta from content
global is_posting global is_posting
is_posting = True is_posting = True
@@ -313,7 +311,8 @@ def curate_from_rss():
share_text_encoded = quote(share_text) share_text_encoded = quote(share_text)
post_url_encoded = quote(post_url) post_url_encoded = quote(post_url)
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded) share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
post_data["content"] = f"{final_summary}\n\n{share_links}" # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
post_data["content"] = f"{final_summary}\n\n{share_links}" # Removed cta from content
is_posting = True is_posting = True
try: try:
post_to_wp( post_to_wp(
@@ -339,16 +338,6 @@ def curate_from_rss():
logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}") logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")
if image_url: if image_url:
# Check if image is already used
used_images_list = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
used_image_urls = {entry["title"] for entry in used_images_list}
if image_url in used_image_urls:
logging.warning(f"Image '{image_url}' already used, attempting to fetch a new image")
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
if not image_url:
image_url, image_source, uploader, page_url = get_image(image_query)
logging.info(f"New image fetched: URL={image_url}, Source={image_source}, Uploader={uploader}, Page URL={page_url}")
save_json_file(USED_IMAGES_FILE, image_url, timestamp) save_json_file(USED_IMAGES_FILE, image_url, timestamp)
used_images.add(image_url) used_images.add(image_url)
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}") logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
+183 -326
View File
@@ -236,178 +236,12 @@ def select_best_persona(interest_score, content=""):
return random.choice(personas) return random.choice(personas)
def get_image(search_query): def get_image(search_query):
global last_flickr_request_time, flickr_request_count
reset_flickr_request_count()
flickr_request_count += 1
logging.info(f"Flickr request count: {flickr_request_count}/3600")
current_time = time.time()
time_since_last_request = current_time - last_flickr_request_time
if time_since_last_request < 10:
time.sleep(10 - time_since_last_request)
last_flickr_request_time = time.time()
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
def search_flickr(query, per_page=5): # Try Pixabay with the original query
try:
photos = flickr_api.Photo.search(
text=query,
per_page=per_page,
sort='relevance',
safe_search=1,
media='photos',
license='4,5,9,10'
)
return photos
except Exception as e:
logging.warning(f"Flickr API error for query '{query}': {e}")
return []
def fetch_photo_by_id(photo_id):
try:
photo = flickr_api.Photo(id=photo_id)
return photo
except Exception as e:
logging.warning(f"Failed to fetch Flickr photo ID {photo_id}: {e}")
return None
def process_photo(photo):
tags = [tag.text.lower() for tag in photo.getTags()]
title = photo.title.lower() if photo.title else ""
matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
if matched_keywords:
logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})")
return None
img_url = photo.getPhotoFile(size_label='Medium')
if not img_url or img_url in used_images:
return None
uploader = photo.owner.username
page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
used_images.add(img_url)
save_used_images()
flickr_data = {
"title": search_query,
"image_url": img_url,
"source": "Flickr",
"uploader": uploader,
"page_url": page_url,
"timestamp": datetime.now(timezone.utc).isoformat()
}
flickr_file = "/home/shane/foodie_automator/flickr_images.json"
with open(flickr_file, 'a') as f:
json.dump(flickr_data, f)
f.write('\n')
logging.info(f"Saved Flickr image metadata to {flickr_file}: {img_url}")
logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
return img_url, "Flickr", uploader, page_url
def search_ddg_for_flickr(query):
ddg_query = f"{query} site:flickr.com"
ddg_url = f"https://duckduckgo.com/?q={quote(ddg_query)}"
try:
response = requests.get(ddg_url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
photo_ids = set()
for link in soup.find_all('a', href=True):
href = link['href']
match = re.search(r'flickr\.com/photos/[^/]+/(\d+)', href)
if match:
photo_id = match.group(1)
photo_ids.add(photo_id)
photo_ids = list(photo_ids)[:2] # Limit to 2 IDs
logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}")
return photo_ids
except Exception as e:
logging.warning(f"DDG search failed for query '{ddg_query}': {e}")
return set()
def classify_keywords(keywords):
prompt = (
"Given the following keywords from an image search query, classify each as 'specific' (e.g., brand names, unique entities like 'Taco Bell' or 'Paris') or 'generic' (e.g., common or abstract terms like 'dining' or 'trends'). "
"Return a JSON object mapping each keyword to its classification.\n\n"
"Keywords: " + ", ".join(keywords) + "\n\n"
"Example output format (do not use these exact keywords in your response):\n"
"```json\n"
"{\n"
" \"keyword1\": \"specific\",\n"
" \"keyword2\": \"generic\"\n"
"}\n```"
)
try:
response = client.chat.completions.create(
model=LIGHT_TASK_MODEL,
messages=[
{"role": "system", "content": "You are a helper that classifies keywords."},
{"role": "user", "content": prompt}
],
max_tokens=100,
temperature=0.5
)
raw_response = response.choices[0].message.content
json_match = re.search(r'```json\n([\s\S]*?)\n```', raw_response)
if not json_match:
logging.warning(f"Failed to parse keyword classification JSON: {raw_response}")
return {kw: "specific" for kw in keywords}
classifications = json.loads(json_match.group(1))
return classifications
except Exception as e:
logging.warning(f"Keyword classification failed: {e}. Defaulting to all specific.")
return {kw: "specific" for kw in keywords}
# Step 1: Search DDG to find Flickr photo IDs
logging.info(f"Searching DDG with query: '{search_query} site:flickr.com'")
photo_ids = search_ddg_for_flickr(search_query)
if photo_ids:
for photo_id in photo_ids:
photo = fetch_photo_by_id(photo_id)
if photo:
result = process_photo(photo)
if result:
return result
# Step 2: Break down the query into keywords and classify them for direct Flickr API search
keywords = search_query.lower().split()
if len(keywords) > 1:
classifications = classify_keywords(keywords)
logging.info(f"Keyword classifications: {classifications}")
specific_keywords = [kw for kw, classification in classifications.items() if classification == "specific"]
if specific_keywords:
for keyword in specific_keywords:
logging.info(f"Searching Flickr with specific keyword: '{keyword}'")
photos = search_flickr(keyword)
for photo in photos:
result = process_photo(photo)
if result:
return result
# Step 3: Final fallback to a generic food-related query
logging.info(f"No results found. Falling back to generic query: 'food dining'")
photos = search_flickr("food dining")
for photo in photos:
result = process_photo(photo)
if result:
return result
logging.warning(f"No valid Flickr image found in fallback for query '{search_query}'. Trying Pixabay.")
# Fallback to Pixabay
try: try:
pixabay_url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={quote(search_query)}&image_type=photo&per_page=10" pixabay_url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={quote(search_query)}&image_type=photo&per_page=10"
response = requests.get(pixabay_url, timeout=10) response = requests.get(pixabay_url, headers=headers, timeout=10)
response.raise_for_status() response.raise_for_status()
data = response.json() data = response.json()
@@ -421,15 +255,43 @@ def get_image(search_query):
used_images.add(img_url) used_images.add(img_url)
save_used_images() save_used_images()
logging.debug(f"Image selected for query '{search_query}': {img_url}") logging.info(f"Selected Pixabay image: {img_url} by {uploader} for query '{search_query}'")
return img_url, "Pixabay", uploader, page_url return img_url, "Pixabay", uploader, page_url
logging.warning(f"No valid Pixabay image found for query '{search_query}'.") logging.info(f"No valid Pixabay image found for query '{search_query}'. Trying fallback query.")
return None, None, None, None
except Exception as e: except Exception as e:
logging.error(f"Pixabay image fetch failed for query '{search_query}': {e}") logging.warning(f"Pixabay image fetch failed for query '{search_query}': {e}")
return None, None, None, None
# Fallback to a generic query
fallback_query = "food dining"
try:
pixabay_url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={quote(fallback_query)}&image_type=photo&per_page=10"
response = requests.get(pixabay_url, headers=headers, timeout=10)
response.raise_for_status()
data = response.json()
for hit in data.get('hits', []):
img_url = hit.get('webformatURL')
if not img_url or img_url in used_images:
continue
uploader = hit.get('user', 'Unknown')
page_url = hit.get('pageURL', img_url)
used_images.add(img_url)
save_used_images()
logging.info(f"Selected Pixabay fallback image: {img_url} by {uploader} for query '{fallback_query}'")
return img_url, "Pixabay", uploader, page_url
logging.warning(f"No valid Pixabay image found for fallback query '{fallback_query}'.")
except Exception as e:
logging.warning(f"Pixabay fallback image fetch failed for query '{fallback_query}': {e}")
# Ultimate fallback: return None but log clearly
logging.error(f"All image fetch attempts failed for query '{search_query}'. Returning None.")
return None, None, None, None
def generate_image_query(title, summary): def generate_image_query(title, summary):
try: try:
@@ -1010,24 +872,19 @@ if os.path.exists(used_images_file):
else: else:
data = json.loads(content) data = json.loads(content)
if not isinstance(data, list): if not isinstance(data, list):
logging.warning(f"Invalid format in {used_images_file}: expected a list, got {type(data)}. Resetting.") logging.warning(f"Invalid format in {used_images_file}: expected a list, got {type(data)}. Converting to list.")
data = [] if isinstance(data, dict):
else: # If it's a dict, try to extract URLs from values
# Handle malformed format (list of lists or invalid entries) data = [v for v in data.values() if isinstance(v, str) and v.startswith('https://')]
flat_data = [] else:
for item in data: logging.warning(f"Cannot convert {type(data)} to list. Resetting to empty list.")
if isinstance(item, str) and item.startswith('https://'): data = []
flat_data.append(item) # Filter out non-string or non-URL entries
elif isinstance(item, list): data = [item for item in data if isinstance(item, str) and item.startswith('https://')]
logging.warning(f"Fixing malformed entry in {used_images_file}: {item}")
flat_data.extend([sub_item for sub_item in item if isinstance(sub_item, str) and sub_item.startswith('https://')])
else:
logging.warning(f"Skipping invalid entry in {used_images_file}: {item}")
data = flat_data
used_images.update(data) used_images.update(data)
logging.info(f"Loaded {len(used_images)} used image URLs from {used_images_file}") logging.info(f"Loaded {len(used_images)} used image URLs from {used_images_file}")
except Exception as e: except Exception as e:
logging.warning(f"Failed to load used images from {used_images_file}: {e}. Resetting file.") logging.warning(f"Failed to load used images from {used_images_file}: {e}. Resetting to empty set.")
used_images = set() used_images = set()
with open(used_images_file, 'w') as f: with open(used_images_file, 'w') as f:
json.dump([], f) json.dump([], f)
@@ -1035,17 +892,14 @@ if os.path.exists(used_images_file):
# Function to save used_images to file # Function to save used_images to file
def save_used_images(): def save_used_images():
try: try:
# Ensure used_images contains only valid URLs
valid_urls = [url for url in used_images if isinstance(url, str) and url.startswith('https://')]
if len(valid_urls) != len(used_images):
logging.warning(f"Found {len(used_images) - len(valid_urls)} invalid URLs in used_images set")
with open(used_images_file, 'w') as f: with open(used_images_file, 'w') as f:
f.write('[\n') json.dump(valid_urls, f, indent=2)
urls = list(used_images) logging.info(f"Saved {len(valid_urls)} used image URLs to {used_images_file}")
for i, url in enumerate(urls):
f.write(f'"{url}"')
if i < len(urls) - 1:
f.write(',\n')
else:
f.write('\n')
f.write(']')
logging.info(f"Saved {len(used_images)} used image URLs to {used_images_file}")
except Exception as e: except Exception as e:
logging.warning(f"Failed to save used images to {used_images_file}: {e}") logging.warning(f"Failed to save used images to {used_images_file}: {e}")
@@ -1055,6 +909,134 @@ def reset_flickr_request_count():
flickr_request_count = 0 flickr_request_count = 0
flickr_request_start_time = time.time() flickr_request_start_time = time.time()
def process_photo(photo, search_query):
tags = [tag.text.lower() for tag in photo.getTags()]
title = photo.title.lower() if photo.title else ""
matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
if matched_keywords:
logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})")
return None
# Try 'Large' size first, fall back to 'Medium' if unavailable
img_url = None
try:
img_url = photo.getPhotoFile(size_label='Large')
except flickr_api.flickrerrors.FlickrError as e:
logging.info(f"Large size not available for photo {photo.id}: {e}, trying Medium")
try:
img_url = photo.getPhotoFile(size_label='Medium')
except flickr_api.flickrerrors.FlickrError as e:
logging.warning(f"Medium size not available for photo {photo.id}: {e}")
return None
if not img_url or img_url in used_images:
logging.info(f"Image URL invalid or already used for photo {photo.id}: {img_url}")
return None
uploader = photo.owner.username
page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
used_images.add(img_url)
save_used_images()
flickr_data = {
"title": search_query,
"image_url": img_url,
"source": "Flickr",
"uploader": uploader,
"page_url": page_url,
"timestamp": datetime.now(timezone.utc).isoformat()
}
flickr_file = "/home/shane/foodie_automator/flickr_images.json"
with open(flickr_file, 'a') as f:
json.dump(flickr_data, f)
f.write('\n')
logging.info(f"Saved Flickr image metadata to {flickr_file}: {img_url}")
logging.info(f"Selected Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
return img_url, "Flickr", uploader, page_url
def search_flickr(query, per_page=5):
try:
photos = flickr_api.Photo.search(
text=query,
per_page=per_page,
sort='relevance',
safe_search=1,
media='photos',
license='4,5,9,10'
)
return photos
except Exception as e:
logging.warning(f"Flickr API error for query '{query}': {e}")
return []
def fetch_photo_by_id(photo_id):
try:
photo = flickr_api.Photo(id=photo_id)
return photo
except Exception as e:
logging.warning(f"Failed to fetch Flickr photo ID {photo_id}: {e}")
return None
def search_ddg_for_flickr(query):
ddg_query = f"{query} site:flickr.com"
ddg_url = f"https://duckduckgo.com/?q={quote(ddg_query)}"
try:
response = requests.get(ddg_url, headers={'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
photo_ids = set()
for link in soup.find_all('a', href=True):
href = link['href']
match = re.search(r'flickr\.com/photos/[^/]+/(\d+)', href)
if match:
photo_id = match.group(1)
photo_ids.add(photo_id)
photo_ids = list(photo_ids)[:2] # Limit to 2 IDs
logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}")
return photo_ids
except Exception as e:
logging.warning(f"DDG search failed for query '{ddg_query}': {e}")
return set()
def classify_keywords(keywords):
prompt = (
"Given the following keywords from an image search query, classify each as 'specific' (e.g., brand names, unique entities like 'Taco Bell' or 'Paris') or 'generic' (e.g., common or abstract terms like 'dining' or 'trends'). "
"Return a JSON object mapping each keyword to its classification.\n\n"
"Keywords: " + ", ".join(keywords) + "\n\n"
"Example output format (do not use these exact keywords in your response):\n"
"```json\n"
"{\n"
" \"keyword1\": \"specific\",\n"
" \"keyword2\": \"generic\"\n"
"}\n```"
)
try:
response = client.chat.completions.create(
model=LIGHT_TASK_MODEL,
messages=[
{"role": "system", "content": "You are a helper that classifies keywords."},
{"role": "user", "content": prompt}
],
max_tokens=100,
temperature=0.5
)
raw_response = response.choices[0].message.content
json_match = re.search(r'```json\n([\s\S]*?)\n```', raw_response)
if not json_match:
logging.warning(f"Failed to parse keyword classification JSON: {raw_response}")
return {kw: "specific" for kw in keywords}
classifications = json.loads(json_match.group(1))
return classifications
except Exception as e:
logging.warning(f"Keyword classification failed: {e}. Defaulting to all specific.")
return {kw: "specific" for kw in keywords}
def get_flickr_image(search_query, relevance_keywords): def get_flickr_image(search_query, relevance_keywords):
global last_flickr_request_time, flickr_request_count global last_flickr_request_time, flickr_request_count
@@ -1070,131 +1052,6 @@ def get_flickr_image(search_query, relevance_keywords):
last_flickr_request_time = time.time() last_flickr_request_time = time.time()
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
# Helper function to search Flickr with a given query
def search_flickr(query, per_page=5): # Reduced per_page to limit results
try:
photos = flickr_api.Photo.search(
text=query,
per_page=per_page,
sort='relevance',
safe_search=1,
media='photos',
license='4,5,9,10'
)
return photos
except Exception as e:
logging.warning(f"Flickr API error for query '{query}': {e}")
return []
# Helper function to fetch a Flickr photo by ID
def fetch_photo_by_id(photo_id):
try:
photo = flickr_api.Photo(id=photo_id)
return photo
except Exception as e:
logging.warning(f"Failed to fetch Flickr photo ID {photo_id}: {e}")
return None
# Helper function to process a photo (fetch URL and metadata only)
def process_photo(photo):
tags = [tag.text.lower() for tag in photo.getTags()]
title = photo.title.lower() if photo.title else ""
matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
if matched_keywords:
logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})")
return None
img_url = photo.getPhotoFile(size_label='Large')
if not img_url:
img_url = photo.getPhotoFile(size_label='Medium')
if not img_url or img_url in used_images:
return None
uploader = photo.owner.username
page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
used_images.add(img_url)
save_used_images()
flickr_data = {
"title": search_query,
"image_url": img_url,
"source": "Flickr",
"uploader": uploader,
"page_url": page_url,
"timestamp": datetime.now(timezone.utc).isoformat()
}
flickr_file = "/home/shane/foodie_automator/flickr_images.json"
with open(flickr_file, 'a') as f:
json.dump(flickr_data, f)
f.write('\n')
logging.info(f"Saved Flickr image metadata to {flickr_file}: {img_url}")
logging.info(f"Selected Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
return img_url, "Flickr", uploader, page_url
# Helper function to search DDG and extract Flickr photo IDs
def search_ddg_for_flickr(query):
ddg_query = f"{query} site:flickr.com"
ddg_url = f"https://duckduckgo.com/?q={quote(ddg_query)}"
try:
response = requests.get(ddg_url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
photo_ids = set()
for link in soup.find_all('a', href=True):
href = link['href']
match = re.search(r'flickr\.com/photos/[^/]+/(\d+)', href)
if match:
photo_id = match.group(1)
photo_ids.add(photo_id)
photo_ids = list(photo_ids)[:2] # Limit to 2 IDs
logging.info(f"Found {len(photo_ids)} Flickr photo IDs via DDG: {photo_ids}")
return photo_ids
except Exception as e:
logging.warning(f"DDG search failed for query '{ddg_query}': {e}")
return set()
# Helper function to classify keywords as specific or generic
def classify_keywords(keywords):
prompt = (
"Given the following keywords from an image search query, classify each as 'specific' (e.g., brand names, unique entities like 'Taco Bell' or 'Paris') or 'generic' (e.g., common or abstract terms like 'dining' or 'trends'). "
"Return a JSON object mapping each keyword to its classification.\n\n"
"Keywords: " + ", ".join(keywords) + "\n\n"
"Example output format (do not use these exact keywords in your response):\n"
"```json\n"
"{\n"
" \"keyword1\": \"specific\",\n"
" \"keyword2\": \"generic\"\n"
"}\n```"
)
try:
response = client.chat.completions.create(
model=LIGHT_TASK_MODEL,
messages=[
{"role": "system", "content": "You are a helper that classifies keywords."},
{"role": "user", "content": prompt}
],
max_tokens=100,
temperature=0.5
)
raw_response = response.choices[0].message.content
json_match = re.search(r'```json\n([\s\S]*?)\n```', raw_response)
if not json_match:
logging.warning(f"Failed to parse keyword classification JSON: {raw_response}")
return {kw: "specific" for kw in keywords}
classifications = json.loads(json_match.group(1))
return classifications
except Exception as e:
logging.warning(f"Keyword classification failed: {e}. Defaulting to all specific.")
return {kw: "specific" for kw in keywords}
# Step 1: Search DDG to find Flickr photo IDs # Step 1: Search DDG to find Flickr photo IDs
logging.info(f"Searching DDG with query: '{search_query} site:flickr.com'") logging.info(f"Searching DDG with query: '{search_query} site:flickr.com'")
photo_ids = search_ddg_for_flickr(search_query) photo_ids = search_ddg_for_flickr(search_query)
@@ -1202,7 +1059,7 @@ def get_flickr_image(search_query, relevance_keywords):
for photo_id in photo_ids: for photo_id in photo_ids:
photo = fetch_photo_by_id(photo_id) photo = fetch_photo_by_id(photo_id)
if photo: if photo:
result = process_photo(photo) result = process_photo(photo, search_query)
if result: if result:
return result return result
@@ -1219,7 +1076,7 @@ def get_flickr_image(search_query, relevance_keywords):
logging.info(f"Searching Flickr with specific keyword: '{keyword}'") logging.info(f"Searching Flickr with specific keyword: '{keyword}'")
photos = search_flickr(keyword) photos = search_flickr(keyword)
for photo in photos: for photo in photos:
result = process_photo(photo) result = process_photo(photo, search_query)
if result: if result:
return result return result
@@ -1228,7 +1085,7 @@ def get_flickr_image(search_query, relevance_keywords):
logging.info(f"No results found. Falling back to generic query: '{fallback_query}'") logging.info(f"No results found. Falling back to generic query: '{fallback_query}'")
photos = search_flickr(fallback_query) photos = search_flickr(fallback_query)
for photo in photos: for photo in photos:
result = process_photo(photo) result = process_photo(photo, search_query)
if result: if result:
return result return result