use flickr API

my-fix-branch
Shane 7 months ago
parent 17a5bef6b7
commit 76d5c47079
  1. 4
      foodie_automator_google.py
  2. 8
      foodie_automator_reddit.py
  3. 4
      foodie_automator_rss.py
  4. 2
      foodie_config.py
  5. 387
      foodie_utils.py
  6. 3
      requirements.txt

@ -27,7 +27,7 @@ from foodie_utils import (
upload_image_to_wp, select_best_persona, determine_paragraph_count, upload_image_to_wp, select_best_persona, determine_paragraph_count,
is_interesting, generate_title_from_summary, summarize_with_gpt4o, is_interesting, generate_title_from_summary, summarize_with_gpt4o,
generate_category_from_summary, post_to_wp, prepare_post_data, generate_category_from_summary, post_to_wp, prepare_post_data,
smart_image_and_filter, insert_link_naturally, get_flickr_image_via_ddg smart_image_and_filter, insert_link_naturally, get_flickr_image # Updated function name
) )
from foodie_hooks import get_dynamic_hook, select_best_cta from foodie_hooks import get_dynamic_hook, select_best_cta
from dotenv import load_dotenv from dotenv import load_dotenv
@ -259,7 +259,7 @@ def curate_from_google_trends(geo_list=['US']):
continue continue
# Fetch image # Fetch image
image_url, image_source, uploader, page_url = get_flickr_image_via_ddg(image_query, relevance_keywords) image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
if not image_url: if not image_url:
image_url, image_source, uploader, page_url = get_image(image_query) image_url, image_source, uploader, page_url = get_image(image_query)

@ -14,6 +14,7 @@ from urllib.parse import quote
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
import praw import praw
from dotenv import load_dotenv
from foodie_config import ( from foodie_config import (
AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS,
PERSONA_CONFIGS, CATEGORIES, CTAS, get_clean_source_name, PERSONA_CONFIGS, CATEGORIES, CTAS, get_clean_source_name,
@ -25,10 +26,12 @@ from foodie_utils import (
upload_image_to_wp, determine_paragraph_count, insert_link_naturally, upload_image_to_wp, determine_paragraph_count, insert_link_naturally,
summarize_with_gpt4o, generate_category_from_summary, post_to_wp, summarize_with_gpt4o, generate_category_from_summary, post_to_wp,
prepare_post_data, select_best_author, smart_image_and_filter, prepare_post_data, select_best_author, smart_image_and_filter,
get_flickr_image_via_ddg get_flickr_image # Updated function name
) )
from foodie_hooks import get_dynamic_hook, select_best_cta from foodie_hooks import get_dynamic_hook, select_best_cta
load_dotenv()
# Flag to indicate if we're in the middle of posting # Flag to indicate if we're in the middle of posting
is_posting = False is_posting = False
@ -294,7 +297,8 @@ def curate_from_reddit():
attempts += 1 attempts += 1
continue continue
image_url, image_source, uploader, page_url = get_flickr_image_via_ddg(image_query, relevance_keywords) # Fetch image
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
if not image_url: if not image_url:
image_url, image_source, uploader, page_url = get_image(image_query) image_url, image_source, uploader, page_url = get_image(image_query)

@ -24,7 +24,7 @@ from foodie_utils import (
upload_image_to_wp, determine_paragraph_count, insert_link_naturally, upload_image_to_wp, determine_paragraph_count, insert_link_naturally,
is_interesting, generate_title_from_summary, summarize_with_gpt4o, is_interesting, generate_title_from_summary, summarize_with_gpt4o,
generate_category_from_summary, post_to_wp, prepare_post_data, generate_category_from_summary, post_to_wp, prepare_post_data,
select_best_author, smart_image_and_filter, get_flickr_image_via_ddg select_best_author, smart_image_and_filter, get_flickr_image # Updated function name
) )
from foodie_hooks import get_dynamic_hook, select_best_cta from foodie_hooks import get_dynamic_hook, select_best_cta
from dotenv import load_dotenv from dotenv import load_dotenv
@ -247,7 +247,7 @@ def curate_from_rss():
continue continue
# Fetch image # Fetch image
image_url, image_source, uploader, page_url = get_flickr_image_via_ddg(image_query, relevance_keywords) image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
if not image_url: if not image_url:
image_url, image_source, uploader, page_url = get_image(image_query) image_url, image_source, uploader, page_url = get_image(image_query)

@ -6,6 +6,8 @@ import os
load_dotenv() load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PIXABAY_API_KEY = os.getenv("PIXABAY_API_KEY") PIXABAY_API_KEY = os.getenv("PIXABAY_API_KEY")
FLICKR_API_KEY = os.getenv("FLICKR_API_KEY")
FLICKR_API_SECRET = os.getenv("FLICKR_API_SECRET")
AUTHORS = [ AUTHORS = [
{ {

@ -15,14 +15,15 @@ from dotenv import load_dotenv
from datetime import datetime, timezone, timedelta from datetime import datetime, timezone, timedelta
from openai import OpenAI from openai import OpenAI
from urllib.parse import quote from urllib.parse import quote
from duckduckgo_search import DDGS
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry
import tweepy import tweepy
import flickr_api
from foodie_config import ( from foodie_config import (
RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS,
get_clean_source_name, AUTHORS, LIGHT_TASK_MODEL, SUMMARY_MODEL, X_API_CREDENTIALS get_clean_source_name, AUTHORS, LIGHT_TASK_MODEL, SUMMARY_MODEL, X_API_CREDENTIALS,
FLICKR_API_KEY, FLICKR_API_SECRET, PIXABAY_API_KEY
) )
load_dotenv() load_dotenv()
@ -212,53 +213,131 @@ def select_best_persona(interest_score, content=""):
return random.choice(personas) return random.choice(personas)
def get_image(search_query): def get_image(search_query):
api_key = "14836528-999c19a033d77d463113b1fb8" global last_flickr_request_time, flickr_request_count
base_url = "https://pixabay.com/api/"
queries = [search_query.split()[:2], search_query.split()] reset_flickr_request_count()
flickr_request_count += 1
for query in queries: logging.info(f"Flickr request count: {flickr_request_count}/3600")
short_query = " ".join(query)
params = { # Enforce a minimum delay of 1 second between Flickr requests
"key": api_key, current_time = time.time()
"q": short_query, time_since_last_request = current_time - last_flickr_request_time
"image_type": "photo", if time_since_last_request < 1:
"safesearch": True, time.sleep(1 - time_since_last_request)
"per_page": 20
} last_flickr_request_time = time.time()
try:
logging.info(f"Fetching Pixabay image for query '{short_query}'") try:
response = requests.get(base_url, params=params, timeout=10) # Try Flickr API first
response.raise_for_status() photos = flickr_api.Photo.search(
data = response.json() text=search_query,
per_page=10,
sort='relevance',
safe_search=1,
media='photos',
license='4,5,9,10' # Commercial use licenses
)
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
for photo in photos:
# Fetch photo metadata (tags and title)
tags = [tag.text.lower() for tag in photo.getTags()]
title = photo.title.lower() if photo.title else ""
if not data.get("hits"): # Filter out images with unwanted keywords in tags or title
logging.warning(f"No image hits for query '{short_query}'") matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
if matched_keywords:
logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})")
continue continue
valid_images = [ img_url = photo.getPhotoFile(size_label='Medium')
hit for hit in data["hits"] if not img_url:
if all(tag not in hit.get("tags", "").lower() for tag in ["dog", "cat", "family", "child", "baby"]) continue
] if img_url in used_images:
if not valid_images:
logging.warning(f"No valid images for query '{short_query}' after filtering")
continue continue
image = random.choice(valid_images) # Download the image and run OCR to check for excessive text
image_url = image["webformatURL"] temp_file = None
image_source = "Pixabay" try:
uploader = image.get("user", "Unknown") img_response = requests.get(img_url, headers=headers, timeout=10)
pixabay_url = image["pageURL"] img_response.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
logging.info(f"Fetched image URL: {image_url} by {uploader} for query '{short_query}'") temp_file.write(img_response.content)
print(f"DEBUG: Image selected for query '{short_query}': {image_url}") temp_path = temp_file.name
return image_url, image_source, uploader, pixabay_url
except requests.exceptions.RequestException as e: img = Image.open(temp_path)
logging.error(f"Image fetch failed for query '{short_query}': {e}") text = pytesseract.image_to_string(img)
continue char_count = len(text.strip())
logging.info(f"OCR processed {img_url}: {char_count} characters detected")
logging.error(f"All Pixabay image queries failed: {queries}")
return None, None, None, None if char_count > 200:
logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})")
continue
uploader = photo.owner.username
page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
# Save Flickr image metadata
flickr_data = {
"title": search_query,
"image_url": img_url,
"source": "Flickr",
"uploader": uploader,
"page_url": page_url,
"timestamp": datetime.now(timezone.utc).isoformat(),
"ocr_chars": char_count
}
flickr_file = "/home/shane/foodie_automator/flickr_images.json"
with open(flickr_file, 'a') as f:
json.dump(flickr_data, f)
f.write('\n')
logging.info(f"Saved Flickr image to {flickr_file}: {img_url}")
logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
return img_url, "Flickr", uploader, page_url
except requests.exceptions.HTTPError as e:
if e.response.status_code == 429:
logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.")
return None, None, None, None
else:
logging.warning(f"Download failed for {img_url}: {e}")
continue
except Exception as e:
logging.warning(f"OCR processing failed for {img_url}: {e}")
continue
finally:
if temp_file and os.path.exists(temp_path):
os.unlink(temp_path)
logging.warning(f"No valid Flickr image found in fallback for query '{search_query}'. Trying Pixabay.")
except Exception as e:
logging.warning(f"Fallback Flickr API error for query '{search_query}': {e}. Falling back to Pixabay.")
# Fallback to Pixabay
try:
pixabay_url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={quote(search_query)}&image_type=photo&per_page=10"
response = requests.get(pixabay_url, timeout=10)
response.raise_for_status()
data = response.json()
for hit in data.get('hits', []):
img_url = hit.get('webformatURL')
if not img_url or img_url in used_images:
continue
uploader = hit.get('user', 'Unknown')
page_url = hit.get('pageURL', img_url)
logging.debug(f"Image selected for query '{search_query}': {img_url}")
return img_url, "Pixabay", uploader, page_url
logging.warning(f"No valid Pixabay image found for query '{search_query}'.")
return None, None, None, None
except Exception as e:
logging.error(f"Pixabay image fetch failed for query '{search_query}': {e}")
return None, None, None, None
def generate_image_query(content): def generate_image_query(content):
try: try:
@ -781,119 +860,135 @@ def post_to_wp(post_data, category, link, author, image_url, original_source, im
print(f"WP Error: {e}") print(f"WP Error: {e}")
return None, None return None, None
def get_flickr_image_via_ddg(search_query, relevance_keywords): # Configure Flickr API with credentials
try: flickr_api.set_keys(api_key=FLICKR_API_KEY, api_secret=FLICKR_API_SECRET)
with DDGS() as ddgs: logging.info(f"Flickr API configured with key: {FLICKR_API_KEY[:4]}... and secret: {FLICKR_API_SECRET[:4]}...")
results = ddgs.images(
f"{search_query} flickr site:flickr.com -poster -infographic -chart -graph -data -stats -text -typography",
license_image="sharecommercially",
max_results=30
)
if not results:
logging.warning(f"No Flickr images found via DDG for query '{search_query}'")
return None, None, None, None
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} # Global variable to track the last Flickr request time
candidates = [] last_flickr_request_time = 0
for r in results: # Flickr request counter
image_url = r.get("image", "") flickr_request_count = 0
page_url = r.get("url", "") flickr_request_start_time = time.time()
if not image_url or "live.staticflickr.com" not in image_url:
continue
try: # Define exclude keywords for filtering unwanted image types
response = requests.get(page_url, headers=headers, timeout=10) exclude_keywords = [
response.raise_for_status() "poster", "infographic", "chart", "graph", "data", "stats", "text", "typography",
soup = BeautifulSoup(response.content, 'html.parser') "design", "advertisement", "illustration", "diagram", "layout", "print"
time.sleep(1) ]
tags_elem = soup.find_all('a', class_='tag')
tags = [tag.text.strip().lower() for tag in tags_elem] if tags_elem else []
title_elem = soup.find('h1', class_='photo-title')
title = title_elem.text.strip().lower() if title_elem else r.get("title", "").lower()
exclude_keywords = [
"poster", "infographic", "chart", "graph", "data", "stats", "text", "typography",
"design", "advertisement", "illustration", "diagram", "layout", "print"
]
matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
if matched_keywords:
logging.info(f"Skipping text-heavy image: {image_url} (tags: {tags}, title: {title}, matched: {matched_keywords})")
continue
uploader = soup.find('a', class_='owner-name') def reset_flickr_request_count():
uploader = uploader.text.strip() if uploader else "Flickr User" global flickr_request_count, flickr_request_start_time
candidates.append({ if time.time() - flickr_request_start_time >= 3600: # Reset every hour
"image_url": image_url, flickr_request_count = 0
"page_url": page_url, flickr_request_start_time = time.time()
"uploader": uploader,
"tags": tags,
"title": title
})
except requests.exceptions.RequestException as e: def get_flickr_image(search_query, relevance_keywords):
logging.info(f"Skipping unavailable image: {image_url} (page: {page_url}, error: {e})") global last_flickr_request_time, flickr_request_count
reset_flickr_request_count()
flickr_request_count += 1
logging.info(f"Flickr request count: {flickr_request_count}/3600")
# Enforce a minimum delay of 1 second between Flickr requests
current_time = time.time()
time_since_last_request = current_time - last_flickr_request_time
if time_since_last_request < 1:
time.sleep(1 - time_since_last_request)
last_flickr_request_time = time.time()
try:
# Search for photos on Flickr using the API
photos = flickr_api.Photo.search(
text=search_query,
per_page=10,
sort='relevance',
safe_search=1,
media='photos',
license='4,5,9,10' # Commercial use licenses (CC BY, CC BY-SA, etc.)
)
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
for photo in photos:
# Fetch photo metadata (tags and title)
tags = [tag.text.lower() for tag in photo.getTags()]
title = photo.title.lower() if photo.title else ""
# Filter out images with unwanted keywords in tags or title
matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
if matched_keywords:
logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})")
continue continue
img_url = photo.getPhotoFile(size_label='Large')
if not img_url:
img_url = photo.getPhotoFile(size_label='Medium')
if not img_url:
continue
if img_url in used_images:
continue
# Download the image and run OCR to check for excessive text
temp_file = None
try:
img_response = requests.get(img_url, headers=headers, timeout=10)
img_response.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
temp_file.write(img_response.content)
temp_path = temp_file.name
if not candidates: img = Image.open(temp_path)
logging.warning(f"No valid candidate images after filtering for '{search_query}'") text = pytesseract.image_to_string(img)
return None, None, None, None char_count = len(text.strip())
logging.info(f"OCR processed {img_url}: {char_count} characters detected")
result = random.choice(candidates) if char_count > 200:
image_url = result["image_url"] logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})")
continue
temp_file = None uploader = photo.owner.username
try: page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
img_response = requests.get(image_url, headers=headers, timeout=10)
img_response.raise_for_status() # Save Flickr image metadata
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file: flickr_data = {
temp_file.write(img_response.content) "title": search_query,
temp_path = temp_file.name "image_url": img_url,
"source": "Flickr",
img = Image.open(temp_path) "uploader": uploader,
text = pytesseract.image_to_string(img) "page_url": page_url,
char_count = len(text.strip()) "timestamp": datetime.now(timezone.utc).isoformat(),
logging.info(f"OCR processed {image_url}: {char_count} characters detected") "ocr_chars": char_count
}
if char_count > 200: flickr_file = "/home/shane/foodie_automator/flickr_images.json"
logging.info(f"Skipping text-heavy image (OCR): {image_url} (char_count: {char_count})") with open(flickr_file, 'a') as f:
return None, None, None, None json.dump(flickr_data, f)
f.write('\n')
flickr_data = { logging.info(f"Saved Flickr image to {flickr_file}: {img_url}")
"title": search_query,
"image_url": image_url, logging.info(f"Fetched Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
"source": "Flickr", return img_url, "Flickr", uploader, page_url
"uploader": result["uploader"],
"page_url": result["page_url"],
"timestamp": datetime.now().isoformat(),
"ocr_chars": char_count
}
flickr_file = "/home/shane/foodie_automator/flickr_images.json"
with open(flickr_file, 'a') as f:
json.dump(flickr_data, f)
f.write('\n')
logging.info(f"Saved Flickr image to {flickr_file}: {image_url}")
logging.info(f"Fetched Flickr image URL: {image_url} by {result['uploader']} for query '{search_query}' (tags: {result['tags']})")
print(f"DEBUG: Flickr image selected: {image_url}")
return image_url, "Flickr", result["uploader"], result["page_url"]
except requests.exceptions.HTTPError as e:
if e.response.status_code == 429:
logging.warning(f"Rate limit hit for {image_url}. Falling back to Pixabay.")
return None, None, None, None
else:
logging.warning(f"Download failed for {image_url}: {e}")
return None, None, None, None
except Exception as e:
logging.warning(f"OCR processing failed for {image_url}: {e}")
return None, None, None, None
finally:
if temp_file and os.path.exists(temp_path):
os.unlink(temp_path)
except requests.exceptions.HTTPError as e:
if e.response.status_code == 429:
logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.")
return None, None, None, None
else:
logging.warning(f"Download failed for {img_url}: {e}")
continue
except Exception as e:
logging.warning(f"OCR processing failed for {img_url}: {e}")
continue
finally:
if temp_file and os.path.exists(temp_path):
os.unlink(temp_path)
logging.warning(f"No valid Flickr image found for query '{search_query}'.")
return None, None, None, None
except Exception as e: except Exception as e:
logging.error(f"Flickr/DDG image fetch failed for '{search_query}': {e}") logging.warning(f"Flickr API error for query '{search_query}': {e}. Falling back to Pixabay.")
return None, None, None, None return None, None, None, None
def select_best_author(summary): def select_best_author(summary):

@ -9,4 +9,5 @@ pytesseract==0.3.13
feedparser==6.0.11 feedparser==6.0.11
webdriver-manager==4.0.2 webdriver-manager==4.0.2
tweepy==4.14.0 tweepy==4.14.0
python-dotenv==1.0.1 python-dotenv==1.0.1
flickr-api==0.7.1
Loading…
Cancel
Save