use flickr API

my-fix-branch
Shane 7 months ago
parent 17a5bef6b7
commit 76d5c47079
  1. 4
      foodie_automator_google.py
  2. 8
      foodie_automator_reddit.py
  3. 4
      foodie_automator_rss.py
  4. 2
      foodie_config.py
  5. 307
      foodie_utils.py
  6. 1
      requirements.txt

@ -27,7 +27,7 @@ from foodie_utils import (
upload_image_to_wp, select_best_persona, determine_paragraph_count, upload_image_to_wp, select_best_persona, determine_paragraph_count,
is_interesting, generate_title_from_summary, summarize_with_gpt4o, is_interesting, generate_title_from_summary, summarize_with_gpt4o,
generate_category_from_summary, post_to_wp, prepare_post_data, generate_category_from_summary, post_to_wp, prepare_post_data,
smart_image_and_filter, insert_link_naturally, get_flickr_image_via_ddg smart_image_and_filter, insert_link_naturally, get_flickr_image # Updated function name
) )
from foodie_hooks import get_dynamic_hook, select_best_cta from foodie_hooks import get_dynamic_hook, select_best_cta
from dotenv import load_dotenv from dotenv import load_dotenv
@ -259,7 +259,7 @@ def curate_from_google_trends(geo_list=['US']):
continue continue
# Fetch image # Fetch image
image_url, image_source, uploader, page_url = get_flickr_image_via_ddg(image_query, relevance_keywords) image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
if not image_url: if not image_url:
image_url, image_source, uploader, page_url = get_image(image_query) image_url, image_source, uploader, page_url = get_image(image_query)

@ -14,6 +14,7 @@ from urllib.parse import quote
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
import praw import praw
from dotenv import load_dotenv
from foodie_config import ( from foodie_config import (
AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS,
PERSONA_CONFIGS, CATEGORIES, CTAS, get_clean_source_name, PERSONA_CONFIGS, CATEGORIES, CTAS, get_clean_source_name,
@ -25,10 +26,12 @@ from foodie_utils import (
upload_image_to_wp, determine_paragraph_count, insert_link_naturally, upload_image_to_wp, determine_paragraph_count, insert_link_naturally,
summarize_with_gpt4o, generate_category_from_summary, post_to_wp, summarize_with_gpt4o, generate_category_from_summary, post_to_wp,
prepare_post_data, select_best_author, smart_image_and_filter, prepare_post_data, select_best_author, smart_image_and_filter,
get_flickr_image_via_ddg get_flickr_image # Updated function name
) )
from foodie_hooks import get_dynamic_hook, select_best_cta from foodie_hooks import get_dynamic_hook, select_best_cta
load_dotenv()
# Flag to indicate if we're in the middle of posting # Flag to indicate if we're in the middle of posting
is_posting = False is_posting = False
@ -294,7 +297,8 @@ def curate_from_reddit():
attempts += 1 attempts += 1
continue continue
image_url, image_source, uploader, page_url = get_flickr_image_via_ddg(image_query, relevance_keywords) # Fetch image
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
if not image_url: if not image_url:
image_url, image_source, uploader, page_url = get_image(image_query) image_url, image_source, uploader, page_url = get_image(image_query)

@ -24,7 +24,7 @@ from foodie_utils import (
upload_image_to_wp, determine_paragraph_count, insert_link_naturally, upload_image_to_wp, determine_paragraph_count, insert_link_naturally,
is_interesting, generate_title_from_summary, summarize_with_gpt4o, is_interesting, generate_title_from_summary, summarize_with_gpt4o,
generate_category_from_summary, post_to_wp, prepare_post_data, generate_category_from_summary, post_to_wp, prepare_post_data,
select_best_author, smart_image_and_filter, get_flickr_image_via_ddg select_best_author, smart_image_and_filter, get_flickr_image # Updated function name
) )
from foodie_hooks import get_dynamic_hook, select_best_cta from foodie_hooks import get_dynamic_hook, select_best_cta
from dotenv import load_dotenv from dotenv import load_dotenv
@ -247,7 +247,7 @@ def curate_from_rss():
continue continue
# Fetch image # Fetch image
image_url, image_source, uploader, page_url = get_flickr_image_via_ddg(image_query, relevance_keywords) image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
if not image_url: if not image_url:
image_url, image_source, uploader, page_url = get_image(image_query) image_url, image_source, uploader, page_url = get_image(image_query)

@ -6,6 +6,8 @@ import os
load_dotenv() load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PIXABAY_API_KEY = os.getenv("PIXABAY_API_KEY") PIXABAY_API_KEY = os.getenv("PIXABAY_API_KEY")
FLICKR_API_KEY = os.getenv("FLICKR_API_KEY")
FLICKR_API_SECRET = os.getenv("FLICKR_API_SECRET")
AUTHORS = [ AUTHORS = [
{ {

@ -15,14 +15,15 @@ from dotenv import load_dotenv
from datetime import datetime, timezone, timedelta from datetime import datetime, timezone, timedelta
from openai import OpenAI from openai import OpenAI
from urllib.parse import quote from urllib.parse import quote
from duckduckgo_search import DDGS
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry
import tweepy import tweepy
import flickr_api
from foodie_config import ( from foodie_config import (
RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS,
get_clean_source_name, AUTHORS, LIGHT_TASK_MODEL, SUMMARY_MODEL, X_API_CREDENTIALS get_clean_source_name, AUTHORS, LIGHT_TASK_MODEL, SUMMARY_MODEL, X_API_CREDENTIALS,
FLICKR_API_KEY, FLICKR_API_SECRET, PIXABAY_API_KEY
) )
load_dotenv() load_dotenv()
@ -212,52 +213,130 @@ def select_best_persona(interest_score, content=""):
return random.choice(personas) return random.choice(personas)
def get_image(search_query): def get_image(search_query):
api_key = "14836528-999c19a033d77d463113b1fb8" global last_flickr_request_time, flickr_request_count
base_url = "https://pixabay.com/api/"
queries = [search_query.split()[:2], search_query.split()] reset_flickr_request_count()
flickr_request_count += 1
for query in queries: logging.info(f"Flickr request count: {flickr_request_count}/3600")
short_query = " ".join(query)
params = { # Enforce a minimum delay of 1 second between Flickr requests
"key": api_key, current_time = time.time()
"q": short_query, time_since_last_request = current_time - last_flickr_request_time
"image_type": "photo", if time_since_last_request < 1:
"safesearch": True, time.sleep(1 - time_since_last_request)
"per_page": 20
} last_flickr_request_time = time.time()
try: try:
logging.info(f"Fetching Pixabay image for query '{short_query}'") # Try Flickr API first
response = requests.get(base_url, params=params, timeout=10) photos = flickr_api.Photo.search(
response.raise_for_status() text=search_query,
data = response.json() per_page=10,
sort='relevance',
safe_search=1,
media='photos',
license='4,5,9,10' # Commercial use licenses
)
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
for photo in photos:
# Fetch photo metadata (tags and title)
tags = [tag.text.lower() for tag in photo.getTags()]
title = photo.title.lower() if photo.title else ""
if not data.get("hits"): # Filter out images with unwanted keywords in tags or title
logging.warning(f"No image hits for query '{short_query}'") matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
if matched_keywords:
logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})")
continue continue
valid_images = [ img_url = photo.getPhotoFile(size_label='Medium')
hit for hit in data["hits"] if not img_url:
if all(tag not in hit.get("tags", "").lower() for tag in ["dog", "cat", "family", "child", "baby"]) continue
] if img_url in used_images:
continue
# Download the image and run OCR to check for excessive text
temp_file = None
try:
img_response = requests.get(img_url, headers=headers, timeout=10)
img_response.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
temp_file.write(img_response.content)
temp_path = temp_file.name
img = Image.open(temp_path)
text = pytesseract.image_to_string(img)
char_count = len(text.strip())
logging.info(f"OCR processed {img_url}: {char_count} characters detected")
if not valid_images: if char_count > 200:
logging.warning(f"No valid images for query '{short_query}' after filtering") logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})")
continue continue
image = random.choice(valid_images) uploader = photo.owner.username
image_url = image["webformatURL"] page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
image_source = "Pixabay"
uploader = image.get("user", "Unknown")
pixabay_url = image["pageURL"]
logging.info(f"Fetched image URL: {image_url} by {uploader} for query '{short_query}'") # Save Flickr image metadata
print(f"DEBUG: Image selected for query '{short_query}': {image_url}") flickr_data = {
return image_url, image_source, uploader, pixabay_url "title": search_query,
except requests.exceptions.RequestException as e: "image_url": img_url,
logging.error(f"Image fetch failed for query '{short_query}': {e}") "source": "Flickr",
"uploader": uploader,
"page_url": page_url,
"timestamp": datetime.now(timezone.utc).isoformat(),
"ocr_chars": char_count
}
flickr_file = "/home/shane/foodie_automator/flickr_images.json"
with open(flickr_file, 'a') as f:
json.dump(flickr_data, f)
f.write('\n')
logging.info(f"Saved Flickr image to {flickr_file}: {img_url}")
logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
return img_url, "Flickr", uploader, page_url
except requests.exceptions.HTTPError as e:
if e.response.status_code == 429:
logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.")
return None, None, None, None
else:
logging.warning(f"Download failed for {img_url}: {e}")
continue continue
except Exception as e:
logging.warning(f"OCR processing failed for {img_url}: {e}")
continue
finally:
if temp_file and os.path.exists(temp_path):
os.unlink(temp_path)
logging.warning(f"No valid Flickr image found in fallback for query '{search_query}'. Trying Pixabay.")
logging.error(f"All Pixabay image queries failed: {queries}") except Exception as e:
logging.warning(f"Fallback Flickr API error for query '{search_query}': {e}. Falling back to Pixabay.")
# Fallback to Pixabay
try:
pixabay_url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={quote(search_query)}&image_type=photo&per_page=10"
response = requests.get(pixabay_url, timeout=10)
response.raise_for_status()
data = response.json()
for hit in data.get('hits', []):
img_url = hit.get('webformatURL')
if not img_url or img_url in used_images:
continue
uploader = hit.get('user', 'Unknown')
page_url = hit.get('pageURL', img_url)
logging.debug(f"Image selected for query '{search_query}': {img_url}")
return img_url, "Pixabay", uploader, page_url
logging.warning(f"No valid Pixabay image found for query '{search_query}'.")
return None, None, None, None
except Exception as e:
logging.error(f"Pixabay image fetch failed for query '{search_query}': {e}")
return None, None, None, None return None, None, None, None
def generate_image_query(content): def generate_image_query(content):
@ -781,71 +860,80 @@ def post_to_wp(post_data, category, link, author, image_url, original_source, im
print(f"WP Error: {e}") print(f"WP Error: {e}")
return None, None return None, None
def get_flickr_image_via_ddg(search_query, relevance_keywords): # Configure Flickr API with credentials
try: flickr_api.set_keys(api_key=FLICKR_API_KEY, api_secret=FLICKR_API_SECRET)
with DDGS() as ddgs: logging.info(f"Flickr API configured with key: {FLICKR_API_KEY[:4]}... and secret: {FLICKR_API_SECRET[:4]}...")
results = ddgs.images(
f"{search_query} flickr site:flickr.com -poster -infographic -chart -graph -data -stats -text -typography",
license_image="sharecommercially",
max_results=30
)
if not results:
logging.warning(f"No Flickr images found via DDG for query '{search_query}'")
return None, None, None, None
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} # Global variable to track the last Flickr request time
candidates = [] last_flickr_request_time = 0
for r in results: # Flickr request counter
image_url = r.get("image", "") flickr_request_count = 0
page_url = r.get("url", "") flickr_request_start_time = time.time()
if not image_url or "live.staticflickr.com" not in image_url:
continue # Define exclude keywords for filtering unwanted image types
exclude_keywords = [
"poster", "infographic", "chart", "graph", "data", "stats", "text", "typography",
"design", "advertisement", "illustration", "diagram", "layout", "print"
]
def reset_flickr_request_count():
global flickr_request_count, flickr_request_start_time
if time.time() - flickr_request_start_time >= 3600: # Reset every hour
flickr_request_count = 0
flickr_request_start_time = time.time()
def get_flickr_image(search_query, relevance_keywords):
global last_flickr_request_time, flickr_request_count
reset_flickr_request_count()
flickr_request_count += 1
logging.info(f"Flickr request count: {flickr_request_count}/3600")
# Enforce a minimum delay of 1 second between Flickr requests
current_time = time.time()
time_since_last_request = current_time - last_flickr_request_time
if time_since_last_request < 1:
time.sleep(1 - time_since_last_request)
last_flickr_request_time = time.time()
try: try:
response = requests.get(page_url, headers=headers, timeout=10) # Search for photos on Flickr using the API
response.raise_for_status() photos = flickr_api.Photo.search(
soup = BeautifulSoup(response.content, 'html.parser') text=search_query,
time.sleep(1) per_page=10,
sort='relevance',
safe_search=1,
media='photos',
license='4,5,9,10' # Commercial use licenses (CC BY, CC BY-SA, etc.)
)
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
tags_elem = soup.find_all('a', class_='tag') for photo in photos:
tags = [tag.text.strip().lower() for tag in tags_elem] if tags_elem else [] # Fetch photo metadata (tags and title)
title_elem = soup.find('h1', class_='photo-title') tags = [tag.text.lower() for tag in photo.getTags()]
title = title_elem.text.strip().lower() if title_elem else r.get("title", "").lower() title = photo.title.lower() if photo.title else ""
exclude_keywords = [ # Filter out images with unwanted keywords in tags or title
"poster", "infographic", "chart", "graph", "data", "stats", "text", "typography",
"design", "advertisement", "illustration", "diagram", "layout", "print"
]
matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title] matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title]
if matched_keywords: if matched_keywords:
logging.info(f"Skipping text-heavy image: {image_url} (tags: {tags}, title: {title}, matched: {matched_keywords})") logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})")
continue continue
uploader = soup.find('a', class_='owner-name') img_url = photo.getPhotoFile(size_label='Large')
uploader = uploader.text.strip() if uploader else "Flickr User" if not img_url:
candidates.append({ img_url = photo.getPhotoFile(size_label='Medium')
"image_url": image_url, if not img_url:
"page_url": page_url, continue
"uploader": uploader, if img_url in used_images:
"tags": tags,
"title": title
})
except requests.exceptions.RequestException as e:
logging.info(f"Skipping unavailable image: {image_url} (page: {page_url}, error: {e})")
continue continue
if not candidates: # Download the image and run OCR to check for excessive text
logging.warning(f"No valid candidate images after filtering for '{search_query}'")
return None, None, None, None
result = random.choice(candidates)
image_url = result["image_url"]
temp_file = None temp_file = None
try: try:
img_response = requests.get(image_url, headers=headers, timeout=10) img_response = requests.get(img_url, headers=headers, timeout=10)
img_response.raise_for_status() img_response.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file: with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
temp_file.write(img_response.content) temp_file.write(img_response.content)
@ -854,46 +942,53 @@ def get_flickr_image_via_ddg(search_query, relevance_keywords):
img = Image.open(temp_path) img = Image.open(temp_path)
text = pytesseract.image_to_string(img) text = pytesseract.image_to_string(img)
char_count = len(text.strip()) char_count = len(text.strip())
logging.info(f"OCR processed {image_url}: {char_count} characters detected") logging.info(f"OCR processed {img_url}: {char_count} characters detected")
if char_count > 200: if char_count > 200:
logging.info(f"Skipping text-heavy image (OCR): {image_url} (char_count: {char_count})") logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})")
return None, None, None, None continue
uploader = photo.owner.username
page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}"
# Save Flickr image metadata
flickr_data = { flickr_data = {
"title": search_query, "title": search_query,
"image_url": image_url, "image_url": img_url,
"source": "Flickr", "source": "Flickr",
"uploader": result["uploader"], "uploader": uploader,
"page_url": result["page_url"], "page_url": page_url,
"timestamp": datetime.now().isoformat(), "timestamp": datetime.now(timezone.utc).isoformat(),
"ocr_chars": char_count "ocr_chars": char_count
} }
flickr_file = "/home/shane/foodie_automator/flickr_images.json" flickr_file = "/home/shane/foodie_automator/flickr_images.json"
with open(flickr_file, 'a') as f: with open(flickr_file, 'a') as f:
json.dump(flickr_data, f) json.dump(flickr_data, f)
f.write('\n') f.write('\n')
logging.info(f"Saved Flickr image to {flickr_file}: {image_url}") logging.info(f"Saved Flickr image to {flickr_file}: {img_url}")
logging.info(f"Fetched Flickr image URL: {image_url} by {result['uploader']} for query '{search_query}' (tags: {result['tags']})")
print(f"DEBUG: Flickr image selected: {image_url}") logging.info(f"Fetched Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})")
return image_url, "Flickr", result["uploader"], result["page_url"] return img_url, "Flickr", uploader, page_url
except requests.exceptions.HTTPError as e: except requests.exceptions.HTTPError as e:
if e.response.status_code == 429: if e.response.status_code == 429:
logging.warning(f"Rate limit hit for {image_url}. Falling back to Pixabay.") logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.")
return None, None, None, None return None, None, None, None
else: else:
logging.warning(f"Download failed for {image_url}: {e}") logging.warning(f"Download failed for {img_url}: {e}")
return None, None, None, None continue
except Exception as e: except Exception as e:
logging.warning(f"OCR processing failed for {image_url}: {e}") logging.warning(f"OCR processing failed for {img_url}: {e}")
return None, None, None, None continue
finally: finally:
if temp_file and os.path.exists(temp_path): if temp_file and os.path.exists(temp_path):
os.unlink(temp_path) os.unlink(temp_path)
logging.warning(f"No valid Flickr image found for query '{search_query}'.")
return None, None, None, None
except Exception as e: except Exception as e:
logging.error(f"Flickr/DDG image fetch failed for '{search_query}': {e}") logging.warning(f"Flickr API error for query '{search_query}': {e}. Falling back to Pixabay.")
return None, None, None, None return None, None, None, None
def select_best_author(summary): def select_best_author(summary):

@ -10,3 +10,4 @@ feedparser==6.0.11
webdriver-manager==4.0.2 webdriver-manager==4.0.2
tweepy==4.14.0 tweepy==4.14.0
python-dotenv==1.0.1 python-dotenv==1.0.1
flickr-api==0.7.1
Loading…
Cancel
Save