diff --git a/foodie_automator_google.py b/foodie_automator_google.py index a70fa93..1a29d48 100644 --- a/foodie_automator_google.py +++ b/foodie_automator_google.py @@ -27,7 +27,7 @@ from foodie_utils import ( upload_image_to_wp, select_best_persona, determine_paragraph_count, is_interesting, generate_title_from_summary, summarize_with_gpt4o, generate_category_from_summary, post_to_wp, prepare_post_data, - smart_image_and_filter, insert_link_naturally, get_flickr_image_via_ddg + smart_image_and_filter, insert_link_naturally, get_flickr_image # Updated function name ) from foodie_hooks import get_dynamic_hook, select_best_cta from dotenv import load_dotenv @@ -259,7 +259,7 @@ def curate_from_google_trends(geo_list=['US']): continue # Fetch image - image_url, image_source, uploader, page_url = get_flickr_image_via_ddg(image_query, relevance_keywords) + image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords) if not image_url: image_url, image_source, uploader, page_url = get_image(image_query) diff --git a/foodie_automator_reddit.py b/foodie_automator_reddit.py index 396f6b8..3845ab2 100644 --- a/foodie_automator_reddit.py +++ b/foodie_automator_reddit.py @@ -14,6 +14,7 @@ from urllib.parse import quote from requests.packages.urllib3.util.retry import Retry from requests.adapters import HTTPAdapter import praw +from dotenv import load_dotenv from foodie_config import ( AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES, CTAS, get_clean_source_name, @@ -25,10 +26,12 @@ from foodie_utils import ( upload_image_to_wp, determine_paragraph_count, insert_link_naturally, summarize_with_gpt4o, generate_category_from_summary, post_to_wp, prepare_post_data, select_best_author, smart_image_and_filter, - get_flickr_image_via_ddg + get_flickr_image # Updated function name ) from foodie_hooks import get_dynamic_hook, select_best_cta +load_dotenv() + # Flag to indicate if we're in the middle of posting is_posting = False @@ -294,7 +297,8 @@ def curate_from_reddit(): attempts += 1 continue - image_url, image_source, uploader, page_url = get_flickr_image_via_ddg(image_query, relevance_keywords) + # Fetch image + image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords) if not image_url: image_url, image_source, uploader, page_url = get_image(image_query) diff --git a/foodie_automator_rss.py b/foodie_automator_rss.py index 61537a3..7862e5b 100644 --- a/foodie_automator_rss.py +++ b/foodie_automator_rss.py @@ -24,7 +24,7 @@ from foodie_utils import ( upload_image_to_wp, determine_paragraph_count, insert_link_naturally, is_interesting, generate_title_from_summary, summarize_with_gpt4o, generate_category_from_summary, post_to_wp, prepare_post_data, - select_best_author, smart_image_and_filter, get_flickr_image_via_ddg + select_best_author, smart_image_and_filter, get_flickr_image # Updated function name ) from foodie_hooks import get_dynamic_hook, select_best_cta from dotenv import load_dotenv @@ -247,7 +247,7 @@ def curate_from_rss(): continue # Fetch image - image_url, image_source, uploader, page_url = get_flickr_image_via_ddg(image_query, relevance_keywords) + image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords) if not image_url: image_url, image_source, uploader, page_url = get_image(image_query) diff --git a/foodie_config.py b/foodie_config.py index 86e3d0a..6be0e1d 100644 --- a/foodie_config.py +++ b/foodie_config.py @@ -6,6 +6,8 @@ import os load_dotenv() OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") PIXABAY_API_KEY = os.getenv("PIXABAY_API_KEY") +FLICKR_API_KEY = os.getenv("FLICKR_API_KEY") +FLICKR_API_SECRET = os.getenv("FLICKR_API_SECRET") AUTHORS = [ { diff --git a/foodie_utils.py b/foodie_utils.py index 8ac38fd..22c1f60 100644 --- a/foodie_utils.py +++ b/foodie_utils.py @@ -15,14 +15,15 @@ from dotenv import load_dotenv from datetime import datetime, timezone, timedelta from openai import OpenAI from urllib.parse import quote -from duckduckgo_search import DDGS from bs4 import BeautifulSoup from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry import tweepy +import flickr_api from foodie_config import ( RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, - get_clean_source_name, AUTHORS, LIGHT_TASK_MODEL, SUMMARY_MODEL, X_API_CREDENTIALS + get_clean_source_name, AUTHORS, LIGHT_TASK_MODEL, SUMMARY_MODEL, X_API_CREDENTIALS, + FLICKR_API_KEY, FLICKR_API_SECRET, PIXABAY_API_KEY ) load_dotenv() @@ -212,53 +213,131 @@ def select_best_persona(interest_score, content=""): return random.choice(personas) def get_image(search_query): - api_key = "14836528-999c19a033d77d463113b1fb8" - base_url = "https://pixabay.com/api/" - queries = [search_query.split()[:2], search_query.split()] - - for query in queries: - short_query = " ".join(query) - params = { - "key": api_key, - "q": short_query, - "image_type": "photo", - "safesearch": True, - "per_page": 20 - } - try: - logging.info(f"Fetching Pixabay image for query '{short_query}'") - response = requests.get(base_url, params=params, timeout=10) - response.raise_for_status() - data = response.json() + global last_flickr_request_time, flickr_request_count + + reset_flickr_request_count() + flickr_request_count += 1 + logging.info(f"Flickr request count: {flickr_request_count}/3600") + + # Enforce a minimum delay of 1 second between Flickr requests + current_time = time.time() + time_since_last_request = current_time - last_flickr_request_time + if time_since_last_request < 1: + time.sleep(1 - time_since_last_request) + + last_flickr_request_time = time.time() + + try: + # Try Flickr API first + photos = flickr_api.Photo.search( + text=search_query, + per_page=10, + sort='relevance', + safe_search=1, + media='photos', + license='4,5,9,10' # Commercial use licenses + ) + + headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} + + for photo in photos: + # Fetch photo metadata (tags and title) + tags = [tag.text.lower() for tag in photo.getTags()] + title = photo.title.lower() if photo.title else "" - if not data.get("hits"): - logging.warning(f"No image hits for query '{short_query}'") + # Filter out images with unwanted keywords in tags or title + matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title] + if matched_keywords: + logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})") continue - valid_images = [ - hit for hit in data["hits"] - if all(tag not in hit.get("tags", "").lower() for tag in ["dog", "cat", "family", "child", "baby"]) - ] - - if not valid_images: - logging.warning(f"No valid images for query '{short_query}' after filtering") + img_url = photo.getPhotoFile(size_label='Medium') + if not img_url: + continue + if img_url in used_images: continue - image = random.choice(valid_images) - image_url = image["webformatURL"] - image_source = "Pixabay" - uploader = image.get("user", "Unknown") - pixabay_url = image["pageURL"] - - logging.info(f"Fetched image URL: {image_url} by {uploader} for query '{short_query}'") - print(f"DEBUG: Image selected for query '{short_query}': {image_url}") - return image_url, image_source, uploader, pixabay_url - except requests.exceptions.RequestException as e: - logging.error(f"Image fetch failed for query '{short_query}': {e}") - continue - - logging.error(f"All Pixabay image queries failed: {queries}") - return None, None, None, None + # Download the image and run OCR to check for excessive text + temp_file = None + try: + img_response = requests.get(img_url, headers=headers, timeout=10) + img_response.raise_for_status() + with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file: + temp_file.write(img_response.content) + temp_path = temp_file.name + + img = Image.open(temp_path) + text = pytesseract.image_to_string(img) + char_count = len(text.strip()) + logging.info(f"OCR processed {img_url}: {char_count} characters detected") + + if char_count > 200: + logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})") + continue + + uploader = photo.owner.username + page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}" + + # Save Flickr image metadata + flickr_data = { + "title": search_query, + "image_url": img_url, + "source": "Flickr", + "uploader": uploader, + "page_url": page_url, + "timestamp": datetime.now(timezone.utc).isoformat(), + "ocr_chars": char_count + } + flickr_file = "/home/shane/foodie_automator/flickr_images.json" + with open(flickr_file, 'a') as f: + json.dump(flickr_data, f) + f.write('\n') + logging.info(f"Saved Flickr image to {flickr_file}: {img_url}") + + logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})") + return img_url, "Flickr", uploader, page_url + + except requests.exceptions.HTTPError as e: + if e.response.status_code == 429: + logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.") + return None, None, None, None + else: + logging.warning(f"Download failed for {img_url}: {e}") + continue + except Exception as e: + logging.warning(f"OCR processing failed for {img_url}: {e}") + continue + finally: + if temp_file and os.path.exists(temp_path): + os.unlink(temp_path) + + logging.warning(f"No valid Flickr image found in fallback for query '{search_query}'. Trying Pixabay.") + + except Exception as e: + logging.warning(f"Fallback Flickr API error for query '{search_query}': {e}. Falling back to Pixabay.") + + # Fallback to Pixabay + try: + pixabay_url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={quote(search_query)}&image_type=photo&per_page=10" + response = requests.get(pixabay_url, timeout=10) + response.raise_for_status() + data = response.json() + + for hit in data.get('hits', []): + img_url = hit.get('webformatURL') + if not img_url or img_url in used_images: + continue + uploader = hit.get('user', 'Unknown') + page_url = hit.get('pageURL', img_url) + logging.debug(f"Image selected for query '{search_query}': {img_url}") + return img_url, "Pixabay", uploader, page_url + + logging.warning(f"No valid Pixabay image found for query '{search_query}'.") + return None, None, None, None + + except Exception as e: + logging.error(f"Pixabay image fetch failed for query '{search_query}': {e}") + return None, None, None, None def generate_image_query(content): try: @@ -781,119 +860,135 @@ def post_to_wp(post_data, category, link, author, image_url, original_source, im print(f"WP Error: {e}") return None, None -def get_flickr_image_via_ddg(search_query, relevance_keywords): - try: - with DDGS() as ddgs: - results = ddgs.images( - f"{search_query} flickr site:flickr.com -poster -infographic -chart -graph -data -stats -text -typography", - license_image="sharecommercially", - max_results=30 - ) - if not results: - logging.warning(f"No Flickr images found via DDG for query '{search_query}'") - return None, None, None, None +# Configure Flickr API with credentials +flickr_api.set_keys(api_key=FLICKR_API_KEY, api_secret=FLICKR_API_SECRET) +logging.info(f"Flickr API configured with key: {FLICKR_API_KEY[:4]}... and secret: {FLICKR_API_SECRET[:4]}...") - headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} - candidates = [] +# Global variable to track the last Flickr request time +last_flickr_request_time = 0 - for r in results: - image_url = r.get("image", "") - page_url = r.get("url", "") - if not image_url or "live.staticflickr.com" not in image_url: - continue +# Flickr request counter +flickr_request_count = 0 +flickr_request_start_time = time.time() - try: - response = requests.get(page_url, headers=headers, timeout=10) - response.raise_for_status() - soup = BeautifulSoup(response.content, 'html.parser') - time.sleep(1) - - tags_elem = soup.find_all('a', class_='tag') - tags = [tag.text.strip().lower() for tag in tags_elem] if tags_elem else [] - title_elem = soup.find('h1', class_='photo-title') - title = title_elem.text.strip().lower() if title_elem else r.get("title", "").lower() - - exclude_keywords = [ - "poster", "infographic", "chart", "graph", "data", "stats", "text", "typography", - "design", "advertisement", "illustration", "diagram", "layout", "print" - ] - matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title] - if matched_keywords: - logging.info(f"Skipping text-heavy image: {image_url} (tags: {tags}, title: {title}, matched: {matched_keywords})") - continue +# Define exclude keywords for filtering unwanted image types +exclude_keywords = [ + "poster", "infographic", "chart", "graph", "data", "stats", "text", "typography", + "design", "advertisement", "illustration", "diagram", "layout", "print" +] - uploader = soup.find('a', class_='owner-name') - uploader = uploader.text.strip() if uploader else "Flickr User" - candidates.append({ - "image_url": image_url, - "page_url": page_url, - "uploader": uploader, - "tags": tags, - "title": title - }) +def reset_flickr_request_count(): + global flickr_request_count, flickr_request_start_time + if time.time() - flickr_request_start_time >= 3600: # Reset every hour + flickr_request_count = 0 + flickr_request_start_time = time.time() - except requests.exceptions.RequestException as e: - logging.info(f"Skipping unavailable image: {image_url} (page: {page_url}, error: {e})") +def get_flickr_image(search_query, relevance_keywords): + global last_flickr_request_time, flickr_request_count + + reset_flickr_request_count() + flickr_request_count += 1 + logging.info(f"Flickr request count: {flickr_request_count}/3600") + + # Enforce a minimum delay of 1 second between Flickr requests + current_time = time.time() + time_since_last_request = current_time - last_flickr_request_time + if time_since_last_request < 1: + time.sleep(1 - time_since_last_request) + + last_flickr_request_time = time.time() + + try: + # Search for photos on Flickr using the API + photos = flickr_api.Photo.search( + text=search_query, + per_page=10, + sort='relevance', + safe_search=1, + media='photos', + license='4,5,9,10' # Commercial use licenses (CC BY, CC BY-SA, etc.) + ) + + headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} + + for photo in photos: + # Fetch photo metadata (tags and title) + tags = [tag.text.lower() for tag in photo.getTags()] + title = photo.title.lower() if photo.title else "" + + # Filter out images with unwanted keywords in tags or title + matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title] + if matched_keywords: + logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})") continue + + img_url = photo.getPhotoFile(size_label='Large') + if not img_url: + img_url = photo.getPhotoFile(size_label='Medium') + if not img_url: + continue + if img_url in used_images: + continue + + # Download the image and run OCR to check for excessive text + temp_file = None + try: + img_response = requests.get(img_url, headers=headers, timeout=10) + img_response.raise_for_status() + with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file: + temp_file.write(img_response.content) + temp_path = temp_file.name - if not candidates: - logging.warning(f"No valid candidate images after filtering for '{search_query}'") - return None, None, None, None + img = Image.open(temp_path) + text = pytesseract.image_to_string(img) + char_count = len(text.strip()) + logging.info(f"OCR processed {img_url}: {char_count} characters detected") - result = random.choice(candidates) - image_url = result["image_url"] + if char_count > 200: + logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})") + continue - temp_file = None - try: - img_response = requests.get(image_url, headers=headers, timeout=10) - img_response.raise_for_status() - with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file: - temp_file.write(img_response.content) - temp_path = temp_file.name - - img = Image.open(temp_path) - text = pytesseract.image_to_string(img) - char_count = len(text.strip()) - logging.info(f"OCR processed {image_url}: {char_count} characters detected") - - if char_count > 200: - logging.info(f"Skipping text-heavy image (OCR): {image_url} (char_count: {char_count})") - return None, None, None, None - - flickr_data = { - "title": search_query, - "image_url": image_url, - "source": "Flickr", - "uploader": result["uploader"], - "page_url": result["page_url"], - "timestamp": datetime.now().isoformat(), - "ocr_chars": char_count - } - flickr_file = "/home/shane/foodie_automator/flickr_images.json" - with open(flickr_file, 'a') as f: - json.dump(flickr_data, f) - f.write('\n') - logging.info(f"Saved Flickr image to {flickr_file}: {image_url}") - logging.info(f"Fetched Flickr image URL: {image_url} by {result['uploader']} for query '{search_query}' (tags: {result['tags']})") - print(f"DEBUG: Flickr image selected: {image_url}") - return image_url, "Flickr", result["uploader"], result["page_url"] - - except requests.exceptions.HTTPError as e: - if e.response.status_code == 429: - logging.warning(f"Rate limit hit for {image_url}. Falling back to Pixabay.") - return None, None, None, None - else: - logging.warning(f"Download failed for {image_url}: {e}") - return None, None, None, None - except Exception as e: - logging.warning(f"OCR processing failed for {image_url}: {e}") - return None, None, None, None - finally: - if temp_file and os.path.exists(temp_path): - os.unlink(temp_path) + uploader = photo.owner.username + page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}" + + # Save Flickr image metadata + flickr_data = { + "title": search_query, + "image_url": img_url, + "source": "Flickr", + "uploader": uploader, + "page_url": page_url, + "timestamp": datetime.now(timezone.utc).isoformat(), + "ocr_chars": char_count + } + flickr_file = "/home/shane/foodie_automator/flickr_images.json" + with open(flickr_file, 'a') as f: + json.dump(flickr_data, f) + f.write('\n') + logging.info(f"Saved Flickr image to {flickr_file}: {img_url}") + + logging.info(f"Fetched Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})") + return img_url, "Flickr", uploader, page_url + except requests.exceptions.HTTPError as e: + if e.response.status_code == 429: + logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.") + return None, None, None, None + else: + logging.warning(f"Download failed for {img_url}: {e}") + continue + except Exception as e: + logging.warning(f"OCR processing failed for {img_url}: {e}") + continue + finally: + if temp_file and os.path.exists(temp_path): + os.unlink(temp_path) + + logging.warning(f"No valid Flickr image found for query '{search_query}'.") + return None, None, None, None + except Exception as e: - logging.error(f"Flickr/DDG image fetch failed for '{search_query}': {e}") + logging.warning(f"Flickr API error for query '{search_query}': {e}. Falling back to Pixabay.") return None, None, None, None def select_best_author(summary): diff --git a/requirements.txt b/requirements.txt index 6930e91..b58c5c3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,5 @@ pytesseract==0.3.13 feedparser==6.0.11 webdriver-manager==4.0.2 tweepy==4.14.0 -python-dotenv==1.0.1 \ No newline at end of file +python-dotenv==1.0.1 +flickr-api==0.7.1 \ No newline at end of file