|
|
|
|
@ -15,14 +15,15 @@ from dotenv import load_dotenv |
|
|
|
|
from datetime import datetime, timezone, timedelta |
|
|
|
|
from openai import OpenAI |
|
|
|
|
from urllib.parse import quote |
|
|
|
|
from duckduckgo_search import DDGS |
|
|
|
|
from bs4 import BeautifulSoup |
|
|
|
|
from requests.adapters import HTTPAdapter |
|
|
|
|
from requests.packages.urllib3.util.retry import Retry |
|
|
|
|
import tweepy |
|
|
|
|
import flickr_api |
|
|
|
|
from foodie_config import ( |
|
|
|
|
RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, |
|
|
|
|
get_clean_source_name, AUTHORS, LIGHT_TASK_MODEL, SUMMARY_MODEL, X_API_CREDENTIALS |
|
|
|
|
get_clean_source_name, AUTHORS, LIGHT_TASK_MODEL, SUMMARY_MODEL, X_API_CREDENTIALS, |
|
|
|
|
FLICKR_API_KEY, FLICKR_API_SECRET, PIXABAY_API_KEY |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
@ -212,53 +213,131 @@ def select_best_persona(interest_score, content=""): |
|
|
|
|
return random.choice(personas) |
|
|
|
|
|
|
|
|
|
def get_image(search_query): |
|
|
|
|
api_key = "14836528-999c19a033d77d463113b1fb8" |
|
|
|
|
base_url = "https://pixabay.com/api/" |
|
|
|
|
queries = [search_query.split()[:2], search_query.split()] |
|
|
|
|
|
|
|
|
|
for query in queries: |
|
|
|
|
short_query = " ".join(query) |
|
|
|
|
params = { |
|
|
|
|
"key": api_key, |
|
|
|
|
"q": short_query, |
|
|
|
|
"image_type": "photo", |
|
|
|
|
"safesearch": True, |
|
|
|
|
"per_page": 20 |
|
|
|
|
} |
|
|
|
|
try: |
|
|
|
|
logging.info(f"Fetching Pixabay image for query '{short_query}'") |
|
|
|
|
response = requests.get(base_url, params=params, timeout=10) |
|
|
|
|
response.raise_for_status() |
|
|
|
|
data = response.json() |
|
|
|
|
global last_flickr_request_time, flickr_request_count |
|
|
|
|
|
|
|
|
|
reset_flickr_request_count() |
|
|
|
|
flickr_request_count += 1 |
|
|
|
|
logging.info(f"Flickr request count: {flickr_request_count}/3600") |
|
|
|
|
|
|
|
|
|
# Enforce a minimum delay of 1 second between Flickr requests |
|
|
|
|
current_time = time.time() |
|
|
|
|
time_since_last_request = current_time - last_flickr_request_time |
|
|
|
|
if time_since_last_request < 1: |
|
|
|
|
time.sleep(1 - time_since_last_request) |
|
|
|
|
|
|
|
|
|
last_flickr_request_time = time.time() |
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
# Try Flickr API first |
|
|
|
|
photos = flickr_api.Photo.search( |
|
|
|
|
text=search_query, |
|
|
|
|
per_page=10, |
|
|
|
|
sort='relevance', |
|
|
|
|
safe_search=1, |
|
|
|
|
media='photos', |
|
|
|
|
license='4,5,9,10' # Commercial use licenses |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
if not data.get("hits"): |
|
|
|
|
logging.warning(f"No image hits for query '{short_query}'") |
|
|
|
|
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} |
|
|
|
|
|
|
|
|
|
for photo in photos: |
|
|
|
|
# Fetch photo metadata (tags and title) |
|
|
|
|
tags = [tag.text.lower() for tag in photo.getTags()] |
|
|
|
|
title = photo.title.lower() if photo.title else "" |
|
|
|
|
|
|
|
|
|
# Filter out images with unwanted keywords in tags or title |
|
|
|
|
matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title] |
|
|
|
|
if matched_keywords: |
|
|
|
|
logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})") |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
img_url = photo.getPhotoFile(size_label='Medium') |
|
|
|
|
if not img_url: |
|
|
|
|
continue |
|
|
|
|
if img_url in used_images: |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
valid_images = [ |
|
|
|
|
hit for hit in data["hits"] |
|
|
|
|
if all(tag not in hit.get("tags", "").lower() for tag in ["dog", "cat", "family", "child", "baby"]) |
|
|
|
|
] |
|
|
|
|
# Download the image and run OCR to check for excessive text |
|
|
|
|
temp_file = None |
|
|
|
|
try: |
|
|
|
|
img_response = requests.get(img_url, headers=headers, timeout=10) |
|
|
|
|
img_response.raise_for_status() |
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file: |
|
|
|
|
temp_file.write(img_response.content) |
|
|
|
|
temp_path = temp_file.name |
|
|
|
|
|
|
|
|
|
img = Image.open(temp_path) |
|
|
|
|
text = pytesseract.image_to_string(img) |
|
|
|
|
char_count = len(text.strip()) |
|
|
|
|
logging.info(f"OCR processed {img_url}: {char_count} characters detected") |
|
|
|
|
|
|
|
|
|
if char_count > 200: |
|
|
|
|
logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})") |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
if not valid_images: |
|
|
|
|
logging.warning(f"No valid images for query '{short_query}' after filtering") |
|
|
|
|
uploader = photo.owner.username |
|
|
|
|
page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}" |
|
|
|
|
|
|
|
|
|
# Save Flickr image metadata |
|
|
|
|
flickr_data = { |
|
|
|
|
"title": search_query, |
|
|
|
|
"image_url": img_url, |
|
|
|
|
"source": "Flickr", |
|
|
|
|
"uploader": uploader, |
|
|
|
|
"page_url": page_url, |
|
|
|
|
"timestamp": datetime.now(timezone.utc).isoformat(), |
|
|
|
|
"ocr_chars": char_count |
|
|
|
|
} |
|
|
|
|
flickr_file = "/home/shane/foodie_automator/flickr_images.json" |
|
|
|
|
with open(flickr_file, 'a') as f: |
|
|
|
|
json.dump(flickr_data, f) |
|
|
|
|
f.write('\n') |
|
|
|
|
logging.info(f"Saved Flickr image to {flickr_file}: {img_url}") |
|
|
|
|
|
|
|
|
|
logging.info(f"Fallback Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})") |
|
|
|
|
return img_url, "Flickr", uploader, page_url |
|
|
|
|
|
|
|
|
|
except requests.exceptions.HTTPError as e: |
|
|
|
|
if e.response.status_code == 429: |
|
|
|
|
logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.") |
|
|
|
|
return None, None, None, None |
|
|
|
|
else: |
|
|
|
|
logging.warning(f"Download failed for {img_url}: {e}") |
|
|
|
|
continue |
|
|
|
|
except Exception as e: |
|
|
|
|
logging.warning(f"OCR processing failed for {img_url}: {e}") |
|
|
|
|
continue |
|
|
|
|
finally: |
|
|
|
|
if temp_file and os.path.exists(temp_path): |
|
|
|
|
os.unlink(temp_path) |
|
|
|
|
|
|
|
|
|
image = random.choice(valid_images) |
|
|
|
|
image_url = image["webformatURL"] |
|
|
|
|
image_source = "Pixabay" |
|
|
|
|
uploader = image.get("user", "Unknown") |
|
|
|
|
pixabay_url = image["pageURL"] |
|
|
|
|
logging.warning(f"No valid Flickr image found in fallback for query '{search_query}'. Trying Pixabay.") |
|
|
|
|
|
|
|
|
|
logging.info(f"Fetched image URL: {image_url} by {uploader} for query '{short_query}'") |
|
|
|
|
print(f"DEBUG: Image selected for query '{short_query}': {image_url}") |
|
|
|
|
return image_url, image_source, uploader, pixabay_url |
|
|
|
|
except requests.exceptions.RequestException as e: |
|
|
|
|
logging.error(f"Image fetch failed for query '{short_query}': {e}") |
|
|
|
|
continue |
|
|
|
|
except Exception as e: |
|
|
|
|
logging.warning(f"Fallback Flickr API error for query '{search_query}': {e}. Falling back to Pixabay.") |
|
|
|
|
|
|
|
|
|
logging.error(f"All Pixabay image queries failed: {queries}") |
|
|
|
|
return None, None, None, None |
|
|
|
|
# Fallback to Pixabay |
|
|
|
|
try: |
|
|
|
|
pixabay_url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={quote(search_query)}&image_type=photo&per_page=10" |
|
|
|
|
response = requests.get(pixabay_url, timeout=10) |
|
|
|
|
response.raise_for_status() |
|
|
|
|
data = response.json() |
|
|
|
|
|
|
|
|
|
for hit in data.get('hits', []): |
|
|
|
|
img_url = hit.get('webformatURL') |
|
|
|
|
if not img_url or img_url in used_images: |
|
|
|
|
continue |
|
|
|
|
uploader = hit.get('user', 'Unknown') |
|
|
|
|
page_url = hit.get('pageURL', img_url) |
|
|
|
|
logging.debug(f"Image selected for query '{search_query}': {img_url}") |
|
|
|
|
return img_url, "Pixabay", uploader, page_url |
|
|
|
|
|
|
|
|
|
logging.warning(f"No valid Pixabay image found for query '{search_query}'.") |
|
|
|
|
return None, None, None, None |
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
logging.error(f"Pixabay image fetch failed for query '{search_query}': {e}") |
|
|
|
|
return None, None, None, None |
|
|
|
|
|
|
|
|
|
def generate_image_query(content): |
|
|
|
|
try: |
|
|
|
|
@ -781,119 +860,135 @@ def post_to_wp(post_data, category, link, author, image_url, original_source, im |
|
|
|
|
print(f"WP Error: {e}") |
|
|
|
|
return None, None |
|
|
|
|
|
|
|
|
|
def get_flickr_image_via_ddg(search_query, relevance_keywords): |
|
|
|
|
# Configure Flickr API with credentials |
|
|
|
|
flickr_api.set_keys(api_key=FLICKR_API_KEY, api_secret=FLICKR_API_SECRET) |
|
|
|
|
logging.info(f"Flickr API configured with key: {FLICKR_API_KEY[:4]}... and secret: {FLICKR_API_SECRET[:4]}...") |
|
|
|
|
|
|
|
|
|
# Global variable to track the last Flickr request time |
|
|
|
|
last_flickr_request_time = 0 |
|
|
|
|
|
|
|
|
|
# Flickr request counter |
|
|
|
|
flickr_request_count = 0 |
|
|
|
|
flickr_request_start_time = time.time() |
|
|
|
|
|
|
|
|
|
# Define exclude keywords for filtering unwanted image types |
|
|
|
|
exclude_keywords = [ |
|
|
|
|
"poster", "infographic", "chart", "graph", "data", "stats", "text", "typography", |
|
|
|
|
"design", "advertisement", "illustration", "diagram", "layout", "print" |
|
|
|
|
] |
|
|
|
|
|
|
|
|
|
def reset_flickr_request_count(): |
|
|
|
|
global flickr_request_count, flickr_request_start_time |
|
|
|
|
if time.time() - flickr_request_start_time >= 3600: # Reset every hour |
|
|
|
|
flickr_request_count = 0 |
|
|
|
|
flickr_request_start_time = time.time() |
|
|
|
|
|
|
|
|
|
def get_flickr_image(search_query, relevance_keywords): |
|
|
|
|
global last_flickr_request_time, flickr_request_count |
|
|
|
|
|
|
|
|
|
reset_flickr_request_count() |
|
|
|
|
flickr_request_count += 1 |
|
|
|
|
logging.info(f"Flickr request count: {flickr_request_count}/3600") |
|
|
|
|
|
|
|
|
|
# Enforce a minimum delay of 1 second between Flickr requests |
|
|
|
|
current_time = time.time() |
|
|
|
|
time_since_last_request = current_time - last_flickr_request_time |
|
|
|
|
if time_since_last_request < 1: |
|
|
|
|
time.sleep(1 - time_since_last_request) |
|
|
|
|
|
|
|
|
|
last_flickr_request_time = time.time() |
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
with DDGS() as ddgs: |
|
|
|
|
results = ddgs.images( |
|
|
|
|
f"{search_query} flickr site:flickr.com -poster -infographic -chart -graph -data -stats -text -typography", |
|
|
|
|
license_image="sharecommercially", |
|
|
|
|
max_results=30 |
|
|
|
|
) |
|
|
|
|
if not results: |
|
|
|
|
logging.warning(f"No Flickr images found via DDG for query '{search_query}'") |
|
|
|
|
return None, None, None, None |
|
|
|
|
# Search for photos on Flickr using the API |
|
|
|
|
photos = flickr_api.Photo.search( |
|
|
|
|
text=search_query, |
|
|
|
|
per_page=10, |
|
|
|
|
sort='relevance', |
|
|
|
|
safe_search=1, |
|
|
|
|
media='photos', |
|
|
|
|
license='4,5,9,10' # Commercial use licenses (CC BY, CC BY-SA, etc.) |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} |
|
|
|
|
candidates = [] |
|
|
|
|
|
|
|
|
|
for r in results: |
|
|
|
|
image_url = r.get("image", "") |
|
|
|
|
page_url = r.get("url", "") |
|
|
|
|
if not image_url or "live.staticflickr.com" not in image_url: |
|
|
|
|
for photo in photos: |
|
|
|
|
# Fetch photo metadata (tags and title) |
|
|
|
|
tags = [tag.text.lower() for tag in photo.getTags()] |
|
|
|
|
title = photo.title.lower() if photo.title else "" |
|
|
|
|
|
|
|
|
|
# Filter out images with unwanted keywords in tags or title |
|
|
|
|
matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title] |
|
|
|
|
if matched_keywords: |
|
|
|
|
logging.info(f"Skipping image with unwanted keywords: {photo.id} (tags: {tags}, title: {title}, matched: {matched_keywords})") |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
img_url = photo.getPhotoFile(size_label='Large') |
|
|
|
|
if not img_url: |
|
|
|
|
img_url = photo.getPhotoFile(size_label='Medium') |
|
|
|
|
if not img_url: |
|
|
|
|
continue |
|
|
|
|
if img_url in used_images: |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
# Download the image and run OCR to check for excessive text |
|
|
|
|
temp_file = None |
|
|
|
|
try: |
|
|
|
|
response = requests.get(page_url, headers=headers, timeout=10) |
|
|
|
|
response.raise_for_status() |
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
time.sleep(1) |
|
|
|
|
|
|
|
|
|
tags_elem = soup.find_all('a', class_='tag') |
|
|
|
|
tags = [tag.text.strip().lower() for tag in tags_elem] if tags_elem else [] |
|
|
|
|
title_elem = soup.find('h1', class_='photo-title') |
|
|
|
|
title = title_elem.text.strip().lower() if title_elem else r.get("title", "").lower() |
|
|
|
|
|
|
|
|
|
exclude_keywords = [ |
|
|
|
|
"poster", "infographic", "chart", "graph", "data", "stats", "text", "typography", |
|
|
|
|
"design", "advertisement", "illustration", "diagram", "layout", "print" |
|
|
|
|
] |
|
|
|
|
matched_keywords = [kw for kw in exclude_keywords if kw in tags or kw in title] |
|
|
|
|
if matched_keywords: |
|
|
|
|
logging.info(f"Skipping text-heavy image: {image_url} (tags: {tags}, title: {title}, matched: {matched_keywords})") |
|
|
|
|
img_response = requests.get(img_url, headers=headers, timeout=10) |
|
|
|
|
img_response.raise_for_status() |
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file: |
|
|
|
|
temp_file.write(img_response.content) |
|
|
|
|
temp_path = temp_file.name |
|
|
|
|
|
|
|
|
|
img = Image.open(temp_path) |
|
|
|
|
text = pytesseract.image_to_string(img) |
|
|
|
|
char_count = len(text.strip()) |
|
|
|
|
logging.info(f"OCR processed {img_url}: {char_count} characters detected") |
|
|
|
|
|
|
|
|
|
if char_count > 200: |
|
|
|
|
logging.info(f"Skipping text-heavy image (OCR): {img_url} (char_count: {char_count})") |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
uploader = soup.find('a', class_='owner-name') |
|
|
|
|
uploader = uploader.text.strip() if uploader else "Flickr User" |
|
|
|
|
candidates.append({ |
|
|
|
|
"image_url": image_url, |
|
|
|
|
"page_url": page_url, |
|
|
|
|
"uploader": uploader, |
|
|
|
|
"tags": tags, |
|
|
|
|
"title": title |
|
|
|
|
}) |
|
|
|
|
uploader = photo.owner.username |
|
|
|
|
page_url = f"https://www.flickr.com/photos/{photo.owner.nsid}/{photo.id}" |
|
|
|
|
|
|
|
|
|
except requests.exceptions.RequestException as e: |
|
|
|
|
logging.info(f"Skipping unavailable image: {image_url} (page: {page_url}, error: {e})") |
|
|
|
|
# Save Flickr image metadata |
|
|
|
|
flickr_data = { |
|
|
|
|
"title": search_query, |
|
|
|
|
"image_url": img_url, |
|
|
|
|
"source": "Flickr", |
|
|
|
|
"uploader": uploader, |
|
|
|
|
"page_url": page_url, |
|
|
|
|
"timestamp": datetime.now(timezone.utc).isoformat(), |
|
|
|
|
"ocr_chars": char_count |
|
|
|
|
} |
|
|
|
|
flickr_file = "/home/shane/foodie_automator/flickr_images.json" |
|
|
|
|
with open(flickr_file, 'a') as f: |
|
|
|
|
json.dump(flickr_data, f) |
|
|
|
|
f.write('\n') |
|
|
|
|
logging.info(f"Saved Flickr image to {flickr_file}: {img_url}") |
|
|
|
|
|
|
|
|
|
logging.info(f"Fetched Flickr image: {img_url} by {uploader} for query '{search_query}' (tags: {tags})") |
|
|
|
|
return img_url, "Flickr", uploader, page_url |
|
|
|
|
|
|
|
|
|
except requests.exceptions.HTTPError as e: |
|
|
|
|
if e.response.status_code == 429: |
|
|
|
|
logging.warning(f"Rate limit hit for {img_url}. Falling back to Pixabay.") |
|
|
|
|
return None, None, None, None |
|
|
|
|
else: |
|
|
|
|
logging.warning(f"Download failed for {img_url}: {e}") |
|
|
|
|
continue |
|
|
|
|
except Exception as e: |
|
|
|
|
logging.warning(f"OCR processing failed for {img_url}: {e}") |
|
|
|
|
continue |
|
|
|
|
finally: |
|
|
|
|
if temp_file and os.path.exists(temp_path): |
|
|
|
|
os.unlink(temp_path) |
|
|
|
|
|
|
|
|
|
if not candidates: |
|
|
|
|
logging.warning(f"No valid candidate images after filtering for '{search_query}'") |
|
|
|
|
return None, None, None, None |
|
|
|
|
|
|
|
|
|
result = random.choice(candidates) |
|
|
|
|
image_url = result["image_url"] |
|
|
|
|
|
|
|
|
|
temp_file = None |
|
|
|
|
try: |
|
|
|
|
img_response = requests.get(image_url, headers=headers, timeout=10) |
|
|
|
|
img_response.raise_for_status() |
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file: |
|
|
|
|
temp_file.write(img_response.content) |
|
|
|
|
temp_path = temp_file.name |
|
|
|
|
|
|
|
|
|
img = Image.open(temp_path) |
|
|
|
|
text = pytesseract.image_to_string(img) |
|
|
|
|
char_count = len(text.strip()) |
|
|
|
|
logging.info(f"OCR processed {image_url}: {char_count} characters detected") |
|
|
|
|
|
|
|
|
|
if char_count > 200: |
|
|
|
|
logging.info(f"Skipping text-heavy image (OCR): {image_url} (char_count: {char_count})") |
|
|
|
|
return None, None, None, None |
|
|
|
|
|
|
|
|
|
flickr_data = { |
|
|
|
|
"title": search_query, |
|
|
|
|
"image_url": image_url, |
|
|
|
|
"source": "Flickr", |
|
|
|
|
"uploader": result["uploader"], |
|
|
|
|
"page_url": result["page_url"], |
|
|
|
|
"timestamp": datetime.now().isoformat(), |
|
|
|
|
"ocr_chars": char_count |
|
|
|
|
} |
|
|
|
|
flickr_file = "/home/shane/foodie_automator/flickr_images.json" |
|
|
|
|
with open(flickr_file, 'a') as f: |
|
|
|
|
json.dump(flickr_data, f) |
|
|
|
|
f.write('\n') |
|
|
|
|
logging.info(f"Saved Flickr image to {flickr_file}: {image_url}") |
|
|
|
|
logging.info(f"Fetched Flickr image URL: {image_url} by {result['uploader']} for query '{search_query}' (tags: {result['tags']})") |
|
|
|
|
print(f"DEBUG: Flickr image selected: {image_url}") |
|
|
|
|
return image_url, "Flickr", result["uploader"], result["page_url"] |
|
|
|
|
|
|
|
|
|
except requests.exceptions.HTTPError as e: |
|
|
|
|
if e.response.status_code == 429: |
|
|
|
|
logging.warning(f"Rate limit hit for {image_url}. Falling back to Pixabay.") |
|
|
|
|
return None, None, None, None |
|
|
|
|
else: |
|
|
|
|
logging.warning(f"Download failed for {image_url}: {e}") |
|
|
|
|
return None, None, None, None |
|
|
|
|
except Exception as e: |
|
|
|
|
logging.warning(f"OCR processing failed for {image_url}: {e}") |
|
|
|
|
return None, None, None, None |
|
|
|
|
finally: |
|
|
|
|
if temp_file and os.path.exists(temp_path): |
|
|
|
|
os.unlink(temp_path) |
|
|
|
|
logging.warning(f"No valid Flickr image found for query '{search_query}'.") |
|
|
|
|
return None, None, None, None |
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
logging.error(f"Flickr/DDG image fetch failed for '{search_query}': {e}") |
|
|
|
|
logging.warning(f"Flickr API error for query '{search_query}': {e}. Falling back to Pixabay.") |
|
|
|
|
return None, None, None, None |
|
|
|
|
|
|
|
|
|
def select_best_author(summary): |
|
|
|
|
|