|
|
|
@ -44,12 +44,9 @@ IMAGE_UPLOAD_TIMEOUT = 30 # Added to fix NameError |
|
|
|
IMAGE_EXPIRATION_DAYS = 7 # 7 days, consistent with foodie_automator_rss.py |
|
|
|
IMAGE_EXPIRATION_DAYS = 7 # 7 days, consistent with foodie_automator_rss.py |
|
|
|
|
|
|
|
|
|
|
|
def load_json_file(file_path, expiration_hours=None, default=None): |
|
|
|
def load_json_file(file_path, expiration_hours=None, default=None): |
|
|
|
""" |
|
|
|
|
|
|
|
Load JSON file, optionally filtering expired entries and returning default if invalid. |
|
|
|
|
|
|
|
""" |
|
|
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
if default is None: |
|
|
|
if default is None: |
|
|
|
default = [] # Default to list for posted_rss_titles.json and used_images.json |
|
|
|
default = [] |
|
|
|
|
|
|
|
|
|
|
|
if not os.path.exists(file_path): |
|
|
|
if not os.path.exists(file_path): |
|
|
|
logger.info(f"File {file_path} does not exist. Returning default: {default}") |
|
|
|
logger.info(f"File {file_path} does not exist. Returning default: {default}") |
|
|
|
@ -59,15 +56,34 @@ def load_json_file(file_path, expiration_hours=None, default=None): |
|
|
|
with open(file_path, 'r') as f: |
|
|
|
with open(file_path, 'r') as f: |
|
|
|
data = json.load(f) |
|
|
|
data = json.load(f) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not isinstance(data, list): |
|
|
|
|
|
|
|
logger.warning(f"Data in {file_path} is not a list, resetting to default") |
|
|
|
|
|
|
|
return default |
|
|
|
|
|
|
|
|
|
|
|
if expiration_hours is not None: |
|
|
|
if expiration_hours is not None: |
|
|
|
cutoff = datetime.now(timezone.utc) - timedelta(hours=expiration_hours) |
|
|
|
# Use days for used_images.json, hours for others |
|
|
|
filtered_data = [ |
|
|
|
if "used_images" in file_path: |
|
|
|
entry for entry in data |
|
|
|
expiration_delta = timedelta(days=expiration_hours) |
|
|
|
if datetime.fromisoformat(entry['timestamp']) > cutoff |
|
|
|
else: |
|
|
|
] |
|
|
|
expiration_delta = timedelta(hours=expiration_hours) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cutoff = datetime.now(timezone.utc) - expiration_delta |
|
|
|
|
|
|
|
filtered_data = [] |
|
|
|
|
|
|
|
for entry in data: |
|
|
|
|
|
|
|
if not isinstance(entry, dict) or "title" not in entry or "timestamp" not in entry: |
|
|
|
|
|
|
|
logger.warning(f"Skipping malformed entry in {file_path}: {entry}") |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
timestamp = datetime.fromisoformat(entry["timestamp"]) |
|
|
|
|
|
|
|
if timestamp > cutoff: |
|
|
|
|
|
|
|
filtered_data.append(entry) |
|
|
|
|
|
|
|
except ValueError as e: |
|
|
|
|
|
|
|
logger.warning(f"Invalid timestamp in {file_path} entry {entry}: {e}") |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
if len(filtered_data) < len(data): |
|
|
|
if len(filtered_data) < len(data): |
|
|
|
logger.info(f"Filtered {len(data) - len(filtered_data)} expired entries from {file_path}") |
|
|
|
logger.info(f"Filtered {len(data) - len(filtered_data)} expired entries from {file_path}") |
|
|
|
save_json_file(file_path, filtered_data) # Save filtered data |
|
|
|
save_json_file(file_path, filtered_data) |
|
|
|
data = filtered_data |
|
|
|
data = filtered_data |
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"Loaded {len(data)} valid entries from {file_path}") |
|
|
|
logger.info(f"Loaded {len(data)} valid entries from {file_path}") |
|
|
|
@ -254,64 +270,6 @@ def select_best_persona(interest_score, content=""): |
|
|
|
return random.choice(personas[2:]) |
|
|
|
return random.choice(personas[2:]) |
|
|
|
return random.choice(personas) |
|
|
|
return random.choice(personas) |
|
|
|
|
|
|
|
|
|
|
|
def get_image(search_query): |
|
|
|
|
|
|
|
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Try Pixabay with the original query |
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
pixabay_url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={quote(search_query)}&image_type=photo&per_page=10" |
|
|
|
|
|
|
|
response = requests.get(pixabay_url, headers=headers, timeout=10) |
|
|
|
|
|
|
|
response.raise_for_status() |
|
|
|
|
|
|
|
data = response.json() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for hit in data.get('hits', []): |
|
|
|
|
|
|
|
img_url = hit.get('webformatURL') |
|
|
|
|
|
|
|
if not img_url or img_url in used_images: |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
uploader = hit.get('user', 'Unknown') |
|
|
|
|
|
|
|
page_url = hit.get('pageURL', img_url) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
used_images.add(img_url) |
|
|
|
|
|
|
|
save_used_images() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logging.info(f"Selected Pixabay image: {img_url} by {uploader} for query '{search_query}'") |
|
|
|
|
|
|
|
return img_url, "Pixabay", uploader, page_url |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logging.info(f"No valid Pixabay image found for query '{search_query}'. Trying fallback query.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
|
|
logging.warning(f"Pixabay image fetch failed for query '{search_query}': {e}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Fallback to a generic query |
|
|
|
|
|
|
|
fallback_query = "food dining" |
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
pixabay_url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={quote(fallback_query)}&image_type=photo&per_page=10" |
|
|
|
|
|
|
|
response = requests.get(pixabay_url, headers=headers, timeout=10) |
|
|
|
|
|
|
|
response.raise_for_status() |
|
|
|
|
|
|
|
data = response.json() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for hit in data.get('hits', []): |
|
|
|
|
|
|
|
img_url = hit.get('webformatURL') |
|
|
|
|
|
|
|
if not img_url or img_url in used_images: |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
uploader = hit.get('user', 'Unknown') |
|
|
|
|
|
|
|
page_url = hit.get('pageURL', img_url) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
used_images.add(img_url) |
|
|
|
|
|
|
|
save_used_images() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logging.info(f"Selected Pixabay fallback image: {img_url} by {uploader} for query '{fallback_query}'") |
|
|
|
|
|
|
|
return img_url, "Pixabay", uploader, page_url |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logging.warning(f"No valid Pixabay image found for fallback query '{fallback_query}'.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
|
|
logging.warning(f"Pixabay fallback image fetch failed for query '{fallback_query}': {e}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Ultimate fallback: return None but log clearly |
|
|
|
|
|
|
|
logging.error(f"All image fetch attempts failed for query '{search_query}'. Returning None.") |
|
|
|
|
|
|
|
return None, None, None, None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_image_query(title, summary): |
|
|
|
def generate_image_query(title, summary): |
|
|
|
try: |
|
|
|
try: |
|
|
|
prompt = ( |
|
|
|
prompt = ( |
|
|
|
@ -425,7 +383,7 @@ def smart_image_and_filter(title, summary): |
|
|
|
relevance_keywords = result["relevance"] |
|
|
|
relevance_keywords = result["relevance"] |
|
|
|
main_topic = result.get("main_topic", extract_main_topic(title.lower() + " " + summary.lower())) |
|
|
|
main_topic = result.get("main_topic", extract_main_topic(title.lower() + " " + summary.lower())) |
|
|
|
skip_flag = ( |
|
|
|
skip_flag = ( |
|
|
|
result["aison"] == "SKIP" or |
|
|
|
result["action"] == "SKIP" or # Fixed typo: "aison" → "action" |
|
|
|
"[homemade]" in title.lower() or |
|
|
|
"[homemade]" in title.lower() or |
|
|
|
"homemade" in title.lower() or |
|
|
|
"homemade" in title.lower() or |
|
|
|
"homemade" in summary.lower() or |
|
|
|
"homemade" in summary.lower() or |
|
|
|
@ -1180,9 +1138,7 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
|
|
|
def process_image(image_url, source_name, page_url): |
|
|
|
def process_image(image_url, source_name, page_url): |
|
|
|
"""Download image, check for text with OCR, validate resolution, exclude screenshots, watermarks, and YouTube images.""" |
|
|
|
|
|
|
|
try: |
|
|
|
try: |
|
|
|
# Check for YouTube images via URL or page URL |
|
|
|
|
|
|
|
youtube_domains = ['youtube.com', 'ytimg.com'] |
|
|
|
youtube_domains = ['youtube.com', 'ytimg.com'] |
|
|
|
if any(domain in image_url.lower() or domain in page_url.lower() for domain in youtube_domains): |
|
|
|
if any(domain in image_url.lower() or domain in page_url.lower() for domain in youtube_domains): |
|
|
|
logger.info(f"Skipping YouTube image: {image_url}") |
|
|
|
logger.info(f"Skipping YouTube image: {image_url}") |
|
|
|
@ -1193,20 +1149,17 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term |
|
|
|
response.raise_for_status() |
|
|
|
response.raise_for_status() |
|
|
|
img = Image.open(io.BytesIO(response.content)) |
|
|
|
img = Image.open(io.BytesIO(response.content)) |
|
|
|
|
|
|
|
|
|
|
|
# Check image resolution |
|
|
|
|
|
|
|
width, height = img.size |
|
|
|
width, height = img.size |
|
|
|
min_dimension = 1280 |
|
|
|
min_dimension = 1280 |
|
|
|
if width < min_dimension and height < min_dimension: |
|
|
|
if width < min_dimension and height < min_dimension: |
|
|
|
logger.info(f"Skipping low-resolution image: {image_url} ({width}x{height})") |
|
|
|
logger.info(f"Skipping low-resolution image: {image_url} ({width}x{height})") |
|
|
|
return None |
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
# Attempt to detect screenshots via aspect ratio or naming |
|
|
|
|
|
|
|
aspect_ratio = width / height |
|
|
|
aspect_ratio = width / height |
|
|
|
if (0.9 <= aspect_ratio <= 1.1) or "screenshot" in image_url.lower(): |
|
|
|
if (0.9 <= aspect_ratio <= 1.1) or "screenshot" in image_url.lower(): |
|
|
|
logger.info(f"Skipping potential screenshot: {image_url} (aspect ratio: {aspect_ratio})") |
|
|
|
logger.info(f"Skipping potential screenshot: {image_url} (aspect ratio: {aspect_ratio})") |
|
|
|
return None |
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
# Check for watermarks in URL or page URL |
|
|
|
|
|
|
|
watermark_domains = [ |
|
|
|
watermark_domains = [ |
|
|
|
'shutterstock.com', 'gettyimages.com', 'istockphoto.com', 'adobestock.com', |
|
|
|
'shutterstock.com', 'gettyimages.com', 'istockphoto.com', 'adobestock.com', |
|
|
|
'123rf.com', 'dreamstime.com', 'alamy.com', 'stock.adobe.com' |
|
|
|
'123rf.com', 'dreamstime.com', 'alamy.com', 'stock.adobe.com' |
|
|
|
@ -1215,7 +1168,6 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term |
|
|
|
logger.info(f"Skipping image from stock photo site (potential watermark): {image_url}") |
|
|
|
logger.info(f"Skipping image from stock photo site (potential watermark): {image_url}") |
|
|
|
return None |
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
# OCR to detect text and watermarks |
|
|
|
|
|
|
|
text = pytesseract.image_to_string(img).strip().lower() |
|
|
|
text = pytesseract.image_to_string(img).strip().lower() |
|
|
|
watermark_phrases = [ |
|
|
|
watermark_phrases = [ |
|
|
|
'shutterstock', 'getty images', 'istock', 'adobe stock', 'watermark', |
|
|
|
'shutterstock', 'getty images', 'istock', 'adobe stock', 'watermark', |
|
|
|
@ -1243,12 +1195,13 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term |
|
|
|
logger.warning(f"Failed to process image {image_url}: {e}") |
|
|
|
logger.warning(f"Failed to process image {image_url}: {e}") |
|
|
|
return None |
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
# Step 1: Search DDG for public domain images |
|
|
|
|
|
|
|
ddg_query = f"{search_query} license:public domain" |
|
|
|
ddg_query = f"{search_query} license:public domain" |
|
|
|
logger.info(f"Searching DDG with query: '{ddg_query}'") |
|
|
|
logger.info(f"Searching DDG with query: '{ddg_query}'") |
|
|
|
try: |
|
|
|
try: |
|
|
|
with DDGS() as ddgs: |
|
|
|
with DDGS() as ddgs: |
|
|
|
results = ddgs.images(ddg_query, safesearch="on", max_results=20) |
|
|
|
results = ddgs.images(ddg_query, safesearch="on", max_results=20) |
|
|
|
|
|
|
|
prioritized_results = [] |
|
|
|
|
|
|
|
other_results = [] |
|
|
|
for result in results: |
|
|
|
for result in results: |
|
|
|
image_url = result.get("image") |
|
|
|
image_url = result.get("image") |
|
|
|
page_url = result.get("url") |
|
|
|
page_url = result.get("url") |
|
|
|
@ -1258,14 +1211,23 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term |
|
|
|
source_name = domain.rsplit('.', 1)[0].capitalize() |
|
|
|
source_name = domain.rsplit('.', 1)[0].capitalize() |
|
|
|
else: |
|
|
|
else: |
|
|
|
source_name = "Public Domain" |
|
|
|
source_name = "Public Domain" |
|
|
|
if image_url and image_url.endswith(('.jpg', '.jpeg', '.png')): |
|
|
|
|
|
|
|
|
|
|
|
if not image_url or not image_url.endswith(('.jpg', '.jpeg', '.png')): |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
image_metadata = f"{result.get('title', '').lower()} {page_url.lower()}" |
|
|
|
|
|
|
|
if specific_term and specific_term.lower() in image_metadata: |
|
|
|
|
|
|
|
prioritized_results.append((image_url, source_name, page_url)) |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
other_results.append((image_url, source_name, page_url)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for image_url, source_name, page_url in prioritized_results + other_results: |
|
|
|
result = process_image(image_url, source_name, page_url) |
|
|
|
result = process_image(image_url, source_name, page_url) |
|
|
|
if result: |
|
|
|
if result: |
|
|
|
return result |
|
|
|
return result |
|
|
|
except Exception as e: |
|
|
|
except Exception as e: |
|
|
|
logger.warning(f"DDG search failed for '{ddg_query}': {e}") |
|
|
|
logger.warning(f"DDG search failed for '{ddg_query}': {e}") |
|
|
|
|
|
|
|
|
|
|
|
# Step 2: Fallback to Pixabay with specific term |
|
|
|
|
|
|
|
logger.info(f"No valid DDG images, falling back to Pixabay for '{search_query}'") |
|
|
|
logger.info(f"No valid DDG images, falling back to Pixabay for '{search_query}'") |
|
|
|
image_url, source_name, uploader, page_url = get_image(search_query, specific_term) |
|
|
|
image_url, source_name, uploader, page_url = get_image(search_query, specific_term) |
|
|
|
if image_url: |
|
|
|
if image_url: |
|
|
|
|