fix better images
This commit is contained in:
+55
-22
@@ -70,6 +70,24 @@ MAX_RETRIES = 3
|
|||||||
RETRY_BACKOFF = 2
|
RETRY_BACKOFF = 2
|
||||||
|
|
||||||
def setup_logging():
|
def setup_logging():
|
||||||
|
try:
|
||||||
|
# Ensure log directory exists
|
||||||
|
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
|
||||||
|
logging.debug(f"Log directory created/verified: {os.path.dirname(LOG_FILE)}")
|
||||||
|
|
||||||
|
# Check write permissions
|
||||||
|
if not os.access(os.path.dirname(LOG_FILE), os.W_OK):
|
||||||
|
raise PermissionError(f"No write permission for {os.path.dirname(LOG_FILE)}")
|
||||||
|
|
||||||
|
# Test write to log file
|
||||||
|
try:
|
||||||
|
with open(LOG_FILE, 'a') as f:
|
||||||
|
f.write("")
|
||||||
|
logging.debug(f"Confirmed write access to {LOG_FILE}")
|
||||||
|
except Exception as e:
|
||||||
|
raise PermissionError(f"Cannot write to {LOG_FILE}: {e}")
|
||||||
|
|
||||||
|
# Prune old logs
|
||||||
if os.path.exists(LOG_FILE):
|
if os.path.exists(LOG_FILE):
|
||||||
with open(LOG_FILE, 'r') as f:
|
with open(LOG_FILE, 'r') as f:
|
||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
@@ -102,17 +120,35 @@ def setup_logging():
|
|||||||
|
|
||||||
with open(LOG_FILE, 'w') as f:
|
with open(LOG_FILE, 'w') as f:
|
||||||
f.writelines(pruned_entries)
|
f.writelines(pruned_entries)
|
||||||
|
logging.debug(f"Log file pruned: {LOG_FILE}")
|
||||||
|
|
||||||
logger = logging.getLogger()
|
# Configure logging
|
||||||
logger.setLevel(logging.INFO)
|
logging.basicConfig(
|
||||||
file_handler = logging.FileHandler(LOG_FILE, mode='a')
|
filename=LOG_FILE,
|
||||||
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
level=logging.INFO,
|
||||||
logger.addHandler(file_handler)
|
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||||
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
|
force=True # Ensure this config takes precedence
|
||||||
|
)
|
||||||
console_handler = logging.StreamHandler()
|
console_handler = logging.StreamHandler()
|
||||||
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
||||||
logger.addHandler(console_handler)
|
logging.getLogger().addHandler(console_handler)
|
||||||
logging.info("Logging initialized for foodie_automator_google.py")
|
logging.info("Logging initialized for foodie_automator_google.py")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Fallback to console logging if file logging fails
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||||
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
|
force=True
|
||||||
|
)
|
||||||
|
logging.error(f"Failed to setup file logging for {LOG_FILE}: {e}. Using console logging.")
|
||||||
|
console_handler = logging.StreamHandler()
|
||||||
|
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
||||||
|
logging.getLogger().addHandler(console_handler)
|
||||||
|
logging.info("Console logging initialized as fallback for foodie_automator_google.py")
|
||||||
|
|
||||||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||||
|
|
||||||
def acquire_lock():
|
def acquire_lock():
|
||||||
@@ -253,11 +289,9 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat
|
|||||||
try:
|
try:
|
||||||
logging.debug(f"Using {len(posted_titles)} posted titles and {len(used_images)} used images")
|
logging.debug(f"Using {len(posted_titles)} posted titles and {len(used_images)} used images")
|
||||||
|
|
||||||
# Define regions to scrape
|
|
||||||
regions = ['US', 'GB', 'AU']
|
regions = ['US', 'GB', 'AU']
|
||||||
all_trends = []
|
all_trends = []
|
||||||
|
|
||||||
# Scrape trends for each region
|
|
||||||
for geo in regions:
|
for geo in regions:
|
||||||
logging.info(f"Scraping Google Trends for geo={geo}")
|
logging.info(f"Scraping Google Trends for geo={geo}")
|
||||||
trends = scrape_google_trends(geo=geo)
|
trends = scrape_google_trends(geo=geo)
|
||||||
@@ -267,7 +301,6 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat
|
|||||||
else:
|
else:
|
||||||
logging.warning(f"No trends collected for geo={geo}")
|
logging.warning(f"No trends collected for geo={geo}")
|
||||||
|
|
||||||
# Remove duplicates by title and sort by search volume
|
|
||||||
unique_trends = []
|
unique_trends = []
|
||||||
seen_titles = set()
|
seen_titles = set()
|
||||||
for trend in all_trends:
|
for trend in all_trends:
|
||||||
@@ -277,10 +310,9 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat
|
|||||||
|
|
||||||
if not unique_trends:
|
if not unique_trends:
|
||||||
logging.info("No Google Trends data available across regions")
|
logging.info("No Google Trends data available across regions")
|
||||||
sleep_time = random.randint(1200, 1800) # 20–30 minutes
|
sleep_time = random.randint(1200, 1800)
|
||||||
return None, None, sleep_time
|
return None, None, sleep_time
|
||||||
|
|
||||||
# Sort trends by search volume in descending order
|
|
||||||
unique_trends.sort(key=lambda x: x["search_volume"], reverse=True)
|
unique_trends.sort(key=lambda x: x["search_volume"], reverse=True)
|
||||||
logging.info(f"Total unique trends collected: {len(unique_trends)}")
|
logging.info(f"Total unique trends collected: {len(unique_trends)}")
|
||||||
|
|
||||||
@@ -293,14 +325,13 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat
|
|||||||
summary = trend.get("summary", "")
|
summary = trend.get("summary", "")
|
||||||
source_name = trend.get("source", "Google Trends")
|
source_name = trend.get("source", "Google Trends")
|
||||||
original_source = f'<a href="{link}">{source_name}</a>'
|
original_source = f'<a href="{link}">{source_name}</a>'
|
||||||
original_url = link # Store for fallback
|
original_url = link
|
||||||
|
|
||||||
if title in posted_titles:
|
if title in posted_titles:
|
||||||
logging.info(f"Skipping already posted trend: {title}")
|
logging.info(f"Skipping already posted trend: {title}")
|
||||||
attempts += 1
|
attempts += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check author availability before GPT calls
|
|
||||||
author = get_next_author_round_robin()
|
author = get_next_author_round_robin()
|
||||||
if not author:
|
if not author:
|
||||||
logging.info(f"Skipping trend '{title}' due to tweet rate limits for all authors")
|
logging.info(f"Skipping trend '{title}' due to tweet rate limits for all authors")
|
||||||
@@ -312,8 +343,12 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat
|
|||||||
|
|
||||||
logging.info(f"Trying Google Trend: {title} from {source_name}")
|
logging.info(f"Trying Google Trend: {title} from {source_name}")
|
||||||
|
|
||||||
|
# Fetch DuckDuckGo context early to enhance smart_image_and_filter
|
||||||
|
ddg_context = fetch_duckduckgo_news_context(title)
|
||||||
|
enhanced_summary = summary + "\n\nAdditional Context: " + ddg_context if summary else ddg_context
|
||||||
|
|
||||||
try:
|
try:
|
||||||
image_query, relevance_keywords, main_topic, skip, specific_term = smart_image_and_filter(title, summary)
|
image_query, relevance_keywords, main_topic, skip, specific_term = smart_image_and_filter(title, enhanced_summary)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"Failed to process smart_image_and_filter for '{title}': {e}")
|
logging.warning(f"Failed to process smart_image_and_filter for '{title}': {e}")
|
||||||
attempts += 1
|
attempts += 1
|
||||||
@@ -324,7 +359,6 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat
|
|||||||
attempts += 1
|
attempts += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
ddg_context = fetch_duckduckgo_news_context(title)
|
|
||||||
scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}"
|
scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}"
|
||||||
interest_score = is_interesting(scoring_content)
|
interest_score = is_interesting(scoring_content)
|
||||||
logging.info(f"Interest score for '{title}': {interest_score}")
|
logging.info(f"Interest score for '{title}': {interest_score}")
|
||||||
@@ -405,11 +439,10 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat
|
|||||||
)
|
)
|
||||||
if not post_id:
|
if not post_id:
|
||||||
logging.warning(f"Failed to post to WordPress for '{title}', using original URL: {original_url}")
|
logging.warning(f"Failed to post to WordPress for '{title}', using original URL: {original_url}")
|
||||||
post_url = original_url # Fallback to original trend URL
|
post_url = original_url
|
||||||
else:
|
else:
|
||||||
logging.info(f"Posted to WordPress for {author_username}: {post_url}")
|
logging.info(f"Posted to WordPress for {author_username}: {post_url}")
|
||||||
|
|
||||||
# Update post with actual post_url
|
|
||||||
post_url_encoded = quote(post_url)
|
post_url_encoded = quote(post_url)
|
||||||
share_links = share_links_template.format(post_url=post_url_encoded)
|
share_links = share_links_template.format(post_url=post_url_encoded)
|
||||||
post_data["content"] = f"{final_summary}\n\n{share_links}"
|
post_data["content"] = f"{final_summary}\n\n{share_links}"
|
||||||
@@ -420,7 +453,7 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat
|
|||||||
category=category,
|
category=category,
|
||||||
link=link,
|
link=link,
|
||||||
author=author,
|
author=author,
|
||||||
image_url=None, # Skip image re-upload
|
image_url=None,
|
||||||
original_source=original_source,
|
original_source=original_source,
|
||||||
image_source=image_source,
|
image_source=image_source,
|
||||||
uploader=uploader,
|
uploader=uploader,
|
||||||
@@ -431,7 +464,7 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat
|
|||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Failed to post to WordPress for '{title}': {e}", exc_info=True)
|
logging.error(f"Failed to post to WordPress for '{title}': {e}", exc_info=True)
|
||||||
post_url = original_url # Fallback to original trend URL
|
post_url = original_url
|
||||||
finally:
|
finally:
|
||||||
is_posting = False
|
is_posting = False
|
||||||
|
|
||||||
@@ -446,15 +479,15 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat
|
|||||||
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
|
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
|
||||||
|
|
||||||
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id or 'N/A'}) from Google Trends *****")
|
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id or 'N/A'}) from Google Trends *****")
|
||||||
sleep_time = random.randint(1200, 1800) # 20–30 minutes
|
sleep_time = random.randint(1200, 1800)
|
||||||
return post_data, category, sleep_time
|
return post_data, category, sleep_time
|
||||||
|
|
||||||
logging.info("No interesting Google Trend found after attempts")
|
logging.info("No interesting Google Trend found after attempts")
|
||||||
sleep_time = random.randint(1200, 1800) # 20–30 minutes
|
sleep_time = random.randint(1200, 1800)
|
||||||
return None, None, sleep_time
|
return None, None, sleep_time
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Unexpected error in curate_from_google_trends: {e}", exc_info=True)
|
logging.error(f"Unexpected error in curate_from_google_trends: {e}", exc_info=True)
|
||||||
sleep_time = random.randint(1200, 1800) # 20–30 minutes
|
sleep_time = random.randint(1200, 1800)
|
||||||
return None, None, sleep_time
|
return None, None, sleep_time
|
||||||
|
|
||||||
def run_google_trends_automator():
|
def run_google_trends_automator():
|
||||||
|
|||||||
+13
-10
@@ -346,7 +346,7 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used
|
|||||||
posts = fetch_reddit_posts()
|
posts = fetch_reddit_posts()
|
||||||
if not posts:
|
if not posts:
|
||||||
logging.info("No Reddit posts available")
|
logging.info("No Reddit posts available")
|
||||||
sleep_time = random.randint(1200, 1800) # 20–30 minutes
|
sleep_time = random.randint(1200, 1800)
|
||||||
return None, None, sleep_time
|
return None, None, sleep_time
|
||||||
|
|
||||||
attempts = 0
|
attempts = 0
|
||||||
@@ -379,8 +379,13 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used
|
|||||||
|
|
||||||
logging.info(f"Trying Reddit Post: {title} from {source_name}")
|
logging.info(f"Trying Reddit Post: {title} from {source_name}")
|
||||||
|
|
||||||
|
# Combine summary and top comments for smart_image_and_filter
|
||||||
|
enhanced_summary = summary
|
||||||
|
if top_comments:
|
||||||
|
enhanced_summary += "\n\nTop Comments:\n" + "\n".join(top_comments)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
image_query, relevance_keywords, main_topic, skip, specific_term = smart_image_and_filter(title, summary)
|
image_query, relevance_keywords, main_topic, skip, specific_term = smart_image_and_filter(title, enhanced_summary)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"Failed to process smart_image_and_filter for '{title}': {e}")
|
logging.warning(f"Failed to process smart_image_and_filter for '{title}': {e}")
|
||||||
attempts += 1
|
attempts += 1
|
||||||
@@ -392,7 +397,6 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
ddg_context = fetch_duckduckgo_news_context(title)
|
ddg_context = fetch_duckduckgo_news_context(title)
|
||||||
# Log full scoring content for debugging
|
|
||||||
scoring_content = f"Title: {title}\n\nContent: {summary}\n\nTop Comments: {top_comments}\n\nAdditional Context: {ddg_context}"
|
scoring_content = f"Title: {title}\n\nContent: {summary}\n\nTop Comments: {top_comments}\n\nAdditional Context: {ddg_context}"
|
||||||
logging.debug(f"Scoring content for '{title}': {scoring_content}")
|
logging.debug(f"Scoring content for '{title}': {scoring_content}")
|
||||||
interest_score = is_interesting_reddit(title, summary, upvotes, comment_count, top_comments)
|
interest_score = is_interesting_reddit(title, summary, upvotes, comment_count, top_comments)
|
||||||
@@ -474,11 +478,10 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used
|
|||||||
)
|
)
|
||||||
if not post_id:
|
if not post_id:
|
||||||
logging.warning(f"Failed to post to WordPress for '{title}', using original URL: {original_url}")
|
logging.warning(f"Failed to post to WordPress for '{title}', using original URL: {original_url}")
|
||||||
post_url = original_url # Fallback to original Reddit post URL
|
post_url = original_url
|
||||||
else:
|
else:
|
||||||
logging.info(f"Posted to WordPress for {author_username}: {post_url}")
|
logging.info(f"Posted to WordPress for {author_username}: {post_url}")
|
||||||
|
|
||||||
# Update post with actual post_url
|
|
||||||
post_url_encoded = quote(post_url)
|
post_url_encoded = quote(post_url)
|
||||||
share_links = share_links_template.format(post_url=post_url_encoded)
|
share_links = share_links_template.format(post_url=post_url_encoded)
|
||||||
post_data["content"] = f"{final_summary}\n\n{share_links}"
|
post_data["content"] = f"{final_summary}\n\n{share_links}"
|
||||||
@@ -489,7 +492,7 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used
|
|||||||
category=category,
|
category=category,
|
||||||
link=link,
|
link=link,
|
||||||
author=author,
|
author=author,
|
||||||
image_url=None, # Skip image re-upload
|
image_url=None,
|
||||||
original_source=original_source,
|
original_source=original_source,
|
||||||
image_source=image_source,
|
image_source=image_source,
|
||||||
uploader=uploader,
|
uploader=uploader,
|
||||||
@@ -500,7 +503,7 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used
|
|||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Failed to post to WordPress for '{title}': {e}", exc_info=True)
|
logging.error(f"Failed to post to WordPress for '{title}': {e}", exc_info=True)
|
||||||
post_url = original_url # Fallback to original Reddit post URL
|
post_url = original_url
|
||||||
finally:
|
finally:
|
||||||
is_posting = False
|
is_posting = False
|
||||||
|
|
||||||
@@ -515,15 +518,15 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used
|
|||||||
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
|
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
|
||||||
|
|
||||||
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id or 'N/A'}) from Reddit *****")
|
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id or 'N/A'}) from Reddit *****")
|
||||||
sleep_time = random.randint(1200, 1800) # 20–30 minutes
|
sleep_time = random.randint(1200, 1800)
|
||||||
return post_data, category, sleep_time
|
return post_data, category, sleep_time
|
||||||
|
|
||||||
logging.info("No interesting Reddit post found after attempts")
|
logging.info("No interesting Reddit post found after attempts")
|
||||||
sleep_time = random.randint(1200, 1800) # 20–30 minutes
|
sleep_time = random.randint(1200, 1800)
|
||||||
return None, None, sleep_time
|
return None, None, sleep_time
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Unexpected error in curate_from_reddit: {e}", exc_info=True)
|
logging.error(f"Unexpected error in curate_from_reddit: {e}", exc_info=True)
|
||||||
sleep_time = random.randint(1200, 1800) # 20–30 minutes
|
sleep_time = random.randint(1200, 1800)
|
||||||
return None, None, sleep_time
|
return None, None, sleep_time
|
||||||
|
|
||||||
def run_reddit_automator():
|
def run_reddit_automator():
|
||||||
|
|||||||
@@ -364,7 +364,6 @@ def curate_from_rss(posted_titles_data, posted_titles, used_images_data, used_im
|
|||||||
f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
|
f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
|
||||||
)
|
)
|
||||||
|
|
||||||
# Embed placeholder share links; update after getting post_url
|
|
||||||
post_data["content"] = f"{final_summary}\n\n{share_links_template.format(post_url='{post_url}', share_text=share_text_encoded)}"
|
post_data["content"] = f"{final_summary}\n\n{share_links_template.format(post_url='{post_url}', share_text=share_text_encoded)}"
|
||||||
|
|
||||||
global is_posting
|
global is_posting
|
||||||
@@ -390,7 +389,6 @@ def curate_from_rss(posted_titles_data, posted_titles, used_images_data, used_im
|
|||||||
else:
|
else:
|
||||||
logging.info(f"Posted to WordPress for {author_username}: {post_url}")
|
logging.info(f"Posted to WordPress for {author_username}: {post_url}")
|
||||||
|
|
||||||
# Update content with actual post_url
|
|
||||||
post_url_encoded = quote(post_url)
|
post_url_encoded = quote(post_url)
|
||||||
post_data["content"] = f"{final_summary}\n\n{share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)}"
|
post_data["content"] = f"{final_summary}\n\n{share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)}"
|
||||||
if post_id:
|
if post_id:
|
||||||
|
|||||||
+40
-78
@@ -44,12 +44,9 @@ IMAGE_UPLOAD_TIMEOUT = 30 # Added to fix NameError
|
|||||||
IMAGE_EXPIRATION_DAYS = 7 # 7 days, consistent with foodie_automator_rss.py
|
IMAGE_EXPIRATION_DAYS = 7 # 7 days, consistent with foodie_automator_rss.py
|
||||||
|
|
||||||
def load_json_file(file_path, expiration_hours=None, default=None):
|
def load_json_file(file_path, expiration_hours=None, default=None):
|
||||||
"""
|
|
||||||
Load JSON file, optionally filtering expired entries and returning default if invalid.
|
|
||||||
"""
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
if default is None:
|
if default is None:
|
||||||
default = [] # Default to list for posted_rss_titles.json and used_images.json
|
default = []
|
||||||
|
|
||||||
if not os.path.exists(file_path):
|
if not os.path.exists(file_path):
|
||||||
logger.info(f"File {file_path} does not exist. Returning default: {default}")
|
logger.info(f"File {file_path} does not exist. Returning default: {default}")
|
||||||
@@ -59,15 +56,34 @@ def load_json_file(file_path, expiration_hours=None, default=None):
|
|||||||
with open(file_path, 'r') as f:
|
with open(file_path, 'r') as f:
|
||||||
data = json.load(f)
|
data = json.load(f)
|
||||||
|
|
||||||
|
if not isinstance(data, list):
|
||||||
|
logger.warning(f"Data in {file_path} is not a list, resetting to default")
|
||||||
|
return default
|
||||||
|
|
||||||
if expiration_hours is not None:
|
if expiration_hours is not None:
|
||||||
cutoff = datetime.now(timezone.utc) - timedelta(hours=expiration_hours)
|
# Use days for used_images.json, hours for others
|
||||||
filtered_data = [
|
if "used_images" in file_path:
|
||||||
entry for entry in data
|
expiration_delta = timedelta(days=expiration_hours)
|
||||||
if datetime.fromisoformat(entry['timestamp']) > cutoff
|
else:
|
||||||
]
|
expiration_delta = timedelta(hours=expiration_hours)
|
||||||
|
|
||||||
|
cutoff = datetime.now(timezone.utc) - expiration_delta
|
||||||
|
filtered_data = []
|
||||||
|
for entry in data:
|
||||||
|
if not isinstance(entry, dict) or "title" not in entry or "timestamp" not in entry:
|
||||||
|
logger.warning(f"Skipping malformed entry in {file_path}: {entry}")
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
timestamp = datetime.fromisoformat(entry["timestamp"])
|
||||||
|
if timestamp > cutoff:
|
||||||
|
filtered_data.append(entry)
|
||||||
|
except ValueError as e:
|
||||||
|
logger.warning(f"Invalid timestamp in {file_path} entry {entry}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
if len(filtered_data) < len(data):
|
if len(filtered_data) < len(data):
|
||||||
logger.info(f"Filtered {len(data) - len(filtered_data)} expired entries from {file_path}")
|
logger.info(f"Filtered {len(data) - len(filtered_data)} expired entries from {file_path}")
|
||||||
save_json_file(file_path, filtered_data) # Save filtered data
|
save_json_file(file_path, filtered_data)
|
||||||
data = filtered_data
|
data = filtered_data
|
||||||
|
|
||||||
logger.info(f"Loaded {len(data)} valid entries from {file_path}")
|
logger.info(f"Loaded {len(data)} valid entries from {file_path}")
|
||||||
@@ -254,64 +270,6 @@ def select_best_persona(interest_score, content=""):
|
|||||||
return random.choice(personas[2:])
|
return random.choice(personas[2:])
|
||||||
return random.choice(personas)
|
return random.choice(personas)
|
||||||
|
|
||||||
def get_image(search_query):
|
|
||||||
headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
|
|
||||||
|
|
||||||
# Try Pixabay with the original query
|
|
||||||
try:
|
|
||||||
pixabay_url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={quote(search_query)}&image_type=photo&per_page=10"
|
|
||||||
response = requests.get(pixabay_url, headers=headers, timeout=10)
|
|
||||||
response.raise_for_status()
|
|
||||||
data = response.json()
|
|
||||||
|
|
||||||
for hit in data.get('hits', []):
|
|
||||||
img_url = hit.get('webformatURL')
|
|
||||||
if not img_url or img_url in used_images:
|
|
||||||
continue
|
|
||||||
uploader = hit.get('user', 'Unknown')
|
|
||||||
page_url = hit.get('pageURL', img_url)
|
|
||||||
|
|
||||||
used_images.add(img_url)
|
|
||||||
save_used_images()
|
|
||||||
|
|
||||||
logging.info(f"Selected Pixabay image: {img_url} by {uploader} for query '{search_query}'")
|
|
||||||
return img_url, "Pixabay", uploader, page_url
|
|
||||||
|
|
||||||
logging.info(f"No valid Pixabay image found for query '{search_query}'. Trying fallback query.")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logging.warning(f"Pixabay image fetch failed for query '{search_query}': {e}")
|
|
||||||
|
|
||||||
# Fallback to a generic query
|
|
||||||
fallback_query = "food dining"
|
|
||||||
try:
|
|
||||||
pixabay_url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={quote(fallback_query)}&image_type=photo&per_page=10"
|
|
||||||
response = requests.get(pixabay_url, headers=headers, timeout=10)
|
|
||||||
response.raise_for_status()
|
|
||||||
data = response.json()
|
|
||||||
|
|
||||||
for hit in data.get('hits', []):
|
|
||||||
img_url = hit.get('webformatURL')
|
|
||||||
if not img_url or img_url in used_images:
|
|
||||||
continue
|
|
||||||
uploader = hit.get('user', 'Unknown')
|
|
||||||
page_url = hit.get('pageURL', img_url)
|
|
||||||
|
|
||||||
used_images.add(img_url)
|
|
||||||
save_used_images()
|
|
||||||
|
|
||||||
logging.info(f"Selected Pixabay fallback image: {img_url} by {uploader} for query '{fallback_query}'")
|
|
||||||
return img_url, "Pixabay", uploader, page_url
|
|
||||||
|
|
||||||
logging.warning(f"No valid Pixabay image found for fallback query '{fallback_query}'.")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logging.warning(f"Pixabay fallback image fetch failed for query '{fallback_query}': {e}")
|
|
||||||
|
|
||||||
# Ultimate fallback: return None but log clearly
|
|
||||||
logging.error(f"All image fetch attempts failed for query '{search_query}'. Returning None.")
|
|
||||||
return None, None, None, None
|
|
||||||
|
|
||||||
def generate_image_query(title, summary):
|
def generate_image_query(title, summary):
|
||||||
try:
|
try:
|
||||||
prompt = (
|
prompt = (
|
||||||
@@ -425,7 +383,7 @@ def smart_image_and_filter(title, summary):
|
|||||||
relevance_keywords = result["relevance"]
|
relevance_keywords = result["relevance"]
|
||||||
main_topic = result.get("main_topic", extract_main_topic(title.lower() + " " + summary.lower()))
|
main_topic = result.get("main_topic", extract_main_topic(title.lower() + " " + summary.lower()))
|
||||||
skip_flag = (
|
skip_flag = (
|
||||||
result["aison"] == "SKIP" or
|
result["action"] == "SKIP" or # Fixed typo: "aison" → "action"
|
||||||
"[homemade]" in title.lower() or
|
"[homemade]" in title.lower() or
|
||||||
"homemade" in title.lower() or
|
"homemade" in title.lower() or
|
||||||
"homemade" in summary.lower() or
|
"homemade" in summary.lower() or
|
||||||
@@ -1180,9 +1138,7 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def process_image(image_url, source_name, page_url):
|
def process_image(image_url, source_name, page_url):
|
||||||
"""Download image, check for text with OCR, validate resolution, exclude screenshots, watermarks, and YouTube images."""
|
|
||||||
try:
|
try:
|
||||||
# Check for YouTube images via URL or page URL
|
|
||||||
youtube_domains = ['youtube.com', 'ytimg.com']
|
youtube_domains = ['youtube.com', 'ytimg.com']
|
||||||
if any(domain in image_url.lower() or domain in page_url.lower() for domain in youtube_domains):
|
if any(domain in image_url.lower() or domain in page_url.lower() for domain in youtube_domains):
|
||||||
logger.info(f"Skipping YouTube image: {image_url}")
|
logger.info(f"Skipping YouTube image: {image_url}")
|
||||||
@@ -1193,20 +1149,17 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term
|
|||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
img = Image.open(io.BytesIO(response.content))
|
img = Image.open(io.BytesIO(response.content))
|
||||||
|
|
||||||
# Check image resolution
|
|
||||||
width, height = img.size
|
width, height = img.size
|
||||||
min_dimension = 1280
|
min_dimension = 1280
|
||||||
if width < min_dimension and height < min_dimension:
|
if width < min_dimension and height < min_dimension:
|
||||||
logger.info(f"Skipping low-resolution image: {image_url} ({width}x{height})")
|
logger.info(f"Skipping low-resolution image: {image_url} ({width}x{height})")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Attempt to detect screenshots via aspect ratio or naming
|
|
||||||
aspect_ratio = width / height
|
aspect_ratio = width / height
|
||||||
if (0.9 <= aspect_ratio <= 1.1) or "screenshot" in image_url.lower():
|
if (0.9 <= aspect_ratio <= 1.1) or "screenshot" in image_url.lower():
|
||||||
logger.info(f"Skipping potential screenshot: {image_url} (aspect ratio: {aspect_ratio})")
|
logger.info(f"Skipping potential screenshot: {image_url} (aspect ratio: {aspect_ratio})")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Check for watermarks in URL or page URL
|
|
||||||
watermark_domains = [
|
watermark_domains = [
|
||||||
'shutterstock.com', 'gettyimages.com', 'istockphoto.com', 'adobestock.com',
|
'shutterstock.com', 'gettyimages.com', 'istockphoto.com', 'adobestock.com',
|
||||||
'123rf.com', 'dreamstime.com', 'alamy.com', 'stock.adobe.com'
|
'123rf.com', 'dreamstime.com', 'alamy.com', 'stock.adobe.com'
|
||||||
@@ -1215,7 +1168,6 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term
|
|||||||
logger.info(f"Skipping image from stock photo site (potential watermark): {image_url}")
|
logger.info(f"Skipping image from stock photo site (potential watermark): {image_url}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# OCR to detect text and watermarks
|
|
||||||
text = pytesseract.image_to_string(img).strip().lower()
|
text = pytesseract.image_to_string(img).strip().lower()
|
||||||
watermark_phrases = [
|
watermark_phrases = [
|
||||||
'shutterstock', 'getty images', 'istock', 'adobe stock', 'watermark',
|
'shutterstock', 'getty images', 'istock', 'adobe stock', 'watermark',
|
||||||
@@ -1243,12 +1195,13 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term
|
|||||||
logger.warning(f"Failed to process image {image_url}: {e}")
|
logger.warning(f"Failed to process image {image_url}: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Step 1: Search DDG for public domain images
|
|
||||||
ddg_query = f"{search_query} license:public domain"
|
ddg_query = f"{search_query} license:public domain"
|
||||||
logger.info(f"Searching DDG with query: '{ddg_query}'")
|
logger.info(f"Searching DDG with query: '{ddg_query}'")
|
||||||
try:
|
try:
|
||||||
with DDGS() as ddgs:
|
with DDGS() as ddgs:
|
||||||
results = ddgs.images(ddg_query, safesearch="on", max_results=20)
|
results = ddgs.images(ddg_query, safesearch="on", max_results=20)
|
||||||
|
prioritized_results = []
|
||||||
|
other_results = []
|
||||||
for result in results:
|
for result in results:
|
||||||
image_url = result.get("image")
|
image_url = result.get("image")
|
||||||
page_url = result.get("url")
|
page_url = result.get("url")
|
||||||
@@ -1258,14 +1211,23 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term
|
|||||||
source_name = domain.rsplit('.', 1)[0].capitalize()
|
source_name = domain.rsplit('.', 1)[0].capitalize()
|
||||||
else:
|
else:
|
||||||
source_name = "Public Domain"
|
source_name = "Public Domain"
|
||||||
if image_url and image_url.endswith(('.jpg', '.jpeg', '.png')):
|
|
||||||
|
if not image_url or not image_url.endswith(('.jpg', '.jpeg', '.png')):
|
||||||
|
continue
|
||||||
|
|
||||||
|
image_metadata = f"{result.get('title', '').lower()} {page_url.lower()}"
|
||||||
|
if specific_term and specific_term.lower() in image_metadata:
|
||||||
|
prioritized_results.append((image_url, source_name, page_url))
|
||||||
|
else:
|
||||||
|
other_results.append((image_url, source_name, page_url))
|
||||||
|
|
||||||
|
for image_url, source_name, page_url in prioritized_results + other_results:
|
||||||
result = process_image(image_url, source_name, page_url)
|
result = process_image(image_url, source_name, page_url)
|
||||||
if result:
|
if result:
|
||||||
return result
|
return result
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"DDG search failed for '{ddg_query}': {e}")
|
logger.warning(f"DDG search failed for '{ddg_query}': {e}")
|
||||||
|
|
||||||
# Step 2: Fallback to Pixabay with specific term
|
|
||||||
logger.info(f"No valid DDG images, falling back to Pixabay for '{search_query}'")
|
logger.info(f"No valid DDG images, falling back to Pixabay for '{search_query}'")
|
||||||
image_url, source_name, uploader, page_url = get_image(search_query, specific_term)
|
image_url, source_name, uploader, page_url = get_image(search_query, specific_term)
|
||||||
if image_url:
|
if image_url:
|
||||||
|
|||||||
Reference in New Issue
Block a user