From 6e0f8b47592e7b0771fb599cf18fe7b0cd009baf Mon Sep 17 00:00:00 2001 From: Shane Date: Tue, 13 May 2025 08:51:52 +1000 Subject: [PATCH] fix better images --- foodie_automator_google.py | 141 +++++++++++++++++++++++-------------- foodie_automator_reddit.py | 23 +++--- foodie_automator_rss.py | 2 - foodie_utils.py | 124 +++++++++++--------------------- 4 files changed, 143 insertions(+), 147 deletions(-) diff --git a/foodie_automator_google.py b/foodie_automator_google.py index fcbc162..9ddb4c1 100644 --- a/foodie_automator_google.py +++ b/foodie_automator_google.py @@ -70,48 +70,84 @@ MAX_RETRIES = 3 RETRY_BACKOFF = 2 def setup_logging(): - if os.path.exists(LOG_FILE): - with open(LOG_FILE, 'r') as f: - lines = f.readlines() - - log_entries = [] - current_entry = [] - timestamp_pattern = re.compile(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}') - - for line in lines: - if timestamp_pattern.match(line): - if current_entry: - log_entries.append(''.join(current_entry)) - current_entry = [line] - else: - current_entry.append(line) + try: + # Ensure log directory exists + os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True) + logging.debug(f"Log directory created/verified: {os.path.dirname(LOG_FILE)}") - if current_entry: - log_entries.append(''.join(current_entry)) + # Check write permissions + if not os.access(os.path.dirname(LOG_FILE), os.W_OK): + raise PermissionError(f"No write permission for {os.path.dirname(LOG_FILE)}") - cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS) - pruned_entries = [] - for entry in log_entries: - try: - timestamp = datetime.strptime(entry[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc) - if timestamp > cutoff: - pruned_entries.append(entry) - except ValueError: - logging.warning(f"Skipping malformed log entry (no timestamp): {entry[:50]}...") - continue + # Test write to log file + try: + with open(LOG_FILE, 'a') as f: + f.write("") + logging.debug(f"Confirmed write access to {LOG_FILE}") + except Exception as e: + raise PermissionError(f"Cannot write to {LOG_FILE}: {e}") + + # Prune old logs + if os.path.exists(LOG_FILE): + with open(LOG_FILE, 'r') as f: + lines = f.readlines() + + log_entries = [] + current_entry = [] + timestamp_pattern = re.compile(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}') + + for line in lines: + if timestamp_pattern.match(line): + if current_entry: + log_entries.append(''.join(current_entry)) + current_entry = [line] + else: + current_entry.append(line) + + if current_entry: + log_entries.append(''.join(current_entry)) + + cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS) + pruned_entries = [] + for entry in log_entries: + try: + timestamp = datetime.strptime(entry[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc) + if timestamp > cutoff: + pruned_entries.append(entry) + except ValueError: + logging.warning(f"Skipping malformed log entry (no timestamp): {entry[:50]}...") + continue + + with open(LOG_FILE, 'w') as f: + f.writelines(pruned_entries) + logging.debug(f"Log file pruned: {LOG_FILE}") - with open(LOG_FILE, 'w') as f: - f.writelines(pruned_entries) + # Configure logging + logging.basicConfig( + filename=LOG_FILE, + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + force=True # Ensure this config takes precedence + ) + console_handler = logging.StreamHandler() + console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) + logging.getLogger().addHandler(console_handler) + logging.info("Logging initialized for foodie_automator_google.py") - logger = logging.getLogger() - logger.setLevel(logging.INFO) - file_handler = logging.FileHandler(LOG_FILE, mode='a') - file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) - logger.addHandler(file_handler) - console_handler = logging.StreamHandler() - console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) - logger.addHandler(console_handler) - logging.info("Logging initialized for foodie_automator_google.py") + except Exception as e: + # Fallback to console logging if file logging fails + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + force=True + ) + logging.error(f"Failed to setup file logging for {LOG_FILE}: {e}. Using console logging.") + console_handler = logging.StreamHandler() + console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) + logging.getLogger().addHandler(console_handler) + logging.info("Console logging initialized as fallback for foodie_automator_google.py") client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) @@ -253,11 +289,9 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat try: logging.debug(f"Using {len(posted_titles)} posted titles and {len(used_images)} used images") - # Define regions to scrape regions = ['US', 'GB', 'AU'] all_trends = [] - # Scrape trends for each region for geo in regions: logging.info(f"Scraping Google Trends for geo={geo}") trends = scrape_google_trends(geo=geo) @@ -267,7 +301,6 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat else: logging.warning(f"No trends collected for geo={geo}") - # Remove duplicates by title and sort by search volume unique_trends = [] seen_titles = set() for trend in all_trends: @@ -277,10 +310,9 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat if not unique_trends: logging.info("No Google Trends data available across regions") - sleep_time = random.randint(1200, 1800) # 20–30 minutes + sleep_time = random.randint(1200, 1800) return None, None, sleep_time - # Sort trends by search volume in descending order unique_trends.sort(key=lambda x: x["search_volume"], reverse=True) logging.info(f"Total unique trends collected: {len(unique_trends)}") @@ -293,14 +325,13 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat summary = trend.get("summary", "") source_name = trend.get("source", "Google Trends") original_source = f'{source_name}' - original_url = link # Store for fallback + original_url = link if title in posted_titles: logging.info(f"Skipping already posted trend: {title}") attempts += 1 continue - # Check author availability before GPT calls author = get_next_author_round_robin() if not author: logging.info(f"Skipping trend '{title}' due to tweet rate limits for all authors") @@ -312,8 +343,12 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat logging.info(f"Trying Google Trend: {title} from {source_name}") + # Fetch DuckDuckGo context early to enhance smart_image_and_filter + ddg_context = fetch_duckduckgo_news_context(title) + enhanced_summary = summary + "\n\nAdditional Context: " + ddg_context if summary else ddg_context + try: - image_query, relevance_keywords, main_topic, skip, specific_term = smart_image_and_filter(title, summary) + image_query, relevance_keywords, main_topic, skip, specific_term = smart_image_and_filter(title, enhanced_summary) except Exception as e: logging.warning(f"Failed to process smart_image_and_filter for '{title}': {e}") attempts += 1 @@ -324,7 +359,6 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat attempts += 1 continue - ddg_context = fetch_duckduckgo_news_context(title) scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}" interest_score = is_interesting(scoring_content) logging.info(f"Interest score for '{title}': {interest_score}") @@ -405,11 +439,10 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat ) if not post_id: logging.warning(f"Failed to post to WordPress for '{title}', using original URL: {original_url}") - post_url = original_url # Fallback to original trend URL + post_url = original_url else: logging.info(f"Posted to WordPress for {author_username}: {post_url}") - # Update post with actual post_url post_url_encoded = quote(post_url) share_links = share_links_template.format(post_url=post_url_encoded) post_data["content"] = f"{final_summary}\n\n{share_links}" @@ -420,7 +453,7 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat category=category, link=link, author=author, - image_url=None, # Skip image re-upload + image_url=None, original_source=original_source, image_source=image_source, uploader=uploader, @@ -431,7 +464,7 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat ) except Exception as e: logging.error(f"Failed to post to WordPress for '{title}': {e}", exc_info=True) - post_url = original_url # Fallback to original trend URL + post_url = original_url finally: is_posting = False @@ -446,15 +479,15 @@ def curate_from_google_trends(posted_titles_data, posted_titles, used_images_dat logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}") logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id or 'N/A'}) from Google Trends *****") - sleep_time = random.randint(1200, 1800) # 20–30 minutes + sleep_time = random.randint(1200, 1800) return post_data, category, sleep_time logging.info("No interesting Google Trend found after attempts") - sleep_time = random.randint(1200, 1800) # 20–30 minutes + sleep_time = random.randint(1200, 1800) return None, None, sleep_time except Exception as e: logging.error(f"Unexpected error in curate_from_google_trends: {e}", exc_info=True) - sleep_time = random.randint(1200, 1800) # 20–30 minutes + sleep_time = random.randint(1200, 1800) return None, None, sleep_time def run_google_trends_automator(): diff --git a/foodie_automator_reddit.py b/foodie_automator_reddit.py index 95f507a..9d0aed4 100644 --- a/foodie_automator_reddit.py +++ b/foodie_automator_reddit.py @@ -346,7 +346,7 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used posts = fetch_reddit_posts() if not posts: logging.info("No Reddit posts available") - sleep_time = random.randint(1200, 1800) # 20–30 minutes + sleep_time = random.randint(1200, 1800) return None, None, sleep_time attempts = 0 @@ -379,8 +379,13 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used logging.info(f"Trying Reddit Post: {title} from {source_name}") + # Combine summary and top comments for smart_image_and_filter + enhanced_summary = summary + if top_comments: + enhanced_summary += "\n\nTop Comments:\n" + "\n".join(top_comments) + try: - image_query, relevance_keywords, main_topic, skip, specific_term = smart_image_and_filter(title, summary) + image_query, relevance_keywords, main_topic, skip, specific_term = smart_image_and_filter(title, enhanced_summary) except Exception as e: logging.warning(f"Failed to process smart_image_and_filter for '{title}': {e}") attempts += 1 @@ -392,7 +397,6 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used continue ddg_context = fetch_duckduckgo_news_context(title) - # Log full scoring content for debugging scoring_content = f"Title: {title}\n\nContent: {summary}\n\nTop Comments: {top_comments}\n\nAdditional Context: {ddg_context}" logging.debug(f"Scoring content for '{title}': {scoring_content}") interest_score = is_interesting_reddit(title, summary, upvotes, comment_count, top_comments) @@ -474,11 +478,10 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used ) if not post_id: logging.warning(f"Failed to post to WordPress for '{title}', using original URL: {original_url}") - post_url = original_url # Fallback to original Reddit post URL + post_url = original_url else: logging.info(f"Posted to WordPress for {author_username}: {post_url}") - # Update post with actual post_url post_url_encoded = quote(post_url) share_links = share_links_template.format(post_url=post_url_encoded) post_data["content"] = f"{final_summary}\n\n{share_links}" @@ -489,7 +492,7 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used category=category, link=link, author=author, - image_url=None, # Skip image re-upload + image_url=None, original_source=original_source, image_source=image_source, uploader=uploader, @@ -500,7 +503,7 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used ) except Exception as e: logging.error(f"Failed to post to WordPress for '{title}': {e}", exc_info=True) - post_url = original_url # Fallback to original Reddit post URL + post_url = original_url finally: is_posting = False @@ -515,15 +518,15 @@ def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}") logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id or 'N/A'}) from Reddit *****") - sleep_time = random.randint(1200, 1800) # 20–30 minutes + sleep_time = random.randint(1200, 1800) return post_data, category, sleep_time logging.info("No interesting Reddit post found after attempts") - sleep_time = random.randint(1200, 1800) # 20–30 minutes + sleep_time = random.randint(1200, 1800) return None, None, sleep_time except Exception as e: logging.error(f"Unexpected error in curate_from_reddit: {e}", exc_info=True) - sleep_time = random.randint(1200, 1800) # 20–30 minutes + sleep_time = random.randint(1200, 1800) return None, None, sleep_time def run_reddit_automator(): diff --git a/foodie_automator_rss.py b/foodie_automator_rss.py index e84c59d..0893281 100644 --- a/foodie_automator_rss.py +++ b/foodie_automator_rss.py @@ -364,7 +364,6 @@ def curate_from_rss(posted_titles_data, posted_titles, used_images_data, used_im f'

' ) - # Embed placeholder share links; update after getting post_url post_data["content"] = f"{final_summary}\n\n{share_links_template.format(post_url='{post_url}', share_text=share_text_encoded)}" global is_posting @@ -390,7 +389,6 @@ def curate_from_rss(posted_titles_data, posted_titles, used_images_data, used_im else: logging.info(f"Posted to WordPress for {author_username}: {post_url}") - # Update content with actual post_url post_url_encoded = quote(post_url) post_data["content"] = f"{final_summary}\n\n{share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)}" if post_id: diff --git a/foodie_utils.py b/foodie_utils.py index 49a1e6f..7cfbca8 100644 --- a/foodie_utils.py +++ b/foodie_utils.py @@ -44,12 +44,9 @@ IMAGE_UPLOAD_TIMEOUT = 30 # Added to fix NameError IMAGE_EXPIRATION_DAYS = 7 # 7 days, consistent with foodie_automator_rss.py def load_json_file(file_path, expiration_hours=None, default=None): - """ - Load JSON file, optionally filtering expired entries and returning default if invalid. - """ logger = logging.getLogger(__name__) if default is None: - default = [] # Default to list for posted_rss_titles.json and used_images.json + default = [] if not os.path.exists(file_path): logger.info(f"File {file_path} does not exist. Returning default: {default}") @@ -59,15 +56,34 @@ def load_json_file(file_path, expiration_hours=None, default=None): with open(file_path, 'r') as f: data = json.load(f) + if not isinstance(data, list): + logger.warning(f"Data in {file_path} is not a list, resetting to default") + return default + if expiration_hours is not None: - cutoff = datetime.now(timezone.utc) - timedelta(hours=expiration_hours) - filtered_data = [ - entry for entry in data - if datetime.fromisoformat(entry['timestamp']) > cutoff - ] + # Use days for used_images.json, hours for others + if "used_images" in file_path: + expiration_delta = timedelta(days=expiration_hours) + else: + expiration_delta = timedelta(hours=expiration_hours) + + cutoff = datetime.now(timezone.utc) - expiration_delta + filtered_data = [] + for entry in data: + if not isinstance(entry, dict) or "title" not in entry or "timestamp" not in entry: + logger.warning(f"Skipping malformed entry in {file_path}: {entry}") + continue + try: + timestamp = datetime.fromisoformat(entry["timestamp"]) + if timestamp > cutoff: + filtered_data.append(entry) + except ValueError as e: + logger.warning(f"Invalid timestamp in {file_path} entry {entry}: {e}") + continue + if len(filtered_data) < len(data): logger.info(f"Filtered {len(data) - len(filtered_data)} expired entries from {file_path}") - save_json_file(file_path, filtered_data) # Save filtered data + save_json_file(file_path, filtered_data) data = filtered_data logger.info(f"Loaded {len(data)} valid entries from {file_path}") @@ -254,64 +270,6 @@ def select_best_persona(interest_score, content=""): return random.choice(personas[2:]) return random.choice(personas) -def get_image(search_query): - headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'} - - # Try Pixabay with the original query - try: - pixabay_url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={quote(search_query)}&image_type=photo&per_page=10" - response = requests.get(pixabay_url, headers=headers, timeout=10) - response.raise_for_status() - data = response.json() - - for hit in data.get('hits', []): - img_url = hit.get('webformatURL') - if not img_url or img_url in used_images: - continue - uploader = hit.get('user', 'Unknown') - page_url = hit.get('pageURL', img_url) - - used_images.add(img_url) - save_used_images() - - logging.info(f"Selected Pixabay image: {img_url} by {uploader} for query '{search_query}'") - return img_url, "Pixabay", uploader, page_url - - logging.info(f"No valid Pixabay image found for query '{search_query}'. Trying fallback query.") - - except Exception as e: - logging.warning(f"Pixabay image fetch failed for query '{search_query}': {e}") - - # Fallback to a generic query - fallback_query = "food dining" - try: - pixabay_url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={quote(fallback_query)}&image_type=photo&per_page=10" - response = requests.get(pixabay_url, headers=headers, timeout=10) - response.raise_for_status() - data = response.json() - - for hit in data.get('hits', []): - img_url = hit.get('webformatURL') - if not img_url or img_url in used_images: - continue - uploader = hit.get('user', 'Unknown') - page_url = hit.get('pageURL', img_url) - - used_images.add(img_url) - save_used_images() - - logging.info(f"Selected Pixabay fallback image: {img_url} by {uploader} for query '{fallback_query}'") - return img_url, "Pixabay", uploader, page_url - - logging.warning(f"No valid Pixabay image found for fallback query '{fallback_query}'.") - - except Exception as e: - logging.warning(f"Pixabay fallback image fetch failed for query '{fallback_query}': {e}") - - # Ultimate fallback: return None but log clearly - logging.error(f"All image fetch attempts failed for query '{search_query}'. Returning None.") - return None, None, None, None - def generate_image_query(title, summary): try: prompt = ( @@ -425,7 +383,7 @@ def smart_image_and_filter(title, summary): relevance_keywords = result["relevance"] main_topic = result.get("main_topic", extract_main_topic(title.lower() + " " + summary.lower())) skip_flag = ( - result["aison"] == "SKIP" or + result["action"] == "SKIP" or # Fixed typo: "aison" → "action" "[homemade]" in title.lower() or "homemade" in title.lower() or "homemade" in summary.lower() or @@ -1180,9 +1138,7 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term logger = logging.getLogger(__name__) def process_image(image_url, source_name, page_url): - """Download image, check for text with OCR, validate resolution, exclude screenshots, watermarks, and YouTube images.""" try: - # Check for YouTube images via URL or page URL youtube_domains = ['youtube.com', 'ytimg.com'] if any(domain in image_url.lower() or domain in page_url.lower() for domain in youtube_domains): logger.info(f"Skipping YouTube image: {image_url}") @@ -1193,20 +1149,17 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term response.raise_for_status() img = Image.open(io.BytesIO(response.content)) - # Check image resolution width, height = img.size min_dimension = 1280 if width < min_dimension and height < min_dimension: logger.info(f"Skipping low-resolution image: {image_url} ({width}x{height})") return None - # Attempt to detect screenshots via aspect ratio or naming aspect_ratio = width / height if (0.9 <= aspect_ratio <= 1.1) or "screenshot" in image_url.lower(): logger.info(f"Skipping potential screenshot: {image_url} (aspect ratio: {aspect_ratio})") return None - # Check for watermarks in URL or page URL watermark_domains = [ 'shutterstock.com', 'gettyimages.com', 'istockphoto.com', 'adobestock.com', '123rf.com', 'dreamstime.com', 'alamy.com', 'stock.adobe.com' @@ -1215,7 +1168,6 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term logger.info(f"Skipping image from stock photo site (potential watermark): {image_url}") return None - # OCR to detect text and watermarks text = pytesseract.image_to_string(img).strip().lower() watermark_phrases = [ 'shutterstock', 'getty images', 'istock', 'adobe stock', 'watermark', @@ -1243,12 +1195,13 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term logger.warning(f"Failed to process image {image_url}: {e}") return None - # Step 1: Search DDG for public domain images ddg_query = f"{search_query} license:public domain" logger.info(f"Searching DDG with query: '{ddg_query}'") try: with DDGS() as ddgs: results = ddgs.images(ddg_query, safesearch="on", max_results=20) + prioritized_results = [] + other_results = [] for result in results: image_url = result.get("image") page_url = result.get("url") @@ -1258,14 +1211,23 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term source_name = domain.rsplit('.', 1)[0].capitalize() else: source_name = "Public Domain" - if image_url and image_url.endswith(('.jpg', '.jpeg', '.png')): - result = process_image(image_url, source_name, page_url) - if result: - return result + + if not image_url or not image_url.endswith(('.jpg', '.jpeg', '.png')): + continue + + image_metadata = f"{result.get('title', '').lower()} {page_url.lower()}" + if specific_term and specific_term.lower() in image_metadata: + prioritized_results.append((image_url, source_name, page_url)) + else: + other_results.append((image_url, source_name, page_url)) + + for image_url, source_name, page_url in prioritized_results + other_results: + result = process_image(image_url, source_name, page_url) + if result: + return result except Exception as e: logger.warning(f"DDG search failed for '{ddg_query}': {e}") - # Step 2: Fallback to Pixabay with specific term logger.info(f"No valid DDG images, falling back to Pixabay for '{search_query}'") image_url, source_name, uploader, page_url = get_image(search_query, specific_term) if image_url: