fix better images

2025-05-13 08:51:52 +10:00
parent 9870d276a3
commit 6e0f8b4759
4 changed files with 143 additions and 147 deletions
@@ -44,12 +44,9 @@ IMAGE_UPLOAD_TIMEOUT = 30  # Added to fix NameError
 IMAGE_EXPIRATION_DAYS = 7  # 7 days, consistent with foodie_automator_rss.py

 def load_json_file(file_path, expiration_hours=None, default=None):
-    """
-    Load JSON file, optionally filtering expired entries and returning default if invalid.
-    """
    logger = logging.getLogger(__name__)
    if default is None:
-        default = []  # Default to list for posted_rss_titles.json and used_images.json
+        default = []
    
    if not os.path.exists(file_path):
        logger.info(f"File {file_path} does not exist. Returning default: {default}")
@@ -59,15 +56,34 @@ def load_json_file(file_path, expiration_hours=None, default=None):
        with open(file_path, 'r') as f:
            data = json.load(f)
        
+        if not isinstance(data, list):
+            logger.warning(f"Data in {file_path} is not a list, resetting to default")
+            return default
+        
        if expiration_hours is not None:
-            cutoff = datetime.now(timezone.utc) - timedelta(hours=expiration_hours)
-            filtered_data = [
-                entry for entry in data
-                if datetime.fromisoformat(entry['timestamp']) > cutoff
-            ]
+            # Use days for used_images.json, hours for others
+            if "used_images" in file_path:
+                expiration_delta = timedelta(days=expiration_hours)
+            else:
+                expiration_delta = timedelta(hours=expiration_hours)
+            
+            cutoff = datetime.now(timezone.utc) - expiration_delta
+            filtered_data = []
+            for entry in data:
+                if not isinstance(entry, dict) or "title" not in entry or "timestamp" not in entry:
+                    logger.warning(f"Skipping malformed entry in {file_path}: {entry}")
+                    continue
+                try:
+                    timestamp = datetime.fromisoformat(entry["timestamp"])
+                    if timestamp > cutoff:
+                        filtered_data.append(entry)
+                except ValueError as e:
+                    logger.warning(f"Invalid timestamp in {file_path} entry {entry}: {e}")
+                    continue
+            
            if len(filtered_data) < len(data):
                logger.info(f"Filtered {len(data) - len(filtered_data)} expired entries from {file_path}")
-                save_json_file(file_path, filtered_data)  # Save filtered data
+                save_json_file(file_path, filtered_data)
            data = filtered_data
        
        logger.info(f"Loaded {len(data)} valid entries from {file_path}")
@@ -254,64 +270,6 @@ def select_best_persona(interest_score, content=""):
        return random.choice(personas[2:])
    return random.choice(personas)

-def get_image(search_query):
-    headers = {'User-Agent': 'InsiderFoodieBot/1.0 (https://insiderfoodie.com; contact@insiderfoodie.com)'}
-    
-    # Try Pixabay with the original query
-    try:
-        pixabay_url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={quote(search_query)}&image_type=photo&per_page=10"
-        response = requests.get(pixabay_url, headers=headers, timeout=10)
-        response.raise_for_status()
-        data = response.json()
-        
-        for hit in data.get('hits', []):
-            img_url = hit.get('webformatURL')
-            if not img_url or img_url in used_images:
-                continue
-            uploader = hit.get('user', 'Unknown')
-            page_url = hit.get('pageURL', img_url)
-            
-            used_images.add(img_url)
-            save_used_images()
-            
-            logging.info(f"Selected Pixabay image: {img_url} by {uploader} for query '{search_query}'")
-            return img_url, "Pixabay", uploader, page_url
-        
-        logging.info(f"No valid Pixabay image found for query '{search_query}'. Trying fallback query.")
-    
-    except Exception as e:
-        logging.warning(f"Pixabay image fetch failed for query '{search_query}': {e}")
-    
-    # Fallback to a generic query
-    fallback_query = "food dining"
-    try:
-        pixabay_url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={quote(fallback_query)}&image_type=photo&per_page=10"
-        response = requests.get(pixabay_url, headers=headers, timeout=10)
-        response.raise_for_status()
-        data = response.json()
-        
-        for hit in data.get('hits', []):
-            img_url = hit.get('webformatURL')
-            if not img_url or img_url in used_images:
-                continue
-            uploader = hit.get('user', 'Unknown')
-            page_url = hit.get('pageURL', img_url)
-            
-            used_images.add(img_url)
-            save_used_images()
-            
-            logging.info(f"Selected Pixabay fallback image: {img_url} by {uploader} for query '{fallback_query}'")
-            return img_url, "Pixabay", uploader, page_url
-        
-        logging.warning(f"No valid Pixabay image found for fallback query '{fallback_query}'.")
-    
-    except Exception as e:
-        logging.warning(f"Pixabay fallback image fetch failed for query '{fallback_query}': {e}")
-    
-    # Ultimate fallback: return None but log clearly
-    logging.error(f"All image fetch attempts failed for query '{search_query}'. Returning None.")
-    return None, None, None, None
-
 def generate_image_query(title, summary):
    try:
        prompt = (
@@ -425,7 +383,7 @@ def smart_image_and_filter(title, summary):
        relevance_keywords = result["relevance"]
        main_topic = result.get("main_topic", extract_main_topic(title.lower() + " " + summary.lower()))
        skip_flag = (
-            result["aison"] == "SKIP" or 
+            result["action"] == "SKIP" or  # Fixed typo: "aison" → "action"
            "[homemade]" in title.lower() or 
            "homemade" in title.lower() or 
            "homemade" in summary.lower() or 
@@ -1180,9 +1138,7 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term
    logger = logging.getLogger(__name__)
    
    def process_image(image_url, source_name, page_url):
-        """Download image, check for text with OCR, validate resolution, exclude screenshots, watermarks, and YouTube images."""
        try:
-            # Check for YouTube images via URL or page URL
            youtube_domains = ['youtube.com', 'ytimg.com']
            if any(domain in image_url.lower() or domain in page_url.lower() for domain in youtube_domains):
                logger.info(f"Skipping YouTube image: {image_url}")
@@ -1193,20 +1149,17 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term
            response.raise_for_status()
            img = Image.open(io.BytesIO(response.content))
            
-            # Check image resolution
            width, height = img.size
            min_dimension = 1280
            if width < min_dimension and height < min_dimension:
                logger.info(f"Skipping low-resolution image: {image_url} ({width}x{height})")
                return None
            
-            # Attempt to detect screenshots via aspect ratio or naming
            aspect_ratio = width / height
            if (0.9 <= aspect_ratio <= 1.1) or "screenshot" in image_url.lower():
                logger.info(f"Skipping potential screenshot: {image_url} (aspect ratio: {aspect_ratio})")
                return None
            
-            # Check for watermarks in URL or page URL
            watermark_domains = [
                'shutterstock.com', 'gettyimages.com', 'istockphoto.com', 'adobestock.com',
                '123rf.com', 'dreamstime.com', 'alamy.com', 'stock.adobe.com'
@@ -1215,7 +1168,6 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term
                logger.info(f"Skipping image from stock photo site (potential watermark): {image_url}")
                return None
            
-            # OCR to detect text and watermarks
            text = pytesseract.image_to_string(img).strip().lower()
            watermark_phrases = [
                'shutterstock', 'getty images', 'istock', 'adobe stock', 'watermark',
@@ -1243,12 +1195,13 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term
            logger.warning(f"Failed to process image {image_url}: {e}")
            return None
    
-    # Step 1: Search DDG for public domain images
    ddg_query = f"{search_query} license:public domain"
    logger.info(f"Searching DDG with query: '{ddg_query}'")
    try:
        with DDGS() as ddgs:
            results = ddgs.images(ddg_query, safesearch="on", max_results=20)
+            prioritized_results = []
+            other_results = []
            for result in results:
                image_url = result.get("image")
                page_url = result.get("url")
@@ -1258,14 +1211,23 @@ def get_flickr_image(search_query, relevance_keywords, main_topic, specific_term
                    source_name = domain.rsplit('.', 1)[0].capitalize()
                else:
                    source_name = "Public Domain"
-                if image_url and image_url.endswith(('.jpg', '.jpeg', '.png')):
-                    result = process_image(image_url, source_name, page_url)
-                    if result:
-                        return result
+                
+                if not image_url or not image_url.endswith(('.jpg', '.jpeg', '.png')):
+                    continue
+                
+                image_metadata = f"{result.get('title', '').lower()} {page_url.lower()}"
+                if specific_term and specific_term.lower() in image_metadata:
+                    prioritized_results.append((image_url, source_name, page_url))
+                else:
+                    other_results.append((image_url, source_name, page_url))
+            
+            for image_url, source_name, page_url in prioritized_results + other_results:
+                result = process_image(image_url, source_name, page_url)
+                if result:
+                    return result
    except Exception as e:
        logger.warning(f"DDG search failed for '{ddg_query}': {e}")
    
-    # Step 2: Fallback to Pixabay with specific term
    logger.info(f"No valid DDG images, falling back to Pixabay for '{search_query}'")
    image_url, source_name, uploader, page_url = get_image(search_query, specific_term)
    if image_url: