try image mismatch

2025-05-01 21:55:24 +10:00
parent e2c47a1a05
commit 163e4e50ec
5 changed files with 125 additions and 92 deletions
@@ -256,8 +256,10 @@ def curate_from_google_trends(geo_list=['US']):
        if not image_url:
            image_url, image_source, uploader, page_url = get_image(image_query)

+        # Log the fetched image details
+        logging.info(f"Fetched image for '{post_data['title']}': URL={image_url}, Source={image_source}, Uploader={uploader}, Page URL={page_url}")
+
        hook = get_dynamic_hook(post_data["title"]).strip()
-        # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=None)

        # Generate viral share prompt
        share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
@@ -266,7 +268,7 @@ def curate_from_google_trends(geo_list=['US']):
            f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
            f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
        )
-        post_data["content"] = f"{final_summary}\n\n{share_links_template}"  # Removed cta from content
+        post_data["content"] = f"{final_summary}\n\n{share_links_template}"

        global is_posting
        is_posting = True
@@ -292,8 +294,7 @@ def curate_from_google_trends(geo_list=['US']):
            share_text_encoded = quote(share_text)
            post_url_encoded = quote(post_url)
            share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
-            # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
-            post_data["content"] = f"{final_summary}\n\n{share_links}"  # Removed cta from content
+            post_data["content"] = f"{final_summary}\n\n{share_links}"
            is_posting = True
            try:
                post_to_wp(
@@ -319,6 +320,16 @@ def curate_from_google_trends(geo_list=['US']):
            logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")

            if image_url:
+                # Check if image is already used
+                used_images_list = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
+                used_image_urls = {entry["title"] for entry in used_images_list}
+                if image_url in used_image_urls:
+                    logging.warning(f"Image '{image_url}' already used, attempting to fetch a new image")
+                    image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
+                    if not image_url:
+                        image_url, image_source, uploader, page_url = get_image(image_query)
+                    logging.info(f"New image fetched: URL={image_url}, Source={image_source}, Uploader={uploader}, Page URL={page_url}")
+
                save_json_file(USED_IMAGES_FILE, image_url, timestamp)
                used_images.add(image_url)
                logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
@@ -211,7 +211,7 @@ def curate_from_reddit():
    if not articles:
        print("No Reddit posts available")
        logging.info("No Reddit posts available")
-        return None, None, None
+        return None, None, random.randint(600, 1800)

    articles.sort(key=lambda x: x["upvotes"], reverse=True)
    
@@ -299,8 +299,10 @@ def curate_from_reddit():
        if not image_url:
            image_url, image_source, uploader, page_url = get_image(image_query)
        
+        # Log the fetched image details
+        logging.info(f"Fetched image for '{post_data['title']}': URL={image_url}, Source={image_source}, Uploader={uploader}, Page URL={page_url}")
+        
        hook = get_dynamic_hook(post_data["title"]).strip()
-        # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=None)
        
        # Generate viral share prompt
        share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
@@ -309,7 +311,7 @@ def curate_from_reddit():
            f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
            f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
        )
-        post_data["content"] = f"{final_summary}\n\n{share_links_template}"  # Removed cta from content
+        post_data["content"] = f"{final_summary}\n\n{share_links_template}"
        
        global is_posting
        is_posting = True
@@ -335,8 +337,7 @@ def curate_from_reddit():
            share_text_encoded = quote(share_text)
            post_url_encoded = quote(post_url)
            share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
-            # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
-            post_data["content"] = f"{final_summary}\n\n{share_links}"  # Removed cta from content
+            post_data["content"] = f"{final_summary}\n\n{share_links}"
            is_posting = True
            try:
                post_to_wp(
@@ -362,6 +363,16 @@ def curate_from_reddit():
            logging.info(f"Successfully saved '{raw_title}' to {POSTED_TITLES_FILE} with timestamp {timestamp}")
            
            if image_url:
+                # Check if image is already used
+                used_images_list = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
+                used_image_urls = {entry["title"] for entry in used_images_list}
+                if image_url in used_image_urls:
+                    logging.warning(f"Image '{image_url}' already used, attempting to fetch a new image")
+                    image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
+                    if not image_url:
+                        image_url, image_source, uploader, page_url = get_image(image_query)
+                    logging.info(f"New image fetched: URL={image_url}, Source={image_source}, Uploader={uploader}, Page URL={page_url}")
+
                save_json_file(USED_IMAGES_FILE, image_url, timestamp)
                used_images.add(image_url)
                logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE} with timestamp {timestamp}")
@@ -27,11 +27,15 @@ from foodie_utils import (
    generate_category_from_summary, post_to_wp, prepare_post_data,
    select_best_author, smart_image_and_filter, get_flickr_image
 )
-from foodie_hooks import get_dynamic_hook, get_viral_share_prompt  # Removed select_best_cta import
+from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
 from dotenv import load_dotenv

 load_dotenv()

+# Log script version to ensure it's the latest
+SCRIPT_VERSION = "1.2.0"
+logging.info(f"Starting foodie_automator_rss.py version {SCRIPT_VERSION}")
+
 is_posting = False

 def signal_handler(sig, frame):
@@ -271,8 +275,10 @@ def curate_from_rss():
        if not image_url:
            image_url, image_source, uploader, page_url = get_image(image_query)

+        # Log the fetched image details
+        logging.info(f"Fetched image for '{post_data['title']}': URL={image_url}, Source={image_source}, Uploader={uploader}, Page URL={page_url}")
+
        hook = get_dynamic_hook(post_data["title"]).strip()
-        # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=None)

        # Generate viral share prompt
        share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
@@ -281,7 +287,7 @@ def curate_from_rss():
            f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
            f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
        )
-        post_data["content"] = f"{final_summary}\n\n{share_links_template}"  # Removed cta from content
+        post_data["content"] = f"{final_summary}\n\n{share_links_template}"

        global is_posting
        is_posting = True
@@ -307,8 +313,7 @@ def curate_from_rss():
            share_text_encoded = quote(share_text)
            post_url_encoded = quote(post_url)
            share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
-            # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
-            post_data["content"] = f"{final_summary}\n\n{share_links}"  # Removed cta from content
+            post_data["content"] = f"{final_summary}\n\n{share_links}"
            is_posting = True
            try:
                post_to_wp(
@@ -334,6 +339,16 @@ def curate_from_rss():
            logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")

            if image_url:
+                # Check if image is already used
+                used_images_list = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
+                used_image_urls = {entry["title"] for entry in used_images_list}
+                if image_url in used_image_urls:
+                    logging.warning(f"Image '{image_url}' already used, attempting to fetch a new image")
+                    image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
+                    if not image_url:
+                        image_url, image_source, uploader, page_url = get_image(image_query)
+                    logging.info(f"New image fetched: URL={image_url}, Source={image_source}, Uploader={uploader}, Page URL={page_url}")
+
                save_json_file(USED_IMAGES_FILE, image_url, timestamp)
                used_images.add(image_url)
                logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
@@ -260,14 +260,6 @@ CATEGORIES = [
    "Lifestyle", "Buzz", "Culture", "Health", "Drink", "Food", "Eats"
 ]

-CTAS = [
-    "Love This Take? Share It On <a href='{share_url}'><i class=\"tsi tsi-twitter\"></i></a>!",
-    "Dig This Scoop? Post It On <a href='{share_url}'><i class=\"tsi tsi-twitter\"></i></a>!",
-    "Wild For This? Spread It On <a href='{share_url}'><i class=\"tsi tsi-twitter\"></i></a>!",
-    "Crave This Read? Tweet It On <a href='{share_url}'><i class=\"tsi tsi-twitter\"></i></a>!",
-    "Buzzing Over This? Share On <a href='{share_url}'><i class=\"tsi tsi-twitter\"></i></a>!"
-]
-
 REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID")
 REDDIT_CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET")
 REDDIT_USER_AGENT = os.getenv("REDDIT_USER_AGENT")
@@ -29,63 +29,62 @@ from foodie_config import (
 load_dotenv()
 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

-def load_json_file(filename, expiration_days=None):
-    data = []
-    if os.path.exists(filename):
-        try:
-            with open(filename, 'r') as f:
-                lines = f.readlines()
-                for i, line in enumerate(lines, 1):
-                    if line.strip():
-                        try:
-                            entry = json.loads(line.strip())
-                            if not isinstance(entry, dict) or "title" not in entry or "timestamp" not in entry:
-                                logging.warning(f"Skipping malformed entry in {filename} at line {i}: {entry}")
-                                continue
-                            data.append(entry)
-                        except json.JSONDecodeError as e:
-                            logging.warning(f"Skipping invalid JSON line in {filename} at line {i}: {e}")
-            if expiration_days:
-                cutoff = (datetime.now(timezone.utc) - timedelta(days=expiration_days)).isoformat()
-                data = [entry for entry in data if entry["timestamp"] > cutoff]
-            logging.info(f"Loaded {len(data)} entries from {filename}, {len(data)} valid after expiration check")
-        except Exception as e:
-            logging.error(f"Failed to load {filename}: {e}")
-            data = []  # Reset to empty on failure
-    return data
-
-def save_json_file(filename, key, value):
-    entry = {"title": key, "timestamp": value}
-    PRUNE_INTERVAL_DAYS = 180
+def load_json_file(file_path, expiration_hours):
+    entries = []
+    cutoff = datetime.now(timezone.utc) - timedelta(hours=expiration_hours)
+    
+    if not os.path.exists(file_path):
+        logging.info(f"File {file_path} does not exist, returning empty list")
+        return entries
+    
    try:
-        data = load_json_file(filename, expiration_days=PRUNE_INTERVAL_DAYS)
-        # Remove duplicates by title
-        data = [item for item in data if item["title"] != key]
-        data.append(entry)
-        # Special handling for used_images.json to save as a flat list with one URL per line
-        if filename.endswith('used_images.json'):
-            used_images.add(key)
-            with open(filename, 'w') as f:
-                f.write('[\n')
-                urls = list(used_images)
-                for i, url in enumerate(urls):
-                    f.write(f'"{url}"')
-                    if i < len(urls) - 1:
-                        f.write(',\n')
-                    else:
-                        f.write('\n')
-                f.write(']')
-        else:
-            with open(filename, 'w') as f:
-                for item in data:
-                    json.dump(item, f)
-                    f.write('\n')
-        logging.info(f"Saved '{key}' to {filename}")
-        print(f"DEBUG: Saved '{key}' to {filename}")
-        loaded_data = load_json_file(filename, expiration_days=PRUNE_INTERVAL_DAYS)
-        logging.info(f"Pruned {filename} to {len(loaded_data)} entries (older than {PRUNE_INTERVAL_DAYS} days removed)")
+        with open(file_path, 'r') as f:
+            lines = f.readlines()
+        
+        for i, line in enumerate(lines, 1):
+            try:
+                entry = json.loads(line.strip())
+                if not isinstance(entry, dict) or "title" not in entry or "timestamp" not in entry:
+                    logging.warning(f"Skipping malformed entry in {file_path} at line {i}: {line.strip()}")
+                    continue
+                
+                timestamp = datetime.fromisoformat(entry["timestamp"])
+                if timestamp > cutoff:
+                    entries.append(entry)
+                else:
+                    logging.debug(f"Entry expired in {file_path}: {entry['title']}")
+            except json.JSONDecodeError as e:
+                logging.warning(f"Skipping invalid JSON line in {file_path} at line {i}: {e}")
+                continue
+            except Exception as e:
+                logging.warning(f"Skipping malformed entry in {file_path} at line {i}: {line.strip()}")
+                continue
+        
+        logging.info(f"Loaded {len(entries)} entries from {file_path}, {len(entries)} valid after expiration check")
+        return entries
    except Exception as e:
-        logging.error(f"Failed to save or prune {filename}: {e}")
+        logging.error(f"Failed to load {file_path}: {e}")
+        return entries
+
+def save_json_file(file_path, title, timestamp):
+    try:
+        entries = load_json_file(file_path, 24 if "posted_" in file_path else 7 * 24)  # 24 hours for titles, 7 days for images
+        entry = {"title": title, "timestamp": timestamp}
+        entries.append(entry)
+        
+        # Prune entries older than expiration period
+        expiration_hours = 24 if "posted_" in file_path else 7 * 24
+        cutoff = datetime.now(timezone.utc) - timedelta(hours=expiration_hours)
+        pruned_entries = [e for e in entries if datetime.fromisoformat(e["timestamp"]) > cutoff]
+        
+        with open(file_path, 'w') as f:
+            for entry in pruned_entries:
+                f.write(json.dumps(entry) + '\n')
+        
+        logging.info(f"Saved '{title}' to {file_path}")
+        logging.info(f"Pruned {file_path} to {len(pruned_entries)} entries (older than {expiration_hours//24} days removed)")
+    except Exception as e:
+        logging.error(f"Failed to save to {file_path}: {e}")

 def load_post_counts():
    counts = []
@@ -886,7 +885,8 @@ def post_to_wp(post_data, category, link, author, image_url, original_source, im
        if content is None:
            logging.error(f"Post content is None for title '{post_data['title']}' - using fallback")
            content = "Content unavailable. Check the original source for details."
-        formatted_content = "\n".join(f"<p>{para}</p>" for para in content.split('\n') if para.strip())        
+        formatted_content = "\n".join(f"<p>{para}</p>" for para in content.split('\n') if para.strip())
+        
        author_id_map = {
            "owenjohnson": 10,
            "javiermorales": 2,
@@ -897,6 +897,20 @@ def post_to_wp(post_data, category, link, author, image_url, original_source, im
        }
        author_id = author_id_map.get(author["username"], 5)
        
+        # Handle image upload
+        image_id = None
+        if image_url:
+            logging.info(f"Attempting image upload for '{post_data['title']}', URL: {image_url}, source: {image_source}")
+            image_id = upload_image_to_wp(image_url, post_data["title"], wp_base_url, wp_username, wp_password, image_source, uploader, pixabay_url)
+            if not image_id:
+                logging.info(f"Flickr upload failed for '{post_data['title']}', falling back to Pixabay")
+                pixabay_query = post_data["title"][:50]
+                image_url, image_source, uploader, pixabay_url = get_image(pixabay_query)
+                if image_url:
+                    image_id = upload_image_to_wp(image_url, post_data["title"], wp_base_url, wp_username, wp_password, image_source, uploader, pixabay_url)
+            if not image_id:
+                logging.warning(f"All image uploads failed for '{post_data['title']}' - posting without image")
+        
        payload = {
            "title": post_data["title"],
            "content": formatted_content,
@@ -911,19 +925,9 @@ def post_to_wp(post_data, category, link, author, image_url, original_source, im
            }
        }
        
-        if image_url and not post_id:
-            logging.info(f"Attempting image upload for '{post_data['title']}', URL: {image_url}, source: {image_source}")
-            image_id = upload_image_to_wp(image_url, post_data["title"], wp_base_url, wp_username, wp_password, image_source, uploader, pixabay_url)
-            if not image_id:
-                logging.info(f"Flickr upload failed for '{post_data['title']}', falling back to Pixabay")
-                pixabay_query = post_data["title"][:50]
-                image_url, image_source, uploader, pixabay_url = get_image(pixabay_query)
-                if image_url:
-                    image_id = upload_image_to_wp(image_url, post_data["title"], wp_base_url, wp_username, wp_password, image_source, uploader, pixabay_url)
-            if image_id:
-                payload["featured_media"] = image_id
-            else:
-                logging.warning(f"All image uploads failed for '{post_data['title']}' - posting without image")
+        if image_id:
+            payload["featured_media"] = image_id
+            logging.info(f"Set featured image for post '{post_data['title']}': Media ID={image_id}")
        
        endpoint = f"{wp_base_url}/posts/{post_id}" if post_id else f"{wp_base_url}/posts"
        method = requests.post
@@ -951,7 +955,7 @@ def post_to_wp(post_data, category, link, author, image_url, original_source, im
            try:
                post = {"title": post_data["title"], "url": post_url}
                tweet = generate_article_tweet(author, post, author["persona"])
-                if post_tweet(author, tweet):  # Use the actual post_tweet function
+                if post_tweet(author, tweet):
                    logging.info(f"Successfully posted article tweet for {author['username']} on X")
                else:
                    logging.warning(f"Failed to post article tweet for {author['username']} on X")