Initial commit of foodie automator scripts

2025-04-20 20:12:00 +10:00
commit d4f098639e
7 changed files with 2120 additions and 0 deletions
@@ -0,0 +1,294 @@
+import requests
+import random
+import time
+import logging
+import re
+import os
+import json
+from datetime import datetime, timedelta, timezone
+from openai import OpenAI
+from urllib.parse import quote
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options
+from selenium.common.exceptions import TimeoutException
+from duckduckgo_search import DDGS
+from foodie_config import (
+    AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS,
+    SUMMARY_PERSONA_PROMPTS, CATEGORIES, CTAS, get_clean_source_name
+)
+from foodie_utils import (
+    load_json_file, save_json_file, get_image, generate_image_query,
+    upload_image_to_wp, select_best_persona, determine_paragraph_count, is_interesting,
+    generate_title_from_summary, summarize_with_gpt4o, generate_category_from_summary, post_to_wp,
+    prepare_post_data, smart_image_and_filter, insert_link_naturally, get_flickr_image_via_ddg
+)
+from foodie_hooks import get_dynamic_hook, select_best_cta
+
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+file_handler = logging.FileHandler('/tmp/foodie_automator_google_trends.log', mode='a')
+file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
+logger.addHandler(file_handler)
+console_handler = logging.StreamHandler()
+console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
+logger.addHandler(console_handler)
+logging.info("Logging initialized for foodie_automator_google.py")
+
+client = OpenAI(api_key="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")
+
+POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_google_titles.json'
+USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
+EXPIRATION_HOURS = 24
+IMAGE_EXPIRATION_DAYS = 7
+
+posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
+posted_titles = set(entry["title"] for entry in posted_titles_data)
+used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)
+
+def parse_search_volume(volume_text):
+    try:
+        volume_part = volume_text.split('\n')[0].lower().strip().replace('+', '')
+        if 'k' in volume_part:
+            volume = float(volume_part.replace('k', '')) * 1000
+        elif 'm' in volume_part:
+            volume = float(volume_part.replace('m', '')) * 1000000
+        else:
+            volume = float(volume_part)
+        return volume
+    except (ValueError, AttributeError) as e:
+        logging.warning(f"Could not parse search volume from '{volume_text}': {e}")
+        return 0
+
+def scrape_google_trends(geo='US'):
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/125.0.0.0 Safari/537.36")
+
+    driver = webdriver.Chrome(options=chrome_options)
+    try:
+        for attempt in range(3):
+            try:
+                time.sleep(random.uniform(2, 5))
+                url = f"https://trends.google.com/trending?geo={geo}&hours=24&sort=search-volume&category=5"
+                logging.info(f"Navigating to {url} (attempt {attempt + 1})")
+                driver.get(url)
+
+                logging.info("Waiting for page to load...")
+                WebDriverWait(driver, 60).until(
+                    EC.presence_of_element_located((By.TAG_NAME, "tbody"))
+                )
+                break
+            except TimeoutException:
+                logging.warning(f"Timeout on attempt {attempt + 1} for geo={geo}")
+                if attempt == 2:
+                    logging.error(f"Failed after 3 attempts for geo={geo}")
+                    return []
+                time.sleep(5)
+
+        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+        time.sleep(2)
+
+        trends = []
+        rows = driver.find_elements(By.XPATH, "//tbody/tr")
+        logging.info(f"Found {len(rows)} rows in tbody for geo={geo}")
+
+        cutoff_date = datetime.now(timezone.utc) - timedelta(hours=24)
+        for row in rows:
+            try:
+                columns = row.find_elements(By.TAG_NAME, "td")
+                if len(columns) >= 3:
+                    title = columns[1].text.strip()
+                    search_volume_text = columns[2].text.strip()
+                    search_volume = parse_search_volume(search_volume_text)
+                    logging.info(f"Parsed trend: {title} with search volume: {search_volume}")
+                    if title and search_volume >= 20000:
+                        link = f"https://trends.google.com/trends/explore?q={quote(title)}&geo={geo}"
+                        trends.append({
+                            "title": title,
+                            "link": link,
+                            "search_volume": search_volume
+                        })
+                        logging.info(f"Added trend: {title} with search volume: {search_volume}")
+                    else:
+                        logging.info(f"Skipping trend: {title} (volume: {search_volume} < 20K or no title)")
+                else:
+                    logging.info(f"Skipping row with insufficient columns: {len(columns)}")
+            except Exception as e:
+                logging.warning(f"Row processing error: {e}")
+                continue
+
+        if trends:
+            trends.sort(key=lambda x: x["search_volume"], reverse=True)
+            logging.info(f"Extracted {len(trends)} trends for geo={geo}: {[t['title'] for t in trends]}")
+            print(f"Raw trends fetched for geo={geo}: {[t['title'] for t in trends]}")
+        else:
+            logging.warning(f"No valid trends found with search volume >= 20K for geo={geo}")
+        return trends
+    finally:
+        driver.quit()
+        logging.info(f"Chrome driver closed for geo={geo}")
+
+def fetch_duckduckgo_news_context(trend_title, hours=24):
+    try:
+        with DDGS() as ddgs:
+            results = ddgs.news(f"{trend_title} news", timelimit="d", max_results=5)
+            titles = []
+            for r in results:
+                try:
+                    date_str = r["date"]
+                    if '+00:00' in date_str:
+                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
+                    else:
+                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
+                    if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
+                        titles.append(r["title"].lower())
+                except ValueError as e:
+                    logging.warning(f"Date parsing failed for '{date_str}': {e}")
+                    continue
+            context = " ".join(titles) if titles else "No recent news found within 24 hours"
+            logging.info(f"DuckDuckGo News context for '{trend_title}': {context}")
+            return context
+    except Exception as e:
+        logging.warning(f"DuckDuckGo News context fetch failed for '{trend_title}': {e}")
+        return trend_title
+
+def curate_from_google_trends(geo_list=['US']):
+    original_source = '<a href="https://trends.google.com/">Google Trends</a>'
+    for geo in geo_list:
+        trends = scrape_google_trends(geo=geo)
+        if not trends:
+            print(f"No trends available for geo={geo}")
+            logging.info(f"No trends available for geo={geo}")
+            continue
+
+        attempts = 0
+        max_attempts = 10
+        while attempts < max_attempts and trends:
+            trend = trends.pop(0)  # Take highest-volume trend
+            title = trend["title"]
+            link = trend["link"]
+            search_volume = trend["search_volume"]
+            print(f"Trying Trend: {title} with search volume: {search_volume} for geo={geo}")
+            logging.info(f"Trying Trend: {title} with search volume: {search_volume} for geo={geo}")
+
+            if title in posted_titles:
+                print(f"Skipping already posted trend: {title}")
+                logging.info(f"Skipping already posted trend: {title}")
+                attempts += 1
+                continue
+
+            image_query, relevance_keywords, skip = smart_image_and_filter(title, "")
+            if skip:
+                print(f"Skipping unwanted trend: {title}")
+                logging.info(f"Skipping unwanted trend: {title}")
+                attempts += 1
+                continue
+
+            context = fetch_duckduckgo_news_context(title)
+            scoring_content = f"{title}\n\n{context}"
+            interest_score = is_interesting(scoring_content)
+            logging.info(f"Interest score for '{title}' in geo={geo}: {interest_score}")
+            if interest_score < 6:
+                print(f"Trend Interest Too Low: {interest_score}")
+                logging.info(f"Trend Interest Too Low: {interest_score}")
+                attempts += 1
+                continue
+
+            num_paragraphs = determine_paragraph_count(interest_score)
+            extra_prompt = (
+                f"Generate exactly {num_paragraphs} paragraphs. "
+                f"Do not mention Google Trends, Google, or include any links. "
+                f"Summarize as a standalone food industry trend, focusing on '{title}' and its context."
+            )
+            final_summary = summarize_with_gpt4o(
+                scoring_content,
+                source_name="Google Trends",
+                source_url=link,
+                interest_score=interest_score,
+                extra_prompt=extra_prompt
+            )
+            if not final_summary:
+                logging.info(f"Summary failed for '{title}'")
+                attempts += 1
+                continue
+
+            final_summary = insert_link_naturally(final_summary, "Google Trends", link)
+            post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
+            if not post_data:
+                attempts += 1
+                continue
+
+            image_url, image_source, uploader, page_url = get_flickr_image_via_ddg(image_query, relevance_keywords)
+            if not image_url:
+                image_url, image_source, uploader, page_url = get_image(image_query)
+
+            hook = get_dynamic_hook(post_data["title"]).strip()
+            cta = select_best_cta(post_data["title"], final_summary, post_url=None)
+            post_data["content"] = f"{final_summary}\n\n{cta}"
+
+            post_id, post_url = post_to_wp(
+                post_data=post_data,
+                category=category,
+                link=link,
+                author=author,
+                image_url=image_url,
+                original_source=original_source,
+                image_source=image_source,
+                uploader=uploader,
+                pixabay_url=pixabay_url,
+                interest_score=interest_score
+            )
+
+            if post_id:
+                cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
+                post_data["content"] = f"{final_summary}\n\n{cta}"
+                post_to_wp(
+                    post_data=post_data,
+                    category=category,
+                    link=link,
+                    author=author,
+                    image_url=image_url,
+                    original_source=original_source,
+                    image_source=image_source,
+                    uploader=uploader,
+                    pixabay_url=pixabay_url,
+                    interest_score=interest_score,
+                    post_id=post_id
+                )
+
+                timestamp = datetime.now(timezone.utc).isoformat()
+                save_json_file(POSTED_TITLES_FILE, title, timestamp)
+                posted_titles.add(title)
+                logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")
+
+                if image_url:
+                    save_json_file(USED_IMAGES_FILE, image_url, timestamp)
+                    logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
+
+                print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from trend for geo={geo} *****")
+                logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from trend for geo={geo} *****")
+                return post_data, category, random.randint(0, 1800)
+
+        print(f"No interesting trend found for geo={geo}")
+        logging.info(f"No interesting trend found for geo={geo}")
+
+    print(f"No interesting trend found across regions {geo_list}")
+    logging.info(f"No interesting trend found across regions {geo_list}")
+    return None, None, random.randint(600, 1200)
+
+def run_google_trends_automator():
+    logging.info("***** Google Trends Automator Launched *****")
+    geo_list = ['US', 'GB', 'AU']
+    post_data, category, sleep_time = curate_from_google_trends(geo_list=geo_list)
+    print(f"Sleeping for {sleep_time}s")
+    logging.info(f"Completed run with sleep time: {sleep_time} seconds")
+    time.sleep(sleep_time)
+    return post_data, category, sleep_time
+
+if __name__ == "__main__":
+    run_google_trends_automator()