foodie-automator/foodie_automator_google.py

# foodie_automator_google.py
import requests
import random
import time
import logging
import re
import os
import json
import signal
import sys
from datetime import datetime, timedelta, timezone
from openai import OpenAI
from urllib.parse import quote
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from duckduckgo_search import DDGS
from foodie_config import (
    AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS,
    PERSONA_CONFIGS, CATEGORIES, CTAS, get_clean_source_name, X_API_CREDENTIALS
)
from foodie_utils import (
    load_json_file, save_json_file, get_image, generate_image_query,
    upload_image_to_wp, select_best_persona, determine_paragraph_count,
    is_interesting, generate_title_from_summary, summarize_with_gpt4o,
    generate_category_from_summary, post_to_wp, prepare_post_data,
    smart_image_and_filter, insert_link_naturally, get_flickr_image_via_ddg
)
from foodie_hooks import get_dynamic_hook, select_best_cta
from dotenv import load_dotenv

load_dotenv()

# Flag to indicate if we're in the middle of posting
is_posting = False

def signal_handler(sig, frame):
    logging.info("Received termination signal, checking if safe to exit...")
    if is_posting:
        logging.info("Currently posting, will exit after completion.")
    else:
        logging.info("Safe to exit immediately.")
        sys.exit(0)

signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)

logger = logging.getLogger()
logger.setLevel(logging.INFO)
file_handler = logging.FileHandler('/home/shane/foodie_automator/foodie_automator_google.log', mode='a')
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(console_handler)
logging.info("Logging initialized for foodie_automator_google.py")

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_google_titles.json'
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
EXPIRATION_HOURS = 24
IMAGE_EXPIRATION_DAYS = 7

posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
posted_titles = set(entry["title"] for entry in posted_titles_data)
used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)

def parse_search_volume(volume_text):
    try:
        volume_part = volume_text.split('\n')[0].lower().strip().replace('+', '')
        if 'k' in volume_part:
            volume = float(volume_part.replace('k', '')) * 1000
        elif 'm' in volume_part:
            volume = float(volume_part.replace('m', '')) * 1000000
        else:
            volume = float(volume_part)
        return volume
    except (ValueError, AttributeError) as e:
        logging.warning(f"Could not parse search volume from '{volume_text}': {e}")
        return 0

def scrape_google_trends(geo='US'):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/125.0.0.0 Safari/537.36")

    driver = webdriver.Chrome(options=chrome_options)
    try:
        for attempt in range(3):
            try:
                time.sleep(random.uniform(2, 5))
                url = f"https://trends.google.com/trending?geo={geo}&hours=24&sort=search-volume&category=5"
                logging.info(f"Navigating to {url} (attempt {attempt + 1})")
                driver.get(url)

                logging.info("Waiting for page to load...")
                WebDriverWait(driver, 60).until(
                    EC.presence_of_element_located((By.TAG_NAME, "tbody"))
                )
                break
            except TimeoutException:
                logging.warning(f"Timeout on attempt {attempt + 1} for geo={geo}")
                if attempt == 2:
                    logging.error(f"Failed after 3 attempts for geo={geo}")
                    return []
                time.sleep(5)

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

        trends = []
        rows = driver.find_elements(By.XPATH, "//tbody/tr")
        logging.info(f"Found {len(rows)} rows in tbody for geo={geo}")

        cutoff_date = datetime.now(timezone.utc) - timedelta(hours=24)
        for row in rows:
            try:
                columns = row.find_elements(By.TAG_NAME, "td")
                if len(columns) >= 3:
                    title = columns[1].text.strip()
                    search_volume_text = columns[2].text.strip()
                    search_volume = parse_search_volume(search_volume_text)
                    logging.info(f"Parsed trend: {title} with search volume: {search_volume}")
                    if title and search_volume >= 20000:
                        link = f"https://trends.google.com/trends/explore?q={quote(title)}&geo={geo}"
                        trends.append({
                            "title": title,
                            "link": link,
                            "search_volume": search_volume
                        })
                        logging.info(f"Added trend: {title} with search volume: {search_volume}")
                    else:
                        logging.info(f"Skipping trend: {title} (volume: {search_volume} < 20K or no title)")
                else:
                    logging.info(f"Skipping row with insufficient columns: {len(columns)}")
            except Exception as e:
                logging.warning(f"Row processing error: {e}")
                continue

        if trends:
            trends.sort(key=lambda x: x["search_volume"], reverse=True)
            logging.info(f"Extracted {len(trends)} trends for geo={geo}: {[t['title'] for t in trends]}")
            print(f"Raw trends fetched for geo={geo}: {[t['title'] for t in trends]}")
        else:
            logging.warning(f"No valid trends found with search volume >= 20K for geo={geo}")
        return trends
    finally:
        driver.quit()
        logging.info(f"Chrome driver closed for geo={geo}")

def fetch_duckduckgo_news_context(trend_title, hours=24):
    try:
        with DDGS() as ddgs:
            results = ddgs.news(f"{trend_title} news", timelimit="d", max_results=5)
            titles = []
            for r in results:
                try:
                    date_str = r["date"]
                    if '+00:00' in date_str:
                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
                    else:
                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
                    if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
                        titles.append(r["title"].lower())
                except ValueError as e:
                    logging.warning(f"Date parsing failed for '{date_str}': {e}")
                    continue
            context = " ".join(titles) if titles else "No recent news found within 24 hours"
            logging.info(f"DuckDuckGo News context for '{trend_title}': {context}")
            return context
    except Exception as e:
        logging.warning(f"DuckDuckGo News context fetch failed for '{trend_title}': {e}")
        return trend_title

def curate_from_google_trends(geo_list=['US']):
    # Fetch Google Trends data for each geo
    all_trends = []
    for geo in geo_list:
        trends = scrape_google_trends(geo=geo)
        if trends:
            all_trends.extend(trends)

    if not all_trends:
        print("No Google Trends data available")
        logging.info("No Google Trends data available")
        return None, None, None

    attempts = 0
    max_attempts = 10
    while attempts < max_attempts and all_trends:
        trend = all_trends.pop(0)
        title = trend["title"]
        link = trend.get("link", "https://trends.google.com/")
        summary = trend.get("summary", "")
        source_name = "Google Trends"
        original_source = f'<a href="{link}">{source_name}</a>'

        if title in posted_titles:
            print(f"Skipping already posted trend: {title}")
            logging.info(f"Skipping already posted trend: {title}")
            attempts += 1
            continue

        print(f"Trying Google Trend: {title} from {source_name}")
        logging.info(f"Trying Google Trend: {title} from {source_name}")

        # Check if the trend should be filtered out
        image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
        if skip:
            print(f"Skipping filtered Google Trend: {title}")
            logging.info(f"Skipping filtered Google Trend: {title}")
            attempts += 1
            continue

        # Calculate interest score
        scoring_content = f"{title}\n\n{summary}"
        interest_score = is_interesting(scoring_content)
        logging.info(f"Interest score for '{title}': {interest_score}")
        if interest_score < 6:
            print(f"Google Trends Interest Too Low: {interest_score}")
            logging.info(f"Google Trends Interest Too Low: {interest_score}")
            attempts += 1
            continue

        # Summarize the trend
        num_paragraphs = determine_paragraph_count(interest_score)
        extra_prompt = (
            f"Generate exactly {num_paragraphs} paragraphs.\n"
            f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
            f"Do NOT introduce unrelated concepts.\n"
            f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
            f"Do not include emojis in the summary."
        )
        content_to_summarize = scoring_content
        final_summary = summarize_with_gpt4o(
            content_to_summarize,
            source_name,
            link,
            interest_score=interest_score,
            extra_prompt=extra_prompt
        )
        if not final_summary:
            logging.info(f"Summary failed for '{title}'")
            attempts += 1
            continue

        final_summary = insert_link_naturally(final_summary, source_name, link)

        # Prepare post data
        post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
        if not post_data:
            attempts += 1
            continue

        # Fetch image
        image_url, image_source, uploader, page_url = get_flickr_image_via_ddg(image_query, relevance_keywords)
        if not image_url:
            image_url, image_source, uploader, page_url = get_image(image_query)

        # Generate hooks and initial CTA
        hook = get_dynamic_hook(post_data["title"]).strip()
        cta = select_best_cta(post_data["title"], final_summary, post_url=None)

        post_data["content"] = f"{final_summary}\n\n{cta}"

        # Post to WordPress and tweet
        global is_posting
        is_posting = True
        try:
            post_id, post_url = post_to_wp(
                post_data=post_data,
                category=category,
                link=link,
                author=author,
                image_url=image_url,
                original_source=original_source,
                image_source=image_source,
                uploader=uploader,
                pixabay_url=pixabay_url,
                interest_score=interest_score,
                should_post_tweet=True  # Post the X tweet on the first call
            )
        finally:
            is_posting = False

        if post_id:
            cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
            post_data["content"] = f"{final_summary}\n\n{cta}"
            is_posting = True
            try:
                post_to_wp(
                    post_data=post_data,
                    category=category,
                    link=link,
                    author=author,
                    image_url=image_url,
                    original_source=original_source,
                    image_source=image_source,
                    uploader=uploader,
                    pixabay_url=pixabay_url,
                    interest_score=interest_score,
                    post_id=post_id,
                    should_post_tweet=False  # Skip X tweet on the update call
                )
            finally:
                is_posting = False

            timestamp = datetime.now(timezone.utc).isoformat()
            save_json_file(POSTED_TITLES_FILE, title, timestamp)
            posted_titles.add(title)
            logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")

            if image_url:
                save_json_file(USED_IMAGES_FILE, image_url, timestamp)
                used_images.add(image_url)
                logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")

            print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Google Trends *****")
            logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Google Trends *****")
            return post_data, category, random.randint(0, 1800)

        attempts += 1
        logging.info(f"WP posting failed for '{post_data['title']}'")

    print("No interesting Google Trend found after attempts")
    logging.info("No interesting Google Trend found after attempts")
    return None, None, random.randint(600, 1800)

def run_google_trends_automator():
    logging.info("***** Google Trends Automator Launched *****")
    geo_list = ['US', 'GB', 'AU']
    post_data, category, sleep_time = curate_from_google_trends(geo_list=geo_list)
    print(f"Sleeping for {sleep_time}s")
    logging.info(f"Completed run with sleep time: {sleep_time} seconds")
    time.sleep(sleep_time)
    return post_data, category, sleep_time

if __name__ == "__main__":
    run_google_trends_automator()