foodie-automator/foodie_automator_google.py

# foodie_automator_google.py
import requests
import random
import time
import logging
import re
import os
import json
import signal
import sys
from datetime import datetime, timedelta, timezone
from openai import OpenAI
from urllib.parse import quote
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from duckduckgo_search import DDGS
from foodie_config import (
    AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS,
    PERSONA_CONFIGS, CATEGORIES, get_clean_source_name, X_API_CREDENTIALS
)
from foodie_utils import (
    load_json_file, save_json_file, get_image, generate_image_query,
    upload_image_to_wp, select_best_persona, determine_paragraph_count,
    is_interesting, generate_title_from_summary, summarize_with_gpt4o,
    generate_category_from_summary, post_to_wp, prepare_post_data,
    smart_image_and_filter, insert_link_naturally, get_flickr_image
)
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
from dotenv import load_dotenv
import fcntl

load_dotenv()

is_posting = False
LOCK_FILE = "/home/shane/foodie_automator/locks/foodie_automator_google.lock"

def signal_handler(sig, frame):
    logging.info("Received termination signal, checking if safe to exit...")
    if is_posting:
        logging.info("Currently posting, will exit after completion.")
    else:
        logging.info("Safe to exit immediately.")
        sys.exit(0)

signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)

LOG_FILE = "/home/shane/foodie_automator/logs/foodie_automator_google.log"
LOG_PRUNE_DAYS = 30
MAX_RETRIES = 3
RETRY_BACKOFF = 2

posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
posted_titles = set(entry["title"] for entry in posted_titles_data)
used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)

def setup_logging():
    if os.path.exists(LOG_FILE):
        with open(LOG_FILE, 'r') as f:
            lines = f.readlines()

        log_entries = []
        current_entry = []
        timestamp_pattern = re.compile(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')

        for line in lines:
            if timestamp_pattern.match(line):
                if current_entry:
                    log_entries.append(''.join(current_entry))
                current_entry = [line]
            else:
                current_entry.append(line)

        if current_entry:
            log_entries.append(''.join(current_entry))

        cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
        pruned_entries = []
        for entry in log_entries:
            try:
                timestamp = datetime.strptime(entry[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc)
                if timestamp > cutoff:
                    pruned_entries.append(entry)
            except ValueError:
                logging.warning(f"Skipping malformed log entry (no timestamp): {entry[:50]}...")
                continue

        with open(LOG_FILE, 'w') as f:
            f.writelines(pruned_entries)

    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    file_handler = logging.FileHandler(LOG_FILE, mode='a')
    file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    logger.addHandler(file_handler)
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    logger.addHandler(console_handler)
    logging.info("Logging initialized for foodie_automator_google.py")

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_google_titles.json'
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
EXPIRATION_HOURS = 24
IMAGE_EXPIRATION_DAYS = 7

posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
posted_titles = set(entry["title"] for entry in posted_titles_data)
used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)

def acquire_lock():
    os.makedirs(os.path.dirname(LOCK_FILE), exist_ok=True)
    lock_fd = open(LOCK_FILE, 'w')
    try:
        fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
        lock_fd.write(str(os.getpid()))
        lock_fd.flush()
        return lock_fd
    except IOError:
        logging.info("Another instance of foodie_automator_google.py is running")
        sys.exit(0)

def parse_search_volume(volume_text):
    try:
        volume_part = volume_text.split('\n')[0].lower().strip().replace('+', '')
        if 'k' in volume_part:
            volume = float(volume_part.replace('k', '')) * 1000
        elif 'm' in volume_part:
            volume = float(volume_part.replace('m', '')) * 1000000
        else:
            volume = float(volume_part)
        return volume
    except (ValueError, AttributeError) as e:
        logging.warning(f"Could not parse search volume from '{volume_text}': {e}")
        return 0

def scrape_google_trends(geo='US'):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/125.0.0.0 Safari/537.36")

    driver = None
    try:
        for attempt in range(MAX_RETRIES):
            try:
                driver = webdriver.Chrome(options=chrome_options)
                time.sleep(random.uniform(2, 5))
                url = f"https://trends.google.com/trending?geo={geo}&hours=24&sort=search-volume&category=5"
                logging.info(f"Navigating to {url} (attempt {attempt + 1})")
                driver.get(url)

                logging.info("Waiting for page to load...")
                WebDriverWait(driver, 60).until(
                    EC.presence_of_element_located((By.TAG_NAME, "tbody"))
                )
                break
            except TimeoutException:
                logging.warning(f"Timeout on attempt {attempt + 1} for geo={geo}")
                if attempt == MAX_RETRIES - 1:
                    logging.error(f"Failed after {MAX_RETRIES} attempts for geo={geo}")
                    return []
                time.sleep(RETRY_BACKOFF * (2 ** attempt))
                if driver:
                    driver.quit()
                continue

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

        trends = []
        rows = driver.find_elements(By.XPATH, "//tbody/tr")
        logging.info(f"Found {len(rows)} rows in tbody for geo={geo}")

        cutoff_date = datetime.now(timezone.utc) - timedelta(hours=24)
        for row in rows:
            try:
                columns = row.find_elements(By.TAG_NAME, "td")
                if len(columns) >= 3:
                    title = columns[1].text.strip()
                    search_volume_text = columns[2].text.strip()
                    search_volume = parse_search_volume(search_volume_text)
                    logging.info(f"Parsed trend: {title} with search volume: {search_volume}")
                    if title and search_volume >= 20000:
                        link = f"https://trends.google.com/trends/explore?q={quote(title)}&geo={geo}"
                        trends.append({
                            "title": title,
                            "link": link,
                            "search_volume": search_volume
                        })
                        logging.info(f"Added trend: {title} with search volume: {search_volume}")
                    else:
                        logging.info(f"Skipping trend: {title} (volume: {search_volume} < 20K or no title)")
                else:
                    logging.info(f"Skipping row with insufficient columns: {len(columns)}")
            except Exception as e:
                logging.warning(f"Row processing error: {e}")
                continue

        if trends:
            trends.sort(key=lambda x: x["search_volume"], reverse=True)
            logging.info(f"Extracted {len(trends)} trends for geo={geo}: {[t['title'] for t in trends]}")
        else:
            logging.warning(f"No valid trends found with search volume >= 20K for geo={geo}")
        return trends
    except Exception as e:
        logging.error(f"Unexpected error in scrape_google_trends: {e}", exc_info=True)
        return []
    finally:
        if driver:
            driver.quit()
            logging.info(f"Chrome driver closed for geo={geo}")

def fetch_duckduckgo_news_context(trend_title, hours=24):
    for attempt in range(MAX_RETRIES):
        try:
            with DDGS() as ddgs:
                results = ddgs.news(f"{trend_title} news", timelimit="d", max_results=5)
                titles = []
                for r in results:
                    try:
                        date_str = r["date"]
                        if '+00:00' in date_str:
                            dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
                        else:
                            dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
                        if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
                            titles.append(r["title"].lower())
                    except ValueError as e:
                        logging.warning(f"Date parsing failed for '{date_str}': {e}")
                        continue
                context = " ".join(titles) if titles else "No recent news found within 24 hours"
                logging.info(f"DuckDuckGo News context for '{trend_title}': {context}")
                return context
        except Exception as e:
            logging.warning(f"DuckDuckGo News context fetch failed for '{trend_title}' (attempt {attempt + 1}): {e}")
            if attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_BACKOFF * (2 ** attempt))
            continue
    logging.error(f"Failed to fetch DuckDuckGo News context for '{trend_title}' after {MAX_RETRIES} attempts")
    return trend_title

def curate_from_google_trends(geo_list=['US']):
    try:
        all_trends = []
        for geo in geo_list:
            trends = scrape_google_trends(geo=geo)
            if trends:
                all_trends.extend(trends)

        if not all_trends:
            logging.info("No Google Trends data available")
            return None, None, False

        attempts = 0
        max_attempts = 10
        while attempts < max_attempts and all_trends:
            trend = all_trends.pop(0)
            title = trend["title"]
            link = trend.get("link", "https://trends.google.com/")
            summary = trend.get("summary", "")
            source_name = "Google Trends"
            original_source = f'<a href="{link}">{source_name}</a>'

            if title in posted_titles:
                logging.info(f"Skipping already posted trend: {title}")
                attempts += 1
                continue

            logging.info(f"Trying Google Trend: {title} from {source_name}")

            image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary)
            if skip:
                logging.info(f"Skipping filtered Google Trend: {title}")
                attempts += 1
                continue

            ddg_context = fetch_duckduckgo_news_context(title)
            scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}"
            interest_score = is_interesting(scoring_content)
            logging.info(f"Interest score for '{title}': {interest_score}")
            if interest_score < 6:
                logging.info(f"Google Trends Interest Too Low: {interest_score}")
                attempts += 1
                continue

            num_paragraphs = determine_paragraph_count(interest_score)
            extra_prompt = (
                f"Generate exactly {num_paragraphs} paragraphs.\n"
                f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
                f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
                f"Do NOT introduce unrelated concepts unless in the content or additional context.\n"
                f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
                f"Do not include emojis in the summary."
            )
            content_to_summarize = scoring_content
            final_summary = summarize_with_gpt4o(
                content_to_summarize,
                source_name,
                link,
                interest_score=interest_score,
                extra_prompt=extra_prompt
            )
            if not final_summary:
                logging.info(f"Summary failed for '{title}'")
                attempts += 1
                continue

            final_summary = insert_link_naturally(final_summary, source_name, link)

            # Use round-robin author selection
            author = get_next_author_round_robin()
            author_username = author["username"]
            logging.info(f"Selected author via round-robin: {author_username}")

            post_data = {
                "title": generate_title_from_summary(final_summary),
                "content": final_summary,
                "status": "publish",
                "author": author_username,
                "categories": [generate_category_from_summary(final_summary)]
            }
            category = post_data["categories"][0]
            image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic)
            if not image_url:
                image_url, image_source, uploader, page_url = get_image(image_query)
                if not image_url:
                    logging.warning(f"All image uploads failed for '{title}' - posting without image")
                    image_source = None
                    uploader = None
                    page_url = None

            hook = get_dynamic_hook(post_data["title"]).strip()

            share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
            share_links_template = (
                f'<p>{share_prompt} '
                f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
                f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
            )
            post_data["content"] = f"{final_summary}\n\n{share_links_template}"

            global is_posting
            is_posting = True
            try:
                post_id, post_url = post_to_wp(
                    post_data=post_data,
                    category=category,
                    link=link,
                    author=author,
                    image_url=image_url,
                    original_source=original_source,
                    image_source=image_source,
                    uploader=uploader,
                    page_url=page_url,
                    interest_score=interest_score,
                    should_post_tweet=True
                )
            except Exception as e:
                logging.error(f"Failed to post to WordPress for '{title}': {e}", exc_info=True)
                attempts += 1
                continue
            finally:
                is_posting = False

            if post_id:
                share_text = f"Check out this foodie gem! {post_data['title']}"
                share_text_encoded = quote(share_text)
                post_url_encoded = quote(post_url)
                share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
                post_data["content"] = f"{final_summary}\n\n{share_links}"
                is_posting = True
                try:
                    post_to_wp(
                        post_data=post_data,
                        category=category,
                        link=link,
                        author=author,
                        image_url=image_url,
                        original_source=original_source,
                        image_source=image_source,
                        uploader=uploader,
                        page_url=page_url,
                        interest_score=interest_score,
                        post_id=post_id,
                        should_post_tweet=False
                    )
                except Exception as e:
                    logging.error(f"Failed to update WordPress post '{title}' with share links: {e}", exc_info=True)
                finally:
                    is_posting = False

                timestamp = datetime.now(timezone.utc).isoformat()
                save_json_file(POSTED_TITLES_FILE, title, timestamp)
                posted_titles.add(title)
                logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")

                if image_url:
                    save_json_file(USED_IMAGES_FILE, image_url, timestamp)
                    used_images.add(image_url)
                    logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")

                logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Google Trends *****")
                return post_data, category, True

            attempts += 1
            logging.info(f"WP posting failed for '{post_data['title']}'")

        logging.info("No interesting Google Trend found after attempts")
        return None, None, False
    except Exception as e:
        logging.error(f"Unexpected error in curate_from_google_trends: {e}", exc_info=True)
        return None, None, False

def run_google_trends_automator():
    lock_fd = None
    try:
        lock_fd = acquire_lock()
        logging.info("***** Google Trends Automator Launched *****")
        geo_list = ['US', 'GB', 'AU']
        post_data, category, should_continue = curate_from_google_trends(geo_list=geo_list)
        if not post_data:
            logging.info("No postable Google Trend found")
        else:
            logging.info("Completed Google Trends run")
        return post_data, category, should_continue
    except Exception as e:
        logging.error(f"Fatal error in run_google_trends_automator: {e}", exc_info=True)
        return None, None, False
    finally:
        if lock_fd:
            fcntl.flock(lock_fd, fcntl.LOCK_UN)
            lock_fd.close()
            os.remove(LOCK_FILE) if os.path.exists(LOCK_FILE) else None

if __name__ == "__main__":
    setup_logging()
    post_data, category, should_continue = run_google_trends_automator()
    logging.info(f"Run completed, should_continue: {should_continue}")