foodie-automator/foodie_automator_google.py

# foodie_automator_google.py
import requests
import random
import time
import logging
import re
import os
import json
import signal
import sys
from datetime import datetime, timedelta, timezone
from openai import OpenAI
from urllib.parse import quote
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from duckduckgo_search import DDGS
from foodie_config import (
    AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS,
    PERSONA_CONFIGS, CATEGORIES, get_clean_source_name, X_API_CREDENTIALS
)
from foodie_utils import (
    load_json_file, save_json_file, get_image, generate_image_query,
    upload_image_to_wp, determine_paragraph_count, insert_link_naturally,
    is_interesting, generate_title_from_summary, summarize_with_gpt4o,
    generate_category_from_summary, post_to_wp, prepare_post_data,
    select_best_author, smart_image_and_filter, get_flickr_image,
    get_next_author_round_robin, check_author_rate_limit, update_system_activity
)
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
from dotenv import load_dotenv
import fcntl

load_dotenv()

# Define constants at the top
SCRIPT_NAME = "foodie_automator_google"
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_google_titles.json'
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
EXPIRATION_HOURS = 24
IMAGE_EXPIRATION_DAYS = 7

is_posting = False
LOCK_FILE = "/home/shane/foodie_automator/locks/foodie_automator_google.lock"

# Load JSON files after constants are defined
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
posted_titles = set(entry["title"] for entry in posted_titles_data if "title" in entry)
used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
used_images = set(entry["title"] for entry in used_images_data if "title" in entry)

def signal_handler(sig, frame):
    logging.info("Received termination signal, marking script as stopped...")
    update_system_activity(SCRIPT_NAME, "stopped")
    if is_posting:
        logging.info("Currently posting, will exit after completion.")
    else:
        logging.info("Safe to exit immediately.")
        sys.exit(0)

signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)

LOG_FILE = "/home/shane/foodie_automator/logs/foodie_automator_google.log"
LOG_PRUNE_DAYS = 30
MAX_RETRIES = 3
RETRY_BACKOFF = 2

def setup_logging():
    try:
        # Ensure log directory exists
        os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
        logging.debug(f"Log directory created/verified: {os.path.dirname(LOG_FILE)}")

        # Check write permissions
        if not os.access(os.path.dirname(LOG_FILE), os.W_OK):
            raise PermissionError(f"No write permission for {os.path.dirname(LOG_FILE)}")

        # Test write to log file
        try:
            with open(LOG_FILE, 'a') as f:
                f.write("")
            logging.debug(f"Confirmed write access to {LOG_FILE}")
        except Exception as e:
            raise PermissionError(f"Cannot write to {LOG_FILE}: {e}")

        # Prune old logs
        if os.path.exists(LOG_FILE):
            with open(LOG_FILE, 'r') as f:
                lines = f.readlines()

            log_entries = []
            current_entry = []
            timestamp_pattern = re.compile(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')

            for line in lines:
                if timestamp_pattern.match(line):
                    if current_entry:
                        log_entries.append(''.join(current_entry))
                    current_entry = [line]
                else:
                    current_entry.append(line)

            if current_entry:
                log_entries.append(''.join(current_entry))

            cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
            pruned_entries = []
            for entry in log_entries:
                try:
                    timestamp = datetime.strptime(entry[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc)
                    if timestamp > cutoff:
                        pruned_entries.append(entry)
                except ValueError:
                    logging.warning(f"Skipping malformed log entry (no timestamp): {entry[:50]}...")
                    continue

            with open(LOG_FILE, 'w') as f:
                f.writelines(pruned_entries)
            logging.debug(f"Log file pruned: {LOG_FILE}")

        # Configure logging
        logging.basicConfig(
            filename=LOG_FILE,
            level=logging.INFO,
            format="%(asctime)s - %(levelname)s - %(message)s",
            datefmt="%Y-%m-%d %H:%M:%S",
            force=True  # Ensure this config takes precedence
        )
        console_handler = logging.StreamHandler()
        console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
        logging.getLogger().addHandler(console_handler)
        logging.info("Logging initialized for foodie_automator_google.py")

    except Exception as e:
        # Fallback to console logging if file logging fails
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s - %(levelname)s - %(message)s",
            datefmt="%Y-%m-%d %H:%M:%S",
            force=True
        )
        logging.error(f"Failed to setup file logging for {LOG_FILE}: {e}. Using console logging.")
        console_handler = logging.StreamHandler()
        console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
        logging.getLogger().addHandler(console_handler)
        logging.info("Console logging initialized as fallback for foodie_automator_google.py")

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def acquire_lock():
    os.makedirs(os.path.dirname(LOCK_FILE), exist_ok=True)
    lock_fd = open(LOCK_FILE, 'w')
    try:
        fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
        lock_fd.write(str(os.getpid()))
        lock_fd.flush()
        return lock_fd
    except IOError:
        logging.info("Another instance of foodie_automator_google.py is running")
        sys.exit(0)

def parse_search_volume(volume_text):
    try:
        volume_part = volume_text.split('\n')[0].lower().strip().replace('+', '')
        if 'k' in volume_part:
            volume = float(volume_part.replace('k', '')) * 1000
        elif 'm' in volume_part:
            volume = float(volume_part.replace('m', '')) * 1000000
        else:
            volume = float(volume_part)
        return volume
    except (ValueError, AttributeError) as e:
        logging.warning(f"Could not parse search volume from '{volume_text}': {e}")
        return 0

def scrape_google_trends(geo='US'):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/125.0.0.0 Safari/537.36")

    driver = None
    try:
        for attempt in range(MAX_RETRIES):
            try:
                driver = webdriver.Chrome(options=chrome_options)
                time.sleep(random.uniform(2, 5))
                url = f"https://trends.google.com/trending?geo={geo}&hours=24&sort=search-volume&category=5"
                logging.info(f"Navigating to {url} (attempt {attempt + 1})")
                driver.get(url)

                logging.info("Waiting for page to load...")
                WebDriverWait(driver, 60).until(
                    EC.presence_of_element_located((By.TAG_NAME, "tbody"))
                )
                break
            except TimeoutException:
                logging.warning(f"Timeout on attempt {attempt + 1} for geo={geo}")
                if attempt == MAX_RETRIES - 1:
                    logging.error(f"Failed after {MAX_RETRIES} attempts for geo={geo}")
                    return []
                time.sleep(RETRY_BACKOFF * (2 ** attempt))
                if driver:
                    driver.quit()
                continue

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

        trends = []
        rows = driver.find_elements(By.XPATH, "//tbody/tr")
        logging.info(f"Found {len(rows)} rows in tbody for geo={geo}")

        cutoff_date = datetime.now(timezone.utc) - timedelta(hours=24)
        for row in rows:
            try:
                columns = row.find_elements(By.TAG_NAME, "td")
                if len(columns) >= 3:
                    title = columns[1].text.strip()
                    search_volume_text = columns[2].text.strip()
                    search_volume = parse_search_volume(search_volume_text)
                    logging.info(f"Parsed trend: {title} with search volume: {search_volume}")
                    if title and search_volume >= 20000:
                        link = f"https://trends.google.com/trends/explore?q={quote(title)}&geo={geo}"
                        trends.append({
                            "title": title,
                            "link": link,
                            "search_volume": search_volume
                        })
                        logging.info(f"Added trend: {title} with search volume: {search_volume}")
                    else:
                        logging.info(f"Skipping trend: {title} (volume: {search_volume} < 20K or no title)")
                else:
                    logging.info(f"Skipping row with insufficient columns: {len(columns)}")
            except Exception as e:
                logging.warning(f"Row processing error: {e}")
                continue

        if trends:
            trends.sort(key=lambda x: x["search_volume"], reverse=True)
            logging.info(f"Extracted {len(trends)} trends for geo={geo}: {[t['title'] for t in trends]}")
        else:
            logging.warning(f"No valid trends found with search volume >= 20K for geo={geo}")
        return trends
    except Exception as e:
        logging.error(f"Unexpected error in scrape_google_trends: {e}", exc_info=True)
        return []
    finally:
        if driver:
            driver.quit()
            logging.info(f"Chrome driver closed for geo={geo}")

def fetch_duckduckgo_news_context(trend_title, hours=24):
    for attempt in range(MAX_RETRIES):
        try:
            with DDGS() as ddgs:
                results = ddgs.news(f"{trend_title} news", timelimit="d", max_results=5)
                titles = []
                for r in results:
                    try:
                        date_str = r["date"]
                        # Handle both ISO formats with and without timezone
                        if '+00:00' in date_str:
                            dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
                        else:
                            dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
                        if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
                            titles.append(r["title"].lower())
                    except ValueError as e:
                        logging.warning(f"Date parsing failed for '{date_str}': {e}")
                        continue
                context = " ".join(titles) if titles else "No recent news found within 24 hours"
                logging.info(f"DuckDuckGo News context for '{trend_title}': {context}")
                return context
        except Exception as e:
            logging.warning(f"DuckDuckGo News context fetch failed for '{trend_title}' (attempt {attempt + 1}): {e}")
            if attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_BACKOFF * (2 ** attempt))
            continue
    logging.error(f"Failed to fetch DuckDuckGo News context for '{trend_title}' after {MAX_RETRIES} attempts")
    return trend_title

def curate_from_google(item, original_source, source_name, link, page_url):
    logger = logging.getLogger(__name__)
    try:
        content = item.get('snippet', '')
        if not content:
            logger.info(f"No content for Google item: {item.get('title', 'unknown')}")
            return None, None

        interest_score = is_interesting(content)
        if interest_score < 4:
            logger.info(f"Google item '{item.get('title', 'unknown')}' not interesting enough: score {interest_score}")
            return None, None

        summary = summarize_with_gpt4o(content, source_name, link, interest_score=interest_score)
        if not summary:
            logger.warning(f"Failed to summarize Google item: {item.get('title', 'unknown')}")
            return None, None

        # Remove the original title from the summary if present
        if item.get('title', '') in summary:
            summary = summary.replace(item.get('title', ''), "").strip()
        while "\n\n\n" in summary:
            summary = summary.replace("\n\n\n", "\n\n")

        final_summary = insert_link_naturally(summary, source_name, link)
        if not final_summary:
            logger.warning(f"Failed to insert link for Google item: {item.get('title', 'unknown')}")
            return None, None

        post_data, author, category, image_url, image_source, uploader, page_url = prepare_post_data(final_summary, item.get('title', 'unknown'))
        if not post_data:
            logger.info(f"Post preparation failed for Google item: {item.get('title', 'unknown')}")
            return None, None

        share_text = f"Check out this tasty find: {post_data['title']}"
        share_text_encoded = quote(share_text)
        share_links_template = (
            "Share this post: "
            '<a href="https://x.com/intent/tweet?url={post_url}&text={share_text}">X</a> | '
            '<a href="https://www.facebook.com/sharer/sharer.php?u={post_url}">Facebook</a>'
        )

        # First call: Post without share links
        post_data["content"] = final_summary
        post_id, post_url = post_to_wp(
            post_data=post_data,
            category=category,
            link=link,
            author=author,
            image_url=image_url,
            original_source=original_source,
            image_source=image_source,
            uploader=uploader,
            page_url=page_url,
            interest_score=interest_score,
            should_post_tweet=True,
            summary=final_summary
        )

        if not post_id:
            logger.warning(f"Failed to post Google item to WP: {post_data['title']}")
            return None, None

        # Second call: Update with share links
        post_url_encoded = quote(post_url)
        share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
        post_data["content"] = f"{final_summary}\n\n{share_links}"
        post_id, post_url = post_to_wp(
            post_data=post_data,
            category=category,
            link=link,
            author=author,
            image_url=None,
            original_source=original_source,
            image_source=image_source,
            uploader=uploader,
            page_url=page_url,
            interest_score=interest_score,
            post_id=post_id,
            should_post_tweet=False
        )

        if post_id:
            logger.info(f"Successfully curated and posted Google item: {post_data['title']} (URL: {post_url})")
            return post_id, post_url
        else:
            logger.warning(f"Failed to update Google post with share links: {post_data['title']}")
            return None, None

    except Exception as e:
        logger.error(f"Error curating Google item '{item.get('title', 'unknown')}': {e}")
        return None, None

def run_google_trends_automator():
    lock_fd = None
    try:
        lock_fd = acquire_lock()
        update_system_activity(SCRIPT_NAME, "running", os.getpid())  # Record start
        logging.info("***** Google Trends Automator Launched *****")
        # Load JSON files once
        posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
        posted_titles = set(entry["title"] for entry in posted_titles_data)
        used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
        used_images = set(entry["title"] for entry in used_images_data if "title" in entry)
        post_data, category, sleep_time = curate_from_google_trends(posted_titles_data, posted_titles, used_images_data, used_images)
        if not post_data:
            logging.info("No postable Google Trend found")
        logging.info("Completed Google Trends run")
        update_system_activity(SCRIPT_NAME, "stopped")  # Record stop
        logging.info(f"Run completed, sleep_time: {sleep_time} seconds")
        return post_data, category, sleep_time
    except Exception as e:
        logging.error(f"Fatal error in run_google_trends_automator: {e}", exc_info=True)
        update_system_activity(SCRIPT_NAME, "stopped")  # Record stop on error
        sleep_time = random.randint(1200, 1800)  # 20–30 minutes
        logging.info(f"Run completed, sleep_time: {sleep_time} seconds")
        return None, None, sleep_time
    finally:
        if lock_fd:
            fcntl.flock(lock_fd, fcntl.LOCK_UN)
            lock_fd.close()
            os.remove(LOCK_FILE) if os.path.exists(LOCK_FILE) else None

if __name__ == "__main__":
    setup_logging()
    post_data, category, sleep_time = run_google_trends_automator()
    logging.info(f"Run completed, sleep_time: {sleep_time} seconds")