use cursor to optomize files

2025-05-03 16:23:06 +10:00
parent 427a5cb919
commit 2ca39915e0
5 changed files with 1411 additions and 1634 deletions
@@ -9,6 +9,7 @@ import json
 import signal
 import sys
 from datetime import datetime, timedelta, timezone
+from typing import List, Dict, Optional, Tuple
 from openai import OpenAI
 from urllib.parse import quote
 from selenium import webdriver
@@ -16,11 +17,12 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.chrome.options import Options
-from selenium.common.exceptions import TimeoutException
+from selenium.common.exceptions import TimeoutException, WebDriverException
 from duckduckgo_search import DDGS
 from foodie_config import (
    AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS,
-    PERSONA_CONFIGS, CATEGORIES, get_clean_source_name, X_API_CREDENTIALS
+    PERSONA_CONFIGS, CATEGORIES, get_clean_source_name, X_API_CREDENTIALS,
+    FILE_PATHS, EXPIRATION_DAYS, IMAGE_EXPIRATION_DAYS
 )
 from foodie_utils import (
    load_json_file, save_json_file, get_image, generate_image_query,
@@ -29,320 +31,254 @@ from foodie_utils import (
    generate_category_from_summary, post_to_wp, prepare_post_data,
    smart_image_and_filter, insert_link_naturally, get_flickr_image
 )
-from foodie_hooks import get_dynamic_hook, get_viral_share_prompt  # Removed select_best_cta import
+from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
 from dotenv import load_dotenv

+# Load environment variables
 load_dotenv()

+# Global state
 is_posting = False
+logger = logging.getLogger(__name__)

-def signal_handler(sig, frame):
-    logging.info("Received termination signal, checking if safe to exit...")
-    if is_posting:
-        logging.info("Currently posting, will exit after completion.")
-    else:
-        logging.info("Safe to exit immediately.")
-        sys.exit(0)
+class GoogleTrendsScraper:
+    def __init__(self):
+        self.driver = None
+        self.setup_logging()
+        self.setup_signal_handlers()
+        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+        self.posted_titles = self.load_posted_titles()
+        self.used_images = self.load_used_images()

-signal.signal(signal.SIGTERM, signal_handler)
-signal.signal(signal.SIGINT, signal_handler)
+    def setup_logging(self) -> None:
+        """Configure logging for the scraper."""
+        logger.setLevel(logging.INFO)
+        file_handler = logging.FileHandler(FILE_PATHS["posted_google_titles"].with_suffix('.log'), mode='a')
+        file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
+        logger.addHandler(file_handler)
+        console_handler = logging.StreamHandler()
+        console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
+        logger.addHandler(console_handler)
+        logger.info("Logging initialized for Google Trends scraper")

-logger = logging.getLogger()
-logger.setLevel(logging.INFO)
-file_handler = logging.FileHandler('/home/shane/foodie_automator/foodie_automator_google.log', mode='a')
-file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
-logger.addHandler(file_handler)
-console_handler = logging.StreamHandler()
-console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
-logger.addHandler(console_handler)
-logging.info("Logging initialized for foodie_automator_google.py")
+    def setup_signal_handlers(self) -> None:
+        """Set up signal handlers for graceful shutdown."""
+        def signal_handler(sig, frame):
+            logger.info("Received termination signal, checking if safe to exit...")
+            if is_posting:
+                logger.info("Currently posting, will exit after completion.")
+            else:
+                logger.info("Safe to exit immediately.")
+                sys.exit(0)

-client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+        signal.signal(signal.SIGTERM, signal_handler)
+        signal.signal(signal.SIGINT, signal_handler)

-POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_google_titles.json'
-USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
-EXPIRATION_HOURS = 24
-IMAGE_EXPIRATION_DAYS = 7
+    def load_posted_titles(self) -> set:
+        """Load and return the set of posted titles."""
+        try:
+            data = load_json_file(FILE_PATHS["posted_google_titles"], EXPIRATION_DAYS)
+            return {entry["title"] for entry in data}
+        except Exception as e:
+            logger.error(f"Error loading posted titles: {e}")
+            return set()

-posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
-posted_titles = set(entry["title"] for entry in posted_titles_data)
-used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)
+    def load_used_images(self) -> set:
+        """Load and return the set of used images."""
+        try:
+            data = load_json_file(FILE_PATHS["used_images"], IMAGE_EXPIRATION_DAYS)
+            return {entry["title"] for entry in data if "title" in entry}
+        except Exception as e:
+            logger.error(f"Error loading used images: {e}")
+            return set()

-def parse_search_volume(volume_text):
-    try:
-        volume_part = volume_text.split('\n')[0].lower().strip().replace('+', '')
-        if 'k' in volume_part:
-            volume = float(volume_part.replace('k', '')) * 1000
-        elif 'm' in volume_part:
-            volume = float(volume_part.replace('m', '')) * 1000000
-        else:
-            volume = float(volume_part)
-        return volume
-    except (ValueError, AttributeError) as e:
-        logging.warning(f"Could not parse search volume from '{volume_text}': {e}")
-        return 0
+    def parse_search_volume(self, volume_text: str) -> float:
+        """Parse search volume from text into a numeric value."""
+        try:
+            volume_part = volume_text.split('\n')[0].lower().strip().replace('+', '')
+            if 'k' in volume_part:
+                return float(volume_part.replace('k', '')) * 1000
+            elif 'm' in volume_part:
+                return float(volume_part.replace('m', '')) * 1000000
+            return float(volume_part)
+        except (ValueError, AttributeError) as e:
+            logger.warning(f"Could not parse search volume from '{volume_text}': {e}")
+            return 0.0

-def scrape_google_trends(geo='US'):
-    chrome_options = Options()
-    chrome_options.add_argument("--headless")
-    chrome_options.add_argument("--no-sandbox")
-    chrome_options.add_argument("--disable-dev-shm-usage")
-    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/125.0.0.0 Safari/537.36")
+    def setup_driver(self) -> None:
+        """Set up the Chrome WebDriver with appropriate options."""
+        chrome_options = Options()
+        chrome_options.add_argument("--headless")
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/125.0.0.0 Safari/537.36")
+        self.driver = webdriver.Chrome(options=chrome_options)

-    driver = webdriver.Chrome(options=chrome_options)
-    try:
-        for attempt in range(3):
-            try:
-                time.sleep(random.uniform(2, 5))
-                url = f"https://trends.google.com/trending?geo={geo}&hours=24&sort=search-volume&category=5"
-                logging.info(f"Navigating to {url} (attempt {attempt + 1})")
-                driver.get(url)
-
-                logging.info("Waiting for page to load...")
-                WebDriverWait(driver, 60).until(
-                    EC.presence_of_element_located((By.TAG_NAME, "tbody"))
-                )
-                break
-            except TimeoutException:
-                logging.warning(f"Timeout on attempt {attempt + 1} for geo={geo}")
-                if attempt == 2:
-                    logging.error(f"Failed after 3 attempts for geo={geo}")
-                    return []
-                time.sleep(5)
-
-        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
-        time.sleep(2)
+    def scrape_google_trends(self, geo: str = 'US') -> List[Dict]:
+        """Scrape Google Trends for the specified region."""
+        if not self.driver:
+            self.setup_driver()

        trends = []
-        rows = driver.find_elements(By.XPATH, "//tbody/tr")
-        logging.info(f"Found {len(rows)} rows in tbody for geo={geo}")
+        try:
+            for attempt in range(3):
+                try:
+                    time.sleep(random.uniform(2, 5))
+                    url = f"https://trends.google.com/trending?geo={geo}&hours=24&sort=search-volume&category=5"
+                    logger.info(f"Navigating to {url} (attempt {attempt + 1})")
+                    self.driver.get(url)

-        cutoff_date = datetime.now(timezone.utc) - timedelta(hours=24)
-        for row in rows:
-            try:
-                columns = row.find_elements(By.TAG_NAME, "td")
-                if len(columns) >= 3:
-                    title = columns[1].text.strip()
-                    search_volume_text = columns[2].text.strip()
-                    search_volume = parse_search_volume(search_volume_text)
-                    logging.info(f"Parsed trend: {title} with search volume: {search_volume}")
-                    if title and search_volume >= 20000:
-                        link = f"https://trends.google.com/trends/explore?q={quote(title)}&geo={geo}"
-                        trends.append({
-                            "title": title,
-                            "link": link,
-                            "search_volume": search_volume
-                        })
-                        logging.info(f"Added trend: {title} with search volume: {search_volume}")
-                    else:
-                        logging.info(f"Skipping trend: {title} (volume: {search_volume} < 20K or no title)")
-                else:
-                    logging.info(f"Skipping row with insufficient columns: {len(columns)}")
-            except Exception as e:
-                logging.warning(f"Row processing error: {e}")
+                    logger.info("Waiting for page to load...")
+                    WebDriverWait(self.driver, 60).until(
+                        EC.presence_of_element_located((By.TAG_NAME, "tbody"))
+                    )
+                    break
+                except TimeoutException:
+                    logger.warning(f"Timeout on attempt {attempt + 1} for geo={geo}")
+                    if attempt == 2:
+                        logger.error(f"Failed after 3 attempts for geo={geo}")
+                        return []
+                    time.sleep(5)
+
+            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+            time.sleep(2)
+
+            rows = self.driver.find_elements(By.XPATH, "//tbody/tr")
+            logger.info(f"Found {len(rows)} rows in tbody for geo={geo}")
+
+            for row in rows:
+                try:
+                    columns = row.find_elements(By.TAG_NAME, "td")
+                    if len(columns) >= 3:
+                        title = columns[1].text.strip()
+                        search_volume = self.parse_search_volume(columns[2].text.strip())
+                        if title and search_volume >= 20000:
+                            link = f"https://trends.google.com/trends/explore?q={quote(title)}&geo={geo}"
+                            trends.append({
+                                "title": title,
+                                "link": link,
+                                "search_volume": search_volume
+                            })
+                            logger.info(f"Added trend: {title} with search volume: {search_volume}")
+                except Exception as e:
+                    logger.warning(f"Row processing error: {e}")
+                    continue
+
+            if trends:
+                trends.sort(key=lambda x: x["search_volume"], reverse=True)
+                logger.info(f"Extracted {len(trends)} trends for geo={geo}")
+            else:
+                logger.warning(f"No valid trends found with search volume >= 20K for geo={geo}")
+
+        except WebDriverException as e:
+            logger.error(f"WebDriver error: {e}")
+        finally:
+            if self.driver:
+                self.driver.quit()
+                self.driver = None
+                logger.info(f"Chrome driver closed for geo={geo}")
+
+        return trends
+
+    def fetch_duckduckgo_news_context(self, trend_title: str, hours: int = 24) -> str:
+        """Fetch news context for a trend from DuckDuckGo."""
+        try:
+            with DDGS() as ddgs:
+                results = ddgs.news(f"{trend_title} news", timelimit="d", max_results=5)
+                titles = []
+                for r in results:
+                    try:
+                        date_str = r["date"]
+                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc) if '+00:00' in date_str else datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
+                        if dt > (datetime.now(timezone.utc) - timedelta(hours=hours)):
+                            titles.append(r["title"].lower())
+                    except ValueError as e:
+                        logger.warning(f"Date parsing failed for '{date_str}': {e}")
+                        continue
+                context = " ".join(titles) if titles else "No recent news found within 24 hours"
+                logger.info(f"DuckDuckGo News context for '{trend_title}': {context}")
+                return context
+        except Exception as e:
+            logger.warning(f"DuckDuckGo News context fetch failed for '{trend_title}': {e}")
+            return trend_title
+
+    def curate_from_google_trends(self, geo_list: List[str] = ['US']) -> Tuple[Optional[Dict], Optional[str], int]:
+        """Curate content from Google Trends for multiple regions."""
+        all_trends = []
+        for geo in geo_list:
+            trends = self.scrape_google_trends(geo=geo)
+            if trends:
+                all_trends.extend(trends)
+        
+        if not all_trends:
+            logger.info("No Google Trends data available")
+            return None, None, random.randint(600, 1800)
+
+        for trend in all_trends:
+            title = trend["title"]
+            if title in self.posted_titles:
+                logger.info(f"Skipping already posted trend: {title}")
                continue

-        if trends:
-            trends.sort(key=lambda x: x["search_volume"], reverse=True)
-            logging.info(f"Extracted {len(trends)} trends for geo={geo}: {[t['title'] for t in trends]}")
-            print(f"Raw trends fetched for geo={geo}: {[t['title'] for t in trends]}")
-        else:
-            logging.warning(f"No valid trends found with search volume >= 20K for geo={geo}")
-        return trends
-    finally:
-        driver.quit()
-        logging.info(f"Chrome driver closed for geo={geo}")
+            logger.info(f"Processing Google Trend: {title}")
+            image_query, relevance_keywords, skip = smart_image_and_filter(title, trend.get("summary", ""))
+            if skip:
+                logger.info(f"Skipping filtered Google Trend: {title}")
+                continue

-def fetch_duckduckgo_news_context(trend_title, hours=24):
-    try:
-        with DDGS() as ddgs:
-            results = ddgs.news(f"{trend_title} news", timelimit="d", max_results=5)
-            titles = []
-            for r in results:
-                try:
-                    date_str = r["date"]
-                    if '+00:00' in date_str:
-                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
-                    else:
-                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
-                    if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
-                        titles.append(r["title"].lower())
-                except ValueError as e:
-                    logging.warning(f"Date parsing failed for '{date_str}': {e}")
-                    continue
-            context = " ".join(titles) if titles else "No recent news found within 24 hours"
-            logging.info(f"DuckDuckGo News context for '{trend_title}': {context}")
-            return context
-    except Exception as e:
-        logging.warning(f"DuckDuckGo News context fetch failed for '{trend_title}': {e}")
-        return trend_title
+            scoring_content = f"{title}\n\n{trend.get('summary', '')}"
+            interest_score = is_interesting(scoring_content)
+            if interest_score < 6:
+                logger.info(f"Google Trends Interest Too Low: {interest_score}")
+                continue
+
+            num_paragraphs = determine_paragraph_count(interest_score)
+            extra_prompt = (
+                f"Generate exactly {num_paragraphs} paragraphs.\n"
+                f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
+                f"Do NOT introduce unrelated concepts.\n"
+                f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
+                f"Do not include emojis in the summary."
+            )
+
+            final_summary = summarize_with_gpt4o(
+                scoring_content,
+                "Google Trends",
+                trend["link"],
+                interest_score=interest_score,
+                extra_prompt=extra_prompt
+            )
+
+            if not final_summary:
+                logger.info(f"Summary failed for '{title}'")
+                continue
+
+            final_summary = insert_link_naturally(final_summary, "Google Trends", trend["link"])
+            post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
+
+            if post_data and author:
+                return post_data, author, random.randint(600, 1800)

-def curate_from_google_trends(geo_list=['US']):
-    all_trends = []
-    for geo in geo_list:
-        trends = scrape_google_trends(geo=geo)
-        if trends:
-            all_trends.extend(trends)
-    
-    if not all_trends:
-        print("No Google Trends data available")
-        logging.info("No Google Trends data available")
        return None, None, random.randint(600, 1800)

-    attempts = 0
-    max_attempts = 10
-    while attempts < max_attempts and all_trends:
-        trend = all_trends.pop(0)
-        title = trend["title"]
-        link = trend.get("link", "https://trends.google.com/")
-        summary = trend.get("summary", "")
-        source_name = "Google Trends"
-        original_source = f'<a href="{link}">{source_name}</a>'
-
-        if title in posted_titles:
-            print(f"Skipping already posted trend: {title}")
-            logging.info(f"Skipping already posted trend: {title}")
-            attempts += 1
-            continue
-
-        print(f"Trying Google Trend: {title} from {source_name}")
-        logging.info(f"Trying Google Trend: {title} from {source_name}")
-
-        image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
-        if skip:
-            print(f"Skipping filtered Google Trend: {title}")
-            logging.info(f"Skipping filtered Google Trend: {title}")
-            attempts += 1
-            continue
-
-        scoring_content = f"{title}\n\n{summary}"
-        interest_score = is_interesting(scoring_content)
-        logging.info(f"Interest score for '{title}': {interest_score}")
-        if interest_score < 6:
-            print(f"Google Trends Interest Too Low: {interest_score}")
-            logging.info(f"Google Trends Interest Too Low: {interest_score}")
-            attempts += 1
-            continue
-
-        num_paragraphs = determine_paragraph_count(interest_score)
-        extra_prompt = (
-            f"Generate exactly {num_paragraphs} paragraphs.\n"
-            f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
-            f"Do NOT introduce unrelated concepts.\n"
-            f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
-            f"Do not include emojis in the summary."
-        )
-        content_to_summarize = scoring_content
-        final_summary = summarize_with_gpt4o(
-            content_to_summarize,
-            source_name,
-            link,
-            interest_score=interest_score,
-            extra_prompt=extra_prompt
-        )
-        if not final_summary:
-            logging.info(f"Summary failed for '{title}'")
-            attempts += 1
-            continue
-
-        final_summary = insert_link_naturally(final_summary, source_name, link)
-
-        post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
-        if not post_data:
-            attempts += 1
-            continue
-
-        image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
-        if not image_url:
-            image_url, image_source, uploader, page_url = get_image(image_query)
-
-        hook = get_dynamic_hook(post_data["title"]).strip()
-
-        # Generate viral share prompt
-        share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
-        share_links_template = (
-            f'<p>{share_prompt} '
-            f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
-            f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
-        )
-        post_data["content"] = f"{final_summary}\n\n{share_links_template}"
-
-        global is_posting
-        is_posting = True
-        try:
-            post_id, post_url = post_to_wp(
-                post_data=post_data,
-                category=category,
-                link=link,
-                author=author,
-                image_url=image_url,
-                original_source=original_source,
-                image_source=image_source,
-                uploader=uploader,
-                pixabay_url=pixabay_url,
-                interest_score=interest_score,
-                should_post_tweet=True
-            )
-        finally:
-            is_posting = False
-
-        if post_id:
-            share_text = f"Check out this foodie gem! {post_data['title']}"
-            share_text_encoded = quote(share_text)
-            post_url_encoded = quote(post_url)
-            share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
-            # Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
-            post_data["content"] = f"{final_summary}\n\n{share_links}"  # Removed cta from content
-            is_posting = True
-            try:
-                post_to_wp(
-                    post_data=post_data,
-                    category=category,
-                    link=link,
-                    author=author,
-                    image_url=image_url,
-                    original_source=original_source,
-                    image_source=image_source,
-                    uploader=uploader,
-                    pixabay_url=pixabay_url,
-                    interest_score=interest_score,
-                    post_id=post_id,
-                    should_post_tweet=False
-                )
-            finally:
-                is_posting = False
-
-            timestamp = datetime.now(timezone.utc).isoformat()
-            save_json_file(POSTED_TITLES_FILE, title, timestamp)
-            posted_titles.add(title)
-            logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")
-
-            if image_url:
-                save_json_file(USED_IMAGES_FILE, image_url, timestamp)
-                used_images.add(image_url)
-                logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
-
-            print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Google Trends *****")
-            logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Google Trends *****")
-            return post_data, category, random.randint(0, 1800)
-
-        attempts += 1
-        logging.info(f"WP posting failed for '{post_data['title']}'")
-
-    print("No interesting Google Trend found after attempts")
-    logging.info("No interesting Google Trend found after attempts")
-    return None, None, random.randint(600, 1800)
-
 def run_google_trends_automator():
-    logging.info("***** Google Trends Automator Launched *****")
-    geo_list = ['US', 'GB', 'AU']
-    post_data, category, sleep_time = curate_from_google_trends(geo_list=geo_list)
-    if sleep_time is None:
-        sleep_time = random.randint(600, 1800)
-    print(f"Sleeping for {sleep_time}s")
-    logging.info(f"Completed run with sleep time: {sleep_time} seconds")
-    time.sleep(sleep_time)
-    return post_data, category, sleep_time
+    """Main function to run the Google Trends automator."""
+    scraper = GoogleTrendsScraper()
+    while True:
+        try:
+            post_data, author, sleep_time = scraper.curate_from_google_trends()
+            if post_data and author:
+                global is_posting
+                is_posting = True
+                try:
+                    post_to_wp(post_data, author)
+                    logger.info(f"Successfully posted: {post_data['title']}")
+                finally:
+                    is_posting = False
+            time.sleep(sleep_time)
+        except Exception as e:
+            logger.error(f"Error in Google Trends automator: {e}")
+            time.sleep(300)  # Wait 5 minutes before retrying

 if __name__ == "__main__":
    run_google_trends_automator()