add lock files and update weekly tweet to include last tweet to follow

2025-05-06 09:40:04 +10:00
parent 331979ca9e
commit 028dfc3fc8
6 changed files with 1481 additions and 904 deletions
@@ -29,12 +29,14 @@ from foodie_utils import (
    generate_category_from_summary, post_to_wp, prepare_post_data,
    smart_image_and_filter, insert_link_naturally, get_flickr_image
 )
-from foodie_hooks import get_dynamic_hook, get_viral_share_prompt  # Removed select_best_cta import
+from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
 from dotenv import load_dotenv
+import fcntl

 load_dotenv()

 is_posting = False
+LOCK_FILE = "/home/shane/foodie_automator/locks/foodie_automator_google.lock"

 def signal_handler(sig, frame):
    logging.info("Received termination signal, checking if safe to exit...")
@@ -47,15 +49,58 @@ def signal_handler(sig, frame):
 signal.signal(signal.SIGTERM, signal_handler)
 signal.signal(signal.SIGINT, signal_handler)

-logger = logging.getLogger()
-logger.setLevel(logging.INFO)
-file_handler = logging.FileHandler('/home/shane/foodie_automator/foodie_automator_google.log', mode='a')
-file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
-logger.addHandler(file_handler)
-console_handler = logging.StreamHandler()
-console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
-logger.addHandler(console_handler)
-logging.info("Logging initialized for foodie_automator_google.py")
+LOG_FILE = "/home/shane/foodie_automator/logs/foodie_automator_google.log"
+LOG_PRUNE_DAYS = 30
+MAX_RETRIES = 3
+RETRY_BACKOFF = 2
+
+posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
+posted_titles = set(entry["title"] for entry in posted_titles_data)
+used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)
+
+def setup_logging():
+    if os.path.exists(LOG_FILE):
+        with open(LOG_FILE, 'r') as f:
+            lines = f.readlines()
+        
+        log_entries = []
+        current_entry = []
+        timestamp_pattern = re.compile(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')
+        
+        for line in lines:
+            if timestamp_pattern.match(line):
+                if current_entry:
+                    log_entries.append(''.join(current_entry))
+                current_entry = [line]
+            else:
+                current_entry.append(line)
+        
+        if current_entry:
+            log_entries.append(''.join(current_entry))
+        
+        cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
+        pruned_entries = []
+        for entry in log_entries:
+            try:
+                timestamp = datetime.strptime(entry[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc)
+                if timestamp > cutoff:
+                    pruned_entries.append(entry)
+            except ValueError:
+                logging.warning(f"Skipping malformed log entry (no timestamp): {entry[:50]}...")
+                continue
+        
+        with open(LOG_FILE, 'w') as f:
+            f.writelines(pruned_entries)
+    
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    file_handler = logging.FileHandler(LOG_FILE, mode='a')
+    file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
+    logger.addHandler(file_handler)
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
+    logger.addHandler(console_handler)
+    logging.info("Logging initialized for foodie_automator_google.py")

 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

@@ -68,6 +113,18 @@ posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
 posted_titles = set(entry["title"] for entry in posted_titles_data)
 used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)

+def acquire_lock():
+    os.makedirs(os.path.dirname(LOCK_FILE), exist_ok=True)
+    lock_fd = open(LOCK_FILE, 'w')
+    try:
+        fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+        lock_fd.write(str(os.getpid()))
+        lock_fd.flush()
+        return lock_fd
+    except IOError:
+        logging.info("Another instance of foodie_automator_google.py is running")
+        sys.exit(0)
+
 def parse_search_volume(volume_text):
    try:
        volume_part = volume_text.split('\n')[0].lower().strip().replace('+', '')
@@ -89,10 +146,11 @@ def scrape_google_trends(geo='US'):
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/125.0.0.0 Safari/537.36")

-    driver = webdriver.Chrome(options=chrome_options)
+    driver = None
    try:
-        for attempt in range(3):
+        for attempt in range(MAX_RETRIES):
            try:
+                driver = webdriver.Chrome(options=chrome_options)
                time.sleep(random.uniform(2, 5))
                url = f"https://trends.google.com/trending?geo={geo}&hours=24&sort=search-volume&category=5"
                logging.info(f"Navigating to {url} (attempt {attempt + 1})")
@@ -105,10 +163,13 @@ def scrape_google_trends(geo='US'):
                break
            except TimeoutException:
                logging.warning(f"Timeout on attempt {attempt + 1} for geo={geo}")
-                if attempt == 2:
-                    logging.error(f"Failed after 3 attempts for geo={geo}")
+                if attempt == MAX_RETRIES - 1:
+                    logging.error(f"Failed after {MAX_RETRIES} attempts for geo={geo}")
                    return []
-                time.sleep(5)
+                time.sleep(RETRY_BACKOFF * (2 ** attempt))
+                if driver:
+                    driver.quit()
+                continue

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
@@ -145,157 +206,137 @@ def scrape_google_trends(geo='US'):
        if trends:
            trends.sort(key=lambda x: x["search_volume"], reverse=True)
            logging.info(f"Extracted {len(trends)} trends for geo={geo}: {[t['title'] for t in trends]}")
-            print(f"Raw trends fetched for geo={geo}: {[t['title'] for t in trends]}")
        else:
            logging.warning(f"No valid trends found with search volume >= 20K for geo={geo}")
        return trends
+    except Exception as e:
+        logging.error(f"Unexpected error in scrape_google_trends: {e}", exc_info=True)
+        return []
    finally:
-        driver.quit()
-        logging.info(f"Chrome driver closed for geo={geo}")
+        if driver:
+            driver.quit()
+            logging.info(f"Chrome driver closed for geo={geo}")

 def fetch_duckduckgo_news_context(trend_title, hours=24):
-    try:
-        with DDGS() as ddgs:
-            results = ddgs.news(f"{trend_title} news", timelimit="d", max_results=5)
-            titles = []
-            for r in results:
-                try:
-                    date_str = r["date"]
-                    if '+00:00' in date_str:
-                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
-                    else:
-                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
-                    if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
-                        titles.append(r["title"].lower())
-                except ValueError as e:
-                    logging.warning(f"Date parsing failed for '{date_str}': {e}")
-                    continue
-            context = " ".join(titles) if titles else "No recent news found within 24 hours"
-            logging.info(f"DuckDuckGo News context for '{trend_title}': {context}")
-            return context
-    except Exception as e:
-        logging.warning(f"DuckDuckGo News context fetch failed for '{trend_title}': {e}")
-        return trend_title
+    for attempt in range(MAX_RETRIES):
+        try:
+            with DDGS() as ddgs:
+                results = ddgs.news(f"{trend_title} news", timelimit="d", max_results=5)
+                titles = []
+                for r in results:
+                    try:
+                        date_str = r["date"]
+                        if '+00:00' in date_str:
+                            dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
+                        else:
+                            dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
+                        if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
+                            titles.append(r["title"].lower())
+                    except ValueError as e:
+                        logging.warning(f"Date parsing failed for '{date_str}': {e}")
+                        continue
+                context = " ".join(titles) if titles else "No recent news found within 24 hours"
+                logging.info(f"DuckDuckGo News context for '{trend_title}': {context}")
+                return context
+        except Exception as e:
+            logging.warning(f"DuckDuckGo News context fetch failed for '{trend_title}' (attempt {attempt + 1}): {e}")
+            if attempt < MAX_RETRIES - 1:
+                time.sleep(RETRY_BACKOFF * (2 ** attempt))
+            continue
+    logging.error(f"Failed to fetch DuckDuckGo News context for '{trend_title}' after {MAX_RETRIES} attempts")
+    return trend_title

 def curate_from_google_trends(geo_list=['US']):
-    all_trends = []
-    for geo in geo_list:
-        trends = scrape_google_trends(geo=geo)
-        if trends:
-            all_trends.extend(trends)
-    
-    if not all_trends:
-        print("No Google Trends data available")
-        logging.info("No Google Trends data available")
-        return None, None, random.randint(600, 1800)
+    try:
+        all_trends = []
+        for geo in geo_list:
+            trends = scrape_google_trends(geo=geo)
+            if trends:
+                all_trends.extend(trends)
+        
+        if not all_trends:
+            logging.info("No Google Trends data available")
+            return None, None, False
+        
+        attempts = 0
+        max_attempts = 10
+        while attempts < max_attempts and all_trends:
+            trend = all_trends.pop(0)
+            title = trend["title"]
+            link = trend.get("link", "https://trends.google.com/")
+            summary = trend.get("summary", "")
+            source_name = "Google Trends"
+            original_source = f'<a href="{link}">{source_name}</a>'

-    attempts = 0
-    max_attempts = 10
-    while attempts < max_attempts and all_trends:
-        trend = all_trends.pop(0)
-        title = trend["title"]
-        link = trend.get("link", "https://trends.google.com/")
-        summary = trend.get("summary", "")
-        source_name = "Google Trends"
-        original_source = f'<a href="{link}">{source_name}</a>'
+            if title in posted_titles:
+                logging.info(f"Skipping already posted trend: {title}")
+                attempts += 1
+                continue

-        if title in posted_titles:
-            print(f"Skipping already posted trend: {title}")
-            logging.info(f"Skipping already posted trend: {title}")
-            attempts += 1
-            continue
+            logging.info(f"Trying Google Trend: {title} from {source_name}")

-        print(f"Trying Google Trend: {title} from {source_name}")
-        logging.info(f"Trying Google Trend: {title} from {source_name}")
+            image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary)
+            if skip:
+                logging.info(f"Skipping filtered Google Trend: {title}")
+                attempts += 1
+                continue

-        image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary)
-        if skip:
-            print(f"Skipping filtered Google Trend: {title}")
-            logging.info(f"Skipping filtered Google Trend: {title}")
-            attempts += 1
-            continue
+            ddg_context = fetch_duckduckgo_news_context(title)
+            scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}"
+            interest_score = is_interesting(scoring_content)
+            logging.info(f"Interest score for '{title}': {interest_score}")
+            if interest_score < 6:
+                logging.info(f"Google Trends Interest Too Low: {interest_score}")
+                attempts += 1
+                continue

-        ddg_context = fetch_duckduckgo_news_context(title)
-        scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}"
-        interest_score = is_interesting(scoring_content)
-        logging.info(f"Interest score for '{title}': {interest_score}")
-        if interest_score < 6:
-            print(f"Google Trends Interest Too Low: {interest_score}")
-            logging.info(f"Google Trends Interest Too Low: {interest_score}")
-            attempts += 1
-            continue
-
-        num_paragraphs = determine_paragraph_count(interest_score)
-        extra_prompt = (
-            f"Generate exactly {num_paragraphs} paragraphs.\n"
-            f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
-            f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
-            f"Do NOT introduce unrelated concepts unless in the content or additional context.\n"
-            f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
-            f"Do not include emojis in the summary."
-        )
-        content_to_summarize = scoring_content
-        final_summary = summarize_with_gpt4o(
-            content_to_summarize,
-            source_name,
-            link,
-            interest_score=interest_score,
-            extra_prompt=extra_prompt
-        )
-        if not final_summary:
-            logging.info(f"Summary failed for '{title}'")
-            attempts += 1
-            continue
-
-        final_summary = insert_link_naturally(final_summary, source_name, link)
-
-        post_data, author, category, image_url, image_source, uploader, page_url = prepare_post_data(final_summary, title, main_topic)
-        if not post_data:
-            attempts += 1
-            continue
-
-        image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic)
-        if not image_url:
-            image_url, image_source, uploader, page_url = get_image(image_query)
-
-        hook = get_dynamic_hook(post_data["title"]).strip()
-
-        share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
-        share_links_template = (
-            f'<p>{share_prompt} '
-            f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
-            f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
-        )
-        post_data["content"] = f"{final_summary}\n\n{share_links_template}"
-
-        global is_posting
-        is_posting = True
-        try:
-            post_id, post_url = post_to_wp(
-                post_data=post_data,
-                category=category,
-                link=link,
-                author=author,
-                image_url=image_url,
-                original_source=original_source,
-                image_source=image_source,
-                uploader=uploader,
-                page_url=page_url,
-                interest_score=interest_score,
-                should_post_tweet=True
+            num_paragraphs = determine_paragraph_count(interest_score)
+            extra_prompt = (
+                f"Generate exactly {num_paragraphs} paragraphs.\n"
+                f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
+                f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
+                f"Do NOT introduce unrelated concepts unless in the content or additional context.\n"
+                f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
+                f"Do not include emojis in the summary."
            )
-        finally:
-            is_posting = False
+            content_to_summarize = scoring_content
+            final_summary = summarize_with_gpt4o(
+                content_to_summarize,
+                source_name,
+                link,
+                interest_score=interest_score,
+                extra_prompt=extra_prompt
+            )
+            if not final_summary:
+                logging.info(f"Summary failed for '{title}'")
+                attempts += 1
+                continue

-        if post_id:
-            share_text = f"Check out this foodie gem! {post_data['title']}"
-            share_text_encoded = quote(share_text)
-            post_url_encoded = quote(post_url)
-            share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
-            post_data["content"] = f"{final_summary}\n\n{share_links}"
+            final_summary = insert_link_naturally(final_summary, source_name, link)
+
+            post_data, author, category, image_url, image_source, uploader, page_url = prepare_post_data(final_summary, title, main_topic)
+            if not post_data:
+                attempts += 1
+                continue
+
+            image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic)
+            if not image_url:
+                image_url, image_source, uploader, page_url = get_image(image_query)
+
+            hook = get_dynamic_hook(post_data["title"]).strip()
+
+            share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
+            share_links_template = (
+                f'<p>{share_prompt} '
+                f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
+                f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
+            )
+            post_data["content"] = f"{final_summary}\n\n{share_links_template}"
+
+            global is_posting
            is_posting = True
            try:
-                post_to_wp(
+                post_id, post_url = post_to_wp(
                    post_data=post_data,
                    category=category,
                    link=link,
@@ -306,43 +347,86 @@ def curate_from_google_trends(geo_list=['US']):
                    uploader=uploader,
                    page_url=page_url,
                    interest_score=interest_score,
-                    post_id=post_id,
-                    should_post_tweet=False
+                    should_post_tweet=True
                )
+            except Exception as e:
+                logging.error(f"Failed to post to WordPress for '{title}': {e}", exc_info=True)
+                attempts += 1
+                continue
            finally:
                is_posting = False

-            timestamp = datetime.now(timezone.utc).isoformat()
-            save_json_file(POSTED_TITLES_FILE, title, timestamp)
-            posted_titles.add(title)
-            logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")
+            if post_id:
+                share_text = f"Check out this foodie gem! {post_data['title']}"
+                share_text_encoded = quote(share_text)
+                post_url_encoded = quote(post_url)
+                share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
+                post_data["content"] = f"{final_summary}\n\n{share_links}"
+                is_posting = True
+                try:
+                    post_to_wp(
+                        post_data=post_data,
+                        category=category,
+                        link=link,
+                        author=author,
+                        image_url=image_url,
+                        original_source=original_source,
+                        image_source=image_source,
+                        uploader=uploader,
+                        page_url=page_url,
+                        interest_score=interest_score,
+                        post_id=post_id,
+                        should_post_tweet=False
+                    )
+                except Exception as e:
+                    logging.error(f"Failed to update WordPress post '{title}' with share links: {e}", exc_info=True)
+                finally:
+                    is_posting = False

-            if image_url:
-                save_json_file(USED_IMAGES_FILE, image_url, timestamp)
-                used_images.add(image_url)
-                logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
+                timestamp = datetime.now(timezone.utc).isoformat()
+                save_json_file(POSTED_TITLES_FILE, title, timestamp)
+                posted_titles.add(title)
+                logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")

-            print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Google Trends *****")
-            logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Google Trends *****")
-            return post_data, category, random.randint(0, 1800)
+                if image_url:
+                    save_json_file(USED_IMAGES_FILE, image_url, timestamp)
+                    used_images.add(image_url)
+                    logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")

-        attempts += 1
-        logging.info(f"WP posting failed for '{post_data['title']}'")
+                logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Google Trends *****")
+                return post_data, category, True

-    print("No interesting Google Trend found after attempts")
-    logging.info("No interesting Google Trend found after attempts")
-    return None, None, random.randint(600, 1800)
+            attempts += 1
+            logging.info(f"WP posting failed for '{post_data['title']}'")
+
+        logging.info("No interesting Google Trend found after attempts")
+        return None, None, False
+    except Exception as e:
+        logging.error(f"Unexpected error in curate_from_google_trends: {e}", exc_info=True)
+        return None, None, False

 def run_google_trends_automator():
-    logging.info("***** Google Trends Automator Launched *****")
-    geo_list = ['US', 'GB', 'AU']
-    post_data, category, sleep_time = curate_from_google_trends(geo_list=geo_list)
-    if sleep_time is None:
-        sleep_time = random.randint(600, 1800)
-    print(f"Sleeping for {sleep_time}s")
-    logging.info(f"Completed run with sleep time: {sleep_time} seconds")
-    time.sleep(sleep_time)
-    return post_data, category, sleep_time
+    lock_fd = None
+    try:
+        lock_fd = acquire_lock()
+        logging.info("***** Google Trends Automator Launched *****")
+        geo_list = ['US', 'GB', 'AU']
+        post_data, category, should_continue = curate_from_google_trends(geo_list=geo_list)
+        if not post_data:
+            logging.info("No postable Google Trend found")
+        else:
+            logging.info("Completed Google Trends run")
+        return post_data, category, should_continue
+    except Exception as e:
+        logging.error(f"Fatal error in run_google_trends_automator: {e}", exc_info=True)
+        return None, None, False
+    finally:
+        if lock_fd:
+            fcntl.flock(lock_fd, fcntl.LOCK_UN)
+            lock_fd.close()
+            os.remove(LOCK_FILE) if os.path.exists(LOCK_FILE) else None

 if __name__ == "__main__":
-    run_google_trends_automator()
+    setup_logging()
+    post_data, category, should_continue = run_google_trends_automator()
+    logging.info(f"Run completed, should_continue: {should_continue}")