414 lines
18 KiB
Python
414 lines
18 KiB
Python
# foodie_automator_google.py
|
||
import requests
|
||
import random
|
||
import time
|
||
import logging
|
||
import re
|
||
import os
|
||
import json
|
||
import signal
|
||
import sys
|
||
from datetime import datetime, timedelta, timezone
|
||
from openai import OpenAI
|
||
from urllib.parse import quote
|
||
from selenium import webdriver
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.support.ui import WebDriverWait
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
from selenium.webdriver.chrome.options import Options
|
||
from selenium.common.exceptions import TimeoutException
|
||
from duckduckgo_search import DDGS
|
||
from foodie_config import (
|
||
AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS,
|
||
PERSONA_CONFIGS, CATEGORIES, get_clean_source_name, X_API_CREDENTIALS
|
||
)
|
||
from foodie_utils import (
|
||
load_json_file, save_json_file, get_image, generate_image_query,
|
||
upload_image_to_wp, determine_paragraph_count, insert_link_naturally,
|
||
is_interesting, generate_title_from_summary, summarize_with_gpt4o,
|
||
generate_category_from_summary, post_to_wp, prepare_post_data,
|
||
select_best_author, smart_image_and_filter, get_flickr_image,
|
||
get_next_author_round_robin, check_author_rate_limit, update_system_activity
|
||
)
|
||
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
|
||
from dotenv import load_dotenv
|
||
import fcntl
|
||
|
||
load_dotenv()
|
||
|
||
# Define constants at the top
|
||
SCRIPT_NAME = "foodie_automator_google"
|
||
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_google_titles.json'
|
||
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
|
||
EXPIRATION_HOURS = 24
|
||
IMAGE_EXPIRATION_DAYS = 7
|
||
|
||
is_posting = False
|
||
LOCK_FILE = "/home/shane/foodie_automator/locks/foodie_automator_google.lock"
|
||
|
||
# Load JSON files after constants are defined
|
||
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
|
||
posted_titles = set(entry["title"] for entry in posted_titles_data if "title" in entry)
|
||
used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
|
||
used_images = set(entry["title"] for entry in used_images_data if "title" in entry)
|
||
|
||
def signal_handler(sig, frame):
|
||
logging.info("Received termination signal, marking script as stopped...")
|
||
update_system_activity(SCRIPT_NAME, "stopped")
|
||
if is_posting:
|
||
logging.info("Currently posting, will exit after completion.")
|
||
else:
|
||
logging.info("Safe to exit immediately.")
|
||
sys.exit(0)
|
||
|
||
signal.signal(signal.SIGTERM, signal_handler)
|
||
signal.signal(signal.SIGINT, signal_handler)
|
||
|
||
LOG_FILE = "/home/shane/foodie_automator/logs/foodie_automator_google.log"
|
||
LOG_PRUNE_DAYS = 30
|
||
MAX_RETRIES = 3
|
||
RETRY_BACKOFF = 2
|
||
|
||
def setup_logging():
|
||
try:
|
||
# Ensure log directory exists
|
||
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
|
||
logging.debug(f"Log directory created/verified: {os.path.dirname(LOG_FILE)}")
|
||
|
||
# Check write permissions
|
||
if not os.access(os.path.dirname(LOG_FILE), os.W_OK):
|
||
raise PermissionError(f"No write permission for {os.path.dirname(LOG_FILE)}")
|
||
|
||
# Test write to log file
|
||
try:
|
||
with open(LOG_FILE, 'a') as f:
|
||
f.write("")
|
||
logging.debug(f"Confirmed write access to {LOG_FILE}")
|
||
except Exception as e:
|
||
raise PermissionError(f"Cannot write to {LOG_FILE}: {e}")
|
||
|
||
# Prune old logs
|
||
if os.path.exists(LOG_FILE):
|
||
with open(LOG_FILE, 'r') as f:
|
||
lines = f.readlines()
|
||
|
||
log_entries = []
|
||
current_entry = []
|
||
timestamp_pattern = re.compile(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')
|
||
|
||
for line in lines:
|
||
if timestamp_pattern.match(line):
|
||
if current_entry:
|
||
log_entries.append(''.join(current_entry))
|
||
current_entry = [line]
|
||
else:
|
||
current_entry.append(line)
|
||
|
||
if current_entry:
|
||
log_entries.append(''.join(current_entry))
|
||
|
||
cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
|
||
pruned_entries = []
|
||
for entry in log_entries:
|
||
try:
|
||
timestamp = datetime.strptime(entry[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc)
|
||
if timestamp > cutoff:
|
||
pruned_entries.append(entry)
|
||
except ValueError:
|
||
logging.warning(f"Skipping malformed log entry (no timestamp): {entry[:50]}...")
|
||
continue
|
||
|
||
with open(LOG_FILE, 'w') as f:
|
||
f.writelines(pruned_entries)
|
||
logging.debug(f"Log file pruned: {LOG_FILE}")
|
||
|
||
# Configure logging
|
||
logging.basicConfig(
|
||
filename=LOG_FILE,
|
||
level=logging.INFO,
|
||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||
datefmt="%Y-%m-%d %H:%M:%S",
|
||
force=True # Ensure this config takes precedence
|
||
)
|
||
console_handler = logging.StreamHandler()
|
||
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
||
logging.getLogger().addHandler(console_handler)
|
||
logging.info("Logging initialized for foodie_automator_google.py")
|
||
|
||
except Exception as e:
|
||
# Fallback to console logging if file logging fails
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||
datefmt="%Y-%m-%d %H:%M:%S",
|
||
force=True
|
||
)
|
||
logging.error(f"Failed to setup file logging for {LOG_FILE}: {e}. Using console logging.")
|
||
console_handler = logging.StreamHandler()
|
||
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
||
logging.getLogger().addHandler(console_handler)
|
||
logging.info("Console logging initialized as fallback for foodie_automator_google.py")
|
||
|
||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||
|
||
def acquire_lock():
|
||
os.makedirs(os.path.dirname(LOCK_FILE), exist_ok=True)
|
||
lock_fd = open(LOCK_FILE, 'w')
|
||
try:
|
||
fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||
lock_fd.write(str(os.getpid()))
|
||
lock_fd.flush()
|
||
return lock_fd
|
||
except IOError:
|
||
logging.info("Another instance of foodie_automator_google.py is running")
|
||
sys.exit(0)
|
||
|
||
def parse_search_volume(volume_text):
|
||
try:
|
||
volume_part = volume_text.split('\n')[0].lower().strip().replace('+', '')
|
||
if 'k' in volume_part:
|
||
volume = float(volume_part.replace('k', '')) * 1000
|
||
elif 'm' in volume_part:
|
||
volume = float(volume_part.replace('m', '')) * 1000000
|
||
else:
|
||
volume = float(volume_part)
|
||
return volume
|
||
except (ValueError, AttributeError) as e:
|
||
logging.warning(f"Could not parse search volume from '{volume_text}': {e}")
|
||
return 0
|
||
|
||
def scrape_google_trends(geo='US'):
|
||
chrome_options = Options()
|
||
chrome_options.add_argument("--headless")
|
||
chrome_options.add_argument("--no-sandbox")
|
||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/125.0.0.0 Safari/537.36")
|
||
|
||
driver = None
|
||
try:
|
||
for attempt in range(MAX_RETRIES):
|
||
try:
|
||
driver = webdriver.Chrome(options=chrome_options)
|
||
time.sleep(random.uniform(2, 5))
|
||
url = f"https://trends.google.com/trending?geo={geo}&hours=24&sort=search-volume&category=5"
|
||
logging.info(f"Navigating to {url} (attempt {attempt + 1})")
|
||
driver.get(url)
|
||
|
||
logging.info("Waiting for page to load...")
|
||
WebDriverWait(driver, 60).until(
|
||
EC.presence_of_element_located((By.TAG_NAME, "tbody"))
|
||
)
|
||
break
|
||
except TimeoutException:
|
||
logging.warning(f"Timeout on attempt {attempt + 1} for geo={geo}")
|
||
if attempt == MAX_RETRIES - 1:
|
||
logging.error(f"Failed after {MAX_RETRIES} attempts for geo={geo}")
|
||
return []
|
||
time.sleep(RETRY_BACKOFF * (2 ** attempt))
|
||
if driver:
|
||
driver.quit()
|
||
continue
|
||
|
||
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||
time.sleep(2)
|
||
|
||
trends = []
|
||
rows = driver.find_elements(By.XPATH, "//tbody/tr")
|
||
logging.info(f"Found {len(rows)} rows in tbody for geo={geo}")
|
||
|
||
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=24)
|
||
for row in rows:
|
||
try:
|
||
columns = row.find_elements(By.TAG_NAME, "td")
|
||
if len(columns) >= 3:
|
||
title = columns[1].text.strip()
|
||
search_volume_text = columns[2].text.strip()
|
||
search_volume = parse_search_volume(search_volume_text)
|
||
logging.info(f"Parsed trend: {title} with search volume: {search_volume}")
|
||
if title and search_volume >= 20000:
|
||
link = f"https://trends.google.com/trends/explore?q={quote(title)}&geo={geo}"
|
||
trends.append({
|
||
"title": title,
|
||
"link": link,
|
||
"search_volume": search_volume
|
||
})
|
||
logging.info(f"Added trend: {title} with search volume: {search_volume}")
|
||
else:
|
||
logging.info(f"Skipping trend: {title} (volume: {search_volume} < 20K or no title)")
|
||
else:
|
||
logging.info(f"Skipping row with insufficient columns: {len(columns)}")
|
||
except Exception as e:
|
||
logging.warning(f"Row processing error: {e}")
|
||
continue
|
||
|
||
if trends:
|
||
trends.sort(key=lambda x: x["search_volume"], reverse=True)
|
||
logging.info(f"Extracted {len(trends)} trends for geo={geo}: {[t['title'] for t in trends]}")
|
||
else:
|
||
logging.warning(f"No valid trends found with search volume >= 20K for geo={geo}")
|
||
return trends
|
||
except Exception as e:
|
||
logging.error(f"Unexpected error in scrape_google_trends: {e}", exc_info=True)
|
||
return []
|
||
finally:
|
||
if driver:
|
||
driver.quit()
|
||
logging.info(f"Chrome driver closed for geo={geo}")
|
||
|
||
def fetch_duckduckgo_news_context(trend_title, hours=24):
|
||
for attempt in range(MAX_RETRIES):
|
||
try:
|
||
with DDGS() as ddgs:
|
||
results = ddgs.news(f"{trend_title} news", timelimit="d", max_results=5)
|
||
titles = []
|
||
for r in results:
|
||
try:
|
||
date_str = r["date"]
|
||
# Handle both ISO formats with and without timezone
|
||
if '+00:00' in date_str:
|
||
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
|
||
else:
|
||
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
|
||
if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
|
||
titles.append(r["title"].lower())
|
||
except ValueError as e:
|
||
logging.warning(f"Date parsing failed for '{date_str}': {e}")
|
||
continue
|
||
context = " ".join(titles) if titles else "No recent news found within 24 hours"
|
||
logging.info(f"DuckDuckGo News context for '{trend_title}': {context}")
|
||
return context
|
||
except Exception as e:
|
||
logging.warning(f"DuckDuckGo News context fetch failed for '{trend_title}' (attempt {attempt + 1}): {e}")
|
||
if attempt < MAX_RETRIES - 1:
|
||
time.sleep(RETRY_BACKOFF * (2 ** attempt))
|
||
continue
|
||
logging.error(f"Failed to fetch DuckDuckGo News context for '{trend_title}' after {MAX_RETRIES} attempts")
|
||
return trend_title
|
||
|
||
def curate_from_google(item, original_source, source_name, link, page_url):
|
||
logger = logging.getLogger(__name__)
|
||
try:
|
||
content = item.get('snippet', '')
|
||
if not content:
|
||
logger.info(f"No content for Google item: {item.get('title', 'unknown')}")
|
||
return None, None
|
||
|
||
interest_score = is_interesting(content)
|
||
if interest_score < 4:
|
||
logger.info(f"Google item '{item.get('title', 'unknown')}' not interesting enough: score {interest_score}")
|
||
return None, None
|
||
|
||
summary = summarize_with_gpt4o(content, source_name, link, interest_score=interest_score)
|
||
if not summary:
|
||
logger.warning(f"Failed to summarize Google item: {item.get('title', 'unknown')}")
|
||
return None, None
|
||
|
||
# Remove the original title from the summary if present
|
||
if item.get('title', '') in summary:
|
||
summary = summary.replace(item.get('title', ''), "").strip()
|
||
while "\n\n\n" in summary:
|
||
summary = summary.replace("\n\n\n", "\n\n")
|
||
|
||
final_summary = insert_link_naturally(summary, source_name, link)
|
||
if not final_summary:
|
||
logger.warning(f"Failed to insert link for Google item: {item.get('title', 'unknown')}")
|
||
return None, None
|
||
|
||
post_data, author, category, image_url, image_source, uploader, page_url = prepare_post_data(final_summary, item.get('title', 'unknown'))
|
||
if not post_data:
|
||
logger.info(f"Post preparation failed for Google item: {item.get('title', 'unknown')}")
|
||
return None, None
|
||
|
||
share_text = f"Check out this tasty find: {post_data['title']}"
|
||
share_text_encoded = quote(share_text)
|
||
share_links_template = (
|
||
"Share this post: "
|
||
'<a href="https://x.com/intent/tweet?url={post_url}&text={share_text}">X</a> | '
|
||
'<a href="https://www.facebook.com/sharer/sharer.php?u={post_url}">Facebook</a>'
|
||
)
|
||
|
||
# First call: Post without share links
|
||
post_data["content"] = final_summary
|
||
post_id, post_url = post_to_wp(
|
||
post_data=post_data,
|
||
category=category,
|
||
link=link,
|
||
author=author,
|
||
image_url=image_url,
|
||
original_source=original_source,
|
||
image_source=image_source,
|
||
uploader=uploader,
|
||
page_url=page_url,
|
||
interest_score=interest_score,
|
||
should_post_tweet=True,
|
||
summary=final_summary
|
||
)
|
||
|
||
if not post_id:
|
||
logger.warning(f"Failed to post Google item to WP: {post_data['title']}")
|
||
return None, None
|
||
|
||
# Second call: Update with share links
|
||
post_url_encoded = quote(post_url)
|
||
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
|
||
post_data["content"] = f"{final_summary}\n\n{share_links}"
|
||
post_id, post_url = post_to_wp(
|
||
post_data=post_data,
|
||
category=category,
|
||
link=link,
|
||
author=author,
|
||
image_url=None,
|
||
original_source=original_source,
|
||
image_source=image_source,
|
||
uploader=uploader,
|
||
page_url=page_url,
|
||
interest_score=interest_score,
|
||
post_id=post_id,
|
||
should_post_tweet=False
|
||
)
|
||
|
||
if post_id:
|
||
logger.info(f"Successfully curated and posted Google item: {post_data['title']} (URL: {post_url})")
|
||
return post_id, post_url
|
||
else:
|
||
logger.warning(f"Failed to update Google post with share links: {post_data['title']}")
|
||
return None, None
|
||
|
||
except Exception as e:
|
||
logger.error(f"Error curating Google item '{item.get('title', 'unknown')}': {e}")
|
||
return None, None
|
||
|
||
def run_google_trends_automator():
|
||
lock_fd = None
|
||
try:
|
||
lock_fd = acquire_lock()
|
||
update_system_activity(SCRIPT_NAME, "running", os.getpid()) # Record start
|
||
logging.info("***** Google Trends Automator Launched *****")
|
||
# Load JSON files once
|
||
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
|
||
posted_titles = set(entry["title"] for entry in posted_titles_data)
|
||
used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
|
||
used_images = set(entry["title"] for entry in used_images_data if "title" in entry)
|
||
post_data, category, sleep_time = curate_from_google_trends(posted_titles_data, posted_titles, used_images_data, used_images)
|
||
if not post_data:
|
||
logging.info("No postable Google Trend found")
|
||
logging.info("Completed Google Trends run")
|
||
update_system_activity(SCRIPT_NAME, "stopped") # Record stop
|
||
logging.info(f"Run completed, sleep_time: {sleep_time} seconds")
|
||
return post_data, category, sleep_time
|
||
except Exception as e:
|
||
logging.error(f"Fatal error in run_google_trends_automator: {e}", exc_info=True)
|
||
update_system_activity(SCRIPT_NAME, "stopped") # Record stop on error
|
||
sleep_time = random.randint(1200, 1800) # 20–30 minutes
|
||
logging.info(f"Run completed, sleep_time: {sleep_time} seconds")
|
||
return None, None, sleep_time
|
||
finally:
|
||
if lock_fd:
|
||
fcntl.flock(lock_fd, fcntl.LOCK_UN)
|
||
lock_fd.close()
|
||
os.remove(LOCK_FILE) if os.path.exists(LOCK_FILE) else None
|
||
|
||
if __name__ == "__main__":
|
||
setup_logging()
|
||
post_data, category, sleep_time = run_google_trends_automator()
|
||
logging.info(f"Run completed, sleep_time: {sleep_time} seconds") |