You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

319 lines
14 KiB

import requests
import random
import time
import logging
import re
import os
import json
import signal
import sys
from datetime import datetime, timedelta, timezone
from openai import OpenAI
from urllib.parse import quote
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from duckduckgo_search import DDGS
from foodie_config import (
AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS,
SUMMARY_PERSONA_PROMPTS, CATEGORIES, CTAS, get_clean_source_name
)
from foodie_utils import (
load_json_file, save_json_file, get_image, generate_image_query,
upload_image_to_wp, select_best_persona, determine_paragraph_count, is_interesting,
generate_title_from_summary, summarize_with_gpt4o, generate_category_from_summary, post_to_wp,
prepare_post_data, smart_image_and_filter, insert_link_naturally, get_flickr_image_via_ddg
)
from foodie_hooks import get_dynamic_hook, select_best_cta
# Flag to indicate if we're in the middle of posting
is_posting = False
def signal_handler(sig, frame):
logging.info("Received termination signal, checking if safe to exit...")
if is_posting:
logging.info("Currently posting, will exit after completion.")
else:
logging.info("Safe to exit immediately.")
sys.exit(0)
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
file_handler = logging.FileHandler('/home/shane/foodie_automator/foodie_automator_google.log', mode='a')
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(console_handler)
logging.info("Logging initialized for foodie_automator_google.py")
client = OpenAI(api_key="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")
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_google_titles.json'
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
EXPIRATION_HOURS = 24
IMAGE_EXPIRATION_DAYS = 7
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
posted_titles = set(entry["title"] for entry in posted_titles_data)
used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)
def parse_search_volume(volume_text):
try:
volume_part = volume_text.split('\n')[0].lower().strip().replace('+', '')
if 'k' in volume_part:
volume = float(volume_part.replace('k', '')) * 1000
elif 'm' in volume_part:
volume = float(volume_part.replace('m', '')) * 1000000
else:
volume = float(volume_part)
return volume
except (ValueError, AttributeError) as e:
logging.warning(f"Could not parse search volume from '{volume_text}': {e}")
return 0
def scrape_google_trends(geo='US'):
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/125.0.0.0 Safari/537.36")
driver = webdriver.Chrome(options=chrome_options)
try:
for attempt in range(3):
try:
time.sleep(random.uniform(2, 5))
url = f"https://trends.google.com/trending?geo={geo}&hours=24&sort=search-volume&category=5"
logging.info(f"Navigating to {url} (attempt {attempt + 1})")
driver.get(url)
logging.info("Waiting for page to load...")
WebDriverWait(driver, 60).until(
EC.presence_of_element_located((By.TAG_NAME, "tbody"))
)
break
except TimeoutException:
logging.warning(f"Timeout on attempt {attempt + 1} for geo={geo}")
if attempt == 2:
logging.error(f"Failed after 3 attempts for geo={geo}")
return []
time.sleep(5)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
trends = []
rows = driver.find_elements(By.XPATH, "//tbody/tr")
logging.info(f"Found {len(rows)} rows in tbody for geo={geo}")
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=24)
for row in rows:
try:
columns = row.find_elements(By.TAG_NAME, "td")
if len(columns) >= 3:
title = columns[1].text.strip()
search_volume_text = columns[2].text.strip()
search_volume = parse_search_volume(search_volume_text)
logging.info(f"Parsed trend: {title} with search volume: {search_volume}")
if title and search_volume >= 20000:
link = f"https://trends.google.com/trends/explore?q={quote(title)}&geo={geo}"
trends.append({
"title": title,
"link": link,
"search_volume": search_volume
})
logging.info(f"Added trend: {title} with search volume: {search_volume}")
else:
logging.info(f"Skipping trend: {title} (volume: {search_volume} < 20K or no title)")
else:
logging.info(f"Skipping row with insufficient columns: {len(columns)}")
except Exception as e:
logging.warning(f"Row processing error: {e}")
continue
if trends:
trends.sort(key=lambda x: x["search_volume"], reverse=True)
logging.info(f"Extracted {len(trends)} trends for geo={geo}: {[t['title'] for t in trends]}")
print(f"Raw trends fetched for geo={geo}: {[t['title'] for t in trends]}")
else:
logging.warning(f"No valid trends found with search volume >= 20K for geo={geo}")
return trends
finally:
driver.quit()
logging.info(f"Chrome driver closed for geo={geo}")
def fetch_duckduckgo_news_context(trend_title, hours=24):
try:
with DDGS() as ddgs:
results = ddgs.news(f"{trend_title} news", timelimit="d", max_results=5)
titles = []
for r in results:
try:
date_str = r["date"]
if '+00:00' in date_str:
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
else:
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
titles.append(r["title"].lower())
except ValueError as e:
logging.warning(f"Date parsing failed for '{date_str}': {e}")
continue
context = " ".join(titles) if titles else "No recent news found within 24 hours"
logging.info(f"DuckDuckGo News context for '{trend_title}': {context}")
return context
except Exception as e:
logging.warning(f"DuckDuckGo News context fetch failed for '{trend_title}': {e}")
return trend_title
def curate_from_google_trends(geo_list=['US']):
original_source = '<a href="https://trends.google.com/">Google Trends</a>'
for geo in geo_list:
trends = scrape_google_trends(geo=geo)
if not trends:
print(f"No trends available for geo={geo}")
logging.info(f"No trends available for geo={geo}")
continue
attempts = 0
max_attempts = 10
while attempts < max_attempts and trends:
trend = trends.pop(0) # Take highest-volume trend
title = trend["title"]
link = trend["link"]
search_volume = trend["search_volume"]
print(f"Trying Trend: {title} with search volume: {search_volume} for geo={geo}")
logging.info(f"Trying Trend: {title} with search volume: {search_volume} for geo={geo}")
if title in posted_titles:
print(f"Skipping already posted trend: {title}")
logging.info(f"Skipping already posted trend: {title}")
attempts += 1
continue
image_query, relevance_keywords, skip = smart_image_and_filter(title, "")
if skip:
print(f"Skipping unwanted trend: {title}")
logging.info(f"Skipping unwanted trend: {title}")
attempts += 1
continue
context = fetch_duckduckgo_news_context(title)
scoring_content = f"{title}\n\n{context}"
interest_score = is_interesting(scoring_content)
logging.info(f"Interest score for '{title}' in geo={geo}: {interest_score}")
if interest_score < 6:
print(f"Trend Interest Too Low: {interest_score}")
logging.info(f"Trend Interest Too Low: {interest_score}")
attempts += 1
continue
num_paragraphs = determine_paragraph_count(interest_score)
extra_prompt = (
f"Generate exactly {num_paragraphs} paragraphs. "
f"Do not mention Google Trends, Google, or include any links. "
f"Summarize as a standalone food industry trend, focusing on '{title}' and its context."
)
final_summary = summarize_with_gpt4o(
scoring_content,
source_name="Google Trends",
source_url=link,
interest_score=interest_score,
extra_prompt=extra_prompt
)
if not final_summary:
logging.info(f"Summary failed for '{title}'")
attempts += 1
continue
final_summary = insert_link_naturally(final_summary, "Google Trends", link)
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
if not post_data:
attempts += 1
continue
image_url, image_source, uploader, page_url = get_flickr_image_via_ddg(image_query, relevance_keywords)
if not image_url:
image_url, image_source, uploader, page_url = get_image(image_query)
hook = get_dynamic_hook(post_data["title"]).strip()
cta = select_best_cta(post_data["title"], final_summary, post_url=None)
post_data["content"] = f"{final_summary}\n\n{cta}"
global is_posting
is_posting = True
try:
post_id, post_url = post_to_wp(
post_data=post_data,
category=category,
link=link,
author=author,
image_url=image_url,
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
interest_score=interest_score
)
finally:
is_posting = False
if post_id:
cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
post_data["content"] = f"{final_summary}\n\n{cta}"
is_posting = True
try:
post_to_wp(
post_data=post_data,
category=category,
link=link,
author=author,
image_url=image_url,
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
interest_score=interest_score,
post_id=post_id
)
finally:
is_posting = False
timestamp = datetime.now(timezone.utc).isoformat()
save_json_file(POSTED_TITLES_FILE, title, timestamp)
posted_titles.add(title)
logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")
if image_url:
save_json_file(USED_IMAGES_FILE, image_url, timestamp)
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from trend for geo={geo} *****")
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from trend for geo={geo} *****")
return post_data, category, random.randint(0, 1800)
print(f"No interesting trend found for geo={geo}")
logging.info(f"No interesting trend found for geo={geo}")
print(f"No interesting trend found across regions {geo_list}")
logging.info(f"No interesting trend found across regions {geo_list}")
return None, None, random.randint(600, 1200)
def run_google_trends_automator():
logging.info("***** Google Trends Automator Launched *****")
geo_list = ['US', 'GB', 'AU']
post_data, category, sleep_time = curate_from_google_trends(geo_list=geo_list)
print(f"Sleeping for {sleep_time}s")
logging.info(f"Completed run with sleep time: {sleep_time} seconds")
time.sleep(sleep_time)
return post_data, category, sleep_time
if __name__ == "__main__":
run_google_trends_automator()