You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
319 lines
14 KiB
319 lines
14 KiB
import requests |
|
import random |
|
import time |
|
import logging |
|
import re |
|
import os |
|
import json |
|
import signal |
|
import sys |
|
import datetime, timedelta, timezone |
|
from openai import OpenAI |
|
from urllib.parse import quote |
|
from selenium import webdriver |
|
from selenium.webdriver.common.by import By |
|
from selenium.webdriver.support.ui import WebDriverWait |
|
from selenium.webdriver.support import expected_conditions as EC |
|
from selenium.webdriver.chrome.options import Options |
|
from selenium.common.exceptions import TimeoutException |
|
from duckduckgo_search import DDGS |
|
from foodie_config import ( |
|
AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, |
|
SUMMARY_PERSONA_PROMPTS, CATEGORIES, CTAS, get_clean_source_name |
|
) |
|
from foodie_utils import ( |
|
load_json_file, save_json_file, get_image, generate_image_query, |
|
upload_image_to_wp, select_best_persona, determine_paragraph_count, is_interesting, |
|
generate_title_from_summary, summarize_with_gpt4o, generate_category_from_summary, post_to_wp, |
|
prepare_post_data, smart_image_and_filter, insert_link_naturally, get_flickr_image_via_ddg |
|
) |
|
from foodie_hooks import get_dynamic_hook, select_best_cta |
|
|
|
# Flag to indicate if we're in the middle of posting |
|
is_posting = False |
|
|
|
def signal_handler(sig, frame): |
|
logging.info("Received termination signal, checking if safe to exit...") |
|
if is_posting: |
|
logging.info("Currently posting, will exit after completion.") |
|
else: |
|
logging.info("Safe to exit immediately.") |
|
sys.exit(0) |
|
|
|
signal.signal(signal.SIGTERM, signal_handler) |
|
signal.signal(signal.SIGINT, signal_handler) |
|
|
|
logger = logging.getLogger() |
|
logger.setLevel(logging.INFO) |
|
file_handler = logging.FileHandler('/home/shane/foodie_automator/foodie_automator_google_trends.log', mode='a') |
|
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) |
|
logger.addHandler(file_handler) |
|
console_handler = logging.StreamHandler() |
|
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) |
|
logger.addHandler(console_handler) |
|
logging.info("Logging initialized for foodie_automator_google.py") |
|
|
|
client = OpenAI(api_key="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") |
|
|
|
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_google_titles.json' |
|
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json' |
|
EXPIRATION_HOURS = 24 |
|
IMAGE_EXPIRATION_DAYS = 7 |
|
|
|
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS) |
|
posted_titles = set(entry["title"] for entry in posted_titles_data) |
|
used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry) |
|
|
|
def parse_search_volume(volume_text): |
|
try: |
|
volume_part = volume_text.split('\n')[0].lower().strip().replace('+', '') |
|
if 'k' in volume_part: |
|
volume = float(volume_part.replace('k', '')) * 1000 |
|
elif 'm' in volume_part: |
|
volume = float(volume_part.replace('m', '')) * 1000000 |
|
else: |
|
volume = float(volume_part) |
|
return volume |
|
except (ValueError, AttributeError) as e: |
|
logging.warning(f"Could not parse search volume from '{volume_text}': {e}") |
|
return 0 |
|
|
|
def scrape_google_trends(geo='US'): |
|
chrome_options = Options() |
|
chrome_options.add_argument("--headless") |
|
chrome_options.add_argument("--no-sandbox") |
|
chrome_options.add_argument("--disable-dev-shm-usage") |
|
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/125.0.0.0 Safari/537.36") |
|
|
|
driver = webdriver.Chrome(options=chrome_options) |
|
try: |
|
for attempt in range(3): |
|
try: |
|
time.sleep(random.uniform(2, 5)) |
|
url = f"https://trends.google.com/trending?geo={geo}&hours=24&sort=search-volume&category=5" |
|
logging.info(f"Navigating to {url} (attempt {attempt + 1})") |
|
driver.get(url) |
|
|
|
logging.info("Waiting for page to load...") |
|
WebDriverWait(driver, 60).until( |
|
EC.presence_of_element_located((By.TAG_NAME, "tbody")) |
|
) |
|
break |
|
except TimeoutException: |
|
logging.warning(f"Timeout on attempt {attempt + 1} for geo={geo}") |
|
if attempt == 2: |
|
logging.error(f"Failed after 3 attempts for geo={geo}") |
|
return [] |
|
time.sleep(5) |
|
|
|
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") |
|
time.sleep(2) |
|
|
|
trends = [] |
|
rows = driver.find_elements(By.XPATH, "//tbody/tr") |
|
logging.info(f"Found {len(rows)} rows in tbody for geo={geo}") |
|
|
|
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=24) |
|
for row in rows: |
|
try: |
|
columns = row.find_elements(By.TAG_NAME, "td") |
|
if len(columns) >= 3: |
|
title = columns[1].text.strip() |
|
search_volume_text = columns[2].text.strip() |
|
search_volume = parse_search_volume(search_volume_text) |
|
logging.info(f"Parsed trend: {title} with search volume: {search_volume}") |
|
if title and search_volume >= 20000: |
|
link = f"https://trends.google.com/trends/explore?q={quote(title)}&geo={geo}" |
|
trends.append({ |
|
"title": title, |
|
"link": link, |
|
"search_volume": search_volume |
|
}) |
|
logging.info(f"Added trend: {title} with search volume: {search_volume}") |
|
else: |
|
logging.info(f"Skipping trend: {title} (volume: {search_volume} < 20K or no title)") |
|
else: |
|
logging.info(f"Skipping row with insufficient columns: {len(columns)}") |
|
except Exception as e: |
|
logging.warning(f"Row processing error: {e}") |
|
continue |
|
|
|
if trends: |
|
trends.sort(key=lambda x: x["search_volume"], reverse=True) |
|
logging.info(f"Extracted {len(trends)} trends for geo={geo}: {[t['title'] for t in trends]}") |
|
print(f"Raw trends fetched for geo={geo}: {[t['title'] for t in trends]}") |
|
else: |
|
logging.warning(f"No valid trends found with search volume >= 20K for geo={geo}") |
|
return trends |
|
finally: |
|
driver.quit() |
|
logging.info(f"Chrome driver closed for geo={geo}") |
|
|
|
def fetch_duckduckgo_news_context(trend_title, hours=24): |
|
try: |
|
with DDGS() as ddgs: |
|
results = ddgs.news(f"{trend_title} news", timelimit="d", max_results=5) |
|
titles = [] |
|
for r in results: |
|
try: |
|
date_str = r["date"] |
|
if '+00:00' in date_str: |
|
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc) |
|
else: |
|
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc) |
|
if dt > (datetime.now(timezone.utc) - timedelta(hours=24)): |
|
titles.append(r["title"].lower()) |
|
except ValueError as e: |
|
logging.warning(f"Date parsing failed for '{date_str}': {e}") |
|
continue |
|
context = " ".join(titles) if titles else "No recent news found within 24 hours" |
|
logging.info(f"DuckDuckGo News context for '{trend_title}': {context}") |
|
return context |
|
except Exception as e: |
|
logging.warning(f"DuckDuckGo News context fetch failed for '{trend_title}': {e}") |
|
return trend_title |
|
|
|
def curate_from_google_trends(geo_list=['US']): |
|
original_source = '<a href="https://trends.google.com/">Google Trends</a>' |
|
for geo in geo_list: |
|
trends = scrape_google_trends(geo=geo) |
|
if not trends: |
|
print(f"No trends available for geo={geo}") |
|
logging.info(f"No trends available for geo={geo}") |
|
continue |
|
|
|
attempts = 0 |
|
max_attempts = 10 |
|
while attempts < max_attempts and trends: |
|
trend = trends.pop(0) # Take highest-volume trend |
|
title = trend["title"] |
|
link = trend["link"] |
|
search_volume = trend["search_volume"] |
|
print(f"Trying Trend: {title} with search volume: {search_volume} for geo={geo}") |
|
logging.info(f"Trying Trend: {title} with search volume: {search_volume} for geo={geo}") |
|
|
|
if title in posted_titles: |
|
print(f"Skipping already posted trend: {title}") |
|
logging.info(f"Skipping already posted trend: {title}") |
|
attempts += 1 |
|
continue |
|
|
|
image_query, relevance_keywords, skip = smart_image_and_filter(title, "") |
|
if skip: |
|
print(f"Skipping unwanted trend: {title}") |
|
logging.info(f"Skipping unwanted trend: {title}") |
|
attempts += 1 |
|
continue |
|
|
|
context = fetch_duckduckgo_news_context(title) |
|
scoring_content = f"{title}\n\n{context}" |
|
interest_score = is_interesting(scoring_content) |
|
logging.info(f"Interest score for '{title}' in geo={geo}: {interest_score}") |
|
if interest_score < 6: |
|
print(f"Trend Interest Too Low: {interest_score}") |
|
logging.info(f"Trend Interest Too Low: {interest_score}") |
|
attempts += 1 |
|
continue |
|
|
|
num_paragraphs = determine_paragraph_count(interest_score) |
|
extra_prompt = ( |
|
f"Generate exactly {num_paragraphs} paragraphs. " |
|
f"Do not mention Google Trends, Google, or include any links. " |
|
f"Summarize as a standalone food industry trend, focusing on '{title}' and its context." |
|
) |
|
final_summary = summarize_with_gpt4o( |
|
scoring_content, |
|
source_name="Google Trends", |
|
source_url=link, |
|
interest_score=interest_score, |
|
extra_prompt=extra_prompt |
|
) |
|
if not final_summary: |
|
logging.info(f"Summary failed for '{title}'") |
|
attempts += 1 |
|
continue |
|
|
|
final_summary = insert_link_naturally(final_summary, "Google Trends", link) |
|
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title) |
|
if not post_data: |
|
attempts += 1 |
|
continue |
|
|
|
image_url, image_source, uploader, page_url = get_flickr_image_via_ddg(image_query, relevance_keywords) |
|
if not image_url: |
|
image_url, image_source, uploader, page_url = get_image(image_query) |
|
|
|
hook = get_dynamic_hook(post_data["title"]).strip() |
|
cta = select_best_cta(post_data["title"], final_summary, post_url=None) |
|
post_data["content"] = f"{final_summary}\n\n{cta}" |
|
|
|
global is_posting |
|
is_posting = True |
|
try: |
|
post_id, post_url = post_to_wp( |
|
post_data=post_data, |
|
category=category, |
|
link=link, |
|
author=author, |
|
image_url=image_url, |
|
original_source=original_source, |
|
image_source=image_source, |
|
uploader=uploader, |
|
pixabay_url=pixabay_url, |
|
interest_score=interest_score |
|
) |
|
finally: |
|
is_posting = False |
|
|
|
if post_id: |
|
cta = select_best_cta(post_data["title"], final_summary, post_url=post_url) |
|
post_data["content"] = f"{final_summary}\n\n{cta}" |
|
is_posting = True |
|
try: |
|
post_to_wp( |
|
post_data=post_data, |
|
category=category, |
|
link=link, |
|
author=author, |
|
image_url=image_url, |
|
original_source=original_source, |
|
image_source=image_source, |
|
uploader=uploader, |
|
pixabay_url=pixabay_url, |
|
interest_score=interest_score, |
|
post_id=post_id |
|
) |
|
finally: |
|
is_posting = False |
|
|
|
timestamp = datetime.now(timezone.utc).isoformat() |
|
save_json_file(POSTED_TITLES_FILE, title, timestamp) |
|
posted_titles.add(title) |
|
logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}") |
|
|
|
if image_url: |
|
save_json_file(USED_IMAGES_FILE, image_url, timestamp) |
|
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}") |
|
|
|
print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from trend for geo={geo} *****") |
|
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from trend for geo={geo} *****") |
|
return post_data, category, random.randint(0, 1800) |
|
|
|
print(f"No interesting trend found for geo={geo}") |
|
logging.info(f"No interesting trend found for geo={geo}") |
|
|
|
print(f"No interesting trend found across regions {geo_list}") |
|
logging.info(f"No interesting trend found across regions {geo_list}") |
|
return None, None, random.randint(600, 1200) |
|
|
|
def run_google_trends_automator(): |
|
logging.info("***** Google Trends Automator Launched *****") |
|
geo_list = ['US', 'GB', 'AU'] |
|
post_data, category, sleep_time = curate_from_google_trends(geo_list=geo_list) |
|
print(f"Sleeping for {sleep_time}s") |
|
logging.info(f"Completed run with sleep time: {sleep_time} seconds") |
|
time.sleep(sleep_time) |
|
return post_data, category, sleep_time |
|
|
|
if __name__ == "__main__": |
|
run_google_trends_automator() |