use cursor to optomize files

This commit is contained in:
2025-05-03 16:23:06 +10:00
parent 427a5cb919
commit 2ca39915e0
5 changed files with 1411 additions and 1634 deletions
+228 -292
View File
@@ -9,6 +9,7 @@ import json
import signal
import sys
from datetime import datetime, timedelta, timezone
from typing import List, Dict, Optional, Tuple
from openai import OpenAI
from urllib.parse import quote
from selenium import webdriver
@@ -16,11 +17,12 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import TimeoutException, WebDriverException
from duckduckgo_search import DDGS
from foodie_config import (
AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS,
PERSONA_CONFIGS, CATEGORIES, get_clean_source_name, X_API_CREDENTIALS
PERSONA_CONFIGS, CATEGORIES, get_clean_source_name, X_API_CREDENTIALS,
FILE_PATHS, EXPIRATION_DAYS, IMAGE_EXPIRATION_DAYS
)
from foodie_utils import (
load_json_file, save_json_file, get_image, generate_image_query,
@@ -29,320 +31,254 @@ from foodie_utils import (
generate_category_from_summary, post_to_wp, prepare_post_data,
smart_image_and_filter, insert_link_naturally, get_flickr_image
)
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt # Removed select_best_cta import
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Global state
is_posting = False
logger = logging.getLogger(__name__)
def signal_handler(sig, frame):
logging.info("Received termination signal, checking if safe to exit...")
if is_posting:
logging.info("Currently posting, will exit after completion.")
else:
logging.info("Safe to exit immediately.")
sys.exit(0)
class GoogleTrendsScraper:
def __init__(self):
self.driver = None
self.setup_logging()
self.setup_signal_handlers()
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
self.posted_titles = self.load_posted_titles()
self.used_images = self.load_used_images()
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
def setup_logging(self) -> None:
"""Configure logging for the scraper."""
logger.setLevel(logging.INFO)
file_handler = logging.FileHandler(FILE_PATHS["posted_google_titles"].with_suffix('.log'), mode='a')
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(console_handler)
logger.info("Logging initialized for Google Trends scraper")
logger = logging.getLogger()
logger.setLevel(logging.INFO)
file_handler = logging.FileHandler('/home/shane/foodie_automator/foodie_automator_google.log', mode='a')
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(console_handler)
logging.info("Logging initialized for foodie_automator_google.py")
def setup_signal_handlers(self) -> None:
"""Set up signal handlers for graceful shutdown."""
def signal_handler(sig, frame):
logger.info("Received termination signal, checking if safe to exit...")
if is_posting:
logger.info("Currently posting, will exit after completion.")
else:
logger.info("Safe to exit immediately.")
sys.exit(0)
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_google_titles.json'
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
EXPIRATION_HOURS = 24
IMAGE_EXPIRATION_DAYS = 7
def load_posted_titles(self) -> set:
"""Load and return the set of posted titles."""
try:
data = load_json_file(FILE_PATHS["posted_google_titles"], EXPIRATION_DAYS)
return {entry["title"] for entry in data}
except Exception as e:
logger.error(f"Error loading posted titles: {e}")
return set()
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
posted_titles = set(entry["title"] for entry in posted_titles_data)
used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)
def load_used_images(self) -> set:
"""Load and return the set of used images."""
try:
data = load_json_file(FILE_PATHS["used_images"], IMAGE_EXPIRATION_DAYS)
return {entry["title"] for entry in data if "title" in entry}
except Exception as e:
logger.error(f"Error loading used images: {e}")
return set()
def parse_search_volume(volume_text):
try:
volume_part = volume_text.split('\n')[0].lower().strip().replace('+', '')
if 'k' in volume_part:
volume = float(volume_part.replace('k', '')) * 1000
elif 'm' in volume_part:
volume = float(volume_part.replace('m', '')) * 1000000
else:
volume = float(volume_part)
return volume
except (ValueError, AttributeError) as e:
logging.warning(f"Could not parse search volume from '{volume_text}': {e}")
return 0
def parse_search_volume(self, volume_text: str) -> float:
"""Parse search volume from text into a numeric value."""
try:
volume_part = volume_text.split('\n')[0].lower().strip().replace('+', '')
if 'k' in volume_part:
return float(volume_part.replace('k', '')) * 1000
elif 'm' in volume_part:
return float(volume_part.replace('m', '')) * 1000000
return float(volume_part)
except (ValueError, AttributeError) as e:
logger.warning(f"Could not parse search volume from '{volume_text}': {e}")
return 0.0
def scrape_google_trends(geo='US'):
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/125.0.0.0 Safari/537.36")
def setup_driver(self) -> None:
"""Set up the Chrome WebDriver with appropriate options."""
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/125.0.0.0 Safari/537.36")
self.driver = webdriver.Chrome(options=chrome_options)
driver = webdriver.Chrome(options=chrome_options)
try:
for attempt in range(3):
try:
time.sleep(random.uniform(2, 5))
url = f"https://trends.google.com/trending?geo={geo}&hours=24&sort=search-volume&category=5"
logging.info(f"Navigating to {url} (attempt {attempt + 1})")
driver.get(url)
logging.info("Waiting for page to load...")
WebDriverWait(driver, 60).until(
EC.presence_of_element_located((By.TAG_NAME, "tbody"))
)
break
except TimeoutException:
logging.warning(f"Timeout on attempt {attempt + 1} for geo={geo}")
if attempt == 2:
logging.error(f"Failed after 3 attempts for geo={geo}")
return []
time.sleep(5)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
def scrape_google_trends(self, geo: str = 'US') -> List[Dict]:
"""Scrape Google Trends for the specified region."""
if not self.driver:
self.setup_driver()
trends = []
rows = driver.find_elements(By.XPATH, "//tbody/tr")
logging.info(f"Found {len(rows)} rows in tbody for geo={geo}")
try:
for attempt in range(3):
try:
time.sleep(random.uniform(2, 5))
url = f"https://trends.google.com/trending?geo={geo}&hours=24&sort=search-volume&category=5"
logger.info(f"Navigating to {url} (attempt {attempt + 1})")
self.driver.get(url)
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=24)
for row in rows:
try:
columns = row.find_elements(By.TAG_NAME, "td")
if len(columns) >= 3:
title = columns[1].text.strip()
search_volume_text = columns[2].text.strip()
search_volume = parse_search_volume(search_volume_text)
logging.info(f"Parsed trend: {title} with search volume: {search_volume}")
if title and search_volume >= 20000:
link = f"https://trends.google.com/trends/explore?q={quote(title)}&geo={geo}"
trends.append({
"title": title,
"link": link,
"search_volume": search_volume
})
logging.info(f"Added trend: {title} with search volume: {search_volume}")
else:
logging.info(f"Skipping trend: {title} (volume: {search_volume} < 20K or no title)")
else:
logging.info(f"Skipping row with insufficient columns: {len(columns)}")
except Exception as e:
logging.warning(f"Row processing error: {e}")
logger.info("Waiting for page to load...")
WebDriverWait(self.driver, 60).until(
EC.presence_of_element_located((By.TAG_NAME, "tbody"))
)
break
except TimeoutException:
logger.warning(f"Timeout on attempt {attempt + 1} for geo={geo}")
if attempt == 2:
logger.error(f"Failed after 3 attempts for geo={geo}")
return []
time.sleep(5)
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
rows = self.driver.find_elements(By.XPATH, "//tbody/tr")
logger.info(f"Found {len(rows)} rows in tbody for geo={geo}")
for row in rows:
try:
columns = row.find_elements(By.TAG_NAME, "td")
if len(columns) >= 3:
title = columns[1].text.strip()
search_volume = self.parse_search_volume(columns[2].text.strip())
if title and search_volume >= 20000:
link = f"https://trends.google.com/trends/explore?q={quote(title)}&geo={geo}"
trends.append({
"title": title,
"link": link,
"search_volume": search_volume
})
logger.info(f"Added trend: {title} with search volume: {search_volume}")
except Exception as e:
logger.warning(f"Row processing error: {e}")
continue
if trends:
trends.sort(key=lambda x: x["search_volume"], reverse=True)
logger.info(f"Extracted {len(trends)} trends for geo={geo}")
else:
logger.warning(f"No valid trends found with search volume >= 20K for geo={geo}")
except WebDriverException as e:
logger.error(f"WebDriver error: {e}")
finally:
if self.driver:
self.driver.quit()
self.driver = None
logger.info(f"Chrome driver closed for geo={geo}")
return trends
def fetch_duckduckgo_news_context(self, trend_title: str, hours: int = 24) -> str:
"""Fetch news context for a trend from DuckDuckGo."""
try:
with DDGS() as ddgs:
results = ddgs.news(f"{trend_title} news", timelimit="d", max_results=5)
titles = []
for r in results:
try:
date_str = r["date"]
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc) if '+00:00' in date_str else datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
if dt > (datetime.now(timezone.utc) - timedelta(hours=hours)):
titles.append(r["title"].lower())
except ValueError as e:
logger.warning(f"Date parsing failed for '{date_str}': {e}")
continue
context = " ".join(titles) if titles else "No recent news found within 24 hours"
logger.info(f"DuckDuckGo News context for '{trend_title}': {context}")
return context
except Exception as e:
logger.warning(f"DuckDuckGo News context fetch failed for '{trend_title}': {e}")
return trend_title
def curate_from_google_trends(self, geo_list: List[str] = ['US']) -> Tuple[Optional[Dict], Optional[str], int]:
"""Curate content from Google Trends for multiple regions."""
all_trends = []
for geo in geo_list:
trends = self.scrape_google_trends(geo=geo)
if trends:
all_trends.extend(trends)
if not all_trends:
logger.info("No Google Trends data available")
return None, None, random.randint(600, 1800)
for trend in all_trends:
title = trend["title"]
if title in self.posted_titles:
logger.info(f"Skipping already posted trend: {title}")
continue
if trends:
trends.sort(key=lambda x: x["search_volume"], reverse=True)
logging.info(f"Extracted {len(trends)} trends for geo={geo}: {[t['title'] for t in trends]}")
print(f"Raw trends fetched for geo={geo}: {[t['title'] for t in trends]}")
else:
logging.warning(f"No valid trends found with search volume >= 20K for geo={geo}")
return trends
finally:
driver.quit()
logging.info(f"Chrome driver closed for geo={geo}")
logger.info(f"Processing Google Trend: {title}")
image_query, relevance_keywords, skip = smart_image_and_filter(title, trend.get("summary", ""))
if skip:
logger.info(f"Skipping filtered Google Trend: {title}")
continue
def fetch_duckduckgo_news_context(trend_title, hours=24):
try:
with DDGS() as ddgs:
results = ddgs.news(f"{trend_title} news", timelimit="d", max_results=5)
titles = []
for r in results:
try:
date_str = r["date"]
if '+00:00' in date_str:
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
else:
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
titles.append(r["title"].lower())
except ValueError as e:
logging.warning(f"Date parsing failed for '{date_str}': {e}")
continue
context = " ".join(titles) if titles else "No recent news found within 24 hours"
logging.info(f"DuckDuckGo News context for '{trend_title}': {context}")
return context
except Exception as e:
logging.warning(f"DuckDuckGo News context fetch failed for '{trend_title}': {e}")
return trend_title
scoring_content = f"{title}\n\n{trend.get('summary', '')}"
interest_score = is_interesting(scoring_content)
if interest_score < 6:
logger.info(f"Google Trends Interest Too Low: {interest_score}")
continue
num_paragraphs = determine_paragraph_count(interest_score)
extra_prompt = (
f"Generate exactly {num_paragraphs} paragraphs.\n"
f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
f"Do NOT introduce unrelated concepts.\n"
f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
f"Do not include emojis in the summary."
)
final_summary = summarize_with_gpt4o(
scoring_content,
"Google Trends",
trend["link"],
interest_score=interest_score,
extra_prompt=extra_prompt
)
if not final_summary:
logger.info(f"Summary failed for '{title}'")
continue
final_summary = insert_link_naturally(final_summary, "Google Trends", trend["link"])
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
if post_data and author:
return post_data, author, random.randint(600, 1800)
def curate_from_google_trends(geo_list=['US']):
all_trends = []
for geo in geo_list:
trends = scrape_google_trends(geo=geo)
if trends:
all_trends.extend(trends)
if not all_trends:
print("No Google Trends data available")
logging.info("No Google Trends data available")
return None, None, random.randint(600, 1800)
attempts = 0
max_attempts = 10
while attempts < max_attempts and all_trends:
trend = all_trends.pop(0)
title = trend["title"]
link = trend.get("link", "https://trends.google.com/")
summary = trend.get("summary", "")
source_name = "Google Trends"
original_source = f'<a href="{link}">{source_name}</a>'
if title in posted_titles:
print(f"Skipping already posted trend: {title}")
logging.info(f"Skipping already posted trend: {title}")
attempts += 1
continue
print(f"Trying Google Trend: {title} from {source_name}")
logging.info(f"Trying Google Trend: {title} from {source_name}")
image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
if skip:
print(f"Skipping filtered Google Trend: {title}")
logging.info(f"Skipping filtered Google Trend: {title}")
attempts += 1
continue
scoring_content = f"{title}\n\n{summary}"
interest_score = is_interesting(scoring_content)
logging.info(f"Interest score for '{title}': {interest_score}")
if interest_score < 6:
print(f"Google Trends Interest Too Low: {interest_score}")
logging.info(f"Google Trends Interest Too Low: {interest_score}")
attempts += 1
continue
num_paragraphs = determine_paragraph_count(interest_score)
extra_prompt = (
f"Generate exactly {num_paragraphs} paragraphs.\n"
f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
f"Do NOT introduce unrelated concepts.\n"
f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
f"Do not include emojis in the summary."
)
content_to_summarize = scoring_content
final_summary = summarize_with_gpt4o(
content_to_summarize,
source_name,
link,
interest_score=interest_score,
extra_prompt=extra_prompt
)
if not final_summary:
logging.info(f"Summary failed for '{title}'")
attempts += 1
continue
final_summary = insert_link_naturally(final_summary, source_name, link)
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
if not post_data:
attempts += 1
continue
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
if not image_url:
image_url, image_source, uploader, page_url = get_image(image_query)
hook = get_dynamic_hook(post_data["title"]).strip()
# Generate viral share prompt
share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
share_links_template = (
f'<p>{share_prompt} '
f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
)
post_data["content"] = f"{final_summary}\n\n{share_links_template}"
global is_posting
is_posting = True
try:
post_id, post_url = post_to_wp(
post_data=post_data,
category=category,
link=link,
author=author,
image_url=image_url,
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
interest_score=interest_score,
should_post_tweet=True
)
finally:
is_posting = False
if post_id:
share_text = f"Check out this foodie gem! {post_data['title']}"
share_text_encoded = quote(share_text)
post_url_encoded = quote(post_url)
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
# Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
post_data["content"] = f"{final_summary}\n\n{share_links}" # Removed cta from content
is_posting = True
try:
post_to_wp(
post_data=post_data,
category=category,
link=link,
author=author,
image_url=image_url,
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
interest_score=interest_score,
post_id=post_id,
should_post_tweet=False
)
finally:
is_posting = False
timestamp = datetime.now(timezone.utc).isoformat()
save_json_file(POSTED_TITLES_FILE, title, timestamp)
posted_titles.add(title)
logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")
if image_url:
save_json_file(USED_IMAGES_FILE, image_url, timestamp)
used_images.add(image_url)
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Google Trends *****")
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Google Trends *****")
return post_data, category, random.randint(0, 1800)
attempts += 1
logging.info(f"WP posting failed for '{post_data['title']}'")
print("No interesting Google Trend found after attempts")
logging.info("No interesting Google Trend found after attempts")
return None, None, random.randint(600, 1800)
def run_google_trends_automator():
logging.info("***** Google Trends Automator Launched *****")
geo_list = ['US', 'GB', 'AU']
post_data, category, sleep_time = curate_from_google_trends(geo_list=geo_list)
if sleep_time is None:
sleep_time = random.randint(600, 1800)
print(f"Sleeping for {sleep_time}s")
logging.info(f"Completed run with sleep time: {sleep_time} seconds")
time.sleep(sleep_time)
return post_data, category, sleep_time
"""Main function to run the Google Trends automator."""
scraper = GoogleTrendsScraper()
while True:
try:
post_data, author, sleep_time = scraper.curate_from_google_trends()
if post_data and author:
global is_posting
is_posting = True
try:
post_to_wp(post_data, author)
logger.info(f"Successfully posted: {post_data['title']}")
finally:
is_posting = False
time.sleep(sleep_time)
except Exception as e:
logger.error(f"Error in Google Trends automator: {e}")
time.sleep(300) # Wait 5 minutes before retrying
if __name__ == "__main__":
run_google_trends_automator()