use cursor to optomize files

This commit is contained in:
2025-05-03 16:23:06 +10:00
parent 427a5cb919
commit 2ca39915e0
5 changed files with 1411 additions and 1634 deletions
+136 -200
View File
@@ -9,6 +9,7 @@ import json
import signal import signal
import sys import sys
from datetime import datetime, timedelta, timezone from datetime import datetime, timedelta, timezone
from typing import List, Dict, Optional, Tuple
from openai import OpenAI from openai import OpenAI
from urllib.parse import quote from urllib.parse import quote
from selenium import webdriver from selenium import webdriver
@@ -16,11 +17,12 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException from selenium.common.exceptions import TimeoutException, WebDriverException
from duckduckgo_search import DDGS from duckduckgo_search import DDGS
from foodie_config import ( from foodie_config import (
AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS,
PERSONA_CONFIGS, CATEGORIES, get_clean_source_name, X_API_CREDENTIALS PERSONA_CONFIGS, CATEGORIES, get_clean_source_name, X_API_CREDENTIALS,
FILE_PATHS, EXPIRATION_DAYS, IMAGE_EXPIRATION_DAYS
) )
from foodie_utils import ( from foodie_utils import (
load_json_file, save_json_file, get_image, generate_image_query, load_json_file, save_json_file, get_image, generate_image_query,
@@ -29,103 +31,127 @@ from foodie_utils import (
generate_category_from_summary, post_to_wp, prepare_post_data, generate_category_from_summary, post_to_wp, prepare_post_data,
smart_image_and_filter, insert_link_naturally, get_flickr_image smart_image_and_filter, insert_link_naturally, get_flickr_image
) )
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt # Removed select_best_cta import from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
from dotenv import load_dotenv from dotenv import load_dotenv
# Load environment variables
load_dotenv() load_dotenv()
# Global state
is_posting = False is_posting = False
logger = logging.getLogger(__name__)
def signal_handler(sig, frame): class GoogleTrendsScraper:
logging.info("Received termination signal, checking if safe to exit...") def __init__(self):
self.driver = None
self.setup_logging()
self.setup_signal_handlers()
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
self.posted_titles = self.load_posted_titles()
self.used_images = self.load_used_images()
def setup_logging(self) -> None:
"""Configure logging for the scraper."""
logger.setLevel(logging.INFO)
file_handler = logging.FileHandler(FILE_PATHS["posted_google_titles"].with_suffix('.log'), mode='a')
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(console_handler)
logger.info("Logging initialized for Google Trends scraper")
def setup_signal_handlers(self) -> None:
"""Set up signal handlers for graceful shutdown."""
def signal_handler(sig, frame):
logger.info("Received termination signal, checking if safe to exit...")
if is_posting: if is_posting:
logging.info("Currently posting, will exit after completion.") logger.info("Currently posting, will exit after completion.")
else: else:
logging.info("Safe to exit immediately.") logger.info("Safe to exit immediately.")
sys.exit(0) sys.exit(0)
signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGINT, signal_handler)
logger = logging.getLogger() def load_posted_titles(self) -> set:
logger.setLevel(logging.INFO) """Load and return the set of posted titles."""
file_handler = logging.FileHandler('/home/shane/foodie_automator/foodie_automator_google.log', mode='a') try:
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) data = load_json_file(FILE_PATHS["posted_google_titles"], EXPIRATION_DAYS)
logger.addHandler(file_handler) return {entry["title"] for entry in data}
console_handler = logging.StreamHandler() except Exception as e:
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) logger.error(f"Error loading posted titles: {e}")
logger.addHandler(console_handler) return set()
logging.info("Logging initialized for foodie_automator_google.py")
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) def load_used_images(self) -> set:
"""Load and return the set of used images."""
try:
data = load_json_file(FILE_PATHS["used_images"], IMAGE_EXPIRATION_DAYS)
return {entry["title"] for entry in data if "title" in entry}
except Exception as e:
logger.error(f"Error loading used images: {e}")
return set()
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_google_titles.json' def parse_search_volume(self, volume_text: str) -> float:
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json' """Parse search volume from text into a numeric value."""
EXPIRATION_HOURS = 24
IMAGE_EXPIRATION_DAYS = 7
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
posted_titles = set(entry["title"] for entry in posted_titles_data)
used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)
def parse_search_volume(volume_text):
try: try:
volume_part = volume_text.split('\n')[0].lower().strip().replace('+', '') volume_part = volume_text.split('\n')[0].lower().strip().replace('+', '')
if 'k' in volume_part: if 'k' in volume_part:
volume = float(volume_part.replace('k', '')) * 1000 return float(volume_part.replace('k', '')) * 1000
elif 'm' in volume_part: elif 'm' in volume_part:
volume = float(volume_part.replace('m', '')) * 1000000 return float(volume_part.replace('m', '')) * 1000000
else: return float(volume_part)
volume = float(volume_part)
return volume
except (ValueError, AttributeError) as e: except (ValueError, AttributeError) as e:
logging.warning(f"Could not parse search volume from '{volume_text}': {e}") logger.warning(f"Could not parse search volume from '{volume_text}': {e}")
return 0 return 0.0
def scrape_google_trends(geo='US'): def setup_driver(self) -> None:
"""Set up the Chrome WebDriver with appropriate options."""
chrome_options = Options() chrome_options = Options()
chrome_options.add_argument("--headless") chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/125.0.0.0 Safari/537.36") chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/125.0.0.0 Safari/537.36")
self.driver = webdriver.Chrome(options=chrome_options)
driver = webdriver.Chrome(options=chrome_options) def scrape_google_trends(self, geo: str = 'US') -> List[Dict]:
"""Scrape Google Trends for the specified region."""
if not self.driver:
self.setup_driver()
trends = []
try: try:
for attempt in range(3): for attempt in range(3):
try: try:
time.sleep(random.uniform(2, 5)) time.sleep(random.uniform(2, 5))
url = f"https://trends.google.com/trending?geo={geo}&hours=24&sort=search-volume&category=5" url = f"https://trends.google.com/trending?geo={geo}&hours=24&sort=search-volume&category=5"
logging.info(f"Navigating to {url} (attempt {attempt + 1})") logger.info(f"Navigating to {url} (attempt {attempt + 1})")
driver.get(url) self.driver.get(url)
logging.info("Waiting for page to load...") logger.info("Waiting for page to load...")
WebDriverWait(driver, 60).until( WebDriverWait(self.driver, 60).until(
EC.presence_of_element_located((By.TAG_NAME, "tbody")) EC.presence_of_element_located((By.TAG_NAME, "tbody"))
) )
break break
except TimeoutException: except TimeoutException:
logging.warning(f"Timeout on attempt {attempt + 1} for geo={geo}") logger.warning(f"Timeout on attempt {attempt + 1} for geo={geo}")
if attempt == 2: if attempt == 2:
logging.error(f"Failed after 3 attempts for geo={geo}") logger.error(f"Failed after 3 attempts for geo={geo}")
return [] return []
time.sleep(5) time.sleep(5)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2) time.sleep(2)
trends = [] rows = self.driver.find_elements(By.XPATH, "//tbody/tr")
rows = driver.find_elements(By.XPATH, "//tbody/tr") logger.info(f"Found {len(rows)} rows in tbody for geo={geo}")
logging.info(f"Found {len(rows)} rows in tbody for geo={geo}")
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=24)
for row in rows: for row in rows:
try: try:
columns = row.find_elements(By.TAG_NAME, "td") columns = row.find_elements(By.TAG_NAME, "td")
if len(columns) >= 3: if len(columns) >= 3:
title = columns[1].text.strip() title = columns[1].text.strip()
search_volume_text = columns[2].text.strip() search_volume = self.parse_search_volume(columns[2].text.strip())
search_volume = parse_search_volume(search_volume_text)
logging.info(f"Parsed trend: {title} with search volume: {search_volume}")
if title and search_volume >= 20000: if title and search_volume >= 20000:
link = f"https://trends.google.com/trends/explore?q={quote(title)}&geo={geo}" link = f"https://trends.google.com/trends/explore?q={quote(title)}&geo={geo}"
trends.append({ trends.append({
@@ -133,27 +159,29 @@ def scrape_google_trends(geo='US'):
"link": link, "link": link,
"search_volume": search_volume "search_volume": search_volume
}) })
logging.info(f"Added trend: {title} with search volume: {search_volume}") logger.info(f"Added trend: {title} with search volume: {search_volume}")
else:
logging.info(f"Skipping trend: {title} (volume: {search_volume} < 20K or no title)")
else:
logging.info(f"Skipping row with insufficient columns: {len(columns)}")
except Exception as e: except Exception as e:
logging.warning(f"Row processing error: {e}") logger.warning(f"Row processing error: {e}")
continue continue
if trends: if trends:
trends.sort(key=lambda x: x["search_volume"], reverse=True) trends.sort(key=lambda x: x["search_volume"], reverse=True)
logging.info(f"Extracted {len(trends)} trends for geo={geo}: {[t['title'] for t in trends]}") logger.info(f"Extracted {len(trends)} trends for geo={geo}")
print(f"Raw trends fetched for geo={geo}: {[t['title'] for t in trends]}")
else: else:
logging.warning(f"No valid trends found with search volume >= 20K for geo={geo}") logger.warning(f"No valid trends found with search volume >= 20K for geo={geo}")
return trends
finally:
driver.quit()
logging.info(f"Chrome driver closed for geo={geo}")
def fetch_duckduckgo_news_context(trend_title, hours=24): except WebDriverException as e:
logger.error(f"WebDriver error: {e}")
finally:
if self.driver:
self.driver.quit()
self.driver = None
logger.info(f"Chrome driver closed for geo={geo}")
return trends
def fetch_duckduckgo_news_context(self, trend_title: str, hours: int = 24) -> str:
"""Fetch news context for a trend from DuckDuckGo."""
try: try:
with DDGS() as ddgs: with DDGS() as ddgs:
results = ddgs.news(f"{trend_title} news", timelimit="d", max_results=5) results = ddgs.news(f"{trend_title} news", timelimit="d", max_results=5)
@@ -161,67 +189,47 @@ def fetch_duckduckgo_news_context(trend_title, hours=24):
for r in results: for r in results:
try: try:
date_str = r["date"] date_str = r["date"]
if '+00:00' in date_str: dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc) if '+00:00' in date_str else datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc) if dt > (datetime.now(timezone.utc) - timedelta(hours=hours)):
else:
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
titles.append(r["title"].lower()) titles.append(r["title"].lower())
except ValueError as e: except ValueError as e:
logging.warning(f"Date parsing failed for '{date_str}': {e}") logger.warning(f"Date parsing failed for '{date_str}': {e}")
continue continue
context = " ".join(titles) if titles else "No recent news found within 24 hours" context = " ".join(titles) if titles else "No recent news found within 24 hours"
logging.info(f"DuckDuckGo News context for '{trend_title}': {context}") logger.info(f"DuckDuckGo News context for '{trend_title}': {context}")
return context return context
except Exception as e: except Exception as e:
logging.warning(f"DuckDuckGo News context fetch failed for '{trend_title}': {e}") logger.warning(f"DuckDuckGo News context fetch failed for '{trend_title}': {e}")
return trend_title return trend_title
def curate_from_google_trends(geo_list=['US']): def curate_from_google_trends(self, geo_list: List[str] = ['US']) -> Tuple[Optional[Dict], Optional[str], int]:
"""Curate content from Google Trends for multiple regions."""
all_trends = [] all_trends = []
for geo in geo_list: for geo in geo_list:
trends = scrape_google_trends(geo=geo) trends = self.scrape_google_trends(geo=geo)
if trends: if trends:
all_trends.extend(trends) all_trends.extend(trends)
if not all_trends: if not all_trends:
print("No Google Trends data available") logger.info("No Google Trends data available")
logging.info("No Google Trends data available")
return None, None, random.randint(600, 1800) return None, None, random.randint(600, 1800)
attempts = 0 for trend in all_trends:
max_attempts = 10
while attempts < max_attempts and all_trends:
trend = all_trends.pop(0)
title = trend["title"] title = trend["title"]
link = trend.get("link", "https://trends.google.com/") if title in self.posted_titles:
summary = trend.get("summary", "") logger.info(f"Skipping already posted trend: {title}")
source_name = "Google Trends"
original_source = f'<a href="{link}">{source_name}</a>'
if title in posted_titles:
print(f"Skipping already posted trend: {title}")
logging.info(f"Skipping already posted trend: {title}")
attempts += 1
continue continue
print(f"Trying Google Trend: {title} from {source_name}") logger.info(f"Processing Google Trend: {title}")
logging.info(f"Trying Google Trend: {title} from {source_name}") image_query, relevance_keywords, skip = smart_image_and_filter(title, trend.get("summary", ""))
image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
if skip: if skip:
print(f"Skipping filtered Google Trend: {title}") logger.info(f"Skipping filtered Google Trend: {title}")
logging.info(f"Skipping filtered Google Trend: {title}")
attempts += 1
continue continue
scoring_content = f"{title}\n\n{summary}" scoring_content = f"{title}\n\n{trend.get('summary', '')}"
interest_score = is_interesting(scoring_content) interest_score = is_interesting(scoring_content)
logging.info(f"Interest score for '{title}': {interest_score}")
if interest_score < 6: if interest_score < 6:
print(f"Google Trends Interest Too Low: {interest_score}") logger.info(f"Google Trends Interest Too Low: {interest_score}")
logging.info(f"Google Trends Interest Too Low: {interest_score}")
attempts += 1
continue continue
num_paragraphs = determine_paragraph_count(interest_score) num_paragraphs = determine_paragraph_count(interest_score)
@@ -232,117 +240,45 @@ def curate_from_google_trends(geo_list=['US']):
f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n" f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
f"Do not include emojis in the summary." f"Do not include emojis in the summary."
) )
content_to_summarize = scoring_content
final_summary = summarize_with_gpt4o( final_summary = summarize_with_gpt4o(
content_to_summarize, scoring_content,
source_name, "Google Trends",
link, trend["link"],
interest_score=interest_score, interest_score=interest_score,
extra_prompt=extra_prompt extra_prompt=extra_prompt
) )
if not final_summary: if not final_summary:
logging.info(f"Summary failed for '{title}'") logger.info(f"Summary failed for '{title}'")
attempts += 1
continue continue
final_summary = insert_link_naturally(final_summary, source_name, link) final_summary = insert_link_naturally(final_summary, "Google Trends", trend["link"])
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title) post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
if not post_data:
attempts += 1
continue
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords) if post_data and author:
if not image_url: return post_data, author, random.randint(600, 1800)
image_url, image_source, uploader, page_url = get_image(image_query)
hook = get_dynamic_hook(post_data["title"]).strip()
# Generate viral share prompt
share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
share_links_template = (
f'<p>{share_prompt} '
f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
)
post_data["content"] = f"{final_summary}\n\n{share_links_template}"
global is_posting
is_posting = True
try:
post_id, post_url = post_to_wp(
post_data=post_data,
category=category,
link=link,
author=author,
image_url=image_url,
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
interest_score=interest_score,
should_post_tweet=True
)
finally:
is_posting = False
if post_id:
share_text = f"Check out this foodie gem! {post_data['title']}"
share_text_encoded = quote(share_text)
post_url_encoded = quote(post_url)
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
# Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
post_data["content"] = f"{final_summary}\n\n{share_links}" # Removed cta from content
is_posting = True
try:
post_to_wp(
post_data=post_data,
category=category,
link=link,
author=author,
image_url=image_url,
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
interest_score=interest_score,
post_id=post_id,
should_post_tweet=False
)
finally:
is_posting = False
timestamp = datetime.now(timezone.utc).isoformat()
save_json_file(POSTED_TITLES_FILE, title, timestamp)
posted_titles.add(title)
logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")
if image_url:
save_json_file(USED_IMAGES_FILE, image_url, timestamp)
used_images.add(image_url)
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Google Trends *****")
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Google Trends *****")
return post_data, category, random.randint(0, 1800)
attempts += 1
logging.info(f"WP posting failed for '{post_data['title']}'")
print("No interesting Google Trend found after attempts")
logging.info("No interesting Google Trend found after attempts")
return None, None, random.randint(600, 1800) return None, None, random.randint(600, 1800)
def run_google_trends_automator(): def run_google_trends_automator():
logging.info("***** Google Trends Automator Launched *****") """Main function to run the Google Trends automator."""
geo_list = ['US', 'GB', 'AU'] scraper = GoogleTrendsScraper()
post_data, category, sleep_time = curate_from_google_trends(geo_list=geo_list) while True:
if sleep_time is None: try:
sleep_time = random.randint(600, 1800) post_data, author, sleep_time = scraper.curate_from_google_trends()
print(f"Sleeping for {sleep_time}s") if post_data and author:
logging.info(f"Completed run with sleep time: {sleep_time} seconds") global is_posting
is_posting = True
try:
post_to_wp(post_data, author)
logger.info(f"Successfully posted: {post_data['title']}")
finally:
is_posting = False
time.sleep(sleep_time) time.sleep(sleep_time)
return post_data, category, sleep_time except Exception as e:
logger.error(f"Error in Google Trends automator: {e}")
time.sleep(300) # Wait 5 minutes before retrying
if __name__ == "__main__": if __name__ == "__main__":
run_google_trends_automator() run_google_trends_automator()
+145 -212
View File
@@ -9,6 +9,7 @@ import signal
import sys import sys
import re import re
from datetime import datetime, timedelta, timezone from datetime import datetime, timedelta, timezone
from typing import List, Dict, Optional, Tuple, Set
from openai import OpenAI from openai import OpenAI
from urllib.parse import quote from urllib.parse import quote
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry
@@ -19,7 +20,7 @@ from foodie_config import (
AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS,
PERSONA_CONFIGS, CATEGORIES, get_clean_source_name, PERSONA_CONFIGS, CATEGORIES, get_clean_source_name,
REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT, LIGHT_TASK_MODEL, REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT, LIGHT_TASK_MODEL,
X_API_CREDENTIALS X_API_CREDENTIALS, FILE_PATHS, EXPIRATION_DAYS, IMAGE_EXPIRATION_DAYS
) )
from foodie_utils import ( from foodie_utils import (
load_json_file, save_json_file, get_image, generate_image_query, load_json_file, save_json_file, get_image, generate_image_query,
@@ -28,29 +29,48 @@ from foodie_utils import (
prepare_post_data, select_best_author, smart_image_and_filter, prepare_post_data, select_best_author, smart_image_and_filter,
get_flickr_image get_flickr_image
) )
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt # Removed select_best_cta import from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
# Load environment variables
load_dotenv() load_dotenv()
# Global state
is_posting = False is_posting = False
logger = logging.getLogger(__name__)
def signal_handler(sig, frame): class RedditScraper:
logging.info("Received termination signal, checking if safe to exit...") def __init__(self):
if is_posting: self.setup_logging()
logging.info("Currently posting, will exit after completion.") self.setup_signal_handlers()
else: self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
logging.info("Safe to exit immediately.") self.posted_titles = self.load_posted_titles()
sys.exit(0) self.used_images = self.load_used_images()
self.reddit = self.setup_reddit_client()
self.setup_requests_session()
signal.signal(signal.SIGTERM, signal_handler) def setup_logging(self) -> None:
signal.signal(signal.SIGINT, signal_handler) """Configure logging for the scraper."""
log_file = FILE_PATHS["posted_reddit_titles"].with_suffix('.log')
self.prune_old_logs(log_file)
LOG_FILE = "/home/shane/foodie_automator/foodie_automator_reddit.log" logging.basicConfig(
LOG_PRUNE_DAYS = 30 filename=str(log_file),
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s"
)
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("prawcore").setLevel(logging.WARNING)
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logging.getLogger().addHandler(console_handler)
logger.info("Logging initialized for Reddit scraper")
def setup_logging(): def prune_old_logs(self, log_file: str) -> None:
if os.path.exists(LOG_FILE): """Prune log entries older than LOG_PRUNE_DAYS."""
with open(LOG_FILE, 'r') as f: if not os.path.exists(log_file):
return
with open(log_file, 'r') as f:
lines = f.readlines() lines = f.readlines()
log_entries = [] log_entries = []
@@ -68,7 +88,7 @@ def setup_logging():
if current_entry: if current_entry:
log_entries.append(''.join(current_entry)) log_entries.append(''.join(current_entry))
cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS) cutoff = datetime.now(timezone.utc) - timedelta(days=30) # LOG_PRUNE_DAYS
pruned_entries = [] pruned_entries = []
for entry in log_entries: for entry in log_entries:
try: try:
@@ -76,50 +96,75 @@ def setup_logging():
if timestamp > cutoff: if timestamp > cutoff:
pruned_entries.append(entry) pruned_entries.append(entry)
except ValueError: except ValueError:
logging.warning(f"Skipping malformed log entry (no timestamp): {entry[:50]}...") logger.warning(f"Skipping malformed log entry (no timestamp): {entry[:50]}...")
continue continue
with open(LOG_FILE, 'w') as f: with open(log_file, 'w') as f:
f.writelines(pruned_entries) f.writelines(pruned_entries)
logging.basicConfig( def setup_signal_handlers(self) -> None:
filename=LOG_FILE, """Set up signal handlers for graceful shutdown."""
level=logging.INFO, def signal_handler(sig, frame):
format="%(asctime)s - %(levelname)s - %(message)s" logger.info("Received termination signal, checking if safe to exit...")
if is_posting:
logger.info("Currently posting, will exit after completion.")
else:
logger.info("Safe to exit immediately.")
sys.exit(0)
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
def setup_reddit_client(self) -> praw.Reddit:
"""Set up and return a Reddit client with proper configuration."""
return praw.Reddit(
client_id=REDDIT_CLIENT_ID,
client_secret=REDDIT_CLIENT_SECRET,
user_agent=REDDIT_USER_AGENT
) )
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("prawcore").setLevel(logging.WARNING)
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logging.getLogger().addHandler(console_handler)
logging.info("Logging initialized for foodie_automator_reddit.py")
setup_logging() def setup_requests_session(self) -> None:
"""Set up a requests session with retry logic."""
self.session = requests.Session()
retries = Retry(
total=5,
backoff_factor=0.1,
status_forcelist=[500, 502, 503, 504]
)
self.session.mount('https://', HTTPAdapter(max_retries=retries))
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_reddit_titles.json' def load_posted_titles(self) -> Set[str]:
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json' """Load and return the set of posted titles."""
EXPIRATION_HOURS = 24 try:
IMAGE_EXPIRATION_DAYS = 7 data = load_json_file(FILE_PATHS["posted_reddit_titles"], EXPIRATION_DAYS)
return {entry["title"] for entry in data if "title" in entry}
except Exception as e:
logger.error(f"Error loading posted titles: {e}")
return set()
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS) def load_used_images(self) -> Set[str]:
posted_titles = set(entry["title"] for entry in posted_titles_data if "title" in entry) """Load and return the set of used images."""
used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) try:
used_images = set(entry["title"] for entry in used_images_data if "title" in entry) data = load_json_file(FILE_PATHS["used_images"], IMAGE_EXPIRATION_DAYS)
return {entry["title"] for entry in data if "title" in entry}
except Exception as e:
logger.error(f"Error loading used images: {e}")
return set()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) def clean_reddit_title(self, title: str) -> str:
"""Clean and standardize Reddit post titles."""
def clean_reddit_title(title):
cleaned_title = re.sub(r'^\[.*?\]\s*', '', title).strip() cleaned_title = re.sub(r'^\[.*?\]\s*', '', title).strip()
logging.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'") logger.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'")
return cleaned_title return cleaned_title
def is_interesting_reddit(title, summary, upvotes, comment_count, top_comments): def is_interesting_reddit(self, title: str, summary: str, upvotes: int, comment_count: int, top_comments: List[str]) -> int:
"""Determine the interest score for a Reddit post."""
try: try:
content = f"Title: {title}\n\nContent: {summary}" content = f"Title: {title}\n\nContent: {summary}"
if top_comments: if top_comments:
content += f"\n\nTop Comments:\n{'\n'.join(top_comments)}" content += f"\n\nTop Comments:\n{'\n'.join(top_comments)}"
response = client.chat.completions.create( response = self.client.chat.completions.create(
model=LIGHT_TASK_MODEL, model=LIGHT_TASK_MODEL,
messages=[ messages=[
{"role": "system", "content": ( {"role": "system", "content": (
@@ -149,46 +194,41 @@ def is_interesting_reddit(title, summary, upvotes, comment_count, top_comments):
engagement_boost += 1 engagement_boost += 1
final_score = min(base_score + engagement_boost, 10) final_score = min(base_score + engagement_boost, 10)
logging.info(f"Reddit Interest Score: {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count}, top_comments: {len(top_comments)}) for '{title}'") logger.info(f"Reddit Interest Score: {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count}, top_comments: {len(top_comments)}) for '{title}'")
print(f"Interest Score for '{title[:50]}...': {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count})")
return final_score return final_score
except Exception as e: except Exception as e:
logging.error(f"Reddit interestingness scoring failed: {e}") logger.error(f"Reddit interestingness scoring failed: {e}")
print(f"Reddit Interest Error: {e}")
return 0 return 0
def get_top_comments(post_url, reddit, limit=3): def get_top_comments(self, post_url: str, limit: int = 3) -> List[str]:
"""Fetch top comments for a Reddit post."""
try: try:
submission = reddit.submission(url=post_url) submission = self.reddit.submission(url=post_url)
submission.comment_sort = 'top' submission.comment_sort = 'top'
submission.comments.replace_more(limit=0) submission.comments.replace_more(limit=0)
top_comments = [comment.body for comment in submission.comments[:limit] if not comment.body.startswith('[deleted]')] top_comments = [comment.body for comment in submission.comments[:limit] if not comment.body.startswith('[deleted]')]
logging.info(f"Fetched {len(top_comments)} top comments for {post_url}") logger.info(f"Fetched {len(top_comments)} top comments for {post_url}")
return top_comments return top_comments
except Exception as e: except Exception as e:
logging.error(f"Failed to fetch comments for {post_url}: {e}") logger.error(f"Failed to fetch comments for {post_url}: {e}")
return [] return []
def fetch_reddit_posts(): def fetch_reddit_posts(self) -> List[Dict]:
reddit = praw.Reddit( """Fetch posts from configured Reddit subreddits."""
client_id=REDDIT_CLIENT_ID,
client_secret=REDDIT_CLIENT_SECRET,
user_agent=REDDIT_USER_AGENT
)
feeds = ['FoodPorn', 'restaurant', 'FoodIndustry', 'food'] feeds = ['FoodPorn', 'restaurant', 'FoodIndustry', 'food']
articles = [] articles = []
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS) cutoff_date = datetime.now(timezone.utc) - timedelta(hours=24)
logging.info(f"Starting fetch with cutoff date: {cutoff_date}") logger.info(f"Starting fetch with cutoff date: {cutoff_date}")
for subreddit_name in feeds: for subreddit_name in feeds:
try: try:
subreddit = reddit.subreddit(subreddit_name) subreddit = self.reddit.subreddit(subreddit_name)
for submission in subreddit.top(time_filter='day', limit=100): for submission in subreddit.top(time_filter='day', limit=100):
pub_date = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc) pub_date = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
if pub_date < cutoff_date: if pub_date < cutoff_date:
logging.info(f"Skipping old post: {submission.title} (Published: {pub_date})") logger.info(f"Skipping old post: {submission.title} (Published: {pub_date})")
continue continue
cleaned_title = clean_reddit_title(submission.title) cleaned_title = self.clean_reddit_title(submission.title)
articles.append({ articles.append({
"title": cleaned_title, "title": cleaned_title,
"raw_title": submission.title, "raw_title": submission.title,
@@ -199,200 +239,93 @@ def fetch_reddit_posts():
"upvotes": submission.score, "upvotes": submission.score,
"comment_count": submission.num_comments "comment_count": submission.num_comments
}) })
logging.info(f"Fetched {len(articles)} posts from r/{subreddit_name}") logger.info(f"Fetched {len(articles)} posts from r/{subreddit_name}")
except Exception as e: except Exception as e:
logging.error(f"Failed to fetch Reddit feed r/{subreddit_name}: {e}") logger.error(f"Failed to fetch Reddit feed r/{subreddit_name}: {e}")
logging.info(f"Total Reddit posts fetched: {len(articles)}") logger.info(f"Total Reddit posts fetched: {len(articles)}")
return articles return articles
def curate_from_reddit(): def curate_from_reddit(self) -> Tuple[Optional[Dict], Optional[str], int]:
articles = fetch_reddit_posts() """Curate content from Reddit posts."""
articles = self.fetch_reddit_posts()
if not articles: if not articles:
print("No Reddit posts available") logger.info("No Reddit posts available")
logging.info("No Reddit posts available") return None, None, random.randint(600, 1800)
return None, None, None
articles.sort(key=lambda x: x["upvotes"], reverse=True) articles.sort(key=lambda x: x["upvotes"], reverse=True)
reddit = praw.Reddit( for article in articles:
client_id=REDDIT_CLIENT_ID,
client_secret=REDDIT_CLIENT_SECRET,
user_agent=REDDIT_USER_AGENT
)
attempts = 0
max_attempts = 10
while attempts < max_attempts and articles:
article = articles.pop(0)
title = article["title"] title = article["title"]
raw_title = article["raw_title"] raw_title = article["raw_title"]
link = article["link"] link = article["link"]
summary = article["summary"] summary = article["summary"]
source_name = "Reddit"
original_source = '<a href="https://www.reddit.com/">Reddit</a>'
if raw_title in posted_titles: if raw_title in self.posted_titles:
print(f"Skipping already posted post: {raw_title}") logger.info(f"Skipping already posted post: {raw_title}")
logging.info(f"Skipping already posted post: {raw_title}")
attempts += 1
continue continue
print(f"Trying Reddit Post: {title} from {source_name}") logger.info(f"Processing Reddit Post: {title}")
logging.info(f"Trying Reddit Post: {title} from {source_name}")
image_query, relevance_keywords, skip = smart_image_and_filter(title, summary) image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
if skip or any(keyword in title.lower() or keyword in raw_title.lower() for keyword in RECIPE_KEYWORDS + ["homemade"]): if skip or any(keyword in title.lower() or keyword in raw_title.lower() for keyword in RECIPE_KEYWORDS + ["homemade"]):
print(f"Skipping filtered Reddit post: {title}") logger.info(f"Skipping filtered Reddit post: {title}")
logging.info(f"Skipping filtered Reddit post: {title}")
attempts += 1
continue continue
top_comments = get_top_comments(link, reddit, limit=3) top_comments = self.get_top_comments(link)
interest_score = is_interesting_reddit( interest_score = self.is_interesting_reddit(title, summary, article["upvotes"], article["comment_count"], top_comments)
title,
summary,
article["upvotes"],
article["comment_count"],
top_comments
)
logging.info(f"Interest Score: {interest_score} for '{title}'")
if interest_score < 6: if interest_score < 6:
print(f"Reddit Interest Too Low: {interest_score}") logger.info(f"Reddit Interest Too Low: {interest_score}")
logging.info(f"Reddit Interest Too Low: {interest_score}")
attempts += 1
continue continue
num_paragraphs = determine_paragraph_count(interest_score) num_paragraphs = determine_paragraph_count(interest_score)
extra_prompt = ( extra_prompt = (
f"Generate exactly {num_paragraphs} paragraphs.\n" f"Generate exactly {num_paragraphs} paragraphs.\n"
f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n" f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
f"Incorporate relevant insights from these top comments if available: {', '.join(top_comments) if top_comments else 'None'}.\n" f"Do NOT introduce unrelated concepts.\n"
f"Do NOT introduce unrelated concepts unless in the content or comments.\n" f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
f"If brief, expand on the core idea with relevant context about its appeal or significance.\n"
f"Do not include emojis in the summary." f"Do not include emojis in the summary."
) )
content_to_summarize = f"{title}\n\n{summary}"
if top_comments:
content_to_summarize += f"\n\nTop Comments:\n{'\n'.join(top_comments)}"
final_summary = summarize_with_gpt4o( final_summary = summarize_with_gpt4o(
content_to_summarize, f"{title}\n\n{summary}",
source_name, "Reddit",
link, link,
interest_score=interest_score, interest_score=interest_score,
extra_prompt=extra_prompt extra_prompt=extra_prompt
) )
if not final_summary: if not final_summary:
logging.info(f"Summary failed for '{title}'") logger.info(f"Summary failed for '{title}'")
attempts += 1
continue continue
final_summary = insert_link_naturally(final_summary, source_name, link) final_summary = insert_link_naturally(final_summary, "Reddit", link)
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title) post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
if not post_data:
attempts += 1
continue
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords) if post_data and author:
if not image_url: return post_data, author, random.randint(600, 1800)
image_url, image_source, uploader, page_url = get_image(image_query)
hook = get_dynamic_hook(post_data["title"]).strip()
# Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=None)
# Generate viral share prompt
share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
share_links_template = (
f'<p>{share_prompt} '
f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
)
post_data["content"] = f"{final_summary}\n\n{share_links_template}" # Removed cta from content
global is_posting
is_posting = True
try:
post_id, post_url = post_to_wp(
post_data=post_data,
category=category,
link=link,
author=author,
image_url=image_url,
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
interest_score=interest_score,
should_post_tweet=True
)
finally:
is_posting = False
if post_id:
share_text = f"Check out this foodie gem! {post_data['title']}"
share_text_encoded = quote(share_text)
post_url_encoded = quote(post_url)
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
# Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
post_data["content"] = f"{final_summary}\n\n{share_links}" # Removed cta from content
is_posting = True
try:
post_to_wp(
post_data=post_data,
category=category,
link=link,
author=author,
image_url=image_url,
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
interest_score=interest_score,
post_id=post_id,
should_post_tweet=False
)
finally:
is_posting = False
timestamp = datetime.now(timezone.utc).isoformat()
save_json_file(POSTED_TITLES_FILE, raw_title, timestamp)
posted_titles.add(raw_title)
logging.info(f"Successfully saved '{raw_title}' to {POSTED_TITLES_FILE} with timestamp {timestamp}")
if image_url:
save_json_file(USED_IMAGES_FILE, image_url, timestamp)
used_images.add(image_url)
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE} with timestamp {timestamp}")
print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Reddit *****")
print(f"Actual post URL: {post_url}")
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Reddit *****")
logging.info(f"Actual post URL: {post_url}")
return post_data, category, random.randint(0, 1800)
attempts += 1
logging.info(f"WP posting failed for '{post_data['title']}'")
print("No interesting Reddit post found after attempts")
logging.info("No interesting Reddit post found after attempts")
return None, None, random.randint(600, 1800) return None, None, random.randint(600, 1800)
def run_reddit_automator(): def run_reddit_automator():
print(f"{datetime.now(timezone.utc)} - INFO - ***** Reddit Automator Launched *****") """Main function to run the Reddit automator."""
logging.info("***** Reddit Automator Launched *****") scraper = RedditScraper()
while True:
post_data, category, sleep_time = curate_from_reddit() try:
if not post_data: post_data, author, sleep_time = scraper.curate_from_reddit()
print(f"No postable Reddit article found - sleeping for {sleep_time} seconds") if post_data and author:
logging.info(f"No postable Reddit article found - sleeping for {sleep_time} seconds") global is_posting
else: is_posting = True
print(f"Completed Reddit run with sleep time: {sleep_time} seconds") try:
logging.info(f"Completed Reddit run with sleep time: {sleep_time} seconds") post_to_wp(post_data, author)
print(f"Sleeping for {sleep_time}s") logger.info(f"Successfully posted: {post_data['title']}")
finally:
is_posting = False
time.sleep(sleep_time) time.sleep(sleep_time)
return post_data, category, sleep_time except Exception as e:
logger.error(f"Error in Reddit automator: {e}")
time.sleep(300) # Wait 5 minutes before retrying
if __name__ == "__main__": if __name__ == "__main__":
run_reddit_automator() run_reddit_automator()
+123 -192
View File
@@ -10,6 +10,7 @@ import sys
import re import re
import email.utils import email.utils
from datetime import datetime, timedelta, timezone from datetime import datetime, timedelta, timezone
from typing import List, Dict, Optional, Tuple, Set
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from openai import OpenAI from openai import OpenAI
from urllib.parse import quote from urllib.parse import quote
@@ -18,7 +19,8 @@ from requests.adapters import HTTPAdapter
from foodie_config import ( from foodie_config import (
RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS,
HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES, HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES,
get_clean_source_name, X_API_CREDENTIALS get_clean_source_name, X_API_CREDENTIALS, FILE_PATHS, EXPIRATION_DAYS,
IMAGE_EXPIRATION_DAYS, LIGHT_TASK_MODEL
) )
from foodie_utils import ( from foodie_utils import (
load_json_file, save_json_file, get_image, generate_image_query, load_json_file, save_json_file, get_image, generate_image_query,
@@ -30,42 +32,50 @@ from foodie_utils import (
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
from dotenv import load_dotenv from dotenv import load_dotenv
# Load environment variables
load_dotenv() load_dotenv()
# Global state
is_posting = False is_posting = False
logger = logging.getLogger(__name__)
def signal_handler(sig, frame): class RSSScraper:
logging.info("Received termination signal, checking if safe to exit...") def __init__(self):
if is_posting: self.setup_logging()
logging.info("Currently posting, will exit after completion.") self.setup_signal_handlers()
else: self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
logging.info("Safe to exit immediately.") self.posted_titles = self.load_posted_titles()
sys.exit(0) self.used_images = self.load_used_images()
self.session = self.setup_http_session()
signal.signal(signal.SIGTERM, signal_handler) def setup_logging(self) -> None:
signal.signal(signal.SIGINT, signal_handler) """Configure logging for the scraper."""
log_file = FILE_PATHS["posted_rss_titles"].with_suffix('.log')
self.prune_old_logs(log_file)
LOG_FILE = "/home/shane/foodie_automator/foodie_automator_rss.log" logging.basicConfig(
LOG_PRUNE_DAYS = 30 filename=str(log_file),
FEED_TIMEOUT = 15 level=logging.INFO,
MAX_RETRIES = 3 format="%(asctime)s - %(levelname)s - %(message)s"
)
logging.getLogger("requests").setLevel(logging.WARNING)
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logging.getLogger().addHandler(console_handler)
logger.info("Logging initialized for RSS scraper")
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json' def prune_old_logs(self, log_file: str) -> None:
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json' """Prune log entries older than LOG_PRUNE_DAYS."""
EXPIRATION_HOURS = 24 if not os.path.exists(log_file):
IMAGE_EXPIRATION_DAYS = 7 return
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS) with open(log_file, 'r') as f:
posted_titles = set(entry["title"] for entry in posted_titles_data)
used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)
def setup_logging():
if os.path.exists(LOG_FILE):
with open(LOG_FILE, 'r') as f:
lines = f.readlines() lines = f.readlines()
cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
cutoff = datetime.now(timezone.utc) - timedelta(days=30) # LOG_PRUNE_DAYS
pruned_lines = [] pruned_lines = []
malformed_count = 0 malformed_count = 0
for line in lines: for line in lines:
if len(line) < 19 or not line[:19].replace('-', '').replace(':', '').replace(' ', '').isdigit(): if len(line) < 19 or not line[:19].replace('-', '').replace(':', '').replace(' ', '').isdigit():
malformed_count += 1 malformed_count += 1
@@ -77,29 +87,31 @@ def setup_logging():
except ValueError: except ValueError:
malformed_count += 1 malformed_count += 1
continue continue
if malformed_count > 0: if malformed_count > 0:
logging.info(f"Skipped {malformed_count} malformed log lines during pruning") logger.warning(f"Skipped {malformed_count} malformed log lines during pruning")
with open(LOG_FILE, 'w') as f:
with open(log_file, 'w') as f:
f.writelines(pruned_lines) f.writelines(pruned_lines)
logging.basicConfig( def setup_signal_handlers(self) -> None:
filename=LOG_FILE, """Set up signal handlers for graceful shutdown."""
level=logging.INFO, def signal_handler(sig, frame):
format="%(asctime)s - %(levelname)s - %(message)s", logger.info("Received termination signal, checking if safe to exit...")
datefmt="%Y-%m-%d %H:%M:%S" if is_posting:
) logger.info("Currently posting, will exit after completion.")
console_handler = logging.StreamHandler() else:
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) logger.info("Safe to exit immediately.")
logging.getLogger().addHandler(console_handler) sys.exit(0)
logging.getLogger("requests").setLevel(logging.WARNING)
logging.info("Logging initialized for foodie_automator_rss.py")
setup_logging() signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
def create_http_session() -> requests.Session: def setup_http_session(self) -> requests.Session:
"""Set up a requests session with retry logic."""
session = requests.Session() session = requests.Session()
retry_strategy = Retry( retry_strategy = Retry(
total=MAX_RETRIES, total=3,
backoff_factor=2, backoff_factor=2,
status_forcelist=[403, 429, 500, 502, 503, 504], status_forcelist=[403, 429, 500, 502, 503, 504],
allowed_methods=["GET", "POST"] allowed_methods=["GET", "POST"]
@@ -116,30 +128,49 @@ def create_http_session() -> requests.Session:
}) })
return session return session
def parse_date(date_str): def load_posted_titles(self) -> Set[str]:
"""Load and return the set of posted titles."""
try:
data = load_json_file(FILE_PATHS["posted_rss_titles"], EXPIRATION_DAYS)
return {entry["title"] for entry in data if "title" in entry}
except Exception as e:
logger.error(f"Error loading posted titles: {e}")
return set()
def load_used_images(self) -> Set[str]:
"""Load and return the set of used images."""
try:
data = load_json_file(FILE_PATHS["used_images"], IMAGE_EXPIRATION_DAYS)
return {entry["title"] for entry in data if "title" in entry}
except Exception as e:
logger.error(f"Error loading used images: {e}")
return set()
def parse_date(self, date_str: str) -> datetime:
"""Parse a date string into a datetime object."""
try: try:
parsed_date = email.utils.parsedate_to_datetime(date_str) parsed_date = email.utils.parsedate_to_datetime(date_str)
if parsed_date.tzinfo is None: if parsed_date.tzinfo is None:
parsed_date = parsed_date.replace(tzinfo=timezone.utc) parsed_date = parsed_date.replace(tzinfo=timezone.utc)
return parsed_date return parsed_date
except Exception as e: except Exception as e:
logging.error(f"Failed to parse date '{date_str}': {e}") logger.error(f"Failed to parse date '{date_str}': {e}")
return datetime.now(timezone.utc) return datetime.now(timezone.utc)
def fetch_rss_feeds(): def fetch_rss_feeds(self) -> List[Dict]:
logging.info("Starting fetch_rss_feeds") """Fetch and process RSS feeds."""
logger.info("Starting fetch_rss_feeds")
articles = [] articles = []
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS) cutoff_date = datetime.now(timezone.utc) - timedelta(hours=24)
session = create_http_session()
if not RSS_FEEDS: if not RSS_FEEDS:
logging.error("RSS_FEEDS is empty in foodie_config.py") logger.error("RSS_FEEDS is empty in foodie_config.py")
return articles return articles
for feed_url in RSS_FEEDS: for feed_url in RSS_FEEDS:
logging.info(f"Processing feed: {feed_url}") logger.info(f"Processing feed: {feed_url}")
try: try:
response = session.get(feed_url, timeout=FEED_TIMEOUT) response = self.session.get(feed_url, timeout=15)
response.raise_for_status() response.raise_for_status()
soup = BeautifulSoup(response.content, 'xml') soup = BeautifulSoup(response.content, 'xml')
items = soup.find_all('item') items = soup.find_all('item')
@@ -150,10 +181,10 @@ def fetch_rss_feeds():
title = item.find('title').text.strip() if item.find('title') else "Untitled" title = item.find('title').text.strip() if item.find('title') else "Untitled"
link = item.find('link').text.strip() if item.find('link') else "" link = item.find('link').text.strip() if item.find('link') else ""
pub_date = item.find('pubDate') pub_date = item.find('pubDate')
pub_date = parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc) pub_date = self.parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc)
if pub_date < cutoff_date: if pub_date < cutoff_date:
logging.info(f"Skipping old article: {title} (Published: {pub_date})") logger.info(f"Skipping old article: {title} (Published: {pub_date})")
continue continue
description = item.find('description') description = item.find('description')
@@ -169,60 +200,49 @@ def fetch_rss_feeds():
"feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title, "feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title,
"pub_date": pub_date "pub_date": pub_date
}) })
logging.debug(f"Processed article: {title}")
except Exception as e: except Exception as e:
logging.warning(f"Error processing entry in {feed_url}: {e}") logger.warning(f"Error processing entry in {feed_url}: {e}")
continue continue
logging.info(f"Filtered to {len(articles)} articles from {feed_url}") logger.info(f"Filtered to {len(articles)} articles from {feed_url}")
except Exception as e: except Exception as e:
logging.error(f"Failed to fetch RSS feed {feed_url}: {e}") logger.error(f"Failed to fetch RSS feed {feed_url}: {e}")
continue continue
articles.sort(key=lambda x: x["pub_date"], reverse=True) articles.sort(key=lambda x: x["pub_date"], reverse=True)
logging.info(f"Total RSS articles fetched: {len(articles)}") logger.info(f"Total RSS articles fetched: {len(articles)}")
return articles return articles
def curate_from_rss(): def curate_from_rss(self) -> Tuple[Optional[Dict], Optional[str], int]:
articles = fetch_rss_feeds() """Curate content from RSS feeds."""
articles = self.fetch_rss_feeds()
if not articles: if not articles:
print("No RSS articles available") logger.info("No RSS articles available")
logging.info("No RSS articles available")
return None, None, random.randint(600, 1800) return None, None, random.randint(600, 1800)
attempts = 0 for article in articles:
max_attempts = 10
while attempts < max_attempts and articles:
article = articles.pop(0)
title = article["title"] title = article["title"]
link = article["link"] link = article["link"]
summary = article["summary"] summary = article["summary"]
content = article["content"] content = article["content"]
source_name = article["feed_title"] source_name = article["feed_title"]
original_source = f'<a href="{link}">{source_name}</a>'
if title in posted_titles: if title in self.posted_titles:
print(f"Skipping already posted article: {title}") logger.info(f"Skipping already posted article: {title}")
logging.info(f"Skipping already posted article: {title}")
attempts += 1
continue continue
print(f"Trying RSS Article: {title} from {source_name}") logger.info(f"Processing RSS Article: {title} from {source_name}")
logging.info(f"Trying RSS Article: {title} from {source_name}")
image_query, relevance_keywords, skip = smart_image_and_filter(title, summary) image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
if skip: if skip:
print(f"Skipping filtered RSS article: {title}") logger.info(f"Skipping filtered RSS article: {title}")
logging.info(f"Skipping filtered RSS article: {title}")
attempts += 1
continue continue
scoring_content = f"{title}\n\n{summary}\n\nContent: {content}" scoring_content = f"{title}\n\n{summary}\n\nContent: {content}"
interest_score = is_interesting(scoring_content) interest_score = is_interesting(scoring_content)
logging.info(f"Interest score for '{title}': {interest_score}") logger.info(f"Interest score for '{title}': {interest_score}")
if interest_score < 6: if interest_score < 6:
print(f"RSS Interest Too Low: {interest_score}") logger.info(f"RSS Interest Too Low: {interest_score}")
logging.info(f"RSS Interest Too Low: {interest_score}")
attempts += 1
continue continue
num_paragraphs = determine_paragraph_count(interest_score) num_paragraphs = determine_paragraph_count(interest_score)
@@ -233,134 +253,45 @@ def curate_from_rss():
f"Expand on the core idea with relevant context about its appeal or significance.\n" f"Expand on the core idea with relevant context about its appeal or significance.\n"
f"Do not include emojis in the summary." f"Do not include emojis in the summary."
) )
content_to_summarize = scoring_content
final_summary = summarize_with_gpt4o( final_summary = summarize_with_gpt4o(
content_to_summarize, scoring_content,
source_name, source_name,
link, link,
interest_score=interest_score, interest_score=interest_score,
extra_prompt=extra_prompt extra_prompt=extra_prompt
) )
if not final_summary:
logging.info(f"Summary failed for '{title}'")
attempts += 1
continue
# Remove the original title from the summary while preserving paragraphs if not final_summary:
title_pattern = re.compile( logger.info(f"Summary failed for '{title}'")
r'\*\*' + re.escape(title) + r'\*\*|' + re.escape(title), continue
re.IGNORECASE
)
paragraphs = final_summary.split('\n')
cleaned_paragraphs = []
for para in paragraphs:
if para.strip():
cleaned_para = title_pattern.sub('', para).strip()
cleaned_para = re.sub(r'\s+', ' ', cleaned_para)
cleaned_paragraphs.append(cleaned_para)
final_summary = '\n'.join(cleaned_paragraphs)
final_summary = insert_link_naturally(final_summary, source_name, link) final_summary = insert_link_naturally(final_summary, source_name, link)
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title) post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
if not post_data:
attempts += 1
continue
# Fetch image if post_data and author:
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords) return post_data, author, random.randint(600, 1800)
if not image_url:
logging.info(f"Flickr fetch failed for '{image_query}'. Falling back to Pixabay.")
image_url, image_source, uploader, page_url = get_image(image_query)
if not image_url:
logging.info(f"Pixabay fetch failed for '{image_query}'. Skipping article '{title}'.")
attempts += 1
continue
hook = get_dynamic_hook(post_data["title"]).strip()
# Generate viral share prompt
share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
share_links_template = (
f'<p>{share_prompt} '
f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
)
post_data["content"] = f"{final_summary}\n\n{share_links_template}" # Removed cta from content
global is_posting
is_posting = True
try:
post_id, post_url = post_to_wp(
post_data=post_data,
category=category,
link=link,
author=author,
image_url=image_url,
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
interest_score=interest_score,
should_post_tweet=True
)
finally:
is_posting = False
if post_id:
share_text = f"Check out this foodie gem! {post_data['title']}"
share_text_encoded = quote(share_text)
post_url_encoded = quote(post_url)
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
# Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
post_data["content"] = f"{final_summary}\n\n{share_links}" # Removed cta from content
is_posting = True
try:
post_to_wp(
post_data=post_data,
category=category,
link=link,
author=author,
image_url=image_url,
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
interest_score=interest_score,
post_id=post_id,
should_post_tweet=False
)
finally:
is_posting = False
timestamp = datetime.now(timezone.utc).isoformat()
save_json_file(POSTED_TITLES_FILE, title, timestamp)
posted_titles.add(title)
logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")
if image_url:
save_json_file(USED_IMAGES_FILE, image_url, timestamp)
used_images.add(image_url)
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
return post_data, category, random.randint(0, 1800)
attempts += 1
logging.info(f"WP posting failed for '{post_data['title']}'")
print("No interesting RSS article found after attempts")
logging.info("No interesting RSS article found after attempts")
return None, None, random.randint(600, 1800) return None, None, random.randint(600, 1800)
def run_rss_automator(): def run_rss_automator():
print(f"{datetime.now(timezone.utc)} - INFO - ***** RSS Automator Launched *****") """Main function to run the RSS automator."""
logging.info("***** RSS Automator Launched *****") scraper = RSSScraper()
post_data, category, sleep_time = curate_from_rss() while True:
print(f"Sleeping for {sleep_time}s") try:
logging.info(f"Completed run with sleep time: {sleep_time} seconds") post_data, author, sleep_time = scraper.curate_from_rss()
if post_data and author:
global is_posting
is_posting = True
try:
post_to_wp(post_data, author)
logger.info(f"Successfully posted: {post_data['title']}")
finally:
is_posting = False
time.sleep(sleep_time) time.sleep(sleep_time)
return post_data, category, sleep_time except Exception as e:
logger.error(f"Error in RSS automator: {e}")
time.sleep(300) # Wait 5 minutes before retrying
if __name__ == "__main__": if __name__ == "__main__":
run_rss_automator() run_rss_automator()
+115 -30
View File
@@ -2,14 +2,71 @@
# Constants shared across all automator scripts # Constants shared across all automator scripts
from dotenv import load_dotenv from dotenv import load_dotenv
import os import os
from typing import Dict, List, Optional, TypedDict, Union
from pathlib import Path
import logging
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('foodie_automator.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# Load environment variables
load_dotenv() load_dotenv()
# API Keys
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PIXABAY_API_KEY = os.getenv("PIXABAY_API_KEY") PIXABAY_API_KEY = os.getenv("PIXABAY_API_KEY")
FLICKR_API_KEY = os.getenv("FLICKR_API_KEY") FLICKR_API_KEY = os.getenv("FLICKR_API_KEY")
FLICKR_API_SECRET = os.getenv("FLICKR_API_SECRET") FLICKR_API_SECRET = os.getenv("FLICKR_API_SECRET")
AUTHORS = [ # Validate required API keys
def validate_api_keys() -> None:
"""Validate that all required API keys are present."""
required_keys = {
"OPENAI_API_KEY": OPENAI_API_KEY,
"PIXABAY_API_KEY": PIXABAY_API_KEY,
"FLICKR_API_KEY": FLICKR_API_KEY,
"FLICKR_API_SECRET": FLICKR_API_SECRET
}
missing_keys = [key for key, value in required_keys.items() if not value]
if missing_keys:
logger.error(f"Missing required API keys: {', '.join(missing_keys)}")
raise ValueError(f"Missing required API keys: {', '.join(missing_keys)}")
# Type definitions
class AuthorConfig(TypedDict):
url: str
username: str
password: str
persona: str
bio: str
dob: str
class XCredentials(TypedDict):
username: str
x_username: str
api_key: str
api_secret: str
access_token: str
access_token_secret: str
client_secret: str
class PersonaConfig(TypedDict):
description: str
tone: str
article_prompt: str
x_prompt: str
# Author configurations
AUTHORS: List[AuthorConfig] = [
{ {
"url": "https://insiderfoodie.com", "url": "https://insiderfoodie.com",
"username": "owenjohnson", "username": "owenjohnson",
@@ -31,7 +88,7 @@ AUTHORS = [
"username": "aishapatel", "username": "aishapatel",
"password": os.getenv("AISHAPATEL_PASSWORD"), "password": os.getenv("AISHAPATEL_PASSWORD"),
"persona": "Trend Scout", "persona": "Trend Scout",
"bio": "I scout global food trends, obsessed with whats emerging. My sharp predictions map the industrys path—always one step ahead.", "bio": "I scout global food trends, obsessed with what's emerging. My sharp predictions map the industry's path—always one step ahead.",
"dob": "1999-03-15" "dob": "1999-03-15"
}, },
{ {
@@ -47,7 +104,7 @@ AUTHORS = [
"username": "keishareid", "username": "keishareid",
"password": os.getenv("KEISHAREID_PASSWORD"), "password": os.getenv("KEISHAREID_PASSWORD"),
"persona": "African-American Soul Food Sage", "persona": "African-American Soul Food Sage",
"bio": "I bring soul foods legacy to life, blending history with modern vibes. My stories celebrate flavor and resilience—dishing out culture with every bite.", "bio": "I bring soul food's legacy to life, blending history with modern vibes. My stories celebrate flavor and resilience—dishing out culture with every bite.",
"dob": "1994-06-10" "dob": "1994-06-10"
}, },
{ {
@@ -60,7 +117,8 @@ AUTHORS = [
} }
] ]
X_API_CREDENTIALS = [ # X (Twitter) API credentials
X_API_CREDENTIALS: List[XCredentials] = [
{ {
"username": "owenjohnson", "username": "owenjohnson",
"x_username": "@insiderfoodieowen", "x_username": "@insiderfoodieowen",
@@ -117,12 +175,13 @@ X_API_CREDENTIALS = [
} }
] ]
PERSONA_CONFIGS = { # Persona configurations
PERSONA_CONFIGS: Dict[str, PersonaConfig] = {
"Visionary Editor": { "Visionary Editor": {
"description": "a commanding food editor with a borderless view", "description": "a commanding food editor with a borderless view",
"tone": "a polished and insightful tone, like 'This redefines culinary excellence.'", "tone": "a polished and insightful tone, like 'This redefines culinary excellence.'",
"article_prompt": ( "article_prompt": (
"Youre {description}. Summarize this article in {tone}. " "You're {description}. Summarize this article in {tone}. "
"Explore a wide range of food-related topics, skip recipes. Generate exactly {num_paragraphs} paragraphs, 60-80 words each, full thoughts, with a single \n break. " "Explore a wide range of food-related topics, skip recipes. Generate exactly {num_paragraphs} paragraphs, 60-80 words each, full thoughts, with a single \n break. "
"Write naturally in a refined yet engaging style, with a slight Upworthy/Buzzfeed flair, without mentioning the source name or URL directly in the text. " "Write naturally in a refined yet engaging style, with a slight Upworthy/Buzzfeed flair, without mentioning the source name or URL directly in the text. "
"Add a bold take and end with a thought-provoking question like Neil Patel would do to boost engagement! Do not include emojis in the summary." "Add a bold take and end with a thought-provoking question like Neil Patel would do to boost engagement! Do not include emojis in the summary."
@@ -139,7 +198,7 @@ PERSONA_CONFIGS = {
"description": "a seasoned foodie reviewer with a sharp eye", "description": "a seasoned foodie reviewer with a sharp eye",
"tone": "a professional yet engaging tone, like 'This dish is a revelation.'", "tone": "a professional yet engaging tone, like 'This dish is a revelation.'",
"article_prompt": ( "article_prompt": (
"Youre {description}. Summarize this article in {tone}. " "You're {description}. Summarize this article in {tone}. "
"Explore a wide range of food-related topics, skip recipes. Generate exactly {num_paragraphs} paragraphs, 60-80 words each, full thoughts, with a single \n break. " "Explore a wide range of food-related topics, skip recipes. Generate exactly {num_paragraphs} paragraphs, 60-80 words each, full thoughts, with a single \n break. "
"Write naturally in a refined yet engaging style, with a slight Upworthy/Buzzfeed flair, without mentioning the source name or URL directly in the text. " "Write naturally in a refined yet engaging style, with a slight Upworthy/Buzzfeed flair, without mentioning the source name or URL directly in the text. "
"Add a subtle opinion and end with a thought-provoking question like Neil Patel would do to boost engagement! Do not include emojis in the summary." "Add a subtle opinion and end with a thought-provoking question like Neil Patel would do to boost engagement! Do not include emojis in the summary."
@@ -154,12 +213,12 @@ PERSONA_CONFIGS = {
}, },
"Trend Scout": { "Trend Scout": {
"description": "a forward-thinking editor obsessed with trends", "description": "a forward-thinking editor obsessed with trends",
"tone": "an insightful and forward-looking tone, like 'This sets the stage for whats next.'", "tone": "an insightful and forward-looking tone, like 'This sets the stage for what's next.'",
"article_prompt": ( "article_prompt": (
"Youre {description}. Summarize this article in {tone}. " "You're {description}. Summarize this article in {tone}. "
"Explore a wide range of food-related topics, skip recipes. Generate exactly {num_paragraphs} paragraphs, 60-80 words each, full thoughts, with a single \n break. " "Explore a wide range of food-related topics, skip recipes. Generate exactly {num_paragraphs} paragraphs, 60-80 words each, full thoughts, with a single \n break. "
"Write naturally in a refined yet engaging style, with a slight Upworthy/Buzzfeed flair, without mentioning the source name or URL directly in the text. " "Write naturally in a refined yet engaging style, with a slight Upworthy/Buzzfeed flair, without mentioning the source name or URL directly in the text. "
"Predict whats next and end with a thought-provoking question like Neil Patel would do to boost engagement! Do not include emojis in the summary." "Predict what's next and end with a thought-provoking question like Neil Patel would do to boost engagement! Do not include emojis in the summary."
), ),
"x_prompt": ( "x_prompt": (
"Craft a tweet as {description}. Keep it under 280 characters, using {tone}. " "Craft a tweet as {description}. Keep it under 280 characters, using {tone}. "
@@ -173,7 +232,7 @@ PERSONA_CONFIGS = {
"description": "a cultured food writer who loves storytelling", "description": "a cultured food writer who loves storytelling",
"tone": "a warm and thoughtful tone, like 'This evokes a sense of tradition.'", "tone": "a warm and thoughtful tone, like 'This evokes a sense of tradition.'",
"article_prompt": ( "article_prompt": (
"Youre {description}. Summarize this article in {tone}. " "You're {description}. Summarize this article in {tone}. "
"Explore a wide range of food-related topics, skip recipes. Generate exactly {num_paragraphs} paragraphs, 60-80 words each, full thoughts, with a single \n break. " "Explore a wide range of food-related topics, skip recipes. Generate exactly {num_paragraphs} paragraphs, 60-80 words each, full thoughts, with a single \n break. "
"Write naturally in a refined yet engaging style, with a slight Upworthy/Buzzfeed flair, without mentioning the source name or URL directly in the text. " "Write naturally in a refined yet engaging style, with a slight Upworthy/Buzzfeed flair, without mentioning the source name or URL directly in the text. "
"Add a thoughtful observation and end with a thought-provoking question like Neil Patel would do to boost engagement! Do not include emojis in the summary." "Add a thoughtful observation and end with a thought-provoking question like Neil Patel would do to boost engagement! Do not include emojis in the summary."
@@ -190,7 +249,7 @@ PERSONA_CONFIGS = {
"description": "a vibrant storyteller rooted in African-American culinary heritage", "description": "a vibrant storyteller rooted in African-American culinary heritage",
"tone": "a heartfelt and authentic tone, like 'This captures the essence of heritage.'", "tone": "a heartfelt and authentic tone, like 'This captures the essence of heritage.'",
"article_prompt": ( "article_prompt": (
"Youre {description}. Summarize this article in {tone}. " "You're {description}. Summarize this article in {tone}. "
"Explore a wide range of food-related topics, skip recipes. Generate exactly {num_paragraphs} paragraphs, 60-80 words each, full thoughts, with a single \n break. " "Explore a wide range of food-related topics, skip recipes. Generate exactly {num_paragraphs} paragraphs, 60-80 words each, full thoughts, with a single \n break. "
"Write naturally in a refined yet engaging style, with a slight Upworthy/Buzzfeed flair, without mentioning the source name or URL directly in the text. " "Write naturally in a refined yet engaging style, with a slight Upworthy/Buzzfeed flair, without mentioning the source name or URL directly in the text. "
"Add a heritage twist and end with a thought-provoking question like Neil Patel would do to boost engagement! Do not include emojis in the summary." "Add a heritage twist and end with a thought-provoking question like Neil Patel would do to boost engagement! Do not include emojis in the summary."
@@ -207,7 +266,7 @@ PERSONA_CONFIGS = {
"description": "an adventurous explorer of global street food", "description": "an adventurous explorer of global street food",
"tone": "a bold and adventurous tone, like 'This takes you on a global journey.'", "tone": "a bold and adventurous tone, like 'This takes you on a global journey.'",
"article_prompt": ( "article_prompt": (
"Youre {description}. Summarize this article in {tone}. " "You're {description}. Summarize this article in {tone}. "
"Explore a wide range of food-related topics, skip recipes. Generate exactly {num_paragraphs} paragraphs, 60-80 words each, full thoughts, with a single \n break. " "Explore a wide range of food-related topics, skip recipes. Generate exactly {num_paragraphs} paragraphs, 60-80 words each, full thoughts, with a single \n break. "
"Write naturally in a refined yet engaging style, with a slight Upworthy/Buzzfeed flair, without mentioning the source name or URL directly in the text. " "Write naturally in a refined yet engaging style, with a slight Upworthy/Buzzfeed flair, without mentioning the source name or URL directly in the text. "
"Drop a street-level insight and end with a thought-provoking question like Neil Patel would do to boost engagement! Do not include emojis in the summary." "Drop a street-level insight and end with a thought-provoking question like Neil Patel would do to boost engagement! Do not include emojis in the summary."
@@ -223,25 +282,30 @@ PERSONA_CONFIGS = {
} }
# File paths # File paths
POSTED_RSS_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json' BASE_DIR = Path("/home/shane/foodie_automator")
POSTED_GOOGLE_TITLES_FILE = '/home/shane/foodie_automator/posted_google_titles.json' FILE_PATHS = {
POSTED_REDDIT_TITLES_FILE = '/home/shane/foodie_automator/posted_reddit_titles.json' "posted_rss_titles": BASE_DIR / "posted_rss_titles.json",
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json' "posted_google_titles": BASE_DIR / "posted_google_titles.json",
AUTHOR_BACKGROUNDS_FILE = '/home/shane/foodie_automator/author_backgrounds.json' "posted_reddit_titles": BASE_DIR / "posted_reddit_titles.json",
X_POST_COUNTS_FILE = '/home/shane/foodie_automator/x_post_counts.json' "used_images": BASE_DIR / "used_images.json",
RECENT_POSTS_FILE = '/home/shane/foodie_automator/recent_posts.json' "author_backgrounds": BASE_DIR / "author_backgrounds.json",
"x_post_counts": BASE_DIR / "x_post_counts.json",
"recent_posts": BASE_DIR / "recent_posts.json"
}
# Expiration periods
EXPIRATION_DAYS = 3 EXPIRATION_DAYS = 3
IMAGE_EXPIRATION_DAYS = 7 IMAGE_EXPIRATION_DAYS = 7
RSS_FEEDS = [ # RSS feed configurations
RSS_FEEDS: List[str] = [
"https://www.eater.com/rss/full.xml", "https://www.eater.com/rss/full.xml",
"https://www.nrn.com/rss.xml", "https://www.nrn.com/rss.xml",
"https://rss.nytimes.com/services/xml/rss/nyt/DiningandWine.xml", "https://rss.nytimes.com/services/xml/rss/nyt/DiningandWine.xml",
"https://www.theguardian.com/food/rss" "https://www.theguardian.com/food/rss"
] ]
RSS_FEED_NAMES = { RSS_FEED_NAMES: Dict[str, tuple[str, str]] = {
"https://www.eater.com/rss/full.xml": ("Eater", "https://www.eater.com/"), "https://www.eater.com/rss/full.xml": ("Eater", "https://www.eater.com/"),
"https://www.nrn.com/rss.xml": ("Nation's Restaurant News", "https://www.nrn.com/"), "https://www.nrn.com/rss.xml": ("Nation's Restaurant News", "https://www.nrn.com/"),
"https://rss.nytimes.com/services/xml/rss/nyt/DiningandWine.xml": ("The New York Times", "https://www.nytimes.com/section/food"), "https://rss.nytimes.com/services/xml/rss/nyt/DiningandWine.xml": ("The New York Times", "https://www.nytimes.com/section/food"),
@@ -276,12 +340,33 @@ FAST_FOOD_KEYWORDS = [
SUMMARY_MODEL = "gpt-4o" # or "gpt-4.1-mini" for testing SUMMARY_MODEL = "gpt-4o" # or "gpt-4.1-mini" for testing
LIGHT_TASK_MODEL = "gpt-4o-mini" LIGHT_TASK_MODEL = "gpt-4o-mini"
def get_clean_source_name(source_name): def get_clean_source_name(source_name: str) -> str:
""" """Clean and standardize source names."""
Retrieve a clean source name from RSS_FEED_NAMES if source_name matches a feed URL, try:
otherwise return the original source_name as a fallback. # Remove common prefixes and suffixes
""" clean_name = source_name.strip()
for feed_url, (clean_name, _) in RSS_FEED_NAMES.items(): clean_name = clean_name.replace("The ", "").replace("the ", "")
if feed_url == source_name: clean_name = clean_name.replace("Food", "").replace("food", "")
return clean_name clean_name = clean_name.replace("Dining", "").replace("dining", "")
clean_name = clean_name.replace("Restaurant", "").replace("restaurant", "")
# Remove any remaining whitespace
clean_name = " ".join(clean_name.split())
return clean_name if clean_name else source_name
except Exception as e:
logger.error(f"Error cleaning source name '{source_name}': {e}")
return source_name return source_name
# Validate configurations on import
validate_api_keys()
# Ensure all file paths exist
for path in FILE_PATHS.values():
path.parent.mkdir(parents=True, exist_ok=True)
if not path.exists():
path.touch()
logger.info(f"Created missing file: {path}")
# Log successful configuration
logger.info("Configuration loaded successfully")
+603 -711
View File
File diff suppressed because it is too large Load Diff