use cursor to optomize files

This commit is contained in:
2025-05-03 16:23:06 +10:00
parent 427a5cb919
commit 2ca39915e0
5 changed files with 1411 additions and 1634 deletions
+213 -282
View File
@@ -10,6 +10,7 @@ import sys
import re
import email.utils
from datetime import datetime, timedelta, timezone
from typing import List, Dict, Optional, Tuple, Set
from bs4 import BeautifulSoup
from openai import OpenAI
from urllib.parse import quote
@@ -18,7 +19,8 @@ from requests.adapters import HTTPAdapter
from foodie_config import (
RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS,
HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES,
get_clean_source_name, X_API_CREDENTIALS
get_clean_source_name, X_API_CREDENTIALS, FILE_PATHS, EXPIRATION_DAYS,
IMAGE_EXPIRATION_DAYS, LIGHT_TASK_MODEL
)
from foodie_utils import (
load_json_file, save_json_file, get_image, generate_image_query,
@@ -30,42 +32,50 @@ from foodie_utils import (
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Global state
is_posting = False
logger = logging.getLogger(__name__)
def signal_handler(sig, frame):
logging.info("Received termination signal, checking if safe to exit...")
if is_posting:
logging.info("Currently posting, will exit after completion.")
else:
logging.info("Safe to exit immediately.")
sys.exit(0)
class RSSScraper:
def __init__(self):
self.setup_logging()
self.setup_signal_handlers()
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
self.posted_titles = self.load_posted_titles()
self.used_images = self.load_used_images()
self.session = self.setup_http_session()
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
def setup_logging(self) -> None:
"""Configure logging for the scraper."""
log_file = FILE_PATHS["posted_rss_titles"].with_suffix('.log')
self.prune_old_logs(log_file)
logging.basicConfig(
filename=str(log_file),
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s"
)
logging.getLogger("requests").setLevel(logging.WARNING)
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logging.getLogger().addHandler(console_handler)
logger.info("Logging initialized for RSS scraper")
LOG_FILE = "/home/shane/foodie_automator/foodie_automator_rss.log"
LOG_PRUNE_DAYS = 30
FEED_TIMEOUT = 15
MAX_RETRIES = 3
def prune_old_logs(self, log_file: str) -> None:
"""Prune log entries older than LOG_PRUNE_DAYS."""
if not os.path.exists(log_file):
return
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json'
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
EXPIRATION_HOURS = 24
IMAGE_EXPIRATION_DAYS = 7
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
posted_titles = set(entry["title"] for entry in posted_titles_data)
used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)
def setup_logging():
if os.path.exists(LOG_FILE):
with open(LOG_FILE, 'r') as f:
with open(log_file, 'r') as f:
lines = f.readlines()
cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
cutoff = datetime.now(timezone.utc) - timedelta(days=30) # LOG_PRUNE_DAYS
pruned_lines = []
malformed_count = 0
for line in lines:
if len(line) < 19 or not line[:19].replace('-', '').replace(':', '').replace(' ', '').isdigit():
malformed_count += 1
@@ -77,290 +87,211 @@ def setup_logging():
except ValueError:
malformed_count += 1
continue
if malformed_count > 0:
logging.info(f"Skipped {malformed_count} malformed log lines during pruning")
with open(LOG_FILE, 'w') as f:
logger.warning(f"Skipped {malformed_count} malformed log lines during pruning")
with open(log_file, 'w') as f:
f.writelines(pruned_lines)
logging.basicConfig(
filename=LOG_FILE,
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"
)
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logging.getLogger().addHandler(console_handler)
logging.getLogger("requests").setLevel(logging.WARNING)
logging.info("Logging initialized for foodie_automator_rss.py")
setup_logging()
def setup_signal_handlers(self) -> None:
"""Set up signal handlers for graceful shutdown."""
def signal_handler(sig, frame):
logger.info("Received termination signal, checking if safe to exit...")
if is_posting:
logger.info("Currently posting, will exit after completion.")
else:
logger.info("Safe to exit immediately.")
sys.exit(0)
def create_http_session() -> requests.Session:
session = requests.Session()
retry_strategy = Retry(
total=MAX_RETRIES,
backoff_factor=2,
status_forcelist=[403, 429, 500, 502, 503, 504],
allowed_methods=["GET", "POST"]
)
adapter = HTTPAdapter(
max_retries=retry_strategy,
pool_connections=10,
pool_maxsize=10
)
session.mount("http://", adapter)
session.mount("https://", adapter)
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
})
return session
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
def parse_date(date_str):
try:
parsed_date = email.utils.parsedate_to_datetime(date_str)
if parsed_date.tzinfo is None:
parsed_date = parsed_date.replace(tzinfo=timezone.utc)
return parsed_date
except Exception as e:
logging.error(f"Failed to parse date '{date_str}': {e}")
return datetime.now(timezone.utc)
def setup_http_session(self) -> requests.Session:
"""Set up a requests session with retry logic."""
session = requests.Session()
retry_strategy = Retry(
total=3,
backoff_factor=2,
status_forcelist=[403, 429, 500, 502, 503, 504],
allowed_methods=["GET", "POST"]
)
adapter = HTTPAdapter(
max_retries=retry_strategy,
pool_connections=10,
pool_maxsize=10
)
session.mount("http://", adapter)
session.mount("https://", adapter)
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
})
return session
def fetch_rss_feeds():
logging.info("Starting fetch_rss_feeds")
articles = []
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS)
session = create_http_session()
if not RSS_FEEDS:
logging.error("RSS_FEEDS is empty in foodie_config.py")
return articles
for feed_url in RSS_FEEDS:
logging.info(f"Processing feed: {feed_url}")
def load_posted_titles(self) -> Set[str]:
"""Load and return the set of posted titles."""
try:
response = session.get(feed_url, timeout=FEED_TIMEOUT)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'xml')
items = soup.find_all('item')
feed_title = RSS_FEED_NAMES.get(feed_url, (get_clean_source_name(feed_url), feed_url))
for item in items:
try:
title = item.find('title').text.strip() if item.find('title') else "Untitled"
link = item.find('link').text.strip() if item.find('link') else ""
pub_date = item.find('pubDate')
pub_date = parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc)
if pub_date < cutoff_date:
logging.info(f"Skipping old article: {title} (Published: {pub_date})")
continue
description = item.find('description')
summary = BeautifulSoup(description.text, 'html.parser').get_text().strip() if description else ""
content = item.find('content:encoded')
content_text = BeautifulSoup(content.text, 'html.parser').get_text().strip() if content else summary
articles.append({
"title": title,
"link": link,
"summary": summary,
"content": content_text,
"feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title,
"pub_date": pub_date
})
logging.debug(f"Processed article: {title}")
except Exception as e:
logging.warning(f"Error processing entry in {feed_url}: {e}")
continue
logging.info(f"Filtered to {len(articles)} articles from {feed_url}")
data = load_json_file(FILE_PATHS["posted_rss_titles"], EXPIRATION_DAYS)
return {entry["title"] for entry in data if "title" in entry}
except Exception as e:
logging.error(f"Failed to fetch RSS feed {feed_url}: {e}")
continue
logger.error(f"Error loading posted titles: {e}")
return set()
articles.sort(key=lambda x: x["pub_date"], reverse=True)
logging.info(f"Total RSS articles fetched: {len(articles)}")
return articles
def load_used_images(self) -> Set[str]:
"""Load and return the set of used images."""
try:
data = load_json_file(FILE_PATHS["used_images"], IMAGE_EXPIRATION_DAYS)
return {entry["title"] for entry in data if "title" in entry}
except Exception as e:
logger.error(f"Error loading used images: {e}")
return set()
def curate_from_rss():
articles = fetch_rss_feeds()
if not articles:
print("No RSS articles available")
logging.info("No RSS articles available")
return None, None, random.randint(600, 1800)
def parse_date(self, date_str: str) -> datetime:
"""Parse a date string into a datetime object."""
try:
parsed_date = email.utils.parsedate_to_datetime(date_str)
if parsed_date.tzinfo is None:
parsed_date = parsed_date.replace(tzinfo=timezone.utc)
return parsed_date
except Exception as e:
logger.error(f"Failed to parse date '{date_str}': {e}")
return datetime.now(timezone.utc)
attempts = 0
max_attempts = 10
while attempts < max_attempts and articles:
article = articles.pop(0)
title = article["title"]
link = article["link"]
summary = article["summary"]
content = article["content"]
source_name = article["feed_title"]
original_source = f'<a href="{link}">{source_name}</a>'
def fetch_rss_feeds(self) -> List[Dict]:
"""Fetch and process RSS feeds."""
logger.info("Starting fetch_rss_feeds")
articles = []
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=24)
if title in posted_titles:
print(f"Skipping already posted article: {title}")
logging.info(f"Skipping already posted article: {title}")
attempts += 1
continue
if not RSS_FEEDS:
logger.error("RSS_FEEDS is empty in foodie_config.py")
return articles
print(f"Trying RSS Article: {title} from {source_name}")
logging.info(f"Trying RSS Article: {title} from {source_name}")
for feed_url in RSS_FEEDS:
logger.info(f"Processing feed: {feed_url}")
try:
response = self.session.get(feed_url, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'xml')
items = soup.find_all('item')
image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
if skip:
print(f"Skipping filtered RSS article: {title}")
logging.info(f"Skipping filtered RSS article: {title}")
attempts += 1
continue
feed_title = RSS_FEED_NAMES.get(feed_url, (get_clean_source_name(feed_url), feed_url))
for item in items:
try:
title = item.find('title').text.strip() if item.find('title') else "Untitled"
link = item.find('link').text.strip() if item.find('link') else ""
pub_date = item.find('pubDate')
pub_date = self.parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc)
scoring_content = f"{title}\n\n{summary}\n\nContent: {content}"
interest_score = is_interesting(scoring_content)
logging.info(f"Interest score for '{title}': {interest_score}")
if interest_score < 6:
print(f"RSS Interest Too Low: {interest_score}")
logging.info(f"RSS Interest Too Low: {interest_score}")
attempts += 1
continue
if pub_date < cutoff_date:
logger.info(f"Skipping old article: {title} (Published: {pub_date})")
continue
num_paragraphs = determine_paragraph_count(interest_score)
extra_prompt = (
f"Generate exactly {num_paragraphs} paragraphs.\n"
f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
f"Do NOT introduce unrelated concepts.\n"
f"Expand on the core idea with relevant context about its appeal or significance.\n"
f"Do not include emojis in the summary."
)
content_to_summarize = scoring_content
final_summary = summarize_with_gpt4o(
content_to_summarize,
source_name,
link,
interest_score=interest_score,
extra_prompt=extra_prompt
)
if not final_summary:
logging.info(f"Summary failed for '{title}'")
attempts += 1
continue
description = item.find('description')
summary = BeautifulSoup(description.text, 'html.parser').get_text().strip() if description else ""
content = item.find('content:encoded')
content_text = BeautifulSoup(content.text, 'html.parser').get_text().strip() if content else summary
# Remove the original title from the summary while preserving paragraphs
title_pattern = re.compile(
r'\*\*' + re.escape(title) + r'\*\*|' + re.escape(title),
re.IGNORECASE
)
paragraphs = final_summary.split('\n')
cleaned_paragraphs = []
for para in paragraphs:
if para.strip():
cleaned_para = title_pattern.sub('', para).strip()
cleaned_para = re.sub(r'\s+', ' ', cleaned_para)
cleaned_paragraphs.append(cleaned_para)
final_summary = '\n'.join(cleaned_paragraphs)
final_summary = insert_link_naturally(final_summary, source_name, link)
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
if not post_data:
attempts += 1
continue
# Fetch image
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
if not image_url:
logging.info(f"Flickr fetch failed for '{image_query}'. Falling back to Pixabay.")
image_url, image_source, uploader, page_url = get_image(image_query)
if not image_url:
logging.info(f"Pixabay fetch failed for '{image_query}'. Skipping article '{title}'.")
attempts += 1
articles.append({
"title": title,
"link": link,
"summary": summary,
"content": content_text,
"feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title,
"pub_date": pub_date
})
except Exception as e:
logger.warning(f"Error processing entry in {feed_url}: {e}")
continue
logger.info(f"Filtered to {len(articles)} articles from {feed_url}")
except Exception as e:
logger.error(f"Failed to fetch RSS feed {feed_url}: {e}")
continue
hook = get_dynamic_hook(post_data["title"]).strip()
articles.sort(key=lambda x: x["pub_date"], reverse=True)
logger.info(f"Total RSS articles fetched: {len(articles)}")
return articles
# Generate viral share prompt
share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
share_links_template = (
f'<p>{share_prompt} '
f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
)
post_data["content"] = f"{final_summary}\n\n{share_links_template}" # Removed cta from content
def curate_from_rss(self) -> Tuple[Optional[Dict], Optional[str], int]:
"""Curate content from RSS feeds."""
articles = self.fetch_rss_feeds()
if not articles:
logger.info("No RSS articles available")
return None, None, random.randint(600, 1800)
global is_posting
is_posting = True
try:
post_id, post_url = post_to_wp(
post_data=post_data,
category=category,
link=link,
author=author,
image_url=image_url,
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
interest_score=interest_score,
should_post_tweet=True
for article in articles:
title = article["title"]
link = article["link"]
summary = article["summary"]
content = article["content"]
source_name = article["feed_title"]
if title in self.posted_titles:
logger.info(f"Skipping already posted article: {title}")
continue
logger.info(f"Processing RSS Article: {title} from {source_name}")
image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
if skip:
logger.info(f"Skipping filtered RSS article: {title}")
continue
scoring_content = f"{title}\n\n{summary}\n\nContent: {content}"
interest_score = is_interesting(scoring_content)
logger.info(f"Interest score for '{title}': {interest_score}")
if interest_score < 6:
logger.info(f"RSS Interest Too Low: {interest_score}")
continue
num_paragraphs = determine_paragraph_count(interest_score)
extra_prompt = (
f"Generate exactly {num_paragraphs} paragraphs.\n"
f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
f"Do NOT introduce unrelated concepts.\n"
f"Expand on the core idea with relevant context about its appeal or significance.\n"
f"Do not include emojis in the summary."
)
finally:
is_posting = False
if post_id:
share_text = f"Check out this foodie gem! {post_data['title']}"
share_text_encoded = quote(share_text)
post_url_encoded = quote(post_url)
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
# Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
post_data["content"] = f"{final_summary}\n\n{share_links}" # Removed cta from content
is_posting = True
try:
post_to_wp(
post_data=post_data,
category=category,
link=link,
author=author,
image_url=image_url,
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
interest_score=interest_score,
post_id=post_id,
should_post_tweet=False
)
finally:
is_posting = False
final_summary = summarize_with_gpt4o(
scoring_content,
source_name,
link,
interest_score=interest_score,
extra_prompt=extra_prompt
)
timestamp = datetime.now(timezone.utc).isoformat()
save_json_file(POSTED_TITLES_FILE, title, timestamp)
posted_titles.add(title)
logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")
if not final_summary:
logger.info(f"Summary failed for '{title}'")
continue
if image_url:
save_json_file(USED_IMAGES_FILE, image_url, timestamp)
used_images.add(image_url)
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
final_summary = insert_link_naturally(final_summary, source_name, link)
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
return post_data, category, random.randint(0, 1800)
if post_data and author:
return post_data, author, random.randint(600, 1800)
attempts += 1
logging.info(f"WP posting failed for '{post_data['title']}'")
print("No interesting RSS article found after attempts")
logging.info("No interesting RSS article found after attempts")
return None, None, random.randint(600, 1800)
return None, None, random.randint(600, 1800)
def run_rss_automator():
print(f"{datetime.now(timezone.utc)} - INFO - ***** RSS Automator Launched *****")
logging.info("***** RSS Automator Launched *****")
post_data, category, sleep_time = curate_from_rss()
print(f"Sleeping for {sleep_time}s")
logging.info(f"Completed run with sleep time: {sleep_time} seconds")
time.sleep(sleep_time)
return post_data, category, sleep_time
"""Main function to run the RSS automator."""
scraper = RSSScraper()
while True:
try:
post_data, author, sleep_time = scraper.curate_from_rss()
if post_data and author:
global is_posting
is_posting = True
try:
post_to_wp(post_data, author)
logger.info(f"Successfully posted: {post_data['title']}")
finally:
is_posting = False
time.sleep(sleep_time)
except Exception as e:
logger.error(f"Error in RSS automator: {e}")
time.sleep(300) # Wait 5 minutes before retrying
if __name__ == "__main__":
run_rss_automator()