use cursor to optomize files

This commit is contained in:
2025-05-03 16:23:06 +10:00
parent 427a5cb919
commit 2ca39915e0
5 changed files with 1411 additions and 1634 deletions
+248 -315
View File
@@ -9,6 +9,7 @@ import signal
import sys
import re
from datetime import datetime, timedelta, timezone
from typing import List, Dict, Optional, Tuple, Set
from openai import OpenAI
from urllib.parse import quote
from requests.packages.urllib3.util.retry import Retry
@@ -19,7 +20,7 @@ from foodie_config import (
AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS,
PERSONA_CONFIGS, CATEGORIES, get_clean_source_name,
REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT, LIGHT_TASK_MODEL,
X_API_CREDENTIALS
X_API_CREDENTIALS, FILE_PATHS, EXPIRATION_DAYS, IMAGE_EXPIRATION_DAYS
)
from foodie_utils import (
load_json_file, save_json_file, get_image, generate_image_query,
@@ -28,29 +29,48 @@ from foodie_utils import (
prepare_post_data, select_best_author, smart_image_and_filter,
get_flickr_image
)
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt # Removed select_best_cta import
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
# Load environment variables
load_dotenv()
# Global state
is_posting = False
logger = logging.getLogger(__name__)
def signal_handler(sig, frame):
logging.info("Received termination signal, checking if safe to exit...")
if is_posting:
logging.info("Currently posting, will exit after completion.")
else:
logging.info("Safe to exit immediately.")
sys.exit(0)
class RedditScraper:
def __init__(self):
self.setup_logging()
self.setup_signal_handlers()
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
self.posted_titles = self.load_posted_titles()
self.used_images = self.load_used_images()
self.reddit = self.setup_reddit_client()
self.setup_requests_session()
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
def setup_logging(self) -> None:
"""Configure logging for the scraper."""
log_file = FILE_PATHS["posted_reddit_titles"].with_suffix('.log')
self.prune_old_logs(log_file)
logging.basicConfig(
filename=str(log_file),
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s"
)
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("prawcore").setLevel(logging.WARNING)
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logging.getLogger().addHandler(console_handler)
logger.info("Logging initialized for Reddit scraper")
LOG_FILE = "/home/shane/foodie_automator/foodie_automator_reddit.log"
LOG_PRUNE_DAYS = 30
def prune_old_logs(self, log_file: str) -> None:
"""Prune log entries older than LOG_PRUNE_DAYS."""
if not os.path.exists(log_file):
return
def setup_logging():
if os.path.exists(LOG_FILE):
with open(LOG_FILE, 'r') as f:
with open(log_file, 'r') as f:
lines = f.readlines()
log_entries = []
@@ -68,7 +88,7 @@ def setup_logging():
if current_entry:
log_entries.append(''.join(current_entry))
cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
cutoff = datetime.now(timezone.utc) - timedelta(days=30) # LOG_PRUNE_DAYS
pruned_entries = []
for entry in log_entries:
try:
@@ -76,323 +96,236 @@ def setup_logging():
if timestamp > cutoff:
pruned_entries.append(entry)
except ValueError:
logging.warning(f"Skipping malformed log entry (no timestamp): {entry[:50]}...")
logger.warning(f"Skipping malformed log entry (no timestamp): {entry[:50]}...")
continue
with open(LOG_FILE, 'w') as f:
with open(log_file, 'w') as f:
f.writelines(pruned_entries)
logging.basicConfig(
filename=LOG_FILE,
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s"
)
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("prawcore").setLevel(logging.WARNING)
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logging.getLogger().addHandler(console_handler)
logging.info("Logging initialized for foodie_automator_reddit.py")
setup_logging()
def setup_signal_handlers(self) -> None:
"""Set up signal handlers for graceful shutdown."""
def signal_handler(sig, frame):
logger.info("Received termination signal, checking if safe to exit...")
if is_posting:
logger.info("Currently posting, will exit after completion.")
else:
logger.info("Safe to exit immediately.")
sys.exit(0)
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_reddit_titles.json'
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
EXPIRATION_HOURS = 24
IMAGE_EXPIRATION_DAYS = 7
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
posted_titles = set(entry["title"] for entry in posted_titles_data if "title" in entry)
used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
used_images = set(entry["title"] for entry in used_images_data if "title" in entry)
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def clean_reddit_title(title):
cleaned_title = re.sub(r'^\[.*?\]\s*', '', title).strip()
logging.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'")
return cleaned_title
def is_interesting_reddit(title, summary, upvotes, comment_count, top_comments):
try:
content = f"Title: {title}\n\nContent: {summary}"
if top_comments:
content += f"\n\nTop Comments:\n{'\n'.join(top_comments)}"
response = client.chat.completions.create(
model=LIGHT_TASK_MODEL,
messages=[
{"role": "system", "content": (
"Rate this Reddit post from 0-10 based on rarity, buzzworthiness, and engagement potential for food lovers, covering food topics (skip recipes). "
"Score 8-10 for rare, highly shareable ideas (e.g., unique dishes or restaurant trends). "
"Score 5-7 for fresh, engaging updates with broad appeal. Score below 5 for common or unremarkable content. "
"Consider comments for added context (e.g., specific locations or unique details). "
"Return only a number."
)},
{"role": "user", "content": content}
],
max_tokens=5
def setup_reddit_client(self) -> praw.Reddit:
"""Set up and return a Reddit client with proper configuration."""
return praw.Reddit(
client_id=REDDIT_CLIENT_ID,
client_secret=REDDIT_CLIENT_SECRET,
user_agent=REDDIT_USER_AGENT
)
base_score = int(response.choices[0].message.content.strip()) if response.choices[0].message.content.strip().isdigit() else 0
engagement_boost = 0
if upvotes >= 500:
engagement_boost += 3
elif upvotes >= 100:
engagement_boost += 2
elif upvotes >= 50:
engagement_boost += 1
if comment_count >= 100:
engagement_boost += 2
elif comment_count >= 20:
engagement_boost += 1
def setup_requests_session(self) -> None:
"""Set up a requests session with retry logic."""
self.session = requests.Session()
retries = Retry(
total=5,
backoff_factor=0.1,
status_forcelist=[500, 502, 503, 504]
)
self.session.mount('https://', HTTPAdapter(max_retries=retries))
final_score = min(base_score + engagement_boost, 10)
logging.info(f"Reddit Interest Score: {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count}, top_comments: {len(top_comments)}) for '{title}'")
print(f"Interest Score for '{title[:50]}...': {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count})")
return final_score
except Exception as e:
logging.error(f"Reddit interestingness scoring failed: {e}")
print(f"Reddit Interest Error: {e}")
return 0
def get_top_comments(post_url, reddit, limit=3):
try:
submission = reddit.submission(url=post_url)
submission.comment_sort = 'top'
submission.comments.replace_more(limit=0)
top_comments = [comment.body for comment in submission.comments[:limit] if not comment.body.startswith('[deleted]')]
logging.info(f"Fetched {len(top_comments)} top comments for {post_url}")
return top_comments
except Exception as e:
logging.error(f"Failed to fetch comments for {post_url}: {e}")
return []
def fetch_reddit_posts():
reddit = praw.Reddit(
client_id=REDDIT_CLIENT_ID,
client_secret=REDDIT_CLIENT_SECRET,
user_agent=REDDIT_USER_AGENT
)
feeds = ['FoodPorn', 'restaurant', 'FoodIndustry', 'food']
articles = []
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS)
logging.info(f"Starting fetch with cutoff date: {cutoff_date}")
for subreddit_name in feeds:
def load_posted_titles(self) -> Set[str]:
"""Load and return the set of posted titles."""
try:
subreddit = reddit.subreddit(subreddit_name)
for submission in subreddit.top(time_filter='day', limit=100):
pub_date = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
if pub_date < cutoff_date:
logging.info(f"Skipping old post: {submission.title} (Published: {pub_date})")
continue
cleaned_title = clean_reddit_title(submission.title)
articles.append({
"title": cleaned_title,
"raw_title": submission.title,
"link": f"https://www.reddit.com{submission.permalink}",
"summary": submission.selftext,
"feed_title": get_clean_source_name(subreddit_name),
"pub_date": pub_date,
"upvotes": submission.score,
"comment_count": submission.num_comments
})
logging.info(f"Fetched {len(articles)} posts from r/{subreddit_name}")
data = load_json_file(FILE_PATHS["posted_reddit_titles"], EXPIRATION_DAYS)
return {entry["title"] for entry in data if "title" in entry}
except Exception as e:
logging.error(f"Failed to fetch Reddit feed r/{subreddit_name}: {e}")
logging.info(f"Total Reddit posts fetched: {len(articles)}")
return articles
logger.error(f"Error loading posted titles: {e}")
return set()
def curate_from_reddit():
articles = fetch_reddit_posts()
if not articles:
print("No Reddit posts available")
logging.info("No Reddit posts available")
return None, None, None
articles.sort(key=lambda x: x["upvotes"], reverse=True)
reddit = praw.Reddit(
client_id=REDDIT_CLIENT_ID,
client_secret=REDDIT_CLIENT_SECRET,
user_agent=REDDIT_USER_AGENT
)
attempts = 0
max_attempts = 10
while attempts < max_attempts and articles:
article = articles.pop(0)
title = article["title"]
raw_title = article["raw_title"]
link = article["link"]
summary = article["summary"]
source_name = "Reddit"
original_source = '<a href="https://www.reddit.com/">Reddit</a>'
if raw_title in posted_titles:
print(f"Skipping already posted post: {raw_title}")
logging.info(f"Skipping already posted post: {raw_title}")
attempts += 1
continue
print(f"Trying Reddit Post: {title} from {source_name}")
logging.info(f"Trying Reddit Post: {title} from {source_name}")
image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
if skip or any(keyword in title.lower() or keyword in raw_title.lower() for keyword in RECIPE_KEYWORDS + ["homemade"]):
print(f"Skipping filtered Reddit post: {title}")
logging.info(f"Skipping filtered Reddit post: {title}")
attempts += 1
continue
top_comments = get_top_comments(link, reddit, limit=3)
interest_score = is_interesting_reddit(
title,
summary,
article["upvotes"],
article["comment_count"],
top_comments
)
logging.info(f"Interest Score: {interest_score} for '{title}'")
if interest_score < 6:
print(f"Reddit Interest Too Low: {interest_score}")
logging.info(f"Reddit Interest Too Low: {interest_score}")
attempts += 1
continue
num_paragraphs = determine_paragraph_count(interest_score)
extra_prompt = (
f"Generate exactly {num_paragraphs} paragraphs.\n"
f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
f"Incorporate relevant insights from these top comments if available: {', '.join(top_comments) if top_comments else 'None'}.\n"
f"Do NOT introduce unrelated concepts unless in the content or comments.\n"
f"If brief, expand on the core idea with relevant context about its appeal or significance.\n"
f"Do not include emojis in the summary."
)
content_to_summarize = f"{title}\n\n{summary}"
if top_comments:
content_to_summarize += f"\n\nTop Comments:\n{'\n'.join(top_comments)}"
final_summary = summarize_with_gpt4o(
content_to_summarize,
source_name,
link,
interest_score=interest_score,
extra_prompt=extra_prompt
)
if not final_summary:
logging.info(f"Summary failed for '{title}'")
attempts += 1
continue
final_summary = insert_link_naturally(final_summary, source_name, link)
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
if not post_data:
attempts += 1
continue
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
if not image_url:
image_url, image_source, uploader, page_url = get_image(image_query)
hook = get_dynamic_hook(post_data["title"]).strip()
# Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=None)
# Generate viral share prompt
share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
share_links_template = (
f'<p>{share_prompt} '
f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
)
post_data["content"] = f"{final_summary}\n\n{share_links_template}" # Removed cta from content
global is_posting
is_posting = True
def load_used_images(self) -> Set[str]:
"""Load and return the set of used images."""
try:
post_id, post_url = post_to_wp(
post_data=post_data,
category=category,
link=link,
author=author,
image_url=image_url,
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
interest_score=interest_score,
should_post_tweet=True
)
finally:
is_posting = False
data = load_json_file(FILE_PATHS["used_images"], IMAGE_EXPIRATION_DAYS)
return {entry["title"] for entry in data if "title" in entry}
except Exception as e:
logger.error(f"Error loading used images: {e}")
return set()
if post_id:
share_text = f"Check out this foodie gem! {post_data['title']}"
share_text_encoded = quote(share_text)
post_url_encoded = quote(post_url)
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
# Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
post_data["content"] = f"{final_summary}\n\n{share_links}" # Removed cta from content
is_posting = True
try:
post_to_wp(
post_data=post_data,
category=category,
link=link,
author=author,
image_url=image_url,
original_source=original_source,
image_source=image_source,
uploader=uploader,
pixabay_url=pixabay_url,
interest_score=interest_score,
post_id=post_id,
should_post_tweet=False
)
finally:
is_posting = False
def clean_reddit_title(self, title: str) -> str:
"""Clean and standardize Reddit post titles."""
cleaned_title = re.sub(r'^\[.*?\]\s*', '', title).strip()
logger.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'")
return cleaned_title
def is_interesting_reddit(self, title: str, summary: str, upvotes: int, comment_count: int, top_comments: List[str]) -> int:
"""Determine the interest score for a Reddit post."""
try:
content = f"Title: {title}\n\nContent: {summary}"
if top_comments:
content += f"\n\nTop Comments:\n{'\n'.join(top_comments)}"
timestamp = datetime.now(timezone.utc).isoformat()
save_json_file(POSTED_TITLES_FILE, raw_title, timestamp)
posted_titles.add(raw_title)
logging.info(f"Successfully saved '{raw_title}' to {POSTED_TITLES_FILE} with timestamp {timestamp}")
response = self.client.chat.completions.create(
model=LIGHT_TASK_MODEL,
messages=[
{"role": "system", "content": (
"Rate this Reddit post from 0-10 based on rarity, buzzworthiness, and engagement potential for food lovers, covering food topics (skip recipes). "
"Score 8-10 for rare, highly shareable ideas (e.g., unique dishes or restaurant trends). "
"Score 5-7 for fresh, engaging updates with broad appeal. Score below 5 for common or unremarkable content. "
"Consider comments for added context (e.g., specific locations or unique details). "
"Return only a number."
)},
{"role": "user", "content": content}
],
max_tokens=5
)
base_score = int(response.choices[0].message.content.strip()) if response.choices[0].message.content.strip().isdigit() else 0
engagement_boost = 0
if upvotes >= 500:
engagement_boost += 3
elif upvotes >= 100:
engagement_boost += 2
elif upvotes >= 50:
engagement_boost += 1
if image_url:
save_json_file(USED_IMAGES_FILE, image_url, timestamp)
used_images.add(image_url)
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE} with timestamp {timestamp}")
print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Reddit *****")
print(f"Actual post URL: {post_url}")
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Reddit *****")
logging.info(f"Actual post URL: {post_url}")
return post_data, category, random.randint(0, 1800)
if comment_count >= 100:
engagement_boost += 2
elif comment_count >= 20:
engagement_boost += 1
final_score = min(base_score + engagement_boost, 10)
logger.info(f"Reddit Interest Score: {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count}, top_comments: {len(top_comments)}) for '{title}'")
return final_score
except Exception as e:
logger.error(f"Reddit interestingness scoring failed: {e}")
return 0
def get_top_comments(self, post_url: str, limit: int = 3) -> List[str]:
"""Fetch top comments for a Reddit post."""
try:
submission = self.reddit.submission(url=post_url)
submission.comment_sort = 'top'
submission.comments.replace_more(limit=0)
top_comments = [comment.body for comment in submission.comments[:limit] if not comment.body.startswith('[deleted]')]
logger.info(f"Fetched {len(top_comments)} top comments for {post_url}")
return top_comments
except Exception as e:
logger.error(f"Failed to fetch comments for {post_url}: {e}")
return []
def fetch_reddit_posts(self) -> List[Dict]:
"""Fetch posts from configured Reddit subreddits."""
feeds = ['FoodPorn', 'restaurant', 'FoodIndustry', 'food']
articles = []
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=24)
attempts += 1
logging.info(f"WP posting failed for '{post_data['title']}'")
print("No interesting Reddit post found after attempts")
logging.info("No interesting Reddit post found after attempts")
return None, None, random.randint(600, 1800)
logger.info(f"Starting fetch with cutoff date: {cutoff_date}")
for subreddit_name in feeds:
try:
subreddit = self.reddit.subreddit(subreddit_name)
for submission in subreddit.top(time_filter='day', limit=100):
pub_date = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
if pub_date < cutoff_date:
logger.info(f"Skipping old post: {submission.title} (Published: {pub_date})")
continue
cleaned_title = self.clean_reddit_title(submission.title)
articles.append({
"title": cleaned_title,
"raw_title": submission.title,
"link": f"https://www.reddit.com{submission.permalink}",
"summary": submission.selftext,
"feed_title": get_clean_source_name(subreddit_name),
"pub_date": pub_date,
"upvotes": submission.score,
"comment_count": submission.num_comments
})
logger.info(f"Fetched {len(articles)} posts from r/{subreddit_name}")
except Exception as e:
logger.error(f"Failed to fetch Reddit feed r/{subreddit_name}: {e}")
logger.info(f"Total Reddit posts fetched: {len(articles)}")
return articles
def curate_from_reddit(self) -> Tuple[Optional[Dict], Optional[str], int]:
"""Curate content from Reddit posts."""
articles = self.fetch_reddit_posts()
if not articles:
logger.info("No Reddit posts available")
return None, None, random.randint(600, 1800)
articles.sort(key=lambda x: x["upvotes"], reverse=True)
for article in articles:
title = article["title"]
raw_title = article["raw_title"]
link = article["link"]
summary = article["summary"]
if raw_title in self.posted_titles:
logger.info(f"Skipping already posted post: {raw_title}")
continue
logger.info(f"Processing Reddit Post: {title}")
image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
if skip or any(keyword in title.lower() or keyword in raw_title.lower() for keyword in RECIPE_KEYWORDS + ["homemade"]):
logger.info(f"Skipping filtered Reddit post: {title}")
continue
top_comments = self.get_top_comments(link)
interest_score = self.is_interesting_reddit(title, summary, article["upvotes"], article["comment_count"], top_comments)
if interest_score < 6:
logger.info(f"Reddit Interest Too Low: {interest_score}")
continue
num_paragraphs = determine_paragraph_count(interest_score)
extra_prompt = (
f"Generate exactly {num_paragraphs} paragraphs.\n"
f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
f"Do NOT introduce unrelated concepts.\n"
f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
f"Do not include emojis in the summary."
)
final_summary = summarize_with_gpt4o(
f"{title}\n\n{summary}",
"Reddit",
link,
interest_score=interest_score,
extra_prompt=extra_prompt
)
if not final_summary:
logger.info(f"Summary failed for '{title}'")
continue
final_summary = insert_link_naturally(final_summary, "Reddit", link)
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
if post_data and author:
return post_data, author, random.randint(600, 1800)
return None, None, random.randint(600, 1800)
def run_reddit_automator():
print(f"{datetime.now(timezone.utc)} - INFO - ***** Reddit Automator Launched *****")
logging.info("***** Reddit Automator Launched *****")
post_data, category, sleep_time = curate_from_reddit()
if not post_data:
print(f"No postable Reddit article found - sleeping for {sleep_time} seconds")
logging.info(f"No postable Reddit article found - sleeping for {sleep_time} seconds")
else:
print(f"Completed Reddit run with sleep time: {sleep_time} seconds")
logging.info(f"Completed Reddit run with sleep time: {sleep_time} seconds")
print(f"Sleeping for {sleep_time}s")
time.sleep(sleep_time)
return post_data, category, sleep_time
"""Main function to run the Reddit automator."""
scraper = RedditScraper()
while True:
try:
post_data, author, sleep_time = scraper.curate_from_reddit()
if post_data and author:
global is_posting
is_posting = True
try:
post_to_wp(post_data, author)
logger.info(f"Successfully posted: {post_data['title']}")
finally:
is_posting = False
time.sleep(sleep_time)
except Exception as e:
logger.error(f"Error in Reddit automator: {e}")
time.sleep(300) # Wait 5 minutes before retrying
if __name__ == "__main__":
run_reddit_automator()