331 lines
14 KiB
Python
331 lines
14 KiB
Python
# foodie_automator_reddit.py
|
|
import requests
|
|
import random
|
|
import time
|
|
import logging
|
|
import os
|
|
import json
|
|
import signal
|
|
import sys
|
|
import re
|
|
from datetime import datetime, timedelta, timezone
|
|
from typing import List, Dict, Optional, Tuple, Set
|
|
from openai import OpenAI
|
|
from urllib.parse import quote
|
|
from requests.packages.urllib3.util.retry import Retry
|
|
from requests.adapters import HTTPAdapter
|
|
import praw
|
|
from dotenv import load_dotenv
|
|
from foodie_config import (
|
|
AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS,
|
|
PERSONA_CONFIGS, CATEGORIES, get_clean_source_name,
|
|
REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT, LIGHT_TASK_MODEL,
|
|
X_API_CREDENTIALS, FILE_PATHS, EXPIRATION_DAYS, IMAGE_EXPIRATION_DAYS
|
|
)
|
|
from foodie_utils import (
|
|
load_json_file, save_json_file, get_image, generate_image_query,
|
|
upload_image_to_wp, determine_paragraph_count, insert_link_naturally,
|
|
summarize_with_gpt4o, generate_category_from_summary, post_to_wp,
|
|
prepare_post_data, select_best_author, smart_image_and_filter,
|
|
get_flickr_image
|
|
)
|
|
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
|
|
|
|
# Load environment variables
|
|
load_dotenv()
|
|
|
|
# Global state
|
|
is_posting = False
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class RedditScraper:
|
|
def __init__(self):
|
|
self.setup_logging()
|
|
self.setup_signal_handlers()
|
|
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
|
self.posted_titles = self.load_posted_titles()
|
|
self.used_images = self.load_used_images()
|
|
self.reddit = self.setup_reddit_client()
|
|
self.setup_requests_session()
|
|
|
|
def setup_logging(self) -> None:
|
|
"""Configure logging for the scraper."""
|
|
log_file = FILE_PATHS["posted_reddit_titles"].with_suffix('.log')
|
|
self.prune_old_logs(log_file)
|
|
|
|
logging.basicConfig(
|
|
filename=str(log_file),
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(levelname)s - %(message)s"
|
|
)
|
|
logging.getLogger("requests").setLevel(logging.WARNING)
|
|
logging.getLogger("prawcore").setLevel(logging.WARNING)
|
|
console_handler = logging.StreamHandler()
|
|
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
|
logging.getLogger().addHandler(console_handler)
|
|
logger.info("Logging initialized for Reddit scraper")
|
|
|
|
def prune_old_logs(self, log_file: str) -> None:
|
|
"""Prune log entries older than LOG_PRUNE_DAYS."""
|
|
if not os.path.exists(log_file):
|
|
return
|
|
|
|
with open(log_file, 'r') as f:
|
|
lines = f.readlines()
|
|
|
|
log_entries = []
|
|
current_entry = []
|
|
timestamp_pattern = re.compile(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}')
|
|
|
|
for line in lines:
|
|
if timestamp_pattern.match(line):
|
|
if current_entry:
|
|
log_entries.append(''.join(current_entry))
|
|
current_entry = [line]
|
|
else:
|
|
current_entry.append(line)
|
|
|
|
if current_entry:
|
|
log_entries.append(''.join(current_entry))
|
|
|
|
cutoff = datetime.now(timezone.utc) - timedelta(days=30) # LOG_PRUNE_DAYS
|
|
pruned_entries = []
|
|
for entry in log_entries:
|
|
try:
|
|
timestamp = datetime.strptime(entry[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc)
|
|
if timestamp > cutoff:
|
|
pruned_entries.append(entry)
|
|
except ValueError:
|
|
logger.warning(f"Skipping malformed log entry (no timestamp): {entry[:50]}...")
|
|
continue
|
|
|
|
with open(log_file, 'w') as f:
|
|
f.writelines(pruned_entries)
|
|
|
|
def setup_signal_handlers(self) -> None:
|
|
"""Set up signal handlers for graceful shutdown."""
|
|
def signal_handler(sig, frame):
|
|
logger.info("Received termination signal, checking if safe to exit...")
|
|
if is_posting:
|
|
logger.info("Currently posting, will exit after completion.")
|
|
else:
|
|
logger.info("Safe to exit immediately.")
|
|
sys.exit(0)
|
|
|
|
signal.signal(signal.SIGTERM, signal_handler)
|
|
signal.signal(signal.SIGINT, signal_handler)
|
|
|
|
def setup_reddit_client(self) -> praw.Reddit:
|
|
"""Set up and return a Reddit client with proper configuration."""
|
|
return praw.Reddit(
|
|
client_id=REDDIT_CLIENT_ID,
|
|
client_secret=REDDIT_CLIENT_SECRET,
|
|
user_agent=REDDIT_USER_AGENT
|
|
)
|
|
|
|
def setup_requests_session(self) -> None:
|
|
"""Set up a requests session with retry logic."""
|
|
self.session = requests.Session()
|
|
retries = Retry(
|
|
total=5,
|
|
backoff_factor=0.1,
|
|
status_forcelist=[500, 502, 503, 504]
|
|
)
|
|
self.session.mount('https://', HTTPAdapter(max_retries=retries))
|
|
|
|
def load_posted_titles(self) -> Set[str]:
|
|
"""Load and return the set of posted titles."""
|
|
try:
|
|
data = load_json_file(FILE_PATHS["posted_reddit_titles"], EXPIRATION_DAYS)
|
|
return {entry["title"] for entry in data if "title" in entry}
|
|
except Exception as e:
|
|
logger.error(f"Error loading posted titles: {e}")
|
|
return set()
|
|
|
|
def load_used_images(self) -> Set[str]:
|
|
"""Load and return the set of used images."""
|
|
try:
|
|
data = load_json_file(FILE_PATHS["used_images"], IMAGE_EXPIRATION_DAYS)
|
|
return {entry["url"] for entry in data if "url" in entry}
|
|
except Exception as e:
|
|
logger.error(f"Error loading used images: {e}")
|
|
return set()
|
|
|
|
def clean_reddit_title(self, title: str) -> str:
|
|
"""Clean and standardize Reddit post titles."""
|
|
cleaned_title = re.sub(r'^\[.*?\]\s*', '', title).strip()
|
|
logger.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'")
|
|
return cleaned_title
|
|
|
|
def is_interesting_reddit(self, title: str, summary: str, upvotes: int, comment_count: int, top_comments: List[str]) -> int:
|
|
"""Determine the interest score for a Reddit post."""
|
|
try:
|
|
content = f"Title: {title}\n\nContent: {summary}"
|
|
if top_comments:
|
|
content += f"\n\nTop Comments:\n{'\n'.join(top_comments)}"
|
|
|
|
response = self.client.chat.completions.create(
|
|
model=LIGHT_TASK_MODEL,
|
|
messages=[
|
|
{"role": "system", "content": (
|
|
"Rate this Reddit post from 0-10 based on rarity, buzzworthiness, and engagement potential for food lovers, covering food topics (skip recipes). "
|
|
"Score 8-10 for rare, highly shareable ideas (e.g., unique dishes or restaurant trends). "
|
|
"Score 5-7 for fresh, engaging updates with broad appeal. Score below 5 for common or unremarkable content. "
|
|
"Consider comments for added context (e.g., specific locations or unique details). "
|
|
"Return only a number."
|
|
)},
|
|
{"role": "user", "content": content}
|
|
],
|
|
max_tokens=5
|
|
)
|
|
base_score = int(response.choices[0].message.content.strip()) if response.choices[0].message.content.strip().isdigit() else 0
|
|
|
|
engagement_boost = 0
|
|
if upvotes >= 500:
|
|
engagement_boost += 3
|
|
elif upvotes >= 100:
|
|
engagement_boost += 2
|
|
elif upvotes >= 50:
|
|
engagement_boost += 1
|
|
|
|
if comment_count >= 100:
|
|
engagement_boost += 2
|
|
elif comment_count >= 20:
|
|
engagement_boost += 1
|
|
|
|
final_score = min(base_score + engagement_boost, 10)
|
|
logger.info(f"Reddit Interest Score: {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count}, top_comments: {len(top_comments)}) for '{title}'")
|
|
return final_score
|
|
except Exception as e:
|
|
logger.error(f"Reddit interestingness scoring failed: {e}")
|
|
return 0
|
|
|
|
def get_top_comments(self, post_url: str, limit: int = 3) -> List[str]:
|
|
"""Fetch top comments for a Reddit post."""
|
|
try:
|
|
submission = self.reddit.submission(url=post_url)
|
|
submission.comment_sort = 'top'
|
|
submission.comments.replace_more(limit=0)
|
|
top_comments = [comment.body for comment in submission.comments[:limit] if not comment.body.startswith('[deleted]')]
|
|
logger.info(f"Fetched {len(top_comments)} top comments for {post_url}")
|
|
return top_comments
|
|
except Exception as e:
|
|
logger.error(f"Failed to fetch comments for {post_url}: {e}")
|
|
return []
|
|
|
|
def fetch_reddit_posts(self) -> List[Dict]:
|
|
"""Fetch posts from configured Reddit subreddits."""
|
|
feeds = ['FoodPorn', 'restaurant', 'FoodIndustry', 'food']
|
|
articles = []
|
|
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=24)
|
|
|
|
logger.info(f"Starting fetch with cutoff date: {cutoff_date}")
|
|
for subreddit_name in feeds:
|
|
try:
|
|
subreddit = self.reddit.subreddit(subreddit_name)
|
|
for submission in subreddit.top(time_filter='day', limit=100):
|
|
pub_date = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
|
|
if pub_date < cutoff_date:
|
|
logger.info(f"Skipping old post: {submission.title} (Published: {pub_date})")
|
|
continue
|
|
cleaned_title = self.clean_reddit_title(submission.title)
|
|
articles.append({
|
|
"title": cleaned_title,
|
|
"raw_title": submission.title,
|
|
"link": f"https://www.reddit.com{submission.permalink}",
|
|
"summary": submission.selftext,
|
|
"feed_title": get_clean_source_name(subreddit_name),
|
|
"pub_date": pub_date,
|
|
"upvotes": submission.score,
|
|
"comment_count": submission.num_comments
|
|
})
|
|
logger.info(f"Fetched {len(articles)} posts from r/{subreddit_name}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to fetch Reddit feed r/{subreddit_name}: {e}")
|
|
|
|
logger.info(f"Total Reddit posts fetched: {len(articles)}")
|
|
return articles
|
|
|
|
def curate_from_reddit(self) -> Tuple[Optional[Dict], Optional[str], int]:
|
|
"""Curate content from Reddit posts."""
|
|
articles = self.fetch_reddit_posts()
|
|
if not articles:
|
|
logger.info("No Reddit posts available")
|
|
return None, None, random.randint(600, 1800)
|
|
|
|
articles.sort(key=lambda x: x["upvotes"], reverse=True)
|
|
|
|
for article in articles:
|
|
title = article["title"]
|
|
raw_title = article["raw_title"]
|
|
link = article["link"]
|
|
summary = article["summary"]
|
|
|
|
if raw_title in self.posted_titles:
|
|
logger.info(f"Skipping already posted post: {raw_title}")
|
|
continue
|
|
|
|
logger.info(f"Processing Reddit Post: {title}")
|
|
|
|
image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
|
|
if skip or any(keyword in title.lower() or keyword in raw_title.lower() for keyword in RECIPE_KEYWORDS + ["homemade"]):
|
|
logger.info(f"Skipping filtered Reddit post: {title}")
|
|
continue
|
|
|
|
top_comments = self.get_top_comments(link)
|
|
interest_score = self.is_interesting_reddit(title, summary, article["upvotes"], article["comment_count"], top_comments)
|
|
|
|
if interest_score < 6:
|
|
logger.info(f"Reddit Interest Too Low: {interest_score}")
|
|
continue
|
|
|
|
num_paragraphs = determine_paragraph_count(interest_score)
|
|
extra_prompt = (
|
|
f"Generate exactly {num_paragraphs} paragraphs.\n"
|
|
f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
|
|
f"Do NOT introduce unrelated concepts.\n"
|
|
f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
|
|
f"Do not include emojis in the summary."
|
|
)
|
|
|
|
final_summary = summarize_with_gpt4o(
|
|
f"{title}\n\n{summary}",
|
|
"Reddit",
|
|
link,
|
|
interest_score=interest_score,
|
|
extra_prompt=extra_prompt
|
|
)
|
|
|
|
if not final_summary:
|
|
logger.info(f"Summary failed for '{title}'")
|
|
continue
|
|
|
|
final_summary = insert_link_naturally(final_summary, "Reddit", link)
|
|
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
|
|
|
|
if post_data and author:
|
|
return post_data, author, random.randint(600, 1800)
|
|
|
|
return None, None, random.randint(600, 1800)
|
|
|
|
def run_reddit_automator():
|
|
"""Main function to run the Reddit automator."""
|
|
scraper = RedditScraper()
|
|
while True:
|
|
try:
|
|
post_data, author, sleep_time = scraper.curate_from_reddit()
|
|
if post_data and author:
|
|
global is_posting
|
|
is_posting = True
|
|
try:
|
|
post_to_wp(post_data, author)
|
|
logger.info(f"Successfully posted: {post_data['title']}")
|
|
finally:
|
|
is_posting = False
|
|
time.sleep(sleep_time)
|
|
except Exception as e:
|
|
logger.error(f"Error in Reddit automator: {e}")
|
|
time.sleep(300) # Wait 5 minutes before retrying
|
|
|
|
if __name__ == "__main__":
|
|
run_reddit_automator() |