479 lines
21 KiB
Python
479 lines
21 KiB
Python
# foodie_automator_reddit.py
|
||
import requests
|
||
import random
|
||
import time
|
||
import logging
|
||
import os
|
||
import json
|
||
import signal
|
||
import sys
|
||
import re
|
||
from duckduckgo_search import DDGS
|
||
from datetime import datetime, timedelta, timezone
|
||
from openai import OpenAI
|
||
from urllib.parse import quote
|
||
from requests.packages.urllib3.util.retry import Retry
|
||
from requests.adapters import HTTPAdapter
|
||
import praw
|
||
from dotenv import load_dotenv
|
||
from foodie_config import (
|
||
AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS,
|
||
PERSONA_CONFIGS, CATEGORIES, get_clean_source_name,
|
||
REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT, LIGHT_TASK_MODEL,
|
||
X_API_CREDENTIALS
|
||
)
|
||
from foodie_utils import (
|
||
load_json_file, save_json_file, get_image, generate_image_query,
|
||
upload_image_to_wp, determine_paragraph_count, insert_link_naturally,
|
||
is_interesting, generate_title_from_summary, summarize_with_gpt4o,
|
||
generate_category_from_summary, post_to_wp, prepare_post_data,
|
||
select_best_author, smart_image_and_filter, get_flickr_image,
|
||
get_next_author_round_robin, check_author_rate_limit, update_system_activity
|
||
)
|
||
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
|
||
import fcntl
|
||
|
||
load_dotenv()
|
||
|
||
SCRIPT_NAME = "foodie_automator_reddit"
|
||
|
||
is_posting = False
|
||
LOCK_FILE = "/home/shane/foodie_automator/locks/foodie_automator_reddit.lock"
|
||
|
||
def signal_handler(sig, frame):
|
||
logging.info("Received termination signal, marking script as stopped...")
|
||
update_system_activity(SCRIPT_NAME, "stopped")
|
||
if is_posting:
|
||
logging.info("Currently posting, will exit after completion.")
|
||
else:
|
||
logging.info("Safe to exit immediately.")
|
||
sys.exit(0)
|
||
|
||
signal.signal(signal.SIGTERM, signal_handler)
|
||
signal.signal(signal.SIGINT, signal_handler)
|
||
|
||
LOG_FILE = "/home/shane/foodie_automator/logs/foodie_automator_reddit.log"
|
||
LOG_PRUNE_DAYS = 30
|
||
MAX_RETRIES = 3
|
||
RETRY_BACKOFF = 2
|
||
|
||
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_reddit_titles.json'
|
||
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
|
||
EXPIRATION_HOURS = 24
|
||
IMAGE_EXPIRATION_DAYS = 7
|
||
|
||
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
|
||
posted_titles = set(entry["title"] for entry in posted_titles_data if "title" in entry)
|
||
used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
|
||
used_images = set(entry["title"] for entry in used_images_data if "title" in entry)
|
||
|
||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||
|
||
def setup_logging():
|
||
try:
|
||
# Ensure log directory exists
|
||
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
|
||
logging.debug(f"Log directory created/verified: {os.path.dirname(LOG_FILE)}")
|
||
|
||
# Check write permissions
|
||
if not os.access(os.path.dirname(LOG_FILE), os.W_OK):
|
||
raise PermissionError(f"No write permission for {os.path.dirname(LOG_FILE)}")
|
||
|
||
# Test write to log file
|
||
try:
|
||
with open(LOG_FILE, 'a') as f:
|
||
f.write("")
|
||
logging.debug(f"Confirmed write access to {LOG_FILE}")
|
||
except Exception as e:
|
||
raise PermissionError(f"Cannot write to {LOG_FILE}: {e}")
|
||
|
||
# Prune old logs
|
||
if os.path.exists(LOG_FILE):
|
||
with open(LOG_FILE, 'r') as f:
|
||
lines = f.readlines()
|
||
|
||
log_entries = []
|
||
current_entry = []
|
||
timestamp_pattern = re.compile(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}')
|
||
|
||
for line in lines:
|
||
if timestamp_pattern.match(line):
|
||
if current_entry:
|
||
log_entries.append(''.join(current_entry))
|
||
current_entry = [line]
|
||
else:
|
||
current_entry.append(line)
|
||
|
||
if current_entry:
|
||
log_entries.append(''.join(current_entry))
|
||
|
||
cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
|
||
pruned_entries = []
|
||
for entry in log_entries:
|
||
try:
|
||
timestamp = datetime.strptime(entry[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc)
|
||
if timestamp > cutoff:
|
||
pruned_entries.append(entry)
|
||
except ValueError:
|
||
logging.warning(f"Skipping malformed log entry (no timestamp): {entry[:50]}...")
|
||
continue
|
||
|
||
with open(LOG_FILE, 'w') as f:
|
||
f.writelines(pruned_entries)
|
||
logging.debug(f"Log file pruned: {LOG_FILE}")
|
||
|
||
# Configure logging
|
||
logging.basicConfig(
|
||
filename=LOG_FILE,
|
||
level=logging.INFO,
|
||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||
datefmt="%Y-%m-%d %H:%M:%S",
|
||
force=True # Ensure this config takes precedence
|
||
)
|
||
logging.getLogger("requests").setLevel(logging.WARNING)
|
||
logging.getLogger("prawcore").setLevel(logging.WARNING)
|
||
console_handler = logging.StreamHandler()
|
||
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
||
logging.getLogger().addHandler(console_handler)
|
||
logging.info("Logging initialized for foodie_automator_reddit.py")
|
||
|
||
except Exception as e:
|
||
# Fallback to console logging if file logging fails
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||
datefmt="%Y-%m-%d %H:%M:%S",
|
||
force=True
|
||
)
|
||
logging.error(f"Failed to setup file logging for {LOG_FILE}: {e}. Using console logging.")
|
||
console_handler = logging.StreamHandler()
|
||
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
||
logging.getLogger().addHandler(console_handler)
|
||
logging.info("Console logging initialized as fallback for foodie_automator_reddit.py")
|
||
|
||
def acquire_lock():
|
||
os.makedirs(os.path.dirname(LOCK_FILE), exist_ok=True)
|
||
lock_fd = open(LOCK_FILE, 'w')
|
||
try:
|
||
fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||
lock_fd.write(str(os.getpid()))
|
||
lock_fd.flush()
|
||
return lock_fd
|
||
except IOError:
|
||
logging.info("Another instance of foodie_automator_reddit.py is running")
|
||
sys.exit(0)
|
||
|
||
def clean_reddit_title(title):
|
||
"""Clean Reddit post title by removing prefixes, newlines, and special characters."""
|
||
if not title or not isinstance(title, str):
|
||
logging.warning(f"Invalid title received: {title}")
|
||
return ""
|
||
# Remove [prefixes], newlines, and excessive whitespace
|
||
cleaned_title = re.sub(r'^\[.*?\]\s*', '', title) # Remove [prefix]
|
||
cleaned_title = re.sub(r'\n+', ' ', cleaned_title) # Replace newlines with space
|
||
cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip() # Normalize spaces
|
||
# Remove special characters (keep alphanumeric, spaces, and basic punctuation)
|
||
cleaned_title = re.sub(r'[^\w\s.,!?-]', '', cleaned_title)
|
||
logging.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'")
|
||
return cleaned_title
|
||
|
||
def is_interesting_reddit(title, summary, upvotes, comment_count, top_comments):
|
||
for attempt in range(MAX_RETRIES):
|
||
try:
|
||
content = f"Title: {title}\n\nContent: {summary}"
|
||
if top_comments:
|
||
content += f"\n\nTop Comments:\n{'\n'.join(top_comments)}"
|
||
|
||
response = client.chat.completions.create(
|
||
model=LIGHT_TASK_MODEL,
|
||
messages=[
|
||
{"role": "system", "content": (
|
||
"Rate this Reddit post from 0-10 based on rarity, buzzworthiness, and engagement potential for food lovers, covering food topics (skip recipes). "
|
||
"Score 8-10 for rare, highly shareable ideas (e.g., unique dishes or restaurant trends). "
|
||
"Score 5-7 for fresh, engaging updates with broad appeal. Score below 5 for common or unremarkable content. "
|
||
"Consider comments for added context (e.g., specific locations or unique details). "
|
||
"Return only a number"
|
||
)},
|
||
{"role": "user", "content": content}
|
||
],
|
||
max_tokens=5
|
||
)
|
||
base_score = int(response.choices[0].message.content.strip()) if response.choices[0].message.content.strip().isdigit() else 0
|
||
|
||
engagement_boost = 0
|
||
if upvotes >= 500:
|
||
engagement_boost += 3
|
||
elif upvotes >= 100:
|
||
engagement_boost += 2
|
||
elif upvotes >= 50:
|
||
engagement_boost += 1
|
||
|
||
if comment_count >= 100:
|
||
engagement_boost += 2
|
||
elif comment_count >= 20:
|
||
engagement_boost += 1
|
||
|
||
final_score = min(base_score + engagement_boost, 10)
|
||
logging.info(f"Reddit Interest Score: {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count}, top_comments: {len(top_comments)}) for '{title}'")
|
||
return final_score
|
||
except Exception as e:
|
||
logging.warning(f"Reddit interestingness scoring failed (attempt {attempt + 1}): {e}")
|
||
if attempt < MAX_RETRIES - 1:
|
||
time.sleep(RETRY_BACKOFF * (2 ** attempt))
|
||
continue
|
||
logging.error(f"Failed to score Reddit post '{title}' after {MAX_RETRIES} attempts")
|
||
return 0
|
||
|
||
def get_top_comments(post_url, reddit, limit=3):
|
||
for attempt in range(MAX_RETRIES):
|
||
try:
|
||
submission = reddit.submission(url=post_url)
|
||
submission.comment_sort = 'top'
|
||
submission.comments.replace_more(limit=0)
|
||
top_comments = [comment.body for comment in submission.comments[:limit] if not comment.body.startswith('[deleted]')]
|
||
logging.info(f"Fetched {len(top_comments)} top comments for {post_url}")
|
||
return top_comments
|
||
except Exception as e:
|
||
logging.warning(f"Failed to fetch comments for {post_url} (attempt {attempt + 1}): {e}")
|
||
if attempt < MAX_RETRIES - 1:
|
||
time.sleep(RETRY_BACKOFF * (2 ** attempt))
|
||
continue
|
||
logging.error(f"Failed to fetch comments for {post_url} after {MAX_RETRIES} attempts")
|
||
return []
|
||
|
||
def fetch_duckduckgo_news_context(title, hours=24):
|
||
for attempt in range(MAX_RETRIES):
|
||
try:
|
||
with DDGS() as ddgs:
|
||
results = ddgs.news(f"{title} news", timelimit="d", max_results=5)
|
||
titles = []
|
||
for r in results:
|
||
try:
|
||
date_str = r["date"]
|
||
if '+00:00' in date_str:
|
||
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
|
||
else:
|
||
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
|
||
if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
|
||
titles.append(r["title"].lower())
|
||
except ValueError as e:
|
||
logging.warning(f"Date parsing failed for '{date_str}': {e}")
|
||
continue
|
||
context = " ".join(titles) if titles else "No recent news found within 24 hours"
|
||
logging.info(f"DuckDuckGo News context for '{title}': {context}")
|
||
return context
|
||
except Exception as e:
|
||
logging.warning(f"DuckDuckGo News context fetch failed for '{title}' (attempt {attempt + 1}): {e}")
|
||
if attempt < MAX_RETRIES - 1:
|
||
time.sleep(RETRY_BACKOFF * (2 ** attempt))
|
||
continue
|
||
logging.error(f"Failed to fetch DuckDuckGo News context for '{title}' after {MAX_RETRIES} attempts")
|
||
return title
|
||
|
||
def fetch_reddit_posts():
|
||
"""Fetch Reddit posts from specified subreddits, filtering low-quality and [homemade] posts."""
|
||
try:
|
||
reddit = praw.Reddit(
|
||
client_id=REDDIT_CLIENT_ID,
|
||
client_secret=REDDIT_CLIENT_SECRET,
|
||
user_agent=REDDIT_USER_AGENT
|
||
)
|
||
feeds = [
|
||
"food",
|
||
"FoodPorn",
|
||
"spicy",
|
||
"KoreanFood",
|
||
"JapaneseFood",
|
||
"DessertPorn",
|
||
"ChineseFood",
|
||
"IndianFood"
|
||
]
|
||
articles = []
|
||
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS)
|
||
|
||
logging.info(f"Starting fetch with cutoff date: {cutoff_date}")
|
||
for subreddit_name in feeds:
|
||
for attempt in range(MAX_RETRIES):
|
||
try:
|
||
subreddit = reddit.subreddit(subreddit_name)
|
||
for submission in subreddit.top(time_filter='day', limit=100):
|
||
pub_date = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
|
||
if pub_date < cutoff_date:
|
||
logging.info(f"Skipping old post: {submission.title} (Published: {pub_date})")
|
||
continue
|
||
if "[homemade]" in submission.title.lower():
|
||
logging.info(f"Skipping homemade post: {submission.title}")
|
||
continue
|
||
cleaned_title = clean_reddit_title(submission.title)
|
||
if not cleaned_title or len(cleaned_title) < 5:
|
||
logging.info(f"Skipping post with invalid or short title: {submission.title}")
|
||
continue
|
||
# Filter out posts with empty or very short summaries
|
||
summary = submission.selftext.strip() if submission.selftext else ""
|
||
if len(summary) < 20 and not submission.url.endswith(('.jpg', '.jpeg', '.png', '.gif')):
|
||
logging.info(f"Skipping post with insufficient summary: {cleaned_title}")
|
||
continue
|
||
# Fetch top comments for additional context
|
||
top_comments = get_top_comments(f"https://www.reddit.com{submission.permalink}", reddit)
|
||
articles.append({
|
||
"title": cleaned_title,
|
||
"raw_title": submission.title,
|
||
"link": f"https://www.reddit.com{submission.permalink}",
|
||
"summary": summary,
|
||
"feed_title": get_clean_source_name(subreddit_name),
|
||
"pub_date": pub_date,
|
||
"upvotes": submission.score,
|
||
"comment_count": submission.num_comments,
|
||
"top_comments": top_comments
|
||
})
|
||
logging.info(f"Fetched {len(articles)} posts from r/{subreddit_name}")
|
||
break
|
||
except Exception as e:
|
||
logging.error(f"Failed to fetch Reddit feed r/{subreddit_name} (attempt {attempt + 1}): {e}")
|
||
if attempt < MAX_RETRIES - 1:
|
||
time.sleep(RETRY_BACKOFF * (2 ** attempt))
|
||
continue
|
||
logging.info(f"Total Reddit posts fetched: {len(articles)}")
|
||
return articles
|
||
except Exception as e:
|
||
logging.error(f"Unexpected error in fetch_reddit_posts: {e}", exc_info=True)
|
||
return []
|
||
|
||
def curate_from_reddit(post, original_source, source_name, link, page_url):
|
||
logger = logging.getLogger(__name__)
|
||
try:
|
||
content = post.selftext if post.selftext else post.url
|
||
if not content:
|
||
logger.info(f"No content for Reddit post: {post.title}")
|
||
return None, None
|
||
|
||
interest_score = is_interesting(content)
|
||
if interest_score < 4:
|
||
logger.info(f"Reddit post '{post.title}' not interesting enough: score {interest_score}")
|
||
return None, None
|
||
|
||
summary = summarize_with_gpt4o(content, source_name, link, interest_score=interest_score)
|
||
if not summary:
|
||
logger.warning(f"Failed to summarize Reddit post: {post.title}")
|
||
return None, None
|
||
|
||
if post.title in summary:
|
||
summary = summary.replace(post.title, "").strip()
|
||
while "\n\n\n" in summary:
|
||
summary = summary.replace("\n\n\n", "\n\n")
|
||
|
||
final_summary = insert_link_naturally(summary, source_name, link)
|
||
if not final_summary:
|
||
logger.warning(f"Failed to insert link for Reddit post: {post.title}")
|
||
return None, None
|
||
|
||
result = prepare_post_data(final_summary, post.title)
|
||
if not result:
|
||
logger.info(f"Post preparation failed for Reddit post: {post.title}")
|
||
return None, None
|
||
|
||
logger.debug(f"prepare_post_data returned {len(result)} values: {result}")
|
||
if len(result) < 7:
|
||
logger.error(f"prepare_post_data returned too few values: {result}")
|
||
return None, None
|
||
|
||
post_data = result[0]
|
||
author = result[1]
|
||
category = result[2]
|
||
image_url = result[3]
|
||
image_source = result[4]
|
||
uploader = result[5]
|
||
page_url = result[6]
|
||
|
||
share_text = f"Check out this tasty find: {post_data['title']}"
|
||
share_text_encoded = quote(share_text)
|
||
share_links_template = (
|
||
"Share this post: "
|
||
'<a href="https://x.com/intent/tweet?url={post_url}&text={share_text}">X</a> | '
|
||
'<a href="https://www.facebook.com/sharer/sharer.php?u={post_url}">Facebook</a>'
|
||
)
|
||
|
||
post_data["content"] = final_summary
|
||
post_id, post_url = post_to_wp(
|
||
post_data=post_data,
|
||
category=category,
|
||
link=link,
|
||
author=author,
|
||
image_url=image_url,
|
||
original_source=original_source,
|
||
image_source=image_source,
|
||
uploader=uploader,
|
||
page_url=page_url,
|
||
interest_score=interest_score,
|
||
should_post_tweet=True,
|
||
summary=final_summary
|
||
)
|
||
|
||
if not post_id:
|
||
logger.warning(f"Failed to post Reddit post to WP: {post_data['title']}")
|
||
return None, None
|
||
|
||
post_url_encoded = quote(post_url)
|
||
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
|
||
post_data["content"] = f"{final_summary}\n\n{share_links}"
|
||
post_id, post_url = post_to_wp(
|
||
post_data=post_data,
|
||
category=category,
|
||
link=link,
|
||
author=author,
|
||
image_url=None,
|
||
original_source=original_source,
|
||
image_source=image_source,
|
||
uploader=uploader,
|
||
page_url=page_url,
|
||
interest_score=interest_score,
|
||
post_id=post_id,
|
||
should_post_tweet=False,
|
||
summary=final_summary
|
||
)
|
||
|
||
if post_id:
|
||
logger.info(f"Successfully curated and posted Reddit post: {post_data['title']} (URL: {post_url})")
|
||
return post_id, post_url
|
||
else:
|
||
logger.warning(f"Failed to update Reddit post with share links: {post_data['title']}")
|
||
return None, None
|
||
|
||
except Exception as e:
|
||
logger.error(f"Error curating Reddit post '{post.get('title', 'unknown')}': {e}")
|
||
return None, None
|
||
|
||
def run_reddit_automator():
|
||
lock_fd = None
|
||
try:
|
||
lock_fd = acquire_lock()
|
||
update_system_activity(SCRIPT_NAME, "running", os.getpid()) # Record start
|
||
logging.info("***** Reddit Automator Launched *****")
|
||
# Load JSON files once
|
||
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
|
||
posted_titles = set(entry["title"] for entry in posted_titles_data)
|
||
used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
|
||
used_images = set(entry["title"] for entry in used_images_data if "title" in entry)
|
||
post_data, category, sleep_time = curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used_images)
|
||
if not post_data:
|
||
logging.info("No postable Reddit article found")
|
||
logging.info("Completed Reddit run")
|
||
update_system_activity(SCRIPT_NAME, "stopped") # Record stop
|
||
logging.info(f"Run completed, sleep_time: {sleep_time} seconds")
|
||
return post_data, category, sleep_time
|
||
except Exception as e:
|
||
logging.error(f"Fatal error in run_reddit_automator: {e}", exc_info=True)
|
||
update_system_activity(SCRIPT_NAME, "stopped") # Record stop on error
|
||
sleep_time = random.randint(1200, 1800) # 20–30 minutes
|
||
logging.info(f"Run completed, sleep_time: {sleep_time} seconds")
|
||
return None, None, sleep_time
|
||
finally:
|
||
if lock_fd:
|
||
fcntl.flock(lock_fd, fcntl.LOCK_UN)
|
||
lock_fd.close()
|
||
os.remove(LOCK_FILE) if os.path.exists(LOCK_FILE) else None
|
||
|
||
if __name__ == "__main__":
|
||
setup_logging()
|
||
post_data, category, sleep_time = run_reddit_automator()
|
||
logging.info(f"Run completed, sleep_time: {sleep_time} seconds") |