You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

572 lines
26 KiB

# foodie_automator_reddit.py
import requests
import random
import time
import logging
import os
import json
import signal
import sys
import re
from duckduckgo_search import DDGS
from datetime import datetime, timedelta, timezone
from openai import OpenAI
from urllib.parse import quote
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import praw
from dotenv import load_dotenv
from foodie_config import (
AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS,
PERSONA_CONFIGS, CATEGORIES, get_clean_source_name,
REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT, LIGHT_TASK_MODEL,
X_API_CREDENTIALS
)
from foodie_utils import (
load_json_file, save_json_file, get_image, generate_image_query,
upload_image_to_wp, determine_paragraph_count, insert_link_naturally,
is_interesting, generate_title_from_summary, summarize_with_gpt4o,
generate_category_from_summary, post_to_wp, prepare_post_data,
select_best_author, smart_image_and_filter, get_flickr_image,
get_next_author_round_robin, check_author_rate_limit, update_system_activity
)
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
import fcntl
load_dotenv()
SCRIPT_NAME = "foodie_automator_reddit"
is_posting = False
LOCK_FILE = "/home/shane/foodie_automator/locks/foodie_automator_reddit.lock"
def signal_handler(sig, frame):
logging.info("Received termination signal, marking script as stopped...")
update_system_activity(SCRIPT_NAME, "stopped")
if is_posting:
logging.info("Currently posting, will exit after completion.")
else:
logging.info("Safe to exit immediately.")
sys.exit(0)
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
LOG_FILE = "/home/shane/foodie_automator/logs/foodie_automator_reddit.log"
LOG_PRUNE_DAYS = 30
MAX_RETRIES = 3
RETRY_BACKOFF = 2
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_reddit_titles.json'
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
EXPIRATION_HOURS = 24
IMAGE_EXPIRATION_DAYS = 7
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
posted_titles = set(entry["title"] for entry in posted_titles_data if "title" in entry)
used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
used_images = set(entry["title"] for entry in used_images_data if "title" in entry)
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def setup_logging():
try:
# Ensure log directory exists
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
logging.debug(f"Log directory created/verified: {os.path.dirname(LOG_FILE)}")
# Check write permissions
if not os.access(os.path.dirname(LOG_FILE), os.W_OK):
raise PermissionError(f"No write permission for {os.path.dirname(LOG_FILE)}")
# Test write to log file
try:
with open(LOG_FILE, 'a') as f:
f.write("")
logging.debug(f"Confirmed write access to {LOG_FILE}")
except Exception as e:
raise PermissionError(f"Cannot write to {LOG_FILE}: {e}")
# Prune old logs
if os.path.exists(LOG_FILE):
with open(LOG_FILE, 'r') as f:
lines = f.readlines()
log_entries = []
current_entry = []
timestamp_pattern = re.compile(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}')
for line in lines:
if timestamp_pattern.match(line):
if current_entry:
log_entries.append(''.join(current_entry))
current_entry = [line]
else:
current_entry.append(line)
if current_entry:
log_entries.append(''.join(current_entry))
cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
pruned_entries = []
for entry in log_entries:
try:
timestamp = datetime.strptime(entry[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc)
if timestamp > cutoff:
pruned_entries.append(entry)
except ValueError:
logging.warning(f"Skipping malformed log entry (no timestamp): {entry[:50]}...")
continue
with open(LOG_FILE, 'w') as f:
f.writelines(pruned_entries)
logging.debug(f"Log file pruned: {LOG_FILE}")
# Configure logging
logging.basicConfig(
filename=LOG_FILE,
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
force=True # Ensure this config takes precedence
)
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("prawcore").setLevel(logging.WARNING)
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logging.getLogger().addHandler(console_handler)
logging.info("Logging initialized for foodie_automator_reddit.py")
except Exception as e:
# Fallback to console logging if file logging fails
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
force=True
)
logging.error(f"Failed to setup file logging for {LOG_FILE}: {e}. Using console logging.")
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logging.getLogger().addHandler(console_handler)
logging.info("Console logging initialized as fallback for foodie_automator_reddit.py")
def acquire_lock():
os.makedirs(os.path.dirname(LOCK_FILE), exist_ok=True)
lock_fd = open(LOCK_FILE, 'w')
try:
fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
lock_fd.write(str(os.getpid()))
lock_fd.flush()
return lock_fd
except IOError:
logging.info("Another instance of foodie_automator_reddit.py is running")
sys.exit(0)
def clean_reddit_title(title):
"""Clean Reddit post title by removing prefixes, newlines, and special characters."""
if not title or not isinstance(title, str):
logging.warning(f"Invalid title received: {title}")
return ""
# Remove [prefixes], newlines, and excessive whitespace
cleaned_title = re.sub(r'^\[.*?\]\s*', '', title) # Remove [prefix]
cleaned_title = re.sub(r'\n+', ' ', cleaned_title) # Replace newlines with space
cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip() # Normalize spaces
# Remove special characters (keep alphanumeric, spaces, and basic punctuation)
cleaned_title = re.sub(r'[^\w\s.,!?-]', '', cleaned_title)
logging.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'")
return cleaned_title
def is_interesting_reddit(title, summary, upvotes, comment_count, top_comments):
for attempt in range(MAX_RETRIES):
try:
content = f"Title: {title}\n\nContent: {summary}"
if top_comments:
content += f"\n\nTop Comments:\n{'\n'.join(top_comments)}"
response = client.chat.completions.create(
model=LIGHT_TASK_MODEL,
messages=[
{"role": "system", "content": (
"Rate this Reddit post from 0-10 based on rarity, buzzworthiness, and engagement potential for food lovers, covering food topics (skip recipes). "
"Score 8-10 for rare, highly shareable ideas (e.g., unique dishes or restaurant trends). "
"Score 5-7 for fresh, engaging updates with broad appeal. Score below 5 for common or unremarkable content. "
"Consider comments for added context (e.g., specific locations or unique details). "
"Return only a number"
)},
{"role": "user", "content": content}
],
max_tokens=5
)
base_score = int(response.choices[0].message.content.strip()) if response.choices[0].message.content.strip().isdigit() else 0
engagement_boost = 0
if upvotes >= 500:
engagement_boost += 3
elif upvotes >= 100:
engagement_boost += 2
elif upvotes >= 50:
engagement_boost += 1
if comment_count >= 100:
engagement_boost += 2
elif comment_count >= 20:
engagement_boost += 1
final_score = min(base_score + engagement_boost, 10)
logging.info(f"Reddit Interest Score: {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count}, top_comments: {len(top_comments)}) for '{title}'")
return final_score
except Exception as e:
logging.warning(f"Reddit interestingness scoring failed (attempt {attempt + 1}): {e}")
if attempt < MAX_RETRIES - 1:
time.sleep(RETRY_BACKOFF * (2 ** attempt))
continue
logging.error(f"Failed to score Reddit post '{title}' after {MAX_RETRIES} attempts")
return 0
def get_top_comments(post_url, reddit, limit=3):
for attempt in range(MAX_RETRIES):
try:
submission = reddit.submission(url=post_url)
submission.comment_sort = 'top'
submission.comments.replace_more(limit=0)
top_comments = [comment.body for comment in submission.comments[:limit] if not comment.body.startswith('[deleted]')]
logging.info(f"Fetched {len(top_comments)} top comments for {post_url}")
return top_comments
except Exception as e:
logging.warning(f"Failed to fetch comments for {post_url} (attempt {attempt + 1}): {e}")
if attempt < MAX_RETRIES - 1:
time.sleep(RETRY_BACKOFF * (2 ** attempt))
continue
logging.error(f"Failed to fetch comments for {post_url} after {MAX_RETRIES} attempts")
return []
def fetch_duckduckgo_news_context(title, hours=24):
for attempt in range(MAX_RETRIES):
try:
with DDGS() as ddgs:
results = ddgs.news(f"{title} news", timelimit="d", max_results=5)
titles = []
for r in results:
try:
date_str = r["date"]
if '+00:00' in date_str:
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
else:
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
titles.append(r["title"].lower())
except ValueError as e:
logging.warning(f"Date parsing failed for '{date_str}': {e}")
continue
context = " ".join(titles) if titles else "No recent news found within 24 hours"
logging.info(f"DuckDuckGo News context for '{title}': {context}")
return context
except Exception as e:
logging.warning(f"DuckDuckGo News context fetch failed for '{title}' (attempt {attempt + 1}): {e}")
if attempt < MAX_RETRIES - 1:
time.sleep(RETRY_BACKOFF * (2 ** attempt))
continue
logging.error(f"Failed to fetch DuckDuckGo News context for '{title}' after {MAX_RETRIES} attempts")
return title
def fetch_reddit_posts():
"""Fetch Reddit posts from specified subreddits, filtering low-quality and [homemade] posts."""
try:
reddit = praw.Reddit(
client_id=REDDIT_CLIENT_ID,
client_secret=REDDIT_CLIENT_SECRET,
user_agent=REDDIT_USER_AGENT
)
feeds = [
"food",
"FoodPorn",
"spicy",
"KoreanFood",
"JapaneseFood",
"DessertPorn",
"ChineseFood",
"IndianFood"
]
articles = []
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS)
logging.info(f"Starting fetch with cutoff date: {cutoff_date}")
for subreddit_name in feeds:
for attempt in range(MAX_RETRIES):
try:
subreddit = reddit.subreddit(subreddit_name)
for submission in subreddit.top(time_filter='day', limit=100):
pub_date = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
if pub_date < cutoff_date:
logging.info(f"Skipping old post: {submission.title} (Published: {pub_date})")
continue
if "[homemade]" in submission.title.lower():
logging.info(f"Skipping homemade post: {submission.title}")
continue
cleaned_title = clean_reddit_title(submission.title)
if not cleaned_title or len(cleaned_title) < 5:
logging.info(f"Skipping post with invalid or short title: {submission.title}")
continue
# Filter out posts with empty or very short summaries
summary = submission.selftext.strip() if submission.selftext else ""
if len(summary) < 20 and not submission.url.endswith(('.jpg', '.jpeg', '.png', '.gif')):
logging.info(f"Skipping post with insufficient summary: {cleaned_title}")
continue
# Fetch top comments for additional context
top_comments = get_top_comments(f"https://www.reddit.com{submission.permalink}", reddit)
articles.append({
"title": cleaned_title,
"raw_title": submission.title,
"link": f"https://www.reddit.com{submission.permalink}",
"summary": summary,
"feed_title": get_clean_source_name(subreddit_name),
"pub_date": pub_date,
"upvotes": submission.score,
"comment_count": submission.num_comments,
"top_comments": top_comments
})
logging.info(f"Fetched {len(articles)} posts from r/{subreddit_name}")
break
except Exception as e:
logging.error(f"Failed to fetch Reddit feed r/{subreddit_name} (attempt {attempt + 1}): {e}")
if attempt < MAX_RETRIES - 1:
time.sleep(RETRY_BACKOFF * (2 ** attempt))
continue
logging.info(f"Total Reddit posts fetched: {len(articles)}")
return articles
except Exception as e:
logging.error(f"Unexpected error in fetch_reddit_posts: {e}", exc_info=True)
return []
def curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used_images):
try:
logging.debug(f"Using {len(posted_titles)} posted titles and {len(used_images)} used images")
posts = fetch_reddit_posts()
if not posts:
logging.info("No Reddit posts available")
sleep_time = random.randint(1200, 1800)
return None, None, sleep_time
attempts = 0
max_attempts = 10
while attempts < max_attempts and posts:
post = posts.pop(0)
title = post["title"]
link = post.get("link", "")
summary = post.get("summary", "")
source_name = "Reddit"
original_source = f'<a href="{link}">{source_name}</a>'
original_url = link
upvotes = post.get("upvotes", 0)
comment_count = post.get("comment_count", 0)
top_comments = post.get("top_comments", [])
if title in posted_titles:
logging.info(f"Skipping already posted Reddit post: {title}")
attempts += 1
continue
if upvotes < 300:
logging.info(f"Skipping post '{title}' due to insufficient upvotes ({upvotes} < 300)")
attempts += 1
continue
author = get_next_author_round_robin()
if not author:
logging.info(f"Skipping post '{title}' due to tweet rate limits for all authors")
attempts += 1
continue
author_username = author["username"]
logging.info(f"Selected author via round-robin: {author_username}")
logging.info(f"Trying Reddit Post: {title} from {source_name}")
# Combine summary and top comments for smart_image_and_filter
enhanced_summary = summary
if top_comments:
enhanced_summary += "\n\nTop Comments:\n" + "\n".join(top_comments)
try:
image_query, relevance_keywords, main_topic, skip, specific_term = smart_image_and_filter(title, enhanced_summary)
except Exception as e:
logging.warning(f"Failed to process smart_image_and_filter for '{title}': {e}")
attempts += 1
continue
if skip:
logging.info(f"Skipping filtered Reddit post: {title}")
attempts += 1
continue
ddg_context = fetch_duckduckgo_news_context(title)
scoring_content = f"Title: {title}\n\nContent: {summary}\n\nTop Comments: {top_comments}\n\nAdditional Context: {ddg_context}"
logging.debug(f"Scoring content for '{title}': {scoring_content}")
interest_score = is_interesting_reddit(title, summary, upvotes, comment_count, top_comments)
logging.info(f"Interest score for '{title}': {interest_score}")
if interest_score < 6:
logging.info(f"Reddit Interest Too Low: {interest_score}")
attempts += 1
continue
num_paragraphs = determine_paragraph_count(interest_score)
extra_prompt = (
f"Generate exactly {num_paragraphs} paragraphs.\n"
f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
f"Do NOT introduce unrelated concepts unless in the content or additional context.\n"
f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
f"Do not include emojis in the summary."
)
content_to_summarize = scoring_content
final_summary = summarize_with_gpt4o(
content_to_summarize,
source_name,
link,
interest_score=interest_score,
extra_prompt=extra_prompt
)
if not final_summary:
logging.info(f"Summary failed for '{title}'")
attempts += 1
continue
final_summary = insert_link_naturally(final_summary, source_name, link)
post_data = {
"title": generate_title_from_summary(final_summary),
"content": final_summary,
"status": "publish",
"author": author_username,
"categories": [generate_category_from_summary(final_summary)]
}
category = post_data["categories"][0]
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic, specific_term)
if not image_url:
logging.warning(f"Flickr image fetch failed for '{image_query}', trying fallback")
image_url, image_source, uploader, page_url = get_image(image_query, specific_term)
if not image_url:
logging.warning(f"All image uploads failed for '{title}' - posting without image")
image_source = None
uploader = None
page_url = None
hook = get_dynamic_hook(post_data["title"]).strip()
share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
share_text = f"Check out this foodie gem! {post_data['title']}"
share_text_encoded = quote(share_text)
share_links_template = (
f'<p>{share_prompt} '
f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={share_text_encoded}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
)
post_data["content"] = f"{final_summary}\n\n{share_links_template}"
global is_posting
is_posting = True
try:
post_id, post_url = post_to_wp(
post_data=post_data,
category=category,
link=link,
author=author,
image_url=image_url,
original_source=original_source,
image_source=image_source,
uploader=uploader,
page_url=page_url,
interest_score=interest_score,
should_post_tweet=True,
summary=final_summary
)
if not post_id:
logging.warning(f"Failed to post to WordPress for '{title}', using original URL: {original_url}")
post_url = original_url
else:
logging.info(f"Posted to WordPress for {author_username}: {post_url}")
post_url_encoded = quote(post_url)
share_links = share_links_template.format(post_url=post_url_encoded)
post_data["content"] = f"{final_summary}\n\n{share_links}"
post_data["post_id"] = post_id
if post_id:
post_to_wp(
post_data=post_data,
category=category,
link=link,
author=author,
image_url=None,
original_source=original_source,
image_source=image_source,
uploader=uploader,
page_url=page_url,
interest_score=interest_score,
post_id=post_id,
should_post_tweet=False
)
except Exception as e:
logging.error(f"Failed to post to WordPress for '{title}': {e}", exc_info=True)
post_url = original_url
finally:
is_posting = False
timestamp = datetime.now(timezone.utc).isoformat()
save_json_file(POSTED_TITLES_FILE, title, timestamp)
posted_titles.add(title)
logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")
if image_url:
save_json_file(USED_IMAGES_FILE, image_url, timestamp)
used_images.add(image_url)
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id or 'N/A'}) from Reddit *****")
sleep_time = random.randint(1200, 1800)
return post_data, category, sleep_time
logging.info("No interesting Reddit post found after attempts")
sleep_time = random.randint(1200, 1800)
return None, None, sleep_time
except Exception as e:
logging.error(f"Unexpected error in curate_from_reddit: {e}", exc_info=True)
sleep_time = random.randint(1200, 1800)
return None, None, sleep_time
def run_reddit_automator():
lock_fd = None
try:
lock_fd = acquire_lock()
update_system_activity(SCRIPT_NAME, "running", os.getpid()) # Record start
logging.info("***** Reddit Automator Launched *****")
# Load JSON files once
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
posted_titles = set(entry["title"] for entry in posted_titles_data)
used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
used_images = set(entry["title"] for entry in used_images_data if "title" in entry)
post_data, category, sleep_time = curate_from_reddit(posted_titles_data, posted_titles, used_images_data, used_images)
if not post_data:
logging.info("No postable Reddit article found")
logging.info("Completed Reddit run")
logging.info(f"Run completed, sleep_time: {sleep_time} seconds")
# Sleep while still marked as running
time.sleep(sleep_time)
update_system_activity(SCRIPT_NAME, "stopped") # Record stop after sleep
return post_data, category, sleep_time
except Exception as e:
logging.error(f"Fatal error in run_reddit_automator: {e}", exc_info=True)
update_system_activity(SCRIPT_NAME, "stopped") # Record stop on error
sleep_time = random.randint(1200, 1800) # 20–30 minutes
logging.info(f"Run completed, sleep_time: {sleep_time} seconds")
return None, None, sleep_time
finally:
if lock_fd:
fcntl.flock(lock_fd, fcntl.LOCK_UN)
lock_fd.close()
os.remove(LOCK_FILE) if os.path.exists(LOCK_FILE) else None
if __name__ == "__main__":
setup_logging()
post_data, category, sleep_time = run_reddit_automator()
logging.info(f"Run completed, sleep_time: {sleep_time} seconds")