You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
422 lines
18 KiB
422 lines
18 KiB
# foodie_automator_reddit.py |
|
import requests |
|
import random |
|
import time |
|
import logging |
|
import os |
|
import json |
|
import signal |
|
import sys |
|
import re |
|
from duckduckgo_search import DDGS |
|
from datetime import datetime, timedelta, timezone |
|
from openai import OpenAI |
|
from urllib.parse import quote |
|
from requests.packages.urllib3.util.retry import Retry |
|
from requests.adapters import HTTPAdapter |
|
import praw |
|
from dotenv import load_dotenv |
|
from foodie_config import ( |
|
AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, |
|
PERSONA_CONFIGS, CATEGORIES, get_clean_source_name, |
|
REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT, LIGHT_TASK_MODEL, |
|
X_API_CREDENTIALS |
|
) |
|
from foodie_utils import ( |
|
load_json_file, save_json_file, get_image, generate_image_query, |
|
upload_image_to_wp, determine_paragraph_count, insert_link_naturally, |
|
summarize_with_gpt4o, generate_category_from_summary, post_to_wp, |
|
prepare_post_data, select_best_author, smart_image_and_filter, |
|
get_flickr_image |
|
) |
|
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt # Removed select_best_cta import |
|
|
|
load_dotenv() |
|
|
|
is_posting = False |
|
|
|
def signal_handler(sig, frame): |
|
logging.info("Received termination signal, checking if safe to exit...") |
|
if is_posting: |
|
logging.info("Currently posting, will exit after completion.") |
|
else: |
|
logging.info("Safe to exit immediately.") |
|
sys.exit(0) |
|
|
|
signal.signal(signal.SIGTERM, signal_handler) |
|
signal.signal(signal.SIGINT, signal_handler) |
|
|
|
LOG_FILE = "/home/shane/foodie_automator/foodie_automator_reddit.log" |
|
LOG_PRUNE_DAYS = 30 |
|
|
|
def setup_logging(): |
|
if os.path.exists(LOG_FILE): |
|
with open(LOG_FILE, 'r') as f: |
|
lines = f.readlines() |
|
|
|
log_entries = [] |
|
current_entry = [] |
|
timestamp_pattern = re.compile(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}') |
|
|
|
for line in lines: |
|
if timestamp_pattern.match(line): |
|
if current_entry: |
|
log_entries.append(''.join(current_entry)) |
|
current_entry = [line] |
|
else: |
|
current_entry.append(line) |
|
|
|
if current_entry: |
|
log_entries.append(''.join(current_entry)) |
|
|
|
cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS) |
|
pruned_entries = [] |
|
for entry in log_entries: |
|
try: |
|
timestamp = datetime.strptime(entry[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc) |
|
if timestamp > cutoff: |
|
pruned_entries.append(entry) |
|
except ValueError: |
|
logging.warning(f"Skipping malformed log entry (no timestamp): {entry[:50]}...") |
|
continue |
|
|
|
with open(LOG_FILE, 'w') as f: |
|
f.writelines(pruned_entries) |
|
|
|
logging.basicConfig( |
|
filename=LOG_FILE, |
|
level=logging.INFO, |
|
format="%(asctime)s - %(levelname)s - %(message)s" |
|
) |
|
logging.getLogger("requests").setLevel(logging.WARNING) |
|
logging.getLogger("prawcore").setLevel(logging.WARNING) |
|
console_handler = logging.StreamHandler() |
|
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) |
|
logging.getLogger().addHandler(console_handler) |
|
logging.info("Logging initialized for foodie_automator_reddit.py") |
|
|
|
setup_logging() |
|
|
|
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_reddit_titles.json' |
|
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json' |
|
EXPIRATION_HOURS = 24 |
|
IMAGE_EXPIRATION_DAYS = 7 |
|
|
|
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS) |
|
posted_titles = set(entry["title"] for entry in posted_titles_data if "title" in entry) |
|
used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) |
|
used_images = set(entry["title"] for entry in used_images_data if "title" in entry) |
|
|
|
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) |
|
|
|
def clean_reddit_title(title): |
|
cleaned_title = re.sub(r'^\[.*?\]\s*', '', title).strip() |
|
logging.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'") |
|
return cleaned_title |
|
|
|
def is_interesting_reddit(title, summary, upvotes, comment_count, top_comments): |
|
try: |
|
content = f"Title: {title}\n\nContent: {summary}" |
|
if top_comments: |
|
content += f"\n\nTop Comments:\n{'\n'.join(top_comments)}" |
|
|
|
response = client.chat.completions.create( |
|
model=LIGHT_TASK_MODEL, |
|
messages=[ |
|
{"role": "system", "content": ( |
|
"Rate this Reddit post from 0-10 based on rarity, buzzworthiness, and engagement potential for food lovers, covering food topics (skip recipes). " |
|
"Score 8-10 for rare, highly shareable ideas (e.g., unique dishes or restaurant trends). " |
|
"Score 5-7 for fresh, engaging updates with broad appeal. Score below 5 for common or unremarkable content. " |
|
"Consider comments for added context (e.g., specific locations or unique details). " |
|
"Return only a number." |
|
)}, |
|
{"role": "user", "content": content} |
|
], |
|
max_tokens=5 |
|
) |
|
base_score = int(response.choices[0].message.content.strip()) if response.choices[0].message.content.strip().isdigit() else 0 |
|
|
|
engagement_boost = 0 |
|
if upvotes >= 500: |
|
engagement_boost += 3 |
|
elif upvotes >= 100: |
|
engagement_boost += 2 |
|
elif upvotes >= 50: |
|
engagement_boost += 1 |
|
|
|
if comment_count >= 100: |
|
engagement_boost += 2 |
|
elif comment_count >= 20: |
|
engagement_boost += 1 |
|
|
|
final_score = min(base_score + engagement_boost, 10) |
|
logging.info(f"Reddit Interest Score: {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count}, top_comments: {len(top_comments)}) for '{title}'") |
|
print(f"Interest Score for '{title[:50]}...': {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count})") |
|
return final_score |
|
except Exception as e: |
|
logging.error(f"Reddit interestingness scoring failed: {e}") |
|
print(f"Reddit Interest Error: {e}") |
|
return 0 |
|
|
|
def get_top_comments(post_url, reddit, limit=3): |
|
try: |
|
submission = reddit.submission(url=post_url) |
|
submission.comment_sort = 'top' |
|
submission.comments.replace_more(limit=0) |
|
top_comments = [comment.body for comment in submission.comments[:limit] if not comment.body.startswith('[deleted]')] |
|
logging.info(f"Fetched {len(top_comments)} top comments for {post_url}") |
|
return top_comments |
|
except Exception as e: |
|
logging.error(f"Failed to fetch comments for {post_url}: {e}") |
|
return [] |
|
|
|
def fetch_duckduckgo_news_context(title, hours=24): |
|
try: |
|
with DDGS() as ddgs: |
|
results = ddgs.news(f"{title} news", timelimit="d", max_results=5) |
|
titles = [] |
|
for r in results: |
|
try: |
|
date_str = r["date"] |
|
if '+00:00' in date_str: |
|
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc) |
|
else: |
|
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc) |
|
if dt > (datetime.now(timezone.utc) - timedelta(hours=24)): |
|
titles.append(r["title"].lower()) |
|
except ValueError as e: |
|
logging.warning(f"Date parsing failed for '{date_str}': {e}") |
|
continue |
|
context = " ".join(titles) if titles else "No recent news found within 24 hours" |
|
logging.info(f"DuckDuckGo News context for '{title}': {context}") |
|
return context |
|
except Exception as e: |
|
logging.warning(f"DuckDuckGo News context fetch failed for '{title}': {e}") |
|
return title |
|
|
|
def fetch_reddit_posts(): |
|
reddit = praw.Reddit( |
|
client_id=REDDIT_CLIENT_ID, |
|
client_secret=REDDIT_CLIENT_SECRET, |
|
user_agent=REDDIT_USER_AGENT |
|
) |
|
feeds = ['FoodPorn', 'restaurant', 'FoodIndustry', 'food'] |
|
articles = [] |
|
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS) |
|
|
|
logging.info(f"Starting fetch with cutoff date: {cutoff_date}") |
|
for subreddit_name in feeds: |
|
try: |
|
subreddit = reddit.subreddit(subreddit_name) |
|
for submission in subreddit.top(time_filter='day', limit=100): |
|
pub_date = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc) |
|
if pub_date < cutoff_date: |
|
logging.info(f"Skipping old post: {submission.title} (Published: {pub_date})") |
|
continue |
|
cleaned_title = clean_reddit_title(submission.title) |
|
articles.append({ |
|
"title": cleaned_title, |
|
"raw_title": submission.title, |
|
"link": f"https://www.reddit.com{submission.permalink}", |
|
"summary": submission.selftext, |
|
"feed_title": get_clean_source_name(subreddit_name), |
|
"pub_date": pub_date, |
|
"upvotes": submission.score, |
|
"comment_count": submission.num_comments |
|
}) |
|
logging.info(f"Fetched {len(articles)} posts from r/{subreddit_name}") |
|
except Exception as e: |
|
logging.error(f"Failed to fetch Reddit feed r/{subreddit_name}: {e}") |
|
|
|
logging.info(f"Total Reddit posts fetched: {len(articles)}") |
|
return articles |
|
|
|
def curate_from_reddit(): |
|
articles = fetch_reddit_posts() |
|
if not articles: |
|
print("No Reddit posts available") |
|
logging.info("No Reddit posts available") |
|
return None, None, random.randint(600, 1800) |
|
|
|
articles.sort(key=lambda x: x["upvotes"], reverse=True) |
|
|
|
reddit = praw.Reddit( |
|
client_id=REDDIT_CLIENT_ID, |
|
client_secret=REDDIT_CLIENT_SECRET, |
|
user_agent=REDDIT_USER_AGENT |
|
) |
|
|
|
attempts = 0 |
|
max_attempts = 10 |
|
while attempts < max_attempts and articles: |
|
article = articles.pop(0) |
|
title = article["title"] |
|
raw_title = article["raw_title"] |
|
link = article["link"] |
|
summary = article["summary"] |
|
source_name = "Reddit" |
|
original_source = '<a href="https://www.reddit.com/">Reddit</a>' |
|
|
|
if raw_title in posted_titles: |
|
print(f"Skipping already posted post: {raw_title}") |
|
logging.info(f"Skipping already posted post: {raw_title}") |
|
attempts += 1 |
|
continue |
|
|
|
print(f"Trying Reddit Post: {title} from {source_name}") |
|
logging.info(f"Trying Reddit Post: {title} from {source_name}") |
|
|
|
image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary) |
|
if skip or any(keyword in title.lower() or keyword in raw_title.lower() for keyword in RECIPE_KEYWORDS + ["homemade"]): |
|
print(f"Skipping filtered Reddit post: {title}") |
|
logging.info(f"Skipping filtered Reddit post: {title}") |
|
attempts += 1 |
|
continue |
|
|
|
top_comments = get_top_comments(link, reddit, limit=3) |
|
# Fetch additional context via DDG |
|
ddg_context = fetch_duckduckgo_news_context(title) |
|
content_to_summarize = f"{title}\n\n{summary}\n\nTop Comments:\n{'\n'.join(top_comments) if top_comments else 'None'}\n\nAdditional Context: {ddg_context}" |
|
interest_score = is_interesting_reddit( |
|
title, |
|
summary, |
|
article["upvotes"], |
|
article["comment_count"], |
|
top_comments |
|
) |
|
logging.info(f"Interest Score: {interest_score} for '{title}'") |
|
if interest_score < 6: |
|
print(f"Reddit Interest Too Low: {interest_score}") |
|
logging.info(f"Reddit Interest Too Low: {interest_score}") |
|
attempts += 1 |
|
continue |
|
|
|
num_paragraphs = determine_paragraph_count(interest_score) |
|
extra_prompt = ( |
|
f"Generate exactly {num_paragraphs} paragraphs.\n" |
|
f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n" |
|
f"Incorporate relevant insights from these top comments if available: {', '.join(top_comments) if top_comments else 'None'}.\n" |
|
f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n" |
|
f"Do NOT introduce unrelated concepts unless in the content, comments, or additional context.\n" |
|
f"If brief, expand on the core idea with relevant context about its appeal or significance.\n" |
|
f"Do not include emojis in the summary." |
|
) |
|
|
|
final_summary = summarize_with_gpt4o( |
|
content_to_summarize, |
|
source_name, |
|
link, |
|
interest_score=interest_score, |
|
extra_prompt=extra_prompt |
|
) |
|
if not final_summary: |
|
logging.info(f"Summary failed for '{title}'") |
|
attempts += 1 |
|
continue |
|
|
|
final_summary = insert_link_naturally(final_summary, source_name, link) |
|
|
|
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title) |
|
if not post_data: |
|
attempts += 1 |
|
continue |
|
|
|
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords) |
|
if not image_url: |
|
image_url, image_source, uploader, page_url = get_image(image_query) |
|
|
|
hook = get_dynamic_hook(post_data["title"]).strip() |
|
|
|
# Generate viral share prompt |
|
share_prompt = get_viral_share_prompt(post_data["title"], final_summary) |
|
share_links_template = ( |
|
f'<p>{share_prompt} ' |
|
f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> ' |
|
f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>' |
|
) |
|
post_data["content"] = f"{final_summary}\n\n{share_links_template}" |
|
|
|
global is_posting |
|
is_posting = True |
|
try: |
|
post_id, post_url = post_to_wp( |
|
post_data=post_data, |
|
category=category, |
|
link=link, |
|
author=author, |
|
image_url=image_url, |
|
original_source=original_source, |
|
image_source=image_source, |
|
uploader=uploader, |
|
pixabay_url=pixabay_url, |
|
interest_score=interest_score, |
|
should_post_tweet=True |
|
) |
|
finally: |
|
is_posting = False |
|
|
|
if post_id: |
|
share_text = f"Check out this foodie gem! {post_data['title']}" |
|
share_text_encoded = quote(share_text) |
|
post_url_encoded = quote(post_url) |
|
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded) |
|
post_data["content"] = f"{final_summary}\n\n{share_links}" |
|
is_posting = True |
|
try: |
|
post_to_wp( |
|
post_data=post_data, |
|
category=category, |
|
link=link, |
|
author=author, |
|
image_url=image_url, |
|
original_source=original_source, |
|
image_source=image_source, |
|
uploader=uploader, |
|
pixabay_url=pixabay_url, |
|
interest_score=interest_score, |
|
post_id=post_id, |
|
should_post_tweet=False |
|
) |
|
finally: |
|
is_posting = False |
|
|
|
timestamp = datetime.now(timezone.utc).isoformat() |
|
save_json_file(POSTED_TITLES_FILE, raw_title, timestamp) |
|
posted_titles.add(raw_title) |
|
logging.info(f"Successfully saved '{raw_title}' to {POSTED_TITLES_FILE} with timestamp {timestamp}") |
|
|
|
if image_url: |
|
save_json_file(USED_IMAGES_FILE, image_url, timestamp) |
|
used_images.add(image_url) |
|
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE} with timestamp {timestamp}") |
|
|
|
print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Reddit *****") |
|
print(f"Actual post URL: {post_url}") |
|
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Reddit *****") |
|
logging.info(f"Actual post URL: {post_url}") |
|
return post_data, category, random.randint(0, 1800) |
|
|
|
attempts += 1 |
|
logging.info(f"WP posting failed for '{post_data['title']}'") |
|
|
|
print("No interesting Reddit post found after attempts") |
|
logging.info("No interesting Reddit post found after attempts") |
|
return None, None, random.randint(600, 1800) |
|
|
|
def run_reddit_automator(): |
|
print(f"{datetime.now(timezone.utc)} - INFO - ***** Reddit Automator Launched *****") |
|
logging.info("***** Reddit Automator Launched *****") |
|
|
|
post_data, category, sleep_time = curate_from_reddit() |
|
if not post_data: |
|
print(f"No postable Reddit article found - sleeping for {sleep_time} seconds") |
|
logging.info(f"No postable Reddit article found - sleeping for {sleep_time} seconds") |
|
else: |
|
print(f"Completed Reddit run with sleep time: {sleep_time} seconds") |
|
logging.info(f"Completed Reddit run with sleep time: {sleep_time} seconds") |
|
print(f"Sleeping for {sleep_time}s") |
|
time.sleep(sleep_time) |
|
return post_data, category, sleep_time |
|
|
|
if __name__ == "__main__": |
|
run_reddit_automator() |