foodie-automator/foodie_automator_reddit.py

# foodie_automator_reddit.py
import requests
import random
import time
import logging
import os
import json
import signal
import sys
import re
from duckduckgo_search import DDGS
from datetime import datetime, timedelta, timezone
from openai import OpenAI
from urllib.parse import quote
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import praw
from dotenv import load_dotenv
from foodie_config import (
    AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS,
    PERSONA_CONFIGS, CATEGORIES, get_clean_source_name,
    REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT, LIGHT_TASK_MODEL,
    X_API_CREDENTIALS
)
from foodie_utils import (
    load_json_file, save_json_file, get_image, generate_image_query,
    upload_image_to_wp, determine_paragraph_count, insert_link_naturally,
    summarize_with_gpt4o, generate_category_from_summary, post_to_wp,
    prepare_post_data, select_best_author, smart_image_and_filter,
    get_flickr_image
)
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt  # Removed select_best_cta import

load_dotenv()

is_posting = False

def signal_handler(sig, frame):
    logging.info("Received termination signal, checking if safe to exit...")
    if is_posting:
        logging.info("Currently posting, will exit after completion.")
    else:
        logging.info("Safe to exit immediately.")
        sys.exit(0)

signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)

LOG_FILE = "/home/shane/foodie_automator/foodie_automator_reddit.log"
LOG_PRUNE_DAYS = 30

def setup_logging():
    if os.path.exists(LOG_FILE):
        with open(LOG_FILE, 'r') as f:
            lines = f.readlines()

        log_entries = []
        current_entry = []
        timestamp_pattern = re.compile(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}')

        for line in lines:
            if timestamp_pattern.match(line):
                if current_entry:
                    log_entries.append(''.join(current_entry))
                current_entry = [line]
            else:
                current_entry.append(line)

        if current_entry:
            log_entries.append(''.join(current_entry))

        cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
        pruned_entries = []
        for entry in log_entries:
            try:
                timestamp = datetime.strptime(entry[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc)
                if timestamp > cutoff:
                    pruned_entries.append(entry)
            except ValueError:
                logging.warning(f"Skipping malformed log entry (no timestamp): {entry[:50]}...")
                continue

        with open(LOG_FILE, 'w') as f:
            f.writelines(pruned_entries)

    logging.basicConfig(
        filename=LOG_FILE,
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s"
    )
    logging.getLogger("requests").setLevel(logging.WARNING)
    logging.getLogger("prawcore").setLevel(logging.WARNING)
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    logging.getLogger().addHandler(console_handler)
    logging.info("Logging initialized for foodie_automator_reddit.py")

setup_logging()

POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_reddit_titles.json'
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
EXPIRATION_HOURS = 24
IMAGE_EXPIRATION_DAYS = 7

posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
posted_titles = set(entry["title"] for entry in posted_titles_data if "title" in entry)
used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
used_images = set(entry["title"] for entry in used_images_data if "title" in entry)

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def clean_reddit_title(title):
    cleaned_title = re.sub(r'^\[.*?\]\s*', '', title).strip()
    logging.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'")
    return cleaned_title

def is_interesting_reddit(title, summary, upvotes, comment_count, top_comments):
    try:
        content = f"Title: {title}\n\nContent: {summary}"
        if top_comments:
            content += f"\n\nTop Comments:\n{'\n'.join(top_comments)}"

        response = client.chat.completions.create(
            model=LIGHT_TASK_MODEL,
            messages=[
                {"role": "system", "content": (
                    "Rate this Reddit post from 0-10 based on rarity, buzzworthiness, and engagement potential for food lovers, covering food topics (skip recipes). "
                    "Score 8-10 for rare, highly shareable ideas (e.g., unique dishes or restaurant trends). "
                    "Score 5-7 for fresh, engaging updates with broad appeal. Score below 5 for common or unremarkable content. "
                    "Consider comments for added context (e.g., specific locations or unique details). "
                    "Return only a number."
                )},
                {"role": "user", "content": content}
            ],
            max_tokens=5
        )
        base_score = int(response.choices[0].message.content.strip()) if response.choices[0].message.content.strip().isdigit() else 0

        engagement_boost = 0
        if upvotes >= 500:
            engagement_boost += 3
        elif upvotes >= 100:
            engagement_boost += 2
        elif upvotes >= 50:
            engagement_boost += 1

        if comment_count >= 100:
            engagement_boost += 2
        elif comment_count >= 20:
            engagement_boost += 1

        final_score = min(base_score + engagement_boost, 10)
        logging.info(f"Reddit Interest Score: {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count}, top_comments: {len(top_comments)}) for '{title}'")
        print(f"Interest Score for '{title[:50]}...': {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count})")
        return final_score
    except Exception as e:
        logging.error(f"Reddit interestingness scoring failed: {e}")
        print(f"Reddit Interest Error: {e}")
        return 0

def get_top_comments(post_url, reddit, limit=3):
    try:
        submission = reddit.submission(url=post_url)
        submission.comment_sort = 'top'
        submission.comments.replace_more(limit=0)
        top_comments = [comment.body for comment in submission.comments[:limit] if not comment.body.startswith('[deleted]')]
        logging.info(f"Fetched {len(top_comments)} top comments for {post_url}")
        return top_comments
    except Exception as e:
        logging.error(f"Failed to fetch comments for {post_url}: {e}")
        return []

def fetch_duckduckgo_news_context(title, hours=24):
    try:
        with DDGS() as ddgs:
            results = ddgs.news(f"{title} news", timelimit="d", max_results=5)
            titles = []
            for r in results:
                try:
                    date_str = r["date"]
                    if '+00:00' in date_str:
                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
                    else:
                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
                    if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
                        titles.append(r["title"].lower())
                except ValueError as e:
                    logging.warning(f"Date parsing failed for '{date_str}': {e}")
                    continue
            context = " ".join(titles) if titles else "No recent news found within 24 hours"
            logging.info(f"DuckDuckGo News context for '{title}': {context}")
            return context
    except Exception as e:
        logging.warning(f"DuckDuckGo News context fetch failed for '{title}': {e}")
        return title

def fetch_reddit_posts():
    reddit = praw.Reddit(
        client_id=REDDIT_CLIENT_ID,
        client_secret=REDDIT_CLIENT_SECRET,
        user_agent=REDDIT_USER_AGENT
    )
    feeds = ['FoodPorn', 'restaurant', 'FoodIndustry', 'food']
    articles = []
    cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS)

    logging.info(f"Starting fetch with cutoff date: {cutoff_date}")
    for subreddit_name in feeds:
        try:
            subreddit = reddit.subreddit(subreddit_name)
            for submission in subreddit.top(time_filter='day', limit=100):
                pub_date = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
                if pub_date < cutoff_date:
                    logging.info(f"Skipping old post: {submission.title} (Published: {pub_date})")
                    continue
                cleaned_title = clean_reddit_title(submission.title)
                articles.append({
                    "title": cleaned_title,
                    "raw_title": submission.title,
                    "link": f"https://www.reddit.com{submission.permalink}",
                    "summary": submission.selftext,
                    "feed_title": get_clean_source_name(subreddit_name),
                    "pub_date": pub_date,
                    "upvotes": submission.score,
                    "comment_count": submission.num_comments
                })
            logging.info(f"Fetched {len(articles)} posts from r/{subreddit_name}")
        except Exception as e:
            logging.error(f"Failed to fetch Reddit feed r/{subreddit_name}: {e}")

    logging.info(f"Total Reddit posts fetched: {len(articles)}")
    return articles

def curate_from_reddit():
    articles = fetch_reddit_posts()
    if not articles:
        print("No Reddit posts available")
        logging.info("No Reddit posts available")
        return None, None, random.randint(600, 1800)

    articles.sort(key=lambda x: x["upvotes"], reverse=True)

    reddit = praw.Reddit(
        client_id=REDDIT_CLIENT_ID,
        client_secret=REDDIT_CLIENT_SECRET,
        user_agent=REDDIT_USER_AGENT
    )

    attempts = 0
    max_attempts = 10
    while attempts < max_attempts and articles:
        article = articles.pop(0)
        title = article["title"]
        raw_title = article["raw_title"]
        link = article["link"]
        summary = article["summary"]
        source_name = "Reddit"
        original_source = '<a href="https://www.reddit.com/">Reddit</a>'

        if raw_title in posted_titles:
            print(f"Skipping already posted post: {raw_title}")
            logging.info(f"Skipping already posted post: {raw_title}")
            attempts += 1
            continue

        print(f"Trying Reddit Post: {title} from {source_name}")
        logging.info(f"Trying Reddit Post: {title} from {source_name}")

        image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
        if skip or any(keyword in title.lower() or keyword in raw_title.lower() for keyword in RECIPE_KEYWORDS + ["homemade"]):
            print(f"Skipping filtered Reddit post: {title}")
            logging.info(f"Skipping filtered Reddit post: {title}")
            attempts += 1
            continue

        top_comments = get_top_comments(link, reddit, limit=3)
        # Fetch additional context via DDG
        ddg_context = fetch_duckduckgo_news_context(title)
        content_to_summarize = f"{title}\n\n{summary}\n\nTop Comments:\n{'\n'.join(top_comments) if top_comments else 'None'}\n\nAdditional Context: {ddg_context}"
        interest_score = is_interesting_reddit(
            title,
            summary,
            article["upvotes"],
            article["comment_count"],
            top_comments
        )
        logging.info(f"Interest Score: {interest_score} for '{title}'")
        if interest_score < 6:
            print(f"Reddit Interest Too Low: {interest_score}")
            logging.info(f"Reddit Interest Too Low: {interest_score}")
            attempts += 1
            continue

        num_paragraphs = determine_paragraph_count(interest_score)
        extra_prompt = (
            f"Generate exactly {num_paragraphs} paragraphs.\n"
            f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
            f"Incorporate relevant insights from these top comments if available: {', '.join(top_comments) if top_comments else 'None'}.\n"
            f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
            f"Do NOT introduce unrelated concepts unless in the content, comments, or additional context.\n"
            f"If brief, expand on the core idea with relevant context about its appeal or significance.\n"
            f"Do not include emojis in the summary."
        )

        final_summary = summarize_with_gpt4o(
            content_to_summarize,
            source_name,
            link,
            interest_score=interest_score,
            extra_prompt=extra_prompt
        )
        if not final_summary:
            logging.info(f"Summary failed for '{title}'")
            attempts += 1
            continue

        final_summary = insert_link_naturally(final_summary, source_name, link)

        post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
        if not post_data:
            attempts += 1
            continue

        image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
        if not image_url:
            image_url, image_source, uploader, page_url = get_image(image_query)

        hook = get_dynamic_hook(post_data["title"]).strip()

        # Generate viral share prompt
        share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
        share_links_template = (
            f'<p>{share_prompt} '
            f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
            f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
        )
        post_data["content"] = f"{final_summary}\n\n{share_links_template}"

        global is_posting
        is_posting = True
        try:
            post_id, post_url = post_to_wp(
                post_data=post_data,
                category=category,
                link=link,
                author=author,
                image_url=image_url,
                original_source=original_source,
                image_source=image_source,
                uploader=uploader,
                pixabay_url=pixabay_url,
                interest_score=interest_score,
                should_post_tweet=True
            )
        finally:
            is_posting = False

        if post_id:
            share_text = f"Check out this foodie gem! {post_data['title']}"
            share_text_encoded = quote(share_text)
            post_url_encoded = quote(post_url)
            share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
            post_data["content"] = f"{final_summary}\n\n{share_links}"
            is_posting = True
            try:
                post_to_wp(
                    post_data=post_data,
                    category=category,
                    link=link,
                    author=author,
                    image_url=image_url,
                    original_source=original_source,
                    image_source=image_source,
                    uploader=uploader,
                    pixabay_url=pixabay_url,
                    interest_score=interest_score,
                    post_id=post_id,
                    should_post_tweet=False
                )
            finally:
                is_posting = False

            timestamp = datetime.now(timezone.utc).isoformat()
            save_json_file(POSTED_TITLES_FILE, raw_title, timestamp)
            posted_titles.add(raw_title)
            logging.info(f"Successfully saved '{raw_title}' to {POSTED_TITLES_FILE} with timestamp {timestamp}")

            if image_url:
                save_json_file(USED_IMAGES_FILE, image_url, timestamp)
                used_images.add(image_url)
                logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE} with timestamp {timestamp}")

            print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Reddit *****")
            print(f"Actual post URL: {post_url}")
            logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Reddit *****")
            logging.info(f"Actual post URL: {post_url}")
            return post_data, category, random.randint(0, 1800)

        attempts += 1
        logging.info(f"WP posting failed for '{post_data['title']}'")

    print("No interesting Reddit post found after attempts")
    logging.info("No interesting Reddit post found after attempts")
    return None, None, random.randint(600, 1800)

def run_reddit_automator():
    print(f"{datetime.now(timezone.utc)} - INFO - ***** Reddit Automator Launched *****")
    logging.info("***** Reddit Automator Launched *****")

    post_data, category, sleep_time = curate_from_reddit()
    if not post_data:
        print(f"No postable Reddit article found - sleeping for {sleep_time} seconds")
        logging.info(f"No postable Reddit article found - sleeping for {sleep_time} seconds")
    else:
        print(f"Completed Reddit run with sleep time: {sleep_time} seconds")
        logging.info(f"Completed Reddit run with sleep time: {sleep_time} seconds")
    print(f"Sleeping for {sleep_time}s")
    time.sleep(sleep_time)
    return post_data, category, sleep_time

if __name__ == "__main__":
    run_reddit_automator()