foodie-automator/foodie_automator_reddit.py

import requests
import random
import time
import logging
import os
import json
from datetime import datetime, timedelta, timezone
from openai import OpenAI
from urllib.parse import quote
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import praw
from foodie_config import (
    AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS,
    SUMMARY_PERSONA_PROMPTS, CATEGORIES, CTAS, get_clean_source_name,
    REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT, LIGHT_TASK_MODEL
)
from foodie_utils import (
    load_json_file, save_json_file, get_image, generate_image_query,
    upload_image_to_wp, determine_paragraph_count, insert_link_naturally,
    summarize_with_gpt4o, generate_category_from_summary, post_to_wp,
    prepare_post_data, select_best_author, smart_image_and_filter, get_flickr_image_via_ddg
)
from foodie_hooks import get_dynamic_hook, select_best_cta

LOG_FILE = "/home/shane/foodie_automator/foodie_automator_reddit.log"
LOG_PRUNE_DAYS = 30

def setup_logging():
    if os.path.exists(LOG_FILE):
        with open(LOG_FILE, 'r') as f:
            lines = f.readlines()
        cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
        pruned_lines = []
        for line in lines:
            try:
                timestamp = datetime.strptime(line[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc)
                if timestamp > cutoff:
                    pruned_lines.append(line)
            except ValueError:
                logging.warning(f"Skipping malformed log line: {line.strip()[:50]}...")
                continue
        with open(LOG_FILE, 'w') as f:
            f.writelines(pruned_lines)

    logging.basicConfig(
        filename=LOG_FILE,
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s"
    )
    logging.getLogger("requests").setLevel(logging.WARNING)
    logging.getLogger("prawcore").setLevel(logging.WARNING)
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    logging.getLogger().addHandler(console_handler)
    logging.info("Logging initialized for foodie_automator_reddit.py")

setup_logging()

POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_reddit_titles.json'
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
EXPIRATION_HOURS = 24
IMAGE_EXPIRATION_DAYS = 7

posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
posted_titles = set(entry["title"] for entry in posted_titles_data if "title" in entry)
used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
used_images = set(entry["title"] for entry in used_images_data if "title" in entry)

client = OpenAI(api_key="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")

def is_interesting_reddit(title, summary, upvotes, comment_count, top_comments):
    try:
        content = f"Title: {title}\n\nContent: {summary}"
        if top_comments:
            content += f"\n\nTop Comments:\n{'\n'.join(top_comments)}"

        response = client.chat.completions.create(
            model=LIGHT_TASK_MODEL,
            messages=[
                {"role": "system", "content": (
                    "Rate this Reddit post from 0-10 based on rarity, buzzworthiness, and engagement potential for food lovers, covering food topics (skip recipes). "
                    "Score 8-10 for rare, highly shareable ideas (e.g., unique dishes or restaurant trends). "
                    "Score 5-7 for fresh, engaging updates with broad appeal. Score below 5 for common or unremarkable content. "
                    "Consider comments for added context (e.g., specific locations or unique details). "
                    "Return only a number."
                )},
                {"role": "user", "content": content}
            ],
            max_tokens=5
        )
        base_score = int(response.choices[0].message.content.strip()) if response.choices[0].message.content.strip().isdigit() else 0

        engagement_boost = 0
        if upvotes >= 500:
            engagement_boost += 3
        elif upvotes >= 100:
            engagement_boost += 2
        elif upvotes >= 50:
            engagement_boost += 1

        if comment_count >= 100:
            engagement_boost += 2
        elif comment_count >= 20:
            engagement_boost += 1

        final_score = min(base_score + engagement_boost, 10)
        logging.info(f"Reddit Interest Score: {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count}, top_comments: {len(top_comments)}) for '{title}'")
        print(f"Interest Score for '{title[:50]}...': {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count})")
        return final_score
    except Exception as e:
        logging.error(f"Reddit interestingness scoring failed: {e}")
        print(f"Reddit Interest Error: {e}")
        return 0

def get_top_comments(post_url, reddit, limit=3):
    try:
        submission = reddit.submission(url=post_url)
        submission.comments.replace_more(limit=0)
        submission.comment_sort = 'top'
        top_comments = [comment.body for comment in submission.comments[:limit] if not comment.body.startswith('[deleted]')]
        logging.info(f"Fetched {len(top_comments)} top comments for {post_url}")
        return top_comments
    except Exception as e:
        logging.error(f"Failed to fetch comments for {post_url}: {e}")
        return []

def fetch_reddit_posts():
    reddit = praw.Reddit(
        client_id=REDDIT_CLIENT_ID,
        client_secret=REDDIT_CLIENT_SECRET,
        user_agent=REDDIT_USER_AGENT
    )
    feeds = ['FoodPorn', 'restaurant', 'FoodIndustry', 'food']
    articles = []
    cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS)

    logging.info(f"Starting fetch with cutoff date: {cutoff_date}")
    for subreddit_name in feeds:
        try:
            subreddit = reddit.subreddit(subreddit_name)
            for submission in subreddit.top(time_filter='day', limit=100):
                pub_date = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
                if pub_date < cutoff_date:
                    logging.info(f"Skipping old post: {submission.title} (Published: {pub_date})")
                    continue
                articles.append({
                    "title": submission.title,
                    "link": f"https://www.reddit.com{submission.permalink}",
                    "summary": submission.selftext,
                    "feed_title": get_clean_source_name(subreddit_name),
                    "pub_date": pub_date,
                    "upvotes": submission.score,
                    "comment_count": submission.num_comments
                })
            logging.info(f"Fetched {len(articles)} posts from r/{subreddit_name}")
        except Exception as e:
            logging.error(f"Failed to fetch Reddit feed r/{subreddit_name}: {e}")

    logging.info(f"Total Reddit posts fetched: {len(articles)}")
    return articles

def curate_from_reddit():
    articles = fetch_reddit_posts()
    if not articles:
        print("No Reddit posts available")
        logging.info("No Reddit posts available")
        return None, None, None

    # Sort by upvotes descending
    articles.sort(key=lambda x: x["upvotes"], reverse=True)

    reddit = praw.Reddit(
        client_id=REDDIT_CLIENT_ID,
        client_secret=REDDIT_CLIENT_SECRET,
        user_agent=REDDIT_USER_AGENT
    )

    attempts = 0
    max_attempts = 10
    while attempts < max_attempts and articles:
        article = articles.pop(0)  # Take highest-upvote post
        title = article["title"]
        link = article["link"]
        summary = article["summary"]
        source_name = "Reddit"
        original_source = '<a href="https://www.reddit.com/">Reddit</a>'

        if title in posted_titles:
            print(f"Skipping already posted post: {title}")
            logging.info(f"Skipping already posted post: {title}")
            attempts += 1
            continue

        print(f"Trying Reddit Post: {title} from {source_name}")
        logging.info(f"Trying Reddit Post: {title} from {source_name}")

        image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
        if skip or any(keyword in title.lower() or keyword in summary.lower() for keyword in RECIPE_KEYWORDS + ["homemade"]):
            print(f"Skipping filtered Reddit post: {title}")
            logging.info(f"Skipping filtered Reddit post: {title}")
            attempts += 1
            continue

        top_comments = get_top_comments(link, reddit, limit=3)
        interest_score = is_interesting_reddit(
            title,
            summary,
            article["upvotes"],
            article["comment_count"],
            top_comments
        )
        logging.info(f"Interest Score: {interest_score} for '{title}'")
        if interest_score < 6:
            print(f"Reddit Interest Too Low: {interest_score}")
            logging.info(f"Reddit Interest Too Low: {interest_score}")
            attempts += 1
            continue

        num_paragraphs = determine_paragraph_count(interest_score)
        extra_prompt = (
            f"Generate exactly {num_paragraphs} paragraphs. "
            f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details. "
            "Incorporate relevant insights from these top comments if available: {', '.join(top_comments) if top_comments else 'None'}. "
            "Do NOT introduce unrelated concepts unless in the content or comments. "
            "If brief, expand on the core idea with relevant context about its appeal or significance."
        )
        content_to_summarize = f"{title}\n\n{summary}"
        if top_comments:
            content_to_summarize += f"\n\nTop Comments:\n{'\n'.join(top_comments)}"

        final_summary = summarize_with_gpt4o(
            content_to_summarize,
            source_name,
            link,
            interest_score=interest_score,
            extra_prompt=extra_prompt
        )
        if not final_summary:
            logging.info(f"Summary failed for '{title}'")
            attempts += 1
            continue

        final_summary = insert_link_naturally(final_summary, source_name, link)

        post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
        if not post_data:
            attempts += 1
            continue

        image_url, image_source, uploader, page_url = get_flickr_image_via_ddg(image_query, relevance_keywords)
        if not image_url:
            image_url, image_source, uploader, page_url = get_image(image_query)

        hook = get_dynamic_hook(post_data["title"]).strip()
        cta = select_best_cta(post_data["title"], final_summary, post_url=None)

        post_data["content"] = f"{final_summary}\n\n{cta}"

        post_id, post_url = post_to_wp(
            post_data=post_data,
            category=category,
            link=link,
            author=author,
            image_url=image_url,
            original_source=original_source,
            image_source=image_source,
            uploader=uploader,
            pixabay_url=pixabay_url,
            interest_score=interest_score
        )

        if post_id:
            cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
            post_data["content"] = f"{final_summary}\n\n{cta}"

            post_to_wp(
                post_data=post_data,
                category=category,
                link=link,
                author=author,
                image_url=image_url,
                original_source=original_source,
                image_source=image_source,
                uploader=uploader,
                pixabay_url=pixabay_url,
                interest_score=interest_score,
                post_id=post_id
            )

            timestamp = datetime.now(timezone.utc).isoformat()
            save_json_file(POSTED_TITLES_FILE, title, timestamp)
            posted_titles.add(title)
            logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE} with timestamp {timestamp}")

            if image_url:
                save_json_file(USED_IMAGES_FILE, image_url, timestamp)
                used_images.add(image_url)
                logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE} with timestamp {timestamp}")

            print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Reddit *****")
            print(f"Actual post URL: {post_url}")
            logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Reddit *****")
            logging.info(f"Actual post URL: {post_url}")
            return post_data, category, random.randint(0, 1800)

        attempts += 1
        logging.info(f"WP posting failed for '{post_data['title']}'")

    print("No interesting Reddit post found after attempts")
    logging.info("No interesting Reddit post found after attempts")
    return None, None, random.randint(600, 1800)

def run_reddit_automator():
    print(f"{datetime.now(timezone.utc)} - INFO - ***** Reddit Automator Launched *****")
    logging.info("***** Reddit Automator Launched *****")

    post_data, category, sleep_time = curate_from_reddit()
    if not post_data:
        print(f"No postable Reddit article found - sleeping for {sleep_time} seconds")
        logging.info(f"No postable Reddit article found - sleeping for {sleep_time} seconds")
    else:
        print(f"Completed Reddit run with sleep time: {sleep_time} seconds")
        logging.info(f"Completed Reddit run with sleep time: {sleep_time} seconds")
    print(f"Sleeping for {sleep_time}s")
    time.sleep(sleep_time)
    return post_data, category, sleep_time

if __name__ == "__main__":
    run_reddit_automator()