foodie-automator/foodie_automator_reddit.py

# foodie_automator_reddit.py
import requests
import random
import time
import logging
import os
import json
import signal
import sys
import re
from duckduckgo_search import DDGS
from datetime import datetime, timedelta, timezone
from openai import OpenAI
from urllib.parse import quote
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import praw
from dotenv import load_dotenv
from foodie_config import (
    AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS,
    PERSONA_CONFIGS, CATEGORIES, get_clean_source_name,
    REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT, LIGHT_TASK_MODEL,
    X_API_CREDENTIALS
)
from foodie_utils import (
    load_json_file, save_json_file, get_image, generate_image_query,
    upload_image_to_wp, determine_paragraph_count, insert_link_naturally,
    summarize_with_gpt4o, generate_category_from_summary, post_to_wp,
    prepare_post_data, select_best_author, smart_image_and_filter,
    get_flickr_image
)
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
import fcntl

load_dotenv()

is_posting = False
LOCK_FILE = "/home/shane/foodie_automator/locks/foodie_automator_reddit.lock"

def signal_handler(sig, frame):
    logging.info("Received termination signal, checking if safe to exit...")
    if is_posting:
        logging.info("Currently posting, will exit after completion.")
    else:
        logging.info("Safe to exit immediately.")
        sys.exit(0)

signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)

LOG_FILE = "/home/shane/foodie_automator/logs/foodie_automator_reddit.log"
LOG_PRUNE_DAYS = 30
MAX_RETRIES = 3
RETRY_BACKOFF = 2

POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_reddit_titles.json'
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
EXPIRATION_HOURS = 24
IMAGE_EXPIRATION_DAYS = 7

posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
posted_titles = set(entry["title"] for entry in posted_titles_data if "title" in entry)
used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
used_images = set(entry["title"] for entry in used_images_data if "title" in entry)

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def setup_logging():
    if os.path.exists(LOG_FILE):
        with open(LOG_FILE, 'r') as f:
            lines = f.readlines()

        log_entries = []
        current_entry = []
        timestamp_pattern = re.compile(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}')

        for line in lines:
            if(timestamp_pattern.match(line)):
                if current_entry:
                    log_entries.append(''.join(current_entry))
                current_entry = [line]
            else:
                current_entry.append(line)

        if current_entry:
            log_entries.append(''.join(current_entry))

        cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
        pruned_entries = []
        for entry in log_entries:
            try:
                timestamp = datetime.strptime(entry[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc)
                if timestamp > cutoff:
                    pruned_entries.append(entry)
            except ValueError:
                logging.warning(f"Skipping malformed log entry (no timestamp): {entry[:50]}...")
                continue

        with open(LOG_FILE, 'w') as f:
            f.writelines(pruned_entries)

    logging.basicConfig(
        filename=LOG_FILE,
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s"
    )
    logging.getLogger("requests").setLevel(logging.WARNING)
    logging.getLogger("prawcore").setLevel(logging.WARNING)
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    logging.getLogger().addHandler(console_handler)
    logging.info("Logging initialized for foodie_automator_reddit.py")

def acquire_lock():
    os.makedirs(os.path.dirname(LOCK_FILE), exist_ok=True)
    lock_fd = open(LOCK_FILE, 'w')
    try:
        fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
        lock_fd.write(str(os.getpid()))
        lock_fd.flush()
        return lock_fd
    except IOError:
        logging.info("Another instance of foodie_automator_reddit.py is running")
        sys.exit(0)

def clean_reddit_title(title):
    cleaned_title = re.sub(r'^\[.*?\]\s*', '', title).strip()
    logging.info(f"Cleaned Reddit title from '{title}' to '{cleaned_title}'")
    return cleaned_title

def is_interesting_reddit(title, summary, upvotes, comment_count, top_comments):
    for attempt in range(MAX_RETRIES):
        try:
            content = f"Title: {title}\n\nContent: {summary}"
            if top_comments:
                content += f"\n\nTop Comments:\n{'\n'.join(top_comments)}"

            response = client.chat.completions.create(
                model=LIGHT_TASK_MODEL,
                messages=[
                    {"role": "system", "content": (
                        "Rate this Reddit post from 0-10 based on rarity, buzzworthiness, and engagement potential for food lovers, covering food topics (skip recipes). "
                        "Score 8-10 for rare, highly shareable ideas (e.g., unique dishes or restaurant trends). "
                        "Score 5-7 for fresh, engaging updates with broad appeal. Score below 5 for common or unremarkable content. "
                        "Consider comments for added context (e.g., specific locations or unique details). "
                        "Return only a number"
                    )},
                    {"role": "user", "content": content}
                ],
                max_tokens=5
            )
            base_score = int(response.choices[0].message.content.strip()) if response.choices[0].message.content.strip().isdigit() else 0

            engagement_boost = 0
            if upvotes >= 500:
                engagement_boost += 3
            elif upvotes >= 100:
                engagement_boost += 2
            elif upvotes >= 50:
                engagement_boost += 1

            if comment_count >= 100:
                engagement_boost += 2
            elif comment_count >= 20:
                engagement_boost += 1

            final_score = min(base_score + engagement_boost, 10)
            logging.info(f"Reddit Interest Score: {final_score} (base: {base_score}, upvotes: {upvotes}, comments: {comment_count}, top_comments: {len(top_comments)}) for '{title}'")
            return final_score
        except Exception as e:
            logging.warning(f"Reddit interestingness scoring failed (attempt {attempt + 1}): {e}")
            if attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_BACKOFF * (2 ** attempt))
            continue
    logging.error(f"Failed to score Reddit post '{title}' after {MAX_RETRIES} attempts")
    return 0

def get_top_comments(post_url, reddit, limit=3):
    for attempt in range(MAX_RETRIES):
        try:
            submission = reddit.submission(url=post_url)
            submission.comment_sort = 'top'
            submission.comments.replace_more(limit=0)
            top_comments = [comment.body for comment in submission.comments[:limit] if not comment.body.startswith('[deleted]')]
            logging.info(f"Fetched {len(top_comments)} top comments for {post_url}")
            return top_comments
        except Exception as e:
            logging.warning(f"Failed to fetch comments for {post_url} (attempt {attempt + 1}): {e}")
            if attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_BACKOFF * (2 ** attempt))
            continue
    logging.error(f"Failed to fetch comments for {post_url} after {MAX_RETRIES} attempts")
    return []

def fetch_duckduckgo_news_context(title, hours=24):
    for attempt in range(MAX_RETRIES):
        try:
            with DDGS() as ddgs:
                results = ddgs.news(f"{title} news", timelimit="d", max_results=5)
                titles = []
                for r in results:
                    try:
                        date_str = r["date"]
                        if '+00:00' in date_str:
                            dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
                        else:
                            dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
                        if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
                            titles.append(r["title"].lower())
                    except ValueError as e:
                        logging.warning(f"Date parsing failed for '{date_str}': {e}")
                        continue
                context = " ".join(titles) if titles else "No recent news found within 24 hours"
                logging.info(f"DuckDuckGo News context for '{title}': {context}")
                return context
        except Exception as e:
            logging.warning(f"DuckDuckGo News context fetch failed for '{title}' (attempt {attempt + 1}): {e}")
            if attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_BACKOFF * (2 ** attempt))
            continue
    logging.error(f"Failed to fetch DuckDuckGo News context for '{title}' after {MAX_RETRIES} attempts")
    return title

def fetch_reddit_posts():
    try:
        reddit = praw.Reddit(
            client_id=REDDIT_CLIENT_ID,
            client_secret=REDDIT_CLIENT_SECRET,
            user_agent=REDDIT_USER_AGENT
        )
        feeds = ['FoodPorn', 'restaurant', 'FoodIndustry', 'food']
        articles = []
        cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS)

        logging.info(f"Starting fetch with cutoff date: {cutoff_date}")
        for subreddit_name in feeds:
            for attempt in range(MAX_RETRIES):
                try:
                    subreddit = reddit.subreddit(subreddit_name)
                    for submission in subreddit.top(time_filter='day', limit=100):
                        pub_date = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
                        if pub_date < cutoff_date:
                            logging.info(f"Skipping old post: {submission.title} (Published: {pub_date})")
                            continue
                        cleaned_title = clean_reddit_title(submission.title)
                        articles.append({
                            "title": cleaned_title,
                            "raw_title": submission.title,
                            "link": f"https://www.reddit.com{submission.permalink}",
                            "summary": submission.selftext,
                            "feed_title": get_clean_source_name(subreddit_name),
                            "pub_date": pub_date,
                            "upvotes": submission.score,
                            "comment_count": submission.num_comments
                        })
                    logging.info(f"Fetched {len(articles)} posts from r/{subreddit_name}")
                    break
                except Exception as e:
                    logging.error(f"Failed to fetch Reddit feed r/{subreddit_name} (attempt {attempt + 1}): {e}")
                    if attempt < MAX_RETRIES - 1:
                        time.sleep(RETRY_BACKOFF * (2 ** attempt))
                    continue
        logging.info(f"Total Reddit posts fetched: {len(articles)}")
        return articles
    except Exception as e:
        logging.error(f"Unexpected error in fetch_reddit_posts: {e}", exc_info=True)
        return []

def curate_from_reddit():
    try:
        articles = fetch_reddit_posts()
        if not articles:
            logging.info("No Reddit posts available")
            return None, None, False

        articles.sort(key=lambda x: x["upvotes"], reverse=True)

        reddit = praw.Reddit(
            client_id=REDDIT_CLIENT_ID,
            client_secret=REDDIT_CLIENT_SECRET,
            user_agent=REDDIT_USER_AGENT
        )

        attempts = 0
        max_attempts = 10
        while attempts < max_attempts and articles:
            article = articles.pop(0)
            title = article["title"]
            raw_title = article["raw_title"]
            link = article["link"]
            summary = article["summary"]
            source_name = "Reddit"
            original_source = '<a href="https://www.reddit.com/">Reddit</a>'

            if raw_title in posted_titles:
                logging.info(f"Skipping already posted post: {raw_title}")
                attempts += 1
                continue

            logging.info(f"Trying Reddit Post: {title} from {source_name}")

            image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary)
            if skip or any(keyword in title.lower() or keyword in raw_title.lower() for keyword in RECIPE_KEYWORDS + ["homemade"]):
                logging.info(f"Skipping filtered Reddit post: {title}")
                attempts += 1
                continue

            top_comments = get_top_comments(link, reddit, limit=3)
            ddg_context = fetch_duckduckgo_news_context(title)
            content_to_summarize = f"{title}\n\n{summary}\n\nTop Comments:\n{'\n'.join(top_comments) if top_comments else 'None'}\n\nAdditional Context: {ddg_context}"
            interest_score = is_interesting_reddit(
                title,
                summary,
                article["upvotes"],
                article["comment_count"],
                top_comments
            )
            logging.info(f"Interest Score: {interest_score} for '{title}'")
            if interest_score < 6:
                logging.info(f"Reddit Interest Too Low: {interest_score}")
                attempts += 1
                continue

            num_paragraphs = determine_paragraph_count(interest_score)
            extra_prompt = (
                f"Generate exactly {num_paragraphs} paragraphs.\n"
                f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
                f"Incorporate relevant insights from these top comments if available: {', '.join(top_comments) if top_comments else 'None'}.\n"
                f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
                f"Do NOT introduce unrelated concepts unless in the content, comments, or additional context.\n"
                f"If brief, expand on the core idea with relevant context about its appeal or significance.\n"
                f"Do not include emojis in the summary."
            )

            final_summary = summarize_with_gpt4o(
                content_to_summarize,
                source_name,
                link,
                interest_score=interest_score,
                extra_prompt=extra_prompt
            )
            if not final_summary:
                logging.info(f"Summary failed for '{title}'")
                attempts += 1
                continue

            final_summary = insert_link_naturally(final_summary, source_name, link)

            # Use round-robin author selection
            author = get_next_author_round_robin()
            author_username = author["username"]
            logging.info(f"Selected author via round-robin: {author_username}")

            post_data = {
                "title": generate_title_from_summary(final_summary),
                "content": final_summary,
                "status": "publish",
                "author": author_username,
                "categories": [generate_category_from_summary(final_summary)]
            }
            category = post_data["categories"][0]
            image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic)
            if not image_url:
                image_url, image_source, uploader, page_url = get_image(image_query)
                if not image_url:
                    logging.warning(f"All image uploads failed for '{title}' - posting without image")
                    image_source = None
                    uploader = None
                    page_url = None

            hook = get_dynamic_hook(post_data["title"]).strip()

            share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
            share_links_template = (
                f'<p>{share_prompt} '
                f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
                f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
            )
            post_data["content"] = f"{final_summary}\n\n{share_links_template}"

            global is_posting
            is_posting = True
            try:
                post_id, post_url = post_to_wp(
                    post_data=post_data,
                    category=category,
                    link=link,
                    author=author,
                    image_url=image_url,
                    original_source=original_source,
                    image_source=image_source,
                    uploader=uploader,
                    page_url=page_url,
                    interest_score=interest_score,
                    should_post_tweet=True
                )
            except Exception as e:
                logging.error(f"Failed to post to WordPress for '{title}': {e}", exc_info=True)
                attempts += 1
                continue
            finally:
                is_posting = False

            if post_id:
                share_text = f"Check out this foodie gem! {post_data['title']}"
                share_text_encoded = quote(share_text)
                post_url_encoded = quote(post_url)
                share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
                post_data["content"] = f"{final_summary}\n\n{share_links}"
                is_posting = True
                try:
                    post_to_wp(
                        post_data=post_data,
                        category=category,
                        link=link,
                        author=author,
                        image_url=image_url,
                        original_source=original_source,
                        image_source=image_source,
                        uploader=uploader,
                        page_url=page_url,
                        interest_score=interest_score,
                        post_id=post_id,
                        should_post_tweet=False
                    )
                except Exception as e:
                    logging.error(f"Failed to update WordPress post '{title}' with share links: {e}", exc_info=True)
                finally:
                    is_posting = False

                timestamp = datetime.now(timezone.utc).isoformat()
                save_json_file(POSTED_TITLES_FILE, raw_title, timestamp)
                posted_titles.add(raw_title)
                logging.info(f"Successfully saved '{raw_title}' to {POSTED_TITLES_FILE}")

                if image_url:
                    save_json_file(USED_IMAGES_FILE, image_url, timestamp)
                    used_images.add(image_url)
                    logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")

                logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from Reddit *****")
                return post_data, category, True
            attempts += 1
            logging.info(f"WP posting failed for '{post_data['title']}'")

        logging.info("No interesting Reddit post found after attempts")
        return None, None, False
    except Exception as e:
        logging.error(f"Unexpected error in curate_from_reddit: {e}", exc_info=True)
        return None, None, False

def run_reddit_automator():
    lock_fd = None
    try:
        lock_fd = acquire_lock()
        logging.info("***** Reddit Automator Launched *****")
        post_data, category, should_continue = curate_from_reddit()
        if not post_data:
            logging.info("No postable Reddit article found")
        else:
            logging.info("Completed Reddit run")
        return post_data, category, should_continue
    except Exception as e:
        logging.error(f"Fatal error in run_reddit_automator: {e}", exc_info=True)
        return None, None, False
    finally:
        if lock_fd:
            fcntl.flock(lock_fd, fcntl.LOCK_UN)
            lock_fd.close()
            os.remove(LOCK_FILE) if os.path.exists(LOCK_FILE) else None

if __name__ == "__main__":
    setup_logging()
    post_data, category, should_continue = run_reddit_automator()
    logging.info(f"Run completed, should_continue: {should_continue}")