450 lines
19 KiB
Python
450 lines
19 KiB
Python
# foodie_automator_rss.py
|
||
import requests
|
||
import random
|
||
import time
|
||
import logging
|
||
import os
|
||
import json
|
||
import signal
|
||
import sys
|
||
import re
|
||
import email.utils
|
||
import feedparser
|
||
from duckduckgo_search import DDGS
|
||
from datetime import datetime, timedelta, timezone
|
||
from bs4 import BeautifulSoup
|
||
from openai import OpenAI
|
||
from urllib.parse import quote
|
||
from requests.packages.urllib3.util.retry import Retry
|
||
from requests.adapters import HTTPAdapter
|
||
from foodie_config import (
|
||
RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS,
|
||
HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES,
|
||
get_clean_source_name, X_API_CREDENTIALS
|
||
)
|
||
from foodie_utils import (
|
||
load_json_file, save_json_file, get_image, generate_image_query,
|
||
upload_image_to_wp, determine_paragraph_count, insert_link_naturally,
|
||
is_interesting, generate_title_from_summary, summarize_with_gpt4o,
|
||
generate_category_from_summary, post_to_wp, prepare_post_data,
|
||
select_best_author, smart_image_and_filter, get_flickr_image,
|
||
get_next_author_round_robin, check_author_rate_limit, update_system_activity
|
||
)
|
||
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
|
||
from dotenv import load_dotenv
|
||
import fcntl
|
||
|
||
load_dotenv()
|
||
|
||
is_posting = False
|
||
SCRIPT_NAME = "foodie_automator_rss"
|
||
LOCK_FILE = "/home/shane/foodie_automator/locks/foodie_automator_rss.lock"
|
||
LOG_FILE = "/home/shane/foodie_automator/logs/foodie_automator_rss.log"
|
||
LOG_PRUNE_DAYS = 30
|
||
FEED_TIMEOUT = 15
|
||
MAX_RETRIES = 3
|
||
RETRY_BACKOFF = 2
|
||
|
||
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json'
|
||
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
|
||
EXPIRATION_HOURS = 24
|
||
IMAGE_EXPIRATION_DAYS = 7
|
||
|
||
def setup_logging():
|
||
"""Initialize logging with pruning of old logs."""
|
||
try:
|
||
logging.debug("Attempting to set up logging")
|
||
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
|
||
logging.debug(f"Log directory created/verified: {os.path.dirname(LOG_FILE)}")
|
||
if not os.access(os.path.dirname(LOG_FILE), os.W_OK):
|
||
raise PermissionError(f"No write permission for {os.path.dirname(LOG_FILE)}")
|
||
|
||
# Test write to log file
|
||
try:
|
||
with open(LOG_FILE, 'a') as f:
|
||
f.write("")
|
||
logging.debug(f"Confirmed write access to {LOG_FILE}")
|
||
except Exception as e:
|
||
raise PermissionError(f"Cannot write to {LOG_FILE}: {e}")
|
||
|
||
if os.path.exists(LOG_FILE):
|
||
with open(LOG_FILE, 'r') as f:
|
||
lines = f.readlines()
|
||
cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
|
||
pruned_lines = []
|
||
malformed_count = 0
|
||
for line in lines:
|
||
if len(line) < 19 or not line[:19].replace('-', '').replace(':', '').replace(' ', '').isdigit():
|
||
malformed_count += 1
|
||
continue
|
||
try:
|
||
timestamp = datetime.strptime(line[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc)
|
||
if timestamp > cutoff:
|
||
pruned_lines.append(line)
|
||
except ValueError:
|
||
malformed_count += 1
|
||
continue
|
||
if malformed_count > 0:
|
||
logging.info(f"Skipped {malformed_count} malformed log lines during pruning")
|
||
with open(LOG_FILE, 'w') as f:
|
||
f.writelines(pruned_lines)
|
||
logging.debug(f"Log file pruned: {LOG_FILE}")
|
||
|
||
logging.basicConfig(
|
||
filename=LOG_FILE,
|
||
level=logging.INFO,
|
||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||
datefmt="%Y-%m-%d %H:%M:%S",
|
||
force=True
|
||
)
|
||
console_handler = logging.StreamHandler()
|
||
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
||
logging.getLogger().addHandler(console_handler)
|
||
logging.getLogger("requests").setLevel(logging.WARNING)
|
||
logging.getLogger("openai").setLevel(logging.WARNING)
|
||
logging.info("Logging initialized for foodie_automator_rss.py")
|
||
except Exception as e:
|
||
print(f"Failed to setup logging: {e}")
|
||
sys.exit(1)
|
||
|
||
# Call setup_logging immediately
|
||
setup_logging()
|
||
|
||
check_author_rate_limit.script_run_id = int(time.time())
|
||
logging.info(f"Set script_run_id to {check_author_rate_limit.script_run_id}")
|
||
|
||
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
|
||
posted_titles = set(entry["title"] for entry in posted_titles_data)
|
||
used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)
|
||
|
||
def acquire_lock():
|
||
try:
|
||
logging.debug("Attempting to acquire lock")
|
||
os.makedirs(os.path.dirname(LOCK_FILE), exist_ok=True)
|
||
lock_fd = open(LOCK_FILE, 'w')
|
||
fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||
lock_fd.write(str(os.getpid()))
|
||
lock_fd.flush()
|
||
logging.debug(f"Lock acquired: {LOCK_FILE}")
|
||
return lock_fd
|
||
except IOError:
|
||
logging.info("Another instance of foodie_automator_rss.py is running")
|
||
sys.exit(0)
|
||
|
||
def signal_handler(sig, frame):
|
||
logging.info("Received termination signal, marking script as stopped...")
|
||
update_system_activity(SCRIPT_NAME, "stopped")
|
||
sys.exit(0)
|
||
|
||
signal.signal(signal.SIGTERM, signal_handler)
|
||
signal.signal(signal.SIGINT, signal_handler)
|
||
|
||
def create_http_session() -> requests.Session:
|
||
session = requests.Session()
|
||
retry_strategy = Retry(
|
||
total=MAX_RETRIES,
|
||
backoff_factor=RETRY_BACKOFF,
|
||
status_forcelist=[403, 429, 500, 502, 503, 504],
|
||
allowed_methods=["GET", "POST"]
|
||
)
|
||
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||
session.mount("http://", adapter)
|
||
session.mount("https://", adapter)
|
||
session.headers.update({
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
|
||
})
|
||
return session
|
||
|
||
def parse_date(date_str):
|
||
try:
|
||
parsed_date = email.utils.parsedate_to_datetime(date_str)
|
||
if parsed_date.tzinfo is None:
|
||
parsed_date = parsed_date.replace(tzinfo=timezone.utc)
|
||
return parsed_date
|
||
except Exception as e:
|
||
logging.error(f"Failed to parse date '{date_str}': {e}")
|
||
return datetime.now(timezone.utc)
|
||
|
||
def fetch_rss_feeds():
|
||
logging.info("Starting fetch_rss_feeds")
|
||
articles = []
|
||
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS)
|
||
session = create_http_session()
|
||
|
||
if not RSS_FEEDS:
|
||
logging.error("RSS_FEEDS is empty in foodie_config.py")
|
||
return articles
|
||
|
||
logging.info(f"Processing feeds: {RSS_FEEDS}")
|
||
for feed_url in RSS_FEEDS:
|
||
for attempt in range(MAX_RETRIES):
|
||
logging.info(f"Processing feed: {feed_url} (attempt {attempt + 1})")
|
||
try:
|
||
response = session.get(feed_url, timeout=FEED_TIMEOUT)
|
||
response.raise_for_status()
|
||
soup = BeautifulSoup(response.content, 'xml')
|
||
items = soup.find_all('item')
|
||
|
||
feed_title = RSS_FEED_NAMES.get(feed_url, (get_clean_source_name(feed_url), feed_url))
|
||
for item in items:
|
||
try:
|
||
title = item.find('title').text.strip() if item.find('title') else "Untitled"
|
||
link = item.find('link').text.strip() if item.find('link') else ""
|
||
pub_date = item.find('pubDate')
|
||
pub_date = parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc)
|
||
|
||
if pub_date < cutoff_date:
|
||
logging.info(f"Skipping old article: {title} (Published: {pub_date})")
|
||
continue
|
||
|
||
description = item.find('description')
|
||
summary = BeautifulSoup(description.text, 'html.parser').get_text().strip() if description else ""
|
||
content = item.find('content:encoded')
|
||
content_text = BeautifulSoup(content.text, 'html.parser').get_text().strip() if content else summary
|
||
|
||
articles.append({
|
||
"title": title,
|
||
"link": link,
|
||
"summary": summary,
|
||
"content": content_text,
|
||
"feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title,
|
||
"pub_date": pub_date
|
||
})
|
||
logging.debug(f"Processed article: {title}")
|
||
except Exception as e:
|
||
logging.warning(f"Error processing entry in {feed_url}: {e}")
|
||
continue
|
||
logging.info(f"Filtered to {len(articles)} articles from {feed_url}")
|
||
break
|
||
except Exception as e:
|
||
logging.error(f"Failed to fetch RSS feed {feed_url}: {e}")
|
||
if attempt < MAX_RETRIES - 1:
|
||
time.sleep(RETRY_BACKOFF * (2 ** attempt))
|
||
continue
|
||
articles.sort(key=lambda x: x["pub_date"], reverse=True)
|
||
logging.info(f"Total RSS articles fetched: {len(articles)}")
|
||
return articles
|
||
|
||
def fetch_duckduckgo_news_context(title, hours=24):
|
||
for attempt in range(MAX_RETRIES):
|
||
try:
|
||
with DDGS() as ddgs:
|
||
results = ddgs.news(f"{title} news", timelimit="d", max_results=5)
|
||
titles = []
|
||
for r in results:
|
||
try:
|
||
date_str = r["date"]
|
||
if '+00:00' in date_str:
|
||
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
|
||
else:
|
||
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
|
||
if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
|
||
titles.append(r["title"].lower())
|
||
except ValueError as e:
|
||
logging.warning(f"Date parsing failed for '{date_str}': {e}")
|
||
continue
|
||
context = " ".join(titles) if titles else "No recent news found within 24 hours"
|
||
logging.info(f"DuckDuckGo News context for '{title}': {context}")
|
||
return context
|
||
except Exception as e:
|
||
logging.warning(f"DuckDuckGo News context fetch failed for '{title}' (attempt {attempt + 1}): {e}")
|
||
if attempt < MAX_RETRIES - 1:
|
||
time.sleep(RETRY_BACKOFF * (2 ** attempt))
|
||
continue
|
||
logging.error(f"Failed to fetch DuckDuckGo News context for '{title}' after {MAX_RETRIES} attempts")
|
||
return title
|
||
|
||
def curate_from_rss(entry, original_source, source_name, link, page_url):
|
||
logger = logging.getLogger(__name__)
|
||
try:
|
||
content = entry.summary
|
||
if not content:
|
||
logger.info(f"No content for RSS entry: {entry.title}")
|
||
return None, None
|
||
|
||
interest_score = is_interesting(content)
|
||
if interest_score < 4:
|
||
logger.info(f"RSS entry '{entry.title}' not interesting enough: score {interest_score}")
|
||
return None, None
|
||
|
||
summary = summarize_with_gpt4o(content, source_name, link, interest_score=interest_score)
|
||
if not summary:
|
||
logger.warning(f"Failed to summarize RSS entry: {entry.title}")
|
||
return None, None
|
||
|
||
# Remove the original title from the summary if present
|
||
if entry.title in summary:
|
||
summary = summary.replace(entry.title, "").strip()
|
||
while "\n\n\n" in summary:
|
||
summary = summary.replace("\n\n\n", "\n\n")
|
||
|
||
final_summary = insert_link_naturally(summary, source_name, link)
|
||
if not final_summary:
|
||
logger.warning(f"Failed to insert link for RSS entry: {entry.title}")
|
||
return None, None
|
||
|
||
# Call prepare_post_data and handle return values dynamically
|
||
result = prepare_post_data(final_summary, entry.title)
|
||
if not result:
|
||
logger.info(f"Post preparation failed for RSS entry: {entry.title}")
|
||
return None, None
|
||
|
||
# Log the number of values returned for debugging
|
||
logger.debug(f"prepare_post_data returned {len(result)} values: {result}")
|
||
|
||
# Expect at least 7 values; handle additional values gracefully
|
||
if len(result) < 7:
|
||
logger.error(f"prepare_post_data returned too few values: {result}")
|
||
return None, None
|
||
|
||
post_data = result[0]
|
||
author = result[1]
|
||
category = result[2]
|
||
image_url = result[3]
|
||
image_source = result[4]
|
||
uploader = result[5]
|
||
page_url = result[6]
|
||
|
||
share_text = f"Check out this tasty find: {post_data['title']}"
|
||
share_text_encoded = quote(share_text)
|
||
share_links_template = (
|
||
"Share this post: "
|
||
'<a href="https://x.com/intent/tweet?url={post_url}&text={share_text}">X</a> | '
|
||
'<a href="https://www.facebook.com/sharer/sharer.php?u={post_url}">Facebook</a>'
|
||
)
|
||
|
||
# First call: Post without share links
|
||
post_data["content"] = final_summary
|
||
post_id, post_url = post_to_wp(
|
||
post_data=post_data,
|
||
category=category,
|
||
link=link,
|
||
author=author,
|
||
image_url=image_url,
|
||
original_source=original_source,
|
||
image_source=image_source,
|
||
uploader=uploader,
|
||
page_url=page_url,
|
||
interest_score=interest_score,
|
||
should_post_tweet=True,
|
||
summary=final_summary
|
||
)
|
||
|
||
if not post_id:
|
||
logger.warning(f"Failed to post RSS entry to WP: {post_data['title']}")
|
||
return None, None
|
||
|
||
# Second call: Update with share links
|
||
post_url_encoded = quote(post_url)
|
||
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
|
||
post_data["content"] = f"{final_summary}\n\n{share_links}"
|
||
post_id, post_url = post_to_wp(
|
||
post_data=post_data,
|
||
category=category,
|
||
link=link,
|
||
author=author,
|
||
image_url=None,
|
||
original_source=original_source,
|
||
image_source=image_source,
|
||
uploader=uploader,
|
||
page_url=page_url,
|
||
interest_score=interest_score,
|
||
post_id=post_id,
|
||
should_post_tweet=False,
|
||
summary=final_summary
|
||
)
|
||
|
||
if post_id:
|
||
logger.info(f"Successfully curated and posted RSS entry: {post_data['title']} (URL: {post_url})")
|
||
return post_id, post_url
|
||
else:
|
||
logger.warning(f"Failed to update RSS post with share links: {post_data['title']}")
|
||
return None, None
|
||
|
||
except Exception as e:
|
||
logger.error(f"Error curating RSS entry '{getattr(entry, 'title', 'unknown')}': {e}")
|
||
return None, None
|
||
|
||
def run_rss_automator():
|
||
lock_fd = None
|
||
try:
|
||
lock_fd = acquire_lock()
|
||
update_system_activity(SCRIPT_NAME, "running", os.getpid()) # Record start
|
||
logging.info("***** RSS Automator Launched *****")
|
||
|
||
# Load posted titles and used images
|
||
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
|
||
posted_titles = set(entry["title"] for entry in posted_titles_data)
|
||
used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS)
|
||
used_images = set(entry["title"] for entry in used_images_data if "title" in entry)
|
||
|
||
# Fetch RSS articles
|
||
articles = fetch_rss_feeds()
|
||
if not articles:
|
||
logging.info("No new RSS articles found")
|
||
sleep_time = random.randint(1200, 1800) # 20–30 minutes
|
||
return None, None, sleep_time
|
||
|
||
# Process each article
|
||
for article in articles:
|
||
title = article["title"]
|
||
if title in posted_titles:
|
||
logging.info(f"Skipping already posted article: {title}")
|
||
continue
|
||
|
||
# Extract necessary fields
|
||
entry = type('Entry', (), {
|
||
'title': title,
|
||
'summary': article["summary"],
|
||
'link': article["link"]
|
||
})()
|
||
original_source = article["feed_title"]
|
||
source_name = get_clean_source_name(original_source)
|
||
link = article["link"]
|
||
page_url = link # Use the article link as the page_url
|
||
|
||
# Curate the article
|
||
post_id, post_url = curate_from_rss(entry, original_source, source_name, link, page_url)
|
||
if post_id and post_url:
|
||
# Prepare post_data for return
|
||
post_data = {
|
||
"title": title,
|
||
"url": post_url,
|
||
"id": post_id
|
||
}
|
||
# Update posted titles
|
||
timestamp = datetime.now(timezone.utc).isoformat()
|
||
save_json_file(POSTED_TITLES_FILE, title, timestamp)
|
||
posted_titles.add(title)
|
||
|
||
# Determine category (you might need to adjust this based on your actual usage)
|
||
category = generate_category_from_summary(article["summary"])
|
||
|
||
logging.info("Completed RSS run")
|
||
update_system_activity(SCRIPT_NAME, "stopped") # Record stop
|
||
sleep_time = random.randint(1200, 1800) # 20–30 minutes
|
||
logging.info(f"Run completed, sleep_time: {sleep_time} seconds")
|
||
return post_data, category, sleep_time
|
||
|
||
# If no articles were posted
|
||
logging.info("No postable RSS article found")
|
||
update_system_activity(SCRIPT_NAME, "stopped") # Record stop
|
||
sleep_time = random.randint(1200, 1800) # 20–30 minutes
|
||
logging.info(f"Run completed, sleep_time: {sleep_time} seconds")
|
||
return None, None, sleep_time
|
||
|
||
except Exception as e:
|
||
logging.error(f"Fatal error in run_rss_automator: {e}", exc_info=True)
|
||
update_system_activity(SCRIPT_NAME, "stopped") # Record stop on error
|
||
sleep_time = random.randint(1200, 1800) # 20–30 minutes
|
||
logging.info(f"Run completed, sleep_time: {sleep_time} seconds")
|
||
return None, None, sleep_time
|
||
finally:
|
||
if lock_fd:
|
||
fcntl.flock(lock_fd, fcntl.LOCK_UN)
|
||
lock_fd.close()
|
||
os.remove(LOCK_FILE) if os.path.exists(LOCK_FILE) else None
|
||
|
||
if __name__ == "__main__":
|
||
post_data, category, sleep_time = run_rss_automator()
|
||
logging.info(f"Run completed, sleep_time: {sleep_time} seconds") |