You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
480 lines
22 KiB
480 lines
22 KiB
# foodie_automator_rss.py |
|
import requests |
|
import random |
|
import time |
|
import logging |
|
import os |
|
import json |
|
import signal |
|
import sys |
|
import re |
|
import email.utils |
|
import feedparser |
|
from duckduckgo_search import DDGS |
|
from datetime import datetime, timedelta, timezone |
|
from bs4 import BeautifulSoup |
|
from openai import OpenAI |
|
from urllib.parse import quote |
|
from requests.packages.urllib3.util.retry import Retry |
|
from requests.adapters import HTTPAdapter |
|
from foodie_config import ( |
|
RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, |
|
HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES, |
|
get_clean_source_name, X_API_CREDENTIALS |
|
) |
|
from foodie_utils import ( |
|
load_json_file, save_json_file, get_image, generate_image_query, |
|
upload_image_to_wp, determine_paragraph_count, insert_link_naturally, |
|
is_interesting, generate_title_from_summary, summarize_with_gpt4o, |
|
generate_category_from_summary, post_to_wp, prepare_post_data, |
|
select_best_author, smart_image_and_filter, get_flickr_image, |
|
get_next_author_round_robin, check_author_rate_limit, update_system_activity |
|
) |
|
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt |
|
from dotenv import load_dotenv |
|
import fcntl |
|
|
|
load_dotenv() |
|
|
|
is_posting = False |
|
SCRIPT_NAME = "foodie_automator_rss" |
|
LOCK_FILE = "/home/shane/foodie_automator/locks/foodie_automator_rss.lock" |
|
LOG_FILE = "/home/shane/foodie_automator/logs/foodie_automator_rss.log" |
|
LOG_PRUNE_DAYS = 30 |
|
FEED_TIMEOUT = 15 |
|
MAX_RETRIES = 3 |
|
RETRY_BACKOFF = 2 |
|
|
|
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json' |
|
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json' |
|
EXPIRATION_HOURS = 24 |
|
IMAGE_EXPIRATION_DAYS = 7 |
|
|
|
def setup_logging(): |
|
"""Initialize logging with pruning of old logs.""" |
|
try: |
|
logging.debug("Attempting to set up logging") |
|
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True) |
|
logging.debug(f"Log directory created/verified: {os.path.dirname(LOG_FILE)}") |
|
if not os.access(os.path.dirname(LOG_FILE), os.W_OK): |
|
raise PermissionError(f"No write permission for {os.path.dirname(LOG_FILE)}") |
|
|
|
# Test write to log file |
|
try: |
|
with open(LOG_FILE, 'a') as f: |
|
f.write("") |
|
logging.debug(f"Confirmed write access to {LOG_FILE}") |
|
except Exception as e: |
|
raise PermissionError(f"Cannot write to {LOG_FILE}: {e}") |
|
|
|
if os.path.exists(LOG_FILE): |
|
with open(LOG_FILE, 'r') as f: |
|
lines = f.readlines() |
|
cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS) |
|
pruned_lines = [] |
|
malformed_count = 0 |
|
for line in lines: |
|
if len(line) < 19 or not line[:19].replace('-', '').replace(':', '').replace(' ', '').isdigit(): |
|
malformed_count += 1 |
|
continue |
|
try: |
|
timestamp = datetime.strptime(line[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc) |
|
if timestamp > cutoff: |
|
pruned_lines.append(line) |
|
except ValueError: |
|
malformed_count += 1 |
|
continue |
|
if malformed_count > 0: |
|
logging.info(f"Skipped {malformed_count} malformed log lines during pruning") |
|
with open(LOG_FILE, 'w') as f: |
|
f.writelines(pruned_lines) |
|
logging.debug(f"Log file pruned: {LOG_FILE}") |
|
|
|
logging.basicConfig( |
|
filename=LOG_FILE, |
|
level=logging.INFO, |
|
format="%(asctime)s - %(levelname)s - %(message)s", |
|
datefmt="%Y-%m-%d %H:%M:%S", |
|
force=True |
|
) |
|
console_handler = logging.StreamHandler() |
|
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) |
|
logging.getLogger().addHandler(console_handler) |
|
logging.getLogger("requests").setLevel(logging.WARNING) |
|
logging.getLogger("openai").setLevel(logging.WARNING) |
|
logging.info("Logging initialized for foodie_automator_rss.py") |
|
except Exception as e: |
|
print(f"Failed to setup logging: {e}") |
|
sys.exit(1) |
|
|
|
# Call setup_logging immediately |
|
setup_logging() |
|
|
|
check_author_rate_limit.script_run_id = int(time.time()) |
|
logging.info(f"Set script_run_id to {check_author_rate_limit.script_run_id}") |
|
|
|
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS) |
|
posted_titles = set(entry["title"] for entry in posted_titles_data) |
|
used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry) |
|
|
|
def acquire_lock(): |
|
try: |
|
logging.debug("Attempting to acquire lock") |
|
os.makedirs(os.path.dirname(LOCK_FILE), exist_ok=True) |
|
lock_fd = open(LOCK_FILE, 'w') |
|
fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) |
|
lock_fd.write(str(os.getpid())) |
|
lock_fd.flush() |
|
logging.debug(f"Lock acquired: {LOCK_FILE}") |
|
return lock_fd |
|
except IOError: |
|
logging.info("Another instance of foodie_automator_rss.py is running") |
|
sys.exit(0) |
|
|
|
def signal_handler(sig, frame): |
|
logging.info("Received termination signal, marking script as stopped...") |
|
update_system_activity(SCRIPT_NAME, "stopped") |
|
sys.exit(0) |
|
|
|
signal.signal(signal.SIGTERM, signal_handler) |
|
signal.signal(signal.SIGINT, signal_handler) |
|
|
|
def create_http_session() -> requests.Session: |
|
session = requests.Session() |
|
retry_strategy = Retry( |
|
total=MAX_RETRIES, |
|
backoff_factor=RETRY_BACKOFF, |
|
status_forcelist=[403, 429, 500, 502, 503, 504], |
|
allowed_methods=["GET", "POST"] |
|
) |
|
adapter = HTTPAdapter(max_retries=retry_strategy) |
|
session.mount("http://", adapter) |
|
session.mount("https://", adapter) |
|
session.headers.update({ |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36' |
|
}) |
|
return session |
|
|
|
def parse_date(date_str): |
|
try: |
|
parsed_date = email.utils.parsedate_to_datetime(date_str) |
|
if parsed_date.tzinfo is None: |
|
parsed_date = parsed_date.replace(tzinfo=timezone.utc) |
|
return parsed_date |
|
except Exception as e: |
|
logging.error(f"Failed to parse date '{date_str}': {e}") |
|
return datetime.now(timezone.utc) |
|
|
|
def fetch_rss_feeds(): |
|
logging.info("Starting fetch_rss_feeds") |
|
articles = [] |
|
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS) |
|
session = create_http_session() |
|
|
|
if not RSS_FEEDS: |
|
logging.error("RSS_FEEDS is empty in foodie_config.py") |
|
return articles |
|
|
|
logging.info(f"Processing feeds: {RSS_FEEDS}") |
|
for feed_url in RSS_FEEDS: |
|
for attempt in range(MAX_RETRIES): |
|
logging.info(f"Processing feed: {feed_url} (attempt {attempt + 1})") |
|
try: |
|
response = session.get(feed_url, timeout=FEED_TIMEOUT) |
|
response.raise_for_status() |
|
soup = BeautifulSoup(response.content, 'xml') |
|
items = soup.find_all('item') |
|
|
|
feed_title = RSS_FEED_NAMES.get(feed_url, (get_clean_source_name(feed_url), feed_url)) |
|
for item in items: |
|
try: |
|
title = item.find('title').text.strip() if item.find('title') else "Untitled" |
|
link = item.find('link').text.strip() if item.find('link') else "" |
|
pub_date = item.find('pubDate') |
|
pub_date = parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc) |
|
|
|
if pub_date < cutoff_date: |
|
logging.info(f"Skipping old article: {title} (Published: {pub_date})") |
|
continue |
|
|
|
description = item.find('description') |
|
summary = BeautifulSoup(description.text, 'html.parser').get_text().strip() if description else "" |
|
content = item.find('content:encoded') |
|
content_text = BeautifulSoup(content.text, 'html.parser').get_text().strip() if content else summary |
|
|
|
articles.append({ |
|
"title": title, |
|
"link": link, |
|
"summary": summary, |
|
"content": content_text, |
|
"feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title, |
|
"pub_date": pub_date |
|
}) |
|
logging.debug(f"Processed article: {title}") |
|
except Exception as e: |
|
logging.warning(f"Error processing entry in {feed_url}: {e}") |
|
continue |
|
logging.info(f"Filtered to {len(articles)} articles from {feed_url}") |
|
break |
|
except Exception as e: |
|
logging.error(f"Failed to fetch RSS feed {feed_url}: {e}") |
|
if attempt < MAX_RETRIES - 1: |
|
time.sleep(RETRY_BACKOFF * (2 ** attempt)) |
|
continue |
|
articles.sort(key=lambda x: x["pub_date"], reverse=True) |
|
logging.info(f"Total RSS articles fetched: {len(articles)}") |
|
return articles |
|
|
|
def fetch_duckduckgo_news_context(title, hours=24): |
|
for attempt in range(MAX_RETRIES): |
|
try: |
|
with DDGS() as ddgs: |
|
results = ddgs.news(f"{title} news", timelimit="d", max_results=5) |
|
titles = [] |
|
for r in results: |
|
try: |
|
date_str = r["date"] |
|
if '+00:00' in date_str: |
|
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc) |
|
else: |
|
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc) |
|
if dt > (datetime.now(timezone.utc) - timedelta(hours=24)): |
|
titles.append(r["title"].lower()) |
|
except ValueError as e: |
|
logging.warning(f"Date parsing failed for '{date_str}': {e}") |
|
continue |
|
context = " ".join(titles) if titles else "No recent news found within 24 hours" |
|
logging.info(f"DuckDuckGo News context for '{title}': {context}") |
|
return context |
|
except Exception as e: |
|
logging.warning(f"DuckDuckGo News context fetch failed for '{title}' (attempt {attempt + 1}): {e}") |
|
if attempt < MAX_RETRIES - 1: |
|
time.sleep(RETRY_BACKOFF * (2 ** attempt)) |
|
continue |
|
logging.error(f"Failed to fetch DuckDuckGo News context for '{title}' after {MAX_RETRIES} attempts") |
|
return title |
|
|
|
def curate_from_rss(posted_titles_data, posted_titles, used_images_data, used_images): |
|
try: |
|
logging.debug(f"Using {len(posted_titles)} posted titles and {len(used_images)} used images") |
|
|
|
articles = fetch_rss_feeds() |
|
if not articles: |
|
logging.info("No RSS articles available") |
|
sleep_time = random.randint(1200, 1800) # 20–30 minutes |
|
return None, None, sleep_time |
|
|
|
attempts = 0 |
|
max_attempts = 10 |
|
while attempts < max_attempts and articles: |
|
article = articles.pop(0) |
|
title = article["title"] |
|
link = article["link"] |
|
summary = article.get("summary", "") |
|
source_name = article.get("feed_title", "Unknown Source") |
|
original_source = f'<a href="{link}">{source_name}</a>' |
|
original_url = link # Store for fallback |
|
|
|
if title in posted_titles: |
|
logging.info(f"Skipping already posted article: {title}") |
|
attempts += 1 |
|
continue |
|
|
|
# Select author |
|
author = get_next_author_round_robin() |
|
if not author: |
|
logging.info(f"Skipping article '{title}' due to tweet rate limits for all authors") |
|
attempts += 1 |
|
continue |
|
author_username = author["username"] |
|
logging.info(f"Selected author via round-robin: {author_username}") |
|
|
|
logging.info(f"Trying RSS Article: {title} from {source_name}") |
|
|
|
try: |
|
image_query, relevance_keywords, main_topic, skip, specific_term = smart_image_and_filter(title, summary) |
|
except Exception as e: |
|
logging.warning(f"Failed to process smart_image_and_filter for '{title}': {e}") |
|
attempts += 1 |
|
continue |
|
|
|
if skip: |
|
logging.info(f"Skipping filtered RSS article: {title}") |
|
attempts += 1 |
|
continue |
|
|
|
ddg_context = fetch_duckduckgo_news_context(title) |
|
scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}" |
|
interest_score = is_interesting(scoring_content) |
|
logging.info(f"Interest score for '{title}': {interest_score}") |
|
if interest_score < 6: |
|
logging.info(f"RSS Interest Too Low: {interest_score}") |
|
attempts += 1 |
|
continue |
|
|
|
num_paragraphs = determine_paragraph_count(interest_score) |
|
extra_prompt = ( |
|
f"Generate exactly {num_paragraphs} paragraphs.\n" |
|
f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n" |
|
f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n" |
|
f"Do NOT introduce unrelated concepts unless in the content or additional context.\n" |
|
f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n" |
|
f"Do not include emojis in the summary." |
|
) |
|
content_to_summarize = scoring_content |
|
final_summary = summarize_with_gpt4o( |
|
content_to_summarize, |
|
source_name, |
|
link, |
|
interest_score=interest_score, |
|
extra_prompt=extra_prompt |
|
) |
|
if not final_summary: |
|
logging.info(f"Summary failed for '{title}'") |
|
attempts += 1 |
|
continue |
|
|
|
final_summary = insert_link_naturally(final_summary, source_name, link) |
|
|
|
post_data = { |
|
"title": generate_title_from_summary(final_summary), |
|
"content": final_summary, |
|
"status": "publish", |
|
"author": author_username, |
|
"categories": [generate_category_from_summary(final_summary)] |
|
} |
|
category = post_data["categories"][0] |
|
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic, specific_term) |
|
if not image_url: |
|
logging.warning(f"Flickr image fetch failed for '{image_query}', trying fallback") |
|
image_url, image_source, uploader, page_url = get_image(image_query, specific_term) |
|
if not image_url: |
|
logging.warning(f"All image uploads failed for '{title}' - posting without image") |
|
image_source = None |
|
uploader = None |
|
page_url = None |
|
|
|
hook = get_dynamic_hook(post_data["title"]).strip() |
|
share_prompt = get_viral_share_prompt(post_data["title"], final_summary) |
|
share_text = f"Check out this foodie gem! {post_data['title']}" |
|
share_text_encoded = quote(share_text) |
|
share_links_template = ( |
|
f'<p>{share_prompt} ' |
|
f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={share_text_encoded}" target="_blank"><i class="tsi tsi-twitter"></i></a> ' |
|
f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>' |
|
) |
|
|
|
post_data["content"] = f"{final_summary}\n\n{share_links_template.format(post_url='{post_url}', share_text=share_text_encoded)}" |
|
|
|
global is_posting |
|
is_posting = True |
|
try: |
|
post_id, post_url = post_to_wp( |
|
post_data=post_data, |
|
category=category, |
|
link=link, |
|
author=author, |
|
image_url=image_url, |
|
original_source=original_source, |
|
image_source=image_source, |
|
uploader=uploader, |
|
page_url=page_url, |
|
interest_score=interest_score, |
|
should_post_tweet=True, |
|
summary=final_summary |
|
) |
|
if not post_id: |
|
logging.warning(f"Failed to post to WordPress for '{title}', using original URL: {original_url}") |
|
post_url = original_url |
|
else: |
|
logging.info(f"Posted to WordPress for {author_username}: {post_url}") |
|
|
|
post_url_encoded = quote(post_url) |
|
post_data["content"] = f"{final_summary}\n\n{share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)}" |
|
if post_id: |
|
post_to_wp( |
|
post_data=post_data, |
|
category=category, |
|
link=link, |
|
author=author, |
|
image_url=None, # Skip image re-upload |
|
original_source=original_source, |
|
image_source=image_source, |
|
uploader=uploader, |
|
page_url=page_url, |
|
interest_score=interest_score, |
|
post_id=post_id, |
|
should_post_tweet=False |
|
) |
|
|
|
timestamp = datetime.now(timezone.utc).isoformat() |
|
save_json_file(POSTED_TITLES_FILE, title, timestamp) |
|
posted_titles.add(title) |
|
logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}") |
|
|
|
if image_url: |
|
save_json_file(USED_IMAGES_FILE, image_url, timestamp) |
|
used_images.add(image_url) |
|
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}") |
|
|
|
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id or 'N/A'}) from RSS *****") |
|
sleep_time = random.randint(1200, 1800) # 20–30 minutes |
|
return post_data, category, sleep_time |
|
|
|
except Exception as e: |
|
logging.error(f"Failed to post to WordPress for '{title}': {e}", exc_info=True) |
|
post_url = original_url |
|
timestamp = datetime.now(timezone.utc).isoformat() |
|
save_json_file(POSTED_TITLES_FILE, title, timestamp) |
|
posted_titles.add(title) |
|
logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}") |
|
if image_url: |
|
save_json_file(USED_IMAGES_FILE, image_url, timestamp) |
|
used_images.add(image_url) |
|
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}") |
|
attempts += 1 |
|
finally: |
|
is_posting = False |
|
|
|
logging.info("No interesting RSS article found after attempts") |
|
sleep_time = random.randint(1200, 1800) # 20–30 minutes |
|
return None, None, sleep_time |
|
except Exception as e: |
|
logging.error(f"Unexpected error in curate_from_rss: {e}", exc_info=True) |
|
sleep_time = random.randint(1200, 1800) # 20–30 minutes |
|
return None, None, sleep_time |
|
|
|
def run_rss_automator(): |
|
lock_fd = None |
|
try: |
|
lock_fd = acquire_lock() |
|
update_system_activity(SCRIPT_NAME, "running", os.getpid()) # Record start |
|
logging.info("***** RSS Automator Launched *****") |
|
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS) |
|
posted_titles = set(entry["title"] for entry in posted_titles_data) |
|
used_images_data = load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) |
|
used_images = set(entry["title"] for entry in used_images_data if "title" in entry) |
|
post_data, category, sleep_time = curate_from_rss(posted_titles_data, posted_titles, used_images_data, used_images) |
|
if not post_data: |
|
logging.info("No postable RSS article found") |
|
logging.info("Completed RSS run") |
|
logging.info(f"Run completed, sleep_time: {sleep_time} seconds") |
|
# Sleep while still marked as running |
|
time.sleep(sleep_time) |
|
update_system_activity(SCRIPT_NAME, "stopped") # Record stop after sleep |
|
return post_data, category, sleep_time |
|
except Exception as e: |
|
logging.error(f"Fatal error in run_rss_automator: {e}", exc_info=True) |
|
update_system_activity(SCRIPT_NAME, "stopped") # Record stop on error |
|
sleep_time = random.randint(1200, 1800) # Fixed to 20–30 minutes |
|
logging.info(f"Run completed, sleep_time: {sleep_time} seconds") |
|
return None, None, sleep_time |
|
finally: |
|
if lock_fd: |
|
fcntl.flock(lock_fd, fcntl.LOCK_UN) |
|
lock_fd.close() |
|
os.remove(LOCK_FILE) if os.path.exists(LOCK_FILE) else None |
|
|
|
if __name__ == "__main__": |
|
post_data, category, sleep_time = run_rss_automator() |
|
# logging.info(f"Run completed, sleep_time: {sleep_time} seconds") |