You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
367 lines
14 KiB
367 lines
14 KiB
# foodie_automator_rss.py |
|
import requests |
|
import random |
|
import time |
|
import logging |
|
import os |
|
import json |
|
import signal |
|
import sys |
|
import email.utils |
|
from datetime import datetime, timedelta, timezone |
|
from bs4 import BeautifulSoup |
|
from openai import OpenAI |
|
from urllib.parse import quote |
|
from requests.packages.urllib3.util.retry import Retry |
|
from requests.adapters import HTTPAdapter |
|
from foodie_config import ( |
|
RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, |
|
HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES, |
|
get_clean_source_name, X_API_CREDENTIALS |
|
) |
|
from foodie_utils import ( |
|
load_json_file, save_json_file, get_image, generate_image_query, |
|
upload_image_to_wp, determine_paragraph_count, insert_link_naturally, |
|
is_interesting, generate_title_from_summary, summarize_with_gpt4o, |
|
generate_category_from_summary, post_to_wp, prepare_post_data, |
|
select_best_author, smart_image_and_filter |
|
) |
|
from foodie_hooks import get_dynamic_hook, select_best_cta |
|
import feedparser |
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
from typing import List, Dict, Any, Optional |
|
from dotenv import load_dotenv |
|
|
|
load_dotenv() |
|
|
|
# Flag to indicate if we're in the middle of posting |
|
is_posting = False |
|
|
|
def signal_handler(sig, frame): |
|
logging.info("Received termination signal, checking if safe to exit...") |
|
if is_posting: |
|
logging.info("Currently posting, will exit after completion.") |
|
else: |
|
logging.info("Safe to exit immediately.") |
|
sys.exit(0) |
|
|
|
signal.signal(signal.SIGTERM, signal_handler) |
|
signal.signal(signal.SIGINT, signal_handler) |
|
|
|
LOG_FILE = "/home/shane/foodie_automator/foodie_automator_rss.log" |
|
LOG_PRUNE_DAYS = 30 |
|
MAX_WORKERS = 5 |
|
RATE_LIMIT_DELAY = 1 |
|
FEED_TIMEOUT = 30 |
|
MAX_RETRIES = 3 |
|
|
|
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json' |
|
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json' |
|
EXPIRATION_HOURS = 24 |
|
IMAGE_EXPIRATION_DAYS = 7 |
|
|
|
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS) |
|
posted_titles = set(entry["title"] for entry in posted_titles_data) |
|
used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry) |
|
|
|
def setup_logging(): |
|
if os.path.exists(LOG_FILE): |
|
with open(LOG_FILE, 'r') as f: |
|
lines = f.readlines() |
|
cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS) |
|
pruned_lines = [] |
|
for line in lines: |
|
try: |
|
timestamp = datetime.strptime(line[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc) |
|
if timestamp > cutoff: |
|
pruned_lines.append(line) |
|
except ValueError: |
|
logging.warning(f"Skipping malformed log line: {line.strip()[:50]}...") |
|
continue |
|
with open(LOG_FILE, 'w') as f: |
|
f.writelines(pruned_lines) |
|
|
|
logging.basicConfig( |
|
filename=LOG_FILE, |
|
level=logging.INFO, |
|
format="%(asctime)s - %(levelname)s - %(message)s", |
|
datefmt="%Y-%m-%d %H:%M:%S" |
|
) |
|
console_handler = logging.StreamHandler() |
|
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) |
|
logging.getLogger().addHandler(console_handler) |
|
logging.info("Logging initialized for foodie_automator_rss.py") |
|
|
|
setup_logging() |
|
|
|
def create_http_session() -> requests.Session: |
|
session = requests.Session() |
|
retry_strategy = Retry( |
|
total=MAX_RETRIES, |
|
backoff_factor=1, |
|
status_forcelist=[429, 500, 502, 503, 504], |
|
allowed_methods=["GET", "POST"] |
|
) |
|
adapter = HTTPAdapter( |
|
max_retries=retry_strategy, |
|
pool_connections=10, |
|
pool_maxsize=10 |
|
) |
|
session.mount("http://", adapter) |
|
session.mount("https://", adapter) |
|
return session |
|
|
|
def fetch_feed(feed_url: str, session: requests.Session) -> Optional[feedparser.FeedParserDict]: |
|
try: |
|
response = session.get(feed_url, timeout=FEED_TIMEOUT) |
|
response.raise_for_status() |
|
feed = feedparser.parse(response.content) |
|
|
|
if feed.bozo: |
|
logging.warning(f"Feed parsing error for {feed_url}: {feed.bozo_exception}") |
|
return None |
|
|
|
return feed |
|
except Exception as e: |
|
logging.error(f"Error fetching feed {feed_url}: {str(e)}") |
|
return None |
|
|
|
def is_interesting_rss(title: str, summary: str, pub_date: datetime) -> bool: |
|
try: |
|
if not title or not summary: |
|
return False |
|
|
|
if datetime.now(timezone.utc) - pub_date > timedelta(days=7): |
|
return False |
|
|
|
score = 0 |
|
title_lower = title.lower() |
|
if any(keyword in title_lower for keyword in RECIPE_KEYWORDS): |
|
score += 3 |
|
if any(keyword in title_lower for keyword in PROMO_KEYWORDS): |
|
score += 2 |
|
if any(keyword in title_lower for keyword in HOME_KEYWORDS): |
|
score += 1 |
|
|
|
summary_lower = summary.lower() |
|
if len(summary.split()) < 100: |
|
score -= 2 |
|
if any(keyword in summary_lower for keyword in PRODUCT_KEYWORDS): |
|
score += 1 |
|
|
|
return score >= 4 |
|
except Exception as e: |
|
logging.error(f"Error in is_interesting_rss: {str(e)}") |
|
return False |
|
|
|
def fetch_rss_feeds() -> List[Dict[str, Any]]: |
|
session = create_http_session() |
|
articles = [] |
|
|
|
try: |
|
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: |
|
futures = [] |
|
for feed_url in RSS_FEEDS: |
|
future = executor.submit(process_feed, feed_url, session) |
|
futures.append(future) |
|
|
|
for future in as_completed(futures): |
|
try: |
|
feed_articles = future.result() |
|
articles.extend(feed_articles) |
|
except Exception as e: |
|
logging.error(f"Error processing feed: {str(e)}") |
|
continue |
|
|
|
return articles |
|
except Exception as e: |
|
logging.error(f"Error in fetch_rss_feeds: {str(e)}") |
|
return [] |
|
|
|
def process_feed(feed_url: str, session: requests.Session) -> List[Dict[str, Any]]: |
|
try: |
|
feed = fetch_feed(feed_url, session) |
|
if not feed: |
|
return [] |
|
|
|
articles = [] |
|
for entry in feed.entries: |
|
try: |
|
pub_date = datetime.fromtimestamp(time.mktime(entry.published_parsed), tz=timezone.utc) |
|
|
|
article = { |
|
"title": entry.title, |
|
"link": entry.link, |
|
"summary": entry.summary if hasattr(entry, 'summary') else entry.description, |
|
"content": getattr(entry, 'content', [{'value': ''}])[0].value, |
|
"feed_title": get_clean_source_name(feed_url), |
|
"pub_date": pub_date |
|
} |
|
|
|
if is_interesting_rss(article["title"], article["summary"], pub_date): |
|
articles.append(article) |
|
|
|
time.sleep(RATE_LIMIT_DELAY) |
|
except Exception as e: |
|
logging.warning(f"Error processing entry: {str(e)}") |
|
continue |
|
|
|
return articles |
|
except Exception as e: |
|
logging.error(f"Error processing feed {feed_url}: {str(e)}") |
|
return [] |
|
|
|
def parse_date(date_str): |
|
try: |
|
parsed_date = email.utils.parsedate_to_datetime(date_str) |
|
if parsed_date.tzinfo is None: |
|
parsed_date = parsed_date.replace(tzinfo=timezone.utc) |
|
return parsed_date |
|
except Exception as e: |
|
logging.error(f"Failed to parse date '{date_str}': {e}") |
|
return datetime.now(timezone.utc) |
|
|
|
def curate_from_rss(): |
|
articles = fetch_rss_feeds() |
|
if not articles: |
|
print("No RSS articles available") |
|
logging.info("No RSS articles available") |
|
return None, None, None |
|
|
|
attempts = 0 |
|
max_attempts = 10 |
|
while attempts < max_attempts and articles: |
|
article = articles.pop(0) |
|
title = article["title"] |
|
link = article["link"] |
|
summary = article["summary"] |
|
content = article["content"] |
|
source_name = article["feed_title"] |
|
original_source = f'<a href="{link}">{source_name}</a>' |
|
|
|
if title in posted_titles: |
|
print(f"Skipping already posted article: {title}") |
|
logging.info(f"Skipping already posted article: {title}") |
|
attempts += 1 |
|
continue |
|
|
|
print(f"Trying RSS Article: {title} from {source_name}") |
|
logging.info(f"Trying RSS Article: {title} from {source_name}") |
|
|
|
image_query, relevance_keywords, skip = smart_image_and_filter(title, summary) |
|
if skip: |
|
print(f"Skipping filtered RSS article: {title}") |
|
logging.info(f"Skipping filtered RSS article: {title}") |
|
attempts += 1 |
|
continue |
|
|
|
scoring_content = f"{title}\n\n{summary}\n\nContent: {content}" |
|
interest_score = is_interesting(scoring_content) |
|
logging.info(f"Interest score for '{title}': {interest_score}") |
|
if interest_score < 6: |
|
print(f"RSS Interest Too Low: {interest_score}") |
|
logging.info(f"RSS Interest Too Low: {interest_score}") |
|
attempts += 1 |
|
continue |
|
|
|
num_paragraphs = determine_paragraph_count(interest_score) |
|
extra_prompt = ( |
|
f"Generate exactly {num_paragraphs} paragraphs. " |
|
f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details. " |
|
f"Do NOT introduce unrelated concepts. Expand on the core idea with relevant context about its appeal or significance." |
|
"Do not include emojis in the summary." |
|
) |
|
content_to_summarize = scoring_content |
|
final_summary = summarize_with_gpt4o( |
|
content_to_summarize, |
|
source_name, |
|
link, |
|
interest_score=interest_score, |
|
extra_prompt=extra_prompt |
|
) |
|
if not final_summary: |
|
logging.info(f"Summary failed for '{title}'") |
|
attempts += 1 |
|
continue |
|
|
|
final_summary = insert_link_naturally(final_summary, source_name, link) |
|
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title) |
|
if not post_data: |
|
attempts += 1 |
|
continue |
|
|
|
hook = get_dynamic_hook(post_data["title"]).strip() |
|
cta = select_best_cta(post_data["title"], final_summary, post_url=None) |
|
|
|
post_data["content"] = f"{final_summary}\n\n{cta}" |
|
global is_posting |
|
is_posting = True |
|
try: |
|
post_id, post_url = post_to_wp( |
|
post_data=post_data, |
|
category=category, |
|
link=link, |
|
author=author, |
|
image_url=image_url, |
|
original_source=original_source, |
|
image_source=image_source, |
|
uploader=uploader, |
|
pixabay_url=pixabay_url, |
|
interest_score=interest_score |
|
) |
|
finally: |
|
is_posting = False |
|
|
|
if post_id: |
|
cta = select_best_cta(post_data["title"], final_summary, post_url=post_url) |
|
post_data["content"] = f"{final_summary}\n\n{cta}" |
|
is_posting = True |
|
try: |
|
post_to_wp( |
|
post_data=post_data, |
|
category=category, |
|
link=link, |
|
author=author, |
|
image_url=image_url, |
|
original_source=original_source, |
|
image_source=image_source, |
|
uploader=uploader, |
|
pixabay_url=pixabay_url, |
|
interest_score=interest_score, |
|
post_id=post_id |
|
) |
|
finally: |
|
is_posting = False |
|
|
|
timestamp = datetime.now(timezone.utc).isoformat() |
|
save_json_file(POSTED_TITLES_FILE, title, timestamp) |
|
posted_titles.add(title) |
|
logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}") |
|
|
|
if image_url: |
|
save_json_file(USED_IMAGES_FILE, image_url, timestamp) |
|
used_images.add(image_url) |
|
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}") |
|
|
|
print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****") |
|
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****") |
|
return post_data, category, random.randint(0, 1800) |
|
|
|
attempts += 1 |
|
logging.info(f"WP posting failed for '{post_data['title']}'") |
|
|
|
print("No interesting RSS article found after attempts") |
|
logging.info("No interesting RSS article found after attempts") |
|
return None, None, random.randint(600, 1800) |
|
|
|
def run_rss_automator(): |
|
print(f"{datetime.now(timezone.utc)} - INFO - ***** RSS Automator Launched *****") |
|
logging.info("***** RSS Automator Launched *****") |
|
post_data, category, sleep_time = curate_from_rss() |
|
print(f"Sleeping for {sleep_time}s") |
|
logging.info(f"Completed run with sleep time: {sleep_time} seconds") |
|
time.sleep(sleep_time) |
|
return post_data, category, sleep_time |
|
|
|
if __name__ == "__main__": |
|
run_rss_automator() |