Initial commit of foodie automator scripts
This commit is contained in:
@@ -0,0 +1,330 @@
|
||||
import requests
|
||||
import random
|
||||
import time
|
||||
import logging
|
||||
import os
|
||||
import json
|
||||
import email.utils
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from bs4 import BeautifulSoup
|
||||
from openai import OpenAI
|
||||
from urllib.parse import quote
|
||||
from requests.packages.urllib3.util.retry import Retry
|
||||
from requests.adapters import HTTPAdapter
|
||||
from foodie_config import RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, HOME_KEYWORDS, PRODUCT_KEYWORDS, SUMMARY_PERSONA_PROMPTS, CATEGORIES, get_clean_source_name
|
||||
from foodie_utils import (
|
||||
load_json_file, save_json_file, get_image, generate_image_query,
|
||||
upload_image_to_wp, determine_paragraph_count, insert_link_naturally, is_interesting,
|
||||
generate_title_from_summary, summarize_with_gpt4o, generate_category_from_summary, post_to_wp,
|
||||
prepare_post_data, select_best_author, smart_image_and_filter
|
||||
)
|
||||
from foodie_hooks import get_dynamic_hook, select_best_cta
|
||||
import feedparser
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
LOG_FILE = "/home/shane/foodie_automator/foodie_automator_rss.log"
|
||||
LOG_PRUNE_DAYS = 30
|
||||
MAX_WORKERS = 5 # Number of concurrent workers for parallel processing
|
||||
RATE_LIMIT_DELAY = 1 # Delay between API calls in seconds
|
||||
FEED_TIMEOUT = 30 # Timeout for feed requests in seconds
|
||||
MAX_RETRIES = 3 # Maximum number of retries for failed requests
|
||||
|
||||
def setup_logging():
|
||||
"""Configure logging with rotation and cleanup."""
|
||||
if os.path.exists(LOG_FILE):
|
||||
with open(LOG_FILE, 'r') as f:
|
||||
lines = f.readlines()
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
|
||||
pruned_lines = []
|
||||
for line in lines:
|
||||
try:
|
||||
timestamp = datetime.strptime(line[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc)
|
||||
if timestamp > cutoff:
|
||||
pruned_lines.append(line)
|
||||
except ValueError:
|
||||
logging.warning(f"Skipping malformed log line: {line.strip()[:50]}...")
|
||||
continue
|
||||
with open(LOG_FILE, 'w') as f:
|
||||
f.writelines(pruned_lines)
|
||||
|
||||
logging.basicConfig(
|
||||
filename=LOG_FILE,
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
|
||||
def create_http_session() -> requests.Session:
|
||||
"""Create and configure an HTTP session with retry logic."""
|
||||
session = requests.Session()
|
||||
retry_strategy = Retry(
|
||||
total=MAX_RETRIES,
|
||||
backoff_factor=1,
|
||||
status_forcelist=[429, 500, 502, 503, 504],
|
||||
allowed_methods=["GET", "POST"]
|
||||
)
|
||||
adapter = HTTPAdapter(
|
||||
max_retries=retry_strategy,
|
||||
pool_connections=10,
|
||||
pool_maxsize=10
|
||||
)
|
||||
session.mount("http://", adapter)
|
||||
session.mount("https://", adapter)
|
||||
return session
|
||||
|
||||
def fetch_feed(feed_url: str, session: requests.Session) -> Optional[feedparser.FeedParserDict]:
|
||||
"""Fetch and parse an RSS feed with error handling and retries."""
|
||||
try:
|
||||
response = session.get(feed_url, timeout=FEED_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
feed = feedparser.parse(response.content)
|
||||
|
||||
if feed.bozo:
|
||||
logging.warning(f"Feed parsing error for {feed_url}: {feed.bozo_exception}")
|
||||
return None
|
||||
|
||||
return feed
|
||||
except Exception as e:
|
||||
logging.error(f"Error fetching feed {feed_url}: {str(e)}")
|
||||
return None
|
||||
|
||||
def is_interesting_rss(title: str, summary: str, pub_date: datetime) -> bool:
|
||||
"""Enhanced content filtering with improved scoring."""
|
||||
try:
|
||||
# Basic validation
|
||||
if not title or not summary:
|
||||
return False
|
||||
|
||||
# Check if content is too old
|
||||
if datetime.now(timezone.utc) - pub_date > timedelta(days=7):
|
||||
return False
|
||||
|
||||
# Calculate interest score
|
||||
score = 0
|
||||
|
||||
# Title analysis
|
||||
title_lower = title.lower()
|
||||
if any(keyword in title_lower for keyword in RECIPE_KEYWORDS):
|
||||
score += 3
|
||||
if any(keyword in title_lower for keyword in PROMO_KEYWORDS):
|
||||
score += 2
|
||||
if any(keyword in title_lower for keyword in HOME_KEYWORDS):
|
||||
score += 1
|
||||
|
||||
# Content analysis
|
||||
summary_lower = summary.lower()
|
||||
if len(summary.split()) < 100:
|
||||
score -= 2
|
||||
if any(keyword in summary_lower for keyword in PRODUCT_KEYWORDS):
|
||||
score += 1
|
||||
|
||||
return score >= 4
|
||||
except Exception as e:
|
||||
logging.error(f"Error in is_interesting_rss: {str(e)}")
|
||||
return False
|
||||
|
||||
def fetch_rss_feeds() -> List[Dict[str, Any]]:
|
||||
"""Fetch RSS feeds with parallel processing and improved error handling."""
|
||||
session = create_http_session()
|
||||
articles = []
|
||||
|
||||
try:
|
||||
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
||||
futures = []
|
||||
for feed_url in RSS_FEEDS:
|
||||
future = executor.submit(process_feed, feed_url, session)
|
||||
futures.append(future)
|
||||
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
feed_articles = future.result()
|
||||
articles.extend(feed_articles)
|
||||
except Exception as e:
|
||||
logging.error(f"Error processing feed: {str(e)}")
|
||||
continue
|
||||
|
||||
return articles
|
||||
except Exception as e:
|
||||
logging.error(f"Error in fetch_rss_feeds: {str(e)}")
|
||||
return []
|
||||
|
||||
def process_feed(feed_url: str, session: requests.Session) -> List[Dict[str, Any]]:
|
||||
"""Process a single RSS feed and extract articles."""
|
||||
try:
|
||||
feed = fetch_feed(feed_url, session)
|
||||
if not feed:
|
||||
return []
|
||||
|
||||
articles = []
|
||||
for entry in feed.entries:
|
||||
try:
|
||||
pub_date = datetime.fromtimestamp(time.mktime(entry.published_parsed), tz=timezone.utc)
|
||||
|
||||
article = {
|
||||
"title": entry.title,
|
||||
"link": entry.link,
|
||||
"summary": entry.summary if hasattr(entry, 'summary') else entry.description,
|
||||
"feed_title": get_clean_source_name(feed.feed.title),
|
||||
"pub_date": pub_date
|
||||
}
|
||||
|
||||
if is_interesting_rss(article["title"], article["summary"], pub_date):
|
||||
articles.append(article)
|
||||
|
||||
time.sleep(RATE_LIMIT_DELAY)
|
||||
except Exception as e:
|
||||
logging.warning(f"Error processing entry: {str(e)}")
|
||||
continue
|
||||
|
||||
return articles
|
||||
except Exception as e:
|
||||
logging.error(f"Error processing feed {feed_url}: {str(e)}")
|
||||
return []
|
||||
|
||||
def parse_date(date_str):
|
||||
try:
|
||||
parsed_date = email.utils.parsedate_to_datetime(date_str)
|
||||
if parsed_date.tzinfo is None:
|
||||
parsed_date = parsed_date.replace(tzinfo=timezone.utc)
|
||||
return parsed_date
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to parse date '{date_str}': {e}")
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
def curate_from_rss():
|
||||
articles = fetch_rss_feeds()
|
||||
if not articles:
|
||||
print("No RSS articles available")
|
||||
logging.info("No RSS articles available")
|
||||
return None, None, None
|
||||
|
||||
attempts = 0
|
||||
max_attempts = 10
|
||||
while attempts < max_attempts and articles:
|
||||
article = articles.pop(0) # Take newest article
|
||||
title = article["title"]
|
||||
link = article["link"]
|
||||
summary = article["summary"]
|
||||
content = article["content"]
|
||||
feed_url = article["feed_title"]
|
||||
source_name = feed_url[0] if isinstance(feed_url, tuple) and len(feed_url) > 0 else feed_url
|
||||
original_source = f'<a href="{link}">{source_name}</a>'
|
||||
|
||||
if title in posted_titles:
|
||||
print(f"Skipping already posted article: {title}")
|
||||
logging.info(f"Skipping already posted article: {title}")
|
||||
attempts += 1
|
||||
continue
|
||||
|
||||
print(f"Trying RSS Article: {title} from {source_name}")
|
||||
logging.info(f"Trying RSS Article: {title} from {source_name}")
|
||||
|
||||
image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
|
||||
if skip:
|
||||
print(f"Skipping filtered RSS article: {title}")
|
||||
logging.info(f"Skipping filtered RSS article: {title}")
|
||||
attempts += 1
|
||||
continue
|
||||
|
||||
# Score using title, summary, and content
|
||||
scoring_content = f"{title}\n\n{summary}\n\nContent: {content}"
|
||||
interest_score = is_interesting(scoring_content)
|
||||
logging.info(f"Interest score for '{title}': {interest_score}")
|
||||
if interest_score < 6:
|
||||
print(f"RSS Interest Too Low: {interest_score}")
|
||||
logging.info(f"RSS Interest Too Low: {interest_score}")
|
||||
attempts += 1
|
||||
continue
|
||||
|
||||
num_paragraphs = determine_paragraph_count(interest_score)
|
||||
extra_prompt = (
|
||||
f"Generate exactly {num_paragraphs} paragraphs. "
|
||||
f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details. "
|
||||
f"Do NOT introduce unrelated concepts. Expand on the core idea with relevant context about its appeal or significance."
|
||||
)
|
||||
content_to_summarize = scoring_content
|
||||
final_summary = summarize_with_gpt4o(
|
||||
content_to_summarize,
|
||||
source_name,
|
||||
link,
|
||||
interest_score=interest_score,
|
||||
extra_prompt=extra_prompt
|
||||
)
|
||||
if not final_summary:
|
||||
logging.info(f"Summary failed for '{title}'")
|
||||
attempts += 1
|
||||
continue
|
||||
|
||||
final_summary = insert_link_naturally(final_summary, source_name, link)
|
||||
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
|
||||
if not post_data:
|
||||
attempts += 1
|
||||
continue
|
||||
|
||||
hook = get_dynamic_hook(post_data["title"]).strip()
|
||||
cta = select_best_cta(post_data["title"], final_summary, post_url=None)
|
||||
|
||||
post_data["content"] = f"{final_summary}\n\n{cta}"
|
||||
post_id, post_url = post_to_wp(
|
||||
post_data=post_data,
|
||||
category=category,
|
||||
link=link,
|
||||
author=author,
|
||||
image_url=image_url,
|
||||
original_source=original_source,
|
||||
image_source=image_source,
|
||||
uploader=uploader,
|
||||
pixabay_url=pixabay_url,
|
||||
interest_score=interest_score
|
||||
)
|
||||
|
||||
if post_id:
|
||||
cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
|
||||
post_data["content"] = f"{final_summary}\n\n{cta}"
|
||||
post_to_wp(
|
||||
post_data=post_data,
|
||||
category=category,
|
||||
link=link,
|
||||
author=author,
|
||||
image_url=image_url,
|
||||
original_source=original_source,
|
||||
image_source=image_source,
|
||||
uploader=uploader,
|
||||
pixabay_url=pixabay_url,
|
||||
interest_score=interest_score,
|
||||
post_id=post_id
|
||||
)
|
||||
|
||||
timestamp = datetime.now(timezone.utc).isoformat()
|
||||
save_json_file(POSTED_TITLES_FILE, title, timestamp)
|
||||
posted_titles.add(title)
|
||||
logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")
|
||||
|
||||
if image_url:
|
||||
save_json_file(USED_IMAGES_FILE, image_url, timestamp)
|
||||
used_images.add(image_url)
|
||||
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
|
||||
|
||||
print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
|
||||
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
|
||||
return post_data, category, random.randint(0, 1800)
|
||||
|
||||
attempts += 1
|
||||
logging.info(f"WP posting failed for '{post_data['title']}'")
|
||||
|
||||
print("No interesting RSS article found after attempts")
|
||||
logging.info("No interesting RSS article found after attempts")
|
||||
return None, None, random.randint(600, 1800)
|
||||
|
||||
def run_rss_automator():
|
||||
print(f"{datetime.now(timezone.utc)} - INFO - ***** RSS Automator Launched *****")
|
||||
logging.info("***** RSS Automator Launched *****")
|
||||
post_data, category, sleep_time = curate_from_rss()
|
||||
print(f"Sleeping for {sleep_time}s")
|
||||
logging.info(f"Completed run with sleep time: {sleep_time} seconds")
|
||||
time.sleep(sleep_time)
|
||||
return post_data, category, sleep_time
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_rss_automator()
|
||||
Reference in New Issue
Block a user