|
|
|
@ -16,7 +16,7 @@ from requests.packages.urllib3.util.retry import Retry |
|
|
|
from requests.adapters import HTTPAdapter |
|
|
|
from requests.adapters import HTTPAdapter |
|
|
|
from foodie_config import ( |
|
|
|
from foodie_config import ( |
|
|
|
RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, |
|
|
|
RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS, |
|
|
|
HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES, |
|
|
|
HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES, CTAS, |
|
|
|
get_clean_source_name, X_API_CREDENTIALS |
|
|
|
get_clean_source_name, X_API_CREDENTIALS |
|
|
|
) |
|
|
|
) |
|
|
|
from foodie_utils import ( |
|
|
|
from foodie_utils import ( |
|
|
|
@ -24,12 +24,9 @@ from foodie_utils import ( |
|
|
|
upload_image_to_wp, determine_paragraph_count, insert_link_naturally, |
|
|
|
upload_image_to_wp, determine_paragraph_count, insert_link_naturally, |
|
|
|
is_interesting, generate_title_from_summary, summarize_with_gpt4o, |
|
|
|
is_interesting, generate_title_from_summary, summarize_with_gpt4o, |
|
|
|
generate_category_from_summary, post_to_wp, prepare_post_data, |
|
|
|
generate_category_from_summary, post_to_wp, prepare_post_data, |
|
|
|
select_best_author, smart_image_and_filter |
|
|
|
select_best_author, smart_image_and_filter, get_flickr_image_via_ddg |
|
|
|
) |
|
|
|
) |
|
|
|
from foodie_hooks import get_dynamic_hook, select_best_cta |
|
|
|
from foodie_hooks import get_dynamic_hook, select_best_cta |
|
|
|
import feedparser |
|
|
|
|
|
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
|
|
|
|
|
|
from typing import List, Dict, Any, Optional |
|
|
|
|
|
|
|
from dotenv import load_dotenv |
|
|
|
from dotenv import load_dotenv |
|
|
|
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
load_dotenv() |
|
|
|
@ -50,9 +47,7 @@ signal.signal(signal.SIGINT, signal_handler) |
|
|
|
|
|
|
|
|
|
|
|
LOG_FILE = "/home/shane/foodie_automator/foodie_automator_rss.log" |
|
|
|
LOG_FILE = "/home/shane/foodie_automator/foodie_automator_rss.log" |
|
|
|
LOG_PRUNE_DAYS = 30 |
|
|
|
LOG_PRUNE_DAYS = 30 |
|
|
|
MAX_WORKERS = 5 |
|
|
|
FEED_TIMEOUT = 15 |
|
|
|
RATE_LIMIT_DELAY = 1 |
|
|
|
|
|
|
|
FEED_TIMEOUT = 30 |
|
|
|
|
|
|
|
MAX_RETRIES = 3 |
|
|
|
MAX_RETRIES = 3 |
|
|
|
|
|
|
|
|
|
|
|
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json' |
|
|
|
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json' |
|
|
|
@ -90,6 +85,7 @@ def setup_logging(): |
|
|
|
console_handler = logging.StreamHandler() |
|
|
|
console_handler = logging.StreamHandler() |
|
|
|
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) |
|
|
|
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) |
|
|
|
logging.getLogger().addHandler(console_handler) |
|
|
|
logging.getLogger().addHandler(console_handler) |
|
|
|
|
|
|
|
logging.getLogger("requests").setLevel(logging.WARNING) |
|
|
|
logging.info("Logging initialized for foodie_automator_rss.py") |
|
|
|
logging.info("Logging initialized for foodie_automator_rss.py") |
|
|
|
|
|
|
|
|
|
|
|
setup_logging() |
|
|
|
setup_logging() |
|
|
|
@ -98,8 +94,8 @@ def create_http_session() -> requests.Session: |
|
|
|
session = requests.Session() |
|
|
|
session = requests.Session() |
|
|
|
retry_strategy = Retry( |
|
|
|
retry_strategy = Retry( |
|
|
|
total=MAX_RETRIES, |
|
|
|
total=MAX_RETRIES, |
|
|
|
backoff_factor=2, # Increased backoff factor for better retry handling |
|
|
|
backoff_factor=2, |
|
|
|
status_forcelist=[429, 500, 502, 503, 504, 403], # Added 403 to retry list |
|
|
|
status_forcelist=[403, 429, 500, 502, 503, 504], |
|
|
|
allowed_methods=["GET", "POST"] |
|
|
|
allowed_methods=["GET", "POST"] |
|
|
|
) |
|
|
|
) |
|
|
|
adapter = HTTPAdapter( |
|
|
|
adapter = HTTPAdapter( |
|
|
|
@ -109,153 +105,83 @@ def create_http_session() -> requests.Session: |
|
|
|
) |
|
|
|
) |
|
|
|
session.mount("http://", adapter) |
|
|
|
session.mount("http://", adapter) |
|
|
|
session.mount("https://", adapter) |
|
|
|
session.mount("https://", adapter) |
|
|
|
# Add a realistic User-Agent header |
|
|
|
|
|
|
|
session.headers.update({ |
|
|
|
session.headers.update({ |
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36' |
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36' |
|
|
|
}) |
|
|
|
}) |
|
|
|
return session |
|
|
|
return session |
|
|
|
|
|
|
|
|
|
|
|
def fetch_feed(feed_url: str, session: requests.Session) -> Optional[feedparser.FeedParserDict]: |
|
|
|
def parse_date(date_str): |
|
|
|
logging.debug(f"Fetching feed: {feed_url}") |
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
response = session.get(feed_url, timeout=15) # Reduced timeout to 15 seconds |
|
|
|
|
|
|
|
response.raise_for_status() |
|
|
|
|
|
|
|
feed = feedparser.parse(response.content) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if feed.bozo: |
|
|
|
|
|
|
|
logging.warning(f"Feed parsing error for {feed_url}: {feed.bozo_exception}") |
|
|
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logging.debug(f"Successfully fetched feed: {feed_url}") |
|
|
|
|
|
|
|
return feed |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
|
|
logging.error(f"Error fetching feed {feed_url}: {str(e)}") |
|
|
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_interesting_rss(title: str, summary: str, pub_date: datetime) -> bool: |
|
|
|
|
|
|
|
try: |
|
|
|
try: |
|
|
|
if not title or not summary: |
|
|
|
parsed_date = email.utils.parsedate_to_datetime(date_str) |
|
|
|
return False |
|
|
|
if parsed_date.tzinfo is None: |
|
|
|
|
|
|
|
parsed_date = parsed_date.replace(tzinfo=timezone.utc) |
|
|
|
if datetime.now(timezone.utc) - pub_date > timedelta(days=7): |
|
|
|
return parsed_date |
|
|
|
return False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
score = 0 |
|
|
|
|
|
|
|
title_lower = title.lower() |
|
|
|
|
|
|
|
if any(keyword in title_lower for keyword in RECIPE_KEYWORDS): |
|
|
|
|
|
|
|
score += 3 |
|
|
|
|
|
|
|
if any(keyword in title_lower for keyword in PROMO_KEYWORDS): |
|
|
|
|
|
|
|
score += 2 |
|
|
|
|
|
|
|
if any(keyword in title_lower for keyword in HOME_KEYWORDS): |
|
|
|
|
|
|
|
score += 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
summary_lower = summary.lower() |
|
|
|
|
|
|
|
if len(summary.split()) < 100: |
|
|
|
|
|
|
|
score -= 2 |
|
|
|
|
|
|
|
if any(keyword in summary_lower for keyword in PRODUCT_KEYWORDS): |
|
|
|
|
|
|
|
score += 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return score >= 4 |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
except Exception as e: |
|
|
|
logging.error(f"Error in is_interesting_rss: {str(e)}") |
|
|
|
logging.error(f"Failed to parse date '{date_str}': {e}") |
|
|
|
return False |
|
|
|
return datetime.now(timezone.utc) |
|
|
|
|
|
|
|
|
|
|
|
def fetch_rss_feeds() -> List[Dict[str, Any]]: |
|
|
|
def fetch_rss_feeds(): |
|
|
|
logging.info("Starting fetch_rss_feeds") |
|
|
|
logging.info("Starting fetch_rss_feeds") |
|
|
|
session = create_http_session() |
|
|
|
|
|
|
|
articles = [] |
|
|
|
articles = [] |
|
|
|
|
|
|
|
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS) |
|
|
|
|
|
|
|
session = create_http_session() |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
if not RSS_FEEDS: |
|
|
|
logging.info(f"Processing {len(RSS_FEEDS)} feeds: {RSS_FEEDS}") |
|
|
|
logging.error("RSS_FEEDS is empty in foodie_config.py") |
|
|
|
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: |
|
|
|
|
|
|
|
futures = [] |
|
|
|
|
|
|
|
for feed_url in RSS_FEEDS: |
|
|
|
|
|
|
|
logging.debug(f"Scheduling feed: {feed_url}") |
|
|
|
|
|
|
|
future = executor.submit(process_feed, feed_url, session) |
|
|
|
|
|
|
|
futures.append(future) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for future in as_completed(futures): |
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
feed_articles = future.result() |
|
|
|
|
|
|
|
logging.info(f"Completed feed processing, got {len(feed_articles)} articles") |
|
|
|
|
|
|
|
articles.extend(feed_articles) |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
|
|
logging.error(f"Error processing feed in future: {str(e)}") |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
logging.info(f"Finished fetch_rss_feeds, total articles: {len(articles)}") |
|
|
|
|
|
|
|
return articles |
|
|
|
return articles |
|
|
|
except Exception as e: |
|
|
|
|
|
|
|
logging.error(f"Error in fetch_rss_feeds: {str(e)}") |
|
|
|
|
|
|
|
return [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_feed(feed_url: str, session: requests.Session) -> List[Dict[str, Any]]: |
|
|
|
for feed_url in RSS_FEEDS: |
|
|
|
logging.info(f"Processing feed: {feed_url}") |
|
|
|
logging.info(f"Processing feed: {feed_url}") |
|
|
|
try: |
|
|
|
try: |
|
|
|
feed = fetch_feed(feed_url, session) |
|
|
|
response = session.get(feed_url, timeout=FEED_TIMEOUT) |
|
|
|
if not feed: |
|
|
|
response.raise_for_status() |
|
|
|
logging.warning(f"No feed data for {feed_url}") |
|
|
|
soup = BeautifulSoup(response.content, 'xml') |
|
|
|
return [] |
|
|
|
items = soup.find_all('item') |
|
|
|
|
|
|
|
|
|
|
|
articles = [] |
|
|
|
feed_title = RSS_FEED_NAMES.get(feed_url, (get_clean_source_name(feed_url), feed_url)) |
|
|
|
logging.debug(f"Feed entries count: {len(feed.entries)}") |
|
|
|
for item in items: |
|
|
|
for entry in feed.entries: |
|
|
|
|
|
|
|
try: |
|
|
|
try: |
|
|
|
logging.debug(f"Processing entry: {entry.get('title', 'No title')}") |
|
|
|
title = item.find('title').text.strip() if item.find('title') else "Untitled" |
|
|
|
pub_date = datetime.fromtimestamp(time.mktime(entry.published_parsed), tz=timezone.utc) |
|
|
|
link = item.find('link').text.strip() if item.find('link') else "" |
|
|
|
|
|
|
|
pub_date = item.find('pubDate') |
|
|
|
# Safely extract content |
|
|
|
pub_date = parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc) |
|
|
|
content = "" |
|
|
|
|
|
|
|
if hasattr(entry, 'content') and isinstance(entry.content, list) and len(entry.content) > 0: |
|
|
|
|
|
|
|
content_item = entry.content[0] |
|
|
|
|
|
|
|
if isinstance(content_item, dict) and 'value' in content_item: |
|
|
|
|
|
|
|
content = content_item['value'] |
|
|
|
|
|
|
|
elif hasattr(content_item, 'value'): |
|
|
|
|
|
|
|
content = content_item.value |
|
|
|
|
|
|
|
elif hasattr(entry, 'description'): |
|
|
|
|
|
|
|
content = entry.description |
|
|
|
|
|
|
|
elif hasattr(entry, 'summary'): |
|
|
|
|
|
|
|
content = entry.summary |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
article = { |
|
|
|
|
|
|
|
"title": entry.title, |
|
|
|
|
|
|
|
"link": entry.link, |
|
|
|
|
|
|
|
"summary": entry.summary if hasattr(entry, 'summary') else entry.description if hasattr(entry, 'description') else "", |
|
|
|
|
|
|
|
"content": content, |
|
|
|
|
|
|
|
"feed_title": get_clean_source_name(feed_url), |
|
|
|
|
|
|
|
"pub_date": pub_date |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if is_interesting_rss(article["title"], article["summary"], pub_date): |
|
|
|
if pub_date < cutoff_date: |
|
|
|
logging.info(f"Interesting article found: {article['title']}") |
|
|
|
logging.info(f"Skipping old article: {title} (Published: {pub_date})") |
|
|
|
articles.append(article) |
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
time.sleep(RATE_LIMIT_DELAY) |
|
|
|
description = item.find('description') |
|
|
|
|
|
|
|
summary = BeautifulSoup(description.text, 'html.parser').get_text().strip() if description else "" |
|
|
|
|
|
|
|
content = item.find('content:encoded') |
|
|
|
|
|
|
|
content_text = BeautifulSoup(content.text, 'html.parser').get_text().strip() if content else summary |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
articles.append({ |
|
|
|
|
|
|
|
"title": title, |
|
|
|
|
|
|
|
"link": link, |
|
|
|
|
|
|
|
"summary": summary, |
|
|
|
|
|
|
|
"content": content_text, |
|
|
|
|
|
|
|
"feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title, |
|
|
|
|
|
|
|
"pub_date": pub_date |
|
|
|
|
|
|
|
}) |
|
|
|
|
|
|
|
logging.debug(f"Processed article: {title}") |
|
|
|
except Exception as e: |
|
|
|
except Exception as e: |
|
|
|
logging.warning(f"Error processing entry in {feed_url}: {str(e)}") |
|
|
|
logging.warning(f"Error processing entry in {feed_url}: {e}") |
|
|
|
continue |
|
|
|
continue |
|
|
|
|
|
|
|
logging.info(f"Filtered to {len(articles)} articles from {feed_url}") |
|
|
|
logging.info(f"Finished processing {feed_url}, found {len(articles)} articles") |
|
|
|
|
|
|
|
return articles |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
except Exception as e: |
|
|
|
logging.error(f"Error processing feed {feed_url}: {str(e)}") |
|
|
|
logging.error(f"Failed to fetch RSS feed {feed_url}: {e}") |
|
|
|
return [] |
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
def parse_date(date_str): |
|
|
|
articles.sort(key=lambda x: x["pub_date"], reverse=True) |
|
|
|
try: |
|
|
|
logging.info(f"Total RSS articles fetched: {len(articles)}") |
|
|
|
parsed_date = email.utils.parsedate_to_datetime(date_str) |
|
|
|
return articles |
|
|
|
if parsed_date.tzinfo is None: |
|
|
|
|
|
|
|
parsed_date = parsed_date.replace(tzinfo=timezone.utc) |
|
|
|
|
|
|
|
return parsed_date |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
|
|
logging.error(f"Failed to parse date '{date_str}': {e}") |
|
|
|
|
|
|
|
return datetime.now(timezone.utc) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def curate_from_rss(): |
|
|
|
def curate_from_rss(): |
|
|
|
articles = fetch_rss_feeds() |
|
|
|
articles = fetch_rss_feeds() |
|
|
|
if not articles: |
|
|
|
if not articles: |
|
|
|
print("No RSS articles available") |
|
|
|
print("No RSS articles available") |
|
|
|
logging.info("No RSS articles available") |
|
|
|
logging.info("No RSS articles available") |
|
|
|
return None, None, None |
|
|
|
return None, None, random.randint(600, 1800) |
|
|
|
|
|
|
|
|
|
|
|
attempts = 0 |
|
|
|
attempts = 0 |
|
|
|
max_attempts = 10 |
|
|
|
max_attempts = 10 |
|
|
|
@ -320,6 +246,11 @@ def curate_from_rss(): |
|
|
|
attempts += 1 |
|
|
|
attempts += 1 |
|
|
|
continue |
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Fetch image |
|
|
|
|
|
|
|
image_url, image_source, uploader, page_url = get_flickr_image_via_ddg(image_query, relevance_keywords) |
|
|
|
|
|
|
|
if not image_url: |
|
|
|
|
|
|
|
image_url, image_source, uploader, page_url = get_image(image_query) |
|
|
|
|
|
|
|
|
|
|
|
hook = get_dynamic_hook(post_data["title"]).strip() |
|
|
|
hook = get_dynamic_hook(post_data["title"]).strip() |
|
|
|
cta = select_best_cta(post_data["title"], final_summary, post_url=None) |
|
|
|
cta = select_best_cta(post_data["title"], final_summary, post_url=None) |
|
|
|
|
|
|
|
|
|
|
|
@ -338,7 +269,7 @@ def curate_from_rss(): |
|
|
|
uploader=uploader, |
|
|
|
uploader=uploader, |
|
|
|
pixabay_url=pixabay_url, |
|
|
|
pixabay_url=pixabay_url, |
|
|
|
interest_score=interest_score, |
|
|
|
interest_score=interest_score, |
|
|
|
should_post_tweet=True # Post the X tweet on the first call |
|
|
|
should_post_tweet=True |
|
|
|
) |
|
|
|
) |
|
|
|
finally: |
|
|
|
finally: |
|
|
|
is_posting = False |
|
|
|
is_posting = False |
|
|
|
@ -360,7 +291,7 @@ def curate_from_rss(): |
|
|
|
pixabay_url=pixabay_url, |
|
|
|
pixabay_url=pixabay_url, |
|
|
|
interest_score=interest_score, |
|
|
|
interest_score=interest_score, |
|
|
|
post_id=post_id, |
|
|
|
post_id=post_id, |
|
|
|
should_post_tweet=False # Skip X tweet on the update call |
|
|
|
should_post_tweet=False |
|
|
|
) |
|
|
|
) |
|
|
|
finally: |
|
|
|
finally: |
|
|
|
is_posting = False |
|
|
|
is_posting = False |
|
|
|
|