use cursor to optomize files
This commit is contained in:
+213
-282
@@ -10,6 +10,7 @@ import sys
|
||||
import re
|
||||
import email.utils
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import List, Dict, Optional, Tuple, Set
|
||||
from bs4 import BeautifulSoup
|
||||
from openai import OpenAI
|
||||
from urllib.parse import quote
|
||||
@@ -18,7 +19,8 @@ from requests.adapters import HTTPAdapter
|
||||
from foodie_config import (
|
||||
RSS_FEEDS, RSS_FEED_NAMES, AUTHORS, RECIPE_KEYWORDS, PROMO_KEYWORDS,
|
||||
HOME_KEYWORDS, PRODUCT_KEYWORDS, PERSONA_CONFIGS, CATEGORIES,
|
||||
get_clean_source_name, X_API_CREDENTIALS
|
||||
get_clean_source_name, X_API_CREDENTIALS, FILE_PATHS, EXPIRATION_DAYS,
|
||||
IMAGE_EXPIRATION_DAYS, LIGHT_TASK_MODEL
|
||||
)
|
||||
from foodie_utils import (
|
||||
load_json_file, save_json_file, get_image, generate_image_query,
|
||||
@@ -30,42 +32,50 @@ from foodie_utils import (
|
||||
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Global state
|
||||
is_posting = False
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def signal_handler(sig, frame):
|
||||
logging.info("Received termination signal, checking if safe to exit...")
|
||||
if is_posting:
|
||||
logging.info("Currently posting, will exit after completion.")
|
||||
else:
|
||||
logging.info("Safe to exit immediately.")
|
||||
sys.exit(0)
|
||||
class RSSScraper:
|
||||
def __init__(self):
|
||||
self.setup_logging()
|
||||
self.setup_signal_handlers()
|
||||
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
self.posted_titles = self.load_posted_titles()
|
||||
self.used_images = self.load_used_images()
|
||||
self.session = self.setup_http_session()
|
||||
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
def setup_logging(self) -> None:
|
||||
"""Configure logging for the scraper."""
|
||||
log_file = FILE_PATHS["posted_rss_titles"].with_suffix('.log')
|
||||
self.prune_old_logs(log_file)
|
||||
|
||||
logging.basicConfig(
|
||||
filename=str(log_file),
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
logging.getLogger("requests").setLevel(logging.WARNING)
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
||||
logging.getLogger().addHandler(console_handler)
|
||||
logger.info("Logging initialized for RSS scraper")
|
||||
|
||||
LOG_FILE = "/home/shane/foodie_automator/foodie_automator_rss.log"
|
||||
LOG_PRUNE_DAYS = 30
|
||||
FEED_TIMEOUT = 15
|
||||
MAX_RETRIES = 3
|
||||
def prune_old_logs(self, log_file: str) -> None:
|
||||
"""Prune log entries older than LOG_PRUNE_DAYS."""
|
||||
if not os.path.exists(log_file):
|
||||
return
|
||||
|
||||
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json'
|
||||
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
|
||||
EXPIRATION_HOURS = 24
|
||||
IMAGE_EXPIRATION_DAYS = 7
|
||||
|
||||
posted_titles_data = load_json_file(POSTED_TITLES_FILE, EXPIRATION_HOURS)
|
||||
posted_titles = set(entry["title"] for entry in posted_titles_data)
|
||||
used_images = set(entry["title"] for entry in load_json_file(USED_IMAGES_FILE, IMAGE_EXPIRATION_DAYS) if "title" in entry)
|
||||
|
||||
def setup_logging():
|
||||
if os.path.exists(LOG_FILE):
|
||||
with open(LOG_FILE, 'r') as f:
|
||||
with open(log_file, 'r') as f:
|
||||
lines = f.readlines()
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
|
||||
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(days=30) # LOG_PRUNE_DAYS
|
||||
pruned_lines = []
|
||||
malformed_count = 0
|
||||
|
||||
for line in lines:
|
||||
if len(line) < 19 or not line[:19].replace('-', '').replace(':', '').replace(' ', '').isdigit():
|
||||
malformed_count += 1
|
||||
@@ -77,290 +87,211 @@ def setup_logging():
|
||||
except ValueError:
|
||||
malformed_count += 1
|
||||
continue
|
||||
|
||||
if malformed_count > 0:
|
||||
logging.info(f"Skipped {malformed_count} malformed log lines during pruning")
|
||||
with open(LOG_FILE, 'w') as f:
|
||||
logger.warning(f"Skipped {malformed_count} malformed log lines during pruning")
|
||||
|
||||
with open(log_file, 'w') as f:
|
||||
f.writelines(pruned_lines)
|
||||
|
||||
logging.basicConfig(
|
||||
filename=LOG_FILE,
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
||||
logging.getLogger().addHandler(console_handler)
|
||||
logging.getLogger("requests").setLevel(logging.WARNING)
|
||||
logging.info("Logging initialized for foodie_automator_rss.py")
|
||||
|
||||
setup_logging()
|
||||
def setup_signal_handlers(self) -> None:
|
||||
"""Set up signal handlers for graceful shutdown."""
|
||||
def signal_handler(sig, frame):
|
||||
logger.info("Received termination signal, checking if safe to exit...")
|
||||
if is_posting:
|
||||
logger.info("Currently posting, will exit after completion.")
|
||||
else:
|
||||
logger.info("Safe to exit immediately.")
|
||||
sys.exit(0)
|
||||
|
||||
def create_http_session() -> requests.Session:
|
||||
session = requests.Session()
|
||||
retry_strategy = Retry(
|
||||
total=MAX_RETRIES,
|
||||
backoff_factor=2,
|
||||
status_forcelist=[403, 429, 500, 502, 503, 504],
|
||||
allowed_methods=["GET", "POST"]
|
||||
)
|
||||
adapter = HTTPAdapter(
|
||||
max_retries=retry_strategy,
|
||||
pool_connections=10,
|
||||
pool_maxsize=10
|
||||
)
|
||||
session.mount("http://", adapter)
|
||||
session.mount("https://", adapter)
|
||||
session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
|
||||
})
|
||||
return session
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
def parse_date(date_str):
|
||||
try:
|
||||
parsed_date = email.utils.parsedate_to_datetime(date_str)
|
||||
if parsed_date.tzinfo is None:
|
||||
parsed_date = parsed_date.replace(tzinfo=timezone.utc)
|
||||
return parsed_date
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to parse date '{date_str}': {e}")
|
||||
return datetime.now(timezone.utc)
|
||||
def setup_http_session(self) -> requests.Session:
|
||||
"""Set up a requests session with retry logic."""
|
||||
session = requests.Session()
|
||||
retry_strategy = Retry(
|
||||
total=3,
|
||||
backoff_factor=2,
|
||||
status_forcelist=[403, 429, 500, 502, 503, 504],
|
||||
allowed_methods=["GET", "POST"]
|
||||
)
|
||||
adapter = HTTPAdapter(
|
||||
max_retries=retry_strategy,
|
||||
pool_connections=10,
|
||||
pool_maxsize=10
|
||||
)
|
||||
session.mount("http://", adapter)
|
||||
session.mount("https://", adapter)
|
||||
session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
|
||||
})
|
||||
return session
|
||||
|
||||
def fetch_rss_feeds():
|
||||
logging.info("Starting fetch_rss_feeds")
|
||||
articles = []
|
||||
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=EXPIRATION_HOURS)
|
||||
session = create_http_session()
|
||||
|
||||
if not RSS_FEEDS:
|
||||
logging.error("RSS_FEEDS is empty in foodie_config.py")
|
||||
return articles
|
||||
|
||||
for feed_url in RSS_FEEDS:
|
||||
logging.info(f"Processing feed: {feed_url}")
|
||||
def load_posted_titles(self) -> Set[str]:
|
||||
"""Load and return the set of posted titles."""
|
||||
try:
|
||||
response = session.get(feed_url, timeout=FEED_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.content, 'xml')
|
||||
items = soup.find_all('item')
|
||||
|
||||
feed_title = RSS_FEED_NAMES.get(feed_url, (get_clean_source_name(feed_url), feed_url))
|
||||
for item in items:
|
||||
try:
|
||||
title = item.find('title').text.strip() if item.find('title') else "Untitled"
|
||||
link = item.find('link').text.strip() if item.find('link') else ""
|
||||
pub_date = item.find('pubDate')
|
||||
pub_date = parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc)
|
||||
|
||||
if pub_date < cutoff_date:
|
||||
logging.info(f"Skipping old article: {title} (Published: {pub_date})")
|
||||
continue
|
||||
|
||||
description = item.find('description')
|
||||
summary = BeautifulSoup(description.text, 'html.parser').get_text().strip() if description else ""
|
||||
content = item.find('content:encoded')
|
||||
content_text = BeautifulSoup(content.text, 'html.parser').get_text().strip() if content else summary
|
||||
|
||||
articles.append({
|
||||
"title": title,
|
||||
"link": link,
|
||||
"summary": summary,
|
||||
"content": content_text,
|
||||
"feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title,
|
||||
"pub_date": pub_date
|
||||
})
|
||||
logging.debug(f"Processed article: {title}")
|
||||
except Exception as e:
|
||||
logging.warning(f"Error processing entry in {feed_url}: {e}")
|
||||
continue
|
||||
logging.info(f"Filtered to {len(articles)} articles from {feed_url}")
|
||||
data = load_json_file(FILE_PATHS["posted_rss_titles"], EXPIRATION_DAYS)
|
||||
return {entry["title"] for entry in data if "title" in entry}
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to fetch RSS feed {feed_url}: {e}")
|
||||
continue
|
||||
logger.error(f"Error loading posted titles: {e}")
|
||||
return set()
|
||||
|
||||
articles.sort(key=lambda x: x["pub_date"], reverse=True)
|
||||
logging.info(f"Total RSS articles fetched: {len(articles)}")
|
||||
return articles
|
||||
def load_used_images(self) -> Set[str]:
|
||||
"""Load and return the set of used images."""
|
||||
try:
|
||||
data = load_json_file(FILE_PATHS["used_images"], IMAGE_EXPIRATION_DAYS)
|
||||
return {entry["title"] for entry in data if "title" in entry}
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading used images: {e}")
|
||||
return set()
|
||||
|
||||
def curate_from_rss():
|
||||
articles = fetch_rss_feeds()
|
||||
if not articles:
|
||||
print("No RSS articles available")
|
||||
logging.info("No RSS articles available")
|
||||
return None, None, random.randint(600, 1800)
|
||||
def parse_date(self, date_str: str) -> datetime:
|
||||
"""Parse a date string into a datetime object."""
|
||||
try:
|
||||
parsed_date = email.utils.parsedate_to_datetime(date_str)
|
||||
if parsed_date.tzinfo is None:
|
||||
parsed_date = parsed_date.replace(tzinfo=timezone.utc)
|
||||
return parsed_date
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse date '{date_str}': {e}")
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
attempts = 0
|
||||
max_attempts = 10
|
||||
while attempts < max_attempts and articles:
|
||||
article = articles.pop(0)
|
||||
title = article["title"]
|
||||
link = article["link"]
|
||||
summary = article["summary"]
|
||||
content = article["content"]
|
||||
source_name = article["feed_title"]
|
||||
original_source = f'<a href="{link}">{source_name}</a>'
|
||||
def fetch_rss_feeds(self) -> List[Dict]:
|
||||
"""Fetch and process RSS feeds."""
|
||||
logger.info("Starting fetch_rss_feeds")
|
||||
articles = []
|
||||
cutoff_date = datetime.now(timezone.utc) - timedelta(hours=24)
|
||||
|
||||
if title in posted_titles:
|
||||
print(f"Skipping already posted article: {title}")
|
||||
logging.info(f"Skipping already posted article: {title}")
|
||||
attempts += 1
|
||||
continue
|
||||
if not RSS_FEEDS:
|
||||
logger.error("RSS_FEEDS is empty in foodie_config.py")
|
||||
return articles
|
||||
|
||||
print(f"Trying RSS Article: {title} from {source_name}")
|
||||
logging.info(f"Trying RSS Article: {title} from {source_name}")
|
||||
for feed_url in RSS_FEEDS:
|
||||
logger.info(f"Processing feed: {feed_url}")
|
||||
try:
|
||||
response = self.session.get(feed_url, timeout=15)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.content, 'xml')
|
||||
items = soup.find_all('item')
|
||||
|
||||
image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
|
||||
if skip:
|
||||
print(f"Skipping filtered RSS article: {title}")
|
||||
logging.info(f"Skipping filtered RSS article: {title}")
|
||||
attempts += 1
|
||||
continue
|
||||
feed_title = RSS_FEED_NAMES.get(feed_url, (get_clean_source_name(feed_url), feed_url))
|
||||
for item in items:
|
||||
try:
|
||||
title = item.find('title').text.strip() if item.find('title') else "Untitled"
|
||||
link = item.find('link').text.strip() if item.find('link') else ""
|
||||
pub_date = item.find('pubDate')
|
||||
pub_date = self.parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc)
|
||||
|
||||
scoring_content = f"{title}\n\n{summary}\n\nContent: {content}"
|
||||
interest_score = is_interesting(scoring_content)
|
||||
logging.info(f"Interest score for '{title}': {interest_score}")
|
||||
if interest_score < 6:
|
||||
print(f"RSS Interest Too Low: {interest_score}")
|
||||
logging.info(f"RSS Interest Too Low: {interest_score}")
|
||||
attempts += 1
|
||||
continue
|
||||
if pub_date < cutoff_date:
|
||||
logger.info(f"Skipping old article: {title} (Published: {pub_date})")
|
||||
continue
|
||||
|
||||
num_paragraphs = determine_paragraph_count(interest_score)
|
||||
extra_prompt = (
|
||||
f"Generate exactly {num_paragraphs} paragraphs.\n"
|
||||
f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
|
||||
f"Do NOT introduce unrelated concepts.\n"
|
||||
f"Expand on the core idea with relevant context about its appeal or significance.\n"
|
||||
f"Do not include emojis in the summary."
|
||||
)
|
||||
content_to_summarize = scoring_content
|
||||
final_summary = summarize_with_gpt4o(
|
||||
content_to_summarize,
|
||||
source_name,
|
||||
link,
|
||||
interest_score=interest_score,
|
||||
extra_prompt=extra_prompt
|
||||
)
|
||||
if not final_summary:
|
||||
logging.info(f"Summary failed for '{title}'")
|
||||
attempts += 1
|
||||
continue
|
||||
description = item.find('description')
|
||||
summary = BeautifulSoup(description.text, 'html.parser').get_text().strip() if description else ""
|
||||
content = item.find('content:encoded')
|
||||
content_text = BeautifulSoup(content.text, 'html.parser').get_text().strip() if content else summary
|
||||
|
||||
# Remove the original title from the summary while preserving paragraphs
|
||||
title_pattern = re.compile(
|
||||
r'\*\*' + re.escape(title) + r'\*\*|' + re.escape(title),
|
||||
re.IGNORECASE
|
||||
)
|
||||
paragraphs = final_summary.split('\n')
|
||||
cleaned_paragraphs = []
|
||||
for para in paragraphs:
|
||||
if para.strip():
|
||||
cleaned_para = title_pattern.sub('', para).strip()
|
||||
cleaned_para = re.sub(r'\s+', ' ', cleaned_para)
|
||||
cleaned_paragraphs.append(cleaned_para)
|
||||
final_summary = '\n'.join(cleaned_paragraphs)
|
||||
|
||||
final_summary = insert_link_naturally(final_summary, source_name, link)
|
||||
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
|
||||
if not post_data:
|
||||
attempts += 1
|
||||
continue
|
||||
|
||||
# Fetch image
|
||||
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords)
|
||||
if not image_url:
|
||||
logging.info(f"Flickr fetch failed for '{image_query}'. Falling back to Pixabay.")
|
||||
image_url, image_source, uploader, page_url = get_image(image_query)
|
||||
if not image_url:
|
||||
logging.info(f"Pixabay fetch failed for '{image_query}'. Skipping article '{title}'.")
|
||||
attempts += 1
|
||||
articles.append({
|
||||
"title": title,
|
||||
"link": link,
|
||||
"summary": summary,
|
||||
"content": content_text,
|
||||
"feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title,
|
||||
"pub_date": pub_date
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing entry in {feed_url}: {e}")
|
||||
continue
|
||||
logger.info(f"Filtered to {len(articles)} articles from {feed_url}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch RSS feed {feed_url}: {e}")
|
||||
continue
|
||||
|
||||
hook = get_dynamic_hook(post_data["title"]).strip()
|
||||
articles.sort(key=lambda x: x["pub_date"], reverse=True)
|
||||
logger.info(f"Total RSS articles fetched: {len(articles)}")
|
||||
return articles
|
||||
|
||||
# Generate viral share prompt
|
||||
share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
|
||||
share_links_template = (
|
||||
f'<p>{share_prompt} '
|
||||
f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
|
||||
f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
|
||||
)
|
||||
post_data["content"] = f"{final_summary}\n\n{share_links_template}" # Removed cta from content
|
||||
def curate_from_rss(self) -> Tuple[Optional[Dict], Optional[str], int]:
|
||||
"""Curate content from RSS feeds."""
|
||||
articles = self.fetch_rss_feeds()
|
||||
if not articles:
|
||||
logger.info("No RSS articles available")
|
||||
return None, None, random.randint(600, 1800)
|
||||
|
||||
global is_posting
|
||||
is_posting = True
|
||||
try:
|
||||
post_id, post_url = post_to_wp(
|
||||
post_data=post_data,
|
||||
category=category,
|
||||
link=link,
|
||||
author=author,
|
||||
image_url=image_url,
|
||||
original_source=original_source,
|
||||
image_source=image_source,
|
||||
uploader=uploader,
|
||||
pixabay_url=pixabay_url,
|
||||
interest_score=interest_score,
|
||||
should_post_tweet=True
|
||||
for article in articles:
|
||||
title = article["title"]
|
||||
link = article["link"]
|
||||
summary = article["summary"]
|
||||
content = article["content"]
|
||||
source_name = article["feed_title"]
|
||||
|
||||
if title in self.posted_titles:
|
||||
logger.info(f"Skipping already posted article: {title}")
|
||||
continue
|
||||
|
||||
logger.info(f"Processing RSS Article: {title} from {source_name}")
|
||||
|
||||
image_query, relevance_keywords, skip = smart_image_and_filter(title, summary)
|
||||
if skip:
|
||||
logger.info(f"Skipping filtered RSS article: {title}")
|
||||
continue
|
||||
|
||||
scoring_content = f"{title}\n\n{summary}\n\nContent: {content}"
|
||||
interest_score = is_interesting(scoring_content)
|
||||
logger.info(f"Interest score for '{title}': {interest_score}")
|
||||
|
||||
if interest_score < 6:
|
||||
logger.info(f"RSS Interest Too Low: {interest_score}")
|
||||
continue
|
||||
|
||||
num_paragraphs = determine_paragraph_count(interest_score)
|
||||
extra_prompt = (
|
||||
f"Generate exactly {num_paragraphs} paragraphs.\n"
|
||||
f"FOCUS: Summarize ONLY the provided content, explicitly mentioning '{title}' and sticking to its specific topic and details.\n"
|
||||
f"Do NOT introduce unrelated concepts.\n"
|
||||
f"Expand on the core idea with relevant context about its appeal or significance.\n"
|
||||
f"Do not include emojis in the summary."
|
||||
)
|
||||
finally:
|
||||
is_posting = False
|
||||
|
||||
if post_id:
|
||||
share_text = f"Check out this foodie gem! {post_data['title']}"
|
||||
share_text_encoded = quote(share_text)
|
||||
post_url_encoded = quote(post_url)
|
||||
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
|
||||
# Removed: cta = select_best_cta(post_data["title"], final_summary, post_url=post_url)
|
||||
post_data["content"] = f"{final_summary}\n\n{share_links}" # Removed cta from content
|
||||
is_posting = True
|
||||
try:
|
||||
post_to_wp(
|
||||
post_data=post_data,
|
||||
category=category,
|
||||
link=link,
|
||||
author=author,
|
||||
image_url=image_url,
|
||||
original_source=original_source,
|
||||
image_source=image_source,
|
||||
uploader=uploader,
|
||||
pixabay_url=pixabay_url,
|
||||
interest_score=interest_score,
|
||||
post_id=post_id,
|
||||
should_post_tweet=False
|
||||
)
|
||||
finally:
|
||||
is_posting = False
|
||||
final_summary = summarize_with_gpt4o(
|
||||
scoring_content,
|
||||
source_name,
|
||||
link,
|
||||
interest_score=interest_score,
|
||||
extra_prompt=extra_prompt
|
||||
)
|
||||
|
||||
timestamp = datetime.now(timezone.utc).isoformat()
|
||||
save_json_file(POSTED_TITLES_FILE, title, timestamp)
|
||||
posted_titles.add(title)
|
||||
logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")
|
||||
if not final_summary:
|
||||
logger.info(f"Summary failed for '{title}'")
|
||||
continue
|
||||
|
||||
if image_url:
|
||||
save_json_file(USED_IMAGES_FILE, image_url, timestamp)
|
||||
used_images.add(image_url)
|
||||
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
|
||||
final_summary = insert_link_naturally(final_summary, source_name, link)
|
||||
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
|
||||
|
||||
print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
|
||||
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
|
||||
return post_data, category, random.randint(0, 1800)
|
||||
if post_data and author:
|
||||
return post_data, author, random.randint(600, 1800)
|
||||
|
||||
attempts += 1
|
||||
logging.info(f"WP posting failed for '{post_data['title']}'")
|
||||
|
||||
print("No interesting RSS article found after attempts")
|
||||
logging.info("No interesting RSS article found after attempts")
|
||||
return None, None, random.randint(600, 1800)
|
||||
return None, None, random.randint(600, 1800)
|
||||
|
||||
def run_rss_automator():
|
||||
print(f"{datetime.now(timezone.utc)} - INFO - ***** RSS Automator Launched *****")
|
||||
logging.info("***** RSS Automator Launched *****")
|
||||
post_data, category, sleep_time = curate_from_rss()
|
||||
print(f"Sleeping for {sleep_time}s")
|
||||
logging.info(f"Completed run with sleep time: {sleep_time} seconds")
|
||||
time.sleep(sleep_time)
|
||||
return post_data, category, sleep_time
|
||||
"""Main function to run the RSS automator."""
|
||||
scraper = RSSScraper()
|
||||
while True:
|
||||
try:
|
||||
post_data, author, sleep_time = scraper.curate_from_rss()
|
||||
if post_data and author:
|
||||
global is_posting
|
||||
is_posting = True
|
||||
try:
|
||||
post_to_wp(post_data, author)
|
||||
logger.info(f"Successfully posted: {post_data['title']}")
|
||||
finally:
|
||||
is_posting = False
|
||||
time.sleep(sleep_time)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in RSS automator: {e}")
|
||||
time.sleep(300) # Wait 5 minutes before retrying
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_rss_automator()
|
||||
Reference in New Issue
Block a user