add lock files and update weekly tweet to include last tweet to follow

This commit is contained in:
2025-05-06 09:40:04 +10:00
parent 331979ca9e
commit 028dfc3fc8
6 changed files with 1481 additions and 904 deletions
+230 -197
View File
@@ -31,10 +31,12 @@ from foodie_utils import (
)
from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
from dotenv import load_dotenv
import fcntl
load_dotenv()
is_posting = False
LOCK_FILE = "/home/shane/foodie_automator/locks/foodie_automator_rss.lock"
def signal_handler(sig, frame):
logging.info("Received termination signal, checking if safe to exit...")
@@ -47,10 +49,11 @@ def signal_handler(sig, frame):
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
LOG_FILE = "/home/shane/foodie_automator/foodie_automator_rss.log"
LOG_FILE = "/home/shane/foodie_automator/logs/foodie_automator_rss.log"
LOG_PRUNE_DAYS = 30
FEED_TIMEOUT = 15
MAX_RETRIES = 3
RETRY_BACKOFF = 2
POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json'
USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
@@ -96,21 +99,27 @@ def setup_logging():
logging.getLogger("requests").setLevel(logging.WARNING)
logging.info("Logging initialized for foodie_automator_rss.py")
setup_logging()
def acquire_lock():
os.makedirs(os.path.dirname(LOCK_FILE), exist_ok=True)
lock_fd = open(LOCK_FILE, 'w')
try:
fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
lock_fd.write(str(os.getpid()))
lock_fd.flush()
return lock_fd
except IOError:
logging.info("Another instance of foodie_automator_rss.py is running")
sys.exit(0)
def create_http_session() -> requests.Session:
session = requests.Session()
retry_strategy = Retry(
total=MAX_RETRIES,
backoff_factor=2,
backoff_factor=RETRY_BACKOFF,
status_forcelist=[403, 429, 500, 502, 503, 504],
allowed_methods=["GET", "POST"]
)
adapter = HTTPAdapter(
max_retries=retry_strategy,
pool_connections=10,
pool_maxsize=10
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)
session.headers.update({
@@ -140,189 +149,169 @@ def fetch_rss_feeds():
logging.info(f"Processing feeds: {RSS_FEEDS}")
for feed_url in RSS_FEEDS:
logging.info(f"Processing feed: {feed_url}")
try:
response = session.get(feed_url, timeout=FEED_TIMEOUT)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'xml')
items = soup.find_all('item')
for attempt in range(MAX_RETRIES):
logging.info(f"Processing feed: {feed_url} (attempt {attempt + 1})")
try:
response = session.get(feed_url, timeout=FEED_TIMEOUT)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'xml')
items = soup.find_all('item')
feed_title = RSS_FEED_NAMES.get(feed_url, (get_clean_source_name(feed_url), feed_url))
for item in items:
try:
title = item.find('title').text.strip() if item.find('title') else "Untitled"
link = item.find('link').text.strip() if item.find('link') else ""
pub_date = item.find('pubDate')
pub_date = parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc)
feed_title = RSS_FEED_NAMES.get(feed_url, (get_clean_source_name(feed_url), feed_url))
for item in items:
try:
title = item.find('title').text.strip() if item.find('title') else "Untitled"
link = item.find('link').text.strip() if item.find('link') else ""
pub_date = item.find('pubDate')
pub_date = parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc)
if pub_date < cutoff_date:
logging.info(f"Skipping old article: {title} (Published: {pub_date})")
if pub_date < cutoff_date:
logging.info(f"Skipping old article: {title} (Published: {pub_date})")
continue
description = item.find('description')
summary = BeautifulSoup(description.text, 'html.parser').get_text().strip() if description else ""
content = item.find('content:encoded')
content_text = BeautifulSoup(content.text, 'html.parser').get_text().strip() if content else summary
articles.append({
"title": title,
"link": link,
"summary": summary,
"content": content_text,
"feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title,
"pub_date": pub_date
})
logging.debug(f"Processed article: {title}")
except Exception as e:
logging.warning(f"Error processing entry in {feed_url}: {e}")
continue
description = item.find('description')
summary = BeautifulSoup(description.text, 'html.parser').get_text().strip() if description else ""
content = item.find('content:encoded')
content_text = BeautifulSoup(content.text, 'html.parser').get_text().strip() if content else summary
articles.append({
"title": title,
"link": link,
"summary": summary,
"content": content_text,
"feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title,
"pub_date": pub_date
})
logging.debug(f"Processed article: {title}")
except Exception as e:
logging.warning(f"Error processing entry in {feed_url}: {e}")
continue
logging.info(f"Filtered to {len(articles)} articles from {feed_url}")
except Exception as e:
logging.error(f"Failed to fetch RSS feed {feed_url}: {e}")
continue
logging.info(f"Filtered to {len(articles)} articles from {feed_url}")
break
except Exception as e:
logging.error(f"Failed to fetch RSS feed {feed_url}: {e}")
if attempt < MAX_RETRIES - 1:
time.sleep(RETRY_BACKOFF * (2 ** attempt))
continue
articles.sort(key=lambda x: x["pub_date"], reverse=True)
logging.info(f"Total RSS articles fetched: {len(articles)}")
return articles
def fetch_duckduckgo_news_context(title, hours=24):
try:
with DDGS() as ddgs:
results = ddgs.news(f"{title} news", timelimit="d", max_results=5)
titles = []
for r in results:
try:
date_str = r["date"]
if '+00:00' in date_str:
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
else:
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
titles.append(r["title"].lower())
except ValueError as e:
logging.warning(f"Date parsing failed for '{date_str}': {e}")
continue
context = " ".join(titles) if titles else "No recent news found within 24 hours"
logging.info(f"DuckDuckGo News context for '{title}': {context}")
return context
except Exception as e:
logging.warning(f"DuckDuckGo News context fetch failed for '{title}': {e}")
return title
for attempt in range(MAX_RETRIES):
try:
with DDGS() as ddgs:
results = ddgs.news(f"{title} news", timelimit="d", max_results=5)
titles = []
for r in results:
try:
date_str = r["date"]
if '+00:00' in date_str:
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
else:
dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S%Z").replace(tzinfo=timezone.utc)
if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
titles.append(r["title"].lower())
except ValueError as e:
logging.warning(f"Date parsing failed for '{date_str}': {e}")
continue
context = " ".join(titles) if titles else "No recent news found within 24 hours"
logging.info(f"DuckDuckGo News context for '{title}': {context}")
return context
except Exception as e:
logging.warning(f"DuckDuckGo News context fetch failed for '{title}' (attempt {attempt + 1}): {e}")
if attempt < MAX_RETRIES - 1:
time.sleep(RETRY_BACKOFF * (2 ** attempt))
continue
logging.error(f"Failed to fetch DuckDuckGo News context for '{title}' after {MAX_RETRIES} attempts")
return title
def curate_from_rss():
articles = fetch_rss_feeds() # Corrected from fetch_rss_articles to fetch_rss_feeds
if not articles:
print("No RSS articles available")
logging.info("No RSS articles available")
return None, None, random.randint(600, 1800)
try:
articles = fetch_rss_feeds()
if not articles:
logging.info("No RSS articles available")
return None, None, False # Continue running
attempts = 0
max_attempts = 10
while attempts < max_attempts and articles:
article = articles.pop(0)
title = article["title"]
link = article["link"]
summary = article.get("summary", "")
source_name = article.get("feed_title", "Unknown Source") # Adjusted to match fetch_rss_feeds output
original_source = f'<a href="{link}">{source_name}</a>'
attempts = 0
max_attempts = 10
while attempts < max_attempts and articles:
article = articles.pop(0)
title = article["title"]
link = article["link"]
summary = article.get("summary", "")
source_name = article.get("feed_title", "Unknown Source")
original_source = f'<a href="{link}">{source_name}</a>'
if title in posted_titles:
print(f"Skipping already posted article: {title}")
logging.info(f"Skipping already posted article: {title}")
attempts += 1
continue
if title in posted_titles:
logging.info(f"Skipping already posted article: {title}")
attempts += 1
continue
print(f"Trying RSS Article: {title} from {source_name}")
logging.info(f"Trying RSS Article: {title} from {source_name}")
logging.info(f"Trying RSS Article: {title} from {source_name}")
image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary)
if skip:
print(f"Skipping filtered RSS article: {title}")
logging.info(f"Skipping filtered RSS article: {title}")
attempts += 1
continue
image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary)
if skip:
logging.info(f"Skipping filtered RSS article: {title}")
attempts += 1
continue
ddg_context = fetch_duckduckgo_news_context(title)
scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}"
interest_score = is_interesting(scoring_content)
logging.info(f"Interest score for '{title}': {interest_score}")
if interest_score < 6:
print(f"RSS Interest Too Low: {interest_score}")
logging.info(f"RSS Interest Too Low: {interest_score}")
attempts += 1
continue
ddg_context = fetch_duckduckgo_news_context(title)
scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}"
interest_score = is_interesting(scoring_content)
logging.info(f"Interest score for '{title}': {interest_score}")
if interest_score < 6:
logging.info(f"RSS Interest Too Low: {interest_score}")
attempts += 1
continue
num_paragraphs = determine_paragraph_count(interest_score)
extra_prompt = (
f"Generate exactly {num_paragraphs} paragraphs.\n"
f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
f"Do NOT introduce unrelated concepts unless in the content or additional context.\n"
f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
f"Do not include emojis in the summary."
)
content_to_summarize = scoring_content
final_summary = summarize_with_gpt4o(
content_to_summarize,
source_name,
link,
interest_score=interest_score,
extra_prompt=extra_prompt
)
if not final_summary:
logging.info(f"Summary failed for '{title}'")
attempts += 1
continue
final_summary = insert_link_naturally(final_summary, source_name, link)
post_data, author, category, image_url, image_source, uploader, page_url = prepare_post_data(final_summary, title, main_topic)
if not post_data:
attempts += 1
continue
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic)
if not image_url:
image_url, image_source, uploader, page_url = get_image(image_query)
hook = get_dynamic_hook(post_data["title"]).strip()
share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
share_links_template = (
f'<p>{share_prompt} '
f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
)
post_data["content"] = f"{final_summary}\n\n{share_links_template}"
global is_posting
is_posting = True
try:
post_id, post_url = post_to_wp(
post_data=post_data,
category=category,
link=link,
author=author,
image_url=image_url,
original_source=original_source,
image_source=image_source,
uploader=uploader,
page_url=page_url,
interest_score=interest_score,
should_post_tweet=True
num_paragraphs = determine_paragraph_count(interest_score)
extra_prompt = (
f"Generate exactly {num_paragraphs} paragraphs.\n"
f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
f"Do NOT introduce unrelated concepts unless in the content or additional context.\n"
f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
f"Do not include emojis in the summary."
)
finally:
is_posting = False
content_to_summarize = scoring_content
final_summary = summarize_with_gpt4o(
content_to_summarize,
source_name,
link,
interest_score=interest_score,
extra_prompt=extra_prompt
)
if not final_summary:
logging.info(f"Summary failed for '{title}'")
attempts += 1
continue
if post_id:
share_text = f"Check out this foodie gem! {post_data['title']}"
share_text_encoded = quote(share_text)
post_url_encoded = quote(post_url)
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
post_data["content"] = f"{final_summary}\n\n{share_links}"
final_summary = insert_link_naturally(final_summary, source_name, link)
post_data, author, category, image_url, image_source, uploader, page_url = prepare_post_data(final_summary, title, main_topic)
if not post_data:
attempts += 1
continue
image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic)
if not image_url:
image_url, image_source, uploader, page_url = get_image(image_query)
hook = get_dynamic_hook(post_data["title"]).strip()
share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
share_links_template = (
f'<p>{share_prompt} '
f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
)
post_data["content"] = f"{final_summary}\n\n{share_links_template}"
global is_posting
is_posting = True
try:
post_to_wp(
post_id, post_url = post_to_wp(
post_data=post_data,
category=category,
link=link,
@@ -333,41 +322,85 @@ def curate_from_rss():
uploader=uploader,
page_url=page_url,
interest_score=interest_score,
post_id=post_id,
should_post_tweet=False
should_post_tweet=True
)
except Exception as e:
logging.error(f"Failed to post to WordPress for '{title}': {e}", exc_info=True)
attempts += 1
continue
finally:
is_posting = False
timestamp = datetime.now(timezone.utc).isoformat()
save_json_file(POSTED_TITLES_FILE, title, timestamp)
posted_titles.add(title)
logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")
if post_id:
share_text = f"Check out this foodie gem! {post_data['title']}"
share_text_encoded = quote(share_text)
post_url_encoded = quote(post_url)
share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
post_data["content"] = f"{final_summary}\n\n{share_links}"
is_posting = True
try:
post_to_wp(
post_data=post_data,
category=category,
link=link,
author=author,
image_url=image_url,
original_source=original_source,
image_source=image_source,
uploader=uploader,
page_url=page_url,
interest_score=interest_score,
post_id=post_id,
should_post_tweet=False
)
except Exception as e:
logging.error(f"Failed to update WordPress post '{title}' with share links: {e}", exc_info=True)
finally:
is_posting = False
if image_url:
save_json_file(USED_IMAGES_FILE, image_url, timestamp)
used_images.add(image_url)
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
timestamp = datetime.now(timezone.utc).isoformat()
save_json_file(POSTED_TITLES_FILE, title, timestamp)
posted_titles.add(title)
logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")
print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
return post_data, category, random.randint(0, 1800)
if image_url:
save_json_file(USED_IMAGES_FILE, image_url, timestamp)
used_images.add(image_url)
logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
attempts += 1
logging.info(f"WP posting failed for '{post_data['title']}'")
logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
return post_data, category, True # Run again immediately
attempts += 1
logging.info(f"WP posting failed for '{post_data['title']}'")
print("No interesting RSS article found after attempts")
logging.info("No interesting RSS article found after attempts")
return None, None, random.randint(600, 1800)
logging.info("No interesting RSS article found after attempts")
return None, None, False # Wait before running again
except Exception as e:
logging.error(f"Unexpected error in curate_from_rss: {e}", exc_info=True)
return None, None, False
def run_rss_automator():
print(f"{datetime.now(timezone.utc)} - INFO - ***** RSS Automator Launched *****")
logging.info("***** RSS Automator Launched *****")
post_data, category, sleep_time = curate_from_rss()
print(f"Sleeping for {sleep_time}s")
logging.info(f"Completed run with sleep time: {sleep_time} seconds")
time.sleep(sleep_time)
return post_data, category, sleep_time
lock_fd = None
try:
lock_fd = acquire_lock()
logging.info("***** RSS Automator Launched *****")
post_data, category, should_continue = curate_from_rss()
if not post_data:
logging.info("No postable RSS article found")
else:
logging.info("Completed RSS run")
return post_data, category, should_continue
except Exception as e:
logging.error(f"Fatal error in run_rss_automator: {e}", exc_info=True)
return None, None, False
finally:
if lock_fd:
fcntl.flock(lock_fd, fcntl.LOCK_UN)
lock_fd.close()
os.remove(LOCK_FILE) if os.path.exists(LOCK_FILE) else None
if __name__ == "__main__":
run_rss_automator()
setup_logging()
post_data, category, should_continue = run_rss_automator()
# Remove sleep timer, let manage_scripts.sh control execution
logging.info(f"Run completed, should_continue: {should_continue}")