add lock files and update weekly tweet to include last tweet to follow

2025-05-06 09:40:04 +10:00
parent 331979ca9e
commit 028dfc3fc8
6 changed files with 1481 additions and 904 deletions
@@ -31,10 +31,12 @@ from foodie_utils import (
 )
 from foodie_hooks import get_dynamic_hook, get_viral_share_prompt
 from dotenv import load_dotenv
+import fcntl

 load_dotenv()

 is_posting = False
+LOCK_FILE = "/home/shane/foodie_automator/locks/foodie_automator_rss.lock"

 def signal_handler(sig, frame):
    logging.info("Received termination signal, checking if safe to exit...")
@@ -47,10 +49,11 @@ def signal_handler(sig, frame):
 signal.signal(signal.SIGTERM, signal_handler)
 signal.signal(signal.SIGINT, signal_handler)

-LOG_FILE = "/home/shane/foodie_automator/foodie_automator_rss.log"
+LOG_FILE = "/home/shane/foodie_automator/logs/foodie_automator_rss.log"
 LOG_PRUNE_DAYS = 30
 FEED_TIMEOUT = 15
 MAX_RETRIES = 3
+RETRY_BACKOFF = 2

 POSTED_TITLES_FILE = '/home/shane/foodie_automator/posted_rss_titles.json'
 USED_IMAGES_FILE = '/home/shane/foodie_automator/used_images.json'
@@ -96,21 +99,27 @@ def setup_logging():
    logging.getLogger("requests").setLevel(logging.WARNING)
    logging.info("Logging initialized for foodie_automator_rss.py")

-setup_logging()
+def acquire_lock():
+    os.makedirs(os.path.dirname(LOCK_FILE), exist_ok=True)
+    lock_fd = open(LOCK_FILE, 'w')
+    try:
+        fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+        lock_fd.write(str(os.getpid()))
+        lock_fd.flush()
+        return lock_fd
+    except IOError:
+        logging.info("Another instance of foodie_automator_rss.py is running")
+        sys.exit(0)

 def create_http_session() -> requests.Session:
    session = requests.Session()
    retry_strategy = Retry(
        total=MAX_RETRIES,
-        backoff_factor=2,
+        backoff_factor=RETRY_BACKOFF,
        status_forcelist=[403, 429, 500, 502, 503, 504],
        allowed_methods=["GET", "POST"]
    )
-    adapter = HTTPAdapter(
-        max_retries=retry_strategy,
-        pool_connections=10,
-        pool_maxsize=10
-    )
+    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    session.headers.update({
@@ -140,189 +149,169 @@ def fetch_rss_feeds():

    logging.info(f"Processing feeds: {RSS_FEEDS}")
    for feed_url in RSS_FEEDS:
-        logging.info(f"Processing feed: {feed_url}")
-        try:
-            response = session.get(feed_url, timeout=FEED_TIMEOUT)
-            response.raise_for_status()
-            soup = BeautifulSoup(response.content, 'xml')
-            items = soup.find_all('item')
+        for attempt in range(MAX_RETRIES):
+            logging.info(f"Processing feed: {feed_url} (attempt {attempt + 1})")
+            try:
+                response = session.get(feed_url, timeout=FEED_TIMEOUT)
+                response.raise_for_status()
+                soup = BeautifulSoup(response.content, 'xml')
+                items = soup.find_all('item')

-            feed_title = RSS_FEED_NAMES.get(feed_url, (get_clean_source_name(feed_url), feed_url))
-            for item in items:
-                try:
-                    title = item.find('title').text.strip() if item.find('title') else "Untitled"
-                    link = item.find('link').text.strip() if item.find('link') else ""
-                    pub_date = item.find('pubDate')
-                    pub_date = parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc)
+                feed_title = RSS_FEED_NAMES.get(feed_url, (get_clean_source_name(feed_url), feed_url))
+                for item in items:
+                    try:
+                        title = item.find('title').text.strip() if item.find('title') else "Untitled"
+                        link = item.find('link').text.strip() if item.find('link') else ""
+                        pub_date = item.find('pubDate')
+                        pub_date = parse_date(pub_date.text) if pub_date else datetime.now(timezone.utc)

-                    if pub_date < cutoff_date:
-                        logging.info(f"Skipping old article: {title} (Published: {pub_date})")
+                        if pub_date < cutoff_date:
+                            logging.info(f"Skipping old article: {title} (Published: {pub_date})")
+                            continue
+
+                        description = item.find('description')
+                        summary = BeautifulSoup(description.text, 'html.parser').get_text().strip() if description else ""
+                        content = item.find('content:encoded')
+                        content_text = BeautifulSoup(content.text, 'html.parser').get_text().strip() if content else summary
+
+                        articles.append({
+                            "title": title,
+                            "link": link,
+                            "summary": summary,
+                            "content": content_text,
+                            "feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title,
+                            "pub_date": pub_date
+                        })
+                        logging.debug(f"Processed article: {title}")
+                    except Exception as e:
+                        logging.warning(f"Error processing entry in {feed_url}: {e}")
                        continue
-
-                    description = item.find('description')
-                    summary = BeautifulSoup(description.text, 'html.parser').get_text().strip() if description else ""
-                    content = item.find('content:encoded')
-                    content_text = BeautifulSoup(content.text, 'html.parser').get_text().strip() if content else summary
-
-                    articles.append({
-                        "title": title,
-                        "link": link,
-                        "summary": summary,
-                        "content": content_text,
-                        "feed_title": feed_title[0] if isinstance(feed_title, tuple) else feed_title,
-                        "pub_date": pub_date
-                    })
-                    logging.debug(f"Processed article: {title}")
-                except Exception as e:
-                    logging.warning(f"Error processing entry in {feed_url}: {e}")
-                    continue
-            logging.info(f"Filtered to {len(articles)} articles from {feed_url}")
-        except Exception as e:
-            logging.error(f"Failed to fetch RSS feed {feed_url}: {e}")
-            continue
-
+                logging.info(f"Filtered to {len(articles)} articles from {feed_url}")
+                break
+            except Exception as e:
+                logging.error(f"Failed to fetch RSS feed {feed_url}: {e}")
+                if attempt < MAX_RETRIES - 1:
+                    time.sleep(RETRY_BACKOFF * (2 ** attempt))
+                continue
    articles.sort(key=lambda x: x["pub_date"], reverse=True)
    logging.info(f"Total RSS articles fetched: {len(articles)}")
    return articles

 def fetch_duckduckgo_news_context(title, hours=24):
-    try:
-        with DDGS() as ddgs:
-            results = ddgs.news(f"{title} news", timelimit="d", max_results=5)
-            titles = []
-            for r in results:
-                try:
-                    date_str = r["date"]
-                    if '+00:00' in date_str:
-                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
-                    else:
-                        dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
-                    if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
-                        titles.append(r["title"].lower())
-                except ValueError as e:
-                    logging.warning(f"Date parsing failed for '{date_str}': {e}")
-                    continue
-            context = " ".join(titles) if titles else "No recent news found within 24 hours"
-            logging.info(f"DuckDuckGo News context for '{title}': {context}")
-            return context
-    except Exception as e:
-        logging.warning(f"DuckDuckGo News context fetch failed for '{title}': {e}")
-        return title
+    for attempt in range(MAX_RETRIES):
+        try:
+            with DDGS() as ddgs:
+                results = ddgs.news(f"{title} news", timelimit="d", max_results=5)
+                titles = []
+                for r in results:
+                    try:
+                        date_str = r["date"]
+                        if '+00:00' in date_str:
+                            dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=timezone.utc)
+                        else:
+                            dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S%Z").replace(tzinfo=timezone.utc)
+                        if dt > (datetime.now(timezone.utc) - timedelta(hours=24)):
+                            titles.append(r["title"].lower())
+                    except ValueError as e:
+                        logging.warning(f"Date parsing failed for '{date_str}': {e}")
+                        continue
+                context = " ".join(titles) if titles else "No recent news found within 24 hours"
+                logging.info(f"DuckDuckGo News context for '{title}': {context}")
+                return context
+        except Exception as e:
+            logging.warning(f"DuckDuckGo News context fetch failed for '{title}' (attempt {attempt + 1}): {e}")
+            if attempt < MAX_RETRIES - 1:
+                time.sleep(RETRY_BACKOFF * (2 ** attempt))
+            continue
+    logging.error(f"Failed to fetch DuckDuckGo News context for '{title}' after {MAX_RETRIES} attempts")
+    return title

 def curate_from_rss():
-    articles = fetch_rss_feeds()  # Corrected from fetch_rss_articles to fetch_rss_feeds
-    if not articles:
-        print("No RSS articles available")
-        logging.info("No RSS articles available")
-        return None, None, random.randint(600, 1800)
+    try:
+        articles = fetch_rss_feeds()
+        if not articles:
+            logging.info("No RSS articles available")
+            return None, None, False  # Continue running

-    attempts = 0
-    max_attempts = 10
-    while attempts < max_attempts and articles:
-        article = articles.pop(0)
-        title = article["title"]
-        link = article["link"]
-        summary = article.get("summary", "")
-        source_name = article.get("feed_title", "Unknown Source")  # Adjusted to match fetch_rss_feeds output
-        original_source = f'<a href="{link}">{source_name}</a>'
+        attempts = 0
+        max_attempts = 10
+        while attempts < max_attempts and articles:
+            article = articles.pop(0)
+            title = article["title"]
+            link = article["link"]
+            summary = article.get("summary", "")
+            source_name = article.get("feed_title", "Unknown Source")
+            original_source = f'<a href="{link}">{source_name}</a>'

-        if title in posted_titles:
-            print(f"Skipping already posted article: {title}")
-            logging.info(f"Skipping already posted article: {title}")
-            attempts += 1
-            continue
+            if title in posted_titles:
+                logging.info(f"Skipping already posted article: {title}")
+                attempts += 1
+                continue

-        print(f"Trying RSS Article: {title} from {source_name}")
-        logging.info(f"Trying RSS Article: {title} from {source_name}")
+            logging.info(f"Trying RSS Article: {title} from {source_name}")

-        image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary)
-        if skip:
-            print(f"Skipping filtered RSS article: {title}")
-            logging.info(f"Skipping filtered RSS article: {title}")
-            attempts += 1
-            continue
+            image_query, relevance_keywords, main_topic, skip = smart_image_and_filter(title, summary)
+            if skip:
+                logging.info(f"Skipping filtered RSS article: {title}")
+                attempts += 1
+                continue

-        ddg_context = fetch_duckduckgo_news_context(title)
-        scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}"
-        interest_score = is_interesting(scoring_content)
-        logging.info(f"Interest score for '{title}': {interest_score}")
-        if interest_score < 6:
-            print(f"RSS Interest Too Low: {interest_score}")
-            logging.info(f"RSS Interest Too Low: {interest_score}")
-            attempts += 1
-            continue
+            ddg_context = fetch_duckduckgo_news_context(title)
+            scoring_content = f"{title}\n\n{summary}\n\nAdditional Context: {ddg_context}"
+            interest_score = is_interesting(scoring_content)
+            logging.info(f"Interest score for '{title}': {interest_score}")
+            if interest_score < 6:
+                logging.info(f"RSS Interest Too Low: {interest_score}")
+                attempts += 1
+                continue

-        num_paragraphs = determine_paragraph_count(interest_score)
-        extra_prompt = (
-            f"Generate exactly {num_paragraphs} paragraphs.\n"
-            f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
-            f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
-            f"Do NOT introduce unrelated concepts unless in the content or additional context.\n"
-            f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
-            f"Do not include emojis in the summary."
-        )
-        content_to_summarize = scoring_content
-        final_summary = summarize_with_gpt4o(
-            content_to_summarize,
-            source_name,
-            link,
-            interest_score=interest_score,
-            extra_prompt=extra_prompt
-        )
-        if not final_summary:
-            logging.info(f"Summary failed for '{title}'")
-            attempts += 1
-            continue
-
-        final_summary = insert_link_naturally(final_summary, source_name, link)
-
-        post_data, author, category, image_url, image_source, uploader, page_url = prepare_post_data(final_summary, title, main_topic)
-        if not post_data:
-            attempts += 1
-            continue
-
-        image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic)
-        if not image_url:
-            image_url, image_source, uploader, page_url = get_image(image_query)
-
-        hook = get_dynamic_hook(post_data["title"]).strip()
-
-        share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
-        share_links_template = (
-            f'<p>{share_prompt} '
-            f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
-            f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
-        )
-        post_data["content"] = f"{final_summary}\n\n{share_links_template}"
-
-        global is_posting
-        is_posting = True
-        try:
-            post_id, post_url = post_to_wp(
-                post_data=post_data,
-                category=category,
-                link=link,
-                author=author,
-                image_url=image_url,
-                original_source=original_source,
-                image_source=image_source,
-                uploader=uploader,
-                page_url=page_url,
-                interest_score=interest_score,
-                should_post_tweet=True
+            num_paragraphs = determine_paragraph_count(interest_score)
+            extra_prompt = (
+                f"Generate exactly {num_paragraphs} paragraphs.\n"
+                f"FOCUS: Summarize ONLY the provided content, focusing on its specific topic and details without mentioning the original title.\n"
+                f"Incorporate relevant insights from this additional context if available: {ddg_context}.\n"
+                f"Do NOT introduce unrelated concepts unless in the content or additional context.\n"
+                f"Expand on the core idea with relevant context about its appeal or significance in food trends.\n"
+                f"Do not include emojis in the summary."
            )
-        finally:
-            is_posting = False
+            content_to_summarize = scoring_content
+            final_summary = summarize_with_gpt4o(
+                content_to_summarize,
+                source_name,
+                link,
+                interest_score=interest_score,
+                extra_prompt=extra_prompt
+            )
+            if not final_summary:
+                logging.info(f"Summary failed for '{title}'")
+                attempts += 1
+                continue

-        if post_id:
-            share_text = f"Check out this foodie gem! {post_data['title']}"
-            share_text_encoded = quote(share_text)
-            post_url_encoded = quote(post_url)
-            share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
-            post_data["content"] = f"{final_summary}\n\n{share_links}"
+            final_summary = insert_link_naturally(final_summary, source_name, link)
+
+            post_data, author, category, image_url, image_source, uploader, page_url = prepare_post_data(final_summary, title, main_topic)
+            if not post_data:
+                attempts += 1
+                continue
+
+            image_url, image_source, uploader, page_url = get_flickr_image(image_query, relevance_keywords, main_topic)
+            if not image_url:
+                image_url, image_source, uploader, page_url = get_image(image_query)
+
+            hook = get_dynamic_hook(post_data["title"]).strip()
+
+            share_prompt = get_viral_share_prompt(post_data["title"], final_summary)
+            share_links_template = (
+                f'<p>{share_prompt} '
+                f'<a href="https://x.com/intent/tweet?url={{post_url}}&text={{share_text}}" target="_blank"><i class="tsi tsi-twitter"></i></a> '
+                f'<a href="https://www.facebook.com/sharer/sharer.php?u={{post_url}}" target="_blank"><i class="tsi tsi-facebook"></i></a></p>'
+            )
+            post_data["content"] = f"{final_summary}\n\n{share_links_template}"
+
+            global is_posting
            is_posting = True
            try:
-                post_to_wp(
+                post_id, post_url = post_to_wp(
                    post_data=post_data,
                    category=category,
                    link=link,
@@ -333,41 +322,85 @@ def curate_from_rss():
                    uploader=uploader,
                    page_url=page_url,
                    interest_score=interest_score,
-                    post_id=post_id,
-                    should_post_tweet=False
+                    should_post_tweet=True
                )
+            except Exception as e:
+                logging.error(f"Failed to post to WordPress for '{title}': {e}", exc_info=True)
+                attempts += 1
+                continue
            finally:
                is_posting = False

-            timestamp = datetime.now(timezone.utc).isoformat()
-            save_json_file(POSTED_TITLES_FILE, title, timestamp)
-            posted_titles.add(title)
-            logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")
+            if post_id:
+                share_text = f"Check out this foodie gem! {post_data['title']}"
+                share_text_encoded = quote(share_text)
+                post_url_encoded = quote(post_url)
+                share_links = share_links_template.format(post_url=post_url_encoded, share_text=share_text_encoded)
+                post_data["content"] = f"{final_summary}\n\n{share_links}"
+                is_posting = True
+                try:
+                    post_to_wp(
+                        post_data=post_data,
+                        category=category,
+                        link=link,
+                        author=author,
+                        image_url=image_url,
+                        original_source=original_source,
+                        image_source=image_source,
+                        uploader=uploader,
+                        page_url=page_url,
+                        interest_score=interest_score,
+                        post_id=post_id,
+                        should_post_tweet=False
+                    )
+                except Exception as e:
+                    logging.error(f"Failed to update WordPress post '{title}' with share links: {e}", exc_info=True)
+                finally:
+                    is_posting = False

-            if image_url:
-                save_json_file(USED_IMAGES_FILE, image_url, timestamp)
-                used_images.add(image_url)
-                logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")
+                timestamp = datetime.now(timezone.utc).isoformat()
+                save_json_file(POSTED_TITLES_FILE, title, timestamp)
+                posted_titles.add(title)
+                logging.info(f"Successfully saved '{title}' to {POSTED_TITLES_FILE}")

-            print(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
-            logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
-            return post_data, category, random.randint(0, 1800)
+                if image_url:
+                    save_json_file(USED_IMAGES_FILE, image_url, timestamp)
+                    used_images.add(image_url)
+                    logging.info(f"Saved image '{image_url}' to {USED_IMAGES_FILE}")

-        attempts += 1
-        logging.info(f"WP posting failed for '{post_data['title']}'")
+                logging.info(f"***** SUCCESS: Posted '{post_data['title']}' (ID: {post_id}) from RSS *****")
+                return post_data, category, True  # Run again immediately
+            attempts += 1
+            logging.info(f"WP posting failed for '{post_data['title']}'")

-    print("No interesting RSS article found after attempts")
-    logging.info("No interesting RSS article found after attempts")
-    return None, None, random.randint(600, 1800)
+        logging.info("No interesting RSS article found after attempts")
+        return None, None, False  # Wait before running again
+    except Exception as e:
+        logging.error(f"Unexpected error in curate_from_rss: {e}", exc_info=True)
+        return None, None, False

 def run_rss_automator():
-    print(f"{datetime.now(timezone.utc)} - INFO - ***** RSS Automator Launched *****")
-    logging.info("***** RSS Automator Launched *****")
-    post_data, category, sleep_time = curate_from_rss()
-    print(f"Sleeping for {sleep_time}s")
-    logging.info(f"Completed run with sleep time: {sleep_time} seconds")
-    time.sleep(sleep_time)
-    return post_data, category, sleep_time
+    lock_fd = None
+    try:
+        lock_fd = acquire_lock()
+        logging.info("***** RSS Automator Launched *****")
+        post_data, category, should_continue = curate_from_rss()
+        if not post_data:
+            logging.info("No postable RSS article found")
+        else:
+            logging.info("Completed RSS run")
+        return post_data, category, should_continue
+    except Exception as e:
+        logging.error(f"Fatal error in run_rss_automator: {e}", exc_info=True)
+        return None, None, False
+    finally:
+        if lock_fd:
+            fcntl.flock(lock_fd, fcntl.LOCK_UN)
+            lock_fd.close()
+            os.remove(LOCK_FILE) if os.path.exists(LOCK_FILE) else None

 if __name__ == "__main__":
-    run_rss_automator()
+    setup_logging()
+    post_data, category, should_continue = run_rss_automator()
+    # Remove sleep timer, let manage_scripts.sh control execution
+    logging.info(f"Run completed, should_continue: {should_continue}")