From 90be324fe4cef701e458cc0d8ae60c7a13b496f2 Mon Sep 17 00:00:00 2001 From: Shane Date: Thu, 1 May 2025 19:24:20 +1000 Subject: [PATCH] test --- foodie_automator_rss.py | 18 ++++++++- foodie_utils.py | 81 ++++++++++++++++++++--------------------- 2 files changed, 57 insertions(+), 42 deletions(-) diff --git a/foodie_automator_rss.py b/foodie_automator_rss.py index 7862e5b..6014523 100644 --- a/foodie_automator_rss.py +++ b/foodie_automator_rss.py @@ -65,14 +65,20 @@ def setup_logging(): lines = f.readlines() cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS) pruned_lines = [] + malformed_count = 0 for line in lines: + if len(line) < 19 or not line[:19].replace('-', '').replace(':', '').replace(' ', '').isdigit(): + malformed_count += 1 + continue try: timestamp = datetime.strptime(line[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc) if timestamp > cutoff: pruned_lines.append(line) except ValueError: - logging.warning(f"Skipping malformed log line: {line.strip()[:50]}...") + malformed_count += 1 continue + if malformed_count > 0: + logging.info(f"Skipped {malformed_count} malformed log lines during pruning") with open(LOG_FILE, 'w') as f: f.writelines(pruned_lines) @@ -240,6 +246,16 @@ def curate_from_rss(): attempts += 1 continue + # Remove the original title from the summary + title_pattern = re.compile( + r'\*\*' + re.escape(title) + r'\*\*|' + re.escape(title), + re.IGNORECASE + ) + final_summary = title_pattern.sub('', final_summary).strip() + # Clean up any extra spaces or newlines left after removal + final_summary = re.sub(r'\s+', ' ', final_summary) + final_summary = '\n'.join(para.strip() for para in final_summary.split('\n') if para.strip()) + final_summary = insert_link_naturally(final_summary, source_name, link) post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title) if not post_data: diff --git a/foodie_utils.py b/foodie_utils.py index a81b46c..0571c57 100644 --- a/foodie_utils.py +++ b/foodie_utils.py @@ -137,7 +137,7 @@ def generate_article_tweet(author, post, persona): author_handle = f"@{author['username']}" prompt = ( - f"Craft a sharp tweet (under 280 characters) for {author_handle} with the voice of '{persona}'. " + f"Craft a sharp tweet (under 230 characters) for {author_handle} with the voice of '{persona}'. " f"Distill the essence of the article '{title}' and include the raw URL '{url}' at the end. " f"Make it bold, spark curiosity, and invite engagement with a human touch. " f"Swap 'elevate' for dynamic terms like 'ignite' or 'unleash'. " @@ -414,53 +414,46 @@ def get_image(search_query): logging.error(f"Pixabay image fetch failed for query '{search_query}': {e}") return None, None, None, None -def generate_image_query(content): - prompt = ( - "Given the following content, generate a concise image search query (max 5 words) that would likely yield relevant, visually appealing images on platforms like Flickr or Pixabay. " - "Identify and prioritize specific entities like brand names or unique terms over abstract or generic concepts. " - "Focus on concrete, visual concepts related to food, dining, or restaurants. " - "Also provide relevance keywords (max 5 words) to filter results, using general themes related to the content. " - "Return the result as a JSON object with 'search' and 'relevance' keys.\n\n" - "Content:\n" - f"{content}\n\n" - "Example output:\n" - "```json\n" - "{\n" - " \"search\": \"Wingstop dining\",\n" - " \"relevance\": \"fast food dining\"\n" - "}\n```" - ) - +def generate_image_query(title, summary): try: + prompt = ( + "Given the following article title and summary, generate a concise image search query (max 5 words) to find a relevant image. " + "Also provide a list of relevance keywords (max 5 words) that should be associated with the image. " + "Return the result as a JSON object with 'search' and 'relevance' keys.\n\n" + f"Title: {title}\n\n" + f"Summary: {summary}\n\n" + "Example output:\n" + "```json\n" + "{\"search\": \"Italian cuisine trends\", \"relevance\": \"pasta wine dining culture\"}\n" + "```" + ) response = client.chat.completions.create( model=LIGHT_TASK_MODEL, messages=[ - {"role": "system", "content": "You are a helpful assistant that generates concise image search queries."}, - {"role": "user", "content": prompt} + {"role": "system", "content": prompt}, + {"role": "user", "content": "Generate an image search query and relevance keywords."} ], max_tokens=100, temperature=0.5 ) - raw_response = response.choices[0].message.content - logging.debug(f"Raw GPT image query response: '{raw_response}'") - - # Extract JSON from the response json_match = re.search(r'```json\n([\s\S]*?)\n```', raw_response) if not json_match: - logging.warning(f"Failed to parse image query JSON from GPT response: {raw_response}") - return "food dining", ["dining", "trends"] + logging.warning(f"Failed to parse image query JSON: {raw_response}") + return title, [], True query_data = json.loads(json_match.group(1)) - search_query = query_data.get("search", "food dining") - relevance_keywords = query_data.get("relevance", ["dining", "trends"]) + search_query = query_data.get("search", title) + relevance_keywords = query_data.get("relevance", "").split() - logging.debug(f"Image query from content: {query_data}") - return search_query, relevance_keywords + # Log the JSON object in a single line + log_json = json.dumps(query_data).replace('\n', ' ').replace('\r', ' ') + logging.debug(f"Image query from content: {log_json}") + return search_query, relevance_keywords, False except Exception as e: - logging.warning(f"Failed to generate image query: {e}. Using fallback.") - return "food dining", ["dining", "trends"] + logging.warning(f"Image query generation failed: {e}. Using title as fallback.") + return title, [], True def smart_image_and_filter(title, summary): try: @@ -655,6 +648,7 @@ def summarize_with_gpt4o(content, source_name, link, interest_score=0, extra_pro full_prompt = ( f"{prompt}\n\n" + f"Do not include the article title in the summary.\n\n" f"{extra_prompt}\n\n" f"Avoid using the word 'elevate'—use more humanized language like 'level up' or 'bring to life'.\n" f"Content to summarize:\n{content}\n\n" @@ -673,6 +667,14 @@ def summarize_with_gpt4o(content, source_name, link, interest_score=0, extra_pro ) summary = response.choices[0].message.content.strip() + + # Post-process to remove the original title if it still appears + # Extract the title from the content (assuming it's the first line or part of the prompt) + # For simplicity, we can pass the title as an additional parameter if needed + # Here, we'll assume the title is passed via the calling function (e.g., from foodie_automator_rss.py) + # For now, we'll use a placeholder for the title removal logic + # In foodie_automator_rss.py, the title is available as entry.title + # We'll handle the title removal in the calling script instead logging.info(f"Processed summary (Persona: {persona}): {summary}") return summary @@ -682,13 +684,12 @@ def summarize_with_gpt4o(content, source_name, link, interest_score=0, extra_pro def insert_link_naturally(summary, source_name, source_url): try: - # Log the input summary to debug its structure logging.info(f"Input summary to insert_link_naturally: {summary!r}") prompt = ( "Take this summary and insert a single HTML link naturally into one paragraph (randomly chosen). " "Use the format '{source_name}' and weave it into the text seamlessly, " - "e.g., 'The latest scoop from {source_name} reveals...' or '{source_name} uncovers this wild shift.' " + "e.g., 'The latest scoop from {source_name} reveals...' or '{source_name} shares this insight.' " "Vary the phrasing creatively to avoid repetition (don’t always use 'dives into'). " "Place the link at a sentence boundary (after a period, not within numbers like '6.30am' or '1.5'). " "Maintain the original tone, flow, and paragraph structure, preserving all existing newlines exactly as they are. " @@ -711,10 +712,7 @@ def insert_link_naturally(summary, source_name, source_url): new_summary = response.choices[0].message.content.strip() link_pattern = f'{source_name}' if new_summary and new_summary.count(link_pattern) == 1: - # Normalize paragraph separation to ensure a single \n break - # Split by newlines, but do not filter out paragraphs to preserve the count paragraphs = new_summary.split('\n') - # Strip each paragraph, but keep all paragraphs even if empty paragraphs = [p.strip() for p in paragraphs] new_summary = '\n'.join(paragraphs) logging.info(f"Summary with naturally embedded link (normalized): {new_summary!r}") @@ -733,11 +731,12 @@ def insert_link_naturally(summary, source_name, source_url): return summary target_para = random.choice([p for p in paragraphs if p.strip()]) + link_pattern = f'{source_name}' phrases = [ - f"The scoop from {link_pattern} spills the details", - f"{link_pattern} uncovers this wild shift", - f"This gem via {link_pattern} drops some truth", - f"{link_pattern} breaks down the buzz" + f"Learn more from {link_pattern}", + f"{link_pattern} shares this insight", + f"Discover more at {link_pattern}", + f"Check out {link_pattern} for details" ] insertion_phrase = random.choice(phrases)