my-fix-branch
Shane 7 months ago
parent 022b52a8a7
commit 90be324fe4
  1. 18
      foodie_automator_rss.py
  2. 73
      foodie_utils.py

@ -65,14 +65,20 @@ def setup_logging():
lines = f.readlines() lines = f.readlines()
cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS) cutoff = datetime.now(timezone.utc) - timedelta(days=LOG_PRUNE_DAYS)
pruned_lines = [] pruned_lines = []
malformed_count = 0
for line in lines: for line in lines:
if len(line) < 19 or not line[:19].replace('-', '').replace(':', '').replace(' ', '').isdigit():
malformed_count += 1
continue
try: try:
timestamp = datetime.strptime(line[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc) timestamp = datetime.strptime(line[:19], '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc)
if timestamp > cutoff: if timestamp > cutoff:
pruned_lines.append(line) pruned_lines.append(line)
except ValueError: except ValueError:
logging.warning(f"Skipping malformed log line: {line.strip()[:50]}...") malformed_count += 1
continue continue
if malformed_count > 0:
logging.info(f"Skipped {malformed_count} malformed log lines during pruning")
with open(LOG_FILE, 'w') as f: with open(LOG_FILE, 'w') as f:
f.writelines(pruned_lines) f.writelines(pruned_lines)
@ -240,6 +246,16 @@ def curate_from_rss():
attempts += 1 attempts += 1
continue continue
# Remove the original title from the summary
title_pattern = re.compile(
r'\*\*' + re.escape(title) + r'\*\*|' + re.escape(title),
re.IGNORECASE
)
final_summary = title_pattern.sub('', final_summary).strip()
# Clean up any extra spaces or newlines left after removal
final_summary = re.sub(r'\s+', ' ', final_summary)
final_summary = '\n'.join(para.strip() for para in final_summary.split('\n') if para.strip())
final_summary = insert_link_naturally(final_summary, source_name, link) final_summary = insert_link_naturally(final_summary, source_name, link)
post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title) post_data, author, category, image_url, image_source, uploader, pixabay_url = prepare_post_data(final_summary, title)
if not post_data: if not post_data:

@ -137,7 +137,7 @@ def generate_article_tweet(author, post, persona):
author_handle = f"@{author['username']}" author_handle = f"@{author['username']}"
prompt = ( prompt = (
f"Craft a sharp tweet (under 280 characters) for {author_handle} with the voice of '{persona}'. " f"Craft a sharp tweet (under 230 characters) for {author_handle} with the voice of '{persona}'. "
f"Distill the essence of the article '{title}' and include the raw URL '{url}' at the end. " f"Distill the essence of the article '{title}' and include the raw URL '{url}' at the end. "
f"Make it bold, spark curiosity, and invite engagement with a human touch. " f"Make it bold, spark curiosity, and invite engagement with a human touch. "
f"Swap 'elevate' for dynamic terms like 'ignite' or 'unleash'. " f"Swap 'elevate' for dynamic terms like 'ignite' or 'unleash'. "
@ -414,53 +414,46 @@ def get_image(search_query):
logging.error(f"Pixabay image fetch failed for query '{search_query}': {e}") logging.error(f"Pixabay image fetch failed for query '{search_query}': {e}")
return None, None, None, None return None, None, None, None
def generate_image_query(content): def generate_image_query(title, summary):
try:
prompt = ( prompt = (
"Given the following content, generate a concise image search query (max 5 words) that would likely yield relevant, visually appealing images on platforms like Flickr or Pixabay. " "Given the following article title and summary, generate a concise image search query (max 5 words) to find a relevant image. "
"Identify and prioritize specific entities like brand names or unique terms over abstract or generic concepts. " "Also provide a list of relevance keywords (max 5 words) that should be associated with the image. "
"Focus on concrete, visual concepts related to food, dining, or restaurants. "
"Also provide relevance keywords (max 5 words) to filter results, using general themes related to the content. "
"Return the result as a JSON object with 'search' and 'relevance' keys.\n\n" "Return the result as a JSON object with 'search' and 'relevance' keys.\n\n"
"Content:\n" f"Title: {title}\n\n"
f"{content}\n\n" f"Summary: {summary}\n\n"
"Example output:\n" "Example output:\n"
"```json\n" "```json\n"
"{\n" "{\"search\": \"Italian cuisine trends\", \"relevance\": \"pasta wine dining culture\"}\n"
" \"search\": \"Wingstop dining\",\n" "```"
" \"relevance\": \"fast food dining\"\n"
"}\n```"
) )
try:
response = client.chat.completions.create( response = client.chat.completions.create(
model=LIGHT_TASK_MODEL, model=LIGHT_TASK_MODEL,
messages=[ messages=[
{"role": "system", "content": "You are a helpful assistant that generates concise image search queries."}, {"role": "system", "content": prompt},
{"role": "user", "content": prompt} {"role": "user", "content": "Generate an image search query and relevance keywords."}
], ],
max_tokens=100, max_tokens=100,
temperature=0.5 temperature=0.5
) )
raw_response = response.choices[0].message.content raw_response = response.choices[0].message.content
logging.debug(f"Raw GPT image query response: '{raw_response}'")
# Extract JSON from the response
json_match = re.search(r'```json\n([\s\S]*?)\n```', raw_response) json_match = re.search(r'```json\n([\s\S]*?)\n```', raw_response)
if not json_match: if not json_match:
logging.warning(f"Failed to parse image query JSON from GPT response: {raw_response}") logging.warning(f"Failed to parse image query JSON: {raw_response}")
return "food dining", ["dining", "trends"] return title, [], True
query_data = json.loads(json_match.group(1)) query_data = json.loads(json_match.group(1))
search_query = query_data.get("search", "food dining") search_query = query_data.get("search", title)
relevance_keywords = query_data.get("relevance", ["dining", "trends"]) relevance_keywords = query_data.get("relevance", "").split()
logging.debug(f"Image query from content: {query_data}") # Log the JSON object in a single line
return search_query, relevance_keywords log_json = json.dumps(query_data).replace('\n', ' ').replace('\r', ' ')
logging.debug(f"Image query from content: {log_json}")
return search_query, relevance_keywords, False
except Exception as e: except Exception as e:
logging.warning(f"Failed to generate image query: {e}. Using fallback.") logging.warning(f"Image query generation failed: {e}. Using title as fallback.")
return "food dining", ["dining", "trends"] return title, [], True
def smart_image_and_filter(title, summary): def smart_image_and_filter(title, summary):
try: try:
@ -655,6 +648,7 @@ def summarize_with_gpt4o(content, source_name, link, interest_score=0, extra_pro
full_prompt = ( full_prompt = (
f"{prompt}\n\n" f"{prompt}\n\n"
f"Do not include the article title in the summary.\n\n"
f"{extra_prompt}\n\n" f"{extra_prompt}\n\n"
f"Avoid using the word 'elevate'—use more humanized language like 'level up' or 'bring to life'.\n" f"Avoid using the word 'elevate'—use more humanized language like 'level up' or 'bring to life'.\n"
f"Content to summarize:\n{content}\n\n" f"Content to summarize:\n{content}\n\n"
@ -673,6 +667,14 @@ def summarize_with_gpt4o(content, source_name, link, interest_score=0, extra_pro
) )
summary = response.choices[0].message.content.strip() summary = response.choices[0].message.content.strip()
# Post-process to remove the original title if it still appears
# Extract the title from the content (assuming it's the first line or part of the prompt)
# For simplicity, we can pass the title as an additional parameter if needed
# Here, we'll assume the title is passed via the calling function (e.g., from foodie_automator_rss.py)
# For now, we'll use a placeholder for the title removal logic
# In foodie_automator_rss.py, the title is available as entry.title
# We'll handle the title removal in the calling script instead
logging.info(f"Processed summary (Persona: {persona}): {summary}") logging.info(f"Processed summary (Persona: {persona}): {summary}")
return summary return summary
@ -682,13 +684,12 @@ def summarize_with_gpt4o(content, source_name, link, interest_score=0, extra_pro
def insert_link_naturally(summary, source_name, source_url): def insert_link_naturally(summary, source_name, source_url):
try: try:
# Log the input summary to debug its structure
logging.info(f"Input summary to insert_link_naturally: {summary!r}") logging.info(f"Input summary to insert_link_naturally: {summary!r}")
prompt = ( prompt = (
"Take this summary and insert a single HTML link naturally into one paragraph (randomly chosen). " "Take this summary and insert a single HTML link naturally into one paragraph (randomly chosen). "
"Use the format '<a href=\"{source_url}\">{source_name}</a>' and weave it into the text seamlessly, " "Use the format '<a href=\"{source_url}\">{source_name}</a>' and weave it into the text seamlessly, "
"e.g., 'The latest scoop from {source_name} reveals...' or '{source_name} uncovers this wild shift.' " "e.g., 'The latest scoop from {source_name} reveals...' or '{source_name} shares this insight.' "
"Vary the phrasing creatively to avoid repetition (don’t always use 'dives into'). " "Vary the phrasing creatively to avoid repetition (don’t always use 'dives into'). "
"Place the link at a sentence boundary (after a period, not within numbers like '6.30am' or '1.5'). " "Place the link at a sentence boundary (after a period, not within numbers like '6.30am' or '1.5'). "
"Maintain the original tone, flow, and paragraph structure, preserving all existing newlines exactly as they are. " "Maintain the original tone, flow, and paragraph structure, preserving all existing newlines exactly as they are. "
@ -711,10 +712,7 @@ def insert_link_naturally(summary, source_name, source_url):
new_summary = response.choices[0].message.content.strip() new_summary = response.choices[0].message.content.strip()
link_pattern = f'<a href="{source_url}">{source_name}</a>' link_pattern = f'<a href="{source_url}">{source_name}</a>'
if new_summary and new_summary.count(link_pattern) == 1: if new_summary and new_summary.count(link_pattern) == 1:
# Normalize paragraph separation to ensure a single \n break
# Split by newlines, but do not filter out paragraphs to preserve the count
paragraphs = new_summary.split('\n') paragraphs = new_summary.split('\n')
# Strip each paragraph, but keep all paragraphs even if empty
paragraphs = [p.strip() for p in paragraphs] paragraphs = [p.strip() for p in paragraphs]
new_summary = '\n'.join(paragraphs) new_summary = '\n'.join(paragraphs)
logging.info(f"Summary with naturally embedded link (normalized): {new_summary!r}") logging.info(f"Summary with naturally embedded link (normalized): {new_summary!r}")
@ -733,11 +731,12 @@ def insert_link_naturally(summary, source_name, source_url):
return summary return summary
target_para = random.choice([p for p in paragraphs if p.strip()]) target_para = random.choice([p for p in paragraphs if p.strip()])
link_pattern = f'<a href="{source_url}">{source_name}</a>'
phrases = [ phrases = [
f"The scoop from {link_pattern} spills the details", f"Learn more from {link_pattern}",
f"{link_pattern} uncovers this wild shift", f"{link_pattern} shares this insight",
f"This gem via {link_pattern} drops some truth", f"Discover more at {link_pattern}",
f"{link_pattern} breaks down the buzz" f"Check out {link_pattern} for details"
] ]
insertion_phrase = random.choice(phrases) insertion_phrase = random.choice(phrases)

Loading…
Cancel
Save