fix insert link naturally

This commit is contained in:
2025-05-13 19:09:26 +10:00
parent 5f38374abd
commit 79cc367579
4 changed files with 286 additions and 557 deletions
+49 -14
View File
@@ -615,23 +615,28 @@ def insert_link_naturally(summary, source_name, source_url):
try:
logging.info(f"Input summary to insert_link_naturally: {summary!r}")
# Split summary into paragraphs using \n\n (correct separator)
# Split summary into paragraphs using \n\n
paragraphs = summary.split('\n\n')
if not paragraphs or all(not p.strip() for p in paragraphs):
logging.error("No valid paragraphs to insert link.")
return summary
return append_link_as_fallback(summary, source_name, source_url)
# Find paragraphs with at least two sentences
eligible_paragraphs = [p for p in paragraphs if p.strip() and len(re.split(r'(?<=[.!?])\s+', p.strip())) >= 2]
eligible_paragraphs = [
p for p in paragraphs
if p.strip() and len(re.split(r'(?<=[.!?])\s+', p.strip())) >= 2
]
if not eligible_paragraphs:
logging.warning("No paragraph with multiple sentences found, using fallback.")
return append_link_as_fallback(summary, source_name, source_url)
# Alternative phrases for variety
# Alternative phrases for manual insertion (as a fallback)
link_phrases = [
"according to {source}",
"as reported by {source}",
"{source} notes that"
"{source} notes that",
"per {source}",
"says {source}"
]
best_candidate = None
@@ -643,22 +648,28 @@ def insert_link_naturally(summary, source_name, source_url):
sentences = re.split(r'(?<=[.!?])\s+', para.strip())
eligible_sentences = [
(i, s) for i, s in enumerate(sentences)
if s.strip() and not s.endswith('?') # Exclude sentences ending with '?'
and not s.endswith('!') # Exclude exclamations for smoother integration
if s.strip()
and not s.endswith('?') # Exclude questions
and not s.endswith('!') # Exclude exclamations
and '<a href=' not in s # Avoid sentences with existing links
and len(s.split()) >= 5 # Prefer sentences with at least 5 words
]
if not eligible_sentences:
continue
# Score sentences based on suitability (prefer declarative sentences)
# Score sentences based on suitability
for idx, sentence in eligible_sentences:
score = 0
# Favor sentences with factual content (simplified heuristic)
# Favor sentences with factual content
if any(word in sentence.lower() for word in ["is", "are", "has", "shows", "reveals"]):
score += 2
# Prefer longer sentences for better context
score += len(sentence.split()) // 5
# Prefer middle sentences for natural flow
score += abs(idx - len(sentences) / 2) * -1 # Penalize sentences far from the middle
# Boost score for sentences mentioning the source topic
if source_name.lower() in sentence.lower():
score += 3
if score > best_score:
best_score = score
@@ -669,14 +680,38 @@ def insert_link_naturally(summary, source_name, source_url):
logging.warning("No suitable sentence found, using fallback.")
return append_link_as_fallback(summary, source_name, source_url)
# Select a link phrase based on sentence structure
# Select a link phrase for fallback manual insertion
sentence_idx, sentence = best_candidate
link_phrase = random.choice(link_phrases)
link_pattern = f'<a href="{source_url}">{source_name}</a>'
formatted_link = link_phrase.format(source=link_pattern)
# Insert the link at the end of the selected sentence (no capitalization needed)
new_sentence = f"{sentence.rstrip('.')} {formatted_link}."
# Use GPT to rewrite the sentence with the link
prompt = (
f"Rewrite the following sentence to naturally include a reference to the source '{source_name}' "
f"with a hyperlink in HTML format: <a href=\"{source_url}\">{source_name}</a>. "
"Integrate the link into the sentence seamlessly, maintaining the original tone and style. "
"Do not add extra sentences, change the meaning, or include additional punctuation like a trailing period. "
"Return only the rewritten sentence."
)
response = client.chat.completions.create(
model=LIGHT_TASK_MODEL,
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": sentence}
],
max_tokens=100,
temperature=0.7
)
new_sentence = response.choices[0].message.content.strip()
if not new_sentence or '<a href=' not in new_sentence:
logging.warning("GPT failed to rewrite sentence, using manual insertion")
new_sentence = f"{sentence.rstrip('.')} {formatted_link}."
else:
# Ensure the sentence ends with a period if the original did
if sentence.rstrip().endswith('.'):
new_sentence = new_sentence.rstrip('.') + '.'
sentences[sentence_idx] = new_sentence
new_para = ' '.join(sentences)
paragraphs[paragraphs.index(best_paragraph)] = new_para
@@ -838,12 +873,12 @@ def post_to_wp(post_data, category, link, author, image_url, original_source, im
tags.append(picks_tag_id)
logger.info(f"Added 'Picks' tag (ID: {picks_tag_id}) due to high interest score: {interest_score}")
# Format content with <p> tags
# Format content with <p> tags, splitting on \n\n to match summary format
content = post_data["content"]
if content is None:
logger.error(f"Post content is None for title '{post_data['title']}' - using fallback")
content = "Content unavailable. Check the original source for details."
formatted_content = "\n".join(f"<p>{para}</p>" for para in content.split('\n') if para.strip())
formatted_content = "\n".join(f"<p>{para}</p>" for para in content.split('\n\n') if para.strip())
# Upload image before posting
image_id = None