fix(title): strip markdown labels and skip empty placeholders in auto-title (#611)

Squash-merges PR #611 (@franksong2702). Fixes two edge cases in auto-generated session titles.

1. Strip Markdown labels (`**Session Title:**`, `Title:`) from sanitizer output — these were being persisted verbatim when the LLM emitted them.
2. Skip empty assistant tool-call placeholder messages when extracting the first exchange for title generation — previously the empty row could be latched onto instead of the first real answer.

Also tightens the title prompt to explicitly forbid Markdown, bullets, and label prefixes.

1371 tests passing, QA harness green.

Co-authored-by: Frank Song <franksong2702@gmail.com>
This commit is contained in:
franksong2702
2026-04-17 09:51:00 +08:00
committed by GitHub
parent 2484409b7a
commit 692ba68e42
2 changed files with 47 additions and 2 deletions

View File

@@ -76,8 +76,14 @@ def _strip_thinking_markup(text: str) -> str:
def _sanitize_generated_title(text: str) -> str: def _sanitize_generated_title(text: str) -> str:
"""Sanitize LLM-generated title text before persisting to session.""" """Sanitize LLM-generated title text before persisting to session."""
s = _strip_thinking_markup(text or '') s = _strip_thinking_markup(text or '')
s = re.sub(
r'^\s*(?:[*_`~]+\s*)?(?:session\s+title|title)\s*[:]\s*(?:[*_`~]+\s*)?',
'',
s,
flags=re.IGNORECASE,
)
s = re.sub(r'^\s*title\s*:\s*', '', s, flags=re.IGNORECASE) s = re.sub(r'^\s*title\s*:\s*', '', s, flags=re.IGNORECASE)
s = s.strip(" \t\r\n\"'`") s = s.strip(" \t\r\n\"'`*_~")
s = re.sub(r'\s+', ' ', s).strip() s = re.sub(r'\s+', ' ', s).strip()
# Guard against chain-of-thought leakage and meta-reasoning patterns. # Guard against chain-of-thought leakage and meta-reasoning patterns.
if _looks_invalid_generated_title(s): if _looks_invalid_generated_title(s):
@@ -128,7 +134,9 @@ def _first_exchange_snippets(messages):
if role == 'user' and not user_text: if role == 'user' and not user_text:
user_text = _message_text(m.get('content')) user_text = _message_text(m.get('content'))
elif role == 'assistant' and not asst_text: elif role == 'assistant' and not asst_text:
asst_text = _message_text(m.get('content')) candidate = _message_text(m.get('content'))
if candidate:
asst_text = candidate
if user_text and asst_text: if user_text and asst_text:
break break
return user_text[:500], asst_text[:500] return user_text[:500], asst_text[:500]
@@ -149,6 +157,7 @@ def _title_prompts(user_text: str, assistant_text: str) -> tuple[str, list[str]]
"Generate a short session title from this conversation start.\n" "Generate a short session title from this conversation start.\n"
"Use BOTH the user's question and the assistant's visible answer.\n" "Use BOTH the user's question and the assistant's visible answer.\n"
"Return only the title text, 3-8 words, as a topic label.\n" "Return only the title text, 3-8 words, as a topic label.\n"
"Do not use markdown, bullets, labels, or prefixes like Session Title:.\n"
"Do not output a full sentence.\n" "Do not output a full sentence.\n"
"Do not output acknowledgements or completion phrases like OK, done, all set, 测试完成.\n" "Do not output acknowledgements or completion phrases like OK, done, all set, 测试完成.\n"
"Do not describe internal reasoning.\n" "Do not describe internal reasoning.\n"
@@ -159,6 +168,7 @@ def _title_prompts(user_text: str, assistant_text: str) -> tuple[str, list[str]]
"Rewrite this conversation start as a concise noun-phrase title.\n" "Rewrite this conversation start as a concise noun-phrase title.\n"
"Use the actual topic, not the task outcome.\n" "Use the actual topic, not the task outcome.\n"
"Return title text only.\n" "Return title text only.\n"
"Do not use markdown, bullets, labels, or prefixes like Session Title:.\n"
"Never output acknowledgements, completion status, or meta commentary." "Never output acknowledgements, completion status, or meta commentary."
), ),
] ]

View File

@@ -0,0 +1,35 @@
import unittest
from api.streaming import _first_exchange_snippets, _sanitize_generated_title
class TestGeneratedTitleSanitization(unittest.TestCase):
def test_strips_session_title_markdown_prefix(self):
self.assertEqual(
_sanitize_generated_title("**Session Title:** Clarifying Topic for Discussion"),
"Clarifying Topic for Discussion",
)
def test_strips_plain_title_prefix(self):
self.assertEqual(
_sanitize_generated_title("Title: Clarifying Topic for Discussion"),
"Clarifying Topic for Discussion",
)
def test_strips_wrapping_markdown_emphasis(self):
self.assertEqual(
_sanitize_generated_title("**Clarifying Topic for Discussion**"),
"Clarifying Topic for Discussion",
)
def test_first_exchange_skips_empty_assistant_tool_call_placeholder(self):
messages = [
{"role": "user", "content": "What time is it in San Francisco?"},
{"role": "assistant", "content": "", "tool_calls": [{"id": "call_1"}]},
{"role": "tool", "content": "tool output", "tool_call_id": "call_1"},
{"role": "assistant", "content": "It is 6:16 PM in San Francisco."},
]
self.assertEqual(
_first_exchange_snippets(messages),
("What time is it in San Francisco?", "It is 6:16 PM in San Francisco."),
)