fix(title): strip markdown labels and skip empty placeholders in auto-title (#611)

Squash-merges PR #611 (@franksong2702). Fixes two edge cases in auto-generated session titles. 1. Strip Markdown labels (`**Session Title:**`, `Title:`) from sanitizer output — these were being persisted verbatim when the LLM emitted them. 2. Skip empty assistant tool-call placeholder messages when extracting the first exchange for title generation — previously the empty row could be latched onto instead of the first real answer. Also tightens the title prompt to explicitly forbid Markdown, bullets, and label prefixes. 1371 tests passing, QA harness green. Co-authored-by: Frank Song <franksong2702@gmail.com>
2026-04-17 09:51:00 +08:00
parent 2484409b7a
commit 692ba68e42
2 changed files with 47 additions and 2 deletions
--- a/api/streaming.py
+++ b/api/streaming.py
@@ -76,8 +76,14 @@ def _strip_thinking_markup(text: str) -> str:
 def _sanitize_generated_title(text: str) -> str:
    """Sanitize LLM-generated title text before persisting to session."""
    s = _strip_thinking_markup(text or '')
    s = re.sub(
        r'^\s*(?:[*_`~]+\s*)?(?:session\s+title|title)\s*[:：]\s*(?:[*_`~]+\s*)?',
        '',
        s,
        flags=re.IGNORECASE,
    )
    s = re.sub(r'^\s*title\s*:\s*', '', s, flags=re.IGNORECASE)
-    s = s.strip(" \t\r\n\"'`")
+    s = s.strip(" \t\r\n\"'`*_~")
    s = re.sub(r'\s+', ' ', s).strip()
    # Guard against chain-of-thought leakage and meta-reasoning patterns.
    if _looks_invalid_generated_title(s):
@@ -128,7 +134,9 @@ def _first_exchange_snippets(messages):
        if role == 'user' and not user_text:
            user_text = _message_text(m.get('content'))
        elif role == 'assistant' and not asst_text:
-            asst_text = _message_text(m.get('content'))
+            candidate = _message_text(m.get('content'))
            if candidate:
                asst_text = candidate
        if user_text and asst_text:
            break
    return user_text[:500], asst_text[:500]
@@ -149,6 +157,7 @@ def _title_prompts(user_text: str, assistant_text: str) -> tuple[str, list[str]]
            "Generate a short session title from this conversation start.\n"
            "Use BOTH the user's question and the assistant's visible answer.\n"
            "Return only the title text, 3-8 words, as a topic label.\n"
            "Do not use markdown, bullets, labels, or prefixes like Session Title:.\n"
            "Do not output a full sentence.\n"
            "Do not output acknowledgements or completion phrases like OK, done, all set, 测试完成.\n"
            "Do not describe internal reasoning.\n"
@@ -159,6 +168,7 @@ def _title_prompts(user_text: str, assistant_text: str) -> tuple[str, list[str]]
            "Rewrite this conversation start as a concise noun-phrase title.\n"
            "Use the actual topic, not the task outcome.\n"
            "Return title text only.\n"
            "Do not use markdown, bullets, labels, or prefixes like Session Title:.\n"
            "Never output acknowledgements, completion status, or meta commentary."
        ),
    ]
--- a/tests/test_title_sanitization.py
+++ b/tests/test_title_sanitization.py
@@ -0,0 +1,35 @@
 import unittest
 from api.streaming import _first_exchange_snippets, _sanitize_generated_title
 class TestGeneratedTitleSanitization(unittest.TestCase):
    def test_strips_session_title_markdown_prefix(self):
        self.assertEqual(
            _sanitize_generated_title("**Session Title:** Clarifying Topic for Discussion"),
            "Clarifying Topic for Discussion",
        )
    def test_strips_plain_title_prefix(self):
        self.assertEqual(
            _sanitize_generated_title("Title: Clarifying Topic for Discussion"),
            "Clarifying Topic for Discussion",
        )
    def test_strips_wrapping_markdown_emphasis(self):
        self.assertEqual(
            _sanitize_generated_title("**Clarifying Topic for Discussion**"),
            "Clarifying Topic for Discussion",
        )
    def test_first_exchange_skips_empty_assistant_tool_call_placeholder(self):
        messages = [
            {"role": "user", "content": "What time is it in San Francisco?"},
            {"role": "assistant", "content": "", "tool_calls": [{"id": "call_1"}]},
            {"role": "tool", "content": "tool output", "tool_call_id": "call_1"},
            {"role": "assistant", "content": "It is 6:16 PM in San Francisco."},
        ]
        self.assertEqual(
            _first_exchange_snippets(messages),
            ("What time is it in San Francisco?", "It is 6:16 PM in San Francisco."),
        )