From 692ba68e42f5a484920a59c8f43dedf12b094471 Mon Sep 17 00:00:00 2001
From: franksong2702 <138988108+franksong2702@users.noreply.github.com>
Date: Fri, 17 Apr 2026 09:51:00 +0800
Subject: [PATCH] fix(title): strip markdown labels and skip empty placeholders
 in auto-title (#611)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Squash-merges PR #611 (@franksong2702). Fixes two edge cases in auto-generated session titles.

1. Strip Markdown labels (`**Session Title:**`, `Title:`) from sanitizer output — these were being persisted verbatim when the LLM emitted them.
2. Skip empty assistant tool-call placeholder messages when extracting the first exchange for title generation — previously the empty row could be latched onto instead of the first real answer.

Also tightens the title prompt to explicitly forbid Markdown, bullets, and label prefixes.

1371 tests passing, QA harness green.

Co-authored-by: Frank Song <franksong2702@gmail.com>
---
 api/streaming.py                 | 14 +++++++++++--
 tests/test_title_sanitization.py | 35 ++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_title_sanitization.py

diff --git a/api/streaming.py b/api/streaming.py
index 0e1d6a4..fb0eba7 100644
--- a/api/streaming.py
+++ b/api/streaming.py
@@ -76,8 +76,14 @@ def _strip_thinking_markup(text: str) -> str:
 def _sanitize_generated_title(text: str) -> str:
     """Sanitize LLM-generated title text before persisting to session."""
     s = _strip_thinking_markup(text or '')
+    s = re.sub(
+        r'^\s*(?:[*_`~]+\s*)?(?:session\s+title|title)\s*[:：]\s*(?:[*_`~]+\s*)?',
+        '',
+        s,
+        flags=re.IGNORECASE,
+    )
     s = re.sub(r'^\s*title\s*:\s*', '', s, flags=re.IGNORECASE)
-    s = s.strip(" \t\r\n\"'`")
+    s = s.strip(" \t\r\n\"'`*_~")
     s = re.sub(r'\s+', ' ', s).strip()
     # Guard against chain-of-thought leakage and meta-reasoning patterns.
     if _looks_invalid_generated_title(s):
@@ -128,7 +134,9 @@ def _first_exchange_snippets(messages):
         if role == 'user' and not user_text:
             user_text = _message_text(m.get('content'))
         elif role == 'assistant' and not asst_text:
-            asst_text = _message_text(m.get('content'))
+            candidate = _message_text(m.get('content'))
+            if candidate:
+                asst_text = candidate
         if user_text and asst_text:
             break
     return user_text[:500], asst_text[:500]
@@ -149,6 +157,7 @@ def _title_prompts(user_text: str, assistant_text: str) -> tuple[str, list[str]]
             "Generate a short session title from this conversation start.\n"
             "Use BOTH the user's question and the assistant's visible answer.\n"
             "Return only the title text, 3-8 words, as a topic label.\n"
+            "Do not use markdown, bullets, labels, or prefixes like Session Title:.\n"
             "Do not output a full sentence.\n"
             "Do not output acknowledgements or completion phrases like OK, done, all set, 测试完成.\n"
             "Do not describe internal reasoning.\n"
@@ -159,6 +168,7 @@ def _title_prompts(user_text: str, assistant_text: str) -> tuple[str, list[str]]
             "Rewrite this conversation start as a concise noun-phrase title.\n"
             "Use the actual topic, not the task outcome.\n"
             "Return title text only.\n"
+            "Do not use markdown, bullets, labels, or prefixes like Session Title:.\n"
             "Never output acknowledgements, completion status, or meta commentary."
         ),
     ]
diff --git a/tests/test_title_sanitization.py b/tests/test_title_sanitization.py
new file mode 100644
index 0000000..89f00f4
--- /dev/null
+++ b/tests/test_title_sanitization.py
@@ -0,0 +1,35 @@
+import unittest
+
+from api.streaming import _first_exchange_snippets, _sanitize_generated_title
+
+
+class TestGeneratedTitleSanitization(unittest.TestCase):
+    def test_strips_session_title_markdown_prefix(self):
+        self.assertEqual(
+            _sanitize_generated_title("**Session Title:** Clarifying Topic for Discussion"),
+            "Clarifying Topic for Discussion",
+        )
+
+    def test_strips_plain_title_prefix(self):
+        self.assertEqual(
+            _sanitize_generated_title("Title: Clarifying Topic for Discussion"),
+            "Clarifying Topic for Discussion",
+        )
+
+    def test_strips_wrapping_markdown_emphasis(self):
+        self.assertEqual(
+            _sanitize_generated_title("**Clarifying Topic for Discussion**"),
+            "Clarifying Topic for Discussion",
+        )
+
+    def test_first_exchange_skips_empty_assistant_tool_call_placeholder(self):
+        messages = [
+            {"role": "user", "content": "What time is it in San Francisco?"},
+            {"role": "assistant", "content": "", "tool_calls": [{"id": "call_1"}]},
+            {"role": "tool", "content": "tool output", "tool_call_id": "call_1"},
+            {"role": "assistant", "content": "It is 6:16 PM in San Francisco."},
+        ]
+        self.assertEqual(
+            _first_exchange_snippets(messages),
+            ("What time is it in San Francisco?", "It is 6:16 PM in San Francisco."),
+        )