From 00eb13b3167b383870670b07d11cf1bf00e1db47 Mon Sep 17 00:00:00 2001
From: nesquena-hermes <nesquena+hermes@gmail.com>
Date: Mon, 13 Apr 2026 15:43:01 -0700
Subject: [PATCH] fix: unicode filenames in Content-Disposition headers (#378)

* Fix unicode filenames in file download headers

* docs: v0.50.19 CHANGELOG entry for unicode filename fix (PR #378)

* docs: fix test count in v0.50.19 CHANGELOG (924 not 926)

---------

Co-authored-by: shaoxianbilly <40623436+shaoxianbilly@users.noreply.github.com>
Co-authored-by: Nathan Esquenazi <nesquena@gmail.com>
---
 CHANGELOG.md           |  5 ++++
 api/routes.py          | 30 ++++++++++++++++++++----
 tests/test_sprint29.py | 53 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index df60a8f..3e107c9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,11 @@
 
 ---
 
+## [v0.50.19] Fix UnicodeEncodeError when downloading files with non-ASCII filenames (PR #378)
+
+- **Workspace file downloads no longer crash for Unicode filenames** (`api/routes.py`): Clicking a PDF or other file with Chinese, Japanese, Arabic, or other non-ASCII characters in its name caused a `UnicodeEncodeError` because Python's HTTP server requires header values to be latin-1 encodable. A new `_content_disposition_value(disposition, filename)` helper centralises `Content-Disposition` generation: it strips CR/LF (injection guard), builds an ASCII fallback for the legacy `filename=` parameter (non-ASCII chars replaced with `_`), and preserves the full UTF-8 name in `filename*=UTF-8''...` per RFC 5987. Both `attachment` and `inline` responses use it.
+  - 2 new integration tests in `tests/test_sprint29.py` covering Chinese filenames for both download and inline responses, verifying the header is latin-1 encodable and `filename*=UTF-8''` is present; 924 tests total (up from 922)
+
 ## [v0.50.18] Recover from invalid default workspace paths (PR #366)
 
 - **WebUI no longer breaks when the configured default workspace is unavailable** (`api/config.py`): The workspace resolution path was refactored into three composable functions — `_workspace_candidates()`, `_ensure_workspace_dir()`, and `resolve_default_workspace()`. When the configured workspace (from env var, settings file, or passed path) cannot be created or accessed, the server falls back through an ordered priority list: `HERMES_WEBUI_DEFAULT_WORKSPACE` env var → `~/workspace` (if exists) → `~/work` (if exists) → `~/workspace` (create it) → `STATE_DIR/workspace`.
diff --git a/api/routes.py b/api/routes.py
index 4875ffe..861a9b6 100644
--- a/api/routes.py
+++ b/api/routes.py
@@ -1302,6 +1302,29 @@ def _handle_gateway_sse_stream(handler):
     return True
 
 
+def _content_disposition_value(disposition: str, filename: str) -> str:
+    """Build a latin-1-safe Content-Disposition value with RFC 5987 filename*."""
+    import urllib.parse as _up
+
+    safe_name = Path(filename).name.replace("\r", "").replace("\n", "")
+    ascii_fallback = "".join(
+        ch if 32 <= ord(ch) < 127 and ch not in {'"', '\\'} else "_"
+        for ch in safe_name
+    ).strip(" .")
+    if not ascii_fallback:
+        suffix = Path(safe_name).suffix
+        ascii_suffix = "".join(
+            ch if 32 <= ord(ch) < 127 and ch not in {'"', '\\'} else "_"
+            for ch in suffix
+        )
+        ascii_fallback = f"download{ascii_suffix}" if ascii_suffix else "download"
+    quoted_name = _up.quote(safe_name, safe="")
+    return (
+        f'{disposition}; filename="{ascii_fallback}"; '
+        f"filename*=UTF-8''{quoted_name}"
+    )
+
+
 def _handle_file_raw(handler, parsed):
     qs = parse_qs(parsed.query)
     sid = qs.get("session_id", [""])[0]
@@ -1319,9 +1342,6 @@ def _handle_file_raw(handler, parsed):
     ext = target.suffix.lower()
     mime = MIME_MAP.get(ext, "application/octet-stream")
     raw_bytes = target.read_bytes()
-    import urllib.parse as _up
-
-    safe_name = _up.quote(target.name, safe="")
     handler.send_response(200)
     handler.send_header("Content-Type", mime)
     handler.send_header("Content-Length", str(len(raw_bytes)))
@@ -1331,12 +1351,12 @@ def _handle_file_raw(handler, parsed):
     if force_download or mime in dangerous_types:
         handler.send_header(
             "Content-Disposition",
-            f"attachment; filename=\"{target.name}\"; filename*=UTF-8''{safe_name}",
+            _content_disposition_value("attachment", target.name),
         )
     else:
         handler.send_header(
             "Content-Disposition",
-            f"inline; filename=\"{target.name}\"; filename*=UTF-8''{safe_name}",
+            _content_disposition_value("inline", target.name),
         )
     handler.end_headers()
     handler.wfile.write(raw_bytes)
diff --git a/tests/test_sprint29.py b/tests/test_sprint29.py
index c80903e..5a1ecd5 100644
--- a/tests/test_sprint29.py
+++ b/tests/test_sprint29.py
@@ -21,6 +21,7 @@ import pathlib
 import sys
 import time
 import urllib.error
+import urllib.parse
 import urllib.request
 
 sys.path.insert(0, str(pathlib.Path(__file__).parent))
@@ -51,6 +52,12 @@ def post(path, body=None, headers=None):
         return json.loads(e.read()), e.code
 
 
+def get_raw_with_headers(path):
+    req = urllib.request.Request(BASE + path)
+    with urllib.request.urlopen(req, timeout=10) as r:
+        return r.read(), dict(r.headers.items()), r.status
+
+
 # ── 1. CSRF Protection ─────────────────────────────────────────────────────
 
 
@@ -550,6 +557,52 @@ class TestContentDisposition:
         assert "image/svg+xml" in src
         assert "dangerous_types" in src
 
+    def test_unicode_filename_download_header_is_latin1_safe(self, cleanup_test_sessions):
+        """Unicode filenames must not crash download responses."""
+        body, status = post("/api/session/new", {})
+        assert status == 200, body
+        sid = body["session"]["session_id"]
+        cleanup_test_sessions.append(sid)
+        ws = pathlib.Path(body["session"]["workspace"])
+        filename = "中文对照表.pdf"
+        pdf_bytes = b"%PDF-1.3\n1 0 obj\n<<>>\nendobj\ntrailer\n<<>>\n%%EOF\n"
+        (ws / filename).write_bytes(pdf_bytes)
+
+        encoded = urllib.parse.quote(filename)
+        raw, headers, raw_status = get_raw_with_headers(
+            f"/api/file/raw?session_id={sid}&path={encoded}&download=1"
+        )
+
+        assert raw_status == 200
+        assert raw == pdf_bytes
+        disp = headers["Content-Disposition"]
+        assert disp.startswith("attachment; ")
+        assert "filename*=UTF-8''" in disp
+        disp.encode("latin-1")
+
+    def test_unicode_filename_inline_header_is_latin1_safe(self, cleanup_test_sessions):
+        """Inline responses must also work for unicode filenames."""
+        body, status = post("/api/session/new", {})
+        assert status == 200, body
+        sid = body["session"]["session_id"]
+        cleanup_test_sessions.append(sid)
+        ws = pathlib.Path(body["session"]["workspace"])
+        filename = "预览.pdf"
+        pdf_bytes = b"%PDF-1.3\n1 0 obj\n<<>>\nendobj\ntrailer\n<<>>\n%%EOF\n"
+        (ws / filename).write_bytes(pdf_bytes)
+
+        encoded = urllib.parse.quote(filename)
+        raw, headers, raw_status = get_raw_with_headers(
+            f"/api/file/raw?session_id={sid}&path={encoded}"
+        )
+
+        assert raw_status == 200
+        assert raw == pdf_bytes
+        disp = headers["Content-Disposition"]
+        assert disp.startswith("inline; ")
+        assert "filename*=UTF-8''" in disp
+        disp.encode("latin-1")
+
 
 # ── 9. PBKDF2 Password Hashing ───────────────────────────────────────────