Merge pull request #276 from CJackHwang/dev

Fix citation link mapping for duplicate URLs and unstable cite_index
Initialize LICENSE file
2026-05-03 16:05:26 +08:00 · 2026-04-21 18:46:10 +08:00 · 2026-04-21 11:53:38 +08:00 · 2026-04-21 11:43:36 +08:00 · 2026-04-21 11:42:41 +08:00 · 2026-04-21 10:52:17 +08:00
341 changed files with 92659 additions and 8147 deletions
--- a/.env.example
+++ b/.env.example
@@ -1,5 +1,8 @@
 # DS2API runtime
+# Runtime listen port inside the app/container
 PORT=5001
+# Docker Compose host port (compose only; container still listens on PORT)
+DS2API_HOST_PORT=6011
 LOG_LEVEL=INFO

 # Admin authentication
@@ -10,6 +13,8 @@ DS2API_ADMIN_KEY=change-me
 DS2API_CONFIG_PATH=/app/config.json
 # 2) inline JSON or Base64 JSON
 # DS2API_CONFIG_JSON=
+# 3) legacy compatibility alias
+# CONFIG_JSON=

 # Optional: static admin assets path
 # DS2API_STATIC_ADMIN_DIR=/app/static/admin
--- a/.github/workflows/quality-gates.yml
+++ b/.github/workflows/quality-gates.yml
@@ -19,7 +19,7 @@ jobs:
      - name: Setup Go
        uses: actions/setup-go@v5
        with:
-          go-version: "1.24.x"
+          go-version: "1.26.x"

      - name: Setup Node
        uses: actions/setup-node@v4
@@ -28,6 +28,16 @@ jobs:
          cache: "npm"
          cache-dependency-path: webui/package-lock.json

+      - name: Setup golangci-lint
+        uses: golangci/golangci-lint-action@v8
+        with:
+          version: v2.11.4
+          install-mode: binary
+          verify: true
+
+      - name: Go Format & Lint Gates
+        run: ./scripts/lint.sh
+
      - name: Refactor Line Gate
        run: ./tests/scripts/check-refactor-line-gate.sh

--- a/.github/workflows/release-artifacts.yml
+++ b/.github/workflows/release-artifacts.yml
@@ -27,7 +27,7 @@ jobs:
      - name: Setup Go
        uses: actions/setup-go@v5
        with:
-          go-version: "1.24.x"
+          go-version: "1.26.x"

      - name: Setup Node
        uses: actions/setup-node@v4
@@ -79,7 +79,7 @@ jobs:
            CGO_ENABLED=0 GOOS="${GOOS}" GOARCH="${GOARCH}" \
              go build -trimpath -ldflags="-s -w -X ds2api/internal/version.BuildVersion=${BUILD_VERSION}" -o "${STAGE}/${BIN}" ./cmd/ds2api

-            cp config.example.json .env.example sha3_wasm_bg.7b9ca65ddd.wasm LICENSE README.MD README.en.md "${STAGE}/"
+            cp config.example.json .env.example LICENSE README.MD README.en.md "${STAGE}/"
            cp -R static/admin "${STAGE}/static/admin"

            if [ "${GOOS}" = "windows" ]; then
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ config.json
 *.swo
 *~
 .DS_Store
+opencode.json

 # Logs
 *.log
@@ -58,3 +59,6 @@ Thumbs.db
 # Claude Code
 .claude/
 CLAUDE.local.md
+
+# Local tool bootstrap cache
+.tmp/
--- a/.golangci.yml
+++ b/.golangci.yml
@@ -0,0 +1,73 @@
+version: "2"
+
+run:
+  tests: true
+
+linters:
+  default: standard
+  enable:
+    - errcheck
+    - govet
+    - ineffassign
+    - staticcheck
+    - unused
+  settings:
+    dupl:
+      threshold: 100
+    goconst:
+      min-len: 2
+      min-occurrences: 2
+    gocritic:
+      enabled-tags:
+        - diagnostic
+        - experimental
+        - opinionated
+        - performance
+        - style
+      disabled-checks:
+        - wrapperFunc
+        - rangeValCopy
+        - hugeParam
+    gocyclo:
+      min-complexity: 15
+    lll:
+      line-length: 140
+    misspell:
+      locale: US
+    nakedret:
+      max-func-lines: 30
+    prealloc:
+      simple: true
+      range-loops: true
+      for-loops: false
+  exclusions:
+    generated: lax
+    rules:
+      - path: (.+)\.go$
+        text: "ST1000: at least one file in a package should have a package comment"
+    paths:
+      - third_party$
+      - builtin$
+      - examples$
+      - vendor$
+      - webui/node_modules$
+
+issues:
+  max-issues-per-linter: 0
+  max-same-issues: 0
+
+formatters:
+  enable:
+    - gofmt
+  settings:
+    goimports:
+      local-prefixes:
+        - ds2api
+  exclusions:
+    generated: lax
+    paths:
+      - third_party$
+      - builtin$
+      - examples$
+      - vendor$
+      - webui/node_modules$
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -0,0 +1,23 @@
+# AGENTS.md
+
+These rules apply to all agent-made changes in this repository.
+
+## PR Gate
+
+- Before opening or updating a PR, run the same local gates as `.github/workflows/quality-gates.yml`.
+- Required commands:
+  - `./scripts/lint.sh`
+  - `./tests/scripts/check-refactor-line-gate.sh`
+  - `./tests/scripts/run-unit-all.sh`
+  - `npm run build --prefix webui`
+
+## Go Lint Rules
+
+- Run `gofmt -w` on every changed Go file before commit or push.
+- Do not ignore error returns from I/O-style cleanup calls such as `Close`, `Flush`, `Sync`, or similar methods.
+- If a cleanup error cannot be returned, log it explicitly.
+
+## Change Scope
+
+- Keep changes additive and tightly scoped to the requested feature or bugfix.
+- Do not mix unrelated refactors into feature PRs unless they are required to make the change pass gates.
--- a/API.en.md
+++ b/API.en.md
@@ -4,6 +4,8 @@ Language: [中文](API.md) | [English](API.en.md)

 This document describes the actual behavior of the current Go codebase.

+Docs: [Overview](README.en.md) / [Architecture](docs/ARCHITECTURE.en.md) / [Deployment](docs/DEPLOY.en.md) / [Testing](docs/TESTING.md)
+
 ---

 ## Table of Contents
@@ -31,6 +33,13 @@ This document describes the actual behavior of the current Go codebase.
 | Health probes | `GET /healthz`, `GET /readyz` |
 | CORS | Enabled (`Access-Control-Allow-Origin: *`, allows `Content-Type`, `Authorization`, `X-API-Key`, `X-Ds2-Target-Account`, `X-Vercel-Protection-Bypass`) |

+### 3.0 Adapter-Layer Notes
+
+- OpenAI / Claude / Gemini protocols are now mounted on one shared `chi` router tree assembled in `internal/server/router.go`.
+- Adapter responsibilities are streamlined to: **request normalization → DeepSeek invocation → protocol-shaped rendering**, reducing legacy split-logic paths.
+- Tool-calling semantics are aligned between Go and Node runtime: parsing is now centered on XML/Markup-family tool syntax (`<tool_call>` / `<function_call>` / `<invoke>` / `tool_use` / antml variants), plus stream-time anti-leak filtering.
+- `Admin API` separates static config from runtime policy: `/admin/config*` for configuration state, `/admin/settings*` for runtime behavior.
+
 ---

 ## Configuration Best Practice
@@ -45,7 +54,7 @@ cp config.example.json config.json
 Use it per deployment mode:

 - Local run: read `config.json` directly
- Docker / Vercel: generate Base64 from `config.json`, then set `DS2API_CONFIG_JSON`
+- Docker / Vercel: generate Base64 from `config.json`, then set `DS2API_CONFIG_JSON`, or paste raw JSON directly

 ```bash
 DS2API_CONFIG_JSON="$(base64 < config.json | tr -d '\n')"
@@ -65,6 +74,7 @@ Two header formats accepted:
 | --- | --- |
 | Bearer Token | `Authorization: Bearer <token>` |
 | API Key Header | `x-api-key: <token>` (no `Bearer` prefix) |
+| Gemini-compatible | `x-goog-api-key: <token>` or `?key=<token>` / `?api_key=<token>` |

 **Auth behavior**:

@@ -72,6 +82,7 @@ Two header formats accepted:
 - Token is not in `config.keys` → **Direct token mode**: treated as a DeepSeek token directly

 **Optional header**: `X-Ds2-Target-Account: <email_or_mobile>` — Pin a specific managed account.
+Gemini-compatible clients can also send `x-goog-api-key`, `?key=`, or `?api_key=` as the caller credential source.

 ### Admin Endpoints (`/admin/*`)

@@ -88,13 +99,16 @@ Two header formats accepted:
 | Method | Path | Auth | Description |
 | --- | --- | --- | --- |
 | GET | `/healthz` | None | Liveness probe |
+| HEAD | `/healthz` | None | Liveness probe (no body) |
 | GET | `/readyz` | None | Readiness probe |
+| HEAD | `/readyz` | None | Readiness probe (no body) |
 | GET | `/v1/models` | None | OpenAI model list |
 | GET | `/v1/models/{id}` | None | OpenAI single-model query (alias accepted) |
 | POST | `/v1/chat/completions` | Business | OpenAI chat completions |
 | POST | `/v1/responses` | Business | OpenAI Responses API (stream/non-stream) |
 | GET | `/v1/responses/{response_id}` | Business | Query stored response (in-memory TTL) |
 | POST | `/v1/embeddings` | Business | OpenAI Embeddings API |
+| POST | `/v1/files` | Business | OpenAI Files upload (multipart/form-data) |
 | GET | `/anthropic/v1/models` | None | Claude model list |
 | POST | `/anthropic/v1/messages` | Business | Claude messages |
 | POST | `/anthropic/v1/messages/count_tokens` | Business | Claude token counting |
@@ -118,19 +132,31 @@ Two header formats accepted:
 | GET | `/admin/config/export` | Admin | Export full config (`config`/`json`/`base64`) |
 | POST | `/admin/keys` | Admin | Add API key |
 | DELETE | `/admin/keys/{key}` | Admin | Delete API key |
+| GET | `/admin/proxies` | Admin | List proxies |
+| POST | `/admin/proxies` | Admin | Add proxy |
+| PUT | `/admin/proxies/{proxyID}` | Admin | Update proxy (empty password keeps old secret) |
+| DELETE | `/admin/proxies/{proxyID}` | Admin | Delete proxy (auto-unbind referenced accounts) |
+| POST | `/admin/proxies/test` | Admin | Test proxy connectivity |
 | GET | `/admin/accounts` | Admin | Paginated account list |
 | POST | `/admin/accounts` | Admin | Add account |
 | DELETE | `/admin/accounts/{identifier}` | Admin | Delete account |
+| PUT | `/admin/accounts/{identifier}/proxy` | Admin | Bind/unbind proxy for an account |
 | GET | `/admin/queue/status` | Admin | Account queue status |
 | POST | `/admin/accounts/test` | Admin | Test one account |
 | POST | `/admin/accounts/test-all` | Admin | Test all accounts |
+| POST | `/admin/accounts/sessions/delete-all` | Admin | Delete all sessions for one account |
 | POST | `/admin/import` | Admin | Batch import keys/accounts |
 | POST | `/admin/test` | Admin | Test API through service |
+| POST | `/admin/dev/raw-samples/capture` | Admin | Fire one request and persist it as a raw sample |
+| GET | `/admin/dev/raw-samples/query` | Admin | Search current in-memory capture chains by prompt keyword |
+| POST | `/admin/dev/raw-samples/save` | Admin | Persist a selected in-memory capture chain as a raw sample |
 | POST | `/admin/vercel/sync` | Admin | Sync config to Vercel |
 | GET | `/admin/vercel/status` | Admin | Vercel sync status |
+| POST | `/admin/vercel/status` | Admin | Vercel sync status / draft compare |
 | GET | `/admin/export` | Admin | Export config JSON/Base64 |
 | GET | `/admin/dev/captures` | Admin | Read local packet-capture entries |
 | DELETE | `/admin/dev/captures` | Admin | Clear local packet-capture entries |
+| GET | `/admin/version` | Admin | Check current version and latest Release |

 ---

@@ -154,7 +180,7 @@ Two header formats accepted:

 ### `GET /v1/models`

-No auth required. Returns supported models.
+No auth required. Returns the currently supported DeepSeek native model list.

 **Response**:

@@ -165,11 +191,21 @@ No auth required. Returns supported models.
    {"id": "deepseek-chat", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
    {"id": "deepseek-reasoner", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
    {"id": "deepseek-chat-search", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
-    {"id": "deepseek-reasoner-search", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []}
+    {"id": "deepseek-reasoner-search", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
+    {"id": "deepseek-expert-chat", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
+    {"id": "deepseek-expert-reasoner", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
+    {"id": "deepseek-expert-chat-search", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
+    {"id": "deepseek-expert-reasoner-search", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
+    {"id": "deepseek-vision-chat", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
+    {"id": "deepseek-vision-reasoner", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
+    {"id": "deepseek-vision-chat-search", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
+    {"id": "deepseek-vision-reasoner-search", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []}
  ]
 }
 ```

+> Note: `/v1/models` returns normalized DeepSeek native model IDs. Common aliases are accepted only as request input and are not expanded as separate items in this endpoint.
+
 ### Model Alias Resolution

 For `chat` / `responses` / `embeddings`, DS2API follows a wide-input/strict-output policy:
@@ -179,6 +215,13 @@ For `chat` / `responses` / `embeddings`, DS2API follows a wide-input/strict-outp
 3. If still unmatched, fall back by known family heuristics (`o*`, `gpt-*`, `claude-*`, etc.).
 4. If still unmatched, return `invalid_request_error`.

+Current built-in default aliases (excerpt):
+
+- OpenAI: `gpt-4o`, `gpt-4.1`, `gpt-4.1-mini`, `gpt-4.1-nano`, `gpt-5`, `gpt-5-mini`, `gpt-5-codex`
+- OpenAI reasoning: `o1`, `o1-mini`, `o3`, `o3-mini`
+- Claude: `claude-sonnet-4-5`, `claude-haiku-4-5`, `claude-opus-4-6` (plus compatibility aliases `claude-3-5-sonnet` / `claude-3-5-haiku` / `claude-3-opus`)
+- Gemini: `gemini-2.5-pro`, `gemini-2.5-flash`
+
 ### `POST /v1/chat/completions`

 **Headers**:
@@ -192,7 +235,7 @@ Content-Type: application/json

 | Field | Type | Required | Notes |
 | --- | --- | --- | --- |
-| `model` | string | ✅ | DeepSeek native models + common aliases (`gpt-4o`, `gpt-5-codex`, `o3`, `claude-sonnet-4-5`, etc.) |
+| `model` | string | ✅ | DeepSeek native models + common aliases (`gpt-5`, `gpt-5-mini`, `gpt-5-codex`, `o3`, `claude-opus-4-6`, `gemini-2.5-pro`, `gemini-2.5-flash`, etc.) |
 | `messages` | array | ✅ | OpenAI-style messages |
 | `stream` | boolean | ❌ | Default `false` |
 | `tools` | array | ❌ | Function calling schema |
@@ -250,6 +293,7 @@ data: [DONE]
 - `deepseek-reasoner` / `deepseek-reasoner-search` models emit `delta.reasoning_content`
 - Text emits `delta.content`
 - Last chunk includes `finish_reason` and `usage`
+- Token counting prefers pass-through from upstream DeepSeek SSE (`accumulated_token_usage` / `token_usage`), and only falls back to local estimation when upstream usage is absent

 #### Tool Calls

@@ -282,7 +326,12 @@ When `tools` is present, DS2API performs anti-leak handling:
 }
 ```

-**Stream**: Once high-confidence toolcall features are matched, DS2API emits `delta.tool_calls` immediately (without waiting for full JSON closure), then keeps sending argument deltas; confirmed raw tool JSON is never forwarded as `delta.content`.
+**Stream**: Once high-confidence toolcall features are matched, DS2API emits `delta.tool_calls` immediately (without waiting for full argument closure), then keeps sending argument deltas; confirmed tool-call fragments are not forwarded as `delta.content`.
+
+Additional notes:
+
+- The parser currently follows XML/Markup-family tool payloads (`<tool_call>`, `<function_call>`, `<invoke>`, `tool_use`, antml variants). Standalone JSON `tool_calls` payloads are not treated as executable tool calls by default.
+- `tool_calls` shown inside fenced markdown code blocks (for example, ```json ... ```) are treated as examples, not executable calls.

 ---

@@ -341,7 +390,8 @@ data: [DONE]
 ```

 If `tool_choice=required` is violated in stream mode, DS2API emits `response.failed` then `[DONE]` (no `response.completed`).
-Unknown tool names (outside declared `tools`) are rejected and will not be emitted as valid tool calls.
+
+> Current behavior: the parser tries to extract structured tool calls and does not enforce a hard allow-list reject; your tool executor should still validate against a whitelist before executing.

 ### `GET /v1/responses/{response_id}`

@@ -360,11 +410,27 @@ Business auth required. Returns OpenAI-compatible embeddings shape.

 > Requires `embeddings.provider`. Current supported values: `mock` / `deterministic` / `builtin`. If missing/unsupported, returns standard error shape with HTTP 501.

+### `POST /v1/files`
+
+Business auth required. OpenAI Files-compatible upload endpoint; currently only `multipart/form-data` is supported.
+
+| Field | Type | Required | Notes |
+| --- | --- | --- | --- |
+| `file` | file | ✅ | Binary payload |
+| `purpose` | string | ❌ | Forwarded purpose field |
+
+Constraints and behavior:
+
+- `Content-Type` must be `multipart/form-data` (otherwise `400`).
+- Total request size limit is `100 MiB` (over-limit returns `413`).
+- Success returns an OpenAI `file` object (`id/object/bytes/filename/purpose/status`, etc.) and includes `account_id` for source-account tracing.
+
 ---

 ## Claude-Compatible API

 Besides `/anthropic/v1/*`, DS2API also supports shortcut paths: `/v1/messages`, `/messages`, `/v1/messages/count_tokens`, `/messages/count_tokens`.
+Implementation-wise this path is unified on the OpenAI Chat Completions parse-and-translate pipeline to avoid maintaining divergent parsing chains.

 ### `GET /anthropic/v1/models`

@@ -386,7 +452,7 @@ No auth required.
 }
 ```

-> Note: the example is partial; the real response includes historical Claude 1.x/2.x/3.x/4.x IDs and common aliases.
+> Note: the example is partial; besides the current primary aliases, the real response also includes Claude 4.x snapshots plus historical 3.x / 2.x / 1.x IDs and common aliases.

 ### `POST /anthropic/v1/messages`

@@ -499,6 +565,7 @@ Supported paths:
 - `/v1/models/{model}:streamGenerateContent` (compat path)

 Authentication is the same as other business routes (`Authorization: Bearer <token>` or `x-api-key`).
+Implementation-wise this path is unified on the OpenAI Chat Completions parse-and-translate pipeline to avoid maintaining divergent parsing chains.

 ### `POST /v1beta/models/{model}:generateContent`

@@ -517,6 +584,7 @@ Returns SSE (`text/event-stream`), each chunk as `data: <json>`:
 - regular text: incremental text chunks
 - `tools` mode: buffered and emitted as `functionCall` at finalize phase
 - final chunk: includes `finishReason: "STOP"` and `usageMetadata`
+- Token counting prefers pass-through from upstream DeepSeek SSE (`accumulated_token_usage` / `token_usage`), and only falls back to local estimation when upstream usage is absent

 ---

@@ -580,6 +648,10 @@ Returns sanitized config.
 ```json
 {
  "keys": ["k1", "k2"],
+  "env_backed": false,
+  "env_source_present": true,
+  "env_writeback_enabled": true,
+  "config_path": "/data/config.json",
  "accounts": [
    {
      "identifier": "user@example.com",
@@ -599,7 +671,7 @@ Returns sanitized config.

 ### `POST /admin/config`

-Updatable fields: `keys`, `accounts`, `claude_mapping`.
+Only updates `keys`, `accounts`, and `claude_mapping`.

 **Request**:

@@ -620,25 +692,29 @@ Updatable fields: `keys`, `accounts`, `claude_mapping`.

 Reads runtime settings and status, including:

- `admin` (JWT expiry, default-password warning, etc.)
- `runtime` (`account_max_inflight`, `account_max_queue`, `global_max_inflight`)
- `toolcall` / `responses` / `embeddings`
- `auto_delete` (`sessions`)
+- `success`
+- `admin` (`has_password_hash`, `jwt_expire_hours`, `jwt_valid_after_unix`, `default_password_warning`)
+- `runtime` (`account_max_inflight`, `account_max_queue`, `global_max_inflight`, `token_refresh_interval_hours`)
+- `compat` (`wide_input_strict_output`, `strip_reference_markers`)
+- `responses` / `embeddings`
+- `auto_delete` (`mode`: `none` / `single` / `all`; legacy `sessions=true` is still treated as `all`)
 - `claude_mapping` / `model_aliases`
 - `env_backed`, `needs_vercel_sync`
+- `toolcall` policy is fixed to `feature_match + high` and is no longer returned or editable via settings

 ### `PUT /admin/settings`

 Hot-updates runtime settings. Supported fields:

 - `admin.jwt_expire_hours`
- `runtime.account_max_inflight` / `runtime.account_max_queue` / `runtime.global_max_inflight`
- `toolcall.mode` / `toolcall.early_emit_confidence`
+- `runtime.account_max_inflight` / `runtime.account_max_queue` / `runtime.global_max_inflight` / `runtime.token_refresh_interval_hours`
+- `compat.wide_input_strict_output` / `compat.strip_reference_markers`
 - `responses.store_ttl_seconds`
 - `embeddings.provider`
- `auto_delete.sessions`
+- `auto_delete.mode`
 - `claude_mapping`
 - `model_aliases`
+- `toolcall` policy is fixed and is no longer writable through settings

 ### `POST /admin/settings/password`

@@ -650,6 +726,8 @@ Request example:
 {"new_password":"your-new-password"}
 ```

+It also accepts `{"password":"your-new-password"}`.
+
 ### `POST /admin/config/import`

 Imports full config with:
@@ -658,6 +736,10 @@ Imports full config with:
 - `mode=replace`

 The request can send config directly, or wrapped as `{"config": {...}, "mode":"merge"}`.
+Query params `?mode=merge` / `?mode=replace` are also supported.
+Import accepts `keys`, `accounts`, `claude_mapping` / `claude_model_mapping`, `model_aliases`, `admin`, `runtime`, `responses`, `embeddings`, and `auto_delete`; legacy `toolcall` fields are ignored.
+
+> `compat` fields are managed via `/admin/settings` or the config file; this import endpoint does not update `compat`.

 ### `GET /admin/config/export`

@@ -675,6 +757,26 @@ Exports full config in three forms: `config`, `json`, and `base64`.

 **Response**: `{"success": true, "total_keys": 2}`

+### `GET /admin/proxies`
+
+Lists proxy configs (password is never returned; use `has_password` as a marker).
+
+### `POST /admin/proxies`
+
+Adds a proxy. Request accepts `id` (optional; auto-generated when omitted), `name`, `type` (`http` / `socks5`), `host`, `port`, `username`, `password`.
+
+### `PUT /admin/proxies/{proxyID}`
+
+Updates a proxy. If `password` is an empty string, the existing secret is preserved.
+
+### `DELETE /admin/proxies/{proxyID}`
+
+Deletes a proxy and automatically clears `proxy_id` on all accounts that reference it.
+
+### `POST /admin/proxies/test`
+
+Tests proxy connectivity: provide `proxy_id` to test a saved proxy; omit it to run a one-off test using proxy fields in the request body.
+
 ### `GET /admin/accounts`

 **Query params**:
@@ -682,7 +784,8 @@ Exports full config in three forms: `config`, `json`, and `base64`.
 | Param | Default | Range |
 | --- | --- | --- |
 | `page` | `1` | ≥ 1 |
-| `page_size` | `10` | 1–100 |
+| `page_size` | `10` | 1–5000 |
+| `q` | empty | Filter by identifier / email / mobile |

 **Response**:

@@ -695,7 +798,8 @@ Exports full config in three forms: `config`, `json`, and `base64`.
      "mobile": "",
      "has_password": true,
      "has_token": true,
-      "token_preview": "abc..."
+      "token_preview": "abc...",
+      "test_status": "ok"
    }
  ],
  "total": 25,
@@ -705,6 +809,8 @@ Exports full config in three forms: `config`, `json`, and `base64`.
 }
 ```

+Returned items also include `test_status`, usually `ok` or `failed`.
+
 ### `POST /admin/accounts`

 ```json
@@ -719,6 +825,14 @@ Exports full config in three forms: `config`, `json`, and `base64`.

 **Response**: `{"success": true, "total_accounts": 5}`

+### `PUT /admin/accounts/{identifier}/proxy`
+
+Updates proxy binding for a specific account.
+
+- Request body: `{"proxy_id":"..."}`.
+- Use empty `proxy_id` to unbind proxy.
+- `identifier` supports email / mobile / token-only synthetic id.
+
 ### `GET /admin/queue/status`

 ```json
@@ -729,17 +843,25 @@ Exports full config in three forms: `config`, `json`, and `base64`.
  "available_accounts": ["a@example.com"],
  "in_use_accounts": ["b@example.com"],
  "max_inflight_per_account": 2,
-  "recommended_concurrency": 8
+  "global_max_inflight": 8,
+  "recommended_concurrency": 8,
+  "waiting": 0,
+  "max_queue_size": 8
 }
 ```

 | Field | Description |
 | --- | --- |
-| `available` | Currently available accounts |
-| `in_use` | Currently in-use accounts |
+| `available` | Accounts that still have spare inflight capacity |
+| `in_use` | Number of occupied in-flight slots |
 | `total` | Total accounts |
+| `available_accounts` | List of account IDs with remaining inflight capacity |
+| `in_use_accounts` | List of account IDs currently in use |
 | `max_inflight_per_account` | Per-account inflight limit |
+| `global_max_inflight` | Global inflight limit |
 | `recommended_concurrency` | Suggested concurrency (`total × max_inflight_per_account`) |
+| `waiting` | Number of queued requests currently waiting |
+| `max_queue_size` | Waiting queue limit |

 ### `POST /admin/accounts/test`

@@ -757,10 +879,14 @@ Exports full config in three forms: `config`, `json`, and `base64`.
  "success": true,
  "response_time": 1240,
  "message": "API test successful (session creation only)",
-  "model": "deepseek-chat"
+  "model": "deepseek-chat",
+  "session_count": 0,
+  "config_writable": true
 }
 ```

+If a `message` is provided, `thinking` may also be included when the upstream response carries reasoning text.
+
 ### `POST /admin/accounts/test-all`

 Optional request field: `model`.
@@ -774,6 +900,25 @@ Optional request field: `model`.
 }
 ```

+The internal concurrency limit is currently fixed at 5.
+
+### `POST /admin/accounts/sessions/delete-all`
+
+Deletes all DeepSeek sessions for a specific account. Request example:
+
+```json
+{"identifier":"user@example.com"}
+```
+
+Response:
+
+```json
+{"success": true, "message": "删除成功"}
+```
+
+If the account is missing or deletion fails, `success` becomes `false` and `message` contains the error.
+The current handler returns the Chinese literal `删除成功` on success.
+
 ### `POST /admin/import`

 Batch import keys and accounts.
@@ -819,6 +964,74 @@ Test API availability through the service itself.
 }
 ```

+### `POST /admin/dev/raw-samples/capture`
+
+Internally issues one `/v1/chat/completions` request through the service, then persists the request metadata and raw upstream SSE into `tests/raw_stream_samples/<sample-id>/`.
+
+Common request fields:
+
+| Field | Required | Default | Notes |
+| --- | --- | --- | --- |
+| `message` | No | `你好` | Convenience single-turn user message |
+| `messages` | No | Auto-derived from `message` | OpenAI-style message array |
+| `model` | No | `deepseek-chat` | Target model |
+| `stream` | No | `true` | Recommended to keep streaming enabled so raw SSE is recorded |
+| `api_key` | No | First configured key | Business API key to use |
+| `sample_id` | No | Auto-generated | Sample directory name |
+
+On success, the response headers include:
+
+- `X-Ds2-Sample-Id`
+- `X-Ds2-Sample-Dir`
+- `X-Ds2-Sample-Meta`
+- `X-Ds2-Sample-Upstream`
+
+If the request itself succeeds but the process did not record a new upstream capture, the endpoint returns:
+
+```json
+{"detail":"no upstream capture was recorded"}
+```
+
+### `GET /admin/dev/raw-samples/query`
+
+Searches the current process's in-memory capture entries and groups `completion + continue` rounds by `chat_session_id`.
+
+**Query parameters**:
+
+| Param | Default | Notes |
+| --- | --- | --- |
+| `q` | empty | Fuzzy match against request/response text |
+| `limit` | `20` | Max number of chains returned |
+
+**Response fields** include:
+
+- `items[].chain_key`
+- `items[].capture_ids`
+- `items[].round_count`
+- `items[].initial_label`
+- `items[].request_preview`
+- `items[].response_preview`
+
+### `POST /admin/dev/raw-samples/save`
+
+Persists one selected in-memory capture chain into `tests/raw_stream_samples/<sample-id>/`.
+
+Any one of these selectors is accepted:
+
+```json
+{"chain_key":"session:xxxx","sample_id":"tmp-from-memory"}
+```
+
+```json
+{"capture_id":"cap_xxx","sample_id":"tmp-from-memory"}
+```
+
+```json
+{"query":"Guangzhou weather","sample_id":"tmp-from-memory"}
+```
+
+The success payload includes `sample_id`, `dir`, `meta_path`, and `upstream_path`.
+
 ### `POST /admin/vercel/sync`

 | Field | Required | Notes |
@@ -851,16 +1064,25 @@ Or manual deploy required:
 }
 ```

+Failed account checks are returned in `failed_accounts`, and any saved Vercel credentials are returned in `saved_credentials`.
+
 ### `GET /admin/vercel/status`

 ```json
 {
  "synced": true,
  "last_sync_time": 1738400000,
-  "has_synced_before": true
+  "has_synced_before": true,
+  "env_backed": false,
+  "config_hash": "....",
+  "last_synced_hash": "....",
+  "draft_hash": "....",
+  "draft_differs": false
 }
 ```

+`POST /admin/vercel/status` can also accept `config_override` to compare a draft config against the current synced config.
+
 ### `GET /admin/export`

 ```json
@@ -870,6 +1092,29 @@ Or manual deploy required:
 }
 ```

+This is the same payload as `GET /admin/config/export`, just with a shorter path.
+
+### `GET /admin/version`
+
+Checks the current build version and the latest GitHub Release:
+
+```json
+{
+  "success": true,
+  "current_version": "3.0.0",
+  "current_tag": "v3.0.0",
+  "source": "file:VERSION",
+  "checked_at": "2026-03-29T00:00:00Z",
+  "latest_tag": "v3.0.0",
+  "latest_version": "3.0.0",
+  "release_url": "https://github.com/CJackHwang/ds2api/releases/tag/v3.0.0",
+  "published_at": "2026-03-28T12:00:00Z",
+  "has_update": false
+}
+```
+
+If GitHub API access fails, the response includes `check_error` while still returning HTTP 200.
+
 ### `GET /admin/dev/captures`

 Reads local packet-capture status and recent entries (Admin auth required):
--- a/API.md
+++ b/API.md
@@ -4,6 +4,8 @@

 本文档描述当前 Go 代码库的实际 API 行为。

+文档导航：[总览](README.MD) / [架构说明](docs/ARCHITECTURE.md) / [部署指南](docs/DEPLOY.md) / [测试指南](docs/TESTING.md)
+
 ---

 ## 目录
@@ -31,6 +33,13 @@
 | 健康检查 | `GET /healthz`、`GET /readyz` |
 | CORS | 已启用（`Access-Control-Allow-Origin: *`，允许 `Content-Type`, `Authorization`, `X-API-Key`, `X-Ds2-Target-Account`, `X-Vercel-Protection-Bypass`） |

+### 3.0 接口适配层说明
+
+- OpenAI / Claude / Gemini 三套协议已统一挂在同一 `chi` 路由树上，由 `internal/server/router.go` 负责装配。
+- 适配器层职责收敛为：**请求归一化 → DeepSeek 调用 → 协议形态渲染**，减少历史版本中“同能力多处实现”的分叉。
+- Tool Calling 的解析策略在 Go 与 Node Runtime 间保持一致：当前以 XML/Markup 家族解析为主（含 `<tool_call>` / `<function_call>` / `<invoke>` / `tool_use` / antml 变体），并在流式场景执行防泄漏筛分。
+- `Admin API` 将配置与运行时策略分开：`/admin/config*` 管静态配置，`/admin/settings*` 管运行时行为。
+
 ---

 ## 配置最佳实践
@@ -45,7 +54,7 @@ cp config.example.json config.json
 按部署方式使用：

 - 本地运行：直接读取 `config.json`
- Docker / Vercel：从 `config.json` 生成 Base64，填入 `DS2API_CONFIG_JSON`
+- Docker / Vercel：从 `config.json` 生成 Base64，填入 `DS2API_CONFIG_JSON`，也可以直接填原始 JSON

 ```bash
 DS2API_CONFIG_JSON="$(base64 < config.json | tr -d '\n')"
@@ -65,6 +74,7 @@ Vercel 一键部署可先只填 `DS2API_ADMIN_KEY`，部署后在 `/admin` 导
 | --- | --- |
 | Bearer Token | `Authorization: Bearer <token>` |
 | API Key Header | `x-api-key: <token>`（无 `Bearer` 前缀） |
+| Gemini 兼容 | `x-goog-api-key: <token>` 或 `?key=<token>` / `?api_key=<token>` |

 **鉴权行为**：

@@ -72,6 +82,7 @@ Vercel 一键部署可先只填 `DS2API_ADMIN_KEY`，部署后在 `/admin` 导
 - token 不在 `config.keys` 中 → **直通 token 模式**，直接作为 DeepSeek token 使用

 **可选请求头**：`X-Ds2-Target-Account: <email_or_mobile>` — 指定使用某个托管账号。
+Gemini 兼容客户端还可以使用 `x-goog-api-key`、`?key=` 或 `?api_key=` 作为凭据来源。

 ### Admin 接口（`/admin/*`）

@@ -88,13 +99,16 @@ Vercel 一键部署可先只填 `DS2API_ADMIN_KEY`，部署后在 `/admin` 导
 | 方法 | 路径 | 鉴权 | 说明 |
 | --- | --- | --- | --- |
 | GET | `/healthz` | 无 | 存活探针 |
+| HEAD | `/healthz` | 无 | 存活探针（无响应体） |
 | GET | `/readyz` | 无 | 就绪探针 |
+| HEAD | `/readyz` | 无 | 就绪探针（无响应体） |
 | GET | `/v1/models` | 无 | OpenAI 模型列表 |
 | GET | `/v1/models/{id}` | 无 | OpenAI 单模型查询（支持 alias 入参） |
 | POST | `/v1/chat/completions` | 业务 | OpenAI 对话补全 |
 | POST | `/v1/responses` | 业务 | OpenAI Responses 接口（流式/非流式） |
 | GET | `/v1/responses/{response_id}` | 业务 | 查询已生成 response（内存 TTL） |
 | POST | `/v1/embeddings` | 业务 | OpenAI Embeddings 接口 |
+| POST | `/v1/files` | 业务 | OpenAI Files 上传（multipart/form-data） |
 | GET | `/anthropic/v1/models` | 无 | Claude 模型列表 |
 | POST | `/anthropic/v1/messages` | 业务 | Claude 消息接口 |
 | POST | `/anthropic/v1/messages/count_tokens` | 业务 | Claude token 计数 |
@@ -118,19 +132,31 @@ Vercel 一键部署可先只填 `DS2API_ADMIN_KEY`，部署后在 `/admin` 导
 | GET | `/admin/config/export` | Admin | 导出完整配置（含 `config`/`json`/`base64`） |
 | POST | `/admin/keys` | Admin | 添加 API key |
 | DELETE | `/admin/keys/{key}` | Admin | 删除 API key |
+| GET | `/admin/proxies` | Admin | 代理列表 |
+| POST | `/admin/proxies` | Admin | 添加代理 |
+| PUT | `/admin/proxies/{proxyID}` | Admin | 更新代理（留空 password 表示保留原密码） |
+| DELETE | `/admin/proxies/{proxyID}` | Admin | 删除代理（自动解绑引用该代理的账号） |
+| POST | `/admin/proxies/test` | Admin | 测试代理连通性 |
 | GET | `/admin/accounts` | Admin | 分页账号列表 |
 | POST | `/admin/accounts` | Admin | 添加账号 |
 | DELETE | `/admin/accounts/{identifier}` | Admin | 删除账号 |
+| PUT | `/admin/accounts/{identifier}/proxy` | Admin | 为账号绑定/解绑代理 |
 | GET | `/admin/queue/status` | Admin | 账号队列状态 |
 | POST | `/admin/accounts/test` | Admin | 测试单个账号 |
 | POST | `/admin/accounts/test-all` | Admin | 测试全部账号 |
+| POST | `/admin/accounts/sessions/delete-all` | Admin | 删除某账号的全部会话 |
 | POST | `/admin/import` | Admin | 批量导入 keys/accounts |
 | POST | `/admin/test` | Admin | 测试当前 API 可用性 |
+| POST | `/admin/dev/raw-samples/capture` | Admin | 直接发起一次请求并保存为 raw sample |
+| GET | `/admin/dev/raw-samples/query` | Admin | 按问题关键词查询当前内存抓包链 |
+| POST | `/admin/dev/raw-samples/save` | Admin | 把命中的内存抓包链保存为 raw sample |
 | POST | `/admin/vercel/sync` | Admin | 同步配置到 Vercel |
 | GET | `/admin/vercel/status` | Admin | Vercel 同步状态 |
+| POST | `/admin/vercel/status` | Admin | Vercel 同步状态 / 草稿对比 |
 | GET | `/admin/export` | Admin | 导出配置 JSON/Base64 |
 | GET | `/admin/dev/captures` | Admin | 查看本地抓包记录 |
 | DELETE | `/admin/dev/captures` | Admin | 清空本地抓包记录 |
+| GET | `/admin/version` | Admin | 查询当前版本与最新 Release |

 ---

@@ -154,7 +180,7 @@ Vercel 一键部署可先只填 `DS2API_ADMIN_KEY`，部署后在 `/admin` 导

 ### `GET /v1/models`

-无需鉴权。返回当前支持的模型列表。
+无需鉴权。返回当前支持的 DeepSeek 原生模型列表。

 **响应示例**：

@@ -165,11 +191,21 @@ Vercel 一键部署可先只填 `DS2API_ADMIN_KEY`，部署后在 `/admin` 导
    {"id": "deepseek-chat", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
    {"id": "deepseek-reasoner", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
    {"id": "deepseek-chat-search", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
-    {"id": "deepseek-reasoner-search", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []}
+    {"id": "deepseek-reasoner-search", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
+    {"id": "deepseek-expert-chat", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
+    {"id": "deepseek-expert-reasoner", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
+    {"id": "deepseek-expert-chat-search", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
+    {"id": "deepseek-expert-reasoner-search", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
+    {"id": "deepseek-vision-chat", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
+    {"id": "deepseek-vision-reasoner", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
+    {"id": "deepseek-vision-chat-search", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []},
+    {"id": "deepseek-vision-reasoner-search", "object": "model", "created": 1677610602, "owned_by": "deepseek", "permission": []}
  ]
 }
 ```

+> 说明：`/v1/models` 返回的是规范化后的 DeepSeek 原生模型 ID；常见 alias 仅用于请求入参解析，不会在该接口中单独展开返回。
+
 ### 模型 alias 解析策略

 对 `chat` / `responses` / `embeddings` 的 `model` 字段采用“宽进严出”：
@@ -179,6 +215,13 @@ Vercel 一键部署可先只填 `DS2API_ADMIN_KEY`，部署后在 `/admin` 导
 3. 未命中时按模型家族规则回退（如 `o*`、`gpt-*`、`claude-*`）。
 4. 仍未命中则返回 `invalid_request_error`。

+当前内置默认 alias（节选）：
+
+- OpenAI：`gpt-4o`、`gpt-4.1`、`gpt-4.1-mini`、`gpt-4.1-nano`、`gpt-5`、`gpt-5-mini`、`gpt-5-codex`
+- OpenAI Reasoning：`o1`、`o1-mini`、`o3`、`o3-mini`
+- Claude：`claude-sonnet-4-5`、`claude-haiku-4-5`、`claude-opus-4-6`（及 `claude-3-5-sonnet` / `claude-3-5-haiku` / `claude-3-opus` 兼容别名）
+- Gemini：`gemini-2.5-pro`、`gemini-2.5-flash`
+
 ### `POST /v1/chat/completions`

 **请求头**：
@@ -192,7 +235,7 @@ Content-Type: application/json

 | 字段 | 类型 | 必填 | 说明 |
 | --- | --- | --- | --- |
-| `model` | string | ✅ | 支持 DeepSeek 原生模型 + 常见 alias（如 `gpt-4o`、`gpt-5-codex`、`o3`、`claude-sonnet-4-5`） |
+| `model` | string | ✅ | 支持 DeepSeek 原生模型 + 常见 alias（如 `gpt-5`、`gpt-5-mini`、`gpt-5-codex`、`o3`、`claude-opus-4-6`、`gemini-2.5-pro`、`gemini-2.5-flash` 等） |
 | `messages` | array | ✅ | OpenAI 风格消息数组 |
 | `stream` | boolean | ❌ | 默认 `false` |
 | `tools` | array | ❌ | Function Calling 定义 |
@@ -250,6 +293,7 @@ data: [DONE]
 - `deepseek-reasoner` / `deepseek-reasoner-search` 模型输出 `delta.reasoning_content`
 - 普通文本输出 `delta.content`
 - 最后一段包含 `finish_reason` 和 `usage`
+- token 计数优先透传上游 DeepSeek SSE（如 `accumulated_token_usage` / `token_usage`）；仅在上游缺失时回退本地估算

 #### Tool Calls

@@ -282,11 +326,12 @@ data: [DONE]
 }
 ```

-**流式**：命中高置信特征后立即输出 `delta.tool_calls`（不等待完整 JSON 闭合），并持续发送 arguments 增量；已确认的 toolcall 原始 JSON 不会回流到 `delta.content`。
+**流式**：命中高置信特征后立即输出 `delta.tool_calls`（不等待完整工具参数闭合），并持续发送 arguments 增量；已确认的工具调用片段不会回流到 `delta.content`。

 补充说明：

- **非代码块上下文**下，工具 JSON 即使与普通文本混合，也会按特征识别并产出可执行 tool call（前后普通文本仍可透传）。
+- **非代码块上下文**下，工具负载即使与普通文本混合，也会按特征识别并产出可执行 tool call（前后普通文本仍可透传）。
+- 解析器当前走 XML/Markup 家族（包含 `<tool_call>`、`<function_call>`、`<invoke>`、`tool_use`、antml 风格）；纯 JSON `tool_calls` 片段默认不会直接作为可执行调用解析。
 - Markdown fenced code block（例如 ```json ... ```）中的 `tool_calls` 仅视为示例文本，不会被执行。

 ---
@@ -346,7 +391,8 @@ data: [DONE]
 ```

 流式场景下若 `tool_choice=required` 违规，会返回 `response.failed` 后结束（不再发送 `response.completed`）。
-未在 `tools` 声明中的工具名会被严格拒绝，不会作为有效 tool call 下发。
+
+> 当前版本说明：解析层默认“尽量提取结构化 tool call”，未启用基于 `tools` allow-list 的硬拒绝；是否执行仍应由你的工具执行器做白名单校验。

 ### `GET /v1/responses/{response_id}`

@@ -365,11 +411,27 @@ data: [DONE]

 > 需配置 `embeddings.provider`。当前支持：`mock` / `deterministic` / `builtin`。未配置或不支持时返回标准错误结构（HTTP 501）。

+### `POST /v1/files`
+
+需要业务鉴权。兼容 OpenAI Files 上传接口，当前仅支持 `multipart/form-data`。
+
+| 字段 | 类型 | 必填 | 说明 |
+| --- | --- | --- | --- |
+| `file` | file | ✅ | 上传文件二进制 |
+| `purpose` | string | ❌ | 透传到上游用途字段 |
+
+约束与行为：
+
+- 请求必须为 `multipart/form-data`，否则返回 `400`。
+- 请求体总大小上限 `100 MiB`（超限返回 `413`）。
+- 成功返回 OpenAI `file` 对象（`id/object/bytes/filename/purpose/status` 等字段），并附带 `account_id` 便于定位来源账号。
+
 ---

 ## Claude 兼容接口

 除标准路径 `/anthropic/v1/*` 外，还支持快捷路径 `/v1/messages`、`/messages`、`/v1/messages/count_tokens`、`/messages/count_tokens`。
+实现上统一走 OpenAI Chat Completions 解析与回译链路，避免多套解析逻辑分叉维护。

 ### `GET /anthropic/v1/models`

@@ -391,7 +453,7 @@ data: [DONE]
 }
 ```

-> 说明：示例仅展示部分模型；实际返回包含 Claude 1.x/2.x/3.x/4.x 历史模型 ID 与常见别名。
+> 说明：示例仅展示部分模型；实际返回除当前主别名外，还包含 Claude 4.x snapshots，以及 3.x / 2.x / 1.x 历史模型 ID 与常见别名。

 ### `POST /anthropic/v1/messages`

@@ -504,6 +566,7 @@ data: {"type":"message_stop"}
 - `/v1/models/{model}:streamGenerateContent`（兼容路径）

 鉴权方式同业务接口（`Authorization: Bearer <token>` 或 `x-api-key`）。
+实现上统一走 OpenAI Chat Completions 解析与回译链路，避免多套解析逻辑分叉维护。

 ### `POST /v1beta/models/{model}:generateContent`

@@ -522,6 +585,7 @@ data: {"type":"message_stop"}
 - 常规文本：持续返回增量文本 chunk
 - `tools` 场景：会缓冲并在结束时输出 `functionCall` 结构
 - 结束 chunk：包含 `finishReason: "STOP"` 与 `usageMetadata`
+- token 计数优先透传上游 DeepSeek SSE（如 `accumulated_token_usage` / `token_usage`）；仅在上游缺失时回退本地估算

 ---

@@ -585,6 +649,10 @@ data: {"type":"message_stop"}
 ```json
 {
  "keys": ["k1", "k2"],
+  "env_backed": false,
+  "env_source_present": true,
+  "env_writeback_enabled": true,
+  "config_path": "/data/config.json",
  "accounts": [
    {
      "identifier": "user@example.com",
@@ -604,7 +672,7 @@ data: {"type":"message_stop"}

 ### `POST /admin/config`

-可更新 `keys`、`accounts`、`claude_mapping`。
+只更新 `keys`、`accounts`、`claude_mapping`。

 **请求**：

@@ -625,25 +693,29 @@ data: {"type":"message_stop"}

 读取运行时设置与状态，返回：

- `admin`（JWT 过期、默认密码告警等）
- `runtime`（`account_max_inflight`、`account_max_queue`、`global_max_inflight`）
- `toolcall` / `responses` / `embeddings`
- `auto_delete`（`sessions`）
+- `success`
+- `admin`（`has_password_hash`、`jwt_expire_hours`、`jwt_valid_after_unix`、`default_password_warning`）
+- `runtime`（`account_max_inflight`、`account_max_queue`、`global_max_inflight`、`token_refresh_interval_hours`）
+- `compat`（`wide_input_strict_output`、`strip_reference_markers`）
+- `responses` / `embeddings`
+- `auto_delete`（`mode`：`none` / `single` / `all`；旧配置 `sessions=true` 仍按 `all` 处理）
 - `claude_mapping` / `model_aliases`
 - `env_backed`、`needs_vercel_sync`
+- `toolcall` 策略已固定为 `feature_match + high`，不再通过 settings 返回或修改

 ### `PUT /admin/settings`

 热更新运行时设置。支持更新：

 - `admin.jwt_expire_hours`
- `runtime.account_max_inflight` / `runtime.account_max_queue` / `runtime.global_max_inflight`
- `toolcall.mode` / `toolcall.early_emit_confidence`
+- `runtime.account_max_inflight` / `runtime.account_max_queue` / `runtime.global_max_inflight` / `runtime.token_refresh_interval_hours`
+- `compat.wide_input_strict_output` / `compat.strip_reference_markers`
 - `responses.store_ttl_seconds`
 - `embeddings.provider`
- `auto_delete.sessions`
+- `auto_delete.mode`
 - `claude_mapping`
 - `model_aliases`
+- `toolcall` 策略已固定，不再作为可写入字段

 ### `POST /admin/settings/password`

@@ -655,6 +727,8 @@ data: {"type":"message_stop"}
 {"new_password":"your-new-password"}
 ```

+也兼容 `{"password":"your-new-password"}`。
+
 ### `POST /admin/config/import`

 导入完整配置，支持：
@@ -663,6 +737,10 @@ data: {"type":"message_stop"}
 - `mode=replace`

 请求可直接传配置对象，或使用 `{"config": {...}, "mode":"merge"}` 包裹格式。
+也支持在查询参数里传 `?mode=merge` / `?mode=replace`。
+导入时会接受 `keys`、`accounts`、`claude_mapping` / `claude_model_mapping`、`model_aliases`、`admin`、`runtime`、`responses`、`embeddings`、`auto_delete` 等字段；`toolcall` 相关字段会被忽略。
+
+> `compat` 相关字段请通过 `/admin/settings` 或配置文件管理；该导入接口不会更新 `compat`。

 ### `GET /admin/config/export`

@@ -680,6 +758,26 @@ data: {"type":"message_stop"}

 **响应**：`{"success": true, "total_keys": 2}`

+### `GET /admin/proxies`
+
+列出代理配置（密码不回传，仅返回 `has_password` 标记）。
+
+### `POST /admin/proxies`
+
+新增代理。请求体支持 `id`（可选，未传则自动生成）、`name`、`type`（`http` / `socks5`）、`host`、`port`、`username`、`password`。
+
+### `PUT /admin/proxies/{proxyID}`
+
+更新指定代理。若请求中 `password` 为空字符串，则保留原密码。
+
+### `DELETE /admin/proxies/{proxyID}`
+
+删除代理，并自动清空所有引用该代理账号的 `proxy_id`。
+
+### `POST /admin/proxies/test`
+
+测试代理连通性：传 `proxy_id` 时测试已保存代理；不传时按请求体代理字段做临时连通性测试。
+
 ### `GET /admin/accounts`

 **查询参数**：
@@ -687,7 +785,8 @@ data: {"type":"message_stop"}
 | 参数 | 默认 | 范围 |
 | --- | --- | --- |
 | `page` | `1` | ≥ 1 |
-| `page_size` | `10` | 1–100 |
+| `page_size` | `10` | 1–5000 |
+| `q` | 空 | 按 identifier / email / mobile 过滤 |

 **响应**：

@@ -700,7 +799,8 @@ data: {"type":"message_stop"}
      "mobile": "",
      "has_password": true,
      "has_token": true,
-      "token_preview": "abc..."
+      "token_preview": "abc...",
+      "test_status": "ok"
    }
  ],
  "total": 25,
@@ -724,6 +824,14 @@ data: {"type":"message_stop"}

 **响应**：`{"success": true, "total_accounts": 5}`

+### `PUT /admin/accounts/{identifier}/proxy`
+
+更新指定账号绑定代理。
+
+- 请求体：`{"proxy_id":"..."}`；
+- `proxy_id` 传空字符串时表示解绑代理；
+- `identifier` 支持 email / mobile / token-only 合成标识。
+
 ### `GET /admin/queue/status`

 ```json
@@ -734,17 +842,25 @@ data: {"type":"message_stop"}
  "available_accounts": ["a@example.com"],
  "in_use_accounts": ["b@example.com"],
  "max_inflight_per_account": 2,
-  "recommended_concurrency": 8
+  "global_max_inflight": 8,
+  "recommended_concurrency": 8,
+  "waiting": 0,
+  "max_queue_size": 8
 }
 ```

 | 字段 | 说明 |
 | --- | --- |
-| `available` | 当前可用账号数 |
-| `in_use` | 当前使用中的账号数 |
+| `available` | 仍有剩余并发槽位的账号数 |
+| `in_use` | 当前已占用的 in-flight 槽位数 |
 | `total` | 总账号数 |
+| `available_accounts` | 仍有剩余并发槽位的账号 ID 列表 |
+| `in_use_accounts` | 当前处于使用中的账号 ID 列表 |
 | `max_inflight_per_account` | 每账号并发上限 |
+| `global_max_inflight` | 全局并发上限 |
 | `recommended_concurrency` | 建议并发值（`total × max_inflight_per_account`） |
+| `waiting` | 当前等待中的请求数 |
+| `max_queue_size` | 等待队列上限 |

 ### `POST /admin/accounts/test`

@@ -762,10 +878,14 @@ data: {"type":"message_stop"}
  "success": true,
  "response_time": 1240,
  "message": "API 测试成功（仅会话创建）",
-  "model": "deepseek-chat"
+  "model": "deepseek-chat",
+  "session_count": 0,
+  "config_writable": true
 }
 ```

+如果传入 `message`，还会附带 `thinking`（当上游返回思考内容时）。
+
 ### `POST /admin/accounts/test-all`

 可选请求字段：`model`
@@ -779,6 +899,24 @@ data: {"type":"message_stop"}
 }
 ```

+内部并发上限当前固定为 5。
+
+### `POST /admin/accounts/sessions/delete-all`
+
+清空指定账号的所有 DeepSeek 会话。请求体示例：
+
+```json
+{"identifier":"user@example.com"}
+```
+
+响应：
+
+```json
+{"success": true, "message": "删除成功"}
+```
+
+如果账号不存在或删除失败，`success` 会是 `false`，`message` 会返回错误原因。
+
 ### `POST /admin/import`

 批量导入 keys 与 accounts。
@@ -824,6 +962,74 @@ data: {"type":"message_stop"}
 }
 ```

+### `POST /admin/dev/raw-samples/capture`
+
+直接通过服务自身发起一次 `/v1/chat/completions` 请求，并把请求元信息和上游原始 SSE 保存到 `tests/raw_stream_samples/<sample-id>/`。
+
+常用请求字段：
+
+| 字段 | 必填 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `message` | 否 | `你好` | 便捷单轮用户消息 |
+| `messages` | 否 | 自动由 `message` 生成 | OpenAI 风格消息数组 |
+| `model` | 否 | `deepseek-chat` | 目标模型 |
+| `stream` | 否 | `true` | 建议保留流式，以记录原始 SSE |
+| `api_key` | 否 | 配置中第一个 key | 调用业务接口使用的 key |
+| `sample_id` | 否 | 自动生成 | 样本目录名 |
+
+成功时会在响应头里附带：
+
+- `X-Ds2-Sample-Id`
+- `X-Ds2-Sample-Dir`
+- `X-Ds2-Sample-Meta`
+- `X-Ds2-Sample-Upstream`
+
+如果请求本身成功，但当前进程没有记录到新的上游抓包，会返回：
+
+```json
+{"detail":"no upstream capture was recorded"}
+```
+
+### `GET /admin/dev/raw-samples/query`
+
+按关键词查询当前进程内存里的抓包记录，并按 `chat_session_id` 归并 `completion + continue` 链。
+
+**查询参数**：
+
+| 参数 | 默认值 | 说明 |
+| --- | --- | --- |
+| `q` | 空 | 按请求体/响应体关键词模糊匹配 |
+| `limit` | `20` | 返回链条数上限 |
+
+**响应字段**包含：
+
+- `items[].chain_key`
+- `items[].capture_ids`
+- `items[].round_count`
+- `items[].initial_label`
+- `items[].request_preview`
+- `items[].response_preview`
+
+### `POST /admin/dev/raw-samples/save`
+
+把当前内存中的某条抓包链落盘为 `tests/raw_stream_samples/<sample-id>/`。
+
+支持以下任一种选中方式：
+
+```json
+{"chain_key":"session:xxxx","sample_id":"tmp-from-memory"}
+```
+
+```json
+{"capture_id":"cap_xxx","sample_id":"tmp-from-memory"}
+```
+
+```json
+{"query":"广州天气","sample_id":"tmp-from-memory"}
+```
+
+成功响应会返回 `sample_id`、`dir`、`meta_path`、`upstream_path`。
+
 ### `POST /admin/vercel/sync`

 | 字段 | 必填 | 说明 |
@@ -856,16 +1062,25 @@ data: {"type":"message_stop"}
 }
 ```

+失败校验的账号会通过 `failed_accounts` 返回；成功保存到 Vercel 的凭据会通过 `saved_credentials` 返回。
+
 ### `GET /admin/vercel/status`

 ```json
 {
  "synced": true,
  "last_sync_time": 1738400000,
-  "has_synced_before": true
+  "has_synced_before": true,
+  "env_backed": false,
+  "config_hash": "....",
+  "last_synced_hash": "....",
+  "draft_hash": "....",
+  "draft_differs": false
 }
 ```

+`POST /admin/vercel/status` 还可以携带 `config_override`，用于对比“草稿配置”和当前已同步配置。
+
 ### `GET /admin/export`

 ```json
@@ -875,6 +1090,29 @@ data: {"type":"message_stop"}
 }
 ```

+该接口与 `GET /admin/config/export` 返回相同内容，只是路径更短。
+
+### `GET /admin/version`
+
+查询当前构建版本与 GitHub 最新 Release：
+
+```json
+{
+  "success": true,
+  "current_version": "3.0.0",
+  "current_tag": "v3.0.0",
+  "source": "file:VERSION",
+  "checked_at": "2026-03-29T00:00:00Z",
+  "latest_tag": "v3.0.0",
+  "latest_version": "3.0.0",
+  "release_url": "https://github.com/CJackHwang/ds2api/releases/tag/v3.0.0",
+  "published_at": "2026-03-28T12:00:00Z",
+  "has_update": false
+}
+```
+
+如果 GitHub API 不可用，响应里会额外包含 `check_error`，但 HTTP 状态仍为 200。
+
 ### `GET /admin/dev/captures`

 查看本地抓包状态与最近记录（需 Admin 鉴权）：
--- a/CONTRIBUTING.en.md
+++ b/CONTRIBUTING.en.md
@@ -1,145 +0,0 @@
-# Contributing Guide
-
-Language: [中文](CONTRIBUTING.md) | [English](CONTRIBUTING.en.md)
-
-Thanks for your interest in contributing to DS2API!
-
-## Development Setup
-
-### Prerequisites
-
- Go 1.24+
- Node.js 20+ (for WebUI development)
- npm (bundled with Node.js)
-
-### Backend Development
-
-```bash
-# 1. Clone
-git clone https://github.com/CJackHwang/ds2api.git
-cd ds2api
-
-# 2. Configure
-cp config.example.json config.json
-# Edit config.json with test accounts
-
-# 3. Run backend
-go run ./cmd/ds2api
-# Default: http://localhost:5001
-```
-
-### Frontend Development (WebUI)
-
-```bash
-# 1. Navigate to WebUI directory
-cd webui
-
-# 2. Install dependencies
-npm install
-
-# 3. Start dev server (hot reload)
-npm run dev
-# Default: http://localhost:5173, auto-proxies API to backend
-```
-
-WebUI tech stack:
- React + Vite
- Tailwind CSS
- Bilingual language packs: `webui/src/locales/zh.json` / `en.json`
-
-### Docker Development
-
-```bash
-docker-compose -f docker-compose.dev.yml up
-```
-
-## Code Standards
-
-| Language | Standards |
-| --- | --- |
-| **Go** | Run `gofmt` and ensure `go test ./...` passes before committing |
-| **JavaScript/React** | Follow existing project style (functional components) |
-| **Commit messages** | Use semantic prefixes: `feat:`, `fix:`, `docs:`, `refactor:`, `style:`, `perf:`, `chore:` |
-
-## Submitting a PR
-
-1. Fork the repo
-2. Create a branch (e.g. `feature/xxx` or `fix/xxx`)
-3. Commit changes
-4. Push your branch
-5. Open a Pull Request
-
-> 💡 If you modify files under `webui/`, no manual build is needed — CI handles it automatically.
-
-## Build WebUI
-
-Manually build WebUI to `static/admin/`:
-
-```bash
-./scripts/build-webui.sh
-```
-
-## Running Tests
-
-```bash
-# Go + Node unit tests (recommended)
-./tests/scripts/run-unit-all.sh
-
-# End-to-end live tests (real accounts)
-./tests/scripts/run-live.sh
-```
-
-## Project Structure
-
-```text
-ds2api/
-├── cmd/
-│   ├── ds2api/              # Local/container entrypoint
-│   └── ds2api-tests/        # End-to-end testsuite entrypoint
-├── api/
-│   ├── index.go             # Vercel Serverless Go entry
-│   ├── chat-stream.js       # Vercel Node.js stream relay
-│   └── (rewrite targets in vercel.json)
-├── internal/
-│   ├── account/             # Account pool and concurrency queue
-│   ├── adapter/
-│   │   ├── openai/          # OpenAI adapter
-│   │   ├── claude/          # Claude adapter
-│   │   └── gemini/          # Gemini adapter
-│   ├── admin/               # Admin API handlers
-│   ├── auth/                # Auth and JWT
-│   ├── claudeconv/          # Claude message conversion
-│   ├── compat/              # Compatibility helpers
-│   ├── config/              # Config loading and hot-reload
-│   ├── deepseek/            # DeepSeek client, PoW WASM
-│   ├── js/                  # Node runtime stream/compat logic
-│   ├── devcapture/          # Dev packet capture
-│   ├── format/              # Output formatting
-│   ├── prompt/              # Prompt building
-│   ├── server/              # HTTP routing (chi router)
-│   ├── sse/                 # SSE parsing utilities
-│   ├── stream/              # Unified stream consumption engine
-│   ├── testsuite/           # Testsuite core logic
-│   ├── util/                # Common utilities
-│   └── webui/               # WebUI static hosting
-├── webui/                   # React WebUI source
-│   └── src/
-│       ├── app/             # Routing, auth, config state
-│       ├── features/        # Feature modules
-│       ├── components/      # Shared components
-│       └── locales/         # Language packs
-├── scripts/                 # Build and test scripts
-├── static/admin/            # WebUI build output (not committed)
-├── Dockerfile               # Multi-stage build
-├── docker-compose.yml       # Production
-├── docker-compose.dev.yml   # Development
-└── vercel.json              # Vercel config
-```
-
-## Reporting Issues
-
-Please use [GitHub Issues](https://github.com/CJackHwang/ds2api/issues) and include:
-
- Steps to reproduce
- Relevant log output
- Environment info (OS, Go version, deployment method)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,145 +0,0 @@
-# 贡献指南
-
-语言 / Language: [中文](CONTRIBUTING.md) | [English](CONTRIBUTING.en.md)
-
-感谢你对 DS2API 的关注与贡献！
-
-## 开发环境设置
-
-### 前置要求
-
- Go 1.24+
- Node.js 20+（WebUI 开发时）
- npm（随 Node.js 提供）
-
-### 后端开发
-
-```bash
-# 1. 克隆仓库
-git clone https://github.com/CJackHwang/ds2api.git
-cd ds2api
-
-# 2. 配置
-cp config.example.json config.json
-# 编辑 config.json，填入测试账号
-
-# 3. 启动后端
-go run ./cmd/ds2api
-# 默认监听 http://localhost:5001
-```
-
-### 前端开发（WebUI）
-
-```bash
-# 1. 进入 WebUI 目录
-cd webui
-
-# 2. 安装依赖
-npm install
-
-# 3. 启动开发服务器（热更新）
-npm run dev
-# 默认监听 http://localhost:5173，自动代理 API 到后端
-```
-
-WebUI 技术栈：
- React + Vite
- Tailwind CSS
- 中英文语言包：`webui/src/locales/zh.json` / `en.json`
-
-### Docker 开发环境
-
-```bash
-docker-compose -f docker-compose.dev.yml up
-```
-
-## 代码规范
-
-| 语言 | 规范 |
-| --- | --- |
-| **Go** | 提交前运行 `gofmt`，确保 `go test ./...` 通过 |
-| **JavaScript/React** | 保持现有代码风格（函数组件） |
-| **提交信息** | 使用语义化前缀：`feat:`、`fix:`、`docs:`、`refactor:`、`style:`、`perf:`、`chore:` |
-
-## 提交 PR
-
-1. Fork 仓库
-2. 创建分支（如 `feature/xxx` 或 `fix/xxx`）
-3. 提交更改
-4. 推送分支
-5. 发起 Pull Request
-
-> 💡 如果修改了 `webui/` 目录下的文件，无需手动构建——CI 会自动处理。
-
-## WebUI 构建
-
-手动构建 WebUI 到 `static/admin/`：
-
-```bash
-./scripts/build-webui.sh
-```
-
-## 运行测试
-
-```bash
-# Go + Node 单元测试（推荐）
-./tests/scripts/run-unit-all.sh
-
-# 端到端全链路测试（真实账号）
-./tests/scripts/run-live.sh
-```
-
-## 项目结构
-
-```text
-ds2api/
-├── cmd/
-│   ├── ds2api/              # 本地/容器启动入口
-│   └── ds2api-tests/        # 端到端测试集入口
-├── api/
-│   ├── index.go             # Vercel Serverless Go 入口
-│   ├── chat-stream.js       # Vercel Node.js 流式转发
-│   └── (rewrite targets in vercel.json)
-├── internal/
-│   ├── account/             # 账号池与并发队列
-│   ├── adapter/
-│   │   ├── openai/          # OpenAI 兼容适配器
-│   │   ├── claude/          # Claude 兼容适配器
-│   │   └── gemini/          # Gemini 兼容适配器
-│   ├── admin/               # Admin API handlers
-│   ├── auth/                # 鉴权与 JWT
-│   ├── claudeconv/          # Claude 消息格式转换
-│   ├── compat/              # 兼容性辅助
-│   ├── config/              # 配置加载与热更新
-│   ├── deepseek/            # DeepSeek 客户端、PoW WASM
-│   ├── js/                  # Node 运行时流式/兼容逻辑
-│   ├── devcapture/          # 开发抓包
-│   ├── format/              # 输出格式化
-│   ├── prompt/              # Prompt 构建
-│   ├── server/              # HTTP 路由（chi router）
-│   ├── sse/                 # SSE 解析工具
-│   ├── stream/              # 统一流式消费引擎
-│   ├── testsuite/           # 测试集核心逻辑
-│   ├── util/                # 通用工具
-│   └── webui/               # WebUI 静态托管
-├── webui/                   # React WebUI 源码
-│   └── src/
-│       ├── app/             # 路由、鉴权、配置状态
-│       ├── features/        # 业务功能模块
-│       ├── components/      # 通用组件
-│       └── locales/         # 语言包
-├── scripts/                 # 构建与测试脚本
-├── static/admin/            # WebUI 构建产物（不提交）
-├── Dockerfile               # 多阶段构建
-├── docker-compose.yml       # 生产环境
-├── docker-compose.dev.yml   # 开发环境
-└── vercel.json              # Vercel 配置
-```
-
-## 问题反馈
-
-请使用 [GitHub Issues](https://github.com/CJackHwang/ds2api/issues) 并附上：
-
- 复现步骤
- 相关日志输出
- 运行环境信息（OS、Go 版本、部署方式）
--- a/10
+++ b/10
@@ -1,4 +1,4 @@
-FROM node:20 AS webui-builder
+FROM node:24 AS webui-builder

 WORKDIR /app/webui
 COPY webui/package.json webui/package-lock.json ./
@@ -6,7 +6,7 @@ RUN npm ci
 COPY webui ./
 RUN npm run build

-FROM golang:1.24 AS go-builder
+FROM golang:1.26 AS go-builder
 WORKDIR /app
 ARG TARGETOS
 ARG TARGETARCH
@@ -34,7 +34,7 @@ CMD ["/usr/local/bin/ds2api"]

 FROM runtime-base AS runtime-from-source
 COPY --from=go-builder /out/ds2api /usr/local/bin/ds2api
-COPY --from=go-builder /app/sha3_wasm_bg.7b9ca65ddd.wasm /app/sha3_wasm_bg.7b9ca65ddd.wasm
+
 COPY --from=go-builder /app/config.example.json /app/config.example.json
 COPY --from=webui-builder /app/static/admin /app/static/admin

@@ -53,13 +53,13 @@ RUN set -eux; \
    test -n "${PKG_DIR}"; \
    mkdir -p /out/static; \
    cp "${PKG_DIR}/ds2api" /out/ds2api; \
-    cp "${PKG_DIR}/sha3_wasm_bg.7b9ca65ddd.wasm" /out/sha3_wasm_bg.7b9ca65ddd.wasm; \
+
    cp "${PKG_DIR}/config.example.json" /out/config.example.json; \
    cp -R "${PKG_DIR}/static/admin" /out/static/admin

 FROM runtime-base AS runtime-from-dist
 COPY --from=dist-extract /out/ds2api /usr/local/bin/ds2api
-COPY --from=dist-extract /out/sha3_wasm_bg.7b9ca65ddd.wasm /app/sha3_wasm_bg.7b9ca65ddd.wasm
+
 COPY --from=dist-extract /out/config.example.json /app/config.example.json
 COPY --from=dist-extract /out/static/admin /app/static/admin

--- a/143
+++ b/143
@@ -1,5 +1,5 @@
-                    GNU GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007

 Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
@@ -7,17 +7,15 @@

                            Preamble

-  The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
+  The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.

  The licenses for most software and other practical works are designed
 to take away your freedom to share and change the works.  By contrast,
-the GNU General Public License is intended to guarantee your freedom to
+our General Public Licenses are intended to guarantee your freedom to
 share and change all versions of a program--to make sure it remains free
-software for all its users.  We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors.  You can apply it to
-your programs, too.
+software for all its users.

  When we speak of free software, we are referring to freedom, not
 price.  Our General Public Licenses are designed to make sure that you
@@ -26,44 +24,34 @@ them if you wish), that you receive source code or can get it if you
 want it, that you can change the software or use pieces of it in new
 free programs, and that you know you can do these things.

-  To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights.  Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
+  Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.

-  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received.  You must make sure that they, too, receive
-or can get the source code.  And you must show them these terms so they
-know their rights.
+  A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate.  Many developers of free software are heartened and
+encouraged by the resulting cooperation.  However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.

-  Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
+  The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community.  It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server.  Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.

-  For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software.  For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
-
-  Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so.  This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software.  The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable.  Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products.  If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-
-  Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary.  To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
+  An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals.  This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.

  The precise terms and conditions for copying, distribution and
 modification follow.
@@ -72,7 +60,7 @@ modification follow.

  0. Definitions.

-  "This License" refers to version 3 of the GNU General Public License.
+  "This License" refers to version 3 of the GNU Affero General Public License.

  "Copyright" also means copyright-like laws that apply to other kinds of
 works, such as semiconductor masks.
@@ -549,35 +537,45 @@ to collect a royalty for further conveying from those to whom you convey
 the Program, the only way you could satisfy both those terms and this
 License would be to refrain entirely from conveying the Program.

-  13. Use with the GNU Affero General Public License.
+  13. Remote Network Interaction; Use with the GNU General Public License.
+
+  Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software.  This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.

  Notwithstanding any other provision of this License, you have
 permission to link or combine any covered work with a work licensed
-under version 3 of the GNU Affero General Public License into a single
+under version 3 of the GNU General Public License into a single
 combined work, and to convey the resulting work.  The terms of this
 License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU Affero General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.

  14. Revised Versions of this License.

  The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time.  Such new versions will
-be similar in spirit to the present version, but may differ in detail to
+the GNU Affero General Public License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in detail to
 address new problems or concerns.

  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU General
+Program specifies that a certain numbered version of the GNU Affero General
 Public License "or any later version" applies to it, you have the
 option of following the terms and conditions either of that numbered
 version or of any later version published by the Free Software
 Foundation.  If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
+GNU Affero General Public License, you may choose any version ever published
 by the Free Software Foundation.

  If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
+versions of the GNU Affero General Public License can be used, that proxy's
 public statement of acceptance of a version permanently authorizes you
 to choose that version for the Program.

@@ -635,40 +633,29 @@ the "copyright" line and a pointer to where the full notice is found.
    Copyright (C) <year>  <name of author>

    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
+    it under the terms of the GNU Affero General Public License as published
+    by the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+    GNU Affero General Public License for more details.

-    You should have received a copy of the GNU General Public License
+    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.

 Also add information on how to contact you by electronic and paper mail.

-  If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-
-    <program>  Copyright (C) <year>  <name of author>
-    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
+  If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source.  For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code.  There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.

  You should also get your employer (if you work as a programmer) or school,
 if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
+For more information on this, and how to apply and follow the GNU AGPL, see
 <https://www.gnu.org/licenses/>.
-
-  The GNU General Public License does not permit incorporating your program
-into proprietary programs.  If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library.  If this is what you want to do, use the GNU Lesser General
-Public License instead of this License.  But first, please read
-<https://www.gnu.org/licenses/why-not-lgpl.html>.
--- a/README.MD
+++ b/README.MD
@@ -8,7 +8,7 @@
 ![Stars](https://img.shields.io/github/stars/CJackHwang/ds2api.svg)
 ![Forks](https://img.shields.io/github/forks/CJackHwang/ds2api.svg)
 [![Release](https://img.shields.io/github/v/release/CJackHwang/ds2api?display_name=tag)](https://github.com/CJackHwang/ds2api/releases)
-[![Docker](https://img.shields.io/badge/docker-ready-blue.svg)](DEPLOY.md)
+[![Docker](https://img.shields.io/badge/docker-ready-blue.svg)](docs/DEPLOY.md)
 [![Deploy on Zeabur](https://zeabur.com/button.svg)](https://zeabur.com/templates/L4CFHP)
 [![Deploy with Vercel](https://vercel.com/button)](https://vercel.com/new/clone?repository-url=https://github.com/CJackHwang/ds2api)

@@ -16,43 +16,68 @@

 将 DeepSeek Web 对话能力转换为 OpenAI、Claude 与 Gemini 兼容 API。后端为 **Go 全量实现**，前端为 React WebUI 管理台（源码在 `webui/`，部署时自动构建到 `static/admin`）。

-## 架构概览
+文档入口：[文档导航](docs/README.md) / [架构说明](docs/ARCHITECTURE.md) / [接口文档](API.md)
+
+【感谢Linux.do社区及GitHub社区各位开发者对项目的支持与贡献】
+
+> **重要免责声明**
+>
+> 本仓库仅供学习、研究、个人实验和内部验证使用，不提供任何形式的商业授权、适用性保证或结果保证。
+>
+> 作者及仓库维护者不对因使用、修改、分发、部署或依赖本项目而产生的任何直接或间接损失、账号封禁、数据丢失、法律风险或第三方索赔负责。
+>
+> 请勿将本项目用于违反服务条款、协议、法律法规或平台规则的场景。商业使用前请自行确认 `LICENSE`、相关协议以及你是否获得了作者的书面许可。
+
+## 架构概览（摘要）

 ```mermaid
 flowchart LR
-    Client["🖥️ 客户端\n(OpenAI / Claude / Gemini 兼容)"]
+    Client["🖥️ 客户端 / SDK\n(OpenAI / Claude / Gemini)"]
+    Upstream["☁️ DeepSeek API"]

-    subgraph DS2API["DS2API 服务"]
-        direction TB
-        CORS["CORS 中间件"]
-        Auth["🔐 鉴权中间件"]
+    subgraph DS2API["DS2API 3.x（统一 OpenAI 内核）"]
+        Router["chi Router + 中间件\n(RequestID / RealIP / Logger / Recoverer / CORS)"]

-        subgraph Adapters["适配器层"]
-            OA["OpenAI 适配器\n/v1/*"]
-            CA["Claude 适配器\n/anthropic/*"]
-            GA["Gemini 适配器\n/v1beta/models/*"]
+        subgraph Adapters["协议适配层"]
+            OA["OpenAI\n/v1/*"]
+            CA["Claude\n/anthropic/* + /v1/messages"]
+            GA["Gemini\n/v1beta/models/* + /v1/models/*"]
+            Admin["Admin API\n/admin/*"]
+            WebUI["WebUI\n/admin（静态托管）"]
        end

-        subgraph Support["支撑模块"]
-            Pool["📦 账号池 / 并发队列"]
-            PoW["⚙️ PoW WASM\n(wazero)"]
+        subgraph Runtime["运行时核心能力"]
+            Bridge["CLIProxy 转换桥\n(多协议 <-> OpenAI)"]
+            OAEngine["OpenAI ChatCompletions\n(统一工具调用与流式语义)"]
+            Auth["Auth Resolver\n(API key / bearer / x-goog-api-key)"]
+            Pool["Account Pool + Queue\n(并发槽位 + 等待队列)"]
+            DSClient["DeepSeek Client\n(Session / Auth / HTTP)"]
+            Pow["PoW 实现\n(纯 Go 毫秒级)"]
+            Tool["Tool Sieve\n(Go/Node 语义对齐)"]
        end
-
-        Admin["🛠️ Admin API\n/admin/*"]
-        WebUI["🌐 WebUI\n(/admin)"]
    end

-    DS["☁️ DeepSeek API"]
+    Client --> Router
+    Router --> OA & CA & GA
+    Router --> Admin
+    Router --> WebUI

-    Client -- "请求" --> CORS --> Auth
-    Auth --> OA & CA & GA
-    OA & CA & GA -- "调用" --> DS
-    Auth --> Admin
-    OA & CA & GA -. "轮询选账号" .-> Pool
-    OA & CA & GA -. "计算 PoW" .-> PoW
-    DS -- "响应" --> Client
+    OA --> OAEngine
+    CA & GA --> Bridge
+    Bridge --> OAEngine
+    OAEngine --> Auth
+    OAEngine -.账号轮询.-> Pool
+    OAEngine -.工具调用解析.-> Tool
+    OAEngine -.PoW 计算.-> Pow
+    Auth --> DSClient
+    DSClient --> Upstream
+    Upstream --> DSClient
+    OAEngine --> Bridge
+    Bridge --> Client
 ```

+详细架构拆分与目录职责见 [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md)。
+
 - **后端**：Go（`cmd/ds2api/`、`api/`、`internal/`），不依赖 Python 运行时
 - **前端**：React 管理台（`webui/`），运行时托管静态构建产物
 - **部署**：本地运行、Docker、Vercel Serverless、Linux systemd
@@ -61,14 +86,14 @@ flowchart LR

 | 能力 | 说明 |
 | --- | --- |
-| OpenAI 兼容 | `GET /v1/models`、`GET /v1/models/{id}`、`POST /v1/chat/completions`、`POST /v1/responses`、`GET /v1/responses/{response_id}`、`POST /v1/embeddings` |
+| OpenAI 兼容 | `GET /v1/models`、`GET /v1/models/{id}`、`POST /v1/chat/completions`、`POST /v1/responses`、`GET /v1/responses/{response_id}`、`POST /v1/embeddings`、`POST /v1/files` |
 | Claude 兼容 | `GET /anthropic/v1/models`、`POST /anthropic/v1/messages`、`POST /anthropic/v1/messages/count_tokens`（及快捷路径 `/v1/messages`、`/messages`） |
 | Gemini 兼容 | `POST /v1beta/models/{model}:generateContent`、`POST /v1beta/models/{model}:streamGenerateContent`（及 `/v1/models/{model}:*` 路径） |
 | 多账号轮询 | 自动 token 刷新、邮箱/手机号双登录方式 |
 | 并发队列控制 | 每账号 in-flight 上限 + 等待队列，动态计算建议并发值 |
-| DeepSeek PoW | WASM 计算（`wazero`），无需外部 Node.js 依赖 |
+| DeepSeek PoW | 纯 Go 高性能实现（DeepSeekHashV1），毫秒级响应 |
 | Tool Calling | 防泄漏处理：非代码块高置信特征识别、`delta.tool_calls` 早发、结构化增量输出 |
-| Admin API | 配置管理、运行时设置热更新、账号测试 / 批量测试、导入导出、Vercel 同步 |
+| Admin API | 配置管理、运行时设置热更新、代理管理、账号测试 / 批量测试、会话清理、导入导出、Vercel 同步、版本检查 |
 | WebUI 管理台 | `/admin` 单页应用（中英文双语、深色模式） |
 | 运维探针 | `GET /healthz`（存活）、`GET /readyz`（就绪） |

@@ -82,37 +107,45 @@ flowchart LR
 | P0 | Anthropic SDK（messages） | ✅ |
 | P0 | Google Gemini SDK（generateContent） | ✅ |
 | P1 | LangChain / LlamaIndex / OpenWebUI（OpenAI 兼容接入） | ✅ |
-| P2 | MCP 独立桥接层 | 规划中 |

 ## 模型支持

-### OpenAI 接口
+### OpenAI 接口（`GET /v1/models`）

-| 模型 | thinking | search |
-| --- | --- | --- |
-| `deepseek-chat` | ❌ | ❌ |
-| `deepseek-reasoner` | ✅ | ❌ |
-| `deepseek-chat-search` | ❌ | ✅ |
-| `deepseek-reasoner-search` | ✅ | ✅ |
+| 模型类型 | 模型 ID | thinking | search |
+| --- | --- | --- | --- |
+| default | `deepseek-chat` | ❌ | ❌ |
+| default | `deepseek-reasoner` | ✅ | ❌ |
+| default | `deepseek-chat-search` | ❌ | ✅ |
+| default | `deepseek-reasoner-search` | ✅ | ✅ |
+| expert | `deepseek-expert-chat` | ❌ | ❌ |
+| expert | `deepseek-expert-reasoner` | ✅ | ❌ |
+| expert | `deepseek-expert-chat-search` | ❌ | ✅ |
+| expert | `deepseek-expert-reasoner-search` | ✅ | ✅ |
+| vision | `deepseek-vision-chat` | ❌ | ❌ |
+| vision | `deepseek-vision-reasoner` | ✅ | ❌ |
+| vision | `deepseek-vision-chat-search` | ❌ | ✅ |
+| vision | `deepseek-vision-reasoner-search` | ✅ | ✅ |

-### Claude 接口
+除原生模型外，也支持常见 alias 输入（如 `gpt-5`、`gpt-5-mini`、`gpt-5-codex`、`gpt-4.1`、`o3`、`claude-opus-4-6`、`claude-sonnet-4-5`、`gemini-2.5-pro`、`gemini-2.5-flash` 等），但 `/v1/models` 返回的是规范化后的 DeepSeek 原生模型 ID。

-| 模型 | 默认映射 |
+### Claude 接口（`GET /anthropic/v1/models`）
+
+| 当前常用模型 | 默认映射 |
 | --- | --- |
 | `claude-sonnet-4-5` | `deepseek-chat` |
 | `claude-haiku-4-5`（兼容 `claude-3-5-haiku-latest`） | `deepseek-chat` |
 | `claude-opus-4-6` | `deepseek-reasoner` |

 可通过配置中的 `claude_mapping` 或 `claude_model_mapping` 覆盖映射关系。
-另外，`/anthropic/v1/models` 现已包含 Claude 1.x/2.x/3.x/4.x 历史模型 ID 与常见别名，便于旧客户端直接兼容。
-
+`/anthropic/v1/models` 除上述当前主别名外，还会返回 Claude 4.x snapshots，以及 3.x / 2.x / 1.x 历史模型 ID 与常见 alias，便于旧客户端直接兼容。

 #### Claude Code 接入避坑（实测）

 - `ANTHROPIC_BASE_URL` 推荐直接指向 DS2API 根地址（例如 `http://127.0.0.1:5001`），Claude Code 会请求 `/v1/messages?beta=true`。
 - `ANTHROPIC_API_KEY` 需要与 `config.json` 中 `keys` 一致；建议同时保留常规 key 与 `sk-ant-*` 形态 key，兼容不同客户端校验习惯。
 - 若系统设置了代理，建议对 DS2API 地址配置 `NO_PROXY=127.0.0.1,localhost,<你的主机IP>`，避免本地回环请求被代理拦截。
- 如遇“工具调用输出成文本、未执行”问题，请升级到包含 Claude 工具调用多格式解析（JSON/XML/ANTML/invoke）的版本。
+- 如遇“工具调用输出成文本、未执行”问题，请优先检查模型输出是否为受支持的 XML/Markup 工具块（例如 `<tool_call>` / `<function_call>` / `<invoke>` / `tool_use`），而不是纯 JSON `tool_calls` 片段。

 ### Gemini 接口

@@ -120,6 +153,15 @@ Gemini 适配器将模型名通过 `model_aliases` 或内置规则映射到 Deep

 ## 快速开始

+### 部署方式优先级建议
+
+推荐按以下顺序选择部署方式：
+
+1. **下载 Release 构建包运行**：最省事，产物已编译完成，最适合大多数用户。
+2. **Docker / GHCR 镜像部署**：适合需要容器化、编排或云环境部署。
+3. **Vercel 部署**：适合已有 Vercel 环境且接受其平台约束的场景。
+4. **本地源码运行 / 自行编译**：适合开发、调试或需要自行修改代码的场景。
+
 ### 通用第一步（所有部署方式）

 把 `config.json` 作为唯一配置源（推荐做法）：
@@ -131,36 +173,29 @@ cp config.example.json config.json

 后续部署建议：
 - 本地运行：直接读取 `config.json`
- Docker / Vercel：由 `config.json` 生成 `DS2API_CONFIG_JSON`（Base64）注入环境变量
+- Docker / Vercel：由 `config.json` 生成 `DS2API_CONFIG_JSON`（Base64）注入环境变量，也可以直接写原始 JSON

-### 方式一：本地运行
+### 方式一：下载 Release 构建包

-**前置要求**：Go 1.24+，Node.js 20+（仅在需要构建 WebUI 时）
+每次发布 Release 时，GitHub Actions 会自动构建多平台二进制包：

 ```bash
-# 1. 克隆仓库
-git clone https://github.com/CJackHwang/ds2api.git
-cd ds2api
-
-# 2. 配置
+# 下载对应平台的压缩包后
+tar -xzf ds2api_<tag>_linux_amd64.tar.gz
+cd ds2api_<tag>_linux_amd64
 cp config.example.json config.json
-# 编辑 config.json，填入你的 DeepSeek 账号信息和 API key
-
-# 3. 启动
-go run ./cmd/ds2api
+# 编辑 config.json
+./ds2api
 ```

-默认监听地址：`http://localhost:5001`
-
-> **WebUI 自动构建**：本地首次启动时，若 `static/admin` 不存在，会自动尝试执行 `npm install && npm run build`（需要本机有 Node.js）。你也可以手动构建：`./scripts/build-webui.sh`
-
 ### 方式二：Docker 运行

 ```bash
-# 1. 准备环境变量文件
+# 1. 准备环境变量和配置文件
 cp .env.example .env
+cp config.example.json config.json

-# 2. 编辑 .env（至少设置 DS2API_ADMIN_KEY）
+# 2. 编辑 .env（至少设置 DS2API_ADMIN_KEY；如需修改宿主机端口，可额外设置 DS2API_HOST_PORT）
 #    DS2API_ADMIN_KEY=请替换为强密码

 # 3. 启动
@@ -170,6 +205,8 @@ docker-compose up -d
 docker-compose logs -f
 ```

+默认 `docker-compose.yml` 会把宿主机 `6011` 映射到容器内的 `5001`。如果你希望直接对外暴露 `5001`，请设置 `DS2API_HOST_PORT=5001`（或者手动调整 `ports` 配置）。
+
 更新镜像：`docker-compose up -d --build`

 #### Zeabur 一键部署（Dockerfile）
@@ -202,37 +239,30 @@ base64 < config.json | tr -d '\n'

 > **流式说明**：`/v1/chat/completions` 在 Vercel 上默认走 `api/chat-stream.js`（Node Runtime）以保证实时 SSE。鉴权、账号选择、会话/PoW 准备仍由 Go 内部 prepare 接口完成；流式响应（含 `tools`）在 Node 侧执行与 Go 对齐的输出组装与防泄漏处理。

-详细部署说明请参阅 [部署指南](DEPLOY.md)。
+详细部署说明请参阅 [部署指南](docs/DEPLOY.md)。

-### 方式四：下载 Release 构建包
+### 方式四：本地源码运行

-每次发布 Release 时，GitHub Actions 会自动构建多平台二进制包：
+**前置要求**：Go 1.26+，Node.js `20.19+` 或 `22.12+`（仅在需要构建 WebUI 时）

 ```bash
-# 下载对应平台的压缩包后
-tar -xzf ds2api_<tag>_linux_amd64.tar.gz
-cd ds2api_<tag>_linux_amd64
+# 1. 克隆仓库
+git clone https://github.com/CJackHwang/ds2api.git
+cd ds2api
+
+# 2. 配置
 cp config.example.json config.json
-# 编辑 config.json
-./ds2api
+# 编辑 config.json，填入你的 DeepSeek 账号信息和 API key
+
+# 3. 启动
+go run ./cmd/ds2api
 ```

-### 方式五：OpenCode CLI 接入
+默认本地访问地址：`http://127.0.0.1:5001`

-1. 复制示例配置：
+服务实际绑定：`0.0.0.0:5001`，因此同一局域网设备通常也可以通过你的内网 IP 访问。

-```bash
-cp opencode.json.example opencode.json
-```
-
-2. 编辑 `opencode.json`：
- 将 `baseURL` 改为你的 DS2API 地址（例如 `https://your-domain.com/v1`）
- 将 `apiKey` 改为你的 DS2API key（对应 `config.keys`）
-
-3. 在项目目录启动 OpenCode CLI（按你的安装方式运行 `opencode`）。
-
-> 建议优先使用 OpenAI 兼容路径（`/v1/*`），即示例里的 `@ai-sdk/openai-compatible` provider。
-> 若客户端支持 `wire_api`，可分别测试 `responses` 与 `chat`，DS2API 两条链路都兼容。
+> **WebUI 自动构建**：本地首次启动时，若 `static/admin` 不存在，会自动尝试执行 `npm ci`（仅在缺少依赖时）和 `npm run build -- --outDir static/admin --emptyOutDir`（需要本机有 Node.js）。你也可以手动构建：`./scripts/build-webui.sh`

 ## 配置说明

@@ -253,15 +283,16 @@ cp opencode.json.example opencode.json
  ],
  "model_aliases": {
    "gpt-4o": "deepseek-chat",
+    "gpt-5": "deepseek-chat",
+    "gpt-5-mini": "deepseek-chat",
    "gpt-5-codex": "deepseek-reasoner",
-    "o3": "deepseek-reasoner"
+    "o3": "deepseek-reasoner",
+    "claude-opus-4-6": "deepseek-reasoner",
+    "gemini-2.5-flash": "deepseek-chat"
  },
  "compat": {
-    "wide_input_strict_output": true
-  },
-  "toolcall": {
-    "mode": "feature_match",
-    "early_emit_confidence": "high"
+    "wide_input_strict_output": true,
+    "strip_reference_markers": true
  },
  "responses": {
    "store_ttl_seconds": 900
@@ -279,10 +310,11 @@ cp opencode.json.example opencode.json
  "runtime": {
    "account_max_inflight": 2,
    "account_max_queue": 0,
-    "global_max_inflight": 0
+    "global_max_inflight": 0,
+    "token_refresh_interval_hours": 6
  },
  "auto_delete": {
-    "sessions": false
+    "mode": "none"
  }
 }
 ```
@@ -292,13 +324,14 @@ cp opencode.json.example opencode.json
 - `token`：配置文件中即使填写也会在加载时被清空（不会从 `config.json` 读取 token）；实际 token 仅在运行时内存中维护并自动刷新
 - `model_aliases`：常见模型名（如 GPT/Codex/Claude）到 DeepSeek 模型的映射
 - `compat.wide_input_strict_output`：建议保持 `true`（当前实现默认宽进严出）
- `toolcall`：固定采用特征匹配 + 高置信早发策略
+- `compat.strip_reference_markers`：建议保持 `true`，用于清理可见输出中的引用/标记
+- `toolcall`：旧字段，当前实现已固定为特征匹配 + 高置信早发；即使保留在配置里也会被忽略
 - `responses.store_ttl_seconds`：`/v1/responses/{id}` 的内存缓存 TTL
 - `embeddings.provider`：embedding 提供方（当前内置 `deterministic/mock/builtin`）
 - `claude_mapping`：字典中 `fast`/`slow` 后缀映射到对应 DeepSeek 模型（兼容读取 `claude_model_mapping`）
 - `admin`：管理后台设置（JWT 过期时间、密码哈希等），可通过 Admin Settings API 热更新
- `runtime`：运行时参数（并发限制、队列大小），可通过 Admin Settings API 热更新；`account_max_queue=0`/`global_max_inflight=0` 表示按推荐值自动计算
- `auto_delete.sessions`：是否在请求结束后自动清理 DeepSeek 会话（默认 `false`，可在 Settings 热更新）
+- `runtime`：运行时参数（并发限制、队列大小、托管账号 token 刷新间隔），可通过 Admin Settings API 热更新；`account_max_queue=0`/`global_max_inflight=0` 表示按推荐值自动计算，`token_refresh_interval_hours=6` 为默认强制重登间隔
+- `auto_delete.mode`：请求结束后如何清理 DeepSeek 远端聊天记录，支持 `none`（默认，不删除）、`single`（仅删除当前会话）、`all`（清空全部会话）；旧配置里的 `auto_delete.sessions=true` 仍会被视为 `all`

 ### 环境变量

@@ -311,25 +344,24 @@ cp opencode.json.example opencode.json
 | `DS2API_JWT_EXPIRE_HOURS` | Admin JWT 过期小时数 | `24` |
 | `DS2API_CONFIG_PATH` | 配置文件路径 | `config.json` |
 | `DS2API_CONFIG_JSON` | 直接注入配置（JSON 或 Base64） | — |
-| `DS2API_WASM_PATH` | PoW WASM 文件路径 | 自动查找 |
+| `DS2API_ENV_WRITEBACK` | 环境变量模式下自动写回配置文件并切换文件模式（`1/true/yes/on`） | 关闭 |
 | `DS2API_STATIC_ADMIN_DIR` | 管理台静态文件目录 | `static/admin` |
 | `DS2API_AUTO_BUILD_WEBUI` | 启动时自动构建 WebUI | 本地开启，Vercel 关闭 |
+| `DS2API_DEV_PACKET_CAPTURE` | 本地开发抓包开关（记录最近会话请求/响应体） | 本地非 Vercel 默认开启 |
+| `DS2API_DEV_PACKET_CAPTURE_LIMIT` | 本地抓包保留条数（超出自动淘汰） | `20` |
+| `DS2API_DEV_PACKET_CAPTURE_MAX_BODY_BYTES` | 单条响应体最大记录字节数 | `5242880` |
 | `DS2API_ACCOUNT_MAX_INFLIGHT` | 每账号最大并发 in-flight 请求数 | `2` |
-| `DS2API_ACCOUNT_CONCURRENCY` | 同上（兼容旧名） | — |
 | `DS2API_ACCOUNT_MAX_QUEUE` | 等待队列上限 | `recommended_concurrency` |
-| `DS2API_ACCOUNT_QUEUE_SIZE` | 同上（兼容旧名） | — |
 | `DS2API_GLOBAL_MAX_INFLIGHT` | 全局最大 in-flight 请求数 | `recommended_concurrency` |
-| `DS2API_MAX_INFLIGHT` | 同上（兼容旧名） | — |
 | `DS2API_VERCEL_INTERNAL_SECRET` | Vercel 混合流式内部鉴权密钥 | 回退用 `DS2API_ADMIN_KEY` |
 | `DS2API_VERCEL_STREAM_LEASE_TTL_SECONDS` | 流式 lease 过期秒数 | `900` |
-| `DS2API_DEV_PACKET_CAPTURE` | 本地开发抓包开关（记录最近会话请求/响应体） | 本地非 Vercel 默认开启 |
-| `DS2API_DEV_PACKET_CAPTURE_LIMIT` | 本地抓包保留条数（超出自动淘汰） | `5` |
-| `DS2API_DEV_PACKET_CAPTURE_MAX_BODY_BYTES` | 单条响应体最大记录字节数 | `2097152` |
 | `VERCEL_TOKEN` | Vercel 同步 token | — |
 | `VERCEL_PROJECT_ID` | Vercel 项目 ID | — |
 | `VERCEL_TEAM_ID` | Vercel 团队 ID | — |
 | `DS2API_VERCEL_PROTECTION_BYPASS` | Vercel 部署保护绕过密钥（内部 Node→Go 调用） | — |

+> 提示：当检测到 `DS2API_CONFIG_JSON` 时，管理台会显示当前模式风险与自动持久化状态（含 `DS2API_CONFIG_PATH` 路径与模式切换说明）。
+
 ## 鉴权模式

 调用业务接口（`/v1/*`、`/anthropic/*`、Gemini 路由）时支持两种模式：
@@ -340,6 +372,7 @@ cp opencode.json.example opencode.json
 | **直通 token 模式** | 传入 token 不在 `config.keys` 中时，直接作为 DeepSeek token 使用 |

 可选请求头 `X-Ds2-Target-Account`：指定使用某个托管账号（值为 email 或 mobile）。
+Gemini 路由还可以使用 `x-goog-api-key`，或在没有认证头时使用 `?key=` / `?api_key=` 作为调用方凭据。

 ## 并发模型

@@ -356,23 +389,27 @@ cp opencode.json.example opencode.json

 ## Tool Call 适配

-当请求中带 `tools` 时，DS2API 会做防泄漏处理：
+当请求中带 `tools` 时，DS2API 会做防泄漏处理与结构化转译：

-1. 只在**非代码块上下文**启用 toolcall 特征识别（代码块示例不会触发）
-2. `responses` 流式严格使用官方 item 生命周期事件（`response.output_item.*`、`response.content_part.*`、`response.function_call_arguments.*`）
-3. 未在 `tools` 声明中的工具名会被严格拒绝，不会下发为有效 tool call
+1. 只在**非代码块上下文**启用执行型 toolcall 识别（代码块示例默认不触发）
+2. 解析层当前以 XML/Markup 家族为准（`<tool_call>` / `<function_call>` / `<invoke>` / `tool_use` / antml 变体）；纯 JSON `tool_calls` 片段默认不作为可执行调用解析
+3. `responses` 流式严格使用官方 item 生命周期事件（`response.output_item.*`、`response.content_part.*`、`response.function_call_arguments.*`）
 4. `responses` 支持并执行 `tool_choice`（`auto`/`none`/`required`/强制函数）；`required` 违规时非流式返回 `422`，流式返回 `response.failed`
-5. 仅在通过策略校验后才会发出有效工具调用事件，避免错误工具名进入客户端执行链
+5. 客户端请求哪种协议，就按该协议返回工具调用（OpenAI/Claude/Gemini 各自原生结构）；模型侧优先约束输出规范 XML，再由兼容层转译
+
+> 说明：当前版本在 parser 层仍以“尽量解析成功”为优先，未启用基于 allow-list 的工具名硬拒绝。
+>
+> 想评估“把工具调用封装成 XML 再输入模型”的方案，可参考：`docs/toolcall-semantics.md`。

 ## 本地开发抓包工具

-用于定位「responses 思考流/工具调用」等问题。开启后会自动记录最近 N 条 DeepSeek 对话上游请求体与响应体（默认 5 条，超出自动淘汰）。
+用于定位「responses 思考流/工具调用」等问题。开启后会自动记录最近 N 条 DeepSeek 对话上游请求体与响应体（默认 20 条，超出自动淘汰；单条响应体默认最多记录 5 MB）。

 启用示例：

 ```bash
 DS2API_DEV_PACKET_CAPTURE=true \
-DS2API_DEV_PACKET_CAPTURE_LIMIT=5 \
+DS2API_DEV_PACKET_CAPTURE_LIMIT=20 \
 go run ./cmd/ds2api
 ```

@@ -380,6 +417,8 @@ go run ./cmd/ds2api

 - `GET /admin/dev/captures`：查看抓包列表（最新在前）
 - `DELETE /admin/dev/captures`：清空抓包
+- `GET /admin/dev/raw-samples/query?q=关键词&limit=20`：按问题关键词查询当前内存抓包，并按 `chat_session_id` 归并 `completion + continue` 链
+- `POST /admin/dev/raw-samples/save`：把命中的某条抓包链保存为 `tests/raw_stream_samples/<sample-id>/` 回放样本

 返回字段包含：

@@ -387,61 +426,10 @@ go run ./cmd/ds2api
 - `response_body`：上游返回的原始流式内容拼接文本
 - `response_truncated`：是否触发单条大小截断

-## 项目结构
+保存接口支持用 `query`、`chain_key` 或 `capture_id` 选中目标。例如：

-```text
-ds2api/
-├── cmd/
-│   ├── ds2api/              # 本地 / 容器启动入口
-│   └── ds2api-tests/        # 端到端测试集入口
-├── api/
-│   ├── index.go             # Vercel Serverless Go 入口
-│   ├── chat-stream.js       # Vercel Node.js 流式转发
-│   └── (rewrite targets in vercel.json)
-├── internal/
-│   ├── account/             # 账号池与并发队列
-│   ├── adapter/
-│   │   ├── openai/          # OpenAI 兼容适配器（含 Tool Call 解析、Vercel 流式 prepare/release）
-│   │   ├── claude/          # Claude 兼容适配器
-│   │   └── gemini/          # Gemini 兼容适配器（generateContent / streamGenerateContent）
-│   ├── admin/               # Admin API handlers（含 Settings 热更新）
-│   ├── auth/                # 鉴权与 JWT
-│   ├── claudeconv/          # Claude 消息格式转换
-│   ├── compat/              # 兼容性辅助
-│   ├── config/              # 配置加载与热更新
-│   ├── deepseek/            # DeepSeek API 客户端、PoW WASM
-│   ├── js/                  # Node 运行时流式处理与兼容逻辑
-│   ├── devcapture/          # 开发抓包模块
-│   ├── format/              # 输出格式化
-│   ├── prompt/              # Prompt 构建
-│   ├── server/              # HTTP 路由与中间件（chi router）
-│   ├── sse/                 # SSE 解析工具
-│   ├── stream/              # 统一流式消费引擎
-│   ├── util/                # 通用工具函数
-│   └── webui/               # WebUI 静态文件托管与自动构建
-├── webui/                   # React WebUI 源码（Vite + Tailwind）
-│   └── src/
-│       ├── app/             # 路由、鉴权、配置状态管理
-│       ├── features/        # 业务功能模块（account/settings/vercel/apiTester）
-│       ├── components/      # 登录/落地页等通用组件
-│       └── locales/         # 中英文语言包（zh.json / en.json）
-├── scripts/
-│   └── build-webui.sh       # WebUI 手动构建脚本
-├── tests/
-│   ├── compat/              # 兼容性测试夹具与期望输出
-│   └── scripts/             # 统一测试脚本入口（unit/e2e）
-├── static/admin/            # WebUI 构建产物（不提交到 Git）
-├── .github/
-│   ├── workflows/           # GitHub Actions（质量门禁 + Release 自动构建）
-│   ├── ISSUE_TEMPLATE/      # Issue 模板
-│   └── PULL_REQUEST_TEMPLATE.md
-├── config.example.json      # 配置文件示例
-├── .env.example             # 环境变量示例
-├── Dockerfile               # 多阶段构建（WebUI + Go）
-├── docker-compose.yml       # 生产环境 Docker Compose
-├── docker-compose.dev.yml   # 开发环境 Docker Compose
-├── vercel.json              # Vercel 路由与构建配置
-└── go.mod / go.sum          # Go 模块依赖
+```json
+{"query":"广州天气","sample_id":"gz-weather-from-memory"}
 ```

 ## 文档索引
@@ -449,9 +437,9 @@ ds2api/
 | 文档 | 说明 |
 | --- | --- |
 | [API.md](API.md) / [API.en.md](API.en.md) | API 接口文档（含请求/响应示例） |
-| [DEPLOY.md](DEPLOY.md) / [DEPLOY.en.md](DEPLOY.en.md) | 部署指南（本地/Docker/Vercel/systemd） |
-| [CONTRIBUTING.md](CONTRIBUTING.md) / [CONTRIBUTING.en.md](CONTRIBUTING.en.md) | 贡献指南 |
-| [TESTING.md](TESTING.md) | 测试集使用指南 |
+| [DEPLOY.md](docs/DEPLOY.md) / [DEPLOY.en.md](docs/DEPLOY.en.md) | 部署指南（本地/Docker/Vercel/systemd） |
+| [CONTRIBUTING.md](docs/CONTRIBUTING.md) / [CONTRIBUTING.en.md](docs/CONTRIBUTING.en.md) | 贡献指南 |
+| [TESTING.md](docs/TESTING.md) | 测试集使用指南 |

 ## 测试

@@ -481,7 +469,7 @@ npm ci --prefix webui && npm run build --prefix webui

 ## 测试

-详细测试指南请参阅 [TESTING.md](TESTING.md)。
+详细测试指南请参阅 [docs/TESTING.md](docs/TESTING.md)。

 ### 快速测试命令

@@ -490,7 +478,7 @@ npm ci --prefix webui && npm run build --prefix webui
 go test ./...

 # 运行 tool calls 相关测试（调试工具调用问题）
-go test -v -run 'TestParseToolCalls|TestRepair' ./internal/util/
+go test -v -run 'TestParseToolCalls|TestRepair' ./internal/toolcall/

 # 运行端到端测试
 ./tests/scripts/run-live.sh
@@ -507,4 +495,7 @@ go test -v -run 'TestParseToolCalls|TestRepair' ./internal/util/

 ## 免责声明

-本项目基于逆向方式实现，仅供学习与研究使用。稳定性和可用性不作保证，请勿用于违反服务条款或法律法规的场景。
+本项目基于逆向方式实现，仅供学习、研究、个人实验和内部验证使用，不提供任何商业授权、稳定性保证或可用性保证。
+作者及仓库维护者不对因使用、修改、分发、部署或依赖本项目而产生的任何直接或间接损失、账号封禁、数据丢失、法律风险或第三方索赔负责。
+
+请勿将本项目用于违反服务条款、协议、法律法规或平台规则的场景。商业使用前请自行确认 `LICENSE`、相关协议以及你是否获得了作者的书面许可。
--- a/README.en.md
+++ b/README.en.md
@@ -8,7 +8,7 @@
 ![Stars](https://img.shields.io/github/stars/CJackHwang/ds2api.svg)
 ![Forks](https://img.shields.io/github/forks/CJackHwang/ds2api.svg)
 [![Release](https://img.shields.io/github/v/release/CJackHwang/ds2api?display_name=tag)](https://github.com/CJackHwang/ds2api/releases)
-[![Docker](https://img.shields.io/badge/docker-ready-blue.svg)](DEPLOY.en.md)
+[![Docker](https://img.shields.io/badge/docker-ready-blue.svg)](docs/DEPLOY.en.md)
 [![Deploy on Zeabur](https://zeabur.com/button.svg)](https://zeabur.com/templates/L4CFHP)
 [![Deploy with Vercel](https://vercel.com/button)](https://vercel.com/new/clone?repository-url=https://github.com/CJackHwang/ds2api)

@@ -16,43 +16,66 @@ Language: [中文](README.MD) | [English](README.en.md)

 DS2API converts DeepSeek Web chat capability into OpenAI-compatible, Claude-compatible, and Gemini-compatible APIs. The backend is a **pure Go implementation**, with a React WebUI admin panel (source in `webui/`, build output auto-generated to `static/admin` during deployment).

-## Architecture Overview
+Documentation entry: [Docs Index](docs/README.md) / [Architecture](docs/ARCHITECTURE.en.md) / [API Reference](API.en.md)
+
+> **Important Disclaimer**
+>
+> This repository is provided for learning, research, personal experimentation, and internal validation only. It does not grant any commercial authorization and comes with no warranty of fitness, stability, or results.
+>
+> The author and repository maintainers are not responsible for any direct or indirect loss, account suspension, data loss, legal risk, or third-party claims arising from use, modification, distribution, deployment, or reliance on this project.
+>
+> Do not use this project in ways that violate service terms, agreements, laws, or platform rules. Before any commercial use, review the `LICENSE`, the relevant terms, and confirm that you have the author's written permission.
+
+## Architecture Overview (Summary)

 ```mermaid
 flowchart LR
-    Client["🖥️ Clients\n(OpenAI / Claude / Gemini compat)"]
+    Client["🖥️ Clients / SDKs\n(OpenAI / Claude / Gemini)"]
+    Upstream["☁️ DeepSeek API"]

-    subgraph DS2API["DS2API Service"]
-        direction TB
-        CORS["CORS Middleware"]
-        Auth["🔐 Auth Middleware"]
+    subgraph DS2API["DS2API 3.x (Unified OpenAI Core)"]
+        Router["chi Router + Middleware\n(RequestID / RealIP / Logger / Recoverer / CORS)"]

-        subgraph Adapters["Adapter Layer"]
-            OA["OpenAI Adapter\n/v1/*"]
-            CA["Claude Adapter\n/anthropic/*"]
-            GA["Gemini Adapter\n/v1beta/models/*"]
+        subgraph Adapters["Protocol Adapters"]
+            OA["OpenAI\n/v1/*"]
+            CA["Claude\n/anthropic/* + /v1/messages"]
+            GA["Gemini\n/v1beta/models/* + /v1/models/*"]
+            Admin["Admin API\n/admin/*"]
+            WebUI["WebUI\n/admin (static hosting)"]
        end

-        subgraph Support["Support Modules"]
-            Pool["📦 Account Pool / Queue"]
-            PoW["⚙️ PoW WASM\n(wazero)"]
+        subgraph Runtime["Runtime + Core Capabilities"]
+            Bridge["CLIProxy Bridge\n(multi-protocol <-> OpenAI)"]
+            OAEngine["OpenAI ChatCompletions\n(unified tools + stream semantics)"]
+            Auth["Auth Resolver\n(API key / bearer / x-goog-api-key)"]
+            Pool["Account Pool + Queue\n(in-flight slots + wait queue)"]
+            DSClient["DeepSeek Client\n(session / auth / HTTP)"]
+            Pow["PoW Solver\n(Pure Go ms-level)"]
+            Tool["Tool Sieve\n(Go/Node semantic parity)"]
        end
-
-        Admin["🛠️ Admin API\n/admin/*"]
-        WebUI["🌐 WebUI\n(/admin)"]
    end

-    DS["☁️ DeepSeek API"]
+    Client --> Router
+    Router --> OA & CA & GA
+    Router --> Admin
+    Router --> WebUI

-    Client -- "Request" --> CORS --> Auth
-    Auth --> OA & CA & GA
-    OA & CA & GA -- "Call" --> DS
-    Auth --> Admin
-    OA & CA & GA -. "Rotate accounts" .-> Pool
-    OA & CA & GA -. "Compute PoW" .-> PoW
-    DS -- "Response" --> Client
+    OA --> OAEngine
+    CA & GA --> Bridge
+    Bridge --> OAEngine
+    OAEngine --> Auth
+    OAEngine -.account rotation.-> Pool
+    OAEngine -.tool-call parsing.-> Tool
+    OAEngine -.PoW solving.-> Pow
+    Auth --> DSClient
+    DSClient --> Upstream
+    Upstream --> DSClient
+    OAEngine --> Bridge
+    Bridge --> Client
 ```

+For the full module-by-module architecture and directory responsibilities, see [docs/ARCHITECTURE.en.md](docs/ARCHITECTURE.en.md).
+
 - **Backend**: Go (`cmd/ds2api/`, `api/`, `internal/`), no Python runtime
 - **Frontend**: React admin panel (`webui/`), served as static build at runtime
 - **Deployment**: local run, Docker, Vercel serverless, Linux systemd
@@ -61,14 +84,14 @@ flowchart LR

 | Capability | Details |
 | --- | --- |
-| OpenAI compatible | `GET /v1/models`, `GET /v1/models/{id}`, `POST /v1/chat/completions`, `POST /v1/responses`, `GET /v1/responses/{response_id}`, `POST /v1/embeddings` |
+| OpenAI compatible | `GET /v1/models`, `GET /v1/models/{id}`, `POST /v1/chat/completions`, `POST /v1/responses`, `GET /v1/responses/{response_id}`, `POST /v1/embeddings`, `POST /v1/files` |
 | Claude compatible | `GET /anthropic/v1/models`, `POST /anthropic/v1/messages`, `POST /anthropic/v1/messages/count_tokens` (plus shortcut paths `/v1/messages`, `/messages`) |
 | Gemini compatible | `POST /v1beta/models/{model}:generateContent`, `POST /v1beta/models/{model}:streamGenerateContent` (plus `/v1/models/{model}:*` paths) |
 | Multi-account rotation | Auto token refresh, email/mobile dual login |
 | Concurrency control | Per-account in-flight limit + waiting queue, dynamic recommended concurrency |
-| DeepSeek PoW | WASM solving via `wazero`, no external Node.js dependency |
+| DeepSeek PoW | Pure Go high-performance solver (DeepSeekHashV1), ms-level response |
 | Tool Calling | Anti-leak handling: non-code-block feature match, early `delta.tool_calls`, structured incremental output |
-| Admin API | Config management, runtime settings hot-reload, account testing/batch test, import/export, Vercel sync |
+| Admin API | Config management, runtime settings hot-reload, proxy management, account testing/batch test, session cleanup, import/export, Vercel sync, version check |
 | WebUI Admin Panel | SPA at `/admin` (bilingual Chinese/English, dark mode) |
 | Health Probes | `GET /healthz` (liveness), `GET /readyz` (readiness) |

@@ -82,37 +105,45 @@ flowchart LR
 | P0 | Anthropic SDK (messages) | ✅ |
 | P0 | Google Gemini SDK (generateContent) | ✅ |
 | P1 | LangChain / LlamaIndex / OpenWebUI (OpenAI-compatible integration) | ✅ |
-| P2 | MCP standalone bridge | Planned |

 ## Model Support

-### OpenAI Endpoint
+### OpenAI Endpoint (`GET /v1/models`)

-| Model | thinking | search |
-| --- | --- | --- |
-| `deepseek-chat` | ❌ | ❌ |
-| `deepseek-reasoner` | ✅ | ❌ |
-| `deepseek-chat-search` | ❌ | ✅ |
-| `deepseek-reasoner-search` | ✅ | ✅ |
+| Family | Model ID | thinking | search |
+| --- | --- | --- | --- |
+| default | `deepseek-chat` | ❌ | ❌ |
+| default | `deepseek-reasoner` | ✅ | ❌ |
+| default | `deepseek-chat-search` | ❌ | ✅ |
+| default | `deepseek-reasoner-search` | ✅ | ✅ |
+| expert | `deepseek-expert-chat` | ❌ | ❌ |
+| expert | `deepseek-expert-reasoner` | ✅ | ❌ |
+| expert | `deepseek-expert-chat-search` | ❌ | ✅ |
+| expert | `deepseek-expert-reasoner-search` | ✅ | ✅ |
+| vision | `deepseek-vision-chat` | ❌ | ❌ |
+| vision | `deepseek-vision-reasoner` | ✅ | ❌ |
+| vision | `deepseek-vision-chat-search` | ❌ | ✅ |
+| vision | `deepseek-vision-reasoner-search` | ✅ | ✅ |

-### Claude Endpoint
+Besides native IDs, DS2API also accepts common aliases as input (for example `gpt-5`, `gpt-5-mini`, `gpt-5-codex`, `gpt-4.1`, `o3`, `claude-opus-4-6`, `claude-sonnet-4-5`, `gemini-2.5-pro`, `gemini-2.5-flash`), but `/v1/models` returns normalized DeepSeek native model IDs.

-| Model | Default Mapping |
+### Claude Endpoint (`GET /anthropic/v1/models`)
+
+| Current common model | Default Mapping |
 | --- | --- |
 | `claude-sonnet-4-5` | `deepseek-chat` |
 | `claude-haiku-4-5` (compatible with `claude-3-5-haiku-latest`) | `deepseek-chat` |
 | `claude-opus-4-6` | `deepseek-reasoner` |

 Override mapping via `claude_mapping` or `claude_model_mapping` in config.
-In addition, `/anthropic/v1/models` now includes historical Claude 1.x/2.x/3.x/4.x IDs and common aliases for legacy client compatibility.
-
+Besides the current primary aliases above, `/anthropic/v1/models` also returns Claude 4.x snapshots plus historical 3.x / 2.x / 1.x IDs and common aliases for legacy client compatibility.

 #### Claude Code integration pitfalls (validated)

 - Set `ANTHROPIC_BASE_URL` to the DS2API root URL (for example `http://127.0.0.1:5001`). Claude Code sends requests to `/v1/messages?beta=true`.
 - `ANTHROPIC_API_KEY` must match an entry in `keys` from `config.json`. Keeping both a regular key and an `sk-ant-*` style key improves client compatibility.
 - If your environment has proxy variables, set `NO_PROXY=127.0.0.1,localhost,<your_host_ip>` for DS2API to avoid proxy interception of local traffic.
- If tool calls are rendered as plain text and not executed, upgrade to a build that includes multi-format Claude tool-call parsing (JSON/XML/ANTML/invoke).
+- If tool calls are rendered as plain text and not executed, first verify the model output uses supported XML/Markup tool blocks (`<tool_call>` / `<function_call>` / `<invoke>` / `tool_use`) rather than standalone JSON `tool_calls`.

 ### Gemini Endpoint

@@ -120,6 +151,15 @@ The Gemini adapter maps model names to DeepSeek native models via `model_aliases

 ## Quick Start

+### Recommended deployment priority
+
+Recommended order when choosing a deployment method:
+
+1. **Download and run release binaries**: the easiest path for most users because the artifacts are already built.
+2. **Docker / GHCR image deployment**: suitable for containerized, orchestrated, or cloud environments.
+3. **Vercel deployment**: suitable if you already use Vercel and accept its platform constraints.
+4. **Run from source / build locally**: suitable for development, debugging, or when you need to modify the code yourself.
+
 ### Universal First Step (all deployment modes)

 Use `config.json` as the single source of truth (recommended):
@@ -131,45 +171,40 @@ cp config.example.json config.json

 Recommended per deployment mode:
 - Local run: read `config.json` directly
- Docker / Vercel: generate Base64 from `config.json` and inject as `DS2API_CONFIG_JSON`
+- Docker / Vercel: generate Base64 from `config.json` and inject as `DS2API_CONFIG_JSON`, or paste raw JSON directly

-### Option 1: Local Run
+### Option 1: Download Release Binaries

-**Prerequisites**: Go 1.24+, Node.js 20+ (only if building WebUI locally)
+GitHub Actions automatically builds multi-platform archives on each Release:

 ```bash
-# 1. Clone
-git clone https://github.com/CJackHwang/ds2api.git
-cd ds2api
-
-# 2. Configure
+# After downloading the archive for your platform
+tar -xzf ds2api_<tag>_linux_amd64.tar.gz
+cd ds2api_<tag>_linux_amd64
 cp config.example.json config.json
-# Edit config.json with your DeepSeek account info and API keys
-
-# 3. Start
-go run ./cmd/ds2api
+# Edit config.json
+./ds2api
 ```

-Default URL: `http://localhost:5001`
-
-> **WebUI auto-build**: On first local startup, if `static/admin` is missing, DS2API will auto-run `npm install && npm run build` (requires Node.js). You can also build manually: `./scripts/build-webui.sh`
-
-### Option 2: Docker
+### Option 2: Docker / GHCR

 ```bash
-# 1. Prepare env file
+# Pull prebuilt image
+docker pull ghcr.io/cjackhwang/ds2api:latest
+
+# Or run a pinned version
+# docker pull ghcr.io/cjackhwang/ds2api:v3.0.0
+
+# Prepare env file and config file
 cp .env.example .env
+cp config.example.json config.json

-# 2. Edit .env (at least set DS2API_ADMIN_KEY)
-#    DS2API_ADMIN_KEY=replace-with-a-strong-secret
-
-# 3. Start
+# Start with compose
 docker-compose up -d
-
-# 4. View logs
-docker-compose logs -f
 ```

+The default `docker-compose.yml` uses `ghcr.io/cjackhwang/ds2api:latest` and maps host port `6011` to container port `5001`. If you want `5001` exposed directly, set `DS2API_HOST_PORT=5001` (or adjust the `ports` mapping).
+
 Rebuild after updates: `docker-compose up -d --build`

 #### Zeabur One-Click (Dockerfile)
@@ -202,37 +237,30 @@ base64 < config.json | tr -d '\n'

 > **Streaming note**: `/v1/chat/completions` on Vercel is routed to `api/chat-stream.js` (Node Runtime) for real-time SSE. Auth, account selection, and session/PoW preparation are still handled by the Go internal prepare endpoint; streaming output (including `tools`) is assembled on Node with Go-aligned anti-leak handling.

-For detailed deployment instructions, see the [Deployment Guide](DEPLOY.en.md).
+For detailed deployment instructions, see the [Deployment Guide](docs/DEPLOY.en.md).

-### Option 4: Download Release Binaries
+### Option 4: Local Run

-GitHub Actions automatically builds multi-platform archives on each Release:
+**Prerequisites**: Go 1.26+, Node.js `20.19+` or `22.12+` (only if building WebUI locally)

 ```bash
-# After downloading the archive for your platform
-tar -xzf ds2api_<tag>_linux_amd64.tar.gz
-cd ds2api_<tag>_linux_amd64
+# 1. Clone
+git clone https://github.com/CJackHwang/ds2api.git
+cd ds2api
+
+# 2. Configure
 cp config.example.json config.json
-# Edit config.json
-./ds2api
+# Edit config.json with your DeepSeek account info and API keys
+
+# 3. Start
+go run ./cmd/ds2api
 ```

-### Option 5: OpenCode CLI
+Default local URL: `http://127.0.0.1:5001`

-1. Copy the example config:
+The server actually binds to `0.0.0.0:5001`, so devices on the same LAN can usually reach it through your private IP as well.

-```bash
-cp opencode.json.example opencode.json
-```
-
-2. Edit `opencode.json`:
- Set `baseURL` to your DS2API endpoint (for example, `https://your-domain.com/v1`)
- Set `apiKey` to your DS2API key (from `config.keys`)
-
-3. Start OpenCode CLI in the project directory (run `opencode` using your installed method).
-
-> Recommended: use the OpenAI-compatible path (`/v1/*`) via `@ai-sdk/openai-compatible` as shown in the example.
-> If your client supports `wire_api`, test both `responses` and `chat`; DS2API supports both paths.
+> **WebUI auto-build**: On first local startup, if `static/admin` is missing, DS2API will auto-run `npm ci` (only when dependencies are missing) and `npm run build -- --outDir static/admin --emptyOutDir` (requires Node.js). You can also build manually: `./scripts/build-webui.sh`

 ## Configuration

@@ -253,15 +281,16 @@ cp opencode.json.example opencode.json
  ],
  "model_aliases": {
    "gpt-4o": "deepseek-chat",
+    "gpt-5": "deepseek-chat",
+    "gpt-5-mini": "deepseek-chat",
    "gpt-5-codex": "deepseek-reasoner",
-    "o3": "deepseek-reasoner"
+    "o3": "deepseek-reasoner",
+    "claude-opus-4-6": "deepseek-reasoner",
+    "gemini-2.5-flash": "deepseek-chat"
  },
  "compat": {
-    "wide_input_strict_output": true
-  },
-  "toolcall": {
-    "mode": "feature_match",
-    "early_emit_confidence": "high"
+    "wide_input_strict_output": true,
+    "strip_reference_markers": true
  },
  "responses": {
    "store_ttl_seconds": 900
@@ -279,10 +308,11 @@ cp opencode.json.example opencode.json
  "runtime": {
    "account_max_inflight": 2,
    "account_max_queue": 0,
-    "global_max_inflight": 0
+    "global_max_inflight": 0,
+    "token_refresh_interval_hours": 6
  },
  "auto_delete": {
-    "sessions": false
+    "mode": "none"
  }
 }
 ```
@@ -292,13 +322,14 @@ cp opencode.json.example opencode.json
 - `token`: Even if set in `config.json`, it is cleared during load (DS2API does not read persisted tokens from config); runtime tokens are maintained/refreshed in memory only
 - `model_aliases`: Map common model names (GPT/Codex/Claude) to DeepSeek models
 - `compat.wide_input_strict_output`: Keep `true` (current default policy)
- `toolcall`: Fixed to feature matching + high-confidence early emit
+- `compat.strip_reference_markers`: Keep `true`; it strips reference markers from visible output
+- `toolcall`: Legacy field; the current behavior is fixed to feature matching + high-confidence early emit, and any config value is ignored
 - `responses.store_ttl_seconds`: In-memory TTL for `/v1/responses/{id}`
 - `embeddings.provider`: Embeddings provider (`deterministic/mock/builtin` built-in)
 - `claude_mapping`: Maps `fast`/`slow` suffixes to corresponding DeepSeek models (still compatible with `claude_model_mapping`)
 - `admin`: Admin panel settings (JWT expiry, password hash, etc.), hot-reloadable via Admin Settings API
- `runtime`: Runtime parameters (concurrency limits, queue sizes), hot-reloadable via Admin Settings API; `account_max_queue=0`/`global_max_inflight=0` means auto-calculate from recommended values
- `auto_delete.sessions`: Whether to auto-delete DeepSeek sessions after request completion (default `false`, hot-reloadable via Settings)
+- `runtime`: Runtime parameters (concurrency limits, queue sizes, managed token refresh interval), hot-reloadable via Admin Settings API; `account_max_queue=0`/`global_max_inflight=0` means auto-calculate from recommended values, `token_refresh_interval_hours=6` is the default forced re-login interval
+- `auto_delete.mode`: How to clean up DeepSeek remote chat records after each request completes. Supported values: `none` (default, no deletion), `single` (delete only the current session), `all` (delete all sessions); legacy `auto_delete.sessions=true` is still treated as `all`

 ### Environment Variables

@@ -311,25 +342,24 @@ cp opencode.json.example opencode.json
 | `DS2API_JWT_EXPIRE_HOURS` | Admin JWT TTL in hours | `24` |
 | `DS2API_CONFIG_PATH` | Config file path | `config.json` |
 | `DS2API_CONFIG_JSON` | Inline config (JSON or Base64) | — |
-| `DS2API_WASM_PATH` | PoW WASM file path | Auto-detect |
+| `DS2API_ENV_WRITEBACK` | Auto-write env-backed config to file and transition to file mode (`1/true/yes/on`) | Disabled |
 | `DS2API_STATIC_ADMIN_DIR` | Admin static assets dir | `static/admin` |
 | `DS2API_AUTO_BUILD_WEBUI` | Auto-build WebUI on startup | Enabled locally, disabled on Vercel |
 | `DS2API_ACCOUNT_MAX_INFLIGHT` | Max in-flight requests per account | `2` |
-| `DS2API_ACCOUNT_CONCURRENCY` | Alias (legacy compat) | — |
 | `DS2API_ACCOUNT_MAX_QUEUE` | Waiting queue limit | `recommended_concurrency` |
-| `DS2API_ACCOUNT_QUEUE_SIZE` | Alias (legacy compat) | — |
 | `DS2API_GLOBAL_MAX_INFLIGHT` | Global max in-flight requests | `recommended_concurrency` |
-| `DS2API_MAX_INFLIGHT` | Alias (legacy compat) | — |
 | `DS2API_VERCEL_INTERNAL_SECRET` | Vercel hybrid streaming internal auth | Falls back to `DS2API_ADMIN_KEY` |
 | `DS2API_VERCEL_STREAM_LEASE_TTL_SECONDS` | Stream lease TTL seconds | `900` |
 | `DS2API_DEV_PACKET_CAPTURE` | Local dev packet capture switch (record recent request/response bodies) | Enabled by default on non-Vercel local runtime |
-| `DS2API_DEV_PACKET_CAPTURE_LIMIT` | Number of captured sessions to retain (auto-evict overflow) | `5` |
-| `DS2API_DEV_PACKET_CAPTURE_MAX_BODY_BYTES` | Max recorded bytes per captured response body | `2097152` |
+| `DS2API_DEV_PACKET_CAPTURE_LIMIT` | Number of captured sessions to retain (auto-evict overflow) | `20` |
+| `DS2API_DEV_PACKET_CAPTURE_MAX_BODY_BYTES` | Max recorded bytes per captured response body | `5242880` |
 | `VERCEL_TOKEN` | Vercel sync token | — |
 | `VERCEL_PROJECT_ID` | Vercel project ID | — |
 | `VERCEL_TEAM_ID` | Vercel team ID | — |
 | `DS2API_VERCEL_PROTECTION_BYPASS` | Vercel deployment protection bypass for internal Node→Go calls | — |

+> Note: when `DS2API_CONFIG_JSON` is detected, the Admin UI shows mode risk and auto-persistence status (including `DS2API_CONFIG_PATH` and mode-transition hints).
+
 ## Authentication Modes

 For business endpoints (`/v1/*`, `/anthropic/*`, Gemini routes), DS2API supports two modes:
@@ -340,6 +370,7 @@ For business endpoints (`/v1/*`, `/anthropic/*`, Gemini routes), DS2API supports
 | **Direct token** | If the token is not in `config.keys`, DS2API treats it as a DeepSeek token directly |

 Optional header `X-Ds2-Target-Account`: Pin a specific managed account (value is email or mobile).
+Gemini routes also accept `x-goog-api-key`, or `?key=` / `?api_key=` when no auth header is present.

 ## Concurrency Model

@@ -359,21 +390,22 @@ Queue limit = DS2API_ACCOUNT_MAX_QUEUE (default = recommended concurrency)
 When `tools` is present in the request, DS2API performs anti-leak handling:

 1. Toolcall feature matching is enabled only in **non-code-block context** (fenced examples are ignored)
-   - In non-code-block context, tool JSON may still be recognized even when mixed with normal prose; surrounding prose can remain as text output.
-2. `responses` streaming strictly uses official item lifecycle events (`response.output_item.*`, `response.content_part.*`, `response.function_call_arguments.*`)
-3. Tool names not declared in the `tools` schema are strictly rejected and will not be emitted as valid tool calls
+2. The parser currently targets XML/Markup-family tool syntax (`<tool_call>` / `<function_call>` / `<invoke>` / `tool_use` / antml variants); standalone JSON `tool_calls` payloads are not treated as executable calls by default
+3. `responses` streaming strictly uses official item lifecycle events (`response.output_item.*`, `response.content_part.*`, `response.function_call_arguments.*`)
 4. `responses` supports and enforces `tool_choice` (`auto`/`none`/`required`/forced function); `required` violations return `422` for non-stream and `response.failed` for stream
-5. Valid tool call events are only emitted after passing policy validation, preventing invalid tool names from entering the client execution chain
+5. The output protocol follows the client request (OpenAI / Claude / Gemini native shapes); model-side prompting can prefer XML, and the compatibility layer handles the protocol-specific translation
+
+> Note: the current parser still prioritizes “parse successfully whenever possible”; hard allow-list rejection for undeclared tool names is not enabled yet.

 ## Local Dev Packet Capture

-This is for debugging issues such as Responses reasoning streaming and tool-call handoff. When enabled, DS2API stores the latest N DeepSeek conversation payload pairs (request body + upstream response body), defaulting to 5 entries with auto-eviction.
+This is for debugging issues such as Responses reasoning streaming and tool-call handoff. When enabled, DS2API stores the latest N DeepSeek conversation payload pairs (request body + upstream response body), defaulting to 20 entries with auto-eviction; each response body is capped at 5 MB by default.

 Enable example:

 ```bash
 DS2API_DEV_PACKET_CAPTURE=true \
-DS2API_DEV_PACKET_CAPTURE_LIMIT=5 \
+DS2API_DEV_PACKET_CAPTURE_LIMIT=20 \
 go run ./cmd/ds2api
 ```

@@ -381,6 +413,8 @@ Inspect/clear (Admin JWT required):

 - `GET /admin/dev/captures`: list captured items (newest first)
 - `DELETE /admin/dev/captures`: clear captured items
+- `GET /admin/dev/raw-samples/query?q=keyword&limit=20`: search current in-memory captures by prompt keyword and group `completion + continue` by `chat_session_id`
+- `POST /admin/dev/raw-samples/save`: persist a selected capture chain as `tests/raw_stream_samples/<sample-id>/`

 Response fields include:

@@ -388,61 +422,10 @@ Response fields include:
 - `response_body`: concatenated raw upstream stream body text
 - `response_truncated`: whether body-size truncation happened

-## Project Structure
+The save endpoint can target a chain by `query`, `chain_key`, or `capture_id`. Example:

-```text
-ds2api/
-├── cmd/
-│   ├── ds2api/              # Local / container entrypoint
-│   └── ds2api-tests/        # End-to-end testsuite entrypoint
-├── api/
-│   ├── index.go             # Vercel Serverless Go entry
-│   ├── chat-stream.js       # Vercel Node.js stream relay
-│   └── (rewrite targets in vercel.json)
-├── internal/
-│   ├── account/             # Account pool and concurrency queue
-│   ├── adapter/
-│   │   ├── openai/          # OpenAI adapter (incl. tool call parsing, Vercel stream prepare/release)
-│   │   ├── claude/          # Claude adapter
-│   │   └── gemini/          # Gemini adapter (generateContent / streamGenerateContent)
-│   ├── admin/               # Admin API handlers (incl. Settings hot-reload)
-│   ├── auth/                # Auth and JWT
-│   ├── claudeconv/          # Claude message format conversion
-│   ├── compat/              # Compatibility helpers
-│   ├── config/              # Config loading and hot-reload
-│   ├── deepseek/            # DeepSeek API client, PoW WASM
-│   ├── js/                  # Node runtime stream/compat logic
-│   ├── devcapture/          # Dev packet capture module
-│   ├── format/              # Output formatting
-│   ├── prompt/              # Prompt construction
-│   ├── server/              # HTTP routing and middleware (chi router)
-│   ├── sse/                 # SSE parsing utilities
-│   ├── stream/              # Unified stream consumption engine
-│   ├── util/                # Common utilities
-│   └── webui/               # WebUI static file serving and auto-build
-├── webui/                   # React WebUI source (Vite + Tailwind)
-│   └── src/
-│       ├── app/             # Routing, auth, config state
-│       ├── features/        # Feature modules (account/settings/vercel/apiTester)
-│       ├── components/      # Shared UI pieces (login/landing, etc.)
-│       └── locales/         # Language packs (zh.json / en.json)
-├── scripts/
-│   └── build-webui.sh       # Manual WebUI build script
-├── tests/
-│   ├── compat/              # Compatibility fixtures and expected outputs
-│   └── scripts/             # Unified test script entrypoints (unit/e2e)
-├── static/admin/            # WebUI build output (not committed to Git)
-├── .github/
-│   ├── workflows/           # GitHub Actions (quality gates + release automation)
-│   ├── ISSUE_TEMPLATE/      # Issue templates
-│   └── PULL_REQUEST_TEMPLATE.md
-├── config.example.json      # Config file template
-├── .env.example             # Environment variable template
-├── Dockerfile               # Multi-stage build (WebUI + Go)
-├── docker-compose.yml       # Production Docker Compose
-├── docker-compose.dev.yml   # Development Docker Compose
-├── vercel.json              # Vercel routing and build config
-└── go.mod / go.sum          # Go module dependencies
+```json
+{"query":"Guangzhou weather","sample_id":"gz-weather-from-memory"}
 ```

 ## Documentation Index
@@ -450,9 +433,9 @@ ds2api/
 | Document | Description |
 | --- | --- |
 | [API.md](API.md) / [API.en.md](API.en.md) | API reference with request/response examples |
-| [DEPLOY.md](DEPLOY.md) / [DEPLOY.en.md](DEPLOY.en.md) | Deployment guide (local/Docker/Vercel/systemd) |
-| [CONTRIBUTING.md](CONTRIBUTING.md) / [CONTRIBUTING.en.md](CONTRIBUTING.en.md) | Contributing guide |
-| [TESTING.md](TESTING.md) | Testsuite guide |
+| [DEPLOY.md](docs/DEPLOY.md) / [DEPLOY.en.md](docs/DEPLOY.en.md) | Deployment guide (local/Docker/Vercel/systemd) |
+| [CONTRIBUTING.md](docs/CONTRIBUTING.md) / [CONTRIBUTING.en.md](docs/CONTRIBUTING.en.md) | Contributing guide |
+| [TESTING.md](docs/TESTING.md) | Testsuite guide |

 ## Testing

@@ -491,4 +474,7 @@ Workflow: `.github/workflows/release-artifacts.yml`

 ## Disclaimer

-This project is built through reverse engineering and is provided for learning and research only. Stability is not guaranteed. Do not use it in scenarios that violate terms of service or laws.
+This project is built through reverse engineering and is provided for learning, research, personal experimentation, and internal validation only. No commercial authorization is granted, and no warranty of stability, fitness, or results is provided.
+The author and repository maintainers are not responsible for any direct or indirect loss, account suspension, data loss, legal risk, or third-party claims arising from use, modification, distribution, deployment, or reliance on this project.
+
+Do not use this project in ways that violate service terms, agreements, laws, or platform rules. Before any commercial use, review the `LICENSE`, the relevant terms, and confirm that you have the author's written permission.
--- a/2
+++ b/2
@@ -1 +1 @@
-2.4.1
+3.5.2
--- a/app/handler.go
+++ b/app/handler.go
@@ -3,9 +3,17 @@ package app
 import (
 	"net/http"

+	"ds2api/internal/config"
 	"ds2api/internal/server"
 )

 func NewHandler() http.Handler {
-	return server.NewApp().Router
+	app, err := server.NewApp()
+	if err != nil {
+		config.Logger.Error("[app] init failed", "error", err)
+		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			server.WriteUnhandledError(w, err)
+		})
+	}
+	return app.Router
 }
--- a/cmd/ds2api-tests/main.go
+++ b/cmd/ds2api-tests/main.go
@@ -30,8 +30,8 @@ func main() {
 	opts.Timeout = time.Duration(timeoutSeconds) * time.Second

 	if err := testsuite.Run(context.Background(), opts); err != nil {
-		fmt.Fprintln(os.Stderr, err.Error())
+		_, _ = fmt.Fprintln(os.Stderr, err.Error())
 		os.Exit(1)
 	}
-	fmt.Fprintln(os.Stdout, "testsuite completed successfully")
+	_, _ = fmt.Fprintln(os.Stdout, "testsuite completed successfully")
 }
--- a/cmd/ds2api/main.go
+++ b/cmd/ds2api/main.go
@@ -18,9 +18,17 @@ import (
 )

 func main() {
+	if err := config.LoadDotEnv(); err != nil {
+		config.Logger.Warn("[dotenv] load failed", "error", err)
+	}
+	config.RefreshLogger()
 	webui.EnsureBuiltOnStartup()
 	_ = auth.AdminKey()
-	app := server.NewApp()
+	app, err := server.NewApp()
+	if err != nil {
+		config.Logger.Error("server initialization failed", "error", err)
+		os.Exit(1)
+	}
 	port := strings.TrimSpace(os.Getenv("PORT"))
 	if port == "" {
 		port = "5001"
--- a/config.example.json
+++ b/config.example.json
@@ -28,11 +28,8 @@
    "o3": "deepseek-reasoner"
  },
  "compat": {
-    "wide_input_strict_output": true
-  },
-  "toolcall": {
-    "mode": "feature_match",
-    "early_emit_confidence": "high"
+    "wide_input_strict_output": true,
+    "strip_reference_markers": true
  },
  "responses": {
    "store_ttl_seconds": 900
@@ -50,9 +47,10 @@
  "runtime": {
    "account_max_inflight": 2,
    "account_max_queue": 0,
-    "global_max_inflight": 0
+    "global_max_inflight": 0,
+    "token_refresh_interval_hours": 6
  },
  "auto_delete": {
-    "sessions": false
+    "mode": "none"
  }
 }
--- a/docker-compose.dev.yml
+++ b/docker-compose.dev.yml
@@ -16,7 +16,8 @@ services:
    container_name: ds2api-dev
    command: ["go", "run", "./cmd/ds2api"]
    ports:
-      - "${PORT:-5001}:${PORT:-5001}"
+      # Host port is configurable via DS2API_HOST_PORT; container port stays fixed at 5001.
+      - "${DS2API_HOST_PORT:-6011}:5001"
    env_file:
      - .env
    environment:
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,14 +1,16 @@
 services:
  ds2api:
    image: ghcr.io/cjackhwang/ds2api:latest
-    container_name: ds2api
-    restart: always
-    ports:
-      - "6011:5001"
-    volumes:
-      - ./config.json:/app/config.json    # 配置文件
-      - ./.env:/app/.env                  # 环境变量
-    environment:
-      - TZ=Asia/Shanghai
-      - LOG_LEVEL=INFO
-      - DS2API_ADMIN_KEY=${DS2API_ADMIN_KEY:-ds2api}
+    container_name: ds2api
+    restart: always
+    env_file:
+      - .env
+    ports:
+      # Host port is configurable via DS2API_HOST_PORT; container port stays fixed at 5001.
+      - "${DS2API_HOST_PORT:-6011}:5001"
+    volumes:
+      - ./config.json:/app/config.json    # 配置文件
+    environment:
+      - TZ=Asia/Shanghai
+      - LOG_LEVEL=INFO
+      - DS2API_ADMIN_KEY=${DS2API_ADMIN_KEY:-ds2api}
--- a/docs/ARCHITECTURE.en.md
+++ b/docs/ARCHITECTURE.en.md
@@ -0,0 +1,136 @@
+# DS2API Architecture & Project Layout
+
+Language: [中文](ARCHITECTURE.md) | [English](ARCHITECTURE.en.md)
+
+> This file is the single architecture source for directory layout, module boundaries, and execution flow.
+
+## 1. Top-level Layout (expanded)
+
+> Notes: this is the **fully expanded** project directory list (excluding metadata/dependency dirs such as `.git/` and `webui/node_modules/`), with each folder annotated by purpose.
+
+```text
+ds2api/
+├── .github/                              # GitHub collaboration and CI config
+│   ├── ISSUE_TEMPLATE/                   # Issue templates
+│   └── workflows/                        # GitHub Actions workflows
+├── api/                                  # Serverless entrypoints (Vercel Go/Node)
+├── app/                                  # Application-level handler assembly
+├── cmd/                                  # Executable entrypoints
+│   ├── ds2api/                           # Main service bootstrap
+│   └── ds2api-tests/                     # E2E testsuite CLI bootstrap
+├── docs/                                 # Project documentation
+├── internal/                             # Core implementation (non-public packages)
+│   ├── account/                          # Account pool, inflight slots, waiting queue
+│   ├── adapter/                          # Multi-protocol adapters
+│   │   ├── claude/                       # Claude protocol adapter
+│   │   ├── gemini/                       # Gemini protocol adapter
+│   │   └── openai/                       # OpenAI adapter and shared execution core
+│   ├── admin/                            # Admin API (config/accounts/ops)
+│   ├── auth/                             # Auth/JWT/credential resolution
+│   ├── claudeconv/                       # Claude message conversion helpers
+│   ├── compat/                           # Compatibility and regression helpers
+│   ├── config/                           # Config loading/validation/hot reload
+│   ├── deepseek/                         # DeepSeek upstream client capabilities
+│   │   └── transport/                    # DeepSeek transport details
+│   ├── devcapture/                       # Dev capture and troubleshooting
+│   ├── format/                           # Response formatting layer
+│   │   ├── claude/                       # Claude output formatting
+│   │   └── openai/                       # OpenAI output formatting
+│   ├── js/                               # Node runtime related logic
+│   │   ├── chat-stream/                  # Node streaming bridge
+│   │   ├── helpers/                      # JS helper modules
+│   │   │   └── stream-tool-sieve/        # JS implementation of tool sieve
+│   │   └── shared/                       # Shared semantics between Go/Node
+│   ├── prompt/                           # Prompt composition
+│   ├── rawsample/                        # Raw sample read/write and management
+│   ├── server/                           # Router and middleware assembly
+│   ├── sse/                              # SSE parsing utilities
+│   ├── stream/                           # Unified stream consumption engine
+│   ├── testsuite/                        # Testsuite execution framework
+│   ├── textclean/                        # Text cleanup
+│   ├── toolcall/                         # Tool-call parsing and repair
+│   ├── translatorcliproxy/               # Cross-protocol translation bridge
+│   ├── util/                             # Shared utility helpers
+│   ├── version/                          # Version query/compare
+│   └── webui/                            # WebUI static hosting logic
+├── plans/                                # Stage plans and manual QA records
+├── pow/                                  # PoW standalone implementation + benchmarks
+├── scripts/                              # Build/release helper scripts
+├── tests/                                # Test assets and scripts
+│   ├── compat/                           # Compatibility fixtures + expected outputs
+│   │   ├── expected/                     # Expected output samples
+│   │   └── fixtures/                     # Fixture inputs
+│   │       ├── sse_chunks/               # SSE chunk fixtures
+│   │       └── toolcalls/                # Tool-call fixtures
+│   ├── node/                             # Node unit tests
+│   ├── raw_stream_samples/               # Upstream raw SSE samples
+│   │   ├── content-filter-trigger-20260405-jwt3/          # Content-filter terminal sample
+│   │   ├── continue-thinking-snapshot-replay-20260405/    # Continue-thinking sample
+│   │   ├── guangzhou-weather-reasoner-search-20260404/    # Search/reference sample
+│   │   ├── markdown-format-example-20260405/              # Markdown sample
+│   │   └── markdown-format-example-20260405-spacefix/     # Space-fix sample
+│   ├── scripts/                          # Test entry scripts
+│   └── tools/                            # Testing helper tools
+└── webui/                                # React admin console source
+    ├── public/                           # Static assets
+    └── src/                              # Frontend source code
+        ├── app/                          # Routing/state scaffolding
+        ├── components/                   # Shared UI components
+        ├── features/                     # Feature modules
+        │   ├── account/                  # Account management page
+        │   ├── apiTester/                # API tester page
+        │   ├── settings/                 # Settings page
+        │   └── vercel/                   # Vercel sync page
+        ├── layout/                       # Layout components
+        ├── locales/                      # i18n strings
+        └── utils/                        # Frontend utilities
+```
+
+## 2. Primary Request Flow
+
+```mermaid
+flowchart LR
+    C[Client/SDK] --> R[internal/server/router.go]
+    R --> OA[OpenAI Adapter]
+    R --> CA[Claude Adapter]
+    R --> GA[Gemini Adapter]
+    R --> AD[Admin API]
+
+    CA --> BR[translatorcliproxy]
+    GA --> BR
+    BR --> CORE[internal/adapter/openai ChatCompletions]
+    OA --> CORE
+
+    CORE --> AUTH[internal/auth + config key/account resolver]
+    CORE --> POOL[internal/account queue + concurrency]
+    CORE --> TOOL[internal/toolcall parser + sieve]
+    CORE --> DS[internal/deepseek client]
+    DS --> U[DeepSeek upstream]
+```
+
+## 3. Responsibilities in `internal/`
+
+- `internal/server`: router tree + middlewares (health, protocol routes, Admin/WebUI).
+- `internal/adapter/openai`: shared execution core (chat/responses/embeddings + tool semantics).
+- `internal/adapter/{claude,gemini}`: protocol wrappers only (no duplicated upstream execution).
+- `internal/translatorcliproxy`: structure translation between Claude/Gemini and OpenAI.
+- `internal/deepseek`: upstream request/session/PoW/SSE handling.
+- `internal/stream` + `internal/sse`: stream parsing and incremental assembly.
+- `internal/toolcall`: XML/Markup-family tool-call parsing + anti-leak sieve (`<tool_call>` / `<function_call>` / `<invoke>` / `tool_use` / antml variants).
+- `internal/admin`: config/accounts/vercel sync/version/dev-capture endpoints.
+- `internal/config`: config loading/validation + runtime settings hot-reload.
+- `internal/account`: managed account pool, inflight slots, waiting queue.
+
+## 4. WebUI Runtime Relation
+
+- `webui/` stores frontend source (Vite + React).
+- Runtime serves static output from `static/admin`.
+- On first local startup, if `static/admin` is missing, DS2API may auto-build it (Node.js required).
+
+## 5. Documentation Split Strategy
+
+- Onboarding & quick start: `README.MD` / `README.en.md`
+- Architecture & layout: `docs/ARCHITECTURE*.md` (this file)
+- API contracts: `API.md` / `API.en.md`
+- Deployment/testing/contributing: `docs/DEPLOY*`, `docs/TESTING.md`, `docs/CONTRIBUTING*`
+- Deep topics: `docs/toolcall-semantics.md`, `docs/DeepSeekSSE行为结构说明-2026-04-05.md`
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -0,0 +1,136 @@
+# DS2API 架构与项目结构说明
+
+语言 / Language: [中文](ARCHITECTURE.md) | [English](ARCHITECTURE.en.md)
+
+> 本文档用于集中维护“代码目录结构 + 模块边界 + 主链路调用关系”。
+
+## 1. 顶层目录结构（展开）
+
+> 说明：以下为仓库内业务相关目录的**完整展开**（排除 `.git/` 与 `webui/node_modules/` 这类依赖/元数据目录），并标注每个文件夹作用。
+
+```text
+ds2api/
+├── .github/                              # GitHub 协作与 CI 配置
+│   ├── ISSUE_TEMPLATE/                   # Issue 模板
+│   └── workflows/                        # GitHub Actions 工作流
+├── api/                                  # Serverless 入口（Vercel Go/Node）
+├── app/                                  # 应用级 handler 装配层
+├── cmd/                                  # 可执行程序入口
+│   ├── ds2api/                           # 主服务启动入口
+│   └── ds2api-tests/                     # E2E 测试集 CLI 入口
+├── docs/                                 # 项目文档目录
+├── internal/                             # 核心业务实现（不对外暴露）
+│   ├── account/                          # 账号池、并发槽位、等待队列
+│   ├── adapter/                          # 多协议适配层
+│   │   ├── claude/                       # Claude 协议适配
+│   │   ├── gemini/                       # Gemini 协议适配
+│   │   └── openai/                       # OpenAI 协议与统一执行核心
+│   ├── admin/                            # Admin API（配置/账号/运维）
+│   ├── auth/                             # 鉴权/JWT/凭证解析
+│   ├── claudeconv/                       # Claude 消息格式转换工具
+│   ├── compat/                           # 兼容性辅助与回归支持
+│   ├── config/                           # 配置加载、校验、热更新
+│   ├── deepseek/                         # DeepSeek 上游客户端能力
+│   │   └── transport/                    # DeepSeek 传输层细节
+│   ├── devcapture/                       # 开发抓包与调试采集
+│   ├── format/                           # 响应格式化层
+│   │   ├── claude/                       # Claude 输出格式化
+│   │   └── openai/                       # OpenAI 输出格式化
+│   ├── js/                               # Node Runtime 相关逻辑
+│   │   ├── chat-stream/                  # Node 流式输出桥接
+│   │   ├── helpers/                      # JS 辅助函数
+│   │   │   └── stream-tool-sieve/        # Tool sieve JS 实现
+│   │   └── shared/                       # Go/Node 共用语义片段
+│   ├── prompt/                           # Prompt 组装
+│   ├── rawsample/                        # raw sample 读写与管理
+│   ├── server/                           # 路由与中间件装配
+│   ├── sse/                              # SSE 解析工具
+│   ├── stream/                           # 统一流式消费引擎
+│   ├── testsuite/                        # 测试集执行框架
+│   ├── textclean/                        # 文本清洗
+│   ├── toolcall/                         # 工具调用解析与修复
+│   ├── translatorcliproxy/               # 多协议互转桥
+│   ├── util/                             # 通用工具函数
+│   ├── version/                          # 版本查询/比较
+│   └── webui/                            # WebUI 静态托管相关逻辑
+├── plans/                                # 阶段计划与人工验收记录
+├── pow/                                  # PoW 独立实现与基准
+├── scripts/                              # 构建/发布/辅助脚本
+├── tests/                                # 测试资源与脚本
+│   ├── compat/                           # 兼容性夹具与期望输出
+│   │   ├── expected/                     # 预期结果样本
+│   │   └── fixtures/                     # 测试输入夹具
+│   │       ├── sse_chunks/               # SSE chunk 夹具
+│   │       └── toolcalls/                # toolcall 夹具
+│   ├── node/                             # Node 单元测试
+│   ├── raw_stream_samples/               # 上游原始 SSE 样本
+│   │   ├── content-filter-trigger-20260405-jwt3/          # 风控终态样本
+│   │   ├── continue-thinking-snapshot-replay-20260405/    # continue 样本
+│   │   ├── guangzhou-weather-reasoner-search-20260404/    # 搜索+引用样本
+│   │   ├── markdown-format-example-20260405/              # Markdown 样本
+│   │   └── markdown-format-example-20260405-spacefix/     # 空格修复样本
+│   ├── scripts/                          # 测试脚本入口
+│   └── tools/                            # 测试辅助工具
+└── webui/                                # React 管理台源码
+    ├── public/                           # 静态资源
+    └── src/                              # 前端源码
+        ├── app/                          # 路由/状态框架
+        ├── components/                   # 共享组件
+        ├── features/                     # 功能模块
+        │   ├── account/                  # 账号管理页面
+        │   ├── apiTester/                # API 测试页面
+        │   ├── settings/                 # 设置页面
+        │   └── vercel/                   # Vercel 同步页面
+        ├── layout/                       # 布局组件
+        ├── locales/                      # 国际化文案
+        └── utils/                        # 前端工具函数
+```
+
+## 2. 请求主链路
+
+```mermaid
+flowchart LR
+    C[Client/SDK] --> R[internal/server/router.go]
+    R --> OA[OpenAI Adapter]
+    R --> CA[Claude Adapter]
+    R --> GA[Gemini Adapter]
+    R --> AD[Admin API]
+
+    CA --> BR[translatorcliproxy]
+    GA --> BR
+    BR --> CORE[internal/adapter/openai ChatCompletions]
+    OA --> CORE
+
+    CORE --> AUTH[internal/auth + config key/account resolver]
+    CORE --> POOL[internal/account queue + concurrency]
+    CORE --> TOOL[internal/toolcall parser + sieve]
+    CORE --> DS[internal/deepseek client]
+    DS --> U[DeepSeek upstream]
+```
+
+## 3. internal/ 子模块职责
+
+- `internal/server`：路由树和中间件挂载（健康检查、协议入口、Admin/WebUI）。
+- `internal/adapter/openai`：统一执行内核（chat/responses/embeddings 与 tool calling 语义）。
+- `internal/adapter/{claude,gemini}`：协议输入输出适配，不重复实现上游调用逻辑。
+- `internal/translatorcliproxy`：Claude/Gemini 与 OpenAI 结构互转。
+- `internal/deepseek`：上游请求、会话、PoW、SSE 消费。
+- `internal/stream` + `internal/sse`：流式解析与增量处理。
+- `internal/toolcall`：以 XML/Markup 家族为核心的工具调用解析与防泄漏筛分（`<tool_call>` / `<function_call>` / `<invoke>` / `tool_use` / antml 变体）。
+- `internal/admin`：配置管理、账号管理、Vercel 同步、版本检查、开发抓包。
+- `internal/config`：配置加载、校验、运行时 settings 热更新。
+- `internal/account`：托管账号池、并发槽位、等待队列。
+
+## 4. WebUI 与运行时关系
+
+- `webui/` 是前端源码（Vite + React）。
+- 运行时托管目录是 `static/admin`（构建产物）。
+- 本地首次启动若 `static/admin` 缺失，会尝试自动构建（依赖 Node.js）。
+
+## 5. 文档拆分策略
+
+- 总览与快速开始：`README.MD` / `README.en.md`
+- 架构与目录：`docs/ARCHITECTURE*.md`（本文件）
+- 接口协议：`API.md` / `API.en.md`
+- 部署、测试、贡献：`docs/DEPLOY*`、`docs/TESTING.md`、`docs/CONTRIBUTING*`
+- 专题：`docs/toolcall-semantics.md`、`docs/DeepSeekSSE行为结构说明-2026-04-05.md`
--- a/docs/CONTRIBUTING.en.md
+++ b/docs/CONTRIBUTING.en.md
@@ -0,0 +1,110 @@
+# Contributing Guide
+
+Language: [中文](CONTRIBUTING.md) | [English](CONTRIBUTING.en.md)
+
+Thanks for your interest in contributing to DS2API!
+
+## Development Setup
+
+### Prerequisites
+
+- Go 1.26+
+- Node.js `20.19+` or `22.12+` (for WebUI development)
+- npm (bundled with Node.js)
+
+### Backend Development
+
+```bash
+# 1. Clone
+git clone https://github.com/CJackHwang/ds2api.git
+cd ds2api
+
+# 2. Configure
+cp config.example.json config.json
+# Edit config.json with test accounts
+
+# 3. Run backend
+go run ./cmd/ds2api
+# Local access: http://127.0.0.1:5001
+# Actual bind: 0.0.0.0:5001, so LAN access is available via your private IP
+```
+
+### Frontend Development (WebUI)
+
+```bash
+# 1. Navigate to WebUI directory
+cd webui
+
+# 2. Install dependencies
+npm install
+
+# 3. Start dev server (hot reload)
+npm run dev
+# Default: http://localhost:5173, auto-proxies API to backend
+# host: 0.0.0.0 is not configured, so LAN access is not enabled by default
+```
+
+WebUI tech stack:
+- React + Vite
+- Tailwind CSS
+- Bilingual language packs: `webui/src/locales/zh.json` / `en.json`
+
+### Docker Development
+
+```bash
+docker-compose -f docker-compose.dev.yml up
+```
+
+## Code Standards
+
+| Language | Standards |
+| --- | --- |
+| **Go** | Run `./scripts/lint.sh` (gofmt + golangci-lint) and ensure `go test ./...` passes before committing |
+| **JavaScript/React** | Follow existing project style (functional components) |
+| **Commit messages** | Use semantic prefixes: `feat:`, `fix:`, `docs:`, `refactor:`, `style:`, `perf:`, `chore:` |
+
+## Submitting a PR
+
+1. Fork the repo
+2. Create a branch (e.g. `feature/xxx` or `fix/xxx`)
+3. Commit changes
+4. Push your branch
+5. Open a Pull Request
+
+> 💡 If you modify files under `webui/`, no manual build is needed — CI handles it automatically.
+> If you want to verify the generated `static/admin/` assets locally, you can still run `./scripts/build-webui.sh`.
+
+## Build WebUI
+
+Manually build WebUI to `static/admin/`:
+
+```bash
+./scripts/build-webui.sh
+```
+
+## Running Tests
+
+```bash
+# Go + Node unit tests (recommended)
+./tests/scripts/run-unit-all.sh
+
+# End-to-end live tests (real accounts)
+./tests/scripts/run-live.sh
+```
+
+## Project Structure
+
+To avoid documentation drift, directory layout and module responsibilities were moved to:
+
+- [docs/ARCHITECTURE.en.md](./ARCHITECTURE.en.md)
+- [docs/README.md](./README.md)
+
+Before contributing, review the architecture doc sections for request flow and `internal/` module boundaries.
+
+## Reporting Issues
+
+Please use [GitHub Issues](https://github.com/CJackHwang/ds2api/issues) and include:
+
+- Steps to reproduce
+- Relevant log output
+- Environment info (OS, Go version, deployment method)
--- a/docs/CONTRIBUTING.md
+++ b/docs/CONTRIBUTING.md
@@ -0,0 +1,110 @@
+# 贡献指南
+
+语言 / Language: [中文](CONTRIBUTING.md) | [English](CONTRIBUTING.en.md)
+
+感谢你对 DS2API 的关注与贡献！
+
+## 开发环境设置
+
+### 前置要求
+
+- Go 1.26+
+- Node.js `20.19+` 或 `22.12+`（WebUI 开发时）
+- npm（随 Node.js 提供）
+
+### 后端开发
+
+```bash
+# 1. 克隆仓库
+git clone https://github.com/CJackHwang/ds2api.git
+cd ds2api
+
+# 2. 配置
+cp config.example.json config.json
+# 编辑 config.json，填入测试账号
+
+# 3. 启动后端
+go run ./cmd/ds2api
+# 本地访问 http://127.0.0.1:5001
+# 实际绑定 0.0.0.0:5001，可通过局域网 IP 访问
+```
+
+### 前端开发（WebUI）
+
+```bash
+# 1. 进入 WebUI 目录
+cd webui
+
+# 2. 安装依赖
+npm install
+
+# 3. 启动开发服务器（热更新）
+npm run dev
+# 默认监听 http://localhost:5173，自动代理 API 到后端
+# 当前未配置 host: 0.0.0.0，因此默认不对局域网开放
+```
+
+WebUI 技术栈：
+- React + Vite
+- Tailwind CSS
+- 中英文语言包：`webui/src/locales/zh.json` / `en.json`
+
+### Docker 开发环境
+
+```bash
+docker-compose -f docker-compose.dev.yml up
+```
+
+## 代码规范
+
+| 语言 | 规范 |
+| --- | --- |
+| **Go** | 提交前运行 `./scripts/lint.sh`（包含 gofmt+golangci-lint）并确保 `go test ./...` 通过 |
+| **JavaScript/React** | 保持现有代码风格（函数组件） |
+| **提交信息** | 使用语义化前缀：`feat:`、`fix:`、`docs:`、`refactor:`、`style:`、`perf:`、`chore:` |
+
+## 提交 PR
+
+1. Fork 仓库
+2. 创建分支（如 `feature/xxx` 或 `fix/xxx`）
+3. 提交更改
+4. 推送分支
+5. 发起 Pull Request
+
+> 💡 如果修改了 `webui/` 目录下的文件，无需手动构建——CI 会自动处理。
+> 但如果你本地想验证 `static/admin/` 产物，还是可以手动运行 `./scripts/build-webui.sh`。
+
+## WebUI 构建
+
+手动构建 WebUI 到 `static/admin/`：
+
+```bash
+./scripts/build-webui.sh
+```
+
+## 运行测试
+
+```bash
+# Go + Node 单元测试（推荐）
+./tests/scripts/run-unit-all.sh
+
+# 端到端全链路测试（真实账号）
+./tests/scripts/run-live.sh
+```
+
+## 项目结构
+
+为避免与其他文档重复维护，目录结构与模块职责已迁移到：
+
+- [docs/ARCHITECTURE.md](./ARCHITECTURE.md)
+- [docs/README.md](./README.md)
+
+贡献前建议先阅读架构文档中的“请求主链路”和 `internal/` 模块职责，再定位改动范围。
+
+## 问题反馈
+
+请使用 [GitHub Issues](https://github.com/CJackHwang/ds2api/issues) 并附上：
+
+- 复现步骤
+- 相关日志输出
+- 运行环境信息（OS、Go 版本、部署方式）
--- a/docs/DEPLOY.en.md
+++ b/docs/DEPLOY.en.md
@@ -4,15 +4,18 @@ Language: [中文](DEPLOY.md) | [English](DEPLOY.en.md)

 This guide covers all deployment methods for the current Go-based codebase.

+Doc map: [Index](./README.md) | [Architecture](./ARCHITECTURE.en.md) | [API](../API.en.md) | [Testing](./TESTING.md)
+
 ---

 ## Table of Contents

+- [Recommended deployment priority](#recommended-deployment-priority)
 - [Prerequisites](#0-prerequisites)
- [1. Local Run](#1-local-run)
- [2. Docker Deployment](#2-docker-deployment)
+- [1. Download Release Binaries](#1-download-release-binaries)
+- [2. Docker / GHCR Deployment](#2-docker--ghcr-deployment)
 - [3. Vercel Deployment](#3-vercel-deployment)
- [4. Download Release Binaries](#4-download-release-binaries)
+- [4. Local Run from Source](#4-local-run-from-source)
 - [5. Reverse Proxy (Nginx)](#5-reverse-proxy-nginx)
 - [6. Linux systemd Service](#6-linux-systemd-service)
 - [7. Post-Deploy Checks](#7-post-deploy-checks)
@@ -20,12 +23,23 @@ This guide covers all deployment methods for the current Go-based codebase.

 ---

+## Recommended deployment priority
+
+Recommended order when choosing a deployment method:
+
+1. **Download and run release binaries**: the easiest path for most users because the artifacts are already built.
+2. **Docker / GHCR image deployment**: suitable for containerized, orchestrated, or cloud environments.
+3. **Vercel deployment**: suitable if you already use Vercel and accept its platform constraints.
+4. **Run from source / build locally**: suitable for development, debugging, or when you need to modify the code yourself.
+
+---
+
 ## 0. Prerequisites

 | Dependency | Minimum Version | Notes |
 | --- | --- | --- |
-| Go | 1.24+ | Build backend |
-| Node.js | 20+ | Only needed to build WebUI locally |
+| Go | 1.26+ | Build backend |
+| Node.js | `20.19+` or `22.12+` | Only needed to build WebUI locally |
 | npm | Bundled with Node.js | Install WebUI dependencies |

 Config source (choose one):
@@ -46,75 +60,67 @@ Use `config.json` as the single source of truth:

 ---

-## 1. Local Run
+## 1. Download Release Binaries

-### 1.1 Basic Steps
+Built-in GitHub Actions workflow: `.github/workflows/release-artifacts.yml`
+
+- **Trigger**: only on Release `published` (no build on normal push)
+- **Outputs**: multi-platform binary archives + `sha256sums.txt`
+- **Container publishing**: GHCR only (`ghcr.io/cjackhwang/ds2api`)
+
+| Platform | Architecture | Format |
+| --- | --- | --- |
+| Linux | amd64, arm64 | `.tar.gz` |
+| macOS | amd64, arm64 | `.tar.gz` |
+| Windows | amd64 | `.zip` |
+
+Each archive includes:
+
+- `ds2api` executable (`ds2api.exe` on Windows)
+- `static/admin/` (built WebUI assets)
+- `config.example.json`, `.env.example`
+- `README.MD`, `README.en.md`, `LICENSE`
+
+### Usage

 ```bash
-# Clone
-git clone https://github.com/CJackHwang/ds2api.git
-cd ds2api
+# 1. Download the archive for your platform
+# 2. Extract
+tar -xzf ds2api_<tag>_linux_amd64.tar.gz
+cd ds2api_<tag>_linux_amd64

-# Copy and edit config
+# 3. Configure
 cp config.example.json config.json
-# Open config.json and fill in:
-#   - keys: your API access keys
-#   - accounts: DeepSeek accounts (email or mobile + password)
+# Edit config.json

-# Start
-go run ./cmd/ds2api
-```
-
-Default address: `http://0.0.0.0:5001` (override with `PORT`).
-
-### 1.2 WebUI Build
-
-On first local startup, if `static/admin/` is missing, DS2API will automatically attempt to build the WebUI (requires Node.js/npm).
-
-Manual build:
-
-```bash
-./scripts/build-webui.sh
-```
-
-Or step by step:
-
-```bash
-cd webui
-npm install
-npm run build
-# Output goes to static/admin/
-```
-
-Control auto-build via environment variable:
-
-```bash
-# Disable auto-build
-DS2API_AUTO_BUILD_WEBUI=false go run ./cmd/ds2api
-
-# Force enable auto-build
-DS2API_AUTO_BUILD_WEBUI=true go run ./cmd/ds2api
-```
-
-### 1.3 Compile to Binary
-
-```bash
-go build -o ds2api ./cmd/ds2api
+# 4. Start
 ./ds2api
 ```

+### Maintainer Release Flow
+
+1. Create and publish a GitHub Release (with tag, for example `vX.Y.Z`)
+2. Wait for the `Release Artifacts` workflow to complete
+3. Download the matching archive from Release Assets
+
 ---

-## 2. Docker Deployment
+## 2. Docker / GHCR Deployment

 ### 2.1 Basic Steps

 ```bash
-# Copy env template
+# Pull prebuilt image
+docker pull ghcr.io/cjackhwang/ds2api:latest
+
+# Copy env template and config file
 cp .env.example .env
+cp config.example.json config.json

 # Edit .env and set at least:
 #   DS2API_ADMIN_KEY=your-admin-key
+# Optionally set the host port:
+#   DS2API_HOST_PORT=6011

 # Start
 docker-compose up -d
@@ -123,6 +129,14 @@ docker-compose up -d
 docker-compose logs -f
 ```

+The default `docker-compose.yml` directly uses `ghcr.io/cjackhwang/ds2api:latest` and maps host port `6011` to container port `5001`. If you want `5001` exposed directly, set `DS2API_HOST_PORT=5001` (or adjust the `ports` mapping).
+
+If you want a pinned version instead of `latest`, you can also pull a specific tag directly:
+
+```bash
+docker pull ghcr.io/cjackhwang/ds2api:v3.0.0
+```
+
 ### 2.2 Update

 ```bash
@@ -134,7 +148,7 @@ docker-compose up -d --build
 The `Dockerfile` now provides two image paths:

 1. **Default local/dev path (`runtime-from-source`)**: a three-stage build (WebUI build + Go build + runtime).
-2. **Release path (`runtime-from-dist`)**: CI first creates `dist/ds2api_<tag>_linux_<arch>.tar.gz`, then Docker directly reuses the binary and `static/admin` assets from those release archives, without running `npm build`/`go build` again.
+2. **Release path (`runtime-from-dist`)**: the release workflow first creates tag-named release archives, then copies the Linux bundles to `dist/docker-input/linux_amd64.tar.gz` / `linux_arm64.tar.gz`; Docker consumes those prepared inputs directly, without rerunning `npm build`/`go build`.

 The release path keeps Docker images aligned with release archives and reduces duplicate build work.

@@ -194,10 +208,10 @@ Notes:
 2. **Import** the project on Vercel
 3. **Set environment variables** (minimum required: one variable):

-   | Variable | Description |
-   | --- | --- |
-   | `DS2API_ADMIN_KEY` | Admin key (required) |
-   | `DS2API_CONFIG_JSON` | Config content, raw JSON or Base64 (optional, recommended) |
+| Variable | Description |
+| --- | --- |
+| `DS2API_ADMIN_KEY` | Admin key (required) |
+| `DS2API_CONFIG_JSON` | Config content, raw JSON or Base64 (optional, recommended) |

 4. **Deploy**

@@ -240,11 +254,9 @@ VERCEL_TEAM_ID=team_xxxxxxxxxxxx   # optional for personal accounts
 | Variable | Description | Default |
 | --- | --- | --- |
 | `DS2API_ACCOUNT_MAX_INFLIGHT` | Per-account inflight limit | `2` |
-| `DS2API_ACCOUNT_CONCURRENCY` | Alias (legacy compat) | — |
 | `DS2API_ACCOUNT_MAX_QUEUE` | Waiting queue limit | `recommended_concurrency` |
-| `DS2API_ACCOUNT_QUEUE_SIZE` | Alias (legacy compat) | — |
 | `DS2API_GLOBAL_MAX_INFLIGHT` | Global inflight limit | `recommended_concurrency` |
-| `DS2API_MAX_INFLIGHT` | Alias (legacy compat) | — |
+| `DS2API_ENV_WRITEBACK` | When `DS2API_CONFIG_JSON` is present, auto-write to `DS2API_CONFIG_PATH` and switch to file-backed mode after success (`1/true/yes/on`) | Disabled |
 | `DS2API_VERCEL_INTERNAL_SECRET` | Hybrid streaming internal auth | Falls back to `DS2API_ADMIN_KEY` |
 | `DS2API_VERCEL_STREAM_LEASE_TTL_SECONDS` | Stream lease TTL | `900` |
 | `VERCEL_TOKEN` | Vercel sync token | — |
@@ -309,7 +321,7 @@ Error: Command failed: go build -ldflags -s -w -o .../bootstrap ...
 1. Open Vercel Project Settings → Build and Development Settings
 2. **Clear** custom Go Build Flags / Build Command (recommended)
 3. If ldflags must be used, set `-ldflags="-s -w"` (ensure it's one argument)
-4. Verify `go.mod` uses a supported version (currently `go 1.24`)
+4. Verify `go.mod` uses a supported version (currently `go 1.26.0`)
 5. Redeploy (recommended: clear cache)

 #### Internal Package Import Error
@@ -345,58 +357,61 @@ If API responses return Vercel HTML `Authentication Required`:

 ---

-## 4. Download Release Binaries
+## 4. Local Run from Source

-Built-in GitHub Actions workflow: `.github/workflows/release-artifacts.yml`
-
- **Trigger**: only on Release `published` (no build on normal push)
- **Outputs**: multi-platform binary archives + `sha256sums.txt`
- **Container publishing**: GHCR only (`ghcr.io/cjackhwang/ds2api`)
-
-| Platform | Architecture | Format |
-| --- | --- | --- |
-| Linux | amd64, arm64 | `.tar.gz` |
-| macOS | amd64, arm64 | `.tar.gz` |
-| Windows | amd64 | `.zip` |
-
-Each archive includes:
-
- `ds2api` executable (`ds2api.exe` on Windows)
- `static/admin/` (built WebUI assets)
- `sha3_wasm_bg.7b9ca65ddd.wasm` (optional; binary has embedded fallback)
- `config.example.json`, `.env.example`
- `README.MD`, `README.en.md`, `LICENSE`
-
-### Usage
+### 4.1 Basic Steps

 ```bash
-# 1. Download the archive for your platform
-# 2. Extract
-tar -xzf ds2api_<tag>_linux_amd64.tar.gz
-cd ds2api_<tag>_linux_amd64
+# Clone
+git clone https://github.com/CJackHwang/ds2api.git
+cd ds2api

-# 3. Configure
+# Copy and edit config
 cp config.example.json config.json
-# Edit config.json
+# Open config.json and fill in:
+#   - keys: your API access keys
+#   - accounts: DeepSeek accounts (email or mobile + password)

-# 4. Start
-./ds2api
+# Start
+go run ./cmd/ds2api
 ```

-### Maintainer Release Flow
+Default local access URL: `http://127.0.0.1:5001`; the server actually binds to `0.0.0.0:5001` (override with `PORT`).

-1. Create and publish a GitHub Release (with tag, for example `vX.Y.Z`)
-2. Wait for the `Release Artifacts` workflow to complete
-3. Download the matching archive from Release Assets
+### 4.2 WebUI Build

-### Pull from GHCR (Optional)
+On first local startup, if `static/admin/` is missing, DS2API will automatically attempt to build the WebUI (requires Node.js/npm; when dependencies are missing it runs `npm ci` first, then `npm run build -- --outDir static/admin --emptyOutDir`).
+
+Manual build:

 ```bash
-# latest
-docker pull ghcr.io/cjackhwang/ds2api:latest
+./scripts/build-webui.sh
+```

-# specific version (example)
-docker pull ghcr.io/cjackhwang/ds2api:v2.1.2
+Or step by step:
+
+```bash
+cd webui
+npm install
+npm run build
+# Output goes to static/admin/
+```
+
+Control auto-build via environment variable:
+
+```bash
+# Disable auto-build
+DS2API_AUTO_BUILD_WEBUI=false go run ./cmd/ds2api
+
+# Force enable auto-build
+DS2API_AUTO_BUILD_WEBUI=true go run ./cmd/ds2api
+```
+
+### 4.3 Compile to Binary
+
+```bash
+go build -o ds2api ./cmd/ds2api
+./ds2api
 ```

 ---
@@ -453,8 +468,6 @@ server {
 # Copy compiled binary and related files to target directory
 sudo mkdir -p /opt/ds2api
 sudo cp ds2api config.json /opt/ds2api/
-# Optional: if you want to use an external WASM file (override embedded one)
-# sudo cp sha3_wasm_bg.7b9ca65ddd.wasm /opt/ds2api/
 sudo cp -r static/admin /opt/ds2api/static/admin
 ```

--- a/docs/DEPLOY.md
+++ b/docs/DEPLOY.md
@@ -4,15 +4,18 @@

 本指南基于当前 Go 代码库，详细说明各种部署方式。

+本页导航：[文档总索引](./README.md)｜[架构说明](./ARCHITECTURE.md)｜[接口文档](../API.md)｜[测试指南](./TESTING.md)
+
 ---

 ## 目录

+- [部署方式优先级建议](#部署方式优先级建议)
 - [前置要求](#0-前置要求)
- [一、本地运行](#一本地运行)
- [二、Docker 部署](#二docker-部署)
+- [一、下载 Release 构建包](#一下载-release-构建包)
+- [二、Docker / GHCR 部署](#二docker--ghcr-部署)
 - [三、Vercel 部署](#三vercel-部署)
- [四、下载 Release 构建包](#四下载-release-构建包)
+- [四、本地源码运行](#四本地源码运行)
 - [五、反向代理（Nginx）](#五反向代理nginx)
 - [六、Linux systemd 服务化](#六linux-systemd-服务化)
 - [七、部署后检查](#七部署后检查)
@@ -20,18 +23,29 @@

 ---

+## 部署方式优先级建议
+
+推荐按以下顺序选择部署方式：
+
+1. **下载 Release 构建包运行**：最省事，产物已编译完成，最适合大多数用户。
+2. **Docker / GHCR 镜像部署**：适合需要容器化、编排或云环境部署。
+3. **Vercel 部署**：适合已有 Vercel 环境且接受其平台约束的场景。
+4. **本地源码运行 / 自行编译**：适合开发、调试或需要自行修改代码的场景。
+
+---
+
 ## 0. 前置要求

 | 依赖 | 最低版本 | 说明 |
 | --- | --- | --- |
-| Go | 1.24+ | 编译后端 |
-| Node.js | 20+ | 仅在需要本地构建 WebUI 时 |
+| Go | 1.26+ | 编译后端 |
+| Node.js | `20.19+` 或 `22.12+` | 仅在需要本地构建 WebUI 时 |
 | npm | 随 Node.js 提供 | 安装 WebUI 依赖 |

 配置来源（任选其一）：

 - **文件方式**：`config.json`（推荐本地/Docker 使用）
- **环境变量方式**：`DS2API_CONFIG_JSON`（推荐 Vercel 使用，支持 JSON 字符串或 Base64 编码）
+- **环境变量方式**：`DS2API_CONFIG_JSON`（推荐 Vercel 使用，支持 JSON 字符串或 Base64 编码，也可以直接写原始 JSON）

 统一建议（最优实践）：

@@ -46,75 +60,67 @@ cp config.example.json config.json

 ---

-## 一、本地运行
+## 一、下载 Release 构建包

-### 1.1 基本步骤
+仓库内置 GitHub Actions 工作流：`.github/workflows/release-artifacts.yml`
+
+- **触发条件**：仅在 Release `published` 时触发（普通 push 不会构建）
+- **构建产物**：多平台二进制压缩包 + `sha256sums.txt`
+- **容器镜像发布**：仅发布到 GHCR（`ghcr.io/cjackhwang/ds2api`）
+
+| 平台 | 架构 | 文件格式 |
+| --- | --- | --- |
+| Linux | amd64, arm64 | `.tar.gz` |
+| macOS | amd64, arm64 | `.tar.gz` |
+| Windows | amd64 | `.zip` |
+
+每个压缩包包含：
+
+- `ds2api` 可执行文件（Windows 为 `ds2api.exe`）
+- `static/admin/`（WebUI 构建产物）
+- `config.example.json`、`.env.example`
+- `README.MD`、`README.en.md`、`LICENSE`
+
+### 使用步骤

 ```bash
-# 克隆仓库
-git clone https://github.com/CJackHwang/ds2api.git
-cd ds2api
+# 1. 下载对应平台的压缩包
+# 2. 解压
+tar -xzf ds2api_<tag>_linux_amd64.tar.gz
+cd ds2api_<tag>_linux_amd64

-# 复制并编辑配置
+# 3. 配置
 cp config.example.json config.json
-# 使用你喜欢的编辑器打开 config.json，填入：
-#   - keys: 你的 API 访问密钥
-#   - accounts: DeepSeek 账号（email 或 mobile + password）
+# 编辑 config.json

-# 启动服务
-go run ./cmd/ds2api
-```
-
-默认监听 `http://0.0.0.0:5001`，可通过 `PORT` 环境变量覆盖。
-
-### 1.2 WebUI 构建
-
-本地首次启动时，若 `static/admin/` 不存在，服务会自动尝试构建 WebUI（需要 Node.js/npm）。
-
-你也可以手动构建：
-
-```bash
-./scripts/build-webui.sh
-```
-
-或手动执行：
-
-```bash
-cd webui
-npm install
-npm run build
-# 产物输出到 static/admin/
-```
-
-通过环境变量控制自动构建行为：
-
-```bash
-# 强制关闭自动构建
-DS2API_AUTO_BUILD_WEBUI=false go run ./cmd/ds2api
-
-# 强制开启自动构建
-DS2API_AUTO_BUILD_WEBUI=true go run ./cmd/ds2api
-```
-
-### 1.3 编译为二进制文件
-
-```bash
-go build -o ds2api ./cmd/ds2api
+# 4. 启动
 ./ds2api
 ```

+### 维护者发布步骤
+
+1. 在 GitHub 创建并发布 Release（带 tag，如 `vX.Y.Z`）
+2. 等待 Actions 工作流 `Release Artifacts` 完成
+3. 在 Release 的 Assets 下载对应平台压缩包
+
 ---

-## 二、Docker 部署
+## 二、Docker / GHCR 部署

 ### 2.1 基本步骤

 ```bash
-# 复制环境变量模板
+# 拉取预编译镜像
+docker pull ghcr.io/cjackhwang/ds2api:latest
+
+# 复制环境变量模板和配置文件
 cp .env.example .env
+cp config.example.json config.json

 # 编辑 .env（请改成你的强密码），至少设置：
 #   DS2API_ADMIN_KEY=your-admin-key
+# 如需修改宿主机端口，可额外设置：
+#   DS2API_HOST_PORT=6011

 # 启动
 docker-compose up -d
@@ -123,6 +129,14 @@ docker-compose up -d
 docker-compose logs -f
 ```

+默认 `docker-compose.yml` 直接使用 `ghcr.io/cjackhwang/ds2api:latest`，并把宿主机 `6011` 映射到容器内的 `5001`。如果你希望直接对外暴露 `5001`，请设置 `DS2API_HOST_PORT=5001`（或者手动调整 `ports` 配置）。
+
+如需固定版本，也可以直接拉取指定 tag：
+
+```bash
+docker pull ghcr.io/cjackhwang/ds2api:v3.0.0
+```
+
 ### 2.2 更新

 ```bash
@@ -134,7 +148,7 @@ docker-compose up -d --build
 `Dockerfile` 提供两条构建路径：

 1. **本地/开发默认路径（`runtime-from-source`）**：三阶段构建（WebUI 构建 + Go 构建 + 运行阶段）。
-2. **Release 路径（`runtime-from-dist`）**：CI 先生成 `dist/ds2api_<tag>_linux_<arch>.tar.gz`，再由 Docker 直接复用该发布包内的二进制和 `static/admin` 产物组装运行镜像，不再重复执行 `npm build`/`go build`。
+2. **Release 路径（`runtime-from-dist`）**：发布工作流先生成 tag 命名的 Release 压缩包，再把 Linux 产物复制成 `dist/docker-input/linux_amd64.tar.gz` / `linux_arm64.tar.gz`；Docker 构建阶段直接消费这些输入，不再重复执行 `npm build`/`go build`。

 Release 路径可确保 Docker 镜像与 release 压缩包使用同一套产物，减少重复构建带来的差异。

@@ -194,10 +208,10 @@ healthcheck:
 2. **在 Vercel 上导入项目**
 3. **配置环境变量**（最少只需设置以下一项）：

-   | 变量 | 说明 |
-   | --- | --- |
-   | `DS2API_ADMIN_KEY` | 管理密钥（必填） |
-   | `DS2API_CONFIG_JSON` | 配置内容，JSON 字符串或 Base64 编码（可选，建议） |
+| 变量 | 说明 |
+| --- | --- |
+| `DS2API_ADMIN_KEY` | 管理密钥（必填） |
+| `DS2API_CONFIG_JSON` | 配置内容，JSON 字符串或 Base64 编码（可选，建议） |

 4. **部署**

@@ -240,11 +254,9 @@ VERCEL_TEAM_ID=team_xxxxxxxxxxxx   # 个人账号可留空
 | 变量 | 说明 | 默认值 |
 | --- | --- | --- |
 | `DS2API_ACCOUNT_MAX_INFLIGHT` | 每账号并发上限 | `2` |
-| `DS2API_ACCOUNT_CONCURRENCY` | 同上（兼容别名） | — |
 | `DS2API_ACCOUNT_MAX_QUEUE` | 等待队列上限 | `recommended_concurrency` |
-| `DS2API_ACCOUNT_QUEUE_SIZE` | 同上（兼容别名） | — |
 | `DS2API_GLOBAL_MAX_INFLIGHT` | 全局并发上限 | `recommended_concurrency` |
-| `DS2API_MAX_INFLIGHT` | 同上（兼容别名） | — |
+| `DS2API_ENV_WRITEBACK` | 检测到 `DS2API_CONFIG_JSON` 时自动写入 `DS2API_CONFIG_PATH`，并在成功后转为文件模式（`1/true/yes/on`） | 关闭 |
 | `DS2API_VERCEL_INTERNAL_SECRET` | 混合流式内部鉴权 | 回退用 `DS2API_ADMIN_KEY` |
 | `DS2API_VERCEL_STREAM_LEASE_TTL_SECONDS` | 流式 lease TTL | `900` |
 | `VERCEL_TOKEN` | Vercel 同步 token | — |
@@ -309,7 +321,7 @@ Error: Command failed: go build -ldflags -s -w -o .../bootstrap ...
 1. 进入 Vercel Project Settings → Build and Development Settings
 2. **清空**自定义 Go Build Flags / Build Command（推荐）
 3. 若必须设置 ldflags，使用 `-ldflags="-s -w"`（保证它是一个参数）
-4. 确认仓库 `go.mod` 为受支持版本（当前为 `go 1.24`）
+4. 确认仓库 `go.mod` 为受支持版本（当前为 `go 1.26.0`）
 5. 重新部署（建议清缓存后 Redeploy）

 #### Internal 包导入错误
@@ -345,58 +357,61 @@ No Output Directory named "public" found after the Build completed.

 ---

-## 四、下载 Release 构建包
+## 四、本地源码运行

-仓库内置 GitHub Actions 工作流：`.github/workflows/release-artifacts.yml`
-
- **触发条件**：仅在 Release `published` 时触发（普通 push 不会构建）
- **构建产物**：多平台二进制压缩包 + `sha256sums.txt`
- **容器镜像发布**：仅发布到 GHCR（`ghcr.io/cjackhwang/ds2api`）
-
-| 平台 | 架构 | 文件格式 |
-| --- | --- | --- |
-| Linux | amd64, arm64 | `.tar.gz` |
-| macOS | amd64, arm64 | `.tar.gz` |
-| Windows | amd64 | `.zip` |
-
-每个压缩包包含：
-
- `ds2api` 可执行文件（Windows 为 `ds2api.exe`）
- `static/admin/`（WebUI 构建产物）
- `sha3_wasm_bg.7b9ca65ddd.wasm`（可选；程序内置 embed fallback）
- `config.example.json`、`.env.example`
- `README.MD`、`README.en.md`、`LICENSE`
-
-### 使用步骤
+### 4.1 基本步骤

 ```bash
-# 1. 下载对应平台的压缩包
-# 2. 解压
-tar -xzf ds2api_<tag>_linux_amd64.tar.gz
-cd ds2api_<tag>_linux_amd64
+# 克隆仓库
+git clone https://github.com/CJackHwang/ds2api.git
+cd ds2api

-# 3. 配置
+# 复制并编辑配置
 cp config.example.json config.json
-# 编辑 config.json
+# 使用你喜欢的编辑器打开 config.json，填入：
+#   - keys: 你的 API 访问密钥
+#   - accounts: DeepSeek 账号（email 或 mobile + password）

-# 4. 启动
-./ds2api
+# 启动服务
+go run ./cmd/ds2api
 ```

-### 维护者发布步骤
+默认本地访问地址是 `http://127.0.0.1:5001`；服务实际绑定 `0.0.0.0:5001`，可通过 `PORT` 环境变量覆盖。

-1. 在 GitHub 创建并发布 Release（带 tag，如 `vX.Y.Z`）
-2. 等待 Actions 工作流 `Release Artifacts` 完成
-3. 在 Release 的 Assets 下载对应平台压缩包
+### 4.2 WebUI 构建

-### 拉取 GHCR 镜像（可选）
+本地首次启动时，若 `static/admin/` 不存在，服务会自动尝试构建 WebUI（需要 Node.js/npm；缺依赖时会先执行 `npm ci`，再执行 `npm run build -- --outDir static/admin --emptyOutDir`）。
+
+你也可以手动构建：

 ```bash
-# latest
-docker pull ghcr.io/cjackhwang/ds2api:latest
+./scripts/build-webui.sh
+```

-# 指定版本（示例）
-docker pull ghcr.io/cjackhwang/ds2api:v2.1.2
+或手动执行：
+
+```bash
+cd webui
+npm install
+npm run build
+# 产物输出到 static/admin/
+```
+
+通过环境变量控制自动构建行为：
+
+```bash
+# 强制关闭自动构建
+DS2API_AUTO_BUILD_WEBUI=false go run ./cmd/ds2api
+
+# 强制开启自动构建
+DS2API_AUTO_BUILD_WEBUI=true go run ./cmd/ds2api
+```
+
+### 4.3 编译为二进制文件
+
+```bash
+go build -o ds2api ./cmd/ds2api
+./ds2api
 ```

 ---
@@ -453,8 +468,6 @@ server {
 # 将编译好的二进制文件和相关文件复制到目标目录
 sudo mkdir -p /opt/ds2api
 sudo cp ds2api config.json /opt/ds2api/
-# 可选：若你希望使用外置 WASM 文件（覆盖内置版本）
-# sudo cp sha3_wasm_bg.7b9ca65ddd.wasm /opt/ds2api/
 sudo cp -r static/admin /opt/ds2api/static/admin
 ```

--- a/docs/DeepSeekSSE行为结构说明-2026-04-05.md
+++ b/docs/DeepSeekSSE行为结构说明-2026-04-05.md
@@ -0,0 +1,315 @@
+# DeepSeek SSE 行为结构说明（第三方逆向版）
+
+> 说明：本文基于当前仓库 `tests/raw_stream_samples/` 下全部 `upstream.stream.sse` 原始流样本整理而成，属于第三方逆向观察文档，不是官方协议。
+> 当前 corpus 由 4 份原始流组成，覆盖搜索+引用、风控终态、Markdown 输出和空格敏感输出等行为。
+> 补充：文末还会注明少量“当前实现已确认、但 corpus 尚未完整覆盖”的行为，例如长思考场景下的自动续写状态。
+
+文档导航：[文档总索引](./README.md) / [测试指南](./TESTING.md) / [样本目录说明](../tests/raw_stream_samples/README.md)
+
+## 1. 样本覆盖
+
+下列样本共同构成了本文的观察基础：
+
+| 样本 | 观察重点 |
+| --- | --- |
+| [guangzhou-weather-reasoner-search-20260404](../tests/raw_stream_samples/guangzhou-weather-reasoner-search-20260404/upstream.stream.sse) | 搜索+思考流程，包含 `reference:N` 引用标记与工具片段 |
+| [content-filter-trigger-20260405-jwt3](../tests/raw_stream_samples/content-filter-trigger-20260405-jwt3/upstream.stream.sse) | `CONTENT_FILTER` 终态分支，包含拒答模板与 `ban_regenerate` |
+| [markdown-format-example-20260405](../tests/raw_stream_samples/markdown-format-example-20260405/upstream.stream.sse) | Markdown 输出的早期样本，用于观察 token 级输出形态 |
+| [markdown-format-example-20260405-spacefix](../tests/raw_stream_samples/markdown-format-example-20260405-spacefix/upstream.stream.sse) | Markdown 输出修正样本，用于验证空格 chunk 必须保留 |
+
+当前 corpus 的整体特征是 `message` 帧占绝对多数，控制事件只占很小一部分，但它们决定了流的生命周期和最终状态。
+
+## 2. 总体结构
+
+DeepSeek 的这类输出可以分成两层看：
+
+1. SSE 事件层。
+2. JSON 载荷层。
+
+事件层负责传输边界，载荷层负责业务状态。实现时不要把 HTTP chunk、SSE block 和业务 JSON 混为一体。
+
+最常见的时序可以概括为：
+
+```text
+ready
+update_session
+message(初始化 envelope)
+message(正文 / 片段 / 状态增量)
+message(状态收口)
+finish
+update_session
+title
+close
+```
+
+`finish` 表示生成流结束，但不是唯一的终止信号；真正的语义终态通常还要结合 `response/status`、`quasi_status` 和 `close` 一起判断。
+
+## 3. SSE 事件层
+
+当前 corpus 中观察到的事件类型如下：
+
+| 事件 | 作用 | 处理建议 |
+| --- | --- | --- |
+| `ready` | 传输层就绪，通常携带 `request_message_id`、`response_message_id`、`model_type` | 记录元数据即可，不参与正文拼接 |
+| `update_session` | 会话时间戳或心跳更新 | 当作会话状态帧处理 |
+| `message` | 主体载荷，绝大多数业务信息都在这里 | 必须按顺序解析并保序累积 |
+| `finish` | 生成阶段结束 | 作为流结束标记之一 |
+| `title` | 会话标题生成结果 | 元数据帧，不参与正文拼接 |
+| `close` | 连接关闭信息 | 仅用于收尾与审计 |
+
+说明：
+
+- `message` 是默认事件名，SSE 中没有显式 `event:` 时也应按 `message` 处理。
+- 目前样本里大量 `message` 帧没有独立的业务前缀，不能靠事件名区分正文和控制帧。
+- 可能出现空 payload 的 `message` 帧；它们应被视为 no-op，但不能打乱事件顺序。
+
+## 4. 载荷层形态
+
+`message` 的 `data:` 部分不是单一 schema，而是多种结构混合。当前 corpus 里主要见到以下几种形态：
+
+| 形态 | 典型结构 | 作用 |
+| --- | --- | --- |
+| 初始化 envelope | `{"v":{"response":{...}}}` | 给出会话初始状态、模型状态和片段容器 |
+| 纯文本 token | `{"v":"..."}` | 直接输出可见文本 token |
+| 路径补丁 | `{"p":"...","o":"APPEND|SET|BATCH","v":...}` | 对某个状态路径做增量更新 |
+| 终态 batch | `{"v":[{"p":"status","v":"CONTENT_FILTER"}, ...]}` | 尾部状态收口，常见于风控终态 |
+
+一个简化后的典型样式如下：
+
+```json
+{"v":"输出"}
+{"p":"response/fragments/-1/content","o":"APPEND","v":"..."}
+{"p":"response/fragments","o":"APPEND","v":[...]}
+{"p":"response","o":"BATCH","v":[{"p":"accumulated_token_usage","v":211},{"p":"quasi_status","v":"FINISHED"}]}
+{"p":"response/status","o":"SET","v":"FINISHED"}
+```
+
+注意：
+
+- `v` 可能是字符串、对象、数组、布尔值或数字。
+- `o` 当前样本里主要见到 `APPEND`、`SET`、`BATCH`。
+- `v` 为数组时，通常表示一个批量 patch 集合，而不是正文数组。
+
+## 5. 初始化 envelope
+
+每条流开头，常会先出现一个 `message` 帧，内容是完整的 `response` 初始状态。当前 corpus 中，这个 envelope 常见字段包括：
+
+- `message_id`
+- `parent_id`
+- `model`
+- `role`
+- `thinking_enabled`
+- `ban_edit`
+- `ban_regenerate`
+- `status`
+- `incomplete_message`
+- `accumulated_token_usage`
+- `files`
+- `feedback`
+- `inserted_at`
+- `search_enabled`
+- `fragments`
+- `conversation_mode`
+- `has_pending_fragment`
+- `auto_continue`
+- `search_triggered`
+
+这些字段更像会话状态和策略开关，不是正文内容。第三方实现应把它们保留在内部状态树里，而不是直接拼接到最终答案。
+
+## 6. 路径结构
+
+当前 corpus 里观察到的 `p` 路径可以归成几组：
+
+### 6.1 片段级路径
+
+- `response/fragments/-N/content`
+- `response/fragments/-N/status`
+- `response/fragments/-N/results`
+- `response/fragments/-N/elapsed_secs`
+
+这类路径表示某个片段对象的增量更新。`-N` 只是样本中的索引风格，不应被写死成固定数量。
+
+### 6.2 片段容器路径
+
+- `response/fragments`
+- `fragments`
+
+这两类路径通常承载 fragment 数组。前者更像响应树中的分支，后者更像终态批处理里的片段集合。
+
+### 6.3 语义状态路径
+
+- `response/status`
+- `response/has_pending_fragment`
+- `quasi_status`
+- `status`
+- `ban_regenerate`
+
+这类路径决定流是否结束、是否被风控、是否还有待处理片段。它们不应作为正文输出。
+
+尤其是 `response/status` / `status` 这类路径上的字符串值，应被视为控制信号而不是文本 token。当前已确认需要特殊对待的值包括：
+
+- `FINISHED`：正常完成终态，应触发收口。
+- `CONTENT_FILTER`：风控终态，应走拒答/模板分支。
+- `WIP` / `INCOMPLETE` / `AUTO_CONTINUE`：未完成但可继续生成的中间状态，不应直接输出给客户端。
+
+### 6.4 统计与进度路径
+
+- `accumulated_token_usage`
+
+这类路径用于使用量或进度统计，属于元数据。
+
+### 6.5 非命名空间字段
+
+在片段对象内部，还会看到 `content`、`references`、`result`、`queries`、`stage_id` 等字段。它们不一定带 `response/...` 前缀，但仍然是协议语义的一部分。
+
+## 7. fragment 类型
+
+当前 corpus 里已经观察到的 fragment 类型如下：
+
+| 类型 | 作用 | 是否应直接渲染 |
+| --- | --- | --- |
+| `RESPONSE` | 正常回答片段 | 是，属于正文 |
+| `THINK` | 推理或阶段提示 | 通常否，按产品策略决定是否展示 |
+| `TOOL_SEARCH` | 搜索工具调用元数据 | 否 |
+| `TOOL_OPEN` | 打开 / 抽取结果的工具元数据 | 否 |
+| `TIP` | 提示 / 警告类片段，常带 `style: WARNING` | 视产品策略决定，通常作为附注 |
+| `TEMPLATE_RESPONSE` | 风控拒答模板 | 是，但它属于终态 fallback，不是普通正文 |
+
+观察到的典型片段字段：
+
+- `id`
+- `type`
+- `content`
+- `references`
+- `stage_id`
+- `status`
+- `queries`
+- `results`
+- `result`
+- `elapsed_secs`
+- `style`
+- `hide_on_wip`
+
+第三方实现不要把 `fragment.type` 和 `p` 路径混为一谈。`type` 是语义分类，`p` 是状态树位置。
+
+## 8. 终态行为
+
+当前 corpus 里有两条很重要的终态分支。
+
+### 8.1 正常完成
+
+正常回答通常会出现如下收口顺序：
+
+1. `response` 的 `BATCH` 更新 `accumulated_token_usage`。
+2. `response` 的 `BATCH` 或单独 patch 更新 `quasi_status: FINISHED`。
+3. `response/status` 置为 `FINISHED`。
+4. `finish` 事件到来。
+5. 之后可能还有 `update_session`、`title`、`close`。
+
+### 8.2 风控终态
+
+`content-filter-trigger-20260405-jwt3` 展示了另一种终态路径：
+
+1. 先继续输出一段正常正文。
+2. 出现提示类 fragment，例如 `TIP`。
+3. 可能先把 `quasi_status` 提前收口到 `FINISHED`。
+4. 之后出现一个终态 batch，把 `ban_regenerate` 设为 `true`，把 `status` 置为 `CONTENT_FILTER`，并附带 `TEMPLATE_RESPONSE`。
+5. 最后再出现 `finish`，然后是收尾事件。
+
+这个分支说明：
+
+- `finish` 不等于正常结束。
+- `CONTENT_FILTER` 是一个独立终态，不是普通异常。
+- `TEMPLATE_RESPONSE` 不应被当作常规回答流的中间片段，它是终态 fallback。
+
+一个简化的风控尾部可以写成：
+
+```json
+{"p":"response","o":"BATCH","v":[{"p":"accumulated_token_usage","v":1269},{"p":"quasi_status","v":"FINISHED"}]}
+{"v":[{"p":"ban_regenerate","v":true},{"p":"status","v":"CONTENT_FILTER"},{"p":"fragments","v":[{"id":38,"type":"TEMPLATE_RESPONSE","content":"..."}]},{"p":"quasi_status","v":"CONTENT_FILTER"}]}
+{"event":"finish"}
+```
+
+### 8.3 自动续写中间态（实现补充）
+
+这部分不是当前 corpus 的直接覆盖项，而是 2026-04-05 在长思考实测中观察到、且已在当前实现中兼容的行为：
+
+1. 上游可能先把 `response/status` 或 envelope 内的 `response.status` 置为 `WIP` / `INCOMPLETE`。
+2. 有时还会伴随 `auto_continue: true`。
+3. 这表示当前轮输出尚未真正结束，客户端或代理层可以继续调用 continue 接口续写同一条回答。
+4. 续写后的内容会承接之前的思考与正文，不应把前一轮状态值泄露成可见文本。
+
+对第三方实现，建议把这一类状态统一当作“可继续的控制信号”：
+
+- 可以据此决定是否继续拉取后续流。
+- 不能把 `INCOMPLETE`、`WIP`、`AUTO_CONTINUE` 直接拼接到最终文本。
+- `finish` 事件本身也不能单独说明回答已完全结束，仍要结合状态字段判断。
+
+## 9. 文本重建规则
+
+如果你的目标是把流重建成最终可见文本，必须遵守下面这些规则：
+
+- 按接收顺序逐个追加 token。
+- 不要对每个 `v` 做 `trim` 或 `TrimSpace`。
+- 不要丢弃只包含空格的 chunk。
+- 不要合并连续空格、换行或 Markdown 符号附近的空白。
+- 不要把 `[reference:N]` 视为协议元数据，它在当前 corpus 里就是正文的一部分。
+- 如果你要屏蔽引用标记，应当把它做成可配置的后处理，而不是在解析阶段硬删。
+- `response/status` / `status` 路径上的状态字符串不应进入正文，即使它们不是终态。
+
+这点对 Markdown、代码块、引用、表格都很关键。样本里已经证明，`#`、`-`、`>`、`|` 这类符号后面的空格必须原样保留，否则渲染结果会变形。
+
+## 10. 推荐实现方式
+
+对第三方开发者，建议把实现拆成三条线：
+
+1. 原始事件线：保留 SSE block 顺序、事件名和完整 JSON 载荷。
+2. 状态树线：维护 `response`、`fragments`、`status`、`quasi_status` 等结构。
+3. 可见文本线：只从明确应渲染的 token / fragment 中拼接最终文本。
+
+一个简单的处理顺序可以是：
+
+```text
+parse SSE block
+  -> 识别 event
+  -> 解析 JSON payload
+  -> 更新状态树
+  -> 识别 status / quasi_status / auto_continue 等控制信号
+  -> 判定是否有可见文本
+  -> 追加到输出缓冲
+  -> 遇到 WIP / INCOMPLETE / AUTO_CONTINUE 时决定是否续写
+  -> 遇到 FINISHED / CONTENT_FILTER / finish 时收口
+```
+
+实现时的兼容原则：
+
+- 未知路径保留，不要报错中断。
+- 未知 fragment.type 保留在日志里。
+- 不要假设所有模型都一定输出 `thinking_content`，当前 corpus 的推理更多是通过 fragment 类型表达。
+- 不要假设 `title` 一定存在，它只是后置元数据。
+
+## 11. 本 corpus 证明了什么
+
+当前样本足以证明以下行为：
+
+- 搜索类模型会把工具调用、结果、引用和正文混在同一条 SSE 流里。
+- 风控不会简单地“没有输出”，而是会在正常生成后切换到 `CONTENT_FILTER` 终态。
+- Markdown 和代码输出对空格非常敏感，空格 chunk 不能吞。
+- `message` 是主体承载层，`ready` / `update_session` / `finish` / `title` / `close` 是控制层。
+- `fragment.type` 是可视化和工具链分层的关键，不应只靠 `p` 路径判断。
+
+结合 2026-04-05 的长思考实测，还可以补充一条当前实现层面的结论：
+
+- 长思考场景下，上游可能先给出 `INCOMPLETE` / `WIP` / `AUTO_CONTINUE` 状态，再通过 continue 链路续写；这些状态值本身不应作为正文输出。
+
+## 12. 适用边界
+
+本文是基于当前 corpus 的逆向说明，不是恒定协议。
+
+- 新模型可能增加新的 `p` 路径。
+- 新版本可能增加新的 fragment.type。
+- `CONTENT_FILTER` 的终态模板内容可能变化。
+- 自动续写相关状态（如 `INCOMPLETE` / `AUTO_CONTINUE`）当前主要来自实测与实现兼容逻辑，后续字段形态仍可能变化。
+- 解析器应当对未知字段、未知路径、未知事件保持容忍。
+
+如果你要把这份说明用于实际开发，建议同时保留原始流样本、回放脚本和回归测试，不要只依赖本文。
--- a/docs/README.md
+++ b/docs/README.md
@@ -0,0 +1,53 @@
+# DS2API 文档导航 | Documentation Index
+
+语言 / Language: [中文](README.md) | [English](README.md#english)
+
+## 中文
+
+为减少重复维护，本仓库文档按“入口文档 + 专题文档”拆分。建议从下列顺序阅读：
+
+1. [项目总览（README）](../README.MD)
+2. [架构与目录说明](./ARCHITECTURE.md)
+3. [接口文档（API）](../API.md)
+4. [部署指南](./DEPLOY.md)
+5. [测试指南](./TESTING.md)
+6. [贡献指南](./CONTRIBUTING.md)
+
+### 专题文档
+
+- [Tool Calling 统一语义](./toolcall-semantics.md)
+- [DeepSeek SSE 行为结构说明（逆向观察）](./DeepSeekSSE行为结构说明-2026-04-05.md)
+
+### 文档维护约定
+
+- `README.MD` / `README.en.md`：面向首次接触用户，保留“是什么 + 怎么快速跑起来”。
+- `docs/ARCHITECTURE*.md`：面向开发者，集中维护项目结构、模块职责与调用链。
+- `API*.md`：面向客户端接入者，聚焦接口行为、鉴权和示例。
+- 其他 `docs/*.md`：主题化说明，避免在多个文档重复粘贴同一段内容。
+
+---
+
+## English
+
+To reduce maintenance drift, docs are split into an “entry doc + topical docs” layout.
+
+Recommended reading order:
+
+1. [Project overview (README)](../README.en.md)
+2. [Architecture and project layout](./ARCHITECTURE.en.md)
+3. [API reference](../API.en.md)
+4. [Deployment guide](./DEPLOY.en.md)
+5. [Testing guide](./TESTING.md)
+6. [Contributing guide](./CONTRIBUTING.en.md)
+
+### Topical docs
+
+- [Tool-calling unified semantics](./toolcall-semantics.md)
+- [DeepSeek SSE behavior notes (reverse-engineered)](./DeepSeekSSE行为结构说明-2026-04-05.md)
+
+### Maintenance conventions
+
+- `README.MD` / `README.en.md`: onboarding-oriented (“what + quick start”).
+- `docs/ARCHITECTURE*.md`: developer-oriented source of truth for module boundaries and execution flow.
+- `API*.md`: integration-oriented behavior/contracts.
+- Other `docs/*.md`: focused topics, avoid copy-pasting the same section into multiple files.
--- a/docs/TESTING.md
+++ b/docs/TESTING.md
@@ -1,6 +1,8 @@
 # DS2API 测试指南

-语言 / Language: [中文 + English](TESTING.md)
+语言 / Language: 中文 + English（同页）
+
+文档导航： [总览](../README.MD) / [架构说明](./ARCHITECTURE.md) / [部署指南](./DEPLOY.md) / [接口文档](../API.md)

 ## 概述 | Overview

@@ -14,6 +16,7 @@ DS2API 提供两个层级的测试：
 | 端到端测试 | `./tests/scripts/run-live.sh` | 使用真实账号执行全链路测试 |

 端到端测试集会录制完整的请求/响应日志，用于故障排查。
+Node 单元测试脚本会先做 `node --check` 语法门禁，再以 `--test-concurrency=1` 串行执行测试文件，减少模块级共享状态带来的干扰。

 ---

@@ -66,6 +69,8 @@ DS2API 提供两个层级的测试：

 4. **结果收集**：继续执行所有用例（不中断），写入最终汇总

+如果你只想跳过这些 preflight 检查，可以直接运行 `go run ./cmd/ds2api-tests --no-preflight`。
+
 ---

 ## CLI 参数 | CLI Flags
@@ -177,10 +182,10 @@ go test ./...

 ```bash
 # 运行 tool calls 相关测试（推荐用于调试 tool call 解析问题）
-go test -v -run 'TestParseToolCalls|TestRepair' ./internal/util/
+go test -v -run 'TestParseToolCalls|TestRepair' ./internal/toolcall/

 # 运行单个测试用例
-go test -v -run TestParseToolCallsWithDeepSeekHallucination ./internal/util/
+go test -v -run TestParseToolCallsWithDeepSeekHallucination ./internal/toolcall/

 # 运行 format 相关测试
 go test -v ./internal/format/...
@@ -195,13 +200,13 @@ go test -v ./internal/adapter/openai/...

 ```bash
 # 1. 运行 tool calls 相关的所有测试
-go test -v -run 'TestParseToolCalls|TestRepair' ./internal/util/
+go test -v -run 'TestParseToolCalls|TestRepair' ./internal/toolcall/

 # 2. 查看测试输出中的详细调试信息
-go test -v -run TestParseToolCallsWithDeepSeekHallucination ./internal/util/ 2>&1
+go test -v -run TestParseToolCallsWithDeepSeekHallucination ./internal/toolcall/ 2>&1

 # 3. 检查具体测试用例的修复效果
-# 测试用例位于 internal/util/toolcalls_test.go，包含：
+# 测试用例位于 internal/toolcall/toolcalls_test.go，包含：
 # - TestParseToolCallsWithDeepSeekHallucination: DeepSeek 典型幻觉输出
 # - TestRepairLooseJSONWithNestedObjects: 嵌套对象的方括号修复
 # - TestParseToolCallsWithMixedWindowsPaths: Windows 路径处理
@@ -223,6 +228,56 @@ node --test tests/node/stream-tool-sieve.test.js
 go run ./cmd/ds2api-tests --no-preflight
 ```

+### 运行原始流仿真（独立工具）
+
+```bash
+./tests/scripts/run-raw-stream-sim.sh
+```
+
+说明：
+- 该工具默认重放 `tests/raw_stream_samples/manifest.json` 声明的 canonical 样本，按上游 SSE 顺序做 1:1 仿真解析。
+- 默认校验不出现 `FINISHED` 文本泄露，并要求存在结束信号。
+- 默认**不**把 `raw accumulated_token_usage` 与本地解析 token 做强一致校验（当前实现以内容估算为准）；如需强校验可显式加 `--fail-on-token-mismatch`。
+- 每次运行都会把本地派生结果写入 `artifacts/raw-stream-sim/<run-id>/<sample-id>/replay.output.txt`，并输出结构化报告。
+- 如果你有历史基线目录，可以通过 `--baseline-root` 让工具直接做文本对比。
+- 更完整的协议级行为结构说明见 [DeepSeekSSE行为结构说明-2026-04-05.md](./DeepSeekSSE行为结构说明-2026-04-05.md)。
+
+### 对单个样本做回放比对
+
+```bash
+./tests/scripts/compare-raw-stream-sample.sh markdown-format-example-20260405-spacefix
+```
+
+说明：
+- 该脚本会从 raw-only 样本目录读取 `upstream.stream.sse`。
+- 回放结果会写入 `artifacts/raw-stream-sim/<run-id>/<sample-id>/`，便于直接查阅。
+- 如果传入历史基线目录，脚本会自动对比当前回放输出和基线文本。
+
+### 采集永久样本
+
+本地启动服务后，可以直接打：
+
+```bash
+POST /admin/dev/raw-samples/capture
+```
+
+这个接口会把请求元信息和上游原始流写入 `tests/raw_stream_samples/<sample-id>/`，以后可以直接拿来做回放和字段分析。派生输出会在本地回放时再生成，不再落在样本目录里。
+
+### 从内存抓包查询并保存样本
+
+如果问题刚刚在本地复现过，也可以先查当前进程内存里的抓包，再选择性落盘：
+
+```bash
+GET /admin/dev/raw-samples/query?q=广州&limit=10
+POST /admin/dev/raw-samples/save
+{"chain_key":"session:xxxx","sample_id":"tmp-from-memory"}
+```
+
+说明：
+- `query` 会按 `chat_session_id` 把 `completion + continue` 归并成一条链，适合定位接续思考问题。
+- `save` 支持用 `query`、`chain_key` 或 `capture_id` 选中目标。
+- 生成的样本目录仍然是 `tests/raw_stream_samples/<sample-id>/`，可以直接喂给回放脚本。
+
 ### 指定输出目录和超时

 ```bash
--- a/docs/toolcall-semantics.md
+++ b/docs/toolcall-semantics.md
@@ -1,41 +1,74 @@
-# Tool call parsing semantics (Go canonical spec)
+# Tool call parsing semantics（Go/Node 统一语义）

-This document defines the cross-runtime contract for `ParseToolCallsDetailed` / `parseToolCallsDetailed`.
+本文档描述当前代码中工具调用解析链路的**实际行为**（以 `internal/toolcall` 与 `internal/js/helpers/stream-tool-sieve` 为准）。

-## Output contract
+文档导航：[总览](../README.MD) / [架构说明](./ARCHITECTURE.md) / [测试指南](./TESTING.md)

- `calls`: accepted tool calls with normalized tool names.
- `sawToolCallSyntax`: true when tool-call-like syntax is detected (`tool_calls`, `<tool_call>`, `<function_call>`, `<invoke>`) or a valid call is parsed.
- `rejectedByPolicy`: true when parser extracted call syntax but all calls are rejected by allow-list policy.
- `rejectedToolNames`: de-duplicated rejected tool names in first-seen order.
+## 1) 当前输出结构

-## Parse pipeline
+`ParseToolCallsDetailed` / `parseToolCallsDetailed` 返回：

-1. Strip fenced code blocks for non-standalone parsing.
-2. Build candidates from:
-   - full text,
-   - fenced JSON snippets,
-   - extracted JSON objects around `tool_calls`,
-   - first `{` to last `}` object slice.
-3. Parse each candidate in order:
-   - JSON payload parser (`tool_calls`, list, single call object),
-   - XML/Markup parser (`<tool_call>`, `<function_call>`, `<invoke>`; supports attributes + nested fields),
-   - Text KV fallback parser (`function.name: <name>` ... `function.arguments: {json}`).
-4. Stop at first candidate that yields at least one call.
+- `calls`：解析出的工具调用列表（`name` + `input`）。
+- `sawToolCallSyntax`：检测到工具调用语法特征时为 `true`。
+- `rejectedByPolicy`：当前实现固定为 `false`（预留字段）。
+- `rejectedToolNames`：当前实现固定为空数组（预留字段）。

-## Name normalization policy
+> 当前 `filterToolCallsDetailed` 仅做结构清洗，不做 allow-list 工具名硬拒绝。

-When matching parsed names against configured tools:
+## 2) 解析范围（重点）

-1. exact match,
-2. case-insensitive match,
-3. namespace tail match (`a.b.c` => `c`),
-4. loose alnum match (remove non `[a-z0-9]`, compare).
+当前版本的可执行解析以 **XML/Markup 家族**为主：

-## Standalone mode
+- `<tool_call>...</tool_call>`
+- `<function_call>...</function_call>`
+- `<invoke ...>...</invoke>`（含自闭合）
+- `<tool_use>...</tool_use>`
+- antml 变体（如 `antml:function_call` / `antml:argument`）

-Standalone mode (`ParseStandaloneToolCallsDetailed`) parses the whole input directly (no candidate slicing), while still applying:
+并支持在这些标记块内部解析：

- example-context guard,
- JSON then markup fallback,
- the same allow-list normalization policy.
+- JSON 参数字符串
+- 标签参数（`<parameter name="...">...`）
+- key/value 风格子标签
+
+## 3) 不应再假设的行为
+
+以下说法在当前实现中已不成立：
+
+1. “纯 JSON `tool_calls` 片段会被直接当作可执行工具调用解析”。
+2. “存在 `toolcall.mode` / `toolcall.early_emit_confidence` 等可配置开关可以改变解析策略”。
+
+当前策略在代码中固定为：
+
+- 特征匹配开启（feature-match on）
+- 高置信度早发开启（early emit on）
+- policy 拒绝字段保留但未启用
+
+## 4) 流式与防泄漏语义
+
+在流式链路中（OpenAI / Claude / Gemini 统一内核）：
+
+- 工具调用片段会被优先提取为结构化增量输出；
+- 已识别的工具调用原始片段不会作为普通文本再次回流；
+- fenced code block 中的示例内容按文本处理，不作为可执行工具调用。
+
+## 5) 落地建议（按当前实现）
+
+1. Prompt 里优先约束模型输出 XML/Markup 工具块。
+2. 执行器侧继续做工具名白名单与参数 schema 校验（不要依赖 parser 代替安全策略）。
+3. 需要兼容历史“纯 JSON tool_calls”模型输出时，请在上游模板层把输出规范化为 XML/Markup 风格再进入 DS2API。
+
+## 6) 回归验证建议
+
+可直接运行：
+
+```bash
+go test -v -run 'TestParseToolCalls|TestRepair' ./internal/toolcall/
+node --test tests/node/stream-tool-sieve.test.js
+```
+
+重点覆盖：
+
+- `<tool_call>` / `<function_call>` / `<invoke>` / `tool_use` / antml 变体
+- 参数 JSON 修复与解析
+- 流式增量下的工具调用提取与文本防泄漏
--- a/go.mod
+++ b/go.mod
@@ -1,17 +1,24 @@
 module ds2api

-go 1.24
+go 1.26.0

 require (
-	github.com/andybalholm/brotli v1.0.6
-	github.com/go-chi/chi/v5 v5.2.3
+	github.com/andybalholm/brotli v1.2.1
+	github.com/go-chi/chi/v5 v5.2.5
 	github.com/google/uuid v1.6.0
-	github.com/refraction-networking/utls v1.8.1
-	github.com/tetratelabs/wazero v1.9.0
+	github.com/refraction-networking/utls v1.8.2
+	github.com/router-for-me/CLIProxyAPI/v6 v6.9.14
 )

 require (
-	github.com/klauspost/compress v1.17.4 // indirect
-	golang.org/x/crypto v0.36.0 // indirect
-	golang.org/x/sys v0.31.0 // indirect
+	github.com/klauspost/compress v1.18.5 // indirect
+	github.com/sirupsen/logrus v1.9.4 // indirect
+	github.com/tidwall/gjson v1.18.0 // indirect
+	github.com/tidwall/match v1.2.0 // indirect
+	github.com/tidwall/pretty v1.2.1 // indirect
+	github.com/tidwall/sjson v1.2.5 // indirect
+	golang.org/x/crypto v0.49.0 // indirect
+	golang.org/x/net v0.52.0
+	golang.org/x/sys v0.42.0 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -1,16 +1,43 @@
-github.com/andybalholm/brotli v1.0.6 h1:Yf9fFpf49Zrxb9NlQaluyE92/+X7UVHlhMNJN2sxfOI=
-github.com/andybalholm/brotli v1.0.6/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
-github.com/go-chi/chi/v5 v5.2.3 h1:WQIt9uxdsAbgIYgid+BpYc+liqQZGMHRaUwp0JUcvdE=
-github.com/go-chi/chi/v5 v5.2.3/go.mod h1:L2yAIGWB3H+phAw1NxKwWM+7eUH/lU8pOMm5hHcoops=
+github.com/andybalholm/brotli v1.2.1 h1:R+f5xP285VArJDRgowrfb9DqL18yVK0gKAW/F+eTWro=
+github.com/andybalholm/brotli v1.2.1/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/go-chi/chi/v5 v5.2.5 h1:Eg4myHZBjyvJmAFjFvWgrqDTXFyOzjj7YIm3L3mu6Ug=
+github.com/go-chi/chi/v5 v5.2.5/go.mod h1:X7Gx4mteadT3eDOMTsXzmI4/rwUpOwBHLpAfupzFJP0=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
-github.com/klauspost/compress v1.17.4 h1:Ej5ixsIri7BrIjBkRZLTo6ghwrEtHFk7ijlczPW4fZ4=
-github.com/klauspost/compress v1.17.4/go.mod h1:/dCuZOvVtNoHsyb+cuJD3itjs3NbnF6KH9zAO4BDxPM=
-github.com/refraction-networking/utls v1.8.1 h1:yNY1kapmQU8JeM1sSw2H2asfTIwWxIkrMJI0pRUOCAo=
-github.com/refraction-networking/utls v1.8.1/go.mod h1:jkSOEkLqn+S/jtpEHPOsVv/4V4EVnelwbMQl4vCWXAM=
-github.com/tetratelabs/wazero v1.9.0 h1:IcZ56OuxrtaEz8UYNRHBrUa9bYeX9oVY93KspZZBf/I=
-github.com/tetratelabs/wazero v1.9.0/go.mod h1:TSbcXCfFP0L2FGkRPxHphadXPjo1T6W+CseNNY7EkjM=
-golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34=
-golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc=
-golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik=
-golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
+github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE=
+github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/refraction-networking/utls v1.8.2 h1:j4Q1gJj0xngdeH+Ox/qND11aEfhpgoEvV+S9iJ2IdQo=
+github.com/refraction-networking/utls v1.8.2/go.mod h1:jkSOEkLqn+S/jtpEHPOsVv/4V4EVnelwbMQl4vCWXAM=
+github.com/router-for-me/CLIProxyAPI/v6 v6.9.14 h1:XItUHrPGE9E5xTeZIPjKGmKqfEs1AZbxl1RPfO5xtrc=
+github.com/router-for-me/CLIProxyAPI/v6 v6.9.14/go.mod h1:P1jsIPFXorYGuS2N/3BlZYkpRKi/z7+oR3+1tdG0u4k=
+github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w=
+github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g=
+github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
+github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
+github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY=
+github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
+github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
+github.com/tidwall/match v1.2.0 h1:0pt8FlkOwjN2fPt4bIl4BoNxb98gGHN2ObFEDkrfZnM=
+github.com/tidwall/match v1.2.0/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
+github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
+github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
+github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
+github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY=
+github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28=
+github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU=
+github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E=
+golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4=
+golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA=
+golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0=
+golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw=
+golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
+golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
--- a/internal/account/pool_acquire.go
+++ b/internal/account/pool_acquire.go
@@ -60,16 +60,10 @@ func (p *Pool) acquireLocked(target string, exclude map[string]bool) (config.Acc
 		return acc, true
 	}

-	if acc, ok := p.tryAcquire(exclude, true); ok {
-		return acc, true
-	}
-	if acc, ok := p.tryAcquire(exclude, false); ok {
-		return acc, true
-	}
-	return config.Account{}, false
+	return p.tryAcquire(exclude)
 }

-func (p *Pool) tryAcquire(exclude map[string]bool, requireToken bool) (config.Account, bool) {
+func (p *Pool) tryAcquire(exclude map[string]bool) (config.Account, bool) {
 	for i := 0; i < len(p.queue); i++ {
 		id := p.queue[i]
 		if exclude[id] || !p.canAcquireIDLocked(id) {
@@ -79,9 +73,6 @@ func (p *Pool) tryAcquire(exclude map[string]bool, requireToken bool) (config.Ac
 		if !ok {
 			continue
 		}
-		if requireToken && acc.Token == "" {
-			continue
-		}
 		p.inUse[id]++
 		p.bumpQueue(id)
 		return acc, true
--- a/internal/account/pool_edge_test.go
+++ b/internal/account/pool_edge_test.go
@@ -13,9 +13,7 @@ import (

 func TestPoolEmptyNoAccounts(t *testing.T) {
 	t.Setenv("DS2API_ACCOUNT_MAX_INFLIGHT", "2")
-	t.Setenv("DS2API_ACCOUNT_CONCURRENCY", "")
 	t.Setenv("DS2API_ACCOUNT_MAX_QUEUE", "")
-	t.Setenv("DS2API_ACCOUNT_QUEUE_SIZE", "")
 	t.Setenv("DS2API_CONFIG_JSON", `{"keys":["k1"],"accounts":[]}`)
 	pool := NewPool(config.LoadStore())
 	if _, ok := pool.Acquire("", nil); ok {
@@ -165,9 +163,7 @@ func TestPoolAcquireWaitTargetAccount(t *testing.T) {

 func TestPoolMaxQueueSizeOverride(t *testing.T) {
 	t.Setenv("DS2API_ACCOUNT_MAX_INFLIGHT", "1")
-	t.Setenv("DS2API_ACCOUNT_CONCURRENCY", "")
 	t.Setenv("DS2API_ACCOUNT_MAX_QUEUE", "5")
-	t.Setenv("DS2API_ACCOUNT_QUEUE_SIZE", "")
 	t.Setenv("DS2API_CONFIG_JSON", `{"keys":["k1"],"accounts":[{"email":"acc1@example.com","token":"t1"}]}`)
 	pool := NewPool(config.LoadStore())
 	status := pool.Status()
@@ -176,19 +172,6 @@ func TestPoolMaxQueueSizeOverride(t *testing.T) {
 	}
 }

-func TestPoolQueueSizeAliasEnv(t *testing.T) {
-	t.Setenv("DS2API_ACCOUNT_MAX_INFLIGHT", "1")
-	t.Setenv("DS2API_ACCOUNT_CONCURRENCY", "")
-	t.Setenv("DS2API_ACCOUNT_MAX_QUEUE", "")
-	t.Setenv("DS2API_ACCOUNT_QUEUE_SIZE", "7")
-	t.Setenv("DS2API_CONFIG_JSON", `{"keys":["k1"],"accounts":[{"email":"acc1@example.com","token":"t1"}]}`)
-	pool := NewPool(config.LoadStore())
-	status := pool.Status()
-	if got, ok := status["max_queue_size"].(int); !ok || got != 7 {
-		t.Fatalf("expected max_queue_size=7, got %#v", status["max_queue_size"])
-	}
-}
-
 func TestPoolMultipleAcquireReleaseCycles(t *testing.T) {
 	pool := newSingleAccountPoolForTest(t, "1")
 	for i := 0; i < 10; i++ {
--- a/internal/account/pool_limits.go
+++ b/internal/account/pool_limits.go
@@ -29,13 +29,8 @@ func (p *Pool) ApplyRuntimeLimits(maxInflightPerAccount, maxQueueSize, globalMax
 }

 func maxInflightFromEnv() int {
-	for _, key := range []string{"DS2API_ACCOUNT_MAX_INFLIGHT", "DS2API_ACCOUNT_CONCURRENCY"} {
-		raw := strings.TrimSpace(os.Getenv(key))
-		if raw == "" {
-			continue
-		}
-		n, err := strconv.Atoi(raw)
-		if err == nil && n > 0 {
+	if raw := strings.TrimSpace(os.Getenv("DS2API_ACCOUNT_MAX_INFLIGHT")); raw != "" {
+		if n, err := strconv.Atoi(raw); err == nil && n > 0 {
 			return n
 		}
 	}
@@ -53,13 +48,8 @@ func defaultRecommendedConcurrency(accountCount, maxInflightPerAccount int) int
 }

 func maxQueueFromEnv(defaultSize int) int {
-	for _, key := range []string{"DS2API_ACCOUNT_MAX_QUEUE", "DS2API_ACCOUNT_QUEUE_SIZE"} {
-		raw := strings.TrimSpace(os.Getenv(key))
-		if raw == "" {
-			continue
-		}
-		n, err := strconv.Atoi(raw)
-		if err == nil && n >= 0 {
+	if raw := strings.TrimSpace(os.Getenv("DS2API_ACCOUNT_MAX_QUEUE")); raw != "" {
+		if n, err := strconv.Atoi(raw); err == nil && n >= 0 {
 			return n
 		}
 	}
--- a/internal/account/pool_test.go
+++ b/internal/account/pool_test.go
@@ -12,9 +12,7 @@ import (
 func newPoolForTest(t *testing.T, maxInflight string) *Pool {
 	t.Helper()
 	t.Setenv("DS2API_ACCOUNT_MAX_INFLIGHT", maxInflight)
-	t.Setenv("DS2API_ACCOUNT_CONCURRENCY", "")
 	t.Setenv("DS2API_ACCOUNT_MAX_QUEUE", "")
-	t.Setenv("DS2API_ACCOUNT_QUEUE_SIZE", "")
 	t.Setenv("DS2API_CONFIG_JSON", `{
 		"keys":["k1"],
 		"accounts":[
@@ -29,9 +27,7 @@ func newPoolForTest(t *testing.T, maxInflight string) *Pool {
 func newSingleAccountPoolForTest(t *testing.T, maxInflight string) *Pool {
 	t.Helper()
 	t.Setenv("DS2API_ACCOUNT_MAX_INFLIGHT", maxInflight)
-	t.Setenv("DS2API_ACCOUNT_CONCURRENCY", "")
 	t.Setenv("DS2API_ACCOUNT_MAX_QUEUE", "")
-	t.Setenv("DS2API_ACCOUNT_QUEUE_SIZE", "")
 	t.Setenv("DS2API_CONFIG_JSON", `{
 		"keys":["k1"],
 		"accounts":[{"email":"acc1@example.com","token":"token1"}]
@@ -170,9 +166,9 @@ func TestPoolStatusRecommendedConcurrencyRespectsOverride(t *testing.T) {
 	}
 }

-func TestPoolAccountConcurrencyAliasEnv(t *testing.T) {
-	t.Setenv("DS2API_ACCOUNT_MAX_INFLIGHT", "")
-	t.Setenv("DS2API_ACCOUNT_CONCURRENCY", "4")
+func TestPoolGlobalMaxInflightEnv(t *testing.T) {
+	t.Setenv("DS2API_ACCOUNT_MAX_INFLIGHT", "1")
+	t.Setenv("DS2API_GLOBAL_MAX_INFLIGHT", "4")
 	t.Setenv("DS2API_CONFIG_JSON", `{
 		"keys":["k1"],
 		"accounts":[
@@ -183,15 +179,15 @@ func TestPoolAccountConcurrencyAliasEnv(t *testing.T) {

 	pool := NewPool(config.LoadStore())
 	status := pool.Status()
-	if got, ok := status["max_inflight_per_account"].(int); !ok || got != 4 {
+	if got, ok := status["global_max_inflight"].(int); !ok || got != 4 {
+		t.Fatalf("unexpected global_max_inflight: %#v", status["global_max_inflight"])
+	}
+	if got, ok := status["max_inflight_per_account"].(int); !ok || got != 1 {
 		t.Fatalf("unexpected max_inflight_per_account: %#v", status["max_inflight_per_account"])
 	}
-	if got, ok := status["recommended_concurrency"].(int); !ok || got != 8 {
+	if got, ok := status["recommended_concurrency"].(int); !ok || got != 2 {
 		t.Fatalf("unexpected recommended_concurrency: %#v", status["recommended_concurrency"])
 	}
-	if got, ok := status["max_queue_size"].(int); !ok || got != 8 {
-		t.Fatalf("unexpected max_queue_size: %#v", status["max_queue_size"])
-	}
 }

 func TestPoolDropsLegacyTokenOnlyAccountOnLoad(t *testing.T) {
@@ -215,6 +211,31 @@ func TestPoolDropsLegacyTokenOnlyAccountOnLoad(t *testing.T) {
 	}
 }

+func TestPoolAcquireRotatesIntoTokenlessAccounts(t *testing.T) {
+	t.Setenv("DS2API_ACCOUNT_MAX_INFLIGHT", "1")
+	t.Setenv("DS2API_ACCOUNT_MAX_QUEUE", "")
+	t.Setenv("DS2API_CONFIG_JSON", `{
+		"keys":["k1"],
+		"accounts":[
+			{"email":"acc1@example.com","token":"token1"},
+			{"email":"acc2@example.com","token":""},
+			{"email":"acc3@example.com","token":""}
+		]
+	}`)
+
+	pool := NewPool(config.LoadStore())
+	for i, want := range []string{"acc1@example.com", "acc2@example.com", "acc3@example.com"} {
+		acc, ok := pool.Acquire("", nil)
+		if !ok {
+			t.Fatalf("expected acquire success at step %d", i+1)
+		}
+		if got := acc.Identifier(); got != want {
+			t.Fatalf("unexpected account at step %d: got %q want %q", i+1, got, want)
+		}
+		pool.Release(acc.Identifier())
+	}
+}
+
 func TestPoolAcquireWaitQueuesAndSucceedsAfterRelease(t *testing.T) {
 	pool := newSingleAccountPoolForTest(t, "1")
 	first, ok := pool.Acquire("", nil)
--- a/internal/adapter/claude/deps.go
+++ b/internal/adapter/claude/deps.go
@@ -22,6 +22,11 @@ type DeepSeekCaller interface {

 type ConfigReader interface {
 	ClaudeMapping() map[string]string
+	CompatStripReferenceMarkers() bool
+}
+
+type OpenAIChatRunner interface {
+	ChatCompletions(w http.ResponseWriter, r *http.Request)
 }

 var _ AuthResolver = (*auth.Resolver)(nil)
--- a/internal/adapter/claude/deps_injection_test.go
+++ b/internal/adapter/claude/deps_injection_test.go
@@ -7,6 +7,7 @@ type mockClaudeConfig struct {
 }

 func (m mockClaudeConfig) ClaudeMapping() map[string]string { return m.m }
+func (mockClaudeConfig) CompatStripReferenceMarkers() bool  { return true }

 func TestNormalizeClaudeRequestUsesConfigInterfaceMapping(t *testing.T) {
 	req := map[string]any{
--- a/internal/adapter/claude/handler_helpers_misc.go
+++ b/internal/adapter/claude/handler_helpers_misc.go
@@ -0,0 +1,97 @@
+package claude
+
+import (
+	"fmt"
+	"strings"
+)
+
+func hasSystemMessage(messages []any) bool {
+	for _, m := range messages {
+		msg, ok := m.(map[string]any)
+		if ok && msg["role"] == "system" {
+			return true
+		}
+	}
+	return false
+}
+
+func extractClaudeToolNames(tools []any) []string {
+	out := make([]string, 0, len(tools))
+	for _, t := range tools {
+		m, ok := t.(map[string]any)
+		if !ok {
+			continue
+		}
+		name, _, _ := extractClaudeToolMeta(m)
+		if name != "" {
+			out = append(out, name)
+		}
+	}
+	return out
+}
+
+func extractClaudeToolMeta(m map[string]any) (string, string, any) {
+	name, _ := m["name"].(string)
+	desc, _ := m["description"].(string)
+	schemaObj := m["input_schema"]
+	if schemaObj == nil {
+		schemaObj = m["parameters"]
+	}
+
+	if fn, ok := m["function"].(map[string]any); ok {
+		if strings.TrimSpace(name) == "" {
+			name, _ = fn["name"].(string)
+		}
+		if strings.TrimSpace(desc) == "" {
+			desc, _ = fn["description"].(string)
+		}
+		if schemaObj == nil {
+			if v, ok := fn["input_schema"]; ok {
+				schemaObj = v
+			}
+		}
+		if schemaObj == nil {
+			if v, ok := fn["parameters"]; ok {
+				schemaObj = v
+			}
+		}
+	}
+	return strings.TrimSpace(name), strings.TrimSpace(desc), schemaObj
+}
+
+func toMessageMaps(v any) []map[string]any {
+	arr, ok := v.([]any)
+	if !ok {
+		return nil
+	}
+	out := make([]map[string]any, 0, len(arr))
+	for _, item := range arr {
+		if m, ok := item.(map[string]any); ok {
+			out = append(out, m)
+		}
+	}
+	return out
+}
+
+func extractMessageContent(v any) string {
+	switch x := v.(type) {
+	case string:
+		return x
+	case []any:
+		parts := make([]string, 0, len(x))
+		for _, it := range x {
+			parts = append(parts, fmt.Sprintf("%v", it))
+		}
+		return strings.Join(parts, "\n")
+	default:
+		return fmt.Sprintf("%v", x)
+	}
+}
+
+func cloneMap(in map[string]any) map[string]any {
+	out := make(map[string]any, len(in))
+	for k, v := range in {
+		out[k] = v
+	}
+	return out
+}
--- a/internal/adapter/claude/handler_messages.go
+++ b/internal/adapter/claude/handler_messages.go
@@ -1,89 +1,130 @@
 package claude

 import (
+	"bytes"
 	"encoding/json"
-	"fmt"
 	"io"
 	"net/http"
+	"net/http/httptest"
 	"strings"
-	"time"

-	"ds2api/internal/auth"
 	"ds2api/internal/config"
-	claudefmt "ds2api/internal/format/claude"
-	"ds2api/internal/sse"
 	streamengine "ds2api/internal/stream"
+	"ds2api/internal/translatorcliproxy"
+	"ds2api/internal/util"
+
+	sdktranslator "github.com/router-for-me/CLIProxyAPI/v6/sdk/translator"
 )

 func (h *Handler) Messages(w http.ResponseWriter, r *http.Request) {
 	if strings.TrimSpace(r.Header.Get("anthropic-version")) == "" {
 		r.Header.Set("anthropic-version", "2023-06-01")
 	}
-	a, err := h.Auth.Determine(r)
-	if err != nil {
-		status := http.StatusUnauthorized
-		detail := err.Error()
-		if err == auth.ErrNoAccount {
-			status = http.StatusTooManyRequests
-		}
-		writeClaudeError(w, status, detail)
+	if h.OpenAI == nil {
+		writeClaudeError(w, http.StatusInternalServerError, "OpenAI proxy backend unavailable.")
 		return
 	}
-	defer h.Auth.Release(a)
+	if h.proxyViaOpenAI(w, r, h.Store) {
+		return
+	}
+	writeClaudeError(w, http.StatusBadGateway, "Failed to proxy Claude request.")
+}

+func (h *Handler) proxyViaOpenAI(w http.ResponseWriter, r *http.Request, store ConfigReader) bool {
+	raw, err := io.ReadAll(r.Body)
+	if err != nil {
+		writeClaudeError(w, http.StatusBadRequest, "invalid body")
+		return true
+	}
 	var req map[string]any
-	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+	if err := json.Unmarshal(raw, &req); err != nil {
 		writeClaudeError(w, http.StatusBadRequest, "invalid json")
-		return
+		return true
 	}
-	norm, err := normalizeClaudeRequest(h.Store, req)
-	if err != nil {
-		writeClaudeError(w, http.StatusBadRequest, err.Error())
-		return
-	}
-	stdReq := norm.Standard
+	model, _ := req["model"].(string)
+	stream := util.ToBool(req["stream"])

-	sessionID, err := h.DS.CreateSession(r.Context(), a, 3)
-	if err != nil {
-		writeClaudeError(w, http.StatusUnauthorized, "invalid token.")
-		return
+	// Preserve claude_mapping (fast/slow/opus routing) while proxying via OpenAI.
+	translateModel := model
+	if store != nil {
+		if norm, normErr := normalizeClaudeRequest(store, cloneMap(req)); normErr == nil && strings.TrimSpace(norm.Standard.ResolvedModel) != "" {
+			translateModel = strings.TrimSpace(norm.Standard.ResolvedModel)
+		}
 	}
-	pow, err := h.DS.GetPow(r.Context(), a, 3)
-	if err != nil {
-		writeClaudeError(w, http.StatusUnauthorized, "Failed to get PoW")
-		return
-	}
-	requestPayload := stdReq.CompletionPayload(sessionID)
-	resp, err := h.DS.CallCompletion(r.Context(), a, requestPayload, pow, 3)
-	if err != nil {
-		writeClaudeError(w, http.StatusInternalServerError, "Failed to get Claude response.")
-		return
-	}
-	if resp.StatusCode != http.StatusOK {
-		defer resp.Body.Close()
-		body, _ := io.ReadAll(resp.Body)
-		writeClaudeError(w, http.StatusInternalServerError, string(body))
-		return
+	translatedReq := translatorcliproxy.ToOpenAI(sdktranslator.FormatClaude, translateModel, raw, stream)
+
+	isVercelPrepare := strings.TrimSpace(r.URL.Query().Get("__stream_prepare")) == "1"
+	isVercelRelease := strings.TrimSpace(r.URL.Query().Get("__stream_release")) == "1"
+
+	if isVercelRelease {
+		proxyReq := r.Clone(r.Context())
+		proxyReq.URL.Path = "/v1/chat/completions"
+		proxyReq.Body = io.NopCloser(bytes.NewReader(raw))
+		proxyReq.ContentLength = int64(len(raw))
+		rec := httptest.NewRecorder()
+		h.OpenAI.ChatCompletions(rec, proxyReq)
+		res := rec.Result()
+		defer func() { _ = res.Body.Close() }()
+		body, _ := io.ReadAll(res.Body)
+		for k, vv := range res.Header {
+			for _, v := range vv {
+				w.Header().Add(k, v)
+			}
+		}
+		w.WriteHeader(res.StatusCode)
+		_, _ = w.Write(body)
+		return true
 	}

-	if stdReq.Stream {
-		h.handleClaudeStreamRealtime(w, r, resp, stdReq.ResponseModel, norm.NormalizedMessages, stdReq.Thinking, stdReq.Search, stdReq.ToolNames)
-		return
+	proxyReq := r.Clone(r.Context())
+	proxyReq.URL.Path = "/v1/chat/completions"
+	proxyReq.Body = io.NopCloser(bytes.NewReader(translatedReq))
+	proxyReq.ContentLength = int64(len(translatedReq))
+
+	if stream && !isVercelPrepare {
+		w.Header().Set("Content-Type", "text/event-stream")
+		w.Header().Set("Cache-Control", "no-cache, no-transform")
+		w.Header().Set("Connection", "keep-alive")
+		w.Header().Set("X-Accel-Buffering", "no")
+		streamWriter := translatorcliproxy.NewOpenAIStreamTranslatorWriter(w, sdktranslator.FormatClaude, model, raw, translatedReq)
+		h.OpenAI.ChatCompletions(streamWriter, proxyReq)
+		return true
 	}
-	result := sse.CollectStream(resp, stdReq.Thinking, true)
-	respBody := claudefmt.BuildMessageResponse(
-		fmt.Sprintf("msg_%d", time.Now().UnixNano()),
-		stdReq.ResponseModel,
-		norm.NormalizedMessages,
-		result.Thinking,
-		result.Text,
-		stdReq.ToolNames,
-	)
-	writeJSON(w, http.StatusOK, respBody)
+
+	rec := httptest.NewRecorder()
+	h.OpenAI.ChatCompletions(rec, proxyReq)
+	res := rec.Result()
+	defer func() { _ = res.Body.Close() }()
+	body, _ := io.ReadAll(res.Body)
+	if res.StatusCode < 200 || res.StatusCode >= 300 {
+		for k, vv := range res.Header {
+			for _, v := range vv {
+				w.Header().Add(k, v)
+			}
+		}
+		w.WriteHeader(res.StatusCode)
+		_, _ = w.Write(body)
+		return true
+	}
+	if isVercelPrepare {
+		for k, vv := range res.Header {
+			for _, v := range vv {
+				w.Header().Add(k, v)
+			}
+		}
+		w.WriteHeader(res.StatusCode)
+		_, _ = w.Write(body)
+		return true
+	}
+	converted := translatorcliproxy.FromOpenAINonStream(sdktranslator.FormatClaude, model, raw, translatedReq, body)
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(http.StatusOK)
+	_, _ = w.Write(converted)
+	return true
 }

 func (h *Handler) handleClaudeStreamRealtime(w http.ResponseWriter, r *http.Request, resp *http.Response, model string, messages []any, thinkingEnabled, searchEnabled bool, toolNames []string) {
-	defer resp.Body.Close()
+	defer func() { _ = resp.Body.Close() }()
 	if resp.StatusCode != http.StatusOK {
 		body, _ := io.ReadAll(resp.Body)
 		writeClaudeError(w, http.StatusInternalServerError, string(body))
@@ -108,6 +149,7 @@ func (h *Handler) handleClaudeStreamRealtime(w http.ResponseWriter, r *http.Requ
 		messages,
 		thinkingEnabled,
 		searchEnabled,
+		h.compatStripReferenceMarkers(),
 		toolNames,
 	)
 	streamRuntime.sendMessageStart()
--- a/internal/adapter/claude/handler_routes.go
+++ b/internal/adapter/claude/handler_routes.go
@@ -15,9 +15,17 @@ import (
 var writeJSON = util.WriteJSON

 type Handler struct {
-	Store ConfigReader
-	Auth  AuthResolver
-	DS    DeepSeekCaller
+	Store  ConfigReader
+	Auth   AuthResolver
+	DS     DeepSeekCaller
+	OpenAI OpenAIChatRunner
+}
+
+func (h *Handler) compatStripReferenceMarkers() bool {
+	if h == nil || h.Store == nil {
+		return true
+	}
+	return h.Store.CompatStripReferenceMarkers()
 }

 var (
--- a/internal/adapter/claude/handler_stream_test.go
+++ b/internal/adapter/claude/handler_stream_test.go
@@ -138,77 +138,6 @@ func TestHandleClaudeStreamRealtimeThinkingDelta(t *testing.T) {
 	}
 }

-func TestHandleClaudeStreamRealtimeToolSafety(t *testing.T) {
-	h := &Handler{}
-	resp := makeClaudeSSEHTTPResponse(
-		`data: {"p":"response/content","v":"{\"tool_calls\":[{\"name\":\"search\""}`,
-		`data: {"p":"response/content","v":",\"input\":{\"q\":\"go\"}}]}"}`,
-		`data: [DONE]`,
-	)
-	rec := httptest.NewRecorder()
-	req := httptest.NewRequest(http.MethodPost, "/anthropic/v1/messages", nil)
-
-	h.handleClaudeStreamRealtime(rec, req, resp, "claude-sonnet-4-5", []any{map[string]any{"role": "user", "content": "use tool"}}, false, false, []string{"search"})
-
-	frames := parseClaudeFrames(t, rec.Body.String())
-	for _, f := range findClaudeFrames(frames, "content_block_delta") {
-		delta, _ := f.Payload["delta"].(map[string]any)
-		if delta["type"] == "text_delta" && strings.Contains(asString(delta["text"]), `"tool_calls"`) {
-			t.Fatalf("raw tool_calls JSON leaked in text delta: body=%s", rec.Body.String())
-		}
-	}
-
-	foundToolUse := false
-	for _, f := range findClaudeFrames(frames, "content_block_start") {
-		contentBlock, _ := f.Payload["content_block"].(map[string]any)
-		if contentBlock["type"] == "tool_use" {
-			foundToolUse = true
-			break
-		}
-	}
-	if !foundToolUse {
-		t.Fatalf("expected tool_use block in stream, body=%s", rec.Body.String())
-	}
-
-	foundToolUseStop := false
-	for _, f := range findClaudeFrames(frames, "message_delta") {
-		delta, _ := f.Payload["delta"].(map[string]any)
-		if delta["stop_reason"] == "tool_use" {
-			foundToolUseStop = true
-			break
-		}
-	}
-	if !foundToolUseStop {
-		t.Fatalf("expected stop_reason=tool_use, body=%s", rec.Body.String())
-	}
-}
-
-func TestHandleClaudeStreamRealtimeToolDetectionFromThinkingFallback(t *testing.T) {
-	h := &Handler{}
-	resp := makeClaudeSSEHTTPResponse(
-		`data: {"p":"response/thinking_content","v":"{\"tool_calls\":[{\"name\":\"search\""}`,
-		`data: {"p":"response/thinking_content","v":",\"input\":{\"q\":\"go\"}}]}"}`,
-		`data: [DONE]`,
-	)
-	rec := httptest.NewRecorder()
-	req := httptest.NewRequest(http.MethodPost, "/anthropic/v1/messages", nil)
-
-	h.handleClaudeStreamRealtime(rec, req, resp, "claude-sonnet-4-5", []any{map[string]any{"role": "user", "content": "use tool"}}, true, false, []string{"search"})
-
-	frames := parseClaudeFrames(t, rec.Body.String())
-	foundToolUse := false
-	for _, f := range findClaudeFrames(frames, "content_block_start") {
-		contentBlock, _ := f.Payload["content_block"].(map[string]any)
-		if contentBlock["type"] == "tool_use" && contentBlock["name"] == "search" {
-			foundToolUse = true
-			break
-		}
-	}
-	if !foundToolUse {
-		t.Fatalf("expected tool_use block from thinking fallback, body=%s", rec.Body.String())
-	}
-}
-
 func TestHandleClaudeStreamRealtimeSkipsThinkingFallbackWhenFinalTextExists(t *testing.T) {
 	h := &Handler{}
 	resp := makeClaudeSSEHTTPResponse(
--- a/internal/adapter/claude/handler_util_test.go
+++ b/internal/adapter/claude/handler_util_test.go
@@ -93,8 +93,11 @@ func TestNormalizeClaudeMessagesToolUseToAssistantToolCalls(t *testing.T) {
 		t.Fatalf("expected call id preserved, got %#v", call)
 	}
 	content, _ := m["content"].(string)
-	if !containsStr(content, "search_web") || !containsStr(content, `"arguments":"{\"query\":\"latest\"}"`) {
-		t.Fatalf("expected assistant content to include serialized tool call for prompt roundtrip, got %q", content)
+	if !containsStr(content, "<tool_calls>") || !containsStr(content, "<tool_name>search_web</tool_name>") {
+		t.Fatalf("expected assistant content to include XML tool call history, got %q", content)
+	}
+	if !containsStr(content, "<parameters>\n      <query><![CDATA[latest]]></query>\n    </parameters>") {
+		t.Fatalf("expected assistant content to include serialized parameters, got %q", content)
 	}
 }

@@ -222,6 +225,47 @@ func TestNormalizeClaudeMessagesToolResultNonTextPayloadStringified(t *testing.T
 	}
 }

+func TestNormalizeClaudeMessagesBackfillsToolResultCallIDByName(t *testing.T) {
+	msgs := []any{
+		map[string]any{
+			"role": "assistant",
+			"content": []any{
+				map[string]any{
+					"type":  "tool_use",
+					"name":  "search_web",
+					"input": map[string]any{"query": "latest"},
+				},
+			},
+		},
+		map[string]any{
+			"role": "user",
+			"content": []any{
+				map[string]any{
+					"type":    "tool_result",
+					"name":    "search_web",
+					"content": "ok",
+				},
+			},
+		},
+	}
+
+	got := normalizeClaudeMessages(msgs)
+	if len(got) != 2 {
+		t.Fatalf("expected 2 messages, got %#v", got)
+	}
+	assistant, _ := got[0].(map[string]any)
+	tc, _ := assistant["tool_calls"].([]any)
+	call, _ := tc[0].(map[string]any)
+	callID, _ := call["id"].(string)
+	if !strings.HasPrefix(callID, "call_claude_") {
+		t.Fatalf("expected generated call id, got %#v", call)
+	}
+	toolMsg, _ := got[1].(map[string]any)
+	if toolMsg["tool_call_id"] != callID {
+		t.Fatalf("expected tool_result to reuse generated id, got %#v", toolMsg)
+	}
+}
+
 // ─── buildClaudeToolPrompt ───────────────────────────────────────────

 func TestBuildClaudeToolPromptSingleTool(t *testing.T) {
@@ -248,14 +292,11 @@ func TestBuildClaudeToolPromptSingleTool(t *testing.T) {
 	if !containsStr(prompt, "Search the web") {
 		t.Fatalf("expected description in prompt")
 	}
-	if !containsStr(prompt, "tool_use") {
-		t.Fatalf("expected tool_use instruction in prompt")
+	if !containsStr(prompt, "<tool_calls>") {
+		t.Fatalf("expected XML tool_calls format in prompt")
 	}
-	if containsStr(prompt, "TOOL_CALL_HISTORY") || containsStr(prompt, "TOOL_RESULT_HISTORY") {
-		t.Fatalf("expected legacy tool history markers removed from prompt")
-	}
-	if !containsStr(prompt, "Do not print tool-call JSON in text") {
-		t.Fatalf("expected prompt to keep no tool-call-json instruction")
+	if !containsStr(prompt, "TOOL CALL FORMAT") {
+		t.Fatalf("expected tool call format header in prompt")
 	}
 }

@@ -301,12 +342,9 @@ func TestBuildClaudeToolPromptSupportsOpenAIStyleFunctionTool(t *testing.T) {
 func TestBuildClaudeToolPromptSkipsNonMap(t *testing.T) {
 	tools := []any{"not a map"}
 	prompt := buildClaudeToolPrompt(tools)
-	if prompt == "" {
-		t.Fatal("expected non-empty prompt even with invalid tools")
-	}
-	// Should still contain the intro and instruction
-	if !containsStr(prompt, "You are Claude") {
-		t.Fatalf("expected intro in prompt")
+	// No valid tools → empty prompt
+	if prompt != "" {
+		t.Fatalf("expected empty prompt for non-map tools, got: %q", prompt)
 	}
 }

--- a/internal/adapter/claude/handler_utils.go
+++ b/internal/adapter/claude/handler_utils.go
@@ -1,13 +1,21 @@
 package claude

 import (
+	"ds2api/internal/toolcall"
 	"encoding/json"
 	"fmt"
 	"strings"
+
+	"ds2api/internal/prompt"
 )

 func normalizeClaudeMessages(messages []any) []any {
 	out := make([]any, 0, len(messages))
+	state := &claudeToolCallState{
+		nameByID:       map[string]string{},
+		lastIDByName:   map[string]string{},
+		callIDSequence: 0,
+	}
 	for _, m := range messages {
 		msg, ok := m.(map[string]any)
 		if !ok {
@@ -41,7 +49,7 @@ func normalizeClaudeMessages(messages []any) []any {
 				case "tool_use":
 					if role == "assistant" {
 						flushText()
-						if toolMsg := normalizeClaudeToolUseToAssistant(b); toolMsg != nil {
+						if toolMsg := normalizeClaudeToolUseToAssistant(b, state); toolMsg != nil {
 							out = append(out, toolMsg)
 						}
 						continue
@@ -51,7 +59,7 @@ func normalizeClaudeMessages(messages []any) []any {
 					}
 				case "tool_result":
 					flushText()
-					if toolMsg := normalizeClaudeToolResultToToolMessage(b); toolMsg != nil {
+					if toolMsg := normalizeClaudeToolResultToToolMessage(b, state); toolMsg != nil {
 						out = append(out, toolMsg)
 					}
 				default:
@@ -70,24 +78,30 @@ func normalizeClaudeMessages(messages []any) []any {
 }

 func buildClaudeToolPrompt(tools []any) string {
-	parts := []string{"You are Claude, a helpful AI assistant. You have access to these tools:"}
+	toolSchemas := make([]string, 0, len(tools))
+	names := make([]string, 0, len(tools))
 	for _, t := range tools {
 		m, ok := t.(map[string]any)
 		if !ok {
 			continue
 		}
 		name, desc, schemaObj := extractClaudeToolMeta(m)
+		if name == "" {
+			continue
+		}
+		names = append(names, name)
 		schema, _ := json.Marshal(schemaObj)
-		parts = append(parts, fmt.Sprintf("Tool: %s\nDescription: %s\nParameters: %s", name, desc, schema))
+		toolSchemas = append(toolSchemas, fmt.Sprintf("Tool: %s\nDescription: %s\nParameters: %s", name, desc, schema))
 	}
-	parts = append(parts,
-		"When you need a tool, respond with Claude-native tool use (tool_use) using the provided tool schema. Do not print tool-call JSON in text.",
-		"Tool roundtrip context is included directly in the conversation messages (assistant tool_use/tool_calls and tool results).",
-		"After receiving a valid tool result, continue with final answer instead of repeating the same call unless required fields are still missing.",
-	)
-	return strings.Join(parts, "\n\n")
+	if len(toolSchemas) == 0 {
+		return ""
+	}
+	return "You have access to these tools:\n\n" +
+		strings.Join(toolSchemas, "\n\n") + "\n\n" +
+		toolcall.BuildToolCallInstructions(names)
 }

+//nolint:unused // retained for compatibility with pending Claude tool-result prompt flow.
 func formatClaudeToolResultForPrompt(block map[string]any) string {
 	if block == nil {
 		return ""
@@ -111,7 +125,7 @@ func formatClaudeToolResultForPrompt(block map[string]any) string {
 	return string(b)
 }

-func normalizeClaudeToolUseToAssistant(block map[string]any) map[string]any {
+func normalizeClaudeToolUseToAssistant(block map[string]any, state *claudeToolCallState) map[string]any {
 	if block == nil {
 		return nil
 	}
@@ -119,13 +133,15 @@ func normalizeClaudeToolUseToAssistant(block map[string]any) map[string]any {
 	if name == "" {
 		return nil
 	}
-	callID := strings.TrimSpace(fmt.Sprintf("%v", block["id"]))
+	callID := safeStringValue(block["id"])
 	if callID == "" {
-		callID = strings.TrimSpace(fmt.Sprintf("%v", block["tool_use_id"]))
+		callID = safeStringValue(block["tool_use_id"])
 	}
 	if callID == "" {
-		callID = "call_claude"
+		callID = state.nextID()
 	}
+	state.nameByID[callID] = name
+	state.lastIDByName[strings.ToLower(name)] = callID
 	arguments := block["input"]
 	if arguments == nil {
 		arguments = map[string]any{}
@@ -146,29 +162,39 @@ func normalizeClaudeToolUseToAssistant(block map[string]any) map[string]any {
 	}
 	return map[string]any{
 		"role":       "assistant",
-		"content":    marshalCompactJSON(toolCalls),
+		"content":    prompt.FormatToolCallsForPrompt(toolCalls),
 		"tool_calls": toolCalls,
 	}
 }

-func normalizeClaudeToolResultToToolMessage(block map[string]any) map[string]any {
+func normalizeClaudeToolResultToToolMessage(block map[string]any, state *claudeToolCallState) map[string]any {
 	if block == nil {
 		return nil
 	}
-	toolCallID := strings.TrimSpace(fmt.Sprintf("%v", block["tool_use_id"]))
+	name := safeStringValue(block["name"])
+	toolCallID := safeStringValue(block["tool_use_id"])
 	if toolCallID == "" {
-		toolCallID = strings.TrimSpace(fmt.Sprintf("%v", block["tool_call_id"]))
+		toolCallID = safeStringValue(block["tool_call_id"])
 	}
 	if toolCallID == "" {
-		toolCallID = "call_claude"
+		if name != "" {
+			toolCallID = strings.TrimSpace(state.lastIDByName[strings.ToLower(name)])
+		}
+	}
+	if toolCallID == "" {
+		toolCallID = state.nextID()
 	}
 	out := map[string]any{
 		"role":         "tool",
 		"tool_call_id": toolCallID,
 		"content":      normalizeClaudeToolResultContent(block["content"]),
 	}
-	if name := strings.TrimSpace(fmt.Sprintf("%v", block["name"])); name != "" {
+	if name != "" {
 		out["name"] = name
+		state.nameByID[toolCallID] = name
+		state.lastIDByName[strings.ToLower(name)] = toolCallID
+	} else if inferred := strings.TrimSpace(state.nameByID[toolCallID]); inferred != "" {
+		out["name"] = inferred
 	}
 	return out
 }
@@ -198,94 +224,3 @@ func formatClaudeBlockRaw(block map[string]any) string {
 	}
 	return string(b)
 }
-
-func hasSystemMessage(messages []any) bool {
-	for _, m := range messages {
-		msg, ok := m.(map[string]any)
-		if ok && msg["role"] == "system" {
-			return true
-		}
-	}
-	return false
-}
-
-func extractClaudeToolNames(tools []any) []string {
-	out := make([]string, 0, len(tools))
-	for _, t := range tools {
-		m, ok := t.(map[string]any)
-		if !ok {
-			continue
-		}
-		name, _, _ := extractClaudeToolMeta(m)
-		if name != "" {
-			out = append(out, name)
-		}
-	}
-	return out
-}
-
-func extractClaudeToolMeta(m map[string]any) (string, string, any) {
-	name, _ := m["name"].(string)
-	desc, _ := m["description"].(string)
-	schemaObj := m["input_schema"]
-	if schemaObj == nil {
-		schemaObj = m["parameters"]
-	}
-
-	if fn, ok := m["function"].(map[string]any); ok {
-		if strings.TrimSpace(name) == "" {
-			name, _ = fn["name"].(string)
-		}
-		if strings.TrimSpace(desc) == "" {
-			desc, _ = fn["description"].(string)
-		}
-		if schemaObj == nil {
-			if v, ok := fn["input_schema"]; ok {
-				schemaObj = v
-			}
-		}
-		if schemaObj == nil {
-			if v, ok := fn["parameters"]; ok {
-				schemaObj = v
-			}
-		}
-	}
-	return strings.TrimSpace(name), strings.TrimSpace(desc), schemaObj
-}
-
-func toMessageMaps(v any) []map[string]any {
-	arr, ok := v.([]any)
-	if !ok {
-		return nil
-	}
-	out := make([]map[string]any, 0, len(arr))
-	for _, item := range arr {
-		if m, ok := item.(map[string]any); ok {
-			out = append(out, m)
-		}
-	}
-	return out
-}
-
-func extractMessageContent(v any) string {
-	switch x := v.(type) {
-	case string:
-		return x
-	case []any:
-		parts := make([]string, 0, len(x))
-		for _, it := range x {
-			parts = append(parts, fmt.Sprintf("%v", it))
-		}
-		return strings.Join(parts, "\n")
-	default:
-		return fmt.Sprintf("%v", x)
-	}
-}
-
-func cloneMap(in map[string]any) map[string]any {
-	out := make(map[string]any, len(in))
-	for k, v := range in {
-		out[k] = v
-	}
-	return out
-}
--- a/internal/adapter/claude/handler_utils_sanitize.go
+++ b/internal/adapter/claude/handler_utils_sanitize.go
@@ -96,6 +96,7 @@ func looksLikeBase64Payload(v string) bool {
 	return true
 }

+//nolint:unused // helper kept for compatibility with upcoming sanitize pipeline.
 func marshalCompactJSON(v any) string {
 	b, err := json.Marshal(v)
 	if err != nil {
--- a/internal/adapter/claude/output_clean.go
+++ b/internal/adapter/claude/output_clean.go
@@ -0,0 +1,13 @@
+package claude
+
+import textclean "ds2api/internal/textclean"
+
+func cleanVisibleOutput(text string, stripReferenceMarkers bool) string {
+	if text == "" {
+		return text
+	}
+	if stripReferenceMarkers {
+		text = textclean.StripReferenceMarkers(text)
+	}
+	return text
+}
--- a/internal/adapter/claude/proxy_vercel_test.go
+++ b/internal/adapter/claude/proxy_vercel_test.go
@@ -0,0 +1,118 @@
+package claude
+
+import (
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+)
+
+type claudeProxyStoreStub struct {
+	mapping map[string]string
+}
+
+func (s claudeProxyStoreStub) ClaudeMapping() map[string]string {
+	return s.mapping
+}
+
+func (claudeProxyStoreStub) CompatStripReferenceMarkers() bool { return true }
+
+type openAIProxyStub struct {
+	status int
+	body   string
+}
+
+func (s openAIProxyStub) ChatCompletions(w http.ResponseWriter, _ *http.Request) {
+	if s.status == 0 {
+		s.status = http.StatusOK
+	}
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(s.status)
+	_, _ = w.Write([]byte(s.body))
+}
+
+type openAIProxyCaptureStub struct {
+	seenModel string
+	seenReq   map[string]any
+}
+
+func (s *openAIProxyCaptureStub) ChatCompletions(w http.ResponseWriter, r *http.Request) {
+	var req map[string]any
+	_ = json.NewDecoder(r.Body).Decode(&req)
+	s.seenReq = req
+	if m, ok := req["model"].(string); ok {
+		s.seenModel = m
+	}
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(http.StatusOK)
+	_, _ = w.Write([]byte(`{"id":"ok","choices":[{"message":{"role":"assistant","content":"ok"}}]}`))
+}
+
+func TestClaudeProxyViaOpenAIVercelPreparePassthrough(t *testing.T) {
+	h := &Handler{OpenAI: openAIProxyStub{status: 200, body: `{"lease_id":"lease_123","payload":{"a":1}}`}}
+	req := httptest.NewRequest(http.MethodPost, "/anthropic/v1/messages?__stream_prepare=1", strings.NewReader(`{"model":"claude-sonnet-4-5","messages":[{"role":"user","content":"hi"}],"stream":true}`))
+	rec := httptest.NewRecorder()
+
+	h.Messages(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("unexpected status: %d body=%s", rec.Code, rec.Body.String())
+	}
+	var out map[string]any
+	if err := json.Unmarshal(rec.Body.Bytes(), &out); err != nil {
+		t.Fatalf("expected json response, got err=%v body=%s", err, rec.Body.String())
+	}
+	if _, ok := out["lease_id"]; !ok {
+		t.Fatalf("expected lease_id in prepare passthrough, got=%v", out)
+	}
+}
+
+func TestClaudeProxyViaOpenAIPreservesClaudeMapping(t *testing.T) {
+	openAI := &openAIProxyCaptureStub{}
+	h := &Handler{
+		Store:  claudeProxyStoreStub{mapping: map[string]string{"fast": "deepseek-chat", "slow": "deepseek-reasoner"}},
+		OpenAI: openAI,
+	}
+	req := httptest.NewRequest(http.MethodPost, "/anthropic/v1/messages", strings.NewReader(`{"model":"claude-3-opus","messages":[{"role":"user","content":"hi"}],"stream":false}`))
+	rec := httptest.NewRecorder()
+
+	h.Messages(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("unexpected status: %d body=%s", rec.Code, rec.Body.String())
+	}
+	if got := strings.TrimSpace(openAI.seenModel); got != "deepseek-reasoner" {
+		t.Fatalf("expected mapped proxy model deepseek-reasoner, got %q", got)
+	}
+}
+
+func TestClaudeProxyTranslatesInlineImageToOpenAIDataURL(t *testing.T) {
+	openAI := &openAIProxyCaptureStub{}
+	h := &Handler{OpenAI: openAI}
+	req := httptest.NewRequest(http.MethodPost, "/anthropic/v1/messages", strings.NewReader(`{"model":"claude-sonnet-4-5","messages":[{"role":"user","content":[{"type":"text","text":"hello"},{"type":"image","source":{"type":"base64","media_type":"image/png","data":"QUJDRA=="}}]}],"stream":false}`))
+	rec := httptest.NewRecorder()
+
+	h.Messages(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("unexpected status: %d body=%s", rec.Code, rec.Body.String())
+	}
+	messages, _ := openAI.seenReq["messages"].([]any)
+	if len(messages) != 1 {
+		t.Fatalf("expected one translated message, got %#v", openAI.seenReq)
+	}
+	msg, _ := messages[0].(map[string]any)
+	content, _ := msg["content"].([]any)
+	if len(content) != 2 {
+		t.Fatalf("expected translated content blocks, got %#v", msg)
+	}
+	imageBlock, _ := content[1].(map[string]any)
+	if strings.TrimSpace(asString(imageBlock["type"])) != "image_url" {
+		t.Fatalf("expected image_url block, got %#v", imageBlock)
+	}
+	imageURL, _ := imageBlock["image_url"].(map[string]any)
+	if !strings.HasPrefix(strings.TrimSpace(asString(imageURL["url"])), "data:image/png;base64,") {
+		t.Fatalf("expected translated data url, got %#v", imageBlock)
+	}
+}
--- a/internal/adapter/claude/standard_request.go
+++ b/internal/adapter/claude/standard_request.go
@@ -18,7 +18,7 @@ func normalizeClaudeRequest(store ConfigReader, req map[string]any) (claudeNorma
 	model, _ := req["model"].(string)
 	messagesRaw, _ := req["messages"].([]any)
 	if strings.TrimSpace(model) == "" || len(messagesRaw) == 0 {
-		return claudeNormalizedRequest{}, fmt.Errorf("Request must include 'model' and 'messages'.")
+		return claudeNormalizedRequest{}, fmt.Errorf("request must include 'model' and 'messages'")
 	}
 	if _, ok := req["max_tokens"]; !ok {
 		req["max_tokens"] = 8192
@@ -36,7 +36,7 @@ func normalizeClaudeRequest(store ConfigReader, req map[string]any) (claudeNorma
 		thinkingEnabled = false
 		searchEnabled = false
 	}
-	finalPrompt := deepseek.MessagesPrepare(toMessageMaps(dsPayload["messages"]))
+	finalPrompt := deepseek.MessagesPrepareWithThinking(toMessageMaps(dsPayload["messages"]), thinkingEnabled)
 	toolNames := extractClaudeToolNames(toolsRequested)
 	if len(toolNames) == 0 && len(toolsRequested) > 0 {
 		toolNames = []string{"__any_tool__"}
--- a/internal/adapter/claude/stream_runtime_core.go
+++ b/internal/adapter/claude/stream_runtime_core.go
@@ -19,9 +19,10 @@ type claudeStreamRuntime struct {
 	toolNames []string
 	messages  []any

-	thinkingEnabled   bool
-	searchEnabled     bool
-	bufferToolContent bool
+	thinkingEnabled       bool
+	searchEnabled         bool
+	bufferToolContent     bool
+	stripReferenceMarkers bool

 	messageID string
 	thinking  strings.Builder
@@ -44,21 +45,23 @@ func newClaudeStreamRuntime(
 	messages []any,
 	thinkingEnabled bool,
 	searchEnabled bool,
+	stripReferenceMarkers bool,
 	toolNames []string,
 ) *claudeStreamRuntime {
 	return &claudeStreamRuntime{
-		w:                  w,
-		rc:                 rc,
-		canFlush:           canFlush,
-		model:              model,
-		messages:           messages,
-		thinkingEnabled:    thinkingEnabled,
-		searchEnabled:      searchEnabled,
-		bufferToolContent:  len(toolNames) > 0,
-		toolNames:          toolNames,
-		messageID:          fmt.Sprintf("msg_%d", time.Now().UnixNano()),
-		thinkingBlockIndex: -1,
-		textBlockIndex:     -1,
+		w:                     w,
+		rc:                    rc,
+		canFlush:              canFlush,
+		model:                 model,
+		messages:              messages,
+		thinkingEnabled:       thinkingEnabled,
+		searchEnabled:         searchEnabled,
+		bufferToolContent:     len(toolNames) > 0,
+		stripReferenceMarkers: stripReferenceMarkers,
+		toolNames:             toolNames,
+		messageID:             fmt.Sprintf("msg_%d", time.Now().UnixNano()),
+		thinkingBlockIndex:    -1,
+		textBlockIndex:        -1,
 	}
 }

@@ -76,10 +79,11 @@ func (s *claudeStreamRuntime) onParsed(parsed sse.LineResult) streamengine.Parse

 	contentSeen := false
 	for _, p := range parsed.Parts {
-		if p.Text == "" {
+		cleanedText := cleanVisibleOutput(p.Text, s.stripReferenceMarkers)
+		if cleanedText == "" {
 			continue
 		}
-		if p.Type != "thinking" && s.searchEnabled && sse.IsCitation(p.Text) {
+		if p.Type != "thinking" && s.searchEnabled && sse.IsCitation(cleanedText) {
 			continue
 		}
 		contentSeen = true
@@ -88,7 +92,11 @@ func (s *claudeStreamRuntime) onParsed(parsed sse.LineResult) streamengine.Parse
 			if !s.thinkingEnabled {
 				continue
 			}
-			s.thinking.WriteString(p.Text)
+			trimmed := sse.TrimContinuationOverlap(s.thinking.String(), cleanedText)
+			if trimmed == "" {
+				continue
+			}
+			s.thinking.WriteString(trimmed)
 			s.closeTextBlock()
 			if !s.thinkingBlockOpen {
 				s.thinkingBlockIndex = s.nextBlockIndex
@@ -108,13 +116,17 @@ func (s *claudeStreamRuntime) onParsed(parsed sse.LineResult) streamengine.Parse
 				"index": s.thinkingBlockIndex,
 				"delta": map[string]any{
 					"type":     "thinking_delta",
-					"thinking": p.Text,
+					"thinking": trimmed,
 				},
 			})
 			continue
 		}

-		s.text.WriteString(p.Text)
+		trimmed := sse.TrimContinuationOverlap(s.text.String(), cleanedText)
+		if trimmed == "" {
+			continue
+		}
+		s.text.WriteString(trimmed)
 		if s.bufferToolContent {
 			if hasUnclosedCodeFence(s.text.String()) {
 				continue
@@ -140,7 +152,7 @@ func (s *claudeStreamRuntime) onParsed(parsed sse.LineResult) streamengine.Parse
 			"index": s.textBlockIndex,
 			"delta": map[string]any{
 				"type": "text_delta",
-				"text": p.Text,
+				"text": trimmed,
 			},
 		})
 	}
--- a/internal/adapter/claude/stream_runtime_finalize.go
+++ b/internal/adapter/claude/stream_runtime_finalize.go
@@ -1,6 +1,8 @@
 package claude

 import (
+	"ds2api/internal/toolcall"
+	"encoding/json"
 	"fmt"
 	"time"

@@ -42,12 +44,12 @@ func (s *claudeStreamRuntime) finalize(stopReason string) {
 	s.closeTextBlock()

 	finalThinking := s.thinking.String()
-	finalText := s.text.String()
+	finalText := cleanVisibleOutput(s.text.String(), s.stripReferenceMarkers)

 	if s.bufferToolContent {
-		detected := util.ParseStandaloneToolCalls(finalText, s.toolNames)
+		detected := toolcall.ParseStandaloneToolCalls(finalText, s.toolNames)
 		if len(detected) == 0 && finalText == "" && finalThinking != "" {
-			detected = util.ParseStandaloneToolCalls(finalThinking, s.toolNames)
+			detected = toolcall.ParseStandaloneToolCalls(finalThinking, s.toolNames)
 		}
 		if len(detected) > 0 {
 			stopReason = "tool_use"
@@ -60,9 +62,20 @@ func (s *claudeStreamRuntime) finalize(stopReason string) {
 						"type":  "tool_use",
 						"id":    fmt.Sprintf("toolu_%d_%d", time.Now().Unix(), idx),
 						"name":  tc.Name,
-						"input": tc.Input,
+						"input": map[string]any{},
 					},
 				})
+
+				inputBytes, _ := json.Marshal(tc.Input)
+				s.send("content_block_delta", map[string]any{
+					"type":  "content_block_delta",
+					"index": idx,
+					"delta": map[string]any{
+						"type":         "input_json_delta",
+						"partial_json": string(inputBytes),
+					},
+				})
+
 				s.send("content_block_stop", map[string]any{
 					"type":  "content_block_stop",
 					"index": idx,
--- a/internal/adapter/claude/stream_status_test.go
+++ b/internal/adapter/claude/stream_status_test.go
@@ -1,7 +1,6 @@
 package claude

 import (
-	"context"
 	"net/http"
 	"net/http/httptest"
 	"strings"
@@ -9,48 +8,17 @@ import (

 	"github.com/go-chi/chi/v5"
 	chimw "github.com/go-chi/chi/v5/middleware"
-
-	"ds2api/internal/auth"
 )

-type streamStatusClaudeAuthStub struct{}
+type streamStatusClaudeOpenAIStub struct{}

-func (streamStatusClaudeAuthStub) Determine(_ *http.Request) (*auth.RequestAuth, error) {
-	return &auth.RequestAuth{
-		UseConfigToken: false,
-		DeepSeekToken:  "direct-token",
-		CallerID:       "caller:test",
-		TriedAccounts:  map[string]bool{},
-	}, nil
+func (streamStatusClaudeOpenAIStub) ChatCompletions(w http.ResponseWriter, _ *http.Request) {
+	w.Header().Set("Content-Type", "text/event-stream")
+	w.WriteHeader(http.StatusOK)
+	_, _ = w.Write([]byte("data: {\"id\":\"chatcmpl-1\",\"object\":\"chat.completion.chunk\",\"choices\":[{\"index\":0,\"delta\":{\"content\":\"hello\"},\"finish_reason\":null}]}\n\n"))
+	_, _ = w.Write([]byte("data: [DONE]\n\n"))
 }

-func (streamStatusClaudeAuthStub) Release(_ *auth.RequestAuth) {}
-
-type streamStatusClaudeDSStub struct{}
-
-func (streamStatusClaudeDSStub) CreateSession(_ context.Context, _ *auth.RequestAuth, _ int) (string, error) {
-	return "session-id", nil
-}
-
-func (streamStatusClaudeDSStub) GetPow(_ context.Context, _ *auth.RequestAuth, _ int) (string, error) {
-	return "pow", nil
-}
-
-func (streamStatusClaudeDSStub) CallCompletion(_ context.Context, _ *auth.RequestAuth, _ map[string]any, _ string, _ int) (*http.Response, error) {
-	body := "data: {\"p\":\"response/content\",\"v\":\"hello\"}\n" + "data: [DONE]\n"
-	return &http.Response{
-		StatusCode: http.StatusOK,
-		Header:     make(http.Header),
-		Body:       ioNopCloser{strings.NewReader(body)},
-	}, nil
-}
-
-type ioNopCloser struct {
-	*strings.Reader
-}
-
-func (ioNopCloser) Close() error { return nil }
-
 type streamStatusClaudeStoreStub struct{}

 func (streamStatusClaudeStoreStub) ClaudeMapping() map[string]string {
@@ -60,6 +28,8 @@ func (streamStatusClaudeStoreStub) ClaudeMapping() map[string]string {
 	}
 }

+func (streamStatusClaudeStoreStub) CompatStripReferenceMarkers() bool { return true }
+
 func captureClaudeStatusMiddleware(statuses *[]int) func(http.Handler) http.Handler {
 	return func(next http.Handler) http.Handler {
 		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
@@ -73,9 +43,8 @@ func captureClaudeStatusMiddleware(statuses *[]int) func(http.Handler) http.Hand
 func TestClaudeMessagesStreamStatusCapturedAs200(t *testing.T) {
 	statuses := make([]int, 0, 1)
 	h := &Handler{
-		Store: streamStatusClaudeStoreStub{},
-		Auth:  streamStatusClaudeAuthStub{},
-		DS:    streamStatusClaudeDSStub{},
+		Store:  streamStatusClaudeStoreStub{},
+		OpenAI: streamStatusClaudeOpenAIStub{},
 	}
 	r := chi.NewRouter()
 	r.Use(captureClaudeStatusMiddleware(&statuses))
@@ -83,7 +52,6 @@ func TestClaudeMessagesStreamStatusCapturedAs200(t *testing.T) {

 	reqBody := `{"model":"claude-sonnet-4-5","messages":[{"role":"user","content":"hi"}],"stream":true}`
 	req := httptest.NewRequest(http.MethodPost, "/anthropic/v1/messages", strings.NewReader(reqBody))
-	req.Header.Set("Authorization", "Bearer direct-token")
 	req.Header.Set("Content-Type", "application/json")
 	rec := httptest.NewRecorder()
 	r.ServeHTTP(rec, req)
--- a/internal/adapter/claude/tool_call_state.go
+++ b/internal/adapter/claude/tool_call_state.go
@@ -0,0 +1,25 @@
+package claude
+
+import (
+	"fmt"
+	"strings"
+)
+
+type claudeToolCallState struct {
+	nameByID       map[string]string
+	lastIDByName   map[string]string
+	callIDSequence int
+}
+
+func (s *claudeToolCallState) nextID() string {
+	s.callIDSequence++
+	return fmt.Sprintf("call_claude_%d", s.callIDSequence)
+}
+
+func safeStringValue(v any) string {
+	s, ok := v.(string)
+	if !ok {
+		return ""
+	}
+	return strings.TrimSpace(s)
+}
--- a/internal/adapter/gemini/convert_messages.go
+++ b/internal/adapter/gemini/convert_messages.go
@@ -1,11 +1,20 @@
 package gemini

-import "strings"
+import (
+	"fmt"
+	"strings"
+)

 const maxGeminiRawPromptChars = 1024

 func geminiMessagesFromRequest(req map[string]any) []any {
 	out := make([]any, 0, 8)
+	toolCallCounter := 0
+	nextToolCallID := func() string {
+		toolCallCounter++
+		return fmt.Sprintf("call_gemini_%d", toolCallCounter)
+	}
+	lastToolCallIDByName := map[string]string{}
 	if sys := normalizeGeminiSystemInstruction(req["systemInstruction"]); strings.TrimSpace(sys) != "" {
 		out = append(out, map[string]any{
 			"role":    "system",
@@ -61,8 +70,11 @@ func geminiMessagesFromRequest(req map[string]any) []any {
 				if name := strings.TrimSpace(asString(fnCall["name"])); name != "" {
 					callID := strings.TrimSpace(asString(fnCall["id"]))
 					if callID == "" {
-						callID = "call_gemini"
+						if callID = strings.TrimSpace(asString(fnCall["call_id"])); callID == "" {
+							callID = nextToolCallID()
+						}
 					}
+					lastToolCallIDByName[strings.ToLower(name)] = callID
 					out = append(out, map[string]any{
 						"role": "assistant",
 						"tool_calls": []any{
@@ -91,7 +103,10 @@ func geminiMessagesFromRequest(req map[string]any) []any {
 					callID = strings.TrimSpace(asString(fnResp["tool_call_id"]))
 				}
 				if callID == "" {
-					callID = "call_gemini"
+					callID = strings.TrimSpace(lastToolCallIDByName[strings.ToLower(name)])
+				}
+				if callID == "" {
+					callID = nextToolCallID()
 				}
 				content := fnResp["response"]
 				if content == nil {
--- a/internal/adapter/gemini/convert_messages_test.go
+++ b/internal/adapter/gemini/convert_messages_test.go
@@ -82,3 +82,48 @@ func TestGeminiMessagesFromRequestPreservesUnknownPartAsRawJSONText(t *testing.T
 		t.Fatalf("expected raw base64 payload not to be embedded, got %q", content)
 	}
 }
+
+func TestGeminiMessagesFromRequestBackfillsFunctionResponseCallIDByName(t *testing.T) {
+	req := map[string]any{
+		"contents": []any{
+			map[string]any{
+				"role": "model",
+				"parts": []any{
+					map[string]any{
+						"functionCall": map[string]any{
+							"name": "search_web",
+							"args": map[string]any{"query": "docs"},
+						},
+					},
+				},
+			},
+			map[string]any{
+				"role": "user",
+				"parts": []any{
+					map[string]any{
+						"functionResponse": map[string]any{
+							"name":     "search_web",
+							"response": map[string]any{"ok": true},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	got := geminiMessagesFromRequest(req)
+	if len(got) != 2 {
+		t.Fatalf("expected two normalized messages, got %#v", got)
+	}
+	assistant, _ := got[0].(map[string]any)
+	tc, _ := assistant["tool_calls"].([]any)
+	call, _ := tc[0].(map[string]any)
+	callID, _ := call["id"].(string)
+	if !strings.HasPrefix(callID, "call_gemini_") {
+		t.Fatalf("expected generated call id prefix, got %#v", call)
+	}
+	toolMsg, _ := got[1].(map[string]any)
+	if toolMsg["tool_call_id"] != callID {
+		t.Fatalf("expected tool response to inherit generated call id, tool=%#v call=%#v", toolMsg, call)
+	}
+}
--- a/internal/adapter/gemini/convert_passthrough.go
+++ b/internal/adapter/gemini/convert_passthrough.go
@@ -5,6 +5,7 @@ import (
 	"strings"
 )

+//nolint:unused // compatibility hook for native Gemini request normalization path.
 func collectGeminiPassThrough(req map[string]any) map[string]any {
 	cfg, _ := req["generationConfig"].(map[string]any)
 	if len(cfg) == 0 {
--- a/internal/adapter/gemini/convert_request.go
+++ b/internal/adapter/gemini/convert_request.go
@@ -9,6 +9,7 @@ import (
 	"ds2api/internal/util"
 )

+//nolint:unused // kept for native Gemini adapter route compatibility.
 func normalizeGeminiRequest(store ConfigReader, routeModel string, req map[string]any, stream bool) (util.StandardRequest, error) {
 	requestedModel := strings.TrimSpace(routeModel)
 	if requestedModel == "" {
@@ -17,17 +18,17 @@ func normalizeGeminiRequest(store ConfigReader, routeModel string, req map[strin

 	resolvedModel, ok := config.ResolveModel(store, requestedModel)
 	if !ok {
-		return util.StandardRequest{}, fmt.Errorf("Model '%s' is not available.", requestedModel)
+		return util.StandardRequest{}, fmt.Errorf("model %q is not available", requestedModel)
 	}
 	thinkingEnabled, searchEnabled, _ := config.GetModelConfig(resolvedModel)

 	messagesRaw := geminiMessagesFromRequest(req)
 	if len(messagesRaw) == 0 {
-		return util.StandardRequest{}, fmt.Errorf("Request must include non-empty contents.")
+		return util.StandardRequest{}, fmt.Errorf("request must include non-empty contents")
 	}

 	toolsRaw := convertGeminiTools(req["tools"])
-	finalPrompt, toolNames := openai.BuildPromptForAdapter(messagesRaw, toolsRaw, "")
+	finalPrompt, toolNames := openai.BuildPromptForAdapter(messagesRaw, toolsRaw, "", thinkingEnabled)
 	passThrough := collectGeminiPassThrough(req)

 	return util.StandardRequest{
--- a/internal/adapter/gemini/convert_tools.go
+++ b/internal/adapter/gemini/convert_tools.go
@@ -2,6 +2,7 @@ package gemini

 import "strings"

+//nolint:unused // kept for native Gemini adapter route compatibility.
 func convertGeminiTools(raw any) []any {
 	tools, _ := raw.([]any)
 	if len(tools) == 0 {
--- a/internal/adapter/gemini/deps.go
+++ b/internal/adapter/gemini/deps.go
@@ -22,6 +22,11 @@ type DeepSeekCaller interface {

 type ConfigReader interface {
 	ModelAliases() map[string]string
+	CompatStripReferenceMarkers() bool
+}
+
+type OpenAIChatRunner interface {
+	ChatCompletions(w http.ResponseWriter, r *http.Request)
 }

 var _ AuthResolver = (*auth.Resolver)(nil)
--- a/internal/adapter/gemini/handler_generate.go
+++ b/internal/adapter/gemini/handler_generate.go
@@ -1,74 +1,140 @@
 package gemini

 import (
+	"bytes"
+	"ds2api/internal/toolcall"
 	"encoding/json"
 	"io"
 	"net/http"
+	"net/http/httptest"
 	"strings"

 	"github.com/go-chi/chi/v5"

-	"ds2api/internal/auth"
 	"ds2api/internal/sse"
+	"ds2api/internal/translatorcliproxy"
 	"ds2api/internal/util"
+
+	sdktranslator "github.com/router-for-me/CLIProxyAPI/v6/sdk/translator"
 )

 func (h *Handler) handleGenerateContent(w http.ResponseWriter, r *http.Request, stream bool) {
-	a, err := h.Auth.Determine(r)
-	if err != nil {
-		status := http.StatusUnauthorized
-		detail := err.Error()
-		if err == auth.ErrNoAccount {
-			status = http.StatusTooManyRequests
-		}
-		writeGeminiError(w, status, detail)
+	if h.OpenAI == nil {
+		writeGeminiError(w, http.StatusInternalServerError, "OpenAI proxy backend unavailable.")
 		return
 	}
-	defer h.Auth.Release(a)
-
-	var req map[string]any
-	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
-		writeGeminiError(w, http.StatusBadRequest, "invalid json")
+	if h.proxyViaOpenAI(w, r, stream) {
 		return
 	}
-
-	routeModel := strings.TrimSpace(chi.URLParam(r, "model"))
-	stdReq, err := normalizeGeminiRequest(h.Store, routeModel, req, stream)
-	if err != nil {
-		writeGeminiError(w, http.StatusBadRequest, err.Error())
-		return
-	}
-
-	sessionID, err := h.DS.CreateSession(r.Context(), a, 3)
-	if err != nil {
-		if a.UseConfigToken {
-			writeGeminiError(w, http.StatusUnauthorized, "Account token is invalid. Please re-login the account in admin.")
-		} else {
-			writeGeminiError(w, http.StatusUnauthorized, "Invalid token.")
-		}
-		return
-	}
-	pow, err := h.DS.GetPow(r.Context(), a, 3)
-	if err != nil {
-		writeGeminiError(w, http.StatusUnauthorized, "Failed to get PoW (invalid token or unknown error).")
-		return
-	}
-	payload := stdReq.CompletionPayload(sessionID)
-	resp, err := h.DS.CallCompletion(r.Context(), a, payload, pow, 3)
-	if err != nil {
-		writeGeminiError(w, http.StatusInternalServerError, "Failed to get completion.")
-		return
-	}
-
-	if stream {
-		h.handleStreamGenerateContent(w, r, resp, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames)
-		return
-	}
-	h.handleNonStreamGenerateContent(w, resp, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.ToolNames)
+	writeGeminiError(w, http.StatusBadGateway, "Failed to proxy Gemini request.")
 }

+func (h *Handler) proxyViaOpenAI(w http.ResponseWriter, r *http.Request, stream bool) bool {
+	raw, err := io.ReadAll(r.Body)
+	if err != nil {
+		writeGeminiError(w, http.StatusBadRequest, "invalid body")
+		return true
+	}
+	routeModel := strings.TrimSpace(chi.URLParam(r, "model"))
+	translatedReq := translatorcliproxy.ToOpenAI(sdktranslator.FormatGemini, routeModel, raw, stream)
+	if !strings.Contains(string(translatedReq), `"stream"`) {
+		var reqMap map[string]any
+		if json.Unmarshal(translatedReq, &reqMap) == nil {
+			reqMap["stream"] = stream
+			if b, e := json.Marshal(reqMap); e == nil {
+				translatedReq = b
+			}
+		}
+	}
+
+	isVercelPrepare := strings.TrimSpace(r.URL.Query().Get("__stream_prepare")) == "1"
+	isVercelRelease := strings.TrimSpace(r.URL.Query().Get("__stream_release")) == "1"
+
+	if isVercelRelease {
+		proxyReq := r.Clone(r.Context())
+		proxyReq.URL.Path = "/v1/chat/completions"
+		proxyReq.Body = io.NopCloser(bytes.NewReader(raw))
+		proxyReq.ContentLength = int64(len(raw))
+		rec := httptest.NewRecorder()
+		h.OpenAI.ChatCompletions(rec, proxyReq)
+		res := rec.Result()
+		defer func() { _ = res.Body.Close() }()
+		body, _ := io.ReadAll(res.Body)
+		for k, vv := range res.Header {
+			for _, v := range vv {
+				w.Header().Add(k, v)
+			}
+		}
+		w.WriteHeader(res.StatusCode)
+		_, _ = w.Write(body)
+		return true
+	}
+
+	proxyReq := r.Clone(r.Context())
+	proxyReq.URL.Path = "/v1/chat/completions"
+	proxyReq.Body = io.NopCloser(bytes.NewReader(translatedReq))
+	proxyReq.ContentLength = int64(len(translatedReq))
+
+	if stream && !isVercelPrepare {
+		w.Header().Set("Content-Type", "text/event-stream")
+		w.Header().Set("Cache-Control", "no-cache, no-transform")
+		w.Header().Set("Connection", "keep-alive")
+		w.Header().Set("X-Accel-Buffering", "no")
+		streamWriter := translatorcliproxy.NewOpenAIStreamTranslatorWriter(w, sdktranslator.FormatGemini, routeModel, raw, translatedReq)
+		h.OpenAI.ChatCompletions(streamWriter, proxyReq)
+		return true
+	}
+
+	rec := httptest.NewRecorder()
+	h.OpenAI.ChatCompletions(rec, proxyReq)
+	res := rec.Result()
+	defer func() { _ = res.Body.Close() }()
+	body, _ := io.ReadAll(res.Body)
+	if res.StatusCode < 200 || res.StatusCode >= 300 {
+		for k, vv := range res.Header {
+			for _, v := range vv {
+				w.Header().Add(k, v)
+			}
+		}
+		writeGeminiErrorFromOpenAI(w, res.StatusCode, body)
+		return true
+	}
+	if isVercelPrepare {
+		for k, vv := range res.Header {
+			for _, v := range vv {
+				w.Header().Add(k, v)
+			}
+		}
+		w.WriteHeader(res.StatusCode)
+		_, _ = w.Write(body)
+		return true
+	}
+	converted := translatorcliproxy.FromOpenAINonStream(sdktranslator.FormatGemini, routeModel, raw, translatedReq, body)
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(http.StatusOK)
+	_, _ = w.Write(converted)
+	return true
+}
+
+func writeGeminiErrorFromOpenAI(w http.ResponseWriter, status int, raw []byte) {
+	message := strings.TrimSpace(string(raw))
+	var parsed map[string]any
+	if err := json.Unmarshal(raw, &parsed); err == nil {
+		if errObj, ok := parsed["error"].(map[string]any); ok {
+			if msg, ok := errObj["message"].(string); ok && strings.TrimSpace(msg) != "" {
+				message = strings.TrimSpace(msg)
+			}
+		}
+	}
+	if message == "" {
+		message = http.StatusText(status)
+	}
+	writeGeminiError(w, status, message)
+}
+
+//nolint:unused // retained for native Gemini non-stream handling path.
 func (h *Handler) handleNonStreamGenerateContent(w http.ResponseWriter, resp *http.Response, model, finalPrompt string, thinkingEnabled bool, toolNames []string) {
-	defer resp.Body.Close()
+	defer func() { _ = resp.Body.Close() }()
 	if resp.StatusCode != http.StatusOK {
 		body, _ := io.ReadAll(resp.Body)
 		writeGeminiError(w, resp.StatusCode, strings.TrimSpace(string(body)))
@@ -76,9 +142,17 @@ func (h *Handler) handleNonStreamGenerateContent(w http.ResponseWriter, resp *ht
 	}

 	result := sse.CollectStream(resp, thinkingEnabled, true)
-	writeJSON(w, http.StatusOK, buildGeminiGenerateContentResponse(model, finalPrompt, result.Thinking, result.Text, toolNames))
+	stripReferenceMarkers := h.compatStripReferenceMarkers()
+	writeJSON(w, http.StatusOK, buildGeminiGenerateContentResponse(
+		model,
+		finalPrompt,
+		cleanVisibleOutput(result.Thinking, stripReferenceMarkers),
+		cleanVisibleOutput(result.Text, stripReferenceMarkers),
+		toolNames,
+	))
 }

+//nolint:unused // retained for native Gemini non-stream handling path.
 func buildGeminiGenerateContentResponse(model, finalPrompt, finalThinking, finalText string, toolNames []string) map[string]any {
 	parts := buildGeminiPartsFromFinal(finalText, finalThinking, toolNames)
 	usage := buildGeminiUsage(finalPrompt, finalThinking, finalText)
@@ -98,6 +172,7 @@ func buildGeminiGenerateContentResponse(model, finalPrompt, finalThinking, final
 	}
 }

+//nolint:unused // retained for native Gemini non-stream handling path.
 func buildGeminiUsage(finalPrompt, finalThinking, finalText string) map[string]any {
 	promptTokens := util.EstimateTokens(finalPrompt)
 	reasoningTokens := util.EstimateTokens(finalThinking)
@@ -109,10 +184,11 @@ func buildGeminiUsage(finalPrompt, finalThinking, finalText string) map[string]a
 	}
 }

+//nolint:unused // retained for native Gemini non-stream handling path.
 func buildGeminiPartsFromFinal(finalText, finalThinking string, toolNames []string) []map[string]any {
-	detected := util.ParseToolCalls(finalText, toolNames)
-	if len(detected) == 0 && strings.TrimSpace(finalThinking) != "" {
-		detected = util.ParseToolCalls(finalThinking, toolNames)
+	detected := toolcall.ParseToolCalls(finalText, toolNames)
+	if len(detected) == 0 && finalThinking != "" {
+		detected = toolcall.ParseToolCalls(finalThinking, toolNames)
 	}
 	if len(detected) > 0 {
 		parts := make([]map[string]any, 0, len(detected))
@@ -128,7 +204,7 @@ func buildGeminiPartsFromFinal(finalText, finalThinking string, toolNames []stri
 	}

 	text := finalText
-	if strings.TrimSpace(text) == "" {
+	if text == "" {
 		text = finalThinking
 	}
 	return []map[string]any{{"text": text}}
--- a/internal/adapter/gemini/handler_routes.go
+++ b/internal/adapter/gemini/handler_routes.go
@@ -11,9 +11,18 @@ import (
 var writeJSON = util.WriteJSON

 type Handler struct {
-	Store ConfigReader
-	Auth  AuthResolver
-	DS    DeepSeekCaller
+	Store  ConfigReader
+	Auth   AuthResolver
+	DS     DeepSeekCaller
+	OpenAI OpenAIChatRunner
+}
+
+//nolint:unused // used by native Gemini stream/non-stream runtime helpers.
+func (h *Handler) compatStripReferenceMarkers() bool {
+	if h == nil || h.Store == nil {
+		return true
+	}
+	return h.Store.CompatStripReferenceMarkers()
 }

 func RegisterRoutes(r chi.Router, h *Handler) {
--- a/internal/adapter/gemini/handler_stream_runtime.go
+++ b/internal/adapter/gemini/handler_stream_runtime.go
@@ -12,8 +12,9 @@ import (
 	streamengine "ds2api/internal/stream"
 )

+//nolint:unused // retained for native Gemini stream handling path.
 func (h *Handler) handleStreamGenerateContent(w http.ResponseWriter, r *http.Request, resp *http.Response, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string) {
-	defer resp.Body.Close()
+	defer func() { _ = resp.Body.Close() }()
 	if resp.StatusCode != http.StatusOK {
 		body, _ := io.ReadAll(resp.Body)
 		writeGeminiError(w, resp.StatusCode, strings.TrimSpace(string(body)))
@@ -27,7 +28,7 @@ func (h *Handler) handleStreamGenerateContent(w http.ResponseWriter, r *http.Req

 	rc := http.NewResponseController(w)
 	_, canFlush := w.(http.Flusher)
-	runtime := newGeminiStreamRuntime(w, rc, canFlush, model, finalPrompt, thinkingEnabled, searchEnabled, toolNames)
+	runtime := newGeminiStreamRuntime(w, rc, canFlush, model, finalPrompt, thinkingEnabled, searchEnabled, h.compatStripReferenceMarkers(), toolNames)

 	initialType := "text"
 	if thinkingEnabled {
@@ -49,6 +50,7 @@ func (h *Handler) handleStreamGenerateContent(w http.ResponseWriter, r *http.Req
 	})
 }

+//nolint:unused // retained for native Gemini stream handling path.
 type geminiStreamRuntime struct {
 	w        http.ResponseWriter
 	rc       *http.ResponseController
@@ -57,15 +59,17 @@ type geminiStreamRuntime struct {
 	model       string
 	finalPrompt string

-	thinkingEnabled bool
-	searchEnabled   bool
-	bufferContent   bool
-	toolNames       []string
+	thinkingEnabled       bool
+	searchEnabled         bool
+	bufferContent         bool
+	stripReferenceMarkers bool
+	toolNames             []string

 	thinking strings.Builder
 	text     strings.Builder
 }

+//nolint:unused // retained for native Gemini stream handling path.
 func newGeminiStreamRuntime(
 	w http.ResponseWriter,
 	rc *http.ResponseController,
@@ -74,21 +78,24 @@ func newGeminiStreamRuntime(
 	finalPrompt string,
 	thinkingEnabled bool,
 	searchEnabled bool,
+	stripReferenceMarkers bool,
 	toolNames []string,
 ) *geminiStreamRuntime {
 	return &geminiStreamRuntime{
-		w:               w,
-		rc:              rc,
-		canFlush:        canFlush,
-		model:           model,
-		finalPrompt:     finalPrompt,
-		thinkingEnabled: thinkingEnabled,
-		searchEnabled:   searchEnabled,
-		bufferContent:   len(toolNames) > 0,
-		toolNames:       toolNames,
+		w:                     w,
+		rc:                    rc,
+		canFlush:              canFlush,
+		model:                 model,
+		finalPrompt:           finalPrompt,
+		thinkingEnabled:       thinkingEnabled,
+		searchEnabled:         searchEnabled,
+		bufferContent:         len(toolNames) > 0,
+		stripReferenceMarkers: stripReferenceMarkers,
+		toolNames:             toolNames,
 	}
 }

+//nolint:unused // retained for native Gemini stream handling path.
 func (s *geminiStreamRuntime) sendChunk(payload map[string]any) {
 	b, _ := json.Marshal(payload)
 	_, _ = s.w.Write([]byte("data: "))
@@ -99,6 +106,7 @@ func (s *geminiStreamRuntime) sendChunk(payload map[string]any) {
 	}
 }

+//nolint:unused // retained for native Gemini stream handling path.
 func (s *geminiStreamRuntime) onParsed(parsed sse.LineResult) streamengine.ParsedDecision {
 	if !parsed.Parsed {
 		return streamengine.ParsedDecision{}
@@ -109,20 +117,29 @@ func (s *geminiStreamRuntime) onParsed(parsed sse.LineResult) streamengine.Parse

 	contentSeen := false
 	for _, p := range parsed.Parts {
-		if p.Text == "" {
+		cleanedText := cleanVisibleOutput(p.Text, s.stripReferenceMarkers)
+		if cleanedText == "" {
 			continue
 		}
-		if p.Type != "thinking" && s.searchEnabled && sse.IsCitation(p.Text) {
+		if p.Type != "thinking" && s.searchEnabled && sse.IsCitation(cleanedText) {
 			continue
 		}
 		contentSeen = true
 		if p.Type == "thinking" {
 			if s.thinkingEnabled {
-				s.thinking.WriteString(p.Text)
+				trimmed := sse.TrimContinuationOverlap(s.thinking.String(), cleanedText)
+				if trimmed == "" {
+					continue
+				}
+				s.thinking.WriteString(trimmed)
 			}
 			continue
 		}
-		s.text.WriteString(p.Text)
+		trimmed := sse.TrimContinuationOverlap(s.text.String(), cleanedText)
+		if trimmed == "" {
+			continue
+		}
+		s.text.WriteString(trimmed)
 		if s.bufferContent {
 			continue
 		}
@@ -132,7 +149,7 @@ func (s *geminiStreamRuntime) onParsed(parsed sse.LineResult) streamengine.Parse
 					"index": 0,
 					"content": map[string]any{
 						"role":  "model",
-						"parts": []map[string]any{{"text": p.Text}},
+						"parts": []map[string]any{{"text": trimmed}},
 					},
 				},
 			},
@@ -142,9 +159,10 @@ func (s *geminiStreamRuntime) onParsed(parsed sse.LineResult) streamengine.Parse
 	return streamengine.ParsedDecision{ContentSeen: contentSeen}
 }

+//nolint:unused // retained for native Gemini stream handling path.
 func (s *geminiStreamRuntime) finalize() {
 	finalThinking := s.thinking.String()
-	finalText := s.text.String()
+	finalText := cleanVisibleOutput(s.text.String(), s.stripReferenceMarkers)

 	if s.bufferContent {
 		parts := buildGeminiPartsFromFinal(finalText, finalThinking, s.toolNames)
--- a/internal/adapter/gemini/handler_test.go
+++ b/internal/adapter/gemini/handler_test.go
@@ -17,7 +17,8 @@ import (

 type testGeminiConfig struct{}

-func (testGeminiConfig) ModelAliases() map[string]string { return nil }
+func (testGeminiConfig) ModelAliases() map[string]string   { return nil }
+func (testGeminiConfig) CompatStripReferenceMarkers() bool { return true }

 type testGeminiAuth struct {
 	a   *auth.RequestAuth
@@ -41,19 +42,23 @@ func (m testGeminiAuth) Determine(_ *http.Request) (*auth.RequestAuth, error) {

 func (testGeminiAuth) Release(_ *auth.RequestAuth) {}

+//nolint:unused // reserved test double for native Gemini DS-call path coverage.
 type testGeminiDS struct {
 	resp *http.Response
 	err  error
 }

+//nolint:unused // reserved test double for native Gemini DS-call path coverage.
 func (m testGeminiDS) CreateSession(_ context.Context, _ *auth.RequestAuth, _ int) (string, error) {
 	return "session-id", nil
 }

+//nolint:unused // reserved test double for native Gemini DS-call path coverage.
 func (m testGeminiDS) GetPow(_ context.Context, _ *auth.RequestAuth, _ int) (string, error) {
 	return "pow", nil
 }

+//nolint:unused // reserved test double for native Gemini DS-call path coverage.
 func (m testGeminiDS) CallCompletion(_ context.Context, _ *auth.RequestAuth, _ map[string]any, _ string, _ int) (*http.Response, error) {
 	if m.err != nil {
 		return nil, m.err
@@ -61,6 +66,51 @@ func (m testGeminiDS) CallCompletion(_ context.Context, _ *auth.RequestAuth, _ m
 	return m.resp, nil
 }

+type geminiOpenAIErrorStub struct {
+	status  int
+	body    string
+	headers map[string]string
+}
+
+func (s geminiOpenAIErrorStub) ChatCompletions(w http.ResponseWriter, _ *http.Request) {
+	for k, v := range s.headers {
+		w.Header().Set(k, v)
+	}
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(s.status)
+	_, _ = w.Write([]byte(s.body))
+}
+
+type geminiOpenAISuccessStub struct {
+	stream  bool
+	body    string
+	seenReq map[string]any
+}
+
+func (s *geminiOpenAISuccessStub) ChatCompletions(w http.ResponseWriter, r *http.Request) {
+	if r != nil {
+		var req map[string]any
+		_ = json.NewDecoder(r.Body).Decode(&req)
+		s.seenReq = req
+	}
+	if s.stream {
+		w.Header().Set("Content-Type", "text/event-stream")
+		w.WriteHeader(http.StatusOK)
+		_, _ = w.Write([]byte("data: {\"id\":\"chatcmpl-1\",\"object\":\"chat.completion.chunk\",\"choices\":[{\"index\":0,\"delta\":{\"content\":\"hello \"},\"finish_reason\":null}]}\n\n"))
+		_, _ = w.Write([]byte("data: {\"id\":\"chatcmpl-1\",\"object\":\"chat.completion.chunk\",\"choices\":[{\"index\":0,\"delta\":{\"content\":\"world\"},\"finish_reason\":\"stop\"}]}\n\n"))
+		_, _ = w.Write([]byte("data: [DONE]\n\n"))
+		return
+	}
+	out := s.body
+	if strings.TrimSpace(out) == "" {
+		out = `{"id":"chatcmpl-1","object":"chat.completion","choices":[{"index":0,"message":{"role":"assistant","tool_calls":[{"id":"call_1","type":"function","function":{"name":"eval_javascript","arguments":"{\"code\":\"1+1\"}"}}]},"finish_reason":"tool_calls"}]}`
+	}
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(http.StatusOK)
+	_, _ = w.Write([]byte(out))
+}
+
+//nolint:unused // helper retained for native Gemini stream fixture tests.
 func makeGeminiUpstreamResponse(lines ...string) *http.Response {
 	body := strings.Join(lines, "\n")
 	if !strings.HasSuffix(body, "\n") {
@@ -98,14 +148,11 @@ func TestGeminiRoutesRegistered(t *testing.T) {
 }

 func TestGenerateContentReturnsFunctionCallParts(t *testing.T) {
-	upstream := makeGeminiUpstreamResponse(
-		`data: {"p":"response/content","v":"{\"tool_calls\":[{\"name\":\"eval_javascript\",\"input\":{\"code\":\"1+1\"}}]}"}`,
-		`data: [DONE]`,
-	)
 	h := &Handler{
 		Store: testGeminiConfig{},
-		Auth:  testGeminiAuth{},
-		DS:    testGeminiDS{resp: upstream},
+		OpenAI: &geminiOpenAISuccessStub{
+			body: `{"id":"chatcmpl-1","object":"chat.completion","choices":[{"index":0,"message":{"role":"assistant","tool_calls":[{"id":"call_1","type":"function","function":{"name":"eval_javascript","arguments":"{\"code\":\"1+1\"}"}}]},"finish_reason":"tool_calls"}]}`,
+		},
 	}
 	r := chi.NewRouter()
 	RegisterRoutes(r, h)
@@ -115,7 +162,6 @@ func TestGenerateContentReturnsFunctionCallParts(t *testing.T) {
 		"tools":[{"functionDeclarations":[{"name":"eval_javascript","description":"eval","parameters":{"type":"object","properties":{"code":{"type":"string"}}}}]}]
 	}`
 	req := httptest.NewRequest(http.MethodPost, "/v1beta/models/gemini-2.5-pro:generateContent", strings.NewReader(body))
-	req.Header.Set("Authorization", "Bearer direct-token")
 	rec := httptest.NewRecorder()
 	r.ServeHTTP(rec, req)
 	if rec.Code != http.StatusOK {
@@ -144,11 +190,7 @@ func TestGenerateContentReturnsFunctionCallParts(t *testing.T) {
 }

 func TestGenerateContentMixedToolSnippetAlsoTriggersFunctionCall(t *testing.T) {
-	upstream := makeGeminiUpstreamResponse(
-		`data: {"p":"response/content","v":"我来调用工具\n{\"tool_calls\":[{\"name\":\"eval_javascript\",\"input\":{\"code\":\"1+1\"}}]}"}`,
-		`data: [DONE]`,
-	)
-	h := &Handler{Store: testGeminiConfig{}, Auth: testGeminiAuth{}, DS: testGeminiDS{resp: upstream}}
+	h := &Handler{Store: testGeminiConfig{}, OpenAI: &geminiOpenAISuccessStub{}}
 	r := chi.NewRouter()
 	RegisterRoutes(r, h)

@@ -157,7 +199,6 @@ func TestGenerateContentMixedToolSnippetAlsoTriggersFunctionCall(t *testing.T) {
 		"tools":[{"functionDeclarations":[{"name":"eval_javascript","description":"eval","parameters":{"type":"object","properties":{"code":{"type":"string"}}}}]}]
 	}`
 	req := httptest.NewRequest(http.MethodPost, "/v1beta/models/gemini-2.5-pro:generateContent", strings.NewReader(body))
-	req.Header.Set("Authorization", "Bearer direct-token")
 	rec := httptest.NewRecorder()
 	r.ServeHTTP(rec, req)

@@ -180,38 +221,25 @@ func TestGenerateContentMixedToolSnippetAlsoTriggersFunctionCall(t *testing.T) {
 }

 func TestStreamGenerateContentEmitsSSE(t *testing.T) {
-	upstream := makeGeminiUpstreamResponse(
-		`data: {"p":"response/content","v":"hello "}`,
-		`data: {"p":"response/content","v":"world"}`,
-		`data: [DONE]`,
-	)
 	h := &Handler{
-		Store: testGeminiConfig{},
-		Auth:  testGeminiAuth{},
-		DS:    testGeminiDS{resp: upstream},
+		Store:  testGeminiConfig{},
+		OpenAI: &geminiOpenAISuccessStub{stream: true},
 	}
 	r := chi.NewRouter()
 	RegisterRoutes(r, h)

 	body := `{"contents":[{"role":"user","parts":[{"text":"hello"}]}]}`
 	req := httptest.NewRequest(http.MethodPost, "/v1/models/gemini-2.5-pro:streamGenerateContent?alt=sse", strings.NewReader(body))
-	req.Header.Set("Authorization", "Bearer direct-token")
 	rec := httptest.NewRecorder()
 	r.ServeHTTP(rec, req)

 	if rec.Code != http.StatusOK {
 		t.Fatalf("expected 200, got %d body=%s", rec.Code, rec.Body.String())
 	}
-	if !strings.Contains(rec.Body.String(), "data: ") {
-		t.Fatalf("expected SSE data frames, got body=%s", rec.Body.String())
-	}
-	if !strings.Contains(rec.Body.String(), `"finishReason":"STOP"`) {
-		t.Fatalf("expected stream finish frame, got body=%s", rec.Body.String())
-	}

 	frames := extractGeminiSSEFrames(t, rec.Body.String())
 	if len(frames) == 0 {
-		t.Fatalf("expected non-empty sse frames, body=%s", rec.Body.String())
+		t.Fatalf("expected non-empty stream frames, body=%s", rec.Body.String())
 	}
 	last := frames[len(frames)-1]
 	candidates, _ := last["candidates"].([]any)
@@ -229,16 +257,94 @@ func TestStreamGenerateContentEmitsSSE(t *testing.T) {
 	}
 }

+func TestGeminiProxyTranslatesInlineImageToOpenAIDataURL(t *testing.T) {
+	openAI := &geminiOpenAISuccessStub{}
+	h := &Handler{Store: testGeminiConfig{}, OpenAI: openAI}
+	r := chi.NewRouter()
+	RegisterRoutes(r, h)
+
+	body := `{"contents":[{"role":"user","parts":[{"text":"hello"},{"inlineData":{"mimeType":"image/png","data":"QUJDRA=="}}]}]}`
+	req := httptest.NewRequest(http.MethodPost, "/v1beta/models/gemini-2.5-pro:generateContent", strings.NewReader(body))
+	rec := httptest.NewRecorder()
+	r.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d body=%s", rec.Code, rec.Body.String())
+	}
+	messages, _ := openAI.seenReq["messages"].([]any)
+	if len(messages) != 1 {
+		t.Fatalf("expected one translated message, got %#v", openAI.seenReq)
+	}
+	msg, _ := messages[0].(map[string]any)
+	content, _ := msg["content"].([]any)
+	if len(content) != 2 {
+		t.Fatalf("expected translated content blocks, got %#v", msg)
+	}
+	imageBlock, _ := content[1].(map[string]any)
+	if strings.TrimSpace(asString(imageBlock["type"])) != "image_url" {
+		t.Fatalf("expected image_url block, got %#v", imageBlock)
+	}
+	imageURL, _ := imageBlock["image_url"].(map[string]any)
+	if !strings.HasPrefix(strings.TrimSpace(asString(imageURL["url"])), "data:image/png;base64,") {
+		t.Fatalf("expected translated data url, got %#v", imageBlock)
+	}
+}
+
+func TestGenerateContentOpenAIProxyErrorUsesGeminiEnvelope(t *testing.T) {
+	h := &Handler{
+		Store: testGeminiConfig{},
+		OpenAI: geminiOpenAIErrorStub{
+			status: http.StatusUnauthorized,
+			body:   `{"error":{"message":"invalid api key"}}`,
+			headers: map[string]string{
+				"WWW-Authenticate":      `Bearer realm="example"`,
+				"Retry-After":           "30",
+				"X-RateLimit-Remaining": "0",
+			},
+		},
+	}
+	r := chi.NewRouter()
+	RegisterRoutes(r, h)
+
+	req := httptest.NewRequest(http.MethodPost, "/v1/models/gemini-2.5-pro:generateContent", strings.NewReader(`{"contents":[{"role":"user","parts":[{"text":"hi"}]}]}`))
+	rec := httptest.NewRecorder()
+	r.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusUnauthorized {
+		t.Fatalf("expected 401, got %d body=%s", rec.Code, rec.Body.String())
+	}
+	var out map[string]any
+	if err := json.Unmarshal(rec.Body.Bytes(), &out); err != nil {
+		t.Fatalf("expected json body: %v", err)
+	}
+	errObj, _ := out["error"].(map[string]any)
+	if errObj["status"] != "UNAUTHENTICATED" {
+		t.Fatalf("expected Gemini status UNAUTHENTICATED, got=%v", errObj["status"])
+	}
+	if errObj["message"] != "invalid api key" {
+		t.Fatalf("expected parsed error message, got=%v", errObj["message"])
+	}
+	if got := rec.Header().Get("WWW-Authenticate"); got == "" {
+		t.Fatalf("expected WWW-Authenticate header to be preserved")
+	}
+	if got := rec.Header().Get("Retry-After"); got != "30" {
+		t.Fatalf("expected Retry-After header 30, got=%q", got)
+	}
+	if got := rec.Header().Get("X-RateLimit-Remaining"); got != "0" {
+		t.Fatalf("expected X-RateLimit-Remaining header 0, got=%q", got)
+	}
+}
+
 func extractGeminiSSEFrames(t *testing.T, body string) []map[string]any {
 	t.Helper()
 	scanner := bufio.NewScanner(strings.NewReader(body))
 	out := make([]map[string]any, 0, 4)
 	for scanner.Scan() {
 		line := strings.TrimSpace(scanner.Text())
-		if !strings.HasPrefix(line, "data: ") {
-			continue
+		raw := line
+		if strings.HasPrefix(line, "data: ") {
+			raw = strings.TrimSpace(strings.TrimPrefix(line, "data: "))
 		}
-		raw := strings.TrimSpace(strings.TrimPrefix(line, "data: "))
 		if raw == "" {
 			continue
 		}
--- a/internal/adapter/gemini/output_clean.go
+++ b/internal/adapter/gemini/output_clean.go
@@ -0,0 +1,14 @@
+package gemini
+
+import textclean "ds2api/internal/textclean"
+
+//nolint:unused // retained for native Gemini output post-processing path.
+func cleanVisibleOutput(text string, stripReferenceMarkers bool) string {
+	if text == "" {
+		return text
+	}
+	if stripReferenceMarkers {
+		text = textclean.StripReferenceMarkers(text)
+	}
+	return text
+}
--- a/internal/adapter/gemini/proxy_vercel_test.go
+++ b/internal/adapter/gemini/proxy_vercel_test.go
@@ -0,0 +1,42 @@
+package gemini
+
+import (
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+)
+
+type openAIProxyStub struct {
+	status int
+	body   string
+}
+
+func (s openAIProxyStub) ChatCompletions(w http.ResponseWriter, _ *http.Request) {
+	if s.status == 0 {
+		s.status = http.StatusOK
+	}
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(s.status)
+	_, _ = w.Write([]byte(s.body))
+}
+
+func TestGeminiProxyViaOpenAIVercelReleasePassthrough(t *testing.T) {
+	h := &Handler{OpenAI: openAIProxyStub{status: 200, body: `{"success":true}`}}
+	req := httptest.NewRequest(http.MethodPost, "/v1beta/models/gemini-2.5-pro:streamGenerateContent?__stream_release=1", strings.NewReader(`{"lease_id":"lease_123"}`))
+	rec := httptest.NewRecorder()
+
+	h.StreamGenerateContent(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("unexpected status: %d body=%s", rec.Code, rec.Body.String())
+	}
+	var out map[string]any
+	if err := json.Unmarshal(rec.Body.Bytes(), &out); err != nil {
+		t.Fatalf("expected json response, got err=%v body=%s", err, rec.Body.String())
+	}
+	if v, ok := out["success"].(bool); !ok || !v {
+		t.Fatalf("expected success=true passthrough, got=%v", out)
+	}
+}
--- a/internal/adapter/openai/chat_stream_runtime.go
+++ b/internal/adapter/openai/chat_stream_runtime.go
@@ -1,6 +1,7 @@
 package openai

 import (
+	"ds2api/internal/toolcall"
 	"encoding/json"
 	"net/http"
 	"strings"
@@ -8,7 +9,6 @@ import (
 	openaifmt "ds2api/internal/format/openai"
 	"ds2api/internal/sse"
 	streamengine "ds2api/internal/stream"
-	"ds2api/internal/util"
 )

 type chatStreamRuntime struct {
@@ -22,8 +22,9 @@ type chatStreamRuntime struct {
 	finalPrompt  string
 	toolNames    []string

-	thinkingEnabled bool
-	searchEnabled   bool
+	thinkingEnabled       bool
+	searchEnabled         bool
+	stripReferenceMarkers bool

 	firstChunkSent       bool
 	bufferToolContent    bool
@@ -48,25 +49,27 @@ func newChatStreamRuntime(
 	finalPrompt string,
 	thinkingEnabled bool,
 	searchEnabled bool,
+	stripReferenceMarkers bool,
 	toolNames []string,
 	bufferToolContent bool,
 	emitEarlyToolDeltas bool,
 ) *chatStreamRuntime {
 	return &chatStreamRuntime{
-		w:                   w,
-		rc:                  rc,
-		canFlush:            canFlush,
-		completionID:        completionID,
-		created:             created,
-		model:               model,
-		finalPrompt:         finalPrompt,
-		toolNames:           toolNames,
-		thinkingEnabled:     thinkingEnabled,
-		searchEnabled:       searchEnabled,
-		bufferToolContent:   bufferToolContent,
-		emitEarlyToolDeltas: emitEarlyToolDeltas,
-		streamToolCallIDs:   map[int]string{},
-		streamToolNames:     map[int]string{},
+		w:                     w,
+		rc:                    rc,
+		canFlush:              canFlush,
+		completionID:          completionID,
+		created:               created,
+		model:                 model,
+		finalPrompt:           finalPrompt,
+		toolNames:             toolNames,
+		thinkingEnabled:       thinkingEnabled,
+		searchEnabled:         searchEnabled,
+		stripReferenceMarkers: stripReferenceMarkers,
+		bufferToolContent:     bufferToolContent,
+		emitEarlyToolDeltas:   emitEarlyToolDeltas,
+		streamToolCallIDs:     map[int]string{},
+		streamToolNames:       map[int]string{},
 	}
 }

@@ -95,10 +98,23 @@ func (s *chatStreamRuntime) sendDone() {
 	}
 }

+func (s *chatStreamRuntime) sendFailedChunk(status int, message, code string) {
+	s.sendChunk(map[string]any{
+		"status_code": status,
+		"error": map[string]any{
+			"message": message,
+			"type":    openAIErrorType(status),
+			"code":    code,
+			"param":   nil,
+		},
+	})
+	s.sendDone()
+}
+
 func (s *chatStreamRuntime) finalize(finishReason string) {
 	finalThinking := s.thinking.String()
-	finalText := sanitizeLeakedToolHistory(s.text.String())
-	detected := util.ParseStandaloneToolCallsDetailed(finalText, s.toolNames)
+	finalText := cleanVisibleOutput(s.text.String(), s.stripReferenceMarkers)
+	detected := toolcall.ParseStandaloneToolCallsDetailed(finalText, s.toolNames)
 	if len(detected.Calls) > 0 && !s.toolCallsDoneEmitted {
 		finishReason = "tool_calls"
 		delta := map[string]any{
@@ -141,7 +157,7 @@ func (s *chatStreamRuntime) finalize(finishReason string) {
 			if evt.Content == "" {
 				continue
 			}
-			cleaned := sanitizeLeakedToolHistory(evt.Content)
+			cleaned := cleanVisibleOutput(evt.Content, s.stripReferenceMarkers)
 			if cleaned == "" {
 				continue
 			}
@@ -165,12 +181,28 @@ func (s *chatStreamRuntime) finalize(finishReason string) {
 	if len(detected.Calls) > 0 || s.toolCallsEmitted {
 		finishReason = "tool_calls"
 	}
+	if len(detected.Calls) == 0 && !s.toolCallsEmitted && strings.TrimSpace(finalText) == "" {
+		status := http.StatusTooManyRequests
+		message := "Upstream model returned empty output."
+		code := "upstream_empty_output"
+		if strings.TrimSpace(finalThinking) != "" {
+			message = "Upstream model returned reasoning without visible output."
+		}
+		if finishReason == "content_filter" {
+			status = http.StatusBadRequest
+			message = "Upstream content filtered the response and returned no output."
+			code = "content_filter"
+		}
+		s.sendFailedChunk(status, message, code)
+		return
+	}
+	usage := openaifmt.BuildChatUsage(s.finalPrompt, finalThinking, finalText)
 	s.sendChunk(openaifmt.BuildChatStreamChunk(
 		s.completionID,
 		s.created,
 		s.model,
 		[]map[string]any{openaifmt.BuildChatStreamFinishChoice(0, finishReason)},
-		openaifmt.BuildChatUsage(s.finalPrompt, finalThinking, finalText),
+		usage,
 	))
 	s.sendDone()
 }
@@ -179,7 +211,13 @@ func (s *chatStreamRuntime) onParsed(parsed sse.LineResult) streamengine.ParsedD
 	if !parsed.Parsed {
 		return streamengine.ParsedDecision{}
 	}
-	if parsed.ContentFilter || parsed.ErrorMessage != "" {
+	if parsed.ContentFilter {
+		if strings.TrimSpace(s.text.String()) == "" {
+			return streamengine.ParsedDecision{Stop: true, StopReason: streamengine.StopReason("content_filter")}
+		}
+		return streamengine.ParsedDecision{Stop: true, StopReason: streamengine.StopReasonHandlerRequested}
+	}
+	if parsed.ErrorMessage != "" {
 		return streamengine.ParsedDecision{Stop: true, StopReason: streamengine.StopReason("content_filter")}
 	}
 	if parsed.Stop {
@@ -189,10 +227,11 @@ func (s *chatStreamRuntime) onParsed(parsed sse.LineResult) streamengine.ParsedD
 	newChoices := make([]map[string]any, 0, len(parsed.Parts))
 	contentSeen := false
 	for _, p := range parsed.Parts {
-		if s.searchEnabled && sse.IsCitation(p.Text) {
+		cleanedText := cleanVisibleOutput(p.Text, s.stripReferenceMarkers)
+		if s.searchEnabled && sse.IsCitation(cleanedText) {
 			continue
 		}
-		if p.Text == "" {
+		if cleanedText == "" {
 			continue
 		}
 		contentSeen = true
@@ -203,21 +242,29 @@ func (s *chatStreamRuntime) onParsed(parsed sse.LineResult) streamengine.ParsedD
 		}
 		if p.Type == "thinking" {
 			if s.thinkingEnabled {
-				s.thinking.WriteString(p.Text)
-				delta["reasoning_content"] = p.Text
+				trimmed := sse.TrimContinuationOverlap(s.thinking.String(), cleanedText)
+				if trimmed == "" {
+					continue
+				}
+				s.thinking.WriteString(trimmed)
+				delta["reasoning_content"] = trimmed
 			}
 		} else {
-			s.text.WriteString(p.Text)
+			trimmed := sse.TrimContinuationOverlap(s.text.String(), cleanedText)
+			if trimmed == "" {
+				continue
+			}
+			s.text.WriteString(trimmed)
 			if !s.bufferToolContent {
-				delta["content"] = p.Text
+				delta["content"] = trimmed
 			} else {
-				events := processToolSieveChunk(&s.toolSieve, p.Text, s.toolNames)
+				events := processToolSieveChunk(&s.toolSieve, trimmed, s.toolNames)
 				for _, evt := range events {
 					if len(evt.ToolCallDeltas) > 0 {
 						if !s.emitEarlyToolDeltas {
 							continue
 						}
-						filtered := filterIncrementalToolCallDeltasByAllowed(evt.ToolCallDeltas, s.toolNames, s.streamToolNames)
+						filtered := filterIncrementalToolCallDeltasByAllowed(evt.ToolCallDeltas, s.streamToolNames)
 						if len(filtered) == 0 {
 							continue
 						}
@@ -250,7 +297,7 @@ func (s *chatStreamRuntime) onParsed(parsed sse.LineResult) streamengine.ParsedD
 						continue
 					}
 					if evt.Content != "" {
-						cleaned := sanitizeLeakedToolHistory(evt.Content)
+						cleaned := cleanVisibleOutput(evt.Content, s.stripReferenceMarkers)
 						if cleaned == "" {
 							continue
 						}
--- a/internal/adapter/openai/citation_links.go
+++ b/internal/adapter/openai/citation_links.go
@@ -0,0 +1,31 @@
+package openai
+
+import (
+	"fmt"
+	"regexp"
+	"strconv"
+	"strings"
+)
+
+var citationMarkerPattern = regexp.MustCompile(`(?i)\[citation:\s*(\d+)\]`)
+
+func replaceCitationMarkersWithLinks(text string, links map[int]string) string {
+	if strings.TrimSpace(text) == "" || len(links) == 0 {
+		return text
+	}
+	return citationMarkerPattern.ReplaceAllStringFunc(text, func(match string) string {
+		sub := citationMarkerPattern.FindStringSubmatch(match)
+		if len(sub) < 2 {
+			return match
+		}
+		idx, err := strconv.Atoi(strings.TrimSpace(sub[1]))
+		if err != nil || idx <= 0 {
+			return match
+		}
+		url := strings.TrimSpace(links[idx])
+		if url == "" {
+			return match
+		}
+		return fmt.Sprintf("[%d](%s)", idx, url)
+	})
+}
--- a/internal/adapter/openai/citation_links_test.go
+++ b/internal/adapter/openai/citation_links_test.go
@@ -0,0 +1,28 @@
+package openai
+
+import "testing"
+
+func TestReplaceCitationMarkersWithLinks(t *testing.T) {
+	raw := "这是一条更新[citation:1]，更多信息见[citation:2]。"
+	links := map[int]string{
+		1: "https://example.com/news-1",
+		2: "https://example.com/news-2",
+	}
+
+	got := replaceCitationMarkersWithLinks(raw, links)
+	want := "这是一条更新[1](https://example.com/news-1)，更多信息见[2](https://example.com/news-2)。"
+	if got != want {
+		t.Fatalf("expected %q, got %q", want, got)
+	}
+}
+
+func TestReplaceCitationMarkersWithLinksKeepsUnknownIndex(t *testing.T) {
+	raw := "只有一个来源[citation:1]，未知来源[citation:3]。"
+	links := map[int]string{1: "https://example.com/a"}
+
+	got := replaceCitationMarkersWithLinks(raw, links)
+	want := "只有一个来源[1](https://example.com/a)，未知来源[citation:3]。"
+	if got != want {
+		t.Fatalf("expected %q, got %q", want, got)
+	}
+}
--- a/internal/adapter/openai/deps.go
+++ b/internal/adapter/openai/deps.go
@@ -18,17 +18,21 @@ type AuthResolver interface {
 type DeepSeekCaller interface {
 	CreateSession(ctx context.Context, a *auth.RequestAuth, maxAttempts int) (string, error)
 	GetPow(ctx context.Context, a *auth.RequestAuth, maxAttempts int) (string, error)
+	UploadFile(ctx context.Context, a *auth.RequestAuth, req deepseek.UploadFileRequest, maxAttempts int) (*deepseek.UploadFileResult, error)
 	CallCompletion(ctx context.Context, a *auth.RequestAuth, payload map[string]any, powResp string, maxAttempts int) (*http.Response, error)
+	DeleteSessionForToken(ctx context.Context, token string, sessionID string) (*deepseek.DeleteSessionResult, error)
 	DeleteAllSessionsForToken(ctx context.Context, token string) error
 }

 type ConfigReader interface {
 	ModelAliases() map[string]string
 	CompatWideInputStrictOutput() bool
+	CompatStripReferenceMarkers() bool
 	ToolcallMode() string
 	ToolcallEarlyEmitConfidence() string
 	ResponsesStoreTTLSeconds() int
 	EmbeddingsProvider() string
+	AutoDeleteMode() string
 	AutoDeleteSessions() bool
 }

--- a/internal/adapter/openai/deps_injection_test.go
+++ b/internal/adapter/openai/deps_injection_test.go
@@ -3,23 +3,31 @@ package openai
 import "testing"

 type mockOpenAIConfig struct {
-	aliases      map[string]string
-	wideInput    bool
-	toolMode     string
-	earlyEmit    string
-	responsesTTL int
-	embedProv    string
+	aliases        map[string]string
+	wideInput      bool
+	autoDeleteMode string
+	toolMode       string
+	earlyEmit      string
+	responsesTTL   int
+	embedProv      string
 }

 func (m mockOpenAIConfig) ModelAliases() map[string]string { return m.aliases }
 func (m mockOpenAIConfig) CompatWideInputStrictOutput() bool {
 	return m.wideInput
 }
+func (m mockOpenAIConfig) CompatStripReferenceMarkers() bool   { return true }
 func (m mockOpenAIConfig) ToolcallMode() string                { return m.toolMode }
 func (m mockOpenAIConfig) ToolcallEarlyEmitConfidence() string { return m.earlyEmit }
 func (m mockOpenAIConfig) ResponsesStoreTTLSeconds() int       { return m.responsesTTL }
 func (m mockOpenAIConfig) EmbeddingsProvider() string          { return m.embedProv }
-func (m mockOpenAIConfig) AutoDeleteSessions() bool            { return false }
+func (m mockOpenAIConfig) AutoDeleteMode() string {
+	if m.autoDeleteMode == "" {
+		return "none"
+	}
+	return m.autoDeleteMode
+}
+func (m mockOpenAIConfig) AutoDeleteSessions() bool { return false }

 func TestNormalizeOpenAIChatRequestWithConfigInterface(t *testing.T) {
 	cfg := mockOpenAIConfig{
--- a/internal/adapter/openai/embeddings_handler.go
+++ b/internal/adapter/openai/embeddings_handler.go
@@ -26,8 +26,13 @@ func (h *Handler) Embeddings(w http.ResponseWriter, r *http.Request) {
 	}
 	defer h.Auth.Release(a)

+	r.Body = http.MaxBytesReader(w, r.Body, openAIGeneralMaxSize)
 	var req map[string]any
 	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		if strings.Contains(strings.ToLower(err.Error()), "too large") {
+			writeOpenAIError(w, http.StatusRequestEntityTooLarge, "request body too large")
+			return
+		}
 		writeOpenAIError(w, http.StatusBadRequest, "invalid json")
 		return
 	}
--- a/internal/adapter/openai/file_inline_upload.go
+++ b/internal/adapter/openai/file_inline_upload.go
@@ -0,0 +1,382 @@
+package openai
+
+import (
+	"context"
+	"crypto/sha256"
+	"encoding/base64"
+	"fmt"
+	"mime"
+	"net/http"
+	"net/url"
+	"path/filepath"
+	"strings"
+
+	"ds2api/internal/auth"
+	"ds2api/internal/deepseek"
+)
+
+const maxInlineFilesPerRequest = 50
+
+type inlineFileUploadError struct {
+	status  int
+	message string
+	err     error
+}
+
+func (e *inlineFileUploadError) Error() string {
+	if e == nil {
+		return ""
+	}
+	if strings.TrimSpace(e.message) != "" {
+		return e.message
+	}
+	if e.err != nil {
+		return e.err.Error()
+	}
+	return "inline file processing failed"
+}
+
+type inlineUploadState struct {
+	ctx          context.Context
+	handler      *Handler
+	auth         *auth.RequestAuth
+	uploadedByID map[string]string
+	uploadCount  int
+}
+
+type inlineDecodedFile struct {
+	Data            []byte
+	ContentType     string
+	Filename        string
+	ReplacementType string
+}
+
+func (h *Handler) preprocessInlineFileInputs(ctx context.Context, a *auth.RequestAuth, req map[string]any) error {
+	if h == nil || h.DS == nil || len(req) == 0 {
+		return nil
+	}
+	state := &inlineUploadState{
+		ctx:          ctx,
+		handler:      h,
+		auth:         a,
+		uploadedByID: map[string]string{},
+	}
+	for _, key := range []string{"messages", "input", "attachments"} {
+		if raw, ok := req[key]; ok {
+			updated, err := state.walk(raw)
+			if err != nil {
+				return err
+			}
+			req[key] = updated
+		}
+	}
+	if refIDs := collectOpenAIRefFileIDs(req); len(refIDs) > 0 {
+		req["ref_file_ids"] = stringsToAnySlice(refIDs)
+	}
+	return nil
+}
+
+func writeOpenAIInlineFileError(w http.ResponseWriter, err error) {
+	inlineErr, ok := err.(*inlineFileUploadError)
+	if !ok || inlineErr == nil {
+		writeOpenAIError(w, http.StatusInternalServerError, "Failed to process file input.")
+		return
+	}
+	status := inlineErr.status
+	if status == 0 {
+		status = http.StatusInternalServerError
+	}
+	message := strings.TrimSpace(inlineErr.message)
+	if message == "" {
+		message = "Failed to process file input."
+	}
+	writeOpenAIError(w, status, message)
+}
+
+func (s *inlineUploadState) walk(raw any) (any, error) {
+	switch x := raw.(type) {
+	case []any:
+		out := make([]any, len(x))
+		for i, item := range x {
+			updated, err := s.walk(item)
+			if err != nil {
+				return nil, err
+			}
+			out[i] = updated
+		}
+		return out, nil
+	case map[string]any:
+		if replacement, replaced, err := s.tryUploadBlock(x); replaced || err != nil {
+			return replacement, err
+		}
+		for _, key := range []string{"messages", "input", "attachments", "content", "files", "items", "data", "source", "file", "image_url"} {
+			if nested, ok := x[key]; ok {
+				updated, err := s.walk(nested)
+				if err != nil {
+					return nil, err
+				}
+				x[key] = updated
+			}
+		}
+		return x, nil
+	default:
+		return raw, nil
+	}
+}
+
+func (s *inlineUploadState) tryUploadBlock(block map[string]any) (map[string]any, bool, error) {
+	decoded, ok, err := decodeOpenAIInlineFileBlock(block)
+	if err != nil {
+		return nil, true, &inlineFileUploadError{status: http.StatusBadRequest, message: err.Error(), err: err}
+	}
+	if !ok {
+		return nil, false, nil
+	}
+	if s.uploadCount >= maxInlineFilesPerRequest {
+		return nil, true, fmt.Errorf("exceeded maximum of %d inline files per request", maxInlineFilesPerRequest)
+	}
+	fileID, err := s.uploadInlineFile(decoded)
+	if err != nil {
+		return nil, true, &inlineFileUploadError{status: http.StatusInternalServerError, message: "Failed to upload inline file.", err: err}
+	}
+	s.uploadCount++
+	replacement := map[string]any{
+		"type":    decoded.ReplacementType,
+		"file_id": fileID,
+	}
+	if decoded.Filename != "" {
+		replacement["filename"] = decoded.Filename
+	}
+	if decoded.ContentType != "" {
+		replacement["mime_type"] = decoded.ContentType
+	}
+	return replacement, true, nil
+}
+
+func (s *inlineUploadState) uploadInlineFile(file inlineDecodedFile) (string, error) {
+	sum := sha256.Sum256(append([]byte(file.ContentType+"\x00"+file.Filename+"\x00"), file.Data...))
+	cacheKey := fmt.Sprintf("%x", sum[:])
+	if fileID, ok := s.uploadedByID[cacheKey]; ok && strings.TrimSpace(fileID) != "" {
+		return fileID, nil
+	}
+	contentType := strings.TrimSpace(file.ContentType)
+	if contentType == "" {
+		contentType = http.DetectContentType(file.Data)
+	}
+	result, err := s.handler.DS.UploadFile(s.ctx, s.auth, deepseek.UploadFileRequest{
+		Filename:    file.Filename,
+		ContentType: contentType,
+		Data:        file.Data,
+	}, 3)
+	if err != nil {
+		return "", err
+	}
+	fileID := strings.TrimSpace(result.ID)
+	if fileID == "" {
+		return "", fmt.Errorf("upload succeeded without file id")
+	}
+	s.uploadedByID[cacheKey] = fileID
+	return fileID, nil
+}
+
+func decodeOpenAIInlineFileBlock(block map[string]any) (inlineDecodedFile, bool, error) {
+	if block == nil {
+		return inlineDecodedFile{}, false, nil
+	}
+	if strings.TrimSpace(asString(block["file_id"])) != "" {
+		return inlineDecodedFile{}, false, nil
+	}
+	if nested, ok := block["file"].(map[string]any); ok {
+		decoded, matched, err := decodeOpenAIInlineFileBlock(nested)
+		if err != nil || !matched {
+			return decoded, matched, err
+		}
+		if decoded.Filename == "" {
+			decoded.Filename = pickInlineFilename(block, decoded.ContentType, defaultInlinePrefix(decoded.ReplacementType))
+		}
+		return decoded, true, nil
+	}
+	blockType := strings.ToLower(strings.TrimSpace(asString(block["type"])))
+	if raw, matched := extractInlineImageDataURL(block); matched {
+		data, contentType, err := decodeInlinePayload(raw, contentTypeFromMap(block))
+		if err != nil {
+			return inlineDecodedFile{}, true, fmt.Errorf("invalid image input")
+		}
+		return inlineDecodedFile{
+			Data:            data,
+			ContentType:     contentType,
+			Filename:        pickInlineFilename(block, contentType, "image"),
+			ReplacementType: "input_image",
+		}, true, nil
+	}
+	if raw, matched := extractInlineFilePayload(block, blockType); matched {
+		data, contentType, err := decodeInlinePayload(raw, contentTypeFromMap(block))
+		if err != nil {
+			return inlineDecodedFile{}, true, fmt.Errorf("invalid file input")
+		}
+		return inlineDecodedFile{
+			Data:            data,
+			ContentType:     contentType,
+			Filename:        pickInlineFilename(block, contentType, defaultInlinePrefix(blockType)),
+			ReplacementType: "input_file",
+		}, true, nil
+	}
+	return inlineDecodedFile{}, false, nil
+}
+
+func extractInlineImageDataURL(block map[string]any) (string, bool) {
+	imageURL := block["image_url"]
+	switch x := imageURL.(type) {
+	case string:
+		if isDataURL(x) {
+			return strings.TrimSpace(x), true
+		}
+	case map[string]any:
+		if raw := strings.TrimSpace(asString(x["url"])); isDataURL(raw) {
+			return raw, true
+		}
+	}
+	if raw := strings.TrimSpace(asString(block["url"])); isDataURL(raw) {
+		return raw, true
+	}
+	return "", false
+}
+
+func extractInlineFilePayload(block map[string]any, blockType string) (string, bool) {
+	for _, value := range []any{block["file_data"], block["base64"], block["data"]} {
+		if raw := strings.TrimSpace(asString(value)); raw != "" {
+			if strings.Contains(blockType, "file") || block["file_data"] != nil || block["filename"] != nil || block["file_name"] != nil || block["name"] != nil {
+				return raw, true
+			}
+		}
+	}
+	return "", false
+}
+
+func decodeInlinePayload(raw string, explicitContentType string) ([]byte, string, error) {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return nil, "", fmt.Errorf("empty payload")
+	}
+	if isDataURL(raw) {
+		return decodeDataURL(raw, explicitContentType)
+	}
+	decoded, err := decodeBase64Flexible(raw)
+	if err != nil {
+		return nil, "", err
+	}
+	contentType := strings.TrimSpace(explicitContentType)
+	if contentType == "" && len(decoded) > 0 {
+		contentType = http.DetectContentType(decoded)
+	}
+	return decoded, contentType, nil
+}
+
+func decodeDataURL(raw string, explicitContentType string) ([]byte, string, error) {
+	raw = strings.TrimSpace(raw)
+	if !isDataURL(raw) {
+		return nil, "", fmt.Errorf("unsupported data url")
+	}
+	header, payload, ok := strings.Cut(raw, ",")
+	if !ok {
+		return nil, "", fmt.Errorf("invalid data url")
+	}
+	meta := strings.TrimSpace(strings.TrimPrefix(header, "data:"))
+	contentType := strings.TrimSpace(explicitContentType)
+	if contentType == "" {
+		contentType = "application/octet-stream"
+		if meta != "" {
+			parts := strings.Split(meta, ";")
+			if len(parts) > 0 && strings.TrimSpace(parts[0]) != "" {
+				contentType = strings.TrimSpace(parts[0])
+			}
+		}
+	}
+	if strings.Contains(strings.ToLower(meta), ";base64") {
+		decoded, err := decodeBase64Flexible(payload)
+		if err != nil {
+			return nil, "", err
+		}
+		return decoded, contentType, nil
+	}
+	decoded, err := url.PathUnescape(payload)
+	if err != nil {
+		return nil, "", err
+	}
+	return []byte(decoded), contentType, nil
+}
+
+func decodeBase64Flexible(raw string) ([]byte, error) {
+	raw = strings.TrimSpace(raw)
+	for _, enc := range []*base64.Encoding{base64.StdEncoding, base64.RawStdEncoding, base64.URLEncoding, base64.RawURLEncoding} {
+		decoded, err := enc.DecodeString(raw)
+		if err == nil {
+			return decoded, nil
+		}
+	}
+	return nil, fmt.Errorf("invalid base64 payload")
+}
+
+func contentTypeFromMap(block map[string]any) string {
+	for _, value := range []any{block["mime_type"], block["mimeType"], block["content_type"], block["contentType"], block["media_type"], block["mediaType"]} {
+		if contentType := strings.TrimSpace(asString(value)); contentType != "" {
+			return contentType
+		}
+	}
+	if imageURL, ok := block["image_url"].(map[string]any); ok {
+		for _, value := range []any{imageURL["mime_type"], imageURL["mimeType"], imageURL["content_type"], imageURL["contentType"]} {
+			if contentType := strings.TrimSpace(asString(value)); contentType != "" {
+				return contentType
+			}
+		}
+	}
+	return ""
+}
+
+func pickInlineFilename(block map[string]any, contentType string, prefix string) string {
+	for _, value := range []any{block["filename"], block["file_name"], block["name"]} {
+		if name := strings.TrimSpace(asString(value)); name != "" {
+			return filepath.Base(name)
+		}
+	}
+	if prefix == "" {
+		prefix = "upload"
+	}
+	ext := ".bin"
+	if parsedType := strings.TrimSpace(contentType); parsedType != "" {
+		if comma := strings.Index(parsedType, ";"); comma >= 0 {
+			parsedType = strings.TrimSpace(parsedType[:comma])
+		}
+		if exts, err := mime.ExtensionsByType(parsedType); err == nil && len(exts) > 0 && strings.TrimSpace(exts[0]) != "" {
+			ext = exts[0]
+		}
+	}
+	return prefix + ext
+}
+
+func defaultInlinePrefix(blockType string) string {
+	blockType = strings.ToLower(strings.TrimSpace(blockType))
+	if strings.Contains(blockType, "image") {
+		return "image"
+	}
+	return "upload"
+}
+
+func isDataURL(raw string) bool {
+	return strings.HasPrefix(strings.ToLower(strings.TrimSpace(raw)), "data:")
+}
+
+func stringsToAnySlice(items []string) []any {
+	out := make([]any, 0, len(items))
+	for _, item := range items {
+		trimmed := strings.TrimSpace(item)
+		if trimmed == "" {
+			continue
+		}
+		out = append(out, trimmed)
+	}
+	if len(out) == 0 {
+		return nil
+	}
+	return out
+}
--- a/internal/adapter/openai/file_inline_upload_test.go
+++ b/internal/adapter/openai/file_inline_upload_test.go
@@ -0,0 +1,274 @@
+package openai
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+
+	"github.com/go-chi/chi/v5"
+
+	"ds2api/internal/auth"
+	"ds2api/internal/deepseek"
+)
+
+type inlineUploadDSStub struct {
+	uploadCalls    []deepseek.UploadFileRequest
+	lastCtx        context.Context
+	completionReq  map[string]any
+	createSession  string
+	uploadErr      error
+	completionResp *http.Response
+}
+
+func (m *inlineUploadDSStub) CreateSession(_ context.Context, _ *auth.RequestAuth, _ int) (string, error) {
+	if strings.TrimSpace(m.createSession) == "" {
+		return "session-id", nil
+	}
+	return m.createSession, nil
+}
+
+func (m *inlineUploadDSStub) GetPow(_ context.Context, _ *auth.RequestAuth, _ int) (string, error) {
+	return "pow", nil
+}
+
+func (m *inlineUploadDSStub) UploadFile(ctx context.Context, _ *auth.RequestAuth, req deepseek.UploadFileRequest, _ int) (*deepseek.UploadFileResult, error) {
+	m.lastCtx = ctx
+	m.uploadCalls = append(m.uploadCalls, req)
+	if m.uploadErr != nil {
+		return nil, m.uploadErr
+	}
+	return &deepseek.UploadFileResult{
+		ID:       "file-inline-1",
+		Filename: req.Filename,
+		Bytes:    int64(len(req.Data)),
+		Status:   "uploaded",
+		Purpose:  req.Purpose,
+	}, nil
+}
+
+func (m *inlineUploadDSStub) CallCompletion(_ context.Context, _ *auth.RequestAuth, payload map[string]any, _ string, _ int) (*http.Response, error) {
+	m.completionReq = payload
+	if m.completionResp != nil {
+		return m.completionResp, nil
+	}
+	return makeOpenAISSEHTTPResponse(
+		`data: {"p":"response/content","v":"ok"}`,
+		`data: [DONE]`,
+	), nil
+}
+
+func (m *inlineUploadDSStub) DeleteSessionForToken(_ context.Context, _ string, _ string) (*deepseek.DeleteSessionResult, error) {
+	return &deepseek.DeleteSessionResult{Success: true}, nil
+}
+
+func (m *inlineUploadDSStub) DeleteAllSessionsForToken(_ context.Context, _ string) error {
+	return nil
+}
+
+func TestPreprocessInlineFileInputsReplacesDataURLAndCollectsRefFileIDs(t *testing.T) {
+	ds := &inlineUploadDSStub{}
+	h := &Handler{DS: ds}
+	req := map[string]any{
+		"messages": []any{
+			map[string]any{
+				"role": "user",
+				"content": []any{
+					map[string]any{
+						"type":      "image_url",
+						"image_url": map[string]any{"url": "data:image/png;base64,QUJDRA=="},
+					},
+				},
+			},
+		},
+	}
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	if err := h.preprocessInlineFileInputs(ctx, &auth.RequestAuth{DeepSeekToken: "token"}, req); err != nil {
+		t.Fatalf("preprocess failed: %v", err)
+	}
+	if len(ds.uploadCalls) != 1 {
+		t.Fatalf("expected 1 upload, got %d", len(ds.uploadCalls))
+	}
+	if ds.lastCtx != ctx {
+		t.Fatalf("expected upload to use request context")
+	}
+	if ds.uploadCalls[0].ContentType != "image/png" {
+		t.Fatalf("expected image/png, got %q", ds.uploadCalls[0].ContentType)
+	}
+	if ds.uploadCalls[0].Filename != "image.png" {
+		t.Fatalf("expected inferred filename image.png, got %q", ds.uploadCalls[0].Filename)
+	}
+	messages, _ := req["messages"].([]any)
+	first, _ := messages[0].(map[string]any)
+	content, _ := first["content"].([]any)
+	block, _ := content[0].(map[string]any)
+	if block["type"] != "input_image" {
+		t.Fatalf("expected input_image replacement, got %#v", block)
+	}
+	if block["file_id"] != "file-inline-1" {
+		t.Fatalf("expected file-inline-1 replacement id, got %#v", block)
+	}
+	refIDs, _ := req["ref_file_ids"].([]any)
+	if len(refIDs) != 1 || refIDs[0] != "file-inline-1" {
+		t.Fatalf("unexpected ref_file_ids: %#v", req["ref_file_ids"])
+	}
+}
+
+func TestPreprocessInlineFileInputsDeduplicatesIdenticalPayloads(t *testing.T) {
+	ds := &inlineUploadDSStub{}
+	h := &Handler{DS: ds}
+	req := map[string]any{
+		"messages": []any{
+			map[string]any{
+				"role": "user",
+				"content": []any{
+					map[string]any{"type": "image_url", "image_url": map[string]any{"url": "data:image/png;base64,QUJDRA=="}},
+					map[string]any{"type": "image_url", "image_url": map[string]any{"url": "data:image/png;base64,QUJDRA=="}},
+				},
+			},
+		},
+	}
+
+	if err := h.preprocessInlineFileInputs(context.Background(), &auth.RequestAuth{DeepSeekToken: "token"}, req); err != nil {
+		t.Fatalf("preprocess failed: %v", err)
+	}
+	if len(ds.uploadCalls) != 1 {
+		t.Fatalf("expected deduplicated single upload, got %d", len(ds.uploadCalls))
+	}
+	refIDs, _ := req["ref_file_ids"].([]any)
+	if len(refIDs) != 1 || refIDs[0] != "file-inline-1" {
+		t.Fatalf("unexpected ref_file_ids after dedupe: %#v", req["ref_file_ids"])
+	}
+}
+
+func TestChatCompletionsUploadsInlineFilesBeforeCompletion(t *testing.T) {
+	ds := &inlineUploadDSStub{}
+	h := &Handler{Store: mockOpenAIConfig{wideInput: true}, Auth: streamStatusAuthStub{}, DS: ds}
+	reqBody := `{"model":"deepseek-chat","messages":[{"role":"user","content":[{"type":"input_text","text":"hi"},{"type":"image_url","image_url":{"url":"data:image/png;base64,QUJDRA=="}}]}],"stream":false}`
+	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(reqBody))
+	req.Header.Set("Authorization", "Bearer direct-token")
+	req.Header.Set("Content-Type", "application/json")
+	rec := httptest.NewRecorder()
+
+	h.ChatCompletions(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d body=%s", rec.Code, rec.Body.String())
+	}
+	if len(ds.uploadCalls) != 1 {
+		t.Fatalf("expected 1 upload call, got %d", len(ds.uploadCalls))
+	}
+	if ds.completionReq == nil {
+		t.Fatal("expected completion payload to be captured")
+	}
+	refIDs, _ := ds.completionReq["ref_file_ids"].([]any)
+	if len(refIDs) != 1 || refIDs[0] != "file-inline-1" {
+		t.Fatalf("unexpected completion ref_file_ids: %#v", ds.completionReq["ref_file_ids"])
+	}
+}
+
+func TestResponsesUploadsInlineFilesBeforeCompletion(t *testing.T) {
+	ds := &inlineUploadDSStub{}
+	h := &Handler{Store: mockOpenAIConfig{wideInput: true}, Auth: streamStatusAuthStub{}, DS: ds}
+	r := chi.NewRouter()
+	RegisterRoutes(r, h)
+	reqBody := `{"model":"deepseek-chat","input":[{"role":"user","content":[{"type":"input_text","text":"hi"},{"type":"input_image","image_url":{"url":"data:image/png;base64,QUJDRA=="}}]}],"stream":false}`
+	req := httptest.NewRequest(http.MethodPost, "/v1/responses", strings.NewReader(reqBody))
+	req.Header.Set("Authorization", "Bearer direct-token")
+	req.Header.Set("Content-Type", "application/json")
+	rec := httptest.NewRecorder()
+
+	r.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d body=%s", rec.Code, rec.Body.String())
+	}
+	if len(ds.uploadCalls) != 1 {
+		t.Fatalf("expected 1 upload call, got %d", len(ds.uploadCalls))
+	}
+	refIDs, _ := ds.completionReq["ref_file_ids"].([]any)
+	if len(refIDs) != 1 || refIDs[0] != "file-inline-1" {
+		t.Fatalf("unexpected completion ref_file_ids: %#v", ds.completionReq["ref_file_ids"])
+	}
+}
+
+func TestChatCompletionsInlineUploadFailureReturnsBadRequest(t *testing.T) {
+	ds := &inlineUploadDSStub{}
+	h := &Handler{Store: mockOpenAIConfig{wideInput: true}, Auth: streamStatusAuthStub{}, DS: ds}
+	reqBody := `{"model":"deepseek-chat","messages":[{"role":"user","content":[{"type":"image_url","image_url":{"url":"data:image/png;base64,%%%"}}]}],"stream":false}`
+	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(reqBody))
+	req.Header.Set("Authorization", "Bearer direct-token")
+	req.Header.Set("Content-Type", "application/json")
+	rec := httptest.NewRecorder()
+
+	h.ChatCompletions(rec, req)
+
+	if rec.Code != http.StatusBadRequest {
+		t.Fatalf("expected 400, got %d body=%s", rec.Code, rec.Body.String())
+	}
+	if ds.completionReq != nil {
+		t.Fatalf("did not expect completion call on upload decode error")
+	}
+}
+
+func TestResponsesInlineUploadFailureReturnsInternalServerError(t *testing.T) {
+	ds := &inlineUploadDSStub{uploadErr: errors.New("boom")}
+	h := &Handler{Store: mockOpenAIConfig{wideInput: true}, Auth: streamStatusAuthStub{}, DS: ds}
+	r := chi.NewRouter()
+	RegisterRoutes(r, h)
+	reqBody := `{"model":"deepseek-chat","input":[{"role":"user","content":[{"type":"image_url","image_url":{"url":"data:image/png;base64,QUJDRA=="}}]}],"stream":false}`
+	req := httptest.NewRequest(http.MethodPost, "/v1/responses", strings.NewReader(reqBody))
+	req.Header.Set("Authorization", "Bearer direct-token")
+	req.Header.Set("Content-Type", "application/json")
+	rec := httptest.NewRecorder()
+
+	r.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusInternalServerError {
+		t.Fatalf("expected 500, got %d body=%s", rec.Code, rec.Body.String())
+	}
+	if ds.completionReq != nil {
+		t.Fatalf("did not expect completion call after upload failure")
+	}
+}
+
+func TestVercelPrepareUploadsInlineFilesBeforeLeasePayload(t *testing.T) {
+	t.Setenv("VERCEL", "1")
+	t.Setenv("DS2API_VERCEL_INTERNAL_SECRET", "stream-secret")
+	ds := &inlineUploadDSStub{}
+	h := &Handler{Store: mockOpenAIConfig{wideInput: true}, Auth: streamStatusAuthStub{}, DS: ds}
+	r := chi.NewRouter()
+	RegisterRoutes(r, h)
+	reqBody := `{"model":"deepseek-chat","messages":[{"role":"user","content":[{"type":"input_text","text":"hi"},{"type":"image_url","image_url":{"url":"data:image/png;base64,QUJDRA=="}}]}],"stream":true}`
+	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions?__stream_prepare=1", strings.NewReader(reqBody))
+	req.Header.Set("Authorization", "Bearer direct-token")
+	req.Header.Set("X-Ds2-Internal-Token", "stream-secret")
+	req.Header.Set("Content-Type", "application/json")
+	rec := httptest.NewRecorder()
+
+	r.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d body=%s", rec.Code, rec.Body.String())
+	}
+	if len(ds.uploadCalls) != 1 {
+		t.Fatalf("expected 1 upload call, got %d", len(ds.uploadCalls))
+	}
+	var out map[string]any
+	if err := json.Unmarshal(rec.Body.Bytes(), &out); err != nil {
+		t.Fatalf("decode response failed: %v body=%s", err, rec.Body.String())
+	}
+	payload, _ := out["payload"].(map[string]any)
+	if payload == nil {
+		t.Fatalf("expected payload in prepare response, got %#v", out)
+	}
+	refIDs, _ := payload["ref_file_ids"].([]any)
+	if len(refIDs) != 1 || refIDs[0] != "file-inline-1" {
+		t.Fatalf("unexpected payload ref_file_ids: %#v", payload["ref_file_ids"])
+	}
+}
--- a/internal/adapter/openai/file_refs.go
+++ b/internal/adapter/openai/file_refs.go
@@ -0,0 +1,94 @@
+package openai
+
+import "strings"
+
+func collectOpenAIRefFileIDs(req map[string]any) []string {
+	if len(req) == 0 {
+		return nil
+	}
+	out := make([]string, 0, 4)
+	seen := map[string]struct{}{}
+	for _, key := range []string{
+		"ref_file_ids",
+		"file_ids",
+		"attachments",
+		"messages",
+		"input",
+	} {
+		raw := req[key]
+		if raw == nil {
+			continue
+		}
+		// Skip top-level strings for 'messages' and 'input' as they are likely plain text content,
+		// not file IDs. String file IDs are expected in 'ref_file_ids' or 'file_ids'.
+		if key == "messages" || key == "input" {
+			if _, ok := raw.(string); ok {
+				continue
+			}
+		}
+		appendOpenAIRefFileIDs(&out, seen, raw)
+	}
+	if len(out) == 0 {
+		return nil
+	}
+	return out
+}
+
+func appendOpenAIRefFileIDs(out *[]string, seen map[string]struct{}, raw any) {
+	switch x := raw.(type) {
+	case string:
+		addOpenAIRefFileID(out, seen, x)
+	case []string:
+		for _, item := range x {
+			addOpenAIRefFileID(out, seen, item)
+		}
+	case []any:
+		for _, item := range x {
+			appendOpenAIRefFileIDs(out, seen, item)
+		}
+	case map[string]any:
+		if fileID := strings.TrimSpace(asString(x["file_id"])); fileID != "" {
+			addOpenAIRefFileID(out, seen, fileID)
+		}
+		if strings.Contains(strings.ToLower(strings.TrimSpace(asString(x["type"]))), "file") {
+			if fileID := strings.TrimSpace(asString(x["id"])); fileID != "" {
+				addOpenAIRefFileID(out, seen, fileID)
+			}
+		}
+		if fileMap, ok := x["file"].(map[string]any); ok {
+			if fileID := strings.TrimSpace(asString(fileMap["file_id"])); fileID != "" {
+				addOpenAIRefFileID(out, seen, fileID)
+			}
+			if fileID := strings.TrimSpace(asString(fileMap["id"])); fileID != "" {
+				addOpenAIRefFileID(out, seen, fileID)
+			}
+		}
+		// Recurse into potential containers. Note: we do NOT recurse into 'content' or 'input'
+		// if they are plain strings (handled by the top-level switch), but they are usually
+		// nested inside the map branch anyway.
+		// To be safe, we only recurse into these known container keys.
+		for _, key := range []string{"ref_file_ids", "file_ids", "attachments", "messages", "input", "content", "files", "items", "data", "source"} {
+			if nested, ok := x[key]; ok {
+				// If it's a message content that is a string, we must NOT treat it as an ID.
+				if key == "content" || key == "input" {
+					if _, ok := nested.(string); ok {
+						continue
+					}
+				}
+				appendOpenAIRefFileIDs(out, seen, nested)
+			}
+		}
+	}
+}
+
+func addOpenAIRefFileID(out *[]string, seen map[string]struct{}, fileID string) {
+	fileID = strings.TrimSpace(fileID)
+	if fileID == "" {
+		return
+	}
+	if _, ok := seen[fileID]; ok {
+		return
+	}
+	seen[fileID] = struct{}{}
+	*out = append(*out, fileID)
+}
--- a/internal/adapter/openai/files_route_test.go
+++ b/internal/adapter/openai/files_route_test.go
@@ -0,0 +1,202 @@
+package openai
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"errors"
+	"mime/multipart"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+
+	"github.com/go-chi/chi/v5"
+
+	"ds2api/internal/auth"
+	"ds2api/internal/deepseek"
+)
+
+type managedFilesAuthStub struct{}
+
+func (managedFilesAuthStub) Determine(_ *http.Request) (*auth.RequestAuth, error) {
+	return &auth.RequestAuth{
+		UseConfigToken: true,
+		DeepSeekToken:  "managed-token",
+		CallerID:       "caller:test",
+		AccountID:      "acct-123",
+		TriedAccounts:  map[string]bool{},
+	}, nil
+}
+
+func (managedFilesAuthStub) DetermineCaller(_ *http.Request) (*auth.RequestAuth, error) {
+	return &auth.RequestAuth{
+		UseConfigToken: true,
+		DeepSeekToken:  "managed-token",
+		CallerID:       "caller:test",
+		AccountID:      "acct-123",
+		TriedAccounts:  map[string]bool{},
+	}, nil
+}
+
+func (managedFilesAuthStub) Release(_ *auth.RequestAuth) {}
+
+type filesRouteDSStub struct {
+	lastReq deepseek.UploadFileRequest
+	upload  *deepseek.UploadFileResult
+	err     error
+}
+
+func (m *filesRouteDSStub) CreateSession(_ context.Context, _ *auth.RequestAuth, _ int) (string, error) {
+	return "", nil
+}
+
+func (m *filesRouteDSStub) GetPow(_ context.Context, _ *auth.RequestAuth, _ int) (string, error) {
+	return "", nil
+}
+
+func (m *filesRouteDSStub) UploadFile(_ context.Context, _ *auth.RequestAuth, req deepseek.UploadFileRequest, _ int) (*deepseek.UploadFileResult, error) {
+	m.lastReq = req
+	if m.err != nil {
+		return nil, m.err
+	}
+	if m.upload != nil {
+		return m.upload, nil
+	}
+	return &deepseek.UploadFileResult{ID: "file-123", Filename: req.Filename, Bytes: int64(len(req.Data)), Purpose: req.Purpose, Status: "uploaded"}, nil
+}
+
+func (m *filesRouteDSStub) CallCompletion(_ context.Context, _ *auth.RequestAuth, _ map[string]any, _ string, _ int) (*http.Response, error) {
+	return nil, errors.New("not implemented")
+}
+
+func (m *filesRouteDSStub) DeleteSessionForToken(_ context.Context, _ string, _ string) (*deepseek.DeleteSessionResult, error) {
+	return &deepseek.DeleteSessionResult{Success: true}, nil
+}
+
+func (m *filesRouteDSStub) DeleteAllSessionsForToken(_ context.Context, _ string) error {
+	return nil
+}
+
+func newMultipartUploadRequest(t *testing.T, purpose string, filename string, data []byte) *http.Request {
+	t.Helper()
+	var body bytes.Buffer
+	writer := multipart.NewWriter(&body)
+	if purpose != "" {
+		if err := writer.WriteField("purpose", purpose); err != nil {
+			t.Fatalf("write purpose failed: %v", err)
+		}
+	}
+	part, err := writer.CreateFormFile("file", filename)
+	if err != nil {
+		t.Fatalf("create form file failed: %v", err)
+	}
+	if _, err := part.Write(data); err != nil {
+		t.Fatalf("write file failed: %v", err)
+	}
+	if err := writer.Close(); err != nil {
+		t.Fatalf("close writer failed: %v", err)
+	}
+	req := httptest.NewRequest(http.MethodPost, "/v1/files", &body)
+	req.Header.Set("Authorization", "Bearer direct-token")
+	req.Header.Set("Content-Type", writer.FormDataContentType())
+	return req
+}
+
+func TestFilesRouteUploadSuccess(t *testing.T) {
+	ds := &filesRouteDSStub{}
+	h := &Handler{Store: mockOpenAIConfig{wideInput: true}, Auth: streamStatusAuthStub{}, DS: ds}
+	r := chi.NewRouter()
+	RegisterRoutes(r, h)
+
+	req := newMultipartUploadRequest(t, "assistants", "notes.txt", []byte("hello world"))
+	rec := httptest.NewRecorder()
+	r.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d body=%s", rec.Code, rec.Body.String())
+	}
+	if ds.lastReq.Filename != "notes.txt" {
+		t.Fatalf("expected filename notes.txt, got %q", ds.lastReq.Filename)
+	}
+	if ds.lastReq.Purpose != "assistants" {
+		t.Fatalf("expected purpose assistants, got %q", ds.lastReq.Purpose)
+	}
+	if string(ds.lastReq.Data) != "hello world" {
+		t.Fatalf("unexpected uploaded data: %q", string(ds.lastReq.Data))
+	}
+	var out map[string]any
+	if err := json.Unmarshal(rec.Body.Bytes(), &out); err != nil {
+		t.Fatalf("decode response failed: %v body=%s", err, rec.Body.String())
+	}
+	if out["object"] != "file" {
+		t.Fatalf("expected file object, got %#v", out)
+	}
+	if out["id"] != "file-123" {
+		t.Fatalf("expected file id file-123, got %#v", out["id"])
+	}
+	if out["filename"] != "notes.txt" {
+		t.Fatalf("expected filename notes.txt, got %#v", out["filename"])
+	}
+}
+
+func TestFilesRouteUploadIncludesAccountIDForManagedAccount(t *testing.T) {
+	ds := &filesRouteDSStub{}
+	h := &Handler{Store: mockOpenAIConfig{wideInput: true}, Auth: managedFilesAuthStub{}, DS: ds}
+	r := chi.NewRouter()
+	RegisterRoutes(r, h)
+
+	req := newMultipartUploadRequest(t, "assistants", "notes.txt", []byte("hello world"))
+	rec := httptest.NewRecorder()
+	r.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d body=%s", rec.Code, rec.Body.String())
+	}
+	var out map[string]any
+	if err := json.Unmarshal(rec.Body.Bytes(), &out); err != nil {
+		t.Fatalf("decode response failed: %v body=%s", err, rec.Body.String())
+	}
+	if out["account_id"] != "acct-123" {
+		t.Fatalf("expected account_id acct-123, got %#v", out["account_id"])
+	}
+}
+
+func TestFilesRouteRejectsNonMultipart(t *testing.T) {
+	h := &Handler{Store: mockOpenAIConfig{wideInput: true}, Auth: streamStatusAuthStub{}, DS: &filesRouteDSStub{}}
+	r := chi.NewRouter()
+	RegisterRoutes(r, h)
+
+	req := httptest.NewRequest(http.MethodPost, "/v1/files", bytes.NewBufferString(`{"purpose":"assistants"}`))
+	req.Header.Set("Authorization", "Bearer direct-token")
+	req.Header.Set("Content-Type", "application/json")
+	rec := httptest.NewRecorder()
+	r.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusBadRequest {
+		t.Fatalf("expected 400, got %d body=%s", rec.Code, rec.Body.String())
+	}
+}
+
+func TestFilesRouteRequiresFileField(t *testing.T) {
+	h := &Handler{Store: mockOpenAIConfig{wideInput: true}, Auth: streamStatusAuthStub{}, DS: &filesRouteDSStub{}}
+	r := chi.NewRouter()
+	RegisterRoutes(r, h)
+
+	var body bytes.Buffer
+	writer := multipart.NewWriter(&body)
+	if err := writer.WriteField("purpose", "assistants"); err != nil {
+		t.Fatalf("write field failed: %v", err)
+	}
+	if err := writer.Close(); err != nil {
+		t.Fatalf("close writer failed: %v", err)
+	}
+	req := httptest.NewRequest(http.MethodPost, "/v1/files", &body)
+	req.Header.Set("Authorization", "Bearer direct-token")
+	req.Header.Set("Content-Type", writer.FormDataContentType())
+	rec := httptest.NewRecorder()
+	r.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusBadRequest {
+		t.Fatalf("expected 400, got %d body=%s", rec.Code, rec.Body.String())
+	}
+}
--- a/internal/adapter/openai/handler_chat.go
+++ b/internal/adapter/openai/handler_chat.go
@@ -5,6 +5,7 @@ import (
 	"encoding/json"
 	"io"
 	"net/http"
+	"strings"
 	"time"

 	"ds2api/internal/auth"
@@ -35,39 +36,35 @@ func (h *Handler) ChatCompletions(w http.ResponseWriter, r *http.Request) {
 		writeOpenAIError(w, status, detail)
 		return
 	}
+	var sessionID string
 	defer func() {
-		// 自动删除会话（同步）
-		// 必须在 Release 之前同步删除，否则：
-		// 1. 异步删除时账号已被 Release
-		// 2. 新请求可能获取到同一账号并开始使用
-		// 3. 异步删除仍在进行，会截断新请求正在使用的会话
-		if h.Store.AutoDeleteSessions() && a.DeepSeekToken != "" {
-			deleteCtx, cancel := context.WithTimeout(r.Context(), 10*time.Second)
-			defer cancel()
-			err := h.DS.DeleteAllSessionsForToken(deleteCtx, a.DeepSeekToken)
-			if err != nil {
-				config.Logger.Warn("[auto_delete_sessions] failed", "account", a.AccountID, "error", err)
-			} else {
-				config.Logger.Debug("[auto_delete_sessions] success", "account", a.AccountID)
-			}
-		}
+		h.autoDeleteRemoteSession(r.Context(), a, sessionID)
 		h.Auth.Release(a)
 	}()

 	r = r.WithContext(auth.WithAuth(r.Context(), a))

+	r.Body = http.MaxBytesReader(w, r.Body, openAIGeneralMaxSize)
 	var req map[string]any
 	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		if strings.Contains(strings.ToLower(err.Error()), "too large") {
+			writeOpenAIError(w, http.StatusRequestEntityTooLarge, "request body too large")
+			return
+		}
 		writeOpenAIError(w, http.StatusBadRequest, "invalid json")
 		return
 	}
+	if err := h.preprocessInlineFileInputs(r.Context(), a, req); err != nil {
+		writeOpenAIInlineFileError(w, err)
+		return
+	}
 	stdReq, err := normalizeOpenAIChatRequest(h.Store, req, requestTraceID(r))
 	if err != nil {
 		writeOpenAIError(w, http.StatusBadRequest, err.Error())
 		return
 	}

-	sessionID, err := h.DS.CreateSession(r.Context(), a, 3)
+	sessionID, err = h.DS.CreateSession(r.Context(), a, 3)
 	if err != nil {
 		if a.UseConfigToken {
 			writeOpenAIError(w, http.StatusUnauthorized, "Account token is invalid. Please re-login the account in admin.")
@@ -91,12 +88,45 @@ func (h *Handler) ChatCompletions(w http.ResponseWriter, r *http.Request) {
 		h.handleStream(w, r, resp, sessionID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames)
 		return
 	}
-	h.handleNonStream(w, r.Context(), resp, sessionID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.ToolNames)
+	h.handleNonStream(w, r.Context(), resp, sessionID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames)
 }

-func (h *Handler) handleNonStream(w http.ResponseWriter, ctx context.Context, resp *http.Response, completionID, model, finalPrompt string, thinkingEnabled bool, toolNames []string) {
+func (h *Handler) autoDeleteRemoteSession(ctx context.Context, a *auth.RequestAuth, sessionID string) {
+	mode := h.Store.AutoDeleteMode()
+	if mode == "none" || a.DeepSeekToken == "" {
+		return
+	}
+
+	deleteBaseCtx := context.WithoutCancel(ctx)
+	deleteCtx, cancel := context.WithTimeout(deleteBaseCtx, 10*time.Second)
+	defer cancel()
+
+	switch mode {
+	case "single":
+		if sessionID == "" {
+			config.Logger.Warn("[auto_delete_sessions] skipped single-session delete because session_id is empty", "account", a.AccountID)
+			return
+		}
+		_, err := h.DS.DeleteSessionForToken(deleteCtx, a.DeepSeekToken, sessionID)
+		if err != nil {
+			config.Logger.Warn("[auto_delete_sessions] failed", "account", a.AccountID, "mode", mode, "session_id", sessionID, "error", err)
+			return
+		}
+		config.Logger.Debug("[auto_delete_sessions] success", "account", a.AccountID, "mode", mode, "session_id", sessionID)
+	case "all":
+		if err := h.DS.DeleteAllSessionsForToken(deleteCtx, a.DeepSeekToken); err != nil {
+			config.Logger.Warn("[auto_delete_sessions] failed", "account", a.AccountID, "mode", mode, "error", err)
+			return
+		}
+		config.Logger.Debug("[auto_delete_sessions] success", "account", a.AccountID, "mode", mode)
+	default:
+		config.Logger.Warn("[auto_delete_sessions] unknown mode", "account", a.AccountID, "mode", mode)
+	}
+}
+
+func (h *Handler) handleNonStream(w http.ResponseWriter, ctx context.Context, resp *http.Response, completionID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string) {
 	if resp.StatusCode != http.StatusOK {
-		defer resp.Body.Close()
+		defer func() { _ = resp.Body.Close() }()
 		body, _ := io.ReadAll(resp.Body)
 		writeOpenAIError(w, resp.StatusCode, string(body))
 		return
@@ -104,14 +134,21 @@ func (h *Handler) handleNonStream(w http.ResponseWriter, ctx context.Context, re
 	_ = ctx
 	result := sse.CollectStream(resp, thinkingEnabled, true)

-	finalThinking := result.Thinking
-	finalText := sanitizeLeakedToolHistory(result.Text)
+	stripReferenceMarkers := h.compatStripReferenceMarkers()
+	finalThinking := cleanVisibleOutput(result.Thinking, stripReferenceMarkers)
+	finalText := cleanVisibleOutput(result.Text, stripReferenceMarkers)
+	if searchEnabled {
+		finalText = replaceCitationMarkersWithLinks(finalText, result.CitationLinks)
+	}
+	if writeUpstreamEmptyOutputError(w, finalText, result.ContentFilter) {
+		return
+	}
 	respBody := openaifmt.BuildChatCompletion(completionID, model, finalPrompt, finalThinking, finalText, toolNames)
 	writeJSON(w, http.StatusOK, respBody)
 }

 func (h *Handler) handleStream(w http.ResponseWriter, r *http.Request, resp *http.Response, completionID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string) {
-	defer resp.Body.Close()
+	defer func() { _ = resp.Body.Close() }()
 	if resp.StatusCode != http.StatusOK {
 		body, _ := io.ReadAll(resp.Body)
 		writeOpenAIError(w, resp.StatusCode, string(body))
@@ -130,6 +167,7 @@ func (h *Handler) handleStream(w http.ResponseWriter, r *http.Request, resp *htt
 	created := time.Now().Unix()
 	bufferToolContent := len(toolNames) > 0
 	emitEarlyToolDeltas := h.toolcallFeatureMatchEnabled() && h.toolcallEarlyEmitHighConfidence()
+	stripReferenceMarkers := h.compatStripReferenceMarkers()
 	initialType := "text"
 	if thinkingEnabled {
 		initialType = "thinking"
@@ -145,6 +183,7 @@ func (h *Handler) handleStream(w http.ResponseWriter, r *http.Request, resp *htt
 		finalPrompt,
 		thinkingEnabled,
 		searchEnabled,
+		stripReferenceMarkers,
 		toolNames,
 		bufferToolContent,
 		emitEarlyToolDeltas,
--- a/internal/adapter/openai/handler_chat_auto_delete_test.go
+++ b/internal/adapter/openai/handler_chat_auto_delete_test.go
@@ -0,0 +1,143 @@
+package openai
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+
+	"ds2api/internal/auth"
+	"ds2api/internal/deepseek"
+)
+
+type autoDeleteModeDSStub struct {
+	resp          *http.Response
+	singleCalls   int
+	allCalls      int
+	lastSessionID string
+	lastCtxErr    error
+}
+
+func (m *autoDeleteModeDSStub) CreateSession(_ context.Context, _ *auth.RequestAuth, _ int) (string, error) {
+	return "session-id", nil
+}
+
+func (m *autoDeleteModeDSStub) GetPow(_ context.Context, _ *auth.RequestAuth, _ int) (string, error) {
+	return "pow", nil
+}
+
+func (m *autoDeleteModeDSStub) UploadFile(_ context.Context, _ *auth.RequestAuth, _ deepseek.UploadFileRequest, _ int) (*deepseek.UploadFileResult, error) {
+	return &deepseek.UploadFileResult{ID: "file-id", Filename: "file.txt", Bytes: 1, Status: "uploaded"}, nil
+}
+
+func (m *autoDeleteModeDSStub) CallCompletion(_ context.Context, _ *auth.RequestAuth, _ map[string]any, _ string, _ int) (*http.Response, error) {
+	return m.resp, nil
+}
+
+func (m *autoDeleteModeDSStub) DeleteSessionForToken(_ context.Context, _ string, sessionID string) (*deepseek.DeleteSessionResult, error) {
+	m.singleCalls++
+	m.lastSessionID = sessionID
+	return &deepseek.DeleteSessionResult{SessionID: sessionID, Success: true}, nil
+}
+
+func (m *autoDeleteModeDSStub) DeleteAllSessionsForToken(_ context.Context, _ string) error {
+	m.allCalls++
+	return nil
+}
+
+func (m *autoDeleteModeDSStub) DeleteSessionForTokenCtx(ctx context.Context, _ string, sessionID string) (*deepseek.DeleteSessionResult, error) {
+	m.singleCalls++
+	m.lastSessionID = sessionID
+	m.lastCtxErr = ctx.Err()
+	return &deepseek.DeleteSessionResult{SessionID: sessionID, Success: true}, nil
+}
+
+func TestChatCompletionsAutoDeleteModes(t *testing.T) {
+	tests := []struct {
+		name       string
+		mode       string
+		wantSingle int
+		wantAll    int
+	}{
+		{name: "none", mode: "none"},
+		{name: "single", mode: "single", wantSingle: 1},
+		{name: "all", mode: "all", wantAll: 1},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			ds := &autoDeleteModeDSStub{
+				resp: makeOpenAISSEHTTPResponse(
+					`data: {"p":"response/content","v":"hello"}`,
+					"data: [DONE]",
+				),
+			}
+			h := &Handler{
+				Store: mockOpenAIConfig{
+					wideInput:      true,
+					autoDeleteMode: tc.mode,
+				},
+				Auth: streamStatusAuthStub{},
+				DS:   ds,
+			}
+
+			reqBody := `{"model":"deepseek-chat","messages":[{"role":"user","content":"hi"}],"stream":false}`
+			req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(reqBody))
+			req.Header.Set("Authorization", "Bearer direct-token")
+			req.Header.Set("Content-Type", "application/json")
+			rec := httptest.NewRecorder()
+
+			h.ChatCompletions(rec, req)
+
+			if rec.Code != http.StatusOK {
+				t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+			}
+			if ds.singleCalls != tc.wantSingle {
+				t.Fatalf("single delete calls=%d want=%d", ds.singleCalls, tc.wantSingle)
+			}
+			if ds.allCalls != tc.wantAll {
+				t.Fatalf("all delete calls=%d want=%d", ds.allCalls, tc.wantAll)
+			}
+			if tc.wantSingle > 0 && ds.lastSessionID != "session-id" {
+				t.Fatalf("expected single delete for session-id, got %q", ds.lastSessionID)
+			}
+		})
+	}
+}
+
+type autoDeleteCtxDSStub struct {
+	autoDeleteModeDSStub
+}
+
+func (m *autoDeleteCtxDSStub) DeleteSessionForToken(ctx context.Context, token string, sessionID string) (*deepseek.DeleteSessionResult, error) {
+	return m.DeleteSessionForTokenCtx(ctx, token, sessionID)
+}
+
+func (m *autoDeleteCtxDSStub) DeleteAllSessionsForToken(_ context.Context, _ string) error {
+	m.allCalls++
+	return nil
+}
+
+func TestAutoDeleteRemoteSessionIgnoresCanceledParentContext(t *testing.T) {
+	ds := &autoDeleteCtxDSStub{}
+	h := &Handler{
+		Store: mockOpenAIConfig{
+			wideInput:      true,
+			autoDeleteMode: "single",
+		},
+		DS: ds,
+	}
+	a := &auth.RequestAuth{DeepSeekToken: "token", AccountID: "acct"}
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+
+	h.autoDeleteRemoteSession(ctx, a, "session-id")
+
+	if ds.singleCalls != 1 {
+		t.Fatalf("single delete calls=%d want=1", ds.singleCalls)
+	}
+	if ds.lastCtxErr != nil {
+		t.Fatalf("delete ctx should not inherit cancellation, got %v", ds.lastCtxErr)
+	}
+}
--- a/internal/adapter/openai/handler_files.go
+++ b/internal/adapter/openai/handler_files.go
@@ -0,0 +1,104 @@
+package openai
+
+import (
+	"io"
+	"net/http"
+	"strings"
+	"time"
+
+	"ds2api/internal/auth"
+	"ds2api/internal/deepseek"
+)
+
+const openAIUploadMaxMemory = 32 << 20
+
+func (h *Handler) UploadFile(w http.ResponseWriter, r *http.Request) {
+	a, err := h.Auth.Determine(r)
+	if err != nil {
+		status := http.StatusUnauthorized
+		detail := err.Error()
+		if err == auth.ErrNoAccount {
+			status = http.StatusTooManyRequests
+		}
+		writeOpenAIError(w, status, detail)
+		return
+	}
+	defer h.Auth.Release(a)
+	if !strings.HasPrefix(strings.ToLower(strings.TrimSpace(r.Header.Get("Content-Type"))), "multipart/form-data") {
+		writeOpenAIError(w, http.StatusBadRequest, "content-type must be multipart/form-data")
+		return
+	}
+	// Enforce a hard cap on the total request body size to prevent OOM
+	r.Body = http.MaxBytesReader(w, r.Body, openAIUploadMaxSize)
+	if err := r.ParseMultipartForm(openAIUploadMaxMemory); err != nil {
+		if strings.Contains(strings.ToLower(err.Error()), "too large") {
+			writeOpenAIError(w, http.StatusRequestEntityTooLarge, "file size exceeds limit")
+			return
+		}
+		writeOpenAIError(w, http.StatusBadRequest, "invalid multipart form")
+		return
+	}
+	if r.MultipartForm != nil {
+		defer func() { _ = r.MultipartForm.RemoveAll() }()
+	}
+	r = r.WithContext(auth.WithAuth(r.Context(), a))
+	file, header, err := r.FormFile("file")
+	if err != nil {
+		writeOpenAIError(w, http.StatusBadRequest, "file is required")
+		return
+	}
+	defer func() { _ = file.Close() }()
+	data, err := io.ReadAll(file)
+	if err != nil {
+		writeOpenAIError(w, http.StatusBadRequest, "failed to read uploaded file")
+		return
+	}
+	contentType := strings.TrimSpace(header.Header.Get("Content-Type"))
+	if contentType == "" && len(data) > 0 {
+		contentType = http.DetectContentType(data)
+	}
+	result, err := h.DS.UploadFile(r.Context(), a, deepseek.UploadFileRequest{
+		Filename:    header.Filename,
+		ContentType: contentType,
+		Purpose:     strings.TrimSpace(r.FormValue("purpose")),
+		Data:        data,
+	}, 3)
+	if err != nil {
+		writeOpenAIError(w, http.StatusInternalServerError, "Failed to upload file.")
+		return
+	}
+	if result != nil && result.AccountID == "" {
+		result.AccountID = a.AccountID
+	}
+	writeJSON(w, http.StatusOK, buildOpenAIFileObject(result))
+}
+
+func buildOpenAIFileObject(result *deepseek.UploadFileResult) map[string]any {
+	if result == nil {
+		obj := map[string]any{
+			"id":             "",
+			"object":         "file",
+			"bytes":          0,
+			"created_at":     time.Now().Unix(),
+			"filename":       "",
+			"purpose":        "",
+			"status":         "uploaded",
+			"status_details": nil,
+		}
+		return obj
+	}
+	obj := map[string]any{
+		"id":             result.ID,
+		"object":         "file",
+		"bytes":          result.Bytes,
+		"created_at":     time.Now().Unix(),
+		"filename":       result.Filename,
+		"purpose":        result.Purpose,
+		"status":         result.Status,
+		"status_details": nil,
+	}
+	if result.AccountID != "" {
+		obj["account_id"] = result.AccountID
+	}
+	return obj
+}
--- a/internal/adapter/openai/handler_routes.go
+++ b/internal/adapter/openai/handler_routes.go
@@ -13,6 +13,13 @@ import (
 	"ds2api/internal/util"
 )

+const (
+	// openAIUploadMaxSize limits total multipart request body size (100 MiB).
+	openAIUploadMaxSize = 100 << 20
+	// openAIGeneralMaxSize limits total JSON request body size (100 MiB).
+	openAIGeneralMaxSize = 100 << 20
+)
+
 // writeJSON is a package-internal alias kept to avoid mass-renaming across
 // every call-site in this package.
 var writeJSON = util.WriteJSON
@@ -28,6 +35,13 @@ type Handler struct {
 	responses    *responseStore
 }

+func (h *Handler) compatStripReferenceMarkers() bool {
+	if h == nil || h.Store == nil {
+		return true
+	}
+	return h.Store.CompatStripReferenceMarkers()
+}
+
 type streamLease struct {
 	Auth      *auth.RequestAuth
 	ExpiresAt time.Time
@@ -39,6 +53,7 @@ func RegisterRoutes(r chi.Router, h *Handler) {
 	r.Post("/v1/chat/completions", h.ChatCompletions)
 	r.Post("/v1/responses", h.Responses)
 	r.Get("/v1/responses/{response_id}", h.GetResponseByID)
+	r.Post("/v1/files", h.UploadFile)
 	r.Post("/v1/embeddings", h.Embeddings)
 }

--- a/internal/adapter/openai/handler_toolcall_format.go
+++ b/internal/adapter/openai/handler_toolcall_format.go
@@ -1,6 +1,7 @@
 package openai

 import (
+	"ds2api/internal/toolcall"
 	"encoding/json"
 	"fmt"
 	"strings"
@@ -53,7 +54,7 @@ func injectToolPrompt(messages []map[string]any, tools []any, policy util.ToolCh
 	if len(toolSchemas) == 0 {
 		return messages, names
 	}
-	toolPrompt := "You have access to these tools:\n\n" + strings.Join(toolSchemas, "\n\n") + "\n\nWhen you need to use tools, output ONLY this JSON object format:\n{\"tool_calls\": [{\"name\": \"tool_name\", \"input\": {\"param\": \"value\"}}]}\n\n【EXAMPLE】\nUser: Please check the weather in Beijing and Shanghai, and update my todo list.\nAssistant:\n{\"tool_calls\": [\n  {\"name\": \"get_weather\", \"input\": {\"city\": \"Beijing\"}},\n  {\"name\": \"get_weather\", \"input\": {\"city\": \"Shanghai\"}},\n  {\"name\": \"update_todo\", \"input\": {\"todos\": [{\"content\": \"Buy milk\"}, {\"content\": \"Write report\"}]}}\n]}\n\nIMPORTANT:\n1) If calling tools, output ONLY the JSON object above. Do NOT include any extra text.\n2) Do NOT wrap tool-call JSON in markdown/code fences (for example, do not use triple backticks).\n3) After receiving a tool result, you MUST use it to produce the final answer.\n4) Only call another tool when the previous result is missing required data or returned an error.\n5) JSON SYNTAX STRICTLY REQUIRED: All property names MUST be enclosed in double quotes (e.g., \"name\", not name).\n6) ARRAY FORMAT: If providing a list of items, you MUST enclose them in square brackets `[]` (e.g., \"todos\": [{\"item\": \"a\"}, {\"item\": \"b\"}]). DO NOT output comma-separated objects without brackets."
+	toolPrompt := "You have access to these tools:\n\n" + strings.Join(toolSchemas, "\n\n") + "\n\n" + buildToolCallInstructions(names)
 	if policy.Mode == util.ToolChoiceRequired {
 		toolPrompt += "\n7) For this response, you MUST call at least one tool from the allowed list."
 	}
@@ -73,6 +74,11 @@ func injectToolPrompt(messages []map[string]any, tools []any, policy util.ToolCh
 	return messages, names
 }

+// buildToolCallInstructions delegates to the shared util implementation.
+func buildToolCallInstructions(toolNames []string) string {
+	return toolcall.BuildToolCallInstructions(toolNames)
+}
+
 func formatIncrementalStreamToolCallDeltas(deltas []toolCallDelta, ids map[int]string) []map[string]any {
 	if len(deltas) == 0 {
 		return nil
@@ -107,7 +113,7 @@ func formatIncrementalStreamToolCallDeltas(deltas []toolCallDelta, ids map[int]s
 	return out
 }

-func filterIncrementalToolCallDeltasByAllowed(deltas []toolCallDelta, allowedNames []string, seenNames map[int]string) []toolCallDelta {
+func filterIncrementalToolCallDeltasByAllowed(deltas []toolCallDelta, seenNames map[int]string) []toolCallDelta {
 	if len(deltas) == 0 {
 		return nil
 	}
@@ -133,7 +139,7 @@ func filterIncrementalToolCallDeltasByAllowed(deltas []toolCallDelta, allowedNam
 	return out
 }

-func formatFinalStreamToolCallsWithStableIDs(calls []util.ParsedToolCall, ids map[int]string) []map[string]any {
+func formatFinalStreamToolCallsWithStableIDs(calls []toolcall.ParsedToolCall, ids map[int]string) []map[string]any {
 	if len(calls) == 0 {
 		return nil
 	}
--- a/internal/adapter/openai/handler_toolcall_policy.go
+++ b/internal/adapter/openai/handler_toolcall_policy.go
@@ -1,19 +1,9 @@
 package openai

-import "strings"
-
 func (h *Handler) toolcallFeatureMatchEnabled() bool {
-	if h == nil || h.Store == nil {
-		return true
-	}
-	mode := strings.TrimSpace(strings.ToLower(h.Store.ToolcallMode()))
-	return mode == "" || mode == "feature_match"
+	return true
 }

 func (h *Handler) toolcallEarlyEmitHighConfidence() bool {
-	if h == nil || h.Store == nil {
-		return true
-	}
-	level := strings.TrimSpace(strings.ToLower(h.Store.ToolcallEarlyEmitConfidence()))
-	return level == "" || level == "high"
+	return true
 }
--- a/internal/adapter/openai/handler_toolcall_test.go
+++ b/internal/adapter/openai/handler_toolcall_test.go
@@ -3,7 +3,6 @@ package openai
 import (
 	"context"
 	"encoding/json"
-	"fmt"
 	"io"
 	"net/http"
 	"net/http/httptest"
@@ -59,21 +58,6 @@ func parseSSEDataFrames(t *testing.T, body string) ([]map[string]any, bool) {
 	return frames, done
 }

-func streamHasRawToolJSONContent(frames []map[string]any) bool {
-	for _, frame := range frames {
-		choices, _ := frame["choices"].([]any)
-		for _, item := range choices {
-			choice, _ := item.(map[string]any)
-			delta, _ := choice["delta"].(map[string]any)
-			content, _ := delta["content"].(string)
-			if strings.Contains(content, `"tool_calls"`) {
-				return true
-			}
-		}
-	}
-	return false
-}
-
 func streamHasToolCallsDelta(frames []map[string]any) bool {
 	for _, frame := range frames {
 		choices, _ := frame["choices"].([]any)
@@ -101,364 +85,61 @@ func streamFinishReason(frames []map[string]any) string {
 	return ""
 }

-func streamToolCallArgumentChunks(frames []map[string]any) []string {
-	out := make([]string, 0, 4)
-	for _, frame := range frames {
-		choices, _ := frame["choices"].([]any)
-		for _, item := range choices {
-			choice, _ := item.(map[string]any)
-			delta, _ := choice["delta"].(map[string]any)
-			toolCalls, _ := delta["tool_calls"].([]any)
-			for _, tc := range toolCalls {
-				tcm, _ := tc.(map[string]any)
-				fn, _ := tcm["function"].(map[string]any)
-				if args, ok := fn["arguments"].(string); ok && args != "" {
-					out = append(out, args)
-				}
-			}
-		}
-	}
-	return out
-}
-
-func TestHandleNonStreamToolCallInterceptsChatModel(t *testing.T) {
-	h := &Handler{}
-	resp := makeSSEHTTPResponse(
-		`data: {"p":"response/content","v":"{\"tool_calls\":[{\"name\":\"search\",\"input\":{\"q\":\"go\"}}]}"}`,
-		`data: [DONE]`,
-	)
-	rec := httptest.NewRecorder()
-
-	h.handleNonStream(rec, context.Background(), resp, "cid1", "deepseek-chat", "prompt", false, []string{"search"})
-	if rec.Code != http.StatusOK {
-		t.Fatalf("unexpected status: %d", rec.Code)
-	}
-
-	out := decodeJSONBody(t, rec.Body.String())
-	choices, _ := out["choices"].([]any)
-	if len(choices) != 1 {
-		t.Fatalf("unexpected choices: %#v", out["choices"])
-	}
-	choice, _ := choices[0].(map[string]any)
-	if choice["finish_reason"] != "tool_calls" {
-		t.Fatalf("expected finish_reason=tool_calls, got %#v", choice["finish_reason"])
-	}
-	msg, _ := choice["message"].(map[string]any)
-	if msg["content"] != nil {
-		t.Fatalf("expected content nil, got %#v", msg["content"])
-	}
-	toolCalls, _ := msg["tool_calls"].([]any)
-	if len(toolCalls) != 1 {
-		t.Fatalf("expected 1 tool call, got %#v", msg["tool_calls"])
-	}
-}
-
-func TestHandleNonStreamToolCallInterceptsReasonerModel(t *testing.T) {
-	h := &Handler{}
-	resp := makeSSEHTTPResponse(
-		`data: {"p":"response/thinking_content","v":"先想一下"}`,
-		`data: {"p":"response/content","v":"{\"tool_calls\":[{\"name\":\"search\",\"input\":{\"q\":\"go\"}}]}"}`,
-		`data: [DONE]`,
-	)
-	rec := httptest.NewRecorder()
-
-	h.handleNonStream(rec, context.Background(), resp, "cid2", "deepseek-reasoner", "prompt", true, []string{"search"})
-	if rec.Code != http.StatusOK {
-		t.Fatalf("unexpected status: %d", rec.Code)
-	}
-
-	out := decodeJSONBody(t, rec.Body.String())
-	choices, _ := out["choices"].([]any)
-	choice, _ := choices[0].(map[string]any)
-	msg, _ := choice["message"].(map[string]any)
-	if msg["reasoning_content"] != "先想一下" {
-		t.Fatalf("expected reasoning_content, got %#v", msg["reasoning_content"])
-	}
-	if msg["content"] != nil {
-		t.Fatalf("expected content nil, got %#v", msg["content"])
-	}
-	if choice["finish_reason"] != "tool_calls" {
-		t.Fatalf("expected finish_reason=tool_calls, got %#v", choice["finish_reason"])
-	}
-}
-
-func TestHandleNonStreamUnknownToolIntercepted(t *testing.T) {
-	h := &Handler{}
-	resp := makeSSEHTTPResponse(
-		`data: {"p":"response/content","v":"{\"tool_calls\":[{\"name\":\"not_in_schema\",\"input\":{\"q\":\"go\"}}]}"}`,
-		`data: [DONE]`,
-	)
-	rec := httptest.NewRecorder()
-
-	h.handleNonStream(rec, context.Background(), resp, "cid2b", "deepseek-chat", "prompt", false, []string{"search"})
-	if rec.Code != http.StatusOK {
-		t.Fatalf("unexpected status: %d", rec.Code)
-	}
-
-	out := decodeJSONBody(t, rec.Body.String())
-	choices, _ := out["choices"].([]any)
-	choice, _ := choices[0].(map[string]any)
-	if choice["finish_reason"] != "tool_calls" {
-		t.Fatalf("expected finish_reason=tool_calls, got %#v", choice["finish_reason"])
-	}
-	msg, _ := choice["message"].(map[string]any)
-	toolCalls, _ := msg["tool_calls"].([]any)
-	if len(toolCalls) != 1 {
-		t.Fatalf("expected tool_calls for unknown schema name, got %#v", msg["tool_calls"])
-	}
-}
-
-func TestHandleNonStreamEmbeddedToolCallExamplePromotesToolCall(t *testing.T) {
-	h := &Handler{}
-	resp := makeSSEHTTPResponse(
-		`data: {"p":"response/content","v":"下面是示例："}`,
-		`data: {"p":"response/content","v":"{\"tool_calls\":[{\"name\":\"search\",\"input\":{\"q\":\"go\"}}]}"}`,
-		`data: {"p":"response/content","v":"请勿执行。"}`,
-		`data: [DONE]`,
-	)
-	rec := httptest.NewRecorder()
-
-	h.handleNonStream(rec, context.Background(), resp, "cid2c", "deepseek-chat", "prompt", false, []string{"search"})
-	if rec.Code != http.StatusOK {
-		t.Fatalf("unexpected status: %d", rec.Code)
-	}
-
-	out := decodeJSONBody(t, rec.Body.String())
-	choices, _ := out["choices"].([]any)
-	choice, _ := choices[0].(map[string]any)
-	if choice["finish_reason"] != "tool_calls" {
-		t.Fatalf("expected finish_reason=tool_calls, got %#v", choice["finish_reason"])
-	}
-	msg, _ := choice["message"].(map[string]any)
-	toolCalls, _ := msg["tool_calls"].([]any)
-	if len(toolCalls) != 1 {
-		t.Fatalf("expected one tool_call field for embedded example: %#v", msg["tool_calls"])
-	}
-	content, _ := msg["content"].(string)
-	if strings.Contains(content, `"tool_calls"`) {
-		t.Fatalf("expected raw tool_calls json stripped from content, got %#v", content)
-	}
-}
-
-func TestHandleNonStreamFencedToolCallExampleDoesNotPromoteToolCall(t *testing.T) {
-	h := &Handler{}
-	resp := makeSSEHTTPResponse(
-		"data: {\"p\":\"response/content\",\"v\":\"```json\\n{\\\"tool_calls\\\":[{\\\"name\\\":\\\"search\\\",\\\"input\\\":{\\\"q\\\":\\\"go\\\"}}]}\\n```\"}",
-		`data: [DONE]`,
-	)
-	rec := httptest.NewRecorder()
-
-	h.handleNonStream(rec, context.Background(), resp, "cid2d", "deepseek-chat", "prompt", false, []string{"search"})
-	if rec.Code != http.StatusOK {
-		t.Fatalf("unexpected status: %d", rec.Code)
-	}
-
-	out := decodeJSONBody(t, rec.Body.String())
-	choices, _ := out["choices"].([]any)
-	choice, _ := choices[0].(map[string]any)
-	if choice["finish_reason"] == "tool_calls" {
-		t.Fatalf("expected fenced example to remain content-only, got finish_reason=%#v", choice["finish_reason"])
-	}
-	msg, _ := choice["message"].(map[string]any)
-	toolCalls, _ := msg["tool_calls"].([]any)
-	if len(toolCalls) != 0 {
-		t.Fatalf("expected no tool_call field for fenced example: %#v", msg["tool_calls"])
-	}
-	content, _ := msg["content"].(string)
-	if !strings.Contains(content, `"tool_calls"`) {
-		t.Fatalf("expected fenced example content preserved, got %q", content)
-	}
-}
-
 // Backward-compatible alias for historical test name used in CI logs.
-func TestHandleNonStreamFencedToolCallExamplePromotesToolCall(t *testing.T) {
-	TestHandleNonStreamFencedToolCallExampleDoesNotPromoteToolCall(t)
-}
-
-func TestHandleStreamToolCallInterceptsWithoutRawContentLeak(t *testing.T) {
+func TestHandleNonStreamReturns429WhenUpstreamOutputEmpty(t *testing.T) {
 	h := &Handler{}
 	resp := makeSSEHTTPResponse(
-		`data: {"p":"response/content","v":"{\"tool_calls\":[{\"name\":\"search\""}`,
-		`data: {"p":"response/content","v":",\"input\":{\"q\":\"go\"}}]}"}`,
+		`data: {"p":"response/content","v":""}`,
 		`data: [DONE]`,
 	)
 	rec := httptest.NewRecorder()
-	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)

-	h.handleStream(rec, req, resp, "cid3", "deepseek-chat", "prompt", false, false, []string{"search"})
-
-	frames, done := parseSSEDataFrames(t, rec.Body.String())
-	if !done {
-		t.Fatalf("expected [DONE], body=%s", rec.Body.String())
+	h.handleNonStream(rec, context.Background(), resp, "cid-empty", "deepseek-chat", "prompt", false, false, nil)
+	if rec.Code != http.StatusTooManyRequests {
+		t.Fatalf("expected status 429 for empty upstream output, got %d body=%s", rec.Code, rec.Body.String())
 	}
-	if !streamHasToolCallsDelta(frames) {
-		t.Fatalf("expected tool_calls delta, body=%s", rec.Body.String())
-	}
-	foundToolIndex := false
-	for _, frame := range frames {
-		choices, _ := frame["choices"].([]any)
-		for _, item := range choices {
-			choice, _ := item.(map[string]any)
-			delta, _ := choice["delta"].(map[string]any)
-			toolCalls, _ := delta["tool_calls"].([]any)
-			for _, tc := range toolCalls {
-				tcm, _ := tc.(map[string]any)
-				if _, ok := tcm["index"].(float64); ok {
-					foundToolIndex = true
-				}
-			}
-		}
-	}
-	if !foundToolIndex {
-		t.Fatalf("expected stream tool_calls item with index, body=%s", rec.Body.String())
-	}
-	if streamHasRawToolJSONContent(frames) {
-		t.Fatalf("raw tool_calls JSON leaked in content delta: %s", rec.Body.String())
-	}
-	if streamFinishReason(frames) != "tool_calls" {
-		t.Fatalf("expected finish_reason=tool_calls, body=%s", rec.Body.String())
+	out := decodeJSONBody(t, rec.Body.String())
+	errObj, _ := out["error"].(map[string]any)
+	if asString(errObj["code"]) != "upstream_empty_output" {
+		t.Fatalf("expected code=upstream_empty_output, got %#v", out)
 	}
 }

-func TestHandleStreamToolCallLargeArgumentsStillIntercepted(t *testing.T) {
+func TestHandleNonStreamReturnsContentFilterErrorWhenUpstreamFilteredWithoutOutput(t *testing.T) {
 	h := &Handler{}
-	large := strings.Repeat("a", 9000)
-	payload := fmt.Sprintf(`{"tool_calls":[{"name":"search","input":{"q":"%s"}}]}`, large)
-	splitAt := len(payload) / 2
 	resp := makeSSEHTTPResponse(
-		fmt.Sprintf(`data: {"p":"response/content","v":%q}`, payload[:splitAt]),
-		fmt.Sprintf(`data: {"p":"response/content","v":%q}`, payload[splitAt:]),
+		`data: {"code":"content_filter"}`,
 		`data: [DONE]`,
 	)
 	rec := httptest.NewRecorder()
-	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)

-	h.handleStream(rec, req, resp, "cid3-large", "deepseek-chat", "prompt", false, false, []string{"search"})
-
-	frames, done := parseSSEDataFrames(t, rec.Body.String())
-	if !done {
-		t.Fatalf("expected [DONE], body=%s", rec.Body.String())
+	h.handleNonStream(rec, context.Background(), resp, "cid-empty-filtered", "deepseek-chat", "prompt", false, false, nil)
+	if rec.Code != http.StatusBadRequest {
+		t.Fatalf("expected status 400 for filtered upstream output, got %d body=%s", rec.Code, rec.Body.String())
 	}
-	if !streamHasToolCallsDelta(frames) {
-		t.Fatalf("expected tool_calls delta, body=%s", rec.Body.String())
-	}
-	if streamHasRawToolJSONContent(frames) {
-		t.Fatalf("raw tool_calls JSON leaked in content delta: %s", rec.Body.String())
-	}
-	if streamFinishReason(frames) != "tool_calls" {
-		t.Fatalf("expected finish_reason=tool_calls, body=%s", rec.Body.String())
+	out := decodeJSONBody(t, rec.Body.String())
+	errObj, _ := out["error"].(map[string]any)
+	if asString(errObj["code"]) != "content_filter" {
+		t.Fatalf("expected code=content_filter, got %#v", out)
 	}
 }

-func TestHandleStreamReasonerToolCallInterceptsWithoutRawContentLeak(t *testing.T) {
+func TestHandleNonStreamReturns429WhenUpstreamHasOnlyThinking(t *testing.T) {
 	h := &Handler{}
 	resp := makeSSEHTTPResponse(
-		`data: {"p":"response/thinking_content","v":"思考中"}`,
-		`data: {"p":"response/content","v":"{\"tool_calls\":[{\"name\":\"search\",\"input\":{\"q\":\"go\"}}]}"}`,
+		`data: {"p":"response/thinking_content","v":"Only thinking"}`,
 		`data: [DONE]`,
 	)
 	rec := httptest.NewRecorder()
-	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)

-	h.handleStream(rec, req, resp, "cid4", "deepseek-reasoner", "prompt", true, false, []string{"search"})
-
-	frames, done := parseSSEDataFrames(t, rec.Body.String())
-	if !done {
-		t.Fatalf("expected [DONE], body=%s", rec.Body.String())
+	h.handleNonStream(rec, context.Background(), resp, "cid-thinking-only", "deepseek-reasoner", "prompt", true, false, nil)
+	if rec.Code != http.StatusTooManyRequests {
+		t.Fatalf("expected status 429 for thinking-only upstream output, got %d body=%s", rec.Code, rec.Body.String())
 	}
-	if !streamHasToolCallsDelta(frames) {
-		t.Fatalf("expected tool_calls delta, body=%s", rec.Body.String())
-	}
-	foundToolIndex := false
-	for _, frame := range frames {
-		choices, _ := frame["choices"].([]any)
-		for _, item := range choices {
-			choice, _ := item.(map[string]any)
-			delta, _ := choice["delta"].(map[string]any)
-			toolCalls, _ := delta["tool_calls"].([]any)
-			for _, tc := range toolCalls {
-				tcm, _ := tc.(map[string]any)
-				if _, ok := tcm["index"].(float64); ok {
-					foundToolIndex = true
-				}
-			}
-		}
-	}
-	if !foundToolIndex {
-		t.Fatalf("expected stream tool_calls item with index, body=%s", rec.Body.String())
-	}
-	if streamHasRawToolJSONContent(frames) {
-		t.Fatalf("raw tool_calls JSON leaked in content delta: %s", rec.Body.String())
-	}
-	if streamFinishReason(frames) != "tool_calls" {
-		t.Fatalf("expected finish_reason=tool_calls, body=%s", rec.Body.String())
-	}
-
-	hasThinkingDelta := false
-	for _, frame := range frames {
-		choices, _ := frame["choices"].([]any)
-		for _, item := range choices {
-			choice, _ := item.(map[string]any)
-			delta, _ := choice["delta"].(map[string]any)
-			if _, ok := delta["reasoning_content"]; ok {
-				hasThinkingDelta = true
-			}
-		}
-	}
-	if !hasThinkingDelta {
-		t.Fatalf("expected reasoning_content delta in reasoner stream: %s", rec.Body.String())
-	}
-}
-
-func TestHandleStreamUnknownToolEmitsToolCall(t *testing.T) {
-	h := &Handler{}
-	resp := makeSSEHTTPResponse(
-		`data: {"p":"response/content","v":"{\"tool_calls\":[{\"name\":\"not_in_schema\",\"input\":{\"q\":\"go\"}}]}"}`,
-		`data: [DONE]`,
-	)
-	rec := httptest.NewRecorder()
-	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
-
-	h.handleStream(rec, req, resp, "cid5", "deepseek-chat", "prompt", false, false, []string{"search"})
-
-	frames, done := parseSSEDataFrames(t, rec.Body.String())
-	if !done {
-		t.Fatalf("expected [DONE], body=%s", rec.Body.String())
-	}
-	if !streamHasToolCallsDelta(frames) {
-		t.Fatalf("expected tool_calls delta for unknown schema name, body=%s", rec.Body.String())
-	}
-	if streamHasRawToolJSONContent(frames) {
-		t.Fatalf("did not expect raw tool_calls json leak for unknown schema name: %s", rec.Body.String())
-	}
-	if streamFinishReason(frames) != "tool_calls" {
-		t.Fatalf("expected finish_reason=tool_calls, body=%s", rec.Body.String())
-	}
-}
-
-func TestHandleStreamUnknownToolNoArgsEmitsToolCall(t *testing.T) {
-	h := &Handler{}
-	resp := makeSSEHTTPResponse(
-		`data: {"p":"response/content","v":"{\"tool_calls\":[{\"name\":\"not_in_schema\"}]}"}`,
-		`data: [DONE]`,
-	)
-	rec := httptest.NewRecorder()
-	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
-
-	h.handleStream(rec, req, resp, "cid5b", "deepseek-chat", "prompt", false, false, []string{"search"})
-
-	frames, done := parseSSEDataFrames(t, rec.Body.String())
-	if !done {
-		t.Fatalf("expected [DONE], body=%s", rec.Body.String())
-	}
-	if !streamHasToolCallsDelta(frames) {
-		t.Fatalf("expected tool_calls delta for unknown schema name (no args), body=%s", rec.Body.String())
-	}
-	if streamHasRawToolJSONContent(frames) {
-		t.Fatalf("did not expect raw tool_calls json leak for unknown schema name (no args): %s", rec.Body.String())
-	}
-	if streamFinishReason(frames) != "tool_calls" {
-		t.Fatalf("expected finish_reason=tool_calls, body=%s", rec.Body.String())
+	out := decodeJSONBody(t, rec.Body.String())
+	errObj, _ := out["error"].(map[string]any)
+	if asString(errObj["code"]) != "upstream_empty_output" {
+		t.Fatalf("expected code=upstream_empty_output, got %#v", out)
 	}
 }

@@ -500,287 +181,6 @@ func TestHandleStreamToolsPlainTextStreamsBeforeFinish(t *testing.T) {
 	}
 }

-func TestHandleStreamToolCallMixedWithPlainTextSegments(t *testing.T) {
-	h := &Handler{}
-	resp := makeSSEHTTPResponse(
-		`data: {"p":"response/content","v":"下面是示例："}`,
-		`data: {"p":"response/content","v":"{\"tool_calls\":[{\"name\":\"search\",\"input\":{\"q\":\"go\"}}]}"}`,
-		`data: {"p":"response/content","v":"请勿执行。"}`,
-		`data: [DONE]`,
-	)
-	rec := httptest.NewRecorder()
-	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
-
-	h.handleStream(rec, req, resp, "cid7", "deepseek-chat", "prompt", false, false, []string{"search"})
-
-	frames, done := parseSSEDataFrames(t, rec.Body.String())
-	if !done {
-		t.Fatalf("expected [DONE], body=%s", rec.Body.String())
-	}
-	if !streamHasToolCallsDelta(frames) {
-		t.Fatalf("expected tool_calls delta in mixed prose stream, body=%s", rec.Body.String())
-	}
-	content := strings.Builder{}
-	for _, frame := range frames {
-		choices, _ := frame["choices"].([]any)
-		for _, item := range choices {
-			choice, _ := item.(map[string]any)
-			delta, _ := choice["delta"].(map[string]any)
-			if c, ok := delta["content"].(string); ok {
-				content.WriteString(c)
-			}
-		}
-	}
-	got := content.String()
-	if !strings.Contains(got, "下面是示例：") || !strings.Contains(got, "请勿执行。") {
-		t.Fatalf("expected pre/post plain text to pass sieve, got=%q", got)
-	}
-	if streamFinishReason(frames) != "tool_calls" {
-		t.Fatalf("expected finish_reason=tool_calls for mixed prose, body=%s", rec.Body.String())
-	}
-}
-
-func TestHandleStreamToolCallAfterLeadingTextRemainsText(t *testing.T) {
-	h := &Handler{}
-	resp := makeSSEHTTPResponse(
-		`data: {"p":"response/content","v":"我将调用工具。"}`,
-		`data: {"p":"response/content","v":"{\"tool_calls\":[{\"name\":\"search\",\"input\":{\"q\":\"go\"}}]}"}`,
-		`data: [DONE]`,
-	)
-	rec := httptest.NewRecorder()
-	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
-
-	h.handleStream(rec, req, resp, "cid7b", "deepseek-chat", "prompt", false, false, []string{"search"})
-
-	frames, done := parseSSEDataFrames(t, rec.Body.String())
-	if !done {
-		t.Fatalf("expected [DONE], body=%s", rec.Body.String())
-	}
-	if !streamHasToolCallsDelta(frames) {
-		t.Fatalf("expected tool_calls delta, body=%s", rec.Body.String())
-	}
-	content := strings.Builder{}
-	for _, frame := range frames {
-		choices, _ := frame["choices"].([]any)
-		for _, item := range choices {
-			choice, _ := item.(map[string]any)
-			delta, _ := choice["delta"].(map[string]any)
-			if c, ok := delta["content"].(string); ok {
-				content.WriteString(c)
-			}
-		}
-	}
-	got := content.String()
-	if !strings.Contains(got, "我将调用工具。") {
-		t.Fatalf("expected leading text to keep streaming, got=%q", got)
-	}
-
-	if streamFinishReason(frames) != "tool_calls" {
-		t.Fatalf("expected finish_reason=tool_calls, body=%s", rec.Body.String())
-	}
-}
-
-func TestHandleStreamToolCallWithSameChunkTrailingTextRemainsText(t *testing.T) {
-	h := &Handler{}
-	resp := makeSSEHTTPResponse(
-		`data: {"p":"response/content","v":"{\"tool_calls\":[{\"name\":\"search\",\"input\":{\"q\":\"go\"}}]}接下来我会继续说明。"}`,
-		`data: [DONE]`,
-	)
-	rec := httptest.NewRecorder()
-	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
-
-	h.handleStream(rec, req, resp, "cid7c", "deepseek-chat", "prompt", false, false, []string{"search"})
-
-	frames, done := parseSSEDataFrames(t, rec.Body.String())
-	if !done {
-		t.Fatalf("expected [DONE], body=%s", rec.Body.String())
-	}
-	if !streamHasToolCallsDelta(frames) {
-		t.Fatalf("expected tool_calls delta, body=%s", rec.Body.String())
-	}
-	content := strings.Builder{}
-	for _, frame := range frames {
-		choices, _ := frame["choices"].([]any)
-		for _, item := range choices {
-			choice, _ := item.(map[string]any)
-			delta, _ := choice["delta"].(map[string]any)
-			if c, ok := delta["content"].(string); ok {
-				content.WriteString(c)
-			}
-		}
-	}
-	got := content.String()
-	if !strings.Contains(got, "接下来我会继续说明。") {
-		t.Fatalf("expected trailing plain text to be preserved, got=%q", got)
-	}
-
-	if streamFinishReason(frames) != "tool_calls" {
-		t.Fatalf("expected finish_reason=tool_calls, body=%s", rec.Body.String())
-	}
-}
-
-func TestHandleStreamFencedToolCallSnippetPromotesToolCall(t *testing.T) {
-	h := &Handler{}
-	resp := makeSSEHTTPResponse(
-		fmt.Sprintf(`data: {"p":"response/content","v":%q}`, "下面是调用示例：\n```json\n"),
-		fmt.Sprintf(`data: {"p":"response/content","v":%q}`, "{\"tool_calls\":[{\"name\":\"search\",\"input\":{\"q\":\"go\"}}]}\n```\n仅示例，不要执行。"),
-		`data: [DONE]`,
-	)
-	rec := httptest.NewRecorder()
-	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
-
-	h.handleStream(rec, req, resp, "cid7f", "deepseek-chat", "prompt", false, false, []string{"search"})
-
-	frames, done := parseSSEDataFrames(t, rec.Body.String())
-	if !done {
-		t.Fatalf("expected [DONE], body=%s", rec.Body.String())
-	}
-	if !streamHasToolCallsDelta(frames) {
-		t.Fatalf("expected tool_calls delta for fenced snippet, body=%s", rec.Body.String())
-	}
-	content := strings.Builder{}
-	for _, frame := range frames {
-		choices, _ := frame["choices"].([]any)
-		for _, item := range choices {
-			choice, _ := item.(map[string]any)
-			delta, _ := choice["delta"].(map[string]any)
-			if c, ok := delta["content"].(string); ok {
-				content.WriteString(c)
-			}
-		}
-	}
-	got := content.String()
-	if strings.Contains(strings.ToLower(got), "tool_calls") {
-		t.Fatalf("expected raw fenced tool_calls snippet stripped from content, got=%q", got)
-	}
-	if strings.Contains(strings.ToLower(got), "```json") || strings.Contains(got, "\n```\n") {
-		t.Fatalf("expected consumed fenced tool payload to not leave empty code fence, got=%q", got)
-	}
-	if streamFinishReason(frames) != "tool_calls" {
-		t.Fatalf("expected finish_reason=tool_calls, body=%s", rec.Body.String())
-	}
-}
-
-func TestHandleStreamStandaloneToolCallAfterClosedFenceKeepsFence(t *testing.T) {
-	h := &Handler{}
-	resp := makeSSEHTTPResponse(
-		fmt.Sprintf(`data: {"p":"response/content","v":%q}`, "先给一个代码示例：\n```text\nhello\n```\n"),
-		fmt.Sprintf(`data: {"p":"response/content","v":%q}`, "{\"tool_calls\":[{\"name\":\"search\",\"input\":{\"q\":\"go\"}}]}"),
-		`data: [DONE]`,
-	)
-	rec := httptest.NewRecorder()
-	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
-
-	h.handleStream(rec, req, resp, "cid7g", "deepseek-chat", "prompt", false, false, []string{"search"})
-
-	frames, done := parseSSEDataFrames(t, rec.Body.String())
-	if !done {
-		t.Fatalf("expected [DONE], body=%s", rec.Body.String())
-	}
-	if !streamHasToolCallsDelta(frames) {
-		t.Fatalf("expected tool_calls delta for standalone payload, body=%s", rec.Body.String())
-	}
-	content := strings.Builder{}
-	for _, frame := range frames {
-		choices, _ := frame["choices"].([]any)
-		for _, item := range choices {
-			choice, _ := item.(map[string]any)
-			delta, _ := choice["delta"].(map[string]any)
-			if c, ok := delta["content"].(string); ok {
-				content.WriteString(c)
-			}
-		}
-	}
-	got := content.String()
-	if !strings.Contains(got, "```") {
-		t.Fatalf("expected closed fence before standalone tool json to be preserved, got=%q", got)
-	}
-	if streamFinishReason(frames) != "tool_calls" {
-		t.Fatalf("expected finish_reason=tool_calls, body=%s", rec.Body.String())
-	}
-}
-
-func TestHandleStreamToolCallKeyAppearsLateRemainsText(t *testing.T) {
-	h := &Handler{}
-	spaces := strings.Repeat(" ", 200)
-	resp := makeSSEHTTPResponse(
-		`data: {"p":"response/content","v":"{`+spaces+`"}`,
-		`data: {"p":"response/content","v":"\"tool_calls\":[{\"name\":\"search\",\"input\":{\"q\":\"go\"}}]}"}`,
-		`data: {"p":"response/content","v":"后置正文C。"}`,
-		`data: [DONE]`,
-	)
-	rec := httptest.NewRecorder()
-	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
-
-	h.handleStream(rec, req, resp, "cid8", "deepseek-chat", "prompt", false, false, []string{"search"})
-
-	frames, done := parseSSEDataFrames(t, rec.Body.String())
-	if !done {
-		t.Fatalf("expected [DONE], body=%s", rec.Body.String())
-	}
-	if !streamHasToolCallsDelta(frames) {
-		t.Fatalf("expected tool_calls delta, body=%s", rec.Body.String())
-	}
-	content := strings.Builder{}
-	for _, frame := range frames {
-		choices, _ := frame["choices"].([]any)
-		for _, item := range choices {
-			choice, _ := item.(map[string]any)
-			delta, _ := choice["delta"].(map[string]any)
-			if c, ok := delta["content"].(string); ok {
-				content.WriteString(c)
-			}
-		}
-	}
-	got := content.String()
-	if !strings.Contains(got, "后置正文C。") {
-		t.Fatalf("expected stream to continue after tool json convergence, got=%q", got)
-	}
-	if streamFinishReason(frames) != "tool_calls" {
-		t.Fatalf("expected finish_reason=tool_calls, body=%s", rec.Body.String())
-	}
-}
-
-func TestHandleStreamInvalidToolJSONDoesNotLeakRawObject(t *testing.T) {
-	h := &Handler{}
-	resp := makeSSEHTTPResponse(
-		`data: {"p":"response/content","v":"前置正文D。"}`,
-		`data: {"p":"response/content","v":"{'tool_calls':[{'name':'search','input':{'q':'go'}}]}"}`,
-		`data: {"p":"response/content","v":"后置正文E。"}`,
-		`data: [DONE]`,
-	)
-	rec := httptest.NewRecorder()
-	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
-
-	h.handleStream(rec, req, resp, "cid9", "deepseek-chat", "prompt", false, false, []string{"search"})
-
-	frames, done := parseSSEDataFrames(t, rec.Body.String())
-	if !done {
-		t.Fatalf("expected [DONE], body=%s", rec.Body.String())
-	}
-	if streamHasToolCallsDelta(frames) {
-		t.Fatalf("did not expect tool_calls delta for invalid json, body=%s", rec.Body.String())
-	}
-	content := strings.Builder{}
-	for _, frame := range frames {
-		choices, _ := frame["choices"].([]any)
-		for _, item := range choices {
-			choice, _ := item.(map[string]any)
-			delta, _ := choice["delta"].(map[string]any)
-			if c, ok := delta["content"].(string); ok {
-				content.WriteString(c)
-			}
-		}
-	}
-	got := content.String()
-	if !strings.Contains(got, "前置正文D。") || !strings.Contains(got, "后置正文E。") {
-		t.Fatalf("expected pre/post plain text to remain, got=%q", content.String())
-	}
-	if !strings.Contains(strings.ToLower(got), "tool_calls") {
-		t.Fatalf("expected invalid embedded tool-like json to pass through as text, got=%q", got)
-	}
-}
-
 func TestHandleStreamIncompleteCapturedToolJSONFlushesAsTextOnFinalize(t *testing.T) {
 	h := &Handler{}
 	resp := makeSSEHTTPResponse(
@@ -814,108 +214,3 @@ func TestHandleStreamIncompleteCapturedToolJSONFlushesAsTextOnFinalize(t *testin
 		t.Fatalf("expected incomplete capture to flush as plain text instead of stalling, got=%q", content.String())
 	}
 }
-
-func TestHandleStreamToolCallArgumentsEmitAsSingleCompletedChunk(t *testing.T) {
-	h := &Handler{}
-	resp := makeSSEHTTPResponse(
-		`data: {"p":"response/content","v":"{\"tool_calls\":[{\"name\":\"search\",\"input\":{\"q\":\"go"}`,
-		`data: {"p":"response/content","v":"lang\",\"page\":1}}]}"}`,
-		`data: [DONE]`,
-	)
-	rec := httptest.NewRecorder()
-	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
-
-	h.handleStream(rec, req, resp, "cid11", "deepseek-chat", "prompt", false, false, []string{"search"})
-
-	frames, done := parseSSEDataFrames(t, rec.Body.String())
-	if !done {
-		t.Fatalf("expected [DONE], body=%s", rec.Body.String())
-	}
-	if !streamHasToolCallsDelta(frames) {
-		t.Fatalf("expected tool_calls delta, body=%s", rec.Body.String())
-	}
-	if streamHasRawToolJSONContent(frames) {
-		t.Fatalf("raw tool_calls JSON leaked in content delta: %s", rec.Body.String())
-	}
-	argChunks := streamToolCallArgumentChunks(frames)
-	if len(argChunks) == 0 {
-		t.Fatalf("expected tool call arguments chunk, got=%v body=%s", argChunks, rec.Body.String())
-	}
-	joined := strings.Join(argChunks, "")
-	if !strings.Contains(joined, `"q":"golang"`) || !strings.Contains(joined, `"page":1`) {
-		t.Fatalf("unexpected merged arguments stream: %q", joined)
-	}
-	if streamFinishReason(frames) != "tool_calls" {
-		t.Fatalf("expected finish_reason=tool_calls, body=%s", rec.Body.String())
-	}
-}
-
-func TestHandleStreamMultiToolCallDoesNotMergeNamesOrArguments(t *testing.T) {
-	h := &Handler{}
-	resp := makeSSEHTTPResponse(
-		`data: {"p":"response/content","v":"{\"tool_calls\":[{\"name\":\"search_web\",\"input\":{\"query\":\"latest ai news\"}},{"}`,
-		`data: {"p":"response/content","v":"\"name\":\"eval_javascript\",\"input\":{\"code\":\"1+1\"}}]}"}`,
-		`data: [DONE]`,
-	)
-	rec := httptest.NewRecorder()
-	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
-
-	h.handleStream(rec, req, resp, "cid12", "deepseek-chat", "prompt", false, false, []string{"search_web", "eval_javascript"})
-
-	frames, done := parseSSEDataFrames(t, rec.Body.String())
-	if !done {
-		t.Fatalf("expected [DONE], body=%s", rec.Body.String())
-	}
-	if !streamHasToolCallsDelta(frames) {
-		t.Fatalf("expected tool_calls delta, body=%s", rec.Body.String())
-	}
-
-	foundSearch := false
-	foundEval := false
-	foundIndex1 := false
-	toolCallsDeltaLens := make([]int, 0, 2)
-	for _, frame := range frames {
-		choices, _ := frame["choices"].([]any)
-		for _, item := range choices {
-			choice, _ := item.(map[string]any)
-			delta, _ := choice["delta"].(map[string]any)
-			rawToolCalls, hasToolCalls := delta["tool_calls"]
-			if !hasToolCalls {
-				continue
-			}
-			toolCalls, _ := rawToolCalls.([]any)
-			toolCallsDeltaLens = append(toolCallsDeltaLens, len(toolCalls))
-			for _, tc := range toolCalls {
-				tcm, _ := tc.(map[string]any)
-				if idx, ok := tcm["index"].(float64); ok && int(idx) == 1 {
-					foundIndex1 = true
-				}
-				fn, _ := tcm["function"].(map[string]any)
-				name, _ := fn["name"].(string)
-				switch name {
-				case "search_web":
-					foundSearch = true
-				case "eval_javascript":
-					foundEval = true
-				case "search_webeval_javascript":
-					t.Fatalf("unexpected merged tool name: %s, body=%s", name, rec.Body.String())
-				}
-				if args, ok := fn["arguments"].(string); ok && strings.Contains(args, `}{"`) {
-					t.Fatalf("unexpected concatenated tool arguments: %q, body=%s", args, rec.Body.String())
-				}
-			}
-		}
-	}
-	if !foundSearch || !foundEval {
-		t.Fatalf("expected both tool names in stream deltas, foundSearch=%v foundEval=%v body=%s", foundSearch, foundEval, rec.Body.String())
-	}
-	if len(toolCallsDeltaLens) != 1 || toolCallsDeltaLens[0] != 2 {
-		t.Fatalf("expected exactly one tool_calls delta with two calls, got lens=%v body=%s", toolCallsDeltaLens, rec.Body.String())
-	}
-	if !foundIndex1 {
-		t.Fatalf("expected second tool call index in stream deltas, body=%s", rec.Body.String())
-	}
-	if streamFinishReason(frames) != "tool_calls" {
-		t.Fatalf("expected finish_reason=tool_calls, body=%s", rec.Body.String())
-	}
-}
--- a/internal/adapter/openai/leaked_output_sanitize.go
+++ b/internal/adapter/openai/leaked_output_sanitize.go
@@ -0,0 +1,115 @@
+package openai
+
+import (
+	"regexp"
+	"strings"
+)
+
+var emptyJSONFencePattern = regexp.MustCompile("(?is)```json\\s*```")
+var leakedToolCallArrayPattern = regexp.MustCompile(`(?is)\[\{\s*"function"\s*:\s*\{[\s\S]*?\}\s*,\s*"id"\s*:\s*"call[^"]*"\s*,\s*"type"\s*:\s*"function"\s*}\]`)
+var leakedToolResultBlobPattern = regexp.MustCompile(`(?is)<\s*\|\s*tool\s*\|\s*>\s*\{[\s\S]*?"tool_call_id"\s*:\s*"call[^"]*"\s*}`)
+
+var leakedThinkTagPattern = regexp.MustCompile(`(?is)</?\s*think\s*>`)
+
+// leakedBOSMarkerPattern matches DeepSeek BOS markers in BOTH forms:
+//   - ASCII underscore: <｜begin_of_sentence｜>
+//   - U+2581 variant:   <｜begin▁of▁sentence｜>
+var leakedBOSMarkerPattern = regexp.MustCompile(`(?i)<[｜\|]\s*begin[_▁]of[_▁]sentence\s*[｜\|]>`)
+
+// leakedMetaMarkerPattern matches the remaining DeepSeek special tokens in BOTH forms:
+//   - ASCII underscore: <｜end_of_sentence｜>, <｜end_of_toolresults｜>, <｜end_of_instructions｜>
+//   - U+2581 variant:   <｜end▁of▁sentence｜>, <｜end▁of▁toolresults｜>, <｜end▁of▁instructions｜>
+var leakedMetaMarkerPattern = regexp.MustCompile(`(?i)<[｜\|]\s*(?:assistant|tool|end[_▁]of[_▁]sentence|end[_▁]of[_▁]thinking|end[_▁]of[_▁]toolresults|end[_▁]of[_▁]instructions)\s*[｜\|]>`)
+
+// leakedAgentXMLBlockPatterns catch agent-style XML blocks that leak through
+// when the sieve fails to capture them. These are applied only to complete
+// wrapper blocks so standalone "<result>" examples in normal output remain
+// untouched.
+var leakedAgentXMLBlockPatterns = []*regexp.Regexp{
+	regexp.MustCompile(`(?is)<attempt_completion\b[^>]*>(.*?)</attempt_completion>`),
+	regexp.MustCompile(`(?is)<ask_followup_question\b[^>]*>(.*?)</ask_followup_question>`),
+	regexp.MustCompile(`(?is)<new_task\b[^>]*>(.*?)</new_task>`),
+}
+
+var leakedAgentWrapperTagPattern = regexp.MustCompile(`(?is)</?(?:attempt_completion|ask_followup_question|new_task)\b[^>]*>`)
+var leakedAgentWrapperPlusResultOpenPattern = regexp.MustCompile(`(?is)<(?:attempt_completion|ask_followup_question|new_task)\b[^>]*>\s*<result>`)
+var leakedAgentResultPlusWrapperClosePattern = regexp.MustCompile(`(?is)</result>\s*</(?:attempt_completion|ask_followup_question|new_task)\b[^>]*>`)
+var leakedAgentResultTagPattern = regexp.MustCompile(`(?is)</?result>`)
+
+func sanitizeLeakedOutput(text string) string {
+	if text == "" {
+		return text
+	}
+	out := emptyJSONFencePattern.ReplaceAllString(text, "")
+	out = leakedToolCallArrayPattern.ReplaceAllString(out, "")
+	out = leakedToolResultBlobPattern.ReplaceAllString(out, "")
+	out = stripDanglingThinkSuffix(out)
+	out = leakedThinkTagPattern.ReplaceAllString(out, "")
+	out = leakedBOSMarkerPattern.ReplaceAllString(out, "")
+	out = leakedMetaMarkerPattern.ReplaceAllString(out, "")
+	out = sanitizeLeakedAgentXMLBlocks(out)
+	return out
+}
+
+func stripDanglingThinkSuffix(text string) string {
+	matches := leakedThinkTagPattern.FindAllStringIndex(text, -1)
+	if len(matches) == 0 {
+		return text
+	}
+	depth := 0
+	lastOpen := -1
+	for _, loc := range matches {
+		tag := strings.ToLower(text[loc[0]:loc[1]])
+		compact := strings.ReplaceAll(strings.ReplaceAll(strings.TrimSpace(tag), " ", ""), "\t", "")
+		if strings.HasPrefix(compact, "</") {
+			if depth > 0 {
+				depth--
+				if depth == 0 {
+					lastOpen = -1
+				}
+			}
+			continue
+		}
+		if depth == 0 {
+			lastOpen = loc[0]
+		}
+		depth++
+	}
+	if depth == 0 || lastOpen < 0 {
+		return text
+	}
+	prefix := text[:lastOpen]
+	if strings.TrimSpace(prefix) == "" {
+		return ""
+	}
+	return prefix
+}
+
+func sanitizeLeakedAgentXMLBlocks(text string) string {
+	out := text
+	for _, pattern := range leakedAgentXMLBlockPatterns {
+		out = pattern.ReplaceAllStringFunc(out, func(match string) string {
+			submatches := pattern.FindStringSubmatch(match)
+			if len(submatches) < 2 {
+				return match
+			}
+			// Preserve the inner text so leaked agent instructions do not erase
+			// the actual answer, but strip the wrapper/result markup itself.
+			return leakedAgentResultTagPattern.ReplaceAllString(submatches[1], "")
+		})
+	}
+	// Fallback for truncated output streams: strip any dangling wrapper tags
+	// that were not part of a complete block replacement. If we detect leaked
+	// wrapper tags, strip only adjacent <result> tags to avoid exposing agent
+	// markup without altering unrelated user-visible <result> examples.
+	if leakedAgentWrapperTagPattern.MatchString(out) {
+		out = leakedAgentWrapperPlusResultOpenPattern.ReplaceAllStringFunc(out, func(match string) string {
+			return leakedAgentResultTagPattern.ReplaceAllString(match, "")
+		})
+		out = leakedAgentResultPlusWrapperClosePattern.ReplaceAllStringFunc(out, func(match string) string {
+			return leakedAgentResultTagPattern.ReplaceAllString(match, "")
+		})
+		out = leakedAgentWrapperTagPattern.ReplaceAllString(out, "")
+	}
+	return out
+}
--- a/internal/adapter/openai/leaked_output_sanitize_test.go
+++ b/internal/adapter/openai/leaked_output_sanitize_test.go
@@ -0,0 +1,84 @@
+package openai
+
+import "testing"
+
+func TestSanitizeLeakedOutputRemovesEmptyJSONFence(t *testing.T) {
+	raw := "before\n```json\n```\nafter"
+	got := sanitizeLeakedOutput(raw)
+	if got != "before\n\nafter" {
+		t.Fatalf("unexpected sanitized empty json fence: %q", got)
+	}
+}
+
+func TestSanitizeLeakedOutputRemovesLeakedWireToolCallAndResult(t *testing.T) {
+	raw := "开始\n[{\"function\":{\"arguments\":\"{\\\"command\\\":\\\"java -version\\\"}\",\"name\":\"exec\"},\"id\":\"callb9a321\",\"type\":\"function\"}]< | Tool | >{\"content\":\"openjdk version 21\",\"tool_call_id\":\"callb9a321\"}\n结束"
+	got := sanitizeLeakedOutput(raw)
+	if got != "开始\n\n结束" {
+		t.Fatalf("unexpected sanitize result for leaked wire format: %q", got)
+	}
+}
+
+func TestSanitizeLeakedOutputRemovesStandaloneMetaMarkers(t *testing.T) {
+	raw := "A<| end_of_sentence |><| Assistant |>B<| end_of_thinking |>C<｜end▁of▁thinking｜>D<｜end▁of▁sentence｜>E<| end_of_toolresults |>F<｜end▁of▁instructions｜>G"
+	got := sanitizeLeakedOutput(raw)
+	if got != "ABCDEFG" {
+		t.Fatalf("unexpected sanitize result for meta markers: %q", got)
+	}
+}
+
+func TestSanitizeLeakedOutputRemovesThinkAndBosMarkers(t *testing.T) {
+	raw := "A<think>B</think>C<｜begin▁of▁sentence｜>D<| begin_of_sentence |>E<｜begin_of_sentence｜>F"
+	got := sanitizeLeakedOutput(raw)
+	if got != "ABCDEF" {
+		t.Fatalf("unexpected sanitize result for think/BOS markers: %q", got)
+	}
+}
+
+func TestSanitizeLeakedOutputRemovesDanglingThinkBlock(t *testing.T) {
+	raw := "Answer prefix<think>internal reasoning that never closes"
+	got := sanitizeLeakedOutput(raw)
+	if got != "Answer prefix" {
+		t.Fatalf("unexpected sanitize result for dangling think block: %q", got)
+	}
+}
+
+func TestSanitizeLeakedOutputRemovesAgentXMLLeaks(t *testing.T) {
+	raw := "Done.<attempt_completion><result>Some final answer</result></attempt_completion>"
+	got := sanitizeLeakedOutput(raw)
+	if got != "Done.Some final answer" {
+		t.Fatalf("unexpected sanitize result for agent XML leak: %q", got)
+	}
+}
+
+func TestSanitizeLeakedOutputPreservesStandaloneResultTags(t *testing.T) {
+	raw := "Example XML: <result>value</result>"
+	got := sanitizeLeakedOutput(raw)
+	if got != raw {
+		t.Fatalf("unexpected sanitize result for standalone result tag: %q", got)
+	}
+}
+
+func TestSanitizeLeakedOutputRemovesDanglingAgentXMLOpeningTags(t *testing.T) {
+	raw := "Done.<attempt_completion><result>Some final answer"
+	got := sanitizeLeakedOutput(raw)
+	if got != "Done.Some final answer" {
+		t.Fatalf("unexpected sanitize result for dangling opening tags: %q", got)
+	}
+}
+
+func TestSanitizeLeakedOutputRemovesDanglingAgentXMLClosingTags(t *testing.T) {
+	raw := "Done.Some final answer</result></attempt_completion>"
+	got := sanitizeLeakedOutput(raw)
+	if got != "Done.Some final answer" {
+		t.Fatalf("unexpected sanitize result for dangling closing tags: %q", got)
+	}
+}
+
+func TestSanitizeLeakedOutputPreservesUnrelatedResultTagsWhenWrapperLeaks(t *testing.T) {
+	raw := "Done.<attempt_completion><result>Some final answer\nExample XML: <result>value</result>"
+	got := sanitizeLeakedOutput(raw)
+	want := "Done.Some final answer\nExample XML: <result>value</result>"
+	if got != want {
+		t.Fatalf("unexpected sanitize result for mixed leaked wrapper + xml example: %q", got)
+	}
+}
--- a/internal/adapter/openai/message_normalize.go
+++ b/internal/adapter/openai/message_normalize.go
@@ -1,7 +1,6 @@
 package openai

 import (
-	"encoding/json"
 	"strings"

 	"ds2api/internal/prompt"
@@ -55,7 +54,18 @@ func normalizeOpenAIMessagesForPrompt(raw []any, traceID string) []map[string]an
 }

 func buildAssistantContentForPrompt(msg map[string]any) string {
-	return strings.TrimSpace(normalizeOpenAIContentForPrompt(msg["content"]))
+	content := strings.TrimSpace(normalizeOpenAIContentForPrompt(msg["content"]))
+	toolHistory := prompt.FormatToolCallsForPrompt(msg["tool_calls"])
+	switch {
+	case content == "" && toolHistory == "":
+		return ""
+	case content == "":
+		return toolHistory
+	case toolHistory == "":
+		return content
+	default:
+		return content + "\n\n" + toolHistory
+	}
 }

 func buildToolContentForPrompt(msg map[string]any) string {
@@ -70,18 +80,6 @@ func normalizeOpenAIContentForPrompt(v any) string {
 	return prompt.NormalizeContent(v)
 }

-func normalizeToolArgumentString(raw string) string {
-	trimmed := strings.TrimSpace(raw)
-	if trimmed == "" {
-		return ""
-	}
-	if looksLikeConcatenatedJSON(trimmed) {
-		// Keep original payload to avoid silent argument rewrites.
-		return raw
-	}
-	return trimmed
-}
-
 func normalizeOpenAIRoleForPrompt(role string) string {
 	role = strings.ToLower(strings.TrimSpace(role))
 	if role == "developer" {
@@ -96,20 +94,3 @@ func asString(v any) string {
 	}
 	return ""
 }
-
-func looksLikeConcatenatedJSON(raw string) bool {
-	trimmed := strings.TrimSpace(raw)
-	if trimmed == "" {
-		return false
-	}
-	if strings.Contains(trimmed, "}{") || strings.Contains(trimmed, "][") {
-		return true
-	}
-	dec := json.NewDecoder(strings.NewReader(trimmed))
-	var first any
-	if err := dec.Decode(&first); err != nil {
-		return false
-	}
-	var second any
-	return dec.Decode(&second) == nil
-}
--- a/internal/adapter/openai/message_normalize_test.go
+++ b/internal/adapter/openai/message_normalize_test.go
@@ -34,20 +34,23 @@ func TestNormalizeOpenAIMessagesForPrompt_AssistantToolCallsAndToolResult(t *tes
 	}

 	normalized := normalizeOpenAIMessagesForPrompt(raw, "")
-	if len(normalized) != 3 {
-		t.Fatalf("expected 3 normalized messages with tool-call-only assistant turn omitted, got %d", len(normalized))
+	if len(normalized) != 4 {
+		t.Fatalf("expected 4 normalized messages with assistant tool history preserved, got %d", len(normalized))
 	}
-	toolContent, _ := normalized[2]["content"].(string)
-	if !strings.Contains(toolContent, `"temp":18`) {
-		t.Fatalf("tool result should be transparently forwarded, got %q", toolContent)
+	assistantContent, _ := normalized[2]["content"].(string)
+	if !strings.Contains(assistantContent, "<tool_calls>") {
+		t.Fatalf("assistant tool history should be preserved in XML form, got %q", assistantContent)
 	}
-	if strings.Contains(toolContent, "[TOOL_RESULT_HISTORY]") {
-		t.Fatalf("tool history marker should not be injected: %q", toolContent)
+	if !strings.Contains(assistantContent, "<tool_name>get_weather</tool_name>") {
+		t.Fatalf("expected tool name in preserved history, got %q", assistantContent)
+	}
+	if !strings.Contains(normalized[3]["content"].(string), `"temp":18`) {
+		t.Fatalf("tool result should be transparently forwarded, got %#v", normalized[3]["content"])
 	}

 	prompt := util.MessagesPrepare(normalized)
-	if strings.Contains(prompt, "[TOOL_CALL_HISTORY]") || strings.Contains(prompt, "[TOOL_RESULT_HISTORY]") {
-		t.Fatalf("expected no synthetic history markers in prompt: %q", prompt)
+	if !strings.Contains(prompt, "<tool_calls>") {
+		t.Fatalf("expected preserved assistant tool history in prompt: %q", prompt)
 	}
 }

@@ -170,8 +173,15 @@ func TestNormalizeOpenAIMessagesForPrompt_AssistantMultipleToolCallsRemainSepara
 	}

 	normalized := normalizeOpenAIMessagesForPrompt(raw, "")
-	if len(normalized) != 0 {
-		t.Fatalf("expected assistant tool_call-only message omitted, got %#v", normalized)
+	if len(normalized) != 1 {
+		t.Fatalf("expected assistant tool_call-only message preserved, got %#v", normalized)
+	}
+	content, _ := normalized[0]["content"].(string)
+	if strings.Count(content, "<tool_call>") != 2 {
+		t.Fatalf("expected two preserved tool call blocks, got %q", content)
+	}
+	if !strings.Contains(content, "<tool_name>search_web</tool_name>") || !strings.Contains(content, "<tool_name>eval_javascript</tool_name>") {
+		t.Fatalf("expected both tool names in preserved history, got %q", content)
 	}
 }

@@ -192,8 +202,12 @@ func TestNormalizeOpenAIMessagesForPrompt_PreservesConcatenatedToolArguments(t *
 	}

 	normalized := normalizeOpenAIMessagesForPrompt(raw, "")
-	if len(normalized) != 0 {
-		t.Fatalf("expected assistant tool_call-only content omitted, got %#v", normalized)
+	if len(normalized) != 1 {
+		t.Fatalf("expected assistant tool_call-only content preserved, got %#v", normalized)
+	}
+	content, _ := normalized[0]["content"].(string)
+	if !strings.Contains(content, `{}{"query":"测试工具调用"}`) {
+		t.Fatalf("expected concatenated tool arguments preserved, got %q", content)
 	}
 }

@@ -215,7 +229,7 @@ func TestNormalizeOpenAIMessagesForPrompt_AssistantToolCallsMissingNameAreDroppe

 	normalized := normalizeOpenAIMessagesForPrompt(raw, "")
 	if len(normalized) != 0 {
-		t.Fatalf("expected assistant tool_calls without text omitted, got %#v", normalized)
+		t.Fatalf("expected assistant tool_calls without text to be dropped when name is missing, got %#v", normalized)
 	}
 }

@@ -237,8 +251,15 @@ func TestNormalizeOpenAIMessagesForPrompt_AssistantNilContentDoesNotInjectNullLi
 	}

 	normalized := normalizeOpenAIMessagesForPrompt(raw, "")
-	if len(normalized) != 0 {
-		t.Fatalf("expected nil-content assistant tool_call-only message omitted, got %#v", normalized)
+	if len(normalized) != 1 {
+		t.Fatalf("expected nil-content assistant tool_call-only message preserved, got %#v", normalized)
+	}
+	content, _ := normalized[0]["content"].(string)
+	if strings.Contains(content, "null") {
+		t.Fatalf("expected no null literal injection, got %q", content)
+	}
+	if !strings.Contains(content, "<tool_calls>") {
+		t.Fatalf("expected assistant tool history in normalized content, got %q", content)
 	}
 }

--- a/internal/adapter/openai/models_route_test.go
+++ b/internal/adapter/openai/models_route_test.go
@@ -22,6 +22,24 @@ func TestGetModelRouteDirectAndAlias(t *testing.T) {
 		}
 	})

+	t.Run("direct_expert", func(t *testing.T) {
+		req := httptest.NewRequest(http.MethodGet, "/v1/models/deepseek-expert-chat", nil)
+		rec := httptest.NewRecorder()
+		r.ServeHTTP(rec, req)
+		if rec.Code != http.StatusOK {
+			t.Fatalf("expected 200, got %d body=%s", rec.Code, rec.Body.String())
+		}
+	})
+
+	t.Run("direct_vision", func(t *testing.T) {
+		req := httptest.NewRequest(http.MethodGet, "/v1/models/deepseek-vision-chat", nil)
+		rec := httptest.NewRecorder()
+		r.ServeHTTP(rec, req)
+		if rec.Code != http.StatusOK {
+			t.Fatalf("expected 200, got %d body=%s", rec.Code, rec.Body.String())
+		}
+	})
+
 	t.Run("alias", func(t *testing.T) {
 		req := httptest.NewRequest(http.MethodGet, "/v1/models/gpt-4.1", nil)
 		rec := httptest.NewRecorder()
--- a/internal/adapter/openai/output_clean.go
+++ b/internal/adapter/openai/output_clean.go
@@ -0,0 +1,13 @@
+package openai
+
+import textclean "ds2api/internal/textclean"
+
+func cleanVisibleOutput(text string, stripReferenceMarkers bool) string {
+	if text == "" {
+		return text
+	}
+	if stripReferenceMarkers {
+		text = textclean.StripReferenceMarkers(text)
+	}
+	return sanitizeLeakedOutput(text)
+}
--- a/internal/adapter/openai/prompt_build.go
+++ b/internal/adapter/openai/prompt_build.go
@@ -5,22 +5,22 @@ import (
 	"ds2api/internal/util"
 )

-func buildOpenAIFinalPrompt(messagesRaw []any, toolsRaw any, traceID string) (string, []string) {
-	return buildOpenAIFinalPromptWithPolicy(messagesRaw, toolsRaw, traceID, util.DefaultToolChoicePolicy())
+func buildOpenAIFinalPrompt(messagesRaw []any, toolsRaw any, traceID string, thinkingEnabled bool) (string, []string) {
+	return buildOpenAIFinalPromptWithPolicy(messagesRaw, toolsRaw, traceID, util.DefaultToolChoicePolicy(), thinkingEnabled)
 }

-func buildOpenAIFinalPromptWithPolicy(messagesRaw []any, toolsRaw any, traceID string, toolPolicy util.ToolChoicePolicy) (string, []string) {
+func buildOpenAIFinalPromptWithPolicy(messagesRaw []any, toolsRaw any, traceID string, toolPolicy util.ToolChoicePolicy, thinkingEnabled bool) (string, []string) {
 	messages := normalizeOpenAIMessagesForPrompt(messagesRaw, traceID)
 	toolNames := []string{}
 	if tools, ok := toolsRaw.([]any); ok && len(tools) > 0 {
 		messages, toolNames = injectToolPrompt(messages, tools, toolPolicy)
 	}
-	return deepseek.MessagesPrepare(messages), toolNames
+	return deepseek.MessagesPrepareWithThinking(messages, thinkingEnabled), toolNames
 }

 // BuildPromptForAdapter exposes the OpenAI-compatible prompt building flow so
 // other protocol adapters (for example Gemini) can reuse the same tool/history
 // normalization logic and remain behavior-compatible with chat/completions.
-func BuildPromptForAdapter(messagesRaw []any, toolsRaw any, traceID string) (string, []string) {
-	return buildOpenAIFinalPrompt(messagesRaw, toolsRaw, traceID)
+func BuildPromptForAdapter(messagesRaw []any, toolsRaw any, traceID string, thinkingEnabled bool) (string, []string) {
+	return buildOpenAIFinalPrompt(messagesRaw, toolsRaw, traceID, thinkingEnabled)
 }
--- a/internal/adapter/openai/prompt_build_test.go
+++ b/internal/adapter/openai/prompt_build_test.go
@@ -40,15 +40,18 @@ func TestBuildOpenAIFinalPrompt_HandlerPathIncludesToolRoundtripSemantics(t *tes
 		},
 	}

-	finalPrompt, toolNames := buildOpenAIFinalPrompt(messages, tools, "")
+	finalPrompt, toolNames := buildOpenAIFinalPrompt(messages, tools, "", false)
 	if len(toolNames) != 1 || toolNames[0] != "get_weather" {
 		t.Fatalf("unexpected tool names: %#v", toolNames)
 	}
 	if !strings.Contains(finalPrompt, `"condition":"sunny"`) {
 		t.Fatalf("handler finalPrompt should preserve tool output content: %q", finalPrompt)
 	}
-	if strings.Contains(finalPrompt, "[TOOL_CALL_HISTORY]") || strings.Contains(finalPrompt, "[TOOL_RESULT_HISTORY]") {
-		t.Fatalf("handler finalPrompt should not include synthetic history markers: %q", finalPrompt)
+	if !strings.Contains(finalPrompt, "<tool_calls>") {
+		t.Fatalf("handler finalPrompt should preserve assistant tool history: %q", finalPrompt)
+	}
+	if !strings.Contains(finalPrompt, "<tool_name>get_weather</tool_name>") {
+		t.Fatalf("handler finalPrompt should include tool name history: %q", finalPrompt)
 	}
 }

@@ -70,17 +73,31 @@ func TestBuildOpenAIFinalPrompt_VercelPreparePathKeepsFinalAnswerInstruction(t *
 		},
 	}

-	finalPrompt, _ := buildOpenAIFinalPrompt(messages, tools, "")
-	if !strings.Contains(finalPrompt, "After receiving a tool result, you MUST use it to produce the final answer.") {
-		t.Fatalf("vercel prepare finalPrompt missing final-answer instruction: %q", finalPrompt)
+	finalPrompt, _ := buildOpenAIFinalPrompt(messages, tools, "", false)
+	if !strings.Contains(finalPrompt, "Remember: The ONLY valid way to use tools is the <tool_calls> XML block at the end of your response.") {
+		t.Fatalf("vercel prepare finalPrompt missing final tool-call anchor instruction: %q", finalPrompt)
 	}
-	if !strings.Contains(finalPrompt, "Only call another tool when the previous result is missing required data or returned an error.") {
-		t.Fatalf("vercel prepare finalPrompt missing retry guard instruction: %q", finalPrompt)
+	if !strings.Contains(finalPrompt, "TOOL CALL FORMAT") {
+		t.Fatalf("vercel prepare finalPrompt missing xml format instruction: %q", finalPrompt)
 	}
-	if !strings.Contains(finalPrompt, "Do NOT wrap tool-call JSON in markdown/code fences") {
-		t.Fatalf("vercel prepare finalPrompt missing no-fence instruction: %q", finalPrompt)
+	if !strings.Contains(finalPrompt, "Do NOT wrap XML in markdown fences") {
+		t.Fatalf("vercel prepare finalPrompt missing no-fence xml instruction: %q", finalPrompt)
 	}
 	if strings.Contains(finalPrompt, "```json") {
-		t.Fatalf("vercel prepare finalPrompt should not require fenced json tool calls: %q", finalPrompt)
+		t.Fatalf("vercel prepare finalPrompt should not require fenced tool calls: %q", finalPrompt)
+	}
+}
+
+func TestBuildOpenAIFinalPromptWithThinkingAddsContinuationContract(t *testing.T) {
+	messages := []any{
+		map[string]any{"role": "user", "content": "继续回答上一个问题"},
+	}
+
+	finalPrompt, _ := buildOpenAIFinalPrompt(messages, nil, "", true)
+	if !strings.Contains(finalPrompt, "Continue the conversation from the full prior context") {
+		t.Fatalf("expected continuation contract in thinking prompt, got=%q", finalPrompt)
+	}
+	if !strings.Contains(finalPrompt, "final user-facing answer only in reasoning") {
+		t.Fatalf("expected visible-answer contract in thinking prompt, got=%q", finalPrompt)
 	}
 }
--- a/internal/adapter/openai/responses_embeddings_test.go
+++ b/internal/adapter/openai/responses_embeddings_test.go
@@ -156,6 +156,33 @@ func TestNormalizeResponsesInputAsMessagesFunctionCallItemPreservesConcatenatedA
 	}
 }

+func TestCollectOpenAIRefFileIDs(t *testing.T) {
+	got := collectOpenAIRefFileIDs(map[string]any{
+		"ref_file_ids": []any{"file-top", "file-dup"},
+		"attachments": []any{
+			map[string]any{"file_id": "file-attachment"},
+		},
+		"input": []any{
+			map[string]any{
+				"type": "message",
+				"content": []any{
+					map[string]any{"type": "input_file", "file_id": "file-input"},
+					map[string]any{"type": "input_file", "id": "file-dup"},
+				},
+			},
+		},
+	})
+	want := []string{"file-top", "file-dup", "file-attachment", "file-input"}
+	if len(got) != len(want) {
+		t.Fatalf("expected %d file ids, got %#v", len(want), got)
+	}
+	for i, id := range want {
+		if got[i] != id {
+			t.Fatalf("unexpected file ids at %d: got=%#v want=%#v", i, got, want)
+		}
+	}
+}
+
 func TestExtractEmbeddingInputs(t *testing.T) {
 	got := extractEmbeddingInputs([]any{"a", "b"})
 	if len(got) != 2 || got[0] != "a" || got[1] != "b" {
--- a/internal/adapter/openai/responses_handler.go
+++ b/internal/adapter/openai/responses_handler.go
@@ -1,6 +1,7 @@
 package openai

 import (
+	"ds2api/internal/toolcall"
 	"encoding/json"
 	"io"
 	"net/http"
@@ -64,11 +65,20 @@ func (h *Handler) Responses(w http.ResponseWriter, r *http.Request) {
 		return
 	}

+	r.Body = http.MaxBytesReader(w, r.Body, openAIGeneralMaxSize)
 	var req map[string]any
 	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		if strings.Contains(strings.ToLower(err.Error()), "too large") {
+			writeOpenAIError(w, http.StatusRequestEntityTooLarge, "request body too large")
+			return
+		}
 		writeOpenAIError(w, http.StatusBadRequest, "invalid json")
 		return
 	}
+	if err := h.preprocessInlineFileInputs(r.Context(), a, req); err != nil {
+		writeOpenAIInlineFileError(w, err)
+		return
+	}
 	traceID := requestTraceID(r)
 	stdReq, err := normalizeOpenAIResponsesRequest(h.Store, req, traceID)
 	if err != nil {
@@ -102,19 +112,27 @@ func (h *Handler) Responses(w http.ResponseWriter, r *http.Request) {
 		h.handleResponsesStream(w, r, resp, owner, responseID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, stdReq.ToolChoice, traceID)
 		return
 	}
-	h.handleResponsesNonStream(w, resp, owner, responseID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.ToolNames, stdReq.ToolChoice, traceID)
+	h.handleResponsesNonStream(w, resp, owner, responseID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, stdReq.ToolChoice, traceID)
 }

-func (h *Handler) handleResponsesNonStream(w http.ResponseWriter, resp *http.Response, owner, responseID, model, finalPrompt string, thinkingEnabled bool, toolNames []string, toolChoice util.ToolChoicePolicy, traceID string) {
-	defer resp.Body.Close()
+func (h *Handler) handleResponsesNonStream(w http.ResponseWriter, resp *http.Response, owner, responseID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolChoice util.ToolChoicePolicy, traceID string) {
+	defer func() { _ = resp.Body.Close() }()
 	if resp.StatusCode != http.StatusOK {
 		body, _ := io.ReadAll(resp.Body)
 		writeOpenAIError(w, resp.StatusCode, strings.TrimSpace(string(body)))
 		return
 	}
 	result := sse.CollectStream(resp, thinkingEnabled, true)
-	sanitizedText := sanitizeLeakedToolHistory(result.Text)
-	textParsed := util.ParseStandaloneToolCallsDetailed(sanitizedText, toolNames)
+	stripReferenceMarkers := h.compatStripReferenceMarkers()
+	sanitizedThinking := cleanVisibleOutput(result.Thinking, stripReferenceMarkers)
+	sanitizedText := cleanVisibleOutput(result.Text, stripReferenceMarkers)
+	if searchEnabled {
+		sanitizedText = replaceCitationMarkersWithLinks(sanitizedText, result.CitationLinks)
+	}
+	if writeUpstreamEmptyOutputError(w, sanitizedText, result.ContentFilter) {
+		return
+	}
+	textParsed := toolcall.ParseStandaloneToolCallsDetailed(sanitizedText, toolNames)
 	logResponsesToolPolicyRejection(traceID, toolChoice, textParsed, "text")

 	callCount := len(textParsed.Calls)
@@ -123,13 +141,13 @@ func (h *Handler) handleResponsesNonStream(w http.ResponseWriter, resp *http.Res
 		return
 	}

-	responseObj := openaifmt.BuildResponseObject(responseID, model, finalPrompt, result.Thinking, sanitizedText, toolNames)
+	responseObj := openaifmt.BuildResponseObject(responseID, model, finalPrompt, sanitizedThinking, sanitizedText, toolNames)
 	h.getResponseStore().put(owner, responseID, responseObj)
 	writeJSON(w, http.StatusOK, responseObj)
 }

 func (h *Handler) handleResponsesStream(w http.ResponseWriter, r *http.Request, resp *http.Response, owner, responseID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolChoice util.ToolChoicePolicy, traceID string) {
-	defer resp.Body.Close()
+	defer func() { _ = resp.Body.Close() }()
 	if resp.StatusCode != http.StatusOK {
 		body, _ := io.ReadAll(resp.Body)
 		writeOpenAIError(w, resp.StatusCode, strings.TrimSpace(string(body)))
@@ -148,6 +166,7 @@ func (h *Handler) handleResponsesStream(w http.ResponseWriter, r *http.Request,
 	}
 	bufferToolContent := len(toolNames) > 0
 	emitEarlyToolDeltas := h.toolcallFeatureMatchEnabled() && h.toolcallEarlyEmitHighConfidence()
+	stripReferenceMarkers := h.compatStripReferenceMarkers()

 	streamRuntime := newResponsesStreamRuntime(
 		w,
@@ -158,6 +177,7 @@ func (h *Handler) handleResponsesStream(w http.ResponseWriter, r *http.Request,
 		finalPrompt,
 		thinkingEnabled,
 		searchEnabled,
+		stripReferenceMarkers,
 		toolNames,
 		bufferToolContent,
 		emitEarlyToolDeltas,
@@ -185,7 +205,7 @@ func (h *Handler) handleResponsesStream(w http.ResponseWriter, r *http.Request,
 	})
 }

-func logResponsesToolPolicyRejection(traceID string, policy util.ToolChoicePolicy, parsed util.ToolCallParseResult, channel string) {
+func logResponsesToolPolicyRejection(traceID string, policy util.ToolChoicePolicy, parsed toolcall.ToolCallParseResult, channel string) {
 	rejected := filteredRejectedToolNamesForLog(parsed.RejectedToolNames)
 	if !parsed.RejectedByPolicy || len(rejected) == 0 {
 		return
--- a/internal/adapter/openai/responses_input_items.go
+++ b/internal/adapter/openai/responses_input_items.go
@@ -1,11 +1,11 @@
 package openai

 import (
-	"encoding/json"
 	"fmt"
 	"strings"

 	"ds2api/internal/config"
+	"ds2api/internal/prompt"
 )

 func normalizeResponsesInputItem(m map[string]any) map[string]any {
@@ -148,7 +148,7 @@ func normalizeResponsesInputItemWithState(m map[string]any, callNameByID map[str

 		functionPayload := map[string]any{
 			"name":      name,
-			"arguments": stringifyToolCallArguments(argsRaw),
+			"arguments": prompt.StringifyToolCallArguments(argsRaw),
 		}
 		call := map[string]any{
 			"type":     "function",
@@ -211,26 +211,3 @@ func normalizeResponsesFallbackPart(m map[string]any) string {
 	}
 	return strings.TrimSpace(fmt.Sprintf("%v", m))
 }
-
-func stringifyToolCallArguments(v any) string {
-	switch x := v.(type) {
-	case nil:
-		return "{}"
-	case string:
-		s := strings.TrimSpace(x)
-		if s == "" {
-			return "{}"
-		}
-		s = normalizeToolArgumentString(s)
-		if s == "" {
-			return "{}"
-		}
-		return s
-	default:
-		b, err := json.Marshal(x)
-		if err != nil || len(b) == 0 {
-			return "{}"
-		}
-		return string(b)
-	}
-}
--- a/internal/adapter/openai/responses_stream_runtime_core.go
+++ b/internal/adapter/openai/responses_stream_runtime_core.go
@@ -1,6 +1,7 @@
 package openai

 import (
+	"ds2api/internal/toolcall"
 	"net/http"
 	"strings"

@@ -23,8 +24,9 @@ type responsesStreamRuntime struct {
 	traceID     string
 	toolChoice  util.ToolChoicePolicy

-	thinkingEnabled bool
-	searchEnabled   bool
+	thinkingEnabled       bool
+	searchEnabled         bool
+	stripReferenceMarkers bool

 	bufferToolContent    bool
 	emitEarlyToolDeltas  bool
@@ -62,6 +64,7 @@ func newResponsesStreamRuntime(
 	finalPrompt string,
 	thinkingEnabled bool,
 	searchEnabled bool,
+	stripReferenceMarkers bool,
 	toolNames []string,
 	bufferToolContent bool,
 	emitEarlyToolDeltas bool,
@@ -70,40 +73,65 @@ func newResponsesStreamRuntime(
 	persistResponse func(obj map[string]any),
 ) *responsesStreamRuntime {
 	return &responsesStreamRuntime{
-		w:                   w,
-		rc:                  rc,
-		canFlush:            canFlush,
-		responseID:          responseID,
-		model:               model,
-		finalPrompt:         finalPrompt,
-		thinkingEnabled:     thinkingEnabled,
-		searchEnabled:       searchEnabled,
-		toolNames:           toolNames,
-		bufferToolContent:   bufferToolContent,
-		emitEarlyToolDeltas: emitEarlyToolDeltas,
-		streamToolCallIDs:   map[int]string{},
-		functionItemIDs:     map[int]string{},
-		functionOutputIDs:   map[int]int{},
-		functionArgs:        map[int]string{},
-		functionDone:        map[int]bool{},
-		functionAdded:       map[int]bool{},
-		functionNames:       map[int]string{},
-		messageOutputID:     -1,
-		toolChoice:          toolChoice,
-		traceID:             traceID,
-		persistResponse:     persistResponse,
+		w:                     w,
+		rc:                    rc,
+		canFlush:              canFlush,
+		responseID:            responseID,
+		model:                 model,
+		finalPrompt:           finalPrompt,
+		thinkingEnabled:       thinkingEnabled,
+		searchEnabled:         searchEnabled,
+		stripReferenceMarkers: stripReferenceMarkers,
+		toolNames:             toolNames,
+		bufferToolContent:     bufferToolContent,
+		emitEarlyToolDeltas:   emitEarlyToolDeltas,
+		streamToolCallIDs:     map[int]string{},
+		functionItemIDs:       map[int]string{},
+		functionOutputIDs:     map[int]int{},
+		functionArgs:          map[int]string{},
+		functionDone:          map[int]bool{},
+		functionAdded:         map[int]bool{},
+		functionNames:         map[int]string{},
+		messageOutputID:       -1,
+		toolChoice:            toolChoice,
+		traceID:               traceID,
+		persistResponse:       persistResponse,
 	}
 }

+func (s *responsesStreamRuntime) failResponse(message, code string) {
+	s.failed = true
+	failedResp := map[string]any{
+		"id":          s.responseID,
+		"type":        "response",
+		"object":      "response",
+		"model":       s.model,
+		"status":      "failed",
+		"output":      []any{},
+		"output_text": "",
+		"error": map[string]any{
+			"message": message,
+			"type":    "invalid_request_error",
+			"code":    code,
+			"param":   nil,
+		},
+	}
+	if s.persistResponse != nil {
+		s.persistResponse(failedResp)
+	}
+	s.sendEvent("response.failed", openaifmt.BuildResponsesFailedPayload(s.responseID, s.model, message, code))
+	s.sendDone()
+}
+
 func (s *responsesStreamRuntime) finalize() {
 	finalThinking := s.thinking.String()
-	finalText := sanitizeLeakedToolHistory(s.text.String())
+	finalText := cleanVisibleOutput(s.text.String(), s.stripReferenceMarkers)

 	if s.bufferToolContent {
 		s.processToolStreamEvents(flushToolSieve(&s.sieve, s.toolNames), true)
 	}

-	textParsed := util.ParseStandaloneToolCallsDetailed(finalText, s.toolNames)
+	textParsed := toolcall.ParseStandaloneToolCallsDetailed(finalText, s.toolNames)
 	detected := textParsed.Calls
 	s.logToolPolicyRejections(textParsed)

@@ -117,28 +145,16 @@ func (s *responsesStreamRuntime) finalize() {
 	s.closeMessageItem()

 	if s.toolChoice.IsRequired() && len(detected) == 0 {
-		s.failed = true
-		message := "tool_choice requires at least one valid tool call."
-		failedResp := map[string]any{
-			"id":          s.responseID,
-			"type":        "response",
-			"object":      "response",
-			"model":       s.model,
-			"status":      "failed",
-			"output":      []any{},
-			"output_text": "",
-			"error": map[string]any{
-				"message": message,
-				"type":    "invalid_request_error",
-				"code":    "tool_choice_violation",
-				"param":   nil,
-			},
+		s.failResponse("tool_choice requires at least one valid tool call.", "tool_choice_violation")
+		return
+	}
+	if len(detected) == 0 && strings.TrimSpace(finalText) == "" {
+		code := "upstream_empty_output"
+		message := "Upstream model returned empty output."
+		if finalThinking != "" {
+			message = "Upstream model returned reasoning without visible output."
 		}
-		if s.persistResponse != nil {
-			s.persistResponse(failedResp)
-		}
-		s.sendEvent("response.failed", openaifmt.BuildResponsesFailedPayload(s.responseID, s.model, message, "tool_choice_violation"))
-		s.sendDone()
+		s.failResponse(message, code)
 		return
 	}
 	s.closeIncompleteFunctionItems()
@@ -151,8 +167,8 @@ func (s *responsesStreamRuntime) finalize() {
 	s.sendDone()
 }

-func (s *responsesStreamRuntime) logToolPolicyRejections(textParsed util.ToolCallParseResult) {
-	logRejected := func(parsed util.ToolCallParseResult, channel string) {
+func (s *responsesStreamRuntime) logToolPolicyRejections(textParsed toolcall.ToolCallParseResult) {
+	logRejected := func(parsed toolcall.ToolCallParseResult, channel string) {
 		rejected := filteredRejectedToolNamesForLog(parsed.RejectedToolNames)
 		if !parsed.RejectedByPolicy || len(rejected) == 0 {
 			return
@@ -178,10 +194,11 @@ func (s *responsesStreamRuntime) onParsed(parsed sse.LineResult) streamengine.Pa

 	contentSeen := false
 	for _, p := range parsed.Parts {
-		if p.Text == "" {
+		cleanedText := cleanVisibleOutput(p.Text, s.stripReferenceMarkers)
+		if cleanedText == "" {
 			continue
 		}
-		if p.Type != "thinking" && s.searchEnabled && sse.IsCitation(p.Text) {
+		if p.Type != "thinking" && s.searchEnabled && sse.IsCitation(cleanedText) {
 			continue
 		}
 		contentSeen = true
@@ -189,21 +206,25 @@ func (s *responsesStreamRuntime) onParsed(parsed sse.LineResult) streamengine.Pa
 			if !s.thinkingEnabled {
 				continue
 			}
-			s.thinking.WriteString(p.Text)
-			s.sendEvent("response.reasoning.delta", openaifmt.BuildResponsesReasoningDeltaPayload(s.responseID, p.Text))
+			trimmed := sse.TrimContinuationOverlap(s.thinking.String(), cleanedText)
+			if trimmed == "" {
+				continue
+			}
+			s.thinking.WriteString(trimmed)
+			s.sendEvent("response.reasoning.delta", openaifmt.BuildResponsesReasoningDeltaPayload(s.responseID, trimmed))
 			continue
 		}

-		cleanedText := sanitizeLeakedToolHistory(p.Text)
-		if cleanedText == "" {
+		trimmed := sse.TrimContinuationOverlap(s.text.String(), cleanedText)
+		if trimmed == "" {
 			continue
 		}
-		s.text.WriteString(cleanedText)
+		s.text.WriteString(trimmed)
 		if !s.bufferToolContent {
-			s.emitTextDelta(cleanedText)
+			s.emitTextDelta(trimmed)
 			continue
 		}
-		s.processToolStreamEvents(processToolSieveChunk(&s.sieve, cleanedText, s.toolNames), true)
+		s.processToolStreamEvents(processToolSieveChunk(&s.sieve, trimmed, s.toolNames), true)
 	}

 	return streamengine.ParsedDecision{ContentSeen: contentSeen}
--- a/internal/adapter/openai/responses_stream_runtime_events.go
+++ b/internal/adapter/openai/responses_stream_runtime_events.go
@@ -48,7 +48,7 @@ func (s *responsesStreamRuntime) processToolStreamEvents(events []toolStreamEven
 			if !s.emitEarlyToolDeltas {
 				continue
 			}
-			filtered := filterIncrementalToolCallDeltasByAllowed(evt.ToolCallDeltas, s.toolNames, s.functionNames)
+			filtered := filterIncrementalToolCallDeltasByAllowed(evt.ToolCallDeltas, s.functionNames)
 			if len(filtered) == 0 {
 				continue
 			}
--- a/internal/adapter/openai/responses_stream_runtime_toolcalls.go
+++ b/internal/adapter/openai/responses_stream_runtime_toolcalls.go
@@ -1,11 +1,11 @@
 package openai

 import (
+	"ds2api/internal/toolcall"
 	"encoding/json"
 	"strings"

 	openaifmt "ds2api/internal/format/openai"
-	"ds2api/internal/util"

 	"github.com/google/uuid"
 )
@@ -69,7 +69,7 @@ func (s *responsesStreamRuntime) ensureMessageContentPartAdded() {
 }

 func (s *responsesStreamRuntime) emitTextDelta(content string) {
-	if strings.TrimSpace(content) == "" {
+	if content == "" {
 		return
 	}
 	s.ensureMessageContentPartAdded()
@@ -208,7 +208,7 @@ func (s *responsesStreamRuntime) emitFunctionCallDeltaEvents(deltas []toolCallDe
 	}
 }

-func (s *responsesStreamRuntime) emitFunctionCallDoneEvents(calls []util.ParsedToolCall) {
+func (s *responsesStreamRuntime) emitFunctionCallDoneEvents(calls []toolcall.ParsedToolCall) {
 	for idx, tc := range calls {
 		if strings.TrimSpace(tc.Name) == "" {
 			continue
--- a/internal/adapter/openai/responses_stream_runtime_toolcalls_finalize.go
+++ b/internal/adapter/openai/responses_stream_runtime_toolcalls_finalize.go
@@ -1,12 +1,12 @@
 package openai

 import (
+	"ds2api/internal/toolcall"
 	"encoding/json"
 	"sort"
 	"strings"

 	openaifmt "ds2api/internal/format/openai"
-	"ds2api/internal/util"
 )

 func (s *responsesStreamRuntime) closeIncompleteFunctionItems() {
@@ -57,7 +57,7 @@ func (s *responsesStreamRuntime) closeIncompleteFunctionItems() {
 	}
 }

-func (s *responsesStreamRuntime) buildCompletedResponseObject(finalThinking, finalText string, calls []util.ParsedToolCall) map[string]any {
+func (s *responsesStreamRuntime) buildCompletedResponseObject(finalThinking, finalText string, calls []toolcall.ParsedToolCall) map[string]any {
 	type indexedItem struct {
 		index int
 		item  map[string]any
@@ -83,13 +83,13 @@ func (s *responsesStreamRuntime) buildCompletedResponseObject(finalThinking, fin
 		})
 	} else if len(calls) == 0 {
 		content := make([]map[string]any, 0, 2)
-		if strings.TrimSpace(finalThinking) != "" {
+		if finalThinking != "" {
 			content = append(content, map[string]any{
 				"type": "reasoning",
 				"text": finalThinking,
 			})
 		}
-		if strings.TrimSpace(finalText) != "" {
+		if finalText != "" {
 			content = append(content, map[string]any{
 				"type": "output_text",
 				"text": finalText,
@@ -136,10 +136,10 @@ func (s *responsesStreamRuntime) buildCompletedResponseObject(finalThinking, fin
 	}

 	outputText := s.visibleText.String()
-	if strings.TrimSpace(outputText) == "" && len(calls) == 0 {
-		if strings.TrimSpace(finalText) != "" {
+	if outputText == "" && len(calls) == 0 {
+		if finalText != "" {
 			outputText = finalText
-		} else if strings.TrimSpace(finalThinking) != "" {
+		} else if finalThinking != "" {
 			outputText = finalThinking
 		}
 	}
--- a/internal/adapter/openai/responses_stream_test.go
+++ b/internal/adapter/openai/responses_stream_test.go
@@ -12,149 +12,6 @@ import (
 	"ds2api/internal/util"
 )

-func TestHandleResponsesStreamToolCallsHideRawOutputTextInCompleted(t *testing.T) {
-	h := &Handler{}
-	req := httptest.NewRequest(http.MethodPost, "/v1/responses", nil)
-	rec := httptest.NewRecorder()
-
-	sseLine := func(v string) string {
-		b, _ := json.Marshal(map[string]any{
-			"p": "response/content",
-			"v": v,
-		})
-		return "data: " + string(b) + "\n"
-	}
-
-	rawToolJSON := `{"tool_calls":[{"name":"read_file","input":{"path":"README.MD"}}]}`
-	streamBody := sseLine(rawToolJSON) + "data: [DONE]\n"
-	resp := &http.Response{
-		StatusCode: http.StatusOK,
-		Body:       io.NopCloser(strings.NewReader(streamBody)),
-	}
-
-	h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-chat", "prompt", false, false, []string{"read_file"}, util.DefaultToolChoicePolicy(), "")
-
-	completed, ok := extractSSEEventPayload(rec.Body.String(), "response.completed")
-	if !ok {
-		t.Fatalf("expected response.completed event, body=%s", rec.Body.String())
-	}
-	responseObj, _ := completed["response"].(map[string]any)
-	outputText, _ := responseObj["output_text"].(string)
-	if outputText != "" {
-		t.Fatalf("expected empty output_text for tool_calls response, got output_text=%q", outputText)
-	}
-	output, _ := responseObj["output"].([]any)
-	if len(output) == 0 {
-		t.Fatalf("expected structured output entries, got %#v", responseObj["output"])
-	}
-	hasFunctionCall := false
-	hasLegacyWrapper := false
-	for _, item := range output {
-		m, _ := item.(map[string]any)
-		if m == nil {
-			continue
-		}
-		if m["type"] == "function_call" {
-			hasFunctionCall = true
-		}
-		if m["type"] == "tool_calls" {
-			hasLegacyWrapper = true
-		}
-	}
-	if !hasFunctionCall {
-		t.Fatalf("expected function_call item, got %#v", responseObj["output"])
-	}
-	if hasLegacyWrapper {
-		t.Fatalf("did not expect legacy tool_calls wrapper, got %#v", responseObj["output"])
-	}
-	if strings.Contains(outputText, `"tool_calls"`) {
-		t.Fatalf("raw tool_calls JSON leaked in output_text: %q", outputText)
-	}
-}
-
-func TestHandleResponsesStreamUsesOfficialOutputItemEvents(t *testing.T) {
-	h := &Handler{}
-	req := httptest.NewRequest(http.MethodPost, "/v1/responses", nil)
-	rec := httptest.NewRecorder()
-
-	sseLine := func(v string) string {
-		b, _ := json.Marshal(map[string]any{
-			"p": "response/content",
-			"v": v,
-		})
-		return "data: " + string(b) + "\n"
-	}
-
-	streamBody := sseLine(`{"tool_calls":[{"name":"read_file","input":{"path":"README.MD"}}]}`) + "data: [DONE]\n"
-	resp := &http.Response{
-		StatusCode: http.StatusOK,
-		Body:       io.NopCloser(strings.NewReader(streamBody)),
-	}
-
-	h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-chat", "prompt", false, false, []string{"read_file"}, util.DefaultToolChoicePolicy(), "")
-	body := rec.Body.String()
-	if !strings.Contains(body, "event: response.output_item.added") {
-		t.Fatalf("expected response.output_item.added event, body=%s", body)
-	}
-	if !strings.Contains(body, "event: response.output_item.done") {
-		t.Fatalf("expected response.output_item.done event, body=%s", body)
-	}
-	if !strings.Contains(body, "event: response.function_call_arguments.done") {
-		t.Fatalf("expected response.function_call_arguments.done event, body=%s", body)
-	}
-	if strings.Contains(body, "event: response.output_tool_call.delta") || strings.Contains(body, "event: response.output_tool_call.done") {
-		t.Fatalf("legacy response.output_tool_call.* event must not appear, body=%s", body)
-	}
-
-	addedPayloads := extractAllSSEEventPayloads(body, "response.output_item.added")
-	hasFunctionCallAdded := false
-	for _, payload := range addedPayloads {
-		item, _ := payload["item"].(map[string]any)
-		if item == nil || asString(item["type"]) != "function_call" {
-			continue
-		}
-		hasFunctionCallAdded = true
-		if asString(item["arguments"]) != "" {
-			t.Fatalf("expected in-progress function_call.arguments to start empty string, got %#v", item["arguments"])
-		}
-	}
-	if !hasFunctionCallAdded {
-		t.Fatalf("expected function_call output_item.added payload, body=%s", body)
-	}
-
-	donePayload, ok := extractSSEEventPayload(body, "response.function_call_arguments.done")
-	if !ok {
-		t.Fatalf("expected to parse response.function_call_arguments.done payload, body=%s", body)
-	}
-	doneCallID := strings.TrimSpace(asString(donePayload["call_id"]))
-	if doneCallID == "" {
-		t.Fatalf("expected non-empty call_id in done payload, payload=%#v", donePayload)
-	}
-	completed, ok := extractSSEEventPayload(body, "response.completed")
-	if !ok {
-		t.Fatalf("expected response.completed payload, body=%s", body)
-	}
-	responseObj, _ := completed["response"].(map[string]any)
-	output, _ := responseObj["output"].([]any)
-	var completedCallID string
-	for _, item := range output {
-		m, _ := item.(map[string]any)
-		if m == nil || m["type"] != "function_call" {
-			continue
-		}
-		completedCallID = strings.TrimSpace(asString(m["call_id"]))
-		if completedCallID != "" {
-			break
-		}
-	}
-	if completedCallID == "" {
-		t.Fatalf("expected function_call.call_id in completed output, output=%#v", output)
-	}
-	if completedCallID != doneCallID {
-		t.Fatalf("expected completed call_id to match stream done call_id, done=%q completed=%q", doneCallID, completedCallID)
-	}
-}
-
 func TestHandleResponsesStreamDoesNotEmitReasoningTextCompatEvents(t *testing.T) {
 	h := &Handler{}
 	req := httptest.NewRequest(http.MethodPost, "/v1/responses", nil)
@@ -181,51 +38,6 @@ func TestHandleResponsesStreamDoesNotEmitReasoningTextCompatEvents(t *testing.T)
 	}
 }

-func TestHandleResponsesStreamMultiToolCallKeepsNameAndCallIDAligned(t *testing.T) {
-	h := &Handler{}
-	req := httptest.NewRequest(http.MethodPost, "/v1/responses", nil)
-	rec := httptest.NewRecorder()
-
-	sseLine := func(v string) string {
-		b, _ := json.Marshal(map[string]any{
-			"p": "response/content",
-			"v": v,
-		})
-		return "data: " + string(b) + "\n"
-	}
-
-	streamBody := sseLine(`{"tool_calls":[{"name":"search_web","input":{"query":"latest ai news"}},`) +
-		sseLine(`{"name":"eval_javascript","input":{"code":"1+1"}}]}`) +
-		"data: [DONE]\n"
-	resp := &http.Response{
-		StatusCode: http.StatusOK,
-		Body:       io.NopCloser(strings.NewReader(streamBody)),
-	}
-
-	h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-chat", "prompt", false, false, []string{"search_web", "eval_javascript"}, util.DefaultToolChoicePolicy(), "")
-
-	body := rec.Body.String()
-	donePayloads := extractAllSSEEventPayloads(body, "response.function_call_arguments.done")
-	if len(donePayloads) != 2 {
-		t.Fatalf("expected two response.function_call_arguments.done events, got %d body=%s", len(donePayloads), body)
-	}
-	seenNames := map[string]string{}
-	for _, payload := range donePayloads {
-		name := strings.TrimSpace(asString(payload["name"]))
-		callID := strings.TrimSpace(asString(payload["call_id"]))
-		if name != "search_web" && name != "eval_javascript" {
-			t.Fatalf("unexpected tool name in done payload: %#v", payload)
-		}
-		if callID == "" {
-			t.Fatalf("expected non-empty call_id in done payload: %#v", payload)
-		}
-		seenNames[name] = callID
-	}
-	if seenNames["search_web"] == seenNames["eval_javascript"] {
-		t.Fatalf("expected distinct call_id per tool, got %#v", seenNames)
-	}
-}
-
 func TestHandleResponsesStreamEmitsOutputTextDoneBeforeContentPartDone(t *testing.T) {
 	h := &Handler{}
 	req := httptest.NewRequest(http.MethodPost, "/v1/responses", nil)
@@ -297,123 +109,6 @@ func TestHandleResponsesStreamOutputTextDeltaCarriesItemIndexes(t *testing.T) {
 	}
 }

-func TestHandleResponsesStreamThinkingAndMixedToolExampleEmitsFunctionCall(t *testing.T) {
-	h := &Handler{}
-	req := httptest.NewRequest(http.MethodPost, "/v1/responses", nil)
-	rec := httptest.NewRecorder()
-
-	sseLine := func(path, value string) string {
-		b, _ := json.Marshal(map[string]any{
-			"p": path,
-			"v": value,
-		})
-		return "data: " + string(b) + "\n"
-	}
-
-	streamBody := sseLine("response/thinking_content", "thinking...") +
-		sseLine("response/content", "先读取文件。") +
-		sseLine("response/content", `{"tool_calls":[{"name":"read_file","input":{"path":"README.MD"}}]}`) +
-		"data: [DONE]\n"
-	resp := &http.Response{
-		StatusCode: http.StatusOK,
-		Body:       io.NopCloser(strings.NewReader(streamBody)),
-	}
-
-	h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-reasoner", "prompt", true, false, []string{"read_file"}, util.DefaultToolChoicePolicy(), "")
-
-	addedPayloads := extractAllSSEEventPayloads(rec.Body.String(), "response.output_item.added")
-	if len(addedPayloads) < 1 {
-		t.Fatalf("expected at least one output_item.added event, got %d body=%s", len(addedPayloads), rec.Body.String())
-	}
-
-	completedPayload, ok := extractSSEEventPayload(rec.Body.String(), "response.completed")
-	if !ok {
-		t.Fatalf("expected response.completed payload, body=%s", rec.Body.String())
-	}
-	responseObj, _ := completedPayload["response"].(map[string]any)
-	output, _ := responseObj["output"].([]any)
-	hasMessage := false
-	hasFunctionCall := false
-	for _, item := range output {
-		m, _ := item.(map[string]any)
-		if m == nil {
-			continue
-		}
-		if asString(m["type"]) == "message" {
-			hasMessage = true
-		}
-		if asString(m["type"]) == "function_call" {
-			hasFunctionCall = true
-		}
-	}
-	if !hasMessage {
-		t.Fatalf("expected message output for mixed prose tool example, output=%#v", output)
-	}
-	if !hasFunctionCall {
-		t.Fatalf("expected function_call output for mixed prose tool example, output=%#v", output)
-	}
-}
-
-func TestHandleResponsesStreamToolChoiceNoneStillAllowsFunctionCall(t *testing.T) {
-	h := &Handler{}
-	req := httptest.NewRequest(http.MethodPost, "/v1/responses", nil)
-	rec := httptest.NewRecorder()
-
-	sseLine := func(v string) string {
-		b, _ := json.Marshal(map[string]any{
-			"p": "response/content",
-			"v": v,
-		})
-		return "data: " + string(b) + "\n"
-	}
-
-	streamBody := sseLine(`{"tool_calls":[{"name":"read_file","input":{"path":"README.MD"}}]}`) + "data: [DONE]\n"
-	resp := &http.Response{
-		StatusCode: http.StatusOK,
-		Body:       io.NopCloser(strings.NewReader(streamBody)),
-	}
-	policy := util.ToolChoicePolicy{Mode: util.ToolChoiceNone}
-
-	h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-chat", "prompt", false, false, nil, policy, "")
-	body := rec.Body.String()
-	if !strings.Contains(body, "event: response.function_call_arguments.done") {
-		t.Fatalf("expected function_call events for tool_choice=none, body=%s", body)
-	}
-}
-
-func TestHandleResponsesStreamMalformedToolJSONFallsBackToText(t *testing.T) {
-	h := &Handler{}
-	req := httptest.NewRequest(http.MethodPost, "/v1/responses", nil)
-	rec := httptest.NewRecorder()
-
-	sseLine := func(v string) string {
-		b, _ := json.Marshal(map[string]any{
-			"p": "response/content",
-			"v": v,
-		})
-		return "data: " + string(b) + "\n"
-	}
-
-	// invalid JSON (NaN) should remain plain text in strict mode.
-	streamBody := sseLine(`{"tool_calls":[{"name":"read_file","input":{"path":"README.MD"},"x":NaN}]}`) + "data: [DONE]\n"
-	resp := &http.Response{
-		StatusCode: http.StatusOK,
-		Body:       io.NopCloser(strings.NewReader(streamBody)),
-	}
-
-	h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-chat", "prompt", false, false, []string{"read_file"}, util.DefaultToolChoicePolicy(), "")
-	body := rec.Body.String()
-	if strings.Contains(body, "event: response.function_call_arguments.delta") || strings.Contains(body, "event: response.function_call_arguments.done") {
-		t.Fatalf("did not expect function_call events for malformed payload in strict mode, body=%s", body)
-	}
-	if !strings.Contains(body, "event: response.output_text.delta") {
-		t.Fatalf("expected response.output_text.delta for malformed payload, body=%s", body)
-	}
-	if !strings.Contains(body, "event: response.completed") {
-		t.Fatalf("expected response.completed event, body=%s", body)
-	}
-}
-
 func TestHandleResponsesStreamRequiredToolChoiceFailure(t *testing.T) {
 	h := &Handler{}
 	req := httptest.NewRequest(http.MethodPost, "/v1/responses", nil)
@@ -448,7 +143,7 @@ func TestHandleResponsesStreamRequiredToolChoiceFailure(t *testing.T) {
 	}
 }

-func TestHandleResponsesStreamRequiredToolChoiceIgnoresThinkingToolPayload(t *testing.T) {
+func TestHandleResponsesStreamFailsWhenUpstreamHasOnlyThinking(t *testing.T) {
 	h := &Handler{}
 	req := httptest.NewRequest(http.MethodPost, "/v1/responses", nil)
 	rec := httptest.NewRecorder()
@@ -461,53 +156,13 @@ func TestHandleResponsesStreamRequiredToolChoiceIgnoresThinkingToolPayload(t *te
 		return "data: " + string(b) + "\n"
 	}

-	streamBody := sseLine("response/thinking_content", `{"tool_calls":[{"name":"read_file","input":{"path":"README.MD"}}]}`) +
-		sseLine("response/content", "plain text only") +
-		"data: [DONE]\n"
+	streamBody := sseLine("response/thinking_content", "Only thinking") + "data: [DONE]\n"
 	resp := &http.Response{
 		StatusCode: http.StatusOK,
 		Body:       io.NopCloser(strings.NewReader(streamBody)),
 	}

-	policy := util.ToolChoicePolicy{
-		Mode:    util.ToolChoiceRequired,
-		Allowed: map[string]struct{}{"read_file": {}},
-	}
-
-	h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-chat", "prompt", true, false, []string{"read_file"}, policy, "")
-	body := rec.Body.String()
-	if !strings.Contains(body, "event: response.failed") {
-		t.Fatalf("expected response.failed event for required tool_choice violation, body=%s", body)
-	}
-	if strings.Contains(body, "event: response.completed") {
-		t.Fatalf("did not expect response.completed after failure, body=%s", body)
-	}
-}
-
-func TestHandleResponsesStreamRequiredMalformedToolPayloadFails(t *testing.T) {
-	h := &Handler{}
-	req := httptest.NewRequest(http.MethodPost, "/v1/responses", nil)
-	rec := httptest.NewRecorder()
-
-	sseLine := func(v string) string {
-		b, _ := json.Marshal(map[string]any{
-			"p": "response/content",
-			"v": v,
-		})
-		return "data: " + string(b) + "\n"
-	}
-
-	streamBody := sseLine(`{"tool_calls":[{"name":"read_file","input":{"path":"README.MD"},"x":NaN}]}`) + "data: [DONE]\n"
-	resp := &http.Response{
-		StatusCode: http.StatusOK,
-		Body:       io.NopCloser(strings.NewReader(streamBody)),
-	}
-	policy := util.ToolChoicePolicy{
-		Mode:    util.ToolChoiceRequired,
-		Allowed: map[string]struct{}{"read_file": {}},
-	}
-
-	h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-chat", "prompt", false, false, []string{"read_file"}, policy, "")
+	h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-reasoner", "prompt", true, false, nil, util.DefaultToolChoicePolicy(), "")

 	body := rec.Body.String()
 	if !strings.Contains(body, "event: response.failed") {
@@ -516,31 +171,13 @@ func TestHandleResponsesStreamRequiredMalformedToolPayloadFails(t *testing.T) {
 	if strings.Contains(body, "event: response.completed") {
 		t.Fatalf("did not expect response.completed, body=%s", body)
 	}
-}
-
-func TestHandleResponsesStreamAllowsUnknownToolName(t *testing.T) {
-	h := &Handler{}
-	req := httptest.NewRequest(http.MethodPost, "/v1/responses", nil)
-	rec := httptest.NewRecorder()
-
-	sseLine := func(v string) string {
-		b, _ := json.Marshal(map[string]any{
-			"p": "response/content",
-			"v": v,
-		})
-		return "data: " + string(b) + "\n"
+	payload, ok := extractSSEEventPayload(body, "response.failed")
+	if !ok {
+		t.Fatalf("expected response.failed payload, body=%s", body)
 	}
-
-	streamBody := sseLine(`{"tool_calls":[{"name":"not_in_schema","input":{"q":"go"}}]}`) + "data: [DONE]\n"
-	resp := &http.Response{
-		StatusCode: http.StatusOK,
-		Body:       io.NopCloser(strings.NewReader(streamBody)),
-	}
-
-	h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-chat", "prompt", false, false, []string{"read_file"}, util.DefaultToolChoicePolicy(), "")
-	body := rec.Body.String()
-	if !strings.Contains(body, "event: response.function_call_arguments.done") {
-		t.Fatalf("expected function_call events for unknown tool, body=%s", body)
+	errObj, _ := payload["error"].(map[string]any)
+	if asString(errObj["code"]) != "upstream_empty_output" {
+		t.Fatalf("expected code=upstream_empty_output, got %#v", payload)
 	}
 }

@@ -559,7 +196,7 @@ func TestHandleResponsesNonStreamRequiredToolChoiceViolation(t *testing.T) {
 		Allowed: map[string]struct{}{"read_file": {}},
 	}

-	h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-chat", "prompt", false, []string{"read_file"}, policy, "")
+	h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-chat", "prompt", false, false, []string{"read_file"}, policy, "")
 	if rec.Code != http.StatusUnprocessableEntity {
 		t.Fatalf("expected 422 for required tool_choice violation, got %d body=%s", rec.Code, rec.Body.String())
 	}
@@ -586,7 +223,7 @@ func TestHandleResponsesNonStreamRequiredToolChoiceIgnoresThinkingToolPayload(t
 		Allowed: map[string]struct{}{"read_file": {}},
 	}

-	h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-chat", "prompt", true, []string{"read_file"}, policy, "")
+	h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-chat", "prompt", true, false, []string{"read_file"}, policy, "")
 	if rec.Code != http.StatusUnprocessableEntity {
 		t.Fatalf("expected 422 for required tool_choice violation, got %d body=%s", rec.Code, rec.Body.String())
 	}
@@ -597,33 +234,69 @@ func TestHandleResponsesNonStreamRequiredToolChoiceIgnoresThinkingToolPayload(t
 	}
 }

-func TestHandleResponsesNonStreamToolChoiceNoneStillAllowsFunctionCall(t *testing.T) {
+func TestHandleResponsesNonStreamReturns429WhenUpstreamOutputEmpty(t *testing.T) {
 	h := &Handler{}
 	rec := httptest.NewRecorder()
 	resp := &http.Response{
 		StatusCode: http.StatusOK,
 		Body: io.NopCloser(strings.NewReader(
-			`data: {"p":"response/content","v":"{\"tool_calls\":[{\"name\":\"read_file\",\"input\":{\"path\":\"README.MD\"}}]}"}` + "\n" +
+			`data: {"p":"response/content","v":""}` + "\n" +
 				`data: [DONE]` + "\n",
 		)),
 	}
-	policy := util.ToolChoicePolicy{Mode: util.ToolChoiceNone}

-	h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-chat", "prompt", false, nil, policy, "")
-	if rec.Code != http.StatusOK {
-		t.Fatalf("expected 200 for tool_choice=none handling, got %d body=%s", rec.Code, rec.Body.String())
+	h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-chat", "prompt", false, false, nil, util.DefaultToolChoicePolicy(), "")
+	if rec.Code != http.StatusTooManyRequests {
+		t.Fatalf("expected 429 for empty upstream output, got %d body=%s", rec.Code, rec.Body.String())
 	}
 	out := decodeJSONBody(t, rec.Body.String())
-	output, _ := out["output"].([]any)
-	foundFunctionCall := false
-	for _, item := range output {
-		m, _ := item.(map[string]any)
-		if m != nil && m["type"] == "function_call" {
-			foundFunctionCall = true
-		}
+	errObj, _ := out["error"].(map[string]any)
+	if asString(errObj["code"]) != "upstream_empty_output" {
+		t.Fatalf("expected code=upstream_empty_output, got %#v", out)
 	}
-	if !foundFunctionCall {
-		t.Fatalf("expected function_call output item for tool_choice=none, got %#v", output)
+}
+
+func TestHandleResponsesNonStreamReturnsContentFilterErrorWhenUpstreamFilteredWithoutOutput(t *testing.T) {
+	h := &Handler{}
+	rec := httptest.NewRecorder()
+	resp := &http.Response{
+		StatusCode: http.StatusOK,
+		Body: io.NopCloser(strings.NewReader(
+			`data: {"code":"content_filter"}` + "\n" +
+				`data: [DONE]` + "\n",
+		)),
+	}
+
+	h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-chat", "prompt", false, false, nil, util.DefaultToolChoicePolicy(), "")
+	if rec.Code != http.StatusBadRequest {
+		t.Fatalf("expected 400 for filtered empty upstream output, got %d body=%s", rec.Code, rec.Body.String())
+	}
+	out := decodeJSONBody(t, rec.Body.String())
+	errObj, _ := out["error"].(map[string]any)
+	if asString(errObj["code"]) != "content_filter" {
+		t.Fatalf("expected code=content_filter, got %#v", out)
+	}
+}
+
+func TestHandleResponsesNonStreamReturns429WhenUpstreamHasOnlyThinking(t *testing.T) {
+	h := &Handler{}
+	rec := httptest.NewRecorder()
+	resp := &http.Response{
+		StatusCode: http.StatusOK,
+		Body: io.NopCloser(strings.NewReader(
+			`data: {"p":"response/thinking_content","v":"Only thinking"}` + "\n" +
+				`data: [DONE]` + "\n",
+		)),
+	}
+
+	h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-reasoner", "prompt", true, false, nil, util.DefaultToolChoicePolicy(), "")
+	if rec.Code != http.StatusTooManyRequests {
+		t.Fatalf("expected 429 for thinking-only upstream output, got %d body=%s", rec.Code, rec.Body.String())
+	}
+	out := decodeJSONBody(t, rec.Body.String())
+	errObj, _ := out["error"].(map[string]any)
+	if asString(errObj["code"]) != "upstream_empty_output" {
+		t.Fatalf("expected code=upstream_empty_output, got %#v", out)
 	}
 }

@@ -652,30 +325,3 @@ func extractSSEEventPayload(body, targetEvent string) (map[string]any, bool) {
 	}
 	return nil, false
 }
-
-func extractAllSSEEventPayloads(body, targetEvent string) []map[string]any {
-	scanner := bufio.NewScanner(strings.NewReader(body))
-	matched := false
-	out := make([]map[string]any, 0, 2)
-	for scanner.Scan() {
-		line := strings.TrimSpace(scanner.Text())
-		if strings.HasPrefix(line, "event: ") {
-			evt := strings.TrimSpace(strings.TrimPrefix(line, "event: "))
-			matched = evt == targetEvent
-			continue
-		}
-		if !matched || !strings.HasPrefix(line, "data: ") {
-			continue
-		}
-		raw := strings.TrimSpace(strings.TrimPrefix(line, "data: "))
-		if raw == "" || raw == "[DONE]" {
-			continue
-		}
-		var payload map[string]any
-		if err := json.Unmarshal([]byte(raw), &payload); err != nil {
-			continue
-		}
-		out = append(out, payload)
-	}
-	return out
-}
--- a/internal/adapter/openai/standard_request.go
+++ b/internal/adapter/openai/standard_request.go
@@ -12,11 +12,11 @@ func normalizeOpenAIChatRequest(store ConfigReader, req map[string]any, traceID
 	model, _ := req["model"].(string)
 	messagesRaw, _ := req["messages"].([]any)
 	if strings.TrimSpace(model) == "" || len(messagesRaw) == 0 {
-		return util.StandardRequest{}, fmt.Errorf("Request must include 'model' and 'messages'.")
+		return util.StandardRequest{}, fmt.Errorf("request must include 'model' and 'messages'")
 	}
 	resolvedModel, ok := config.ResolveModel(store, model)
 	if !ok {
-		return util.StandardRequest{}, fmt.Errorf("Model '%s' is not available.", model)
+		return util.StandardRequest{}, fmt.Errorf("model %q is not available", model)
 	}
 	thinkingEnabled, searchEnabled, _ := config.GetModelConfig(resolvedModel)
 	responseModel := strings.TrimSpace(model)
@@ -24,9 +24,10 @@ func normalizeOpenAIChatRequest(store ConfigReader, req map[string]any, traceID
 		responseModel = resolvedModel
 	}
 	toolPolicy := util.DefaultToolChoicePolicy()
-	finalPrompt, toolNames := buildOpenAIFinalPromptWithPolicy(messagesRaw, req["tools"], traceID, toolPolicy)
+	finalPrompt, toolNames := buildOpenAIFinalPromptWithPolicy(messagesRaw, req["tools"], traceID, toolPolicy, thinkingEnabled)
 	toolNames = ensureToolDetectionEnabled(toolNames, req["tools"])
 	passThrough := collectOpenAIChatPassThrough(req)
+	refFileIDs := collectOpenAIRefFileIDs(req)

 	return util.StandardRequest{
 		Surface:        "openai_chat",
@@ -40,6 +41,7 @@ func normalizeOpenAIChatRequest(store ConfigReader, req map[string]any, traceID
 		Stream:         util.ToBool(req["stream"]),
 		Thinking:       thinkingEnabled,
 		Search:         searchEnabled,
+		RefFileIDs:     refFileIDs,
 		PassThrough:    passThrough,
 	}, nil
 }
@@ -48,11 +50,11 @@ func normalizeOpenAIResponsesRequest(store ConfigReader, req map[string]any, tra
 	model, _ := req["model"].(string)
 	model = strings.TrimSpace(model)
 	if model == "" {
-		return util.StandardRequest{}, fmt.Errorf("Request must include 'model'.")
+		return util.StandardRequest{}, fmt.Errorf("request must include 'model'")
 	}
 	resolvedModel, ok := config.ResolveModel(store, model)
 	if !ok {
-		return util.StandardRequest{}, fmt.Errorf("Model '%s' is not available.", model)
+		return util.StandardRequest{}, fmt.Errorf("model %q is not available", model)
 	}
 	thinkingEnabled, searchEnabled, _ := config.GetModelConfig(resolvedModel)

@@ -68,18 +70,19 @@ func normalizeOpenAIResponsesRequest(store ConfigReader, req map[string]any, tra
 		messagesRaw = msgs
 	}
 	if len(messagesRaw) == 0 {
-		return util.StandardRequest{}, fmt.Errorf("Request must include 'input' or 'messages'.")
+		return util.StandardRequest{}, fmt.Errorf("request must include 'input' or 'messages'")
 	}
 	toolPolicy, err := parseToolChoicePolicy(req["tool_choice"], req["tools"])
 	if err != nil {
 		return util.StandardRequest{}, err
 	}
-	finalPrompt, toolNames := buildOpenAIFinalPromptWithPolicy(messagesRaw, req["tools"], traceID, toolPolicy)
+	finalPrompt, toolNames := buildOpenAIFinalPromptWithPolicy(messagesRaw, req["tools"], traceID, toolPolicy, thinkingEnabled)
 	toolNames = ensureToolDetectionEnabled(toolNames, req["tools"])
 	if !toolPolicy.IsNone() {
 		toolPolicy.Allowed = namesToSet(toolNames)
 	}
 	passThrough := collectOpenAIChatPassThrough(req)
+	refFileIDs := collectOpenAIRefFileIDs(req)

 	return util.StandardRequest{
 		Surface:        "openai_responses",
@@ -93,6 +96,7 @@ func normalizeOpenAIResponsesRequest(store ConfigReader, req map[string]any, tra
 		Stream:         util.ToBool(req["stream"]),
 		Thinking:       thinkingEnabled,
 		Search:         searchEnabled,
+		RefFileIDs:     refFileIDs,
 		PassThrough:    passThrough,
 	}, nil
 }
@@ -152,7 +156,7 @@ func parseToolChoicePolicy(toolChoiceRaw any, toolsRaw any) (util.ToolChoicePoli
 		case "required":
 			policy.Mode = util.ToolChoiceRequired
 		default:
-			return util.ToolChoicePolicy{}, fmt.Errorf("Unsupported tool_choice: %q", v)
+			return util.ToolChoicePolicy{}, fmt.Errorf("unsupported tool_choice: %q", v)
 		}
 	case map[string]any:
 		allowedOverride, hasAllowedOverride, err := parseAllowedToolNames(v["allowed_tools"])
@@ -198,7 +202,7 @@ func parseToolChoicePolicy(toolChoiceRaw any, toolsRaw any) (util.ToolChoicePoli
 			policy.ForcedName = name
 			policy.Allowed = namesToSet([]string{name})
 		default:
-			return util.ToolChoicePolicy{}, fmt.Errorf("Unsupported tool_choice.type: %q", typ)
+			return util.ToolChoicePolicy{}, fmt.Errorf("unsupported tool_choice.type: %q", typ)
 		}
 	default:
 		return util.ToolChoicePolicy{}, fmt.Errorf("tool_choice must be a string or object")
@@ -206,7 +210,7 @@ func parseToolChoicePolicy(toolChoiceRaw any, toolsRaw any) (util.ToolChoicePoli

 	if policy.Mode == util.ToolChoiceRequired || policy.Mode == util.ToolChoiceForced {
 		if len(declaredNames) == 0 {
-			return util.ToolChoicePolicy{}, fmt.Errorf("tool_choice=%s requires non-empty tools.", policy.Mode)
+			return util.ToolChoicePolicy{}, fmt.Errorf("tool_choice=%s requires non-empty tools", policy.Mode)
 		}
 	}
 	if policy.Mode == util.ToolChoiceForced {
--- a/internal/adapter/openai/standard_request_test.go
+++ b/internal/adapter/openai/standard_request_test.go
@@ -41,6 +41,36 @@ func TestNormalizeOpenAIChatRequest(t *testing.T) {
 	}
 }

+func TestNormalizeOpenAIChatRequestCollectsRefFileIDs(t *testing.T) {
+	store := newEmptyStoreForNormalizeTest(t)
+	req := map[string]any{
+		"model": "gpt-5-codex",
+		"messages": []any{
+			map[string]any{
+				"role": "user",
+				"content": []any{
+					map[string]any{"type": "input_text", "text": "hello"},
+					map[string]any{"type": "input_file", "file_id": "file-msg"},
+				},
+			},
+		},
+		"attachments": []any{
+			map[string]any{"file_id": "file-attachment"},
+		},
+		"ref_file_ids": []any{"file-top", "file-attachment"},
+	}
+	n, err := normalizeOpenAIChatRequest(store, req, "")
+	if err != nil {
+		t.Fatalf("normalize failed: %v", err)
+	}
+	if len(n.RefFileIDs) != 3 {
+		t.Fatalf("expected 3 distinct file ids, got %#v", n.RefFileIDs)
+	}
+	if n.RefFileIDs[0] != "file-top" || n.RefFileIDs[1] != "file-attachment" || n.RefFileIDs[2] != "file-msg" {
+		t.Fatalf("unexpected file ids: %#v", n.RefFileIDs)
+	}
+}
+
 func TestNormalizeOpenAIResponsesRequestInput(t *testing.T) {
 	store := newEmptyStoreForNormalizeTest(t)
 	req := map[string]any{
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .4.1
 .5.2