feat: add full-context token regression coverage and docs

Lock in the current_input_file regression with API-level tests and document that returned context token counts now track full prompt semantics with conservative sizing.
2026-05-05 08:55:28 +08:00 · 2026-04-30 00:46:06 +08:00
parent 4b4f097006
commit 78fdd63470
3 changed files with 84 additions and 8 deletions
--- a/docs/prompt-compatibility.md
+++ b/docs/prompt-compatibility.md
@@ -98,6 +98,7 @@ DS2API 当前的核心思路，不是把客户端传来的 `messages`、`tools`
 - `prompt` 才是对话上下文主载体。
 - `ref_file_ids` 只承载文件引用，不承载普通文本消息。
 - `tools` 不会作为“原生工具 schema”直接下发给下游，而是被改写进 `prompt`。
+- 对外返回给客户端的 `prompt_tokens` / `input_tokens` / `promptTokenCount` 不再按“最后一条消息”或字符粗估近似返回，而是基于**完整上下文 prompt**做 tokenizer 计数；为了避免上下文实际超限但客户端误以为还能塞下，请求侧上下文 token 会额外保守上浮一点，宁可略大也不低估。
 - 当前 `/v1/chat/completions` 业务路径仍是“每次请求新建一个远端 `chat_session_id`，并默认发送 `parent_message_id: null`”；因此 DS2API 对外默认表现为“新会话 + prompt 拼历史”，而不是复用 DeepSeek 原生会话树。
 - 但 DeepSeek 远端本身支持同一 `chat_session_id` 的跨轮次持续对话。2026-04-27 已用项目内现有 DeepSeek client 做过一次不改业务代码的双轮实测：同一 `chat_session_id` 下，第 1 轮返回 `request_message_id=1` / `response_message_id=2` / 文本 `SESSION_TEST_ONE`；第 2 轮重新获取一次 PoW，并发送 `parent_message_id=2` 后，成功返回 `request_message_id=3` / `response_message_id=4` / 文本 `SESSION_TEST_TWO`。这说明“同远端会话持续聊天”能力存在，且每轮需要携带正确的 parent/message 链接信息，同时重新获取对应轮次可用的 PoW。
 - OpenAI Chat / Responses 原生走统一 OpenAI 标准化与 DeepSeek payload 组装；Claude / Gemini 会尽量复用 OpenAI prompt/tool 语义，其中 Gemini 直接复用 `promptcompat.BuildOpenAIPromptForAdapter`，Claude 消息接口在可代理场景会转换为 OpenAI chat 形态再执行。
@@ -247,6 +248,7 @@ OpenAI 文件相关实现：
 - `current_input_file` 默认开启；它用于把“完整上下文”合并进隐藏上下文文件。当最新 user turn 的纯文本长度达到 `current_input_file.min_chars`（默认 `0`）时，兼容层会上传一个文件名为 `IGNORE.txt` 的上下文文件，并在 live prompt 中只保留一个中性的 user 消息要求模型直接回答最新请求，不再暴露文件名或要求模型读取本地文件。
 - 如果 `current_input_file.enabled=false`，请求会直接透传，不上传任何拆分上下文文件。
 - 旧的 `history_split.enabled` / `history_split.trigger_after_turns` 会被读取进配置对象以保持兼容，但不会触发拆分上传，也不会影响 `current_input_file` 的默认开启。
+- 即使触发 `current_input_file` 后 live prompt 被缩短，对客户端回包里的上下文 token 统计，仍会沿用**拆分前的完整 prompt 语义**做计数，而不是按缩短后的占位 prompt 计算；否则会把真实上下文显著算小。

 相关实现：

--- a/internal/httpapi/openai/history_split_test.go
+++ b/internal/httpapi/openai/history_split_test.go
@@ -14,6 +14,7 @@ import (
 	"ds2api/internal/auth"
 	dsclient "ds2api/internal/deepseek/client"
 	"ds2api/internal/promptcompat"
+	"ds2api/internal/util"
 )

 func historySplitTestMessages() []any {
@@ -302,6 +303,48 @@ func TestApplyCurrentInputFileUploadsFirstTurnWithInjectedWrapper(t *testing.T)
 	if len(out.RefFileIDs) != 1 || out.RefFileIDs[0] != "file-inline-1" {
 		t.Fatalf("expected current input file id in ref_file_ids, got %#v", out.RefFileIDs)
 	}
+	if !strings.Contains(out.PromptTokenText, "first turn content that is long enough") {
+		t.Fatalf("expected prompt token text to preserve original full context, got %q", out.PromptTokenText)
+	}
+}
+
+func TestApplyCurrentInputFilePreservesFullContextPromptForTokenCounting(t *testing.T) {
+	ds := &inlineUploadDSStub{}
+	h := &openAITestSurface{
+		Store: mockOpenAIConfig{
+			wideInput:           true,
+			currentInputEnabled: true,
+			currentInputMin:     0,
+			thinkingInjection:   boolPtr(true),
+		},
+		DS: ds,
+	}
+	req := map[string]any{
+		"model":    "deepseek-v4-flash",
+		"messages": historySplitTestMessages(),
+	}
+	stdReq, err := promptcompat.NormalizeOpenAIChatRequest(h.Store, req, "")
+	if err != nil {
+		t.Fatalf("normalize failed: %v", err)
+	}
+	originalPrompt := stdReq.FinalPrompt
+
+	out, err := h.applyCurrentInputFile(context.Background(), &auth.RequestAuth{DeepSeekToken: "token"}, stdReq)
+	if err != nil {
+		t.Fatalf("apply current input file failed: %v", err)
+	}
+	if out.PromptTokenText != originalPrompt {
+		t.Fatalf("expected prompt token text to preserve original prompt, got %q want %q", out.PromptTokenText, originalPrompt)
+	}
+	if out.FinalPrompt == originalPrompt {
+		t.Fatalf("expected live prompt to be rewritten after current input file")
+	}
+	if !strings.Contains(out.PromptTokenText, "first user turn") || !strings.Contains(out.PromptTokenText, "latest user turn") {
+		t.Fatalf("expected prompt token text to retain full context, got %q", out.PromptTokenText)
+	}
+	if strings.Contains(out.FinalPrompt, "first user turn") || strings.Contains(out.FinalPrompt, "latest user turn") {
+		t.Fatalf("expected live prompt to hide original turns, got %q", out.FinalPrompt)
+	}
 }

 func TestApplyCurrentInputFileUploadsFullContextFile(t *testing.T) {
@@ -438,6 +481,22 @@ func TestChatCompletionsCurrentInputFileUploadsContextAndKeepsNeutralPrompt(t *t
 	if len(refIDs) == 0 || refIDs[0] != "file-inline-1" {
 		t.Fatalf("expected uploaded current input file to be first ref_file_id, got %#v", ds.completionReq["ref_file_ids"])
 	}
+	var body map[string]any
+	if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil {
+		t.Fatalf("decode response failed: %v", err)
+	}
+	usage, _ := body["usage"].(map[string]any)
+	promptTokens := int(usage["prompt_tokens"].(float64))
+	originalPrompt, _ := promptcompat.BuildOpenAIPrompt(historySplitTestMessages(), nil, "", defaultToolChoicePolicy(), true)
+	expectedMin := util.CountPromptTokens(originalPrompt, "deepseek-v4-flash")
+	neutralPrompt := promptText
+	neutralCount := util.CountPromptTokens(neutralPrompt, "deepseek-v4-flash")
+	if promptTokens != expectedMin {
+		t.Fatalf("expected prompt_tokens from full original context, got=%d want=%d", promptTokens, expectedMin)
+	}
+	if promptTokens <= neutralCount {
+		t.Fatalf("expected prompt_tokens to exceed neutral live prompt count, got=%d neutral=%d", promptTokens, neutralCount)
+	}
 }

 func TestResponsesCurrentInputFileUploadsContextAndKeepsNeutralPrompt(t *testing.T) {
@@ -480,6 +539,21 @@ func TestResponsesCurrentInputFileUploadsContextAndKeepsNeutralPrompt(t *testing
 	if strings.Contains(promptText, "first user turn") || strings.Contains(promptText, "latest user turn") {
 		t.Fatalf("expected prompt to hide original turns, got %s", promptText)
 	}
+	var body map[string]any
+	if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil {
+		t.Fatalf("decode response failed: %v", err)
+	}
+	usage, _ := body["usage"].(map[string]any)
+	inputTokens := int(usage["input_tokens"].(float64))
+	originalPrompt, _ := promptcompat.BuildOpenAIPrompt(historySplitTestMessages(), nil, "", defaultToolChoicePolicy(), true)
+	expectedMin := util.CountPromptTokens(originalPrompt, "deepseek-v4-flash")
+	neutralCount := util.CountPromptTokens(promptText, "deepseek-v4-flash")
+	if inputTokens != expectedMin {
+		t.Fatalf("expected input_tokens from full original context, got=%d want=%d", inputTokens, expectedMin)
+	}
+	if inputTokens <= neutralCount {
+		t.Fatalf("expected input_tokens to exceed neutral live prompt count, got=%d neutral=%d", inputTokens, neutralCount)
+	}
 }

 func TestChatCompletionsCurrentInputFileMapsManagedAuthFailureTo401(t *testing.T) {
--- a/internal/util/render.go
+++ b/internal/util/render.go
@@ -23,9 +23,9 @@ func BuildOpenAIChatCompletion(completionID, model, finalPrompt, finalThinking,
 		messageObj["tool_calls"] = toolcall.FormatOpenAIToolCalls(detected, nil)
 		messageObj["content"] = nil
 	}
-	promptTokens := EstimateTokens(finalPrompt)
-	reasoningTokens := EstimateTokens(finalThinking)
-	completionTokens := EstimateTokens(finalText)
+	promptTokens := CountPromptTokens(finalPrompt, model)
+	reasoningTokens := CountOutputTokens(finalThinking, model)
+	completionTokens := CountOutputTokens(finalText, model)

 	return map[string]any{
 		"id":      completionID,
@@ -86,9 +86,9 @@ func BuildOpenAIResponseObject(responseID, model, finalPrompt, finalThinking, fi
 			"content": content,
 		})
 	}
-	promptTokens := EstimateTokens(finalPrompt)
-	reasoningTokens := EstimateTokens(finalThinking)
-	completionTokens := EstimateTokens(finalText)
+	promptTokens := CountPromptTokens(finalPrompt, model)
+	reasoningTokens := CountOutputTokens(finalThinking, model)
+	completionTokens := CountOutputTokens(finalText, model)
 	return map[string]any{
 		"id":          responseID,
 		"type":        "response",
@@ -140,8 +140,8 @@ func BuildClaudeMessageResponse(messageID, model string, normalizedMessages []an
 		"stop_reason":   stopReason,
 		"stop_sequence": nil,
 		"usage": map[string]any{
-			"input_tokens":  EstimateTokens(fmt.Sprintf("%v", normalizedMessages)),
-			"output_tokens": EstimateTokens(finalThinking) + EstimateTokens(finalText),
+			"input_tokens":  CountPromptTokens(fmt.Sprintf("%v", normalizedMessages), model),
+			"output_tokens": CountOutputTokens(finalThinking, model) + CountOutputTokens(finalText, model),
 		},
 	}
 }