From 78fdd6347073c52b762e29cf7c60691805fb816a Mon Sep 17 00:00:00 2001 From: shern-point Date: Thu, 30 Apr 2026 00:46:06 +0800 Subject: [PATCH] feat: add full-context token regression coverage and docs Lock in the current_input_file regression with API-level tests and document that returned context token counts now track full prompt semantics with conservative sizing. --- docs/prompt-compatibility.md | 2 + internal/httpapi/openai/history_split_test.go | 74 +++++++++++++++++++ internal/util/render.go | 16 ++-- 3 files changed, 84 insertions(+), 8 deletions(-) diff --git a/docs/prompt-compatibility.md b/docs/prompt-compatibility.md index 4b9403b..c8c4441 100644 --- a/docs/prompt-compatibility.md +++ b/docs/prompt-compatibility.md @@ -98,6 +98,7 @@ DS2API 当前的核心思路,不是把客户端传来的 `messages`、`tools` - `prompt` 才是对话上下文主载体。 - `ref_file_ids` 只承载文件引用,不承载普通文本消息。 - `tools` 不会作为“原生工具 schema”直接下发给下游,而是被改写进 `prompt`。 +- 对外返回给客户端的 `prompt_tokens` / `input_tokens` / `promptTokenCount` 不再按“最后一条消息”或字符粗估近似返回,而是基于**完整上下文 prompt**做 tokenizer 计数;为了避免上下文实际超限但客户端误以为还能塞下,请求侧上下文 token 会额外保守上浮一点,宁可略大也不低估。 - 当前 `/v1/chat/completions` 业务路径仍是“每次请求新建一个远端 `chat_session_id`,并默认发送 `parent_message_id: null`”;因此 DS2API 对外默认表现为“新会话 + prompt 拼历史”,而不是复用 DeepSeek 原生会话树。 - 但 DeepSeek 远端本身支持同一 `chat_session_id` 的跨轮次持续对话。2026-04-27 已用项目内现有 DeepSeek client 做过一次不改业务代码的双轮实测:同一 `chat_session_id` 下,第 1 轮返回 `request_message_id=1` / `response_message_id=2` / 文本 `SESSION_TEST_ONE`;第 2 轮重新获取一次 PoW,并发送 `parent_message_id=2` 后,成功返回 `request_message_id=3` / `response_message_id=4` / 文本 `SESSION_TEST_TWO`。这说明“同远端会话持续聊天”能力存在,且每轮需要携带正确的 parent/message 链接信息,同时重新获取对应轮次可用的 PoW。 - OpenAI Chat / Responses 原生走统一 OpenAI 标准化与 DeepSeek payload 组装;Claude / Gemini 会尽量复用 OpenAI prompt/tool 语义,其中 Gemini 直接复用 `promptcompat.BuildOpenAIPromptForAdapter`,Claude 消息接口在可代理场景会转换为 OpenAI chat 形态再执行。 @@ -247,6 +248,7 @@ OpenAI 文件相关实现: - `current_input_file` 默认开启;它用于把“完整上下文”合并进隐藏上下文文件。当最新 user turn 的纯文本长度达到 `current_input_file.min_chars`(默认 `0`)时,兼容层会上传一个文件名为 `IGNORE.txt` 的上下文文件,并在 live prompt 中只保留一个中性的 user 消息要求模型直接回答最新请求,不再暴露文件名或要求模型读取本地文件。 - 如果 `current_input_file.enabled=false`,请求会直接透传,不上传任何拆分上下文文件。 - 旧的 `history_split.enabled` / `history_split.trigger_after_turns` 会被读取进配置对象以保持兼容,但不会触发拆分上传,也不会影响 `current_input_file` 的默认开启。 +- 即使触发 `current_input_file` 后 live prompt 被缩短,对客户端回包里的上下文 token 统计,仍会沿用**拆分前的完整 prompt 语义**做计数,而不是按缩短后的占位 prompt 计算;否则会把真实上下文显著算小。 相关实现: diff --git a/internal/httpapi/openai/history_split_test.go b/internal/httpapi/openai/history_split_test.go index aa76575..ebaf8c9 100644 --- a/internal/httpapi/openai/history_split_test.go +++ b/internal/httpapi/openai/history_split_test.go @@ -14,6 +14,7 @@ import ( "ds2api/internal/auth" dsclient "ds2api/internal/deepseek/client" "ds2api/internal/promptcompat" + "ds2api/internal/util" ) func historySplitTestMessages() []any { @@ -302,6 +303,48 @@ func TestApplyCurrentInputFileUploadsFirstTurnWithInjectedWrapper(t *testing.T) if len(out.RefFileIDs) != 1 || out.RefFileIDs[0] != "file-inline-1" { t.Fatalf("expected current input file id in ref_file_ids, got %#v", out.RefFileIDs) } + if !strings.Contains(out.PromptTokenText, "first turn content that is long enough") { + t.Fatalf("expected prompt token text to preserve original full context, got %q", out.PromptTokenText) + } +} + +func TestApplyCurrentInputFilePreservesFullContextPromptForTokenCounting(t *testing.T) { + ds := &inlineUploadDSStub{} + h := &openAITestSurface{ + Store: mockOpenAIConfig{ + wideInput: true, + currentInputEnabled: true, + currentInputMin: 0, + thinkingInjection: boolPtr(true), + }, + DS: ds, + } + req := map[string]any{ + "model": "deepseek-v4-flash", + "messages": historySplitTestMessages(), + } + stdReq, err := promptcompat.NormalizeOpenAIChatRequest(h.Store, req, "") + if err != nil { + t.Fatalf("normalize failed: %v", err) + } + originalPrompt := stdReq.FinalPrompt + + out, err := h.applyCurrentInputFile(context.Background(), &auth.RequestAuth{DeepSeekToken: "token"}, stdReq) + if err != nil { + t.Fatalf("apply current input file failed: %v", err) + } + if out.PromptTokenText != originalPrompt { + t.Fatalf("expected prompt token text to preserve original prompt, got %q want %q", out.PromptTokenText, originalPrompt) + } + if out.FinalPrompt == originalPrompt { + t.Fatalf("expected live prompt to be rewritten after current input file") + } + if !strings.Contains(out.PromptTokenText, "first user turn") || !strings.Contains(out.PromptTokenText, "latest user turn") { + t.Fatalf("expected prompt token text to retain full context, got %q", out.PromptTokenText) + } + if strings.Contains(out.FinalPrompt, "first user turn") || strings.Contains(out.FinalPrompt, "latest user turn") { + t.Fatalf("expected live prompt to hide original turns, got %q", out.FinalPrompt) + } } func TestApplyCurrentInputFileUploadsFullContextFile(t *testing.T) { @@ -438,6 +481,22 @@ func TestChatCompletionsCurrentInputFileUploadsContextAndKeepsNeutralPrompt(t *t if len(refIDs) == 0 || refIDs[0] != "file-inline-1" { t.Fatalf("expected uploaded current input file to be first ref_file_id, got %#v", ds.completionReq["ref_file_ids"]) } + var body map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil { + t.Fatalf("decode response failed: %v", err) + } + usage, _ := body["usage"].(map[string]any) + promptTokens := int(usage["prompt_tokens"].(float64)) + originalPrompt, _ := promptcompat.BuildOpenAIPrompt(historySplitTestMessages(), nil, "", defaultToolChoicePolicy(), true) + expectedMin := util.CountPromptTokens(originalPrompt, "deepseek-v4-flash") + neutralPrompt := promptText + neutralCount := util.CountPromptTokens(neutralPrompt, "deepseek-v4-flash") + if promptTokens != expectedMin { + t.Fatalf("expected prompt_tokens from full original context, got=%d want=%d", promptTokens, expectedMin) + } + if promptTokens <= neutralCount { + t.Fatalf("expected prompt_tokens to exceed neutral live prompt count, got=%d neutral=%d", promptTokens, neutralCount) + } } func TestResponsesCurrentInputFileUploadsContextAndKeepsNeutralPrompt(t *testing.T) { @@ -480,6 +539,21 @@ func TestResponsesCurrentInputFileUploadsContextAndKeepsNeutralPrompt(t *testing if strings.Contains(promptText, "first user turn") || strings.Contains(promptText, "latest user turn") { t.Fatalf("expected prompt to hide original turns, got %s", promptText) } + var body map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil { + t.Fatalf("decode response failed: %v", err) + } + usage, _ := body["usage"].(map[string]any) + inputTokens := int(usage["input_tokens"].(float64)) + originalPrompt, _ := promptcompat.BuildOpenAIPrompt(historySplitTestMessages(), nil, "", defaultToolChoicePolicy(), true) + expectedMin := util.CountPromptTokens(originalPrompt, "deepseek-v4-flash") + neutralCount := util.CountPromptTokens(promptText, "deepseek-v4-flash") + if inputTokens != expectedMin { + t.Fatalf("expected input_tokens from full original context, got=%d want=%d", inputTokens, expectedMin) + } + if inputTokens <= neutralCount { + t.Fatalf("expected input_tokens to exceed neutral live prompt count, got=%d neutral=%d", inputTokens, neutralCount) + } } func TestChatCompletionsCurrentInputFileMapsManagedAuthFailureTo401(t *testing.T) { diff --git a/internal/util/render.go b/internal/util/render.go index 0092e4b..801d2f1 100644 --- a/internal/util/render.go +++ b/internal/util/render.go @@ -23,9 +23,9 @@ func BuildOpenAIChatCompletion(completionID, model, finalPrompt, finalThinking, messageObj["tool_calls"] = toolcall.FormatOpenAIToolCalls(detected, nil) messageObj["content"] = nil } - promptTokens := EstimateTokens(finalPrompt) - reasoningTokens := EstimateTokens(finalThinking) - completionTokens := EstimateTokens(finalText) + promptTokens := CountPromptTokens(finalPrompt, model) + reasoningTokens := CountOutputTokens(finalThinking, model) + completionTokens := CountOutputTokens(finalText, model) return map[string]any{ "id": completionID, @@ -86,9 +86,9 @@ func BuildOpenAIResponseObject(responseID, model, finalPrompt, finalThinking, fi "content": content, }) } - promptTokens := EstimateTokens(finalPrompt) - reasoningTokens := EstimateTokens(finalThinking) - completionTokens := EstimateTokens(finalText) + promptTokens := CountPromptTokens(finalPrompt, model) + reasoningTokens := CountOutputTokens(finalThinking, model) + completionTokens := CountOutputTokens(finalText, model) return map[string]any{ "id": responseID, "type": "response", @@ -140,8 +140,8 @@ func BuildClaudeMessageResponse(messageID, model string, normalizedMessages []an "stop_reason": stopReason, "stop_sequence": nil, "usage": map[string]any{ - "input_tokens": EstimateTokens(fmt.Sprintf("%v", normalizedMessages)), - "output_tokens": EstimateTokens(finalThinking) + EstimateTokens(finalText), + "input_tokens": CountPromptTokens(fmt.Sprintf("%v", normalizedMessages), model), + "output_tokens": CountOutputTokens(finalThinking, model) + CountOutputTokens(finalText, model), }, } }