From 6a778e0d3577733b067900fcca37b20743fcc132 Mon Sep 17 00:00:00 2001 From: shern-point Date: Thu, 30 Apr 2026 01:42:51 +0800 Subject: [PATCH] feat: include inline-uploaded file tokens in context token accounting Track byte sizes of inline-uploaded files during PreprocessInlineFileInputs and convert them to conservative token estimates (bytes/3). RefFileTokens is threaded through StandardRequest into all OpenAI chat/responses usage builders so returned prompt_tokens/input_tokens reflect the full upstream context cost including attached files. --- internal/format/openai/render_chat.go | 2 +- internal/format/openai/render_responses.go | 2 +- internal/format/openai/render_test.go | 2 +- internal/format/openai/render_usage.go | 12 +++---- .../httpapi/openai/chat/chat_history_test.go | 2 +- .../openai/chat/chat_stream_runtime.go | 15 +++++---- .../openai/chat/empty_retry_runtime.go | 15 +++++---- internal/httpapi/openai/chat/handler_chat.go | 15 ++++++--- .../openai/chat/handler_toolcall_test.go | 22 ++++++------- .../httpapi/openai/chat/ref_file_tokens.go | 26 +++++++++++++++ .../openai/files/file_inline_upload.go | 15 ++++++--- .../openai/responses/empty_retry_runtime.go | 12 ++++--- .../openai/responses/ref_file_tokens.go | 26 +++++++++++++++ .../openai/responses/responses_handler.go | 13 +++++--- .../responses_stream_runtime_core.go | 15 +++++---- ...onses_stream_runtime_toolcalls_finalize.go | 6 +++- .../openai/responses/responses_stream_test.go | 32 +++++++++---------- internal/promptcompat/request_normalize.go | 29 +++++++++++++++++ internal/promptcompat/standard_request.go | 1 + 19 files changed, 185 insertions(+), 77 deletions(-) create mode 100644 internal/httpapi/openai/chat/ref_file_tokens.go create mode 100644 internal/httpapi/openai/responses/ref_file_tokens.go diff --git a/internal/format/openai/render_chat.go b/internal/format/openai/render_chat.go index 14a9d1f..3fa6e53 100644 --- a/internal/format/openai/render_chat.go +++ b/internal/format/openai/render_chat.go @@ -29,7 +29,7 @@ func BuildChatCompletionWithToolCalls(completionID, model, finalPrompt, finalThi "created": time.Now().Unix(), "model": model, "choices": []map[string]any{{"index": 0, "message": messageObj, "finish_reason": finishReason}}, - "usage": BuildChatUsageForModel(model, finalPrompt, finalThinking, finalText), + "usage": BuildChatUsageForModel(model, finalPrompt, finalThinking, finalText, 0), } } diff --git a/internal/format/openai/render_responses.go b/internal/format/openai/render_responses.go index 2d3c9dd..0c7dfbe 100644 --- a/internal/format/openai/render_responses.go +++ b/internal/format/openai/render_responses.go @@ -70,7 +70,7 @@ func BuildResponseObjectFromItems(responseID, model, finalPrompt, finalThinking, "model": model, "output": output, "output_text": outputText, - "usage": BuildResponsesUsageForModel(model, finalPrompt, finalThinking, finalText), + "usage": BuildResponsesUsageForModel(model, finalPrompt, finalThinking, finalText, 0), } } diff --git a/internal/format/openai/render_test.go b/internal/format/openai/render_test.go index 61cdb3f..e174bd6 100644 --- a/internal/format/openai/render_test.go +++ b/internal/format/openai/render_test.go @@ -181,7 +181,7 @@ func TestBuildResponseObjectWithToolCallsCoercesSchemaDeclaredStringArguments(t func TestBuildChatUsageForModelUsesConservativePromptCount(t *testing.T) { prompt := strings.Repeat("上下文token ", 40) - usage := BuildChatUsageForModel("deepseek-v4-flash", prompt, "", "ok") + usage := BuildChatUsageForModel("deepseek-v4-flash", prompt, "", "ok", 0) promptTokens, _ := usage["prompt_tokens"].(int) if promptTokens <= util.EstimateTokens(prompt) { t.Fatalf("expected conservative prompt token count > rough estimate, got=%d estimate=%d", promptTokens, util.EstimateTokens(prompt)) diff --git a/internal/format/openai/render_usage.go b/internal/format/openai/render_usage.go index ad1f380..08541a4 100644 --- a/internal/format/openai/render_usage.go +++ b/internal/format/openai/render_usage.go @@ -2,8 +2,8 @@ package openai import "ds2api/internal/util" -func BuildChatUsageForModel(model, finalPrompt, finalThinking, finalText string) map[string]any { - promptTokens := util.CountPromptTokens(finalPrompt, model) +func BuildChatUsageForModel(model, finalPrompt, finalThinking, finalText string, refFileTokens int) map[string]any { + promptTokens := util.CountPromptTokens(finalPrompt, model) + refFileTokens reasoningTokens := util.CountOutputTokens(finalThinking, model) completionTokens := util.CountOutputTokens(finalText, model) return map[string]any{ @@ -17,11 +17,11 @@ func BuildChatUsageForModel(model, finalPrompt, finalThinking, finalText string) } func BuildChatUsage(finalPrompt, finalThinking, finalText string) map[string]any { - return BuildChatUsageForModel("", finalPrompt, finalThinking, finalText) + return BuildChatUsageForModel("", finalPrompt, finalThinking, finalText, 0) } -func BuildResponsesUsageForModel(model, finalPrompt, finalThinking, finalText string) map[string]any { - promptTokens := util.CountPromptTokens(finalPrompt, model) +func BuildResponsesUsageForModel(model, finalPrompt, finalThinking, finalText string, refFileTokens int) map[string]any { + promptTokens := util.CountPromptTokens(finalPrompt, model) + refFileTokens reasoningTokens := util.CountOutputTokens(finalThinking, model) completionTokens := util.CountOutputTokens(finalText, model) return map[string]any{ @@ -32,5 +32,5 @@ func BuildResponsesUsageForModel(model, finalPrompt, finalThinking, finalText st } func BuildResponsesUsage(finalPrompt, finalThinking, finalText string) map[string]any { - return BuildResponsesUsageForModel("", finalPrompt, finalThinking, finalText) + return BuildResponsesUsageForModel("", finalPrompt, finalThinking, finalText, 0) } diff --git a/internal/httpapi/openai/chat/chat_history_test.go b/internal/httpapi/openai/chat/chat_history_test.go index ec28d8a..4d75bf9 100644 --- a/internal/httpapi/openai/chat/chat_history_test.go +++ b/internal/httpapi/openai/chat/chat_history_test.go @@ -194,7 +194,7 @@ func TestHandleStreamContextCancelledMarksHistoryStopped(t *testing.T) { rec := httptest.NewRecorder() resp := makeOpenAISSEHTTPResponse(`data: {"p":"response/content","v":"hello"}`, `data: [DONE]`) - h.handleStream(rec, req, resp, "cid-stop", "deepseek-v4-flash", "prompt", false, false, nil, nil, session) + h.handleStream(rec, req, resp, "cid-stop", "deepseek-v4-flash", "prompt", 0, false, false, nil, nil, session) snapshot, err := historyStore.Snapshot() if err != nil { diff --git a/internal/httpapi/openai/chat/chat_stream_runtime.go b/internal/httpapi/openai/chat/chat_stream_runtime.go index 939aa79..e44ffcc 100644 --- a/internal/httpapi/openai/chat/chat_stream_runtime.go +++ b/internal/httpapi/openai/chat/chat_stream_runtime.go @@ -16,12 +16,13 @@ type chatStreamRuntime struct { rc *http.ResponseController canFlush bool - completionID string - created int64 - model string - finalPrompt string - toolNames []string - toolsRaw any + completionID string + created int64 + model string + finalPrompt string + refFileTokens int + toolNames []string + toolsRaw any thinkingEnabled bool searchEnabled bool @@ -220,7 +221,7 @@ func (s *chatStreamRuntime) finalize(finishReason string, deferEmptyOutput bool) s.sendFailedChunk(status, message, code) return true } - usage := openaifmt.BuildChatUsageForModel(s.model, s.finalPrompt, finalThinking, finalText) + usage := openaifmt.BuildChatUsageForModel(s.model, s.finalPrompt, finalThinking, finalText, s.refFileTokens) s.finalFinishReason = finishReason s.finalUsage = usage s.sendChunk(openaifmt.BuildChatStreamChunk( diff --git a/internal/httpapi/openai/chat/empty_retry_runtime.go b/internal/httpapi/openai/chat/empty_retry_runtime.go index 3052548..848d69b 100644 --- a/internal/httpapi/openai/chat/empty_retry_runtime.go +++ b/internal/httpapi/openai/chat/empty_retry_runtime.go @@ -26,7 +26,7 @@ type chatNonStreamResult struct { responseMessageID int } -func (h *Handler) handleNonStreamWithRetry(w http.ResponseWriter, ctx context.Context, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, completionID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, historySession *chatHistorySession) { +func (h *Handler) handleNonStreamWithRetry(w http.ResponseWriter, ctx context.Context, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, completionID, model, finalPrompt string, refFileTokens int, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, historySession *chatHistorySession) { attempts := 0 currentResp := resp usagePrompt := finalPrompt @@ -46,7 +46,7 @@ func (h *Handler) handleNonStreamWithRetry(w http.ResponseWriter, ctx context.Co result.body = openaifmt.BuildChatCompletionWithToolCalls(completionID, model, usagePrompt, result.thinking, result.text, detected.Calls, toolsRaw) result.finishReason = chatFinishReason(result.body) if !shouldRetryChatNonStream(result, attempts) { - h.finishChatNonStreamResult(w, result, attempts, usagePrompt, historySession) + h.finishChatNonStreamResult(w, result, attempts, usagePrompt, refFileTokens, historySession) return } @@ -101,7 +101,7 @@ func (h *Handler) collectChatNonStreamAttempt(w http.ResponseWriter, resp *http. }, true } -func (h *Handler) finishChatNonStreamResult(w http.ResponseWriter, result chatNonStreamResult, attempts int, usagePrompt string, historySession *chatHistorySession) { +func (h *Handler) finishChatNonStreamResult(w http.ResponseWriter, result chatNonStreamResult, attempts int, usagePrompt string, refFileTokens int, historySession *chatHistorySession) { if result.detectedCalls == 0 && shouldWriteUpstreamEmptyOutputError(result.text) { status, message, code := upstreamEmptyOutputDetail(result.contentFilter, result.text, result.thinking) if historySession != nil { @@ -112,7 +112,7 @@ func (h *Handler) finishChatNonStreamResult(w http.ResponseWriter, result chatNo return } if historySession != nil { - historySession.success(http.StatusOK, result.thinking, result.text, result.finishReason, openaifmt.BuildChatUsage(usagePrompt, result.thinking, result.text)) + historySession.success(http.StatusOK, result.thinking, result.text, result.finishReason, openaifmt.BuildChatUsageForModel("", usagePrompt, result.thinking, result.text, refFileTokens)) } writeJSON(w, http.StatusOK, result.body) source := "first_attempt" @@ -139,8 +139,8 @@ func shouldRetryChatNonStream(result chatNonStreamResult, attempts int) bool { strings.TrimSpace(result.text) == "" } -func (h *Handler) handleStreamWithRetry(w http.ResponseWriter, r *http.Request, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, completionID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, historySession *chatHistorySession) { - streamRuntime, initialType, ok := h.prepareChatStreamRuntime(w, resp, completionID, model, finalPrompt, thinkingEnabled, searchEnabled, toolNames, toolsRaw, historySession) +func (h *Handler) handleStreamWithRetry(w http.ResponseWriter, r *http.Request, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, completionID, model, finalPrompt string, refFileTokens int, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, historySession *chatHistorySession) { + streamRuntime, initialType, ok := h.prepareChatStreamRuntime(w, resp, completionID, model, finalPrompt, refFileTokens, thinkingEnabled, searchEnabled, toolNames, toolsRaw, historySession) if !ok { return } @@ -182,7 +182,7 @@ func (h *Handler) handleStreamWithRetry(w http.ResponseWriter, r *http.Request, } } -func (h *Handler) prepareChatStreamRuntime(w http.ResponseWriter, resp *http.Response, completionID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, historySession *chatHistorySession) (*chatStreamRuntime, string, bool) { +func (h *Handler) prepareChatStreamRuntime(w http.ResponseWriter, resp *http.Response, completionID, model, finalPrompt string, refFileTokens int, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, historySession *chatHistorySession) (*chatStreamRuntime, string, bool) { if resp.StatusCode != http.StatusOK { defer func() { _ = resp.Body.Close() }() body, _ := io.ReadAll(resp.Body) @@ -210,6 +210,7 @@ func (h *Handler) prepareChatStreamRuntime(w http.ResponseWriter, resp *http.Res thinkingEnabled, searchEnabled, h.compatStripReferenceMarkers(), toolNames, toolsRaw, len(toolNames) > 0, h.toolcallFeatureMatchEnabled() && h.toolcallEarlyEmitHighConfidence(), ) + streamRuntime.refFileTokens = refFileTokens return streamRuntime, initialType, true } diff --git a/internal/httpapi/openai/chat/handler_chat.go b/internal/httpapi/openai/chat/handler_chat.go index 0b970b5..d42a29d 100644 --- a/internal/httpapi/openai/chat/handler_chat.go +++ b/internal/httpapi/openai/chat/handler_chat.go @@ -108,11 +108,12 @@ func (h *Handler) ChatCompletions(w http.ResponseWriter, r *http.Request) { writeOpenAIError(w, http.StatusInternalServerError, "Failed to get completion.") return } + refFileTokens := stdReq.RefFileTokens if stdReq.Stream { - h.handleStreamWithRetry(w, r, a, resp, payload, pow, sessionID, stdReq.ResponseModel, stdReq.PromptTokenText, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, stdReq.ToolsRaw, historySession) + h.handleStreamWithRetry(w, r, a, resp, payload, pow, sessionID, stdReq.ResponseModel, stdReq.PromptTokenText, refFileTokens, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, stdReq.ToolsRaw, historySession) return } - h.handleNonStreamWithRetry(w, r.Context(), a, resp, payload, pow, sessionID, stdReq.ResponseModel, stdReq.PromptTokenText, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, stdReq.ToolsRaw, historySession) + h.handleNonStreamWithRetry(w, r.Context(), a, resp, payload, pow, sessionID, stdReq.ResponseModel, stdReq.PromptTokenText, refFileTokens, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, stdReq.ToolsRaw, historySession) } func (h *Handler) autoDeleteRemoteSession(ctx context.Context, a *auth.RequestAuth, sessionID string) { @@ -148,7 +149,7 @@ func (h *Handler) autoDeleteRemoteSession(ctx context.Context, a *auth.RequestAu } } -func (h *Handler) handleNonStream(w http.ResponseWriter, resp *http.Response, completionID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, historySession *chatHistorySession) { +func (h *Handler) handleNonStream(w http.ResponseWriter, resp *http.Response, completionID, model, finalPrompt string, refFileTokens int, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, historySession *chatHistorySession) { if resp.StatusCode != http.StatusOK { defer func() { _ = resp.Body.Close() }() body, _ := io.ReadAll(resp.Body) @@ -177,6 +178,9 @@ func (h *Handler) handleNonStream(w http.ResponseWriter, resp *http.Response, co return } respBody := openaifmt.BuildChatCompletionWithToolCalls(completionID, model, finalPrompt, finalThinking, finalText, detected.Calls, toolsRaw) + if refFileTokens > 0 { + addRefFileTokensToUsage(respBody, refFileTokens) + } finishReason := "stop" if choices, ok := respBody["choices"].([]map[string]any); ok && len(choices) > 0 { if fr, _ := choices[0]["finish_reason"].(string); strings.TrimSpace(fr) != "" { @@ -184,12 +188,12 @@ func (h *Handler) handleNonStream(w http.ResponseWriter, resp *http.Response, co } } if historySession != nil { - historySession.success(http.StatusOK, finalThinking, finalText, finishReason, openaifmt.BuildChatUsageForModel(model, finalPrompt, finalThinking, finalText)) + historySession.success(http.StatusOK, finalThinking, finalText, finishReason, openaifmt.BuildChatUsageForModel(model, finalPrompt, finalThinking, finalText, refFileTokens)) } writeJSON(w, http.StatusOK, respBody) } -func (h *Handler) handleStream(w http.ResponseWriter, r *http.Request, resp *http.Response, completionID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, historySession *chatHistorySession) { +func (h *Handler) handleStream(w http.ResponseWriter, r *http.Request, resp *http.Response, completionID, model, finalPrompt string, refFileTokens int, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, historySession *chatHistorySession) { defer func() { _ = resp.Body.Close() }() if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(resp.Body) @@ -234,6 +238,7 @@ func (h *Handler) handleStream(w http.ResponseWriter, r *http.Request, resp *htt bufferToolContent, emitEarlyToolDeltas, ) + streamRuntime.refFileTokens = refFileTokens streamengine.ConsumeSSE(streamengine.ConsumeConfig{ Context: r.Context(), diff --git a/internal/httpapi/openai/chat/handler_toolcall_test.go b/internal/httpapi/openai/chat/handler_toolcall_test.go index 3184c15..9266698 100644 --- a/internal/httpapi/openai/chat/handler_toolcall_test.go +++ b/internal/httpapi/openai/chat/handler_toolcall_test.go @@ -93,7 +93,7 @@ func TestHandleNonStreamReturns429WhenUpstreamOutputEmpty(t *testing.T) { ) rec := httptest.NewRecorder() - h.handleNonStream(rec, resp, "cid-empty", "deepseek-v4-flash", "prompt", false, false, nil, nil, nil) + h.handleNonStream(rec, resp, "cid-empty", "deepseek-v4-flash", "prompt", 0, false, false, nil, nil, nil) if rec.Code != http.StatusTooManyRequests { t.Fatalf("expected status 429 for empty upstream output, got %d body=%s", rec.Code, rec.Body.String()) } @@ -112,7 +112,7 @@ func TestHandleNonStreamReturnsContentFilterErrorWhenUpstreamFilteredWithoutOutp ) rec := httptest.NewRecorder() - h.handleNonStream(rec, resp, "cid-empty-filtered", "deepseek-v4-flash", "prompt", false, false, nil, nil, nil) + h.handleNonStream(rec, resp, "cid-empty-filtered", "deepseek-v4-flash", "prompt", 0, false, false, nil, nil, nil) if rec.Code != http.StatusBadRequest { t.Fatalf("expected status 400 for filtered upstream output, got %d body=%s", rec.Code, rec.Body.String()) } @@ -131,7 +131,7 @@ func TestHandleNonStreamReturns429WhenUpstreamHasOnlyThinking(t *testing.T) { ) rec := httptest.NewRecorder() - h.handleNonStream(rec, resp, "cid-thinking-only", "deepseek-v4-pro", "prompt", true, false, nil, nil, nil) + h.handleNonStream(rec, resp, "cid-thinking-only", "deepseek-v4-pro", "prompt", 0, true, false, nil, nil, nil) if rec.Code != http.StatusTooManyRequests { t.Fatalf("expected status 429 for thinking-only upstream output, got %d body=%s", rec.Code, rec.Body.String()) } @@ -150,7 +150,7 @@ func TestHandleNonStreamPromotesThinkingToolCallsWhenTextEmpty(t *testing.T) { ) rec := httptest.NewRecorder() - h.handleNonStream(rec, resp, "cid-thinking-tool", "deepseek-v4-pro", "prompt", true, false, []string{"search"}, nil, nil) + h.handleNonStream(rec, resp, "cid-thinking-tool", "deepseek-v4-pro", "prompt", 0, true, false, []string{"search"}, nil, nil) if rec.Code != http.StatusOK { t.Fatalf("expected 200 for thinking tool calls, got %d body=%s", rec.Code, rec.Body.String()) } @@ -181,7 +181,7 @@ func TestHandleNonStreamPromotesHiddenThinkingDSMLToolCallsWhenTextEmpty(t *test ) rec := httptest.NewRecorder() - h.handleNonStream(rec, resp, "cid-hidden-thinking-tool", "deepseek-v4-pro", "prompt", false, false, []string{"search"}, nil, nil) + h.handleNonStream(rec, resp, "cid-hidden-thinking-tool", "deepseek-v4-pro", "prompt", 0, false, false, []string{"search"}, nil, nil) if rec.Code != http.StatusOK { t.Fatalf("expected 200 for hidden thinking tool calls, got %d body=%s", rec.Code, rec.Body.String()) } @@ -211,7 +211,7 @@ func TestHandleStreamToolsPlainTextStreamsBeforeFinish(t *testing.T) { rec := httptest.NewRecorder() req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil) - h.handleStream(rec, req, resp, "cid6", "deepseek-v4-flash", "prompt", false, false, []string{"search"}, nil, nil) + h.handleStream(rec, req, resp, "cid6", "deepseek-v4-flash", "prompt", 0, false, false, []string{"search"}, nil, nil) frames, done := parseSSEDataFrames(t, rec.Body.String()) if !done { @@ -248,7 +248,7 @@ func TestHandleStreamIncompleteCapturedToolJSONFlushesAsTextOnFinalize(t *testin rec := httptest.NewRecorder() req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil) - h.handleStream(rec, req, resp, "cid10", "deepseek-v4-flash", "prompt", false, false, []string{"search"}, nil, nil) + h.handleStream(rec, req, resp, "cid10", "deepseek-v4-flash", "prompt", 0, false, false, []string{"search"}, nil, nil) frames, done := parseSSEDataFrames(t, rec.Body.String()) if !done { @@ -282,7 +282,7 @@ func TestHandleStreamPromotesThinkingToolCallsOnFinalizeWithoutMidstreamIntercep rec := httptest.NewRecorder() req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil) - h.handleStream(rec, req, resp, "cid-thinking-stream", "deepseek-v4-pro", "prompt", true, false, []string{"search"}, nil, nil) + h.handleStream(rec, req, resp, "cid-thinking-stream", "deepseek-v4-pro", "prompt", 0, true, false, []string{"search"}, nil, nil) frames, done := parseSSEDataFrames(t, rec.Body.String()) if !done { @@ -319,7 +319,7 @@ func TestHandleStreamPromotesHiddenThinkingDSMLToolCallsOnFinalize(t *testing.T) rec := httptest.NewRecorder() req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil) - h.handleStream(rec, req, resp, "cid-hidden-thinking-stream", "deepseek-v4-pro", "prompt", false, false, []string{"search"}, nil, nil) + h.handleStream(rec, req, resp, "cid-hidden-thinking-stream", "deepseek-v4-pro", "prompt", 0, false, false, []string{"search"}, nil, nil) frames, done := parseSSEDataFrames(t, rec.Body.String()) if !done { @@ -353,7 +353,7 @@ func TestHandleStreamEmitsDistinctToolCallIDsAcrossSeparateToolBlocks(t *testing rec := httptest.NewRecorder() req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil) - h.handleStream(rec, req, resp, "cid-multi", "deepseek-v4-flash", "prompt", false, false, []string{"read_file", "search"}, nil, nil) + h.handleStream(rec, req, resp, "cid-multi", "deepseek-v4-flash", "prompt", 0, false, false, []string{"read_file", "search"}, nil, nil) frames, done := parseSSEDataFrames(t, rec.Body.String()) if !done { @@ -419,7 +419,7 @@ func TestHandleStreamCoercesSchemaDeclaredStringArgumentsOnFinalize(t *testing.T }, } - h.handleStream(rec, req, resp, "cid-string-protect", "deepseek-v4-flash", "prompt", false, false, []string{"Write"}, toolsRaw, nil) + h.handleStream(rec, req, resp, "cid-string-protect", "deepseek-v4-flash", "prompt", 0, false, false, []string{"Write"}, toolsRaw, nil) frames, done := parseSSEDataFrames(t, rec.Body.String()) if !done { diff --git a/internal/httpapi/openai/chat/ref_file_tokens.go b/internal/httpapi/openai/chat/ref_file_tokens.go new file mode 100644 index 0000000..e5da36a --- /dev/null +++ b/internal/httpapi/openai/chat/ref_file_tokens.go @@ -0,0 +1,26 @@ +package chat + +// addRefFileTokensToUsage adds inline-uploaded file token estimates to an existing +// usage map inside a response object. This keeps the token accounting aware of file +// content that the upstream model processes but that is not part of the prompt text. +func addRefFileTokensToUsage(obj map[string]any, refFileTokens int) { + if refFileTokens <= 0 || obj == nil { + return + } + usage, ok := obj["usage"].(map[string]any) + if !ok || usage == nil { + return + } + for _, key := range []string{"input_tokens", "prompt_tokens"} { + if v, ok := usage[key]; ok { + if n, ok := v.(int); ok { + usage[key] = n + refFileTokens + } + } + } + if v, ok := usage["total_tokens"]; ok { + if n, ok := v.(int); ok { + usage["total_tokens"] = n + refFileTokens + } + } +} diff --git a/internal/httpapi/openai/files/file_inline_upload.go b/internal/httpapi/openai/files/file_inline_upload.go index c8d59a9..5c2c1f5 100644 --- a/internal/httpapi/openai/files/file_inline_upload.go +++ b/internal/httpapi/openai/files/file_inline_upload.go @@ -39,11 +39,12 @@ func (e *inlineFileUploadError) Error() string { } type inlineUploadState struct { - ctx context.Context - handler *Handler - auth *auth.RequestAuth - uploadedByID map[string]string - uploadCount int + ctx context.Context + handler *Handler + auth *auth.RequestAuth + uploadedByID map[string]string + uploadCount int + inlineFileBytes int } type inlineDecodedFile struct { @@ -75,6 +76,9 @@ func (h *Handler) PreprocessInlineFileInputs(ctx context.Context, a *auth.Reques if refIDs := promptcompat.CollectOpenAIRefFileIDs(req); len(refIDs) > 0 { req["ref_file_ids"] = stringsToAnySlice(refIDs) } + if state.inlineFileBytes > 0 { + req["_inline_file_bytes"] = state.inlineFileBytes + } return nil } @@ -142,6 +146,7 @@ func (s *inlineUploadState) tryUploadBlock(block map[string]any) (map[string]any return nil, true, &inlineFileUploadError{status: http.StatusInternalServerError, message: "Failed to upload inline file.", err: err} } s.uploadCount++ + s.inlineFileBytes += len(decoded.Data) replacement := map[string]any{ "type": decoded.ReplacementType, "file_id": fileID, diff --git a/internal/httpapi/openai/responses/empty_retry_runtime.go b/internal/httpapi/openai/responses/empty_retry_runtime.go index fe2145d..4d0aa43 100644 --- a/internal/httpapi/openai/responses/empty_retry_runtime.go +++ b/internal/httpapi/openai/responses/empty_retry_runtime.go @@ -27,7 +27,7 @@ type responsesNonStreamResult struct { responseMessageID int } -func (h *Handler) handleResponsesNonStreamWithRetry(w http.ResponseWriter, ctx context.Context, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, owner, responseID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, toolChoice promptcompat.ToolChoicePolicy, traceID string) { +func (h *Handler) handleResponsesNonStreamWithRetry(w http.ResponseWriter, ctx context.Context, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, owner, responseID, model, finalPrompt string, refFileTokens int, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, toolChoice promptcompat.ToolChoicePolicy, traceID string) { attempts := 0 currentResp := resp usagePrompt := finalPrompt @@ -44,6 +44,9 @@ func (h *Handler) handleResponsesNonStreamWithRetry(w http.ResponseWriter, ctx c result.toolDetectionThinking = accumulatedToolDetectionThinking result.parsed = detectAssistantToolCalls(result.text, result.thinking, result.toolDetectionThinking, toolNames) result.body = openaifmt.BuildResponseObjectWithToolCalls(responseID, model, usagePrompt, result.thinking, result.text, result.parsed.Calls, toolsRaw) + if refFileTokens > 0 { + addRefFileTokensToUsage(result.body, refFileTokens) + } if !shouldRetryResponsesNonStream(result, attempts) { h.finishResponsesNonStreamResult(w, result, attempts, owner, responseID, toolChoice, traceID) @@ -123,8 +126,8 @@ func shouldRetryResponsesNonStream(result responsesNonStreamResult, attempts int strings.TrimSpace(result.text) == "" } -func (h *Handler) handleResponsesStreamWithRetry(w http.ResponseWriter, r *http.Request, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, owner, responseID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, toolChoice promptcompat.ToolChoicePolicy, traceID string) { - streamRuntime, initialType, ok := h.prepareResponsesStreamRuntime(w, resp, owner, responseID, model, finalPrompt, thinkingEnabled, searchEnabled, toolNames, toolsRaw, toolChoice, traceID) +func (h *Handler) handleResponsesStreamWithRetry(w http.ResponseWriter, r *http.Request, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, owner, responseID, model, finalPrompt string, refFileTokens int, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, toolChoice promptcompat.ToolChoicePolicy, traceID string) { + streamRuntime, initialType, ok := h.prepareResponsesStreamRuntime(w, resp, owner, responseID, model, finalPrompt, refFileTokens, thinkingEnabled, searchEnabled, toolNames, toolsRaw, toolChoice, traceID) if !ok { return } @@ -165,7 +168,7 @@ func (h *Handler) handleResponsesStreamWithRetry(w http.ResponseWriter, r *http. } } -func (h *Handler) prepareResponsesStreamRuntime(w http.ResponseWriter, resp *http.Response, owner, responseID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, toolChoice promptcompat.ToolChoicePolicy, traceID string) (*responsesStreamRuntime, string, bool) { +func (h *Handler) prepareResponsesStreamRuntime(w http.ResponseWriter, resp *http.Response, owner, responseID, model, finalPrompt string, refFileTokens int, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, toolChoice promptcompat.ToolChoicePolicy, traceID string) (*responsesStreamRuntime, string, bool) { if resp.StatusCode != http.StatusOK { defer func() { _ = resp.Body.Close() }() body, _ := io.ReadAll(resp.Body) @@ -190,6 +193,7 @@ func (h *Handler) prepareResponsesStreamRuntime(w http.ResponseWriter, resp *htt h.getResponseStore().put(owner, responseID, obj) }, ) + streamRuntime.refFileTokens = refFileTokens streamRuntime.sendCreated() return streamRuntime, initialType, true } diff --git a/internal/httpapi/openai/responses/ref_file_tokens.go b/internal/httpapi/openai/responses/ref_file_tokens.go new file mode 100644 index 0000000..a530340 --- /dev/null +++ b/internal/httpapi/openai/responses/ref_file_tokens.go @@ -0,0 +1,26 @@ +package responses + +// addRefFileTokensToUsage adds inline-uploaded file token estimates to an existing +// usage map inside a response object. This keeps the token accounting aware of file +// content that the upstream model processes but that is not part of the prompt text. +func addRefFileTokensToUsage(obj map[string]any, refFileTokens int) { + if refFileTokens <= 0 || obj == nil { + return + } + usage, ok := obj["usage"].(map[string]any) + if !ok || usage == nil { + return + } + for _, key := range []string{"input_tokens", "prompt_tokens"} { + if v, ok := usage[key]; ok { + if n, ok := v.(int); ok { + usage[key] = n + refFileTokens + } + } + } + if v, ok := usage["total_tokens"]; ok { + if n, ok := v.(int); ok { + usage["total_tokens"] = n + refFileTokens + } + } +} diff --git a/internal/httpapi/openai/responses/responses_handler.go b/internal/httpapi/openai/responses/responses_handler.go index 2d4ff70..76a3b1c 100644 --- a/internal/httpapi/openai/responses/responses_handler.go +++ b/internal/httpapi/openai/responses/responses_handler.go @@ -114,14 +114,15 @@ func (h *Handler) Responses(w http.ResponseWriter, r *http.Request) { } responseID := "resp_" + strings.ReplaceAll(uuid.NewString(), "-", "") + refFileTokens := stdReq.RefFileTokens if stdReq.Stream { - h.handleResponsesStreamWithRetry(w, r, a, resp, payload, pow, owner, responseID, stdReq.ResponseModel, stdReq.PromptTokenText, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, stdReq.ToolsRaw, stdReq.ToolChoice, traceID) + h.handleResponsesStreamWithRetry(w, r, a, resp, payload, pow, owner, responseID, stdReq.ResponseModel, stdReq.PromptTokenText, refFileTokens, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, stdReq.ToolsRaw, stdReq.ToolChoice, traceID) return } - h.handleResponsesNonStreamWithRetry(w, r.Context(), a, resp, payload, pow, owner, responseID, stdReq.ResponseModel, stdReq.PromptTokenText, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, stdReq.ToolsRaw, stdReq.ToolChoice, traceID) + h.handleResponsesNonStreamWithRetry(w, r.Context(), a, resp, payload, pow, owner, responseID, stdReq.ResponseModel, stdReq.PromptTokenText, refFileTokens, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, stdReq.ToolsRaw, stdReq.ToolChoice, traceID) } -func (h *Handler) handleResponsesNonStream(w http.ResponseWriter, resp *http.Response, owner, responseID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, toolChoice promptcompat.ToolChoicePolicy, traceID string) { +func (h *Handler) handleResponsesNonStream(w http.ResponseWriter, resp *http.Response, owner, responseID, model, finalPrompt string, refFileTokens int, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, toolChoice promptcompat.ToolChoicePolicy, traceID string) { defer func() { _ = resp.Body.Close() }() if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(resp.Body) @@ -149,11 +150,14 @@ func (h *Handler) handleResponsesNonStream(w http.ResponseWriter, resp *http.Res } responseObj := openaifmt.BuildResponseObjectWithToolCalls(responseID, model, finalPrompt, sanitizedThinking, sanitizedText, textParsed.Calls, toolsRaw) + if refFileTokens > 0 { + addRefFileTokensToUsage(responseObj, refFileTokens) + } h.getResponseStore().put(owner, responseID, responseObj) writeJSON(w, http.StatusOK, responseObj) } -func (h *Handler) handleResponsesStream(w http.ResponseWriter, r *http.Request, resp *http.Response, owner, responseID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, toolChoice promptcompat.ToolChoicePolicy, traceID string) { +func (h *Handler) handleResponsesStream(w http.ResponseWriter, r *http.Request, resp *http.Response, owner, responseID, model, finalPrompt string, refFileTokens int, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, toolChoice promptcompat.ToolChoicePolicy, traceID string) { defer func() { _ = resp.Body.Close() }() if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(resp.Body) @@ -195,6 +199,7 @@ func (h *Handler) handleResponsesStream(w http.ResponseWriter, r *http.Request, h.getResponseStore().put(owner, responseID, obj) }, ) + streamRuntime.refFileTokens = refFileTokens streamRuntime.sendCreated() streamengine.ConsumeSSE(streamengine.ConsumeConfig{ diff --git a/internal/httpapi/openai/responses/responses_stream_runtime_core.go b/internal/httpapi/openai/responses/responses_stream_runtime_core.go index 077b723..a6e238e 100644 --- a/internal/httpapi/openai/responses/responses_stream_runtime_core.go +++ b/internal/httpapi/openai/responses/responses_stream_runtime_core.go @@ -18,13 +18,14 @@ type responsesStreamRuntime struct { rc *http.ResponseController canFlush bool - responseID string - model string - finalPrompt string - toolNames []string - toolsRaw any - traceID string - toolChoice promptcompat.ToolChoicePolicy + responseID string + model string + finalPrompt string + refFileTokens int + toolNames []string + toolsRaw any + traceID string + toolChoice promptcompat.ToolChoicePolicy thinkingEnabled bool searchEnabled bool diff --git a/internal/httpapi/openai/responses/responses_stream_runtime_toolcalls_finalize.go b/internal/httpapi/openai/responses/responses_stream_runtime_toolcalls_finalize.go index 34e35e5..2f03dd3 100644 --- a/internal/httpapi/openai/responses/responses_stream_runtime_toolcalls_finalize.go +++ b/internal/httpapi/openai/responses/responses_stream_runtime_toolcalls_finalize.go @@ -145,7 +145,7 @@ func (s *responsesStreamRuntime) buildCompletedResponseObject(finalThinking, fin } } - return openaifmt.BuildResponseObjectFromItems( + obj := openaifmt.BuildResponseObjectFromItems( s.responseID, s.model, s.finalPrompt, @@ -154,4 +154,8 @@ func (s *responsesStreamRuntime) buildCompletedResponseObject(finalThinking, fin output, outputText, ) + if s.refFileTokens > 0 { + addRefFileTokensToUsage(obj, s.refFileTokens) + } + return obj } diff --git a/internal/httpapi/openai/responses/responses_stream_test.go b/internal/httpapi/openai/responses/responses_stream_test.go index ef0b202..b695ad6 100644 --- a/internal/httpapi/openai/responses/responses_stream_test.go +++ b/internal/httpapi/openai/responses/responses_stream_test.go @@ -27,7 +27,7 @@ func TestHandleResponsesStreamDoesNotEmitReasoningTextCompatEvents(t *testing.T) Body: io.NopCloser(strings.NewReader(streamBody)), } - h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-v4-pro", "prompt", true, false, nil, nil, promptcompat.DefaultToolChoicePolicy(), "") + h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-v4-pro", "prompt", 0, true, false, nil, nil, promptcompat.DefaultToolChoicePolicy(), "") body := rec.Body.String() if !strings.Contains(body, "event: response.reasoning.delta") { @@ -57,7 +57,7 @@ func TestHandleResponsesStreamEmitsOutputTextDoneBeforeContentPartDone(t *testin Body: io.NopCloser(strings.NewReader(streamBody)), } - h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-v4-flash", "prompt", false, false, nil, nil, promptcompat.DefaultToolChoicePolicy(), "") + h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-v4-flash", "prompt", 0, false, false, nil, nil, promptcompat.DefaultToolChoicePolicy(), "") body := rec.Body.String() if !strings.Contains(body, "event: response.output_text.done") { t.Fatalf("expected response.output_text.done payload, body=%s", body) @@ -91,7 +91,7 @@ func TestHandleResponsesStreamOutputTextDeltaCarriesItemIndexes(t *testing.T) { Body: io.NopCloser(strings.NewReader(streamBody)), } - h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-v4-flash", "prompt", false, false, nil, nil, promptcompat.DefaultToolChoicePolicy(), "") + h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-v4-flash", "prompt", 0, false, false, nil, nil, promptcompat.DefaultToolChoicePolicy(), "") body := rec.Body.String() deltaPayload, ok := extractSSEEventPayload(body, "response.output_text.delta") @@ -130,7 +130,7 @@ func TestHandleResponsesStreamEmitsDistinctToolCallIDsAcrossSeparateToolBlocks(t Body: io.NopCloser(strings.NewReader(streamBody)), } - h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-v4-flash", "prompt", false, false, []string{"read_file", "search"}, nil, promptcompat.DefaultToolChoicePolicy(), "") + h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-v4-flash", "prompt", 0, false, false, []string{"read_file", "search"}, nil, promptcompat.DefaultToolChoicePolicy(), "") body := rec.Body.String() doneEvents := extractSSEEventPayloads(body, "response.function_call_arguments.done") @@ -183,7 +183,7 @@ func TestHandleResponsesStreamRequiredToolChoiceFailure(t *testing.T) { Mode: promptcompat.ToolChoiceRequired, Allowed: map[string]struct{}{"read_file": {}}, } - h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-v4-flash", "prompt", false, false, []string{"read_file"}, nil, policy, "") + h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-v4-flash", "prompt", 0, false, false, []string{"read_file"}, nil, policy, "") body := rec.Body.String() if !strings.Contains(body, "event: response.failed") { @@ -213,7 +213,7 @@ func TestHandleResponsesStreamFailsWhenUpstreamHasOnlyThinking(t *testing.T) { Body: io.NopCloser(strings.NewReader(streamBody)), } - h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-v4-pro", "prompt", true, false, nil, nil, promptcompat.DefaultToolChoicePolicy(), "") + h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-v4-pro", "prompt", 0, true, false, nil, nil, promptcompat.DefaultToolChoicePolicy(), "") body := rec.Body.String() if !strings.Contains(body, "event: response.failed") { @@ -251,7 +251,7 @@ func TestHandleResponsesStreamPromotesThinkingToolCallsOnFinalizeWithoutMidstrea Body: io.NopCloser(strings.NewReader(streamBody)), } - h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-v4-pro", "prompt", true, false, []string{"read_file"}, nil, promptcompat.DefaultToolChoicePolicy(), "") + h.handleResponsesStream(rec, req, resp, "owner-a", "resp_test", "deepseek-v4-pro", "prompt", 0, true, false, []string{"read_file"}, nil, promptcompat.DefaultToolChoicePolicy(), "") body := rec.Body.String() if !strings.Contains(body, "event: response.reasoning.delta") { @@ -288,7 +288,7 @@ func TestHandleResponsesStreamPromotesHiddenThinkingDSMLToolCallsOnFinalize(t *t Mode: promptcompat.ToolChoiceRequired, Allowed: map[string]struct{}{"read_file": {}}, } - h.handleResponsesStream(rec, req, resp, "owner-a", "resp_hidden", "deepseek-v4-pro", "prompt", false, false, []string{"read_file"}, nil, policy, "") + h.handleResponsesStream(rec, req, resp, "owner-a", "resp_hidden", "deepseek-v4-pro", "prompt", 0, false, false, []string{"read_file"}, nil, policy, "") body := rec.Body.String() if strings.Contains(body, "event: response.reasoning.delta") { @@ -317,7 +317,7 @@ func TestHandleResponsesNonStreamRequiredToolChoiceViolation(t *testing.T) { Allowed: map[string]struct{}{"read_file": {}}, } - h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-v4-flash", "prompt", false, false, []string{"read_file"}, nil, policy, "") + h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-v4-flash", "prompt", 0, false, false, []string{"read_file"}, nil, policy, "") if rec.Code != http.StatusUnprocessableEntity { t.Fatalf("expected 422 for required tool_choice violation, got %d body=%s", rec.Code, rec.Body.String()) } @@ -344,7 +344,7 @@ func TestHandleResponsesNonStreamRequiredToolChoiceIgnoresThinkingToolPayloadWhe Allowed: map[string]struct{}{"read_file": {}}, } - h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-v4-flash", "prompt", true, false, []string{"read_file"}, nil, policy, "") + h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-v4-flash", "prompt", 0, true, false, []string{"read_file"}, nil, policy, "") if rec.Code != http.StatusUnprocessableEntity { t.Fatalf("expected 422 for required tool_choice violation, got %d body=%s", rec.Code, rec.Body.String()) } @@ -366,7 +366,7 @@ func TestHandleResponsesNonStreamReturns429WhenUpstreamOutputEmpty(t *testing.T) )), } - h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-v4-flash", "prompt", false, false, nil, nil, promptcompat.DefaultToolChoicePolicy(), "") + h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-v4-flash", "prompt", 0, false, false, nil, nil, promptcompat.DefaultToolChoicePolicy(), "") if rec.Code != http.StatusTooManyRequests { t.Fatalf("expected 429 for empty upstream output, got %d body=%s", rec.Code, rec.Body.String()) } @@ -388,7 +388,7 @@ func TestHandleResponsesNonStreamReturnsContentFilterErrorWhenUpstreamFilteredWi )), } - h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-v4-flash", "prompt", false, false, nil, nil, promptcompat.DefaultToolChoicePolicy(), "") + h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-v4-flash", "prompt", 0, false, false, nil, nil, promptcompat.DefaultToolChoicePolicy(), "") if rec.Code != http.StatusBadRequest { t.Fatalf("expected 400 for filtered empty upstream output, got %d body=%s", rec.Code, rec.Body.String()) } @@ -410,7 +410,7 @@ func TestHandleResponsesNonStreamReturns429WhenUpstreamHasOnlyThinking(t *testin )), } - h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-v4-pro", "prompt", true, false, nil, nil, promptcompat.DefaultToolChoicePolicy(), "") + h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-v4-pro", "prompt", 0, true, false, nil, nil, promptcompat.DefaultToolChoicePolicy(), "") if rec.Code != http.StatusTooManyRequests { t.Fatalf("expected 429 for thinking-only upstream output, got %d body=%s", rec.Code, rec.Body.String()) } @@ -432,7 +432,7 @@ func TestHandleResponsesNonStreamPromotesThinkingToolCallsWhenTextEmpty(t *testi )), } - h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-v4-pro", "prompt", true, false, []string{"read_file"}, nil, promptcompat.DefaultToolChoicePolicy(), "") + h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-v4-pro", "prompt", 0, true, false, []string{"read_file"}, nil, promptcompat.DefaultToolChoicePolicy(), "") if rec.Code != http.StatusOK { t.Fatalf("expected 200 for thinking tool calls, got %d body=%s", rec.Code, rec.Body.String()) } @@ -462,7 +462,7 @@ func TestHandleResponsesNonStreamPromotesHiddenThinkingDSMLToolCallsWhenTextEmpt Mode: promptcompat.ToolChoiceRequired, Allowed: map[string]struct{}{"read_file": {}}, } - h.handleResponsesNonStream(rec, resp, "owner-a", "resp_hidden", "deepseek-v4-pro", "prompt", false, false, []string{"read_file"}, nil, policy, "") + h.handleResponsesNonStream(rec, resp, "owner-a", "resp_hidden", "deepseek-v4-pro", "prompt", 0, false, false, []string{"read_file"}, nil, policy, "") if rec.Code != http.StatusOK { t.Fatalf("expected 200 for hidden thinking tool calls, got %d body=%s", rec.Code, rec.Body.String()) } @@ -509,7 +509,7 @@ func TestHandleResponsesStreamCoercesSchemaDeclaredStringArguments(t *testing.T) Body: io.NopCloser(strings.NewReader(streamBody)), } - h.handleResponsesStream(rec, req, resp, "owner-a", "resp_string_protect", "deepseek-v4-flash", "prompt", false, false, []string{"Write"}, toolsRaw, promptcompat.DefaultToolChoicePolicy(), "") + h.handleResponsesStream(rec, req, resp, "owner-a", "resp_string_protect", "deepseek-v4-flash", "prompt", 0, false, false, []string{"Write"}, toolsRaw, promptcompat.DefaultToolChoicePolicy(), "") payload, ok := extractSSEEventPayload(rec.Body.String(), "response.function_call_arguments.done") if !ok { diff --git a/internal/promptcompat/request_normalize.go b/internal/promptcompat/request_normalize.go index fbb9d4c..833a54e 100644 --- a/internal/promptcompat/request_normalize.go +++ b/internal/promptcompat/request_normalize.go @@ -53,6 +53,7 @@ func NormalizeOpenAIChatRequest(store ConfigReader, req map[string]any, traceID Thinking: thinkingEnabled, Search: searchEnabled, RefFileIDs: refFileIDs, + RefFileTokens: estimateInlineFileTokens(req), PassThrough: passThrough, }, nil } @@ -114,6 +115,7 @@ func NormalizeOpenAIResponsesRequest(store ConfigReader, req map[string]any, tra Thinking: thinkingEnabled, Search: searchEnabled, RefFileIDs: refFileIDs, + RefFileTokens: estimateInlineFileTokens(req), PassThrough: passThrough, }, nil } @@ -358,3 +360,30 @@ func namesToSet(names []string) map[string]struct{} { } return out } + +// estimateInlineFileTokens extracts the byte count stashed by PreprocessInlineFileInputs +// and converts it to a conservative token estimate. Inline files are typically images or +// documents that the upstream model will process; we use bytes/3 (rather than bytes/4) +// as a slightly pessimistic approximation so the returned context token count stays +// safely above the real value. +func estimateInlineFileTokens(req map[string]any) int { + raw, ok := req["_inline_file_bytes"] + if !ok { + return 0 + } + var bytes int + switch v := raw.(type) { + case int: + bytes = v + case int64: + bytes = int(v) + case float64: + bytes = int(v) + default: + return 0 + } + if bytes <= 0 { + return 0 + } + return bytes / 3 +} diff --git a/internal/promptcompat/standard_request.go b/internal/promptcompat/standard_request.go index 1f4c48f..76b812d 100644 --- a/internal/promptcompat/standard_request.go +++ b/internal/promptcompat/standard_request.go @@ -19,6 +19,7 @@ type StandardRequest struct { Thinking bool Search bool RefFileIDs []string + RefFileTokens int PassThrough map[string]any }