feat: add empty-output retry and Vercel auto-continue support

- Auto-retry Chat/Responses streams once when upstream output is empty but not content-filtered, reusing session/token/PoW and appending a regeneration suffix to the prompt
- Wire DeepSeek continue API into Vercel streams for multi-round thinking output exhaustion
- Defer empty-output errors in stream finalizers to enable synthetic retry; only surface failure when the retry budget is exhausted
- Track content_filter stops to avoid retry on filtered outputs
- Add comprehensive tests for stream/non-stream retry, Responses retry, and content_filter no-retry
- Update prompt-compatibility.md documentation

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
CJACK
2026-04-27 17:56:33 +08:00
parent 2d5d211a7a
commit 0378d8c0a9
13 changed files with 1220 additions and 138 deletions

View File

@@ -128,7 +128,10 @@ func (s *chatStreamRuntime) resetStreamToolCallState() {
s.streamToolNames = map[int]string{}
}
func (s *chatStreamRuntime) finalize(finishReason string) {
func (s *chatStreamRuntime) finalize(finishReason string, deferEmptyOutput bool) bool {
s.finalErrorStatus = 0
s.finalErrorMessage = ""
s.finalErrorCode = ""
finalThinking := s.thinking.String()
finalToolDetectionThinking := s.toolDetectionThinking.String()
finalText := cleanVisibleOutput(s.text.String(), s.stripReferenceMarkers)
@@ -204,8 +207,14 @@ func (s *chatStreamRuntime) finalize(finishReason string) {
}
if len(detected.Calls) == 0 && !s.toolCallsEmitted && strings.TrimSpace(finalText) == "" {
status, message, code := upstreamEmptyOutputDetail(finishReason == "content_filter", finalText, finalThinking)
if deferEmptyOutput {
s.finalErrorStatus = status
s.finalErrorMessage = message
s.finalErrorCode = code
return false
}
s.sendFailedChunk(status, message, code)
return
return true
}
usage := openaifmt.BuildChatUsage(s.finalPrompt, finalThinking, finalText)
s.finalFinishReason = finishReason
@@ -218,6 +227,7 @@ func (s *chatStreamRuntime) finalize(finishReason string) {
usage,
))
s.sendDone()
return true
}
func (s *chatStreamRuntime) onParsed(parsed sse.LineResult) streamengine.ParsedDecision {

View File

@@ -0,0 +1,271 @@
package chat
import (
"context"
"io"
"net/http"
"strings"
"time"
"ds2api/internal/auth"
"ds2api/internal/config"
dsprotocol "ds2api/internal/deepseek/protocol"
openaifmt "ds2api/internal/format/openai"
"ds2api/internal/sse"
streamengine "ds2api/internal/stream"
)
type chatNonStreamResult struct {
thinking string
toolDetectionThinking string
text string
contentFilter bool
detectedCalls int
body map[string]any
finishReason string
}
func (h *Handler) handleNonStreamWithRetry(w http.ResponseWriter, ctx context.Context, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, completionID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, historySession *chatHistorySession) {
attempts := 0
currentResp := resp
usagePrompt := finalPrompt
accumulatedThinking := ""
accumulatedToolDetectionThinking := ""
for {
result, ok := h.collectChatNonStreamAttempt(w, currentResp, completionID, model, usagePrompt, thinkingEnabled, searchEnabled, toolNames)
if !ok {
return
}
accumulatedThinking += sse.TrimContinuationOverlap(accumulatedThinking, result.thinking)
accumulatedToolDetectionThinking += sse.TrimContinuationOverlap(accumulatedToolDetectionThinking, result.toolDetectionThinking)
result.thinking = accumulatedThinking
result.toolDetectionThinking = accumulatedToolDetectionThinking
detected := detectAssistantToolCalls(result.text, result.thinking, result.toolDetectionThinking, toolNames)
result.detectedCalls = len(detected.Calls)
result.body = openaifmt.BuildChatCompletionWithToolCalls(completionID, model, usagePrompt, result.thinking, result.text, detected.Calls)
result.finishReason = chatFinishReason(result.body)
if !shouldRetryChatNonStream(result, attempts) {
h.finishChatNonStreamResult(w, result, attempts, usagePrompt, historySession)
return
}
attempts++
config.Logger.Info("[openai_empty_retry] attempting synthetic retry", "surface", "chat.completions", "stream", false, "retry_attempt", attempts)
retryPayload := clonePayloadWithEmptyOutputRetryPrompt(payload)
nextResp, err := h.DS.CallCompletion(ctx, a, retryPayload, pow, 3)
if err != nil {
if historySession != nil {
historySession.error(http.StatusInternalServerError, "Failed to get completion.", "error", result.thinking, result.text)
}
writeOpenAIError(w, http.StatusInternalServerError, "Failed to get completion.")
config.Logger.Warn("[openai_empty_retry] retry request failed", "surface", "chat.completions", "stream", false, "retry_attempt", attempts, "error", err)
return
}
usagePrompt = usagePromptWithEmptyOutputRetry(finalPrompt, attempts)
currentResp = nextResp
}
}
func (h *Handler) collectChatNonStreamAttempt(w http.ResponseWriter, resp *http.Response, completionID, model, usagePrompt string, thinkingEnabled, searchEnabled bool, toolNames []string) (chatNonStreamResult, bool) {
if resp.StatusCode != http.StatusOK {
defer func() { _ = resp.Body.Close() }()
body, _ := io.ReadAll(resp.Body)
writeOpenAIError(w, resp.StatusCode, string(body))
return chatNonStreamResult{}, false
}
result := sse.CollectStream(resp, thinkingEnabled, true)
stripReferenceMarkers := h.compatStripReferenceMarkers()
finalThinking := cleanVisibleOutput(result.Thinking, stripReferenceMarkers)
finalToolDetectionThinking := cleanVisibleOutput(result.ToolDetectionThinking, stripReferenceMarkers)
finalText := cleanVisibleOutput(result.Text, stripReferenceMarkers)
if searchEnabled {
finalText = replaceCitationMarkersWithLinks(finalText, result.CitationLinks)
}
detected := detectAssistantToolCalls(finalText, finalThinking, finalToolDetectionThinking, toolNames)
respBody := openaifmt.BuildChatCompletionWithToolCalls(completionID, model, usagePrompt, finalThinking, finalText, detected.Calls)
return chatNonStreamResult{
thinking: finalThinking,
toolDetectionThinking: finalToolDetectionThinking,
text: finalText,
contentFilter: result.ContentFilter,
detectedCalls: len(detected.Calls),
body: respBody,
finishReason: chatFinishReason(respBody),
}, true
}
func (h *Handler) finishChatNonStreamResult(w http.ResponseWriter, result chatNonStreamResult, attempts int, usagePrompt string, historySession *chatHistorySession) {
if result.detectedCalls == 0 && shouldWriteUpstreamEmptyOutputError(result.text) {
status, message, code := upstreamEmptyOutputDetail(result.contentFilter, result.text, result.thinking)
if historySession != nil {
historySession.error(status, message, code, result.thinking, result.text)
}
writeUpstreamEmptyOutputError(w, result.text, result.thinking, result.contentFilter)
config.Logger.Info("[openai_empty_retry] terminal empty output", "surface", "chat.completions", "stream", false, "retry_attempts", attempts, "success_source", "none", "content_filter", result.contentFilter)
return
}
if historySession != nil {
historySession.success(http.StatusOK, result.thinking, result.text, result.finishReason, openaifmt.BuildChatUsage(usagePrompt, result.thinking, result.text))
}
writeJSON(w, http.StatusOK, result.body)
source := "first_attempt"
if attempts > 0 {
source = "synthetic_retry"
}
config.Logger.Info("[openai_empty_retry] completed", "surface", "chat.completions", "stream", false, "retry_attempts", attempts, "success_source", source)
}
func chatFinishReason(respBody map[string]any) string {
if choices, ok := respBody["choices"].([]map[string]any); ok && len(choices) > 0 {
if fr, _ := choices[0]["finish_reason"].(string); strings.TrimSpace(fr) != "" {
return fr
}
}
return "stop"
}
func shouldRetryChatNonStream(result chatNonStreamResult, attempts int) bool {
return emptyOutputRetryEnabled() &&
attempts < emptyOutputRetryMaxAttempts() &&
!result.contentFilter &&
result.detectedCalls == 0 &&
strings.TrimSpace(result.text) == ""
}
func (h *Handler) handleStreamWithRetry(w http.ResponseWriter, r *http.Request, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, completionID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, historySession *chatHistorySession) {
streamRuntime, initialType, ok := h.prepareChatStreamRuntime(w, resp, completionID, model, finalPrompt, thinkingEnabled, searchEnabled, toolNames, historySession)
if !ok {
return
}
attempts := 0
currentResp := resp
for {
terminalWritten, retryable := h.consumeChatStreamAttempt(r, currentResp, streamRuntime, initialType, thinkingEnabled, historySession, attempts < emptyOutputRetryMaxAttempts())
if terminalWritten {
logChatStreamTerminal(streamRuntime, attempts)
return
}
if !retryable || !emptyOutputRetryEnabled() || attempts >= emptyOutputRetryMaxAttempts() {
streamRuntime.finalize("stop", false)
recordChatStreamHistory(streamRuntime, historySession)
config.Logger.Info("[openai_empty_retry] terminal empty output", "surface", "chat.completions", "stream", true, "retry_attempts", attempts, "success_source", "none")
return
}
attempts++
config.Logger.Info("[openai_empty_retry] attempting synthetic retry", "surface", "chat.completions", "stream", true, "retry_attempt", attempts)
nextResp, err := h.DS.CallCompletion(r.Context(), a, clonePayloadWithEmptyOutputRetryPrompt(payload), pow, 3)
if err != nil {
failChatStreamRetry(streamRuntime, historySession, http.StatusInternalServerError, "Failed to get completion.", "error")
config.Logger.Warn("[openai_empty_retry] retry request failed", "surface", "chat.completions", "stream", true, "retry_attempt", attempts, "error", err)
return
}
if nextResp.StatusCode != http.StatusOK {
defer func() { _ = nextResp.Body.Close() }()
body, _ := io.ReadAll(nextResp.Body)
failChatStreamRetry(streamRuntime, historySession, nextResp.StatusCode, string(body), "error")
return
}
streamRuntime.finalPrompt = usagePromptWithEmptyOutputRetry(finalPrompt, attempts)
currentResp = nextResp
}
}
func (h *Handler) prepareChatStreamRuntime(w http.ResponseWriter, resp *http.Response, completionID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, historySession *chatHistorySession) (*chatStreamRuntime, string, bool) {
if resp.StatusCode != http.StatusOK {
defer func() { _ = resp.Body.Close() }()
body, _ := io.ReadAll(resp.Body)
if historySession != nil {
historySession.error(resp.StatusCode, string(body), "error", "", "")
}
writeOpenAIError(w, resp.StatusCode, string(body))
return nil, "", false
}
w.Header().Set("Content-Type", "text/event-stream")
w.Header().Set("Cache-Control", "no-cache, no-transform")
w.Header().Set("Connection", "keep-alive")
w.Header().Set("X-Accel-Buffering", "no")
rc := http.NewResponseController(w)
_, canFlush := w.(http.Flusher)
if !canFlush {
config.Logger.Warn("[stream] response writer does not support flush; streaming may be buffered")
}
initialType := "text"
if thinkingEnabled {
initialType = "thinking"
}
streamRuntime := newChatStreamRuntime(
w, rc, canFlush, completionID, time.Now().Unix(), model, finalPrompt,
thinkingEnabled, searchEnabled, h.compatStripReferenceMarkers(), toolNames,
len(toolNames) > 0, h.toolcallFeatureMatchEnabled() && h.toolcallEarlyEmitHighConfidence(),
)
return streamRuntime, initialType, true
}
func (h *Handler) consumeChatStreamAttempt(r *http.Request, resp *http.Response, streamRuntime *chatStreamRuntime, initialType string, thinkingEnabled bool, historySession *chatHistorySession, allowDeferEmpty bool) (bool, bool) {
defer func() { _ = resp.Body.Close() }()
finalReason := "stop"
streamengine.ConsumeSSE(streamengine.ConsumeConfig{
Context: r.Context(),
Body: resp.Body,
ThinkingEnabled: thinkingEnabled,
InitialType: initialType,
KeepAliveInterval: time.Duration(dsprotocol.KeepAliveTimeout) * time.Second,
IdleTimeout: time.Duration(dsprotocol.StreamIdleTimeout) * time.Second,
MaxKeepAliveNoInput: dsprotocol.MaxKeepaliveCount,
}, streamengine.ConsumeHooks{
OnKeepAlive: streamRuntime.sendKeepAlive,
OnParsed: func(parsed sse.LineResult) streamengine.ParsedDecision {
decision := streamRuntime.onParsed(parsed)
if historySession != nil {
historySession.progress(streamRuntime.thinking.String(), streamRuntime.text.String())
}
return decision
},
OnFinalize: func(reason streamengine.StopReason, _ error) {
if string(reason) == "content_filter" {
finalReason = "content_filter"
}
},
OnContextDone: func() {
if historySession != nil {
historySession.stopped(streamRuntime.thinking.String(), streamRuntime.text.String(), string(streamengine.StopReasonContextCancelled))
}
},
})
terminalWritten := streamRuntime.finalize(finalReason, allowDeferEmpty && finalReason != "content_filter")
if terminalWritten {
recordChatStreamHistory(streamRuntime, historySession)
return true, false
}
return false, true
}
func recordChatStreamHistory(streamRuntime *chatStreamRuntime, historySession *chatHistorySession) {
if historySession == nil {
return
}
if streamRuntime.finalErrorMessage != "" {
historySession.error(streamRuntime.finalErrorStatus, streamRuntime.finalErrorMessage, streamRuntime.finalErrorCode, streamRuntime.thinking.String(), streamRuntime.text.String())
return
}
historySession.success(http.StatusOK, streamRuntime.finalThinking, streamRuntime.finalText, streamRuntime.finalFinishReason, streamRuntime.finalUsage)
}
func failChatStreamRetry(streamRuntime *chatStreamRuntime, historySession *chatHistorySession, status int, message, code string) {
streamRuntime.sendFailedChunk(status, message, code)
if historySession != nil {
historySession.error(status, message, code, streamRuntime.thinking.String(), streamRuntime.text.String())
}
}
func logChatStreamTerminal(streamRuntime *chatStreamRuntime, attempts int) {
source := "first_attempt"
if attempts > 0 {
source = "synthetic_retry"
}
if streamRuntime.finalErrorMessage != "" {
config.Logger.Info("[openai_empty_retry] terminal empty output", "surface", "chat.completions", "stream", true, "retry_attempts", attempts, "success_source", "none", "error_code", streamRuntime.finalErrorCode)
return
}
config.Logger.Info("[openai_empty_retry] completed", "surface", "chat.completions", "stream", true, "retry_attempts", attempts, "success_source", source)
}

View File

@@ -123,6 +123,22 @@ func writeUpstreamEmptyOutputError(w http.ResponseWriter, text, thinking string,
return shared.WriteUpstreamEmptyOutputError(w, text, thinking, contentFilter)
}
func emptyOutputRetryEnabled() bool {
return shared.EmptyOutputRetryEnabled()
}
func emptyOutputRetryMaxAttempts() int {
return shared.EmptyOutputRetryMaxAttempts()
}
func clonePayloadWithEmptyOutputRetryPrompt(payload map[string]any) map[string]any {
return shared.ClonePayloadWithEmptyOutputRetryPrompt(payload)
}
func usagePromptWithEmptyOutputRetry(originalPrompt string, retryAttempts int) string {
return shared.UsagePromptWithEmptyOutputRetry(originalPrompt, retryAttempts)
}
func formatIncrementalStreamToolCallDeltas(deltas []toolstream.ToolCallDelta, ids map[int]string) []map[string]any {
return shared.FormatIncrementalStreamToolCallDeltas(deltas, ids)
}

View File

@@ -105,10 +105,10 @@ func (h *Handler) ChatCompletions(w http.ResponseWriter, r *http.Request) {
return
}
if stdReq.Stream {
h.handleStream(w, r, resp, sessionID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, historySession)
h.handleStreamWithRetry(w, r, a, resp, payload, pow, sessionID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, historySession)
return
}
h.handleNonStream(w, resp, sessionID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, historySession)
h.handleNonStreamWithRetry(w, r.Context(), a, resp, payload, pow, sessionID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, historySession)
}
func (h *Handler) autoDeleteRemoteSession(ctx context.Context, a *auth.RequestAuth, sessionID string) {
@@ -251,9 +251,9 @@ func (h *Handler) handleStream(w http.ResponseWriter, r *http.Request, resp *htt
},
OnFinalize: func(reason streamengine.StopReason, _ error) {
if string(reason) == "content_filter" {
streamRuntime.finalize("content_filter")
streamRuntime.finalize("content_filter", false)
} else {
streamRuntime.finalize("stop")
streamRuntime.finalize("stop", false)
}
if historySession == nil {
return

View File

@@ -0,0 +1,221 @@
package responses
import (
"context"
"io"
"net/http"
"strings"
"time"
"ds2api/internal/auth"
"ds2api/internal/config"
dsprotocol "ds2api/internal/deepseek/protocol"
openaifmt "ds2api/internal/format/openai"
"ds2api/internal/promptcompat"
"ds2api/internal/sse"
streamengine "ds2api/internal/stream"
"ds2api/internal/toolcall"
)
type responsesNonStreamResult struct {
thinking string
toolDetectionThinking string
text string
contentFilter bool
parsed toolcall.ToolCallParseResult
body map[string]any
}
func (h *Handler) handleResponsesNonStreamWithRetry(w http.ResponseWriter, ctx context.Context, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, owner, responseID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolChoice promptcompat.ToolChoicePolicy, traceID string) {
attempts := 0
currentResp := resp
usagePrompt := finalPrompt
accumulatedThinking := ""
accumulatedToolDetectionThinking := ""
for {
result, ok := h.collectResponsesNonStreamAttempt(w, currentResp, responseID, model, usagePrompt, thinkingEnabled, searchEnabled, toolNames)
if !ok {
return
}
accumulatedThinking += sse.TrimContinuationOverlap(accumulatedThinking, result.thinking)
accumulatedToolDetectionThinking += sse.TrimContinuationOverlap(accumulatedToolDetectionThinking, result.toolDetectionThinking)
result.thinking = accumulatedThinking
result.toolDetectionThinking = accumulatedToolDetectionThinking
result.parsed = detectAssistantToolCalls(result.text, result.thinking, result.toolDetectionThinking, toolNames)
result.body = openaifmt.BuildResponseObjectWithToolCalls(responseID, model, usagePrompt, result.thinking, result.text, result.parsed.Calls)
if !shouldRetryResponsesNonStream(result, attempts) {
h.finishResponsesNonStreamResult(w, result, attempts, owner, responseID, toolChoice, traceID)
return
}
attempts++
config.Logger.Info("[openai_empty_retry] attempting synthetic retry", "surface", "responses", "stream", false, "retry_attempt", attempts)
nextResp, err := h.DS.CallCompletion(ctx, a, clonePayloadWithEmptyOutputRetryPrompt(payload), pow, 3)
if err != nil {
writeOpenAIError(w, http.StatusInternalServerError, "Failed to get completion.")
config.Logger.Warn("[openai_empty_retry] retry request failed", "surface", "responses", "stream", false, "retry_attempt", attempts, "error", err)
return
}
usagePrompt = usagePromptWithEmptyOutputRetry(finalPrompt, attempts)
currentResp = nextResp
}
}
func (h *Handler) collectResponsesNonStreamAttempt(w http.ResponseWriter, resp *http.Response, responseID, model, usagePrompt string, thinkingEnabled, searchEnabled bool, toolNames []string) (responsesNonStreamResult, bool) {
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
writeOpenAIError(w, resp.StatusCode, strings.TrimSpace(string(body)))
return responsesNonStreamResult{}, false
}
result := sse.CollectStream(resp, thinkingEnabled, false)
stripReferenceMarkers := h.compatStripReferenceMarkers()
sanitizedThinking := cleanVisibleOutput(result.Thinking, stripReferenceMarkers)
toolDetectionThinking := cleanVisibleOutput(result.ToolDetectionThinking, stripReferenceMarkers)
sanitizedText := cleanVisibleOutput(result.Text, stripReferenceMarkers)
if searchEnabled {
sanitizedText = replaceCitationMarkersWithLinks(sanitizedText, result.CitationLinks)
}
textParsed := detectAssistantToolCalls(sanitizedText, sanitizedThinking, toolDetectionThinking, toolNames)
responseObj := openaifmt.BuildResponseObjectWithToolCalls(responseID, model, usagePrompt, sanitizedThinking, sanitizedText, textParsed.Calls)
return responsesNonStreamResult{
thinking: sanitizedThinking,
toolDetectionThinking: toolDetectionThinking,
text: sanitizedText,
contentFilter: result.ContentFilter,
parsed: textParsed,
body: responseObj,
}, true
}
func (h *Handler) finishResponsesNonStreamResult(w http.ResponseWriter, result responsesNonStreamResult, attempts int, owner, responseID string, toolChoice promptcompat.ToolChoicePolicy, traceID string) {
if len(result.parsed.Calls) == 0 && writeUpstreamEmptyOutputError(w, result.text, result.thinking, result.contentFilter) {
config.Logger.Info("[openai_empty_retry] terminal empty output", "surface", "responses", "stream", false, "retry_attempts", attempts, "success_source", "none", "content_filter", result.contentFilter)
return
}
logResponsesToolPolicyRejection(traceID, toolChoice, result.parsed, "text")
if toolChoice.IsRequired() && len(result.parsed.Calls) == 0 {
writeOpenAIErrorWithCode(w, http.StatusUnprocessableEntity, "tool_choice requires at least one valid tool call.", "tool_choice_violation")
return
}
h.getResponseStore().put(owner, responseID, result.body)
writeJSON(w, http.StatusOK, result.body)
source := "first_attempt"
if attempts > 0 {
source = "synthetic_retry"
}
config.Logger.Info("[openai_empty_retry] completed", "surface", "responses", "stream", false, "retry_attempts", attempts, "success_source", source)
}
func shouldRetryResponsesNonStream(result responsesNonStreamResult, attempts int) bool {
return emptyOutputRetryEnabled() &&
attempts < emptyOutputRetryMaxAttempts() &&
!result.contentFilter &&
len(result.parsed.Calls) == 0 &&
strings.TrimSpace(result.text) == ""
}
func (h *Handler) handleResponsesStreamWithRetry(w http.ResponseWriter, r *http.Request, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, owner, responseID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolChoice promptcompat.ToolChoicePolicy, traceID string) {
streamRuntime, initialType, ok := h.prepareResponsesStreamRuntime(w, resp, owner, responseID, model, finalPrompt, thinkingEnabled, searchEnabled, toolNames, toolChoice, traceID)
if !ok {
return
}
attempts := 0
currentResp := resp
for {
terminalWritten, retryable := h.consumeResponsesStreamAttempt(r, currentResp, streamRuntime, initialType, thinkingEnabled, attempts < emptyOutputRetryMaxAttempts())
if terminalWritten {
logResponsesStreamTerminal(streamRuntime, attempts)
return
}
if !retryable || !emptyOutputRetryEnabled() || attempts >= emptyOutputRetryMaxAttempts() {
streamRuntime.finalize("stop", false)
config.Logger.Info("[openai_empty_retry] terminal empty output", "surface", "responses", "stream", true, "retry_attempts", attempts, "success_source", "none", "error_code", streamRuntime.finalErrorCode)
return
}
attempts++
config.Logger.Info("[openai_empty_retry] attempting synthetic retry", "surface", "responses", "stream", true, "retry_attempt", attempts)
nextResp, err := h.DS.CallCompletion(r.Context(), a, clonePayloadWithEmptyOutputRetryPrompt(payload), pow, 3)
if err != nil {
streamRuntime.failResponse(http.StatusInternalServerError, "Failed to get completion.", "error")
config.Logger.Warn("[openai_empty_retry] retry request failed", "surface", "responses", "stream", true, "retry_attempt", attempts, "error", err)
return
}
if nextResp.StatusCode != http.StatusOK {
defer func() { _ = nextResp.Body.Close() }()
body, _ := io.ReadAll(nextResp.Body)
streamRuntime.failResponse(nextResp.StatusCode, strings.TrimSpace(string(body)), "error")
return
}
streamRuntime.finalPrompt = usagePromptWithEmptyOutputRetry(finalPrompt, attempts)
currentResp = nextResp
}
}
func (h *Handler) prepareResponsesStreamRuntime(w http.ResponseWriter, resp *http.Response, owner, responseID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolChoice promptcompat.ToolChoicePolicy, traceID string) (*responsesStreamRuntime, string, bool) {
if resp.StatusCode != http.StatusOK {
defer func() { _ = resp.Body.Close() }()
body, _ := io.ReadAll(resp.Body)
writeOpenAIError(w, resp.StatusCode, strings.TrimSpace(string(body)))
return nil, "", false
}
w.Header().Set("Content-Type", "text/event-stream")
w.Header().Set("Cache-Control", "no-cache, no-transform")
w.Header().Set("Connection", "keep-alive")
w.Header().Set("X-Accel-Buffering", "no")
rc := http.NewResponseController(w)
_, canFlush := w.(http.Flusher)
initialType := "text"
if thinkingEnabled {
initialType = "thinking"
}
streamRuntime := newResponsesStreamRuntime(
w, rc, canFlush, responseID, model, finalPrompt, thinkingEnabled, searchEnabled,
h.compatStripReferenceMarkers(), toolNames, len(toolNames) > 0,
h.toolcallFeatureMatchEnabled() && h.toolcallEarlyEmitHighConfidence(),
toolChoice, traceID, func(obj map[string]any) {
h.getResponseStore().put(owner, responseID, obj)
},
)
streamRuntime.sendCreated()
return streamRuntime, initialType, true
}
func (h *Handler) consumeResponsesStreamAttempt(r *http.Request, resp *http.Response, streamRuntime *responsesStreamRuntime, initialType string, thinkingEnabled bool, allowDeferEmpty bool) (bool, bool) {
defer func() { _ = resp.Body.Close() }()
finalReason := "stop"
streamengine.ConsumeSSE(streamengine.ConsumeConfig{
Context: r.Context(),
Body: resp.Body,
ThinkingEnabled: thinkingEnabled,
InitialType: initialType,
KeepAliveInterval: time.Duration(dsprotocol.KeepAliveTimeout) * time.Second,
IdleTimeout: time.Duration(dsprotocol.StreamIdleTimeout) * time.Second,
MaxKeepAliveNoInput: dsprotocol.MaxKeepaliveCount,
}, streamengine.ConsumeHooks{
OnParsed: streamRuntime.onParsed,
OnFinalize: func(reason streamengine.StopReason, _ error) {
if string(reason) == "content_filter" {
finalReason = "content_filter"
}
},
})
terminalWritten := streamRuntime.finalize(finalReason, allowDeferEmpty && finalReason != "content_filter")
if terminalWritten {
return true, false
}
return false, true
}
func logResponsesStreamTerminal(streamRuntime *responsesStreamRuntime, attempts int) {
source := "first_attempt"
if attempts > 0 {
source = "synthetic_retry"
}
if streamRuntime.failed {
config.Logger.Info("[openai_empty_retry] terminal empty output", "surface", "responses", "stream", true, "retry_attempts", attempts, "success_source", "none", "error_code", streamRuntime.finalErrorCode)
return
}
config.Logger.Info("[openai_empty_retry] completed", "surface", "responses", "stream", true, "retry_attempts", attempts, "success_source", source)
}

View File

@@ -113,6 +113,22 @@ func writeUpstreamEmptyOutputError(w http.ResponseWriter, text, thinking string,
return shared.WriteUpstreamEmptyOutputError(w, text, thinking, contentFilter)
}
func emptyOutputRetryEnabled() bool {
return shared.EmptyOutputRetryEnabled()
}
func emptyOutputRetryMaxAttempts() int {
return shared.EmptyOutputRetryMaxAttempts()
}
func clonePayloadWithEmptyOutputRetryPrompt(payload map[string]any) map[string]any {
return shared.ClonePayloadWithEmptyOutputRetryPrompt(payload)
}
func usagePromptWithEmptyOutputRetry(originalPrompt string, retryAttempts int) string {
return shared.UsagePromptWithEmptyOutputRetry(originalPrompt, retryAttempts)
}
func filterIncrementalToolCallDeltasByAllowed(deltas []toolstream.ToolCallDelta, seenNames map[int]string) []toolstream.ToolCallDelta {
return shared.FilterIncrementalToolCallDeltasByAllowed(deltas, seenNames)
}

View File

@@ -115,10 +115,10 @@ func (h *Handler) Responses(w http.ResponseWriter, r *http.Request) {
responseID := "resp_" + strings.ReplaceAll(uuid.NewString(), "-", "")
if stdReq.Stream {
h.handleResponsesStream(w, r, resp, owner, responseID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, stdReq.ToolChoice, traceID)
h.handleResponsesStreamWithRetry(w, r, a, resp, payload, pow, owner, responseID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, stdReq.ToolChoice, traceID)
return
}
h.handleResponsesNonStream(w, resp, owner, responseID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, stdReq.ToolChoice, traceID)
h.handleResponsesNonStreamWithRetry(w, r.Context(), a, resp, payload, pow, owner, responseID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, stdReq.ToolChoice, traceID)
}
func (h *Handler) handleResponsesNonStream(w http.ResponseWriter, resp *http.Response, owner, responseID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolChoice promptcompat.ToolChoicePolicy, traceID string) {
@@ -206,8 +206,12 @@ func (h *Handler) handleResponsesStream(w http.ResponseWriter, r *http.Request,
MaxKeepAliveNoInput: dsprotocol.MaxKeepaliveCount,
}, streamengine.ConsumeHooks{
OnParsed: streamRuntime.onParsed,
OnFinalize: func(_ streamengine.StopReason, _ error) {
streamRuntime.finalize()
OnFinalize: func(reason streamengine.StopReason, _ error) {
if string(reason) == "content_filter" {
streamRuntime.finalize("content_filter", false)
return
}
streamRuntime.finalize("stop", false)
},
})
}

View File

@@ -53,6 +53,9 @@ type responsesStreamRuntime struct {
messagePartAdded bool
sequence int
failed bool
finalErrorStatus int
finalErrorMessage string
finalErrorCode string
persistResponse func(obj map[string]any)
}
@@ -103,6 +106,9 @@ func newResponsesStreamRuntime(
func (s *responsesStreamRuntime) failResponse(status int, message, code string) {
s.failed = true
s.finalErrorStatus = status
s.finalErrorMessage = message
s.finalErrorCode = code
failedResp := map[string]any{
"id": s.responseID,
"type": "response",
@@ -126,7 +132,11 @@ func (s *responsesStreamRuntime) failResponse(status int, message, code string)
s.sendDone()
}
func (s *responsesStreamRuntime) finalize() {
func (s *responsesStreamRuntime) finalize(finishReason string, deferEmptyOutput bool) bool {
s.failed = false
s.finalErrorStatus = 0
s.finalErrorMessage = ""
s.finalErrorCode = ""
finalThinking := s.thinking.String()
finalToolDetectionThinking := s.toolDetectionThinking.String()
finalText := cleanVisibleOutput(s.text.String(), s.stripReferenceMarkers)
@@ -150,12 +160,18 @@ func (s *responsesStreamRuntime) finalize() {
if s.toolChoice.IsRequired() && len(detected) == 0 {
s.failResponse(http.StatusUnprocessableEntity, "tool_choice requires at least one valid tool call.", "tool_choice_violation")
return
return true
}
if len(detected) == 0 && strings.TrimSpace(finalText) == "" {
status, message, code := upstreamEmptyOutputDetail(false, finalText, finalThinking)
status, message, code := upstreamEmptyOutputDetail(finishReason == "content_filter", finalText, finalThinking)
if deferEmptyOutput {
s.finalErrorStatus = status
s.finalErrorMessage = message
s.finalErrorCode = code
return false
}
s.failResponse(status, message, code)
return
return true
}
s.closeIncompleteFunctionItems()
@@ -165,6 +181,7 @@ func (s *responsesStreamRuntime) finalize() {
}
s.sendEvent("response.completed", openaifmt.BuildResponsesCompletedPayload(obj))
s.sendDone()
return true
}
func (s *responsesStreamRuntime) logToolPolicyRejections(textParsed toolcall.ToolCallParseResult) {
@@ -188,7 +205,10 @@ func (s *responsesStreamRuntime) onParsed(parsed sse.LineResult) streamengine.Pa
if !parsed.Parsed {
return streamengine.ParsedDecision{}
}
if parsed.ContentFilter || parsed.ErrorMessage != "" || parsed.Stop {
if parsed.ContentFilter || parsed.ErrorMessage != "" {
return streamengine.ParsedDecision{Stop: true, StopReason: streamengine.StopReason("content_filter")}
}
if parsed.Stop {
return streamengine.ParsedDecision{Stop: true}
}

View File

@@ -0,0 +1,45 @@
package shared
import "strings"
const EmptyOutputRetrySuffix = "Previous reply had no visible output. Please regenerate the visible final answer or tool call now."
func EmptyOutputRetryEnabled() bool {
return true
}
func EmptyOutputRetryMaxAttempts() int {
return 1
}
func ClonePayloadWithEmptyOutputRetryPrompt(payload map[string]any) map[string]any {
clone := make(map[string]any, len(payload))
for k, v := range payload {
clone[k] = v
}
original, _ := payload["prompt"].(string)
clone["prompt"] = AppendEmptyOutputRetrySuffix(original)
return clone
}
func AppendEmptyOutputRetrySuffix(prompt string) string {
prompt = strings.TrimRight(prompt, "\r\n\t ")
if prompt == "" {
return EmptyOutputRetrySuffix
}
return prompt + "\n\n" + EmptyOutputRetrySuffix
}
func UsagePromptWithEmptyOutputRetry(originalPrompt string, retryAttempts int) string {
if retryAttempts <= 0 {
return originalPrompt
}
parts := make([]string, 0, retryAttempts+1)
parts = append(parts, originalPrompt)
next := originalPrompt
for i := 0; i < retryAttempts; i++ {
next = AppendEmptyOutputRetrySuffix(next)
parts = append(parts, next)
}
return strings.Join(parts, "\n")
}

View File

@@ -66,6 +66,44 @@ func (m streamStatusDSStub) DeleteAllSessionsForToken(_ context.Context, _ strin
return nil
}
type streamStatusDSSeqStub struct {
resps []*http.Response
payloads []map[string]any
}
func (m *streamStatusDSSeqStub) CreateSession(_ context.Context, _ *auth.RequestAuth, _ int) (string, error) {
return "session-id", nil
}
func (m *streamStatusDSSeqStub) GetPow(_ context.Context, _ *auth.RequestAuth, _ int) (string, error) {
return "pow", nil
}
func (m *streamStatusDSSeqStub) UploadFile(_ context.Context, _ *auth.RequestAuth, _ dsclient.UploadFileRequest, _ int) (*dsclient.UploadFileResult, error) {
return &dsclient.UploadFileResult{ID: "file-id", Filename: "file.txt", Bytes: 1, Status: "uploaded"}, nil
}
func (m *streamStatusDSSeqStub) CallCompletion(_ context.Context, _ *auth.RequestAuth, payload map[string]any, _ string, _ int) (*http.Response, error) {
clone := make(map[string]any, len(payload))
for k, v := range payload {
clone[k] = v
}
m.payloads = append(m.payloads, clone)
idx := len(m.payloads) - 1
if idx >= len(m.resps) {
idx = len(m.resps) - 1
}
return m.resps[idx], nil
}
func (m *streamStatusDSSeqStub) DeleteSessionForToken(_ context.Context, _ string, _ string) (*dsclient.DeleteSessionResult, error) {
return &dsclient.DeleteSessionResult{Success: true}, nil
}
func (m *streamStatusDSSeqStub) DeleteAllSessionsForToken(_ context.Context, _ string) error {
return nil
}
func makeOpenAISSEHTTPResponse(lines ...string) *http.Response {
body := strings.Join(lines, "\n")
if !strings.HasSuffix(body, "\n") {
@@ -78,6 +116,12 @@ func makeOpenAISSEHTTPResponse(lines ...string) *http.Response {
}
}
func newOpenAITestRouter(h *openAITestSurface) http.Handler {
r := chi.NewRouter()
registerOpenAITestRoutes(r, h)
return r
}
func captureStatusMiddleware(statuses *[]int) func(http.Handler) http.Handler {
return func(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
@@ -239,6 +283,125 @@ func TestChatCompletionsStreamEmitsFailureFrameWhenUpstreamOutputEmpty(t *testin
}
}
func TestChatCompletionsStreamRetriesEmptyOutputOnSameSession(t *testing.T) {
ds := &streamStatusDSSeqStub{resps: []*http.Response{
makeOpenAISSEHTTPResponse(`data: {"p":"response/thinking_content","v":"plan"}`, "data: [DONE]"),
makeOpenAISSEHTTPResponse(`data: {"p":"response/content","v":"visible"}`, "data: [DONE]"),
}}
h := &openAITestSurface{
Store: mockOpenAIConfig{wideInput: true},
Auth: streamStatusAuthStub{},
DS: ds,
}
reqBody := `{"model":"deepseek-v4-pro","messages":[{"role":"user","content":"hi"}],"stream":true}`
req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(reqBody))
req.Header.Set("Authorization", "Bearer direct-token")
req.Header.Set("Content-Type", "application/json")
rec := httptest.NewRecorder()
newOpenAITestRouter(h).ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("expected 200, got %d body=%s", rec.Code, rec.Body.String())
}
if len(ds.payloads) != 2 {
t.Fatalf("expected one synthetic retry call, got %d", len(ds.payloads))
}
if ds.payloads[0]["chat_session_id"] != ds.payloads[1]["chat_session_id"] {
t.Fatalf("expected retry to reuse session, payloads=%#v", ds.payloads)
}
retryPrompt := asString(ds.payloads[1]["prompt"])
if !strings.Contains(retryPrompt, "Previous reply had no visible output. Please regenerate the visible final answer or tool call now.") {
t.Fatalf("expected retry suffix in prompt, got %q", retryPrompt)
}
frames, done := parseSSEDataFrames(t, rec.Body.String())
if !done {
t.Fatalf("expected [DONE], body=%s", rec.Body.String())
}
doneCount := strings.Count(rec.Body.String(), "data: [DONE]")
if doneCount != 1 {
t.Fatalf("expected one [DONE], got %d body=%s", doneCount, rec.Body.String())
}
if len(frames) != 3 {
t.Fatalf("expected reasoning, content, finish frames, got %#v body=%s", frames, rec.Body.String())
}
id := asString(frames[0]["id"])
for _, frame := range frames[1:] {
if asString(frame["id"]) != id {
t.Fatalf("expected same completion id across retry stream, frames=%#v", frames)
}
}
choices, _ := frames[1]["choices"].([]any)
choice, _ := choices[0].(map[string]any)
delta, _ := choice["delta"].(map[string]any)
if asString(delta["content"]) != "visible" {
t.Fatalf("expected retry content delta, got %#v body=%s", delta, rec.Body.String())
}
}
func TestChatCompletionsNonStreamRetriesThinkingOnlyOutput(t *testing.T) {
ds := &streamStatusDSSeqStub{resps: []*http.Response{
makeOpenAISSEHTTPResponse(`data: {"p":"response/thinking_content","v":"plan"}`, "data: [DONE]"),
makeOpenAISSEHTTPResponse(`data: {"p":"response/content","v":"visible"}`, "data: [DONE]"),
}}
h := &openAITestSurface{
Store: mockOpenAIConfig{wideInput: true},
Auth: streamStatusAuthStub{},
DS: ds,
}
reqBody := `{"model":"deepseek-v4-pro","messages":[{"role":"user","content":"hi"}],"stream":false}`
req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(reqBody))
req.Header.Set("Authorization", "Bearer direct-token")
req.Header.Set("Content-Type", "application/json")
rec := httptest.NewRecorder()
newOpenAITestRouter(h).ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("expected 200 after retry, got %d body=%s", rec.Code, rec.Body.String())
}
if len(ds.payloads) != 2 {
t.Fatalf("expected one synthetic retry call, got %d", len(ds.payloads))
}
var out map[string]any
if err := json.Unmarshal(rec.Body.Bytes(), &out); err != nil {
t.Fatalf("decode response failed: %v body=%s", err, rec.Body.String())
}
choices, _ := out["choices"].([]any)
choice, _ := choices[0].(map[string]any)
message, _ := choice["message"].(map[string]any)
if asString(message["content"]) != "visible" {
t.Fatalf("expected retry visible content, got %#v", message)
}
if !strings.Contains(asString(message["reasoning_content"]), "plan") {
t.Fatalf("expected first-attempt reasoning to be preserved, got %#v", message)
}
}
func TestChatCompletionsContentFilterDoesNotRetry(t *testing.T) {
ds := &streamStatusDSSeqStub{resps: []*http.Response{
makeOpenAISSEHTTPResponse(`data: {"code":"content_filter"}`),
makeOpenAISSEHTTPResponse(`data: {"p":"response/content","v":"visible"}`, "data: [DONE]"),
}}
h := &openAITestSurface{
Store: mockOpenAIConfig{wideInput: true},
Auth: streamStatusAuthStub{},
DS: ds,
}
reqBody := `{"model":"deepseek-v4-flash","messages":[{"role":"user","content":"hi"}],"stream":false}`
req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(reqBody))
req.Header.Set("Authorization", "Bearer direct-token")
req.Header.Set("Content-Type", "application/json")
rec := httptest.NewRecorder()
newOpenAITestRouter(h).ServeHTTP(rec, req)
if rec.Code != http.StatusBadRequest {
t.Fatalf("expected content_filter 400, got %d body=%s", rec.Code, rec.Body.String())
}
if len(ds.payloads) != 1 {
t.Fatalf("expected no retry on content_filter, got %d calls", len(ds.payloads))
}
}
func TestResponsesStreamUsageIgnoresBatchAccumulatedTokenUsage(t *testing.T) {
statuses := make([]int, 0, 1)
h := &openAITestSurface{
@@ -287,6 +450,86 @@ func TestResponsesStreamUsageIgnoresBatchAccumulatedTokenUsage(t *testing.T) {
}
}
func TestResponsesStreamRetriesThinkingOnlyOutput(t *testing.T) {
ds := &streamStatusDSSeqStub{resps: []*http.Response{
makeOpenAISSEHTTPResponse(`data: {"p":"response/thinking_content","v":"plan"}`, "data: [DONE]"),
makeOpenAISSEHTTPResponse(`data: {"p":"response/content","v":"visible"}`, "data: [DONE]"),
}}
h := &openAITestSurface{
Store: mockOpenAIConfig{wideInput: true},
Auth: streamStatusAuthStub{},
DS: ds,
}
reqBody := `{"model":"deepseek-v4-pro","input":"hi","stream":true}`
req := httptest.NewRequest(http.MethodPost, "/v1/responses", strings.NewReader(reqBody))
req.Header.Set("Authorization", "Bearer direct-token")
req.Header.Set("Content-Type", "application/json")
rec := httptest.NewRecorder()
newOpenAITestRouter(h).ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("expected 200, got %d body=%s", rec.Code, rec.Body.String())
}
if len(ds.payloads) != 2 {
t.Fatalf("expected one synthetic retry call, got %d", len(ds.payloads))
}
body := rec.Body.String()
if strings.Contains(body, "response.failed") {
t.Fatalf("did not expect premature response.failed, body=%s", body)
}
if !strings.Contains(body, "response.reasoning.delta") || !strings.Contains(body, "response.output_text.delta") || !strings.Contains(body, "response.completed") {
t.Fatalf("expected reasoning, text delta, and completed events, body=%s", body)
}
if strings.Count(body, "data: [DONE]") != 1 {
t.Fatalf("expected one [DONE], body=%s", body)
}
}
func TestResponsesNonStreamRetriesThinkingOnlyOutput(t *testing.T) {
ds := &streamStatusDSSeqStub{resps: []*http.Response{
makeOpenAISSEHTTPResponse(`data: {"p":"response/thinking_content","v":"plan"}`, "data: [DONE]"),
makeOpenAISSEHTTPResponse(`data: {"p":"response/content","v":"visible"}`, "data: [DONE]"),
}}
h := &openAITestSurface{
Store: mockOpenAIConfig{wideInput: true},
Auth: streamStatusAuthStub{},
DS: ds,
}
reqBody := `{"model":"deepseek-v4-pro","input":"hi","stream":false}`
req := httptest.NewRequest(http.MethodPost, "/v1/responses", strings.NewReader(reqBody))
req.Header.Set("Authorization", "Bearer direct-token")
req.Header.Set("Content-Type", "application/json")
rec := httptest.NewRecorder()
newOpenAITestRouter(h).ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("expected 200 after retry, got %d body=%s", rec.Code, rec.Body.String())
}
if len(ds.payloads) != 2 {
t.Fatalf("expected one synthetic retry call, got %d", len(ds.payloads))
}
var out map[string]any
if err := json.Unmarshal(rec.Body.Bytes(), &out); err != nil {
t.Fatalf("decode response failed: %v body=%s", err, rec.Body.String())
}
if asString(out["output_text"]) != "visible" {
t.Fatalf("expected retry visible output_text, got %#v", out["output_text"])
}
output, _ := out["output"].([]any)
if len(output) == 0 {
t.Fatalf("expected output items, got %#v", out)
}
item, _ := output[0].(map[string]any)
content, _ := item["content"].([]any)
if len(content) == 0 {
t.Fatalf("expected content entries, got %#v", item)
}
reasoning, _ := content[0].(map[string]any)
if asString(reasoning["type"]) != "reasoning" || !strings.Contains(asString(reasoning["text"]), "plan") {
t.Fatalf("expected preserved reasoning entry, got %#v", content)
}
}
func TestResponsesNonStreamUsageIgnoresPromptAndOutputTokenUsage(t *testing.T) {
statuses := make([]int, 0, 1)
h := &openAITestSurface{