feat: add empty-output retry and Vercel auto-continue support

- Auto-retry Chat/Responses streams once when upstream output is empty but not content-filtered, reusing session/token/PoW and appending a regeneration suffix to the prompt - Wire DeepSeek continue API into Vercel streams for multi-round thinking output exhaustion - Defer empty-output errors in stream finalizers to enable synthetic retry; only surface failure when the retry budget is exhausted - Track content_filter stops to avoid retry on filtered outputs - Add comprehensive tests for stream/non-stream retry, Responses retry, and content_filter no-retry - Update prompt-compatibility.md documentation Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-16 22:25:15 +08:00 · 2026-04-27 17:56:33 +08:00
parent 2d5d211a7a
commit 0378d8c0a9
13 changed files with 1220 additions and 138 deletions
--- a/internal/httpapi/openai/chat/chat_stream_runtime.go
+++ b/internal/httpapi/openai/chat/chat_stream_runtime.go
@@ -128,7 +128,10 @@ func (s *chatStreamRuntime) resetStreamToolCallState() {
 	s.streamToolNames = map[int]string{}
 }

-func (s *chatStreamRuntime) finalize(finishReason string) {
+func (s *chatStreamRuntime) finalize(finishReason string, deferEmptyOutput bool) bool {
+	s.finalErrorStatus = 0
+	s.finalErrorMessage = ""
+	s.finalErrorCode = ""
 	finalThinking := s.thinking.String()
 	finalToolDetectionThinking := s.toolDetectionThinking.String()
 	finalText := cleanVisibleOutput(s.text.String(), s.stripReferenceMarkers)
@@ -204,8 +207,14 @@ func (s *chatStreamRuntime) finalize(finishReason string) {
 	}
 	if len(detected.Calls) == 0 && !s.toolCallsEmitted && strings.TrimSpace(finalText) == "" {
 		status, message, code := upstreamEmptyOutputDetail(finishReason == "content_filter", finalText, finalThinking)
+		if deferEmptyOutput {
+			s.finalErrorStatus = status
+			s.finalErrorMessage = message
+			s.finalErrorCode = code
+			return false
+		}
 		s.sendFailedChunk(status, message, code)
-		return
+		return true
 	}
 	usage := openaifmt.BuildChatUsage(s.finalPrompt, finalThinking, finalText)
 	s.finalFinishReason = finishReason
@@ -218,6 +227,7 @@ func (s *chatStreamRuntime) finalize(finishReason string) {
 		usage,
 	))
 	s.sendDone()
+	return true
 }

 func (s *chatStreamRuntime) onParsed(parsed sse.LineResult) streamengine.ParsedDecision {
--- a/internal/httpapi/openai/chat/empty_retry_runtime.go
+++ b/internal/httpapi/openai/chat/empty_retry_runtime.go
@@ -0,0 +1,271 @@
+package chat
+
+import (
+	"context"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+
+	"ds2api/internal/auth"
+	"ds2api/internal/config"
+	dsprotocol "ds2api/internal/deepseek/protocol"
+	openaifmt "ds2api/internal/format/openai"
+	"ds2api/internal/sse"
+	streamengine "ds2api/internal/stream"
+)
+
+type chatNonStreamResult struct {
+	thinking              string
+	toolDetectionThinking string
+	text                  string
+	contentFilter         bool
+	detectedCalls         int
+	body                  map[string]any
+	finishReason          string
+}
+
+func (h *Handler) handleNonStreamWithRetry(w http.ResponseWriter, ctx context.Context, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, completionID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, historySession *chatHistorySession) {
+	attempts := 0
+	currentResp := resp
+	usagePrompt := finalPrompt
+	accumulatedThinking := ""
+	accumulatedToolDetectionThinking := ""
+	for {
+		result, ok := h.collectChatNonStreamAttempt(w, currentResp, completionID, model, usagePrompt, thinkingEnabled, searchEnabled, toolNames)
+		if !ok {
+			return
+		}
+		accumulatedThinking += sse.TrimContinuationOverlap(accumulatedThinking, result.thinking)
+		accumulatedToolDetectionThinking += sse.TrimContinuationOverlap(accumulatedToolDetectionThinking, result.toolDetectionThinking)
+		result.thinking = accumulatedThinking
+		result.toolDetectionThinking = accumulatedToolDetectionThinking
+		detected := detectAssistantToolCalls(result.text, result.thinking, result.toolDetectionThinking, toolNames)
+		result.detectedCalls = len(detected.Calls)
+		result.body = openaifmt.BuildChatCompletionWithToolCalls(completionID, model, usagePrompt, result.thinking, result.text, detected.Calls)
+		result.finishReason = chatFinishReason(result.body)
+		if !shouldRetryChatNonStream(result, attempts) {
+			h.finishChatNonStreamResult(w, result, attempts, usagePrompt, historySession)
+			return
+		}
+
+		attempts++
+		config.Logger.Info("[openai_empty_retry] attempting synthetic retry", "surface", "chat.completions", "stream", false, "retry_attempt", attempts)
+		retryPayload := clonePayloadWithEmptyOutputRetryPrompt(payload)
+		nextResp, err := h.DS.CallCompletion(ctx, a, retryPayload, pow, 3)
+		if err != nil {
+			if historySession != nil {
+				historySession.error(http.StatusInternalServerError, "Failed to get completion.", "error", result.thinking, result.text)
+			}
+			writeOpenAIError(w, http.StatusInternalServerError, "Failed to get completion.")
+			config.Logger.Warn("[openai_empty_retry] retry request failed", "surface", "chat.completions", "stream", false, "retry_attempt", attempts, "error", err)
+			return
+		}
+		usagePrompt = usagePromptWithEmptyOutputRetry(finalPrompt, attempts)
+		currentResp = nextResp
+	}
+}
+
+func (h *Handler) collectChatNonStreamAttempt(w http.ResponseWriter, resp *http.Response, completionID, model, usagePrompt string, thinkingEnabled, searchEnabled bool, toolNames []string) (chatNonStreamResult, bool) {
+	if resp.StatusCode != http.StatusOK {
+		defer func() { _ = resp.Body.Close() }()
+		body, _ := io.ReadAll(resp.Body)
+		writeOpenAIError(w, resp.StatusCode, string(body))
+		return chatNonStreamResult{}, false
+	}
+	result := sse.CollectStream(resp, thinkingEnabled, true)
+	stripReferenceMarkers := h.compatStripReferenceMarkers()
+	finalThinking := cleanVisibleOutput(result.Thinking, stripReferenceMarkers)
+	finalToolDetectionThinking := cleanVisibleOutput(result.ToolDetectionThinking, stripReferenceMarkers)
+	finalText := cleanVisibleOutput(result.Text, stripReferenceMarkers)
+	if searchEnabled {
+		finalText = replaceCitationMarkersWithLinks(finalText, result.CitationLinks)
+	}
+	detected := detectAssistantToolCalls(finalText, finalThinking, finalToolDetectionThinking, toolNames)
+	respBody := openaifmt.BuildChatCompletionWithToolCalls(completionID, model, usagePrompt, finalThinking, finalText, detected.Calls)
+	return chatNonStreamResult{
+		thinking:              finalThinking,
+		toolDetectionThinking: finalToolDetectionThinking,
+		text:                  finalText,
+		contentFilter:         result.ContentFilter,
+		detectedCalls:         len(detected.Calls),
+		body:                  respBody,
+		finishReason:          chatFinishReason(respBody),
+	}, true
+}
+
+func (h *Handler) finishChatNonStreamResult(w http.ResponseWriter, result chatNonStreamResult, attempts int, usagePrompt string, historySession *chatHistorySession) {
+	if result.detectedCalls == 0 && shouldWriteUpstreamEmptyOutputError(result.text) {
+		status, message, code := upstreamEmptyOutputDetail(result.contentFilter, result.text, result.thinking)
+		if historySession != nil {
+			historySession.error(status, message, code, result.thinking, result.text)
+		}
+		writeUpstreamEmptyOutputError(w, result.text, result.thinking, result.contentFilter)
+		config.Logger.Info("[openai_empty_retry] terminal empty output", "surface", "chat.completions", "stream", false, "retry_attempts", attempts, "success_source", "none", "content_filter", result.contentFilter)
+		return
+	}
+	if historySession != nil {
+		historySession.success(http.StatusOK, result.thinking, result.text, result.finishReason, openaifmt.BuildChatUsage(usagePrompt, result.thinking, result.text))
+	}
+	writeJSON(w, http.StatusOK, result.body)
+	source := "first_attempt"
+	if attempts > 0 {
+		source = "synthetic_retry"
+	}
+	config.Logger.Info("[openai_empty_retry] completed", "surface", "chat.completions", "stream", false, "retry_attempts", attempts, "success_source", source)
+}
+
+func chatFinishReason(respBody map[string]any) string {
+	if choices, ok := respBody["choices"].([]map[string]any); ok && len(choices) > 0 {
+		if fr, _ := choices[0]["finish_reason"].(string); strings.TrimSpace(fr) != "" {
+			return fr
+		}
+	}
+	return "stop"
+}
+
+func shouldRetryChatNonStream(result chatNonStreamResult, attempts int) bool {
+	return emptyOutputRetryEnabled() &&
+		attempts < emptyOutputRetryMaxAttempts() &&
+		!result.contentFilter &&
+		result.detectedCalls == 0 &&
+		strings.TrimSpace(result.text) == ""
+}
+
+func (h *Handler) handleStreamWithRetry(w http.ResponseWriter, r *http.Request, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, completionID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, historySession *chatHistorySession) {
+	streamRuntime, initialType, ok := h.prepareChatStreamRuntime(w, resp, completionID, model, finalPrompt, thinkingEnabled, searchEnabled, toolNames, historySession)
+	if !ok {
+		return
+	}
+	attempts := 0
+	currentResp := resp
+	for {
+		terminalWritten, retryable := h.consumeChatStreamAttempt(r, currentResp, streamRuntime, initialType, thinkingEnabled, historySession, attempts < emptyOutputRetryMaxAttempts())
+		if terminalWritten {
+			logChatStreamTerminal(streamRuntime, attempts)
+			return
+		}
+		if !retryable || !emptyOutputRetryEnabled() || attempts >= emptyOutputRetryMaxAttempts() {
+			streamRuntime.finalize("stop", false)
+			recordChatStreamHistory(streamRuntime, historySession)
+			config.Logger.Info("[openai_empty_retry] terminal empty output", "surface", "chat.completions", "stream", true, "retry_attempts", attempts, "success_source", "none")
+			return
+		}
+		attempts++
+		config.Logger.Info("[openai_empty_retry] attempting synthetic retry", "surface", "chat.completions", "stream", true, "retry_attempt", attempts)
+		nextResp, err := h.DS.CallCompletion(r.Context(), a, clonePayloadWithEmptyOutputRetryPrompt(payload), pow, 3)
+		if err != nil {
+			failChatStreamRetry(streamRuntime, historySession, http.StatusInternalServerError, "Failed to get completion.", "error")
+			config.Logger.Warn("[openai_empty_retry] retry request failed", "surface", "chat.completions", "stream", true, "retry_attempt", attempts, "error", err)
+			return
+		}
+		if nextResp.StatusCode != http.StatusOK {
+			defer func() { _ = nextResp.Body.Close() }()
+			body, _ := io.ReadAll(nextResp.Body)
+			failChatStreamRetry(streamRuntime, historySession, nextResp.StatusCode, string(body), "error")
+			return
+		}
+		streamRuntime.finalPrompt = usagePromptWithEmptyOutputRetry(finalPrompt, attempts)
+		currentResp = nextResp
+	}
+}
+
+func (h *Handler) prepareChatStreamRuntime(w http.ResponseWriter, resp *http.Response, completionID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, historySession *chatHistorySession) (*chatStreamRuntime, string, bool) {
+	if resp.StatusCode != http.StatusOK {
+		defer func() { _ = resp.Body.Close() }()
+		body, _ := io.ReadAll(resp.Body)
+		if historySession != nil {
+			historySession.error(resp.StatusCode, string(body), "error", "", "")
+		}
+		writeOpenAIError(w, resp.StatusCode, string(body))
+		return nil, "", false
+	}
+	w.Header().Set("Content-Type", "text/event-stream")
+	w.Header().Set("Cache-Control", "no-cache, no-transform")
+	w.Header().Set("Connection", "keep-alive")
+	w.Header().Set("X-Accel-Buffering", "no")
+	rc := http.NewResponseController(w)
+	_, canFlush := w.(http.Flusher)
+	if !canFlush {
+		config.Logger.Warn("[stream] response writer does not support flush; streaming may be buffered")
+	}
+	initialType := "text"
+	if thinkingEnabled {
+		initialType = "thinking"
+	}
+	streamRuntime := newChatStreamRuntime(
+		w, rc, canFlush, completionID, time.Now().Unix(), model, finalPrompt,
+		thinkingEnabled, searchEnabled, h.compatStripReferenceMarkers(), toolNames,
+		len(toolNames) > 0, h.toolcallFeatureMatchEnabled() && h.toolcallEarlyEmitHighConfidence(),
+	)
+	return streamRuntime, initialType, true
+}
+
+func (h *Handler) consumeChatStreamAttempt(r *http.Request, resp *http.Response, streamRuntime *chatStreamRuntime, initialType string, thinkingEnabled bool, historySession *chatHistorySession, allowDeferEmpty bool) (bool, bool) {
+	defer func() { _ = resp.Body.Close() }()
+	finalReason := "stop"
+	streamengine.ConsumeSSE(streamengine.ConsumeConfig{
+		Context:             r.Context(),
+		Body:                resp.Body,
+		ThinkingEnabled:     thinkingEnabled,
+		InitialType:         initialType,
+		KeepAliveInterval:   time.Duration(dsprotocol.KeepAliveTimeout) * time.Second,
+		IdleTimeout:         time.Duration(dsprotocol.StreamIdleTimeout) * time.Second,
+		MaxKeepAliveNoInput: dsprotocol.MaxKeepaliveCount,
+	}, streamengine.ConsumeHooks{
+		OnKeepAlive: streamRuntime.sendKeepAlive,
+		OnParsed: func(parsed sse.LineResult) streamengine.ParsedDecision {
+			decision := streamRuntime.onParsed(parsed)
+			if historySession != nil {
+				historySession.progress(streamRuntime.thinking.String(), streamRuntime.text.String())
+			}
+			return decision
+		},
+		OnFinalize: func(reason streamengine.StopReason, _ error) {
+			if string(reason) == "content_filter" {
+				finalReason = "content_filter"
+			}
+		},
+		OnContextDone: func() {
+			if historySession != nil {
+				historySession.stopped(streamRuntime.thinking.String(), streamRuntime.text.String(), string(streamengine.StopReasonContextCancelled))
+			}
+		},
+	})
+	terminalWritten := streamRuntime.finalize(finalReason, allowDeferEmpty && finalReason != "content_filter")
+	if terminalWritten {
+		recordChatStreamHistory(streamRuntime, historySession)
+		return true, false
+	}
+	return false, true
+}
+
+func recordChatStreamHistory(streamRuntime *chatStreamRuntime, historySession *chatHistorySession) {
+	if historySession == nil {
+		return
+	}
+	if streamRuntime.finalErrorMessage != "" {
+		historySession.error(streamRuntime.finalErrorStatus, streamRuntime.finalErrorMessage, streamRuntime.finalErrorCode, streamRuntime.thinking.String(), streamRuntime.text.String())
+		return
+	}
+	historySession.success(http.StatusOK, streamRuntime.finalThinking, streamRuntime.finalText, streamRuntime.finalFinishReason, streamRuntime.finalUsage)
+}
+
+func failChatStreamRetry(streamRuntime *chatStreamRuntime, historySession *chatHistorySession, status int, message, code string) {
+	streamRuntime.sendFailedChunk(status, message, code)
+	if historySession != nil {
+		historySession.error(status, message, code, streamRuntime.thinking.String(), streamRuntime.text.String())
+	}
+}
+
+func logChatStreamTerminal(streamRuntime *chatStreamRuntime, attempts int) {
+	source := "first_attempt"
+	if attempts > 0 {
+		source = "synthetic_retry"
+	}
+	if streamRuntime.finalErrorMessage != "" {
+		config.Logger.Info("[openai_empty_retry] terminal empty output", "surface", "chat.completions", "stream", true, "retry_attempts", attempts, "success_source", "none", "error_code", streamRuntime.finalErrorCode)
+		return
+	}
+	config.Logger.Info("[openai_empty_retry] completed", "surface", "chat.completions", "stream", true, "retry_attempts", attempts, "success_source", source)
+}
--- a/internal/httpapi/openai/chat/handler.go
+++ b/internal/httpapi/openai/chat/handler.go
@@ -123,6 +123,22 @@ func writeUpstreamEmptyOutputError(w http.ResponseWriter, text, thinking string,
 	return shared.WriteUpstreamEmptyOutputError(w, text, thinking, contentFilter)
 }

+func emptyOutputRetryEnabled() bool {
+	return shared.EmptyOutputRetryEnabled()
+}
+
+func emptyOutputRetryMaxAttempts() int {
+	return shared.EmptyOutputRetryMaxAttempts()
+}
+
+func clonePayloadWithEmptyOutputRetryPrompt(payload map[string]any) map[string]any {
+	return shared.ClonePayloadWithEmptyOutputRetryPrompt(payload)
+}
+
+func usagePromptWithEmptyOutputRetry(originalPrompt string, retryAttempts int) string {
+	return shared.UsagePromptWithEmptyOutputRetry(originalPrompt, retryAttempts)
+}
+
 func formatIncrementalStreamToolCallDeltas(deltas []toolstream.ToolCallDelta, ids map[int]string) []map[string]any {
 	return shared.FormatIncrementalStreamToolCallDeltas(deltas, ids)
 }
--- a/internal/httpapi/openai/chat/handler_chat.go
+++ b/internal/httpapi/openai/chat/handler_chat.go
@@ -105,10 +105,10 @@ func (h *Handler) ChatCompletions(w http.ResponseWriter, r *http.Request) {
 		return
 	}
 	if stdReq.Stream {
-		h.handleStream(w, r, resp, sessionID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, historySession)
+		h.handleStreamWithRetry(w, r, a, resp, payload, pow, sessionID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, historySession)
 		return
 	}
-	h.handleNonStream(w, resp, sessionID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, historySession)
+	h.handleNonStreamWithRetry(w, r.Context(), a, resp, payload, pow, sessionID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, historySession)
 }

 func (h *Handler) autoDeleteRemoteSession(ctx context.Context, a *auth.RequestAuth, sessionID string) {
@@ -251,9 +251,9 @@ func (h *Handler) handleStream(w http.ResponseWriter, r *http.Request, resp *htt
 		},
 		OnFinalize: func(reason streamengine.StopReason, _ error) {
 			if string(reason) == "content_filter" {
-				streamRuntime.finalize("content_filter")
+				streamRuntime.finalize("content_filter", false)
 			} else {
-				streamRuntime.finalize("stop")
+				streamRuntime.finalize("stop", false)
 			}
 			if historySession == nil {
 				return
--- a/internal/httpapi/openai/responses/empty_retry_runtime.go
+++ b/internal/httpapi/openai/responses/empty_retry_runtime.go
@@ -0,0 +1,221 @@
+package responses
+
+import (
+	"context"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+
+	"ds2api/internal/auth"
+	"ds2api/internal/config"
+	dsprotocol "ds2api/internal/deepseek/protocol"
+	openaifmt "ds2api/internal/format/openai"
+	"ds2api/internal/promptcompat"
+	"ds2api/internal/sse"
+	streamengine "ds2api/internal/stream"
+	"ds2api/internal/toolcall"
+)
+
+type responsesNonStreamResult struct {
+	thinking              string
+	toolDetectionThinking string
+	text                  string
+	contentFilter         bool
+	parsed                toolcall.ToolCallParseResult
+	body                  map[string]any
+}
+
+func (h *Handler) handleResponsesNonStreamWithRetry(w http.ResponseWriter, ctx context.Context, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, owner, responseID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolChoice promptcompat.ToolChoicePolicy, traceID string) {
+	attempts := 0
+	currentResp := resp
+	usagePrompt := finalPrompt
+	accumulatedThinking := ""
+	accumulatedToolDetectionThinking := ""
+	for {
+		result, ok := h.collectResponsesNonStreamAttempt(w, currentResp, responseID, model, usagePrompt, thinkingEnabled, searchEnabled, toolNames)
+		if !ok {
+			return
+		}
+		accumulatedThinking += sse.TrimContinuationOverlap(accumulatedThinking, result.thinking)
+		accumulatedToolDetectionThinking += sse.TrimContinuationOverlap(accumulatedToolDetectionThinking, result.toolDetectionThinking)
+		result.thinking = accumulatedThinking
+		result.toolDetectionThinking = accumulatedToolDetectionThinking
+		result.parsed = detectAssistantToolCalls(result.text, result.thinking, result.toolDetectionThinking, toolNames)
+		result.body = openaifmt.BuildResponseObjectWithToolCalls(responseID, model, usagePrompt, result.thinking, result.text, result.parsed.Calls)
+
+		if !shouldRetryResponsesNonStream(result, attempts) {
+			h.finishResponsesNonStreamResult(w, result, attempts, owner, responseID, toolChoice, traceID)
+			return
+		}
+
+		attempts++
+		config.Logger.Info("[openai_empty_retry] attempting synthetic retry", "surface", "responses", "stream", false, "retry_attempt", attempts)
+		nextResp, err := h.DS.CallCompletion(ctx, a, clonePayloadWithEmptyOutputRetryPrompt(payload), pow, 3)
+		if err != nil {
+			writeOpenAIError(w, http.StatusInternalServerError, "Failed to get completion.")
+			config.Logger.Warn("[openai_empty_retry] retry request failed", "surface", "responses", "stream", false, "retry_attempt", attempts, "error", err)
+			return
+		}
+		usagePrompt = usagePromptWithEmptyOutputRetry(finalPrompt, attempts)
+		currentResp = nextResp
+	}
+}
+
+func (h *Handler) collectResponsesNonStreamAttempt(w http.ResponseWriter, resp *http.Response, responseID, model, usagePrompt string, thinkingEnabled, searchEnabled bool, toolNames []string) (responsesNonStreamResult, bool) {
+	defer func() { _ = resp.Body.Close() }()
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		writeOpenAIError(w, resp.StatusCode, strings.TrimSpace(string(body)))
+		return responsesNonStreamResult{}, false
+	}
+	result := sse.CollectStream(resp, thinkingEnabled, false)
+	stripReferenceMarkers := h.compatStripReferenceMarkers()
+	sanitizedThinking := cleanVisibleOutput(result.Thinking, stripReferenceMarkers)
+	toolDetectionThinking := cleanVisibleOutput(result.ToolDetectionThinking, stripReferenceMarkers)
+	sanitizedText := cleanVisibleOutput(result.Text, stripReferenceMarkers)
+	if searchEnabled {
+		sanitizedText = replaceCitationMarkersWithLinks(sanitizedText, result.CitationLinks)
+	}
+	textParsed := detectAssistantToolCalls(sanitizedText, sanitizedThinking, toolDetectionThinking, toolNames)
+	responseObj := openaifmt.BuildResponseObjectWithToolCalls(responseID, model, usagePrompt, sanitizedThinking, sanitizedText, textParsed.Calls)
+	return responsesNonStreamResult{
+		thinking:              sanitizedThinking,
+		toolDetectionThinking: toolDetectionThinking,
+		text:                  sanitizedText,
+		contentFilter:         result.ContentFilter,
+		parsed:                textParsed,
+		body:                  responseObj,
+	}, true
+}
+
+func (h *Handler) finishResponsesNonStreamResult(w http.ResponseWriter, result responsesNonStreamResult, attempts int, owner, responseID string, toolChoice promptcompat.ToolChoicePolicy, traceID string) {
+	if len(result.parsed.Calls) == 0 && writeUpstreamEmptyOutputError(w, result.text, result.thinking, result.contentFilter) {
+		config.Logger.Info("[openai_empty_retry] terminal empty output", "surface", "responses", "stream", false, "retry_attempts", attempts, "success_source", "none", "content_filter", result.contentFilter)
+		return
+	}
+	logResponsesToolPolicyRejection(traceID, toolChoice, result.parsed, "text")
+	if toolChoice.IsRequired() && len(result.parsed.Calls) == 0 {
+		writeOpenAIErrorWithCode(w, http.StatusUnprocessableEntity, "tool_choice requires at least one valid tool call.", "tool_choice_violation")
+		return
+	}
+	h.getResponseStore().put(owner, responseID, result.body)
+	writeJSON(w, http.StatusOK, result.body)
+	source := "first_attempt"
+	if attempts > 0 {
+		source = "synthetic_retry"
+	}
+	config.Logger.Info("[openai_empty_retry] completed", "surface", "responses", "stream", false, "retry_attempts", attempts, "success_source", source)
+}
+
+func shouldRetryResponsesNonStream(result responsesNonStreamResult, attempts int) bool {
+	return emptyOutputRetryEnabled() &&
+		attempts < emptyOutputRetryMaxAttempts() &&
+		!result.contentFilter &&
+		len(result.parsed.Calls) == 0 &&
+		strings.TrimSpace(result.text) == ""
+}
+
+func (h *Handler) handleResponsesStreamWithRetry(w http.ResponseWriter, r *http.Request, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, owner, responseID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolChoice promptcompat.ToolChoicePolicy, traceID string) {
+	streamRuntime, initialType, ok := h.prepareResponsesStreamRuntime(w, resp, owner, responseID, model, finalPrompt, thinkingEnabled, searchEnabled, toolNames, toolChoice, traceID)
+	if !ok {
+		return
+	}
+	attempts := 0
+	currentResp := resp
+	for {
+		terminalWritten, retryable := h.consumeResponsesStreamAttempt(r, currentResp, streamRuntime, initialType, thinkingEnabled, attempts < emptyOutputRetryMaxAttempts())
+		if terminalWritten {
+			logResponsesStreamTerminal(streamRuntime, attempts)
+			return
+		}
+		if !retryable || !emptyOutputRetryEnabled() || attempts >= emptyOutputRetryMaxAttempts() {
+			streamRuntime.finalize("stop", false)
+			config.Logger.Info("[openai_empty_retry] terminal empty output", "surface", "responses", "stream", true, "retry_attempts", attempts, "success_source", "none", "error_code", streamRuntime.finalErrorCode)
+			return
+		}
+		attempts++
+		config.Logger.Info("[openai_empty_retry] attempting synthetic retry", "surface", "responses", "stream", true, "retry_attempt", attempts)
+		nextResp, err := h.DS.CallCompletion(r.Context(), a, clonePayloadWithEmptyOutputRetryPrompt(payload), pow, 3)
+		if err != nil {
+			streamRuntime.failResponse(http.StatusInternalServerError, "Failed to get completion.", "error")
+			config.Logger.Warn("[openai_empty_retry] retry request failed", "surface", "responses", "stream", true, "retry_attempt", attempts, "error", err)
+			return
+		}
+		if nextResp.StatusCode != http.StatusOK {
+			defer func() { _ = nextResp.Body.Close() }()
+			body, _ := io.ReadAll(nextResp.Body)
+			streamRuntime.failResponse(nextResp.StatusCode, strings.TrimSpace(string(body)), "error")
+			return
+		}
+		streamRuntime.finalPrompt = usagePromptWithEmptyOutputRetry(finalPrompt, attempts)
+		currentResp = nextResp
+	}
+}
+
+func (h *Handler) prepareResponsesStreamRuntime(w http.ResponseWriter, resp *http.Response, owner, responseID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolChoice promptcompat.ToolChoicePolicy, traceID string) (*responsesStreamRuntime, string, bool) {
+	if resp.StatusCode != http.StatusOK {
+		defer func() { _ = resp.Body.Close() }()
+		body, _ := io.ReadAll(resp.Body)
+		writeOpenAIError(w, resp.StatusCode, strings.TrimSpace(string(body)))
+		return nil, "", false
+	}
+	w.Header().Set("Content-Type", "text/event-stream")
+	w.Header().Set("Cache-Control", "no-cache, no-transform")
+	w.Header().Set("Connection", "keep-alive")
+	w.Header().Set("X-Accel-Buffering", "no")
+	rc := http.NewResponseController(w)
+	_, canFlush := w.(http.Flusher)
+	initialType := "text"
+	if thinkingEnabled {
+		initialType = "thinking"
+	}
+	streamRuntime := newResponsesStreamRuntime(
+		w, rc, canFlush, responseID, model, finalPrompt, thinkingEnabled, searchEnabled,
+		h.compatStripReferenceMarkers(), toolNames, len(toolNames) > 0,
+		h.toolcallFeatureMatchEnabled() && h.toolcallEarlyEmitHighConfidence(),
+		toolChoice, traceID, func(obj map[string]any) {
+			h.getResponseStore().put(owner, responseID, obj)
+		},
+	)
+	streamRuntime.sendCreated()
+	return streamRuntime, initialType, true
+}
+
+func (h *Handler) consumeResponsesStreamAttempt(r *http.Request, resp *http.Response, streamRuntime *responsesStreamRuntime, initialType string, thinkingEnabled bool, allowDeferEmpty bool) (bool, bool) {
+	defer func() { _ = resp.Body.Close() }()
+	finalReason := "stop"
+	streamengine.ConsumeSSE(streamengine.ConsumeConfig{
+		Context:             r.Context(),
+		Body:                resp.Body,
+		ThinkingEnabled:     thinkingEnabled,
+		InitialType:         initialType,
+		KeepAliveInterval:   time.Duration(dsprotocol.KeepAliveTimeout) * time.Second,
+		IdleTimeout:         time.Duration(dsprotocol.StreamIdleTimeout) * time.Second,
+		MaxKeepAliveNoInput: dsprotocol.MaxKeepaliveCount,
+	}, streamengine.ConsumeHooks{
+		OnParsed: streamRuntime.onParsed,
+		OnFinalize: func(reason streamengine.StopReason, _ error) {
+			if string(reason) == "content_filter" {
+				finalReason = "content_filter"
+			}
+		},
+	})
+	terminalWritten := streamRuntime.finalize(finalReason, allowDeferEmpty && finalReason != "content_filter")
+	if terminalWritten {
+		return true, false
+	}
+	return false, true
+}
+
+func logResponsesStreamTerminal(streamRuntime *responsesStreamRuntime, attempts int) {
+	source := "first_attempt"
+	if attempts > 0 {
+		source = "synthetic_retry"
+	}
+	if streamRuntime.failed {
+		config.Logger.Info("[openai_empty_retry] terminal empty output", "surface", "responses", "stream", true, "retry_attempts", attempts, "success_source", "none", "error_code", streamRuntime.finalErrorCode)
+		return
+	}
+	config.Logger.Info("[openai_empty_retry] completed", "surface", "responses", "stream", true, "retry_attempts", attempts, "success_source", source)
+}
--- a/internal/httpapi/openai/responses/handler.go
+++ b/internal/httpapi/openai/responses/handler.go
@@ -113,6 +113,22 @@ func writeUpstreamEmptyOutputError(w http.ResponseWriter, text, thinking string,
 	return shared.WriteUpstreamEmptyOutputError(w, text, thinking, contentFilter)
 }

+func emptyOutputRetryEnabled() bool {
+	return shared.EmptyOutputRetryEnabled()
+}
+
+func emptyOutputRetryMaxAttempts() int {
+	return shared.EmptyOutputRetryMaxAttempts()
+}
+
+func clonePayloadWithEmptyOutputRetryPrompt(payload map[string]any) map[string]any {
+	return shared.ClonePayloadWithEmptyOutputRetryPrompt(payload)
+}
+
+func usagePromptWithEmptyOutputRetry(originalPrompt string, retryAttempts int) string {
+	return shared.UsagePromptWithEmptyOutputRetry(originalPrompt, retryAttempts)
+}
+
 func filterIncrementalToolCallDeltasByAllowed(deltas []toolstream.ToolCallDelta, seenNames map[int]string) []toolstream.ToolCallDelta {
 	return shared.FilterIncrementalToolCallDeltasByAllowed(deltas, seenNames)
 }
--- a/internal/httpapi/openai/responses/responses_handler.go
+++ b/internal/httpapi/openai/responses/responses_handler.go
@@ -115,10 +115,10 @@ func (h *Handler) Responses(w http.ResponseWriter, r *http.Request) {

 	responseID := "resp_" + strings.ReplaceAll(uuid.NewString(), "-", "")
 	if stdReq.Stream {
-		h.handleResponsesStream(w, r, resp, owner, responseID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, stdReq.ToolChoice, traceID)
+		h.handleResponsesStreamWithRetry(w, r, a, resp, payload, pow, owner, responseID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, stdReq.ToolChoice, traceID)
 		return
 	}
-	h.handleResponsesNonStream(w, resp, owner, responseID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, stdReq.ToolChoice, traceID)
+	h.handleResponsesNonStreamWithRetry(w, r.Context(), a, resp, payload, pow, owner, responseID, stdReq.ResponseModel, stdReq.FinalPrompt, stdReq.Thinking, stdReq.Search, stdReq.ToolNames, stdReq.ToolChoice, traceID)
 }

 func (h *Handler) handleResponsesNonStream(w http.ResponseWriter, resp *http.Response, owner, responseID, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolChoice promptcompat.ToolChoicePolicy, traceID string) {
@@ -206,8 +206,12 @@ func (h *Handler) handleResponsesStream(w http.ResponseWriter, r *http.Request,
 		MaxKeepAliveNoInput: dsprotocol.MaxKeepaliveCount,
 	}, streamengine.ConsumeHooks{
 		OnParsed: streamRuntime.onParsed,
-		OnFinalize: func(_ streamengine.StopReason, _ error) {
-			streamRuntime.finalize()
+		OnFinalize: func(reason streamengine.StopReason, _ error) {
+			if string(reason) == "content_filter" {
+				streamRuntime.finalize("content_filter", false)
+				return
+			}
+			streamRuntime.finalize("stop", false)
 		},
 	})
 }
--- a/internal/httpapi/openai/responses/responses_stream_runtime_core.go
+++ b/internal/httpapi/openai/responses/responses_stream_runtime_core.go
@@ -53,6 +53,9 @@ type responsesStreamRuntime struct {
 	messagePartAdded      bool
 	sequence              int
 	failed                bool
+	finalErrorStatus      int
+	finalErrorMessage     string
+	finalErrorCode        string

 	persistResponse func(obj map[string]any)
 }
@@ -103,6 +106,9 @@ func newResponsesStreamRuntime(

 func (s *responsesStreamRuntime) failResponse(status int, message, code string) {
 	s.failed = true
+	s.finalErrorStatus = status
+	s.finalErrorMessage = message
+	s.finalErrorCode = code
 	failedResp := map[string]any{
 		"id":          s.responseID,
 		"type":        "response",
@@ -126,7 +132,11 @@ func (s *responsesStreamRuntime) failResponse(status int, message, code string)
 	s.sendDone()
 }

-func (s *responsesStreamRuntime) finalize() {
+func (s *responsesStreamRuntime) finalize(finishReason string, deferEmptyOutput bool) bool {
+	s.failed = false
+	s.finalErrorStatus = 0
+	s.finalErrorMessage = ""
+	s.finalErrorCode = ""
 	finalThinking := s.thinking.String()
 	finalToolDetectionThinking := s.toolDetectionThinking.String()
 	finalText := cleanVisibleOutput(s.text.String(), s.stripReferenceMarkers)
@@ -150,12 +160,18 @@ func (s *responsesStreamRuntime) finalize() {

 	if s.toolChoice.IsRequired() && len(detected) == 0 {
 		s.failResponse(http.StatusUnprocessableEntity, "tool_choice requires at least one valid tool call.", "tool_choice_violation")
-		return
+		return true
 	}
 	if len(detected) == 0 && strings.TrimSpace(finalText) == "" {
-		status, message, code := upstreamEmptyOutputDetail(false, finalText, finalThinking)
+		status, message, code := upstreamEmptyOutputDetail(finishReason == "content_filter", finalText, finalThinking)
+		if deferEmptyOutput {
+			s.finalErrorStatus = status
+			s.finalErrorMessage = message
+			s.finalErrorCode = code
+			return false
+		}
 		s.failResponse(status, message, code)
-		return
+		return true
 	}
 	s.closeIncompleteFunctionItems()

@@ -165,6 +181,7 @@ func (s *responsesStreamRuntime) finalize() {
 	}
 	s.sendEvent("response.completed", openaifmt.BuildResponsesCompletedPayload(obj))
 	s.sendDone()
+	return true
 }

 func (s *responsesStreamRuntime) logToolPolicyRejections(textParsed toolcall.ToolCallParseResult) {
@@ -188,7 +205,10 @@ func (s *responsesStreamRuntime) onParsed(parsed sse.LineResult) streamengine.Pa
 	if !parsed.Parsed {
 		return streamengine.ParsedDecision{}
 	}
-	if parsed.ContentFilter || parsed.ErrorMessage != "" || parsed.Stop {
+	if parsed.ContentFilter || parsed.ErrorMessage != "" {
+		return streamengine.ParsedDecision{Stop: true, StopReason: streamengine.StopReason("content_filter")}
+	}
+	if parsed.Stop {
 		return streamengine.ParsedDecision{Stop: true}
 	}

--- a/internal/httpapi/openai/shared/empty_retry.go
+++ b/internal/httpapi/openai/shared/empty_retry.go
@@ -0,0 +1,45 @@
+package shared
+
+import "strings"
+
+const EmptyOutputRetrySuffix = "Previous reply had no visible output. Please regenerate the visible final answer or tool call now."
+
+func EmptyOutputRetryEnabled() bool {
+	return true
+}
+
+func EmptyOutputRetryMaxAttempts() int {
+	return 1
+}
+
+func ClonePayloadWithEmptyOutputRetryPrompt(payload map[string]any) map[string]any {
+	clone := make(map[string]any, len(payload))
+	for k, v := range payload {
+		clone[k] = v
+	}
+	original, _ := payload["prompt"].(string)
+	clone["prompt"] = AppendEmptyOutputRetrySuffix(original)
+	return clone
+}
+
+func AppendEmptyOutputRetrySuffix(prompt string) string {
+	prompt = strings.TrimRight(prompt, "\r\n\t ")
+	if prompt == "" {
+		return EmptyOutputRetrySuffix
+	}
+	return prompt + "\n\n" + EmptyOutputRetrySuffix
+}
+
+func UsagePromptWithEmptyOutputRetry(originalPrompt string, retryAttempts int) string {
+	if retryAttempts <= 0 {
+		return originalPrompt
+	}
+	parts := make([]string, 0, retryAttempts+1)
+	parts = append(parts, originalPrompt)
+	next := originalPrompt
+	for i := 0; i < retryAttempts; i++ {
+		next = AppendEmptyOutputRetrySuffix(next)
+		parts = append(parts, next)
+	}
+	return strings.Join(parts, "\n")
+}
--- a/internal/httpapi/openai/stream_status_test.go
+++ b/internal/httpapi/openai/stream_status_test.go
@@ -66,6 +66,44 @@ func (m streamStatusDSStub) DeleteAllSessionsForToken(_ context.Context, _ strin
 	return nil
 }

+type streamStatusDSSeqStub struct {
+	resps    []*http.Response
+	payloads []map[string]any
+}
+
+func (m *streamStatusDSSeqStub) CreateSession(_ context.Context, _ *auth.RequestAuth, _ int) (string, error) {
+	return "session-id", nil
+}
+
+func (m *streamStatusDSSeqStub) GetPow(_ context.Context, _ *auth.RequestAuth, _ int) (string, error) {
+	return "pow", nil
+}
+
+func (m *streamStatusDSSeqStub) UploadFile(_ context.Context, _ *auth.RequestAuth, _ dsclient.UploadFileRequest, _ int) (*dsclient.UploadFileResult, error) {
+	return &dsclient.UploadFileResult{ID: "file-id", Filename: "file.txt", Bytes: 1, Status: "uploaded"}, nil
+}
+
+func (m *streamStatusDSSeqStub) CallCompletion(_ context.Context, _ *auth.RequestAuth, payload map[string]any, _ string, _ int) (*http.Response, error) {
+	clone := make(map[string]any, len(payload))
+	for k, v := range payload {
+		clone[k] = v
+	}
+	m.payloads = append(m.payloads, clone)
+	idx := len(m.payloads) - 1
+	if idx >= len(m.resps) {
+		idx = len(m.resps) - 1
+	}
+	return m.resps[idx], nil
+}
+
+func (m *streamStatusDSSeqStub) DeleteSessionForToken(_ context.Context, _ string, _ string) (*dsclient.DeleteSessionResult, error) {
+	return &dsclient.DeleteSessionResult{Success: true}, nil
+}
+
+func (m *streamStatusDSSeqStub) DeleteAllSessionsForToken(_ context.Context, _ string) error {
+	return nil
+}
+
 func makeOpenAISSEHTTPResponse(lines ...string) *http.Response {
 	body := strings.Join(lines, "\n")
 	if !strings.HasSuffix(body, "\n") {
@@ -78,6 +116,12 @@ func makeOpenAISSEHTTPResponse(lines ...string) *http.Response {
 	}
 }

+func newOpenAITestRouter(h *openAITestSurface) http.Handler {
+	r := chi.NewRouter()
+	registerOpenAITestRoutes(r, h)
+	return r
+}
+
 func captureStatusMiddleware(statuses *[]int) func(http.Handler) http.Handler {
 	return func(next http.Handler) http.Handler {
 		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
@@ -239,6 +283,125 @@ func TestChatCompletionsStreamEmitsFailureFrameWhenUpstreamOutputEmpty(t *testin
 	}
 }

+func TestChatCompletionsStreamRetriesEmptyOutputOnSameSession(t *testing.T) {
+	ds := &streamStatusDSSeqStub{resps: []*http.Response{
+		makeOpenAISSEHTTPResponse(`data: {"p":"response/thinking_content","v":"plan"}`, "data: [DONE]"),
+		makeOpenAISSEHTTPResponse(`data: {"p":"response/content","v":"visible"}`, "data: [DONE]"),
+	}}
+	h := &openAITestSurface{
+		Store: mockOpenAIConfig{wideInput: true},
+		Auth:  streamStatusAuthStub{},
+		DS:    ds,
+	}
+	reqBody := `{"model":"deepseek-v4-pro","messages":[{"role":"user","content":"hi"}],"stream":true}`
+	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(reqBody))
+	req.Header.Set("Authorization", "Bearer direct-token")
+	req.Header.Set("Content-Type", "application/json")
+	rec := httptest.NewRecorder()
+	newOpenAITestRouter(h).ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d body=%s", rec.Code, rec.Body.String())
+	}
+	if len(ds.payloads) != 2 {
+		t.Fatalf("expected one synthetic retry call, got %d", len(ds.payloads))
+	}
+	if ds.payloads[0]["chat_session_id"] != ds.payloads[1]["chat_session_id"] {
+		t.Fatalf("expected retry to reuse session, payloads=%#v", ds.payloads)
+	}
+	retryPrompt := asString(ds.payloads[1]["prompt"])
+	if !strings.Contains(retryPrompt, "Previous reply had no visible output. Please regenerate the visible final answer or tool call now.") {
+		t.Fatalf("expected retry suffix in prompt, got %q", retryPrompt)
+	}
+
+	frames, done := parseSSEDataFrames(t, rec.Body.String())
+	if !done {
+		t.Fatalf("expected [DONE], body=%s", rec.Body.String())
+	}
+	doneCount := strings.Count(rec.Body.String(), "data: [DONE]")
+	if doneCount != 1 {
+		t.Fatalf("expected one [DONE], got %d body=%s", doneCount, rec.Body.String())
+	}
+	if len(frames) != 3 {
+		t.Fatalf("expected reasoning, content, finish frames, got %#v body=%s", frames, rec.Body.String())
+	}
+	id := asString(frames[0]["id"])
+	for _, frame := range frames[1:] {
+		if asString(frame["id"]) != id {
+			t.Fatalf("expected same completion id across retry stream, frames=%#v", frames)
+		}
+	}
+	choices, _ := frames[1]["choices"].([]any)
+	choice, _ := choices[0].(map[string]any)
+	delta, _ := choice["delta"].(map[string]any)
+	if asString(delta["content"]) != "visible" {
+		t.Fatalf("expected retry content delta, got %#v body=%s", delta, rec.Body.String())
+	}
+}
+
+func TestChatCompletionsNonStreamRetriesThinkingOnlyOutput(t *testing.T) {
+	ds := &streamStatusDSSeqStub{resps: []*http.Response{
+		makeOpenAISSEHTTPResponse(`data: {"p":"response/thinking_content","v":"plan"}`, "data: [DONE]"),
+		makeOpenAISSEHTTPResponse(`data: {"p":"response/content","v":"visible"}`, "data: [DONE]"),
+	}}
+	h := &openAITestSurface{
+		Store: mockOpenAIConfig{wideInput: true},
+		Auth:  streamStatusAuthStub{},
+		DS:    ds,
+	}
+	reqBody := `{"model":"deepseek-v4-pro","messages":[{"role":"user","content":"hi"}],"stream":false}`
+	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(reqBody))
+	req.Header.Set("Authorization", "Bearer direct-token")
+	req.Header.Set("Content-Type", "application/json")
+	rec := httptest.NewRecorder()
+	newOpenAITestRouter(h).ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("expected 200 after retry, got %d body=%s", rec.Code, rec.Body.String())
+	}
+	if len(ds.payloads) != 2 {
+		t.Fatalf("expected one synthetic retry call, got %d", len(ds.payloads))
+	}
+	var out map[string]any
+	if err := json.Unmarshal(rec.Body.Bytes(), &out); err != nil {
+		t.Fatalf("decode response failed: %v body=%s", err, rec.Body.String())
+	}
+	choices, _ := out["choices"].([]any)
+	choice, _ := choices[0].(map[string]any)
+	message, _ := choice["message"].(map[string]any)
+	if asString(message["content"]) != "visible" {
+		t.Fatalf("expected retry visible content, got %#v", message)
+	}
+	if !strings.Contains(asString(message["reasoning_content"]), "plan") {
+		t.Fatalf("expected first-attempt reasoning to be preserved, got %#v", message)
+	}
+}
+
+func TestChatCompletionsContentFilterDoesNotRetry(t *testing.T) {
+	ds := &streamStatusDSSeqStub{resps: []*http.Response{
+		makeOpenAISSEHTTPResponse(`data: {"code":"content_filter"}`),
+		makeOpenAISSEHTTPResponse(`data: {"p":"response/content","v":"visible"}`, "data: [DONE]"),
+	}}
+	h := &openAITestSurface{
+		Store: mockOpenAIConfig{wideInput: true},
+		Auth:  streamStatusAuthStub{},
+		DS:    ds,
+	}
+	reqBody := `{"model":"deepseek-v4-flash","messages":[{"role":"user","content":"hi"}],"stream":false}`
+	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(reqBody))
+	req.Header.Set("Authorization", "Bearer direct-token")
+	req.Header.Set("Content-Type", "application/json")
+	rec := httptest.NewRecorder()
+	newOpenAITestRouter(h).ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusBadRequest {
+		t.Fatalf("expected content_filter 400, got %d body=%s", rec.Code, rec.Body.String())
+	}
+	if len(ds.payloads) != 1 {
+		t.Fatalf("expected no retry on content_filter, got %d calls", len(ds.payloads))
+	}
+}
+
 func TestResponsesStreamUsageIgnoresBatchAccumulatedTokenUsage(t *testing.T) {
 	statuses := make([]int, 0, 1)
 	h := &openAITestSurface{
@@ -287,6 +450,86 @@ func TestResponsesStreamUsageIgnoresBatchAccumulatedTokenUsage(t *testing.T) {
 	}
 }

+func TestResponsesStreamRetriesThinkingOnlyOutput(t *testing.T) {
+	ds := &streamStatusDSSeqStub{resps: []*http.Response{
+		makeOpenAISSEHTTPResponse(`data: {"p":"response/thinking_content","v":"plan"}`, "data: [DONE]"),
+		makeOpenAISSEHTTPResponse(`data: {"p":"response/content","v":"visible"}`, "data: [DONE]"),
+	}}
+	h := &openAITestSurface{
+		Store: mockOpenAIConfig{wideInput: true},
+		Auth:  streamStatusAuthStub{},
+		DS:    ds,
+	}
+	reqBody := `{"model":"deepseek-v4-pro","input":"hi","stream":true}`
+	req := httptest.NewRequest(http.MethodPost, "/v1/responses", strings.NewReader(reqBody))
+	req.Header.Set("Authorization", "Bearer direct-token")
+	req.Header.Set("Content-Type", "application/json")
+	rec := httptest.NewRecorder()
+	newOpenAITestRouter(h).ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d body=%s", rec.Code, rec.Body.String())
+	}
+	if len(ds.payloads) != 2 {
+		t.Fatalf("expected one synthetic retry call, got %d", len(ds.payloads))
+	}
+	body := rec.Body.String()
+	if strings.Contains(body, "response.failed") {
+		t.Fatalf("did not expect premature response.failed, body=%s", body)
+	}
+	if !strings.Contains(body, "response.reasoning.delta") || !strings.Contains(body, "response.output_text.delta") || !strings.Contains(body, "response.completed") {
+		t.Fatalf("expected reasoning, text delta, and completed events, body=%s", body)
+	}
+	if strings.Count(body, "data: [DONE]") != 1 {
+		t.Fatalf("expected one [DONE], body=%s", body)
+	}
+}
+
+func TestResponsesNonStreamRetriesThinkingOnlyOutput(t *testing.T) {
+	ds := &streamStatusDSSeqStub{resps: []*http.Response{
+		makeOpenAISSEHTTPResponse(`data: {"p":"response/thinking_content","v":"plan"}`, "data: [DONE]"),
+		makeOpenAISSEHTTPResponse(`data: {"p":"response/content","v":"visible"}`, "data: [DONE]"),
+	}}
+	h := &openAITestSurface{
+		Store: mockOpenAIConfig{wideInput: true},
+		Auth:  streamStatusAuthStub{},
+		DS:    ds,
+	}
+	reqBody := `{"model":"deepseek-v4-pro","input":"hi","stream":false}`
+	req := httptest.NewRequest(http.MethodPost, "/v1/responses", strings.NewReader(reqBody))
+	req.Header.Set("Authorization", "Bearer direct-token")
+	req.Header.Set("Content-Type", "application/json")
+	rec := httptest.NewRecorder()
+	newOpenAITestRouter(h).ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("expected 200 after retry, got %d body=%s", rec.Code, rec.Body.String())
+	}
+	if len(ds.payloads) != 2 {
+		t.Fatalf("expected one synthetic retry call, got %d", len(ds.payloads))
+	}
+	var out map[string]any
+	if err := json.Unmarshal(rec.Body.Bytes(), &out); err != nil {
+		t.Fatalf("decode response failed: %v body=%s", err, rec.Body.String())
+	}
+	if asString(out["output_text"]) != "visible" {
+		t.Fatalf("expected retry visible output_text, got %#v", out["output_text"])
+	}
+	output, _ := out["output"].([]any)
+	if len(output) == 0 {
+		t.Fatalf("expected output items, got %#v", out)
+	}
+	item, _ := output[0].(map[string]any)
+	content, _ := item["content"].([]any)
+	if len(content) == 0 {
+		t.Fatalf("expected content entries, got %#v", item)
+	}
+	reasoning, _ := content[0].(map[string]any)
+	if asString(reasoning["type"]) != "reasoning" || !strings.Contains(asString(reasoning["text"]), "plan") {
+		t.Fatalf("expected preserved reasoning entry, got %#v", content)
+	}
+}
+
 func TestResponsesNonStreamUsageIgnoresPromptAndOutputTokenUsage(t *testing.T) {
 	statuses := make([]int, 0, 1)
 	h := &openAITestSurface{