refactor: unify empty-output retry logic into shared completionruntime package and normalize protocol adapter boundary.

2026-05-10 11:17:41 +08:00 · 2026-05-10 00:10:53 +08:00
parent 067cf465bb
commit 7c66742a19
32 changed files with 930 additions and 371 deletions
--- a/internal/httpapi/claude/handler_messages.go
+++ b/internal/httpapi/claude/handler_messages.go
@@ -145,7 +145,7 @@ func (h *Handler) handleClaudeDirectStream(w http.ResponseWriter, r *http.Reques
 		return
 	}
 	streamReq := start.Request
-	h.handleClaudeStreamRealtime(w, r, start.Response, streamReq.ResponseModel, streamReq.Messages, streamReq.Thinking, streamReq.Search, streamReq.ToolNames, streamReq.ToolsRaw, historySession)
+	h.handleClaudeStreamRealtimeWithRetry(w, r, a, start.Response, start.Payload, start.Pow, streamReq.ResponseModel, streamReq.Messages, streamReq.Thinking, streamReq.Search, streamReq.ToolNames, streamReq.ToolsRaw, streamReq.PromptTokenText, historySession)
 }

 func (h *Handler) proxyViaOpenAI(w http.ResponseWriter, r *http.Request, store ConfigReader) bool {
@@ -360,3 +360,112 @@ func (h *Handler) handleClaudeStreamRealtime(w http.ResponseWriter, r *http.Requ
 		OnFinalize: streamRuntime.onFinalize,
 	})
 }
+
+func (h *Handler) handleClaudeStreamRealtimeWithRetry(w http.ResponseWriter, r *http.Request, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, model string, messages []any, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, promptTokenText string, historySession *responsehistory.Session) {
+	if resp.StatusCode != http.StatusOK {
+		defer func() { _ = resp.Body.Close() }()
+		body, _ := io.ReadAll(resp.Body)
+		if historySession != nil {
+			historySession.Error(resp.StatusCode, strings.TrimSpace(string(body)), "error", "", "")
+		}
+		writeClaudeError(w, http.StatusInternalServerError, string(body))
+		return
+	}
+
+	w.Header().Set("Content-Type", "text/event-stream")
+	w.Header().Set("Cache-Control", "no-cache, no-transform")
+	w.Header().Set("Connection", "keep-alive")
+	w.Header().Set("X-Accel-Buffering", "no")
+	rc := http.NewResponseController(w)
+	_, canFlush := w.(http.Flusher)
+	if !canFlush {
+		config.Logger.Warn("[claude_stream] response writer does not support flush; streaming may be buffered")
+	}
+
+	streamRuntime := newClaudeStreamRuntime(
+		w,
+		rc,
+		canFlush,
+		model,
+		messages,
+		thinkingEnabled,
+		searchEnabled,
+		stripReferenceMarkersEnabled(),
+		toolNames,
+		toolsRaw,
+		promptTokenText,
+		historySession,
+	)
+	streamRuntime.sendMessageStart()
+
+	completionruntime.ExecuteStreamWithRetry(r.Context(), h.DS, a, resp, payload, pow, completionruntime.StreamRetryOptions{
+		Surface:      "claude.messages",
+		Stream:       true,
+		RetryEnabled: true,
+		MaxAttempts:  3,
+		UsagePrompt:  promptTokenText,
+	}, completionruntime.StreamRetryHooks{
+		ConsumeAttempt: func(currentResp *http.Response, allowDeferEmpty bool) (bool, bool) {
+			return h.consumeClaudeStreamAttempt(r, currentResp, streamRuntime, thinkingEnabled, allowDeferEmpty)
+		},
+		Finalize: func(_ int) {
+			streamRuntime.finalize("end_turn", false)
+		},
+		ParentMessageID: func() int {
+			return streamRuntime.responseMessageID
+		},
+		OnRetryPrompt: func(prompt string) {
+			streamRuntime.promptTokenText = prompt
+		},
+		OnRetryFailure: func(status int, message, code string) {
+			streamRuntime.sendErrorWithCode(status, strings.TrimSpace(message), code)
+		},
+	})
+}
+
+func (h *Handler) consumeClaudeStreamAttempt(r *http.Request, resp *http.Response, streamRuntime *claudeStreamRuntime, thinkingEnabled bool, allowDeferEmpty bool) (bool, bool) {
+	defer func() { _ = resp.Body.Close() }()
+	initialType := "text"
+	if thinkingEnabled {
+		initialType = "thinking"
+	}
+	finalReason := streamengine.StopReason("")
+	var scannerErr error
+	streamengine.ConsumeSSE(streamengine.ConsumeConfig{
+		Context:             r.Context(),
+		Body:                resp.Body,
+		ThinkingEnabled:     thinkingEnabled,
+		InitialType:         initialType,
+		KeepAliveInterval:   claudeStreamPingInterval,
+		IdleTimeout:         claudeStreamIdleTimeout,
+		MaxKeepAliveNoInput: claudeStreamMaxKeepaliveCnt,
+	}, streamengine.ConsumeHooks{
+		OnKeepAlive: func() {
+			streamRuntime.sendPing()
+		},
+		OnParsed: streamRuntime.onParsed,
+		OnFinalize: func(reason streamengine.StopReason, err error) {
+			finalReason = reason
+			scannerErr = err
+		},
+	})
+	if string(finalReason) == "upstream_error" {
+		if streamRuntime.history != nil {
+			streamRuntime.history.Error(500, streamRuntime.upstreamErr, "upstream_error", responsehistory.ThinkingForArchive(streamRuntime.rawThinking.String(), streamRuntime.toolDetectionThinking.String(), streamRuntime.thinking.String()), responsehistory.TextForArchive(streamRuntime.rawText.String(), streamRuntime.text.String()))
+		}
+		streamRuntime.sendError(streamRuntime.upstreamErr)
+		return true, false
+	}
+	if scannerErr != nil {
+		if streamRuntime.history != nil {
+			streamRuntime.history.Error(500, scannerErr.Error(), "error", responsehistory.ThinkingForArchive(streamRuntime.rawThinking.String(), streamRuntime.toolDetectionThinking.String(), streamRuntime.thinking.String()), responsehistory.TextForArchive(streamRuntime.rawText.String(), streamRuntime.text.String()))
+		}
+		streamRuntime.sendError(scannerErr.Error())
+		return true, false
+	}
+	terminalWritten := streamRuntime.finalize("end_turn", allowDeferEmpty)
+	if terminalWritten {
+		return true, false
+	}
+	return false, true
+}
--- a/internal/httpapi/claude/stream_runtime_core.go
+++ b/internal/httpapi/claude/stream_runtime_core.go
@@ -29,9 +29,10 @@ type claudeStreamRuntime struct {
 	bufferToolContent     bool
 	stripReferenceMarkers bool

-	messageID string
-	thinking  strings.Builder
-	text      strings.Builder
+	messageID         string
+	thinking          strings.Builder
+	text              strings.Builder
+	responseMessageID int

 	sieve                 toolstream.State
 	rawText               strings.Builder
@@ -92,6 +93,9 @@ func (s *claudeStreamRuntime) onParsed(parsed sse.LineResult) streamengine.Parse
 		s.upstreamErr = parsed.ErrorMessage
 		return streamengine.ParsedDecision{Stop: true, StopReason: streamengine.StopReason("upstream_error")}
 	}
+	if parsed.ResponseMessageID > 0 {
+		s.responseMessageID = parsed.ResponseMessageID
+	}
 	if parsed.Stop {
 		return streamengine.ParsedDecision{Stop: true}
 	}
--- a/internal/httpapi/claude/stream_runtime_emit.go
+++ b/internal/httpapi/claude/stream_runtime_emit.go
@@ -22,16 +22,27 @@ func (s *claudeStreamRuntime) send(event string, v any) {
 }

 func (s *claudeStreamRuntime) sendError(message string) {
+	s.sendErrorWithCode(500, message, "internal_error")
+}
+
+func (s *claudeStreamRuntime) sendErrorWithCode(status int, message, code string) {
 	msg := strings.TrimSpace(message)
 	if msg == "" {
 		msg = "upstream stream error"
 	}
+	if code == "" {
+		code = "internal_error"
+	}
+	errType := "api_error"
+	if status == 429 {
+		errType = "rate_limit_error"
+	}
 	s.send("error", map[string]any{
 		"type": "error",
 		"error": map[string]any{
-			"type":    "api_error",
+			"type":    errType,
 			"message": msg,
-			"code":    "internal_error",
+			"code":    code,
 			"param":   nil,
 		},
 	})
--- a/internal/httpapi/claude/stream_runtime_finalize.go
+++ b/internal/httpapi/claude/stream_runtime_finalize.go
@@ -63,13 +63,10 @@ func (s *claudeStreamRuntime) sendToolUseBlock(idx int, tc toolcall.ParsedToolCa
 	})
 }

-func (s *claudeStreamRuntime) finalize(stopReason string) {
+func (s *claudeStreamRuntime) finalize(stopReason string, deferEmptyOutput bool) bool {
 	if s.ended {
-		return
+		return true
 	}
-	s.ended = true
-
-	s.closeThinkingBlock()

 	if s.bufferToolContent {
 		for _, evt := range toolstream.Flush(&s.sieve, s.toolNames) {
@@ -123,6 +120,7 @@ func (s *claudeStreamRuntime) finalize(stopReason string) {
 		RawThinking:           s.rawThinking.String(),
 		VisibleThinking:       s.thinking.String(),
 		DetectionThinking:     s.toolDetectionThinking.String(),
+		ResponseMessageID:     s.responseMessageID,
 		AlreadyEmittedCalls:   s.toolCallsDetected,
 		AlreadyEmittedToolRaw: s.toolCallsDetected,
 	}, assistantturn.BuildOptions{
@@ -137,6 +135,22 @@ func (s *claudeStreamRuntime) finalize(stopReason string) {
 	outcome := assistantturn.FinalizeTurn(turn, assistantturn.FinalizeOptions{
 		AlreadyEmittedToolCalls: s.toolCallsDetected,
 	})
+	if outcome.ShouldFail {
+		if deferEmptyOutput {
+			return false
+		}
+		s.ended = true
+		s.closeThinkingBlock()
+		s.closeTextBlock()
+		if s.history != nil {
+			s.history.Error(outcome.Error.Status, outcome.Error.Message, outcome.Error.Code, responsehistory.ThinkingForArchive(turn.RawThinking, turn.DetectionThinking, turn.Thinking), responsehistory.TextForArchive(turn.RawText, turn.Text))
+		}
+		s.sendErrorWithCode(outcome.Error.Status, outcome.Error.Message, outcome.Error.Code)
+		return true
+	}
+
+	s.ended = true
+	s.closeThinkingBlock()

 	if s.bufferToolContent && !s.toolCallsDetected {
 		if len(turn.ToolCalls) > 0 {
@@ -197,6 +211,7 @@ func (s *claudeStreamRuntime) finalize(stopReason string) {
 		},
 	})
 	s.send("message_stop", map[string]any{"type": "message_stop"})
+	return true
 }

 func (s *claudeStreamRuntime) onFinalize(reason streamengine.StopReason, scannerErr error) {
@@ -214,5 +229,5 @@ func (s *claudeStreamRuntime) onFinalize(reason streamengine.StopReason, scanner
 		s.sendError(scannerErr.Error())
 		return
 	}
-	s.finalize("end_turn")
+	s.finalize("end_turn", false)
 }
--- a/internal/httpapi/gemini/handler_generate.go
+++ b/internal/httpapi/gemini/handler_generate.go
@@ -137,7 +137,7 @@ func (h *Handler) handleGeminiDirectStream(w http.ResponseWriter, r *http.Reques
 		return
 	}
 	streamReq := start.Request
-	h.handleStreamGenerateContent(w, r, start.Response, streamReq.ResponseModel, streamReq.PromptTokenText, streamReq.Thinking, streamReq.Search, streamReq.ToolNames, streamReq.ToolsRaw, historySession)
+	h.handleStreamGenerateContentWithRetry(w, r, a, start.Response, start.Payload, start.Pow, streamReq.ResponseModel, streamReq.PromptTokenText, streamReq.Thinking, streamReq.Search, streamReq.ToolNames, streamReq.ToolsRaw, historySession)
 }

 func (h *Handler) proxyViaOpenAI(w http.ResponseWriter, r *http.Request, stream bool) bool {
--- a/internal/httpapi/gemini/handler_stream_runtime.go
+++ b/internal/httpapi/gemini/handler_stream_runtime.go
@@ -1,6 +1,7 @@
 package gemini

 import (
+	"context"
 	"encoding/json"
 	"io"
 	"net/http"
@@ -8,6 +9,8 @@ import (
 	"time"

 	"ds2api/internal/assistantturn"
+	"ds2api/internal/auth"
+	"ds2api/internal/completionruntime"
 	dsprotocol "ds2api/internal/deepseek/protocol"
 	"ds2api/internal/responsehistory"
 	"ds2api/internal/sse"
@@ -54,7 +57,7 @@ func (h *Handler) handleStreamGenerateContent(w http.ResponseWriter, r *http.Req
 	}, streamengine.ConsumeHooks{
 		OnParsed: runtime.onParsed,
 		OnFinalize: func(_ streamengine.StopReason, _ error) {
-			runtime.finalize()
+			runtime.finalize(false)
 		},
 	})
 }
@@ -78,9 +81,83 @@ type geminiStreamRuntime struct {
 	accumulator       *assistantturn.Accumulator
 	contentFilter     bool
 	responseMessageID int
+	finalErrorStatus  int
+	finalErrorMessage string
+	finalErrorCode    string
 	history           *responsehistory.Session
 }

+func (h *Handler) handleStreamGenerateContentWithRetry(w http.ResponseWriter, r *http.Request, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, model, finalPrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, historySession *responsehistory.Session) {
+	if resp.StatusCode != http.StatusOK {
+		defer func() { _ = resp.Body.Close() }()
+		body, _ := io.ReadAll(resp.Body)
+		if historySession != nil {
+			historySession.Error(resp.StatusCode, strings.TrimSpace(string(body)), "error", "", "")
+		}
+		writeGeminiError(w, resp.StatusCode, strings.TrimSpace(string(body)))
+		return
+	}
+
+	w.Header().Set("Content-Type", "text/event-stream")
+	w.Header().Set("Cache-Control", "no-cache, no-transform")
+	w.Header().Set("Connection", "keep-alive")
+	w.Header().Set("X-Accel-Buffering", "no")
+
+	rc := http.NewResponseController(w)
+	_, canFlush := w.(http.Flusher)
+	runtime := newGeminiStreamRuntime(w, rc, canFlush, model, finalPrompt, thinkingEnabled, searchEnabled, stripReferenceMarkersEnabled(), toolNames, toolsRaw, historySession)
+
+	completionruntime.ExecuteStreamWithRetry(r.Context(), h.DS, a, resp, payload, pow, completionruntime.StreamRetryOptions{
+		Surface:      "gemini.generate_content",
+		Stream:       true,
+		RetryEnabled: true,
+		MaxAttempts:  3,
+		UsagePrompt:  finalPrompt,
+	}, completionruntime.StreamRetryHooks{
+		ConsumeAttempt: func(currentResp *http.Response, allowDeferEmpty bool) (bool, bool) {
+			return h.consumeGeminiStreamAttempt(r.Context(), currentResp, runtime, thinkingEnabled, allowDeferEmpty)
+		},
+		Finalize: func(_ int) {
+			runtime.finalize(false)
+		},
+		ParentMessageID: func() int {
+			return runtime.responseMessageID
+		},
+		OnRetryPrompt: func(prompt string) {
+			runtime.finalPrompt = prompt
+		},
+		OnRetryFailure: func(status int, message, _ string) {
+			runtime.sendErrorChunk(status, strings.TrimSpace(message))
+		},
+	})
+}
+
+func (h *Handler) consumeGeminiStreamAttempt(ctx context.Context, resp *http.Response, runtime *geminiStreamRuntime, thinkingEnabled bool, allowDeferEmpty bool) (bool, bool) {
+	defer func() { _ = resp.Body.Close() }()
+	initialType := "text"
+	if thinkingEnabled {
+		initialType = "thinking"
+	}
+	streamengine.ConsumeSSE(streamengine.ConsumeConfig{
+		Context:             ctx,
+		Body:                resp.Body,
+		ThinkingEnabled:     thinkingEnabled,
+		InitialType:         initialType,
+		KeepAliveInterval:   time.Duration(dsprotocol.KeepAliveTimeout) * time.Second,
+		IdleTimeout:         time.Duration(dsprotocol.StreamIdleTimeout) * time.Second,
+		MaxKeepAliveNoInput: dsprotocol.MaxKeepaliveCount,
+	}, streamengine.ConsumeHooks{
+		OnParsed: runtime.onParsed,
+		OnFinalize: func(_ streamengine.StopReason, _ error) {
+		},
+	})
+	terminalWritten := runtime.finalize(allowDeferEmpty)
+	if terminalWritten {
+		return true, false
+	}
+	return false, true
+}
+
 //nolint:unused // retained for native Gemini stream handling path.
 func newGeminiStreamRuntime(
 	w http.ResponseWriter,
@@ -127,6 +204,35 @@ func (s *geminiStreamRuntime) sendChunk(payload map[string]any) {
 	}
 }

+func (s *geminiStreamRuntime) sendErrorChunk(status int, message string) {
+	msg := strings.TrimSpace(message)
+	if msg == "" {
+		msg = http.StatusText(status)
+	}
+	errorStatus := "INVALID_ARGUMENT"
+	switch status {
+	case http.StatusUnauthorized:
+		errorStatus = "UNAUTHENTICATED"
+	case http.StatusForbidden:
+		errorStatus = "PERMISSION_DENIED"
+	case http.StatusTooManyRequests:
+		errorStatus = "RESOURCE_EXHAUSTED"
+	case http.StatusNotFound:
+		errorStatus = "NOT_FOUND"
+	default:
+		if status >= 500 {
+			errorStatus = "INTERNAL"
+		}
+	}
+	s.sendChunk(map[string]any{
+		"error": map[string]any{
+			"code":    status,
+			"message": msg,
+			"status":  errorStatus,
+		},
+	})
+}
+
 //nolint:unused // retained for native Gemini stream handling path.
 func (s *geminiStreamRuntime) onParsed(parsed sse.LineResult) streamengine.ParsedDecision {
 	if !parsed.Parsed {
@@ -192,7 +298,7 @@ func (s *geminiStreamRuntime) onParsed(parsed sse.LineResult) streamengine.Parse
 }

 //nolint:unused // retained for native Gemini stream handling path.
-func (s *geminiStreamRuntime) finalize() {
+func (s *geminiStreamRuntime) finalize(deferEmptyOutput bool) bool {
 	rawText, text, rawThinking, thinking, detectionThinking := s.accumulator.Snapshot()
 	turn := assistantturn.BuildTurnFromStreamSnapshot(assistantturn.StreamSnapshot{
 		RawText:           rawText,
@@ -211,6 +317,19 @@ func (s *geminiStreamRuntime) finalize() {
 		ToolsRaw:              s.toolsRaw,
 	})
 	outcome := assistantturn.FinalizeTurn(turn, assistantturn.FinalizeOptions{})
+	if outcome.ShouldFail {
+		if deferEmptyOutput {
+			s.finalErrorStatus = outcome.Error.Status
+			s.finalErrorMessage = outcome.Error.Message
+			s.finalErrorCode = outcome.Error.Code
+			return false
+		}
+		if s.history != nil {
+			s.history.Error(outcome.Error.Status, outcome.Error.Message, outcome.Error.Code, responsehistory.ThinkingForArchive(turn.RawThinking, turn.DetectionThinking, turn.Thinking), responsehistory.TextForArchive(turn.RawText, turn.Text))
+		}
+		s.sendErrorChunk(outcome.Error.Status, outcome.Error.Message)
+		return true
+	}
 	if s.history != nil {
 		s.history.Success(
 			http.StatusOK,
@@ -257,4 +376,5 @@ func (s *geminiStreamRuntime) finalize() {
 			"totalTokenCount":      outcome.Usage.TotalTokens,
 		},
 	})
+	return true
 }
--- a/internal/httpapi/openai/chat/empty_retry_runtime.go
+++ b/internal/httpapi/openai/chat/empty_retry_runtime.go
@@ -4,11 +4,11 @@ import (
 	"context"
 	"io"
 	"net/http"
-	"strings"
 	"time"

 	"ds2api/internal/assistantturn"
 	"ds2api/internal/auth"
+	"ds2api/internal/completionruntime"
 	"ds2api/internal/config"
 	dsprotocol "ds2api/internal/deepseek/protocol"
 	openaifmt "ds2api/internal/format/openai"
@@ -17,148 +17,53 @@ import (
 	streamengine "ds2api/internal/stream"
 )

-type chatNonStreamResult struct {
-	rawThinking           string
-	rawText               string
-	thinking              string
-	toolDetectionThinking string
-	text                  string
-	contentFilter         bool
-	detectedCalls         int
-	body                  map[string]any
-	finishReason          string
-	responseMessageID     int
-	outputError           *assistantturn.OutputError
-}
-
-func (r chatNonStreamResult) historyText() string {
-	return historyTextForArchive(r.rawText, r.text)
-}
-
-func (r chatNonStreamResult) historyThinking() string {
-	return historyThinkingForArchive(r.rawThinking, r.toolDetectionThinking, r.thinking)
-}
-
 func (h *Handler) handleNonStreamWithRetry(w http.ResponseWriter, ctx context.Context, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, completionID, model, finalPrompt string, refFileTokens int, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, historySession *chatHistorySession) {
-	attempts := 0
-	currentResp := resp
-	usagePrompt := finalPrompt
-	accumulatedThinking := ""
-	accumulatedRawThinking := ""
-	accumulatedToolDetectionThinking := ""
-	for {
-		result, ok := h.collectChatNonStreamAttempt(w, currentResp, completionID, model, usagePrompt, thinkingEnabled, searchEnabled, toolNames, toolsRaw)
-		if !ok {
-			return
-		}
-		accumulatedThinking += sse.TrimContinuationOverlap(accumulatedThinking, result.thinking)
-		accumulatedRawThinking += sse.TrimContinuationOverlap(accumulatedRawThinking, result.rawThinking)
-		accumulatedToolDetectionThinking += sse.TrimContinuationOverlap(accumulatedToolDetectionThinking, result.toolDetectionThinking)
-		result.thinking = accumulatedThinking
-		result.rawThinking = accumulatedRawThinking
-		result.toolDetectionThinking = accumulatedToolDetectionThinking
-		detected := detectAssistantToolCalls(result.rawText, result.text, result.rawThinking, result.toolDetectionThinking, toolNames)
-		result.detectedCalls = len(detected.Calls)
-		result.body = openaifmt.BuildChatCompletionWithToolCalls(completionID, model, usagePrompt, result.thinking, result.text, detected.Calls, toolsRaw)
-		addRefFileTokensToUsage(result.body, refFileTokens)
-		result.finishReason = chatFinishReason(result.body)
-		if !shouldRetryChatNonStream(result, attempts) {
-			h.finishChatNonStreamResult(w, result, attempts, usagePrompt, refFileTokens, historySession)
-			return
-		}
-
-		attempts++
-		config.Logger.Info("[openai_empty_retry] attempting synthetic retry", "surface", "chat.completions", "stream", false, "retry_attempt", attempts, "parent_message_id", result.responseMessageID)
-		retryPow, powErr := h.DS.GetPow(ctx, a, 3)
-		if powErr != nil {
-			config.Logger.Warn("[openai_empty_retry] retry PoW fetch failed, falling back to original PoW", "surface", "chat.completions", "stream", false, "retry_attempt", attempts, "error", powErr)
-			retryPow = pow
-		}
-		retryPayload := clonePayloadForEmptyOutputRetry(payload, result.responseMessageID)
-		nextResp, err := h.DS.CallCompletion(ctx, a, retryPayload, retryPow, 3)
-		if err != nil {
-			if historySession != nil {
-				historySession.error(http.StatusInternalServerError, "Failed to get completion.", "error", result.historyThinking(), result.historyText())
-			}
-			writeOpenAIError(w, http.StatusInternalServerError, "Failed to get completion.")
-			config.Logger.Warn("[openai_empty_retry] retry request failed", "surface", "chat.completions", "stream", false, "retry_attempt", attempts, "error", err)
-			return
-		}
-		usagePrompt = usagePromptWithEmptyOutputRetry(usagePrompt, attempts)
-		currentResp = nextResp
-	}
-}
-
-func (h *Handler) collectChatNonStreamAttempt(w http.ResponseWriter, resp *http.Response, completionID, model, usagePrompt string, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any) (chatNonStreamResult, bool) {
 	if resp.StatusCode != http.StatusOK {
 		defer func() { _ = resp.Body.Close() }()
 		body, _ := io.ReadAll(resp.Body)
-		writeOpenAIError(w, resp.StatusCode, string(body))
-		return chatNonStreamResult{}, false
-	}
-	result := sse.CollectStream(resp, thinkingEnabled, true)
-	turn := assistantturn.BuildTurnFromCollected(result, assistantturn.BuildOptions{
-		Model:         model,
-		Prompt:        usagePrompt,
-		SearchEnabled: searchEnabled,
-		ToolNames:     toolNames,
-		ToolsRaw:      toolsRaw,
-	})
-	respBody := openaifmt.BuildChatCompletionWithToolCalls(completionID, model, usagePrompt, turn.Thinking, turn.Text, turn.ToolCalls, toolsRaw)
-	return chatNonStreamResult{
-		rawThinking:           result.Thinking,
-		rawText:               result.Text,
-		thinking:              turn.Thinking,
-		toolDetectionThinking: result.ToolDetectionThinking,
-		text:                  turn.Text,
-		contentFilter:         result.ContentFilter,
-		detectedCalls:         len(turn.ToolCalls),
-		body:                  respBody,
-		finishReason:          chatFinishReason(respBody),
-		responseMessageID:     result.ResponseMessageID,
-		outputError:           turn.Error,
-	}, true
-}
-
-func (h *Handler) finishChatNonStreamResult(w http.ResponseWriter, result chatNonStreamResult, attempts int, usagePrompt string, refFileTokens int, historySession *chatHistorySession) {
-	if result.detectedCalls == 0 && strings.TrimSpace(result.text) == "" {
-		status, message, code := upstreamEmptyOutputDetail(result.contentFilter, result.text, result.thinking)
-		if result.outputError != nil {
-			status, message, code = result.outputError.Status, result.outputError.Message, result.outputError.Code
-		}
 		if historySession != nil {
-			historySession.error(status, message, code, result.historyThinking(), result.historyText())
+			historySession.error(resp.StatusCode, string(body), "error", "", "")
 		}
-		writeOpenAIErrorWithCode(w, status, message, code)
-		config.Logger.Info("[openai_empty_retry] terminal empty output", "surface", "chat.completions", "stream", false, "retry_attempts", attempts, "success_source", "none", "content_filter", result.contentFilter)
+		writeOpenAIError(w, resp.StatusCode, string(body))
 		return
 	}
-	if historySession != nil {
-		historySession.success(http.StatusOK, result.historyThinking(), result.historyText(), result.finishReason, openaifmt.BuildChatUsageForModel("", usagePrompt, result.thinking, result.text, refFileTokens))
+	stdReq := promptcompat.StandardRequest{
+		Surface:         "chat.completions",
+		ResponseModel:   model,
+		PromptTokenText: finalPrompt,
+		FinalPrompt:     finalPrompt,
+		RefFileTokens:   refFileTokens,
+		Thinking:        thinkingEnabled,
+		Search:          searchEnabled,
+		ToolNames:       toolNames,
+		ToolsRaw:        toolsRaw,
+		ToolChoice:      promptcompat.DefaultToolChoicePolicy(),
 	}
-	writeJSON(w, http.StatusOK, result.body)
-	source := "first_attempt"
-	if attempts > 0 {
-		source = "synthetic_retry"
-	}
-	config.Logger.Info("[openai_empty_retry] completed", "surface", "chat.completions", "stream", false, "retry_attempts", attempts, "success_source", source)
-}
-
-func chatFinishReason(respBody map[string]any) string {
-	if choices, ok := respBody["choices"].([]map[string]any); ok && len(choices) > 0 {
-		if fr, _ := choices[0]["finish_reason"].(string); strings.TrimSpace(fr) != "" {
-			return fr
+	retryEnabled := h != nil && h.DS != nil && emptyOutputRetryEnabled()
+	result, outErr := completionruntime.ExecuteNonStreamStartedWithRetry(ctx, h.DS, a, completionruntime.StartResult{
+		SessionID: completionID,
+		Payload:   payload,
+		Pow:       pow,
+		Response:  resp,
+		Request:   stdReq,
+	}, completionruntime.Options{
+		RetryEnabled:     retryEnabled,
+		RetryMaxAttempts: emptyOutputRetryMaxAttempts(),
+	})
+	if outErr != nil {
+		if historySession != nil {
+			historySession.error(outErr.Status, outErr.Message, outErr.Code, historyThinkingForArchive(result.Turn.RawThinking, result.Turn.DetectionThinking, result.Turn.Thinking), historyTextForArchive(result.Turn.RawText, result.Turn.Text))
 		}
+		writeOpenAIErrorWithCode(w, outErr.Status, outErr.Message, outErr.Code)
+		return
 	}
-	return "stop"
-}
-
-func shouldRetryChatNonStream(result chatNonStreamResult, attempts int) bool {
-	return emptyOutputRetryEnabled() &&
-		attempts < emptyOutputRetryMaxAttempts() &&
-		!result.contentFilter &&
-		result.detectedCalls == 0 &&
-		strings.TrimSpace(result.text) == ""
+	respBody := openaifmt.BuildChatCompletionWithToolCalls(result.SessionID, model, result.Turn.Prompt, result.Turn.Thinking, result.Turn.Text, result.Turn.ToolCalls, toolsRaw)
+	respBody["usage"] = assistantturn.OpenAIChatUsage(result.Turn)
+	outcome := assistantturn.FinalizeTurn(result.Turn, assistantturn.FinalizeOptions{})
+	if historySession != nil {
+		historySession.success(http.StatusOK, historyThinkingForArchive(result.Turn.RawThinking, result.Turn.DetectionThinking, result.Turn.Thinking), historyTextForArchive(result.Turn.RawText, result.Turn.Text), outcome.FinishReason, assistantturn.OpenAIChatUsage(result.Turn))
+	}
+	writeJSON(w, http.StatusOK, respBody)
 }

 func (h *Handler) handleStreamWithRetry(w http.ResponseWriter, r *http.Request, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, completionID, model, finalPrompt string, refFileTokens int, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, toolChoice promptcompat.ToolChoicePolicy, historySession *chatHistorySession) {
@@ -166,42 +71,35 @@ func (h *Handler) handleStreamWithRetry(w http.ResponseWriter, r *http.Request,
 	if !ok {
 		return
 	}
-	attempts := 0
-	currentResp := resp
-	for {
-		terminalWritten, retryable := h.consumeChatStreamAttempt(r, currentResp, streamRuntime, initialType, thinkingEnabled, historySession, attempts < emptyOutputRetryMaxAttempts())
-		if terminalWritten {
-			logChatStreamTerminal(streamRuntime, attempts)
-			return
-		}
-		if !retryable || !emptyOutputRetryEnabled() || attempts >= emptyOutputRetryMaxAttempts() {
+	completionruntime.ExecuteStreamWithRetry(r.Context(), h.DS, a, resp, payload, pow, completionruntime.StreamRetryOptions{
+		Surface:          "chat.completions",
+		Stream:           true,
+		RetryEnabled:     emptyOutputRetryEnabled(),
+		RetryMaxAttempts: emptyOutputRetryMaxAttempts(),
+		MaxAttempts:      3,
+		UsagePrompt:      finalPrompt,
+	}, completionruntime.StreamRetryHooks{
+		ConsumeAttempt: func(currentResp *http.Response, allowDeferEmpty bool) (bool, bool) {
+			return h.consumeChatStreamAttempt(r, currentResp, streamRuntime, initialType, thinkingEnabled, historySession, allowDeferEmpty)
+		},
+		Finalize: func(attempts int) {
 			streamRuntime.finalize("stop", false)
 			recordChatStreamHistory(streamRuntime, historySession)
 			config.Logger.Info("[openai_empty_retry] terminal empty output", "surface", "chat.completions", "stream", true, "retry_attempts", attempts, "success_source", "none")
-			return
-		}
-		attempts++
-		config.Logger.Info("[openai_empty_retry] attempting synthetic retry", "surface", "chat.completions", "stream", true, "retry_attempt", attempts, "parent_message_id", streamRuntime.responseMessageID)
-		retryPow, powErr := h.DS.GetPow(r.Context(), a, 3)
-		if powErr != nil {
-			config.Logger.Warn("[openai_empty_retry] retry PoW fetch failed, falling back to original PoW", "surface", "chat.completions", "stream", true, "retry_attempt", attempts, "error", powErr)
-			retryPow = pow
-		}
-		nextResp, err := h.DS.CallCompletion(r.Context(), a, clonePayloadForEmptyOutputRetry(payload, streamRuntime.responseMessageID), retryPow, 3)
-		if err != nil {
-			failChatStreamRetry(streamRuntime, historySession, http.StatusInternalServerError, "Failed to get completion.", "error")
-			config.Logger.Warn("[openai_empty_retry] retry request failed", "surface", "chat.completions", "stream", true, "retry_attempt", attempts, "error", err)
-			return
-		}
-		if nextResp.StatusCode != http.StatusOK {
-			defer func() { _ = nextResp.Body.Close() }()
-			body, _ := io.ReadAll(nextResp.Body)
-			failChatStreamRetry(streamRuntime, historySession, nextResp.StatusCode, string(body), "error")
-			return
-		}
-		streamRuntime.finalPrompt = usagePromptWithEmptyOutputRetry(finalPrompt, attempts)
-		currentResp = nextResp
-	}
+		},
+		ParentMessageID: func() int {
+			return streamRuntime.responseMessageID
+		},
+		OnRetryPrompt: func(prompt string) {
+			streamRuntime.finalPrompt = prompt
+		},
+		OnRetryFailure: func(status int, message, code string) {
+			failChatStreamRetry(streamRuntime, historySession, status, message, code)
+		},
+		OnTerminal: func(attempts int) {
+			logChatStreamTerminal(streamRuntime, attempts)
+		},
+	})
 }

 func (h *Handler) prepareChatStreamRuntime(w http.ResponseWriter, resp *http.Response, completionID, model, finalPrompt string, refFileTokens int, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, toolChoice promptcompat.ToolChoicePolicy, historySession *chatHistorySession) (*chatStreamRuntime, string, bool) {
--- a/internal/httpapi/openai/chat/handler.go
+++ b/internal/httpapi/openai/chat/handler.go
@@ -106,10 +106,6 @@ func cleanVisibleOutput(text string, stripReferenceMarkers bool) string {
 	return shared.CleanVisibleOutput(text, stripReferenceMarkers)
 }

-func upstreamEmptyOutputDetail(contentFilter bool, text, thinking string) (int, string, string) {
-	return shared.UpstreamEmptyOutputDetail(contentFilter, text, thinking)
-}
-
 func emptyOutputRetryEnabled() bool {
 	return shared.EmptyOutputRetryEnabled()
 }
@@ -118,14 +114,6 @@ func emptyOutputRetryMaxAttempts() int {
 	return shared.EmptyOutputRetryMaxAttempts()
 }

-func clonePayloadForEmptyOutputRetry(payload map[string]any, parentMessageID int) map[string]any {
-	return shared.ClonePayloadForEmptyOutputRetry(payload, parentMessageID)
-}
-
-func usagePromptWithEmptyOutputRetry(originalPrompt string, retryAttempts int) string {
-	return shared.UsagePromptWithEmptyOutputRetry(originalPrompt, retryAttempts)
-}
-
 func formatIncrementalStreamToolCallDeltas(deltas []toolstream.ToolCallDelta, ids map[int]string) []map[string]any {
 	return shared.FormatIncrementalStreamToolCallDeltas(deltas, ids)
 }
@@ -137,7 +125,3 @@ func filterIncrementalToolCallDeltasByAllowed(deltas []toolstream.ToolCallDelta,
 func formatFinalStreamToolCallsWithStableIDs(calls []toolcall.ParsedToolCall, ids map[int]string, toolsRaw any) []map[string]any {
 	return shared.FormatFinalStreamToolCallsWithStableIDs(calls, ids, toolsRaw)
 }
-
-func detectAssistantToolCalls(rawText, visibleText, exposedThinking, detectionThinking string, toolNames []string) toolcall.ToolCallParseResult {
-	return shared.DetectAssistantToolCalls(rawText, visibleText, exposedThinking, detectionThinking, toolNames)
-}
--- a/internal/httpapi/openai/chat/handler_toolcall_test.go
+++ b/internal/httpapi/openai/chat/handler_toolcall_test.go
@@ -85,8 +85,7 @@ func streamFinishReason(frames []map[string]any) string {
 	return ""
 }

-// Backward-compatible alias for historical test name used in CI logs.
-func TestHandleNonStreamReturns429WhenUpstreamOutputEmpty(t *testing.T) {
+func TestHandleNonStreamSingleAttemptReturns503WhenUpstreamOutputEmpty(t *testing.T) {
 	h := &Handler{}
 	resp := makeSSEHTTPResponse(
 		`data: {"p":"response/content","v":""}`,
@@ -95,17 +94,17 @@ func TestHandleNonStreamReturns429WhenUpstreamOutputEmpty(t *testing.T) {
 	rec := httptest.NewRecorder()

 	h.handleNonStream(rec, resp, "cid-empty", "deepseek-v4-flash", "prompt", 0, false, false, nil, nil, nil)
-	if rec.Code != http.StatusTooManyRequests {
-		t.Fatalf("expected status 429 for empty upstream output, got %d body=%s", rec.Code, rec.Body.String())
+	if rec.Code != http.StatusServiceUnavailable {
+		t.Fatalf("expected status 503 for empty upstream output, got %d body=%s", rec.Code, rec.Body.String())
 	}
 	out := decodeJSONBody(t, rec.Body.String())
 	errObj, _ := out["error"].(map[string]any)
-	if asString(errObj["code"]) != "upstream_empty_output" {
-		t.Fatalf("expected code=upstream_empty_output, got %#v", out)
+	if asString(errObj["code"]) != "upstream_unavailable" {
+		t.Fatalf("expected code=upstream_unavailable, got %#v", out)
 	}
 }

-func TestHandleNonStreamReturnsContentFilterErrorWhenUpstreamFilteredWithoutOutput(t *testing.T) {
+func TestHandleNonStreamSingleAttemptReturnsContentFilterErrorWhenUpstreamFilteredWithoutOutput(t *testing.T) {
 	h := &Handler{}
 	resp := makeSSEHTTPResponse(
 		`data: {"code":"content_filter"}`,
@@ -124,7 +123,7 @@ func TestHandleNonStreamReturnsContentFilterErrorWhenUpstreamFilteredWithoutOutp
 	}
 }

-func TestHandleNonStreamReturns429WhenUpstreamHasOnlyThinking(t *testing.T) {
+func TestHandleNonStreamSingleAttemptReturns429WhenUpstreamHasOnlyThinking(t *testing.T) {
 	h := &Handler{}
 	resp := makeSSEHTTPResponse(
 		`data: {"p":"response/thinking_content","v":"Only thinking"}`,
--- a/internal/httpapi/openai/chat/ref_file_tokens.go
+++ b/internal/httpapi/openai/chat/ref_file_tokens.go
@@ -1,26 +0,0 @@
-package chat
-
-// addRefFileTokensToUsage adds inline-uploaded file token estimates to an existing
-// usage map inside a response object. This keeps the token accounting aware of file
-// content that the upstream model processes but that is not part of the prompt text.
-func addRefFileTokensToUsage(obj map[string]any, refFileTokens int) {
-	if refFileTokens <= 0 || obj == nil {
-		return
-	}
-	usage, ok := obj["usage"].(map[string]any)
-	if !ok || usage == nil {
-		return
-	}
-	for _, key := range []string{"input_tokens", "prompt_tokens"} {
-		if v, ok := usage[key]; ok {
-			if n, ok := v.(int); ok {
-				usage[key] = n + refFileTokens
-			}
-		}
-	}
-	if v, ok := usage["total_tokens"]; ok {
-		if n, ok := v.(int); ok {
-			usage["total_tokens"] = n + refFileTokens
-		}
-	}
-}
--- a/internal/httpapi/openai/responses/empty_retry_runtime.go
+++ b/internal/httpapi/openai/responses/empty_retry_runtime.go
@@ -7,6 +7,7 @@ import (
 	"time"

 	"ds2api/internal/auth"
+	"ds2api/internal/completionruntime"
 	"ds2api/internal/config"
 	dsprotocol "ds2api/internal/deepseek/protocol"
 	"ds2api/internal/promptcompat"
@@ -19,41 +20,34 @@ func (h *Handler) handleResponsesStreamWithRetry(w http.ResponseWriter, r *http.
 	if !ok {
 		return
 	}
-	attempts := 0
-	currentResp := resp
-	for {
-		terminalWritten, retryable := h.consumeResponsesStreamAttempt(r, currentResp, streamRuntime, initialType, thinkingEnabled, attempts < emptyOutputRetryMaxAttempts())
-		if terminalWritten {
-			logResponsesStreamTerminal(streamRuntime, attempts)
-			return
-		}
-		if !retryable || !emptyOutputRetryEnabled() || attempts >= emptyOutputRetryMaxAttempts() {
+	completionruntime.ExecuteStreamWithRetry(r.Context(), h.DS, a, resp, payload, pow, completionruntime.StreamRetryOptions{
+		Surface:          "responses",
+		Stream:           true,
+		RetryEnabled:     emptyOutputRetryEnabled(),
+		RetryMaxAttempts: emptyOutputRetryMaxAttempts(),
+		MaxAttempts:      3,
+		UsagePrompt:      finalPrompt,
+	}, completionruntime.StreamRetryHooks{
+		ConsumeAttempt: func(currentResp *http.Response, allowDeferEmpty bool) (bool, bool) {
+			return h.consumeResponsesStreamAttempt(r, currentResp, streamRuntime, initialType, thinkingEnabled, allowDeferEmpty)
+		},
+		Finalize: func(attempts int) {
 			streamRuntime.finalize("stop", false)
 			config.Logger.Info("[openai_empty_retry] terminal empty output", "surface", "responses", "stream", true, "retry_attempts", attempts, "success_source", "none", "error_code", streamRuntime.finalErrorCode)
-			return
-		}
-		attempts++
-		config.Logger.Info("[openai_empty_retry] attempting synthetic retry", "surface", "responses", "stream", true, "retry_attempt", attempts, "parent_message_id", streamRuntime.responseMessageID)
-		retryPow, powErr := h.DS.GetPow(r.Context(), a, 3)
-		if powErr != nil {
-			config.Logger.Warn("[openai_empty_retry] retry PoW fetch failed, falling back to original PoW", "surface", "responses", "stream", true, "retry_attempt", attempts, "error", powErr)
-			retryPow = pow
-		}
-		nextResp, err := h.DS.CallCompletion(r.Context(), a, clonePayloadForEmptyOutputRetry(payload, streamRuntime.responseMessageID), retryPow, 3)
-		if err != nil {
-			streamRuntime.failResponse(http.StatusInternalServerError, "Failed to get completion.", "error")
-			config.Logger.Warn("[openai_empty_retry] retry request failed", "surface", "responses", "stream", true, "retry_attempt", attempts, "error", err)
-			return
-		}
-		if nextResp.StatusCode != http.StatusOK {
-			defer func() { _ = nextResp.Body.Close() }()
-			body, _ := io.ReadAll(nextResp.Body)
-			streamRuntime.failResponse(nextResp.StatusCode, strings.TrimSpace(string(body)), "error")
-			return
-		}
-		streamRuntime.finalPrompt = usagePromptWithEmptyOutputRetry(finalPrompt, attempts)
-		currentResp = nextResp
-	}
+		},
+		ParentMessageID: func() int {
+			return streamRuntime.responseMessageID
+		},
+		OnRetryPrompt: func(prompt string) {
+			streamRuntime.finalPrompt = prompt
+		},
+		OnRetryFailure: func(status int, message, code string) {
+			streamRuntime.failResponse(status, strings.TrimSpace(message), code)
+		},
+		OnTerminal: func(attempts int) {
+			logResponsesStreamTerminal(streamRuntime, attempts)
+		},
+	})
 }

 func (h *Handler) prepareResponsesStreamRuntime(w http.ResponseWriter, resp *http.Response, owner, responseID, model, finalPrompt string, refFileTokens int, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, toolChoice promptcompat.ToolChoicePolicy, traceID string, historySession *responsehistory.Session) (*responsesStreamRuntime, string, bool) {
--- a/internal/httpapi/openai/responses/handler.go
+++ b/internal/httpapi/openai/responses/handler.go
@@ -103,14 +103,6 @@ func emptyOutputRetryMaxAttempts() int {
 	return shared.EmptyOutputRetryMaxAttempts()
 }

-func clonePayloadForEmptyOutputRetry(payload map[string]any, parentMessageID int) map[string]any {
-	return shared.ClonePayloadForEmptyOutputRetry(payload, parentMessageID)
-}
-
-func usagePromptWithEmptyOutputRetry(originalPrompt string, retryAttempts int) string {
-	return shared.UsagePromptWithEmptyOutputRetry(originalPrompt, retryAttempts)
-}
-
 func filterIncrementalToolCallDeltasByAllowed(deltas []toolstream.ToolCallDelta, seenNames map[int]string) []toolstream.ToolCallDelta {
 	return shared.FilterIncrementalToolCallDeltasByAllowed(deltas, seenNames)
 }
--- a/internal/httpapi/openai/responses/responses_stream_test.go
+++ b/internal/httpapi/openai/responses/responses_stream_test.go
@@ -397,7 +397,7 @@ func TestHandleResponsesNonStreamRequiredToolChoiceIgnoresThinkingToolPayloadWhe
 	}
 }

-func TestHandleResponsesNonStreamReturns429WhenUpstreamOutputEmpty(t *testing.T) {
+func TestHandleResponsesNonStreamSingleAttemptReturns503WhenUpstreamOutputEmpty(t *testing.T) {
 	h := &Handler{}
 	rec := httptest.NewRecorder()
 	resp := &http.Response{
@@ -409,17 +409,17 @@ func TestHandleResponsesNonStreamReturns429WhenUpstreamOutputEmpty(t *testing.T)
 	}

 	h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-v4-flash", "prompt", 0, false, false, nil, nil, promptcompat.DefaultToolChoicePolicy(), "")
-	if rec.Code != http.StatusTooManyRequests {
-		t.Fatalf("expected 429 for empty upstream output, got %d body=%s", rec.Code, rec.Body.String())
+	if rec.Code != http.StatusServiceUnavailable {
+		t.Fatalf("expected 503 for empty upstream output, got %d body=%s", rec.Code, rec.Body.String())
 	}
 	out := decodeJSONBody(t, rec.Body.String())
 	errObj, _ := out["error"].(map[string]any)
-	if asString(errObj["code"]) != "upstream_empty_output" {
-		t.Fatalf("expected code=upstream_empty_output, got %#v", out)
+	if asString(errObj["code"]) != "upstream_unavailable" {
+		t.Fatalf("expected code=upstream_unavailable, got %#v", out)
 	}
 }

-func TestHandleResponsesNonStreamReturnsContentFilterErrorWhenUpstreamFilteredWithoutOutput(t *testing.T) {
+func TestHandleResponsesNonStreamSingleAttemptReturnsContentFilterErrorWhenUpstreamFilteredWithoutOutput(t *testing.T) {
 	h := &Handler{}
 	rec := httptest.NewRecorder()
 	resp := &http.Response{
@@ -441,7 +441,7 @@ func TestHandleResponsesNonStreamReturnsContentFilterErrorWhenUpstreamFilteredWi
 	}
 }

-func TestHandleResponsesNonStreamReturns429WhenUpstreamHasOnlyThinking(t *testing.T) {
+func TestHandleResponsesNonStreamSingleAttemptReturns429WhenUpstreamHasOnlyThinking(t *testing.T) {
 	h := &Handler{}
 	rec := httptest.NewRecorder()
 	resp := &http.Response{
--- a/internal/httpapi/openai/shared/upstream_empty.go
+++ b/internal/httpapi/openai/shared/upstream_empty.go
@@ -17,7 +17,7 @@ func UpstreamEmptyOutputDetail(contentFilter bool, text, thinking string) (int,
 	if thinking != "" {
 		return http.StatusTooManyRequests, "Upstream account hit a rate limit and returned reasoning without visible output.", "upstream_empty_output"
 	}
-	return http.StatusTooManyRequests, "Upstream account hit a rate limit and returned empty output.", "upstream_empty_output"
+	return http.StatusServiceUnavailable, "Upstream service is unavailable and returned no output.", "upstream_unavailable"
 }

 func WriteUpstreamEmptyOutputError(w http.ResponseWriter, text, thinking string, contentFilter bool) bool {
--- a/internal/httpapi/openai/stream_status_test.go
+++ b/internal/httpapi/openai/stream_status_test.go
@@ -274,12 +274,12 @@ func TestChatCompletionsStreamEmitsFailureFrameWhenUpstreamOutputEmpty(t *testin
 	}
 	last := frames[0]
 	statusCode, ok := last["status_code"].(float64)
-	if !ok || int(statusCode) != http.StatusTooManyRequests {
-		t.Fatalf("expected status_code=429, got %#v body=%s", last["status_code"], rec.Body.String())
+	if !ok || int(statusCode) != http.StatusServiceUnavailable {
+		t.Fatalf("expected status_code=503, got %#v body=%s", last["status_code"], rec.Body.String())
 	}
 	errObj, _ := last["error"].(map[string]any)
-	if asString(errObj["code"]) != "upstream_empty_output" {
-		t.Fatalf("expected code=upstream_empty_output, got %#v", last)
+	if asString(errObj["code"]) != "upstream_unavailable" {
+		t.Fatalf("expected code=upstream_unavailable, got %#v", last)
 	}
 }

@@ -345,7 +345,7 @@ func TestChatCompletionsStreamRetriesEmptyOutputOnSameSession(t *testing.T) {

 func TestChatCompletionsNonStreamRetriesThinkingOnlyOutput(t *testing.T) {
 	ds := &streamStatusDSSeqStub{resps: []*http.Response{
-		makeOpenAISSEHTTPResponse(`data: {"response_message_id":99}`, "data: [DONE]"),
+		makeOpenAISSEHTTPResponse(`data: {"response_message_id":99,"p":"response/thinking_content","v":"plan"}`, "data: [DONE]"),
 		makeOpenAISSEHTTPResponse(`data: {"p":"response/content","v":"visible"}`, "data: [DONE]"),
 	}}
 	h := &openAITestSurface{
@@ -496,7 +496,7 @@ func TestResponsesStreamRetriesThinkingOnlyOutput(t *testing.T) {

 func TestResponsesNonStreamRetriesThinkingOnlyOutput(t *testing.T) {
 	ds := &streamStatusDSSeqStub{resps: []*http.Response{
-		makeOpenAISSEHTTPResponse(`data: {"response_message_id":88}`, "data: [DONE]"),
+		makeOpenAISSEHTTPResponse(`data: {"response_message_id":88,"p":"response/thinking_content","v":"plan"}`, "data: [DONE]"),
 		makeOpenAISSEHTTPResponse(`data: {"p":"response/content","v":"visible"}`, "data: [DONE]"),
 	}}
 	h := &openAITestSurface{
@@ -537,8 +537,15 @@ func TestResponsesNonStreamRetriesThinkingOnlyOutput(t *testing.T) {
 	if len(content) == 0 {
 		t.Fatalf("expected content entries, got %#v", item)
 	}
-	textEntry, _ := content[0].(map[string]any)
-	if asString(textEntry["type"]) != "output_text" || asString(textEntry["text"]) != "visible" {
+	var textEntry map[string]any
+	for _, entry := range content {
+		obj, _ := entry.(map[string]any)
+		if asString(obj["type"]) == "output_text" {
+			textEntry = obj
+			break
+		}
+	}
+	if asString(textEntry["text"]) != "visible" {
 		t.Fatalf("expected visible text entry, got %#v", content)
 	}
 }