perf(streaming): optimize TTFT and reduce buffering latency

Core changes: - stream.go: New accumulation buffer architecture with scanner goroutine + select loop, MinChars=16, MaxWait=10ms, first-flush-immediate - dedupe.go: Add TrimContinuationOverlapFromBuilder to avoid string copies - claude/stream_runtime_core.go: Integrate toolstream for incremental text - claude/stream_runtime_finalize.go: toolstream flush support - stream_emitter.js: Reduce DeltaCoalescer thresholds (160->16 chars, 80->20ms) - empty_retry: Add thinking-aware empty output detection - Fix reasoning_content leak and finish_reason=null in edge cases - Fix tail content truncation when max_tokens exceeded Tests: sync test expectations with upstream for thinking content
2026-05-16 22:25:15 +08:00 · 2026-05-02 20:28:30 +08:00
parent 20d71f528a
commit d407ccb773
18 changed files with 667 additions and 291 deletions
--- a/internal/httpapi/openai/chat/chat_stream_runtime.go
+++ b/internal/httpapi/openai/chat/chat_stream_runtime.go
@@ -280,52 +280,40 @@ func (s *chatStreamRuntime) onParsed(parsed sse.LineResult) streamengine.ParsedD
 	contentSeen := false
 	batch := chatDeltaBatch{runtime: s}
 	for _, p := range parsed.ToolDetectionThinkingParts {
-		trimmed := sse.TrimContinuationOverlap(s.toolDetectionThinking.String(), p.Text)
+		trimmed := sse.TrimContinuationOverlapFromBuilder(&s.toolDetectionThinking, p.Text)
 		if trimmed != "" {
 			s.toolDetectionThinking.WriteString(trimmed)
 		}
 	}
 	for _, p := range parsed.Parts {
 		if p.Type == "thinking" {
-			rawTrimmed := sse.TrimContinuationOverlap(s.rawThinking.String(), p.Text)
-			if rawTrimmed != "" {
-				s.rawThinking.WriteString(rawTrimmed)
-				contentSeen = true
-			}
+			s.rawThinking.WriteString(p.Text)
+			contentSeen = true
 			if s.thinkingEnabled {
-				cleanedText := cleanVisibleOutput(rawTrimmed, s.stripReferenceMarkers)
+				cleanedText := cleanVisibleOutput(p.Text, s.stripReferenceMarkers)
 				if cleanedText == "" {
 					continue
 				}
-				trimmed := sse.TrimContinuationOverlap(s.thinking.String(), cleanedText)
-				if trimmed == "" {
-					continue
-				}
-				s.thinking.WriteString(trimmed)
-				batch.append("reasoning_content", trimmed)
+				s.thinking.WriteString(cleanedText)
+				batch.append("reasoning_content", cleanedText)
 			}
 		} else {
-			rawTrimmed := sse.TrimContinuationOverlap(s.rawText.String(), p.Text)
-			if rawTrimmed == "" {
-				continue
-			}
-			s.rawText.WriteString(rawTrimmed)
+			s.rawText.WriteString(p.Text)
 			contentSeen = true
-			cleanedText := cleanVisibleOutput(rawTrimmed, s.stripReferenceMarkers)
+			cleanedText := cleanVisibleOutput(p.Text, s.stripReferenceMarkers)
 			if s.searchEnabled && sse.IsCitation(cleanedText) {
 				continue
 			}
-			trimmed := sse.TrimContinuationOverlap(s.text.String(), cleanedText)
-			if trimmed != "" {
-				s.text.WriteString(trimmed)
+			if cleanedText != "" {
+				s.text.WriteString(cleanedText)
 			}
 			if !s.bufferToolContent {
-				if trimmed == "" {
+				if cleanedText == "" {
 					continue
 				}
-				batch.append("content", trimmed)
+				batch.append("content", cleanedText)
 			} else {
-				events := toolstream.ProcessChunk(&s.toolSieve, rawTrimmed, s.toolNames)
+				events := toolstream.ProcessChunk(&s.toolSieve, p.Text, s.toolNames)
 				for _, evt := range events {
 					if len(evt.ToolCallDeltas) > 0 {
 						if !s.emitEarlyToolDeltas {
--- a/internal/httpapi/openai/chat/empty_retry_runtime.go
+++ b/internal/httpapi/openai/chat/empty_retry_runtime.go
@@ -109,7 +109,7 @@ func (h *Handler) collectChatNonStreamAttempt(w http.ResponseWriter, resp *http.
 }

 func (h *Handler) finishChatNonStreamResult(w http.ResponseWriter, result chatNonStreamResult, attempts int, usagePrompt string, refFileTokens int, historySession *chatHistorySession) {
-	if result.detectedCalls == 0 && shouldWriteUpstreamEmptyOutputError(result.text) {
+	if result.detectedCalls == 0 && shouldWriteUpstreamEmptyOutputError(result.text, result.thinking) {
 		status, message, code := upstreamEmptyOutputDetail(result.contentFilter, result.text, result.thinking)
 		if historySession != nil {
 			historySession.error(status, message, code, result.thinking, result.text)
@@ -143,7 +143,8 @@ func shouldRetryChatNonStream(result chatNonStreamResult, attempts int) bool {
 		attempts < emptyOutputRetryMaxAttempts() &&
 		!result.contentFilter &&
 		result.detectedCalls == 0 &&
-		strings.TrimSpace(result.text) == ""
+		strings.TrimSpace(result.text) == "" &&
+		strings.TrimSpace(result.thinking) == ""
 }

 func (h *Handler) handleStreamWithRetry(w http.ResponseWriter, r *http.Request, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, completionID, model, finalPrompt string, refFileTokens int, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, historySession *chatHistorySession) {
--- a/internal/httpapi/openai/chat/handler.go
+++ b/internal/httpapi/openai/chat/handler.go
@@ -108,8 +108,8 @@ func replaceCitationMarkersWithLinks(text string, links map[int]string) string {
 	return shared.ReplaceCitationMarkersWithLinks(text, links)
 }

-func shouldWriteUpstreamEmptyOutputError(text string) bool {
-	return shared.ShouldWriteUpstreamEmptyOutputError(text)
+func shouldWriteUpstreamEmptyOutputError(text, thinking string) bool {
+	return shared.ShouldWriteUpstreamEmptyOutputError(text, thinking)
 }

 func upstreamEmptyOutputDetail(contentFilter bool, text, thinking string) (int, string, string) {
--- a/internal/httpapi/openai/chat/handler_chat.go
+++ b/internal/httpapi/openai/chat/handler_chat.go
@@ -168,7 +168,7 @@ func (h *Handler) handleNonStream(w http.ResponseWriter, resp *http.Response, co
 		finalText = replaceCitationMarkersWithLinks(finalText, result.CitationLinks)
 	}
 	detected := detectAssistantToolCalls(result.Text, finalText, result.Thinking, result.ToolDetectionThinking, toolNames)
-	if shouldWriteUpstreamEmptyOutputError(finalText) && len(detected.Calls) == 0 {
+	if shouldWriteUpstreamEmptyOutputError(finalText, finalThinking) && len(detected.Calls) == 0 {
 		status, message, code := upstreamEmptyOutputDetail(result.ContentFilter, finalText, finalThinking)
 		if historySession != nil {
 			historySession.error(status, message, code, finalThinking, finalText)
--- a/internal/httpapi/openai/chat/handler_toolcall_test.go
+++ b/internal/httpapi/openai/chat/handler_toolcall_test.go
@@ -133,13 +133,18 @@ func TestHandleNonStreamReturns429WhenUpstreamHasOnlyThinking(t *testing.T) {
 	rec := httptest.NewRecorder()

 	h.handleNonStream(rec, resp, "cid-thinking-only", "deepseek-v4-pro", "prompt", 0, true, false, nil, nil, nil)
-	if rec.Code != http.StatusTooManyRequests {
-		t.Fatalf("expected status 429 for thinking-only upstream output, got %d body=%s", rec.Code, rec.Body.String())
+	if rec.Code != http.StatusOK {
+		t.Fatalf("expected status 200 for thinking-only upstream output, got %d body=%s", rec.Code, rec.Body.String())
 	}
 	out := decodeJSONBody(t, rec.Body.String())
-	errObj, _ := out["error"].(map[string]any)
-	if asString(errObj["code"]) != "upstream_empty_output" {
-		t.Fatalf("expected code=upstream_empty_output, got %#v", out)
+	choices, _ := out["choices"].([]any)
+	if len(choices) == 0 {
+		t.Fatal("expected at least one choice")
+	}
+	first, _ := choices[0].(map[string]any)
+	msg, _ := first["message"].(map[string]any)
+	if asString(msg["reasoning_content"]) != "Only thinking" {
+		t.Fatalf("expected reasoning_content='Only thinking', got %#v", msg)
 	}
 }

--- a/internal/httpapi/openai/responses/empty_retry_runtime.go
+++ b/internal/httpapi/openai/responses/empty_retry_runtime.go
@@ -129,7 +129,8 @@ func shouldRetryResponsesNonStream(result responsesNonStreamResult, attempts int
 		attempts < emptyOutputRetryMaxAttempts() &&
 		!result.contentFilter &&
 		len(result.parsed.Calls) == 0 &&
-		strings.TrimSpace(result.text) == ""
+		strings.TrimSpace(result.text) == "" &&
+		strings.TrimSpace(result.thinking) == ""
 }

 func (h *Handler) handleResponsesStreamWithRetry(w http.ResponseWriter, r *http.Request, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, owner, responseID, model, finalPrompt string, refFileTokens int, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, toolChoice promptcompat.ToolChoicePolicy, traceID string) {
--- a/internal/httpapi/openai/responses/responses_stream_runtime_core.go
+++ b/internal/httpapi/openai/responses/responses_stream_runtime_core.go
@@ -231,57 +231,45 @@ func (s *responsesStreamRuntime) onParsed(parsed sse.LineResult) streamengine.Pa
 	contentSeen := false
 	batch := responsesDeltaBatch{runtime: s}
 	for _, p := range parsed.ToolDetectionThinkingParts {
-		trimmed := sse.TrimContinuationOverlap(s.toolDetectionThinking.String(), p.Text)
+		trimmed := sse.TrimContinuationOverlapFromBuilder(&s.toolDetectionThinking, p.Text)
 		if trimmed != "" {
 			s.toolDetectionThinking.WriteString(trimmed)
 		}
 	}
 	for _, p := range parsed.Parts {
 		if p.Type == "thinking" {
-			rawTrimmed := sse.TrimContinuationOverlap(s.rawThinking.String(), p.Text)
-			if rawTrimmed != "" {
-				s.rawThinking.WriteString(rawTrimmed)
-				contentSeen = true
-			}
+			s.rawThinking.WriteString(p.Text)
+			contentSeen = true
 			if !s.thinkingEnabled {
 				continue
 			}
-			cleanedText := cleanVisibleOutput(rawTrimmed, s.stripReferenceMarkers)
+			cleanedText := cleanVisibleOutput(p.Text, s.stripReferenceMarkers)
 			if cleanedText == "" {
 				continue
 			}
-			trimmed := sse.TrimContinuationOverlap(s.thinking.String(), cleanedText)
-			if trimmed == "" {
-				continue
-			}
-			s.thinking.WriteString(trimmed)
-			batch.append("reasoning", trimmed)
+			s.thinking.WriteString(cleanedText)
+			batch.append("reasoning", cleanedText)
 			continue
 		}

-		rawTrimmed := sse.TrimContinuationOverlap(s.rawText.String(), p.Text)
-		if rawTrimmed == "" {
-			continue
-		}
-		s.rawText.WriteString(rawTrimmed)
+		s.rawText.WriteString(p.Text)
 		contentSeen = true
-		cleanedText := cleanVisibleOutput(rawTrimmed, s.stripReferenceMarkers)
+		cleanedText := cleanVisibleOutput(p.Text, s.stripReferenceMarkers)
 		if s.searchEnabled && sse.IsCitation(cleanedText) {
 			continue
 		}
-		trimmed := sse.TrimContinuationOverlap(s.text.String(), cleanedText)
-		if trimmed != "" {
-			s.text.WriteString(trimmed)
+		if cleanedText != "" {
+			s.text.WriteString(cleanedText)
 		}
 		if !s.bufferToolContent {
-			if trimmed == "" {
+			if cleanedText == "" {
 				continue
 			}
-			batch.append("text", trimmed)
+			batch.append("text", cleanedText)
 			continue
 		}
 		batch.flush()
-		s.processToolStreamEvents(toolstream.ProcessChunk(&s.sieve, rawTrimmed, s.toolNames), true, true)
+		s.processToolStreamEvents(toolstream.ProcessChunk(&s.sieve, p.Text, s.toolNames), true, true)
 	}

 	batch.flush()
--- a/internal/httpapi/openai/responses/responses_stream_test.go
+++ b/internal/httpapi/openai/responses/responses_stream_test.go
@@ -453,13 +453,25 @@ func TestHandleResponsesNonStreamReturns429WhenUpstreamHasOnlyThinking(t *testin
 	}

 	h.handleResponsesNonStream(rec, resp, "owner-a", "resp_test", "deepseek-v4-pro", "prompt", 0, true, false, nil, nil, promptcompat.DefaultToolChoicePolicy(), "")
-	if rec.Code != http.StatusTooManyRequests {
-		t.Fatalf("expected 429 for thinking-only upstream output, got %d body=%s", rec.Code, rec.Body.String())
+	if rec.Code != http.StatusOK {
+		t.Fatalf("expected 200 for thinking-only upstream output, got %d body=%s", rec.Code, rec.Body.String())
 	}
 	out := decodeJSONBody(t, rec.Body.String())
-	errObj, _ := out["error"].(map[string]any)
-	if asString(errObj["code"]) != "upstream_empty_output" {
-		t.Fatalf("expected code=upstream_empty_output, got %#v", out)
+	output, _ := out["output"].([]any)
+	if len(output) == 0 {
+		t.Fatal("expected at least one output item")
+	}
+	first, _ := output[0].(map[string]any)
+	content, _ := first["content"].([]any)
+	if len(content) == 0 {
+		t.Fatal("expected at least one content item")
+	}
+	firstContent, _ := content[0].(map[string]any)
+	if asString(firstContent["type"]) != "reasoning" {
+		t.Fatalf("expected reasoning type, got %v", firstContent["type"])
+	}
+	if asString(firstContent["text"]) != "Only thinking" {
+		t.Fatalf("expected text='Only thinking', got %v", firstContent["text"])
 	}
 }

--- a/internal/httpapi/openai/shared/upstream_empty.go
+++ b/internal/httpapi/openai/shared/upstream_empty.go
@@ -1,9 +1,12 @@
 package shared

-import "net/http"
+import (
+	"net/http"
+	"strings"
+)

-func ShouldWriteUpstreamEmptyOutputError(text string) bool {
-	return text == ""
+func ShouldWriteUpstreamEmptyOutputError(text, thinking string) bool {
+	return strings.TrimSpace(text) == "" && strings.TrimSpace(thinking) == ""
 }

 func UpstreamEmptyOutputDetail(contentFilter bool, text, thinking string) (int, string, string) {
@@ -18,7 +21,7 @@ func UpstreamEmptyOutputDetail(contentFilter bool, text, thinking string) (int,
 }

 func WriteUpstreamEmptyOutputError(w http.ResponseWriter, text, thinking string, contentFilter bool) bool {
-	if !ShouldWriteUpstreamEmptyOutputError(text) {
+	if !ShouldWriteUpstreamEmptyOutputError(text, thinking) {
 		return false
 	}
 	status, message, code := UpstreamEmptyOutputDetail(contentFilter, text, thinking)
--- a/internal/httpapi/openai/stream_status_test.go
+++ b/internal/httpapi/openai/stream_status_test.go
@@ -345,7 +345,7 @@ func TestChatCompletionsStreamRetriesEmptyOutputOnSameSession(t *testing.T) {

 func TestChatCompletionsNonStreamRetriesThinkingOnlyOutput(t *testing.T) {
 	ds := &streamStatusDSSeqStub{resps: []*http.Response{
-		makeOpenAISSEHTTPResponse(`data: {"response_message_id":99,"p":"response/thinking_content","v":"plan"}`, "data: [DONE]"),
+		makeOpenAISSEHTTPResponse(`data: {"response_message_id":99}`, "data: [DONE]"),
 		makeOpenAISSEHTTPResponse(`data: {"p":"response/content","v":"visible"}`, "data: [DONE]"),
 	}}
 	h := &openAITestSurface{
@@ -380,9 +380,6 @@ func TestChatCompletionsNonStreamRetriesThinkingOnlyOutput(t *testing.T) {
 	if asString(message["content"]) != "visible" {
 		t.Fatalf("expected retry visible content, got %#v", message)
 	}
-	if !strings.Contains(asString(message["reasoning_content"]), "plan") {
-		t.Fatalf("expected first-attempt reasoning to be preserved, got %#v", message)
-	}
 }

 func TestChatCompletionsContentFilterDoesNotRetry(t *testing.T) {
@@ -499,7 +496,7 @@ func TestResponsesStreamRetriesThinkingOnlyOutput(t *testing.T) {

 func TestResponsesNonStreamRetriesThinkingOnlyOutput(t *testing.T) {
 	ds := &streamStatusDSSeqStub{resps: []*http.Response{
-		makeOpenAISSEHTTPResponse(`data: {"response_message_id":88,"p":"response/thinking_content","v":"plan"}`, "data: [DONE]"),
+		makeOpenAISSEHTTPResponse(`data: {"response_message_id":88}`, "data: [DONE]"),
 		makeOpenAISSEHTTPResponse(`data: {"p":"response/content","v":"visible"}`, "data: [DONE]"),
 	}}
 	h := &openAITestSurface{
@@ -540,9 +537,9 @@ func TestResponsesNonStreamRetriesThinkingOnlyOutput(t *testing.T) {
 	if len(content) == 0 {
 		t.Fatalf("expected content entries, got %#v", item)
 	}
-	reasoning, _ := content[0].(map[string]any)
-	if asString(reasoning["type"]) != "reasoning" || !strings.Contains(asString(reasoning["text"]), "plan") {
-		t.Fatalf("expected preserved reasoning entry, got %#v", content)
+	textEntry, _ := content[0].(map[string]any)
+	if asString(textEntry["type"]) != "output_text" || asString(textEntry["text"]) != "visible" {
+		t.Fatalf("expected visible text entry, got %#v", content)
 	}
 }