mirror of
https://github.com/CJackHwang/ds2api.git
synced 2026-05-21 08:27:42 +08:00
perf(streaming): optimize TTFT and reduce buffering latency
Core changes: - stream.go: New accumulation buffer architecture with scanner goroutine + select loop, MinChars=16, MaxWait=10ms, first-flush-immediate - dedupe.go: Add TrimContinuationOverlapFromBuilder to avoid string copies - claude/stream_runtime_core.go: Integrate toolstream for incremental text - claude/stream_runtime_finalize.go: toolstream flush support - stream_emitter.js: Reduce DeltaCoalescer thresholds (160->16 chars, 80->20ms) - empty_retry: Add thinking-aware empty output detection - Fix reasoning_content leak and finish_reason=null in edge cases - Fix tail content truncation when max_tokens exceeded Tests: sync test expectations with upstream for thinking content
This commit is contained in:
@@ -280,52 +280,40 @@ func (s *chatStreamRuntime) onParsed(parsed sse.LineResult) streamengine.ParsedD
|
||||
contentSeen := false
|
||||
batch := chatDeltaBatch{runtime: s}
|
||||
for _, p := range parsed.ToolDetectionThinkingParts {
|
||||
trimmed := sse.TrimContinuationOverlap(s.toolDetectionThinking.String(), p.Text)
|
||||
trimmed := sse.TrimContinuationOverlapFromBuilder(&s.toolDetectionThinking, p.Text)
|
||||
if trimmed != "" {
|
||||
s.toolDetectionThinking.WriteString(trimmed)
|
||||
}
|
||||
}
|
||||
for _, p := range parsed.Parts {
|
||||
if p.Type == "thinking" {
|
||||
rawTrimmed := sse.TrimContinuationOverlap(s.rawThinking.String(), p.Text)
|
||||
if rawTrimmed != "" {
|
||||
s.rawThinking.WriteString(rawTrimmed)
|
||||
contentSeen = true
|
||||
}
|
||||
s.rawThinking.WriteString(p.Text)
|
||||
contentSeen = true
|
||||
if s.thinkingEnabled {
|
||||
cleanedText := cleanVisibleOutput(rawTrimmed, s.stripReferenceMarkers)
|
||||
cleanedText := cleanVisibleOutput(p.Text, s.stripReferenceMarkers)
|
||||
if cleanedText == "" {
|
||||
continue
|
||||
}
|
||||
trimmed := sse.TrimContinuationOverlap(s.thinking.String(), cleanedText)
|
||||
if trimmed == "" {
|
||||
continue
|
||||
}
|
||||
s.thinking.WriteString(trimmed)
|
||||
batch.append("reasoning_content", trimmed)
|
||||
s.thinking.WriteString(cleanedText)
|
||||
batch.append("reasoning_content", cleanedText)
|
||||
}
|
||||
} else {
|
||||
rawTrimmed := sse.TrimContinuationOverlap(s.rawText.String(), p.Text)
|
||||
if rawTrimmed == "" {
|
||||
continue
|
||||
}
|
||||
s.rawText.WriteString(rawTrimmed)
|
||||
s.rawText.WriteString(p.Text)
|
||||
contentSeen = true
|
||||
cleanedText := cleanVisibleOutput(rawTrimmed, s.stripReferenceMarkers)
|
||||
cleanedText := cleanVisibleOutput(p.Text, s.stripReferenceMarkers)
|
||||
if s.searchEnabled && sse.IsCitation(cleanedText) {
|
||||
continue
|
||||
}
|
||||
trimmed := sse.TrimContinuationOverlap(s.text.String(), cleanedText)
|
||||
if trimmed != "" {
|
||||
s.text.WriteString(trimmed)
|
||||
if cleanedText != "" {
|
||||
s.text.WriteString(cleanedText)
|
||||
}
|
||||
if !s.bufferToolContent {
|
||||
if trimmed == "" {
|
||||
if cleanedText == "" {
|
||||
continue
|
||||
}
|
||||
batch.append("content", trimmed)
|
||||
batch.append("content", cleanedText)
|
||||
} else {
|
||||
events := toolstream.ProcessChunk(&s.toolSieve, rawTrimmed, s.toolNames)
|
||||
events := toolstream.ProcessChunk(&s.toolSieve, p.Text, s.toolNames)
|
||||
for _, evt := range events {
|
||||
if len(evt.ToolCallDeltas) > 0 {
|
||||
if !s.emitEarlyToolDeltas {
|
||||
|
||||
@@ -109,7 +109,7 @@ func (h *Handler) collectChatNonStreamAttempt(w http.ResponseWriter, resp *http.
|
||||
}
|
||||
|
||||
func (h *Handler) finishChatNonStreamResult(w http.ResponseWriter, result chatNonStreamResult, attempts int, usagePrompt string, refFileTokens int, historySession *chatHistorySession) {
|
||||
if result.detectedCalls == 0 && shouldWriteUpstreamEmptyOutputError(result.text) {
|
||||
if result.detectedCalls == 0 && shouldWriteUpstreamEmptyOutputError(result.text, result.thinking) {
|
||||
status, message, code := upstreamEmptyOutputDetail(result.contentFilter, result.text, result.thinking)
|
||||
if historySession != nil {
|
||||
historySession.error(status, message, code, result.thinking, result.text)
|
||||
@@ -143,7 +143,8 @@ func shouldRetryChatNonStream(result chatNonStreamResult, attempts int) bool {
|
||||
attempts < emptyOutputRetryMaxAttempts() &&
|
||||
!result.contentFilter &&
|
||||
result.detectedCalls == 0 &&
|
||||
strings.TrimSpace(result.text) == ""
|
||||
strings.TrimSpace(result.text) == "" &&
|
||||
strings.TrimSpace(result.thinking) == ""
|
||||
}
|
||||
|
||||
func (h *Handler) handleStreamWithRetry(w http.ResponseWriter, r *http.Request, a *auth.RequestAuth, resp *http.Response, payload map[string]any, pow, completionID, model, finalPrompt string, refFileTokens int, thinkingEnabled, searchEnabled bool, toolNames []string, toolsRaw any, historySession *chatHistorySession) {
|
||||
|
||||
@@ -108,8 +108,8 @@ func replaceCitationMarkersWithLinks(text string, links map[int]string) string {
|
||||
return shared.ReplaceCitationMarkersWithLinks(text, links)
|
||||
}
|
||||
|
||||
func shouldWriteUpstreamEmptyOutputError(text string) bool {
|
||||
return shared.ShouldWriteUpstreamEmptyOutputError(text)
|
||||
func shouldWriteUpstreamEmptyOutputError(text, thinking string) bool {
|
||||
return shared.ShouldWriteUpstreamEmptyOutputError(text, thinking)
|
||||
}
|
||||
|
||||
func upstreamEmptyOutputDetail(contentFilter bool, text, thinking string) (int, string, string) {
|
||||
|
||||
@@ -168,7 +168,7 @@ func (h *Handler) handleNonStream(w http.ResponseWriter, resp *http.Response, co
|
||||
finalText = replaceCitationMarkersWithLinks(finalText, result.CitationLinks)
|
||||
}
|
||||
detected := detectAssistantToolCalls(result.Text, finalText, result.Thinking, result.ToolDetectionThinking, toolNames)
|
||||
if shouldWriteUpstreamEmptyOutputError(finalText) && len(detected.Calls) == 0 {
|
||||
if shouldWriteUpstreamEmptyOutputError(finalText, finalThinking) && len(detected.Calls) == 0 {
|
||||
status, message, code := upstreamEmptyOutputDetail(result.ContentFilter, finalText, finalThinking)
|
||||
if historySession != nil {
|
||||
historySession.error(status, message, code, finalThinking, finalText)
|
||||
|
||||
@@ -133,13 +133,18 @@ func TestHandleNonStreamReturns429WhenUpstreamHasOnlyThinking(t *testing.T) {
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleNonStream(rec, resp, "cid-thinking-only", "deepseek-v4-pro", "prompt", 0, true, false, nil, nil, nil)
|
||||
if rec.Code != http.StatusTooManyRequests {
|
||||
t.Fatalf("expected status 429 for thinking-only upstream output, got %d body=%s", rec.Code, rec.Body.String())
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("expected status 200 for thinking-only upstream output, got %d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
out := decodeJSONBody(t, rec.Body.String())
|
||||
errObj, _ := out["error"].(map[string]any)
|
||||
if asString(errObj["code"]) != "upstream_empty_output" {
|
||||
t.Fatalf("expected code=upstream_empty_output, got %#v", out)
|
||||
choices, _ := out["choices"].([]any)
|
||||
if len(choices) == 0 {
|
||||
t.Fatal("expected at least one choice")
|
||||
}
|
||||
first, _ := choices[0].(map[string]any)
|
||||
msg, _ := first["message"].(map[string]any)
|
||||
if asString(msg["reasoning_content"]) != "Only thinking" {
|
||||
t.Fatalf("expected reasoning_content='Only thinking', got %#v", msg)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user