perf(streaming): optimize TTFT and reduce buffering latency

Core changes:
- stream.go: New accumulation buffer architecture with scanner goroutine
  + select loop, MinChars=16, MaxWait=10ms, first-flush-immediate
- dedupe.go: Add TrimContinuationOverlapFromBuilder to avoid string copies
- claude/stream_runtime_core.go: Integrate toolstream for incremental text
- claude/stream_runtime_finalize.go: toolstream flush support
- stream_emitter.js: Reduce DeltaCoalescer thresholds (160->16 chars, 80->20ms)
- empty_retry: Add thinking-aware empty output detection
- Fix reasoning_content leak and finish_reason=null in edge cases
- Fix tail content truncation when max_tokens exceeded

Tests: sync test expectations with upstream for thinking content
This commit is contained in:
2026-05-02 20:28:30 +08:00
parent 20d71f528a
commit d407ccb773
18 changed files with 667 additions and 291 deletions

View File

@@ -127,19 +127,16 @@ func (s *geminiStreamRuntime) onParsed(parsed sse.LineResult) streamengine.Parse
contentSeen = true
if p.Type == "thinking" {
if s.thinkingEnabled {
trimmed := sse.TrimContinuationOverlap(s.thinking.String(), cleanedText)
if trimmed == "" {
continue
if cleanedText != "" {
s.thinking.WriteString(cleanedText)
}
s.thinking.WriteString(trimmed)
}
continue
}
trimmed := sse.TrimContinuationOverlap(s.text.String(), cleanedText)
if trimmed == "" {
if cleanedText == "" {
continue
}
s.text.WriteString(trimmed)
s.text.WriteString(cleanedText)
if s.bufferContent {
continue
}
@@ -149,7 +146,7 @@ func (s *geminiStreamRuntime) onParsed(parsed sse.LineResult) streamengine.Parse
"index": 0,
"content": map[string]any{
"role": "model",
"parts": []map[string]any{{"text": trimmed}},
"parts": []map[string]any{{"text": cleanedText}},
},
},
},