perf(streaming): optimize TTFT and reduce buffering latency

Core changes:
- stream.go: New accumulation buffer architecture with scanner goroutine
  + select loop, MinChars=16, MaxWait=10ms, first-flush-immediate
- dedupe.go: Add TrimContinuationOverlapFromBuilder to avoid string copies
- claude/stream_runtime_core.go: Integrate toolstream for incremental text
- claude/stream_runtime_finalize.go: toolstream flush support
- stream_emitter.js: Reduce DeltaCoalescer thresholds (160->16 chars, 80->20ms)
- empty_retry: Add thinking-aware empty output detection
- Fix reasoning_content leak and finish_reason=null in edge cases
- Fix tail content truncation when max_tokens exceeded

Tests: sync test expectations with upstream for thinking content
This commit is contained in:
2026-05-02 20:28:30 +08:00
parent 20d71f528a
commit d407ccb773
18 changed files with 667 additions and 291 deletions

View File

@@ -1,9 +1,12 @@
package shared
import "net/http"
import (
"net/http"
"strings"
)
func ShouldWriteUpstreamEmptyOutputError(text string) bool {
return text == ""
func ShouldWriteUpstreamEmptyOutputError(text, thinking string) bool {
return strings.TrimSpace(text) == "" && strings.TrimSpace(thinking) == ""
}
func UpstreamEmptyOutputDetail(contentFilter bool, text, thinking string) (int, string, string) {
@@ -18,7 +21,7 @@ func UpstreamEmptyOutputDetail(contentFilter bool, text, thinking string) (int,
}
func WriteUpstreamEmptyOutputError(w http.ResponseWriter, text, thinking string, contentFilter bool) bool {
if !ShouldWriteUpstreamEmptyOutputError(text) {
if !ShouldWriteUpstreamEmptyOutputError(text, thinking) {
return false
}
status, message, code := UpstreamEmptyOutputDetail(contentFilter, text, thinking)