diff --git a/internal/adapter/openai/tool_sieve_core.go b/internal/adapter/openai/tool_sieve_core.go index e651445..bc81882 100644 --- a/internal/adapter/openai/tool_sieve_core.go +++ b/internal/adapter/openai/tool_sieve_core.go @@ -114,8 +114,14 @@ func flushToolSieve(state *toolStreamSieveState, toolNames []string) []toolStrea } else { content := state.capture.String() if content != "" { - state.noteText(content) - events = append(events, toolStreamEvent{Content: content}) + // If the captured text looks like an incomplete XML tool call block, + // swallow it to prevent leaking raw XML tags to the client. + if hasOpenXMLToolTag(content) { + // Drop it silently — incomplete tool call. + } else { + state.noteText(content) + events = append(events, toolStreamEvent{Content: content}) + } } } state.capture.Reset() @@ -200,6 +206,11 @@ func findToolSegmentStart(s string) int { if start < 0 { start = bestKeyIdx } + // If the keyword matched inside an XML tag (e.g. "tool_calls" in ""), + // back up past the '<' to capture the full tag. + if start > 0 && s[start-1] == '<' { + start-- + } if fenceStart, ok := openFenceStartBefore(s, start); ok { return fenceStart } diff --git a/internal/adapter/openai/tool_sieve_xml.go b/internal/adapter/openai/tool_sieve_xml.go index 885f50a..c4474af 100644 --- a/internal/adapter/openai/tool_sieve_xml.go +++ b/internal/adapter/openai/tool_sieve_xml.go @@ -12,6 +12,17 @@ import ( var xmlToolCallClosingTags = []string{"", "", "", "", "", ""} var xmlToolCallOpeningTags = []string{""}, + {""}, + {""}, + {""}, + {""}, + {""}, +} + // xmlToolCallBlockPattern matches a complete XML tool call block (wrapper or standalone). var xmlToolCallBlockPattern = regexp.MustCompile(`(?is)(\s*(?:.*?)\s*|\s*(?:.*?)\s*|]*>(?:.*?)|]*>(?:.*?)|(?:.*?))`) @@ -22,59 +33,45 @@ var xmlToolTagsToDetect = []string{"", " // consumeXMLToolCapture tries to extract complete XML tool call blocks from captured text. func consumeXMLToolCapture(captured string, toolNames []string) (prefix string, calls []util.ParsedToolCall, suffix string, ready bool) { lower := strings.ToLower(captured) - // Find the earliest XML tool opening tag. - openIdx := -1 - for _, tag := range xmlToolCallOpeningTags { - idx := strings.Index(lower, tag) - if idx >= 0 && (openIdx < 0 || idx < openIdx) { - openIdx = idx + // Find the FIRST matching open/close pair, preferring wrapper tags. + // Tag pairs are ordered longest-first (e.g. = 0 { - absEnd := openIdx + idx + len(tag) - if closeIdx < 0 || absEnd > closeIdx { - closeIdx = absEnd - } + // Find the LAST occurrence of the specific closing tag to get the outermost block. + closeIdx := strings.LastIndex(lower, pair.close) + if closeIdx < openIdx { + // Opening tag is present but its specific closing tag hasn't arrived. + // Return not-ready so we keep buffering — do NOT fall through to + // try inner pairs (e.g. 0 { - prefixPart, suffixPart = trimWrappingJSONFence(prefixPart, suffixPart) - return prefixPart, parsed, suffixPart, true + xmlBlock := captured[openIdx:closeEnd] + prefixPart := captured[:openIdx] + suffixPart := captured[closeEnd:] + parsed := util.ParseToolCalls(xmlBlock, toolNames) + if len(parsed) > 0 { + prefixPart, suffixPart = trimWrappingJSONFence(prefixPart, suffixPart) + return prefixPart, parsed, suffixPart, true + } + // Looks like XML tool syntax but failed to parse — consume it to avoid leak. + return prefixPart, nil, suffixPart, true } - // Looks like XML tool syntax but failed to parse — consume it to avoid leak. - return prefixPart, nil, suffixPart, true + return "", nil, "", false } // hasOpenXMLToolTag returns true if captured text contains an XML tool opening tag -// but no corresponding closing tag yet. +// whose SPECIFIC closing tag has not appeared yet. func hasOpenXMLToolTag(captured string) bool { lower := strings.ToLower(captured) - for _, tag := range xmlToolCallOpeningTags { - if strings.Contains(lower, tag) { - hasClosed := false - for _, ct := range xmlToolCallClosingTags { - if strings.Contains(lower, ct) { - hasClosed = true - break - } - } - if !hasClosed { + for _, pair := range xmlToolCallTagPairs { + if strings.Contains(lower, pair.open) { + if !strings.Contains(lower, pair.close) { return true } } diff --git a/internal/adapter/openai/tool_sieve_xml_test.go b/internal/adapter/openai/tool_sieve_xml_test.go index b04f87b..b678345 100644 --- a/internal/adapter/openai/tool_sieve_xml_test.go +++ b/internal/adapter/openai/tool_sieve_xml_test.go @@ -153,3 +153,135 @@ func TestHasOpenXMLToolTag(t *testing.T) { t.Fatal("should return false for plain text") } } + +// Test the EXACT scenario the user reports: token-by-token streaming where +// tag arrives in small pieces. +func TestProcessToolSieveTokenByTokenXMLNoLeak(t *testing.T) { + var state toolStreamSieveState + // Simulate DeepSeek model generating tokens one at a time. + chunks := []string{ + "<", + "tool", + "_calls", + ">\n", + " <", + "tool", + "_call", + ">\n", + " <", + "tool", + "_name", + ">", + "read", + "_file", + "\n", + " <", + "parameters", + ">", + `{"path"`, + `: "README.MD"`, + `}`, + "\n", + " \n", + "", + } + var events []toolStreamEvent + for _, c := range chunks { + events = append(events, processToolSieveChunk(&state, c, []string{"read_file"})...) + } + events = append(events, flushToolSieve(&state, []string{"read_file"})...) + + var textContent string + var toolCalls int + for _, evt := range events { + if evt.Content != "" { + textContent += evt.Content + } + toolCalls += len(evt.ToolCalls) + } + + if strings.Contains(textContent, "") { + t.Fatalf("closing tag fragment leaked to text: %q", textContent) + } + if strings.Contains(textContent, "read_file") { + t.Fatalf("tool name leaked to text: %q", textContent) + } + if toolCalls == 0 { + t.Fatal("expected tool calls to be extracted, got none") + } +} + +// Test that flushToolSieve on incomplete XML does NOT leak the raw XML content. +func TestFlushToolSieveIncompleteXMLDoesNotLeak(t *testing.T) { + var state toolStreamSieveState + // XML block starts but stream ends before completion. + chunks := []string{ + "\n", + " \n", + " read_file\n", + } + var events []toolStreamEvent + for _, c := range chunks { + events = append(events, processToolSieveChunk(&state, c, []string{"read_file"})...) + } + // Stream ends abruptly - flush should NOT dump raw XML. + events = append(events, flushToolSieve(&state, []string{"read_file"})...) + + var textContent string + for _, evt := range events { + if evt.Content != "" { + textContent += evt.Content + } + } + + if strings.Contains(textContent, "\n " is NOT emitted as text content. +func TestOpeningXMLTagNotLeakedAsContent(t *testing.T) { + var state toolStreamSieveState + // First chunk is the opening tag - should be held, not emitted. + evts1 := processToolSieveChunk(&state, "\n ", []string{"read_file"}) + for _, evt := range evts1 { + if strings.Contains(evt.Content, "") { + t.Fatalf("opening tag leaked on first chunk: %q", evt.Content) + } + } + + // Remaining content arrives. + evts2 := processToolSieveChunk(&state, "\n read_file\n {\"path\":\"README.MD\"}\n \n", []string{"read_file"}) + evts2 = append(evts2, flushToolSieve(&state, []string{"read_file"})...) + + var textContent string + var toolCalls int + allEvents := append(evts1, evts2...) + for _, evt := range allEvents { + if evt.Content != "" { + textContent += evt.Content + } + toolCalls += len(evt.ToolCalls) + } + + if strings.Contains(textContent, "' }, + { open: '' }, + { open: '' }, + { open: '' }, + { open: '' }, + { open: '' }, +]; + +const XML_TOOL_OPENING_TAGS = XML_TOOL_TAG_PAIRS.map(p => p.open); function consumeXMLToolCapture(captured, toolNames, trimWrappingJSONFence) { const lower = captured.toLowerCase(); - let openIdx = -1; - for (const tag of XML_TOOL_OPENING_TAGS) { - const idx = lower.indexOf(tag); - if (idx >= 0 && (openIdx < 0 || idx < openIdx)) { - openIdx = idx; + // Find the FIRST matching open/close pair, preferring wrapper tags. + for (const pair of XML_TOOL_TAG_PAIRS) { + const openIdx = lower.indexOf(pair.open); + if (openIdx < 0) { + continue; } - } - if (openIdx < 0) { - return { ready: false, prefix: '', calls: [], suffix: '' }; - } - let closeIdx = -1; - for (const tag of XML_TOOL_CLOSING_TAGS) { - const idx = lower.indexOf(tag, openIdx); - if (idx >= 0) { - const absEnd = idx + tag.length; - if (closeIdx < 0 || absEnd > closeIdx) { - closeIdx = absEnd; - } + // Find the LAST occurrence of the specific closing tag. + const closeIdx = lower.lastIndexOf(pair.close); + if (closeIdx < openIdx) { + // Opening tag present but specific closing tag hasn't arrived. + // Return not-ready — do NOT fall through to inner pairs. + return { ready: false, prefix: '', calls: [], suffix: '' }; } + const closeEnd = closeIdx + pair.close.length; + const xmlBlock = captured.slice(openIdx, closeEnd); + let prefixPart = captured.slice(0, openIdx); + let suffixPart = captured.slice(closeEnd); + const parsed = parseToolCalls(xmlBlock, toolNames); + if (Array.isArray(parsed) && parsed.length > 0) { + const trimmedFence = trimWrappingJSONFence(prefixPart, suffixPart); + return { + ready: true, + prefix: trimmedFence.prefix, + calls: parsed, + suffix: trimmedFence.suffix, + }; + } + // XML tool syntax but failed to parse — consume to avoid leak. + return { ready: true, prefix: prefixPart, calls: [], suffix: suffixPart }; } - if (closeIdx <= 0) { - return { ready: false, prefix: '', calls: [], suffix: '' }; - } - const xmlBlock = captured.slice(openIdx, closeIdx); - let prefixPart = captured.slice(0, openIdx); - let suffixPart = captured.slice(closeIdx); - const parsed = parseToolCalls(xmlBlock, toolNames); - if (Array.isArray(parsed) && parsed.length > 0) { - const trimmedFence = trimWrappingJSONFence(prefixPart, suffixPart); - return { - ready: true, - prefix: trimmedFence.prefix, - calls: parsed, - suffix: trimmedFence.suffix, - }; - } - return { ready: true, prefix: prefixPart, calls: [], suffix: suffixPart }; + return { ready: false, prefix: '', calls: [], suffix: '' }; } function hasOpenXMLToolTag(captured) { const lower = captured.toLowerCase(); - for (const tag of XML_TOOL_OPENING_TAGS) { - if (lower.includes(tag)) { - let hasClosed = false; - for (const ct of XML_TOOL_CLOSING_TAGS) { - if (lower.includes(ct)) { - hasClosed = true; - break; - } - } - if (!hasClosed) { + for (const pair of XML_TOOL_TAG_PAIRS) { + if (lower.includes(pair.open)) { + if (!lower.includes(pair.close)) { return true; } } diff --git a/internal/js/helpers/stream-tool-sieve/sieve.js b/internal/js/helpers/stream-tool-sieve/sieve.js index bd7e7cc..43c9224 100644 --- a/internal/js/helpers/stream-tool-sieve/sieve.js +++ b/internal/js/helpers/stream-tool-sieve/sieve.js @@ -191,7 +191,12 @@ function findToolSegmentStart(state, s) { } const keyIdx = bestKeyIdx; const start = s.slice(0, keyIdx).lastIndexOf('{'); - const candidateStart = start >= 0 ? start : keyIdx; + let candidateStart = start >= 0 ? start : keyIdx; + // If the keyword matched inside an XML tag (e.g. "tool_calls" in ""), + // back up past the '<' to capture the full tag. + if (candidateStart > 0 && s[candidateStart - 1] === '<') { + candidateStart--; + } if (!insideCodeFenceWithState(state, s.slice(0, candidateStart))) { return candidateStart; }