fix: prevent XML tool call leakage by strictly matching opening and closing tag pairs during streaming

This commit is contained in:
CJACK
2026-03-29 14:40:47 +08:00
parent 302bcefeb5
commit 4b42fe9086
5 changed files with 235 additions and 95 deletions

View File

@@ -114,8 +114,14 @@ func flushToolSieve(state *toolStreamSieveState, toolNames []string) []toolStrea
} else {
content := state.capture.String()
if content != "" {
state.noteText(content)
events = append(events, toolStreamEvent{Content: content})
// If the captured text looks like an incomplete XML tool call block,
// swallow it to prevent leaking raw XML tags to the client.
if hasOpenXMLToolTag(content) {
// Drop it silently — incomplete tool call.
} else {
state.noteText(content)
events = append(events, toolStreamEvent{Content: content})
}
}
}
state.capture.Reset()
@@ -200,6 +206,11 @@ func findToolSegmentStart(s string) int {
if start < 0 {
start = bestKeyIdx
}
// If the keyword matched inside an XML tag (e.g. "tool_calls" in "<tool_calls>"),
// back up past the '<' to capture the full tag.
if start > 0 && s[start-1] == '<' {
start--
}
if fenceStart, ok := openFenceStartBefore(s, start); ok {
return fenceStart
}

View File

@@ -12,6 +12,17 @@ import (
var xmlToolCallClosingTags = []string{"</tool_calls>", "</tool_call>", "</invoke>", "</function_call>", "</function_calls>", "</tool_use>"}
var xmlToolCallOpeningTags = []string{"<tool_calls", "<tool_call", "<invoke", "<function_call", "<function_calls", "<tool_use"}
// xmlToolCallTagPairs maps each opening tag to its expected closing tag.
// Order matters: longer/wrapper tags must be checked first.
var xmlToolCallTagPairs = []struct{ open, close string }{
{"<tool_calls", "</tool_calls>"},
{"<tool_call", "</tool_call>"},
{"<function_calls", "</function_calls>"},
{"<function_call", "</function_call>"},
{"<invoke", "</invoke>"},
{"<tool_use", "</tool_use>"},
}
// xmlToolCallBlockPattern matches a complete XML tool call block (wrapper or standalone).
var xmlToolCallBlockPattern = regexp.MustCompile(`(?is)(<tool_calls>\s*(?:.*?)\s*</tool_calls>|<tool_call>\s*(?:.*?)\s*</tool_call>|<invoke\b[^>]*>(?:.*?)</invoke>|<function_calls?\b[^>]*>(?:.*?)</function_calls?>|<tool_use>(?:.*?)</tool_use>)`)
@@ -22,59 +33,45 @@ var xmlToolTagsToDetect = []string{"<tool_calls>", "<tool_calls\n", "<tool_call>
// consumeXMLToolCapture tries to extract complete XML tool call blocks from captured text.
func consumeXMLToolCapture(captured string, toolNames []string) (prefix string, calls []util.ParsedToolCall, suffix string, ready bool) {
lower := strings.ToLower(captured)
// Find the earliest XML tool opening tag.
openIdx := -1
for _, tag := range xmlToolCallOpeningTags {
idx := strings.Index(lower, tag)
if idx >= 0 && (openIdx < 0 || idx < openIdx) {
openIdx = idx
// Find the FIRST matching open/close pair, preferring wrapper tags.
// Tag pairs are ordered longest-first (e.g. <tool_calls before <tool_call)
// so wrapper tags are checked before inner tags.
for _, pair := range xmlToolCallTagPairs {
openIdx := strings.Index(lower, pair.open)
if openIdx < 0 {
continue
}
}
if openIdx < 0 {
return "", nil, "", false
}
// Look for a matching closing tag.
closeIdx := -1
for _, tag := range xmlToolCallClosingTags {
idx := strings.Index(lower[openIdx:], tag)
if idx >= 0 {
absEnd := openIdx + idx + len(tag)
if closeIdx < 0 || absEnd > closeIdx {
closeIdx = absEnd
}
// Find the LAST occurrence of the specific closing tag to get the outermost block.
closeIdx := strings.LastIndex(lower, pair.close)
if closeIdx < openIdx {
// Opening tag is present but its specific closing tag hasn't arrived.
// Return not-ready so we keep buffering — do NOT fall through to
// try inner pairs (e.g. <tool_call inside <tool_calls).
return "", nil, "", false
}
}
if closeIdx <= 0 {
return "", nil, "", false
}
closeEnd := closeIdx + len(pair.close)
xmlBlock := captured[openIdx:closeIdx]
prefixPart := captured[:openIdx]
suffixPart := captured[closeIdx:]
parsed := util.ParseToolCalls(xmlBlock, toolNames)
if len(parsed) > 0 {
prefixPart, suffixPart = trimWrappingJSONFence(prefixPart, suffixPart)
return prefixPart, parsed, suffixPart, true
xmlBlock := captured[openIdx:closeEnd]
prefixPart := captured[:openIdx]
suffixPart := captured[closeEnd:]
parsed := util.ParseToolCalls(xmlBlock, toolNames)
if len(parsed) > 0 {
prefixPart, suffixPart = trimWrappingJSONFence(prefixPart, suffixPart)
return prefixPart, parsed, suffixPart, true
}
// Looks like XML tool syntax but failed to parse — consume it to avoid leak.
return prefixPart, nil, suffixPart, true
}
// Looks like XML tool syntax but failed to parse — consume it to avoid leak.
return prefixPart, nil, suffixPart, true
return "", nil, "", false
}
// hasOpenXMLToolTag returns true if captured text contains an XML tool opening tag
// but no corresponding closing tag yet.
// whose SPECIFIC closing tag has not appeared yet.
func hasOpenXMLToolTag(captured string) bool {
lower := strings.ToLower(captured)
for _, tag := range xmlToolCallOpeningTags {
if strings.Contains(lower, tag) {
hasClosed := false
for _, ct := range xmlToolCallClosingTags {
if strings.Contains(lower, ct) {
hasClosed = true
break
}
}
if !hasClosed {
for _, pair := range xmlToolCallTagPairs {
if strings.Contains(lower, pair.open) {
if !strings.Contains(lower, pair.close) {
return true
}
}

View File

@@ -153,3 +153,135 @@ func TestHasOpenXMLToolTag(t *testing.T) {
t.Fatal("should return false for plain text")
}
}
// Test the EXACT scenario the user reports: token-by-token streaming where
// <tool_calls> tag arrives in small pieces.
func TestProcessToolSieveTokenByTokenXMLNoLeak(t *testing.T) {
var state toolStreamSieveState
// Simulate DeepSeek model generating tokens one at a time.
chunks := []string{
"<",
"tool",
"_calls",
">\n",
" <",
"tool",
"_call",
">\n",
" <",
"tool",
"_name",
">",
"read",
"_file",
"</",
"tool",
"_name",
">\n",
" <",
"parameters",
">",
`{"path"`,
`: "README.MD"`,
`}`,
"</",
"parameters",
">\n",
" </",
"tool",
"_call",
">\n",
"</",
"tool",
"_calls",
">",
}
var events []toolStreamEvent
for _, c := range chunks {
events = append(events, processToolSieveChunk(&state, c, []string{"read_file"})...)
}
events = append(events, flushToolSieve(&state, []string{"read_file"})...)
var textContent string
var toolCalls int
for _, evt := range events {
if evt.Content != "" {
textContent += evt.Content
}
toolCalls += len(evt.ToolCalls)
}
if strings.Contains(textContent, "<tool_call") {
t.Fatalf("XML tool call content leaked to text in token-by-token mode: %q", textContent)
}
if strings.Contains(textContent, "tool_calls>") {
t.Fatalf("closing tag fragment leaked to text: %q", textContent)
}
if strings.Contains(textContent, "read_file") {
t.Fatalf("tool name leaked to text: %q", textContent)
}
if toolCalls == 0 {
t.Fatal("expected tool calls to be extracted, got none")
}
}
// Test that flushToolSieve on incomplete XML does NOT leak the raw XML content.
func TestFlushToolSieveIncompleteXMLDoesNotLeak(t *testing.T) {
var state toolStreamSieveState
// XML block starts but stream ends before completion.
chunks := []string{
"<tool_calls>\n",
" <tool_call>\n",
" <tool_name>read_file</tool_name>\n",
}
var events []toolStreamEvent
for _, c := range chunks {
events = append(events, processToolSieveChunk(&state, c, []string{"read_file"})...)
}
// Stream ends abruptly - flush should NOT dump raw XML.
events = append(events, flushToolSieve(&state, []string{"read_file"})...)
var textContent string
for _, evt := range events {
if evt.Content != "" {
textContent += evt.Content
}
}
if strings.Contains(textContent, "<tool_call") {
t.Fatalf("incomplete XML leaked on flush: %q", textContent)
}
}
// Test that the opening tag "<tool_calls>\n " is NOT emitted as text content.
func TestOpeningXMLTagNotLeakedAsContent(t *testing.T) {
var state toolStreamSieveState
// First chunk is the opening tag - should be held, not emitted.
evts1 := processToolSieveChunk(&state, "<tool_calls>\n ", []string{"read_file"})
for _, evt := range evts1 {
if strings.Contains(evt.Content, "<tool_calls>") {
t.Fatalf("opening tag leaked on first chunk: %q", evt.Content)
}
}
// Remaining content arrives.
evts2 := processToolSieveChunk(&state, "<tool_call>\n <tool_name>read_file</tool_name>\n <parameters>{\"path\":\"README.MD\"}</parameters>\n </tool_call>\n</tool_calls>", []string{"read_file"})
evts2 = append(evts2, flushToolSieve(&state, []string{"read_file"})...)
var textContent string
var toolCalls int
allEvents := append(evts1, evts2...)
for _, evt := range allEvents {
if evt.Content != "" {
textContent += evt.Content
}
toolCalls += len(evt.ToolCalls)
}
if strings.Contains(textContent, "<tool_call") {
t.Fatalf("XML content leaked: %q", textContent)
}
if toolCalls == 0 {
t.Fatal("expected tool calls to be extracted")
}
}

View File

@@ -1,63 +1,58 @@
'use strict';
const { parseToolCalls } = require('./parse');
const {
XML_TOOL_OPENING_TAGS,
XML_TOOL_CLOSING_TAGS,
} = require('./tool-keywords');
// Tag pairs ordered longest-first: wrapper tags checked before inner tags.
const XML_TOOL_TAG_PAIRS = [
{ open: '<tool_calls', close: '</tool_calls>' },
{ open: '<tool_call', close: '</tool_call>' },
{ open: '<function_calls', close: '</function_calls>' },
{ open: '<function_call', close: '</function_call>' },
{ open: '<invoke', close: '</invoke>' },
{ open: '<tool_use', close: '</tool_use>' },
];
const XML_TOOL_OPENING_TAGS = XML_TOOL_TAG_PAIRS.map(p => p.open);
function consumeXMLToolCapture(captured, toolNames, trimWrappingJSONFence) {
const lower = captured.toLowerCase();
let openIdx = -1;
for (const tag of XML_TOOL_OPENING_TAGS) {
const idx = lower.indexOf(tag);
if (idx >= 0 && (openIdx < 0 || idx < openIdx)) {
openIdx = idx;
// Find the FIRST matching open/close pair, preferring wrapper tags.
for (const pair of XML_TOOL_TAG_PAIRS) {
const openIdx = lower.indexOf(pair.open);
if (openIdx < 0) {
continue;
}
}
if (openIdx < 0) {
return { ready: false, prefix: '', calls: [], suffix: '' };
}
let closeIdx = -1;
for (const tag of XML_TOOL_CLOSING_TAGS) {
const idx = lower.indexOf(tag, openIdx);
if (idx >= 0) {
const absEnd = idx + tag.length;
if (closeIdx < 0 || absEnd > closeIdx) {
closeIdx = absEnd;
}
// Find the LAST occurrence of the specific closing tag.
const closeIdx = lower.lastIndexOf(pair.close);
if (closeIdx < openIdx) {
// Opening tag present but specific closing tag hasn't arrived.
// Return not-ready — do NOT fall through to inner pairs.
return { ready: false, prefix: '', calls: [], suffix: '' };
}
const closeEnd = closeIdx + pair.close.length;
const xmlBlock = captured.slice(openIdx, closeEnd);
let prefixPart = captured.slice(0, openIdx);
let suffixPart = captured.slice(closeEnd);
const parsed = parseToolCalls(xmlBlock, toolNames);
if (Array.isArray(parsed) && parsed.length > 0) {
const trimmedFence = trimWrappingJSONFence(prefixPart, suffixPart);
return {
ready: true,
prefix: trimmedFence.prefix,
calls: parsed,
suffix: trimmedFence.suffix,
};
}
// XML tool syntax but failed to parse — consume to avoid leak.
return { ready: true, prefix: prefixPart, calls: [], suffix: suffixPart };
}
if (closeIdx <= 0) {
return { ready: false, prefix: '', calls: [], suffix: '' };
}
const xmlBlock = captured.slice(openIdx, closeIdx);
let prefixPart = captured.slice(0, openIdx);
let suffixPart = captured.slice(closeIdx);
const parsed = parseToolCalls(xmlBlock, toolNames);
if (Array.isArray(parsed) && parsed.length > 0) {
const trimmedFence = trimWrappingJSONFence(prefixPart, suffixPart);
return {
ready: true,
prefix: trimmedFence.prefix,
calls: parsed,
suffix: trimmedFence.suffix,
};
}
return { ready: true, prefix: prefixPart, calls: [], suffix: suffixPart };
return { ready: false, prefix: '', calls: [], suffix: '' };
}
function hasOpenXMLToolTag(captured) {
const lower = captured.toLowerCase();
for (const tag of XML_TOOL_OPENING_TAGS) {
if (lower.includes(tag)) {
let hasClosed = false;
for (const ct of XML_TOOL_CLOSING_TAGS) {
if (lower.includes(ct)) {
hasClosed = true;
break;
}
}
if (!hasClosed) {
for (const pair of XML_TOOL_TAG_PAIRS) {
if (lower.includes(pair.open)) {
if (!lower.includes(pair.close)) {
return true;
}
}

View File

@@ -191,7 +191,12 @@ function findToolSegmentStart(state, s) {
}
const keyIdx = bestKeyIdx;
const start = s.slice(0, keyIdx).lastIndexOf('{');
const candidateStart = start >= 0 ? start : keyIdx;
let candidateStart = start >= 0 ? start : keyIdx;
// If the keyword matched inside an XML tag (e.g. "tool_calls" in "<tool_calls>"),
// back up past the '<' to capture the full tag.
if (candidateStart > 0 && s[candidateStart - 1] === '<') {
candidateStart--;
}
if (!insideCodeFenceWithState(state, s.slice(0, candidateStart))) {
return candidateStart;
}