diff --git a/internal/adapter/openai/prompt_build_test.go b/internal/adapter/openai/prompt_build_test.go index 724fef8..bc89bd6 100644 --- a/internal/adapter/openai/prompt_build_test.go +++ b/internal/adapter/openai/prompt_build_test.go @@ -74,7 +74,7 @@ func TestBuildOpenAIFinalPrompt_VercelPreparePathKeepsFinalAnswerInstruction(t * } finalPrompt, _ := buildOpenAIFinalPrompt(messages, tools, "", false) - if !strings.Contains(finalPrompt, "Remember: Output ONLY the ... XML block when calling tools.") { + if !strings.Contains(finalPrompt, "Remember: The ONLY valid way to use tools is the XML block at the end of your response.") { t.Fatalf("vercel prepare finalPrompt missing final tool-call anchor instruction: %q", finalPrompt) } if !strings.Contains(finalPrompt, "TOOL CALL FORMAT") { diff --git a/internal/adapter/openai/tool_sieve_core.go b/internal/adapter/openai/tool_sieve_core.go index 241cc06..4fbd64d 100644 --- a/internal/adapter/openai/tool_sieve_core.go +++ b/internal/adapter/openai/tool_sieve_core.go @@ -60,7 +60,7 @@ func processToolSieveChunk(state *toolStreamSieveState, chunk string, toolNames if pending == "" { break } - start := findToolSegmentStart(pending) + start := findToolSegmentStart(state, pending) if start >= 0 { prefix := pending[:start] if prefix != "" { @@ -74,7 +74,7 @@ func processToolSieveChunk(state *toolStreamSieveState, chunk string, toolNames continue } - safe, hold := splitSafeContentForToolDetection(pending) + safe, hold := splitSafeContentForToolDetection(state, pending) if safe == "" { break } @@ -114,14 +114,10 @@ func flushToolSieve(state *toolStreamSieveState, toolNames []string) []toolStrea } else { content := state.capture.String() if content != "" { - // If the captured text looks like an incomplete XML tool call block, - // swallow it to prevent leaking raw XML tags to the client. - if hasOpenXMLToolTag(content) { - // Drop it silently — incomplete tool call. - } else { - state.noteText(content) - events = append(events, toolStreamEvent{Content: content}) - } + // If capture never resolved into a real tool call, release the + // buffered text instead of swallowing it. + state.noteText(content) + events = append(events, toolStreamEvent{Content: content}) } } state.capture.Reset() @@ -130,24 +126,22 @@ func flushToolSieve(state *toolStreamSieveState, toolNames []string) []toolStrea } if state.pending.Len() > 0 { content := state.pending.String() - // Safety: if pending contains XML tool tag fragments (e.g. "tool_calls>" - // from a split closing tag), swallow them instead of leaking. - if hasOpenXMLToolTag(content) || looksLikeXMLToolTagFragment(content) { - // Drop it — likely an incomplete tool call fragment. - } else { - state.noteText(content) - events = append(events, toolStreamEvent{Content: content}) - } + // If pending never resolved into a real tool call, release it as text. + state.noteText(content) + events = append(events, toolStreamEvent{Content: content}) state.pending.Reset() } return events } -func splitSafeContentForToolDetection(s string) (safe, hold string) { +func splitSafeContentForToolDetection(state *toolStreamSieveState, s string) (safe, hold string) { if s == "" { return "", "" } if xmlIdx := findPartialXMLToolTagStart(s); xmlIdx >= 0 { + if insideCodeFenceWithState(state, s[:xmlIdx]) { + return s, "" + } if xmlIdx > 0 { return s[:xmlIdx], s[xmlIdx:] } @@ -156,19 +150,33 @@ func splitSafeContentForToolDetection(s string) (safe, hold string) { return s, "" } -func findToolSegmentStart(s string) int { +func findToolSegmentStart(state *toolStreamSieveState, s string) int { if s == "" { return -1 } lower := strings.ToLower(s) - bestKeyIdx := -1 - for _, tag := range xmlToolTagsToDetect { - idx := strings.Index(lower, tag) - if idx >= 0 && (bestKeyIdx < 0 || idx < bestKeyIdx) { - bestKeyIdx = idx + offset := 0 + for { + bestKeyIdx := -1 + matchedTag := "" + for _, tag := range xmlToolTagsToDetect { + idx := strings.Index(lower[offset:], tag) + if idx >= 0 { + idx += offset + if bestKeyIdx < 0 || idx < bestKeyIdx { + bestKeyIdx = idx + matchedTag = tag + } + } } + if bestKeyIdx < 0 { + return -1 + } + if !insideCodeFenceWithState(state, s[:bestKeyIdx]) { + return bestKeyIdx + } + offset = bestKeyIdx + len(matchedTag) } - return bestKeyIdx } func consumeToolCapture(state *toolStreamSieveState, toolNames []string) (prefix string, calls []toolcall.ParsedToolCall, suffix string, ready bool) { diff --git a/internal/adapter/openai/tool_sieve_state.go b/internal/adapter/openai/tool_sieve_state.go index 1b1b96e..09de2a5 100644 --- a/internal/adapter/openai/tool_sieve_state.go +++ b/internal/adapter/openai/tool_sieve_state.go @@ -6,19 +6,22 @@ import ( ) type toolStreamSieveState struct { - pending strings.Builder - capture strings.Builder - capturing bool - recentTextTail string - pendingToolRaw string - pendingToolCalls []toolcall.ParsedToolCall - disableDeltas bool - toolNameSent bool - toolName string - toolArgsStart int - toolArgsSent int - toolArgsString bool - toolArgsDone bool + pending strings.Builder + capture strings.Builder + capturing bool + codeFenceStack []int + codeFencePendingTicks int + codeFenceLineStart bool + recentTextTail string + pendingToolRaw string + pendingToolCalls []toolcall.ParsedToolCall + disableDeltas bool + toolNameSent bool + toolName string + toolArgsStart int + toolArgsSent int + toolArgsString bool + toolArgsDone bool } type toolStreamEvent struct { @@ -47,9 +50,10 @@ func (s *toolStreamSieveState) resetIncrementalToolState() { } func (s *toolStreamSieveState) noteText(content string) { - if content == "" { + if !hasMeaningfulText(content) { return } + updateCodeFenceState(s, content) s.recentTextTail = appendTail(s.recentTextTail, content, toolSieveContextTailLimit) } @@ -63,3 +67,107 @@ func appendTail(prev, next string, max int) string { } return combined[len(combined)-max:] } + +func hasMeaningfulText(text string) bool { + return strings.TrimSpace(text) != "" +} + +func insideCodeFenceWithState(state *toolStreamSieveState, text string) bool { + if state == nil { + return insideCodeFence(text) + } + simulated := simulateCodeFenceState( + state.codeFenceStack, + state.codeFencePendingTicks, + state.codeFenceLineStart, + text, + ) + return len(simulated.stack) > 0 +} + +func insideCodeFence(text string) bool { + if text == "" { + return false + } + return len(simulateCodeFenceState(nil, 0, true, text).stack) > 0 +} + +func updateCodeFenceState(state *toolStreamSieveState, text string) { + if state == nil || !hasMeaningfulText(text) { + return + } + next := simulateCodeFenceState( + state.codeFenceStack, + state.codeFencePendingTicks, + state.codeFenceLineStart, + text, + ) + state.codeFenceStack = next.stack + state.codeFencePendingTicks = next.pendingTicks + state.codeFenceLineStart = next.lineStart +} + +type codeFenceSimulation struct { + stack []int + pendingTicks int + lineStart bool +} + +func simulateCodeFenceState(stack []int, pendingTicks int, lineStart bool, text string) codeFenceSimulation { + chunk := text + nextStack := append([]int(nil), stack...) + ticks := pendingTicks + atLineStart := lineStart + + flushTicks := func() { + if ticks > 0 { + if atLineStart && ticks >= 3 { + applyFenceMarker(&nextStack, ticks) + } + atLineStart = false + ticks = 0 + } + } + + for i := 0; i < len(chunk); i++ { + ch := chunk[i] + if ch == '`' { + ticks++ + continue + } + flushTicks() + switch ch { + case '\n', '\r': + atLineStart = true + case ' ', '\t': + if atLineStart { + continue + } + atLineStart = false + default: + atLineStart = false + } + } + + return codeFenceSimulation{ + stack: nextStack, + pendingTicks: ticks, + lineStart: atLineStart, + } +} + +func applyFenceMarker(stack *[]int, ticks int) { + if stack == nil || ticks <= 0 { + return + } + if len(*stack) == 0 { + *stack = append(*stack, ticks) + return + } + top := (*stack)[len(*stack)-1] + if ticks >= top { + *stack = (*stack)[:len(*stack)-1] + return + } + *stack = append(*stack, ticks) +} diff --git a/internal/adapter/openai/tool_sieve_xml.go b/internal/adapter/openai/tool_sieve_xml.go index bcafe08..b019b93 100644 --- a/internal/adapter/openai/tool_sieve_xml.go +++ b/internal/adapter/openai/tool_sieve_xml.go @@ -26,8 +26,8 @@ var xmlToolCallTagPairs = []struct{ open, close string }{ {""}, {""}, // Agent-style: these are XML "tool call" patterns from coding agents. - // They get captured → parsed. If parsing fails, the block is consumed - // (swallowed) to prevent raw XML from leaking to the client. + // They get captured → parsed. If parsing fails, the raw XML is preserved + // so the caller can still see the original text. {""}, {""}, {""}, @@ -73,31 +73,12 @@ func consumeXMLToolCapture(captured string, toolNames []string) (prefix string, prefixPart, suffixPart = trimWrappingJSONFence(prefixPart, suffixPart) return prefixPart, parsed, suffixPart, true } - // If this block does not look like an executable tool-call payload, - // pass it through as normal content (e.g. user-requested XML snippets). - if !looksLikeExecutableXMLToolCallBlock(xmlBlock, pair.open) { - return prefixPart + xmlBlock, nil, suffixPart, true - } - // Looks like XML tool syntax but failed to parse — consume it to avoid leak. - return prefixPart, nil, suffixPart, true + // If this block failed to become a tool call, pass it through as text. + return prefixPart + xmlBlock, nil, suffixPart, true } return "", nil, "", false } -func looksLikeExecutableXMLToolCallBlock(xmlBlock, openTag string) bool { - lower := strings.ToLower(xmlBlock) - // Agent wrapper tags are always treated as internal tool-call wrappers. - switch openTag { - case "" or "/tool_call>\n". -// These fragments arise when '<' was consumed separately and the tail remains. -func looksLikeXMLToolTagFragment(s string) bool { - trimmed := strings.TrimSpace(s) - if trimmed == "" { - return false - } - lower := strings.ToLower(trimmed) - // Check for closing tag tails like "tool_calls>" or "/tool_calls>" - fragments := []string{ - "tool_calls>", "tool_call>", "/tool_calls>", "/tool_call>", - "function_calls>", "function_call>", "/function_calls>", "/function_call>", - "invoke>", "/invoke>", "tool_use>", "/tool_use>", - "tool_name>", "/tool_name>", "parameters>", "/parameters>", - // Agent-style tag fragments - "attempt_completion>", "/attempt_completion>", - "ask_followup_question>", "/ask_followup_question>", - "new_task>", "/new_task>", - "result>", "/result>", - } - for _, f := range fragments { - if strings.Contains(lower, f) { - return true - } - } - return false -} diff --git a/internal/adapter/openai/tool_sieve_xml_test.go b/internal/adapter/openai/tool_sieve_xml_test.go index e7ad842..7fd123d 100644 --- a/internal/adapter/openai/tool_sieve_xml_test.go +++ b/internal/adapter/openai/tool_sieve_xml_test.go @@ -121,6 +121,105 @@ func TestProcessToolSieveNonToolXMLKeepsSuffixForToolParsing(t *testing.T) { } } +func TestProcessToolSievePassesThroughMalformedExecutableXMLBlock(t *testing.T) { + var state toolStreamSieveState + chunk := `{"path":"README.md"}` + events := processToolSieveChunk(&state, chunk, []string{"read_file"}) + events = append(events, flushToolSieve(&state, []string{"read_file"})...) + + var textContent strings.Builder + toolCalls := 0 + for _, evt := range events { + textContent.WriteString(evt.Content) + toolCalls += len(evt.ToolCalls) + } + + if toolCalls != 0 { + t.Fatalf("expected malformed executable-looking XML to stay text, got %d events=%#v", toolCalls, events) + } + if textContent.String() != chunk { + t.Fatalf("expected malformed executable-looking XML to pass through unchanged, got %q", textContent.String()) + } +} + +func TestProcessToolSievePassesThroughFencedXMLToolCallExamples(t *testing.T) { + var state toolStreamSieveState + input := strings.Join([]string{ + "Before first example.\n```", + "xml\nread_file{\"path\":\"README.md\"}\n```\n", + "Between examples.\n```xml\n", + "search{\"q\":\"golang\"}\n", + "```\nAfter examples.", + }, "") + + chunks := []string{ + "Before first example.\n```", + "xml\nread_file{\"path\":\"README.md\"}\n```\n", + "Between examples.\n```xml\n", + "search{\"q\":\"golang\"}\n", + "```\nAfter examples.", + } + + var events []toolStreamEvent + for _, c := range chunks { + events = append(events, processToolSieveChunk(&state, c, []string{"read_file", "search"})...) + } + events = append(events, flushToolSieve(&state, []string{"read_file", "search"})...) + + var textContent strings.Builder + toolCalls := 0 + for _, evt := range events { + if evt.Content != "" { + textContent.WriteString(evt.Content) + } + toolCalls += len(evt.ToolCalls) + } + + if toolCalls != 0 { + t.Fatalf("expected fenced XML examples to stay text, got %d tool calls events=%#v", toolCalls, events) + } + if textContent.String() != input { + t.Fatalf("expected fenced XML examples to pass through unchanged, got %q", textContent.String()) + } +} + +func TestProcessToolSieveKeepsPartialXMLTagInsideFencedExample(t *testing.T) { + var state toolStreamSieveState + input := strings.Join([]string{ + "Example:\n```xml\nread_file{\"path\":\"README.md\"}\n```\n", + "Done.", + }, "") + + chunks := []string{ + "Example:\n```xml\nread_file{\"path\":\"README.md\"}\n```\n", + "Done.", + } + + var events []toolStreamEvent + for _, c := range chunks { + events = append(events, processToolSieveChunk(&state, c, []string{"read_file"})...) + } + events = append(events, flushToolSieve(&state, []string{"read_file"})...) + + var textContent strings.Builder + toolCalls := 0 + for _, evt := range events { + if evt.Content != "" { + textContent.WriteString(evt.Content) + } + toolCalls += len(evt.ToolCalls) + } + + if toolCalls != 0 { + t.Fatalf("expected partial fenced XML to stay text, got %d tool calls events=%#v", toolCalls, events) + } + if textContent.String() != input { + t.Fatalf("expected partial fenced XML to pass through unchanged, got %q", textContent.String()) + } +} + func TestProcessToolSievePartialXMLTagHeldBack(t *testing.T) { var state toolStreamSieveState // Chunk ends with a partial XML tool tag. @@ -149,13 +248,14 @@ func TestFindToolSegmentStartDetectsXMLToolCalls(t *testing.T) { {"tool_calls_tag", "some text \n", 10}, {"tool_call_tag", "prefix \n", 7}, {"invoke_tag", "text body", 5}, + {"xml_inside_code_fence", "```xml\nread_file\n```", -1}, {"function_call_tag", "body", 0}, {"no_xml", "just plain text", -1}, {"gemini_json_no_detect", `some text {"functionCall":{"name":"search"}}`, -1}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { - got := findToolSegmentStart(tc.input) + got := findToolSegmentStart(nil, tc.input) if got != tc.want { t.Fatalf("findToolSegmentStart(%q) = %d, want %d", tc.input, got, tc.want) } @@ -269,8 +369,8 @@ func TestProcessToolSieveTokenByTokenXMLNoLeak(t *testing.T) { } } -// Test that flushToolSieve on incomplete XML does NOT leak the raw XML content. -func TestFlushToolSieveIncompleteXMLDoesNotLeak(t *testing.T) { +// Test that flushToolSieve on incomplete XML falls back to raw text. +func TestFlushToolSieveIncompleteXMLFallsBackToText(t *testing.T) { var state toolStreamSieveState // XML block starts but stream ends before completion. chunks := []string{ @@ -292,8 +392,8 @@ func TestFlushToolSieveIncompleteXMLDoesNotLeak(t *testing.T) { } } - if strings.Contains(textContent, "\n", @@ -357,7 +457,7 @@ func TestProcessToolSieveInterceptsAttemptCompletionLeak(t *testing.T) { t.Fatalf("expected leading text to be emitted, got %q", textContent) } - if strings.Contains(textContent, "") || strings.Contains(textContent, "result>") { - t.Fatalf("agent XML tag content leaked to text: %q", textContent) + if textContent != strings.Join(chunks, "") { + t.Fatalf("expected agent XML to fall back to raw text, got %q", textContent) } } diff --git a/internal/js/helpers/stream-tool-sieve/sieve-xml.js b/internal/js/helpers/stream-tool-sieve/sieve-xml.js index 7e61b24..6442dbc 100644 --- a/internal/js/helpers/stream-tool-sieve/sieve-xml.js +++ b/internal/js/helpers/stream-tool-sieve/sieve-xml.js @@ -42,8 +42,8 @@ function consumeXMLToolCapture(captured, toolNames, trimWrappingJSONFence) { suffix: trimmedFence.suffix, }; } - // XML tool syntax but failed to parse — consume to avoid leak. - return { ready: true, prefix: prefixPart, calls: [], suffix: suffixPart }; + // If this block failed to become a tool call, pass it through as text. + return { ready: true, prefix: prefixPart + xmlBlock, calls: [], suffix: suffixPart }; } return { ready: false, prefix: '', calls: [], suffix: '' }; } @@ -79,22 +79,8 @@ function findPartialXMLToolTagStart(s) { return -1; } -function looksLikeXMLToolTagFragment(s) { - const trimmed = (s || '').trim(); - if (!trimmed) return false; - const lower = trimmed.toLowerCase(); - const fragments = [ - 'tool_calls>', 'tool_call>', '/tool_calls>', '/tool_call>', - 'function_calls>', 'function_call>', '/function_calls>', '/function_call>', - 'invoke>', '/invoke>', 'tool_use>', '/tool_use>', - 'tool_name>', '/tool_name>', 'parameters>', '/parameters>', - ]; - return fragments.some(f => lower.includes(f)); -} - module.exports = { consumeXMLToolCapture, hasOpenXMLToolTag, findPartialXMLToolTagStart, - looksLikeXMLToolTagFragment, }; diff --git a/internal/js/helpers/stream-tool-sieve/sieve.js b/internal/js/helpers/stream-tool-sieve/sieve.js index 8916712..6ae85f7 100644 --- a/internal/js/helpers/stream-tool-sieve/sieve.js +++ b/internal/js/helpers/stream-tool-sieve/sieve.js @@ -12,7 +12,6 @@ const { consumeXMLToolCapture: consumeXMLToolCaptureImpl, hasOpenXMLToolTag, findPartialXMLToolTagStart, - looksLikeXMLToolTagFragment, } = require('./sieve-xml'); function processToolSieveChunk(state, chunk, toolNames) { if (!state) { @@ -77,7 +76,7 @@ function processToolSieveChunk(state, chunk, toolNames) { resetIncrementalToolState(state); continue; } - const [safe, hold] = splitSafeContentForToolDetection(pending); + const [safe, hold] = splitSafeContentForToolDetection(state, pending); if (!safe) { break; } @@ -114,26 +113,22 @@ function flushToolSieve(state, toolNames) { } } else if (state.capture) { const content = state.capture; - if (!hasOpenXMLToolTag(content) && !looksLikeXMLToolTagFragment(content)) { - noteText(state, content); - events.push({ type: 'text', text: content }); - } + noteText(state, content); + events.push({ type: 'text', text: content }); } state.capture = ''; state.capturing = false; resetIncrementalToolState(state); } if (state.pending) { - if (!hasOpenXMLToolTag(state.pending) && !looksLikeXMLToolTagFragment(state.pending)) { - noteText(state, state.pending); - events.push({ type: 'text', text: state.pending }); - } + noteText(state, state.pending); + events.push({ type: 'text', text: state.pending }); state.pending = ''; } return events; } -function splitSafeContentForToolDetection(s) { +function splitSafeContentForToolDetection(state, s) { const text = s || ''; if (!text) { return ['', '']; @@ -141,6 +136,9 @@ function splitSafeContentForToolDetection(s) { // Only hold back partial XML tool tags. const xmlIdx = findPartialXMLToolTagStart(text); if (xmlIdx >= 0) { + if (insideCodeFenceWithState(state, text.slice(0, xmlIdx))) { + return [text, '']; + } if (xmlIdx > 0) { return [text.slice(0, xmlIdx), text.slice(xmlIdx)]; } diff --git a/internal/toolcall/tool_prompt.go b/internal/toolcall/tool_prompt.go index 5ee60b6..8e896e9 100644 --- a/internal/toolcall/tool_prompt.go +++ b/internal/toolcall/tool_prompt.go @@ -36,8 +36,6 @@ func BuildToolCallInstructions(toolNames []string) string { return `TOOL CALL FORMAT — FOLLOW EXACTLY: -If you need to call tools, your entire response must be exactly one XML block and nothing else. - TOOL_NAME_HERE @@ -63,7 +61,8 @@ PARAMETER SHAPES: - array => repeated tags or children - number/bool/null => plain text -❌ WRONG — Do NOT do these: +【WRONG — Do NOT do these】: + Wrong 1 — mixed text after XML: ... I hope this helps. Wrong 2 — function-call syntax: @@ -74,14 +73,10 @@ Wrong 4 — Markdown code fences: ` + "```xml" + ` ... ` + "```" + ` -Wrong 5 — native tool tokens: - <|Tool|>call_some_tool{"param":1}<|Tool|> -Wrong 6 — role markers in response: - <|Assistant|> Here is the result... Remember: The ONLY valid way to use tools is the XML block at the end of your response. -✅ CORRECT EXAMPLES: +【CORRECT EXAMPLES】: Example A — Single tool: @@ -127,7 +122,7 @@ fi -Remember: Output ONLY the ... XML block when calling tools.` +` } func matchAny(name string, candidates ...string) bool { diff --git a/tests/node/stream-tool-sieve.test.js b/tests/node/stream-tool-sieve.test.js index 18a31c4..57c29f0 100644 --- a/tests/node/stream-tool-sieve.test.js +++ b/tests/node/stream-tool-sieve.test.js @@ -126,7 +126,16 @@ test('sieve keeps embedded invalid tool-like json as normal text to avoid stream assert.equal(leakedText.toLowerCase().includes('tool_calls'), true); }); -test('sieve flushes incomplete captured XML tool blocks without leaking raw tags', () => { +test('sieve passes malformed executable-looking XML through as text', () => { + const chunk = '{"path":"README.MD"}'; + const events = runSieve([chunk], ['read_file']); + const leakedText = collectText(events); + const hasToolCalls = events.some((evt) => evt.type === 'tool_calls' && evt.calls?.length > 0); + assert.equal(hasToolCalls, false); + assert.equal(leakedText, chunk); +}); + +test('sieve flushes incomplete captured XML tool blocks by falling back to raw text', () => { const events = runSieve( [ '前置正文G。', @@ -137,9 +146,10 @@ test('sieve flushes incomplete captured XML tool blocks without leaking raw tags ['read_file'], ); const leakedText = collectText(events); - assert.equal(leakedText.includes('前置正文G。'), true); - assert.equal(leakedText.toLowerCase().includes('tool_calls'), false); - assert.equal(leakedText.includes('\n', ' \n', ' read_file\n'].join(''); + const hasToolCalls = events.some((evt) => evt.type === 'tool_calls' && evt.calls?.length > 0); + assert.equal(hasToolCalls, false); + assert.equal(leakedText, expected); }); test('sieve captures XML wrapper tags with attributes without leaking wrapper text', () => {