From 26d195f2a6c74daf45eca224e9031936348e40ad Mon Sep 17 00:00:00 2001 From: CJACK Date: Sun, 19 Apr 2026 18:51:25 +0800 Subject: [PATCH] refactor: update tool call format to prefer XML-style parameters with CDATA support for robust content handling --- .../stream-tool-sieve/parse_payload.js | 54 ++++++++++++--- internal/toolcall/regression_test.go | 67 +++++++++++++++++++ internal/toolcall/tool_prompt.go | 53 ++++++++++----- internal/toolcall/toolcalls_markup.go | 46 +++++++++++-- internal/toolcall/toolcalls_parse_markup.go | 15 +++-- 5 files changed, 194 insertions(+), 41 deletions(-) create mode 100644 internal/toolcall/regression_test.go diff --git a/internal/js/helpers/stream-tool-sieve/parse_payload.js b/internal/js/helpers/stream-tool-sieve/parse_payload.js index 5a86c43..ecf6346 100644 --- a/internal/js/helpers/stream-tool-sieve/parse_payload.js +++ b/internal/js/helpers/stream-tool-sieve/parse_payload.js @@ -19,6 +19,8 @@ const TOOL_CALL_MARKUP_ARGS_PATTERNS = [ /<(?:[a-z0-9_:-]+:)?args\b[^>]*>([\s\S]*?)<\/(?:[a-z0-9_:-]+:)?args>/i, /<(?:[a-z0-9_:-]+:)?params\b[^>]*>([\s\S]*?)<\/(?:[a-z0-9_:-]+:)?params>/i, ]; +const CDATA_PATTERN = //i; +const HTML_ENTITIES_PATTERN = /&[a-z0-9#]+;/gi; const { toStringSafe, @@ -74,7 +76,7 @@ function parseMarkupSingleToolCall(attrs, inner) { name = toStringSafe(attrMatch[2]).trim(); } if (!name) { - name = stripTagText(findMarkupTagValue(inner, TOOL_CALL_MARKUP_NAME_PATTERNS)); + name = extractRawTagValue(findMarkupTagValue(inner, TOOL_CALL_MARKUP_NAME_PATTERNS)); } if (!name) { return null; @@ -95,18 +97,21 @@ function parseMarkupSingleToolCall(attrs, inner) { function parseMarkupInput(raw) { const s = toStringSafe(raw).trim(); - if (!s) { - return {}; - } - const parsed = parseToolCallInput(s); - if (parsed && typeof parsed === 'object' && !Array.isArray(parsed) && Object.keys(parsed).length > 0) { - return parsed; - } + // Prioritize XML-style KV tags (e.g., val) const kv = parseMarkupKVObject(s); if (Object.keys(kv).length > 0) { return kv; } - return { _raw: stripTagText(s) }; + + // Fallback to JSON parsing + const parsed = parseToolCallInput(s); + if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) { + if (Object.keys(parsed).length > 0) { + return parsed; + } + } + + return { _raw: extractRawTagValue(s) }; } function parseMarkupKVObject(text) { @@ -120,7 +125,7 @@ function parseMarkupKVObject(text) { if (!key) { continue; } - const valueRaw = stripTagText(m[2]); + const valueRaw = extractRawTagValue(m[2]); if (!valueRaw) { continue; } @@ -133,6 +138,33 @@ function parseMarkupKVObject(text) { return out; } +function extractRawTagValue(inner) { + const s = toStringSafe(inner).trim(); + if (!s) { + return ''; + } + + // 1. Check for CDATA + const cdataMatch = s.match(CDATA_PATTERN); + if (cdataMatch && cdataMatch[1] !== undefined) { + return cdataMatch[1]; + } + + // 2. Fallback to unescaping standard HTML entities + // Note: we avoid broad tag stripping here to preserve user content (like < symbols in code) + return unescapeHtml(inner); +} + +function unescapeHtml(safe) { + if (!safe) return ''; + return safe.replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/'/g, "'"); +} + function stripTagText(text) { return toStringSafe(text).replace(/<[^>]+>/g, ' ').trim(); } @@ -141,7 +173,7 @@ function findMarkupTagValue(text, patterns) { const source = toStringSafe(text); for (const p of patterns) { const m = source.match(p); - if (m && m[1]) { + if (m && m[1] !== undefined) { return toStringSafe(m[1]); } } diff --git a/internal/toolcall/regression_test.go b/internal/toolcall/regression_test.go new file mode 100644 index 0000000..8f94557 --- /dev/null +++ b/internal/toolcall/regression_test.go @@ -0,0 +1,67 @@ +package toolcall + +import ( + "reflect" + "testing" +) + +func TestRegression_RobustXMLAndCDATA(t *testing.T) { + tests := []struct { + name string + text string + expected []ParsedToolCall + }{ + { + name: "Standard JSON parameters (Regression)", + text: `foo{"a": 1}`, + expected: []ParsedToolCall{{Name: "foo", Input: map[string]any{"a": float64(1)}}}, + }, + { + name: "XML tags parameters (Regression)", + text: `foohello`, + expected: []ParsedToolCall{{Name: "foo", Input: map[string]any{"arg1": "hello"}}}, + }, + { + name: "CDATA parameters (New Feature)", + text: `write_file and & symbols]]>`, + expected: []ParsedToolCall{{ + Name: "write_file", + Input: map[string]any{"content": "line 1\nline 2 with and & symbols"}, + }}, + }, + { + name: "Dirty XML with unescaped symbols (Robustness Improvement)", + text: `bashecho "hello" > out.txt && cat out.txt`, + expected: []ParsedToolCall{{ + Name: "bash", + Input: map[string]any{"command": "echo \"hello\" > out.txt && cat out.txt"}, + }}, + }, + { + name: "Mixed JSON inside CDATA (New Hybrid Case)", + text: `foo`, + expected: []ParsedToolCall{{ + Name: "foo", + Input: map[string]any{"json_param": "works"}, + }}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := ParseToolCalls(tt.text, []string{"foo", "write_file", "bash"}) + if len(got) != len(tt.expected) { + t.Fatalf("expected %d calls, got %d", len(tt.expected), len(got)) + } + for i := range got { + if got[i].Name != tt.expected[i].Name { + t.Errorf("expected name %q, got %q", tt.expected[i].Name, got[i].Name) + } + if !reflect.DeepEqual(got[i].Input, tt.expected[i].Input) { + t.Errorf("expected input %#v, got %#v", tt.expected[i].Input, got[i].Input) + } + } + }) + } +} diff --git a/internal/toolcall/tool_prompt.go b/internal/toolcall/tool_prompt.go index bcf1046..d6ec711 100644 --- a/internal/toolcall/tool_prompt.go +++ b/internal/toolcall/tool_prompt.go @@ -41,20 +41,21 @@ When calling tools, emit ONLY raw XML at the very end of your response. No text TOOL_NAME_HERE - {"key":"value"} + + PARAMETER_VALUE + RULES: 1) When calling tools, you MUST use the XML format. 2) No text is allowed AFTER the XML block. -3) MUST be a single-line strict JSON object. Use double quotes. -4) Multiple tools must be inside the same root. -5) Do NOT wrap XML in markdown fences (` + "```" + `). -6) Do NOT invent parameters. Use only the provided schema. -7) CRITICAL: Do NOT use native tool markers like "<|Tool|>" or "<|tool|>". -8) CRITICAL: Do NOT output role markers like "<|System|>", "<|User|>", or "<|Assistant|>". -9) CRITICAL: Do NOT output internal monologues (e.g. "I will list files now..."). Just output your answer or the XML. +3) should be a list of XML tags (e.g., value). For simple inputs, a single-line JSON string is also acceptable. +4) For long text, scripts, or code content, YOU MUST wrap the value in to preserve formatting and avoid character escaping errors. +5) Multiple tools must be inside the same root. +6) Do NOT wrap XML in markdown fences (` + "```" + `). +7) Do NOT invent parameters. Use only the provided schema. +8) CRITICAL: Do NOT output internal monologues (e.g. "I will list files now..."). Just output your answer or the XML. ❌ WRONG — Do NOT do these: Wrong 1 — mixed text after XML: @@ -103,6 +104,22 @@ Example C — Tool with complex nested JSON parameters: ` + ex3Params + ` + +Example D — Tool with long script using CDATA (RELIABLE FOR CODE/SCRIPTS): + + + ` + ex2 + ` + + script.sh + + + + Remember: Output ONLY the ... XML block when calling tools.` } @@ -119,34 +136,34 @@ func matchAny(name string, candidates ...string) bool { func exampleReadParams(name string) string { switch strings.TrimSpace(name) { case "Read": - return `{"file_path":"README.md"}` + return `README.md` case "Glob": - return `{"pattern":"**/*.go","path":"."}` + return `**/*.go.` default: - return `{"path":"src/main.go"}` + return `src/main.go` } } func exampleWriteOrExecParams(name string) string { switch strings.TrimSpace(name) { case "Bash", "execute_command": - return `{"command":"pwd"}` + return `pwd` case "exec_command": - return `{"cmd":"pwd"}` + return `pwd` case "Edit": - return `{"file_path":"README.md","old_string":"foo","new_string":"bar"}` + return `README.mdfoobar` case "MultiEdit": - return `{"file_path":"README.md","edits":[{"old_string":"foo","new_string":"bar"}]}` + return `README.mdfoobar` default: - return `{"path":"output.txt","content":"Hello world"}` + return `output.txtHello world` } } func exampleInteractiveParams(name string) string { switch strings.TrimSpace(name) { case "Task": - return `{"description":"Investigate flaky tests","prompt":"Run targeted tests and summarize failures"}` + return `Investigate flaky testsRun targeted tests and summarize failures` default: - return `{"question":"Which approach do you prefer?","follow_up":[{"text":"Option A"},{"text":"Option B"}]}` + return `Which approach do you prefer?Option AOption B` } } diff --git a/internal/toolcall/toolcalls_markup.go b/internal/toolcall/toolcalls_markup.go index d17d1ff..7e94621 100644 --- a/internal/toolcall/toolcalls_markup.go +++ b/internal/toolcall/toolcalls_markup.go @@ -22,6 +22,9 @@ var toolCallMarkupNamePatternByTag = map[string]*regexp.Regexp{ "name": regexp.MustCompile(`(?is)<(?:[a-z0-9_:-]+:)?name\b[^>]*>(.*?)`), "function": regexp.MustCompile(`(?is)<(?:[a-z0-9_:-]+:)?function\b[^>]*>(.*?)`), } + +// cdataPattern matches CDATA sections to handle them separately from normal tags. +var cdataPattern = regexp.MustCompile(`(?is)`) var toolCallMarkupArgsTagNames = []string{"input", "arguments", "argument", "parameters", "parameter", "args", "params"} var toolCallMarkupArgsPatternByTag = map[string]*regexp.Regexp{ "input": regexp.MustCompile(`(?is)<(?:[a-z0-9_:-]+:)?input\b[^>]*>(.*?)`), @@ -120,12 +123,15 @@ func parseMarkupInput(raw string) map[string]any { if raw == "" { return map[string]any{} } - if parsed := parseToolCallInput(raw); len(parsed) > 0 { - return parsed - } + // Prioritize XML-style KV tags as they are more robust for long text/scripts. if kv := parseMarkupKVObject(raw); len(kv) > 0 { return kv } + + // Fallback to JSON parsing for standard/legacy tool calls. + if parsed := parseToolCallInput(raw); len(parsed) > 0 { + return parsed + } return map[string]any{"_raw": html.UnescapeString(stripTagText(raw))} } @@ -147,7 +153,13 @@ func parseMarkupKVObject(text string) map[string]any { if !strings.EqualFold(key, endKey) { continue } - value := strings.TrimSpace(html.UnescapeString(stripTagText(m[2]))) + // Robustly extract value to handle CDATA and mixed content + value := extractRawTagValue(m[2]) + if value == "" && m[2] != "" { + // If it wasn't empty but extracted to empty, could be whitespace or just tags + value = strings.TrimSpace(m[2]) + } + if value == "" { continue } @@ -164,6 +176,30 @@ func parseMarkupKVObject(text string) map[string]any { return out } +// extractRawTagValue treats the inner content of a tag robustly. +// It detects CDATA and strips it, otherwise it unescapes standard HTML entities. +// It avoids over-aggressive tag stripping that might break user content. +func extractRawTagValue(inner string) string { + trimmed := strings.TrimSpace(inner) + if trimmed == "" { + return "" + } + + // 1. Check for CDATA - if present, it's the ultimate "safe" container. + if cdataMatches := cdataPattern.FindStringSubmatch(trimmed); len(cdataMatches) >= 2 { + return cdataMatches[1] // Return raw content between CDATA brackets + } + + // 2. If no CDATA, we still want to be robust. + // We unescape standard HTML entities (like < > &) + // but we DON'T recursively strip tags unless they are actually valid XML tags + // at the start/end (which should have been handled by the outer matcher anyway). + + // If it contains what looks like a single tag and no other text, it might be nested XML + // but for KV objects we usually want the value. + return html.UnescapeString(inner) +} + func stripTagText(text string) string { return strings.TrimSpace(anyTagPattern.ReplaceAllString(text, "")) } @@ -175,7 +211,7 @@ func findMarkupTagValue(text string, tagNames []string, patternByTag map[string] continue } if m := pattern.FindStringSubmatch(text); len(m) >= 2 { - value := strings.TrimSpace(m[1]) + value := extractRawTagValue(m[1]) if value != "" { return value } diff --git a/internal/toolcall/toolcalls_parse_markup.go b/internal/toolcall/toolcalls_parse_markup.go index 899f8f3..d03ff4d 100644 --- a/internal/toolcall/toolcalls_parse_markup.go +++ b/internal/toolcall/toolcalls_parse_markup.go @@ -115,11 +115,12 @@ func parseSingleXMLToolCall(block string) (ParsedToolCall, bool) { if err := dec.DecodeElement(&node, &t); err == nil { inner := strings.TrimSpace(node.Inner) if inner != "" { - unescapedInner := html.UnescapeString(inner) - if parsed := parseToolCallInput(unescapedInner); len(parsed) > 0 { + // Cleanly extract content (handles CDATA, entities, etc.) + extracted := extractRawTagValue(inner) + if parsed := parseToolCallInput(extracted); len(parsed) > 0 { if len(parsed) == 1 { if _, onlyRaw := parsed["_raw"]; onlyRaw { - if kv := parseMarkupKVObject(unescapedInner); len(kv) > 0 { + if kv := parseMarkupKVObject(extracted); len(kv) > 0 { for k, vv := range kv { params[k] = vv } @@ -130,7 +131,7 @@ func parseSingleXMLToolCall(block string) (ParsedToolCall, bool) { for k, vv := range parsed { params[k] = vv } - } else if kv := parseMarkupKVObject(unescapedInner); len(kv) > 0 { + } else if kv := parseMarkupKVObject(extracted); len(kv) > 0 { for k, vv := range kv { params[k] = vv } @@ -293,7 +294,7 @@ func parseSingleAntmlFunctionCallMatch(m []string) (ParsedToolCall, bool) { continue } k := strings.TrimSpace(am[1]) - v := strings.TrimSpace(html.UnescapeString(am[2])) + v := extractRawTagValue(am[2]) if k != "" { input[k] = v } @@ -316,7 +317,7 @@ func parseInvokeFunctionCallStyle(text string) (ParsedToolCall, bool) { continue } k := strings.TrimSpace(pm[1]) - v := strings.TrimSpace(html.UnescapeString(pm[2])) + v := extractRawTagValue(pm[2]) if k != "" { input[k] = v } @@ -347,7 +348,7 @@ func parseToolUseFunctionStyle(text string) (ParsedToolCall, bool) { continue } k := strings.TrimSpace(pm[1]) - v := strings.TrimSpace(html.UnescapeString(pm[2])) + v := extractRawTagValue(pm[2]) if k != "" { input[k] = v }