feat: add support for CJK angle bracket and trailing attribute separator drift in DSML tool parsing

This commit is contained in:
CJACK
2026-05-10 01:54:31 +08:00
parent 77b6d83266
commit 61d42f8b72
12 changed files with 222 additions and 35 deletions

View File

@@ -1,6 +1,6 @@
'use strict';
const CDATA_PATTERN = /^<!\[CDATA\[([\s\S]*?)]](?:>|)$/i;
const CDATA_PATTERN = /^(?:<|〈)!\[CDATA\[([\s\S]*?)]](?:>||〉)$/i;
const XML_ATTR_PATTERN = /\b([a-z0-9_:-]+)\s*=\s*("([^"]*)"|'([^']*)')/gi;
const TOOL_MARKUP_NAMES = [
{ raw: 'tool_calls', canonical: 'tool_calls' },
@@ -431,26 +431,31 @@ function skipXmlIgnoredSection(lower, i) {
function findCDATAEnd(text, from) {
const ascii = text.indexOf(']]>', from);
const fullwidth = text.indexOf(']]', from);
if (ascii < 0 && fullwidth < 0) {
const cjk = text.indexOf(']]〉', from);
if (ascii < 0 && fullwidth < 0 && cjk < 0) {
return { index: -1, len: 0 };
}
if (ascii < 0) {
return { index: fullwidth, len: ']]'.length };
let best = { index: -1, len: 0 };
for (const candidate of [
{ index: ascii, len: ']]>'.length },
{ index: fullwidth, len: ']]'.length },
{ index: cjk, len: ']]〉'.length },
]) {
if (candidate.index >= 0 && (best.index < 0 || candidate.index < best.index)) {
best = candidate;
}
}
if (fullwidth < 0 || ascii < fullwidth) {
return { index: ascii, len: ']]>'.length };
}
return { index: fullwidth, len: ']]'.length };
return best;
}
function scanToolMarkupTagAt(text, start) {
const raw = toStringSafe(text);
if (!raw || start < 0 || start >= raw.length || raw[start] !== '<') {
if (!raw || start < 0 || start >= raw.length || normalizeFullwidthASCIIChar(raw[start]) !== '<') {
return null;
}
const lower = raw.toLowerCase();
let i = start + 1;
while (i < raw.length && raw[i] === '<') {
while (i < raw.length && normalizeFullwidthASCIIChar(raw[i]) === '<') {
i += 1;
}
const closing = raw[i] === '/';
@@ -822,6 +827,12 @@ function normalizeFullwidthASCIIChar(ch) {
if (!ch) {
return ch;
}
if (ch === '〈') {
return '<';
}
if (ch === '〉') {
return '>';
}
const code = ch.charCodeAt(0);
if (code >= 0xff01 && code <= 0xff5e) {
return String.fromCharCode(code - 0xfee0);
@@ -858,9 +869,28 @@ function matchNormalizedASCII(raw, start, expected) {
function normalizeToolMarkupTagTailForXML(tail) {
let out = '';
for (const ch of typeof tail === 'string' ? tail : String(tail || '')) {
const raw = typeof tail === 'string' ? tail : String(tail || '');
let quote = '';
for (let i = 0; i < raw.length; i += 1) {
const ch = raw[i];
const normalized = normalizeFullwidthASCIIChar(ch);
if (['>', '/', '=', '"', "'"].includes(normalized)) {
if (quote) {
out += normalized;
if (normalized === quote) {
quote = '';
}
} else if (normalized === '"' || normalized === "'") {
quote = normalized;
out += normalized;
} else if (normalized === '|') {
let j = i + 1;
while (j < raw.length && [' ', '\t', '\r', '\n'].includes(raw[j])) {
j += 1;
}
if (normalizeFullwidthASCIIChar(raw[j] || '') !== '>') {
out += normalized;
}
} else if (['>', '/', '='].includes(normalized)) {
out += normalized;
} else {
out += ch;

View File

@@ -65,6 +65,7 @@ func normalizeToolMarkupTagTailForXML(tail string) string {
}
var b strings.Builder
b.Grow(len(tail))
quote := rune(0)
for i := 0; i < len(tail); {
r, size := utf8.DecodeRuneInString(tail[i:])
if r == utf8.RuneError && size == 1 {
@@ -72,9 +73,38 @@ func normalizeToolMarkupTagTailForXML(tail string) string {
i++
continue
}
switch normalizeFullwidthASCII(r) {
case '>', '/', '=', '"', '\'':
b.WriteRune(normalizeFullwidthASCII(r))
ch := normalizeFullwidthASCII(r)
if quote != 0 {
b.WriteRune(ch)
if ch == quote {
quote = 0
}
i += size
continue
}
switch ch {
case '"', '\'':
quote = ch
b.WriteRune(ch)
case '|':
j := i + size
for j < len(tail) {
next, nextSize := utf8.DecodeRuneInString(tail[j:])
if nextSize <= 0 {
break
}
if next == ' ' || next == '\t' || next == '\r' || next == '\n' {
j += nextSize
continue
}
break
}
next, _ := normalizedASCIIAt(tail, j)
if next != '>' {
b.WriteRune(ch)
}
case '>', '/', '=':
b.WriteRune(ch)
default:
b.WriteString(tail[i : i+size])
}

View File

@@ -10,7 +10,7 @@ import (
var toolCallMarkupKVPattern = regexp.MustCompile(`(?is)<(?:[a-z0-9_:-]+:)?([a-z0-9_\-.]+)\b[^>]*>(.*?)</(?:[a-z0-9_:-]+:)?([a-z0-9_\-.]+)>`)
// cdataPattern matches a standalone CDATA section.
var cdataPattern = regexp.MustCompile(`(?is)^<!\[CDATA\[(.*?)]](?:>|)$`)
var cdataPattern = regexp.MustCompile(`(?is)^(?:<|〈)!\[CDATA\[(.*?)]](?:>||〉)$`)
func parseMarkupKVObject(text string) map[string]any {
matches := toolCallMarkupKVPattern.FindAllStringSubmatch(strings.TrimSpace(text), -1)

View File

@@ -305,19 +305,23 @@ func indexToolCDATAClose(text string, from int) int {
}
asciiIdx := strings.Index(text[from:], "]]>")
fullIdx := strings.Index(text[from:], "]]")
if asciiIdx < 0 && fullIdx < 0 {
cjkIdx := strings.Index(text[from:], "]]〉")
if asciiIdx < 0 && fullIdx < 0 && cjkIdx < 0 {
return -1
}
if asciiIdx < 0 {
return from + fullIdx
best := -1
for _, idx := range []int{asciiIdx, fullIdx, cjkIdx} {
if idx >= 0 && (best < 0 || idx < best) {
best = idx
}
}
if fullIdx < 0 || asciiIdx < fullIdx {
return from + asciiIdx
}
return from + fullIdx
return from + best
}
func toolCDATACloseLenAt(text string, idx int) int {
if strings.HasPrefix(text[idx:], "]]〉") {
return len("]]〉")
}
if strings.HasPrefix(text[idx:], "]]") {
return len("]]")
}

View File

@@ -134,12 +134,17 @@ func FindMatchingToolMarkupClose(text string, open ToolMarkupTag) (ToolMarkupTag
}
func scanToolMarkupTagAt(text string, start int) (ToolMarkupTag, bool) {
if start < 0 || start >= len(text) || text[start] != '<' {
next, ok := consumeToolMarkupLessThan(text, start)
if !ok {
return ToolMarkupTag{}, false
}
i := start + 1
for i < len(text) && text[i] == '<' {
i++
i := next
for {
next, ok := consumeToolMarkupLessThan(text, i)
if !ok {
break
}
i = next
}
closing := false
if i < len(text) && text[i] == '/' {
@@ -459,6 +464,14 @@ func consumeToolMarkupPipe(text string, idx int) (int, bool) {
return idx, false
}
func consumeToolMarkupLessThan(text string, idx int) (int, bool) {
ch, size := normalizedASCIIAt(text, idx)
if size <= 0 || ch != '<' {
return idx, false
}
return idx + size, true
}
func hasToolMarkupBoundary(text string, idx int) bool {
if idx >= len(text) {
return true
@@ -488,6 +501,12 @@ func normalizedASCIIAt(text string, idx int) (byte, int) {
}
func normalizeFullwidthASCII(r rune) rune {
switch r {
case '〈':
return '<'
case '〉':
return '>'
}
if r >= '' && r <= '' {
return r - 0xFEE0
}

View File

@@ -132,6 +132,37 @@ func TestParseToolCallsSupportsFullwidthDSMLShell(t *testing.T) {
}
}
func TestParseToolCallsSupportsCJKAngleDSMDrift(t *testing.T) {
text := `<DSMtool_calls>
<DSMinvoke name="Bash">
<DSMparameter name="description">〈![CDATA[Show commits on local dev not on origin/dev]]〉〈/DSMparameter〉
<DSMparameter name="command">〈![CDATA[git log --oneline origin/dev..dev]]〉〈/DSMparameter〉
〈/DSMinvoke〉
<DSMinvoke name="Bash">
<DSMparameter name="description">〈![CDATA[Show commits on origin/dev not on local dev]]〉〈/DSMparameter〉
<DSMparameter name="command">〈![CDATA[git log --oneline dev..origin/dev]]〉〈/DSMparameter〉
〈/DSMinvoke〉
<DSMinvoke name="Bash">
<DSMparameter name="description">〈![CDATA[Check tracking branch status]]〉〈/DSMparameter〉
<DSMparameter name="command">〈![CDATA[git status -b --short]]〉〈/DSMparameter〉
〈/DSMinvoke〉
〈/DSMtool_calls〉`
calls := ParseToolCalls(text, []string{"Bash"})
if len(calls) != 3 {
t.Fatalf("expected three CJK-angle DSM drift calls, got %#v", calls)
}
if calls[0].Name != "Bash" || calls[0].Input["command"] != "git log --oneline origin/dev..dev" {
t.Fatalf("unexpected first CJK-angle DSM drift call: %#v", calls[0])
}
if calls[1].Name != "Bash" || calls[1].Input["description"] != "Show commits on origin/dev not on local dev" {
t.Fatalf("unexpected second CJK-angle DSM drift call: %#v", calls[1])
}
if calls[2].Name != "Bash" || calls[2].Input["command"] != "git status -b --short" {
t.Fatalf("unexpected third CJK-angle DSM drift call: %#v", calls[2])
}
}
func TestParseToolCallsIgnoresBareHyphenatedToolCallsLookalike(t *testing.T) {
text := `<tool-calls><invoke name="Bash"><parameter name="command">pwd</parameter></invoke></tool-calls>`
calls := ParseToolCalls(text, []string{"Bash"})

View File

@@ -1228,3 +1228,37 @@ func TestProcessToolSieveDSMLBarePrefixVariantDoesNotLeak(t *testing.T) {
t.Fatalf("expected one tool call from DSML bare prefix variant, got %d events=%#v", toolCalls, events)
}
}
func TestProcessToolSieveCJKAngleDSMDriftDoesNotLeak(t *testing.T) {
var state State
chunks := []string{
"<DSMtool_calls>\n",
"<DSMinvoke name=\"Bash\">\n",
"<DSMparameter name=\"description\">〈![CDATA[Check tracking branch status]]〉〈/DSMparameter〉\n",
"<DSMparameter name=\"command\">〈![CDATA[git status -b --short]]〉〈/DSMparameter〉\n",
"〈/DSMinvoke〉\n",
"〈/DSMtool_calls〉",
}
var events []Event
for _, c := range chunks {
events = append(events, ProcessChunk(&state, c, []string{"Bash"})...)
}
events = append(events, Flush(&state, []string{"Bash"})...)
var textContent string
var calls []toolcall.ParsedToolCall
for _, evt := range events {
textContent += evt.Content
calls = append(calls, evt.ToolCalls...)
}
if strings.Contains(textContent, "DSM") || strings.Contains(textContent, "git status") {
t.Fatalf("CJK-angle DSM drift leaked to text: %q events=%#v", textContent, events)
}
if len(calls) != 1 {
t.Fatalf("expected one CJK-angle DSM drift tool call, got %d events=%#v", len(calls), events)
}
if calls[0].Name != "Bash" || calls[0].Input["command"] != "git status -b --short" {
t.Fatalf("unexpected CJK-angle DSM drift call: %#v", calls[0])
}
}