feat: add support for CJK angle bracket and trailing attribute separator drift in DSML tool parsing

This commit is contained in:
CJACK
2026-05-10 01:54:31 +08:00
parent 77b6d83266
commit 61d42f8b72
12 changed files with 222 additions and 35 deletions

View File

@@ -65,6 +65,7 @@ func normalizeToolMarkupTagTailForXML(tail string) string {
}
var b strings.Builder
b.Grow(len(tail))
quote := rune(0)
for i := 0; i < len(tail); {
r, size := utf8.DecodeRuneInString(tail[i:])
if r == utf8.RuneError && size == 1 {
@@ -72,9 +73,38 @@ func normalizeToolMarkupTagTailForXML(tail string) string {
i++
continue
}
switch normalizeFullwidthASCII(r) {
case '>', '/', '=', '"', '\'':
b.WriteRune(normalizeFullwidthASCII(r))
ch := normalizeFullwidthASCII(r)
if quote != 0 {
b.WriteRune(ch)
if ch == quote {
quote = 0
}
i += size
continue
}
switch ch {
case '"', '\'':
quote = ch
b.WriteRune(ch)
case '|':
j := i + size
for j < len(tail) {
next, nextSize := utf8.DecodeRuneInString(tail[j:])
if nextSize <= 0 {
break
}
if next == ' ' || next == '\t' || next == '\r' || next == '\n' {
j += nextSize
continue
}
break
}
next, _ := normalizedASCIIAt(tail, j)
if next != '>' {
b.WriteRune(ch)
}
case '>', '/', '=':
b.WriteRune(ch)
default:
b.WriteString(tail[i : i+size])
}

View File

@@ -10,7 +10,7 @@ import (
var toolCallMarkupKVPattern = regexp.MustCompile(`(?is)<(?:[a-z0-9_:-]+:)?([a-z0-9_\-.]+)\b[^>]*>(.*?)</(?:[a-z0-9_:-]+:)?([a-z0-9_\-.]+)>`)
// cdataPattern matches a standalone CDATA section.
var cdataPattern = regexp.MustCompile(`(?is)^<!\[CDATA\[(.*?)]](?:>|)$`)
var cdataPattern = regexp.MustCompile(`(?is)^(?:<|〈)!\[CDATA\[(.*?)]](?:>||〉)$`)
func parseMarkupKVObject(text string) map[string]any {
matches := toolCallMarkupKVPattern.FindAllStringSubmatch(strings.TrimSpace(text), -1)

View File

@@ -305,19 +305,23 @@ func indexToolCDATAClose(text string, from int) int {
}
asciiIdx := strings.Index(text[from:], "]]>")
fullIdx := strings.Index(text[from:], "]]")
if asciiIdx < 0 && fullIdx < 0 {
cjkIdx := strings.Index(text[from:], "]]〉")
if asciiIdx < 0 && fullIdx < 0 && cjkIdx < 0 {
return -1
}
if asciiIdx < 0 {
return from + fullIdx
best := -1
for _, idx := range []int{asciiIdx, fullIdx, cjkIdx} {
if idx >= 0 && (best < 0 || idx < best) {
best = idx
}
}
if fullIdx < 0 || asciiIdx < fullIdx {
return from + asciiIdx
}
return from + fullIdx
return from + best
}
func toolCDATACloseLenAt(text string, idx int) int {
if strings.HasPrefix(text[idx:], "]]〉") {
return len("]]〉")
}
if strings.HasPrefix(text[idx:], "]]") {
return len("]]")
}

View File

@@ -134,12 +134,17 @@ func FindMatchingToolMarkupClose(text string, open ToolMarkupTag) (ToolMarkupTag
}
func scanToolMarkupTagAt(text string, start int) (ToolMarkupTag, bool) {
if start < 0 || start >= len(text) || text[start] != '<' {
next, ok := consumeToolMarkupLessThan(text, start)
if !ok {
return ToolMarkupTag{}, false
}
i := start + 1
for i < len(text) && text[i] == '<' {
i++
i := next
for {
next, ok := consumeToolMarkupLessThan(text, i)
if !ok {
break
}
i = next
}
closing := false
if i < len(text) && text[i] == '/' {
@@ -459,6 +464,14 @@ func consumeToolMarkupPipe(text string, idx int) (int, bool) {
return idx, false
}
func consumeToolMarkupLessThan(text string, idx int) (int, bool) {
ch, size := normalizedASCIIAt(text, idx)
if size <= 0 || ch != '<' {
return idx, false
}
return idx + size, true
}
func hasToolMarkupBoundary(text string, idx int) bool {
if idx >= len(text) {
return true
@@ -488,6 +501,12 @@ func normalizedASCIIAt(text string, idx int) (byte, int) {
}
func normalizeFullwidthASCII(r rune) rune {
switch r {
case '〈':
return '<'
case '〉':
return '>'
}
if r >= '' && r <= '' {
return r - 0xFEE0
}

View File

@@ -132,6 +132,37 @@ func TestParseToolCallsSupportsFullwidthDSMLShell(t *testing.T) {
}
}
func TestParseToolCallsSupportsCJKAngleDSMDrift(t *testing.T) {
text := `<DSMtool_calls>
<DSMinvoke name="Bash">
<DSMparameter name="description">〈![CDATA[Show commits on local dev not on origin/dev]]〉〈/DSMparameter〉
<DSMparameter name="command">〈![CDATA[git log --oneline origin/dev..dev]]〉〈/DSMparameter〉
〈/DSMinvoke〉
<DSMinvoke name="Bash">
<DSMparameter name="description">〈![CDATA[Show commits on origin/dev not on local dev]]〉〈/DSMparameter〉
<DSMparameter name="command">〈![CDATA[git log --oneline dev..origin/dev]]〉〈/DSMparameter〉
〈/DSMinvoke〉
<DSMinvoke name="Bash">
<DSMparameter name="description">〈![CDATA[Check tracking branch status]]〉〈/DSMparameter〉
<DSMparameter name="command">〈![CDATA[git status -b --short]]〉〈/DSMparameter〉
〈/DSMinvoke〉
〈/DSMtool_calls〉`
calls := ParseToolCalls(text, []string{"Bash"})
if len(calls) != 3 {
t.Fatalf("expected three CJK-angle DSM drift calls, got %#v", calls)
}
if calls[0].Name != "Bash" || calls[0].Input["command"] != "git log --oneline origin/dev..dev" {
t.Fatalf("unexpected first CJK-angle DSM drift call: %#v", calls[0])
}
if calls[1].Name != "Bash" || calls[1].Input["description"] != "Show commits on origin/dev not on local dev" {
t.Fatalf("unexpected second CJK-angle DSM drift call: %#v", calls[1])
}
if calls[2].Name != "Bash" || calls[2].Input["command"] != "git status -b --short" {
t.Fatalf("unexpected third CJK-angle DSM drift call: %#v", calls[2])
}
}
func TestParseToolCallsIgnoresBareHyphenatedToolCallsLookalike(t *testing.T) {
text := `<tool-calls><invoke name="Bash"><parameter name="command">pwd</parameter></invoke></tool-calls>`
calls := ParseToolCalls(text, []string{"Bash"})