refactor: generalize DSML tag parsing to tolerate model noise; split tiktoken by build tags

Replace hardcoded DSML typo variant lists in Go/Node tool call parsers with
generalized prefix consumption that tolerates repeated leading <, repeated DSML
prefix noise, and trailing pipe terminators. Split tiktoken-dependent token
counting into a build-tagged file for non-cgo platform compatibility. Add /data
directory to Dockerfile for bind-mount permissions.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
CJACK
2026-05-01 15:17:11 +08:00
parent 2671298439
commit fd0ec29991
21 changed files with 493 additions and 239 deletions

View File

@@ -44,6 +44,9 @@ func rewriteDSMLToolMarkupOutsideIgnored(text string) string {
}
b.WriteString(tag.Name)
b.WriteString(text[tag.NameEnd : tag.End+1])
if text[tag.End] != '>' {
b.WriteByte('>')
}
i = tag.End + 1
continue
}

View File

@@ -128,34 +128,39 @@ func scanToolMarkupTagAt(text string, start int) (ToolMarkupTag, bool) {
}
lower := strings.ToLower(text)
i := start + 1
for i < len(text) && text[i] == '<' {
i++
}
closing := false
if i < len(text) && text[i] == '/' {
closing = true
i++
}
dsmlLike := false
if next, ok := consumeToolMarkupPipe(text, i); ok {
dsmlLike = true
i = next
}
if strings.HasPrefix(lower[i:], "dsml") {
dsmlLike = true
i += len("dsml")
for next, ok := consumeToolMarkupSeparator(text, i); ok; next, ok = consumeToolMarkupSeparator(text, i) {
i = next
}
}
i, dsmlLike := consumeToolMarkupNamePrefix(lower, text, i)
name, nameLen := matchToolMarkupName(lower, i)
if nameLen == 0 {
return ToolMarkupTag{}, false
}
nameEnd := i + nameLen
nameEndBeforePipes := nameEnd
for next, ok := consumeToolMarkupPipe(text, nameEnd); ok; next, ok = consumeToolMarkupPipe(text, nameEnd) {
nameEnd = next
}
hasTrailingPipe := nameEnd > nameEndBeforePipes
if !hasToolMarkupBoundary(text, nameEnd) {
return ToolMarkupTag{}, false
}
end := findXMLTagEnd(text, nameEnd)
if end < 0 {
return ToolMarkupTag{}, false
if !hasTrailingPipe {
return ToolMarkupTag{}, false
}
end = nameEnd - 1
}
if hasTrailingPipe {
if nextLT := strings.IndexByte(text[nameEnd:], '<'); nextLT >= 0 && end >= nameEnd+nextLT {
end = nameEnd - 1
}
}
trimmed := strings.TrimSpace(text[start : end+1])
return ToolMarkupTag{
@@ -171,6 +176,74 @@ func scanToolMarkupTagAt(text string, start int) (ToolMarkupTag, bool) {
}, true
}
func IsPartialToolMarkupTagPrefix(text string) bool {
if text == "" || text[0] != '<' || strings.Contains(text, ">") {
return false
}
lower := strings.ToLower(text)
i := 1
for i < len(text) && text[i] == '<' {
i++
}
if i >= len(text) {
return true
}
if text[i] == '/' {
i++
}
for i <= len(text) {
if i == len(text) {
return true
}
if hasToolMarkupNamePrefix(lower[i:]) {
return true
}
if strings.HasPrefix("dsml", lower[i:]) {
return true
}
next, ok := consumeToolMarkupNamePrefixOnce(lower, text, i)
if !ok {
return false
}
i = next
}
return false
}
func consumeToolMarkupNamePrefix(lower, text string, idx int) (int, bool) {
dsmlLike := false
for {
next, ok := consumeToolMarkupNamePrefixOnce(lower, text, idx)
if !ok {
return idx, dsmlLike
}
idx = next
dsmlLike = true
}
}
func consumeToolMarkupNamePrefixOnce(lower, text string, idx int) (int, bool) {
if next, ok := consumeToolMarkupPipe(text, idx); ok {
return next, true
}
if idx < len(text) && (text[idx] == ' ' || text[idx] == '\t' || text[idx] == '\r' || text[idx] == '\n') {
return idx + 1, true
}
if strings.HasPrefix(lower[idx:], "dsml") {
return idx + len("dsml"), true
}
return idx, false
}
func hasToolMarkupNamePrefix(lowerTail string) bool {
for _, name := range toolMarkupNames {
if strings.HasPrefix(lowerTail, name) || strings.HasPrefix(name, lowerTail) {
return true
}
}
return false
}
func matchToolMarkupName(lower string, start int) (string, int) {
for _, name := range toolMarkupNames {
if strings.HasPrefix(lower[start:], name) {
@@ -193,19 +266,6 @@ func consumeToolMarkupPipe(text string, idx int) (int, bool) {
return idx, false
}
func consumeToolMarkupSeparator(text string, idx int) (int, bool) {
if idx >= len(text) {
return idx, false
}
if text[idx] == ' ' || text[idx] == '\t' || text[idx] == '\r' || text[idx] == '\n' {
return idx + 1, true
}
if next, ok := consumeToolMarkupPipe(text, idx); ok {
return next, true
}
return idx, false
}
func hasToolMarkupBoundary(text string, idx int) bool {
if idx >= len(text) {
return true

View File

@@ -41,6 +41,52 @@ func TestParseToolCallsSupportsDSMLShell(t *testing.T) {
}
}
func TestParseToolCallsToleratesDSMLTrailingPipeTagTerminator(t *testing.T) {
text := strings.Join([]string{
`<|DSML|tool_calls| `,
` <|DSML|invoke name="terminal">`,
` <|DSML|parameter name="command"><![CDATA[find "/home" -type d]]></|DSML|parameter>`,
` <|DSML|parameter name="timeout"><![CDATA[10]]></|DSML|parameter>`,
` </|DSML|invoke>`,
`</|DSML|tool_calls>`,
}, "\n")
calls := ParseToolCalls(text, []string{"terminal"})
if len(calls) != 1 {
t.Fatalf("expected one trailing-pipe DSML call, got %#v", calls)
}
if calls[0].Name != "terminal" {
t.Fatalf("expected terminal tool, got %#v", calls[0])
}
if calls[0].Input["command"] != `find "/home" -type d` {
t.Fatalf("expected command argument, got %#v", calls[0].Input)
}
if calls[0].Input["timeout"] != float64(10) {
t.Fatalf("expected numeric timeout, got %#v", calls[0].Input)
}
}
func TestParseToolCallsToleratesExtraLeadingLessThanBeforeDSML(t *testing.T) {
text := `<<|DSML|tool_calls><<|DSML|invoke name="Bash"><<|DSML|parameter name="command"><![CDATA[pwd]]></|DSML|parameter></|DSML|invoke></|DSML|tool_calls>`
calls := ParseToolCalls(text, []string{"Bash"})
if len(calls) != 1 {
t.Fatalf("expected one extra-leading-less-than DSML call, got %#v", calls)
}
if calls[0].Name != "Bash" || calls[0].Input["command"] != "pwd" {
t.Fatalf("unexpected extra-leading-less-than DSML parse result: %#v", calls[0])
}
}
func TestParseToolCallsToleratesRepeatedDSMLPrefixNoise(t *testing.T) {
text := `<<DSML|DSML|tool_calls><<DSML|DSML|invoke name="Bash"><<DSML|DSML|parameter name="command"><![CDATA[git status]]></DSML|DSML|parameter></DSML|DSML|invoke></DSML|DSML|tool_calls>`
calls := ParseToolCalls(text, []string{"Bash"})
if len(calls) != 1 {
t.Fatalf("expected one repeated-prefix DSML call, got %#v", calls)
}
if calls[0].Name != "Bash" || calls[0].Input["command"] != "git status" {
t.Fatalf("unexpected repeated-prefix DSML parse result: %#v", calls[0])
}
}
func TestParseToolCallsSupportsDSMLShellWithCanonicalExampleInCDATA(t *testing.T) {
content := `<tool_calls><invoke name="demo"><parameter name="value">x</parameter></invoke></tool_calls>`
text := `<|DSML|tool_calls><|DSML|invoke name="Write"><|DSML|parameter name="file_path">notes.md</|DSML|parameter><|DSML|parameter name="content"><![CDATA[` + content + `]]></|DSML|parameter></|DSML|invoke></|DSML|tool_calls>`