feat: improve CDATA and DSML tag parsing robustness with support for fullwidth-bang, ideographic-comma, and extended quote/separator normalization.

This commit is contained in:
CJACK
2026-05-10 03:41:55 +08:00
parent 61d42f8b72
commit 7a28b9e265
16 changed files with 460 additions and 32 deletions

View File

@@ -1,6 +1,6 @@
'use strict';
const CDATA_PATTERN = /^(?:<|〈)!\[CDATA\[([\s\S]*?)]](?:>||〉)$/i;
const CDATA_PATTERN = /^(?:<|〈)(?:!|)\[CDATA\[([\s\S]*?)]](?:>||〉)$/i;
const XML_ATTR_PATTERN = /\b([a-z0-9_:-]+)\s*=\s*("([^"]*)"|'([^']*)')/gi;
const TOOL_MARKUP_NAMES = [
{ raw: 'tool_calls', canonical: 'tool_calls' },
@@ -87,7 +87,8 @@ function isFenceCloseLine(trimmed, fenceChar, fenceLen) {
}
function cdataStartsBeforeFence(line) {
const cdataIdx = line.toLowerCase().indexOf('<![cdata[');
const cdataOpen = findNextCDATAOpen(line, 0);
const cdataIdx = cdataOpen.ok ? cdataOpen.start : -1;
if (cdataIdx < 0) return false;
const fenceIdx = Math.min(
line.indexOf('```') >= 0 ? line.indexOf('```') : Infinity,
@@ -109,9 +110,9 @@ function updateCDATAStateLine(inCDATA, line) {
state = false;
continue;
}
const start = lower.indexOf('<![cdata[', pos);
if (start < 0) return false;
pos = start + '<![cdata['.length;
const start = findNextCDATAOpen(line, pos);
if (!start.ok) return false;
pos = start.bodyStart;
state = true;
}
return state;
@@ -410,8 +411,9 @@ function findMatchingXmlEndTagOutsideCDATA(text, tag, from) {
}
function skipXmlIgnoredSection(lower, i) {
if (lower.startsWith('<![cdata[', i)) {
const cdataEnd = findCDATAEnd(lower, i + '<![cdata['.length);
const cdataOpen = matchCDATAOpenAt(lower, i);
if (cdataOpen.ok) {
const cdataEnd = findCDATAEnd(lower, cdataOpen.bodyStart);
const end = cdataEnd.index;
if (end < 0) {
return { advanced: false, blocked: true, next: i };
@@ -428,6 +430,50 @@ function skipXmlIgnoredSection(lower, i) {
return { advanced: false, blocked: false, next: i };
}
function findNextCDATAOpen(text, from) {
const raw = toStringSafe(text);
for (let i = Math.max(0, from || 0); i < raw.length; i += 1) {
if (normalizeFullwidthASCIIChar(raw[i]) !== '<') {
continue;
}
const open = matchCDATAOpenAt(raw, i);
if (open.ok) {
return { ok: true, start: i, bodyStart: open.bodyStart };
}
}
return { ok: false, start: -1, bodyStart: -1 };
}
function matchCDATAOpenAt(text, start) {
const raw = toStringSafe(text);
if (start < 0 || start >= raw.length || normalizeFullwidthASCIIChar(raw[start]) !== '<') {
return { ok: false, bodyStart: start };
}
let i = start + 1;
for (let skipped = 0; skipped <= 4 && i < raw.length; skipped += 1) {
const matched = matchNormalizedASCII(raw, i, '[cdata[');
if (matched.ok) {
return { ok: true, bodyStart: i + matched.len };
}
if (!isCDATAOpenSeparator(raw[i])) {
break;
}
i += 1;
}
return { ok: false, bodyStart: start };
}
function isCDATAOpenSeparator(ch) {
const normalized = normalizeFullwidthASCIIChar(ch || '');
if (!normalized || ['<', '>', '/', '=', '"', "'", '['].includes(normalized)) {
return false;
}
if ([' ', '\t', '\n', '\r'].includes(normalized)) {
return false;
}
return !/^[A-Za-z0-9]$/.test(normalized);
}
function findCDATAEnd(text, from) {
const ascii = text.indexOf(']]>', from);
const fullwidth = text.indexOf(']]', from);
@@ -458,7 +504,7 @@ function scanToolMarkupTagAt(text, start) {
while (i < raw.length && normalizeFullwidthASCIIChar(raw[i]) === '<') {
i += 1;
}
const closing = raw[i] === '/';
let closing = raw[i] === '/';
if (closing) {
i += 1;
}
@@ -472,6 +518,9 @@ function scanToolMarkupTagAt(text, start) {
if (!fallback.ok) {
return null;
}
if (!closing && toolMarkupPrefixContainsSlash(raw.slice(prefixStart, fallback.start))) {
closing = true;
}
name = fallback.name;
i = fallback.start;
len = fallback.len;
@@ -692,6 +741,15 @@ function toolMarkupPrefixAllowsLocalName(prefix) {
return !/^[A-Za-z0-9]$/.test(previous);
}
function toolMarkupPrefixContainsSlash(prefix) {
for (const ch of toStringSafe(prefix)) {
if (normalizeFullwidthASCIIChar(ch) === '/') {
return true;
}
}
return false;
}
function isToolMarkupTagTerminator(raw, idx) {
return raw[idx] === '>' || normalizeFullwidthASCIIChar(raw[idx] || '') === '>';
}
@@ -833,6 +891,12 @@ function normalizeFullwidthASCIIChar(ch) {
if (ch === '〉') {
return '>';
}
if (ch === '“' || ch === '”') {
return '"';
}
if (ch === '' || ch === '') {
return "'";
}
const code = ch.charCodeAt(0);
if (code >= 0xff01 && code <= 0xff5e) {
return String.fromCharCode(code - 0xfee0);
@@ -882,7 +946,7 @@ function normalizeToolMarkupTagTailForXML(tail) {
} else if (normalized === '"' || normalized === "'") {
quote = normalized;
out += normalized;
} else if (normalized === '|') {
} else if (normalized === '|' || normalized === '!') {
let j = i + 1;
while (j < raw.length && [' ', '\t', '\r', '\n'].includes(raw[j])) {
j += 1;
@@ -1239,16 +1303,34 @@ function unescapeHtml(safe) {
function extractStandaloneCDATA(inner) {
const s = toStringSafe(inner).trim();
const cdataMatch = s.match(CDATA_PATTERN);
if (cdataMatch && cdataMatch[1] !== undefined) {
return { ok: true, value: cdataMatch[1] };
}
if (s.toLowerCase().startsWith('<![cdata[')) {
return { ok: true, value: s.slice('<![CDATA['.length) };
const open = matchCDATAOpenAt(s, 0);
if (open.ok) {
const close = findStandaloneCDATAEnd(s, open.bodyStart);
if (close.index < 0) {
return { ok: true, value: s.slice(open.bodyStart) };
}
return { ok: true, value: s.slice(open.bodyStart, close.index) };
}
return { ok: false, value: '' };
}
function findStandaloneCDATAEnd(text, from) {
const raw = toStringSafe(text);
let best = { index: -1, len: 0 };
for (let searchFrom = Math.max(0, from || 0); searchFrom < raw.length;) {
const close = findCDATAEnd(raw, searchFrom);
if (close.index < 0) {
break;
}
const closeEnd = close.index + close.len;
if (!raw.slice(closeEnd).trim()) {
best = close;
}
searchFrom = closeEnd;
}
return best;
}
function parseJSONLiteralValue(raw) {
const s = toStringSafe(raw).trim();
if (!s) {

View File

@@ -4,6 +4,7 @@ import (
"strings"
"ds2api/internal/prompt"
"ds2api/internal/toolcall"
)
const assistantReasoningLabel = "reasoning_content"
@@ -62,6 +63,9 @@ func buildAssistantContentForPrompt(msg map[string]any) string {
reasoning = strings.TrimSpace(extractOpenAIReasoningContentFromMessage(msg["content"]))
}
toolHistory := prompt.FormatToolCallsForPrompt(msg["tool_calls"])
if toolHistory == "" {
content = normalizeAssistantToolMarkupContentForPrompt(content)
}
parts := make([]string, 0, 3)
if reasoning != "" {
parts = append(parts, formatPromptLabeledBlock(assistantReasoningLabel, reasoning))
@@ -82,6 +86,40 @@ func buildAssistantContentForPrompt(msg map[string]any) string {
}
}
func normalizeAssistantToolMarkupContentForPrompt(content string) string {
trimmed := strings.TrimSpace(content)
if trimmed == "" || !isStandaloneAssistantToolMarkupBlock(trimmed) {
return content
}
parsed := toolcall.ParseStandaloneToolCallsDetailed(trimmed, nil)
if len(parsed.Calls) == 0 {
return content
}
raw := make([]any, 0, len(parsed.Calls))
for _, call := range parsed.Calls {
raw = append(raw, map[string]any{
"name": call.Name,
"input": call.Input,
})
}
if formatted := prompt.FormatToolCallsForPrompt(raw); formatted != "" {
return formatted
}
return content
}
func isStandaloneAssistantToolMarkupBlock(trimmed string) bool {
tag, ok := toolcall.FindToolMarkupTagOutsideIgnored(trimmed, 0)
if !ok || tag.Start != 0 || tag.Closing || tag.Name != "tool_calls" {
return false
}
closeTag, ok := toolcall.FindMatchingToolMarkupClose(trimmed, tag)
if !ok {
return false
}
return strings.TrimSpace(trimmed[closeTag.End+1:]) == ""
}
func normalizeOpenAIReasoningContentForPrompt(v any) string {
switch x := v.(type) {
case string:

View File

@@ -263,6 +263,42 @@ func TestNormalizeOpenAIMessagesForPrompt_AssistantNilContentDoesNotInjectNullLi
}
}
func TestNormalizeOpenAIMessagesForPrompt_CanonicalizesStandaloneAssistantToolMarkupContent(t *testing.T) {
raw := []any{
map[string]any{
"role": "assistant",
"content": `<DSMLtool_calls>
<DSMLinvoke name=“Bash”>
<DSMLparameter name=“command”><[CDATA[lsof -i :4321 -t]]></DSMLparameter>
<DSMLparameter name=“description”><[CDATA[Verify port 4321 is free]]></DSMLparameter>
</DSMLinvoke>
</DSMLtool_calls>`,
},
}
normalized := NormalizeOpenAIMessagesForPrompt(raw, "")
if len(normalized) != 1 {
t.Fatalf("expected one normalized assistant message, got %#v", normalized)
}
content, _ := normalized[0]["content"].(string)
for _, want := range []string{
"<DSMLtool_calls>",
`<DSMLinvoke name="Bash">`,
`<DSMLparameter name="command"><![CDATA[lsof -i :4321 -t]]></DSMLparameter>`,
`<DSMLparameter name="description"><![CDATA[Verify port 4321 is free]]></DSMLparameter>`,
"</DSMLtool_calls>",
} {
if !strings.Contains(content, want) {
t.Fatalf("expected canonicalized assistant tool markup to contain %q, got %q", want, content)
}
}
for _, bad := range []string{"<DSML", "tool_calls", "“", "”"} {
if strings.Contains(content, bad) {
t.Fatalf("expected malformed assistant tool markup to be removed from prompt history, found %q in %q", bad, content)
}
}
}
func TestNormalizeOpenAIMessagesForPrompt_DeveloperRoleMapsToSystem(t *testing.T) {
raw := []any{
map[string]any{"role": "developer", "content": "必须先走工具调用"},

View File

@@ -21,6 +21,7 @@ RULES:
1) Use the <DSMLtool_calls> wrapper format.
2) Put one or more <DSMLinvoke> entries under a single <DSMLtool_calls> root.
3) Put the tool name in the invoke name attribute: <DSMLinvoke name="TOOL_NAME">.
3a) Tag punctuation alphabet: ASCII < > / = " plus the fullwidth vertical bar .
4) All string values must use <![CDATA[...]]>, even short ones. This includes code, scripts, file contents, prompts, paths, names, and queries.
5) Every top-level argument must be a <DSMLparameter name="ARG_NAME">...</DSMLparameter> node.
6) Objects use nested XML elements inside the parameter body. Arrays may repeat <item> children.

View File

@@ -133,6 +133,19 @@ func TestBuildToolCallInstructions_RejectsEmptyParametersInPrompt(t *testing.T)
}
}
func TestBuildToolCallInstructions_UsesPositiveTagPunctuationAlphabet(t *testing.T) {
out := BuildToolCallInstructions([]string{"Bash"})
want := `Tag punctuation alphabet: ASCII < > / = " plus the fullwidth vertical bar .`
if !strings.Contains(out, want) {
t.Fatalf("expected positive tag punctuation alphabet %q, got: %s", want, out)
}
for _, bad := range []string{"lookalike", "substitute", "", "〈", "〉", "“", "”", "、"} {
if strings.Contains(out, bad) {
t.Fatalf("tool prompt should not include negative punctuation examples %q, got: %s", bad, out)
}
}
}
func findInvokeBlocks(text, name string) []string {
open := `<DSMLinvoke name="` + name + `">`
remaining := text

View File

@@ -86,7 +86,7 @@ func normalizeToolMarkupTagTailForXML(tail string) string {
case '"', '\'':
quote = ch
b.WriteRune(ch)
case '|':
case '|', '!':
j := i + size
for j < len(tail) {
next, nextSize := utf8.DecodeRuneInString(tail[j:])

View File

@@ -10,7 +10,7 @@ import (
var toolCallMarkupKVPattern = regexp.MustCompile(`(?is)<(?:[a-z0-9_:-]+:)?([a-z0-9_\-.]+)\b[^>]*>(.*?)</(?:[a-z0-9_:-]+:)?([a-z0-9_\-.]+)>`)
// cdataPattern matches a standalone CDATA section.
var cdataPattern = regexp.MustCompile(`(?is)^(?:<|〈)!\[CDATA\[(.*?)]](?:>||〉)$`)
var cdataPattern = regexp.MustCompile(`(?is)^(?:<|〈)(?:!|)\[CDATA\[(.*?)]](?:>||〉)$`)
func parseMarkupKVObject(text string) map[string]any {
matches := toolCallMarkupKVPattern.FindAllStringSubmatch(strings.TrimSpace(text), -1)
@@ -108,15 +108,32 @@ func extractRawTagValue(inner string) string {
func extractStandaloneCDATA(inner string) (string, bool) {
trimmed := strings.TrimSpace(inner)
if cdataMatches := cdataPattern.FindStringSubmatch(trimmed); len(cdataMatches) >= 2 {
return cdataMatches[1], true
}
if strings.HasPrefix(strings.ToLower(trimmed), "<![cdata[") {
return trimmed[len("<![CDATA["):], true
if bodyStart, ok := matchToolCDATAOpenAt(trimmed, 0); ok {
end := findStandaloneCDATAEnd(trimmed, bodyStart)
if end < 0 {
return trimmed[bodyStart:], true
}
return trimmed[bodyStart:end], true
}
return "", false
}
func findStandaloneCDATAEnd(text string, from int) int {
end := -1
for searchFrom := from; searchFrom < len(text); {
next := indexToolCDATAClose(text, searchFrom)
if next < 0 {
break
}
closeEnd := next + toolCDATACloseLenAt(text, next)
if strings.TrimSpace(text[closeEnd:]) == "" {
end = next
}
searchFrom = closeEnd
}
return end
}
func parseJSONLiteralValue(raw string) (any, bool) {
trimmed := strings.TrimSpace(raw)
if trimmed == "" {

View File

@@ -209,13 +209,14 @@ func skipXMLIgnoredSection(text string, i int) (next int, advanced bool, blocked
if i < 0 || i >= len(text) {
return i, false, false
}
switch {
case hasASCIIPrefixFoldAt(text, i, "<![cdata["):
end := findToolCDATAEnd(text, i+len("<![cdata["))
if bodyStart, ok := matchToolCDATAOpenAt(text, i); ok {
end := findToolCDATAEnd(text, bodyStart)
if end < 0 {
return 0, false, true
}
return end + toolCDATACloseLenAt(text, end), true, false
}
switch {
case strings.HasPrefix(text[i:], "<!--"):
end := strings.Index(text[i+len("<!--"):], "-->")
if end < 0 {
@@ -227,6 +228,38 @@ func skipXMLIgnoredSection(text string, i int) (next int, advanced bool, blocked
}
}
func matchToolCDATAOpenAt(text string, start int) (int, bool) {
i, ok := consumeToolMarkupLessThan(text, start)
if !ok {
return start, false
}
for skipped := 0; skipped <= 4 && i < len(text); skipped++ {
if cdataLen, ok := matchASCIIPrefixFoldAt(text, i, "[cdata["); ok {
return i + cdataLen, true
}
r, size := utf8.DecodeRuneInString(text[i:])
if size <= 0 || !isToolCDATAOpenSeparator(r) {
break
}
i += size
}
return start, false
}
func isToolCDATAOpenSeparator(r rune) bool {
ch := normalizeFullwidthASCII(r)
if ch == 0 || ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '"' || ch == '\'' || ch == '[' {
return false
}
if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' {
return false
}
if (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9') {
return false
}
return true
}
func hasASCIIPrefixFoldAt(text string, start int, prefix string) bool {
_, ok := matchASCIIPrefixFoldAt(text, start, prefix)
return ok

View File

@@ -159,6 +159,9 @@ func scanToolMarkupTagAt(text string, start int) (ToolMarkupTag, bool) {
if !ok {
return ToolMarkupTag{}, false
}
if !closing && toolMarkupPrefixContainsSlash(text[prefixStart:fallbackStart]) {
closing = true
}
name = fallbackName
i = fallbackStart
nameLen = fallbackLen
@@ -461,6 +464,9 @@ func consumeToolMarkupPipe(text string, idx int) (int, bool) {
if strings.HasPrefix(text[idx:], "␂") {
return idx + len("␂"), true
}
if ch, size := normalizedASCIIAt(text, idx); ch == '!' {
return idx + size, true
}
return idx, false
}
@@ -506,9 +512,22 @@ func normalizeFullwidthASCII(r rune) rune {
return '<'
case '〉':
return '>'
case '“', '”':
return '"'
case '', '':
return '\''
}
if r >= '' && r <= '' {
return r - 0xFEE0
}
return r
}
func toolMarkupPrefixContainsSlash(prefix string) bool {
for _, r := range prefix {
if normalizeFullwidthASCII(r) == '/' {
return true
}
}
return false
}

View File

@@ -163,6 +163,49 @@ func TestParseToolCallsSupportsCJKAngleDSMDrift(t *testing.T) {
}
}
func TestParseToolCallsSupportsFullwidthBangDSMLDrift(t *testing.T) {
text := `<DSMLtool_calls>
<DSMLinvoke name=“Bash”>
<DSMLparameter name=“command”><[CDATA[lsof -i :4321 -t]]></DSMLparameter>
<DSMLparameter name=“description”><[CDATA[Verify port 4321 is free]]></DSMLparameter>
</DSMLinvoke>
</DSMLtool_calls>`
calls := ParseToolCalls(text, []string{"Bash"})
if len(calls) != 1 {
t.Fatalf("expected one fullwidth-bang DSML drift call, got %#v", calls)
}
if calls[0].Name != "Bash" || calls[0].Input["command"] != "lsof -i :4321 -t" || calls[0].Input["description"] != "Verify port 4321 is free" {
t.Fatalf("unexpected fullwidth-bang DSML drift call: %#v", calls[0])
}
}
func TestParseToolCallsSupportsIdeographicCommaDSMLDrift(t *testing.T) {
text := `<、DSML、tool_calls>
<、DSML、invoke name="Bash">
<、DSML、parameter name="command"><、[CDATA[git commit -m "$(cat <<'EOF'
feat: expand fullwidth bang separator and curly quote tolerance in DSML tool parsing
Co-Authored-By: Claude Opus 4.6 noreply@anthropic.com
EOF
)"]]><、/DSML、parameter>
<、DSML、parameter name="description"><、[CDATA[Create commit with staged changes]]><、/DSML、parameter>
<、/DSML、invoke>
<、/DSML、tool_calls>`
calls := ParseToolCalls(text, []string{"Bash"})
if len(calls) != 1 {
t.Fatalf("expected one ideographic-comma DSML drift call, got %#v", calls)
}
command, _ := calls[0].Input["command"].(string)
if calls[0].Name != "Bash" || !strings.Contains(command, `git commit -m "$(cat <<'EOF'`) || !strings.Contains(command, "Co-Authored-By: Claude Opus 4.6 noreply@anthropic.com") {
t.Fatalf("unexpected ideographic-comma DSML drift call: %#v", calls[0])
}
if calls[0].Input["description"] != "Create commit with staged changes" {
t.Fatalf("unexpected ideographic-comma description: %#v", calls[0])
}
}
func TestParseToolCallsIgnoresBareHyphenatedToolCallsLookalike(t *testing.T) {
text := `<tool-calls><invoke name="Bash"><parameter name="command">pwd</parameter></invoke></tool-calls>`
calls := ParseToolCalls(text, []string{"Bash"})

View File

@@ -1262,3 +1262,76 @@ func TestProcessToolSieveCJKAngleDSMDriftDoesNotLeak(t *testing.T) {
t.Fatalf("unexpected CJK-angle DSM drift call: %#v", calls[0])
}
}
func TestProcessToolSieveFullwidthBangDSMLDriftDoesNotLeak(t *testing.T) {
var state State
chunks := []string{
"<DSMLtool_calls>\n",
" <DSMLinvoke name=“Bash”>\n",
" <DSMLparameter name=“command”><[CDATA[lsof -i :4321 -t]]></DSMLparameter>\n",
" <DSMLparameter name=“description”><[CDATA[Verify port 4321 is free]]></DSMLparameter>\n",
" </DSMLinvoke>\n",
" </DSMLtool_calls>",
}
var events []Event
for _, c := range chunks {
events = append(events, ProcessChunk(&state, c, []string{"Bash"})...)
}
events = append(events, Flush(&state, []string{"Bash"})...)
var textContent string
var calls []toolcall.ParsedToolCall
for _, evt := range events {
textContent += evt.Content
calls = append(calls, evt.ToolCalls...)
}
if strings.Contains(textContent, "DSML") || strings.Contains(textContent, "lsof") {
t.Fatalf("fullwidth-bang DSML drift leaked to text: %q events=%#v", textContent, events)
}
if len(calls) != 1 {
t.Fatalf("expected one fullwidth-bang DSML drift tool call, got %d events=%#v", len(calls), events)
}
if calls[0].Name != "Bash" || calls[0].Input["command"] != "lsof -i :4321 -t" {
t.Fatalf("unexpected fullwidth-bang DSML drift call: %#v", calls[0])
}
}
func TestProcessToolSieveIdeographicCommaDSMLDriftDoesNotLeak(t *testing.T) {
var state State
chunks := []string{
"<、DSML、tool_calls>\n",
" <、DSML、invoke name=\"Bash\">\n",
" <、DSML、parameter name=\"command\"><、[CDATA[git commit -m \"$(cat <<'EOF'\n",
"feat: expand fullwidth bang separator and curly quote tolerance in DSML tool parsing\n\n",
"Co-Authored-By: Claude Opus 4.6 noreply@anthropic.com\n",
"EOF\n",
")\"]]><、/DSML、parameter>\n",
" <、DSML、parameter name=\"description\"><、[CDATA[Create commit with staged changes]]><、/DSML、parameter>\n",
" <、/DSML、invoke>\n",
"<、/DSML、tool_calls>",
}
var events []Event
for _, c := range chunks {
events = append(events, ProcessChunk(&state, c, []string{"Bash"})...)
}
events = append(events, Flush(&state, []string{"Bash"})...)
var textContent string
var calls []toolcall.ParsedToolCall
for _, evt := range events {
textContent += evt.Content
calls = append(calls, evt.ToolCalls...)
}
if strings.Contains(textContent, "DSML") || strings.Contains(textContent, "git commit") {
t.Fatalf("ideographic-comma DSML drift leaked to text: %q events=%#v", textContent, events)
}
if len(calls) != 1 {
t.Fatalf("expected one ideographic-comma DSML drift tool call, got %d events=%#v", len(calls), events)
}
command, _ := calls[0].Input["command"].(string)
if calls[0].Name != "Bash" || !strings.Contains(command, "git commit -m") {
t.Fatalf("unexpected ideographic-comma DSML drift call: %#v", calls[0])
}
}