feat: implement support for arbitrary tool markup prefixes and control character separators in tool sieve parsing

This commit is contained in:
CJACK
2026-05-10 00:19:03 +08:00
parent 7c66742a19
commit 3cc7f469f3
7 changed files with 335 additions and 9 deletions

View File

@@ -458,11 +458,19 @@ function scanToolMarkupTagAt(text, start) {
i += 1;
}
const prefix = consumeToolMarkupNamePrefix(raw, lower, i);
const prefixStart = i;
i = prefix.next;
const dsmlLike = prefix.dsmlLike;
const { name, len } = matchToolMarkupName(raw, i, dsmlLike);
let dsmlLike = prefix.dsmlLike;
let { name, len } = matchToolMarkupName(raw, i, dsmlLike);
if (!name) {
return null;
const fallback = matchToolMarkupNameAfterArbitraryPrefix(raw, prefixStart);
if (!fallback.ok) {
return null;
}
name = fallback.name;
i = fallback.start;
len = fallback.len;
dsmlLike = true;
}
const originalNameEnd = i + len;
let nameEnd = originalNameEnd;
@@ -574,7 +582,7 @@ function includeDuplicateLeadingLessThan(text, idx) {
}
function isToolMarkupPipe(ch) {
return ch === '|' || ch === '';
return ch === '|' || ch === '' || ch === '␂' || ch === '\x02';
}
function isPartialToolMarkupTagPrefix(text) {
@@ -603,6 +611,9 @@ function isPartialToolMarkupTagPrefix(text) {
if (normalizedASCIITailAt(raw, i).startsWith('dsml') || 'dsml'.startsWith(normalizedASCIITailAt(raw, i))) {
return true;
}
if (hasPartialToolMarkupNameAfterArbitraryPrefix(raw, i)) {
return true;
}
const next = consumeToolMarkupNamePrefixOnce(raw, lower, i);
if (!next.ok) {
return false;
@@ -625,6 +636,53 @@ function consumeToolMarkupNamePrefix(raw, lower, idx) {
}
}
function matchToolMarkupNameAfterArbitraryPrefix(raw, start) {
for (let idx = start; idx < raw.length;) {
if (isToolMarkupTagTerminator(raw, idx)) {
return { ok: false };
}
for (const name of TOOL_MARKUP_NAMES) {
const matched = matchNormalizedASCII(raw, idx, name.raw);
if (!matched.ok) continue;
if (!toolMarkupPrefixAllowsLocalName(raw.slice(start, idx))) continue;
return { ok: true, name: name.canonical, start: idx, len: matched.len };
}
idx += 1;
}
return { ok: false };
}
function hasPartialToolMarkupNameAfterArbitraryPrefix(raw, start) {
for (let idx = start; idx < raw.length;) {
if (isToolMarkupTagTerminator(raw, idx)) {
return false;
}
if (toolMarkupPrefixAllowsLocalName(raw.slice(start, idx)) && hasToolMarkupNamePrefix(raw, idx)) {
return true;
}
idx += 1;
}
return false;
}
function toolMarkupPrefixAllowsLocalName(prefix) {
if (!prefix) {
return false;
}
if (normalizedASCIITailAt(prefix, 0).includes('dsml')) {
return true;
}
if (/[="'"]/.test(prefix)) {
return false;
}
const previous = normalizeFullwidthASCIIChar(prefix[prefix.length - 1] || '');
return !/^[A-Za-z0-9]$/.test(previous);
}
function isToolMarkupTagTerminator(raw, idx) {
return raw[idx] === '>' || normalizeFullwidthASCIIChar(raw[idx] || '') === '>';
}
function consumeToolMarkupNamePrefixOnce(raw, lower, idx) {
if (idx < raw.length && isToolMarkupPipe(raw[idx])) {
return { next: idx + 1, ok: true };

View File

@@ -146,10 +146,18 @@ func scanToolMarkupTagAt(text string, start int) (ToolMarkupTag, bool) {
closing = true
i++
}
prefixStart := i
i, dsmlLike := consumeToolMarkupNamePrefix(text, i)
name, nameLen := matchToolMarkupName(text, i, dsmlLike)
if nameLen == 0 {
return ToolMarkupTag{}, false
fallbackName, fallbackStart, fallbackLen, ok := matchToolMarkupNameAfterArbitraryPrefix(text, prefixStart)
if !ok {
return ToolMarkupTag{}, false
}
name = fallbackName
i = fallbackStart
nameLen = fallbackLen
dsmlLike = true
}
nameEnd := i + nameLen
nameEndBeforePipes := nameEnd
@@ -210,6 +218,9 @@ func IsPartialToolMarkupTagPrefix(text string) bool {
if hasASCIIPartialPrefixFoldAt(text, i, "dsml") {
return true
}
if hasPartialToolMarkupNameAfterArbitraryPrefix(text, i) {
return true
}
next, ok := consumeToolMarkupNamePrefixOnce(text, i)
if !ok {
return false
@@ -340,6 +351,88 @@ func matchToolMarkupName(text string, start int, dsmlLike bool) (string, int) {
return "", 0
}
func matchToolMarkupNameAfterArbitraryPrefix(text string, start int) (string, int, int, bool) {
for idx := start; idx < len(text); {
if isToolMarkupTagTerminator(text, idx) {
return "", 0, 0, false
}
for _, name := range toolMarkupNames {
nameLen, ok := matchASCIIPrefixFoldAt(text, idx, name.raw)
if !ok {
continue
}
if !toolMarkupPrefixAllowsLocalName(text[start:idx]) {
continue
}
return name.canonical, idx, nameLen, true
}
_, size := utf8.DecodeRuneInString(text[idx:])
if size <= 0 {
size = 1
}
idx += size
}
return "", 0, 0, false
}
func hasPartialToolMarkupNameAfterArbitraryPrefix(text string, start int) bool {
for idx := start; idx < len(text); {
if isToolMarkupTagTerminator(text, idx) {
return false
}
if toolMarkupPrefixAllowsLocalName(text[start:idx]) && hasToolMarkupNamePrefix(text, idx) {
return true
}
_, size := utf8.DecodeRuneInString(text[idx:])
if size <= 0 {
size = 1
}
idx += size
}
return false
}
func toolMarkupPrefixAllowsLocalName(prefix string) bool {
if prefix == "" {
return false
}
if strings.Contains(normalizedASCIILowerString(prefix), "dsml") {
return true
}
if strings.ContainsAny(prefix, "=\"'") {
return false
}
r, _ := utf8.DecodeLastRuneInString(prefix)
r = normalizeFullwidthASCII(r)
return (r < 'a' || r > 'z') && (r < 'A' || r > 'Z') && (r < '0' || r > '9')
}
func normalizedASCIILowerString(text string) string {
var b strings.Builder
b.Grow(len(text))
for _, r := range text {
r = normalizeFullwidthASCII(r)
if r >= 'A' && r <= 'Z' {
r += 'a' - 'A'
}
if r <= 0x7f {
b.WriteRune(r)
}
}
return b.String()
}
func isToolMarkupTagTerminator(text string, idx int) bool {
if idx >= len(text) {
return false
}
if text[idx] == '>' {
return true
}
r, _ := utf8.DecodeRuneInString(text[idx:])
return normalizeFullwidthASCII(r) == '>'
}
func consumeToolMarkupPipe(text string, idx int) (int, bool) {
if idx >= len(text) {
return idx, false
@@ -347,9 +440,15 @@ func consumeToolMarkupPipe(text string, idx int) (int, bool) {
if text[idx] == '|' {
return idx + 1, true
}
if text[idx] == '\x02' {
return idx + 1, true
}
if strings.HasPrefix(text[idx:], "") {
return idx + len(""), true
}
if strings.HasPrefix(text[idx:], "␂") {
return idx + len("␂"), true
}
return idx, false
}

View File

@@ -104,6 +104,111 @@ func TestProcessToolSieveInterceptsDSMLTrailingPipeToolCallWithoutLeak(t *testin
}
}
func TestProcessToolSieveInterceptsDSMLControlSeparatorWithoutLeak(t *testing.T) {
for _, tc := range []struct {
name string
sep string
}{
{name: "control_picture", sep: "␂"},
{name: "raw_stx", sep: "\x02"},
} {
t.Run(tc.name, func(t *testing.T) {
sep := tc.sep
var state State
chunks := []string{
"<DSML" + sep + "tool",
"_calls>\n",
` <DSML` + sep + `invoke name="Read">` + "\n",
` <DSML` + sep + `parameter name="file_path"><![CDATA[/tmp/input.txt]]></DSML` + sep + `parameter>` + "\n",
" </DSML" + sep + "invoke>\n",
"</DSML" + sep + "tool_calls>",
}
var events []Event
for _, c := range chunks {
events = append(events, ProcessChunk(&state, c, []string{"Read"})...)
}
events = append(events, Flush(&state, []string{"Read"})...)
var textContent strings.Builder
var calls []any
for _, evt := range events {
textContent.WriteString(evt.Content)
for _, call := range evt.ToolCalls {
calls = append(calls, call)
}
}
if text := textContent.String(); strings.Contains(strings.ToLower(text), "dsml") || strings.Contains(text, "Read") || strings.Contains(text, sep) {
t.Fatalf("control-separator DSML tool call leaked to text: %q events=%#v", text, events)
}
if len(calls) != 1 {
t.Fatalf("expected one control-separator DSML tool call, got %d events=%#v", len(calls), events)
}
})
}
}
func TestProcessToolSieveInterceptsArbitraryPrefixedToolTagsWithoutLeak(t *testing.T) {
var state State
chunks := []string{
"<proto💥tool",
"_calls>\n",
` <proto💥invoke name="Read">` + "\n",
` <proto💥parameter name="file_path"><![CDATA[/tmp/input.txt]]></proto💥parameter>` + "\n",
" </proto💥invoke>\n",
"</proto💥tool_calls>",
}
var events []Event
for _, c := range chunks {
events = append(events, ProcessChunk(&state, c, []string{"Read"})...)
}
events = append(events, Flush(&state, []string{"Read"})...)
var textContent strings.Builder
var calls []any
for _, evt := range events {
textContent.WriteString(evt.Content)
for _, call := range evt.ToolCalls {
calls = append(calls, call)
}
}
if text := textContent.String(); strings.Contains(text, "proto") || strings.Contains(text, "Read") || strings.Contains(text, "💥") {
t.Fatalf("arbitrary-prefixed tool call leaked to text: %q events=%#v", text, events)
}
if len(calls) != 1 {
t.Fatalf("expected one arbitrary-prefixed tool call, got %d events=%#v", len(calls), events)
}
}
func TestProcessToolSieveSuppressesEmptyDSMLControlSeparatorBlockWithoutLeak(t *testing.T) {
var state State
sep := "␂"
chunks := []string{
"<DSML" + sep + "tool_calls>\n",
` <DSML` + sep + `invoke name="Read">` + "\n",
` <DSML` + sep + `parameter name="file_path"></DSML` + sep + `parameter>` + "\n",
" </DSML" + sep + "invoke>\n",
"</DSML" + sep + "tool_calls>",
}
var events []Event
for _, c := range chunks {
events = append(events, ProcessChunk(&state, c, []string{"Read"})...)
}
events = append(events, Flush(&state, []string{"Read"})...)
var textContent strings.Builder
toolCalls := 0
for _, evt := range events {
textContent.WriteString(evt.Content)
toolCalls += len(evt.ToolCalls)
}
if toolCalls != 0 {
t.Fatalf("expected empty control-separator block not to produce calls, got %d events=%#v", toolCalls, events)
}
if text := textContent.String(); strings.Contains(strings.ToLower(text), "dsml") || strings.Contains(text, "Read") || strings.Contains(text, sep) {
t.Fatalf("expected empty control-separator block not to leak as text, got %q", text)
}
}
func TestProcessToolSieveInterceptsExtraLeadingLessThanDSMLToolCallWithoutLeak(t *testing.T) {
var state State
chunks := []string{