mirror of
https://github.com/CJackHwang/ds2api.git
synced 2026-05-16 06:05:07 +08:00
feat: add support for CJK angle bracket and trailing attribute separator drift in DSML tool parsing
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
'use strict';
|
||||
|
||||
const CDATA_PATTERN = /^<!\[CDATA\[([\s\S]*?)]](?:>|>)$/i;
|
||||
const CDATA_PATTERN = /^(?:<|〈)!\[CDATA\[([\s\S]*?)]](?:>|>|〉)$/i;
|
||||
const XML_ATTR_PATTERN = /\b([a-z0-9_:-]+)\s*=\s*("([^"]*)"|'([^']*)')/gi;
|
||||
const TOOL_MARKUP_NAMES = [
|
||||
{ raw: 'tool_calls', canonical: 'tool_calls' },
|
||||
@@ -431,26 +431,31 @@ function skipXmlIgnoredSection(lower, i) {
|
||||
function findCDATAEnd(text, from) {
|
||||
const ascii = text.indexOf(']]>', from);
|
||||
const fullwidth = text.indexOf(']]>', from);
|
||||
if (ascii < 0 && fullwidth < 0) {
|
||||
const cjk = text.indexOf(']]〉', from);
|
||||
if (ascii < 0 && fullwidth < 0 && cjk < 0) {
|
||||
return { index: -1, len: 0 };
|
||||
}
|
||||
if (ascii < 0) {
|
||||
return { index: fullwidth, len: ']]>'.length };
|
||||
let best = { index: -1, len: 0 };
|
||||
for (const candidate of [
|
||||
{ index: ascii, len: ']]>'.length },
|
||||
{ index: fullwidth, len: ']]>'.length },
|
||||
{ index: cjk, len: ']]〉'.length },
|
||||
]) {
|
||||
if (candidate.index >= 0 && (best.index < 0 || candidate.index < best.index)) {
|
||||
best = candidate;
|
||||
}
|
||||
}
|
||||
if (fullwidth < 0 || ascii < fullwidth) {
|
||||
return { index: ascii, len: ']]>'.length };
|
||||
}
|
||||
return { index: fullwidth, len: ']]>'.length };
|
||||
return best;
|
||||
}
|
||||
|
||||
function scanToolMarkupTagAt(text, start) {
|
||||
const raw = toStringSafe(text);
|
||||
if (!raw || start < 0 || start >= raw.length || raw[start] !== '<') {
|
||||
if (!raw || start < 0 || start >= raw.length || normalizeFullwidthASCIIChar(raw[start]) !== '<') {
|
||||
return null;
|
||||
}
|
||||
const lower = raw.toLowerCase();
|
||||
let i = start + 1;
|
||||
while (i < raw.length && raw[i] === '<') {
|
||||
while (i < raw.length && normalizeFullwidthASCIIChar(raw[i]) === '<') {
|
||||
i += 1;
|
||||
}
|
||||
const closing = raw[i] === '/';
|
||||
@@ -822,6 +827,12 @@ function normalizeFullwidthASCIIChar(ch) {
|
||||
if (!ch) {
|
||||
return ch;
|
||||
}
|
||||
if (ch === '〈') {
|
||||
return '<';
|
||||
}
|
||||
if (ch === '〉') {
|
||||
return '>';
|
||||
}
|
||||
const code = ch.charCodeAt(0);
|
||||
if (code >= 0xff01 && code <= 0xff5e) {
|
||||
return String.fromCharCode(code - 0xfee0);
|
||||
@@ -858,9 +869,28 @@ function matchNormalizedASCII(raw, start, expected) {
|
||||
|
||||
function normalizeToolMarkupTagTailForXML(tail) {
|
||||
let out = '';
|
||||
for (const ch of typeof tail === 'string' ? tail : String(tail || '')) {
|
||||
const raw = typeof tail === 'string' ? tail : String(tail || '');
|
||||
let quote = '';
|
||||
for (let i = 0; i < raw.length; i += 1) {
|
||||
const ch = raw[i];
|
||||
const normalized = normalizeFullwidthASCIIChar(ch);
|
||||
if (['>', '/', '=', '"', "'"].includes(normalized)) {
|
||||
if (quote) {
|
||||
out += normalized;
|
||||
if (normalized === quote) {
|
||||
quote = '';
|
||||
}
|
||||
} else if (normalized === '"' || normalized === "'") {
|
||||
quote = normalized;
|
||||
out += normalized;
|
||||
} else if (normalized === '|') {
|
||||
let j = i + 1;
|
||||
while (j < raw.length && [' ', '\t', '\r', '\n'].includes(raw[j])) {
|
||||
j += 1;
|
||||
}
|
||||
if (normalizeFullwidthASCIIChar(raw[j] || '') !== '>') {
|
||||
out += normalized;
|
||||
}
|
||||
} else if (['>', '/', '='].includes(normalized)) {
|
||||
out += normalized;
|
||||
} else {
|
||||
out += ch;
|
||||
|
||||
@@ -65,6 +65,7 @@ func normalizeToolMarkupTagTailForXML(tail string) string {
|
||||
}
|
||||
var b strings.Builder
|
||||
b.Grow(len(tail))
|
||||
quote := rune(0)
|
||||
for i := 0; i < len(tail); {
|
||||
r, size := utf8.DecodeRuneInString(tail[i:])
|
||||
if r == utf8.RuneError && size == 1 {
|
||||
@@ -72,9 +73,38 @@ func normalizeToolMarkupTagTailForXML(tail string) string {
|
||||
i++
|
||||
continue
|
||||
}
|
||||
switch normalizeFullwidthASCII(r) {
|
||||
case '>', '/', '=', '"', '\'':
|
||||
b.WriteRune(normalizeFullwidthASCII(r))
|
||||
ch := normalizeFullwidthASCII(r)
|
||||
if quote != 0 {
|
||||
b.WriteRune(ch)
|
||||
if ch == quote {
|
||||
quote = 0
|
||||
}
|
||||
i += size
|
||||
continue
|
||||
}
|
||||
switch ch {
|
||||
case '"', '\'':
|
||||
quote = ch
|
||||
b.WriteRune(ch)
|
||||
case '|':
|
||||
j := i + size
|
||||
for j < len(tail) {
|
||||
next, nextSize := utf8.DecodeRuneInString(tail[j:])
|
||||
if nextSize <= 0 {
|
||||
break
|
||||
}
|
||||
if next == ' ' || next == '\t' || next == '\r' || next == '\n' {
|
||||
j += nextSize
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
next, _ := normalizedASCIIAt(tail, j)
|
||||
if next != '>' {
|
||||
b.WriteRune(ch)
|
||||
}
|
||||
case '>', '/', '=':
|
||||
b.WriteRune(ch)
|
||||
default:
|
||||
b.WriteString(tail[i : i+size])
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ import (
|
||||
var toolCallMarkupKVPattern = regexp.MustCompile(`(?is)<(?:[a-z0-9_:-]+:)?([a-z0-9_\-.]+)\b[^>]*>(.*?)</(?:[a-z0-9_:-]+:)?([a-z0-9_\-.]+)>`)
|
||||
|
||||
// cdataPattern matches a standalone CDATA section.
|
||||
var cdataPattern = regexp.MustCompile(`(?is)^<!\[CDATA\[(.*?)]](?:>|>)$`)
|
||||
var cdataPattern = regexp.MustCompile(`(?is)^(?:<|〈)!\[CDATA\[(.*?)]](?:>|>|〉)$`)
|
||||
|
||||
func parseMarkupKVObject(text string) map[string]any {
|
||||
matches := toolCallMarkupKVPattern.FindAllStringSubmatch(strings.TrimSpace(text), -1)
|
||||
|
||||
@@ -305,19 +305,23 @@ func indexToolCDATAClose(text string, from int) int {
|
||||
}
|
||||
asciiIdx := strings.Index(text[from:], "]]>")
|
||||
fullIdx := strings.Index(text[from:], "]]>")
|
||||
if asciiIdx < 0 && fullIdx < 0 {
|
||||
cjkIdx := strings.Index(text[from:], "]]〉")
|
||||
if asciiIdx < 0 && fullIdx < 0 && cjkIdx < 0 {
|
||||
return -1
|
||||
}
|
||||
if asciiIdx < 0 {
|
||||
return from + fullIdx
|
||||
best := -1
|
||||
for _, idx := range []int{asciiIdx, fullIdx, cjkIdx} {
|
||||
if idx >= 0 && (best < 0 || idx < best) {
|
||||
best = idx
|
||||
}
|
||||
}
|
||||
if fullIdx < 0 || asciiIdx < fullIdx {
|
||||
return from + asciiIdx
|
||||
}
|
||||
return from + fullIdx
|
||||
return from + best
|
||||
}
|
||||
|
||||
func toolCDATACloseLenAt(text string, idx int) int {
|
||||
if strings.HasPrefix(text[idx:], "]]〉") {
|
||||
return len("]]〉")
|
||||
}
|
||||
if strings.HasPrefix(text[idx:], "]]>") {
|
||||
return len("]]>")
|
||||
}
|
||||
|
||||
@@ -134,12 +134,17 @@ func FindMatchingToolMarkupClose(text string, open ToolMarkupTag) (ToolMarkupTag
|
||||
}
|
||||
|
||||
func scanToolMarkupTagAt(text string, start int) (ToolMarkupTag, bool) {
|
||||
if start < 0 || start >= len(text) || text[start] != '<' {
|
||||
next, ok := consumeToolMarkupLessThan(text, start)
|
||||
if !ok {
|
||||
return ToolMarkupTag{}, false
|
||||
}
|
||||
i := start + 1
|
||||
for i < len(text) && text[i] == '<' {
|
||||
i++
|
||||
i := next
|
||||
for {
|
||||
next, ok := consumeToolMarkupLessThan(text, i)
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
i = next
|
||||
}
|
||||
closing := false
|
||||
if i < len(text) && text[i] == '/' {
|
||||
@@ -459,6 +464,14 @@ func consumeToolMarkupPipe(text string, idx int) (int, bool) {
|
||||
return idx, false
|
||||
}
|
||||
|
||||
func consumeToolMarkupLessThan(text string, idx int) (int, bool) {
|
||||
ch, size := normalizedASCIIAt(text, idx)
|
||||
if size <= 0 || ch != '<' {
|
||||
return idx, false
|
||||
}
|
||||
return idx + size, true
|
||||
}
|
||||
|
||||
func hasToolMarkupBoundary(text string, idx int) bool {
|
||||
if idx >= len(text) {
|
||||
return true
|
||||
@@ -488,6 +501,12 @@ func normalizedASCIIAt(text string, idx int) (byte, int) {
|
||||
}
|
||||
|
||||
func normalizeFullwidthASCII(r rune) rune {
|
||||
switch r {
|
||||
case '〈':
|
||||
return '<'
|
||||
case '〉':
|
||||
return '>'
|
||||
}
|
||||
if r >= '!' && r <= '~' {
|
||||
return r - 0xFEE0
|
||||
}
|
||||
|
||||
@@ -132,6 +132,37 @@ func TestParseToolCallsSupportsFullwidthDSMLShell(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseToolCallsSupportsCJKAngleDSMDrift(t *testing.T) {
|
||||
text := `<DSM|tool_calls>
|
||||
<DSM|invoke name="Bash">
|
||||
<DSM|parameter name="description"|>〈![CDATA[Show commits on local dev not on origin/dev]]〉〈/DSM|parameter〉
|
||||
<DSM|parameter name="command"|>〈![CDATA[git log --oneline origin/dev..dev]]〉〈/DSM|parameter〉
|
||||
〈/DSM|invoke〉
|
||||
<DSM|invoke name="Bash">
|
||||
<DSM|parameter name="description"|>〈![CDATA[Show commits on origin/dev not on local dev]]〉〈/DSM|parameter〉
|
||||
<DSM|parameter name="command"|>〈![CDATA[git log --oneline dev..origin/dev]]〉〈/DSM|parameter〉
|
||||
〈/DSM|invoke〉
|
||||
<DSM|invoke name="Bash">
|
||||
<DSM|parameter name="description"|>〈![CDATA[Check tracking branch status]]〉〈/DSM|parameter〉
|
||||
<DSM|parameter name="command"|>〈![CDATA[git status -b --short]]〉〈/DSM|parameter〉
|
||||
〈/DSM|invoke〉
|
||||
〈/DSM|tool_calls〉`
|
||||
|
||||
calls := ParseToolCalls(text, []string{"Bash"})
|
||||
if len(calls) != 3 {
|
||||
t.Fatalf("expected three CJK-angle DSM drift calls, got %#v", calls)
|
||||
}
|
||||
if calls[0].Name != "Bash" || calls[0].Input["command"] != "git log --oneline origin/dev..dev" {
|
||||
t.Fatalf("unexpected first CJK-angle DSM drift call: %#v", calls[0])
|
||||
}
|
||||
if calls[1].Name != "Bash" || calls[1].Input["description"] != "Show commits on origin/dev not on local dev" {
|
||||
t.Fatalf("unexpected second CJK-angle DSM drift call: %#v", calls[1])
|
||||
}
|
||||
if calls[2].Name != "Bash" || calls[2].Input["command"] != "git status -b --short" {
|
||||
t.Fatalf("unexpected third CJK-angle DSM drift call: %#v", calls[2])
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseToolCallsIgnoresBareHyphenatedToolCallsLookalike(t *testing.T) {
|
||||
text := `<tool-calls><invoke name="Bash"><parameter name="command">pwd</parameter></invoke></tool-calls>`
|
||||
calls := ParseToolCalls(text, []string{"Bash"})
|
||||
|
||||
@@ -1228,3 +1228,37 @@ func TestProcessToolSieveDSMLBarePrefixVariantDoesNotLeak(t *testing.T) {
|
||||
t.Fatalf("expected one tool call from DSML bare prefix variant, got %d events=%#v", toolCalls, events)
|
||||
}
|
||||
}
|
||||
|
||||
func TestProcessToolSieveCJKAngleDSMDriftDoesNotLeak(t *testing.T) {
|
||||
var state State
|
||||
chunks := []string{
|
||||
"<DSM|tool_calls>\n",
|
||||
"<DSM|invoke name=\"Bash\">\n",
|
||||
"<DSM|parameter name=\"description\"|>〈![CDATA[Check tracking branch status]]〉〈/DSM|parameter〉\n",
|
||||
"<DSM|parameter name=\"command\"|>〈![CDATA[git status -b --short]]〉〈/DSM|parameter〉\n",
|
||||
"〈/DSM|invoke〉\n",
|
||||
"〈/DSM|tool_calls〉",
|
||||
}
|
||||
var events []Event
|
||||
for _, c := range chunks {
|
||||
events = append(events, ProcessChunk(&state, c, []string{"Bash"})...)
|
||||
}
|
||||
events = append(events, Flush(&state, []string{"Bash"})...)
|
||||
|
||||
var textContent string
|
||||
var calls []toolcall.ParsedToolCall
|
||||
for _, evt := range events {
|
||||
textContent += evt.Content
|
||||
calls = append(calls, evt.ToolCalls...)
|
||||
}
|
||||
|
||||
if strings.Contains(textContent, "DSM") || strings.Contains(textContent, "git status") {
|
||||
t.Fatalf("CJK-angle DSM drift leaked to text: %q events=%#v", textContent, events)
|
||||
}
|
||||
if len(calls) != 1 {
|
||||
t.Fatalf("expected one CJK-angle DSM drift tool call, got %d events=%#v", len(calls), events)
|
||||
}
|
||||
if calls[0].Name != "Bash" || calls[0].Input["command"] != "git status -b --short" {
|
||||
t.Fatalf("unexpected CJK-angle DSM drift call: %#v", calls[0])
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user