Files
ds2api/internal/js/helpers/stream-tool-sieve/parse_payload.js
CJACK a13293e113 feat: expand DSML tool-call alias and fence handling
Add support for DSML wrapper aliases (<dsml|tool_calls>, <|tool_calls>,
<|tool_calls>) alongside canonical XML. Normalize mixed DSML/canonical
tags instead of rejecting them. Add tilde fence (~~~) support, fix
nested fence and unclosed fence handling, support CDATA-protected fence
content, and skip prose mentions when scanning for real tool blocks.
Mirror all changes between Go and Node.js runtimes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-27 13:39:50 +08:00

623 lines
16 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
'use strict';
const TOOL_CALL_MARKUP_KV_PATTERN = /<(?:[a-z0-9_:-]+:)?([a-z0-9_.-]+)\b[^>]*>([\s\S]*?)<\/(?:[a-z0-9_:-]+:)?\1>/gi;
const CDATA_PATTERN = /^<!\[CDATA\[([\s\S]*?)]]>$/i;
const XML_ATTR_PATTERN = /\b([a-z0-9_:-]+)\s*=\s*("([^"]*)"|'([^']*)')/gi;
const {
toStringSafe,
} = require('./state');
function stripFencedCodeBlocks(text) {
const t = typeof text === 'string' ? text : '';
if (!t) {
return '';
}
const lines = t.split('\n');
const out = [];
let inFence = false;
let fenceChar = '';
let fenceLen = 0;
let inCDATA = false;
let beforeFenceIdx = 0;
for (let li = 0; li < lines.length; li += 1) {
const line = lines[li];
const lineWithNL = li < lines.length - 1 ? line + '\n' : line;
// CDATA protection
if (inCDATA || cdataStartsBeforeFence(line)) {
out.push(lineWithNL);
inCDATA = updateCDATAStateLine(inCDATA, line);
continue;
}
const trimmed = line.replace(/^[ \t]+/, '');
if (!inFence) {
const fence = parseFenceOpenLine(trimmed);
if (fence) {
inFence = true;
fenceChar = fence.ch;
fenceLen = fence.count;
beforeFenceIdx = out.length;
continue;
}
out.push(lineWithNL);
continue;
}
if (isFenceCloseLine(trimmed, fenceChar, fenceLen)) {
inFence = false;
fenceChar = '';
fenceLen = 0;
}
}
if (inFence) {
// Unclosed fence: keep content before the fence started.
if (beforeFenceIdx > 0) {
return out.slice(0, beforeFenceIdx).join('');
}
return '';
}
return out.join('');
}
function parseFenceOpenLine(trimmed) {
if (trimmed.length < 3) return null;
const ch = trimmed[0];
if (ch !== '`' && ch !== '~') return null;
let count = 0;
while (count < trimmed.length && trimmed[count] === ch) count++;
if (count < 3) return null;
return { ch, count };
}
function isFenceCloseLine(trimmed, fenceChar, fenceLen) {
if (!fenceChar || !trimmed || trimmed[0] !== fenceChar) return false;
let count = 0;
while (count < trimmed.length && trimmed[count] === fenceChar) count++;
if (count < fenceLen) return false;
return trimmed.slice(count).trim() === '';
}
function cdataStartsBeforeFence(line) {
const cdataIdx = line.toLowerCase().indexOf('<![cdata[');
if (cdataIdx < 0) return false;
const fenceIdx = Math.min(
line.indexOf('```') >= 0 ? line.indexOf('```') : Infinity,
line.indexOf('~~~') >= 0 ? line.indexOf('~~~') : Infinity,
);
return fenceIdx === Infinity || cdataIdx < fenceIdx;
}
function updateCDATAStateLine(inCDATA, line) {
const lower = line.toLowerCase();
let pos = 0;
let state = inCDATA;
while (pos < lower.length) {
if (state) {
const end = lower.indexOf(']]>', pos);
if (end < 0) return true;
pos = end + ']]>'.length;
state = false;
continue;
}
const start = lower.indexOf('<![cdata[', pos);
if (start < 0) return false;
pos = start + '<![cdata['.length;
state = true;
}
return state;
}
function parseMarkupToolCalls(text) {
const normalized = normalizeDSMLToolCallMarkup(toStringSafe(text));
if (!normalized.ok) {
return [];
}
const raw = normalized.text.trim();
if (!raw) {
return [];
}
const out = [];
for (const wrapper of findXmlElementBlocks(raw, 'tool_calls')) {
const body = toStringSafe(wrapper.body);
for (const block of findXmlElementBlocks(body, 'invoke')) {
const parsed = parseMarkupSingleToolCall(block);
if (parsed) {
out.push(parsed);
}
}
}
return out;
}
function normalizeDSMLToolCallMarkup(text) {
const raw = toStringSafe(text);
if (!raw) {
return { text: '', ok: true };
}
const styles = toolMarkupStylesOutsideIgnored(raw);
if (!styles.dsml) {
return { text: raw, ok: true };
}
// Always normalize DSML aliases to canonical form, even when canonical
// tags coexist. Models frequently mix DSML wrapper tags with canonical
// inner tags (e.g., <tool_calls><invoke name="...">).
return {
text: replaceDSMLToolMarkupOutsideIgnored(raw),
ok: true,
};
}
function containsDSMLToolMarkup(text) {
return toolMarkupStylesOutsideIgnored(text).dsml;
}
function containsCanonicalToolMarkup(text) {
return toolMarkupStylesOutsideIgnored(text).canonical;
}
const DSML_TOOL_MARKUP_ALIASES = [
{ from: '<|dsml|tool_calls', to: '<tool_calls' },
{ from: '</|dsml|tool_calls>', to: '</tool_calls>' },
{ from: '<|dsml|invoke', to: '<invoke' },
{ from: '</|dsml|invoke>', to: '</invoke>' },
{ from: '<|dsml|parameter', to: '<parameter' },
{ from: '</|dsml|parameter>', to: '</parameter>' },
{ from: '<dsml|tool_calls', to: '<tool_calls' },
{ from: '</dsml|tool_calls>', to: '</tool_calls>' },
{ from: '<dsml|invoke', to: '<invoke' },
{ from: '</dsml|invoke>', to: '</invoke>' },
{ from: '<dsml|parameter', to: '<parameter' },
{ from: '</dsml|parameter>', to: '</parameter>' },
{ from: '<|tool_calls', to: '<tool_calls' },
{ from: '</|tool_calls>', to: '</tool_calls>' },
{ from: '<|invoke', to: '<invoke' },
{ from: '</|invoke>', to: '</invoke>' },
{ from: '<|parameter', to: '<parameter' },
{ from: '</|parameter>', to: '</parameter>' },
{ from: '<tool_calls', to: '<tool_calls' },
{ from: '</tool_calls>', to: '</tool_calls>' },
{ from: '<invoke', to: '<invoke' },
{ from: '</invoke>', to: '</invoke>' },
{ from: '<parameter', to: '<parameter' },
{ from: '</parameter>', to: '</parameter>' },
];
const CANONICAL_TOOL_MARKUP_PREFIXES = [
'<tool_calls',
'</tool_calls>',
'<invoke',
'</invoke>',
'<parameter',
'</parameter>',
];
function toolMarkupStylesOutsideIgnored(text) {
const lower = toStringSafe(text).toLowerCase();
const styles = { dsml: false, canonical: false };
for (let i = 0; i < lower.length;) {
const skipped = skipXmlIgnoredSection(lower, i);
if (skipped.blocked) {
return styles;
}
if (skipped.advanced) {
i = skipped.next;
continue;
}
if (CANONICAL_TOOL_MARKUP_PREFIXES.some(prefix => lower.startsWith(prefix, i))) {
styles.canonical = true;
}
if (DSML_TOOL_MARKUP_ALIASES.some(alias => lower.startsWith(alias.from, i))) {
styles.dsml = true;
}
if (styles.dsml && styles.canonical) {
return styles;
}
i += 1;
}
return styles;
}
function replaceDSMLToolMarkupOutsideIgnored(text) {
const raw = toStringSafe(text);
const lower = raw.toLowerCase();
let out = '';
for (let i = 0; i < raw.length;) {
const skipped = skipXmlIgnoredSection(lower, i);
if (skipped.blocked) {
out += raw.slice(i);
break;
}
if (skipped.advanced) {
out += raw.slice(i, skipped.next);
i = skipped.next;
continue;
}
const alias = DSML_TOOL_MARKUP_ALIASES.find(item => lower.startsWith(item.from, i));
if (alias) {
out += alias.to;
i += alias.from.length;
continue;
}
out += raw[i];
i += 1;
}
return out;
}
function parseMarkupSingleToolCall(block) {
const attrs = parseTagAttributes(block.attrs);
const name = toStringSafe(attrs.name).trim();
if (!name) {
return null;
}
const inner = toStringSafe(block.body).trim();
if (inner) {
try {
const decoded = JSON.parse(inner);
if (decoded && typeof decoded === 'object' && !Array.isArray(decoded)) {
return {
name,
input: decoded.input && typeof decoded.input === 'object' && !Array.isArray(decoded.input)
? decoded.input
: decoded.parameters && typeof decoded.parameters === 'object' && !Array.isArray(decoded.parameters)
? decoded.parameters
: {},
};
}
} catch (_err) {
// Not JSON, continue with markup parsing.
}
}
const input = {};
for (const match of findXmlElementBlocks(inner, 'parameter')) {
const parameterAttrs = parseTagAttributes(match.attrs);
const paramName = toStringSafe(parameterAttrs.name).trim();
if (!paramName) {
continue;
}
appendMarkupValue(input, paramName, parseMarkupValue(match.body));
}
if (Object.keys(input).length === 0 && inner.trim() !== '') {
return null;
}
return { name, input };
}
function findXmlElementBlocks(text, tag) {
const source = toStringSafe(text);
const name = toStringSafe(tag).toLowerCase();
if (!source || !name) {
return [];
}
const out = [];
let pos = 0;
while (pos < source.length) {
const start = findXmlStartTagOutsideCDATA(source, name, pos);
if (!start) {
break;
}
const end = findMatchingXmlEndTagOutsideCDATA(source, name, start.bodyStart);
if (!end) {
pos = start.bodyStart;
continue;
}
out.push({
attrs: start.attrs,
body: source.slice(start.bodyStart, end.closeStart),
start: start.start,
end: end.closeEnd,
});
pos = end.closeEnd;
}
return out;
}
function findXmlStartTagOutsideCDATA(text, tag, from) {
const lower = text.toLowerCase();
const target = `<${tag}`;
for (let i = Math.max(0, from || 0); i < text.length;) {
const skipped = skipXmlIgnoredSection(lower, i);
if (skipped.blocked) {
return null;
}
if (skipped.advanced) {
i = skipped.next;
continue;
}
if (lower.startsWith(target, i) && hasXmlTagBoundary(text, i + target.length)) {
const tagEnd = findXmlTagEnd(text, i + target.length);
if (tagEnd < 0) {
return null;
}
return {
start: i,
bodyStart: tagEnd + 1,
attrs: text.slice(i + target.length, tagEnd),
};
}
i += 1;
}
return null;
}
function findMatchingXmlEndTagOutsideCDATA(text, tag, from) {
const lower = text.toLowerCase();
const openTarget = `<${tag}`;
const closeTarget = `</${tag}`;
let depth = 1;
for (let i = Math.max(0, from || 0); i < text.length;) {
const skipped = skipXmlIgnoredSection(lower, i);
if (skipped.blocked) {
return null;
}
if (skipped.advanced) {
i = skipped.next;
continue;
}
if (lower.startsWith(closeTarget, i) && hasXmlTagBoundary(text, i + closeTarget.length)) {
const tagEnd = findXmlTagEnd(text, i + closeTarget.length);
if (tagEnd < 0) {
return null;
}
depth -= 1;
if (depth === 0) {
return { closeStart: i, closeEnd: tagEnd + 1 };
}
i = tagEnd + 1;
continue;
}
if (lower.startsWith(openTarget, i) && hasXmlTagBoundary(text, i + openTarget.length)) {
const tagEnd = findXmlTagEnd(text, i + openTarget.length);
if (tagEnd < 0) {
return null;
}
if (!isSelfClosingXmlTag(text.slice(i, tagEnd))) {
depth += 1;
}
i = tagEnd + 1;
continue;
}
i += 1;
}
return null;
}
function skipXmlIgnoredSection(lower, i) {
if (lower.startsWith('<![cdata[', i)) {
const end = lower.indexOf(']]>', i + '<![cdata['.length);
if (end < 0) {
return { advanced: false, blocked: true, next: i };
}
return { advanced: true, blocked: false, next: end + ']]>'.length };
}
if (lower.startsWith('<!--', i)) {
const end = lower.indexOf('-->', i + '<!--'.length);
if (end < 0) {
return { advanced: false, blocked: true, next: i };
}
return { advanced: true, blocked: false, next: end + '-->'.length };
}
return { advanced: false, blocked: false, next: i };
}
function findXmlTagEnd(text, from) {
let quote = '';
for (let i = Math.max(0, from || 0); i < text.length; i += 1) {
const ch = text[i];
if (quote) {
if (ch === quote) {
quote = '';
}
continue;
}
if (ch === '"' || ch === "'") {
quote = ch;
continue;
}
if (ch === '>') {
return i;
}
}
return -1;
}
function hasXmlTagBoundary(text, idx) {
if (idx >= text.length) {
return true;
}
return [' ', '\t', '\n', '\r', '>', '/'].includes(text[idx]);
}
function isSelfClosingXmlTag(startTag) {
return toStringSafe(startTag).trim().endsWith('/');
}
function parseMarkupInput(raw) {
const s = toStringSafe(raw).trim();
if (!s) {
return {};
}
// Prioritize XML-style KV tags (e.g., <arg>val</arg>)
const kv = parseMarkupKVObject(s);
if (Object.keys(kv).length > 0) {
return kv;
}
// Fallback to JSON parsing
const parsed = parseToolCallInput(s);
if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
if (Object.keys(parsed).length > 0) {
return parsed;
}
}
return { _raw: extractRawTagValue(s) };
}
function parseMarkupKVObject(text) {
const raw = toStringSafe(text).trim();
if (!raw) {
return {};
}
const out = {};
for (const m of raw.matchAll(TOOL_CALL_MARKUP_KV_PATTERN)) {
const key = toStringSafe(m[1]).trim();
if (!key) {
continue;
}
const value = parseMarkupValue(m[2]);
if (value === undefined || value === null) {
continue;
}
appendMarkupValue(out, key, value);
}
return out;
}
function parseMarkupValue(raw) {
const cdata = extractStandaloneCDATA(raw);
if (cdata.ok) {
return cdata.value;
}
const s = toStringSafe(extractRawTagValue(raw)).trim();
if (!s) {
return '';
}
if (s.includes('<') && s.includes('>')) {
const nested = parseMarkupInput(s);
if (nested && typeof nested === 'object' && !Array.isArray(nested)) {
if (isOnlyRawValue(nested)) {
return toStringSafe(nested._raw);
}
return nested;
}
}
if (s.startsWith('{') || s.startsWith('[')) {
try {
return JSON.parse(s);
} catch (_err) {
return s;
}
}
return s;
}
function extractRawTagValue(inner) {
const s = toStringSafe(inner).trim();
if (!s) {
return '';
}
// 1. Check for CDATA
const cdata = extractStandaloneCDATA(s);
if (cdata.ok) {
return cdata.value;
}
// 2. Fallback to unescaping standard HTML entities
// Note: we avoid broad tag stripping here to preserve user content (like < symbols in code)
return unescapeHtml(inner);
}
function unescapeHtml(safe) {
if (!safe) return '';
return safe.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#039;/g, "'")
.replace(/&#x27;/g, "'");
}
function extractStandaloneCDATA(inner) {
const s = toStringSafe(inner).trim();
const cdataMatch = s.match(CDATA_PATTERN);
if (cdataMatch && cdataMatch[1] !== undefined) {
return { ok: true, value: cdataMatch[1] };
}
return { ok: false, value: '' };
}
function parseTagAttributes(raw) {
const source = toStringSafe(raw);
const out = {};
if (!source) {
return out;
}
for (const match of source.matchAll(XML_ATTR_PATTERN)) {
const key = toStringSafe(match[1]).trim().toLowerCase();
if (!key) {
continue;
}
out[key] = match[3] || match[4] || '';
}
return out;
}
function parseToolCallInput(v) {
if (v == null) {
return {};
}
if (typeof v === 'string') {
const raw = toStringSafe(v);
if (!raw) {
return {};
}
try {
const parsed = JSON.parse(raw);
if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
return parsed;
}
return { _raw: raw };
} catch (_err) {
return { _raw: raw };
}
}
if (typeof v === 'object' && !Array.isArray(v)) {
return v;
}
try {
const parsed = JSON.parse(JSON.stringify(v));
if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
return parsed;
}
} catch (_err) {
return {};
}
return {};
}
function appendMarkupValue(out, key, value) {
if (Object.prototype.hasOwnProperty.call(out, key)) {
const current = out[key];
if (Array.isArray(current)) {
current.push(value);
return;
}
out[key] = [current, value];
return;
}
out[key] = value;
}
function isOnlyRawValue(obj) {
if (!obj || typeof obj !== 'object' || Array.isArray(obj)) {
return false;
}
const keys = Object.keys(obj);
return keys.length === 1 && keys[0] === '_raw';
}
module.exports = {
stripFencedCodeBlocks,
parseMarkupToolCalls,
normalizeDSMLToolCallMarkup,
};