feat: implement XML-based tool call extraction and refactor sieve utilities into dedicated modules

This commit is contained in:
CJACK
2026-03-29 13:01:11 +08:00
parent 19b4f879c5
commit 302bcefeb5
10 changed files with 563 additions and 113 deletions

View File

@@ -140,9 +140,58 @@ function extractJSONObjectFrom(text, start) {
return { ok: false, end: 0 };
}
function extractToolHistoryBlock(captured, keyIdx) {
if (typeof captured !== 'string' || keyIdx < 0 || keyIdx >= captured.length) {
return { ok: false, start: 0, end: 0 };
}
const rest = captured.slice(keyIdx).toLowerCase();
if (rest.startsWith('[tool_call_history]')) {
const closeTag = '[/tool_call_history]';
const closeIdx = rest.indexOf(closeTag);
if (closeIdx < 0) {
return { ok: false, start: 0, end: 0 };
}
return { ok: true, start: keyIdx, end: keyIdx + closeIdx + closeTag.length };
}
if (rest.startsWith('[tool_result_history]')) {
const closeTag = '[/tool_result_history]';
const closeIdx = rest.indexOf(closeTag);
if (closeIdx < 0) {
return { ok: false, start: 0, end: 0 };
}
return { ok: true, start: keyIdx, end: keyIdx + closeIdx + closeTag.length };
}
return { ok: false, start: 0, end: 0 };
}
function trimWrappingJSONFence(prefix, suffix) {
const rightTrimmedPrefix = (prefix || '').replace(/[ \t\r\n]+$/g, '');
const fenceIdx = rightTrimmedPrefix.lastIndexOf('```');
if (fenceIdx < 0) return { prefix, suffix };
const fenceCount = (rightTrimmedPrefix.slice(0, fenceIdx + 3).match(/```/g) || []).length;
if (fenceCount % 2 === 0) {
return { prefix, suffix };
}
const header = rightTrimmedPrefix.slice(fenceIdx + 3).trim().toLowerCase();
if (header && header !== 'json') {
return { prefix, suffix };
}
const leftTrimmedSuffix = (suffix || '').replace(/^[ \t\r\n]+/g, '');
if (!leftTrimmedSuffix.startsWith('```')) {
return { prefix, suffix };
}
const consumed = (suffix || '').length - leftTrimmedSuffix.length;
return {
prefix: rightTrimmedPrefix.slice(0, fenceIdx),
suffix: (suffix || '').slice(consumed + 3),
};
}
module.exports = {
findObjectFieldValueStart,
parseJSONStringLiteral,
skipSpaces,
extractJSONObjectFrom,
extractToolHistoryBlock,
trimWrappingJSONFence,
};

View File

@@ -0,0 +1,91 @@
'use strict';
const { parseToolCalls } = require('./parse');
const {
XML_TOOL_OPENING_TAGS,
XML_TOOL_CLOSING_TAGS,
} = require('./tool-keywords');
function consumeXMLToolCapture(captured, toolNames, trimWrappingJSONFence) {
const lower = captured.toLowerCase();
let openIdx = -1;
for (const tag of XML_TOOL_OPENING_TAGS) {
const idx = lower.indexOf(tag);
if (idx >= 0 && (openIdx < 0 || idx < openIdx)) {
openIdx = idx;
}
}
if (openIdx < 0) {
return { ready: false, prefix: '', calls: [], suffix: '' };
}
let closeIdx = -1;
for (const tag of XML_TOOL_CLOSING_TAGS) {
const idx = lower.indexOf(tag, openIdx);
if (idx >= 0) {
const absEnd = idx + tag.length;
if (closeIdx < 0 || absEnd > closeIdx) {
closeIdx = absEnd;
}
}
}
if (closeIdx <= 0) {
return { ready: false, prefix: '', calls: [], suffix: '' };
}
const xmlBlock = captured.slice(openIdx, closeIdx);
let prefixPart = captured.slice(0, openIdx);
let suffixPart = captured.slice(closeIdx);
const parsed = parseToolCalls(xmlBlock, toolNames);
if (Array.isArray(parsed) && parsed.length > 0) {
const trimmedFence = trimWrappingJSONFence(prefixPart, suffixPart);
return {
ready: true,
prefix: trimmedFence.prefix,
calls: parsed,
suffix: trimmedFence.suffix,
};
}
return { ready: true, prefix: prefixPart, calls: [], suffix: suffixPart };
}
function hasOpenXMLToolTag(captured) {
const lower = captured.toLowerCase();
for (const tag of XML_TOOL_OPENING_TAGS) {
if (lower.includes(tag)) {
let hasClosed = false;
for (const ct of XML_TOOL_CLOSING_TAGS) {
if (lower.includes(ct)) {
hasClosed = true;
break;
}
}
if (!hasClosed) {
return true;
}
}
}
return false;
}
function findPartialXMLToolTagStart(s) {
const lastLT = s.lastIndexOf('<');
if (lastLT < 0) {
return -1;
}
const tail = s.slice(lastLT);
if (tail.includes('>')) {
return -1;
}
const lowerTail = tail.toLowerCase();
for (const tag of XML_TOOL_OPENING_TAGS) {
const tagWithLT = tag.startsWith('<') ? tag : '<' + tag;
if (tagWithLT.startsWith(lowerTail)) {
return lastLT;
}
}
return -1;
}
module.exports = {
consumeXMLToolCapture,
hasOpenXMLToolTag,
findPartialXMLToolTagStart,
};

View File

@@ -5,8 +5,17 @@ const {
insideCodeFenceWithState,
} = require('./state');
const { parseStandaloneToolCallsDetailed } = require('./parse');
const { extractJSONObjectFrom } = require('./jsonscan');
const { TOOL_SEGMENT_KEYWORDS, earliestKeywordIndex } = require('./tool-keywords');
const { extractJSONObjectFrom, extractToolHistoryBlock, trimWrappingJSONFence } = require('./jsonscan');
const {
TOOL_SEGMENT_KEYWORDS,
XML_TOOL_SEGMENT_TAGS,
earliestKeywordIndex,
} = require('./tool-keywords');
const {
consumeXMLToolCapture: consumeXMLToolCaptureImpl,
hasOpenXMLToolTag,
findPartialXMLToolTagStart,
} = require('./sieve-xml');
function processToolSieveChunk(state, chunk, toolNames) {
if (!state) {
return [];
@@ -144,6 +153,11 @@ function findSuspiciousPrefixStart(s) {
start = idx;
}
}
// Also check for partial XML tool tag at end of string.
const xmlIdx = findPartialXMLToolTagStart(s);
if (xmlIdx >= 0 && xmlIdx > start) {
start = xmlIdx;
}
return start;
}
@@ -154,10 +168,27 @@ function findToolSegmentStart(state, s) {
const lower = s.toLowerCase();
let offset = 0;
while (true) {
const { index: bestKeyIdx, keyword: matchedKeyword } = earliestKeywordIndex(lower, TOOL_SEGMENT_KEYWORDS, offset);
// Check JSON keywords.
let { index: bestKeyIdx, keyword: matchedKeyword } = earliestKeywordIndex(lower, TOOL_SEGMENT_KEYWORDS, offset);
// Also check XML tool tags.
for (const tag of XML_TOOL_SEGMENT_TAGS) {
const idx = lower.indexOf(tag, offset);
if (idx >= 0 && (bestKeyIdx < 0 || idx < bestKeyIdx)) {
bestKeyIdx = idx;
matchedKeyword = tag;
}
}
if (bestKeyIdx < 0) {
return -1;
}
// For XML tags, the '<' is itself the segment start.
if (s[bestKeyIdx] === '<') {
if (!insideCodeFenceWithState(state, s.slice(0, bestKeyIdx))) {
return bestKeyIdx;
}
offset = bestKeyIdx + matchedKeyword.length;
continue;
}
const keyIdx = bestKeyIdx;
const start = s.slice(0, keyIdx).lastIndexOf('{');
const candidateStart = start >= 0 ? start : keyIdx;
@@ -173,6 +204,17 @@ function consumeToolCapture(state, toolNames) {
if (!captured) {
return { ready: false, prefix: '', calls: [], suffix: '' };
}
// Try XML tool call extraction first.
const xmlResult = consumeXMLToolCaptureImpl(captured, toolNames, trimWrappingJSONFence);
if (xmlResult.ready) {
return xmlResult;
}
// If XML tags are present but block is incomplete, keep buffering.
if (hasOpenXMLToolTag(captured)) {
return { ready: false, prefix: '', calls: [], suffix: '' };
}
const lower = captured.toLowerCase();
const { index: keyIdx } = earliestKeywordIndex(lower, TOOL_SEGMENT_KEYWORDS);
if (keyIdx < 0) {
@@ -231,52 +273,6 @@ function consumeToolCapture(state, toolNames) {
};
}
function extractToolHistoryBlock(captured, keyIdx) {
if (typeof captured !== 'string' || keyIdx < 0 || keyIdx >= captured.length) {
return { ok: false, start: 0, end: 0 };
}
const rest = captured.slice(keyIdx).toLowerCase();
if (rest.startsWith('[tool_call_history]')) {
const closeTag = '[/tool_call_history]';
const closeIdx = rest.indexOf(closeTag);
if (closeIdx < 0) {
return { ok: false, start: 0, end: 0 };
}
return { ok: true, start: keyIdx, end: keyIdx + closeIdx + closeTag.length };
}
if (rest.startsWith('[tool_result_history]')) {
const closeTag = '[/tool_result_history]';
const closeIdx = rest.indexOf(closeTag);
if (closeIdx < 0) {
return { ok: false, start: 0, end: 0 };
}
return { ok: true, start: keyIdx, end: keyIdx + closeIdx + closeTag.length };
}
return { ok: false, start: 0, end: 0 };
}
function trimWrappingJSONFence(prefix, suffix) {
const rightTrimmedPrefix = (prefix || '').replace(/[ \t\r\n]+$/g, '');
const fenceIdx = rightTrimmedPrefix.lastIndexOf('```');
if (fenceIdx < 0) return { prefix, suffix };
const fenceCount = (rightTrimmedPrefix.slice(0, fenceIdx + 3).match(/```/g) || []).length;
if (fenceCount % 2 === 0) {
return { prefix, suffix };
}
const header = rightTrimmedPrefix.slice(fenceIdx + 3).trim().toLowerCase();
if (header && header !== 'json') {
return { prefix, suffix };
}
const leftTrimmedSuffix = (suffix || '').replace(/^[ \t\r\n]+/g, '');
if (!leftTrimmedSuffix.startsWith('```')) {
return { prefix, suffix };
}
const consumed = (suffix || '').length - leftTrimmedSuffix.length;
return {
prefix: rightTrimmedPrefix.slice(0, fenceIdx),
suffix: (suffix || '').slice(consumed + 3),
};
}
module.exports = {
processToolSieveChunk,
flushToolSieve,

View File

@@ -8,6 +8,19 @@ const TOOL_SEGMENT_KEYWORDS = [
'[tool_result_history]',
];
const XML_TOOL_SEGMENT_TAGS = [
'<tool_calls>', '<tool_calls\n', '<tool_call>', '<tool_call\n',
'<invoke ', '<invoke>', '<function_call', '<function_calls', '<tool_use>',
];
const XML_TOOL_OPENING_TAGS = [
'<tool_calls', '<tool_call', '<invoke', '<function_call', '<function_calls', '<tool_use',
];
const XML_TOOL_CLOSING_TAGS = [
'</tool_calls>', '</tool_call>', '</invoke>', '</function_call>', '</function_calls>', '</tool_use>',
];
function earliestKeywordIndex(text, keywords = TOOL_SEGMENT_KEYWORDS, offset = 0) {
if (!text) {
return { index: -1, keyword: '' };
@@ -26,5 +39,8 @@ function earliestKeywordIndex(text, keywords = TOOL_SEGMENT_KEYWORDS, offset = 0
module.exports = {
TOOL_SEGMENT_KEYWORDS,
XML_TOOL_SEGMENT_TAGS,
XML_TOOL_OPENING_TAGS,
XML_TOOL_CLOSING_TAGS,
earliestKeywordIndex,
};