mirror of
https://github.com/CJackHwang/ds2api.git
synced 2026-05-17 06:35:14 +08:00
refactor: generalize DSML tag parsing to tolerate model noise; split tiktoken by build tags
Replace hardcoded DSML typo variant lists in Go/Node tool call parsers with generalized prefix consumption that tolerates repeated leading <, repeated DSML prefix noise, and trailing pipe terminators. Split tiktoken-dependent token counting into a build-tagged file for non-cgo platform compatibility. Add /data directory to Dockerfile for bind-mount permissions. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -1,10 +1,6 @@
|
||||
package toolstream
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"ds2api/internal/toolcall"
|
||||
)
|
||||
import "ds2api/internal/toolcall"
|
||||
|
||||
func ProcessChunk(state *State, chunk string, toolNames []string) []Event {
|
||||
if state == nil {
|
||||
@@ -174,31 +170,27 @@ func findToolSegmentStart(state *State, s string) int {
|
||||
if s == "" {
|
||||
return -1
|
||||
}
|
||||
lower := strings.ToLower(s)
|
||||
offset := 0
|
||||
for {
|
||||
bestKeyIdx := -1
|
||||
matchedTag := ""
|
||||
for _, tag := range xmlToolTagsToDetect {
|
||||
idx := strings.Index(lower[offset:], tag)
|
||||
if idx >= 0 {
|
||||
idx += offset
|
||||
if bestKeyIdx < 0 || idx < bestKeyIdx {
|
||||
bestKeyIdx = idx
|
||||
matchedTag = tag
|
||||
}
|
||||
}
|
||||
}
|
||||
if bestKeyIdx < 0 {
|
||||
tag, ok := toolcall.FindToolMarkupTagOutsideIgnored(s, offset)
|
||||
if !ok {
|
||||
return -1
|
||||
}
|
||||
if !insideCodeFenceWithState(state, s[:bestKeyIdx]) {
|
||||
return bestKeyIdx
|
||||
start := includeDuplicateLeadingLessThan(s, tag.Start)
|
||||
if !insideCodeFenceWithState(state, s[:start]) {
|
||||
return start
|
||||
}
|
||||
offset = bestKeyIdx + len(matchedTag)
|
||||
offset = tag.End + 1
|
||||
}
|
||||
}
|
||||
|
||||
func includeDuplicateLeadingLessThan(s string, idx int) int {
|
||||
for idx > 0 && s[idx-1] == '<' {
|
||||
idx--
|
||||
}
|
||||
return idx
|
||||
}
|
||||
|
||||
func consumeToolCapture(state *State, toolNames []string) (prefix string, calls []toolcall.ParsedToolCall, suffix string, ready bool) {
|
||||
captured := state.capture.String()
|
||||
if captured == "" {
|
||||
|
||||
@@ -153,27 +153,14 @@ func findPartialXMLToolTagStart(s string) int {
|
||||
if lastLT < 0 {
|
||||
return -1
|
||||
}
|
||||
tail := s[lastLT:]
|
||||
start := includeDuplicateLeadingLessThan(s, lastLT)
|
||||
tail := s[start:]
|
||||
// If there's a '>' in the tail, the tag is closed — not partial.
|
||||
if strings.Contains(tail, ">") {
|
||||
return -1
|
||||
}
|
||||
lowerTail := strings.ToLower(tail)
|
||||
for _, tag := range []string{
|
||||
"<tool_calls", "<invoke", "<parameter",
|
||||
"<|tool_calls", "<|invoke", "<|parameter",
|
||||
"<|tool_calls", "<|invoke", "<|parameter",
|
||||
"<|dsml|tool_calls", "<|dsml|invoke", "<|dsml|parameter",
|
||||
"<|dsml|tool_calls", "<|dsml|invoke", "<|dsml|parameter",
|
||||
"<dsmltool_calls", "<dsmlinvoke", "<dsmlparameter",
|
||||
"<dsml tool_calls", "<dsml invoke", "<dsml parameter",
|
||||
"<dsml|tool_calls", "<dsml|invoke", "<dsml|parameter",
|
||||
"<|dsmltool_calls", "<|dsmlinvoke", "<|dsmlparameter",
|
||||
"<|dsml tool_calls", "<|dsml invoke", "<|dsml parameter",
|
||||
} {
|
||||
if strings.HasPrefix(tag, lowerTail) {
|
||||
return lastLT
|
||||
}
|
||||
if toolcall.IsPartialToolMarkupTagPrefix(tail) {
|
||||
return start
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
@@ -1,35 +0,0 @@
|
||||
package toolstream
|
||||
|
||||
import "regexp"
|
||||
|
||||
// --- XML tool call support for the streaming sieve ---
|
||||
|
||||
//nolint:unused // kept as explicit tag inventory for future XML sieve refinements.
|
||||
var xmlToolCallClosingTags = []string{"</tool_calls>", "</|dsml|tool_calls>", "</|dsmltool_calls>", "</|dsml tool_calls>", "</dsml|tool_calls>", "</dsmltool_calls>", "</dsml tool_calls>", "</|tool_calls>", "</|tool_calls>"}
|
||||
|
||||
// xmlToolCallBlockPattern matches a complete canonical XML tool call block.
|
||||
//
|
||||
//nolint:unused // reserved for future fast-path XML block detection.
|
||||
var xmlToolCallBlockPattern = regexp.MustCompile(`(?is)((?:<tool_calls\b|<\|dsml\|tool_calls\b)[^>]*>\s*(?:.*?)\s*(?:</tool_calls>|</\|dsml\|tool_calls>))`)
|
||||
|
||||
// xmlToolTagsToDetect is the set of XML tag prefixes used by findToolSegmentStart.
|
||||
var xmlToolTagsToDetect = []string{
|
||||
"<|dsml|tool_calls>", "<|dsml|tool_calls\n", "<|dsml|tool_calls ",
|
||||
"<|dsml|tool_calls>", "<|dsml|tool_calls\n", "<|dsml|tool_calls ",
|
||||
"<|dsml|invoke ", "<|dsml|invoke\n", "<|dsml|invoke\t", "<|dsml|invoke\r",
|
||||
"<|dsmltool_calls>", "<|dsmltool_calls\n", "<|dsmltool_calls ",
|
||||
"<|dsmlinvoke ", "<|dsmlinvoke\n", "<|dsmlinvoke\t", "<|dsmlinvoke\r",
|
||||
"<|dsml tool_calls>", "<|dsml tool_calls\n", "<|dsml tool_calls ",
|
||||
"<|dsml invoke ", "<|dsml invoke\n", "<|dsml invoke\t", "<|dsml invoke\r",
|
||||
"<dsml|tool_calls>", "<dsml|tool_calls\n", "<dsml|tool_calls ",
|
||||
"<dsml|invoke ", "<dsml|invoke\n", "<dsml|invoke\t", "<dsml|invoke\r",
|
||||
"<dsmltool_calls>", "<dsmltool_calls\n", "<dsmltool_calls ",
|
||||
"<dsmlinvoke ", "<dsmlinvoke\n", "<dsmlinvoke\t", "<dsmlinvoke\r",
|
||||
"<dsml tool_calls>", "<dsml tool_calls\n", "<dsml tool_calls ",
|
||||
"<dsml invoke ", "<dsml invoke\n", "<dsml invoke\t", "<dsml invoke\r",
|
||||
"<|tool_calls>", "<|tool_calls\n", "<|tool_calls ",
|
||||
"<|invoke ", "<|invoke\n", "<|invoke\t", "<|invoke\r",
|
||||
"<|tool_calls>", "<|tool_calls\n", "<|tool_calls ",
|
||||
"<|invoke ", "<|invoke\n", "<|invoke\t", "<|invoke\r",
|
||||
"<tool_calls>", "<tool_calls\n", "<tool_calls ", "<invoke ", "<invoke\n", "<invoke\t", "<invoke\r",
|
||||
}
|
||||
@@ -72,6 +72,97 @@ func TestProcessToolSieveInterceptsDSMLToolCallWithoutLeak(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestProcessToolSieveInterceptsDSMLTrailingPipeToolCallWithoutLeak(t *testing.T) {
|
||||
var state State
|
||||
chunks := []string{
|
||||
"<|DSML|tool_calls| \n",
|
||||
` <|DSML|invoke name="terminal">` + "\n",
|
||||
` <|DSML|parameter name="command"><![CDATA[find "/home" -type d]]></|DSML|parameter>` + "\n",
|
||||
` <|DSML|parameter name="timeout"><![CDATA[10]]></|DSML|parameter>` + "\n",
|
||||
" </|DSML|invoke>\n",
|
||||
"</|DSML|tool_calls>",
|
||||
}
|
||||
var events []Event
|
||||
for _, c := range chunks {
|
||||
events = append(events, ProcessChunk(&state, c, []string{"terminal"})...)
|
||||
}
|
||||
events = append(events, Flush(&state, []string{"terminal"})...)
|
||||
|
||||
var textContent strings.Builder
|
||||
var calls []any
|
||||
for _, evt := range events {
|
||||
textContent.WriteString(evt.Content)
|
||||
for _, call := range evt.ToolCalls {
|
||||
calls = append(calls, call)
|
||||
}
|
||||
}
|
||||
if text := textContent.String(); strings.Contains(strings.ToLower(text), "dsml") || strings.Contains(text, "terminal") {
|
||||
t.Fatalf("trailing-pipe DSML tool call leaked to text: %q events=%#v", text, events)
|
||||
}
|
||||
if len(calls) != 1 {
|
||||
t.Fatalf("expected one trailing-pipe DSML tool call, got %d events=%#v", len(calls), events)
|
||||
}
|
||||
}
|
||||
|
||||
func TestProcessToolSieveInterceptsExtraLeadingLessThanDSMLToolCallWithoutLeak(t *testing.T) {
|
||||
var state State
|
||||
chunks := []string{
|
||||
"<<|DSML|tool_calls>\n",
|
||||
` <<|DSML|invoke name="Bash">` + "\n",
|
||||
` <<|DSML|parameter name="command"><![CDATA[pwd]]></|DSML|parameter>` + "\n",
|
||||
" </|DSML|invoke>\n",
|
||||
"</|DSML|tool_calls>",
|
||||
}
|
||||
var events []Event
|
||||
for _, c := range chunks {
|
||||
events = append(events, ProcessChunk(&state, c, []string{"Bash"})...)
|
||||
}
|
||||
events = append(events, Flush(&state, []string{"Bash"})...)
|
||||
|
||||
var textContent strings.Builder
|
||||
toolCalls := 0
|
||||
for _, evt := range events {
|
||||
textContent.WriteString(evt.Content)
|
||||
toolCalls += len(evt.ToolCalls)
|
||||
}
|
||||
if text := textContent.String(); strings.Contains(text, "<") || strings.Contains(text, "Bash") {
|
||||
t.Fatalf("extra-leading-less-than DSML tool call leaked to text: %q events=%#v", text, events)
|
||||
}
|
||||
if toolCalls != 1 {
|
||||
t.Fatalf("expected one extra-leading-less-than DSML tool call, got %d events=%#v", toolCalls, events)
|
||||
}
|
||||
}
|
||||
|
||||
func TestProcessToolSieveInterceptsRepeatedDSMLPrefixNoiseWithoutLeak(t *testing.T) {
|
||||
var state State
|
||||
chunks := []string{
|
||||
"<<DSML|DSML|tool",
|
||||
"_calls>\n",
|
||||
` <<DSML|DSML|invoke name="Bash">` + "\n",
|
||||
` <<DSML|DSML|parameter name="command"><![CDATA[git status]]></DSML|DSML|parameter>` + "\n",
|
||||
" </DSML|DSML|invoke>\n",
|
||||
"</DSML|DSML|tool_calls>",
|
||||
}
|
||||
var events []Event
|
||||
for _, c := range chunks {
|
||||
events = append(events, ProcessChunk(&state, c, []string{"Bash"})...)
|
||||
}
|
||||
events = append(events, Flush(&state, []string{"Bash"})...)
|
||||
|
||||
var textContent strings.Builder
|
||||
toolCalls := 0
|
||||
for _, evt := range events {
|
||||
textContent.WriteString(evt.Content)
|
||||
toolCalls += len(evt.ToolCalls)
|
||||
}
|
||||
if text := textContent.String(); strings.Contains(strings.ToLower(text), "dsml") || strings.Contains(text, "Bash") {
|
||||
t.Fatalf("repeated-prefix DSML tool call leaked to text: %q events=%#v", text, events)
|
||||
}
|
||||
if toolCalls != 1 {
|
||||
t.Fatalf("expected one repeated-prefix DSML tool call, got %d events=%#v", toolCalls, events)
|
||||
}
|
||||
}
|
||||
|
||||
func TestProcessToolSieveHandlesLongXMLToolCall(t *testing.T) {
|
||||
var state State
|
||||
const toolName = "write_to_file"
|
||||
@@ -442,6 +533,8 @@ func TestFindToolSegmentStartDetectsXMLToolCalls(t *testing.T) {
|
||||
want int
|
||||
}{
|
||||
{"tool_calls_tag", "some text <tool_calls>\n", 10},
|
||||
{"dsml_trailing_pipe_tag", "some text <|DSML|tool_calls| \n", 10},
|
||||
{"dsml_extra_leading_less_than", "some text <<|DSML|tool_calls>\n", 10},
|
||||
{"invoke_tag_missing_wrapper", "some text <invoke name=\"read_file\">\n", 10},
|
||||
{"bare_tool_call_text", "prefix <tool_call>\n", -1},
|
||||
{"xml_inside_code_fence", "```xml\n<tool_calls><invoke name=\"read_file\"></invoke></tool_calls>\n```", -1},
|
||||
@@ -465,6 +558,8 @@ func TestFindPartialXMLToolTagStart(t *testing.T) {
|
||||
want int
|
||||
}{
|
||||
{"partial_tool_calls", "Hello <tool_ca", 6},
|
||||
{"partial_dsml_trailing_pipe", "Hello <|DSML|tool_calls|", 6},
|
||||
{"partial_dsml_extra_leading_less_than", "Hello <<|DSML|tool_calls", 6},
|
||||
{"partial_invoke", "Hello <inv", 6},
|
||||
{"bare_tool_call_not_held", "Hello <tool_name", -1},
|
||||
{"partial_lt_only", "Text <", 5},
|
||||
|
||||
Reference in New Issue
Block a user