revert: replace fullwidth pipe | with halfwidth | in DSML tool markup

PR #460 introduced fullwidth pipe characters (|) in DSML tool call formatting
to improve parsing robustness, but models exposed to these fullwidth pipes in
system prompts exhibit significantly higher rates of tool output hallucinations.
Reverting to halfwidth pipes (|) drastically reduces tokenizer/perplexity-driven
hallucinations while retaining the existing confusable-hardening in the parser.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
CJACK
2026-05-10 15:18:54 +08:00
parent 3beb31309f
commit cee8757d14
45 changed files with 725 additions and 342 deletions

View File

@@ -316,11 +316,11 @@ func TestSieve_CharByCharToolCall(t *testing.T) {
func TestSieve_FullwidthPipeWrapperDSMLInvoke(t *testing.T) {
var state State
chunks := []string{
"<tool_calls>\n",
"<|tool_calls>\n",
"<|DSML|invoke name=\"read_file\">\n",
"<|DSML|parameter name=\"path\">README.md</|DSML|parameter>\n",
"</|DSML|invoke>\n",
"</tool_calls>",
"</|tool_calls>",
}
var events []Event
for _, c := range chunks {
@@ -382,7 +382,7 @@ func TestSieve_TagMentionInTextThenRealToolCall(t *testing.T) {
chunks := []string{
"建议的 commit message\n\nfeat: expand DSML alias support\n\n",
"Add support for <dsml|tool_calls>, ",
"<tool_calls> (fullwidth pipe),\n",
"<|tool_calls> (pipe alias),\n",
"and <|tool_calls> wrapper variants.\n\n",
"<|DSML|tool_calls>\n",
"<|DSML|invoke name=\"Bash\">\n",
@@ -466,14 +466,14 @@ func TestSieve_ReviewSampleWithAliasMentionsPreservesBodyAndToolCalls(t *testing
chunks := []string{
"Done reviewing the diff. Here's my analysis before we commit:\n\n",
"Summary of Changes\n",
"DSML wrapper variant support — recognize aliases (<dsml|tool_calls>, <|tool_calls>, <tool_calls>) alongside canonical <tool_calls> and <|DSML|tool_calls> wrappers.\n\n",
"DSML wrapper variant support — recognize aliases (<dsml|tool_calls>, <|tool_calls>) alongside canonical <tool_calls> and <|DSML|tool_calls> wrappers.\n\n",
"<|DSML|tool_calls>\n",
"<|DSML|invoke name=\"Bash\">\n",
"<|DSML|parameter name=\"command\"><![CDATA[git add docs/toolcall-semantics.md internal/toolstream/tool_sieve_xml.go]]></|DSML|parameter>\n",
"<|DSML|parameter name=\"description\"><![CDATA[Stage all relevant changed files]]></|DSML|parameter>\n",
"</|DSML|invoke>\n",
"<|DSML|invoke name=\"Bash\">\n",
"<|DSML|parameter name=\"command\"><![CDATA[git commit -m \"$(cat <<'EOF'\nfeat(toolstream): expand DSML wrapper detection\n\nSupport DSML wrapper aliases: <dsml|tool_calls>, <|tool_calls>, <tool_calls> alongside existing canonical wrappers.\nEOF\n)\"]]></|DSML|parameter>\n",
"<|DSML|parameter name=\"command\"><![CDATA[git commit -m \"$(cat <<'EOF'\nfeat(toolstream): expand DSML wrapper detection\n\nSupport DSML wrapper aliases: <dsml|tool_calls> and <|tool_calls> alongside existing canonical wrappers.\nEOF\n)\"]]></|DSML|parameter>\n",
"<|DSML|parameter name=\"description\"><![CDATA[Create commit with all staged changes]]></|DSML|parameter>\n",
"</|DSML|invoke>\n",
"</|DSML|tool_calls>",

View File

@@ -626,13 +626,13 @@ func TestProcessToolSieveEmitsAllEmptyDSMLToolBlock(t *testing.T) {
func TestProcessToolSieveEmitsChunkedAllEmptyArbitraryPrefixedToolBlock(t *testing.T) {
chunk := strings.Join([]string{
`<TDSMLtool_calls>`,
` <TDSMLinvoke name="TaskOutput">`,
` <TDSMLparameter name="task_id"></TDSMLparameter>`,
` <TDSMLparameter name="block"></TDSMLparameter>`,
` <TDSMLparameter name="timeout"></TDSMLparameter>`,
` </TDSMLinvoke>`,
` </TDSMLtool_calls>`,
`<T|DSML|tool_calls>`,
` <T|DSML|invoke name="TaskOutput">`,
` <T|DSML|parameter name="task_id"></T|DSML|parameter>`,
` <T|DSML|parameter name="block"></T|DSML|parameter>`,
` <T|DSML|parameter name="timeout"></T|DSML|parameter>`,
` </T|DSML|invoke>`,
` </T|DSML|tool_calls>`,
}, "\n")
calls := collectToolCallsForChunks(t, splitEveryNRBytes(chunk, 8), []string{"TaskOutput"})
if len(calls) != 1 {
@@ -811,8 +811,8 @@ func TestFindPartialXMLToolTagStart(t *testing.T) {
{"partial_tool_calls", "Hello <tool_ca", 6},
{"partial_dsml_trailing_pipe", "Hello <|DSML|tool_calls|", 6},
{"partial_dsml_extra_leading_less_than", "Hello <<|DSML|tool_calls", 6},
{"partial_arbitrary_prefix_before_dsml", "Hello <TDS", 6},
{"partial_arbitrary_prefix_after_dsml_pipe", "Hello <TDSML", 6},
{"partial_arbitrary_prefix_before_dsml", "Hello <T|DS", 6},
{"partial_arbitrary_prefix_after_dsml_pipe", "Hello <T|DSML|", 6},
{"partial_invoke", "Hello <inv", 6},
{"bare_tool_call_not_held", "Hello <tool_name", -1},
{"partial_lt_only", "Text <", 5},
@@ -1091,7 +1091,7 @@ func TestProcessToolSieveRepairsMissingOpeningWrapperWithoutLeakingInvokeText(t
}
}
// Test fullwidth pipe variant: <tool_calls> (U+FF5C) should be buffered and parsed.
// Test escaped U+FF5C pipe variant: <\uff5ctool_calls> should be buffered and parsed.
func TestProcessToolSieveFullwidthPipeVariantDoesNotLeak(t *testing.T) {
var state State
chunks := []string{
@@ -1115,19 +1115,19 @@ func TestProcessToolSieveFullwidthPipeVariantDoesNotLeak(t *testing.T) {
}
if strings.Contains(textContent, "invoke") || strings.Contains(textContent, "execute_command") {
t.Fatalf("fullwidth pipe variant leaked to text: %q", textContent)
t.Fatalf("escaped U+FF5C pipe variant leaked to text: %q", textContent)
}
if toolCalls != 1 {
t.Fatalf("expected one tool call from fullwidth pipe variant, got %d events=%#v", toolCalls, events)
t.Fatalf("expected one tool call from escaped U+FF5C pipe variant, got %d events=%#v", toolCalls, events)
}
}
// Test <DSML|tool_calls> with DSML invoke/parameter tags should buffer the
// Test <|DSML|tool_calls> with DSML invoke/parameter tags should buffer the
// wrapper instead of leaking it before the block is complete.
func TestProcessToolSieveFullwidthDSMLPrefixVariantDoesNotLeak(t *testing.T) {
var state State
chunks := []string{
"<DSML|tool",
"<|DSML|tool",
"_calls>\n",
"<|DSML|invoke name=\"Bash\">\n",
"<|DSML|parameter name=\"command\"><![CDATA[ls -la /Users/aq/Desktop/myproject/ds2api/]]></|DSML|parameter>\n",
@@ -1232,12 +1232,12 @@ func TestProcessToolSieveDSMLBarePrefixVariantDoesNotLeak(t *testing.T) {
func TestProcessToolSieveCJKAngleDSMDriftDoesNotLeak(t *testing.T) {
var state State
chunks := []string{
"<DSMtool_calls>\n",
"<DSMinvoke name=\"Bash\">\n",
"<DSMparameter name=\"description\">〈![CDATA[Check tracking branch status]]〉〈/DSMparameter〉\n",
"<DSMparameter name=\"command\">〈![CDATA[git status -b --short]]〉〈/DSMparameter〉\n",
"〈/DSMinvoke〉\n",
"〈/DSMtool_calls〉",
"<DSM|tool_calls>\n",
"<DSM|invoke name=\"Bash\">\n",
"<DSM|parameter name=\"description\"|>〈![CDATA[Check tracking branch status]]〉〈/DSM|parameter〉\n",
"<DSM|parameter name=\"command\"|>〈![CDATA[git status -b --short]]〉〈/DSM|parameter〉\n",
"〈/DSM|invoke〉\n",
"〈/DSM|tool_calls〉",
}
var events []Event
for _, c := range chunks {
@@ -1338,7 +1338,7 @@ func TestProcessToolSieveIdeographicCommaDSMLDriftDoesNotLeak(t *testing.T) {
func TestProcessToolSieveParsesFullwidthClosingSlashAndKeepsSuffixText(t *testing.T) {
var state State
chunk := `<DSMLtool_calls><DSMLinvoke name="execute_code"><DSMLparameter name="code"><![CDATA[print("hi")]]></DSMLparameter></DSMLinvoke><DSMLtool_calls> sao cụm này lại đc trả là 1 message`
chunk := `<|DSML|tool_calls><|DSML|invoke name="execute_code"><|DSML|parameter name="code"><![CDATA[print("hi")]]></|DSML|parameter></|DSML|invoke><DSML|tool_calls> sao cụm này lại đc trả là 1 message`
events := ProcessChunk(&state, chunk, []string{"execute_code"})
events = append(events, Flush(&state, []string{"execute_code"})...)
@@ -1365,7 +1365,7 @@ func TestProcessToolSieveParsesFullwidthClosingSlashAndKeepsSuffixText(t *testin
func TestProcessToolSieveParsesSentencePieceSeparatorAndFullwidthTerminator(t *testing.T) {
var state State
chunk := `<DSML▁tool_calls><DSML▁invoke▁name="execute_code"><DSML▁parameter▁name="code"><![CDATA[print("hi")]]></DSML▁parameter></DSML▁invoke></DSML▁tool_calls suffix`
chunk := `<|DSML▁tool_calls|><|DSML▁invoke▁name="execute_code"><|DSML▁parameter▁name="code"><![CDATA[print("hi")]]></|DSML▁parameter></|DSML▁invoke></|DSML▁tool_calls suffix`
events := ProcessChunk(&state, chunk, []string{"execute_code"})
events = append(events, Flush(&state, []string{"execute_code"})...)
@@ -1392,7 +1392,7 @@ func TestProcessToolSieveParsesSentencePieceSeparatorAndFullwidthTerminator(t *t
func TestProcessToolSieveParsesFullwidthOpeningDelimiterAndUnicodeAttributes(t *testing.T) {
var state State
chunk := `DSML tool_callsDSML invoke name“execute_code”DSML parameter name“code”<![CDATA[print("hi")]]>DSMLparameterDSMLinvokeDSMLtool_calls suffix`
chunk := `|DSML tool_calls|DSML invoke name“execute_code”|DSML parameter name“code”<![CDATA[print("hi")]]>DSML|parameterDSML|invokeDSML|tool_calls suffix`
events := ProcessChunk(&state, chunk, []string{"execute_code"})
events = append(events, Flush(&state, []string{"execute_code"})...)