From 0a23c77ff73b56c7fd196ee6afa69b131df492a1 Mon Sep 17 00:00:00 2001 From: CJACK Date: Sun, 12 Apr 2026 17:43:57 +0800 Subject: [PATCH] feat: add sanitization for think tags and BOS markers in leaked output and update golang.org/x/net dependency --- go.mod | 2 +- internal/adapter/openai/leaked_output_sanitize.go | 11 ++++++++++- .../adapter/openai/leaked_output_sanitize_test.go | 8 ++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index 1471913..2613f89 100644 --- a/go.mod +++ b/go.mod @@ -18,7 +18,7 @@ require ( github.com/tidwall/pretty v1.2.1 // indirect github.com/tidwall/sjson v1.2.5 // indirect golang.org/x/crypto v0.49.0 // indirect - golang.org/x/net v0.52.0 // indirect + golang.org/x/net v0.52.0 golang.org/x/sys v0.42.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/internal/adapter/openai/leaked_output_sanitize.go b/internal/adapter/openai/leaked_output_sanitize.go index c139feb..bcd8227 100644 --- a/internal/adapter/openai/leaked_output_sanitize.go +++ b/internal/adapter/openai/leaked_output_sanitize.go @@ -8,7 +8,14 @@ var emptyJSONFencePattern = regexp.MustCompile("(?is)```json\\s*```") var leakedToolCallArrayPattern = regexp.MustCompile(`(?is)\[\{\s*"function"\s*:\s*\{[\s\S]*?\}\s*,\s*"id"\s*:\s*"call[^"]*"\s*,\s*"type"\s*:\s*"function"\s*}\]`) var leakedToolResultBlobPattern = regexp.MustCompile(`(?is)<\s*\|\s*tool\s*\|\s*>\s*\{[\s\S]*?"tool_call_id"\s*:\s*"call[^"]*"\s*}`) -// leakedMetaMarkerPattern matches DeepSeek special tokens in BOTH forms: +var leakedThinkTagPattern = regexp.MustCompile(`(?i)`) + +// leakedBOSMarkerPattern matches DeepSeek BOS markers in BOTH forms: +// - ASCII underscore: <|begin_of_sentence|> +// - U+2581 variant: <|begin▁of▁sentence|> +var leakedBOSMarkerPattern = regexp.MustCompile(`(?i)<[|\|]\s*begin[_▁]of[_▁]sentence\s*[|\|]>`) + +// leakedMetaMarkerPattern matches the remaining DeepSeek special tokens in BOTH forms: // - ASCII underscore: <|end_of_sentence|>, <|end_of_toolresults|>, <|end_of_instructions|> // - U+2581 variant: <|end▁of▁sentence|>, <|end▁of▁toolresults|>, <|end▁of▁instructions|> var leakedMetaMarkerPattern = regexp.MustCompile(`(?i)<[|\|]\s*(?:assistant|tool|end[_▁]of[_▁]sentence|end[_▁]of[_▁]thinking|end[_▁]of[_▁]toolresults|end[_▁]of[_▁]instructions)\s*[|\|]>`) @@ -35,6 +42,8 @@ func sanitizeLeakedOutput(text string) string { out := emptyJSONFencePattern.ReplaceAllString(text, "") out = leakedToolCallArrayPattern.ReplaceAllString(out, "") out = leakedToolResultBlobPattern.ReplaceAllString(out, "") + out = leakedThinkTagPattern.ReplaceAllString(out, "") + out = leakedBOSMarkerPattern.ReplaceAllString(out, "") out = leakedMetaMarkerPattern.ReplaceAllString(out, "") out = sanitizeLeakedAgentXMLBlocks(out) return out diff --git a/internal/adapter/openai/leaked_output_sanitize_test.go b/internal/adapter/openai/leaked_output_sanitize_test.go index 558cc48..6fd1485 100644 --- a/internal/adapter/openai/leaked_output_sanitize_test.go +++ b/internal/adapter/openai/leaked_output_sanitize_test.go @@ -26,6 +26,14 @@ func TestSanitizeLeakedOutputRemovesStandaloneMetaMarkers(t *testing.T) { } } +func TestSanitizeLeakedOutputRemovesThinkAndBosMarkers(t *testing.T) { + raw := "ABC<|begin▁of▁sentence|>D<| begin_of_sentence |>E<|begin_of_sentence|>F" + got := sanitizeLeakedOutput(raw) + if got != "ABCDEF" { + t.Fatalf("unexpected sanitize result for think/BOS markers: %q", got) + } +} + func TestSanitizeLeakedOutputRemovesAgentXMLLeaks(t *testing.T) { raw := "Done.Some final answer" got := sanitizeLeakedOutput(raw)