mirror of
https://github.com/CJackHwang/ds2api.git
synced 2026-05-11 03:37:40 +08:00
159 lines
6.0 KiB
Go
159 lines
6.0 KiB
Go
package shared
|
|
|
|
import (
|
|
"regexp"
|
|
"strings"
|
|
|
|
"ds2api/internal/toolcall"
|
|
)
|
|
|
|
var emptyJSONFencePattern = regexp.MustCompile("(?is)```json\\s*```")
|
|
var leakedToolCallArrayPattern = regexp.MustCompile(`(?is)\[\{\s*"function"\s*:\s*\{[\s\S]*?\}\s*,\s*"id"\s*:\s*"call[^"]*"\s*,\s*"type"\s*:\s*"function"\s*}\]`)
|
|
var leakedToolResultBlobPattern = regexp.MustCompile(`(?is)<\s*\|\s*tool\s*\|\s*>\s*\{[\s\S]*?"tool_call_id"\s*:\s*"call[^"]*"\s*}`)
|
|
|
|
var leakedThinkTagPattern = regexp.MustCompile(`(?is)</?\s*think\s*>`)
|
|
|
|
// leakedBOSMarkerPattern matches DeepSeek BOS markers with halfwidth or
|
|
// legacy U+FF5C fullwidth delimiters:
|
|
// - ASCII underscore: <|begin_of_sentence|>
|
|
// - U+2581 variant: <|begin▁of▁sentence|>
|
|
var leakedBOSMarkerPattern = regexp.MustCompile(`(?i)<[\|\x{ff5c}]\s*begin[_▁]of[_▁]sentence\s*[\|\x{ff5c}]>`)
|
|
|
|
// leakedThoughtMarkerPattern matches leaked thought control markers in both
|
|
// explicit and compact forms:
|
|
// - ASCII underscore: <| of_thought |>, <| begin_of_thought |>
|
|
// - U+2581 variant: <|▁of▁thought|>, <|begin▁of▁thought|>
|
|
var leakedThoughtMarkerPattern = regexp.MustCompile(`(?i)<[\|\x{ff5c}]\s*(?:begin[_▁])?[_▁]*of[_▁]thought\s*[\|\x{ff5c}]>`)
|
|
|
|
// leakedMetaMarkerPattern matches the remaining DeepSeek special tokens with
|
|
// halfwidth or legacy U+FF5C fullwidth delimiters:
|
|
// - ASCII underscore: <|end_of_sentence|>, <|end_of_toolresults|>, <|end_of_instructions|>
|
|
// - U+2581 variant: <|end▁of▁sentence|>, <|end▁of▁toolresults|>, <|end▁of▁instructions|>
|
|
var leakedMetaMarkerPattern = regexp.MustCompile(`(?i)<[\|\x{ff5c}]\s*(?:assistant|tool|end[_▁]of[_▁]sentence|end[_▁]of[_▁]thinking|end[_▁]of[_▁]thought|end[_▁]of[_▁]toolresults|end[_▁]of[_▁]instructions)\s*[\|\x{ff5c}]>`)
|
|
|
|
// leakedAgentXMLBlockPatterns catch agent-style XML blocks that leak through
|
|
// when the sieve fails to capture them. These are applied only to complete
|
|
// wrapper blocks so standalone "<result>" examples in normal output remain
|
|
// untouched.
|
|
var leakedAgentXMLBlockPatterns = []*regexp.Regexp{
|
|
regexp.MustCompile(`(?is)<attempt_completion\b[^>]*>(.*?)</attempt_completion>`),
|
|
regexp.MustCompile(`(?is)<ask_followup_question\b[^>]*>(.*?)</ask_followup_question>`),
|
|
regexp.MustCompile(`(?is)<new_task\b[^>]*>(.*?)</new_task>`),
|
|
}
|
|
|
|
var leakedAgentWrapperTagPattern = regexp.MustCompile(`(?is)</?(?:attempt_completion|ask_followup_question|new_task)\b[^>]*>`)
|
|
var leakedAgentWrapperPlusResultOpenPattern = regexp.MustCompile(`(?is)<(?:attempt_completion|ask_followup_question|new_task)\b[^>]*>\s*<result>`)
|
|
var leakedAgentResultPlusWrapperClosePattern = regexp.MustCompile(`(?is)</result>\s*</(?:attempt_completion|ask_followup_question|new_task)\b[^>]*>`)
|
|
var leakedAgentResultTagPattern = regexp.MustCompile(`(?is)</?result>`)
|
|
|
|
func sanitizeLeakedOutput(text string) string {
|
|
if text == "" {
|
|
return text
|
|
}
|
|
out := emptyJSONFencePattern.ReplaceAllString(text, "")
|
|
out = leakedToolCallArrayPattern.ReplaceAllString(out, "")
|
|
out = leakedToolResultBlobPattern.ReplaceAllString(out, "")
|
|
out = stripDanglingThinkSuffix(out)
|
|
out = leakedThinkTagPattern.ReplaceAllString(out, "")
|
|
out = leakedBOSMarkerPattern.ReplaceAllString(out, "")
|
|
out = leakedThoughtMarkerPattern.ReplaceAllString(out, "")
|
|
out = leakedMetaMarkerPattern.ReplaceAllString(out, "")
|
|
out = stripLeakedToolCallWrapperBlocks(out)
|
|
out = sanitizeLeakedAgentXMLBlocks(out)
|
|
return out
|
|
}
|
|
|
|
func stripLeakedToolCallWrapperBlocks(text string) string {
|
|
if text == "" {
|
|
return text
|
|
}
|
|
var b strings.Builder
|
|
pos := 0
|
|
for pos < len(text) {
|
|
tag, ok := toolcall.FindToolMarkupTagOutsideIgnored(text, pos)
|
|
if !ok {
|
|
b.WriteString(text[pos:])
|
|
break
|
|
}
|
|
if tag.Start > pos {
|
|
b.WriteString(text[pos:tag.Start])
|
|
}
|
|
if tag.Closing || tag.Name != "tool_calls" {
|
|
b.WriteString(text[tag.Start : tag.End+1])
|
|
pos = tag.End + 1
|
|
continue
|
|
}
|
|
closeTag, ok := toolcall.FindMatchingToolMarkupClose(text, tag)
|
|
if !ok {
|
|
b.WriteString(text[tag.Start : tag.End+1])
|
|
pos = tag.End + 1
|
|
continue
|
|
}
|
|
pos = closeTag.End + 1
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
func stripDanglingThinkSuffix(text string) string {
|
|
matches := leakedThinkTagPattern.FindAllStringIndex(text, -1)
|
|
if len(matches) == 0 {
|
|
return text
|
|
}
|
|
depth := 0
|
|
lastOpen := -1
|
|
for _, loc := range matches {
|
|
tag := strings.ToLower(text[loc[0]:loc[1]])
|
|
compact := strings.ReplaceAll(strings.ReplaceAll(strings.TrimSpace(tag), " ", ""), "\t", "")
|
|
if strings.HasPrefix(compact, "</") {
|
|
if depth > 0 {
|
|
depth--
|
|
if depth == 0 {
|
|
lastOpen = -1
|
|
}
|
|
}
|
|
continue
|
|
}
|
|
if depth == 0 {
|
|
lastOpen = loc[0]
|
|
}
|
|
depth++
|
|
}
|
|
if depth == 0 || lastOpen < 0 {
|
|
return text
|
|
}
|
|
prefix := text[:lastOpen]
|
|
if strings.TrimSpace(prefix) == "" {
|
|
return ""
|
|
}
|
|
return prefix
|
|
}
|
|
|
|
func sanitizeLeakedAgentXMLBlocks(text string) string {
|
|
out := text
|
|
for _, pattern := range leakedAgentXMLBlockPatterns {
|
|
out = pattern.ReplaceAllStringFunc(out, func(match string) string {
|
|
submatches := pattern.FindStringSubmatch(match)
|
|
if len(submatches) < 2 {
|
|
return match
|
|
}
|
|
// Preserve the inner text so leaked agent instructions do not erase
|
|
// the actual answer, but strip the wrapper/result markup itself.
|
|
return leakedAgentResultTagPattern.ReplaceAllString(submatches[1], "")
|
|
})
|
|
}
|
|
// Fallback for truncated output streams: strip any dangling wrapper tags
|
|
// that were not part of a complete block replacement. If we detect leaked
|
|
// wrapper tags, strip only adjacent <result> tags to avoid exposing agent
|
|
// markup without altering unrelated user-visible <result> examples.
|
|
if leakedAgentWrapperTagPattern.MatchString(out) {
|
|
out = leakedAgentWrapperPlusResultOpenPattern.ReplaceAllStringFunc(out, func(match string) string {
|
|
return leakedAgentResultTagPattern.ReplaceAllString(match, "")
|
|
})
|
|
out = leakedAgentResultPlusWrapperClosePattern.ReplaceAllStringFunc(out, func(match string) string {
|
|
return leakedAgentResultTagPattern.ReplaceAllString(match, "")
|
|
})
|
|
out = leakedAgentWrapperTagPattern.ReplaceAllString(out, "")
|
|
}
|
|
return out
|
|
}
|