From 03ea3728e7ffb6af32310c1ed5ca54f4b4dd8838 Mon Sep 17 00:00:00 2001 From: CJACK Date: Sun, 10 May 2026 17:01:22 +0800 Subject: [PATCH 1/3] fix security advisory issues --- internal/server/router.go | 40 +++++++++++++++++++++++++ internal/server/router_log_test.go | 47 ++++++++++++++++++++++++++++++ internal/webui/handler.go | 7 +++-- internal/webui/handler_test.go | 27 +++++++++++++++++ 4 files changed, 118 insertions(+), 3 deletions(-) create mode 100644 internal/server/router_log_test.go diff --git a/internal/server/router.go b/internal/server/router.go index 07b5e15..b8a78b4 100644 --- a/internal/server/router.go +++ b/internal/server/router.go @@ -6,6 +6,7 @@ import ( "fmt" "log" "net/http" + "net/url" "os" "runtime" "strings" @@ -160,6 +161,16 @@ func (f *filteredLogFormatter) NewLogEntry(r *http.Request) middleware.LogEntry return noopLogEntry{} } } + if r != nil && r.URL != nil { + if redacted, changed := redactSensitiveQueryParams(r.URL); changed { + cloned := *r + clonedURL := *r.URL + clonedURL.RawQuery = redacted + cloned.URL = &clonedURL + cloned.RequestURI = clonedURL.RequestURI() + return f.base.NewLogEntry(&cloned) + } + } return f.base.NewLogEntry(r) } @@ -169,6 +180,35 @@ func (noopLogEntry) Write(_ int, _ int, _ http.Header, _ time.Duration, _ interf func (noopLogEntry) Panic(_ interface{}, _ []byte) {} +func redactSensitiveQueryParams(u *url.URL) (string, bool) { + if u == nil || u.RawQuery == "" { + return "", false + } + values, err := url.ParseQuery(u.RawQuery) + if err != nil && len(values) == 0 { + return "", false + } + changed := false + for name, vals := range values { + if !isSensitiveQueryParam(name) { + continue + } + for i := range vals { + vals[i] = "REDACTED" + } + values[name] = vals + changed = true + } + if !changed { + return "", false + } + return values.Encode(), true +} + +func isSensitiveQueryParam(name string) bool { + return strings.EqualFold(name, "key") || strings.EqualFold(name, "api_key") +} + var defaultCORSAllowHeaders = []string{ "Content-Type", "Authorization", diff --git a/internal/server/router_log_test.go b/internal/server/router_log_test.go new file mode 100644 index 0000000..e5f43df --- /dev/null +++ b/internal/server/router_log_test.go @@ -0,0 +1,47 @@ +package server + +import ( + "bytes" + "log" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "github.com/go-chi/chi/v5/middleware" +) + +func TestFilteredLogFormatterRedactsSensitiveQueryParams(t *testing.T) { + var buf bytes.Buffer + formatter := &filteredLogFormatter{ + base: &middleware.DefaultLogFormatter{ + Logger: log.New(&buf, "", 0), + NoColor: true, + }, + } + req := httptest.NewRequest( + http.MethodPost, + "/v1beta/models/gemini-2.5-pro:generateContent?key=caller-secret&api_key=second-secret&alt=sse", + nil, + ) + + entry := formatter.NewLogEntry(req) + entry.Write(http.StatusOK, 0, http.Header{}, time.Millisecond, nil) + + got := buf.String() + for _, secret := range []string{"caller-secret", "second-secret"} { + if strings.Contains(got, secret) { + t.Fatalf("log line contains sensitive query value %q: %s", secret, got) + } + } + if !strings.Contains(got, "key=REDACTED") || !strings.Contains(got, "api_key=REDACTED") { + t.Fatalf("log line did not include redacted sensitive params: %s", got) + } + if !strings.Contains(got, "alt=sse") { + t.Fatalf("log line did not preserve non-sensitive query param: %s", got) + } + if req.URL.RawQuery != "key=caller-secret&api_key=second-secret&alt=sse" { + t.Fatalf("request was mutated, RawQuery = %q", req.URL.RawQuery) + } +} diff --git a/internal/webui/handler.go b/internal/webui/handler.go index da9649d..5742bb4 100644 --- a/internal/webui/handler.go +++ b/internal/webui/handler.go @@ -95,11 +95,12 @@ func setStaticContentType(w http.ResponseWriter, fullPath string) { } func (h *Handler) serveFromDisk(w http.ResponseWriter, r *http.Request, staticDir string) { + root := filepath.Clean(staticDir) path := strings.TrimPrefix(r.URL.Path, "/admin") path = strings.TrimPrefix(path, "/") if path != "" && strings.Contains(path, ".") { - full := filepath.Join(staticDir, filepath.Clean(path)) - if !strings.HasPrefix(full, staticDir) { + full := filepath.Join(root, filepath.Clean(path)) + if full != root && !strings.HasPrefix(full, root+string(os.PathSeparator)) { http.NotFound(w, r) return } @@ -116,7 +117,7 @@ func (h *Handler) serveFromDisk(w http.ResponseWriter, r *http.Request, staticDi http.NotFound(w, r) return } - index := filepath.Join(staticDir, "index.html") + index := filepath.Join(root, "index.html") if _, err := os.Stat(index); err != nil { http.Error(w, "index.html not found", http.StatusNotFound) return diff --git a/internal/webui/handler_test.go b/internal/webui/handler_test.go index 99832c5..a819d8c 100644 --- a/internal/webui/handler_test.go +++ b/internal/webui/handler_test.go @@ -78,6 +78,33 @@ func TestServeFromDiskPinsContentType(t *testing.T) { } } +func TestServeFromDiskRejectsSiblingDirectoryWithSharedPrefix(t *testing.T) { + parent := t.TempDir() + staticDir := filepath.Join(parent, "admin") + siblingDir := filepath.Join(parent, "admin-leak") + if err := os.MkdirAll(staticDir, 0o755); err != nil { + t.Fatalf("mkdir static dir: %v", err) + } + if err := os.MkdirAll(siblingDir, 0o755); err != nil { + t.Fatalf("mkdir sibling dir: %v", err) + } + if err := os.WriteFile(filepath.Join(siblingDir, "secret.txt"), []byte("secret"), 0o644); err != nil { + t.Fatalf("write sibling secret: %v", err) + } + + h := &Handler{StaticDir: staticDir} + req := httptest.NewRequest(http.MethodGet, "/admin/../admin-leak/secret.txt", nil) + rec := httptest.NewRecorder() + h.serveFromDisk(rec, req, staticDir) + + if rec.Code != http.StatusNotFound { + t.Fatalf("status = %d, want 404", rec.Code) + } + if body := rec.Body.String(); strings.Contains(body, "secret") { + t.Fatal("served content from sibling directory") + } +} + // TestSetStaticContentTypeUnknownExtensionFallsThrough verifies that unknown // extensions leave the Content-Type header unset, so http.ServeFile can apply // its own detection (sniffing or mime.TypeByExtension) for cases the pinned From 243860f6d3ee1a483777fa9abe9e5a3937b2b2ad Mon Sep 17 00:00:00 2001 From: CJACK Date: Sun, 10 May 2026 17:02:40 +0800 Subject: [PATCH 2/3] bump version to 4.6.1 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 6016e8a..8ac28bf 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -4.6.0 +4.6.1 From e393110121280ba8806376a58824e013dc735cd7 Mon Sep 17 00:00:00 2001 From: CJACK Date: Sun, 10 May 2026 18:00:04 +0800 Subject: [PATCH 3/3] fix toolcall inline code and query redaction --- API.md | 2 +- README.MD | 2 +- docs/prompt-compatibility.md | 2 +- docs/toolcall-semantics.md | 5 +- .../stream-tool-sieve/parse_payload.js | 180 ++++++++++++++++-- .../js/helpers/stream-tool-sieve/sieve.js | 5 +- .../js/helpers/stream-tool-sieve/state.js | 66 +++++++ internal/server/router.go | 55 +++++- internal/server/router_log_test.go | 57 ++++++ internal/toolcall/fence_edge_test.go | 41 ++++ internal/toolcall/toolcalls_candidates.go | 18 +- internal/toolcall/toolcalls_dsml.go | 5 + internal/toolcall/toolcalls_parse.go | 23 +++ internal/toolcall/toolcalls_parse_markup.go | 91 +++++++-- internal/toolcall/toolcalls_scan.go | 12 ++ internal/toolstream/fence_edge_sieve_test.go | 65 +++++++ internal/toolstream/tool_sieve_core.go | 4 +- internal/toolstream/tool_sieve_state.go | 64 +++++++ tests/node/stream-tool-sieve.test.js | 60 ++++++ 19 files changed, 708 insertions(+), 49 deletions(-) diff --git a/API.md b/API.md index 63b4539..3ddd83c 100644 --- a/API.md +++ b/API.md @@ -360,7 +360,7 @@ data: [DONE] - 解析器当前把推荐半角管道符 DSML 外壳(`<|DSML|tool_calls>` / `<|DSML|invoke name="...">` / `<|DSML|parameter name="...">`)、DSML wrapper 别名(``、`<|tool_calls>`)、常见 DSML 分隔符漏写形态(如 `<|DSML tool_calls>` / `<|DSML invoke>` / `<|DSML parameter>`)、`DSML` 与工具标签名黏连的常见 typo(如 `` / `` / ``)、控制分隔符漂移(如 `` / 原始 STX `\x02`)、CJK 尖括号、全角感叹号、顿号、PascalCase 本地名、弯引号属性值与属性尾部分隔符漂移(如 `...〈/DSM|parameter〉` / `<!DSML!invoke name=“Bash”>` / `<、DSML、tool_calls>` / `` / ``)、任意协议前缀壳(如 ``)和旧式 canonical XML 工具块(`` / `` / ``)作为可执行调用解析;这些非结构性分隔符壳会先归一化回 XML,内部仍以 XML 解析语义为准,CDATA 开头也会容错 `<![CDATA[` / `<、[CDATA[`。旧式 ``、``、``、``、``、`tool_use`、antml 风格与纯 JSON `tool_calls` 片段默认都会按普通文本处理;完整但 malformed 的 wrapper 同样会作为普通文本释放。 - 解析层不会因为参数值为空而丢弃工具调用;显式空字符串或纯空白参数会按空字符串进入结构化 `tool_calls`。Prompt 会要求模型不要主动输出空参数,缺参/空命令的拒绝应由工具执行侧或客户端 schema 校验负责。 - 当最终可见正文为空但思维链里包含可执行工具调用时,Chat / Responses 会在收尾阶段补发标准 OpenAI `tool_calls` / `function_call` 输出;如果客户端未开启 thinking / reasoning,该思维链只用于检测,不会作为可见正文或 `reasoning_content` 暴露。 -- Markdown fenced code block(例如 ```json ... ```)中的 `tool_calls` 仅视为示例文本,不会被执行。 +- Markdown fenced code block(例如 ```json ... ```)和行内 code span(例如 `` `...` ``)中的 `tool_calls` 仅视为示例文本,不会被执行。 --- diff --git a/README.MD b/README.MD index c32c09c..920201f 100644 --- a/README.MD +++ b/README.MD @@ -372,7 +372,7 @@ Gemini 路由还可以使用 `x-goog-api-key`,或在没有认证头时使用 ` 当请求中带 `tools` 时,DS2API 会做防泄漏处理与结构化转译: -1. 只在**非代码块上下文**启用执行型 toolcall 识别(代码块示例默认不触发) +1. 只在**非 Markdown 代码上下文**启用执行型 toolcall 识别(fenced code block 和行内 code span 中的示例默认不触发) 2. 解析层当前把半角管道符 DSML 外壳视为推荐可执行调用:`<|DSML|tool_calls>` → `<|DSML|invoke name="...">` → `<|DSML|parameter name="...">`;兼容旧式 canonical XML `` → `` → ``,以及若干 DSML 前缀/分隔符漂移。DSML 只是外壳别名,内部仍以 XML 解析语义为准;旧式 `` / `` / `` / ``、``、`tool_use` / antml 变体与纯 JSON `tool_calls` 片段都会按普通文本处理,完整但 malformed 的 wrapper 也会作为普通文本释放 3. `responses` 流式严格使用官方 item 生命周期事件(`response.output_item.*`、`response.content_part.*`、`response.function_call_arguments.*`) 4. `responses` 支持并执行 `tool_choice`(`auto`/`none`/`required`/强制函数);`required` 违规时非流式返回 `422`,流式返回 `response.failed` diff --git a/docs/prompt-compatibility.md b/docs/prompt-compatibility.md index 7b49865..1cce051 100644 --- a/docs/prompt-compatibility.md +++ b/docs/prompt-compatibility.md @@ -168,7 +168,7 @@ OpenAI Chat / Responses 在标准化后、current input file 之前,会默认 4. 普通直传请求会把“工具描述 + 格式约束”一起并入 system prompt;如果 `current_input_file` 触发,则工具描述/schema 会单独上传成 `DS2API_TOOLS.txt`,live prompt 和 system tool 格式提示都会明确要求模型把 `DS2API_TOOLS.txt` 当作可调用工具和参数 schema 的权威来源。 工具调用正例现在优先示范半角管道符 DSML 风格:`<|DSML|tool_calls>` → `<|DSML|invoke name="...">` → `<|DSML|parameter name="...">`。 -兼容层仍接受旧式纯 `` wrapper,并会容错若干 DSML 标签变体,包括短横线形式 `` / `` / ``、下划线形式 `` / `` / ``,以及其他前缀分隔形态如 `` / `` / ``;标签壳扫描还会把全角 ASCII 漂移归一化,例如 `<dSML|tool_calls>` 与全角 `>` 结束符,也会容错 CJK 尖括号、全角感叹号或顿号分隔符、弯引号属性值、PascalCase 本地名和属性尾部分隔符漂移,例如 `...〈/DSM|parameter〉`、`<!DSML!invoke name=“Bash”>`、`<、DSML、tool_calls>`、``、``。更一般地,Go / Node tag 扫描以固定本地标签名 `tool_calls` / `invoke` / `parameter` 为准,标签名前或标签名后的非结构性协议分隔符都会在解析入口剥离,例如 ``、`` 这类控制符或非 ASCII 分隔符漂移也会归一化回现有 XML 标签后继续走同一套 parser;结构性字符如 `<` / `>` / `/` / `=` / 引号、空白和 ASCII 字母数字不会被当作这类分隔符。进入现有 DSML rewrite / XML parse 之前,Go / Node 还会先对“已经识别成工具标签壳的 candidate span”做一次窄 canonicalization:只折叠 wrapper / `invoke` / `parameter` / `name` / `CDATA` / `DSML` 及其壳层分隔符里的 confusable 字符,清理零宽 / BOM / 控制类干扰,并把引号、空白、dash / underscore 变体等统一回可解析的工具语法。这个阶段不会广义改写普通正文、参数内容、CDATA 里的示例文本或其他非工具 XML。CDATA 开头也使用同一类扫描式容错,`` wrapper,并会容错若干 DSML 标签变体,包括短横线形式 `` / `` / ``、下划线形式 `` / `` / ``,以及其他前缀分隔形态如 `` / `` / ``;标签壳扫描还会把全角 ASCII 漂移归一化,例如 `<dSML|tool_calls>` 与全角 `>` 结束符,也会容错 CJK 尖括号、全角感叹号或顿号分隔符、弯引号属性值、PascalCase 本地名和属性尾部分隔符漂移,例如 `...〈/DSM|parameter〉`、`<!DSML!invoke name=“Bash”>`、`<、DSML、tool_calls>`、``、``。更一般地,Go / Node tag 扫描以固定本地标签名 `tool_calls` / `invoke` / `parameter` 为准,标签名前或标签名后的非结构性协议分隔符都会在解析入口剥离,例如 ``、`` 这类控制符或非 ASCII 分隔符漂移也会归一化回现有 XML 标签后继续走同一套 parser;结构性字符如 `<` / `>` / `/` / `=` / 引号、空白和 ASCII 字母数字不会被当作这类分隔符。进入现有 DSML rewrite / XML parse 之前,Go / Node 还会先对“已经识别成工具标签壳的 candidate span”做一次窄 canonicalization:只折叠 wrapper / `invoke` / `parameter` / `name` / `CDATA` / `DSML` 及其壳层分隔符里的 confusable 字符,清理零宽 / BOM / 控制类干扰,并把引号、空白、dash / underscore 变体等统一回可解析的工具语法。这个阶段不会广义改写普通正文、参数内容、Markdown 行内 code span、CDATA 里的示例文本或其他非工具 XML。CDATA 开头也使用同一类扫描式容错,`...` 子节点表示;当某个参数体只包含 item 子节点时,Go / Node 解析器会把它还原成数组,避免 `questions` / `options` 这类 schema 中要求 array 的参数被误解析成 `{ "item": ... }` 对象。除此之外,解析器还会回收一些更松散的列表写法,例如 JSON array 字面量或逗号分隔的 JSON 项序列,只要它们足够明确;但 `` 仍然是首选形态。若模型把完整结构化 XML fragment 误包进 CDATA,兼容层会在保护 `content` / `command` 等原文字段的前提下,尝试把非原文字段中的 CDATA XML fragment 还原成 object / array。不过,如果 CDATA 只是单个平面的 XML/HTML 标签,例如 `urgent` 这种行内标记,兼容层会保留原始字符串,不会强行升成 object / array;只有明显表示结构的 CDATA 片段,例如多兄弟节点、嵌套子节点或 `item` 列表,才会触发结构化恢复。对 `command` / `content` 等长文本参数,CDATA 内部的 Markdown fenced DSML / XML 示例会作为原文保护;示例里的 `]]>` 或 `` 不会截断外层工具调用,解析器会继续等待围栏外真正的参数 / wrapper 结束标签。 Go 侧读取 DeepSeek SSE 时不再依赖 `bufio.Scanner` 的固定 2MiB 单行上限;当写文件类工具把很长的 `content` 放在单个 `data:` 行里返回时,非流式收集、流式解析和 auto-continue 透传都会保留完整行,再进入同一套工具解析与序列化流程。 在 assistant 最终回包阶段,如果某个 tool 参数在声明 schema 中明确是 `string`,兼容层会在把解析后的 `tool_calls` / `function_call` 重新序列化成 OpenAI / Responses / Claude 可见参数前,递归把该路径上的 number / bool / object / array 统一转成字符串;其中 object / array 会压成紧凑 JSON 字符串。这个保护只对 schema 明确声明为 string 的路径生效,不会改写本来就是 `number` / `boolean` / `object` / `array` 的参数。这样可以兼容 DeepSeek 输出了结构化片段、但上游客户端工具 schema 又严格要求字符串参数的场景(例如 `content`、`prompt`、`path`、`taskId` 等)。 diff --git a/docs/toolcall-semantics.md b/docs/toolcall-semantics.md index 7988d5a..fb21b61 100644 --- a/docs/toolcall-semantics.md +++ b/docs/toolcall-semantics.md @@ -62,12 +62,12 @@ - 已识别成功的工具调用不会再次回流到普通文本 - 不符合新格式的块不会执行,并继续按原样文本透传 - 如果一个 confusable / 漂移过的工具壳在 candidate-span canonicalization + repair 后仍能形成有效工具调用,wrapper 后面的 suffix prose 会继续按普通文本输出;如果 canonicalization 后仍不满足 wrapper-confidence 或 XML 语义,整块就作为普通文本释放,不会半吞半漏。 -- fenced code block(反引号 `` ``` `` 和波浪线 `~~~`)中的 XML 示例始终按普通文本处理 +- fenced code block(反引号 `` ``` `` 和波浪线 `~~~`)以及 Markdown inline code span(例如 `` `...` ``)中的 XML 示例始终按普通文本处理 - 支持嵌套围栏(如 4 反引号嵌套 3 反引号)和 CDATA 内围栏保护 - 对 `command` / `content` 等长文本参数,CDATA 内部如果包含 Markdown fenced DSML / XML 示例,即使示例里出现 `]]>` / `` 这类看起来像外层结束标签的片段,也会继续按参数原文保留,直到真正位于围栏外的外层结束标签 - CDATA 开头也按扫描式识别,除了标准 `` 或 Markdown inline code 里的 `<|DSML|tool_calls>`)而后面紧跟真正工具调用时,sieve 会跳过不可解析的 mention 候选并继续匹配后续真实工具块,不会因 mention 导致工具调用丢失,也不会截断 mention 后的正文 +- 当文本中 mention 了某种标签名(如 `` 或 Markdown inline code 里的 `<|DSML|tool_calls>`)而后面紧跟真正工具调用时,sieve 会跳过不可解析的 mention 候选并继续匹配后续真实工具块;行内 code span 中即使出现完整 `...` 示例也不会执行,不会因 mention 导致工具调用丢失,也不会截断 mention 后的正文 - Go 侧 SSE 读取不再使用 `bufio.Scanner` 的固定 token 上限;单个 `data:` 行中包含很长的写文件参数时,非流式收集、流式解析与 auto-continue 透传都应保留完整行,再交给 tool parser 处理 另外,`` 的值如果本身是合法 JSON 字面量,也会按结构化值解析,而不是一律保留为字符串。例如 `123`、`true`、`null`、`[1,2]`、`{"a":1}` 都会还原成对应的 number / boolean / null / array / object。 @@ -111,6 +111,7 @@ go test -v -run 'TestParseToolCalls|TestProcessToolSieve' ./internal/toolcall ./ - 混搭标签(DSML wrapper + canonical inner)归一化后正常解析 - 波浪线围栏 `~~~` 内的示例不执行 - 嵌套围栏(4 反引号嵌套 3 反引号)内的示例不执行 +- Markdown 行内 code span 内的完整工具调用示例不执行 - 文本 mention 标签名后紧跟真正工具调用的场景(含同一 wrapper 变体) - 空参数结构化保留,malformed executable-looking XML wrapper 作为文本释放 - 非兼容内容按普通文本透传 diff --git a/internal/js/helpers/stream-tool-sieve/parse_payload.js b/internal/js/helpers/stream-tool-sieve/parse_payload.js index e9fc02f..380a0f3 100644 --- a/internal/js/helpers/stream-tool-sieve/parse_payload.js +++ b/internal/js/helpers/stream-tool-sieve/parse_payload.js @@ -2,8 +2,6 @@ const CDATA_PATTERN = /^(?:<|〈)(?:!|!)\[CDATA\[([\s\S]*?)]](?:>|>|〉)$/i; const XML_ATTR_PATTERN = /\b([a-z0-9_:-]+)\s*=\s*("([^"]*)"|'([^']*)')/gi; -const XML_TOOL_CALLS_CLOSE_PATTERN = /[<<][\//]tool_calls\s*[>>]/gi; -const XML_INVOKE_START_PATTERN = /[<<]invoke\b[^>>]*\bname\s*[==]\s*(?:"([^"]*)"|'([^']*)'|“([^”]*)”|‘([^’]*)’|"([^"]*)"|'([^']*)')/i; const TOOL_MARKUP_NAMES = [ { raw: 'tool_calls', canonical: 'tool_calls' }, { raw: 'tool-calls', canonical: 'tool_calls', dsmlOnly: true }, @@ -71,6 +69,66 @@ function stripFencedCodeBlocks(text) { return out.join(''); } +function stripMarkdownCodeSpans(text) { + const raw = toStringSafe(text); + if (!raw) { + return ''; + } + let out = ''; + for (let i = 0; i < raw.length;) { + const skipped = skipXmlIgnoredSection(raw, i); + if (skipped.blocked) { + out += raw.slice(i); + break; + } + if (skipped.advanced) { + out += raw.slice(i, skipped.next); + i = skipped.next; + continue; + } + const spanEnd = markdownCodeSpanEnd(raw, i); + if (spanEnd.ok) { + i = spanEnd.end; + continue; + } + out += raw[i]; + i += 1; + } + return out; +} + +function markdownCodeSpanEnd(text, start) { + const raw = toStringSafe(text); + if (start < 0 || start >= raw.length || raw[start] !== '`') { + return { ok: false, end: start }; + } + const count = countLeadingChars(raw, start, '`'); + if (!count) { + return { ok: false, end: start }; + } + let search = start + count; + while (search < raw.length) { + if (raw[search] !== '`') { + search += 1; + continue; + } + const run = countLeadingChars(raw, search, '`'); + if (run === count) { + return { ok: true, end: search + run }; + } + search += run; + } + return { ok: false, end: start }; +} + +function countLeadingChars(text, start, ch) { + let count = 0; + while (start + count < text.length && text[start + count] === ch) { + count += 1; + } + return count; +} + function parseFenceOpenLine(trimmed) { if (trimmed.length < 3) return null; const ch = trimmed[0]; @@ -136,12 +194,12 @@ function parseMarkupToolCalls(text) { if (!raw) { return []; } - let wrappers = findXmlElementBlocks(raw, 'tool_calls'); + let wrappers = findToolCallElementBlocksOutsideIgnored(raw); if (wrappers.length === 0 && hasRepairableXMLToolCallsWrapper(raw)) { const repaired = repairMissingXMLToolCallsOpeningWrapper(raw); if (repaired !== raw) { raw = repaired; - wrappers = findXmlElementBlocks(raw, 'tool_calls'); + wrappers = findToolCallElementBlocksOutsideIgnored(raw); } } const out = []; @@ -157,6 +215,36 @@ function parseMarkupToolCalls(text) { return out; } +function findToolCallElementBlocksOutsideIgnored(text) { + const raw = toStringSafe(text); + const out = []; + for (let searchFrom = 0; searchFrom < raw.length;) { + const tag = findToolMarkupTagOutsideIgnored(raw, searchFrom); + if (!tag) { + break; + } + if (tag.closing || tag.name !== 'tool_calls') { + searchFrom = tag.end + 1; + continue; + } + const closeTag = findMatchingToolMarkupClose(raw, tag); + if (!closeTag) { + searchFrom = tag.end + 1; + continue; + } + const endDelim = xmlTagEndDelimiterLenEndingAt(raw, tag.end); + const attrsEnd = endDelim > 0 ? tag.end + 1 - endDelim : tag.end + 1; + out.push({ + attrs: raw.slice(tag.nameEnd, attrsEnd), + body: raw.slice(tag.end + 1, closeTag.start), + start: tag.start, + end: closeTag.end + 1, + }); + searchFrom = closeTag.end + 1; + } + return out; +} + function normalizeDSMLToolCallMarkup(text) { const raw = toStringSafe(text); if (!raw) { @@ -196,6 +284,11 @@ function containsToolCallWrapperSyntaxOutsideIgnored(text) { i = skipped.next; continue; } + const spanEnd = markdownCodeSpanEnd(raw, i); + if (spanEnd.ok) { + i = spanEnd.end; + continue; + } const tag = scanToolMarkupTagAt(raw, i); if (tag) { if (tag.name !== 'tool_calls') { @@ -232,6 +325,11 @@ function containsToolMarkupSyntaxOutsideIgnored(text) { i = skipped.next; continue; } + const spanEnd = markdownCodeSpanEnd(raw, i); + if (spanEnd.ok) { + i = spanEnd.end; + continue; + } const tag = scanToolMarkupTagAt(raw, i); if (tag) { if (tag.dsmlLike) { @@ -267,6 +365,12 @@ function replaceDSMLToolMarkupOutsideIgnored(text) { i = skipped.next; continue; } + const spanEnd = markdownCodeSpanEnd(raw, i); + if (spanEnd.ok) { + out += raw.slice(i, spanEnd.end); + i = spanEnd.end; + continue; + } const tag = scanToolMarkupTagAt(raw, i); if (tag) { out += `<${tag.closing ? '/' : ''}${tag.name}${raw.slice(tag.nameEnd, tag.end)}>`; @@ -553,6 +657,11 @@ function findToolMarkupTagOutsideIgnored(text, from) { i = skipped.next; continue; } + const spanEnd = markdownCodeSpanEnd(raw, i); + if (spanEnd.ok) { + i = spanEnd.end; + continue; + } const tag = scanToolMarkupTagAt(raw, i); if (tag) { return tag; @@ -987,6 +1096,12 @@ function canonicalizeToolCallCandidateSpans(text) { i = skipped.next; continue; } + const spanEnd = markdownCodeSpanEnd(raw, i); + if (spanEnd.ok) { + out += raw.slice(i, spanEnd.end); + i = spanEnd.end; + continue; + } const tag = scanToolMarkupTagAt(raw, i); if (!tag) { out += raw[i]; @@ -2249,30 +2364,62 @@ function sanitizeLooseCDATA(text) { function hasRepairableXMLToolCallsWrapper(text) { const raw = toStringSafe(text).trim(); - if (!raw || raw.toLowerCase().includes('${raw.slice(invoke.index, close.index)}${raw.slice(close.index + close[0].length)}`; + const invoke = firstToolMarkupTagByName(raw, 'invoke', false); + const close = lastToolMarkupTagByName(raw, 'tool_calls', true); + if (!invoke || !close || invoke.start >= close.start) { + return raw; + } + return `${raw.slice(0, invoke.start)}${raw.slice(invoke.start, close.start)}${raw.slice(close.end + 1)}`; +} + +function firstToolMarkupTagByName(text, name, closing) { + const raw = toStringSafe(text); + for (let searchFrom = 0; searchFrom < raw.length;) { + const tag = findToolMarkupTagOutsideIgnored(raw, searchFrom); + if (!tag) { + break; + } + if (tag.name === name && tag.closing === closing) { + return tag; + } + searchFrom = tag.end + 1; + } + return null; +} + +function lastToolMarkupTagByName(text, name, closing) { + const raw = toStringSafe(text); + let last = null; + for (let searchFrom = 0; searchFrom < raw.length;) { + const tag = findToolMarkupTagOutsideIgnored(raw, searchFrom); + if (!tag) { + break; + } + if (tag.name === name && tag.closing === closing) { + last = tag; + } + searchFrom = tag.end + 1; + } + return last; } function rawNameForTag(tag) { @@ -2494,6 +2641,7 @@ function isOnlyRawValue(obj) { module.exports = { stripFencedCodeBlocks, + stripMarkdownCodeSpans, parseMarkupToolCalls, normalizeDSMLToolCallMarkup, containsToolMarkupSyntaxOutsideIgnored, diff --git a/internal/js/helpers/stream-tool-sieve/sieve.js b/internal/js/helpers/stream-tool-sieve/sieve.js index 0e2d0aa..18898d8 100644 --- a/internal/js/helpers/stream-tool-sieve/sieve.js +++ b/internal/js/helpers/stream-tool-sieve/sieve.js @@ -3,6 +3,7 @@ const { resetIncrementalToolState, noteText, insideCodeFenceWithState, + insideMarkdownCodeSpanWithState, } = require('./state'); const { trimWrappingJSONFence } = require('./jsonscan'); const { @@ -161,7 +162,7 @@ function splitSafeContentForToolDetection(state, s) { // Only hold back partial XML tool tags. const xmlIdx = findPartialXMLToolTagStart(text); if (xmlIdx >= 0) { - if (insideCodeFenceWithState(state, text.slice(0, xmlIdx))) { + if (insideCodeFenceWithState(state, text.slice(0, xmlIdx)) || insideMarkdownCodeSpanWithState(state, text.slice(0, xmlIdx))) { return [text, '']; } if (xmlIdx > 0) { @@ -182,7 +183,7 @@ function findToolSegmentStart(state, s) { if (!tag) { return -1; } - if (!insideCodeFenceWithState(state, s.slice(0, tag.start))) { + if (!insideCodeFenceWithState(state, s.slice(0, tag.start)) && !insideMarkdownCodeSpanWithState(state, s.slice(0, tag.start))) { return tag.start; } offset = tag.end + 1; diff --git a/internal/js/helpers/stream-tool-sieve/state.js b/internal/js/helpers/stream-tool-sieve/state.js index f9fb2b5..8282f2e 100644 --- a/internal/js/helpers/stream-tool-sieve/state.js +++ b/internal/js/helpers/stream-tool-sieve/state.js @@ -9,6 +9,7 @@ function createToolSieveState() { codeFencePendingTicks: 0, codeFencePendingTildes: 0, codeFenceLineStart: true, + markdownCodeSpanTicks: 0, pendingToolRaw: '', pendingToolCalls: [], disableDeltas: false, @@ -35,6 +36,7 @@ function noteText(state, text) { if (!state || !hasMeaningfulText(text)) { return; } + updateMarkdownCodeSpanState(state, text); updateCodeFenceState(state, text); } @@ -64,6 +66,68 @@ function insideCodeFenceWithState(state, text) { return simulated.stack.length > 0; } +function insideMarkdownCodeSpanWithState(state, text) { + if (!state) { + return simulateMarkdownCodeSpanTicks(null, 0, text) > 0; + } + const ticks = Number.isInteger(state.markdownCodeSpanTicks) ? state.markdownCodeSpanTicks : 0; + return simulateMarkdownCodeSpanTicks(state, ticks, text) > 0; +} + +function updateMarkdownCodeSpanState(state, text) { + if (!state || !hasMeaningfulText(text)) { + return; + } + const ticks = Number.isInteger(state.markdownCodeSpanTicks) ? state.markdownCodeSpanTicks : 0; + state.markdownCodeSpanTicks = simulateMarkdownCodeSpanTicks(state, ticks, text); +} + +function simulateMarkdownCodeSpanTicks(state, initialTicks, text) { + const raw = typeof text === 'string' ? text : ''; + let ticks = Number.isInteger(initialTicks) ? initialTicks : 0; + for (let i = 0; i < raw.length;) { + if (raw[i] !== '`') { + i += 1; + continue; + } + const run = countBacktickRun(raw, i); + if (ticks === 0) { + if (run >= 3 && atMarkdownFenceLineStart(raw, i)) { + i += run; + continue; + } + if (state && insideCodeFenceWithState(state, raw.slice(0, i))) { + i += run; + continue; + } + ticks = run; + } else if (run === ticks) { + ticks = 0; + } + i += run; + } + return ticks; +} + +function countBacktickRun(text, start) { + let count = 0; + while (start + count < text.length && text[start + count] === '`') { + count += 1; + } + return count; +} + +function atMarkdownFenceLineStart(text, idx) { + for (let i = idx - 1; i >= 0; i -= 1) { + const ch = text[i]; + if (ch === ' ' || ch === '\t') { + continue; + } + return ch === '\n' || ch === '\r'; + } + return true; +} + function updateCodeFenceState(state, text) { if (!state) { return; @@ -188,7 +252,9 @@ module.exports = { looksLikeToolExampleContext, insideCodeFence, insideCodeFenceWithState, + insideMarkdownCodeSpanWithState, updateCodeFenceState, + updateMarkdownCodeSpanState, hasMeaningfulText, toStringSafe, }; diff --git a/internal/server/router.go b/internal/server/router.go index b8a78b4..ef6afbf 100644 --- a/internal/server/router.go +++ b/internal/server/router.go @@ -185,8 +185,8 @@ func redactSensitiveQueryParams(u *url.URL) (string, bool) { return "", false } values, err := url.ParseQuery(u.RawQuery) - if err != nil && len(values) == 0 { - return "", false + if err != nil { + return redactSensitiveRawQueryParams(u.RawQuery) } changed := false for name, vals := range values { @@ -205,6 +205,57 @@ func redactSensitiveQueryParams(u *url.URL) (string, bool) { return values.Encode(), true } +func redactSensitiveRawQueryParams(rawQuery string) (string, bool) { + if rawQuery == "" { + return "", false + } + var b strings.Builder + b.Grow(len(rawQuery)) + changed := false + start := 0 + for i := 0; i <= len(rawQuery); i++ { + if i < len(rawQuery) && rawQuery[i] != '&' && rawQuery[i] != ';' { + continue + } + segment := rawQuery[start:i] + b.WriteString(redactSensitiveRawQuerySegment(segment, &changed)) + if i < len(rawQuery) { + b.WriteByte(rawQuery[i]) + } + start = i + 1 + } + if !changed { + return "", false + } + return b.String(), true +} + +func redactSensitiveRawQuerySegment(segment string, changed *bool) string { + if segment == "" { + return segment + } + name := segment + valueStart := -1 + if eq := strings.IndexByte(segment, '='); eq >= 0 { + name = segment[:eq] + valueStart = eq + 1 + } + decodedName, err := url.QueryUnescape(name) + if err != nil { + decodedName = name + } + if !isSensitiveQueryParam(decodedName) { + return segment + } + if changed != nil { + *changed = true + } + if valueStart < 0 { + return name + "=REDACTED" + } + return segment[:valueStart] + "REDACTED" +} + func isSensitiveQueryParam(name string) bool { return strings.EqualFold(name, "key") || strings.EqualFold(name, "api_key") } diff --git a/internal/server/router_log_test.go b/internal/server/router_log_test.go index e5f43df..5ecde48 100644 --- a/internal/server/router_log_test.go +++ b/internal/server/router_log_test.go @@ -45,3 +45,60 @@ func TestFilteredLogFormatterRedactsSensitiveQueryParams(t *testing.T) { t.Fatalf("request was mutated, RawQuery = %q", req.URL.RawQuery) } } + +func TestFilteredLogFormatterRedactsSensitiveQueryParamsWhenMalformed(t *testing.T) { + tests := []struct { + name string + target string + secrets []string + redacted []string + preserved []string + }{ + { + name: "semicolon separator", + target: "/v1beta/models/gemini-2.5-pro:generateContent?key=caller-secret;alt=sse", + secrets: []string{"caller-secret"}, + redacted: []string{"key=REDACTED"}, + preserved: []string{"alt=sse"}, + }, + { + name: "bad escape in sensitive value", + target: "/v1beta/models/gemini-2.5-pro:generateContent?api_key=second-secret%ZZ", + secrets: []string{"second-secret"}, + redacted: []string{"api_key=REDACTED"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var buf bytes.Buffer + formatter := &filteredLogFormatter{ + base: &middleware.DefaultLogFormatter{ + Logger: log.New(&buf, "", 0), + NoColor: true, + }, + } + req := httptest.NewRequest(http.MethodPost, tt.target, nil) + + entry := formatter.NewLogEntry(req) + entry.Write(http.StatusOK, 0, http.Header{}, time.Millisecond, nil) + + got := buf.String() + for _, secret := range tt.secrets { + if strings.Contains(got, secret) { + t.Fatalf("log line contains sensitive query value %q: %s", secret, got) + } + } + for _, want := range tt.redacted { + if !strings.Contains(got, want) { + t.Fatalf("log line missing redacted query %q: %s", want, got) + } + } + for _, want := range tt.preserved { + if !strings.Contains(got, want) { + t.Fatalf("log line missing preserved query %q: %s", want, got) + } + } + }) + } +} diff --git a/internal/toolcall/fence_edge_test.go b/internal/toolcall/fence_edge_test.go index 5faff0b..f09455a 100644 --- a/internal/toolcall/fence_edge_test.go +++ b/internal/toolcall/fence_edge_test.go @@ -64,3 +64,44 @@ func TestStripFencedCodeBlocks_InlineBackticksNotFence(t *testing.T) { t.Fatalf("expected Before/After, got %q", got) } } + +func TestParseToolCalls_IgnoresMarkdownDocumentationExamples(t *testing.T) { + text := "解析器支持多种工具调用格式。\n\n" + + "入口函数 `ParseToolCalls(text, availableToolNames)` 会返回调用列表。\n\n" + + "核心流程会解析 XML 格式的 `` / `` 标记。\n\n" + + "### 标准 XML 结构\n" + + "```xml\n" + + "\n" + + " \n" + + " config.json\n" + + " \n" + + "\n" + + "```\n\n" + + "DSML 风格形如 `...`,也可能提到 `` 包裹。\n" + + got := ParseToolCallsDetailed(text, []string{"read_file"}) + if len(got.Calls) != 0 { + t.Fatalf("markdown documentation examples should not parse as tool calls, got %#v", got.Calls) + } +} + +func TestParseToolCalls_IgnoresInlineMarkdownToolCallExample(t *testing.T) { + text := "示例:`README.md`" + + got := ParseToolCallsDetailed(text, []string{"read_file"}) + if len(got.Calls) != 0 { + t.Fatalf("inline markdown tool example should not parse as tool calls, got %#v", got.Calls) + } +} + +func TestParseToolCalls_PreservesBackticksInsideToolParameters(t *testing.T) { + text := "echo `date`" + + got := ParseToolCallsDetailed(text, []string{"Bash"}) + if len(got.Calls) != 1 { + t.Fatalf("expected one tool call, got %#v", got.Calls) + } + if got.Calls[0].Input["command"] != "echo `date`" { + t.Fatalf("expected command backticks preserved, got %#v", got.Calls[0].Input["command"]) + } +} diff --git a/internal/toolcall/toolcalls_candidates.go b/internal/toolcall/toolcalls_candidates.go index 187d61a..f9b3cbb 100644 --- a/internal/toolcall/toolcalls_candidates.go +++ b/internal/toolcall/toolcalls_candidates.go @@ -28,6 +28,11 @@ func canonicalizeToolCallCandidateSpans(text string) string { i = next continue } + if end, ok := markdownCodeSpanEnd(text, i); ok { + b.WriteString(text[i:end]) + i = end + continue + } tag, ok := scanToolMarkupTagAt(text, i) if !ok { b.WriteByte(text[i]) @@ -619,19 +624,18 @@ func hasRepairableXMLToolCallsWrapper(text string) bool { if strings.TrimSpace(text) == "" { return false } - if strings.Contains(strings.ToLower(text), "= len(text) || text[start] != '`' { + return start, false + } + count := countLeadingFenceChars(text[start:], '`') + if count == 0 { + return start, false + } + search := start + count + for search < len(text) { + if text[search] != '`' { + search++ + continue + } + run := countLeadingFenceChars(text[search:], '`') + if run == count { + return search + run, true + } + search += run + } + return start, false +} + func cdataStartsBeforeFence(line string) bool { cdataIdx := indexToolCDATAOpen(line, 0) if cdataIdx < 0 { diff --git a/internal/toolcall/toolcalls_parse_markup.go b/internal/toolcall/toolcalls_parse_markup.go index 0d222bd..eab642e 100644 --- a/internal/toolcall/toolcalls_parse_markup.go +++ b/internal/toolcall/toolcalls_parse_markup.go @@ -10,16 +10,14 @@ import ( ) var xmlAttrPattern = regexp.MustCompile(`(?is)\b([a-z0-9_:-]+)\s*=\s*("([^"]*)"|'([^']*)')`) -var xmlToolCallsClosePattern = regexp.MustCompile(`(?is)`) -var xmlInvokeStartPattern = regexp.MustCompile(`(?is)]*\bname\s*=\s*("([^"]*)"|'([^']*)')`) var cdataBRSeparatorPattern = regexp.MustCompile(`(?i)`) func parseXMLToolCalls(text string) []ParsedToolCall { - wrappers := findXMLElementBlocks(text, "tool_calls") + wrappers := findToolCallElementBlocksOutsideIgnored(text) if len(wrappers) == 0 { repaired := repairMissingXMLToolCallsOpeningWrapper(text) if repaired != text { - wrappers = findXMLElementBlocks(repaired, "tool_calls") + wrappers = findToolCallElementBlocksOutsideIgnored(repaired) } } if len(wrappers) == 0 { @@ -41,26 +39,89 @@ func parseXMLToolCalls(text string) []ParsedToolCall { return out } +func findToolCallElementBlocksOutsideIgnored(text string) []xmlElementBlock { + if text == "" { + return nil + } + var out []xmlElementBlock + for searchFrom := 0; searchFrom < len(text); { + tag, ok := FindToolMarkupTagOutsideIgnored(text, searchFrom) + if !ok { + break + } + if tag.Closing || tag.Name != "tool_calls" { + searchFrom = tag.End + 1 + continue + } + closeTag, ok := FindMatchingToolMarkupClose(text, tag) + if !ok { + searchFrom = tag.End + 1 + continue + } + attrsEnd := tag.End + 1 + if delimLen := xmlTagEndDelimiterLenEndingAt(text, tag.End); delimLen > 0 { + attrsEnd = tag.End + 1 - delimLen + } + out = append(out, xmlElementBlock{ + Attrs: text[tag.NameEnd:attrsEnd], + Body: text[tag.End+1 : closeTag.Start], + Start: tag.Start, + End: closeTag.End + 1, + }) + searchFrom = closeTag.End + 1 + } + return out +} + func repairMissingXMLToolCallsOpeningWrapper(text string) string { - lower := strings.ToLower(text) - if strings.Contains(lower, "= closeLoc[0] { + closeTag, ok := lastToolMarkupTagByName(text, "tool_calls", true) + if !ok || invokeTag.Start >= closeTag.Start { return text } - return text[:invokeLoc[0]] + "" + text[invokeLoc[0]:closeLoc[0]] + "" + text[closeLoc[1]:] + return text[:invokeTag.Start] + "" + text[invokeTag.Start:closeTag.Start] + "" + text[closeTag.End+1:] +} + +func firstToolMarkupTagByName(text, name string, closing bool) (ToolMarkupTag, bool) { + for searchFrom := 0; searchFrom < len(text); { + tag, ok := FindToolMarkupTagOutsideIgnored(text, searchFrom) + if !ok { + break + } + if tag.Name == name && tag.Closing == closing { + return tag, true + } + searchFrom = tag.End + 1 + } + return ToolMarkupTag{}, false +} + +func lastToolMarkupTagByName(text, name string, closing bool) (ToolMarkupTag, bool) { + var last ToolMarkupTag + found := false + for searchFrom := 0; searchFrom < len(text); { + tag, ok := FindToolMarkupTagOutsideIgnored(text, searchFrom) + if !ok { + break + } + if tag.Name == name && tag.Closing == closing { + last = tag + found = true + } + searchFrom = tag.End + 1 + } + if !found { + return ToolMarkupTag{}, false + } + return last, true } func parseSingleXMLToolCall(block xmlElementBlock) (ParsedToolCall, bool) { diff --git a/internal/toolcall/toolcalls_scan.go b/internal/toolcall/toolcalls_scan.go index f14ca15..363e1ea 100644 --- a/internal/toolcall/toolcalls_scan.go +++ b/internal/toolcall/toolcalls_scan.go @@ -42,6 +42,10 @@ func ContainsToolMarkupSyntaxOutsideIgnored(text string) (hasDSML, hasCanonical i = next continue } + if end, ok := markdownCodeSpanEnd(text, i); ok { + i = end + continue + } if tag, ok := scanToolMarkupTagAt(text, i); ok { if tag.DSMLLike { hasDSML = true @@ -69,6 +73,10 @@ func ContainsToolCallWrapperSyntaxOutsideIgnored(text string) (hasDSML, hasCanon i = next continue } + if end, ok := markdownCodeSpanEnd(text, i); ok { + i = end + continue + } if tag, ok := scanToolMarkupTagAt(text, i); ok { if tag.Name != "tool_calls" { i = tag.End + 1 @@ -100,6 +108,10 @@ func FindToolMarkupTagOutsideIgnored(text string, start int) (ToolMarkupTag, boo i = next continue } + if end, ok := markdownCodeSpanEnd(text, i); ok { + i = end + continue + } if tag, ok := scanToolMarkupTagAt(text, i); ok { return tag, true } diff --git a/internal/toolstream/fence_edge_sieve_test.go b/internal/toolstream/fence_edge_sieve_test.go index d56335f..81e6f94 100644 --- a/internal/toolstream/fence_edge_sieve_test.go +++ b/internal/toolstream/fence_edge_sieve_test.go @@ -57,3 +57,68 @@ func TestProcessToolSieveNestedFourBacktickFenceDoesNotTrigger(t *testing.T) { t.Fatalf("expected 4-backtick fenced example to stay text, got %d tool calls", toolCalls) } } + +func TestProcessToolSieveMarkdownDocumentationExamplesDoNotTrigger(t *testing.T) { + var state State + chunks := []string{ + "解析器支持多种工具调用格式。\n\n", + "入口函数 `ParseToolCalls(text, availableToolNames)` 会返回调用列表。\n\n", + "核心流程会解析 XML 格式的 `` / `` 标记。\n\n", + "### 标准 XML 结构\n", + "```xml\n", + "\n", + " \n", + " config.json\n", + " \n", + "\n", + "```\n\n", + "DSML 风格形如 `...`,也可能提到 `` 包裹。\n", + } + var events []Event + for _, c := range chunks { + events = append(events, ProcessChunk(&state, c, []string{"read_file"})...) + } + events = append(events, Flush(&state, []string{"read_file"})...) + + var textContent strings.Builder + toolCalls := 0 + for _, evt := range events { + textContent.WriteString(evt.Content) + toolCalls += len(evt.ToolCalls) + } + + if toolCalls != 0 { + t.Fatalf("expected markdown documentation examples to stay text, got %d tool calls", toolCalls) + } + if !strings.Contains(textContent.String(), "标准 XML 结构") || !strings.Contains(textContent.String(), "DSML 风格") { + t.Fatalf("expected documentation text preserved, got %q", textContent.String()) + } +} + +func TestProcessToolSieveInlineMarkdownToolCallSplitAcrossChunksDoesNotTrigger(t *testing.T) { + var state State + chunks := []string{ + "示例:`", + "README.md", + "` 完毕。", + } + var events []Event + for _, c := range chunks { + events = append(events, ProcessChunk(&state, c, []string{"read_file"})...) + } + events = append(events, Flush(&state, []string{"read_file"})...) + + var textContent strings.Builder + toolCalls := 0 + for _, evt := range events { + textContent.WriteString(evt.Content) + toolCalls += len(evt.ToolCalls) + } + + if toolCalls != 0 { + t.Fatalf("expected split inline markdown tool example to stay text, got %d tool calls", toolCalls) + } + if !strings.Contains(textContent.String(), "") || !strings.Contains(textContent.String(), "完毕") { + t.Fatalf("expected inline example text preserved, got %q", textContent.String()) + } +} diff --git a/internal/toolstream/tool_sieve_core.go b/internal/toolstream/tool_sieve_core.go index afe7c7f..3be88ef 100644 --- a/internal/toolstream/tool_sieve_core.go +++ b/internal/toolstream/tool_sieve_core.go @@ -155,7 +155,7 @@ func splitSafeContentForToolDetection(state *State, s string) (safe, hold string return "", "" } if xmlIdx := findPartialXMLToolTagStart(s); xmlIdx >= 0 { - if insideCodeFenceWithState(state, s[:xmlIdx]) { + if insideCodeFenceWithState(state, s[:xmlIdx]) || insideMarkdownCodeSpanWithState(state, s[:xmlIdx]) { return s, "" } if xmlIdx > 0 { @@ -177,7 +177,7 @@ func findToolSegmentStart(state *State, s string) int { return -1 } start := includeDuplicateLeadingLessThan(s, tag.Start) - if !insideCodeFenceWithState(state, s[:start]) { + if !insideCodeFenceWithState(state, s[:start]) && !insideMarkdownCodeSpanWithState(state, s[:start]) { return start } offset = tag.End + 1 diff --git a/internal/toolstream/tool_sieve_state.go b/internal/toolstream/tool_sieve_state.go index cdc70d5..e3d407b 100644 --- a/internal/toolstream/tool_sieve_state.go +++ b/internal/toolstream/tool_sieve_state.go @@ -13,6 +13,7 @@ type State struct { codeFencePendingTicks int codeFencePendingTildes int codeFenceNotLineStart bool // inverted: zero-value false means "at line start" + markdownCodeSpanTicks int pendingToolRaw string pendingToolCalls []toolcall.ParsedToolCall disableDeltas bool @@ -50,6 +51,7 @@ func (s *State) noteText(content string) { if !hasMeaningfulText(content) { return } + updateMarkdownCodeSpanState(s, content) updateCodeFenceState(s, content) } @@ -78,6 +80,68 @@ func insideCodeFence(text string) bool { return len(simulateCodeFenceState(nil, 0, 0, true, text).stack) > 0 } +func insideMarkdownCodeSpanWithState(state *State, text string) bool { + if state == nil { + return simulateMarkdownCodeSpanTicks(nil, 0, text) > 0 + } + return simulateMarkdownCodeSpanTicks(state, state.markdownCodeSpanTicks, text) > 0 +} + +func updateMarkdownCodeSpanState(state *State, text string) { + if state == nil || !hasMeaningfulText(text) { + return + } + state.markdownCodeSpanTicks = simulateMarkdownCodeSpanTicks(state, state.markdownCodeSpanTicks, text) +} + +func simulateMarkdownCodeSpanTicks(state *State, initialTicks int, text string) int { + ticks := initialTicks + for i := 0; i < len(text); { + if text[i] != '`' { + i++ + continue + } + run := countBacktickRun(text, i) + if ticks == 0 { + if run >= 3 && atMarkdownFenceLineStart(text, i) { + i += run + continue + } + if state != nil && insideCodeFenceWithState(state, text[:i]) { + i += run + continue + } + ticks = run + } else if run == ticks { + ticks = 0 + } + i += run + } + return ticks +} + +func countBacktickRun(text string, start int) int { + count := 0 + for start+count < len(text) && text[start+count] == '`' { + count++ + } + return count +} + +func atMarkdownFenceLineStart(text string, idx int) bool { + for i := idx - 1; i >= 0; i-- { + switch text[i] { + case ' ', '\t': + continue + case '\n', '\r': + return true + default: + return false + } + } + return true +} + func updateCodeFenceState(state *State, text string) { if state == nil || !hasMeaningfulText(text) { return diff --git a/tests/node/stream-tool-sieve.test.js b/tests/node/stream-tool-sieve.test.js index b989fdb..f23777d 100644 --- a/tests/node/stream-tool-sieve.test.js +++ b/tests/node/stream-tool-sieve.test.js @@ -568,6 +568,19 @@ test('parseToolCalls skips prose mention of same wrapper variant', () => { assert.equal(calls[0].input.command, 'git status'); }); +test('parseToolCalls ignores inline markdown tool example', () => { + const payload = '示例:`README.md`'; + const calls = parseToolCalls(payload, ['read_file']); + assert.equal(calls.length, 0); +}); + +test('parseToolCalls preserves backticks inside tool parameters', () => { + const payload = 'echo `date`'; + const calls = parseToolCalls(payload, ['Bash']); + assert.equal(calls.length, 1); + assert.equal(calls[0].input.command, 'echo `date`'); +}); + test('sieve emits tool_calls after prose mentions same wrapper variant', () => { const events = runSieve([ 'Summary: support canonical and DSML <|DSML|tool_calls> wrappers.\n\n', @@ -584,6 +597,53 @@ test('sieve emits tool_calls after prose mentions same wrapper variant', () => { assert.equal(collectText(events).includes('Summary:'), true); }); +test('sieve ignores markdown documentation examples', () => { + const events = runSieve([ + '解析器支持多种工具调用格式。\n\n', + '入口函数 `ParseToolCalls(text, availableToolNames)` 会返回调用列表。\n\n', + '核心流程会解析 XML 格式的 `` / `` 标记。\n\n', + '### 标准 XML 结构\n', + '```xml\n', + '\n', + ' \n', + ' config.json\n', + ' \n', + '\n', + '```\n\n', + 'DSML 风格形如 `...`,也可能提到 `` 包裹。\n', + ], ['read_file']); + const finalCalls = events.filter((evt) => evt.type === 'tool_calls').flatMap((evt) => evt.calls || []); + const text = collectText(events); + assert.equal(finalCalls.length, 0); + assert.equal(text.includes('标准 XML 结构'), true); + assert.equal(text.includes('DSML 风格'), true); +}); + +test('sieve ignores inline markdown tool example split across chunks', () => { + const events = runSieve([ + '示例:`', + 'README.md', + '` 完毕。', + ], ['read_file']); + const finalCalls = events.filter((evt) => evt.type === 'tool_calls').flatMap((evt) => evt.calls || []); + const text = collectText(events); + assert.equal(finalCalls.length, 0); + assert.equal(text.includes(''), true); + assert.equal(text.includes('完毕'), true); +}); + +test('sieve emits real tool after split inline markdown tool example closes', () => { + const events = runSieve([ + '示例:`', + 'README.md', + '` ', + 'real.md', + ], ['read_file']); + const finalCalls = events.filter((evt) => evt.type === 'tool_calls').flatMap((evt) => evt.calls || []); + assert.equal(finalCalls.length, 1); + assert.equal(finalCalls[0].input.path, 'real.md'); +}); + test('sieve emits tool_calls for DSML space-separator typo', () => { const events = runSieve([ '准备读取文件。\n',