refactor: Improve chat stream content and tool call parsing with a new recursive extraction function and dedicated tests.

2026-05-02 07:25:26 +08:00 · 2026-02-17 14:35:24 +08:00
parent 5106773573
commit 05422b2449
5 changed files with 349 additions and 25 deletions
--- a/TESTING.md
+++ b/TESTING.md
@@ -23,6 +23,10 @@ DS2API 提供两个层级的测试：
 go test ./...
 ```

+```bash
+node --test api/helpers/stream-tool-sieve.test.js api/chat-stream.test.js
+```
+
 ### 端到端测试 | End-to-End Tests

 ```bash
@@ -35,6 +39,7 @@ go test ./...
   - `go test ./... -count=1`（单元测试）
   - `node --check api/chat-stream.js`（语法检查）
   - `node --check api/helpers/stream-tool-sieve.js`（语法检查）
+   - `node --test api/helpers/stream-tool-sieve.test.js api/chat-stream.test.js`（Node 流式拦截单测）
   - `npm run build --prefix webui`（WebUI 构建检查）

 2. **隔离启动**：复制 `config.json` 到临时目录，启动独立服务进程
--- a/api/chat-stream.js
+++ b/api/chat-stream.js
@@ -517,32 +517,11 @@ function parseChunkForContent(chunk, thinkingEnabled, currentType) {
  }

  if (Array.isArray(val)) {
-    for (const entry of val) {
-      if (typeof entry === 'string') {
-        if (entry) {
-          parts.push({ text: entry, type: partType });
-        }
-        continue;
-      }
-      if (!entry || typeof entry !== 'object') {
-        continue;
-      }
-      if (asString(entry.p) === 'status' && asString(entry.v) === 'FINISHED') {
-        return { parts: [], finished: true, newType };
-      }
-      const content = asString(entry.content);
-      if (!content) {
-        continue;
-      }
-      const t = asString(entry.type).toUpperCase();
-      if (t === 'THINK' || t === 'THINKING') {
-        parts.push({ text: content, type: 'thinking' });
-      } else if (t === 'RESPONSE') {
-        parts.push({ text: content, type: 'text' });
-      } else {
-        parts.push({ text: content, type: partType });
-      }
+    const extracted = extractContentRecursive(val, partType);
+    if (extracted.finished) {
+      return { parts: [], finished: true, newType };
    }
+    parts.push(...extracted.parts);
    return { parts, finished: false, newType };
  }

@@ -573,6 +552,80 @@ function parseChunkForContent(chunk, thinkingEnabled, currentType) {
  return { parts, finished: false, newType };
 }

+function extractContentRecursive(items, defaultType) {
+  const parts = [];
+  for (const it of items) {
+    if (!it || typeof it !== 'object') {
+      continue;
+    }
+    if (!Object.prototype.hasOwnProperty.call(it, 'v')) {
+      continue;
+    }
+    const itemPath = asString(it.p);
+    const itemV = it.v;
+    if (itemPath === 'status' && asString(itemV) === 'FINISHED') {
+      return { parts: [], finished: true };
+    }
+    if (shouldSkipPath(itemPath)) {
+      continue;
+    }
+    const content = asString(it.content);
+    if (content) {
+      const typeName = asString(it.type).toUpperCase();
+      if (typeName === 'THINK' || typeName === 'THINKING') {
+        parts.push({ text: content, type: 'thinking' });
+      } else if (typeName === 'RESPONSE') {
+        parts.push({ text: content, type: 'text' });
+      } else {
+        parts.push({ text: content, type: defaultType });
+      }
+      continue;
+    }
+
+    let partType = defaultType;
+    if (itemPath.includes('thinking')) {
+      partType = 'thinking';
+    } else if (itemPath.includes('content') || itemPath === 'response' || itemPath === 'fragments') {
+      partType = 'text';
+    }
+
+    if (typeof itemV === 'string') {
+      if (itemV && itemV !== 'FINISHED') {
+        parts.push({ text: itemV, type: partType });
+      }
+      continue;
+    }
+
+    if (!Array.isArray(itemV)) {
+      continue;
+    }
+    for (const inner of itemV) {
+      if (typeof inner === 'string') {
+        if (inner) {
+          parts.push({ text: inner, type: partType });
+        }
+        continue;
+      }
+      if (!inner || typeof inner !== 'object') {
+        continue;
+      }
+      const ct = asString(inner.content);
+      if (!ct) {
+        continue;
+      }
+      const typeName = asString(inner.type).toUpperCase();
+      if (typeName === 'THINK' || typeName === 'THINKING') {
+        parts.push({ text: ct, type: 'thinking' });
+      } else if (typeName === 'RESPONSE') {
+        parts.push({ text: ct, type: 'text' });
+      } else {
+        parts.push({ text: ct, type: partType });
+      }
+    }
+  }
+  return { parts, finished: false };
+}
+
 function shouldSkipPath(pathValue) {
  if (pathValue === 'response/search_status') {
    return true;
@@ -708,3 +761,10 @@ function asString(v) {
  }
  return String(v).trim();
 }
+
+module.exports.__test = {
+  parseChunkForContent,
+  extractContentRecursive,
+  shouldSkipPath,
+  asString,
+};
--- a/api/chat-stream.test.js
+++ b/api/chat-stream.test.js
@@ -0,0 +1,128 @@
+'use strict';
+
+const test = require('node:test');
+const assert = require('node:assert/strict');
+
+const handler = require('./chat-stream');
+const {
+  createToolSieveState,
+  processToolSieveChunk,
+  flushToolSieve,
+} = require('./helpers/stream-tool-sieve');
+
+const { parseChunkForContent } = handler.__test;
+
+test('chat-stream exposes parser test hooks', () => {
+  assert.equal(typeof parseChunkForContent, 'function');
+});
+
+test('parseChunkForContent keeps split response/content fragments inside response array', () => {
+  const chunk = {
+    p: 'response',
+    v: [
+      { p: 'response/content', v: '{"' },
+      { p: 'response/content', v: 'tool_calls":[{"name":"read_file","input":{"path":"README.MD"}}]}' },
+    ],
+  };
+  const parsed = parseChunkForContent(chunk, false, 'text');
+  assert.equal(parsed.finished, false);
+  assert.equal(parsed.newType, 'text');
+  assert.equal(parsed.parts.length, 2);
+  const combined = parsed.parts.map((p) => p.text).join('');
+  assert.equal(combined, '{"tool_calls":[{"name":"read_file","input":{"path":"README.MD"}}]}');
+});
+
+test('parseChunkForContent + sieve does not leak suspicious prefix in split tool json case', () => {
+  const chunk = {
+    p: 'response',
+    v: [
+      { p: 'response/content', v: '{"' },
+      { p: 'response/content', v: 'tool_calls":[{"name":"read_file","input":{"path":"README.MD"}}]}' },
+    ],
+  };
+  const parsed = parseChunkForContent(chunk, false, 'text');
+  const state = createToolSieveState();
+  const events = [];
+  for (const part of parsed.parts) {
+    events.push(...processToolSieveChunk(state, part.text, ['read_file']));
+  }
+  events.push(...flushToolSieve(state, ['read_file']));
+
+  const hasToolCalls = events.some((evt) => evt.type === 'tool_calls' && evt.calls && evt.calls.length > 0);
+  const leakedText = events
+    .filter((evt) => evt.type === 'text' && evt.text)
+    .map((evt) => evt.text)
+    .join('');
+
+  assert.equal(hasToolCalls, true);
+  assert.equal(leakedText.includes('{'), false);
+  assert.equal(leakedText.toLowerCase().includes('tool_calls'), false);
+});
+
+test('parseChunkForContent consumes nested item.v array payloads', () => {
+  const chunk = {
+    p: 'response',
+    v: [
+      { p: 'response/content', v: ['A', 'B'] },
+      { p: 'response/content', v: [{ content: 'C', type: 'RESPONSE' }] },
+    ],
+  };
+  const parsed = parseChunkForContent(chunk, false, 'text');
+  assert.equal(parsed.finished, false);
+  assert.equal(parsed.parts.map((p) => p.text).join(''), 'ABC');
+});
+
+test('parseChunkForContent detects nested status FINISHED in array payload', () => {
+  const chunk = {
+    p: 'response',
+    v: [{ p: 'status', v: 'FINISHED' }],
+  };
+  const parsed = parseChunkForContent(chunk, false, 'text');
+  assert.equal(parsed.finished, true);
+  assert.deepEqual(parsed.parts, []);
+});
+
+test('parseChunkForContent ignores items without v to match Go parser behavior', () => {
+  const chunk = {
+    p: 'response',
+    v: [{ type: 'RESPONSE', content: 'no-v-content' }],
+  };
+  const parsed = parseChunkForContent(chunk, false, 'text');
+  assert.equal(parsed.finished, false);
+  assert.deepEqual(parsed.parts, []);
+});
+
+test('parseChunkForContent handles response/fragments APPEND with thinking and response transitions', () => {
+  const chunk = {
+    p: 'response/fragments',
+    o: 'APPEND',
+    v: [
+      { type: 'THINK', content: '思考中' },
+      { type: 'RESPONSE', content: '结论' },
+    ],
+  };
+  const parsed = parseChunkForContent(chunk, true, 'thinking');
+  assert.equal(parsed.finished, false);
+  assert.equal(parsed.newType, 'text');
+  assert.deepEqual(parsed.parts, [
+    { text: '思考中', type: 'thinking' },
+    { text: '结论', type: 'text' },
+  ]);
+});
+
+test('parseChunkForContent supports wrapped response.fragments object shape', () => {
+  const chunk = {
+    p: 'response',
+    v: {
+      response: {
+        fragments: [
+          { type: 'RESPONSE', content: 'A' },
+          { type: 'RESPONSE', content: 'B' },
+        ],
+      },
+    },
+  };
+  const parsed = parseChunkForContent(chunk, false, 'text');
+  assert.equal(parsed.finished, false);
+  assert.equal(parsed.parts.map((p) => p.text).join(''), 'AB');
+});
--- a/api/helpers/stream-tool-sieve.test.js
+++ b/api/helpers/stream-tool-sieve.test.js
@@ -0,0 +1,130 @@
+'use strict';
+
+const test = require('node:test');
+const assert = require('node:assert/strict');
+
+const {
+  extractToolNames,
+  createToolSieveState,
+  processToolSieveChunk,
+  flushToolSieve,
+  parseToolCalls,
+} = require('./stream-tool-sieve');
+
+function runSieve(chunks, toolNames) {
+  const state = createToolSieveState();
+  const events = [];
+  for (const chunk of chunks) {
+    events.push(...processToolSieveChunk(state, chunk, toolNames));
+  }
+  events.push(...flushToolSieve(state, toolNames));
+  return events;
+}
+
+function collectText(events) {
+  return events
+    .filter((evt) => evt.type === 'text' && evt.text)
+    .map((evt) => evt.text)
+    .join('');
+}
+
+test('extractToolNames keeps tool mode enabled with unknown fallback', () => {
+  const names = extractToolNames([
+    { function: { description: 'no name tool' } },
+    { function: { name: ' read_file ' } },
+    {},
+  ]);
+  assert.deepEqual(names, ['unknown', 'read_file', 'unknown']);
+});
+
+test('parseToolCalls keeps non-object argument strings as _raw (Go parity)', () => {
+  const payload = JSON.stringify({
+    tool_calls: [
+      { name: 'read_file', input: '123' },
+      { name: 'list_dir', input: '[1,2,3]' },
+    ],
+  });
+  const calls = parseToolCalls(payload, ['read_file', 'list_dir']);
+  assert.deepEqual(calls, [
+    { name: 'read_file', input: { _raw: '123' } },
+    { name: 'list_dir', input: { _raw: '[1,2,3]' } },
+  ]);
+});
+
+test('parseToolCalls still intercepts unknown schema names to avoid leaks', () => {
+  const payload = JSON.stringify({
+    tool_calls: [{ name: 'not_in_schema', input: { q: 'go' } }],
+  });
+  const calls = parseToolCalls(payload, ['search']);
+  assert.equal(calls.length, 1);
+  assert.equal(calls[0].name, 'not_in_schema');
+});
+
+test('parseToolCalls supports fenced json and function.arguments string payload', () => {
+  const text = [
+    'I will call a tool now.',
+    '```json',
+    '{"tool_calls":[{"function":{"name":"read_file","arguments":"{\\"path\\":\\"README.md\\"}"}}]}',
+    '```',
+  ].join('\n');
+  const calls = parseToolCalls(text, ['read_file']);
+  assert.equal(calls.length, 1);
+  assert.equal(calls[0].name, 'read_file');
+  assert.deepEqual(calls[0].input, { path: 'README.md' });
+});
+
+test('sieve emits tool_calls and does not leak suspicious prefix on late key convergence', () => {
+  const events = runSieve(
+    [
+      '{"',
+      'tool_calls":[{"name":"read_file","input":{"path":"README.MD"}}]}',
+      '后置正文C。',
+    ],
+    ['read_file'],
+  );
+  const leakedText = collectText(events);
+  const hasToolCall = events.some((evt) => evt.type === 'tool_calls' && Array.isArray(evt.calls) && evt.calls.length > 0);
+  assert.equal(hasToolCall, true);
+  assert.equal(leakedText.includes('{'), false);
+  assert.equal(leakedText.toLowerCase().includes('tool_calls'), false);
+  assert.equal(leakedText.includes('后置正文C。'), true);
+});
+
+test('sieve drops invalid tool json body while preserving surrounding text', () => {
+  const events = runSieve(
+    [
+      '前置正文D。',
+      "{'tool_calls':[{'name':'read_file','input':{'path':'README.MD'}}]}",
+      '后置正文E。',
+    ],
+    ['read_file'],
+  );
+  const leakedText = collectText(events);
+  const hasToolCall = events.some((evt) => evt.type === 'tool_calls');
+  assert.equal(hasToolCall, false);
+  assert.equal(leakedText.includes('前置正文D。'), true);
+  assert.equal(leakedText.includes('后置正文E。'), true);
+  assert.equal(leakedText.toLowerCase().includes('tool_calls'), false);
+});
+
+test('sieve suppresses incomplete captured tool json on stream finalize', () => {
+  const events = runSieve(
+    ['前置正文F。', '{"tool_calls":[{"name":"read_file"'],
+    ['read_file'],
+  );
+  const leakedText = collectText(events);
+  assert.equal(leakedText.includes('前置正文F。'), true);
+  assert.equal(leakedText.toLowerCase().includes('tool_calls'), false);
+  assert.equal(leakedText.includes('{'), false);
+});
+
+test('sieve keeps plain text intact in tool mode when no tool call appears', () => {
+  const events = runSieve(
+    ['你好，', '这是普通文本回复。', '请继续。'],
+    ['read_file'],
+  );
+  const leakedText = collectText(events);
+  const hasToolCall = events.some((evt) => evt.type === 'tool_calls');
+  assert.equal(hasToolCall, false);
+  assert.equal(leakedText, '你好，这是普通文本回复。请继续。');
+});
--- a/internal/testsuite/runner.go
+++ b/internal/testsuite/runner.go
@@ -327,6 +327,7 @@ func (r *Runner) runPreflight(ctx context.Context) error {
 		{"go", "test", "./...", "-count=1"},
 		{"node", "--check", "api/chat-stream.js"},
 		{"node", "--check", "api/helpers/stream-tool-sieve.js"},
+		{"node", "--test", "api/helpers/stream-tool-sieve.test.js", "api/chat-stream.test.js"},
 		{"npm", "run", "build", "--prefix", "webui"},
 	}
 	f, err := os.OpenFile(r.preflightLog, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o644)