refactor: implement robust think tag stripping and CDATA handling for SSE stream parsing

This commit is contained in:
CJACK
2026-04-19 18:35:56 +08:00
parent a1ce954ad5
commit 790a8ca980
3 changed files with 97 additions and 6 deletions

View File

@@ -7,6 +7,53 @@ const {
SKIP_EXACT_PATHS,
} = require('../shared/deepseek-constants');
function stripThinkTags(text) {
if (typeof text !== 'string' || !text) {
return text;
}
return text.replace(/<\/?\s*think\s*>/gi, '');
}
function splitThinkingParts(parts) {
const out = [];
let thinkingDone = false;
for (const p of parts) {
if (!p) continue;
if (thinkingDone && p.type === 'thinking') {
const cleaned = stripThinkTags(p.text);
if (cleaned) {
out.push({ text: cleaned, type: 'text' });
}
continue;
}
if (p.type !== 'thinking') {
const cleaned = stripThinkTags(p.text);
if (cleaned) {
out.push({ text: cleaned, type: p.type });
}
continue;
}
const match = /<\/\s*think\s*>/i.exec(p.text);
if (!match) {
out.push(p);
continue;
}
thinkingDone = true;
const before = p.text.substring(0, match.index);
let after = p.text.substring(match.index + match[0].length);
if (before) {
out.push({ text: before, type: 'thinking' });
}
after = stripThinkTags(after);
if (after) {
out.push({ text: after, type: 'text' });
}
}
return { parts: out, transitioned: thinkingDone };
}
function parseChunkForContent(chunk, thinkingEnabled, currentType, stripReferenceMarkers = true) {
if (!chunk || typeof chunk !== 'object') {
return {
@@ -147,7 +194,11 @@ function parseChunkForContent(chunk, thinkingEnabled, currentType, stripReferenc
let partType = 'text';
if (pathValue === 'response/thinking_content') {
partType = 'thinking';
if (newType === 'text') {
partType = 'text';
} else {
partType = 'thinking';
}
} else if (pathValue === 'response/content') {
partType = 'text';
} else if (pathValue.includes('response/fragments') && pathValue.includes('/content')) {
@@ -186,9 +237,16 @@ function parseChunkForContent(chunk, thinkingEnabled, currentType, stripReferenc
if (content) {
parts.push({ text: content, type: partType });
}
let resolvedParts = filterLeakedContentFilterParts(parts);
const splitResult = splitThinkingParts(resolvedParts);
if (splitResult.transitioned) {
newType = 'text';
}
return {
parsed: true,
parts: filterLeakedContentFilterParts(parts),
parts: splitResult.parts,
finished: false,
contentFilter: false,
errorMessage: '',
@@ -213,9 +271,16 @@ function parseChunkForContent(chunk, thinkingEnabled, currentType, stripReferenc
};
}
parts.push(...extracted.parts);
let resolvedParts = filterLeakedContentFilterParts(parts);
const splitResult = splitThinkingParts(resolvedParts);
if (splitResult.transitioned) {
newType = 'text';
}
return {
parsed: true,
parts: filterLeakedContentFilterParts(parts),
parts: splitResult.parts,
finished: false,
contentFilter: false,
errorMessage: '',
@@ -249,9 +314,16 @@ function parseChunkForContent(chunk, thinkingEnabled, currentType, stripReferenc
}
}
}
let resolvedParts = filterLeakedContentFilterParts(parts);
const splitResult = splitThinkingParts(resolvedParts);
if (splitResult.transitioned) {
newType = 'text';
}
return {
parsed: true,
parts: filterLeakedContentFilterParts(parts),
parts: splitResult.parts,
finished: false,
contentFilter: false,
errorMessage: '',
@@ -546,4 +618,5 @@ module.exports = {
isFragmentStatusPath,
isCitation,
stripReferenceMarkers: stripReferenceMarkersText,
stripThinkTags,
};

View File

@@ -273,7 +273,10 @@ func splitThinkingParts(parts []ContentPart) ([]ContentPart, bool) {
continue
}
if p.Type != "thinking" {
out = append(out, p)
cleaned := stripThinkTags(p.Text)
if cleaned != "" {
out = append(out, ContentPart{Text: cleaned, Type: p.Type})
}
continue
}
loc := thinkClosePattern.FindStringIndex(p.Text)
@@ -294,7 +297,8 @@ func splitThinkingParts(parts []ContentPart) ([]ContentPart, bool) {
}
}
if !thinkingDone {
return parts, false
// Return 'out' instead of 'parts' because text parts might have been cleaned via stripThinkTags
return out, false
}
return out, true
}

View File

@@ -149,3 +149,17 @@ func TestParseSSEChunkForContentAutoTransitionsState(t *testing.T) {
t.Fatalf("expected nextType2 to remain text, got %q", nextType2)
}
}
func TestParseSSEChunkForContentStripsLeakedThinkTagsFromText(t *testing.T) {
chunk := map[string]any{
"p": "response/content", // This makes the part type "text"
"v": "normal text <think>leaked</think> end",
}
parts, _, _ := ParseSSEChunkForContent(chunk, true, "text")
if len(parts) != 1 {
t.Fatalf("expected 1 part, got %d: %#v", len(parts), parts)
}
if parts[0].Type != "text" || parts[0].Text != "normal text leaked end" {
t.Fatalf("expected leaked think tag to be stripped, got %#v", parts[0])
}
}