mirror of
https://github.com/CJackHwang/ds2api.git
synced 2026-05-05 08:55:28 +08:00
refactor: implement robust think tag stripping and CDATA handling for SSE stream parsing
This commit is contained in:
@@ -7,6 +7,53 @@ const {
|
||||
SKIP_EXACT_PATHS,
|
||||
} = require('../shared/deepseek-constants');
|
||||
|
||||
|
||||
|
||||
function stripThinkTags(text) {
|
||||
if (typeof text !== 'string' || !text) {
|
||||
return text;
|
||||
}
|
||||
return text.replace(/<\/?\s*think\s*>/gi, '');
|
||||
}
|
||||
|
||||
function splitThinkingParts(parts) {
|
||||
const out = [];
|
||||
let thinkingDone = false;
|
||||
for (const p of parts) {
|
||||
if (!p) continue;
|
||||
if (thinkingDone && p.type === 'thinking') {
|
||||
const cleaned = stripThinkTags(p.text);
|
||||
if (cleaned) {
|
||||
out.push({ text: cleaned, type: 'text' });
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (p.type !== 'thinking') {
|
||||
const cleaned = stripThinkTags(p.text);
|
||||
if (cleaned) {
|
||||
out.push({ text: cleaned, type: p.type });
|
||||
}
|
||||
continue;
|
||||
}
|
||||
const match = /<\/\s*think\s*>/i.exec(p.text);
|
||||
if (!match) {
|
||||
out.push(p);
|
||||
continue;
|
||||
}
|
||||
thinkingDone = true;
|
||||
const before = p.text.substring(0, match.index);
|
||||
let after = p.text.substring(match.index + match[0].length);
|
||||
if (before) {
|
||||
out.push({ text: before, type: 'thinking' });
|
||||
}
|
||||
after = stripThinkTags(after);
|
||||
if (after) {
|
||||
out.push({ text: after, type: 'text' });
|
||||
}
|
||||
}
|
||||
return { parts: out, transitioned: thinkingDone };
|
||||
}
|
||||
|
||||
function parseChunkForContent(chunk, thinkingEnabled, currentType, stripReferenceMarkers = true) {
|
||||
if (!chunk || typeof chunk !== 'object') {
|
||||
return {
|
||||
@@ -147,7 +194,11 @@ function parseChunkForContent(chunk, thinkingEnabled, currentType, stripReferenc
|
||||
|
||||
let partType = 'text';
|
||||
if (pathValue === 'response/thinking_content') {
|
||||
partType = 'thinking';
|
||||
if (newType === 'text') {
|
||||
partType = 'text';
|
||||
} else {
|
||||
partType = 'thinking';
|
||||
}
|
||||
} else if (pathValue === 'response/content') {
|
||||
partType = 'text';
|
||||
} else if (pathValue.includes('response/fragments') && pathValue.includes('/content')) {
|
||||
@@ -186,9 +237,16 @@ function parseChunkForContent(chunk, thinkingEnabled, currentType, stripReferenc
|
||||
if (content) {
|
||||
parts.push({ text: content, type: partType });
|
||||
}
|
||||
|
||||
let resolvedParts = filterLeakedContentFilterParts(parts);
|
||||
const splitResult = splitThinkingParts(resolvedParts);
|
||||
if (splitResult.transitioned) {
|
||||
newType = 'text';
|
||||
}
|
||||
|
||||
return {
|
||||
parsed: true,
|
||||
parts: filterLeakedContentFilterParts(parts),
|
||||
parts: splitResult.parts,
|
||||
finished: false,
|
||||
contentFilter: false,
|
||||
errorMessage: '',
|
||||
@@ -213,9 +271,16 @@ function parseChunkForContent(chunk, thinkingEnabled, currentType, stripReferenc
|
||||
};
|
||||
}
|
||||
parts.push(...extracted.parts);
|
||||
|
||||
let resolvedParts = filterLeakedContentFilterParts(parts);
|
||||
const splitResult = splitThinkingParts(resolvedParts);
|
||||
if (splitResult.transitioned) {
|
||||
newType = 'text';
|
||||
}
|
||||
|
||||
return {
|
||||
parsed: true,
|
||||
parts: filterLeakedContentFilterParts(parts),
|
||||
parts: splitResult.parts,
|
||||
finished: false,
|
||||
contentFilter: false,
|
||||
errorMessage: '',
|
||||
@@ -249,9 +314,16 @@ function parseChunkForContent(chunk, thinkingEnabled, currentType, stripReferenc
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let resolvedParts = filterLeakedContentFilterParts(parts);
|
||||
const splitResult = splitThinkingParts(resolvedParts);
|
||||
if (splitResult.transitioned) {
|
||||
newType = 'text';
|
||||
}
|
||||
|
||||
return {
|
||||
parsed: true,
|
||||
parts: filterLeakedContentFilterParts(parts),
|
||||
parts: splitResult.parts,
|
||||
finished: false,
|
||||
contentFilter: false,
|
||||
errorMessage: '',
|
||||
@@ -546,4 +618,5 @@ module.exports = {
|
||||
isFragmentStatusPath,
|
||||
isCitation,
|
||||
stripReferenceMarkers: stripReferenceMarkersText,
|
||||
stripThinkTags,
|
||||
};
|
||||
|
||||
@@ -273,7 +273,10 @@ func splitThinkingParts(parts []ContentPart) ([]ContentPart, bool) {
|
||||
continue
|
||||
}
|
||||
if p.Type != "thinking" {
|
||||
out = append(out, p)
|
||||
cleaned := stripThinkTags(p.Text)
|
||||
if cleaned != "" {
|
||||
out = append(out, ContentPart{Text: cleaned, Type: p.Type})
|
||||
}
|
||||
continue
|
||||
}
|
||||
loc := thinkClosePattern.FindStringIndex(p.Text)
|
||||
@@ -294,7 +297,8 @@ func splitThinkingParts(parts []ContentPart) ([]ContentPart, bool) {
|
||||
}
|
||||
}
|
||||
if !thinkingDone {
|
||||
return parts, false
|
||||
// Return 'out' instead of 'parts' because text parts might have been cleaned via stripThinkTags
|
||||
return out, false
|
||||
}
|
||||
return out, true
|
||||
}
|
||||
|
||||
@@ -149,3 +149,17 @@ func TestParseSSEChunkForContentAutoTransitionsState(t *testing.T) {
|
||||
t.Fatalf("expected nextType2 to remain text, got %q", nextType2)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseSSEChunkForContentStripsLeakedThinkTagsFromText(t *testing.T) {
|
||||
chunk := map[string]any{
|
||||
"p": "response/content", // This makes the part type "text"
|
||||
"v": "normal text <think>leaked</think> end",
|
||||
}
|
||||
parts, _, _ := ParseSSEChunkForContent(chunk, true, "text")
|
||||
if len(parts) != 1 {
|
||||
t.Fatalf("expected 1 part, got %d: %#v", len(parts), parts)
|
||||
}
|
||||
if parts[0].Type != "text" || parts[0].Text != "normal text leaked end" {
|
||||
t.Fatalf("expected leaked think tag to be stripped, got %#v", parts[0])
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user