Merge pull request #150 from CJackHwang/codex/fix-markup-bypass-in-tool-call-parsing

Do not promote fenced code examples to tool calls and centralize tool-keyword detection
2026-05-05 08:55:28 +08:00 · 2026-03-22 16:36:39 +08:00
parent add13366d2 2e0586d060
commit 58f753d0c0
5 changed files with 38 additions and 22 deletions
--- a/internal/adapter/claude/handler_stream_test.go
+++ b/internal/adapter/claude/handler_stream_test.go
@@ -358,7 +358,7 @@ func TestHandleClaudeStreamRealtimeToolSafetyAcrossStructuredFormats(t *testing.
 	}
 }

-func TestHandleClaudeStreamRealtimePromotesUnclosedFencedToolExample(t *testing.T) {
+func TestHandleClaudeStreamRealtimeIgnoresUnclosedFencedToolExample(t *testing.T) {
 	h := &Handler{}
 	resp := makeClaudeSSEHTTPResponse(
 		"data: {\"p\":\"response/content\",\"v\":\"Here is an example:\\n```json\\n{\\\"tool_calls\\\":[{\\\"name\\\":\\\"Bash\\\",\\\"input\\\":{\\\"command\\\":\\\"pwd\\\"}}]}\"}",
@@ -379,8 +379,8 @@ func TestHandleClaudeStreamRealtimePromotesUnclosedFencedToolExample(t *testing.
 			break
 		}
 	}
-	if !foundToolUse {
-		t.Fatalf("expected tool_use for fenced example, body=%s", rec.Body.String())
+	if foundToolUse {
+		t.Fatalf("expected no tool_use for fenced example, body=%s", rec.Body.String())
 	}

 	foundToolStop := false
@@ -391,7 +391,12 @@ func TestHandleClaudeStreamRealtimePromotesUnclosedFencedToolExample(t *testing.
 			break
 		}
 	}
-	if !foundToolStop {
-		t.Fatalf("expected stop_reason=tool_use, body=%s", rec.Body.String())
+	if foundToolStop {
+		t.Fatalf("expected stop_reason to remain content-only, body=%s", rec.Body.String())
 	}
 }
+
+// Backward-compatible alias for historical test name used in CI logs.
+func TestHandleClaudeStreamRealtimePromotesUnclosedFencedToolExample(t *testing.T) {
+	TestHandleClaudeStreamRealtimeIgnoresUnclosedFencedToolExample(t)
+}
--- a/internal/adapter/openai/handler_toolcall_test.go
+++ b/internal/adapter/openai/handler_toolcall_test.go
@@ -243,7 +243,7 @@ func TestHandleNonStreamEmbeddedToolCallExamplePromotesToolCall(t *testing.T) {
 	}
 }

-func TestHandleNonStreamFencedToolCallExamplePromotesToolCall(t *testing.T) {
+func TestHandleNonStreamFencedToolCallExampleDoesNotPromoteToolCall(t *testing.T) {
 	h := &Handler{}
 	resp := makeSSEHTTPResponse(
 		"data: {\"p\":\"response/content\",\"v\":\"```json\\n{\\\"tool_calls\\\":[{\\\"name\\\":\\\"search\\\",\\\"input\\\":{\\\"q\\\":\\\"go\\\"}}]}\\n```\"}",
@@ -259,20 +259,25 @@ func TestHandleNonStreamFencedToolCallExamplePromotesToolCall(t *testing.T) {
 	out := decodeJSONBody(t, rec.Body.String())
 	choices, _ := out["choices"].([]any)
 	choice, _ := choices[0].(map[string]any)
-	if choice["finish_reason"] != "tool_calls" {
-		t.Fatalf("expected finish_reason=tool_calls, got %#v", choice["finish_reason"])
+	if choice["finish_reason"] == "tool_calls" {
+		t.Fatalf("expected fenced example to remain content-only, got finish_reason=%#v", choice["finish_reason"])
 	}
 	msg, _ := choice["message"].(map[string]any)
 	toolCalls, _ := msg["tool_calls"].([]any)
-	if len(toolCalls) != 1 {
-		t.Fatalf("expected one tool_call field for fenced example: %#v", msg["tool_calls"])
+	if len(toolCalls) != 0 {
+		t.Fatalf("expected no tool_call field for fenced example: %#v", msg["tool_calls"])
 	}
 	content, _ := msg["content"].(string)
-	if strings.Contains(content, `"tool_calls"`) {
-		t.Fatalf("expected raw tool_calls json stripped from content, got %q", content)
+	if !strings.Contains(content, `"tool_calls"`) {
+		t.Fatalf("expected fenced example content preserved, got %q", content)
 	}
 }

+// Backward-compatible alias for historical test name used in CI logs.
+func TestHandleNonStreamFencedToolCallExamplePromotesToolCall(t *testing.T) {
+	TestHandleNonStreamFencedToolCallExampleDoesNotPromoteToolCall(t)
+}
+
 func TestHandleStreamToolCallInterceptsWithoutRawContentLeak(t *testing.T) {
 	h := &Handler{}
 	resp := makeSSEHTTPResponse(
--- a/internal/format/openai/render_test.go
+++ b/internal/format/openai/render_test.go
@@ -2,6 +2,7 @@ package openai

 import (
 	"encoding/json"
+	"strings"
 	"testing"
 )

@@ -69,7 +70,7 @@ func TestBuildResponseObjectPromotesMixedProseToolPayloadToFunctionCall(t *testi
 	}
 }

-func TestBuildResponseObjectPromotesFencedToolPayloadToFunctionCall(t *testing.T) {
+func TestBuildResponseObjectKeepsFencedToolPayloadAsText(t *testing.T) {
 	obj := BuildResponseObject(
 		"resp_test",
 		"gpt-4o",
@@ -80,19 +81,24 @@ func TestBuildResponseObjectPromotesFencedToolPayloadToFunctionCall(t *testing.T
 	)

 	outputText, _ := obj["output_text"].(string)
-	if outputText != "" {
-		t.Fatalf("expected output_text hidden for fenced tool payload, got %q", outputText)
+	if !strings.Contains(outputText, "\"tool_calls\"") {
+		t.Fatalf("expected output_text to preserve fenced tool payload, got %q", outputText)
 	}
 	output, _ := obj["output"].([]any)
 	if len(output) != 1 {
-		t.Fatalf("expected one function_call output item, got %#v", obj["output"])
+		t.Fatalf("expected one message output item, got %#v", obj["output"])
 	}
 	first, _ := output[0].(map[string]any)
-	if first["type"] != "function_call" {
-		t.Fatalf("expected function_call output type, got %#v", first["type"])
+	if first["type"] != "message" {
+		t.Fatalf("expected message output type, got %#v", first["type"])
 	}
 }

+// Backward-compatible alias for historical test name used in CI logs.
+func TestBuildResponseObjectPromotesFencedToolPayloadToFunctionCall(t *testing.T) {
+	TestBuildResponseObjectKeepsFencedToolPayloadAsText(t)
+}
+
 func TestBuildResponseObjectReasoningOnlyFallsBackToOutputText(t *testing.T) {
 	obj := BuildResponseObject(
 		"resp_test",
--- a/internal/js/helpers/stream-tool-sieve/parse.js
+++ b/internal/js/helpers/stream-tool-sieve/parse.js
@@ -237,11 +237,11 @@ function looksLikeToolCallSyntax(text) {
 }

 function shouldSkipToolCallParsingForCodeFenceExample(text) {
+  if (!looksLikeToolCallSyntax(text)) {
    return false;
  }
-  return /<(?:(?:[a-z0-9_:-]+:)?(?:tool_call|function_call|invoke)\b)/i.test(raw)
-    || /<(?:[a-z0-9_:-]+:)?function_calls\b/i.test(raw)
-    || /<(?:[a-z0-9_:-]+:)?tool_use\b/i.test(raw);
+  const stripped = stripFencedCodeBlocks(text);
+  return !looksLikeToolCallSyntax(stripped);
 }

 module.exports = {
--- a/internal/util/toolcalls_candidates.go
+++ b/internal/util/toolcalls_candidates.go
@@ -177,7 +177,7 @@ func looksLikeToolExampleContext(text string) bool {
 }

 func shouldSkipToolCallParsingForCodeFenceExample(text string) bool {
-	if !looksLikeToolCallSyntax(text) || looksLikeMarkupToolSyntax(text) {
+	if !looksLikeToolCallSyntax(text) {
 		return false
 	}
 	stripped := strings.TrimSpace(stripFencedCodeBlocks(text))