Files
ds2api/internal/toolcall/toolcalls_scan.go
waiwai 1e00e482a6 fix(toolcall): eliminate strings.ToLower panics from Unicode case folding
Replace all strings.ToLower usage with ASCII case-insensitive matching
(hasASCIIPrefixFoldAt, indexASCIIFold, hasDSMLPrefix) to prevent slice
bounds errors when Unicode characters change byte length after case
folding (e.g., Turkish İ U+0130 → i + combining dot: 2 bytes → 3 bytes).

Root cause: code created a strings.ToLower(text) copy, found byte
positions in that copy, then used those positions to slice the
original text — byte offsets that were valid in the lowercased copy
became out-of-bounds in the original when case folding changed byte
lengths.

Files changed:
- toolcalls_scan.go: remove 5 lower usages, add hasDSMLPrefix
- toolcalls_parse_markup.go: remove 3 lower usages, add indexASCIIFold
- toolcalls_markup.go: SanitizeLooseCDATA lower removal
- toolcalls_parse.go: updateCDATAStateForStrip lower removal
- tool_prompt.go: align DSML pipe characters with tool call spec
- tool_prompt_test.go: fix pre-existing test character mismatch
2026-05-09 15:05:51 +08:00

321 lines
6.8 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package toolcall
import "strings"
type toolMarkupNameAlias struct {
raw string
canonical string
dsmlOnly bool
}
var toolMarkupNames = []toolMarkupNameAlias{
{raw: "tool_calls", canonical: "tool_calls"},
{raw: "tool-calls", canonical: "tool_calls", dsmlOnly: true},
{raw: "invoke", canonical: "invoke"},
{raw: "parameter", canonical: "parameter"},
}
type ToolMarkupTag struct {
Start int
End int
NameStart int
NameEnd int
Name string
Closing bool
SelfClosing bool
DSMLLike bool
Canonical bool
}
func ContainsToolMarkupSyntaxOutsideIgnored(text string) (hasDSML, hasCanonical bool) {
for i := 0; i < len(text); {
next, advanced, blocked := skipXMLIgnoredSection(text, i)
if blocked {
return hasDSML, hasCanonical
}
if advanced {
i = next
continue
}
if tag, ok := scanToolMarkupTagAt(text, i); ok {
if tag.DSMLLike {
hasDSML = true
} else {
hasCanonical = true
}
if hasDSML && hasCanonical {
return true, true
}
i = tag.End + 1
continue
}
i++
}
return hasDSML, hasCanonical
}
func ContainsToolCallWrapperSyntaxOutsideIgnored(text string) (hasDSML, hasCanonical bool) {
for i := 0; i < len(text); {
next, advanced, blocked := skipXMLIgnoredSection(text, i)
if blocked {
return hasDSML, hasCanonical
}
if advanced {
i = next
continue
}
if tag, ok := scanToolMarkupTagAt(text, i); ok {
if tag.Name != "tool_calls" {
i = tag.End + 1
continue
}
if tag.DSMLLike {
hasDSML = true
} else {
hasCanonical = true
}
if hasDSML && hasCanonical {
return true, true
}
i = tag.End + 1
continue
}
i++
}
return hasDSML, hasCanonical
}
func FindToolMarkupTagOutsideIgnored(text string, start int) (ToolMarkupTag, bool) {
for i := maxInt(start, 0); i < len(text); {
next, advanced, blocked := skipXMLIgnoredSection(text, i)
if blocked {
return ToolMarkupTag{}, false
}
if advanced {
i = next
continue
}
if tag, ok := scanToolMarkupTagAt(text, i); ok {
return tag, true
}
i++
}
return ToolMarkupTag{}, false
}
func FindMatchingToolMarkupClose(text string, open ToolMarkupTag) (ToolMarkupTag, bool) {
if text == "" || open.Name == "" || open.Closing || open.End >= len(text) {
return ToolMarkupTag{}, false
}
depth := 1
for pos := open.End + 1; pos < len(text); {
tag, ok := FindToolMarkupTagOutsideIgnored(text, pos)
if !ok {
return ToolMarkupTag{}, false
}
if tag.Name != open.Name {
pos = tag.End + 1
continue
}
if tag.Closing {
depth--
if depth == 0 {
return tag, true
}
} else if !tag.SelfClosing {
depth++
}
pos = tag.End + 1
}
return ToolMarkupTag{}, false
}
func scanToolMarkupTagAt(text string, start int) (ToolMarkupTag, bool) {
if start < 0 || start >= len(text) || text[start] != '<' {
return ToolMarkupTag{}, false
}
i := start + 1
for i < len(text) && text[i] == '<' {
i++
}
closing := false
if i < len(text) && text[i] == '/' {
closing = true
i++
}
i, dsmlLike := consumeToolMarkupNamePrefix(text, i)
name, nameLen := matchToolMarkupName(text, i, dsmlLike)
if nameLen == 0 {
return ToolMarkupTag{}, false
}
nameEnd := i + nameLen
nameEndBeforePipes := nameEnd
for next, ok := consumeToolMarkupPipe(text, nameEnd); ok; next, ok = consumeToolMarkupPipe(text, nameEnd) {
nameEnd = next
}
hasTrailingPipe := nameEnd > nameEndBeforePipes
if !hasToolMarkupBoundary(text, nameEnd) {
return ToolMarkupTag{}, false
}
end := findXMLTagEnd(text, nameEnd)
if end < 0 {
if !hasTrailingPipe {
return ToolMarkupTag{}, false
}
end = nameEnd - 1
}
if hasTrailingPipe {
if nextLT := strings.IndexByte(text[nameEnd:], '<'); nextLT >= 0 && end >= nameEnd+nextLT {
end = nameEnd - 1
}
}
trimmed := strings.TrimSpace(text[start : end+1])
return ToolMarkupTag{
Start: start,
End: end,
NameStart: i,
NameEnd: nameEnd,
Name: name,
Closing: closing,
SelfClosing: strings.HasSuffix(trimmed, "/>"),
DSMLLike: dsmlLike,
Canonical: !dsmlLike,
}, true
}
func IsPartialToolMarkupTagPrefix(text string) bool {
if text == "" || text[0] != '<' || strings.Contains(text, ">") {
return false
}
i := 1
for i < len(text) && text[i] == '<' {
i++
}
if i >= len(text) {
return true
}
if text[i] == '/' {
i++
}
for i <= len(text) {
if i == len(text) {
return true
}
if hasToolMarkupNamePrefix(text, i) {
return true
}
if hasDSMLPrefix(text, i) {
return true
}
next, ok := consumeToolMarkupNamePrefixOnce(text, i)
if !ok {
return false
}
i = next
}
return false
}
func consumeToolMarkupNamePrefix(text string, idx int) (int, bool) {
dsmlLike := false
for {
next, ok := consumeToolMarkupNamePrefixOnce(text, idx)
if !ok {
return idx, dsmlLike
}
idx = next
dsmlLike = true
}
}
func consumeToolMarkupNamePrefixOnce(text string, idx int) (int, bool) {
if next, ok := consumeToolMarkupPipe(text, idx); ok {
return next, true
}
if idx < len(text) && (text[idx] == ' ' || text[idx] == '\t' || text[idx] == '\r' || text[idx] == '\n') {
return idx + 1, true
}
if hasASCIIPrefixFoldAt(text, idx, "dsml") {
next := idx + len("dsml")
if next < len(text) && (text[next] == '-' || text[next] == '_') {
next++
}
return next, true
}
return idx, false
}
// hasDSMLPrefix checks if "dsml" starts with text[start:] (case-insensitive).
func hasDSMLPrefix(text string, start int) bool {
const dsml = "dsml"
remain := len(text) - start
if remain <= 0 || remain > len(dsml) {
return false
}
for j := 0; j < remain; j++ {
if asciiLower(text[start+j]) != dsml[j] {
return false
}
}
return true
}
func hasToolMarkupNamePrefix(text string, start int) bool {
for _, name := range toolMarkupNames {
if hasASCIIPrefixFoldAt(text, start, name.raw) {
return true
}
tailLen := len(text) - start
if tailLen > 0 && tailLen <= len(name.raw) {
match := true
for j := 0; j < tailLen; j++ {
if asciiLower(text[start+j]) != asciiLower(name.raw[j]) {
match = false
break
}
}
if match {
return true
}
}
}
return false
}
func matchToolMarkupName(text string, start int, dsmlLike bool) (string, int) {
for _, name := range toolMarkupNames {
if name.dsmlOnly && !dsmlLike {
continue
}
if hasASCIIPrefixFoldAt(text, start, name.raw) {
return name.canonical, len(name.raw)
}
}
return "", 0
}
func consumeToolMarkupPipe(text string, idx int) (int, bool) {
if idx >= len(text) {
return idx, false
}
if text[idx] == '|' {
return idx + 1, true
}
if strings.HasPrefix(text[idx:], "") {
return idx + len(""), true
}
return idx, false
}
func hasToolMarkupBoundary(text string, idx int) bool {
if idx >= len(text) {
return true
}
switch text[idx] {
case ' ', '\t', '\n', '\r', '>', '/':
return true
default:
return false
}
}