fix(toolcall): eliminate strings.ToLower panics from Unicode case folding

Replace all strings.ToLower usage with ASCII case-insensitive matching
(hasASCIIPrefixFoldAt, indexASCIIFold, hasDSMLPrefix) to prevent slice
bounds errors when Unicode characters change byte length after case
folding (e.g., Turkish İ U+0130 → i + combining dot: 2 bytes → 3 bytes).

Root cause: code created a strings.ToLower(text) copy, found byte
positions in that copy, then used those positions to slice the
original text — byte offsets that were valid in the lowercased copy
became out-of-bounds in the original when case folding changed byte
lengths.

Files changed:
- toolcalls_scan.go: remove 5 lower usages, add hasDSMLPrefix
- toolcalls_parse_markup.go: remove 3 lower usages, add indexASCIIFold
- toolcalls_markup.go: SanitizeLooseCDATA lower removal
- toolcalls_parse.go: updateCDATAStateForStrip lower removal
- tool_prompt.go: align DSML pipe characters with tool call spec
- tool_prompt_test.go: fix pre-existing test character mismatch
This commit is contained in:
waiwai
2026-05-09 15:05:51 +08:00
parent 7ab5a0e66d
commit 1e00e482a6
8 changed files with 97 additions and 61 deletions

View File

@@ -141,7 +141,6 @@ func findXMLElementBlocks(text, tag string) []xmlElementBlock {
}
func findXMLStartTagOutsideCDATA(text, tag string, from int) (start, bodyStart int, attrs string, ok bool) {
lower := strings.ToLower(text)
target := "<" + strings.ToLower(tag)
for i := maxInt(from, 0); i < len(text); {
next, advanced, blocked := skipXMLIgnoredSection(text, i)
@@ -152,7 +151,7 @@ func findXMLStartTagOutsideCDATA(text, tag string, from int) (start, bodyStart i
i = next
continue
}
if strings.HasPrefix(lower[i:], target) && hasXMLTagBoundary(text, i+len(target)) {
if hasASCIIPrefixFoldAt(text, i, target) && hasXMLTagBoundary(text, i+len(target)) {
end := findXMLTagEnd(text, i+len(target))
if end < 0 {
return -1, -1, "", false
@@ -165,7 +164,6 @@ func findXMLStartTagOutsideCDATA(text, tag string, from int) (start, bodyStart i
}
func findMatchingXMLEndTagOutsideCDATA(text, tag string, from int) (closeStart, closeEnd int, ok bool) {
lower := strings.ToLower(text)
openTarget := "<" + strings.ToLower(tag)
closeTarget := "</" + strings.ToLower(tag)
depth := 1
@@ -178,7 +176,7 @@ func findMatchingXMLEndTagOutsideCDATA(text, tag string, from int) (closeStart,
i = next
continue
}
if strings.HasPrefix(lower[i:], closeTarget) && hasXMLTagBoundary(text, i+len(closeTarget)) {
if hasASCIIPrefixFoldAt(text, i, closeTarget) && hasXMLTagBoundary(text, i+len(closeTarget)) {
end := findXMLTagEnd(text, i+len(closeTarget))
if end < 0 {
return -1, -1, false
@@ -190,7 +188,7 @@ func findMatchingXMLEndTagOutsideCDATA(text, tag string, from int) (closeStart,
i = end + 1
continue
}
if strings.HasPrefix(lower[i:], openTarget) && hasXMLTagBoundary(text, i+len(openTarget)) {
if hasASCIIPrefixFoldAt(text, i, openTarget) && hasXMLTagBoundary(text, i+len(openTarget)) {
end := findXMLTagEnd(text, i+len(openTarget))
if end < 0 {
return -1, -1, false
@@ -247,6 +245,23 @@ func asciiLower(b byte) byte {
return b
}
// indexASCIIFold returns the absolute byte position in s where substr (ASCII-only) is
// found case-insensitively, scanning forward from start. Returns -1 if not found.
// Unlike strings.Index on a lowercased copy, this does not allocate or risk byte-length
// mismatch when non-ASCII runes change width under case folding.
func indexASCIIFold(s string, start int, substr string) int {
if start < 0 || len(s)-start < len(substr) {
return -1
}
end := len(s) - len(substr) + 1
for i := start; i < end; i++ {
if hasASCIIPrefixFoldAt(s, i, substr) {
return i
}
}
return -1
}
func findToolCDATAEnd(text string, from int) int {
if from < 0 || from >= len(text) {
return -1