mirror of
https://github.com/CJackHwang/ds2api.git
synced 2026-05-19 07:27:43 +08:00
refactor: generalize DSML tag parsing to tolerate model noise; split tiktoken by build tags
Replace hardcoded DSML typo variant lists in Go/Node tool call parsers with generalized prefix consumption that tolerates repeated leading <, repeated DSML prefix noise, and trailing pipe terminators. Split tiktoken-dependent token counting into a build-tagged file for non-cgo platform compatibility. Add /data directory to Dockerfile for bind-mount permissions. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -1,11 +1,5 @@
|
||||
package util
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
tiktoken "github.com/hupe1980/go-tiktoken"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultTokenizerModel = "gpt-4o"
|
||||
claudeTokenizerModel = "claude"
|
||||
@@ -33,41 +27,6 @@ func CountOutputTokens(text, model string) int {
|
||||
return base
|
||||
}
|
||||
|
||||
func countWithTokenizer(text, model string) int {
|
||||
text = strings.TrimSpace(text)
|
||||
if text == "" {
|
||||
return 0
|
||||
}
|
||||
encoding, err := tiktoken.NewEncodingForModel(tokenizerModelForCount(model))
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
ids, _, err := encoding.Encode(text, nil, nil)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return len(ids)
|
||||
}
|
||||
|
||||
func tokenizerModelForCount(model string) string {
|
||||
model = strings.ToLower(strings.TrimSpace(model))
|
||||
if model == "" {
|
||||
return defaultTokenizerModel
|
||||
}
|
||||
switch {
|
||||
case strings.HasPrefix(model, "claude"):
|
||||
return claudeTokenizerModel
|
||||
case strings.HasPrefix(model, "gpt-4"), strings.HasPrefix(model, "gpt-5"), strings.HasPrefix(model, "o1"), strings.HasPrefix(model, "o3"), strings.HasPrefix(model, "o4"):
|
||||
return defaultTokenizerModel
|
||||
case strings.HasPrefix(model, "deepseek-v4"):
|
||||
return defaultTokenizerModel
|
||||
case strings.HasPrefix(model, "deepseek"):
|
||||
return defaultTokenizerModel
|
||||
default:
|
||||
return defaultTokenizerModel
|
||||
}
|
||||
}
|
||||
|
||||
func conservativePromptPadding(base int) int {
|
||||
padding := base / 50
|
||||
if padding < 4 {
|
||||
|
||||
7
internal/util/token_count_heuristic.go
Normal file
7
internal/util/token_count_heuristic.go
Normal file
@@ -0,0 +1,7 @@
|
||||
//go:build 386 || arm || mips || mipsle || wasm
|
||||
|
||||
package util
|
||||
|
||||
func countWithTokenizer(_, _ string) int {
|
||||
return 0
|
||||
}
|
||||
44
internal/util/token_count_tiktoken.go
Normal file
44
internal/util/token_count_tiktoken.go
Normal file
@@ -0,0 +1,44 @@
|
||||
//go:build !386 && !arm && !mips && !mipsle && !wasm
|
||||
|
||||
package util
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
tiktoken "github.com/hupe1980/go-tiktoken"
|
||||
)
|
||||
|
||||
func countWithTokenizer(text, model string) int {
|
||||
text = strings.TrimSpace(text)
|
||||
if text == "" {
|
||||
return 0
|
||||
}
|
||||
encoding, err := tiktoken.NewEncodingForModel(tokenizerModelForCount(model))
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
ids, _, err := encoding.Encode(text, nil, nil)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return len(ids)
|
||||
}
|
||||
|
||||
func tokenizerModelForCount(model string) string {
|
||||
model = strings.ToLower(strings.TrimSpace(model))
|
||||
if model == "" {
|
||||
return defaultTokenizerModel
|
||||
}
|
||||
switch {
|
||||
case strings.HasPrefix(model, "claude"):
|
||||
return claudeTokenizerModel
|
||||
case strings.HasPrefix(model, "gpt-4"), strings.HasPrefix(model, "gpt-5"), strings.HasPrefix(model, "o1"), strings.HasPrefix(model, "o3"), strings.HasPrefix(model, "o4"):
|
||||
return defaultTokenizerModel
|
||||
case strings.HasPrefix(model, "deepseek-v4"):
|
||||
return defaultTokenizerModel
|
||||
case strings.HasPrefix(model, "deepseek"):
|
||||
return defaultTokenizerModel
|
||||
default:
|
||||
return defaultTokenizerModel
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user