feat: tiktoken tokenizer — accurate BPE token counting with provider-aware encoding

This commit is contained in:
2026-04-05 21:46:43 +02:00
parent fbb28de0b8
commit f7782215dc
4 changed files with 115 additions and 0 deletions

View File

@@ -0,0 +1,64 @@
package tokenizer
import (
"log/slog"
"sync"
tiktoken "github.com/pkoukk/tiktoken-go"
)
// Tokenizer counts tokens using a tiktoken BPE encoding.
// Falls back to len/4 heuristic if the encoding fails to load.
type Tokenizer struct {
encoding string
enc *tiktoken.Tiktoken
mu sync.Mutex
loaded bool
warnOnce sync.Once
}
// New creates a Tokenizer for the given tiktoken encoding name (e.g. "cl100k_base").
func New(encoding string) *Tokenizer {
return &Tokenizer{encoding: encoding}
}
// ForProvider returns a Tokenizer appropriate for the named provider.
func ForProvider(providerName string) *Tokenizer {
switch providerName {
case "anthropic", "openai":
return New("cl100k_base")
default:
// mistral, google, ollama, llamacpp, unknown
return New("o200k_base")
}
}
// Count returns the number of tokens for text using the configured encoding.
// Falls back to len(text)/4 if encoding is unavailable.
func (t *Tokenizer) Count(text string) int {
if enc := t.getEncoding(); enc != nil {
tokens := enc.Encode(text, nil, nil)
return len(tokens)
}
// heuristic fallback
return (len(text) + 3) / 4
}
func (t *Tokenizer) getEncoding() *tiktoken.Tiktoken {
t.mu.Lock()
defer t.mu.Unlock()
if t.loaded {
return t.enc // may be nil if failed
}
t.loaded = true
enc, err := tiktoken.GetEncoding(t.encoding)
if err != nil {
t.warnOnce.Do(func() {
slog.Warn("tiktoken encoding unavailable, falling back to heuristic",
"encoding", t.encoding, "error", err)
})
return nil
}
t.enc = enc
return enc
}

View File

@@ -0,0 +1,47 @@
package tokenizer_test
import (
"testing"
"somegit.dev/Owlibou/gnoma/internal/tokenizer"
)
func TestTokenizer_CountKnownText(t *testing.T) {
tok := tokenizer.New("cl100k_base")
// "Hello world" is 2 tokens in cl100k_base
n := tok.Count("Hello world")
if n < 1 || n > 5 {
t.Errorf("unexpected token count for 'Hello world': %d", n)
}
}
func TestTokenizer_FallbackOnBadEncoding(t *testing.T) {
tok := tokenizer.New("nonexistent_encoding_xyz")
// Must not panic; falls back to heuristic
n := tok.Count("some text here")
if n <= 0 {
t.Errorf("expected positive count, got %d", n)
}
}
func TestForProvider_KnownProviders(t *testing.T) {
cases := []string{"anthropic", "openai", "mistral", "google", "ollama", "llamacpp", "unknown"}
for _, prov := range cases {
tok := tokenizer.ForProvider(prov)
n := tok.Count("test input")
if n <= 0 {
t.Errorf("provider %q: expected positive count, got %d", prov, n)
}
}
}
func TestTokenizer_CodeCountsReasonably(t *testing.T) {
tok := tokenizer.New("cl100k_base")
code := `func main() { fmt.Println("hello") }`
n := tok.Count(code)
// Should be between 5 and 20 tokens for this snippet
if n < 5 || n > 20 {
t.Errorf("code token count out of expected range: %d", n)
}
}