-
Notifications
You must be signed in to change notification settings - Fork 1
/
claude.go
57 lines (46 loc) · 1.29 KB
/
claude.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
package tiktoken
import (
_ "embed"
"encoding/base64"
"encoding/json"
"strconv"
"strings"
)
//go:embed resource/claude.json
var claude string
type claudeJSON struct {
ExplicitNVocab int `json:"explicit_n_vocab"`
PatStr string `json:"pat_str"`
BPERanks string `json:"bpe_ranks"`
SpecialTokens map[string]uint `json:"special_tokens"`
}
// NewClaude creates a new Codec instance for the claude tokenization scheme.
// It loads the mergeable ranks from the embedded claude resource.
// The function returns a pointer to the Codec or an error if any.
func NewClaude() (*Codec, error) {
c := claudeJSON{}
if err := json.Unmarshal([]byte(claude), &c); err != nil {
return nil, err
}
parts := strings.SplitN(c.BPERanks, " ", 3)
offset, err := strconv.Atoi(parts[1])
if err != nil {
return nil, err
}
tokens := strings.Split(parts[2], " ")
mergeableRanks := make(map[string]uint, len(tokens))
for i, token := range tokens {
t, bErr := base64.StdEncoding.DecodeString(token)
if bErr != nil {
return nil, bErr
}
mergeableRanks[string(t)] = uint(i * offset)
}
return &Codec{
Name: "claude",
ExplicitNVocab: c.ExplicitNVocab,
PatStr: c.PatStr,
MergeableRanks: mergeableRanks,
SpecialTokens: c.SpecialTokens,
}, nil
}