Skip to content

Commit

Permalink
Add claude
Browse files Browse the repository at this point in the history
  • Loading branch information
hupe1980 committed Jul 21, 2023
1 parent a4f79c6 commit a5b52d2
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 5 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ For more example usage, see [_examples](./_examples).
- ✅ p50k_edit
- ✅ r50k_base
- ✅ gpt2
- ✅ claude

## License
[MIT](LICENCE)
57 changes: 57 additions & 0 deletions claude.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package tiktoken

import (
_ "embed"
"encoding/base64"
"encoding/json"
"strconv"
"strings"
)

//go:embed resource/claude.json
var claude string

type claudeJSON struct {
ExplicitNVocab int `json:"explicit_n_vocab"`
PatStr string `json:"pat_str"`
BPERanks string `json:"bpe_ranks"`
SpecialTokens map[string]uint `json:"special_tokens"`
}

// NewClaude creates a new Codec instance for the claude tokenization scheme.
// It loads the mergeable ranks from the embedded claude resource.
// The function returns a pointer to the Codec or an error if any.
func NewClaude() (*Codec, error) {
c := claudeJSON{}
if err := json.Unmarshal([]byte(claude), &c); err != nil {
return nil, err
}

parts := strings.SplitN(c.BPERanks, " ", 3)

offset, err := strconv.Atoi(parts[1])
if err != nil {
return nil, err
}

tokens := strings.Split(parts[2], " ")

mergeableRanks := make(map[string]uint, len(tokens))

for i, token := range tokens {
t, bErr := base64.StdEncoding.DecodeString(token)
if bErr != nil {
return nil, bErr
}

mergeableRanks[string(t)] = uint(i * offset)
}

return &Codec{
Name: "claude",
ExplicitNVocab: c.ExplicitNVocab,
PatStr: c.PatStr,
MergeableRanks: mergeableRanks,
SpecialTokens: c.SpecialTokens,
}, nil
}
40 changes: 40 additions & 0 deletions claude_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package tiktoken

import (
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestClaude(t *testing.T) {
claude, err := NewClaude()
require.NoError(t, err)

encoding, err := NewEncoding(claude)
require.NoError(t, err)

t.Run("small text", func(t *testing.T) {
idx, _ := encoding.EncodeOrdinary("hello world!")
require.Equal(t, 3, len(idx))
})

t.Run("small text", func(t *testing.T) {
idx, _ := encoding.EncodeOrdinary("hello world!")
require.Equal(t, 3, len(idx))
})

t.Run("text normalising", func(t *testing.T) {
idx, _ := encoding.EncodeOrdinary("™")
assert.Equal(t, 1, len(idx))

idx, _ = encoding.EncodeOrdinary("ϰ")
assert.Equal(t, 1, len(idx))
})

t.Run("allows special tokens", func(t *testing.T) {
idx, _, err := encoding.Encode("<EOT>", AllSpecial, nil)
require.NoError(t, err)
require.Equal(t, 1, len(idx))
})
}
10 changes: 5 additions & 5 deletions codec.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ const (

// Codec represents a token encoding codec.
type Codec struct {
Name string
ExplicitNVocab int
PatStr string
MergeableRanks map[string]uint
SpecialTokens map[string]uint
Name string `json:"name"`
ExplicitNVocab int `json:"explicit_n_vocab"`
PatStr string `json:"pat_str"`
MergeableRanks map[string]uint `json:"mergeable_ranks"`
SpecialTokens map[string]uint `json:"special_tokens"`
}

// CovertVocabBPEAndEncoderJSONToMergeableBPERanks converts the vocabulary BPE and encoder JSON
Expand Down
2 changes: 2 additions & 0 deletions encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ func (enc *Encoding) EncodeOrdinary(text string) ([]uint, []string) {
return enc.coreBPE.EncodeOrdinary(text)
}

var AllSpecial = []string{"all"}

// Encode encodes the given text with the specified allowed and disallowed special tokens.
func (enc *Encoding) Encode(text string, allowedSpecial, disallowedSpecial []string) ([]uint, []string, error) {
var allowedSpecialSet map[string]any
Expand Down
12 changes: 12 additions & 0 deletions resource/claude.json

Large diffs are not rendered by default.

0 comments on commit a5b52d2

Please sign in to comment.