From 93e112559d2b1b64180ebe487e83073bcaf0bba3 Mon Sep 17 00:00:00 2001 From: HavenDV Date: Sun, 19 May 2024 03:20:09 +0400 Subject: [PATCH] feat: Added ability to get encoder/encoding by model name. --- README.md | 3 +- src/libs/Directory.Build.props | 2 +- .../{EncoderHelpers.cs => Encoders.cs} | 30 +++++++++++-------- 3 files changed, 19 insertions(+), 16 deletions(-) rename src/libs/Tiktoken/{EncoderHelpers.cs => Encoders.cs} (67%) diff --git a/README.md b/README.md index 933c9a3..e7a58dc 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,7 @@ We will be happy to accept any PR. using Tiktoken.Encodings; using Tiktoken; -var encoding = new O200KBase(); -var encoder = new Encoder(encoding); +var encoder = Encoders.ForModel("gpt-4o"); // or explicitly new Encoder(new O200KBase()) var tokens = encoder.Encode("hello world"); // [15339, 1917] var text = encoder.Decode(tokens); // hello world var numberOfTokens = encoder.CountTokens(text); // 2 diff --git a/src/libs/Directory.Build.props b/src/libs/Directory.Build.props index 498ef5e..a882e96 100644 --- a/src/libs/Directory.Build.props +++ b/src/libs/Directory.Build.props @@ -9,7 +9,7 @@ - 2.0.0 + 2.0.1 The fastest tokenizer for GPT-3.5 and GPT-4 inspired by Tiktoken. chatgpt;openai;tiktoken;tokens;gpt-4;gpt-3.5-turbo;cl100k_base;p50k_base true diff --git a/src/libs/Tiktoken/EncoderHelpers.cs b/src/libs/Tiktoken/Encoders.cs similarity index 67% rename from src/libs/Tiktoken/EncoderHelpers.cs rename to src/libs/Tiktoken/Encoders.cs index 25390ed..9662248 100644 --- a/src/libs/Tiktoken/EncoderHelpers.cs +++ b/src/libs/Tiktoken/Encoders.cs @@ -2,66 +2,70 @@ namespace Tiktoken; -// ReSharper disable InconsistentNaming - -internal static class EncoderHelpers +/// +/// +/// +public static class Encoders { /// - /// Returns encoding by model name. + /// Returns encoder by model name. /// /// gpt-3.5-turbo /// public static Encoder ForModel(string modelName) { - return new Encoder(GetNameByModel(modelName)); + return new Encoder(GetEncodingByModel(modelName)); } /// - /// Returns encoding by model name or null. + /// Returns encoder by model name or null. /// /// gpt-3.5-turbo /// public static Encoder? TryForModel(string modelName) { - var encoding = TryGetNameByModel(modelName); + var encoding = TryGetEncodingByModel(modelName); return encoding == null ? null : new Encoder(encoding); } - private static Dictionary ModelToEncoding { get; } = new() + private static Dictionary ModelToEncoding { get; } = new() { // chat { "gpt-4o", new O200KBase() }, { "gpt-4", new Cl100KBase() }, { "gpt-3.5-turbo", new Cl100KBase() }, { "gpt-35-turbo", new Cl100KBase() }, // Azure deployment name + // embeddings { "text-embedding-ada-002", new Cl100KBase() }, + { "text-embedding-3-small", new Cl100KBase() }, + { "text-embedding-3-large", new Cl100KBase() }, }; /// - /// Returns encoding name by model name or null. + /// Returns encoding by model name or null. /// /// gpt-4 gpt-3.5-turbo ... /// /// - public static Encoding? TryGetNameByModel(string modelName) + public static Encoding? TryGetEncodingByModel(string modelName) { return ModelToEncoding .FirstOrDefault(a => modelName.StartsWith(a.Key, StringComparison.Ordinal)).Value; } /// - /// Returns encoding name by model name or throws exception. + /// Returns encoding by model name or throws exception. /// /// gpt-4 gpt-3.5-turbo ... /// /// - public static Encoding GetNameByModel(string modelName) + public static Encoding GetEncodingByModel(string modelName) { - return TryGetNameByModel(modelName) ?? + return TryGetEncodingByModel(modelName) ?? throw new ArgumentException($"Model name {modelName} is not supported."); } } \ No newline at end of file