Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a proof of concept for using GetAlternateLookup #62

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions global.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"sdk": {
"version": "9.0.100-rc.2.24474.11",
"rollForward": "latestMajor",
"allowPrerelease": false
}
Expand Down
69 changes: 67 additions & 2 deletions src/libs/Tiktoken.Core/CoreBPE.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System.Collections.Concurrent;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Text;
using System.Text.RegularExpressions;
Expand All @@ -20,7 +21,11 @@ public class CoreBpe

internal bool EnableCache { get; set; } = true;
private ConcurrentDictionary<string, IReadOnlyCollection<int>> FastCache { get; set; } = new();
private ConcurrentDictionary<string, int> FastCacheCounts { get; set; } = new();
private ConcurrentDictionary<string, int> FastCacheCounts { get; set; } = new(
#if NET9_0_OR_GREATER
new AlternateStringComparer()
#endif
);

private Regex SpecialRegex { get; set; }
private Regex Regex { get; set; }
Expand Down Expand Up @@ -59,7 +64,11 @@ public CoreBpe(
#else
static x => new string(x.Key.Select(static y => (char) y).ToArray()),
#endif
static x => x.Value);
static x => x.Value
#if NET9_0_OR_GREATER
, new AlternateStringComparer()
#endif
);
SpecialTokensEncoder = specialTokensEncoder;

Regex = new Regex(pattern, RegexOptions.Compiled);
Expand Down Expand Up @@ -89,23 +98,40 @@ public int CountTokensNative(string text)
var textSpan = text.AsSpan();
Span<byte> pieceBytes = stackalloc byte[128];
#endif
#if NET9_0_OR_GREATER
var fastEncoderLookup = FastEncoder.GetAlternateLookup<ReadOnlySpan<char>>();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here you use FastEncoder in addition to FastCacheCounts, but you didn't pass new AlternateStringComparer() to it.
Is this implemented by default for regular Dictionary and is only needed for ConcurrentDictionary?
P.S. I'm a bit out of context for now, I'll look into this in the next few days (I saw links on your blog, I'll look into this)

var fastCacheCountLookup = FastCacheCounts.GetAlternateLookup<ReadOnlySpan<char>>();
#endif

#if NET7_0_OR_GREATER
foreach (var match in Regex.EnumerateMatches(textSpan))
{
#if NET9_0_OR_GREATER
var fastKey = textSpan.Slice(match.Index, match.Length);
#else
var fastKey = new string(textSpan.Slice(match.Index, match.Length));
#endif
#else
foreach (Match match in Regex.Matches(text))
{
var matchValue = match.Value;
var fastKey = matchValue;
#endif

#if NET9_0_OR_GREATER
if (fastEncoderLookup.ContainsKey(fastKey))
#else
if (FastEncoder.ContainsKey(fastKey))
#endif
{
tokens++;
continue;
}
#if NET9_0_OR_GREATER
if (EnableCache && fastCacheCountLookup.TryGetValue(fastKey, out var fastNumberOfTokens))
#else
if (EnableCache && FastCacheCounts.TryGetValue(fastKey, out var fastNumberOfTokens))
#endif
{
tokens += fastNumberOfTokens;
continue;
Expand All @@ -127,7 +153,11 @@ public int CountTokensNative(string text)

if (EnableCache)
{
#if NET9_0_OR_GREATER
fastCacheCountLookup[fastKey] = numberOfTokens;
#else
FastCacheCounts[fastKey] = numberOfTokens;
#endif
}
}

Expand Down Expand Up @@ -569,4 +599,39 @@ private static byte[] GetUtf8Bytes(ReadOnlySpan<char> text, Span<byte> scratch)
}
}
#endif

#if NET9_0_OR_GREATER
private sealed class AlternateStringComparer : IEqualityComparer<string>,
IAlternateEqualityComparer<ReadOnlySpan<char>, string>
{
public string Create(ReadOnlySpan<char> alternate)
{
return new(alternate);
}

public bool Equals(string? x, string? y)
{
return string.Equals(x, y, StringComparison.Ordinal);
}

public bool Equals(ReadOnlySpan<char> alternate, string other)
{
return other?.AsSpan().SequenceEqual(alternate) ?? false;
}

public int GetHashCode([DisallowNull] string str)
{
return str is null ? 0 : GetHashCode(str.AsSpan());
}

public int GetHashCode(ReadOnlySpan<char> alternate)
{
// use the djb2 hash function for simplicity: http://www.cse.yorku.ca/~oz/hash.html
uint hash = 5381;
foreach (var ch in alternate)
hash = hash * 33u + ch;
return (int)hash;
}
}
#endif
}
2 changes: 1 addition & 1 deletion src/libs/Tiktoken.Core/Tiktoken.Core.csproj
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFrameworks>net4.6.2;netstandard2.0;netstandard2.1;net6.0;net8.0</TargetFrameworks>
<TargetFrameworks>net4.6.2;netstandard2.0;netstandard2.1;net6.0;net8.0;net9.0</TargetFrameworks>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
<NoWarn>$(NoWarn);CA1724</NoWarn>
<RootNamespace>Tiktoken</RootNamespace>
Expand Down
Loading