Skip to content

Commit

Permalink
Merge pull request #6 from pytorch-labs/add_tiktoken
Browse files Browse the repository at this point in the history
Add tiktoken
  • Loading branch information
larryliu0820 authored Dec 5, 2024
2 parents 874f624 + d30bf92 commit b3660e1
Show file tree
Hide file tree
Showing 9 changed files with 544 additions and 30 deletions.
1 change: 1 addition & 0 deletions .cmakelintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
filter=-convention/filename,-linelength,-package/consistency,-readability/logic,+readability/mixedcase,-readability/wonkycase,-syntax,-whitespace/eol,+whitespace/extra,-whitespace/indent,-whitespace/mismatch,-whitespace/newline,-whitespace/tabs
6 changes: 6 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
[submodule "third-party/sentencepiece"]
path = third-party/sentencepiece
url = https://github.com/google/sentencepiece.git
[submodule "third-party/re2"]
path = third-party/re2
url = https://github.com/google/re2.git
[submodule "third-party/abseil-cpp"]
path = third-party/abseil-cpp
url = https://github.com/abseil/abseil-cpp.git
21 changes: 14 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,33 +28,40 @@ set(ABSL_ENABLE_INSTALL ON)
set(ABSL_PROPAGATE_CXX_STD ON)
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_subdirectory(third-party/abseil-cpp)
add_subdirectory(third-party/re2)
add_subdirectory(third-party/sentencepiece)
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})

add_library(tokenizers STATIC src/sentencepiece.cpp)
add_library(tokenizers STATIC src/sentencepiece.cpp src/tiktoken.cpp)

# Using abseil from sentencepiece/third_party
target_include_directories(tokenizers PUBLIC third-party/sentencepiece/src
third-party/sentencepiece include)
target_include_directories(
tokenizers PUBLIC third-party/sentencepiece/src third-party/sentencepiece
include third-party/re2)

target_link_libraries(tokenizers PUBLIC sentencepiece-static)
target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2)

# Build test
if(TOKENIZERS_BUILD_TEST)
include(FetchContent)
# CMAKE
FetchContent_Declare(
googletest
# Specify the commit you depend on and update it regularly.
URL https://github.com/google/googletest/archive/5376968f6948923e2411081fd9372e71a59d8e77.zip
)
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
set(gtest_force_shared_crt
ON
CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(googletest)

set(ENV{RESOURCES_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/test/resources)
add_executable(sentencepiece_test test/test_sentencepiece.cpp)
target_include_directories(
sentencepiece_test PUBLIC third-party/sentencepiece/src
third-party/sentencepiece include GTEST_INCLUDE_PATH)
sentencepiece_test
PUBLIC third-party/sentencepiece/src third-party/sentencepiece include
GTEST_INCLUDE_PATH)
target_link_libraries(sentencepiece_test PUBLIC tokenizers gtest_main)

# tiktoken tests
Expand Down
8 changes: 7 additions & 1 deletion include/error.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,20 @@ enum class Error : error_code_t {
/// Token out of range.
OutOfRange = 0x03,

/// Artifact load failure.
/// Tokenizer artifact load failure.
LoadFailure = 0x04,

/// Encode failure.
EncodeFailure = 0x05,

/// Base64 decode failure.
Base64DecodeFailure = 0x06,

/// Failed to parse tokenizer artifact.
ParseFailure = 0x07,

/// Decode failure.
DecodeFailure = 0x08,
};

} // namespace tokenizers
Expand Down
53 changes: 53 additions & 0 deletions include/result.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,56 @@ template <typename T> T *Result<T>::operator->() {
}

} // namespace tokenizers

/**
* Unwrap a Result to obtain its value. If the Result contains an error,
* propogate the error via trivial function return.
*
* Note: A function using TK_UNWRAP should itself return a Result or Error.
*
* @param[in] result__ Expression yielding the result to unwrap.
* @param[in] ... Optional format string for the log error message and its
* arguments.
*/
#define TK_UNWRAP(result__, ...) TK_INTERNAL_UNWRAP(result__, ##__VA_ARGS__)

// Internal only: Use TK_UNWRAP() instead.
#define TK_INTERNAL_UNWRAP(...) \
TK_INTERNAL_UNWRAP_SELECT(__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1) \
(__VA_ARGS__)

// Internal only: Use TK_UNWRAP() instead.
#define TK_INTERNAL_UNWRAP_SELECT(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, \
...) \
TK_INTERNAL_UNWRAP_##N

// Internal only: Use TK_UNWRAP() instead.
#define TK_INTERNAL_UNWRAP_1(result__) \
({ \
auto et_result__ = (result__); \
if (!et_result__.ok()) { \
return et_result__.error(); \
} \
std::move(*et_result__); \
})

// Internal only: Use TK_UNWRAP() instead.
#define TK_INTERNAL_UNWRAP_2(result__, message__, ...) \
({ \
auto et_result__ = (result__); \
if (!et_result__.ok()) { \
TK_LOG(Error, message__, ##__VA_ARGS__); \
return et_result__.error(); \
} \
std::move(*et_result__); \
})

// Internal only: Use TK_UNWRAP() instead.
#define TK_INTERNAL_UNWRAP_3 TK_INTERNAL_UNWRAP_2
#define TK_INTERNAL_UNWRAP_4 TK_INTERNAL_UNWRAP_2
#define TK_INTERNAL_UNWRAP_5 TK_INTERNAL_UNWRAP_2
#define TK_INTERNAL_UNWRAP_6 TK_INTERNAL_UNWRAP_2
#define TK_INTERNAL_UNWRAP_7 TK_INTERNAL_UNWRAP_2
#define TK_INTERNAL_UNWRAP_8 TK_INTERNAL_UNWRAP_2
#define TK_INTERNAL_UNWRAP_9 TK_INTERNAL_UNWRAP_2
#define TK_INTERNAL_UNWRAP_10 TK_INTERNAL_UNWRAP_2
65 changes: 43 additions & 22 deletions include/tiktoken.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
// Tiktoken header
// Used by OpenAI, adapted from https://github.com/sewenew/tokenizer
#include "re2/re2.h"
#include "result.h"
#include "tokenizer.h"
#include <cstdint>

Expand All @@ -20,9 +21,24 @@ using Re2UPtr = std::unique_ptr<re2::RE2>;

namespace tokenizers {

static constexpr int32_t kSpecialTokensSize = 256;
static constexpr size_t kBOSTokenIndex = 0;
static constexpr size_t kEOSTokenIndex = 1;

class Tiktoken : public Tokenizer {
public:
explicit Tiktoken();
explicit Tiktoken(std::unique_ptr<std::vector<std::string>> special_tokens,
size_t bos_token_index, size_t eos_token_index)
: _special_tokens(std::move(special_tokens)),
_bos_token_index(bos_token_index), _eos_token_index(eos_token_index) {
assert(_bos_token_index < _special_tokens->size());
assert(_eos_token_index < _special_tokens->size());
};

explicit Tiktoken()
: _special_tokens(_get_default_special_tokens()),
_bos_token_index(kBOSTokenIndex), _eos_token_index(kEOSTokenIndex){};

~Tiktoken() override;

Error load(const std::string &tokenizer_path) override;
Expand All @@ -34,37 +50,42 @@ class Tiktoken : public Tokenizer {
uint64_t token) const override;

private:
static inline const Encoder _get_special_tokens(ssize_t num_base_tokens) {
Encoder special_tokens;
special_tokens.emplace("<|begin_of_text|>", num_base_tokens++);
special_tokens.emplace("<|end_of_text|>", num_base_tokens++);
special_tokens.emplace("<|reserved_special_token_0|>", num_base_tokens++);
special_tokens.emplace("<|reserved_special_token_1|>", num_base_tokens++);
special_tokens.emplace("<|reserved_special_token_2|>", num_base_tokens++);
special_tokens.emplace("<|reserved_special_token_3|>", num_base_tokens++);
special_tokens.emplace("<|start_header_id|>", num_base_tokens++);
special_tokens.emplace("<|end_header_id|>", num_base_tokens++);
special_tokens.emplace("<|reserved_special_token_4|>", num_base_tokens++);
special_tokens.emplace("<|eot_id|>", num_base_tokens++);
for (auto i = 5; i < 251; ++i) {
special_tokens.emplace("<|reserved_special_token_" + std::to_string(i) +
"|>",
num_base_tokens++);
static inline std::unique_ptr<std::vector<std::string>>
_get_default_special_tokens() {
auto special_tokens =
std::make_unique<std::vector<std::string>>(std::vector<std::string>{
"<|begin_of_text|>", "<|end_of_text|>",
"<|reserved_special_token_0|>", "<|reserved_special_token_1|>",
"<|finetune_right_pad_id|>", "<|step_id|>", "<|start_header_id|>",
"<|end_header_id|>", "<|eom_id|>", "<|eot_id|>", "<|python_tag|>"});
// pad the rest of the special tokens with reserved tokens
ssize_t reserved_special_token_num = 2;
while (special_tokens->size() < kSpecialTokensSize) {
special_tokens->emplace_back(
"<|reserved_special_token_" +
std::to_string(reserved_special_token_num++) + "|>");
}
return special_tokens;
}

template <typename T>
std::pair<std::optional<std::string>, re2::StringPiece>
_split_with_allowed_special_token(re2::StringPiece &input,
const T &allowed_special);
const T &allowed_special) const;

void _encode(re2::StringPiece &input, std::vector<uint64_t> &ret,
uint64_t &last_piece_token_len);
Error _encode(re2::StringPiece &input, std::vector<uint64_t> &ret,
uint64_t &last_piece_token_len) const;

template <typename T>
std::pair<std::vector<uint64_t>, uint64_t>
_encode_with_special_token(const std::string &text, const T &allowed_special);
Result<std::pair<std::vector<uint64_t>, uint64_t>>
_encode_with_special_token(const std::string &text,
const T &allowed_special) const;

Encoder _build_special_token_encoder(ssize_t num_base_tokens) const;

std::unique_ptr<std::vector<std::string>> _special_tokens;
size_t _bos_token_index;
size_t _eos_token_index;

// Removed negative lookahead \s+(?!\S) since it's not supported by RE2.
const std::string _pattern =
Expand Down
Loading

0 comments on commit b3660e1

Please sign in to comment.