Skip to content

Commit

Permalink
Merge pull request #7 from pytorch-labs/add_tiktoken
Browse files Browse the repository at this point in the history
Add tiktoken tests
  • Loading branch information
larryliu0820 authored Dec 5, 2024
2 parents b3660e1 + 260a7e1 commit 0ef1562
Show file tree
Hide file tree
Showing 6 changed files with 128,087 additions and 9 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,7 @@ jobs:
cmake --build build -j9 --config Debug
# Run unit tests
RESOURCES_PATH=test/resources build/sentencepiece_test
export RESOURCES_PATH=test/resources
build/sentencepiece_test
build/tiktoken_test
10 changes: 7 additions & 3 deletions .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
name: trunk

on:
pull_request:
tags:
- ciflow/trunk/*
push:
branches:
- main
tags:
- ciflow/trunk/*
workflow_dispatch:

concurrency:
Expand All @@ -29,4 +30,7 @@ jobs:
cmake --build build -j9 --config Debug
# Run unit tests
RESOURCES_PATH=test/resources build/sentencepiece_test
export RESOURCES_PATH=test/resources
build/sentencepiece_test
build/tiktoken_test
8 changes: 5 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ if(TOKENIZERS_BUILD_TEST)
target_link_libraries(sentencepiece_test PUBLIC tokenizers gtest_main)

# tiktoken tests
add_executable(tiktoken_test test/test_base64.cpp)
target_include_directories(tiktoken_test PUBLIC include GTEST_INCLUDE_PATH)
target_link_libraries(tiktoken_test PUBLIC gtest_main)
add_executable(tiktoken_test test/test_base64.cpp test/test_tiktoken.cpp)
target_include_directories(
tiktoken_test PUBLIC third-party/re2 third-party/abseil-cpp include
GTEST_INCLUDE_PATH)
target_link_libraries(tiktoken_test PUBLIC tokenizers gtest_main)
endif()
2 changes: 0 additions & 2 deletions include/tiktoken.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,6 @@ class Tiktoken : public Tokenizer {
: _special_tokens(_get_default_special_tokens()),
_bos_token_index(kBOSTokenIndex), _eos_token_index(kEOSTokenIndex){};

~Tiktoken() override;

Error load(const std::string &tokenizer_path) override;

Result<std::vector<uint64_t>> encode(const std::string &input, int8_t bos,
Expand Down
Loading

0 comments on commit 0ef1562

Please sign in to comment.