diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index ab6a767f1..a0ac1d6a3 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -329,6 +329,8 @@ jobs:
wheel_name=$(find . -name 'openvino_tokenizers*.whl')
python3 -m pip install $wheel_name[dev]
popd
+ env:
+ PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu
- name: Tokenizers regression tests (using openvino python modules)
if: needs.openvino_download.outputs.status == 'success'
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 8dbaead32..bddc7e4bb 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -319,6 +319,8 @@ jobs:
wheel_name=$(find . -name 'openvino_tokenizers*.whl')
python3 -m pip install $wheel_name[dev]
popd
+ env:
+ PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu
- name: Tokenizers regression tests (using openvino python modules)
if: needs.openvino_download.outputs.status == 'success'
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 47922670a..c2ae6ffbd 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -323,6 +323,8 @@ jobs:
# Find and install wheel
$ovCoreWheelPath=Get-ChildItem -Path "${{ env.INSTALL_DIR }}\\ov_tokenizers" -Filter openvino_tokenizers*.whl | % { $_.FullName }
python3 -m pip install "$ovCoreWheelPath[all]"
+ env:
+ PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu
- name: Tokenizers regression tests (using openvino python modules)
if: needs.openvino_download.outputs.status == 'success'
diff --git a/README.md b/README.md
index 2994d1b06..21fbfe90d 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
# OpenVINO Tokenizers
+[![Downloads](https://static.pepy.tech/badge/openvino-tokenizers)](https://pepy.tech/project/openvino-tokenizers)
+
OpenVINO Tokenizers adds text processing operations to OpenVINO.
## Features
@@ -59,6 +61,9 @@ This command is the equivalent of minimal installation. Install tokenizers conve
```bash
pip install transformers[sentencepiece] tiktoken
```
+:warning: Latest commit of OpenVINO Tokenizers might rely on features that are not present in the release OpenVINO version.
+Use [a nightly build](https://docs.openvino.ai/2024/get-started/install-openvino.html?VERSION=NIGHTLY) of OpenVINO or build
+OpenVINO Tokenizers from a release branch if you have issues with the build process.
### Build and install for development
```bash
@@ -279,6 +284,29 @@ tf_result = tf_embed(sentences)
assert np.all(np.isclose(ov_result, tf_result, atol=1e-4))
```
+### RWKV Tokenizer
+
+```python
+from urllib.request import urlopen
+
+from openvino import compile_model
+from openvino_tokenizers import build_rwkv_tokenizer
+
+
+rwkv_vocab_url = (
+ "https://raw.githubusercontent.com/BlinkDL/ChatRWKV/main/tokenizer/rwkv_vocab_v20230424.txt"
+)
+
+with urlopen(rwkv_vocab_url) as vocab_file:
+ vocab = map(bytes.decode, vocab_file)
+ tokenizer, detokenizer = build_rwkv_tokenizer(vocab)
+
+tokenizer, detokenizer = compile_model(tokenizer), compile_model(detokenizer)
+
+print(tokenized := tokenizer(["Test string"])["input_ids"]) # [[24235 47429]]
+print(detokenizer(tokenized)["string_output"]) # ['Test string']
+```
+
## Supported Tokenizer Types
| Huggingface
Tokenizer Type | Tokenizer Model Type | Tokenizer | Detokenizer |
@@ -288,6 +316,7 @@ assert np.all(np.isclose(ov_result, tf_result, atol=1e-4))
| | Unigram | ❌ | ❌ |
| Legacy | SentencePiece .model | ✅ | ✅ |
| Custom | tiktoken | ✅ | ✅ |
+| RWKV | Trie | ✅ | ✅ |
## Test Results
diff --git a/python/openvino_tokenizers/build_tokenizer.py b/python/openvino_tokenizers/build_tokenizer.py
index 4226675ff..bbbea7030 100644
--- a/python/openvino_tokenizers/build_tokenizer.py
+++ b/python/openvino_tokenizers/build_tokenizer.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple
+from typing import Iterable, Tuple
from openvino import Model, PartialShape, Type
from openvino.runtime import op
@@ -16,7 +16,7 @@
def build_rwkv_tokenizer(
- rwkv_vocab: List[str],
+ rwkv_vocab: Iterable[str],
clean_up_tokenization_spaces: bool = False,
tokenizer_output_type: Type = Type.i64,
detokenizer_input_type: Type = Type.i64,
diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py
index f74d1d523..3e328673b 100644
--- a/python/openvino_tokenizers/tokenizer_pipeline.py
+++ b/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -9,7 +9,7 @@
from dataclasses import dataclass, field
from functools import singledispatchmethod
from itertools import chain, islice
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
import numpy as np
from openvino.runtime import Model, Output, PartialShape, Type, op
@@ -382,7 +382,7 @@ def fill_vocab(vocab: List[str], indices: List[int]) -> Tuple[List[str], List[in
return new_vocab, new_indices
@classmethod
- def from_rwkv_vocab(cls, vocab_file_strings: Iterator[str]) -> TrieTokenizerStep:
+ def from_rwkv_vocab(cls, vocab_file_strings: Iterable[str]) -> TrieTokenizerStep:
vocab = []
indices = []
for line in vocab_file_strings:
diff --git a/src/vocab_encoder.cpp b/src/vocab_encoder.cpp
index ade59fdba..45ab98679 100644
--- a/src/vocab_encoder.cpp
+++ b/src/vocab_encoder.cpp
@@ -13,12 +13,6 @@
using namespace ov;
-VocabEncoder::VocabEncoder (const ov::OutputVector& arguments) :
- ov::op::Op(arguments) {
- constructor_validate_and_infer_types();
-}
-
-
void VocabEncoder::validate_and_infer_types() {
// main string input
check_string_input(this, 0);
@@ -44,19 +38,21 @@ bool VocabEncoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
auto ends = inputs[1].data();
auto chars = inputs[2].data();
- // vocab string keys
- auto vocab_begins = inputs[3].data();
- auto vocab_ends = inputs[4].data();
- auto vocab_chars = inputs[5].data();
+ if (m_vocab == nullptr) {
+ // vocab string keys
+ auto vocab_begins = inputs[3].data();
+ auto vocab_ends = inputs[4].data();
+ auto vocab_chars = inputs[5].data();
- auto vocab_values = inputs[6].data();
- auto vocab_size = inputs[6].get_size();
+ auto vocab_values = inputs[6].data();
+ auto vocab_size = inputs[6].get_size();
- std::map, int32_t> vocab;
- for (size_t i = 0; i < vocab_size; ++i) {
- std::vector token = std::vector(vocab_chars + vocab_begins[i], vocab_chars + vocab_ends[i]);
- vocab[token] = vocab_values[i];
- };
+ m_vocab = std::make_shared, int32_t>>();
+ for (size_t i = 0; i < vocab_size; ++i) {
+ std::vector token = std::vector(vocab_chars + vocab_begins[i], vocab_chars + vocab_ends[i]);
+ m_vocab->insert(std::pair{token, vocab_values[i]});
+ };
+ }
auto default_value = *inputs[7].data();
const size_t num_elements = inputs[0].get_size();
@@ -66,8 +62,8 @@ bool VocabEncoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
auto token_ids = outputs[0].data();
for (size_t element_idx = 0; element_idx < num_elements; ++element_idx) {
- auto element = vocab.find(std::vector(chars + begins[element_idx], chars + ends[element_idx]));
- if (element == vocab.end()) {
+ auto element = m_vocab->find(std::vector(chars + begins[element_idx], chars + ends[element_idx]));
+ if (element == m_vocab->end()) {
token_ids[element_idx] = default_value;
} else {
token_ids[element_idx] = element->second;
diff --git a/src/vocab_encoder.hpp b/src/vocab_encoder.hpp
index b433c4bbf..c1089d96f 100644
--- a/src/vocab_encoder.hpp
+++ b/src/vocab_encoder.hpp
@@ -5,10 +5,8 @@
#pragma once
#include
#include
-#include "openvino/opsets/opset13.hpp"
using namespace ov;
-using namespace ov::opset13;
class VocabEncoder : public ov::op::Op {
@@ -16,14 +14,21 @@ class VocabEncoder : public ov::op::Op {
OPENVINO_OP("VocabEncoder");
VocabEncoder () = default;
- VocabEncoder(
- const ov::OutputVector& arguments
- );
+
+ VocabEncoder(const ov::OutputVector& arguments) :
+ ov::op::Op(arguments) {
+ constructor_validate_and_infer_types();
+ }
+
+ VocabEncoder(const ov::OutputVector& arguments, std::shared_ptr, int32_t>> vocab) :
+ ov::op::Op(arguments), m_vocab(vocab) {
+ constructor_validate_and_infer_types();
+ }
void validate_and_infer_types() override;
std::shared_ptr clone_with_new_inputs(const ov::OutputVector& inputs) const override {
- return std::make_shared(inputs);
+ return std::make_shared(inputs, m_vocab);
}
bool visit_attributes(ov::AttributeVisitor& visitor) override {
@@ -35,4 +40,6 @@ class VocabEncoder : public ov::op::Op {
bool has_evaluate() const override {
return true;
}
+private:
+ mutable std::shared_ptr, int32_t>> m_vocab;
};