Skip to content

Commit

Permalink
Fix edge case Mistral tokenizer (vllm-project#10152)
Browse files Browse the repository at this point in the history
Signed-off-by: Jee Jee Li <[email protected]>
  • Loading branch information
patrickvonplaten authored and jeejeelee committed Nov 11, 2024
1 parent 22495c7 commit 2320543
Showing 1 changed file with 9 additions and 9 deletions.
18 changes: 9 additions & 9 deletions vllm/transformers_utils/tokenizers/mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,12 @@ def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
self.instruct = tokenizer.instruct_tokenizer

tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
if isinstance(tokenizer_, Tekkenizer):
self.is_tekken = isinstance(tokenizer_, Tekkenizer)
self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer)
if self.is_tekken:
# Make sure special tokens will not raise
tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE

elif isinstance(tokenizer_, SentencePieceTokenizer):
elif self.is_spm:
pass
else:
raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
Expand Down Expand Up @@ -218,7 +219,7 @@ def apply_chat_template(self,
return encoded.tokens

def convert_tokens_to_string(self, tokens: List[str]) -> str:
if isinstance(self.tokenizer, Tekkenizer):
if self.is_tekken:
tokens = [
t for t in tokens
if t not in self.tokenizer._all_special_tokens
Expand Down Expand Up @@ -270,21 +271,20 @@ def convert_ids_to_tokens(
skip_special_tokens
), "skip_special_tokens=False is not supported for Mistral tokenizers."

assert isinstance(self.tokenizer,
(Tekkenizer, SentencePieceTokenizer)), type(
self.tokenizer)
assert self.is_tekken or self.is_spm, type(self.tokenizer)

if isinstance(self.tokenizer, Tekkenizer):
if self.is_tekken:
# skip special tokens
ids = [i for i in ids if i > self.tokenizer.num_special_tokens]

tokens = [self.tokenizer.id_to_piece(id) for id in ids]

if any("�" in t for t in tokens):
if any("�" in t for t in tokens) and self.is_tekken:
# if a decoded token contains the replacement character, then the
# token has an incomplete UTF-8 character so we must use bytes
# See: https://github.com/vllm-project/vllm/pull/8640
# https://github.com/vllm-project/vllm/pull/9625
# if underlying tokenizeir is sentencepiece, we just add "�"
tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]

return tokens

0 comments on commit 2320543

Please sign in to comment.