From 8520eb8544aa1443a0b43c0fba6bba8f30b1f80f Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 13 Sep 2024 11:43:20 +0200 Subject: [PATCH] Update tokenize.py for common >= 1.4.0 --- finetune/data/tokenize.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/finetune/data/tokenize.py b/finetune/data/tokenize.py index 669bc33..0b8d450 100644 --- a/finetune/data/tokenize.py +++ b/finetune/data/tokenize.py @@ -311,6 +311,10 @@ def tokenize_instruct( is_first=msg_idx == first_user_idx, system_prompt=sample.system_prompt, ) + if isinstance(curr_tokens, tuple): + # Versions of mistral_common>1.3.4 return a tuple of tokens (text), tokens (image), spans (image) + curr_tokens = curr_tokens[0] + curr_masks = [False] * len(curr_tokens) # only predict bot answers elif isinstance(message, ToolMessage): curr_tokens = instruct_tokenizer.encode_tool_message(