diff --git a/hezar/models/sequence_labeling/roberta/roberta_sequence_labeling.py b/hezar/models/sequence_labeling/roberta/roberta_sequence_labeling.py index 9f04ece9..dccfd22c 100644 --- a/hezar/models/sequence_labeling/roberta/roberta_sequence_labeling.py +++ b/hezar/models/sequence_labeling/roberta/roberta_sequence_labeling.py @@ -74,7 +74,16 @@ def preprocess(self, inputs: Union[str, List[str]], **kwargs): normalizer = self.preprocessor["text_normalizer"] inputs = normalizer(inputs) tokenizer = self.preprocessor[self.tokenizer_name] - inputs = tokenizer(inputs, return_tensors="pt", device=self.device) + inputs = tokenizer( + inputs, + return_word_ids=True, + return_tokens=True, + return_offsets_mapping=True, + padding=True, + truncation=True, + return_tensors="pt", + device=self.device, + ) return inputs def post_process(self, inputs, **kwargs):