diff --git a/laser_encoders/models.py b/laser_encoders/models.py index 678efc5d..e2a81ef9 100644 --- a/laser_encoders/models.py +++ b/laser_encoders/models.py @@ -95,6 +95,10 @@ def __call__(self, sentences): if self.spm_model: sentences = self.tokenizer(sentences) return self.encode_sentences(sentences) + else: + raise ValueError( + "Either initialize the encoder with an spm_model or pre-tokenize and use the encode_sentences method." + ) def _process_batch(self, batch): tokens = batch.tokens diff --git a/laser_encoders/test_laser_tokenizer.py b/laser_encoders/test_laser_tokenizer.py index cd36182d..867111cf 100644 --- a/laser_encoders/test_laser_tokenizer.py +++ b/laser_encoders/test_laser_tokenizer.py @@ -173,5 +173,5 @@ def test_sentence_encoder( sentence_embedding = sentence_encoder.encode_sentences([tokenized_text]) assert isinstance(sentence_embedding, np.ndarray) - # assert sentence_embedding.shape == (1, 1024) + assert sentence_embedding.shape == (1, 1024) assert np.allclose(expected_array, sentence_embedding[:, :10], atol=1e-3)