From d8e6983931973abb4bbec748024416dabf2633c5 Mon Sep 17 00:00:00 2001 From: CaptainVee Date: Fri, 8 Sep 2023 18:28:21 +0100 Subject: [PATCH 1/5] refactore: modified the sentence encoder to tokenize a text before encodingit --- laser_encoders/download_models.py | 9 ++++++++- laser_encoders/models.py | 9 +++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/laser_encoders/download_models.py b/laser_encoders/download_models.py index ccfc31e9..c4ace448 100644 --- a/laser_encoders/download_models.py +++ b/laser_encoders/download_models.py @@ -117,6 +117,7 @@ def initialize_encoder( model_dir: str = None, spm: bool = True, laser: str = None, + tokenize: bool = None, ): downloader = LaserModelDownloader(model_dir) if laser is not None: @@ -147,11 +148,17 @@ def initialize_encoder( model_dir = downloader.model_dir model_path = os.path.join(model_dir, f"{file_path}.pt") spm_path = os.path.join(model_dir, f"{file_path}.cvocab") + spm_model = None + if tokenize: + spm_model = os.path.join(model_dir, f"{file_path}.spm") if not os.path.exists(spm_path): # if there is no cvocab for the laser3 lang use laser2 cvocab spm_path = os.path.join(model_dir, "laser2.cvocab") - return SentenceEncoder(model_path=model_path, spm_vocab=spm_path) + spm_model = os.path.join(model_dir, "laser2.spm") + return SentenceEncoder( + model_path=model_path, spm_vocab=spm_path, spm_model=spm_model + ) def initialize_tokenizer(lang: str = None, model_dir: str = None, laser: str = None): diff --git a/laser_encoders/models.py b/laser_encoders/models.py index 7ce0e326..0a36d49b 100644 --- a/laser_encoders/models.py +++ b/laser_encoders/models.py @@ -17,6 +17,7 @@ import re import sys from collections import namedtuple +from pathlib import Path import numpy as np import torch @@ -25,6 +26,8 @@ from fairseq.models.transformer import Embedding, TransformerEncoder from fairseq.modules import LayerNorm +from laser_encoders.laser_tokenizer import LaserTokenizer + SPACE_NORMALIZER = re.compile(r"\s+") Batch = namedtuple("Batch", "srcs tokens lengths") @@ -43,6 +46,7 @@ def __init__( max_sentences=None, max_tokens=None, spm_vocab=None, + spm_model=None, cpu=False, fp16=False, verbose=False, @@ -50,6 +54,7 @@ def __init__( ): if verbose: logger.info(f"loading encoder: {model_path}") + self.spm_model = spm_model self.use_cuda = torch.cuda.is_available() and not cpu self.max_sentences = max_sentences self.max_tokens = max_tokens @@ -148,6 +153,10 @@ def batch(tokens, lengths, indices): yield batch(batch_tokens, batch_lengths, batch_indices) def encode_sentences(self, sentences): + if self.spm_model: + tokenizer = LaserTokenizer(spm_model=Path(self.spm_model)) + sentences = tokenizer(sentences) + indices = [] results = [] for batch, batch_indices in self._make_batches(sentences): From af224c6604e7b6e4185d7a20a1e878848c0b0efc Mon Sep 17 00:00:00 2001 From: CaptainVee Date: Fri, 8 Sep 2023 18:53:14 +0100 Subject: [PATCH 2/5] debugging failed test --- laser_encoders/test_laser_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/laser_encoders/test_laser_tokenizer.py b/laser_encoders/test_laser_tokenizer.py index 867111cf..cd36182d 100644 --- a/laser_encoders/test_laser_tokenizer.py +++ b/laser_encoders/test_laser_tokenizer.py @@ -173,5 +173,5 @@ def test_sentence_encoder( sentence_embedding = sentence_encoder.encode_sentences([tokenized_text]) assert isinstance(sentence_embedding, np.ndarray) - assert sentence_embedding.shape == (1, 1024) + # assert sentence_embedding.shape == (1, 1024) assert np.allclose(expected_array, sentence_embedding[:, :10], atol=1e-3) From 2ac3362cf295e326d42f9e7802da0e691896253d Mon Sep 17 00:00:00 2001 From: CaptainVee Date: Mon, 18 Sep 2023 19:44:23 +0100 Subject: [PATCH 3/5] added a call method to seperately handle the tokenization before encodding --- laser_encoders/download_models.py | 15 ++++++++------- laser_encoders/models.py | 12 ++++++++---- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/laser_encoders/download_models.py b/laser_encoders/download_models.py index c4ace448..17a5db35 100644 --- a/laser_encoders/download_models.py +++ b/laser_encoders/download_models.py @@ -117,7 +117,7 @@ def initialize_encoder( model_dir: str = None, spm: bool = True, laser: str = None, - tokenize: bool = None, + tokenize: bool = False, ): downloader = LaserModelDownloader(model_dir) if laser is not None: @@ -147,17 +147,18 @@ def initialize_encoder( model_dir = downloader.model_dir model_path = os.path.join(model_dir, f"{file_path}.pt") - spm_path = os.path.join(model_dir, f"{file_path}.cvocab") + spm_vocab = os.path.join(model_dir, f"{file_path}.cvocab") spm_model = None + if not os.path.exists(spm_vocab): + # if there is no cvocab for the laser3 lang use laser2 cvocab + spm_vocab = os.path.join(model_dir, "laser2.cvocab") if tokenize: spm_model = os.path.join(model_dir, f"{file_path}.spm") + if not os.path.exists(spm_model): + spm_model = os.path.join(model_dir, "laser2.spm") - if not os.path.exists(spm_path): - # if there is no cvocab for the laser3 lang use laser2 cvocab - spm_path = os.path.join(model_dir, "laser2.cvocab") - spm_model = os.path.join(model_dir, "laser2.spm") return SentenceEncoder( - model_path=model_path, spm_vocab=spm_path, spm_model=spm_model + model_path=model_path, spm_vocab=spm_vocab, spm_model=spm_model ) diff --git a/laser_encoders/models.py b/laser_encoders/models.py index 0a36d49b..678efc5d 100644 --- a/laser_encoders/models.py +++ b/laser_encoders/models.py @@ -55,6 +55,9 @@ def __init__( if verbose: logger.info(f"loading encoder: {model_path}") self.spm_model = spm_model + if self.spm_model: + self.tokenizer = LaserTokenizer(spm_model=Path(self.spm_model)) + self.use_cuda = torch.cuda.is_available() and not cpu self.max_sentences = max_sentences self.max_tokens = max_tokens @@ -88,6 +91,11 @@ def __init__( self.encoder.eval() self.sort_kind = sort_kind + def __call__(self, sentences): + if self.spm_model: + sentences = self.tokenizer(sentences) + return self.encode_sentences(sentences) + def _process_batch(self, batch): tokens = batch.tokens lengths = batch.lengths @@ -153,10 +161,6 @@ def batch(tokens, lengths, indices): yield batch(batch_tokens, batch_lengths, batch_indices) def encode_sentences(self, sentences): - if self.spm_model: - tokenizer = LaserTokenizer(spm_model=Path(self.spm_model)) - sentences = tokenizer(sentences) - indices = [] results = [] for batch, batch_indices in self._make_batches(sentences): From c2f66cd8e39f4b29fd5ccc5c4387be5c6acac75c Mon Sep 17 00:00:00 2001 From: CaptainVee Date: Thu, 21 Sep 2023 20:45:51 +0100 Subject: [PATCH 4/5] added value error for when there is no spm_model --- laser_encoders/models.py | 4 ++++ laser_encoders/test_laser_tokenizer.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/laser_encoders/models.py b/laser_encoders/models.py index 678efc5d..e2a81ef9 100644 --- a/laser_encoders/models.py +++ b/laser_encoders/models.py @@ -95,6 +95,10 @@ def __call__(self, sentences): if self.spm_model: sentences = self.tokenizer(sentences) return self.encode_sentences(sentences) + else: + raise ValueError( + "Either initialize the encoder with an spm_model or pre-tokenize and use the encode_sentences method." + ) def _process_batch(self, batch): tokens = batch.tokens diff --git a/laser_encoders/test_laser_tokenizer.py b/laser_encoders/test_laser_tokenizer.py index cd36182d..867111cf 100644 --- a/laser_encoders/test_laser_tokenizer.py +++ b/laser_encoders/test_laser_tokenizer.py @@ -173,5 +173,5 @@ def test_sentence_encoder( sentence_embedding = sentence_encoder.encode_sentences([tokenized_text]) assert isinstance(sentence_embedding, np.ndarray) - # assert sentence_embedding.shape == (1, 1024) + assert sentence_embedding.shape == (1, 1024) assert np.allclose(expected_array, sentence_embedding[:, :10], atol=1e-3) From 085867659d123a7e5af0426f256e29d48d2a0d43 Mon Sep 17 00:00:00 2001 From: CaptainVee Date: Thu, 21 Sep 2023 21:31:01 +0100 Subject: [PATCH 5/5] documentation for the new __call__ method for tokenization with encoder --- laser_encoders/README.md | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/laser_encoders/README.md b/laser_encoders/README.md index b6022628..7a45d929 100644 --- a/laser_encoders/README.md +++ b/laser_encoders/README.md @@ -43,6 +43,13 @@ encoder = initialize_encoder(lang="igbo") embeddings = encoder.encode_sentences([tokenized_sentence]) ``` +When initializing the encoder, you have the option to enable both tokenization and encoding by setting the `tokenize` flag to `True`. Below is an example of how to use it: +```py +encoder = initialize_encoder(lang="igbo", spm=True, tokenize=True) +embeddings = encoder("nnọọ, kedu ka ị mere") +``` +>setting the `spm` flag to `True` tells the encoder to also download the accompanying spm model + **Supported Languages:** You can specify any language from the [FLORES200](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200) dataset. This includes both languages identified by their full codes (like "ibo_Latn") and simpler alternatives (like "igbo"). ## Downloading the pre-trained models @@ -61,13 +68,19 @@ python -m laser_encoders.download_models --model-dir=path/to/model/directory > For a comprehensive list of available arguments, you can use the `--help` command with the download_models script. -Once you have successfully downloaded the models, you can utilize the `LaserTokenizer` to tokenize text in your desired language. Here's an example of how you can achieve this: +Once you have successfully downloaded the models, you can utilize the `SentenceEncoder` to tokenize and encode your text in your desired language. Here's an example of how you can achieve this: ```py -from laser_encoders.laser_tokenizer import LaserTokenizer from laser_encoders.models import SentenceEncoder from pathlib import Path +encoder = SentenceEncoder(model_path=path/to/downloaded/model, spm_model=Path(path/to/spm_model), spm_vocab=path/to/cvocab) +embeddings = encoder("This is a test sentence.") +``` +If you want to perform tokenization seperately, you can do this below: +```py +from laser_encoders.laser_tokenizer import LaserTokenizer + tokenizer = LaserTokenizer(spm_model=Path(path/to/spm_model)) tokenized_sentence = tokenizer.tokenize("This is a test sentence.")