diff --git a/laser_encoders/README.md b/laser_encoders/README.md index a85cdef5..3020ea6b 100644 --- a/laser_encoders/README.md +++ b/laser_encoders/README.md @@ -25,10 +25,21 @@ You can install laser_encoders using pip: ## Usage -Here's a simple example of how you can download and initialise the tokenizer and encoder with just one step. +Here's a simple example on how to obtain embeddings for sentences using the `LaserEncoderPipeline`: -**Note:** By default, the models will be downloaded to the `~/.cache/laser_encoders` directory. To specify a different download location, you can provide the argument `model_dir=path/to/model/directory` to the initialize_tokenizer and initialize_encoder functions +>**Note:** By default, the models will be downloaded to the `~/.cache/laser_encoders` directory. To specify a different download location, you can provide the argument `model_dir=path/to/model/directory` +```py +from laser_encoders import LaserEncoderPipeline + +# Initialize the LASER encoder pipeline +encoder = LaserEncoderPipeline(lang="igbo") + +# Encode sentences into embeddings +embeddings = encoder.encode_sentences(["nnọọ, kedu ka ị mere"]) +``` + +If you prefer more control over the tokenization and encoding process, you can initialize the tokenizer and encoder separately: ```py from laser_encoders import initialize_encoder, initialize_tokenizer @@ -39,16 +50,10 @@ tokenized_sentence = tokenizer.tokenize("nnọọ, kedu ka ị mere") # Initialize the LASER sentence encoder encoder = initialize_encoder(lang="igbo") -# Encode sentences into embeddings +# Encode tokenized sentences into embeddings embeddings = encoder.encode_sentences([tokenized_sentence]) ``` - -When initializing the encoder, you have the option to enable both tokenization and encoding by setting the `tokenize` flag to `True`. Below is an example of how to use it: -```py -encoder = initialize_encoder(lang="igbo", spm=True, tokenize=True) -embeddings = encoder("nnọọ, kedu ka ị mere") -``` ->setting the `spm` flag to `True` tells the encoder to also download the accompanying spm model +>By default, the `spm` flag is set to `True` when initializing the encoder, ensuring the accompanying spm model is downloaded. **Supported Languages:** You can specify any language from the [FLORES200](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200) dataset. This includes both languages identified by their full codes (like "ibo_Latn") and simpler alternatives (like "igbo"). diff --git a/laser_encoders/__init__.py b/laser_encoders/__init__.py index 75264c55..05b46186 100644 --- a/laser_encoders/__init__.py +++ b/laser_encoders/__init__.py @@ -12,4 +12,5 @@ # # ------------------------------------------------------- -from laser_encoders.download_models import initialize_encoder, initialize_tokenizer +from laser_encoders.laser_tokenizer import initialize_tokenizer +from laser_encoders.models import LaserEncoderPipeline, initialize_encoder diff --git a/laser_encoders/download_models.py b/laser_encoders/download_models.py index 452501d3..1167d7c1 100644 --- a/laser_encoders/download_models.py +++ b/laser_encoders/download_models.py @@ -26,8 +26,6 @@ from tqdm import tqdm from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE, SPM_LANGUAGE -from laser_encoders.laser_tokenizer import LaserTokenizer -from laser_encoders.models import SentenceEncoder logging.basicConfig( stream=sys.stdout, @@ -121,88 +119,6 @@ def main(self, args): ) -def initialize_encoder( - lang: str = None, - model_dir: str = None, - spm: bool = True, - laser: str = None, - tokenize: bool = False, -): - downloader = LaserModelDownloader(model_dir) - if laser is not None: - if laser == "laser3": - lang = downloader.get_language_code(LASER3_LANGUAGE, lang) - downloader.download_laser3(lang=lang, spm=spm) - file_path = f"laser3-{lang}.v1" - elif laser == "laser2": - downloader.download_laser2() - file_path = "laser2" - else: - raise ValueError( - f"Unsupported laser model: {laser}. Choose either laser2 or laser3." - ) - else: - lang = downloader.get_language_code(LASER3_LANGUAGE, lang) - if lang in LASER3_LANGUAGE: - downloader.download_laser3(lang=lang, spm=spm) - file_path = f"laser3-{lang}.v1" - elif lang in LASER2_LANGUAGE: - downloader.download_laser2() - file_path = "laser2" - else: - raise ValueError( - f"Unsupported language name: {lang}. Please specify a supported language name." - ) - - model_dir = downloader.model_dir - model_path = os.path.join(model_dir, f"{file_path}.pt") - spm_vocab = os.path.join(model_dir, f"{file_path}.cvocab") - spm_model = None - if not os.path.exists(spm_vocab): - # if there is no cvocab for the laser3 lang use laser2 cvocab - spm_vocab = os.path.join(model_dir, "laser2.cvocab") - if tokenize: - spm_model = os.path.join(model_dir, f"{file_path}.spm") - if not os.path.exists(spm_model): - spm_model = os.path.join(model_dir, "laser2.spm") - - return SentenceEncoder( - model_path=model_path, spm_vocab=spm_vocab, spm_model=spm_model - ) - - -def initialize_tokenizer(lang: str = None, model_dir: str = None, laser: str = None): - downloader = LaserModelDownloader(model_dir) - if laser is not None: - if laser == "laser3": - lang = downloader.get_language_code(LASER3_LANGUAGE, lang) - if lang in SPM_LANGUAGE: - filename = f"laser3-{lang}.v1.spm" - else: - filename = "laser2.spm" - elif laser == "laser2": - filename = "laser2.spm" - else: - raise ValueError( - f"Unsupported laser model: {laser}. Choose either laser2 or laser3." - ) - else: - if lang in LASER3_LANGUAGE or lang in LASER2_LANGUAGE: - lang = downloader.get_language_code(LASER3_LANGUAGE, lang) - if lang in SPM_LANGUAGE: - filename = f"laser3-{lang}.v1.spm" - else: - filename = "laser2.spm" - else: - raise ValueError( - f"Unsupported language name: {lang}. Please specify a supported language name." - ) - - downloader.download(filename) - model_path = os.path.join(downloader.model_dir, filename) - return LaserTokenizer(spm_model=Path(model_path)) - - if __name__ == "__main__": parser = argparse.ArgumentParser(description="LASER: Download Laser models") parser.add_argument( diff --git a/laser_encoders/laser_tokenizer.py b/laser_encoders/laser_tokenizer.py index c180844b..0488cb2c 100644 --- a/laser_encoders/laser_tokenizer.py +++ b/laser_encoders/laser_tokenizer.py @@ -16,6 +16,7 @@ import gzip import logging +import os import re import sys from pathlib import Path @@ -24,6 +25,9 @@ import sentencepiece as spm from sacremoses import MosesDetokenizer, MosesPunctNormalizer +from laser_encoders.download_models import LaserModelDownloader +from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE, SPM_LANGUAGE + SPACE_NORMALIZER = re.compile(r"\s+") logging.basicConfig( @@ -131,3 +135,35 @@ def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]: ids.extend(token_ids) return ids + + +def initialize_tokenizer(lang: str = None, model_dir: str = None, laser: str = None): + downloader = LaserModelDownloader(model_dir) + if laser is not None: + if laser == "laser3": + lang = downloader.get_language_code(LASER3_LANGUAGE, lang) + if lang in SPM_LANGUAGE: + filename = f"laser3-{lang}.v1.spm" + else: + filename = "laser2.spm" + elif laser == "laser2": + filename = "laser2.spm" + else: + raise ValueError( + f"Unsupported laser model: {laser}. Choose either laser2 or laser3." + ) + else: + if lang in LASER3_LANGUAGE or lang in LASER2_LANGUAGE: + lang = downloader.get_language_code(LASER3_LANGUAGE, lang) + if lang in SPM_LANGUAGE: + filename = f"laser3-{lang}.v1.spm" + else: + filename = "laser2.spm" + else: + raise ValueError( + f"Unsupported language name: {lang}. Please specify a supported language name." + ) + + downloader.download(filename) + model_path = os.path.join(downloader.model_dir, filename) + return LaserTokenizer(spm_model=Path(model_path)) diff --git a/laser_encoders/models.py b/laser_encoders/models.py index e2a81ef9..037a4f9f 100644 --- a/laser_encoders/models.py +++ b/laser_encoders/models.py @@ -14,6 +14,7 @@ import logging +import os import re import sys from collections import namedtuple @@ -26,7 +27,9 @@ from fairseq.models.transformer import Embedding, TransformerEncoder from fairseq.modules import LayerNorm -from laser_encoders.laser_tokenizer import LaserTokenizer +from laser_encoders.download_models import LaserModelDownloader +from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE +from laser_encoders.laser_tokenizer import LaserTokenizer, initialize_tokenizer SPACE_NORMALIZER = re.compile(r"\s+") Batch = namedtuple("Batch", "srcs tokens lengths") @@ -325,3 +328,73 @@ def combine_bidir(outs): if encoder_padding_mask.any() else None, } + + +def initialize_encoder( + lang: str = None, + model_dir: str = None, + spm: bool = True, + laser: str = None, +): + downloader = LaserModelDownloader(model_dir) + if laser is not None: + if laser == "laser3": + lang = downloader.get_language_code(LASER3_LANGUAGE, lang) + downloader.download_laser3(lang=lang, spm=spm) + file_path = f"laser3-{lang}.v1" + elif laser == "laser2": + downloader.download_laser2() + file_path = "laser2" + else: + raise ValueError( + f"Unsupported laser model: {laser}. Choose either laser2 or laser3." + ) + else: + lang = downloader.get_language_code(LASER3_LANGUAGE, lang) + if lang in LASER3_LANGUAGE: + downloader.download_laser3(lang=lang, spm=spm) + file_path = f"laser3-{lang}.v1" + elif lang in LASER2_LANGUAGE: + downloader.download_laser2() + file_path = "laser2" + else: + raise ValueError( + f"Unsupported language name: {lang}. Please specify a supported language name." + ) + + model_dir = downloader.model_dir + model_path = os.path.join(model_dir, f"{file_path}.pt") + spm_vocab = os.path.join(model_dir, f"{file_path}.cvocab") + + if not os.path.exists(spm_vocab): + # if there is no cvocab for the laser3 lang use laser2 cvocab + spm_vocab = os.path.join(model_dir, "laser2.cvocab") + + return SentenceEncoder(model_path=model_path, spm_vocab=spm_vocab, spm_model=None) + + +class LaserEncoderPipeline: + def __init__( + self, lang: str, model_dir: str = None, spm: bool = True, laser: str = None + ): + self.tokenizer = initialize_tokenizer( + lang=lang, model_dir=model_dir, laser=laser + ) + self.encoder = initialize_encoder( + lang=lang, model_dir=model_dir, spm=spm, laser=laser + ) + + def encode_sentences(self, sentences: list) -> list: + """ + Tokenizes and encodes a list of sentences. + + Args: + - sentences (list of str): List of sentences to tokenize and encode. + + Returns: + - List of embeddings for each sentence. + """ + tokenized_sentences = [ + self.tokenizer.tokenize(sentence) for sentence in sentences + ] + return self.encoder.encode_sentences(tokenized_sentences) diff --git a/laser_encoders/test_laser_tokenizer.py b/laser_encoders/test_laser_tokenizer.py index 867111cf..1155f8d2 100644 --- a/laser_encoders/test_laser_tokenizer.py +++ b/laser_encoders/test_laser_tokenizer.py @@ -21,7 +21,11 @@ import numpy as np import pytest -from laser_encoders import initialize_encoder, initialize_tokenizer +from laser_encoders import ( + LaserEncoderPipeline, + initialize_encoder, + initialize_tokenizer, +) @pytest.fixture @@ -35,6 +39,27 @@ def input_text() -> str: return "This is a test sentence." +@pytest.fixture +def test_readme_params() -> dict: + return { + "lang": "igbo", + "input_sentences": ["nnọọ, kedu ka ị mere"], + "expected_embedding_shape": (1, 1024), + "expected_array": [ + 0.3807628, + -0.27941525, + -0.17819545, + 0.44144684, + -0.38985375, + 0.04719935, + 0.20238206, + -0.03934783, + 0.0118901, + 0.28986093, + ], + } + + def test_tokenize(tokenizer, input_text: str): expected_output = "▁this ▁is ▁a ▁test ▁sent ence ." assert tokenizer.tokenize(input_text) == expected_output @@ -175,3 +200,36 @@ def test_sentence_encoder( assert isinstance(sentence_embedding, np.ndarray) assert sentence_embedding.shape == (1, 1024) assert np.allclose(expected_array, sentence_embedding[:, :10], atol=1e-3) + + +def test_laser_encoder_pipeline(tmp_path: Path, test_readme_params: dict): + lang = test_readme_params["lang"] + input_sentences = test_readme_params["input_sentences"] + expected_embedding_shape = test_readme_params["expected_embedding_shape"] + expected_array = test_readme_params["expected_array"] + + encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang) + embeddings = encoder.encode_sentences(input_sentences) + + assert isinstance(embeddings, np.ndarray) + assert embeddings.shape == expected_embedding_shape + assert np.allclose(expected_array, embeddings[:, :10], atol=1e-3) + + +def test_separate_initialization_and_encoding( + tmp_path, tokenizer, test_readme_params: dict +): + lang = test_readme_params["lang"] + input_sentences = test_readme_params["input_sentences"] + expected_embedding_shape = test_readme_params["expected_embedding_shape"] + expected_array = test_readme_params["expected_array"] + + tokenized_sentence = tokenizer.tokenize(input_sentences[0]) + sentence_encoder = initialize_encoder(model_dir=tmp_path, lang=lang) + + # Encode tokenized sentences into embeddings + embeddings = sentence_encoder.encode_sentences([tokenized_sentence]) + + assert isinstance(embeddings, np.ndarray) + assert embeddings.shape == expected_embedding_shape + assert np.allclose(expected_array, embeddings[:, :10], atol=1e-3)