Skip to content

Commit

Permalink
Refactor initialize_encoder to LaserEncoderPipeline (#256)
Browse files Browse the repository at this point in the history
* Remove 'tokenize' argument from initialize_encoder function

* Add LaserEncoderPipeline for streamlined tokenization and encoding

* docs: Update README to show use of LaserEncoderPipeline

* style: Reformat code using black

* refactor: move encoder and tokenizer initialization into repective files

* style: run black

* test: Add test for LaserEncoderPipeline
  • Loading branch information
Paulooh007 authored Oct 31, 2023
1 parent e3257c1 commit e6f4805
Show file tree
Hide file tree
Showing 6 changed files with 186 additions and 97 deletions.
25 changes: 15 additions & 10 deletions laser_encoders/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,21 @@ You can install laser_encoders using pip:

## Usage

Here's a simple example of how you can download and initialise the tokenizer and encoder with just one step.
Here's a simple example on how to obtain embeddings for sentences using the `LaserEncoderPipeline`:

**Note:** By default, the models will be downloaded to the `~/.cache/laser_encoders` directory. To specify a different download location, you can provide the argument `model_dir=path/to/model/directory` to the initialize_tokenizer and initialize_encoder functions
>**Note:** By default, the models will be downloaded to the `~/.cache/laser_encoders` directory. To specify a different download location, you can provide the argument `model_dir=path/to/model/directory`
```py
from laser_encoders import LaserEncoderPipeline

# Initialize the LASER encoder pipeline
encoder = LaserEncoderPipeline(lang="igbo")

# Encode sentences into embeddings
embeddings = encoder.encode_sentences(["nnọọ, kedu ka ị mere"])
```

If you prefer more control over the tokenization and encoding process, you can initialize the tokenizer and encoder separately:
```py
from laser_encoders import initialize_encoder, initialize_tokenizer

Expand All @@ -39,16 +50,10 @@ tokenized_sentence = tokenizer.tokenize("nnọọ, kedu ka ị mere")
# Initialize the LASER sentence encoder
encoder = initialize_encoder(lang="igbo")

# Encode sentences into embeddings
# Encode tokenized sentences into embeddings
embeddings = encoder.encode_sentences([tokenized_sentence])
```

When initializing the encoder, you have the option to enable both tokenization and encoding by setting the `tokenize` flag to `True`. Below is an example of how to use it:
```py
encoder = initialize_encoder(lang="igbo", spm=True, tokenize=True)
embeddings = encoder("nnọọ, kedu ka ị mere")
```
>setting the `spm` flag to `True` tells the encoder to also download the accompanying spm model
>By default, the `spm` flag is set to `True` when initializing the encoder, ensuring the accompanying spm model is downloaded.
**Supported Languages:** You can specify any language from the [FLORES200](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200) dataset. This includes both languages identified by their full codes (like "ibo_Latn") and simpler alternatives (like "igbo").

Expand Down
3 changes: 2 additions & 1 deletion laser_encoders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@
#
# -------------------------------------------------------

from laser_encoders.download_models import initialize_encoder, initialize_tokenizer
from laser_encoders.laser_tokenizer import initialize_tokenizer
from laser_encoders.models import LaserEncoderPipeline, initialize_encoder
84 changes: 0 additions & 84 deletions laser_encoders/download_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@
from tqdm import tqdm

from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE, SPM_LANGUAGE
from laser_encoders.laser_tokenizer import LaserTokenizer
from laser_encoders.models import SentenceEncoder

logging.basicConfig(
stream=sys.stdout,
Expand Down Expand Up @@ -121,88 +119,6 @@ def main(self, args):
)


def initialize_encoder(
lang: str = None,
model_dir: str = None,
spm: bool = True,
laser: str = None,
tokenize: bool = False,
):
downloader = LaserModelDownloader(model_dir)
if laser is not None:
if laser == "laser3":
lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
downloader.download_laser3(lang=lang, spm=spm)
file_path = f"laser3-{lang}.v1"
elif laser == "laser2":
downloader.download_laser2()
file_path = "laser2"
else:
raise ValueError(
f"Unsupported laser model: {laser}. Choose either laser2 or laser3."
)
else:
lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
if lang in LASER3_LANGUAGE:
downloader.download_laser3(lang=lang, spm=spm)
file_path = f"laser3-{lang}.v1"
elif lang in LASER2_LANGUAGE:
downloader.download_laser2()
file_path = "laser2"
else:
raise ValueError(
f"Unsupported language name: {lang}. Please specify a supported language name."
)

model_dir = downloader.model_dir
model_path = os.path.join(model_dir, f"{file_path}.pt")
spm_vocab = os.path.join(model_dir, f"{file_path}.cvocab")
spm_model = None
if not os.path.exists(spm_vocab):
# if there is no cvocab for the laser3 lang use laser2 cvocab
spm_vocab = os.path.join(model_dir, "laser2.cvocab")
if tokenize:
spm_model = os.path.join(model_dir, f"{file_path}.spm")
if not os.path.exists(spm_model):
spm_model = os.path.join(model_dir, "laser2.spm")

return SentenceEncoder(
model_path=model_path, spm_vocab=spm_vocab, spm_model=spm_model
)


def initialize_tokenizer(lang: str = None, model_dir: str = None, laser: str = None):
downloader = LaserModelDownloader(model_dir)
if laser is not None:
if laser == "laser3":
lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
if lang in SPM_LANGUAGE:
filename = f"laser3-{lang}.v1.spm"
else:
filename = "laser2.spm"
elif laser == "laser2":
filename = "laser2.spm"
else:
raise ValueError(
f"Unsupported laser model: {laser}. Choose either laser2 or laser3."
)
else:
if lang in LASER3_LANGUAGE or lang in LASER2_LANGUAGE:
lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
if lang in SPM_LANGUAGE:
filename = f"laser3-{lang}.v1.spm"
else:
filename = "laser2.spm"
else:
raise ValueError(
f"Unsupported language name: {lang}. Please specify a supported language name."
)

downloader.download(filename)
model_path = os.path.join(downloader.model_dir, filename)
return LaserTokenizer(spm_model=Path(model_path))


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="LASER: Download Laser models")
parser.add_argument(
Expand Down
36 changes: 36 additions & 0 deletions laser_encoders/laser_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import gzip
import logging
import os
import re
import sys
from pathlib import Path
Expand All @@ -24,6 +25,9 @@
import sentencepiece as spm
from sacremoses import MosesDetokenizer, MosesPunctNormalizer

from laser_encoders.download_models import LaserModelDownloader
from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE, SPM_LANGUAGE

SPACE_NORMALIZER = re.compile(r"\s+")

logging.basicConfig(
Expand Down Expand Up @@ -131,3 +135,35 @@ def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
ids.extend(token_ids)

return ids


def initialize_tokenizer(lang: str = None, model_dir: str = None, laser: str = None):
downloader = LaserModelDownloader(model_dir)
if laser is not None:
if laser == "laser3":
lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
if lang in SPM_LANGUAGE:
filename = f"laser3-{lang}.v1.spm"
else:
filename = "laser2.spm"
elif laser == "laser2":
filename = "laser2.spm"
else:
raise ValueError(
f"Unsupported laser model: {laser}. Choose either laser2 or laser3."
)
else:
if lang in LASER3_LANGUAGE or lang in LASER2_LANGUAGE:
lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
if lang in SPM_LANGUAGE:
filename = f"laser3-{lang}.v1.spm"
else:
filename = "laser2.spm"
else:
raise ValueError(
f"Unsupported language name: {lang}. Please specify a supported language name."
)

downloader.download(filename)
model_path = os.path.join(downloader.model_dir, filename)
return LaserTokenizer(spm_model=Path(model_path))
75 changes: 74 additions & 1 deletion laser_encoders/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@


import logging
import os
import re
import sys
from collections import namedtuple
Expand All @@ -26,7 +27,9 @@
from fairseq.models.transformer import Embedding, TransformerEncoder
from fairseq.modules import LayerNorm

from laser_encoders.laser_tokenizer import LaserTokenizer
from laser_encoders.download_models import LaserModelDownloader
from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE
from laser_encoders.laser_tokenizer import LaserTokenizer, initialize_tokenizer

SPACE_NORMALIZER = re.compile(r"\s+")
Batch = namedtuple("Batch", "srcs tokens lengths")
Expand Down Expand Up @@ -325,3 +328,73 @@ def combine_bidir(outs):
if encoder_padding_mask.any()
else None,
}


def initialize_encoder(
lang: str = None,
model_dir: str = None,
spm: bool = True,
laser: str = None,
):
downloader = LaserModelDownloader(model_dir)
if laser is not None:
if laser == "laser3":
lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
downloader.download_laser3(lang=lang, spm=spm)
file_path = f"laser3-{lang}.v1"
elif laser == "laser2":
downloader.download_laser2()
file_path = "laser2"
else:
raise ValueError(
f"Unsupported laser model: {laser}. Choose either laser2 or laser3."
)
else:
lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
if lang in LASER3_LANGUAGE:
downloader.download_laser3(lang=lang, spm=spm)
file_path = f"laser3-{lang}.v1"
elif lang in LASER2_LANGUAGE:
downloader.download_laser2()
file_path = "laser2"
else:
raise ValueError(
f"Unsupported language name: {lang}. Please specify a supported language name."
)

model_dir = downloader.model_dir
model_path = os.path.join(model_dir, f"{file_path}.pt")
spm_vocab = os.path.join(model_dir, f"{file_path}.cvocab")

if not os.path.exists(spm_vocab):
# if there is no cvocab for the laser3 lang use laser2 cvocab
spm_vocab = os.path.join(model_dir, "laser2.cvocab")

return SentenceEncoder(model_path=model_path, spm_vocab=spm_vocab, spm_model=None)


class LaserEncoderPipeline:
def __init__(
self, lang: str, model_dir: str = None, spm: bool = True, laser: str = None
):
self.tokenizer = initialize_tokenizer(
lang=lang, model_dir=model_dir, laser=laser
)
self.encoder = initialize_encoder(
lang=lang, model_dir=model_dir, spm=spm, laser=laser
)

def encode_sentences(self, sentences: list) -> list:
"""
Tokenizes and encodes a list of sentences.
Args:
- sentences (list of str): List of sentences to tokenize and encode.
Returns:
- List of embeddings for each sentence.
"""
tokenized_sentences = [
self.tokenizer.tokenize(sentence) for sentence in sentences
]
return self.encoder.encode_sentences(tokenized_sentences)
60 changes: 59 additions & 1 deletion laser_encoders/test_laser_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@
import numpy as np
import pytest

from laser_encoders import initialize_encoder, initialize_tokenizer
from laser_encoders import (
LaserEncoderPipeline,
initialize_encoder,
initialize_tokenizer,
)


@pytest.fixture
Expand All @@ -35,6 +39,27 @@ def input_text() -> str:
return "This is a test sentence."


@pytest.fixture
def test_readme_params() -> dict:
return {
"lang": "igbo",
"input_sentences": ["nnọọ, kedu ka ị mere"],
"expected_embedding_shape": (1, 1024),
"expected_array": [
0.3807628,
-0.27941525,
-0.17819545,
0.44144684,
-0.38985375,
0.04719935,
0.20238206,
-0.03934783,
0.0118901,
0.28986093,
],
}


def test_tokenize(tokenizer, input_text: str):
expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
assert tokenizer.tokenize(input_text) == expected_output
Expand Down Expand Up @@ -175,3 +200,36 @@ def test_sentence_encoder(
assert isinstance(sentence_embedding, np.ndarray)
assert sentence_embedding.shape == (1, 1024)
assert np.allclose(expected_array, sentence_embedding[:, :10], atol=1e-3)


def test_laser_encoder_pipeline(tmp_path: Path, test_readme_params: dict):
lang = test_readme_params["lang"]
input_sentences = test_readme_params["input_sentences"]
expected_embedding_shape = test_readme_params["expected_embedding_shape"]
expected_array = test_readme_params["expected_array"]

encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
embeddings = encoder.encode_sentences(input_sentences)

assert isinstance(embeddings, np.ndarray)
assert embeddings.shape == expected_embedding_shape
assert np.allclose(expected_array, embeddings[:, :10], atol=1e-3)


def test_separate_initialization_and_encoding(
tmp_path, tokenizer, test_readme_params: dict
):
lang = test_readme_params["lang"]
input_sentences = test_readme_params["input_sentences"]
expected_embedding_shape = test_readme_params["expected_embedding_shape"]
expected_array = test_readme_params["expected_array"]

tokenized_sentence = tokenizer.tokenize(input_sentences[0])
sentence_encoder = initialize_encoder(model_dir=tmp_path, lang=lang)

# Encode tokenized sentences into embeddings
embeddings = sentence_encoder.encode_sentences([tokenized_sentence])

assert isinstance(embeddings, np.ndarray)
assert embeddings.shape == expected_embedding_shape
assert np.allclose(expected_array, embeddings[:, :10], atol=1e-3)

0 comments on commit e6f4805

Please sign in to comment.