From ea7691cab801743670218048245141d86db0536b Mon Sep 17 00:00:00 2001 From: Kevin Heffernan Date: Fri, 17 Nov 2023 04:05:26 -0800 Subject: [PATCH 1/2] resolve parity with MOSES-4.0 release --- laser_encoders/laser_tokenizer.py | 7 ++++++- pyproject.toml | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/laser_encoders/laser_tokenizer.py b/laser_encoders/laser_tokenizer.py index 915b70b2..5cbd2a4e 100644 --- a/laser_encoders/laser_tokenizer.py +++ b/laser_encoders/laser_tokenizer.py @@ -24,11 +24,13 @@ import sentencepiece as spm from sacremoses import MosesDetokenizer, MosesPunctNormalizer +from unicategories import categories from laser_encoders.download_models import LaserModelDownloader from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE, SPM_LANGUAGE SPACE_NORMALIZER = re.compile(r"\s+") +NON_PRINT_CHARS = set(c for c in categories["C"].characters()) logging.basicConfig( stream=sys.stdout, @@ -59,6 +61,9 @@ def __init__( assert spm_model.exists(), f"spm model file: {spm_model} does not exist" self.moses_punct_normalizer = MosesPunctNormalizer(self.lang, perl_parity=True) + # add parity with MOSES release-4.0 + self.moses_punct_normalizer.substitutions[21] = ("‘", r'"') + self.moses_punct_normalizer.substitutions[22] = ("‚", r'"') self.moses_detokenizer = MosesDetokenizer() self.spm_encoder = spm.SentencePieceProcessor(model_file=str(self.spm_model)) @@ -75,7 +80,7 @@ def log(self, message: str) -> None: def tokenize(self, text: str) -> str: # Preprocessing - sentence_text = "".join(c for c in text if c.isprintable) + sentence_text = "".join([c if c not in NON_PRINT_CHARS else " " for c in text]) if self.normalize_punct: sentence_text = self.moses_punct_normalizer.normalize(sentence_text) if self.descape: diff --git a/pyproject.toml b/pyproject.toml index 74650610..6e82f8bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,8 @@ readme = "laser_encoders/README.md" requires-python = ">=3.8" dependencies = [ - 'sacremoses>=0.1.0', + 'sacremoses==0.1.0', + 'unicategories>=0.1.2', 'sentencepiece>=0.1.99', 'numpy>=1.21.3', 'torch>=1.10.0', From 77bf7fb0d23106f255645e6ae90c3d6ed6f7cf09 Mon Sep 17 00:00:00 2001 From: Kevin Heffernan Date: Fri, 17 Nov 2023 04:25:21 -0800 Subject: [PATCH 2/2] update test --- laser_encoders/test_laser_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/laser_encoders/test_laser_tokenizer.py b/laser_encoders/test_laser_tokenizer.py index 08ad789e..666be959 100644 --- a/laser_encoders/test_laser_tokenizer.py +++ b/laser_encoders/test_laser_tokenizer.py @@ -100,7 +100,7 @@ def test_lowercase(tokenizer): def test_is_printable(tokenizer): test_data = "Hello, \tWorld! ABC\x1f123" - expected_output = "▁hel lo , ▁world ! ▁ab c 12 3" + expected_output = "▁hel lo , ▁world ! ▁ab c ▁12 3" assert tokenizer.tokenize(test_data) == expected_output