sacrebleu[feat]: Add ko-mecab tokenizer

Lightning-AI · Sep 10, 2023 · 2bd8c6e · 2bd8c6e
1 parent c1a10cc
commit 2bd8c6e
Show file tree

Hide file tree

Showing 4 changed files with 49 additions and 4 deletions.
diff --git a/requirements/text.txt b/requirements/text.txt
@@ -6,4 +6,6 @@ tqdm >=4.41.0, <=4.66.1
 regex >=2021.9.24, <=2023.8.8
 transformers >4.4.0, <4.30.3
 mecab-python3 >= 1.0.6, <1.1.0
+mecab-ko >= 1.0.0, <1.1.0
+mecab-ko-dic >= 1.0.0, <1.1.0
 ipadiac >= 1.0.0, <1.1.0
diff --git a/src/torchmetrics/functional/text/sacre_bleu.py b/src/torchmetrics/functional/text/sacre_bleu.py
@@ -46,10 +46,16 @@
 from typing_extensions import Literal
 
 from torchmetrics.functional.text.bleu import _bleu_score_compute, _bleu_score_update
-from torchmetrics.utilities.imports import _IPADIC_AVAILABLE, _MECAB_AVAILABLE, _REGEX_AVAILABLE
+from torchmetrics.utilities.imports import (
+    _IPADIC_AVAILABLE,
+    _MECAB_AVAILABLE,
+    _MECAB_KO_AVAILABLE,
+    _MECAB_KO_DIC_AVAILABLE,
+    _REGEX_AVAILABLE,
+)
 
-AVAILABLE_TOKENIZERS = ("none", "13a", "zh", "intl", "char", "ja-mecab")
-Tokenizers = Literal["none", "13a", "zh", "intl", "char", "ja-mecab"]
+AVAILABLE_TOKENIZERS = ("none", "13a", "zh", "intl", "char", "ja-mecab", "ko-mecab")
+Tokenizers = Literal["none", "13a", "zh", "intl", "char", "ja-mecab", "ko-mecab"]
 
 _UCODE_RANGES = (
     ("\u3400", "\u4db5"),  # CJK Unified Ideographs Extension A, release 3.0
@@ -118,6 +124,7 @@ class _SacreBLEUTokenizer:
         "intl": "_tokenize_international",
         "char": "_tokenize_char",
         "ja-mecab": "_tokenize_ja_mecab",
+        "ko-mecab": "_tokenize_ko_mecab",
     }
 
     def __init__(self, tokenize: Tokenizers, lowercase: bool = False) -> None:
@@ -297,6 +304,25 @@ def _tokenize_ja_mecab(cls, line: str) -> str:
         line = line.strip()
         return tagger.parse(line).strip()
 
+    @classmethod
+    def _tokenize_ko_mecab(cls, line: str) -> str:
+        """Tokenizes a Korean string line using MeCab-korean morphological analyzer.
+
+        Args:
+            line: the input string to tokenize.
+
+        Return:
+            The tokenized string.
+
+        """
+        import mecab_ko
+        import mecab_ko_dic
+
+        tagger = mecab_ko.Tagger(mecab_ko_dic.MECAB_ARGS + " -Owakati")
+
+        line = line.strip()
+        return tagger.parse(line).strip()
+
     @staticmethod
     def _lower(line: str, lowercase: bool) -> str:
         if lowercase:
@@ -325,6 +351,12 @@ def _check_tokenizers_validity(cls, tokenize: Tokenizers) -> None:
                 " Use `pip install mecab-python3 ipadic` or `pip install torchmetrics[text]`."
             )
 
+        if tokenize == "ko-mecab" and not (_MECAB_KO_AVAILABLE and _MECAB_KO_DIC_AVAILABLE):
+            raise ModuleNotFoundError(
+                "`'ko-mecab'` tokenization requires that `mecab_ko` and `mecab_ko_dic` are installed."
+                " Use `pip install mecab_ko mecab_ko_dic` or `pip install torchmetrics[text]`."
+            )
+
 
 def sacre_bleu_score(
     preds: Sequence[str],
@@ -345,7 +377,7 @@ def sacre_bleu_score(
         n_gram: Gram value ranged from 1 to 4
         smooth: Whether to apply smoothing - see [2]
         tokenize: Tokenization technique to be used.
-            Supported tokenization: ['none', '13a', 'zh', 'intl', 'char', 'ja-mecab']
+            Supported tokenization: ['none', '13a', 'zh', 'intl', 'char', 'ja-mecab', 'ko-mecab]
         lowercase: If ``True``, BLEU score over lowercased text is calculated.
         weights:
             Weights used for unigrams, bigrams, etc. to calculate BLEU score.

diff --git a/src/torchmetrics/utilities/imports.py b/src/torchmetrics/utilities/imports.py
@@ -59,6 +59,8 @@
 _PIQ_GREATER_EQUAL_0_8: Optional[bool] = compare_version("piq", operator.ge, "0.8.0")
 _FASTER_COCO_EVAL_AVAILABLE: bool = package_available("faster_coco_eval")
 _MECAB_AVAILABLE: bool = package_available("MeCab")
+_MECAB_KO_AVAILABLE: bool = package_available("mecab_ko")
+_MECAB_KO_DIC_AVAILABLE: bool = package_available("mecab_ko_dic")
 _IPADIC_AVAILABLE: bool = package_available("ipadic")
 
 _LATEX_AVAILABLE: bool = shutil.which("latex") is not None
diff --git a/tests/unittests/text/test_sacre_bleu.py b/tests/unittests/text/test_sacre_bleu.py
@@ -115,3 +115,12 @@ def test_tokenize_ja_mecab():
     preds = ["これは美しい花です。"]
     targets = [["これは美しい花です。", "おいしい寿司を食べたい。"]]
     assert sacrebleu(preds, targets) == _sacrebleu_fn(preds, targets, tokenize="ja-mecab", lowercase=False)
+
+
+def test_tokenize_ko_mecab():
+    """Test that `ja-mecab` tokenizer works on a Japanese text in alignment with the SacreBleu implementation."""
+    sacrebleu = SacreBLEUScore(tokenize="ko-mecab")
+
+    preds = ["이 책은 정말 재미있어요."]
+    targets = [["이 책은 정말 재미있어요.", "고마워요, 너무 도와줘서."]]
+    assert sacrebleu(preds, targets) == _sacrebleu_fn(preds, targets, tokenize="ko-mecab", lowercase=False)