Skip to content

Commit

Permalink
sacrebleu[feat]: Add ko-mecab tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
stancld committed Sep 10, 2023
1 parent c1a10cc commit 2bd8c6e
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 4 deletions.
2 changes: 2 additions & 0 deletions requirements/text.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@ tqdm >=4.41.0, <=4.66.1
regex >=2021.9.24, <=2023.8.8
transformers >4.4.0, <4.30.3
mecab-python3 >= 1.0.6, <1.1.0
mecab-ko >= 1.0.0, <1.1.0
mecab-ko-dic >= 1.0.0, <1.1.0
ipadiac >= 1.0.0, <1.1.0
40 changes: 36 additions & 4 deletions src/torchmetrics/functional/text/sacre_bleu.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,16 @@
from typing_extensions import Literal

from torchmetrics.functional.text.bleu import _bleu_score_compute, _bleu_score_update
from torchmetrics.utilities.imports import _IPADIC_AVAILABLE, _MECAB_AVAILABLE, _REGEX_AVAILABLE
from torchmetrics.utilities.imports import (
_IPADIC_AVAILABLE,
_MECAB_AVAILABLE,
_MECAB_KO_AVAILABLE,
_MECAB_KO_DIC_AVAILABLE,
_REGEX_AVAILABLE,
)

AVAILABLE_TOKENIZERS = ("none", "13a", "zh", "intl", "char", "ja-mecab")
Tokenizers = Literal["none", "13a", "zh", "intl", "char", "ja-mecab"]
AVAILABLE_TOKENIZERS = ("none", "13a", "zh", "intl", "char", "ja-mecab", "ko-mecab")
Tokenizers = Literal["none", "13a", "zh", "intl", "char", "ja-mecab", "ko-mecab"]

_UCODE_RANGES = (
("\u3400", "\u4db5"), # CJK Unified Ideographs Extension A, release 3.0
Expand Down Expand Up @@ -118,6 +124,7 @@ class _SacreBLEUTokenizer:
"intl": "_tokenize_international",
"char": "_tokenize_char",
"ja-mecab": "_tokenize_ja_mecab",
"ko-mecab": "_tokenize_ko_mecab",
}

def __init__(self, tokenize: Tokenizers, lowercase: bool = False) -> None:
Expand Down Expand Up @@ -297,6 +304,25 @@ def _tokenize_ja_mecab(cls, line: str) -> str:
line = line.strip()
return tagger.parse(line).strip()

@classmethod
def _tokenize_ko_mecab(cls, line: str) -> str:
"""Tokenizes a Korean string line using MeCab-korean morphological analyzer.
Args:
line: the input string to tokenize.
Return:
The tokenized string.
"""
import mecab_ko
import mecab_ko_dic

tagger = mecab_ko.Tagger(mecab_ko_dic.MECAB_ARGS + " -Owakati")

line = line.strip()
return tagger.parse(line).strip()

@staticmethod
def _lower(line: str, lowercase: bool) -> str:
if lowercase:
Expand Down Expand Up @@ -325,6 +351,12 @@ def _check_tokenizers_validity(cls, tokenize: Tokenizers) -> None:
" Use `pip install mecab-python3 ipadic` or `pip install torchmetrics[text]`."
)

if tokenize == "ko-mecab" and not (_MECAB_KO_AVAILABLE and _MECAB_KO_DIC_AVAILABLE):
raise ModuleNotFoundError(
"`'ko-mecab'` tokenization requires that `mecab_ko` and `mecab_ko_dic` are installed."
" Use `pip install mecab_ko mecab_ko_dic` or `pip install torchmetrics[text]`."
)


def sacre_bleu_score(
preds: Sequence[str],
Expand All @@ -345,7 +377,7 @@ def sacre_bleu_score(
n_gram: Gram value ranged from 1 to 4
smooth: Whether to apply smoothing - see [2]
tokenize: Tokenization technique to be used.
Supported tokenization: ['none', '13a', 'zh', 'intl', 'char', 'ja-mecab']
Supported tokenization: ['none', '13a', 'zh', 'intl', 'char', 'ja-mecab', 'ko-mecab]
lowercase: If ``True``, BLEU score over lowercased text is calculated.
weights:
Weights used for unigrams, bigrams, etc. to calculate BLEU score.
Expand Down
2 changes: 2 additions & 0 deletions src/torchmetrics/utilities/imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@
_PIQ_GREATER_EQUAL_0_8: Optional[bool] = compare_version("piq", operator.ge, "0.8.0")
_FASTER_COCO_EVAL_AVAILABLE: bool = package_available("faster_coco_eval")
_MECAB_AVAILABLE: bool = package_available("MeCab")
_MECAB_KO_AVAILABLE: bool = package_available("mecab_ko")
_MECAB_KO_DIC_AVAILABLE: bool = package_available("mecab_ko_dic")
_IPADIC_AVAILABLE: bool = package_available("ipadic")

_LATEX_AVAILABLE: bool = shutil.which("latex") is not None
9 changes: 9 additions & 0 deletions tests/unittests/text/test_sacre_bleu.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,12 @@ def test_tokenize_ja_mecab():
preds = ["これは美しい花です。"]
targets = [["これは美しい花です。", "おいしい寿司を食べたい。"]]
assert sacrebleu(preds, targets) == _sacrebleu_fn(preds, targets, tokenize="ja-mecab", lowercase=False)


def test_tokenize_ko_mecab():
"""Test that `ja-mecab` tokenizer works on a Japanese text in alignment with the SacreBleu implementation."""
sacrebleu = SacreBLEUScore(tokenize="ko-mecab")

preds = ["이 책은 정말 재미있어요."]
targets = [["이 책은 정말 재미있어요.", "고마워요, 너무 도와줘서."]]
assert sacrebleu(preds, targets) == _sacrebleu_fn(preds, targets, tokenize="ko-mecab", lowercase=False)

0 comments on commit 2bd8c6e

Please sign in to comment.