diff --git a/utils/src/sentence_split.py b/utils/src/sentence_split.py index d128dbb1..2b96baa5 100644 --- a/utils/src/sentence_split.py +++ b/utils/src/sentence_split.py @@ -9,30 +9,239 @@ import typing as tp from pathlib import Path +from botok.tokenizers import sentencetokenizer as bod_sent_tok # Indicp NLP from indicnlp import common as indic_common from indicnlp import loader as indic_loader from indicnlp.normalize.indic_normalize import IndicNormalizerFactory from indicnlp.tokenize import sentence_tokenize as indic_sent_tok - -# --- sentence splitters -# Moses-style -from sentence_splitter import SentenceSplitter - -INDIC_NLP_RESOURCES = None # apparently not needed for splitting and normalization -from botok.tokenizers import sentencetokenizer as bod_sent_tok from khmernltk import sentence_tokenize as khm_sent_tok - # pythainlp for Thai # Seahorse for Indonesian, Thai, Vietnamese # botok for tibetan # Spacy for # various tool-kits from laonlp.tokenize import sent_tokenize as lao_sent_tok +# --- sentence splitters +# Moses-style +from sentence_splitter import SentenceSplitter + +INDIC_NLP_RESOURCES = None # apparently not needed for splitting and normalization + logger = logging.getLogger("sentence_split") +split_lang_code_map = { + "ace_Arab" : "ace_Arab", + "ace_Latn" : "ace_Latn", + "acm_Arab" : "acm", + "acq_Arab" : "acq", + "aeb_Arab" : "aeb", + "afr_Latn" : "afr", + "ajp_Arab" : "ajp", + "aka_Latn" : "aka", + "amh_Ethi" : "amh", + "apc_Arab" : "apc", + "arb_Arab" : "ara", + "arb_Arab" : "ara_Arab", + "arb_Latn" : "ara_Latn", + "ars_Arab" : "ars", + "ary_Arab" : "ary", + "arz_Arab" : "arz", + "asm_Beng" : "asm", + "ast_Latn" : "ast", + "awa_Deva" : "awa", + "ayr_Latn" : "ayr", + "azb_Arab" : "azb", + "azj_Latn" : "azj", + "bak_Cyrl" : "bak", + "bam_Latn" : "bam", + "ban_Latn" : "ban", + "bel_Cyrl" : "bel", + "bem_Latn" : "bem", + "ben_Beng" : "ben", + "bho_Deva" : "bho", + "bjn_Arab" : "bjn_Arab", + "bjn_Latn" : "bjn_Latn", + "bod_Tibt" : "bod", + "bos_Latn" : "bos", + "bug_Latn" : "bug", + "bul_Cyrl" : "bul", + "cat_Latn" : "cat", + "ceb_Latn" : "ceb", + "ces_Latn" : "ces", + "cjk_Latn" : "cjk", + "ckb_Arab" : "ckb", + "crh_Latn" : "crh_Latn", + "cym_Latn" : "cym", + "dan_Latn" : "dan", + "deu_Latn" : "deu", + "dik_Latn" : "dik", + "diq_Latn" : "diq", + "dyu_Latn" : "dyu", + "dzo_Tibt" : "dzo", + "ell_Grek" : "ell", + "eng_Latn" : "eng", + "epo_Latn" : "epo", + "est_Latn" : "est", + "eus_Latn" : "eus", + "ewe_Latn" : "ewe", + "fao_Latn" : "fao", + "pes_Arab" : "fas", + "fij_Latn" : "fij", + "fin_Latn" : "fin", + "fon_Latn" : "fon", + "fra_Latn" : "fra", + "fur_Latn" : "fur", + "fuv_Latn" : "fuv", + "gla_Latn" : "gla", + "gle_Latn" : "gle", + "glg_Latn" : "glg", + "grn_Latn" : "grn", + "guj_Gujr" : "guj", + "hat_Latn" : "hat", + "hau_Latn" : "hau", + "heb_Hebr" : "heb", + "hin_Deva" : "hin", + "hne_Deva" : "hne", + "hrv_Latn" : "hrv", + "hun_Latn" : "hun", + "hye_Armn" : "hye", + "ibo_Latn" : "ibo", + "ilo_Latn" : "ilo", + "ind_Latn" : "ind", + "isl_Latn" : "isl", + "ita_Latn" : "ita", + "jav_Latn" : "jav", + "jpn_Jpan" : "jpn", + "kab_Latn" : "kab", + "kac_Latn" : "kac", + "kam_Latn" : "kam", + "kan_Knda" : "kan", + "kas_Arab" : "kas_Arab", + "kas_Deva" : "kas_Deva", + "kat_Geor" : "kat", + "knc_Arab" : "kau_Arab", + "knc_Latn" : "kau_Latn", + "kaz_Cyrl" : "kaz", + "kbp_Latn" : "kbp", + "kea_Latn" : "kea", + "khm_Khmr" : "khm", + "kik_Latn" : "kik", + "kin_Latn" : "kin", + "kir_Cyrl" : "kir", + "kmb_Latn" : "kmb", + "kon_Latn" : "kon", + "kor_Hang" : "kor", + "kmr_Latn" : "kur", + "lao_Laoo" : "lao", + "lvs_Latn" : "lav", + "lij_Latn" : "lij", + "lim_Latn" : "lim", + "lin_Latn" : "lin", + "lit_Latn" : "lit", + "lmo_Latn" : "lmo", + "ltg_Latn" : "ltg", + "ltz_Latn" : "ltz", + "lua_Latn" : "lua", + "lug_Latn" : "lug", + "luo_Latn" : "luo", + "lus_Latn" : "lus", + "mag_Deva" : "mag", + "mai_Deva" : "mai", + "mal_Mlym" : "mal", + "mar_Deva" : "mar", + "min_Arab" : "min_Arab", + "min_Latn" : "min_Latn", + "mkd_Cyrl" : "mkd", + "plt_Latn" : "mlg", + "mlt_Latn" : "mlt", + "khk_Cyrl" : "mon", + "mos_Latn" : "mos", + "mri_Latn" : "mri", + "zsm_Latn" : "msa", + "mya_Mymr" : "mya", + "nld_Latn" : "nld", + "nno_Latn" : "nno", + "nob_Latn" : "nob", + "npi_Deva" : "npi", + "nso_Latn" : "nso", + "nus_Latn" : "nus", + "nya_Latn" : "nya", + "oci_Latn" : "oci", + "gaz_Latn" : "orm", + "ory_Orya" : "ory", + "pag_Latn" : "pag", + "pan_Guru" : "pan", + "pap_Latn" : "pap", + "pol_Latn" : "pol", + "por_Latn" : "por", + "prs_Arab" : "prs", + "pbt_Arab" : "pus", + "quy_Latn" : "que", + "ron_Latn" : "ron", + "run_Latn" : "run", + "rus_Cyrl" : "rus", + "sag_Latn" : "sag", + "san_Deva" : "san", + "sat_Olck" : "sat", + "scn_Latn" : "scn", + "shn_Mymr" : "shn", + "sin_Sinh" : "sin", + "slk_Latn" : "slk", + "slv_Latn" : "slv", + "smo_Latn" : "smo", + "sna_Latn" : "sna", + "snd_Arab" : "snd", + "som_Latn" : "som", + "sot_Latn" : "sot", + "spa_Latn" : "spa", + "als_Latn" : "sqi", + "srd_Latn" : "srd", + "srp_Cyrl" : "srp_Cyrl", + "ssw_Latn" : "ssw", + "sun_Latn" : "sun", + "swe_Latn" : "swe", + "swh_Latn" : "swh", + "szl_Latn" : "szl", + "tam_Taml" : "tam", + "tat_Cyrl" : "tat_Cyrl", + "tel_Telu" : "tel", + "tgk_Cyrl" : "tgk", + "tgl_Latn" : "tgl", + "tha_Thai" : "tha", + "tir_Ethi" : "tir", + "taq_Latn" : "tmh_Latn", + "taq_Tfng" : "tmh_Tfng", + "ton_Latn" : "ton", + "tpi_Latn" : "tpi", + "tsn_Latn" : "tsn", + "tso_Latn" : "tso", + "tuk_Latn" : "tuk", + "tum_Latn" : "tum", + "tur_Latn" : "tur", + "twi_Latn" : "twi", + "tzm_Tfng" : "tzm", + "uig_Arab" : "uig", + "ukr_Cyrl" : "ukr", + "umb_Latn" : "umb", + "urd_Arab" : "urd", + "uzn_Latn" : "uzb", + "vec_Latn" : "vec", + "vie_Latn" : "vie", + "war_Latn" : "war", + "wol_Latn" : "wol", + "xho_Latn" : "xho", + "ydd_Hebr" : "yid", + "yor_Latn" : "yor", + "yue_Hant" : "yue", + "zho_Hans" : "zho_Hans", + "zho_Hant" : "zho_Hant", + "zul_Latn" : "zul" +} + + # ---------------------------------- # Supported tokenization algorithms # List of supported languages and mapping ISO3 - > ISO2 @@ -74,14 +283,18 @@ # ---------------------------------------------- LANGS_INDIC = { "asm": "as", + "awa": "aw", "ben": "bn", "brx": "bD", "gom": "xx", "guj": "gu", "hin": "hi", "kan": "kn", + "kas": "ka", "kok": "kK", "mni": "bn", # our meitei is in bengali script, so swapped it to bengali here + "mag": "mg", + "mai": "mi", "mal": "ml", "mar": "mr", "npi": "ne", @@ -142,6 +355,9 @@ def split_burmese(line: str) -> tp.Iterable[str]: def get_split_algo(lang: str, split_algo: str) -> tp.Callable[[str], tp.Iterable[str]]: + if lang in split_lang_code_map: + lang = split_lang_code_map[lang] + # get default algorithm if requested if split_algo == "default": # use best algorithm in function of language