diff --git a/nisaba/scripts/natural_translit/brahmic/deromanize/BUILD.bazel b/nisaba/scripts/natural_translit/brahmic/deromanize/BUILD.bazel new file mode 100644 index 00000000..1c8dd53f --- /dev/null +++ b/nisaba/scripts/natural_translit/brahmic/deromanize/BUILD.bazel @@ -0,0 +1,87 @@ +# Copyright 2023 Nisaba Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@rules_python//python:py_library.bzl", "py_library") +load( + "//nisaba/scripts/utils:grammars.bzl", + "nisaba_compile_multi_grm_py", +) + +package(default_applicable_licenses = [ +]) + +py_library( + name = "typ2brh", + srcs = ["typ2brh.py"], + deps = [ + "//nisaba/scripts/natural_translit/brahmic:iso_inventory", + "//nisaba/scripts/natural_translit/utils:log_op", + "@org_opengrm_pynini//pynini", + ], +) + +py_library( + name = "ltn2brh", + srcs = ["ltn2brh.py"], + deps = [ + "//nisaba/scripts/natural_translit/brahmic:iso_inventory", + "//nisaba/scripts/natural_translit/latin:ltn_inventory", + "//nisaba/scripts/natural_translit/utils:inventory2", + "@org_opengrm_pynini//pynini", + ], +) + +py_library( + name = "fst_builder", + srcs = ["fst_builder.py"], + deps = [ + ":ltn2brh", + ":typ2brh", + "//nisaba/scripts/natural_translit/latin:ltn_inventory", + "//nisaba/scripts/natural_translit/script:char", + "//nisaba/scripts/natural_translit/utils:inventory2", + "//nisaba/scripts/natural_translit/utils:list_op", + "//nisaba/scripts/natural_translit/utils:rewrite_functions", + "//nisaba/scripts/utils:rewrite", + "@org_opengrm_pynini//pynini", + ], +) + +nisaba_compile_multi_grm_py( + name = "hi", + outs = { + "byte": "hi.far", + "utf8": "hi_utf8.far", + }, + visibility = ["//visibility:public"], + deps = [ + ":fst_builder", + ":ltn2brh", + "@org_opengrm_pynini//pynini", + ], +) + +nisaba_compile_multi_grm_py( + name = "ta", + outs = { + "byte": "ta.far", + "utf8": "ta_utf8.far", + }, + visibility = ["//visibility:public"], + deps = [ + ":fst_builder", + ":ltn2brh", + "@org_opengrm_pynini//pynini", + ], +) diff --git a/nisaba/scripts/natural_translit/brahmic/deromanize/fst_builder.py b/nisaba/scripts/natural_translit/brahmic/deromanize/fst_builder.py new file mode 100644 index 00000000..ace2aa47 --- /dev/null +++ b/nisaba/scripts/natural_translit/brahmic/deromanize/fst_builder.py @@ -0,0 +1,400 @@ +# Copyright 2023 Nisaba Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Brahmic deromanizer.""" + +import pynini as pyn +from nisaba.scripts.natural_translit.brahmic import iso_inventory +from nisaba.scripts.natural_translit.brahmic.deromanize import ltn2brh as l2b +from nisaba.scripts.natural_translit.brahmic.deromanize import typ2brh as t2b +from nisaba.scripts.natural_translit.latin import ltn_inventory +from nisaba.scripts.natural_translit.script import char as c +from nisaba.scripts.natural_translit.utils import inventory2 +from nisaba.scripts.natural_translit.utils import list_op as ls +from nisaba.scripts.natural_translit.utils import rewrite_functions as rw +from nisaba.scripts.natural_translit.utils import type_op as ty +from nisaba.scripts.utils import rewrite + +ltn = ltn_inventory.GRAPHEME_INVENTORY +iso = iso_inventory.TRANSLIT_INVENTORY +pan = l2b.MAPPING_INVENTORY +_FST0 = pyn.intersect(pyn.accep('a'), pyn.accep('b')) + + +class RuleList(ty.Thing): + """A thing whose value is a list of fsts.""" + + def __init__(self, alias: str, *fsts): + super().__init__() + self.set_alias(alias) + self.value = [] + self.add(*fsts) + + def add(self, *fsts): + fst_list = [] + for fst in fsts: + if ty.is_instance(fst, pyn.Fst): fst_list.append(fst) + if ty.is_instance(fst, list): fst_list.extend(fst) + if ty.is_instance(fst, RuleList): fst_list.extend(fst.value) + self.value.extend([f for f in fst_list if ty.is_instance(f, pyn.Fst)]) + + +class Deromanizer(inventory2.Inventory): + """Fst inventory for Brahmic deromanization.""" + + def __init__(self): + super().__init__() + self.script = ty.UNASSIGNED + self.schwa_deletion = False + self.init_items() + self.init_supps() + + def make_rule(self, alias: str, *fsts) -> pyn.Fst: + self.add_item(RuleList(alias, *fsts)) + + def init_items(self) -> None: + ls.apply_foreach(self.make_rule, [ + ['ltn2typ', self.rw_ltn2typ()], + ['typ_ops'], + ['anusvara'], + ['cons_first'], + ['cons_nukta'], + ['cons_asp'], + ['cons_drop_asp'], + ['cons_drop_gem'], + ['cons_gem_only'], + ['cons_base'], + ['mono_long'], + ['diph_base'], + ['mono_base_as_long'], + ['mono_base'], + ['schwa_as_long_wf'], + ['cluster_wi'], + ['cluster_wf'], + ['typ2brh'], + ['typ2iso', self.rw_typ2iso()], + ]) + + def init_supps(self) -> None: + ls.apply_foreach(self.make_supp, [ + ['group_vowel', {}], + ['group_mono', {}], + ['group_base_as_long', {}], + ['group_diph', {}], + ['group_consonant', {}], + ['group_has_aspirated', {}], + ['group_no_aspirated', {}], + ['group_drop_aspirated', {}], + ['group_drop_gem', {}], + ['group_gem_only', {}], + ['group_nukta', {}], + ['group_substring', {}], + ['ltn2brh', _FST0], + ['ltn2iso', _FST0], + ]) + + @classmethod + def params( + cls, + script: str = '', + schwa_deletion: bool = False, + schwa_deletion_wf: bool = False, + monophthong: l2b.Ltn2Brh.STAR = ty.UNSPECIFIED, + base_as_long: l2b.Ltn2Brh.STAR = ty.UNSPECIFIED, + diphthong: l2b.Ltn2Brh.STAR = ty.UNSPECIFIED, + has_aspirated: l2b.Ltn2Brh.STAR = ty.UNSPECIFIED, + drop_aspirated: l2b.Ltn2Brh.STAR = ty.UNSPECIFIED, + no_aspirated: l2b.Ltn2Brh.STAR = ty.UNSPECIFIED, + drop_gem: l2b.Ltn2Brh.STAR = ty.UNSPECIFIED, + gem_only: l2b.Ltn2Brh.STAR = ty.UNSPECIFIED, + nukta: l2b.Ltn2Brh.STAR = ty.UNSPECIFIED, + anusvara_labial: bool = False, + anusvara_n: bool = False, + substring: l2b.Ltn2Brh.STAR = ty.UNSPECIFIED, + ) -> 'Deromanizer': + new = cls() + new.set_script(script) + new.set_schwa_deletion(schwa_deletion, schwa_deletion_wf) + new.set_anusvara(anusvara_labial, anusvara_n) + if ty.is_specified(monophthong): new.add_monophthong(monophthong) + if ty.is_specified(base_as_long): new.add_base_as_long(base_as_long) + if ty.is_specified(diphthong): new.add_diphthong(diphthong) + if ty.is_specified(has_aspirated): new.add_has_aspirated(has_aspirated) + if ty.is_specified(drop_aspirated): new.add_drop_aspirated(drop_aspirated) + if ty.is_specified(no_aspirated): new.add_no_aspirated(no_aspirated) + if ty.is_specified(drop_gem): new.add_drop_gem(drop_gem) + if ty.is_specified(gem_only): new.add_gem_only(gem_only) + if ty.is_specified(nukta): new.add_nukta(nukta) + if ty.is_specified(substring): new.add_substring(substring) + new.set_group_rules() + new.set_typ_ops() + new.set_e2e() + return new + + def set_script(self, script: str) -> None: + if script: + self.script = script + self.typ2brh.add(self.rw_typ2brh()) + else: + self.typ2brh = self.typ2iso + + def set_schwa_deletion( + self, + schwa_deletion: bool, + schwa_deletion_wf: bool, + ) -> None: + self.schwa_deletion = schwa_deletion + if not schwa_deletion: return + self.cluster_wi.add(self.rw_cluster_wi()) + if schwa_deletion_wf: + self.schwa_as_long_wf.add(self.rw_schwa_as_long_wf()) + self.cluster_wf.add(self.rw_cluster_wf()) + + def set_anusvara(self, anusvara_labial: bool, anusvara_n: bool) -> None: + if anusvara_n: + if anusvara_labial: + self.anusvara.add(self.rw_ans_labial()) + self.anusvara.add(self.rw_ans_n()) + + def set_group_rules(self) -> None: + for gl in self.group_lists(self.group_mono): + self.mono_base.add(self.rw_vowel(gl)) + self.mono_long.add(self.rw_vowel(gl, 'long', 'long')) + for gl in self.group_lists(self.group_base_as_long): + self.mono_base_as_long.add(self.rw_vowel(gl, 'short', 'long')) + for gl in self.group_lists(self.group_diph): + self.diph_base.add(self.rw_vowel(gl)) + for gl in self.group_lists(self.group_nukta): + self.cons_nukta.add(self.rw_cons(gl, new='nkt')) + for gl in self.group_lists(self.group_drop_aspirated): + self.cons_drop_asp.add(self.rw_drop_aspirated(gl)) + for gl in self.group_lists(self.group_has_aspirated): + self.cons_asp.add(self.rw_aspirated(gl)) + for gl in self.group_lists(self.group_drop_gem): + self.cons_drop_gem.add(self.rw_cons(gl)) + for gl in self.group_lists(self.group_gem_only): + self.cons_gem_only.add(self.rw_cons(gl, rw_one=False)) + for gl in self.group_lists(self.group_consonant): + self.cons_base.add(self.rw_cons(gl)) + if pan.zh_lr in gl: + self.cons_first.add(self.rw_cons([pan.zh_lr])) + + def set_typ_ops(self) -> None: + rules = ( + self.anusvara, + self.cons_first, + self.cons_nukta, + self.cons_drop_asp, + self.cons_drop_gem, + self.cons_asp, + self.cons_gem_only, + self.cons_base, + self.mono_long, + self.diph_base, + self.mono_base_as_long, + self.mono_base, + self.schwa_as_long_wf, + self.cluster_wi, + self.cluster_wf + ) + self.typ_ops.add(*[rule.value for rule in rules]) + + def set_e2e(self) -> None: + self.ltn2brh = rewrite.ComposeFsts( + self.ltn2typ.value + self.typ_ops.value + self.typ2brh.value + ) + self.ltn2iso = rewrite.ComposeFsts( + self.ltn2typ.value + self.typ_ops.value + self.typ2iso.value + ) + + def add_to_group( + self, + group: str, + member: l2b.Ltn2Brh, + priority: ty.IntOrNothing = ty.UNSPECIFIED + ) -> None: + if group not in self.supp_aliases: self.make_supp(group, {}) + supp = ty.enforce_dict(self.get(group)) + k = priority if isinstance(priority, int) else len(member.grs) + m_list = supp.get(k, []) + if member not in m_list: m_list.append(member) + supp[k] = m_list + + def add_to_groups(self, member, *groups) -> None: + for group in groups: + self.add_to_group(group, member) + + def group_lists( + self, group: dict[int, list[l2b.Ltn2Brh]] + ) -> list[list[l2b.Ltn2Brh]]: + return [group[k] for k in sorted(group.keys(), reverse=True)] + + def add_monophthong(self, *args) -> None: + for m in l2b.Ltn2Brh.as_list(*args): + self.add_to_groups(m, 'group_vowel', 'group_mono') + + def add_base_as_long(self, *args) -> None: + for m in l2b.Ltn2Brh.as_list(*args): + self.add_to_groups( + m, 'group_vowel', 'group_mono', 'group_base_as_long' + ) + + def add_diphthong(self, *args) -> None: + for m in l2b.Ltn2Brh.as_list(*args): + self.add_to_groups(m, 'group_vowel', 'group_diph') + + def add_has_aspirated(self, *args) -> None: + for m in l2b.Ltn2Brh.as_list(*args): + self.add_to_groups(m, 'group_consonant', 'group_has_aspirated') + + def add_no_aspirated(self, *args) -> None: + for m in l2b.Ltn2Brh.as_list(*args): + self.add_to_groups(m, 'group_consonant', 'group_no_aspirated') + + def add_drop_aspirated(self, *args) -> None: + for m in l2b.Ltn2Brh.as_list(*args): + self.add_to_groups(m, 'group_consonant', 'group_drop_aspirated') + + def add_drop_gem(self, *args) -> None: + for m in l2b.Ltn2Brh.as_list(*args): + self.add_to_groups(m, 'group_consonant', 'group_drop_gem') + + def add_gem_only(self, *args) -> None: + for m in l2b.Ltn2Brh.as_list(*args): + self.add_to_groups(m, 'group_gem_only') + + def add_nukta(self, *args) -> None: + for m in l2b.Ltn2Brh.as_list(*args): + self.add_to_groups(m, 'group_consonant', 'group_nukta') + + def add_substring(self, *args) -> None: + for m in l2b.Ltn2Brh.as_list(*args): + self.add_to_groups(m, 'group_substring') + + def rw_ltn2typ(self) -> pyn.Fst: + return c.read_glyph(ltn_inventory.ASCII_LC) + + def rw_typ2iso(self) -> pyn.Fst: + return [ + rw.insert(iso.A, iso.SCH_CONS), + rw.delete(iso.A, following=pyn.union(iso.VWL_S, iso.VIR)), + c.print_glyph(iso_inventory.CHAR + iso_inventory.VIRAMA), + rw.delete('.', rw.al.BOW) + ] + + def rw_typ2brh(self) -> pyn.Fst: + return rw.rewrite_ls(t2b.cross(self.script)) + + def rw_vowel( + self, args: list[l2b.Ltn2Brh], old: str = 'short', new: str = 'short' + ) -> pyn.Fst: + rom = 'ltn_l' if old == 'long' else 'ltn' + sgn = 'iso_l' if new == 'long' else 'iso' + ind = 'iso_l_i' if new == 'long' else 'iso_i' + return [ + rw.rewrite_ls( + [(arg.get(rom), arg.get(sgn)) for arg in args], + iso.SCH_CONS + ), + rw.rewrite_ls( + [(arg.get(rom), arg.get(ind)) for arg in args], + ) + ] + + def rw_schwa_as_long_wf(self) -> pyn.Fst: + return rw.rewrite_word_final(iso.A, iso.AA) + + def rw_cluster_wi(self) -> pyn.Fst: + return rw.insert(iso.VIR, rw.al.BOW + iso.SCH_CONS, iso.ONSET_CONS) + + def rw_cluster_wf(self) -> pyn.Fst: + return rw.insert(iso.VIR, iso.SCH_CONS, iso.ONSET_CONS + rw.al.EOW) + + def rw_drop_aspirated(self, args: list[l2b.Ltn2Brh]) -> pyn.Fst: + if self.schwa_deletion: + return [ + rw.rewrite_ls(( + (arg.ltn_l_h | arg.ltn_h_l | arg.ltn_h + arg.ltn), + arg.gem + ) for arg in args), + rw.rewrite_ls((arg.ltn_h, arg.iso) for arg in args), + ] + return [ + rw.rewrite_ls( + [(arg.ltn_h, arg.iso) for arg in args], following=ltn.VOWEL + ), + rw.rewrite_ls([(arg.ltn_h, arg.vir) for arg in args]), + ] + + def rw_aspirated(self, args: list[l2b.Ltn2Brh]) -> pyn.Fst: + if self.schwa_deletion: + return [ + rw.rewrite_ls((arg.ltn_l_h, arg.gem_asp) for arg in args), + rw.rewrite_ls((arg.ltn_h_l, arg.asp_gem) for arg in args), + rw.rewrite_ls((arg.ltn_h, arg.asp) for arg in args), + ] + return [ + rw.rewrite_ls( + [(arg.ltn_h, arg.asp) for arg in args], following=ltn.VOWEL + ), + rw.rewrite_ls([(arg.ltn_h, arg.asp_vir) for arg in args]), + ] + + def rw_drop_gem( + self, args: list[l2b.Ltn2Brh], new: str = 'iso', + ) -> list[pyn.Fst]: + gem_cross = [(arg.ltn_l, arg.get(new)) for arg in args] + if self.schwa_deletion: + gem_rw = [rw.rewrite_ls(gem_cross)] + else: + vir = 'vir' if new == 'iso' else new + '_vir' + gem_rw = [ + rw.rewrite_ls(gem_cross, following=ltn.VOWEL), + rw.rewrite_ls((arg.ltn_l, arg.get(vir)) for arg in args) + ] + return gem_rw + + def rw_cons( + self, args: list[l2b.Ltn2Brh], old: str = 'ltn', new: str = 'iso', + rw_one: bool = True, rw_gem: bool = True + ) -> list[pyn.Fst]: + gem = 'gem' if new == 'iso' else new + '_gem' + gem_cross = [(arg.ltn_l, arg.get(gem)) for arg in args] + one_cross = [(arg.get(old), arg.get(new)) for arg in args] + if self.schwa_deletion: + gem_rw = [rw.rewrite_ls(gem_cross)] + one_rw = [rw.rewrite_ls(one_cross)] + else: + vir = 'vir' if new == 'iso' else new + '_vir' + gem_rw = [ + rw.rewrite_ls(gem_cross, following=ltn.VOWEL), + rw.rewrite_ls((arg.ltn_l, arg.get(gem) + iso.VIR) for arg in args) + ] + one_rw = [ + rw.rewrite_ls(one_cross, following=ltn.VOWEL), + rw.rewrite_ls((arg.get(old), arg.get(vir)) for arg in args) + ] + rws = [] + if rw_gem: rws.extend(gem_rw) + if rw_one: rws.extend(one_rw) + return rws + + def rw_ans_labial(self) -> pyn.Fst: + return rw.rewrite(ltn.M, iso.ANS, ltn.VOWEL, pyn.union(ltn.B, ltn.P)) + + def rw_ans_n(self) -> pyn.Fst: + return rw.rewrite( + ltn.N, iso.ANS, + ltn.VOWEL, pyn.union((ltn.CONS - (ltn.N | ltn.M)), rw.al.EOW) + ) diff --git a/nisaba/scripts/natural_translit/brahmic/deromanize/hi.py b/nisaba/scripts/natural_translit/brahmic/deromanize/hi.py new file mode 100644 index 00000000..5168335b --- /dev/null +++ b/nisaba/scripts/natural_translit/brahmic/deromanize/hi.py @@ -0,0 +1,52 @@ +# Copyright 2023 Nisaba Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""Rule based deromanizer for hi_deva.""" +import pynini as pyn +from pynini.export import multi_grm +from nisaba.scripts.natural_translit.brahmic.deromanize import fst_builder +from nisaba.scripts.natural_translit.brahmic.deromanize import ltn2brh + +pan = ltn2brh.MAPPING_INVENTORY + +hi = fst_builder.Deromanizer.params( + script='deva', + schwa_deletion=True, + schwa_deletion_wf=True, + monophthong=(pan.a, pan.e, pan.i, pan.o, pan.u), + base_as_long=(pan.e, pan.o), + diphthong=(pan.ai, pan.au), + has_aspirated=(pan.b, pan.ch, pan.d, pan.g, pan.j, pan.k, pan.p, pan.t), + no_aspirated=( + pan.c, pan.h, pan.l, pan.m, pan.n, pan.q, + pan.r, pan.s, pan.sh, pan.v, pan.w, pan.x, pan.y + ), + nukta=(pan.f, pan.z), + anusvara_labial=True, + anusvara_n=True, +) + + +def generator_main(exporter_map: multi_grm.ExporterMapping): + """Generates FAR for natural transliteration.""" + for token_type in ('byte', 'utf8'): + with pyn.default_token_type(token_type): + exporter = exporter_map[token_type] + exporter['ISO'] = hi.ltn2iso + exporter['DEVA'] = hi.ltn2brh + + +if __name__ == '__main__': + multi_grm.run(generator_main) diff --git a/nisaba/scripts/natural_translit/brahmic/deromanize/ltn2brh.py b/nisaba/scripts/natural_translit/brahmic/deromanize/ltn2brh.py new file mode 100644 index 00000000..22b9e4ff --- /dev/null +++ b/nisaba/scripts/natural_translit/brahmic/deromanize/ltn2brh.py @@ -0,0 +1,224 @@ +# Copyright 2023 Nisaba Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Latin Brahmic character mappings.""" + +from typing import Any, Callable, Iterable, Union +import pynini as pyn +from nisaba.scripts.natural_translit.brahmic import iso_inventory +from nisaba.scripts.natural_translit.latin import ltn_inventory +from nisaba.scripts.natural_translit.utils import inventory2 +from nisaba.scripts.natural_translit.utils import type_op as ty + +_FstOrList = Union[pyn.FstLike, list[pyn.FstLike]] + +ltn = ltn_inventory.GRAPHEME_INVENTORY +iso = iso_inventory.TRANSLIT_INVENTORY + + +def _rep(fst: pyn.FstLike) -> pyn.FstLike: + return fst + fst + + +class Ltn2Brh(ty.Thing): + """Latin to Brahmic rewrite mapping.""" + + _HAS_DYNAMIC_ATTRIBUTES = True + + def __init__( + self, + alias: str, lta: _FstOrList, isa: pyn.FstLike + ) -> None: + super().__init__() + self.set_alias(alias) + if isinstance(lta, list): + self.grs = lta + self.ltn = lta[0] + for l in lta[1:]: + self.ltn = self.ltn + l + else: + self.ltn = lta + self.grs = [lta] + self.iso = isa + + STAR = Union[ty.Nothing, 'Ltn2Brh', Iterable['Ltn2Brh']] + + @classmethod + def as_list(cls, *args): + ls = [] + for arg in args: + if isinstance(arg, Ltn2Brh): ls.append(arg) + if isinstance(arg, Iterable): ls.extend(cls.as_list(*arg)) + return ls + + @classmethod + def monophthong( + cls, alias: str, lta: _FstOrList, isa: pyn.FstLike, + iso_i: pyn.FstLike, iso_l: pyn.FstLike, iso_l_i: pyn.FstLike + ) -> 'Ltn2Brh': + new = cls(alias, lta, isa) + new.add_fields([ + ['ltn_l', _rep(new.ltn)], + ['iso_i', iso_i], ['iso_l', iso_l], ['iso_l_i', iso_l_i], + ]) + return new + + @classmethod + def diphthong( + cls, alias: str, lta: _FstOrList, isa: pyn.FstLike, iso_i: pyn.FstLike + ) -> 'Ltn2Brh': + new = cls(alias, lta, isa) + new.add_field('iso_i', iso_i) + return new + + @classmethod + def consonant( + cls, alias: str, lta: _FstOrList, isa: pyn.FstLike, + ) -> 'Ltn2Brh': + new = cls(alias, lta, isa) + vir = new.iso + iso.VIR + rep = _rep(new.ltn) + if new.ltn != new.grs[0]: rep = pyn.union(rep, new.grs[0] + new.ltn) + new.add_fields([ + ['vir', vir], + ['ltn_l', rep], + ['gem', vir + new.iso], + ['gem_vir', vir + vir], + ]) + return new + + @classmethod + def aspirated( + cls, alias: str, lta: _FstOrList, isa: pyn.FstLike, asp: pyn.FstLike, + ) -> 'Ltn2Brh': + new = cls.consonant(alias, lta, isa) + ltn_h = new.ltn + ltn.H + asp_vir = asp + iso.VIR + new.add_fields([ + ['ltn_h', ltn_h], + ['ltn_l_h', new.ltn_l + ltn.H], + ['ltn_h_l', ltn_h + ltn_h], + ['asp', asp], + ['asp_vir', asp_vir], + ['gem_asp', new.vir + asp], + ['asp_gem', asp_vir + asp], + ['asp_unasp', asp_vir + new.iso], + ]) + return new + + @classmethod + def foreign( + cls, alias: str, lta: _FstOrList, isa: pyn.FstLike, nkt: pyn.FstLike, + ) -> 'Ltn2Brh': + new = cls.consonant(alias, lta, isa) + nkt_vir = nkt + iso.VIR + new.add_fields([ + ['nkt', nkt], + ['nkt_vir', nkt_vir], + ['nkt_gem', nkt_vir + nkt], + ]) + return new + + # @classmethod + # def substring( + # cls, alias: str, lta: _FstOrList, isa: pyn.FstLike, + # ) -> 'Ltn2Brh': + # new = cls(alias, lta, isa) + # if new.trs[0] in ltn.VOWEL: add ind condition + # if new.trs[-1] not in ltn.VOWEL: add vir condition + + def add_field(self, field: str, value: pyn.FstLike = '') -> None: + if not hasattr(self, field): self.__dict__[field] = value + + def add_fields(self, args_list: list[list[pyn.FstLike]]) -> None: + for args in args_list: + self.add_field(*args) + + def get(self, attr: str) -> pyn.FstLike: + return getattr(self, attr) + + +class _Ltn2BrhInventory(inventory2.Inventory): + """Latin to Brahmic rewrite inventory.""" + + def __init__(self) -> None: + super().__init__() + self.make_inventory() + + def make_maps( + self, + maker: Callable[..., Ltn2Brh], + args_list: list[list[Any]] + ) -> None: + for args in args_list: + mapping = maker(*args) + if mapping.ltn: self.add_item(maker(*args)) + + def make_inventory(self): + self.make_maps(Ltn2Brh.monophthong, [ + ['a', ltn.A, iso.A, iso.A_I, iso.AA, iso.AA_I], + ['e', ltn.E, iso.E, iso.E_I, iso.EE, iso.EE_I], + ['i', ltn.I, iso.I, iso.I_I, iso.II, iso.II_I], + ['o', ltn.O, iso.O, iso.O_I, iso.OO, iso.OO_I], + ['u', ltn.U, iso.U, iso.U_I, iso.UU, iso.UU_I], + ]) + self.make_maps(Ltn2Brh.diphthong, [ + ['ai', [ltn.A, ltn.I], iso.AI, iso.AI_I], + ['au', [ltn.A, ltn.U], iso.AU, iso.AU_I], + ['ae_ee', [ltn.A, ltn.E], iso.EE, iso.EE_I], + ['oa_oo', [ltn.O, ltn.A], iso.OO, iso.OO_I], + ]) + self.make_maps(Ltn2Brh.aspirated, [ + ['b', ltn.B, iso.B, iso.BH], + ['b_p', ltn.B, iso.P, iso.PH], + ['ch', [ltn.C, ltn.H], iso.C, iso.CH], + ['d', ltn.D, iso.D, iso.DH], + ['d_t', ltn.D, iso.T, iso.TH], + ['g', ltn.G, iso.G, iso.GH], + ['g_k', ltn.G, iso.K, iso.KH], + ['j', ltn.J, iso.J, iso.JH], + ['k', ltn.K, iso.K, iso.KH], + ['p', ltn.P, iso.P, iso.PH], + ['t', ltn.T, iso.T, iso.TH], + ]) + self.make_maps(Ltn2Brh.consonant, [ + ['c', ltn.C, iso.C], + ['h', ltn.H, iso.H], + ['l', ltn.L, iso.L], + ['m', ltn.M, iso.M], + ['n', ltn.N, iso.N], + ['q', ltn.Q, iso.K], + ['r', ltn.R, iso.R], + ['s', ltn.S, iso.S], + ['sh', [ltn.S, ltn.H], iso.SH], + ['v', ltn.V, iso.V], + ['w', ltn.W, iso.V], + ['x', ltn.X, iso.K + iso.VIR + iso.S], + ['y', ltn.Y, iso.Y], + ['tr_rr', [ltn.T, ltn.R], iso.RR], + ['zh_lr', [ltn.Z, ltn.H], iso.LR], + ]) + self.make_maps(Ltn2Brh.foreign, [ + ['f', ltn.F, iso.PH, iso.F], + ['z', ltn.Z, iso.J, iso.Z], + ]) + self.make_maps(Ltn2Brh, [ + ['ndr_narr', [ltn.N, ltn.D, ltn.R], iso.NA + iso.RR], + ]) + self.make_supp('vcd_vcl', { + self.b: self.p, self.d: self.t, self.g: self.k + }) + +MAPPING_INVENTORY = _Ltn2BrhInventory() + diff --git a/nisaba/scripts/natural_translit/brahmic/deromanize/ta.py b/nisaba/scripts/natural_translit/brahmic/deromanize/ta.py new file mode 100644 index 00000000..99649654 --- /dev/null +++ b/nisaba/scripts/natural_translit/brahmic/deromanize/ta.py @@ -0,0 +1,51 @@ +# Copyright 2023 Nisaba Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""Rule based deromanizer for ta_taml.""" + +import pynini as pyn +from pynini.export import multi_grm +from nisaba.scripts.natural_translit.brahmic.deromanize import fst_builder +from nisaba.scripts.natural_translit.brahmic.deromanize import ltn2brh + +pan = ltn2brh.MAPPING_INVENTORY + +ta = fst_builder.Deromanizer.params( + script='taml', + monophthong=(pan.a, pan.e, pan.i, pan.o, pan.u), + diphthong=(pan.ai, pan.au, pan.ae_ee, pan.oa_oo), + drop_aspirated=( + pan.b_p, pan.ch, pan.d_t, pan.g_k, pan.j, pan.k, pan.p, pan.t + ), + no_aspirated=( + pan.c, pan.h, pan.l, pan.m, pan.n, pan.q, pan.r, pan.s, + pan.sh, pan.v, pan.w, pan.x, pan.y, pan.zh_lr + ), + gem_only=(pan.tr_rr), + nukta=(pan.f, pan.z), +) + + +def generator_main(exporter_map: multi_grm.ExporterMapping): + """Generates FAR for natural transliteration.""" + for token_type in ('byte', 'utf8'): + with pyn.default_token_type(token_type): + exporter = exporter_map[token_type] + exporter['ISO'] = ta.ltn2iso + exporter['TAML'] = ta.ltn2brh + + +if __name__ == '__main__': + multi_grm.run(generator_main) diff --git a/nisaba/scripts/natural_translit/brahmic/deromanize/typ2brh.py b/nisaba/scripts/natural_translit/brahmic/deromanize/typ2brh.py new file mode 100644 index 00000000..500ba7d7 --- /dev/null +++ b/nisaba/scripts/natural_translit/brahmic/deromanize/typ2brh.py @@ -0,0 +1,210 @@ +# Copyright 2023 Nisaba Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Typ tr to Brahmic dict for the subset of ISO Chars used by deromanizers.""" + +from nisaba.scripts.natural_translit.brahmic import iso_inventory +from nisaba.scripts.natural_translit.utils import log_op as log + +iso = iso_inventory.TRANSLIT_INVENTORY +ISO = 'iso' +DEVA = 'deva' +TAML = 'taml' + +_FAIL = '!' +_DICT = { + log.text_of(iso.VIR): { + DEVA: '्', TAML: '்', + }, + log.text_of(iso.A): { + DEVA: '', TAML: '', + }, + log.text_of(iso.A_I): { + DEVA: 'अ', TAML: 'அ', + }, + log.text_of(iso.AA): { + DEVA: 'ा', TAML: 'ா', + }, + log.text_of(iso.AA_I): { + DEVA: 'आ', TAML: 'ஆ', + }, + log.text_of(iso.E): { + DEVA: 'ॆ', TAML: 'ெ', + }, + log.text_of(iso.E_I): { + DEVA: 'ऎ', TAML: 'எ', + }, + log.text_of(iso.EE): { + DEVA: 'े', TAML: 'ே', + }, + log.text_of(iso.EE_I): { + DEVA: 'ए', TAML: 'ஏ', + }, + log.text_of(iso.I): { + DEVA: 'ि', TAML: 'ி', + }, + log.text_of(iso.I_I): { + DEVA: 'इ', TAML: 'இ', + }, + log.text_of(iso.II): { + DEVA: 'ी', TAML: 'ீ', + }, + log.text_of(iso.II_I): { + DEVA: 'ई', TAML: 'ஈ', + }, + log.text_of(iso.O): { + DEVA: 'ॊ', TAML: 'ொ', + }, + log.text_of(iso.O_I): { + DEVA: 'ऒ', TAML: 'ஒ', + }, + log.text_of(iso.OO): { + DEVA: 'ो', TAML: 'ோ', + }, + log.text_of(iso.OO_I): { + DEVA: 'ओ', TAML: 'ஓ', + }, + log.text_of(iso.U): { + DEVA: 'ु', TAML: 'ு', + }, + log.text_of(iso.U_I): { + DEVA: 'उ', TAML: 'உ', + }, + log.text_of(iso.UU): { + DEVA: 'ू', TAML: 'ூ', + }, + log.text_of(iso.UU_I): { + DEVA: 'ऊ', TAML: 'ஊ', + }, + log.text_of(iso.AI): { + DEVA: 'ै', TAML: 'ை', + }, + log.text_of(iso.AI_I): { + DEVA: 'ऐ', TAML: 'ஐ', + }, + log.text_of(iso.AU): { + DEVA: 'ौ', TAML: 'ௌ', + }, + log.text_of(iso.AU_I): { + DEVA: 'औ', TAML: 'ஔ', + }, + log.text_of(iso.B): { + DEVA: 'ब', + }, + log.text_of(iso.BH): { + DEVA: 'भ', + }, + log.text_of(iso.C): { + DEVA: 'च', TAML: 'ச', + }, + log.text_of(iso.CH): { + DEVA: 'छ', + }, + log.text_of(iso.D): { + DEVA: 'द', + }, + log.text_of(iso.DH): { + DEVA: 'ध', + }, + log.text_of(iso.G): { + DEVA: 'ग', + }, + log.text_of(iso.GH): { + DEVA: 'घ', + }, + log.text_of(iso.H): { + DEVA: 'ह', TAML: 'ஹ', + }, + log.text_of(iso.J): { + DEVA: 'ज', TAML: 'ஜ', + }, + log.text_of(iso.Z): { + DEVA: 'ज़', TAML: 'ஃஜ' + }, + log.text_of(iso.JH): { + DEVA: 'झ', + }, + log.text_of(iso.K): { + DEVA: 'क', TAML: 'க', + }, + log.text_of(iso.KH): { + DEVA: 'ख', + }, + log.text_of(iso.L): { + DEVA: 'ल', TAML: 'ல', + }, + log.text_of(iso.LR): { + TAML: 'ழ', + }, + log.text_of(iso.M): { + DEVA: 'म', TAML: 'ம', + }, + log.text_of(iso.N): { + DEVA: 'न', TAML: 'ந', + }, + log.text_of(iso.NA): { + TAML: 'ன', + }, + log.text_of(iso.P): { + DEVA: 'प', TAML: 'ப', + }, + log.text_of(iso.PH): { + DEVA: 'फ', + }, + log.text_of(iso.F): { + DEVA: 'फ़', TAML: 'ஃப', + }, + log.text_of(iso.R): { + DEVA: 'र', TAML: 'ர', + }, + log.text_of(iso.RR): { + TAML: 'ற', + }, + log.text_of(iso.S): { + DEVA: 'स', TAML: 'ஸ', + }, + log.text_of(iso.SH): { + DEVA: 'श', TAML: 'ஶ', + }, + log.text_of(iso.T): { + DEVA: 'त', TAML: 'த', + }, + log.text_of(iso.TH): { + DEVA: 'थ', + }, + log.text_of(iso.V): { + DEVA: 'व', TAML: 'வ', + }, + log.text_of(iso.Y): { + DEVA: 'य', TAML: 'ய', + }, + log.text_of(iso.ANS): { + DEVA: 'ं', TAML: '', + }, +} + + +def _br_of_tr(tr: str, script: str) -> str: + tr_str = log.text_of(tr) + if not tr_str or tr_str == 'Fst_': return '' + if script == ISO: return tr_str[1:-1] + br_dict = _DICT.get(tr_str, {}) + if not br_dict: log.dbg_return('', 'No dict entry for %s' % tr_str) + br = br_dict.get(script, _FAIL) + if br == _FAIL: log.dbg_return('', 'No %s entry for %s' % (script, tr_str)) + return br + + +def cross(script: str) -> list[list[str]]: + return [[tr, _br_of_tr(tr, script)] for tr in _DICT] diff --git a/nisaba/scripts/natural_translit/brahmic/iso_inventory.py b/nisaba/scripts/natural_translit/brahmic/iso_inventory.py index 047fc099..f684a5ee 100644 --- a/nisaba/scripts/natural_translit/brahmic/iso_inventory.py +++ b/nisaba/scripts/natural_translit/brahmic/iso_inventory.py @@ -123,6 +123,8 @@ ['eye', '̆', ph.SIL], ]) +VIRAMA = [c.make_char('vir', '', ph.SIL)] + SYMBOL = ls.apply_foreach(c.make_char, [ ['ind', '.', ph.SIL], ['zwj', '+', ph.SIL], @@ -161,6 +163,7 @@ VOWEL_SIGN = SIMPLE_VOWEL + TWO_POINT_SIGN + LONG_VOCALIC VOWEL_S = c.store_gr_union('VOWEL_S', VOWEL_SIGN) +VOWEL_S_TR = c.store_tr_union('VWL_S', VOWEL_SIGN) def _independent(sign: c.Char): @@ -207,7 +210,12 @@ def _make_aspirated(char: c.Char) -> c.Char: [[sp.R, sp.EYE], 'r_eye', ph.RT] ]) +SCHWA_BEARING = SIMPLE_CONSONANT + ASPIRATED_CONSONANT +SCHWA_BEARING_TR = c.store_tr_union('SCH_CONS', SCHWA_BEARING) +DEAD_CONSONANT_TR = c.store_tr_union('DEAD_CONS', DEAD_CONSONANT) COMPOSITE_CONSONANT = ASPIRATED_CONSONANT + DEAD_CONSONANT +ONSET_CONSONANT = SIMPLE_CONSONANT + COMPOSITE_CONSONANT +ONSET_CONSONANT_TR = c.store_tr_union('ONSET_CONS', ONSET_CONSONANT) CND = [c.make_composite_char([sp.M, sp.CND_DIA], 'cnd', ph.NSL)] CODA = c.store_gr_union('CODA', SIMPLE_CODA + CND) @@ -219,5 +227,9 @@ def _make_aspirated(char: c.Char) -> c.Char: CHAR = (SINGLE_POINT + TWO_POINT + LONG_VOCALIC) STORES = [VOWEL_S, VOWEL_I, CODA, VOCALIC] +TR_STORES = [ + VOWEL_S_TR, SCHWA_BEARING_TR, DEAD_CONSONANT_TR, ONSET_CONSONANT_TR +] GRAPHEME_INVENTORY = c.gr_inventory(CHAR, STORES) +TRANSLIT_INVENTORY = c.tr_inventory(CHAR + VIRAMA, TR_STORES) diff --git a/nisaba/scripts/natural_translit/latin/ltn_inventory.py b/nisaba/scripts/natural_translit/latin/ltn_inventory.py index a2705072..44c0d086 100644 --- a/nisaba/scripts/natural_translit/latin/ltn_inventory.py +++ b/nisaba/scripts/natural_translit/latin/ltn_inventory.py @@ -31,12 +31,17 @@ def make_ascii_char(glyph: str) -> c.Char: return c.make_char(glyph, glyph) -ASCII_LC = ls.apply_foreach(make_ascii_char, [ - ['a'], ['b'], ['c'], ['d'], ['e'], ['f'], ['g'], ['h'], ['i'], - ['j'], ['k'], ['l'], ['m'], ['n'], ['o'], ['p'], ['q'], ['r'], - ['s'], ['t'], ['u'], ['v'], ['w'], ['x'], ['y'], ['z'], +ASCII_VOWEL = ls.apply_foreach(make_ascii_char, [ + ['a'], ['e'], ['i'], ['o'], ['u'] ]) +ASCII_CONS = ls.apply_foreach(make_ascii_char, [ + ['b'], ['c'], ['d'], ['f'], ['g'], ['h'], ['j'], + ['k'], ['l'], ['m'], ['n'], ['p'], ['q'], ['r'], + ['s'], ['t'], ['v'], ['w'], ['x'], ['y'], ['z'], +]) + +ASCII_LC = ASCII_VOWEL + ASCII_CONS ASCII_UC = c.uppercase_list(ASCII_LC) DOUBLE_SUBSTRING, DOUBLE_DICT = c.ls_double_substring(ASCII_LC) @@ -56,7 +61,11 @@ def double_substring_tr(tr: pyn.Fst) -> pyn.Fst: EN_LETTERS = c.store_tr_star('EN_LETTERS', ASCII_UC) +VOWEL_GR = c.store_gr_union('VOWEL', ASCII_VOWEL) +CONS_GR = c.store_gr_union('CONS', ASCII_CONS) + CHARS = ASCII_LC + ASCII_UC + SUBSTRING + DEL +GRAPHEME_INVENTORY = c.gr_inventory(CHARS, [VOWEL_GR, CONS_GR]) TRANSLIT_INVENTORY = c.tr_inventory(CHARS, [EN_LETTERS])