Default to "hi" for missing mappings in indic-nlp-library

facebookresearch · Jul 22, 2022 · 8b73cb8 · 8b73cb8
1 parent 2c4bbb6
commit 8b73cb8
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 5 deletions.
diff --git a/utils/src/cleaner_splitter.py b/utils/src/cleaner_splitter.py
@@ -154,7 +154,7 @@ def split_clean():
             print(f"{line_stripped}\t{sentence}")
         else:
             print(
-                f"Couldn't match sentence for paragraph {paragraph_digest}",
+                f"Couldn't match sentence for paragraph: {paragraph_digest} sentence: {sentence_digest} lang: {lang}",
                 file=sys.stderr,
             )
 

diff --git a/utils/src/sentence_split.py b/utils/src/sentence_split.py
@@ -283,18 +283,21 @@
 # ----------------------------------------------
 LANGS_INDIC = {
     "asm": "as",
-    "awa": "aw",
+    "awa": "hi",
     "ben": "bn",
+    "bho": "hi",
     "brx": "bD",
     "gom": "xx",
     "guj": "gu",
     "hin": "hi",
+    "hne": "hi",
     "kan": "kn",
-    "kas": "ka",
+    "kas": "hi",
+    "kas_Deva": "hi",
     "kok": "kK",
     "mni": "bn",  # our meitei is in bengali script, so swapped it to bengali here
-    "mag": "mg",
-    "mai": "mi",
+    "mag": "hi",
+    "mai": "hi",
     "mal": "ml",
     "mar": "mr",
     "npi": "ne",