Skip to content

Commit

Permalink
Default to "hi" for missing mappings in indic-nlp-library
Browse files Browse the repository at this point in the history
  • Loading branch information
Celebio committed Jul 22, 2022
1 parent 2c4bbb6 commit 8b73cb8
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 5 deletions.
2 changes: 1 addition & 1 deletion utils/src/cleaner_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def split_clean():
print(f"{line_stripped}\t{sentence}")
else:
print(
f"Couldn't match sentence for paragraph {paragraph_digest}",
f"Couldn't match sentence for paragraph: {paragraph_digest} sentence: {sentence_digest} lang: {lang}",
file=sys.stderr,
)

Expand Down
11 changes: 7 additions & 4 deletions utils/src/sentence_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,18 +283,21 @@
# ----------------------------------------------
LANGS_INDIC = {
"asm": "as",
"awa": "aw",
"awa": "hi",
"ben": "bn",
"bho": "hi",
"brx": "bD",
"gom": "xx",
"guj": "gu",
"hin": "hi",
"hne": "hi",
"kan": "kn",
"kas": "ka",
"kas": "hi",
"kas_Deva": "hi",
"kok": "kK",
"mni": "bn", # our meitei is in bengali script, so swapped it to bengali here
"mag": "mg",
"mai": "mi",
"mag": "hi",
"mai": "hi",
"mal": "ml",
"mar": "mr",
"npi": "ne",
Expand Down

0 comments on commit 8b73cb8

Please sign in to comment.