-
Notifications
You must be signed in to change notification settings - Fork 2
/
khmer_phonemizer.py
45 lines (40 loc) · 1.39 KB
/
khmer_phonemizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
r"""
Khmer Phonemizer - A Free, Standalone and Open-Source Khmer Grapheme-to-Phonemes.
"""
import os
import csv
from g2p import PhonetisaurusGraph
def _read_lexicon_file(file):
lexicon = {}
with open(file) as infile:
for line in csv.reader(infile, delimiter="\t"):
word, phonemes = line
word, phonemes = word.strip(), phonemes.strip().split()
lexicon[word] = phonemes
return lexicon
_graph_file = os.path.join(os.path.dirname(__file__), "km_phonemizer.npz")
_lexicon_file = os.path.join(os.path.dirname(__file__), "km_lexicon.tsv")
_lexicon_dict = _read_lexicon_file(_lexicon_file)
_graph = PhonetisaurusGraph.load(_graph_file, preload=False)
def _phoneticize(word: str, beam: int, min_beam: int, beam_scale: float):
results = _graph.g2p_one(word, beam=beam, min_beam=min_beam, beam_scale=beam_scale)
results = list(results)
if len(results) == 0:
return None
return results[0]
def phonemize_single(
word,
beam: int = 500,
min_beam: int = 100,
beam_scale: float = 0.6,
use_lexicon: bool = True,
):
r"""
Phonemize a single word. The word must match [a-zA-Z\u1780-\u17dd]+
"""
if word is None:
return None
word = word.lower()
if use_lexicon and word in _lexicon_dict:
return _lexicon_dict[word]
return _phoneticize(word, beam=beam, min_beam=min_beam, beam_scale=beam_scale)