-
Notifications
You must be signed in to change notification settings - Fork 0
/
retone_srs.py
89 lines (77 loc) · 2.87 KB
/
retone_srs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# s = unicodedata.normalize('NFKC', s) # canonical, precomposed form
import re
import unicodedata
_toned_vowels = 'áāaàíīiìóōoòúūuùÁĀAÀÍĪIÌÓŌOÒÚŪUÙ'
_toneless_vowels = 'aaaaiiiioooouuuuAAAAIIIIOOOOUUUU'
_detone_vowels = ''.maketrans(_toned_vowels, _toneless_vowels)
_tone_and_vowel_to_toned_vowel = {
('H', 'a'): 'á',
('M', 'a'): 'a',
('L', 'a'): 'à',
('H', 'i'): 'í',
('M', 'i'): 'i',
('L', 'i'): 'ì',
('H', 'o'): 'ó',
('M', 'o'): 'o',
('L', 'o'): 'ò',
('H', 'u'): 'ú',
('M', 'u'): 'u',
('L', 'u'): 'ù',
('H', 'A'): 'Á',
('M', 'A'): 'A',
('L', 'A'): 'À',
('H', 'I'): 'Í',
('M', 'I'): 'I',
('L', 'I'): 'Ì',
('H', 'O'): 'Ó',
('M', 'O'): 'O',
('L', 'O'): 'Ò',
('H', 'U'): 'Ú',
('M', 'U'): 'U',
('L', 'U'): 'Ù',
}
_toned_vowel_to_tone_and_vowel = dict((v, k) for k, v in \
_tone_and_vowel_to_toned_vowel.items())
def retone(text, tones):
# Remove all tone marking from the source text and turn the target tones
# into a list (e..g, ['LL', 'M', 'HH'])
text = text.translate(_detone_vowels)
tones = tones.split(" ")
# If the number of vowels in the original text doesn't match the number
# of vowels recognized by the auto-toner, then return the original text
# without any changes, except for a prepended "*".
vowels = re.findall('([%s]+)' % _toned_vowels, text)
if len(vowels) != len(tones):
return "*" + text
def rewrite_tone(v):
t = tones.pop(0) # e.g., 'HH'
v = v * len(t) # e.g., 'a' --> 'aa'
return ''.join(_tone_and_vowel_to_toned_vowel[tv] for tv in zip(t, v))
# For each vowel (short or long) in the original text, hand over its
# first character to the inner function 'rewrite_tone' to adjust the
# vowel length and tone to match the corresponding tone in 'tones'
# (e.g., 'a' in first match, 'HH' in tones[0] --> 'áá' as output).
return re.sub('([%s]+)' % _toned_vowels, lambda m: \
rewrite_tone(m.group(0)[0]), text)
# Turn a string of orthographic Tsuut'ina text into a space-separated string
# of of "MM L HH"-style tone labels (for converting sentences in the original
# training data into tone sequences for the model to use as training data for
# tone recognition)
def to_tones(text):
# This code has "turn me into an ugly Python list comprehension one-liner"
# written all over it. Fighting temptation...
tones = []
vowels = re.findall('([%s]+)' % _toned_vowels, text)
for vowel in vowels:
tone_seq = ''
for v in vowel:
tone_seq += _toned_vowel_to_tone_and_vowel[v][0]
tones.append(tone_seq)
return ' '.join(tones)
# test = "dosak'a"
# tones = "MM MH ML"
#
# print(retone(test, tones))