-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_data.py
83 lines (74 loc) · 3.27 KB
/
convert_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
###
# script for converting the Phonocolor data to the format used in TELL in Motion
# e.g. : mot,m|O,m|ot
###
import pandas as pd
INPUT_FILE = "word_to_phoneme.csv"
OUTPUT_FILE = "dict.csv"
# read the csv export of the Phonocolor sql dump
# the table to export is `word_to_phoneme`
# must be exported with the following parameters:
# only export 3 columns: word, phonemes, colors
# columns separated with ;
# columns enclosed with "
# columns escaped with "
# lines terminated with AUTO
# replace NULL with NULL
# encoding is utf-8
df = pd.read_csv(INPUT_FILE, sep=";", header=None)
df.columns = ["word", "phonemes", "colors"]
df.dropna(subset=['colors', 'phonemes'], inplace=True)
df.colors = df.colors.str.replace('"', '')
df.colors = df.colors.str.replace('{', '[')
df.colors = df.colors.str.replace('}', ']')
# define the cleaning function
def clean(row):
word = row.word
phonemes = row.phonemes.split(" ")
colors = row.colors
letters = colors[2:-2].split("],[")
grapheme_out = ""
last = ""
phoneme_list = []
for c,letter in zip(list(word),letters):
if c.isalpha(): # skip apostrophe
was_dict = ":" in letter # check if was an object cause sometimes keys are swapped
parts = letter.split(",")
if was_dict:
parts = sorted(parts)
parts = [p.split(":")[-1] for p in parts] # remove parts before ":"
if len(parts) == 4: # [c, col, col, col] or [0:c, 1:col, 2:col, 3:col]
parts = parts[1:3]
elif len(parts) == 3: # [c, col, col] or [0:c, 1:col, 3:col]
parts = parts[1:]
elif len(parts) == 2: # [c, col] or [0:c, 1:col]
parts = parts[1:] # otherwise keep parts as they are, [0:col] or [c] or whatever
parts = [p for p in parts if "#" in p] # filter out only colors
color = ",".join(parts) # order matters for double colors
nb_colors = len(set(parts))
if parts and color != last: # if no color, assume previous one
if grapheme_out:
grapheme_out += "|"
if nb_colors == 2 and phonemes[0] in "NJ":
nb_colors = 1
head, phonemes = phonemes[:nb_colors], phonemes[nb_colors:]
phoneme_list.append(" ".join(head))
elif parts and color == last: # double phonemes
if phoneme_list and phonemes and phoneme_list[-1] == phonemes[0]:
head, phonemes = phonemes[:1], phonemes[1:]
phoneme_list.append(" ".join(head))
grapheme_out += "|"
last = color
grapheme_out += c
if phonemes:
if len(phonemes) == 1 and phonemes[0] == "e": # handle at least some trailing e phonems
if phoneme_list[-1] == "j":
phoneme_list[-1] = "j e"
phonemes = []
phoneme_out = "|".join(phoneme_list)
# TODO: check if still some phonemes left
return [phoneme_out, grapheme_out, "|".join(phonemes)]
# apply the cleaning function
df[['clean_phonemes', 'graphemes', 'excess']] = df.apply(clean, axis=1, result_type='expand')
# store the new values in file
df[['word','clean_phonemes','graphemes']].to_csv(OUTPUT_FILE, index=False)