-
Notifications
You must be signed in to change notification settings - Fork 1
/
generate_common_db_data.py
116 lines (102 loc) · 4.22 KB
/
generate_common_db_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import json
import numpy as np
import pandas as pd
import tqdm
from generate_pokorny_scraped_data_OLD import remove_non_english_chars
def main():
# open the pokorny and liv files
pokorny_filename = "data_pokorny/table_pokorny.json"
liv_filename = "data_liv/table_liv.json"
with open(pokorny_filename, 'r') as fp:
pokorny_data_list = json.load(fp)
with open(liv_filename, 'r') as fp:
liv_data_list = json.load(fp)
# pokorny and liv need to be redictionaried into key: entry
pokorny_data = {entry["root"]: entry for entry in pokorny_data_list}
liv_data = {entry["root"]: entry for entry in liv_data_list}
assert len(pokorny_data) == len(pokorny_data_list)
assert len(liv_data) == len(liv_data_list)
# open the match-up csv
match_df = pd.read_csv("data_common/matchup.csv")
# fill the ["liv: cross-reference"] with "" instead of NaN
match_df["liv: cross-reference"] = match_df["liv: cross-reference"].fillna("")
# for everything in the match-up:
# 1. create a new common entry, with a list of objects under the name "dictionary":
# 2. add the pokorny root to that entry in "pokorny_entries" of that list
# 3. if there is a liv, add the liv root to that entry in "liv_entries" of that list
# 3. keep track of which liv roots you have used.
# If there are any left over at the end add them as an entry, with that root in the liv_entries, but nothing in the pokorny_entries
used_liv_roots = set()
common_data = []
liv_to_pokorny = {}
counter = 0
for index, row in tqdm.tqdm(match_df.iterrows()):
pokorny_root = row["root"]
liv_root = row["liv: cross-reference"]
liv_roots = [root.strip() for root in liv_root.split(",")]
# find the pokorny entry in the pokorny data
pokorny_data_entry = pokorny_data[pokorny_root]
# liv_data_entry = liv_data.get(liv_root, None)
new_entry = {
"root": pokorny_root,
"dictionary": [
{"pokorny_entries": [pokorny_root]},
{"liv_entries": liv_roots if liv_root else []}
],
"meaning": pokorny_data_entry["meaning"],
"common_id": str(counter)
}
counter += 1
# add it to the list
common_data.append(new_entry)
for root in liv_roots:
used_liv_roots.add(root)
liv_to_pokorny[root] = pokorny_root
# remove '' from the set of used liv roots and find the
used_liv_roots.remove('')
# set math
unused_liv = set(liv_data.keys()) - used_liv_roots
# add the remaining liv roots to the common data
for root in tqdm.tqdm(unused_liv):
new_entry = {
"root": root,
"dictionary": [
{"pokorny_entries": []},
{"liv_entries": [root]}
],
"meaning": liv_data[root]["meaning"],
"common_id": str(counter)
}
counter += 1
common_data.append(new_entry)
# reorganize the common data into a dictionary by pokorny id
common_data_dict = {entry["root"]: entry for entry in common_data}
# now go back through the pokorny and liv and add a "common_entries" to each
for entry in pokorny_data.values():
root = entry["root"]
common_entry = common_data_dict[root]
entry["common_id"] = common_entry["common_id"]
for entry in liv_data.values():
root = entry["root"]
if root in liv_to_pokorny:
root = liv_to_pokorny[root]
common_entry = common_data_dict[root]
entry["common_id"] = common_entry["common_id"]
# sort the common by root
common_data = sorted(common_data, key=lambda x: remove_non_english_chars(x["root"]).lower())
# save the common data
print("writing common")
with open("data_common/table_common.json", 'w') as fp:
json.dump(common_data, fp, indent=4)
# save the pokorny data
print("writing pokorny")
with open("data_pokorny/table_pokorny.json", 'w') as fp:
json.dump(pokorny_data_list, fp, indent=4)
# save the liv data
print("writing liv")
with open("data_liv/table_liv.json", 'w') as fp:
json.dump(liv_data_list, fp, indent=4)
pass
if __name__ == '__main__':
main()
pass