-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerate_liv_db_data.py
203 lines (175 loc) · 8.66 KB
/
generate_liv_db_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import json
import regex as re
import pandas as pd
from tqdm import tqdm
import translate_meaning_gpt
from gpt_utils import *
def process_language_form_and_translation(line):
entry = {"lang": {"abbr": "", "questionable": False}, "reflex": "", "meaning": None, "notes": []}
# Todo: the whole thing needs to be extracted from the gpt corrected csv
return entry
def get_reflex_entries(group):
entries = []
for i, row in group.iterrows():
if row.category == "root":
continue
# temp call
process_language_form_and_translation(row.language_form_and_translation)
language = {
'language_name': 'NYI',
'language_abbr': 'NYI',
'langauge_aka': '',
'family_name': 'NYI',
'family_name_override': '',
# todo: I think that this is incorrect but I need to put this data somewhere
'sub_family_name': row.branch,
'is_sub_branch': False,
'is_sub_sub_branch': False
}
entry = {
# language/family
"language": language,
# reflex itself (may contain multiple), store as list
"reflexes": [row.language_form_and_translation],
# part of speech
"pos": "NYI",
# the meaning (gloss)
"gloss": "NYI",
# source (static cuz we only have one place we are taking it from).
"source": {
"text_sources": [{'code': 'LIV', 'display': 'Helmut Rix, Martin Kümmel et al.: Lexikon der indogermanischen Verben (1998)'}],
'db_sources': [{'code': 'TOL', 'display': 'LIV as compiled by Thomas Olander)'}]
},
}
entries.append(entry)
return entries
def create_lineup_csv():
with open("data_pokorny/table_pokorny.json", "r") as fp:
pokorny_json = json.load(fp)
liv_data_path = "data_liv/Olander_LIV_-_resultat_af_script.csv"
root_match_up = pd.DataFrame([{"root": entry["root"], "searchables": entry["searchable_roots"], "canonical_id": "", "liv": ""} for entry in pokorny_json])
root_match_up.to_csv("data_liv/pokorny_liv_root_matchup.csv")
df = pd.read_csv(liv_data_path)
df["iew_num"] = df["iew_reference"].str.extract(r"IEW (\d*)")
df["iew_num"] = pd.to_numeric(df["iew_num"], downcast='integer')
df[["lemma", "iew_num", "iew_reference"]].drop_duplicates().sort_values(by="iew_num").to_csv("data_liv/all_liv_roots.csv", index=False)
breakpoint()
def load_olander():
liv_data_path = "data_liv/Olander_LIV_-_resultat_af_script.csv"
df = pd.read_csv(liv_data_path)
# fixme: this behavior might be confusing, check here if something is strange.
# if the english_meanings.txt doesnt already exist
if not os.path.exists("data_liv/english_meanings.txt"):
# write out the translations to file for auto translating
with open("data_liv/german_meanings.txt", "w", encoding='utf8') as fp:
fp.writelines([
# remove any citation in form {num} and then strip all these possible quotation marks ’‘
re.sub(r'\{\d+\}', '', line).strip(' ’‘') + "\n"
for line in df.translation.to_list()
])
translate_meaning_gpt.main("data_liv/german_meanings.txt", "data_liv/english_meanings.txt")
# load the english
with open("data_liv/english_meanings.txt", "r", encoding='utf8') as fp:
english_meanings = [line.strip(" \n") for line in fp.readlines()]
df["english_meaning"] = english_meanings
return df
def main():
df = load_olander()
root_groups = df.groupby("lemma")
liv_entries = []
for root, group in tqdm(list(root_groups), ncols=150):
gloss = group[group.category == "root"].iloc[0].english_meaning,
# todo: make this actually match the entry_id that exists in the common version. I have no clue how to do this.
entry_id = f'{root}_{gloss}'
reflex_entries = get_reflex_entries(group)
entry = {
"entry_id": entry_id,
"root": root,
"meaning": gloss,
"reflexes": reflex_entries,
# todo: part of speech
"pos": ["NYI"],
"semantic": "NYI",
# todo: figure out how to make roots actually searchable from the root that we have.
"searchable_roots": root
}
liv_entries.append(entry)
with open("data_liv/table_liv.json", "w", encoding="utf-8") as fp:
json.dump(liv_entries, fp, indent=2)
def gpt_fix_fields():
df = load_olander()
# get rid of all rows where category is root
df = df[df.category != "root"]
df["notes"] = df["notes"].fillna("")
# iterate through all the rows
df_list = []
for i, (_, row) in enumerate(df.iterrows()):
headers = ["language", "reflex", "questionable", "meaning", "notes"]
headers_str = str(headers)[1:-1]
prompt_ex = (
"Here are some examples and their output,:\n"
"Information: [gr. ἔφαγον ‘aß (auf)’{3}\n"
f"{headers_str}\n"
"'gr.', 'ἔφαγον', 'false', 'aß (auf)', '3'\n"
"\nInformation: ved. Konj. bhájati ‘teilt zu’, bhájate ‘bekommt Anteil’\n"
f"{headers_str}\n"
"'ved.', 'bhájati', 'false', 'teilt zu', 'None'\n"
"'ved.', 'bhájate', 'false', 'bekommt Anteil', 'None'\n"
"\nInformation: alit. bė́gmi, [lit. bė́gu, (bė́gti) ‘laufen, fliehen’\n"
f"{headers_str}\n"
"'alit.', 'bė́gmi', 'false', 'laufen, fliehen', 'None'\n"
"'lit.', 'bė́gu', 'false', 'laufen, fliehen', 'None'\n"
"\nInformation: ?[arm. Inj. Med. bekanem ‘breche’{3}\n"
f"{headers_str}\n"
"'arm.', 'bekanem', 'true', 'breche', '3'\n"
)
# make a prompt for gpt to separate out all the parts that seems to be stuffed into the language_form_and_translation
prompt = (
f'"{row.language_form_and_translation}" is information (in german) about the reflexes for the PIE root "{row.lemma}" described as "{row.english_meaning}". '
f'It should be about a {row.category} in {row.branch}. '
f'I need to extract the language abbreviation, the reflex (there may be multiple), whether it is questionable, the german meaning, and any note markers. '
f'I want this in comma separated format with the following headers: {headers_str}. '
f'\n\n{prompt_ex}\n'
f'Give these in CSV format, without any other text, placed in a code block. '
f'Remember to quote every field to preserve commas and to put each reflex on its own line. '
f'Also make sure you only put the reflex in the reflex column, without information \n\n'
f'Information: "{row.language_form_and_translation}"\n'
f'{headers_str}'
)
# row.notes is in form "\{int\} note text \{int\} note text", I need to turn this into a dict from {"int": "note text"} using regex
notes = re.findall(r'{(\d+)}\s+(.*?)(?=\s*{(\d+)}|\s*$)', row.notes)
notes = {note[0]: note[1] for note in notes}
# print(f"~~~~~~~~~~~~~~~{i+1}/{len(df)}~~~~~~~~~~~~~~~")
# print("Original:", row.language_form_and_translation)
for _ in range(3):
try:
response = query_gpt(prompt)
csv_text = extract_code_block(response["choices"][0]["message"]["content"])
# print("CSV'd:", *[f"\n\t{text}" for text in csv_text.split("\n")])
response_df = process_gpt(csv_text, headers, quote_char="'")
# print(response_df)
response_df["root"] = row.lemma
response_df["branch"] = row.branch
response_df["category"] = row.category
response_df["original_text"] = row.language_form_and_translation
response_df["expanded_notes"] = [f"{note}: {notes[note]}" if note in notes else note for note in response_df.notes]
df_list.append(response_df)
break
except pd.errors.ParserError as e:
breakpoint()
print(e)
delete_response(prompt)
continue
# merge all the dfs in the list, ignoring the index
combined_df = pd.concat(df_list, ignore_index=True)
# reorder the columns
combined_df = combined_df[["root", "category", "branch", "language", "questionable", "reflex", "meaning", "notes", "expanded_notes", "original_text"]]
# save to file
combined_df.to_csv("data_liv/liv_gpt.csv")
# breakpoint()
pass
if __name__ == '__main__':
# gpt_fix_fields()
# create_lineup_csv()
main()
pass