-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerate_pokorny_scraped_data_OLD.py
246 lines (196 loc) · 10.3 KB
/
generate_pokorny_scraped_data_OLD.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import json
import re
from collections import Counter
from unidecode import unidecode
import pandas as pd
import pyperclip
from io import StringIO
"""
author - anton vinogradov
The only purpose of this file is to process the scraped pokorny site: https://indo-european.info/pokorny-etymological-dictionary/
Since that is not being used as a good source of pokorny data, this script should not be run and only exists to document what has been done. Seriously, don't
use this.
"""
def process_pos(processed_pokorny, pos_file):
corrections = {
"adv": "adverb",
"adj": "adjective",
"interjections": "interjection",
"prep": "preposition",
}
with open(pos_file, "r") as fp:
lines = fp.readlines()
index_to_pos = {
int(line.split(":")[0])-1:
[corrections.get(pos.strip(), pos.strip()) for pos in sorted(line.split(":")[-1].lower().strip().replace(",", ";").replace("/", ";").split(";"))]
for line in lines
}
# for each entry, get the new pos, if a new one doesn't exist use the old one, if an old one doesn't exist make it blank
pos_pokorny = [{**entry, "pos": index_to_pos.get(i, entry.get("pos", []))} for i, entry in enumerate(processed_pokorny)]
return pos_pokorny
a = {
"adv": "adverb",
"adj": "adjective",
"interjections": "interjection",
"prep": "preposition",
"number": "numeral",
}
def remove_non_english_chars(text, allowed_chars=""):
if "ə" not in allowed_chars:
text = text.replace("ə", "e")
cleaned_text = unidecode(text)
cleaned_text = ''.join([c for c in cleaned_text if c.isalpha() or c in allowed_chars])
return cleaned_text
def output_pos(processed_pokorny):
newline = '\n'
text_lines = [
f"{i+1} - {'; '.join(entry['root']).replace(newline, ' ')}" \
f"\n\tEnglish Meaning:\t{'; '.join(entry['meaning'])}" \
f"\n\t German Meaning:\t{'; '.join(entry['other_meaning']['german'])}" \
f"\n\t Tagged POS:\t{'; '.join(entry['pos'])}\n\n"
for i, entry in enumerate(processed_pokorny)
]
with open("data_pokorny/pokorny_pos_output.txt", "w", encoding="utf-8") as fp:
fp.writelines(text_lines)
pass
# the prompt used
"""
I am going to give you a numbered set of word definitions in german. Tell me the part of speech that the word is in english, based solely on the definition.
If there are multiple that you think are reasonable, give all that are applicable. Give your answer in the form "number: part of speech".
Do not include any other text, do not include the meaning either.
"""
def clipboard(processed_pokorny):
definitions = [f'{i + 1}: {"; ".join(entry["other_meaning"]["german"])}' for i, entry in enumerate(processed_pokorny) if "; ".join(entry["meaning"]) != ""]
for i in range(0, len(definitions), 100):
pyperclip.copy("\n".join(definitions[i:i+100]))
breakpoint()
def compare_pos(pokorny_1, pokorny_2):
pos_list_1 = [entry["pos"] for i, entry in enumerate(pokorny_1)]
pos_list_2 = [entry["pos"] for i, entry in enumerate(pokorny_2)]
pos_diff = pd.DataFrame([
{
"index": i,
1: p1,
2: p2,
}
for i, (p1, p2) in enumerate(zip(pos_list_1, pos_list_2)) if set(p1) != set(p2) and len(p1) > 0 and len(p2) > 0
])
return pos_diff
def merge_pos(pokorny_1, pokorny_2):
return [{**entry1, "pos": sorted(set(entry1["pos"] + entry2["pos"]))}for entry1, entry2 in zip(pokorny_1, pokorny_2)]
def generate_ids(pokorny):
# get all the proto-ids, which really just removes the weird chars (even though they display just fine in the browser, might be difficult to paste or something)
stripped_roots = [remove_non_english_chars(entry["root"][0]) for entry in pokorny]
stripped_meanings = [remove_non_english_chars(sorted(entry["meaning"])[0].replace(" ", "-"), allowed_chars="-") if len(entry["meaning"]) else "unknown" for entry in pokorny]
proto_id = [f"{root}_{meaning}" for root, meaning in zip(stripped_roots, stripped_meanings)]
# put a number at the end so any duplicates are removed, and add them to the entries
processed_pokorny = [{**entry, "entry_id": f"{entry_id}_{proto_id[:i].count(entry_id)+1}"} for i, (entry_id, entry) in enumerate(zip(proto_id, pokorny))]
return processed_pokorny
def common():
with open("data_pokorny/pokorny_scraped.json", "r", encoding="utf-8") as fp:
pokorny = json.load(fp)
# for each entry in the raw pokorny, we need to build out entries that are well laid out
common_pokorny = [
{
"root": [root.strip() for roots in entry["root"] for root in roots.split(",")],
"meaning": [meaning.strip() for meanings in entry["English meaning"] for meaning in meanings.split(",")],
"other_meaning": {"german": [meaning.strip() for meanings in entry["German meaning"] for meaning in meanings.replace("`", "'").replace("'", "").split(",")]},
}
for entry in pokorny
]
# we prompt chatgpt here with:
# 'I am going to give you a numbered set of word definitions. Tell me the part of speech that the word is, based solely on the definition. If there are multiple that you think are reasonable, give all that are applicable. Give your answer in the form "number: part of speech". Do not include any other text, do not include the meaning either.'
# then pasting in what clipboard gives you and putting the chat output in pokorny_pos.txt
# clipboard(processed_pokorny)
# the output of gpt is not deterministic afaik, so processing the output may require changing the processing func.
# common_pokorny = process_pos(common_pokorny, "data_pokorny/gpt_pos.txt")
# common_pokorny = process_pos(common_pokorny, "data_pokorny/gpt_german_pos.txt")
# we merge the automated ones, but overwrite with the human one (only on entries that the human corrected).
common_pokorny = merge_pos(process_pos(common_pokorny, "data_pokorny/gpt_pos.txt"), process_pos(common_pokorny, "data_pokorny/gpt_german_pos.txt"))
common_pokorny = process_pos(common_pokorny, "data_pokorny/human_pos.txt")
# compare two pos tagged pokorny lists
# compare_pos(process_pos(common_pokorny, "data_pokorny/human_pos.txt"), process_pos(common_pokorny, "data_pokorny/gpt_german_pos.txt"))
# output to file for double-checking by human
# output_pos(processed_pokorny)
# we also have a human double-checking and correcting those tags, which takes precedence
# processed_pokorny = process_pos(processed_pokorny, "data_pokorny/human_pos.txt")
# generate IDs for each. There are just human-readable ways to refer to the word that can be put into a url and other things.
common_pokorny = generate_ids(common_pokorny)
# output the processed to json, so we can import into a db or something
with open("data_pokorny/pokorny_processed.json", "w", encoding="utf-8") as fp:
json.dump(common_pokorny, fp)
# breakpoint()
pass
def extract_material_forms(material_line_splits):
extracted_forms = []
for split in material_line_splits:
# we often get empty lines at the end, this just ensures that we skip those
if len(split.strip()) == 0:
continue
# most forms are italicized, but not all
forms = list(re.findall(r'\\\\(.*?)\\\\', split))
if len(forms) != 0:
# most definitions follow the form
definition = list(re.findall(r'\\\\.*?\\\\(.*)', split))[0].strip()
else:
# if its empty assume we failed and that this needs some further looking at
breakpoint()
# add onto the entry
extracted_forms.append({
"forms": forms,
"definition": definition,
})
return extracted_forms
def pokorny_table():
with open("data_pokorny/pokorny_scraped.json", "r", encoding="utf-8") as fp:
pokorny = json.load(fp)
# todo: process the material field
# formatting stuff:
# \\words\\ = italicized
# __words__ = underlined
# breakdown of a single line:
# 1. mentions the language (presumably), tends to look like ["Ai.", "gr.", "lat.", ...]
# 2. a generally italicized (but not always) word (presumably called a "reconstruction" or "language form")
# - there may be multiple words here
# 3. the definition
# 4. semicolon ";" marking the end of definitions
# 5. 2-4 may repeat.
# extracted fields:
# - language: string
# - forms object
# - forms: list of forms (strings)
# - definition: string
for entry in pokorny:
processed_materials = []
for material_line in entry["Material"]:
# split up each ; separated entry for this specific langauge
material_line_splits = material_line.split(";")
# separate off the langauge, and edit the first entry to remove it
language = material_line_splits[0].split(" ")[0]
material_line_splits[0] = " ".join(material_line_splits[0].split(" ")[1:])
extracted_forms = extract_material_forms(material_line_splits)
processed_materials.append({
"language": language,
"forms": extracted_forms
})
# print(entry["root"])
# print()
# for line in entry["Material"]:
# print(line.replace("\\\\", "\\"))
# print()
#
# breakpoint()
pass
def get_base_letters(alt_letters):
unique_letters = sorted(set([remove_non_english_chars(letter)[0].lower() for letter in alt_letters]))
base_letters = {
unique_letter:
[letter for letter in a if unique_letter == remove_non_english_chars(letter)[0].lower() and letter != unique_letter]
for unique_letter in unique_letters
}
return base_letters
if __name__ == '__main__':
# print(get_base_letters(["p", "pʰ", "b", "bʰ", "t", "tʰ", "d", "dʰ", "ḱ", "ḱʰ", "ǵ", "ǵʰ", "k", "kʰ", "g", "gʰ", "kʷ", "kʷʰ", "gʷ", "gʷʰ", "T", "K", "k(')", "ǵ(ʰ)", "g(')ʰ", "g(')", "k(ʷ)", "g(ʷ)ʰ", "g(ʷ)", "h₁", "h₂", "h₃", "H, hₓ", "s", "z", "F", "w, u̯", "y, i̯", "G", "r", "l", "L", "m", "n", "N", "r̥", "l̥", "m̥", "n̥", "i", "u", "a", "e", "o", "ā", "ē", "ō", "á", "é", "ó", "ā́", "ḗ", "ṓ", "í", "ī", "ī́", "ú", "ū", "ū́", "ə"]))
pokorny_table()
# common()
pass