Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update ATF importer #552

Open
wants to merge 47 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
68d54f7
Refactor atf importer (WiP)
khoidt May 16, 2024
e3e058a
Update ebl/atf_importer/application/lemma_lookup.py
khoidt May 16, 2024
ab33279
Update ebl/atf_importer/application/lemma_lookup.py
khoidt May 16, 2024
f6bed68
Update ebl/atf_importer/application/atf_importer_base.py
khoidt May 16, 2024
44f2daf
Update ebl/atf_importer/application/atf_importer_base.py
khoidt May 16, 2024
a3e9e52
Update ebl/atf_importer/domain/atf_preprocessor_cdli.py
khoidt May 16, 2024
43a9cfa
Update ebl/atf_importer/domain/atf_preprocessor_cdli.py
khoidt May 16, 2024
860ad08
Update ebl/atf_importer/domain/atf_preprocessor_cdli.py
khoidt May 16, 2024
171ebac
Fix lark paths
khoidt May 16, 2024
65aa777
Update test
khoidt May 16, 2024
0ffb31e
Refactor & update
khoidt May 17, 2024
72813c8
Clean up
khoidt May 17, 2024
6bdc503
Refactor more
khoidt May 17, 2024
b50e5b9
Update
khoidt May 17, 2024
08f4430
Fix type
khoidt May 21, 2024
ac21caa
Improve
khoidt May 21, 2024
4f53377
Improve
khoidt May 21, 2024
cf568d2
Update & fix preprocessor tests
khoidt May 23, 2024
9c19bc4
Refactor & update
khoidt May 23, 2024
115c5f6
Fix test (use transliteration chars)
khoidt May 23, 2024
0426000
Improve
khoidt May 23, 2024
7464010
Fix glossary data (WiP)
khoidt May 24, 2024
8d03672
Update, improve & refactor to fix test (WiP)
khoidt May 27, 2024
b1c8081
Update, refactor & add logging (WiP)
khoidt May 28, 2024
a7e070a
Update logging & improve
khoidt May 29, 2024
e662c72
Refactor
khoidt May 31, 2024
fb84e2c
Update logging
khoidt May 31, 2024
e39e8b3
Update preprocessor & add importer test (WiP)
khoidt Jun 3, 2024
14be543
Update atf preprocessor (WiP)
khoidt Jun 4, 2024
18a1ef7
Fix
khoidt Jun 4, 2024
3c50de9
Update structure, use only ebl atf parser (WiP)
khoidt Jun 12, 2024
1cea5ff
Refactor, update & fix tests (WiP)
khoidt Jul 4, 2024
172631a
Update (WiP)
khoidt Jul 10, 2024
b2a0405
Refactor & fix tests (WiP)
khoidt Oct 15, 2024
4e4693f
Merge remote-tracking branch 'origin/master' into atf-import-update
khoidt Oct 15, 2024
b4b8159
Update, refactor & fix (WiP)
khoidt Oct 23, 2024
276cc63
Update visitor & transformers (WiP)
khoidt Oct 24, 2024
0a0e481
Update transformers pipeline & tests (WiP)
khoidt Oct 29, 2024
60a809f
Add paths in transformers to trace ancestors & break at, fix tests
khoidt Oct 31, 2024
9f75e7f
Add & update transformers & tests (WiP)
khoidt Nov 5, 2024
1973bef
Update serialization logic & fix tests (WiP)
khoidt Nov 7, 2024
d3ab25b
Merge remote-tracking branch 'origin/master' into atf-import-update
khoidt Nov 7, 2024
7e20341
Add transformers and tests, update parser & fix typing
khoidt Nov 12, 2024
f1e3729
Refactor lark grammar (correct at line and structure in general)
khoidt Nov 15, 2024
9974997
Update & fix tests (WiP)
khoidt Nov 19, 2024
9ad365b
Fix more tests & format (WiP)
khoidt Nov 19, 2024
eedec78
Restructure main lark parser (WiP)
khoidt Nov 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
675 changes: 97 additions & 578 deletions ebl/atf_importer/application/atf_importer.py

Large diffs are not rendered by default.

107 changes: 107 additions & 0 deletions ebl/atf_importer/application/database_importer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from typing import Dict, List, Tuple, Optional, Sequence
from ebl.app import create_context
from ebl.fragmentarium.application.fragment_updater import FragmentUpdater
from ebl.fragmentarium.application.transliteration_update_factory import (
TransliterationUpdateFactory,
)
from ebl.fragmentarium.web.dtos import parse_museum_number
from ebl.lemmatization.domain.lemmatization import Lemmatization, LemmatizationToken
from ebl.transliteration.domain.atf import Atf
from ebl.transliteration.domain.lark_parser import parse_atf_lark
from ebl.users.domain.user import AtfImporterUser

class DatabaseImporter:
def __init__(self, database, logger, username: str):
self.database = database
self.logger = logger
self.user = AtfImporterUser(username)
context = create_context()
self.transliteration_factory: TransliterationUpdateFactory = (
context.get_transliteration_update_factory()
)
self.updater: FragmentUpdater = context.get_fragment_updater()

def import_into_database(self, ebl_lines: Dict[str, List], filename: str):
museum_number: Optional[str] = self._retrieve_museum_number(ebl_lines, filename)
if not museum_number:
return
if self._check_fragment_exists(museum_number):
self._import(ebl_lines, museum_number, filename)

def _import(self, ebl_lines: Dict[str, List], museum_number: str, filename: str):
try:
self._insert_transliterations(
ebl_lines["transliteration"],
museum_number,
)
self._insert_lemmatization(ebl_lines["lemmatization"], museum_number)
self.logger.info(f"{filename} successfully imported")
except Exception as e:
self.logger.error(f"Error importing {filename}: {str(e)}")

def _retrieve_museum_number(
self, ebl_lines: Dict[str, List], filename: str
) -> Optional[str]:
for line in ebl_lines["control_lines"]:
linesplit = line["c_line"].split("=")
if len(linesplit) > 1:
return linesplit[-1].strip()
return None

def _check_fragment_exists(self, museum_number: str) -> bool:
exists = list(
self.database.get_collection("fragments").find(
{"museumNumber": museum_number}, {"text.lines.0"}
)
)
return bool(exists)

def _insert_transliterations(
self,
transliterations: List[str],
museum_number: str,
) -> None:
converted_transliteration = "\n".join(transliterations)
transliteration = self.transliteration_factory.create(
Atf(converted_transliteration)
)
self.updater.update_transliteration(
parse_museum_number(museum_number), transliteration, self.user
)

def _insert_lemmatization(
self,
lemmatizations: List[Tuple[str, List[Dict]]],
museum_number: str,
):
lemmatization_tokens = self._get_lemmatization_tokens(lemmatizations)
lemmatization = Lemmatization([lemmatization_tokens])
self.updater.update_lemmatization(
parse_museum_number(museum_number), lemmatization, self.user
)

def _get_lemmatization_tokens(
self, lemmatizations: List[Tuple[str, List[Dict]]]
) -> Sequence[LemmatizationToken]:
lemmatization_tokens: List[LemmatizationToken] = []
for text_line, lemmas in lemmatizations:
ebl_lines = parse_atf_lark(text_line).lines[0].content
lemmatization_tokens = self._get_lemmatization_tokens_in_lines(
ebl_lines, lemmas, lemmatization_tokens
)
return lemmatization_tokens

def _get_lemmatization_tokens_in_lines(
self,
ebl_lines,
lemmas,
lemmatization_tokens: List[LemmatizationToken],
) -> List[LemmatizationToken]:
for token in ebl_lines:
lemma_ids = [
lemma["_id"] for lemma in lemmas if lemma["lemma"] == token.value
]
lemmatization_tokens.append(
LemmatizationToken(token.value, tuple(lemma_ids) if lemma_ids else None)
)
return lemmatization_tokens
116 changes: 116 additions & 0 deletions ebl/atf_importer/application/glossary_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import re
from typing import Dict, List, Tuple, Optional, Iterator, TypedDict


class GlossaryParserData(TypedDict):
lemgwpos_cf: Dict[str, str]
forms_senses: Dict[str, List[str]]
lemposgw_cfgw: Dict[str, Tuple[str, str]]


class GlossaryParser:
def __init__(self):
self.lemgwpos_cf: Dict[str, str] = {}
self.forms_senses: Dict[str, List[str]] = {}
self.lemposgw_cfgw: Dict[str, Tuple[str, str]] = {}

@property
def data(self) -> GlossaryParserData:
return {
"lemgwpos_cf": self.lemgwpos_cf,
"forms_senses": self.forms_senses,
"lemposgw_cfgw": self.lemposgw_cfgw,
}

def parse(self, file: Iterator[str]) -> GlossaryParserData:
current_entry: Dict[str, str] = {}
lemmas: List[str] = []
for line in file:
line = line.strip()
if line.startswith("@entry"):
lemmas, current_entry = self._handle_entry(line, lemmas)
elif line.startswith("@form"):
lemmas = self._handle_form(line, current_entry, lemmas)
elif line.startswith("@sense"):
self._handle_sense(line, lemmas, current_entry)
return self.data

def _handle_entry(
self, line: str, lemmas: List[str]
) -> Tuple[List[str], Dict[str, str]]:
lemmas.clear()
return lemmas, self._parse_entry(line)

def _handle_form(
self, line: str, current_entry: Dict[str, str], lemmas: List[str]
) -> List[str]:
lemma = self._parse_form(line, current_entry)
if lemma:
lemmas.append(lemma)
return lemmas

def _handle_sense(
self, line: str, lemmas: List[str], current_entry: Dict[str, str]
) -> None:
self._parse_sense(line, lemmas, current_entry)

def _parse_entry(self, line: str) -> Dict[str, str]:
entry = {}
parts = line.split(" ", 2)
if len(parts) > 1:
entry["cf"] = parts[1].replace("ʾ", "'").strip()
description = parts[2] if len(parts) > 2 else ""
match = re.search(r"\[(.*?)\] (.*)", description)
if match:
khoidt marked this conversation as resolved.
Show resolved Hide resolved
entry["gw"], entry["pos"] = match.groups()
entry["gw"] = entry["gw"].strip()
entry["pos"] = entry["pos"].strip()
return entry

def _parse_form(self, line: str, current_entry: Dict[str, str]) -> Optional[str]:
parts = line.split(" ")
if len(parts) > 2:
lemma = parts[2].lstrip("$").rstrip("\n")
if (
"cf" in current_entry
and "gw" in current_entry
and "pos" in current_entry
):
key = f"{lemma}{current_entry['pos']}{current_entry['gw']}"
self.lemgwpos_cf[key] = current_entry["cf"]
return lemma
return None

def _parse_sense(
self, line: str, lemmas: List[str], current_entry: Dict[str, str]
) -> None:
pos_tag, sense = self._extract_pos_tag_and_sense(line)
for lemma in lemmas:
self._update_forms_senses(lemma, sense)
self._update_lemposgw_cfgw(lemma, pos_tag, sense, current_entry)

def _extract_pos_tag_and_sense(
self, line: str
) -> Tuple[Optional[str], Optional[str]]:
parts = line.split(" ", 2)
pos_tag = parts[1] if len(parts) > 1 else None
sense = parts[2].strip() if len(parts) > 2 else None
return pos_tag, sense

def _update_forms_senses(self, lemma: str, sense: Optional[str]) -> None:
if sense:
if lemma not in self.forms_senses:
self.forms_senses[lemma] = [sense]
else:
self.forms_senses[lemma].append(sense)

def _update_lemposgw_cfgw(
self,
lemma: str,
pos_tag: Optional[str],
sense: Optional[str],
current_entry: Dict[str, str],
) -> None:
if sense and "gw" in current_entry:
sense_key = f"{lemma}{pos_tag}{sense}"
self.lemposgw_cfgw[sense_key] = (current_entry["cf"], current_entry["gw"])
111 changes: 111 additions & 0 deletions ebl/atf_importer/application/lemma_lookup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from typing import List, Dict, TypedDict, Union


class QueryConfig(TypedDict, total=False):
lemma_field: str
lemma_value: Union[str, List[str]]
guideword_field: str = ""
guideword_value: str = ""


class LemmaLookup:
def __init__(self, database, config, logger):
self.database = database
self.config = config
self.logger = logger

def lookup_lemma(self, lemma: str, guideword: str, pos_tag: str) -> List[Dict]:
if lemma in {"X", "u", "n"}:
return []
lemma = lemma.strip()
guideword = self._clean_guideword(guideword)
unique_lemmas = self._get_unique_lemmas(lemma, guideword, pos_tag)
self._log_warning_if_no_lemmas(unique_lemmas, lemma, guideword)
return [{"_id": lemma_id} for lemma_id in unique_lemmas]

def _get_unique_lemmas(self, lemma: str, guideword: str, pos_tag: str) -> List[str]:
if lemma.startswith("+"):
return self._lookup_prefixed_lemma(lemma[1:], guideword)
else:
unique_lemmas = self._lookup_standard_lemma(lemma, guideword, pos_tag)
if not unique_lemmas and pos_tag in self.config.get("noun_pos_tags", []):
return self._query_database(
{"lemma_field": "oraccWords.lemma", "lemma_value": lemma}
)
return unique_lemmas

def _log_warning_if_no_lemmas(
self, unique_lemmas: List[str], lemma: str, guideword: str
) -> None:
if not unique_lemmas:
self.logger.warning(
"Incompatible lemmatization: No eBL word found for lemma"
f" '{lemma}' and guide word '{guideword}'"
)

def _clean_guideword(self, guideword: str) -> str:
guideword = guideword.strip().strip("[]")
return guideword.split("//")[0] if "//" in guideword else guideword

def _lookup_prefixed_lemma(self, lemma: str, guideword: str) -> List[str]:
lemma = lemma.replace("ʾ", "'")
unique_lemmas = self._query_database(
{
"lemma_field": "oraccWords.lemma",
"lemma_value": lemma,
"guideword_field": "oraccWords.guideWord",
"guideword_value": guideword,
}
) or self._query_multiple_sources(lemma, guideword)

return unique_lemmas

def _lookup_standard_lemma(
self, lemma: str, guideword: str, pos_tag: str
) -> List[str]:
try:
citation_form, guideword = self.config["lemposgw_cfgw"][
lemma + pos_tag + guideword
]
guideword = guideword.split("//")[0] if "//" in guideword else guideword

unique_lemmas = self._query_database(
{
"lemma_field": "oraccWords.lemma",
"lemma_value": citation_form,
"guideword_field": "oraccWords.guideWord",
"guideword_value": guideword,
}
) or self._query_multiple_sources(citation_form, guideword)
except KeyError:
self.logger.warning(
"Incompatible lemmatization: No citation form"
f" or guideword found in the glossary for '{lemma}'"
)
return []

return unique_lemmas

def _query_multiple_sources(self, lemma: str, guideword: str) -> List[str]:
sources = ["forms.lemma", "lemma"]
unique_lemmas = []
for source in sources:
unique_lemmas += self._query_database(
{
"lemma_field": source,
"lemma_value": [lemma],
"guideword_field": "guideWord",
"guideword_value": guideword,
}
)
return unique_lemmas

def _query_database(self, config: QueryConfig) -> List[str]:
query = {config["lemma_field"]: config["lemma_value"]}
if config["guideword_field"] != "" and config["guideword_value"] != "":
query[config["guideword_field"]] = config["guideword_value"]

return [
entry["_id"]
for entry in self.database.get_collection("words").find(query, {"_id"})
]
Loading
Loading