Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alignment data export #409

Open
wants to merge 31 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
e7667b8
Add sign export script
fsimonjetz Apr 3, 2023
e9894f8
add pandas to dependencies for export script
fsimonjetz Apr 3, 2023
6ec8941
Remove debug limit
fsimonjetz Apr 4, 2023
6f6ea7a
'Refactored by Sourcery' (#410)
sourcery-ai[bot] Apr 4, 2023
30fb091
Refactoring
fsimonjetz Apr 4, 2023
7064a5b
export template
fsimonjetz Apr 4, 2023
40a1731
add caic reports io
fsimonjetz Apr 5, 2023
7efa57e
fix reference aggregation
fsimonjetz Apr 5, 2023
f1e74fd
Move team names into environment variable
fsimonjetz Apr 5, 2023
23fc057
Fix monthly index
fsimonjetz Apr 5, 2023
9ba5f42
Less restrictive field matching
fsimonjetz Apr 5, 2023
58dde97
better references display
fsimonjetz Apr 13, 2023
77b5d98
Merge branch 'master' into alignment-data-export
fsimonjetz Apr 13, 2023
6be4274
Refactoring; add zip to export
fsimonjetz Apr 13, 2023
76f487e
Ignore output files
fsimonjetz Apr 13, 2023
c7ef43d
Use tar.gz instead of zip
fsimonjetz Apr 13, 2023
9ff50e7
Merge branch 'extend-revision-record' into alignment-data-export
fsimonjetz Apr 14, 2023
d635c93
Remove unrelated changes
fsimonjetz Apr 14, 2023
2a8037a
remove vocab generation
fsimonjetz Apr 17, 2023
b55a3cb
Remove old export script
fsimonjetz Apr 21, 2023
9d1eb91
include more chapter infos
fsimonjetz Apr 25, 2023
674e83c
Remove filtering "empty" texts
fsimonjetz Apr 25, 2023
b12ba58
use stage abbreviation instead of long name;
fsimonjetz Apr 25, 2023
bd0a703
Fix dtypes; add url column
fsimonjetz May 10, 2023
e9ad32f
include colophon and unplaced line counts
fsimonjetz Jun 1, 2023
d7c4eae
Always use the production db for export
fsimonjetz Jun 1, 2023
a9456c6
add colophon option
fsimonjetz Jun 2, 2023
1caa714
Merge branch 'master' into alignment-data-export
fsimonjetz Jun 2, 2023
82b55ab
add sign export task
fsimonjetz Jun 2, 2023
41b6c36
reformatting
fsimonjetz Jun 2, 2023
044cf3b
Merge branch 'master' into alignment-data-export
fsimonjetz Jun 2, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -132,4 +132,8 @@ Taskfile.yml
.task

# Core dumps
core.*
core.*

# IO outputs
ebl/io/alignment/*.zip
ebl/io/alignment/*.tar*
3 changes: 3 additions & 0 deletions Taskfile.dist.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,6 @@ tasks:
cmds:
- poetry run python -m ebl.io.fragments.importer import {{.CLI_ARGS}}

export-signs:
cmds:
- poetry run python -m ebl.io.alignment.sign_export {{.CLI_ARGS}}
5 changes: 2 additions & 3 deletions ebl/fragmentarium/application/fragment_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
from ebl.fragmentarium.domain.transliteration_update import TransliterationUpdate
from ebl.lemmatization.domain.lemmatization import Lemmatization
from ebl.users.domain.user import User

COLLECTION = "fragments"
from ebl.transliteration.infrastructure.collections import FRAGMENTS_COLLECTION


class FragmentUpdater:
Expand Down Expand Up @@ -132,7 +131,7 @@ def _create_changelog(
schema = FragmentSchema()
fragment_id = str(fragment.number)
self._changelog.create(
COLLECTION,
FRAGMENTS_COLLECTION,
user.profile,
{"_id": fragment_id, **schema.dump(fragment)},
{"_id": fragment_id, **schema.dump(updated_fragment)},
Expand Down
Empty file added ebl/io/__init__.py
Empty file.
Empty file added ebl/io/alignment/__init__.py
Empty file.
159 changes: 159 additions & 0 deletions ebl/io/alignment/sign_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
from pymongo import MongoClient
from ebl.mongo_collection import MongoCollection
from ebl.transliteration.infrastructure.collections import (
FRAGMENTS_COLLECTION,
CHAPTERS_COLLECTION,
)
from ebl.corpus.domain.manuscript import (
ManuscriptType,
Provenance,
)
from ebl.common.domain.period import Period
from ebl.transliteration.domain.stage import Stage, ABBREVIATIONS as STAGE_ABBREVIATIONS
import os
import tarfile
import pandas as pd
import datetime
from functools import reduce
from urllib.parse import quote as encode_url
import argparse


# disable false positive SettingsWithCopyWarning
pd.options.mode.chained_assignment = None

client = MongoClient(os.environ["MONGODB_URI"])
database = client.get_database("ebl")
fragments = MongoCollection(database, FRAGMENTS_COLLECTION)
chapters = MongoCollection(database, CHAPTERS_COLLECTION)

output_folder_name = f"alignment_export_{datetime.date.today()}"
tmp_path = os.path.join("/tmp", output_folder_name)
os.makedirs(tmp_path, exist_ok=True)


def enum_mapping(enum):
return {enum_item.long_name: enum_item.abbreviation for enum_item in enum}


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Export sign data as TSV files")
parser.add_argument(
"--include-colophons",
help="Include colophon and unplaces lines from manuscripts",
action="store_true",
)

args = parser.parse_args()

print(f"Starting export (temporary directory {tmp_path}/ )...")
print("Exporting fragments...")

fragment_signs = fragments.find_many(
{"signs": {"$exists": True, "$ne": ""}}, projection={"signs": True}
)
df_fragments = pd.DataFrame.from_records(fragment_signs)

df_fragments.to_csv(
os.path.join(tmp_path, "fragment_signs.tsv"), index=False, sep="\t"
)

print("Exporting chapters...")

siglum_columns = ["provenance", "period", "type", "disambiguator"]
siglum_enums = [Provenance, Period, ManuscriptType]

abbreviation_mappings = dict(zip(siglum_columns, siglum_enums))

chapter_signs = chapters.aggregate(
[
{
"$project": {
"manuscripts": {"$zip": {"inputs": ["$manuscripts", "$signs"]}},
"textId": 1,
"stage": 1,
"name": 1,
}
},
{"$unwind": "$manuscripts"},
{
"$addFields": {
"manuscript": {"$first": "$manuscripts"},
"signs": {"$last": "$manuscripts"},
}
},
{
"$project": {
"_id": 0,
"category": "$textId.category",
"index": "$textId.index",
"genre": "$textId.genre",
"stage": 1,
"name": 1,
"provenance": "$manuscript.provenance",
"period": "$manuscript.period",
"type": "$manuscript.type",
"disambiguator": "$manuscript.siglumDisambiguator",
"signs": 1,
"colophon_lines": "$manuscript.colophon.numberOfLines",
"unplaced_lines": "$manuscript.unplacedLines.numberOfLines",
}
},
]
)
df_chapters = pd.DataFrame.from_records(chapter_signs)
df_chapters["signs"] = df_chapters.signs.fillna("")

if not args.include_colophons:
print("Dropping colophon and unplaced lines...")
df_chapters["signs"] = df_chapters.signs.str.split("\n")
df_chapters["signs"] = df_chapters.apply(
lambda row: row.signs[: -(row.colophon_lines + row.unplaced_lines) or None],
axis=1,
)
df_chapters["signs"] = df_chapters.signs.str.join("\n")

# map long names to abbreviations
for column, enum in abbreviation_mappings.items():
df_chapters[column] = df_chapters[column].map(enum_mapping(enum))

stages = {stage.value: STAGE_ABBREVIATIONS[stage] for stage in Stage}
df_chapters["stage"] = df_chapters["stage"].map(stages)

# create siglum
df_chapters["siglum"] = df_chapters[siglum_columns].agg("".join, axis=1)

df_chapters["category"] = df_chapters["category"].astype(int)
df_chapters["index"] = df_chapters["index"].astype(int)

url_columns = ["genre", "category", "index", "stage", "name"]

df_chapters["url"] = reduce(
(lambda x, y: x + "/" + y),
[
df_chapters[col].fillna("").astype(str).map(encode_url)
for col in url_columns
],
)

df_chapters["id"] = "/" + df_chapters["url"] + "#" + df_chapters["siglum"]
df_chapters = df_chapters[["id", "signs"]]

# exclude test chapter
df_chapters = df_chapters[~df_chapters["id"].str.startswith("/L/99/99/")]

df_chapters.to_csv(
os.path.join(tmp_path, "chapter_signs.tsv"), index=False, sep="\t"
)

tar_path = os.path.join(
os.path.dirname(__file__),
f"{output_folder_name}{'' if args.include_colophons else '_no_colophons'}.tar.gz",
)

print(f"Storing archive in '{tar_path}'...")

with tarfile.open(tar_path, "w:gz") as tar:
tar.add(tmp_path, arcname=os.path.basename(tmp_path))

print("Done.")
63 changes: 32 additions & 31 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ black = "*"
pymongo-inmemory = "*"
pytest-xdist = "*"
autopep8 = "^1.6.0"
pandas = "*"

[build-system]
requires = ["poetry-core>=1.0.0"]
Expand Down