ElectronicBabylonianLiterature · fsimonjetz · Apr 3, 2023 · Apr 3, 2023 · Apr 4, 2023 · Apr 4, 2023
diff --git a/.gitignore b/.gitignore
@@ -132,4 +132,8 @@ Taskfile.yml
 .task
 
 # Core dumps
-core.*
+core.*
+
+# IO outputs
+ebl/io/alignment/*.zip
+ebl/io/alignment/*.tar*
diff --git a/Taskfile.dist.yml b/Taskfile.dist.yml
@@ -50,3 +50,6 @@ tasks:
     cmds:
       - poetry run python -m ebl.io.fragments.importer import {{.CLI_ARGS}}
 
+  export-signs:
+    cmds:
+      - poetry run python -m ebl.io.alignment.sign_export {{.CLI_ARGS}}
diff --git a/ebl/fragmentarium/application/fragment_updater.py b/ebl/fragmentarium/application/fragment_updater.py
@@ -12,8 +12,7 @@
 from ebl.fragmentarium.domain.transliteration_update import TransliterationUpdate
 from ebl.lemmatization.domain.lemmatization import Lemmatization
 from ebl.users.domain.user import User
-
-COLLECTION = "fragments"
+from ebl.transliteration.infrastructure.collections import FRAGMENTS_COLLECTION
 
 
 class FragmentUpdater:
@@ -132,7 +131,7 @@ def _create_changelog(
         schema = FragmentSchema()
         fragment_id = str(fragment.number)
         self._changelog.create(
-            COLLECTION,
+            FRAGMENTS_COLLECTION,
             user.profile,
             {"_id": fragment_id, **schema.dump(fragment)},
             {"_id": fragment_id, **schema.dump(updated_fragment)},

diff --git a/ebl/io/__init__.py b/ebl/io/__init__.py
diff --git a/ebl/io/alignment/__init__.py b/ebl/io/alignment/__init__.py
diff --git a/ebl/io/alignment/sign_export.py b/ebl/io/alignment/sign_export.py
@@ -0,0 +1,159 @@
+from pymongo import MongoClient
+from ebl.mongo_collection import MongoCollection
+from ebl.transliteration.infrastructure.collections import (
+    FRAGMENTS_COLLECTION,
+    CHAPTERS_COLLECTION,
+)
+from ebl.corpus.domain.manuscript import (
+    ManuscriptType,
+    Provenance,
+)
+from ebl.common.domain.period import Period
+from ebl.transliteration.domain.stage import Stage, ABBREVIATIONS as STAGE_ABBREVIATIONS
+import os
+import tarfile
+import pandas as pd
+import datetime
+from functools import reduce
+from urllib.parse import quote as encode_url
+import argparse
+
+
+# disable false positive SettingsWithCopyWarning
+pd.options.mode.chained_assignment = None
+
+client = MongoClient(os.environ["MONGODB_URI"])
+database = client.get_database("ebl")
+fragments = MongoCollection(database, FRAGMENTS_COLLECTION)
+chapters = MongoCollection(database, CHAPTERS_COLLECTION)
+
+output_folder_name = f"alignment_export_{datetime.date.today()}"
+tmp_path = os.path.join("/tmp", output_folder_name)
+os.makedirs(tmp_path, exist_ok=True)
+
+
+def enum_mapping(enum):
+    return {enum_item.long_name: enum_item.abbreviation for enum_item in enum}
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Export sign data as TSV files")
+    parser.add_argument(
+        "--include-colophons",
+        help="Include colophon and unplaces lines from manuscripts",
+        action="store_true",
+    )
+
+    args = parser.parse_args()
+
+    print(f"Starting export (temporary directory {tmp_path}/ )...")
+    print("Exporting fragments...")
+
+    fragment_signs = fragments.find_many(
+        {"signs": {"$exists": True, "$ne": ""}}, projection={"signs": True}
+    )
+    df_fragments = pd.DataFrame.from_records(fragment_signs)
+
+    df_fragments.to_csv(
+        os.path.join(tmp_path, "fragment_signs.tsv"), index=False, sep="\t"
+    )
+
+    print("Exporting chapters...")
+
+    siglum_columns = ["provenance", "period", "type", "disambiguator"]
+    siglum_enums = [Provenance, Period, ManuscriptType]
+
+    abbreviation_mappings = dict(zip(siglum_columns, siglum_enums))
+
+    chapter_signs = chapters.aggregate(
+        [
+            {
+                "$project": {
+                    "manuscripts": {"$zip": {"inputs": ["$manuscripts", "$signs"]}},
+                    "textId": 1,
+                    "stage": 1,
+                    "name": 1,
+                }
+            },
+            {"$unwind": "$manuscripts"},
+            {
+                "$addFields": {
+                    "manuscript": {"$first": "$manuscripts"},
+                    "signs": {"$last": "$manuscripts"},
+                }
+            },
+            {
+                "$project": {
+                    "_id": 0,
+                    "category": "$textId.category",
+                    "index": "$textId.index",
+                    "genre": "$textId.genre",
+                    "stage": 1,
+                    "name": 1,
+                    "provenance": "$manuscript.provenance",
+                    "period": "$manuscript.period",
+                    "type": "$manuscript.type",
+                    "disambiguator": "$manuscript.siglumDisambiguator",
+                    "signs": 1,
+                    "colophon_lines": "$manuscript.colophon.numberOfLines",
+                    "unplaced_lines": "$manuscript.unplacedLines.numberOfLines",
+                }
+            },
+        ]
+    )
+    df_chapters = pd.DataFrame.from_records(chapter_signs)
+    df_chapters["signs"] = df_chapters.signs.fillna("")
+
+    if not args.include_colophons:
+        print("Dropping colophon and unplaced lines...")
+        df_chapters["signs"] = df_chapters.signs.str.split("\n")
+        df_chapters["signs"] = df_chapters.apply(
+            lambda row: row.signs[: -(row.colophon_lines + row.unplaced_lines) or None],
+            axis=1,
+        )
+        df_chapters["signs"] = df_chapters.signs.str.join("\n")
+
+    # map long names to abbreviations
+    for column, enum in abbreviation_mappings.items():
+        df_chapters[column] = df_chapters[column].map(enum_mapping(enum))
+
+    stages = {stage.value: STAGE_ABBREVIATIONS[stage] for stage in Stage}
+    df_chapters["stage"] = df_chapters["stage"].map(stages)
+
+    # create siglum
+    df_chapters["siglum"] = df_chapters[siglum_columns].agg("".join, axis=1)
+
+    df_chapters["category"] = df_chapters["category"].astype(int)
+    df_chapters["index"] = df_chapters["index"].astype(int)
+
+    url_columns = ["genre", "category", "index", "stage", "name"]
+
+    df_chapters["url"] = reduce(
+        (lambda x, y: x + "/" + y),
+        [
+            df_chapters[col].fillna("").astype(str).map(encode_url)
+            for col in url_columns
+        ],
+    )
+
+    df_chapters["id"] = "/" + df_chapters["url"] + "#" + df_chapters["siglum"]
+    df_chapters = df_chapters[["id", "signs"]]
+
+    # exclude test chapter
+    df_chapters = df_chapters[~df_chapters["id"].str.startswith("/L/99/99/")]
+
+    df_chapters.to_csv(
+        os.path.join(tmp_path, "chapter_signs.tsv"), index=False, sep="\t"
+    )
+
+    tar_path = os.path.join(
+        os.path.dirname(__file__),
+        f"{output_folder_name}{'' if args.include_colophons else '_no_colophons'}.tar.gz",
+    )
+
+    print(f"Storing archive in '{tar_path}'...")
+
+    with tarfile.open(tar_path, "w:gz") as tar:
+        tar.add(tmp_path, arcname=os.path.basename(tmp_path))
+
+    print("Done.")
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -48,6 +48,7 @@ black = "*"
 pymongo-inmemory = "*"
 pytest-xdist = "*"
 autopep8 = "^1.6.0"
+pandas = "*"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]