ElectronicBabylonianLiterature · fsimonjetz · Sep 7, 2023 · Sep 8, 2023 · Sep 8, 2023 · Sep 8, 2023
diff --git a/ebl/app.py b/ebl/app.py
@@ -9,6 +9,7 @@
 from sentry_sdk import configure_scope
 from sentry_sdk.integrations.falcon import FalconIntegration
 import althaia
+from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository
 import ebl.error_handler
 from ebl.bibliography.infrastructure.bibliography import MongoBibliographyRepository
 from ebl.bibliography.web.bootstrap import create_bibliography_routes
@@ -25,6 +26,9 @@
 from ebl.ebl_ai_client import EblAiClient
 from ebl.files.infrastructure.grid_fs_file_repository import GridFsFileRepository
 from ebl.files.web.bootstrap import create_files_route
+from ebl.fragmentarium.infrastructure.fragment_ngram_repository import (
+    FragmentNGramRepository,
+)
 from ebl.markup.web.bootstrap import create_markup_route
 from ebl.fragmentarium.infrastructure.cropped_sign_images_repository import (
     MongoCroppedSignImagesRepository,
@@ -85,6 +89,8 @@ def create_context():
         photo_repository=GridFsFileRepository(database, "photos"),
         folio_repository=GridFsFileRepository(database, "folios"),
         fragment_repository=MongoFragmentRepository(database),
+        fragment_ngram_repository=FragmentNGramRepository(database),
+        chapter_ngram_repository=ChapterNGramRepository(database),
         changelog=Changelog(database),
         bibliography_repository=MongoBibliographyRepository(database),
         text_repository=MongoTextRepository(database),

diff --git a/ebl/common/query/util.py b/ebl/common/query/util.py
@@ -1,4 +1,4 @@
-from typing import Union, Dict
+from typing import Optional, Sequence, Union, Dict
 
 
 def flatten_field(input_: Union[str, Dict], depth=1) -> Dict:
@@ -16,8 +16,8 @@ def drop_duplicates(input_: Union[str, Dict]) -> Dict:
 
 
 def ngrams(input_: Union[str, Dict], n) -> Dict:
-    if n <= 1:
-        raise ValueError("ngram size must be 2 or more")
+    if n <= 0:
+        raise ValueError("ngram size must be 1 or more")
     return {
         "$zip": {
             "inputs": [
@@ -39,3 +39,51 @@ def ngrams(input_: Union[str, Dict], n) -> Dict:
 
 def filter_array(input_, as_, cond) -> Dict:
     return {"$filter": {"input": input_, "as": as_, "cond": cond}}
+
+
+def aggregate_all_ngrams(
+    input_: Union[str, Dict],
+    N: Sequence[int],
+    output_: str = "ngrams",
+    signs_to_exclude: Optional[Sequence[str]] = None,
+    ngram_field="ngram",
+):
+    if signs_to_exclude is None:
+        signs_to_exclude = ["X", ""]
+
+    exclude_empty = {
+        "$eq": [
+            {
+                "$size": {
+                    "$setIntersection": [
+                        f"$${ngram_field}",
+                        signs_to_exclude,
+                    ]
+                }
+            },
+            0,
+        ]
+    }
+    return [
+        {
+            "$addFields": {
+                output_: drop_duplicates(
+                    filter_array(
+                        {"$concatArrays": [ngrams(input_, n) for n in N if n > 0]},
+                        ngram_field,
+                        exclude_empty,
+                    )
+                )
+            }
+        },
+    ]
+
+
+def replace_all(old: str, new: str):
+    return {
+        "$replaceAll": {
+            "input": "$signs",
+            "find": old,
+            "replacement": new,
+        }
+    }
diff --git a/ebl/context.py b/ebl/context.py
@@ -6,6 +6,7 @@
 from ebl.bibliography.application.bibliography_repository import BibliographyRepository
 from ebl.cache.application.custom_cache import ChapterCache
 from ebl.changelog import Changelog
+from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository
 from ebl.corpus.infrastructure.mongo_text_repository import MongoTextRepository
 from ebl.dictionary.application.word_repository import WordRepository
 from ebl.ebl_ai_client import EblAiClient
@@ -19,6 +20,9 @@
 from ebl.fragmentarium.infrastructure.cropped_sign_images_repository import (
     MongoCroppedSignImagesRepository,
 )
+from ebl.fragmentarium.infrastructure.fragment_ngram_repository import (
+    FragmentNGramRepository,
+)
 from ebl.lemmatization.application.suggestion_finder import LemmaRepository
 from ebl.transliteration.application.parallel_line_injector import ParallelLineInjector
 from ebl.transliteration.application.sign_repository import SignRepository
@@ -38,6 +42,8 @@ class Context:
     photo_repository: FileRepository
     folio_repository: FileRepository
     fragment_repository: FragmentRepository
+    fragment_ngram_repository: FragmentNGramRepository
+    chapter_ngram_repository: ChapterNGramRepository
     changelog: Changelog
     bibliography_repository: BibliographyRepository
     text_repository: MongoTextRepository

diff --git a/ebl/corpus/infrastructure/corpus_ngram_repository.py b/ebl/corpus/infrastructure/corpus_ngram_repository.py
@@ -0,0 +1,72 @@
+from ebl.corpus.domain.chapter import ChapterId
+from ebl.corpus.infrastructure.queries import chapter_id_query
+from ebl.errors import NotFoundError
+from ebl.mongo_collection import MongoCollection
+from ebl.transliteration.infrastructure.collections import (
+    CHAPTER_NGRAM_COLLECTION,
+    CHAPTERS_COLLECTION,
+)
+from typing import Optional, Sequence
+
+from ebl.common.query.util import aggregate_all_ngrams, replace_all
+
+NGRAM_FIELD = "ngram"
+
+
+class ChapterNGramRepository:
+    def __init__(self, database):
+        self._chapters = MongoCollection(database, CHAPTERS_COLLECTION)
+        self._ngrams = MongoCollection(database, CHAPTER_NGRAM_COLLECTION)
+
+    def aggregate_chapter_ngrams(
+        self,
+        chapter_id: ChapterId,
+        N: Sequence[int],
+        signs_to_exclude: Optional[Sequence[str]] = None,
+    ):
+        return [
+            {"$match": chapter_id_query(chapter_id)},
+            {"$project": {"signs": 1, "textId": 1, "stage": 1, "name": 1}},
+            {"$unwind": "$signs"},
+            {
+                "$addFields": {
+                    NGRAM_FIELD: {
+                        "$split": [
+                            replace_all("\n", " # "),
+                            " ",
+                        ]
+                    }
+                }
+            },
+            *aggregate_all_ngrams(f"${NGRAM_FIELD}", N, NGRAM_FIELD, signs_to_exclude),
+            {"$unwind": f"${NGRAM_FIELD}"},
+            {
+                "$group": {
+                    "_id": None,
+                    NGRAM_FIELD: {"$addToSet": f"${NGRAM_FIELD}"},
+                    "textId": {"$first": "$textId"},
+                    "name": {"$first": "$name"},
+                    "stage": {"$first": "$stage"},
+                }
+            },
+            {"$project": {"_id": False}},
+        ]
+
+    def update_ngrams(
+        self,
+        chapter_id: ChapterId,
+        N: Sequence[int],
+        signs_to_exclude: Optional[Sequence[str]] = None,
+    ) -> None:
+        aggregation = self.aggregate_chapter_ngrams(chapter_id, N, signs_to_exclude)
+        if data := next(
+            self._chapters.aggregate(aggregation, allowDiskUse=True),
+            None,
+        ):
+            try:
+                self._ngrams.update_one(
+                    chapter_id_query(chapter_id),
+                    {"$set": {NGRAM_FIELD: data[NGRAM_FIELD]}},
+                )
+            except NotFoundError:
+                self._ngrams.insert_one(data)
diff --git a/ebl/corpus/web/bootstrap.py b/ebl/corpus/web/bootstrap.py
@@ -48,9 +48,15 @@ def create_corpus_routes(api: falcon.App, context: Context):
     chapters_by_lemma = ChaptersByLemmaResource(corpus)
     alignment = AlignmentResource(corpus, context.custom_cache)
     manuscript_lemmatization = LemmatizationResource(corpus, context.custom_cache)
-    manuscript = ManuscriptsResource(corpus, context.custom_cache)
-    lines = LinesResource(corpus, context.custom_cache)
-    lines_import = LinesImportResource(corpus, context.custom_cache)
+    manuscript = ManuscriptsResource(
+        corpus, context.custom_cache, context.chapter_ngram_repository
+    )
+    lines = LinesResource(
+        corpus, context.custom_cache, context.chapter_ngram_repository
+    )
+    lines_import = LinesImportResource(
+        corpus, context.custom_cache, context.chapter_ngram_repository
+    )
     colophons = ColophonsResource(corpus)
     unplaced_lines = UnplacedLinesResource(corpus)
     extant_lines = ExtantLinesResource(corpus)

diff --git a/ebl/corpus/web/lines.py b/ebl/corpus/web/lines.py
@@ -7,11 +7,13 @@
 from ebl.corpus.application.corpus import Corpus
 from ebl.corpus.domain.line import Line
 from ebl.corpus.domain.lines_update import LinesUpdate
+from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository
 from ebl.corpus.web.chapter_schemas import ApiChapterSchema, ApiLineSchema
 from ebl.corpus.web.display_schemas import LineDetailsDisplay, LineDetailsDisplaySchema
 from ebl.corpus.web.text_utils import create_chapter_id
 from ebl.errors import NotFoundError
 from ebl.marshmallowschema import validate
+from ebl.users.web.update_cache import create_chapter_ngram_cache
 from ebl.users.web.require_scope import require_scope
 
 
@@ -39,12 +41,19 @@ class LinesImportSchema(Schema):
 
 
 class LinesResource:
-    def __init__(self, corpus: Corpus, cache: ChapterCache):
+    def __init__(
+        self,
+        corpus: Corpus,
+        cache: ChapterCache,
+        ngram_repository: ChapterNGramRepository,
+    ):
         self._corpus = corpus
         self._cache = cache
+        self.ngram_repository = ngram_repository
 
     @falcon.before(require_scope, "write:texts")
     @validate(LinesUpdateSchema())
+    @falcon.after(create_chapter_ngram_cache)
     def on_post(
         self,
         req: falcon.Request,
@@ -64,12 +73,19 @@ def on_post(
 
 
 class LinesImportResource:
-    def __init__(self, corpus: Corpus, cache: ChapterCache):
+    def __init__(
+        self,
+        corpus: Corpus,
+        cache: ChapterCache,
+        ngram_repository: ChapterNGramRepository,
+    ):
         self._corpus = corpus
         self._cache = cache
+        self.ngram_repository = ngram_repository
 
     @falcon.before(require_scope, "write:texts")
     @validate(LinesImportSchema())
+    @falcon.after(create_chapter_ngram_cache)
     def on_post(
         self,
         req: falcon.Request,

diff --git a/ebl/corpus/web/manuscripts.py b/ebl/corpus/web/manuscripts.py
@@ -3,13 +3,15 @@
 
 from ebl.cache.application.custom_cache import ChapterCache
 from ebl.corpus.application.corpus import Corpus
+from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository
 from ebl.corpus.web.chapter_schemas import (
     ApiChapterSchema,
     ApiManuscriptSchema,
     MuseumNumberString,
 )
 from ebl.corpus.web.text_utils import create_chapter_id
 from ebl.marshmallowschema import validate
+from ebl.users.web.update_cache import create_chapter_ngram_cache
 from ebl.users.web.require_scope import require_scope
 
 
@@ -21,9 +23,15 @@ class ManuscriptDtoSchema(Schema):
 
 
 class ManuscriptsResource:
-    def __init__(self, corpus: Corpus, cache: ChapterCache):
+    def __init__(
+        self,
+        corpus: Corpus,
+        cache: ChapterCache,
+        ngram_repository: ChapterNGramRepository,
+    ):
         self._corpus = corpus
         self._cache = cache
+        self.ngram_repository = ngram_repository
 
     def on_get(
         self,
@@ -41,6 +49,7 @@ def on_get(
 
     @falcon.before(require_scope, "write:texts")
     @validate(ManuscriptDtoSchema())
+    @falcon.after(create_chapter_ngram_cache)
     def on_post(
         self,
         req: falcon.Request,

diff --git a/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py
@@ -0,0 +1,58 @@
+from ebl.errors import NotFoundError
+from ebl.mongo_collection import MongoCollection
+from ebl.transliteration.infrastructure.collections import (
+    FRAGMENT_NGRAM_COLLECTION,
+    FRAGMENTS_COLLECTION,
+)
+from typing import Optional, Sequence
+
+from ebl.common.query.util import aggregate_all_ngrams, replace_all
+
+NGRAM_FIELD = "ngram"
+
+
+class FragmentNGramRepository:
+    def __init__(self, database):
+        self._fragments = MongoCollection(database, FRAGMENTS_COLLECTION)
+        self._ngrams = MongoCollection(database, FRAGMENT_NGRAM_COLLECTION)
+
+    def aggregate_fragment_ngrams(
+        self,
+        number: dict,
+        N: Sequence[int],
+        signs_to_exclude: Optional[Sequence[str]] = None,
+    ):
+        return [
+            {"$match": {f"museumNumber.{key}": value for key, value in number.items()}},
+            {
+                "$project": {
+                    f"{NGRAM_FIELD}s": {
+                        "$split": [
+                            replace_all("\n", " # "),
+                            " ",
+                        ]
+                    }
+                }
+            },
+            *aggregate_all_ngrams(
+                f"${NGRAM_FIELD}s", N, f"{NGRAM_FIELD}s", signs_to_exclude
+            ),
+        ]
+
+    def update_ngrams(
+        self,
+        number: dict,
+        N: Sequence[int],
+        signs_to_exclude: Optional[Sequence[str]] = None,
+    ) -> None:
+        aggregation = self.aggregate_fragment_ngrams(number, N, signs_to_exclude)
+        if data := next(
+            self._fragments.aggregate(aggregation, allowDiskUse=True),
+            None,
+        ):
+            try:
+                self._ngrams.update_one(
+                    {"_id": data["_id"]}, {"$set": {"ngrams": data["ngrams"]}}
+                )
+            except NotFoundError:
+                self._ngrams.insert_one(data)