From 6a40a9c8f0b9f476e72eecf1b157f2eec1daac48 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Thu, 7 Sep 2023 15:54:33 +0000 Subject: [PATCH 01/36] add ngram matcher route --- ebl/fragmentarium/web/bootstrap.py | 4 ++++ ebl/fragmentarium/web/ngram_matcher.py | 13 +++++++++++++ 2 files changed, 17 insertions(+) create mode 100644 ebl/fragmentarium/web/ngram_matcher.py diff --git a/ebl/fragmentarium/web/bootstrap.py b/ebl/fragmentarium/web/bootstrap.py index 442cba654..2ce6ed02a 100644 --- a/ebl/fragmentarium/web/bootstrap.py +++ b/ebl/fragmentarium/web/bootstrap.py @@ -11,6 +11,7 @@ from ebl.fragmentarium.web.folios import FoliosResource from ebl.fragmentarium.web.fragment_genre import FragmentGenreResource from ebl.fragmentarium.web.fragment_script import FragmentScriptResource +from ebl.fragmentarium.web.ngram_matcher import NgramAlignResource from ebl.fragmentarium.web.fragment_date import ( FragmentDateResource, FragmentDatesInTextResource, @@ -73,6 +74,8 @@ def create_fragmentarium_routes(api: falcon.App, context: Context): fragment_date = FragmentDateResource(updater) fragment_dates_in_text = FragmentDatesInTextResource(updater) + ngram_aligner = NgramAlignResource(context.fragment_repository) + fragment_matcher = FragmentMatcherResource( FragmentMatcher(context.fragment_repository) ) @@ -108,6 +111,7 @@ def create_fragmentarium_routes(api: falcon.App, context: Context): routes = [ ("/fragments", fragment_search), + ("/fragments/{number}/align", ngram_aligner), ("/fragments/{number}/match", fragment_matcher), ("/fragments/{number}/genres", fragment_genre), ("/fragments/{number}/script", fragment_script), diff --git a/ebl/fragmentarium/web/ngram_matcher.py b/ebl/fragmentarium/web/ngram_matcher.py new file mode 100644 index 000000000..a35ae2d11 --- /dev/null +++ b/ebl/fragmentarium/web/ngram_matcher.py @@ -0,0 +1,13 @@ +from falcon import Request, Response +from ebl.fragmentarium.application.fragment_repository import FragmentRepository + + +class NgramAlignResource: + def __init__( + self, + repository: FragmentRepository, + ): + self.fragment_repository = repository + + def on_get(self, req: Request, resp: Response, number) -> None: + resp.media = {} From 2e0125d320b4aeffcf8749ce96f80625f2f1d492 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Fri, 8 Sep 2023 13:32:23 +0000 Subject: [PATCH 02/36] allow n-grams w. n=1 --- ebl/common/query/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ebl/common/query/util.py b/ebl/common/query/util.py index ee71de18c..a1ecd360a 100644 --- a/ebl/common/query/util.py +++ b/ebl/common/query/util.py @@ -16,8 +16,8 @@ def drop_duplicates(input_: Union[str, Dict]) -> Dict: def ngrams(input_: Union[str, Dict], n) -> Dict: - if n <= 1: - raise ValueError("ngram size must be 2 or more") + if n <= 0: + raise ValueError("ngram size must be 1 or more") return { "$zip": { "inputs": [ From eb005f8e7b494217e54de5b842ceb3ac4a41c315 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Fri, 8 Sep 2023 13:32:37 +0000 Subject: [PATCH 03/36] add fragment ngram aggregation --- ebl/common/infrastructure/ngrams.py | 56 +++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 ebl/common/infrastructure/ngrams.py diff --git a/ebl/common/infrastructure/ngrams.py b/ebl/common/infrastructure/ngrams.py new file mode 100644 index 000000000..2f3967f44 --- /dev/null +++ b/ebl/common/infrastructure/ngrams.py @@ -0,0 +1,56 @@ +from typing import Optional, Sequence, Union, Dict + +from ebl.common.query.util import drop_duplicates, filter_array, ngrams + +NGRAM_FIELD = "ngram" + + +def create_all_ngrams( + input_: Union[str, Dict], + N: Sequence[int], + output_: str = "ngrams", + signs_to_exclude: Optional[Sequence[str]] = None, +): + if signs_to_exclude is None: + signs_to_exclude = ["X", ""] + + no_empty_signs = { + "$eq": [ + { + "$size": { + "$setIntersection": [ + f"$${NGRAM_FIELD}", + signs_to_exclude, + ] + } + }, + 0, + ] + } + return [ + { + "$addFields": { + output_: drop_duplicates( + filter_array( + {"$concatArrays": [ngrams(input_, n) for n in N if n > 0]}, + NGRAM_FIELD, + no_empty_signs, + ) + ) + } + }, + ] + + +def create_fragment_ngrams( + number: str, N: Sequence[int], signs_to_exclude: Optional[Sequence[str]] = None +): + return [ + {"$match": {"_id": number}}, + {"$project": {NGRAM_FIELD: {"$split": ["$signs", " "]}}}, + *create_all_ngrams(f"${NGRAM_FIELD}", N, NGRAM_FIELD, signs_to_exclude), + ] + + +def create_chapter_ngrams(): + pass From 1d4b686817b07e76b53c6864a80bdf43fbd45829 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Fri, 8 Sep 2023 14:04:48 +0000 Subject: [PATCH 04/36] add chapter ngram aggregation --- ebl/common/infrastructure/ngrams.py | 43 +++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/ebl/common/infrastructure/ngrams.py b/ebl/common/infrastructure/ngrams.py index 2f3967f44..c164828af 100644 --- a/ebl/common/infrastructure/ngrams.py +++ b/ebl/common/infrastructure/ngrams.py @@ -1,11 +1,13 @@ from typing import Optional, Sequence, Union, Dict from ebl.common.query.util import drop_duplicates, filter_array, ngrams +from ebl.corpus.domain.chapter import ChapterId +from ebl.corpus.infrastructure.queries import chapter_id_query NGRAM_FIELD = "ngram" -def create_all_ngrams( +def aggregate_all_ngrams( input_: Union[str, Dict], N: Sequence[int], output_: str = "ngrams", @@ -42,15 +44,46 @@ def create_all_ngrams( ] -def create_fragment_ngrams( +def aggregate_fragment_ngrams( number: str, N: Sequence[int], signs_to_exclude: Optional[Sequence[str]] = None ): return [ {"$match": {"_id": number}}, {"$project": {NGRAM_FIELD: {"$split": ["$signs", " "]}}}, - *create_all_ngrams(f"${NGRAM_FIELD}", N, NGRAM_FIELD, signs_to_exclude), + *aggregate_all_ngrams(f"${NGRAM_FIELD}", N, NGRAM_FIELD, signs_to_exclude), ] -def create_chapter_ngrams(): - pass +def aggregate_chapter_ngrams( + chapter_id: ChapterId, + N: Sequence[int], + linebreak_char="#", + signs_to_exclude: Optional[Sequence[str]] = None, +): + replace_linebreaks = { + "$replaceAll": { + "input": "$signs", + "find": "\n", + "replacement": f" {linebreak_char} ", + } + } + + return [ + {"$match": chapter_id_query(chapter_id)}, + {"$project": {"signs": 1}}, + {"$unwind": "$signs"}, + { + "$project": { + NGRAM_FIELD: { + "$split": [ + replace_linebreaks, + " ", + ] + } + } + }, + *aggregate_all_ngrams(f"${NGRAM_FIELD}", N, NGRAM_FIELD, signs_to_exclude), + {"$unwind": f"${NGRAM_FIELD}"}, + {"$group": {"_id": None, NGRAM_FIELD: {"$addToSet": f"${NGRAM_FIELD}"}}}, + {"$project": {"_id": False}}, + ] From 24e1617ee6e2da6501b3ae8c1fa3823d9e403a61 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Mon, 11 Sep 2023 13:35:32 +0000 Subject: [PATCH 05/36] add general ngram aggregation --- ebl/common/infrastructure/ngrams.py | 89 ----------------------------- ebl/common/query/util.py | 50 +++++++++++++++- 2 files changed, 49 insertions(+), 90 deletions(-) delete mode 100644 ebl/common/infrastructure/ngrams.py diff --git a/ebl/common/infrastructure/ngrams.py b/ebl/common/infrastructure/ngrams.py deleted file mode 100644 index c164828af..000000000 --- a/ebl/common/infrastructure/ngrams.py +++ /dev/null @@ -1,89 +0,0 @@ -from typing import Optional, Sequence, Union, Dict - -from ebl.common.query.util import drop_duplicates, filter_array, ngrams -from ebl.corpus.domain.chapter import ChapterId -from ebl.corpus.infrastructure.queries import chapter_id_query - -NGRAM_FIELD = "ngram" - - -def aggregate_all_ngrams( - input_: Union[str, Dict], - N: Sequence[int], - output_: str = "ngrams", - signs_to_exclude: Optional[Sequence[str]] = None, -): - if signs_to_exclude is None: - signs_to_exclude = ["X", ""] - - no_empty_signs = { - "$eq": [ - { - "$size": { - "$setIntersection": [ - f"$${NGRAM_FIELD}", - signs_to_exclude, - ] - } - }, - 0, - ] - } - return [ - { - "$addFields": { - output_: drop_duplicates( - filter_array( - {"$concatArrays": [ngrams(input_, n) for n in N if n > 0]}, - NGRAM_FIELD, - no_empty_signs, - ) - ) - } - }, - ] - - -def aggregate_fragment_ngrams( - number: str, N: Sequence[int], signs_to_exclude: Optional[Sequence[str]] = None -): - return [ - {"$match": {"_id": number}}, - {"$project": {NGRAM_FIELD: {"$split": ["$signs", " "]}}}, - *aggregate_all_ngrams(f"${NGRAM_FIELD}", N, NGRAM_FIELD, signs_to_exclude), - ] - - -def aggregate_chapter_ngrams( - chapter_id: ChapterId, - N: Sequence[int], - linebreak_char="#", - signs_to_exclude: Optional[Sequence[str]] = None, -): - replace_linebreaks = { - "$replaceAll": { - "input": "$signs", - "find": "\n", - "replacement": f" {linebreak_char} ", - } - } - - return [ - {"$match": chapter_id_query(chapter_id)}, - {"$project": {"signs": 1}}, - {"$unwind": "$signs"}, - { - "$project": { - NGRAM_FIELD: { - "$split": [ - replace_linebreaks, - " ", - ] - } - } - }, - *aggregate_all_ngrams(f"${NGRAM_FIELD}", N, NGRAM_FIELD, signs_to_exclude), - {"$unwind": f"${NGRAM_FIELD}"}, - {"$group": {"_id": None, NGRAM_FIELD: {"$addToSet": f"${NGRAM_FIELD}"}}}, - {"$project": {"_id": False}}, - ] diff --git a/ebl/common/query/util.py b/ebl/common/query/util.py index a1ecd360a..0c166de13 100644 --- a/ebl/common/query/util.py +++ b/ebl/common/query/util.py @@ -1,4 +1,4 @@ -from typing import Union, Dict +from typing import Optional, Sequence, Union, Dict def flatten_field(input_: Union[str, Dict], depth=1) -> Dict: @@ -39,3 +39,51 @@ def ngrams(input_: Union[str, Dict], n) -> Dict: def filter_array(input_, as_, cond) -> Dict: return {"$filter": {"input": input_, "as": as_, "cond": cond}} + + +def aggregate_all_ngrams( + input_: Union[str, Dict], + N: Sequence[int], + output_: str = "ngrams", + signs_to_exclude: Optional[Sequence[str]] = None, + ngram_field="ngram", +): + if signs_to_exclude is None: + signs_to_exclude = ["X", ""] + + exclude_empty = { + "$eq": [ + { + "$size": { + "$setIntersection": [ + f"$${ngram_field}", + signs_to_exclude, + ] + } + }, + 0, + ] + } + return [ + { + "$addFields": { + output_: drop_duplicates( + filter_array( + {"$concatArrays": [ngrams(input_, n) for n in N if n > 0]}, + ngram_field, + exclude_empty, + ) + ) + } + }, + ] + + +def replace_all(old: str, new: str): + return { + "$replaceAll": { + "input": "$signs", + "find": old, + "replacement": new, + } + } From 4f81188c94d136c5eb9b35179aae7604d34ffadc Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Mon, 11 Sep 2023 13:35:55 +0000 Subject: [PATCH 06/36] add fragment ngram repo --- .../fragment_ngram_repository.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 ebl/fragmentarium/infrastructure/fragment_ngram_repository.py diff --git a/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py new file mode 100644 index 000000000..6c7323edc --- /dev/null +++ b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py @@ -0,0 +1,59 @@ +from ebl.errors import NotFoundError +from ebl.mongo_collection import MongoCollection +from ebl.transliteration.infrastructure.collections import ( + FRAGMENT_NGRAM_COLLECTION, + FRAGMENTS_COLLECTION, +) +from typing import Optional, Sequence + +from ebl.common.query.util import aggregate_all_ngrams, replace_all + +NGRAM_FIELD = "ngram" + + +class FragmentNGramRepository: + def __init__(self, database): + self._fragments = MongoCollection(database, FRAGMENTS_COLLECTION) + self._ngrams = MongoCollection(database, FRAGMENT_NGRAM_COLLECTION) + + def aggregate_fragment_ngrams( + self, + number: dict, + N: Sequence[int], + signs_to_exclude: Optional[Sequence[str]] = None, + ): + return [ + {"$match": {f"museumNumber.{key}": value for key, value in number.items()}}, + { + "$project": { + f"{NGRAM_FIELD}s": { + "$split": [ + replace_all("#", " # "), + " ", + ] + } + } + }, + *aggregate_all_ngrams( + f"${NGRAM_FIELD}s", N, f"{NGRAM_FIELD}s", signs_to_exclude + ), + ] + + def update_ngrams( + self, + number: dict, + N: Sequence[int], + signs_to_exclude: Optional[Sequence[str]] = None, + ) -> None: + aggregation = self.aggregate_fragment_ngrams(number, N, signs_to_exclude) + data = next( + self._fragments.aggregate(aggregation, allowDiskUse=True), + None, + ) + if data: + try: + self._ngrams.update_one( + {"_id": data["_id"]}, {"$set": {"ngrams": data["ngrams"]}} + ) + except NotFoundError: + self._ngrams.insert_one(data) From c0f4cb78fbce5e4df33b7b682eb427c8b5ae022c Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Mon, 11 Sep 2023 13:36:10 +0000 Subject: [PATCH 07/36] add ngram collections --- ebl/transliteration/infrastructure/collections.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ebl/transliteration/infrastructure/collections.py b/ebl/transliteration/infrastructure/collections.py index 84c152182..7d5a6628a 100644 --- a/ebl/transliteration/infrastructure/collections.py +++ b/ebl/transliteration/infrastructure/collections.py @@ -1,3 +1,5 @@ TEXTS_COLLECTION = "texts" CHAPTERS_COLLECTION = "chapters" FRAGMENTS_COLLECTION = "fragments" +FRAGMENT_NGRAM_COLLECTION = "fragment_ngrams" +CHAPTER_NGRAM_COLLECTION = "chapter_ngrams" From 2281f6b3ee1b52dc8b1e72d13bc1cef1a2810ea4 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Mon, 11 Sep 2023 13:36:32 +0000 Subject: [PATCH 08/36] add ChapterNGramRepository --- .../infrastructure/corpus_ngram_repository.py | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 ebl/corpus/infrastructure/corpus_ngram_repository.py diff --git a/ebl/corpus/infrastructure/corpus_ngram_repository.py b/ebl/corpus/infrastructure/corpus_ngram_repository.py new file mode 100644 index 000000000..ead89ae0a --- /dev/null +++ b/ebl/corpus/infrastructure/corpus_ngram_repository.py @@ -0,0 +1,69 @@ +from ebl.corpus.domain.chapter import ChapterId +from ebl.corpus.infrastructure.queries import chapter_id_query +from ebl.errors import NotFoundError +from ebl.mongo_collection import MongoCollection +from ebl.transliteration.infrastructure.collections import ( + CHAPTER_NGRAM_COLLECTION, + CHAPTERS_COLLECTION, +) +from typing import Optional, Sequence + +from ebl.common.query.util import aggregate_all_ngrams, replace_all + +NGRAM_FIELD = "ngram" + + +class ChapterNGramRepository: + def __init__(self, database): + self._chapters = MongoCollection(database, CHAPTERS_COLLECTION) + self._ngrams = MongoCollection(database, CHAPTER_NGRAM_COLLECTION) + + def aggregate_chapter_ngrams( + self, + chapter_id: ChapterId, + N: Sequence[int], + signs_to_exclude: Optional[Sequence[str]] = None, + ): + return [ + {"$match": chapter_id_query(chapter_id)}, + {"$project": {"signs": 1}}, + {"$unwind": "$signs"}, + { + "$project": { + NGRAM_FIELD: { + "$split": [ + replace_all("#", " # "), + " ", + ] + } + } + }, + *aggregate_all_ngrams(f"${NGRAM_FIELD}", N, NGRAM_FIELD, signs_to_exclude), + {"$unwind": f"${NGRAM_FIELD}"}, + { + "$group": { + "_id": None, + f"{NGRAM_FIELD}s": {"$addToSet": f"${NGRAM_FIELD}"}, + } + }, + {"$project": {"_id": False}}, + ] + + def update_ngrams( + self, + chapter_id: ChapterId, + N: Sequence[int], + signs_to_exclude: Optional[Sequence[str]] = None, + ) -> None: + aggregation = self.aggregate_chapter_ngrams(chapter_id, N, signs_to_exclude) + data = next( + self._chapters.aggregate(aggregation, allowDiskUse=True), + None, + ) + if data: + try: + self._ngrams.update_one( + {"_id": data["_id"]}, {"$set": {"ngrams": data["ngrams"]}} + ) + except NotFoundError: + self._ngrams.insert_one(data) From d19f883a7b4c661abce5ca87c8ce31d376aa56e3 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Mon, 11 Sep 2023 13:37:21 +0000 Subject: [PATCH 09/36] add ngram extraction hook function --- ebl/users/web/create_ngram_cache.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 ebl/users/web/create_ngram_cache.py diff --git a/ebl/users/web/create_ngram_cache.py b/ebl/users/web/create_ngram_cache.py new file mode 100644 index 000000000..8f4e37fd7 --- /dev/null +++ b/ebl/users/web/create_ngram_cache.py @@ -0,0 +1,13 @@ +from ebl.fragmentarium.infrastructure.fragment_ngram_repository import ( + FragmentNGramRepository, +) + + +NGRAM_LENGHTS = [1, 2, 3] + + +def create_fragment_ngram_cache(_req, resp, resource): + museum_number_dto = resp.media["museumNumber"] + + ngram_repository: FragmentNGramRepository = resource.ngram_repository + ngram_repository.update_ngrams(museum_number_dto, NGRAM_LENGHTS) From fe6dc7b782b4c4ef518cd2b5f66e6d67668b4f25 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Mon, 11 Sep 2023 13:37:43 +0000 Subject: [PATCH 10/36] add FragmentNGramRepository to TransliterationResource --- ebl/fragmentarium/web/transliterations.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/ebl/fragmentarium/web/transliterations.py b/ebl/fragmentarium/web/transliterations.py index 1d8111845..7942791ad 100644 --- a/ebl/fragmentarium/web/transliterations.py +++ b/ebl/fragmentarium/web/transliterations.py @@ -3,9 +3,13 @@ from falcon.media.validators.jsonschema import validate from ebl.fragmentarium.application.fragment_updater import FragmentUpdater +from ebl.fragmentarium.infrastructure.fragment_ngram_repository import ( + FragmentNGramRepository, +) from ebl.fragmentarium.web.dtos import create_response_dto, parse_museum_number from ebl.transliteration.domain.atf import Atf from ebl.transliteration.domain.transliteration_error import TransliterationError +from ebl.users.web.create_ngram_cache import create_fragment_ngram_cache from ebl.users.web.require_scope import require_scope from ebl.errors import DataError from ebl.fragmentarium.domain.fragment import NotLowestJoinError @@ -19,12 +23,21 @@ class TransliterationResource: - def __init__(self, updater: FragmentUpdater, transliteration_factory): + def __init__( + self, + updater: FragmentUpdater, + transliteration_factory, + ngram_repository: FragmentNGramRepository, + ): self._updater = updater self._transliteration_factory = transliteration_factory + # Consumed by falcon.after + self.ngram_repository = ngram_repository + @falcon.before(require_scope, "transliterate:fragments") @validate(TRANSLITERATION_DTO_SCHEMA) + @falcon.after(create_fragment_ngram_cache) def on_post(self, req: Request, resp: Response, number: str) -> None: try: user = req.context.user From 44afafc344e0e73e2ff7494a5d60f4ae86ac1bf4 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Mon, 11 Sep 2023 13:38:08 +0000 Subject: [PATCH 11/36] add fragment_ngram_repository to app --- ebl/app.py | 4 ++++ ebl/context.py | 4 ++++ ebl/fragmentarium/web/bootstrap.py | 4 +++- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ebl/app.py b/ebl/app.py index e98d4b20f..a657a7208 100644 --- a/ebl/app.py +++ b/ebl/app.py @@ -25,6 +25,9 @@ from ebl.ebl_ai_client import EblAiClient from ebl.files.infrastructure.grid_fs_file_repository import GridFsFileRepository from ebl.files.web.bootstrap import create_files_route +from ebl.fragmentarium.infrastructure.fragment_ngram_repository import ( + FragmentNGramRepository, +) from ebl.markup.web.bootstrap import create_markup_route from ebl.fragmentarium.infrastructure.cropped_sign_images_repository import ( MongoCroppedSignImagesRepository, @@ -85,6 +88,7 @@ def create_context(): photo_repository=GridFsFileRepository(database, "photos"), folio_repository=GridFsFileRepository(database, "folios"), fragment_repository=MongoFragmentRepository(database), + fragment_ngram_repository=FragmentNGramRepository(database), changelog=Changelog(database), bibliography_repository=MongoBibliographyRepository(database), text_repository=MongoTextRepository(database), diff --git a/ebl/context.py b/ebl/context.py index 547aaf43b..997cc297c 100644 --- a/ebl/context.py +++ b/ebl/context.py @@ -19,6 +19,9 @@ from ebl.fragmentarium.infrastructure.cropped_sign_images_repository import ( MongoCroppedSignImagesRepository, ) +from ebl.fragmentarium.infrastructure.fragment_ngram_repository import ( + FragmentNGramRepository, +) from ebl.lemmatization.application.suggestion_finder import LemmaRepository from ebl.transliteration.application.parallel_line_injector import ParallelLineInjector from ebl.transliteration.application.sign_repository import SignRepository @@ -38,6 +41,7 @@ class Context: photo_repository: FileRepository folio_repository: FileRepository fragment_repository: FragmentRepository + fragment_ngram_repository: FragmentNGramRepository changelog: Changelog bibliography_repository: BibliographyRepository text_repository: MongoTextRepository diff --git a/ebl/fragmentarium/web/bootstrap.py b/ebl/fragmentarium/web/bootstrap.py index 2ce6ed02a..ed27d5acd 100644 --- a/ebl/fragmentarium/web/bootstrap.py +++ b/ebl/fragmentarium/web/bootstrap.py @@ -93,7 +93,9 @@ def create_fragmentarium_routes(api: falcon.App, context: Context): lemmatization = LemmatizationResource(updater) references = ReferencesResource(updater) transliteration = TransliterationResource( - updater, context.get_transliteration_update_factory() + updater, + context.get_transliteration_update_factory(), + context.fragment_ngram_repository, ) introduction = IntroductionResource(updater) archaeology = ArchaeologyResource(updater) From 010b2ac2dbbcc28b572eb4ccba91a8257f8cfed5 Mon Sep 17 00:00:00 2001 From: "sourcery-ai[bot]" <58596630+sourcery-ai[bot]@users.noreply.github.com> Date: Tue, 12 Sep 2023 11:56:12 +0200 Subject: [PATCH 12/36] 'Refactored by Sourcery' (#456) Co-authored-by: Sourcery AI <> --- ebl/corpus/infrastructure/corpus_ngram_repository.py | 5 ++--- .../infrastructure/fragment_ngram_repository.py | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/ebl/corpus/infrastructure/corpus_ngram_repository.py b/ebl/corpus/infrastructure/corpus_ngram_repository.py index ead89ae0a..b0d765dd4 100644 --- a/ebl/corpus/infrastructure/corpus_ngram_repository.py +++ b/ebl/corpus/infrastructure/corpus_ngram_repository.py @@ -56,11 +56,10 @@ def update_ngrams( signs_to_exclude: Optional[Sequence[str]] = None, ) -> None: aggregation = self.aggregate_chapter_ngrams(chapter_id, N, signs_to_exclude) - data = next( + if data := next( self._chapters.aggregate(aggregation, allowDiskUse=True), None, - ) - if data: + ): try: self._ngrams.update_one( {"_id": data["_id"]}, {"$set": {"ngrams": data["ngrams"]}} diff --git a/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py index 6c7323edc..c1e23d81d 100644 --- a/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py +++ b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py @@ -46,11 +46,10 @@ def update_ngrams( signs_to_exclude: Optional[Sequence[str]] = None, ) -> None: aggregation = self.aggregate_fragment_ngrams(number, N, signs_to_exclude) - data = next( + if data := next( self._fragments.aggregate(aggregation, allowDiskUse=True), None, - ) - if data: + ): try: self._ngrams.update_one( {"_id": data["_id"]}, {"$set": {"ngrams": data["ngrams"]}} From abd48647543f1cca4e6cf83395111a425c87edbd Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 12 Sep 2023 10:52:18 +0000 Subject: [PATCH 13/36] fix aggregation, include chapter id --- .../infrastructure/corpus_ngram_repository.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/ebl/corpus/infrastructure/corpus_ngram_repository.py b/ebl/corpus/infrastructure/corpus_ngram_repository.py index b0d765dd4..cf7a6f5bf 100644 --- a/ebl/corpus/infrastructure/corpus_ngram_repository.py +++ b/ebl/corpus/infrastructure/corpus_ngram_repository.py @@ -26,13 +26,13 @@ def aggregate_chapter_ngrams( ): return [ {"$match": chapter_id_query(chapter_id)}, - {"$project": {"signs": 1}}, + {"$project": {"signs": 1, "textId": 1, "stage": 1, "name": 1}}, {"$unwind": "$signs"}, { - "$project": { + "$addFields": { NGRAM_FIELD: { "$split": [ - replace_all("#", " # "), + replace_all("\n", " # "), " ", ] } @@ -43,7 +43,10 @@ def aggregate_chapter_ngrams( { "$group": { "_id": None, - f"{NGRAM_FIELD}s": {"$addToSet": f"${NGRAM_FIELD}"}, + NGRAM_FIELD: {"$addToSet": f"${NGRAM_FIELD}"}, + "textId": {"$first": "$textId"}, + "name": {"$first": "$name"}, + "stage": {"$first": "$stage"}, } }, {"$project": {"_id": False}}, @@ -62,7 +65,8 @@ def update_ngrams( ): try: self._ngrams.update_one( - {"_id": data["_id"]}, {"$set": {"ngrams": data["ngrams"]}} + chapter_id_query(chapter_id), + {"$set": {NGRAM_FIELD: data[NGRAM_FIELD]}}, ) except NotFoundError: self._ngrams.insert_one(data) From b82a0720efab40c6266dd924675893e22e959379 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 12 Sep 2023 10:52:54 +0000 Subject: [PATCH 14/36] fix line replace bug --- ebl/fragmentarium/infrastructure/fragment_ngram_repository.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py index c1e23d81d..1543ebec0 100644 --- a/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py +++ b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py @@ -28,7 +28,7 @@ def aggregate_fragment_ngrams( "$project": { f"{NGRAM_FIELD}s": { "$split": [ - replace_all("#", " # "), + replace_all("\n", " # "), " ", ] } From 3497f08f9da1c182b2bbe4ef3f30caff59e4ba84 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 12 Sep 2023 10:53:40 +0000 Subject: [PATCH 15/36] add create_chapter_ngram_cache --- ebl/users/web/create_ngram_cache.py | 13 ------------- ebl/users/web/update_cache.py | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 13 deletions(-) delete mode 100644 ebl/users/web/create_ngram_cache.py create mode 100644 ebl/users/web/update_cache.py diff --git a/ebl/users/web/create_ngram_cache.py b/ebl/users/web/create_ngram_cache.py deleted file mode 100644 index 8f4e37fd7..000000000 --- a/ebl/users/web/create_ngram_cache.py +++ /dev/null @@ -1,13 +0,0 @@ -from ebl.fragmentarium.infrastructure.fragment_ngram_repository import ( - FragmentNGramRepository, -) - - -NGRAM_LENGHTS = [1, 2, 3] - - -def create_fragment_ngram_cache(_req, resp, resource): - museum_number_dto = resp.media["museumNumber"] - - ngram_repository: FragmentNGramRepository = resource.ngram_repository - ngram_repository.update_ngrams(museum_number_dto, NGRAM_LENGHTS) diff --git a/ebl/users/web/update_cache.py b/ebl/users/web/update_cache.py new file mode 100644 index 000000000..e14bbaf6d --- /dev/null +++ b/ebl/users/web/update_cache.py @@ -0,0 +1,22 @@ +from marshmallow import EXCLUDE +from ebl.corpus.application.id_schemas import ChapterIdSchema +from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository +from ebl.fragmentarium.infrastructure.fragment_ngram_repository import ( + FragmentNGramRepository, +) + + +NGRAM_LENGHTS = [1, 2, 3] + + +def create_fragment_ngram_cache(_req, resp, resource): + museum_number_dto = resp.media["museumNumber"] + + ngram_repository: FragmentNGramRepository = resource.ngram_repository + ngram_repository.update_ngrams(museum_number_dto, NGRAM_LENGHTS) + + +def create_chapter_ngram_cache(_req, resp, resource): + ngram_repository: ChapterNGramRepository = resource.ngram_repository + chapter_id = ChapterIdSchema().load(resp.media, unknown=EXCLUDE) + ngram_repository.update_ngrams(chapter_id, NGRAM_LENGHTS) From b3b830f58224f1f766b06e9525863cb4d17cb2c9 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 12 Sep 2023 10:53:53 +0000 Subject: [PATCH 16/36] refactoring --- ebl/fragmentarium/web/transliterations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ebl/fragmentarium/web/transliterations.py b/ebl/fragmentarium/web/transliterations.py index 7942791ad..1caf02ee2 100644 --- a/ebl/fragmentarium/web/transliterations.py +++ b/ebl/fragmentarium/web/transliterations.py @@ -9,7 +9,7 @@ from ebl.fragmentarium.web.dtos import create_response_dto, parse_museum_number from ebl.transliteration.domain.atf import Atf from ebl.transliteration.domain.transliteration_error import TransliterationError -from ebl.users.web.create_ngram_cache import create_fragment_ngram_cache +from ebl.users.web.update_cache import create_fragment_ngram_cache from ebl.users.web.require_scope import require_scope from ebl.errors import DataError from ebl.fragmentarium.domain.fragment import NotLowestJoinError From 7177c6c20f36eda35803cc019769e31a2c37c2e4 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 12 Sep 2023 10:54:33 +0000 Subject: [PATCH 17/36] add ChapterNGramRepository to app and resources --- ebl/app.py | 2 ++ ebl/context.py | 2 ++ ebl/corpus/web/bootstrap.py | 8 ++++++-- ebl/corpus/web/lines.py | 12 +++++++++++- ebl/corpus/web/manuscripts.py | 11 ++++++++++- 5 files changed, 31 insertions(+), 4 deletions(-) diff --git a/ebl/app.py b/ebl/app.py index a657a7208..410a5825d 100644 --- a/ebl/app.py +++ b/ebl/app.py @@ -9,6 +9,7 @@ from sentry_sdk import configure_scope from sentry_sdk.integrations.falcon import FalconIntegration import althaia +from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository import ebl.error_handler from ebl.bibliography.infrastructure.bibliography import MongoBibliographyRepository from ebl.bibliography.web.bootstrap import create_bibliography_routes @@ -89,6 +90,7 @@ def create_context(): folio_repository=GridFsFileRepository(database, "folios"), fragment_repository=MongoFragmentRepository(database), fragment_ngram_repository=FragmentNGramRepository(database), + chapter_ngram_repository=ChapterNGramRepository(database), changelog=Changelog(database), bibliography_repository=MongoBibliographyRepository(database), text_repository=MongoTextRepository(database), diff --git a/ebl/context.py b/ebl/context.py index 997cc297c..7798636a1 100644 --- a/ebl/context.py +++ b/ebl/context.py @@ -6,6 +6,7 @@ from ebl.bibliography.application.bibliography_repository import BibliographyRepository from ebl.cache.application.custom_cache import ChapterCache from ebl.changelog import Changelog +from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository from ebl.corpus.infrastructure.mongo_text_repository import MongoTextRepository from ebl.dictionary.application.word_repository import WordRepository from ebl.ebl_ai_client import EblAiClient @@ -42,6 +43,7 @@ class Context: folio_repository: FileRepository fragment_repository: FragmentRepository fragment_ngram_repository: FragmentNGramRepository + chapter_ngram_repository: ChapterNGramRepository changelog: Changelog bibliography_repository: BibliographyRepository text_repository: MongoTextRepository diff --git a/ebl/corpus/web/bootstrap.py b/ebl/corpus/web/bootstrap.py index 362461011..7a88bf734 100644 --- a/ebl/corpus/web/bootstrap.py +++ b/ebl/corpus/web/bootstrap.py @@ -48,8 +48,12 @@ def create_corpus_routes(api: falcon.App, context: Context): chapters_by_lemma = ChaptersByLemmaResource(corpus) alignment = AlignmentResource(corpus, context.custom_cache) manuscript_lemmatization = LemmatizationResource(corpus, context.custom_cache) - manuscript = ManuscriptsResource(corpus, context.custom_cache) - lines = LinesResource(corpus, context.custom_cache) + manuscript = ManuscriptsResource( + corpus, context.custom_cache, context.chapter_ngram_repository + ) + lines = LinesResource( + corpus, context.custom_cache, context.chapter_ngram_repository + ) lines_import = LinesImportResource(corpus, context.custom_cache) colophons = ColophonsResource(corpus) unplaced_lines = UnplacedLinesResource(corpus) diff --git a/ebl/corpus/web/lines.py b/ebl/corpus/web/lines.py index 6ef1a3084..872395224 100644 --- a/ebl/corpus/web/lines.py +++ b/ebl/corpus/web/lines.py @@ -7,11 +7,13 @@ from ebl.corpus.application.corpus import Corpus from ebl.corpus.domain.line import Line from ebl.corpus.domain.lines_update import LinesUpdate +from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository from ebl.corpus.web.chapter_schemas import ApiChapterSchema, ApiLineSchema from ebl.corpus.web.display_schemas import LineDetailsDisplay, LineDetailsDisplaySchema from ebl.corpus.web.text_utils import create_chapter_id from ebl.errors import NotFoundError from ebl.marshmallowschema import validate +from ebl.users.web.update_cache import create_chapter_ngram_cache from ebl.users.web.require_scope import require_scope @@ -39,12 +41,19 @@ class LinesImportSchema(Schema): class LinesResource: - def __init__(self, corpus: Corpus, cache: ChapterCache): + def __init__( + self, + corpus: Corpus, + cache: ChapterCache, + ngram_repository: ChapterNGramRepository, + ): self._corpus = corpus self._cache = cache + self.ngram_repository = ngram_repository @falcon.before(require_scope, "write:texts") @validate(LinesUpdateSchema()) + @falcon.after(create_chapter_ngram_cache) def on_post( self, req: falcon.Request, @@ -70,6 +79,7 @@ def __init__(self, corpus: Corpus, cache: ChapterCache): @falcon.before(require_scope, "write:texts") @validate(LinesImportSchema()) + @falcon.after(create_chapter_ngram_cache) def on_post( self, req: falcon.Request, diff --git a/ebl/corpus/web/manuscripts.py b/ebl/corpus/web/manuscripts.py index 511d99705..35c2d7b60 100644 --- a/ebl/corpus/web/manuscripts.py +++ b/ebl/corpus/web/manuscripts.py @@ -3,6 +3,7 @@ from ebl.cache.application.custom_cache import ChapterCache from ebl.corpus.application.corpus import Corpus +from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository from ebl.corpus.web.chapter_schemas import ( ApiChapterSchema, ApiManuscriptSchema, @@ -10,6 +11,7 @@ ) from ebl.corpus.web.text_utils import create_chapter_id from ebl.marshmallowschema import validate +from ebl.users.web.update_cache import create_chapter_ngram_cache from ebl.users.web.require_scope import require_scope @@ -21,9 +23,15 @@ class ManuscriptDtoSchema(Schema): class ManuscriptsResource: - def __init__(self, corpus: Corpus, cache: ChapterCache): + def __init__( + self, + corpus: Corpus, + cache: ChapterCache, + ngram_repository: ChapterNGramRepository, + ): self._corpus = corpus self._cache = cache + self.ngram_repository = ngram_repository def on_get( self, @@ -41,6 +49,7 @@ def on_get( @falcon.before(require_scope, "write:texts") @validate(ManuscriptDtoSchema()) + @falcon.after(create_chapter_ngram_cache) def on_post( self, req: falcon.Request, From b97aa65752e5be3f93c663807ff19af3df672692 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 12 Sep 2023 11:41:17 +0000 Subject: [PATCH 18/36] add ChapterNGramRepository to LinesResource --- ebl/corpus/web/bootstrap.py | 4 +++- ebl/corpus/web/lines.py | 8 +++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/ebl/corpus/web/bootstrap.py b/ebl/corpus/web/bootstrap.py index 7a88bf734..e906242c3 100644 --- a/ebl/corpus/web/bootstrap.py +++ b/ebl/corpus/web/bootstrap.py @@ -54,7 +54,9 @@ def create_corpus_routes(api: falcon.App, context: Context): lines = LinesResource( corpus, context.custom_cache, context.chapter_ngram_repository ) - lines_import = LinesImportResource(corpus, context.custom_cache) + lines_import = LinesImportResource( + corpus, context.custom_cache, context.chapter_ngram_repository + ) colophons = ColophonsResource(corpus) unplaced_lines = UnplacedLinesResource(corpus) extant_lines = ExtantLinesResource(corpus) diff --git a/ebl/corpus/web/lines.py b/ebl/corpus/web/lines.py index 872395224..487c1cb85 100644 --- a/ebl/corpus/web/lines.py +++ b/ebl/corpus/web/lines.py @@ -73,9 +73,15 @@ def on_post( class LinesImportResource: - def __init__(self, corpus: Corpus, cache: ChapterCache): + def __init__( + self, + corpus: Corpus, + cache: ChapterCache, + ngram_repository: ChapterNGramRepository, + ): self._corpus = corpus self._cache = cache + self.ngram_repository = ngram_repository @falcon.before(require_scope, "write:texts") @validate(LinesImportSchema()) From ccb2ce12d07c0b294ded01c1c47fd8d37e1bea5b Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 12 Sep 2023 11:41:25 +0000 Subject: [PATCH 19/36] remove comment --- ebl/fragmentarium/web/transliterations.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ebl/fragmentarium/web/transliterations.py b/ebl/fragmentarium/web/transliterations.py index 1caf02ee2..ba811cb73 100644 --- a/ebl/fragmentarium/web/transliterations.py +++ b/ebl/fragmentarium/web/transliterations.py @@ -31,8 +31,6 @@ def __init__( ): self._updater = updater self._transliteration_factory = transliteration_factory - - # Consumed by falcon.after self.ngram_repository = ngram_repository @falcon.before(require_scope, "transliterate:fragments") From ad18ece73e916c2b06564e3c7a198351dac922b4 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 12 Sep 2023 11:41:53 +0000 Subject: [PATCH 20/36] add n-gram repositories to test context --- ebl/tests/conftest.py | 18 ++++++++++++++++++ ebl/users/web/update_cache.py | 7 +++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/ebl/tests/conftest.py b/ebl/tests/conftest.py index a2c7dcd3f..8b01a9b6e 100644 --- a/ebl/tests/conftest.py +++ b/ebl/tests/conftest.py @@ -27,6 +27,7 @@ from ebl.cache.infrastructure.mongo_cache_repository import MongoCacheRepository from ebl.changelog import Changelog from ebl.corpus.application.corpus import Corpus +from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository from ebl.corpus.infrastructure.mongo_text_repository import MongoTextRepository from ebl.dictionary.application.dictionary_service import Dictionary from ebl.dictionary.infrastructure.word_repository import MongoWordRepository @@ -44,6 +45,9 @@ from ebl.fragmentarium.infrastructure.cropped_sign_images_repository import ( MongoCroppedSignImagesRepository, ) +from ebl.fragmentarium.infrastructure.fragment_ngram_repository import ( + FragmentNGramRepository, +) from ebl.fragmentarium.infrastructure.mongo_annotations_repository import ( MongoAnnotationsRepository, ) @@ -199,6 +203,16 @@ def fragment_repository(database): return MongoFragmentRepository(database) +@pytest.fixture +def fragment_ngram_repository(database): + return FragmentNGramRepository(database) + + +@pytest.fixture +def chapter_ngram_repository(database): + return ChapterNGramRepository(database) + + @pytest.fixture def fragmentarium(fragment_repository): return Fragmentarium(fragment_repository) @@ -401,6 +415,8 @@ def context( photo_repository, folio_repository, fragment_repository, + fragment_ngram_repository, + chapter_ngram_repository, text_repository, changelog, bibliography_repository, @@ -420,6 +436,8 @@ def context( photo_repository=photo_repository, folio_repository=folio_repository, fragment_repository=fragment_repository, + fragment_ngram_repository=fragment_ngram_repository, + chapter_ngram_repository=chapter_ngram_repository, changelog=changelog, bibliography_repository=bibliography_repository, text_repository=text_repository, diff --git a/ebl/users/web/update_cache.py b/ebl/users/web/update_cache.py index e14bbaf6d..339ddd34c 100644 --- a/ebl/users/web/update_cache.py +++ b/ebl/users/web/update_cache.py @@ -10,10 +10,9 @@ def create_fragment_ngram_cache(_req, resp, resource): - museum_number_dto = resp.media["museumNumber"] - - ngram_repository: FragmentNGramRepository = resource.ngram_repository - ngram_repository.update_ngrams(museum_number_dto, NGRAM_LENGHTS) + if museum_number_dto := resp.media.get("museumNumber"): + ngram_repository: FragmentNGramRepository = resource.ngram_repository + ngram_repository.update_ngrams(museum_number_dto, NGRAM_LENGHTS) def create_chapter_ngram_cache(_req, resp, resource): From 8a1e84caaba695d8b7587c0c5d326a9d88527dca Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 12 Sep 2023 15:44:20 +0000 Subject: [PATCH 21/36] Refactoring --- ebl/common/query/util.py | 11 ++++------- ebl/corpus/infrastructure/corpus_ngram_repository.py | 8 +++----- .../infrastructure/fragment_ngram_repository.py | 10 +++------- 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/ebl/common/query/util.py b/ebl/common/query/util.py index 0c166de13..f396e0abe 100644 --- a/ebl/common/query/util.py +++ b/ebl/common/query/util.py @@ -1,4 +1,4 @@ -from typing import Optional, Sequence, Union, Dict +from typing import Sequence, Union, Dict def flatten_field(input_: Union[str, Dict], depth=1) -> Dict: @@ -45,18 +45,15 @@ def aggregate_all_ngrams( input_: Union[str, Dict], N: Sequence[int], output_: str = "ngrams", - signs_to_exclude: Optional[Sequence[str]] = None, - ngram_field="ngram", ): - if signs_to_exclude is None: - signs_to_exclude = ["X", ""] + signs_to_exclude = ["X", ""] exclude_empty = { "$eq": [ { "$size": { "$setIntersection": [ - f"$${ngram_field}", + "$$this", signs_to_exclude, ] } @@ -70,7 +67,7 @@ def aggregate_all_ngrams( output_: drop_duplicates( filter_array( {"$concatArrays": [ngrams(input_, n) for n in N if n > 0]}, - ngram_field, + "this", exclude_empty, ) ) diff --git a/ebl/corpus/infrastructure/corpus_ngram_repository.py b/ebl/corpus/infrastructure/corpus_ngram_repository.py index cf7a6f5bf..69e91ca23 100644 --- a/ebl/corpus/infrastructure/corpus_ngram_repository.py +++ b/ebl/corpus/infrastructure/corpus_ngram_repository.py @@ -6,7 +6,7 @@ CHAPTER_NGRAM_COLLECTION, CHAPTERS_COLLECTION, ) -from typing import Optional, Sequence +from typing import Sequence from ebl.common.query.util import aggregate_all_ngrams, replace_all @@ -22,7 +22,6 @@ def aggregate_chapter_ngrams( self, chapter_id: ChapterId, N: Sequence[int], - signs_to_exclude: Optional[Sequence[str]] = None, ): return [ {"$match": chapter_id_query(chapter_id)}, @@ -38,7 +37,7 @@ def aggregate_chapter_ngrams( } } }, - *aggregate_all_ngrams(f"${NGRAM_FIELD}", N, NGRAM_FIELD, signs_to_exclude), + *aggregate_all_ngrams(f"${NGRAM_FIELD}", N, NGRAM_FIELD), {"$unwind": f"${NGRAM_FIELD}"}, { "$group": { @@ -56,9 +55,8 @@ def update_ngrams( self, chapter_id: ChapterId, N: Sequence[int], - signs_to_exclude: Optional[Sequence[str]] = None, ) -> None: - aggregation = self.aggregate_chapter_ngrams(chapter_id, N, signs_to_exclude) + aggregation = self.aggregate_chapter_ngrams(chapter_id, N) if data := next( self._chapters.aggregate(aggregation, allowDiskUse=True), None, diff --git a/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py index 1543ebec0..fa213ab0e 100644 --- a/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py +++ b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py @@ -4,7 +4,7 @@ FRAGMENT_NGRAM_COLLECTION, FRAGMENTS_COLLECTION, ) -from typing import Optional, Sequence +from typing import Sequence from ebl.common.query.util import aggregate_all_ngrams, replace_all @@ -20,7 +20,6 @@ def aggregate_fragment_ngrams( self, number: dict, N: Sequence[int], - signs_to_exclude: Optional[Sequence[str]] = None, ): return [ {"$match": {f"museumNumber.{key}": value for key, value in number.items()}}, @@ -34,18 +33,15 @@ def aggregate_fragment_ngrams( } } }, - *aggregate_all_ngrams( - f"${NGRAM_FIELD}s", N, f"{NGRAM_FIELD}s", signs_to_exclude - ), + *aggregate_all_ngrams(f"${NGRAM_FIELD}s", N, f"{NGRAM_FIELD}s"), ] def update_ngrams( self, number: dict, N: Sequence[int], - signs_to_exclude: Optional[Sequence[str]] = None, ) -> None: - aggregation = self.aggregate_fragment_ngrams(number, N, signs_to_exclude) + aggregation = self.aggregate_fragment_ngrams(number, N) if data := next( self._fragments.aggregate(aggregation, allowDiskUse=True), None, From 0003a59c39e8b427e5491db708c1f1a4afaec44e Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Wed, 13 Sep 2023 10:34:20 +0000 Subject: [PATCH 22/36] Add get_ngrams method --- .../infrastructure/fragment_ngram_repository.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py index fa213ab0e..c0860fe85 100644 --- a/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py +++ b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py @@ -52,3 +52,8 @@ def update_ngrams( ) except NotFoundError: self._ngrams.insert_one(data) + + def get_ngrams(self, id_: str): + ngrams = self._ngrams.find_one_by_id(id_)[f"{NGRAM_FIELD}s"] + + return set(map(tuple, ngrams)) From 34cf3b5ca20621fd5bdcec57657050407a94d479 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Wed, 13 Sep 2023 10:35:00 +0000 Subject: [PATCH 23/36] add Fragment Ngram Repo test --- .../test_fragment_ngram_repository.py | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 ebl/tests/fragmentarium/test_fragment_ngram_repository.py diff --git a/ebl/tests/fragmentarium/test_fragment_ngram_repository.py b/ebl/tests/fragmentarium/test_fragment_ngram_repository.py new file mode 100644 index 000000000..7a91bc71a --- /dev/null +++ b/ebl/tests/fragmentarium/test_fragment_ngram_repository.py @@ -0,0 +1,50 @@ +from typing import Sequence, Set, Tuple, TypeVar + +import pytest +from ebl.tests.factories.fragment import TransliteratedFragmentFactory +from ebl.transliteration.application.museum_number_schema import MuseumNumberSchema + +T = TypeVar("T") + + +def ngrams(sequence: Sequence[T], n: int) -> Set[Tuple[T]]: + return set(zip(*(sequence[i:] for i in range(n)))) + + +def ngrams_from_signs(signs: str, N: Sequence[int]) -> Set[Tuple[str]]: + split_signs = signs.replace("\n", " # ").split() + all_ngrams = set.union(*(ngrams(split_signs, n) for n in N)) + return {ngram for ngram in all_ngrams if "X" not in ngram} + + +N_VALUES = [ + [1], + [1, 2], + [1, 2, 3], + [5], + [99], +] + + +@pytest.mark.parametrize( + "N", + N_VALUES, +) +@pytest.mark.parametrize( + "N_NEW", + N_VALUES, +) +def test_update_ngrams(fragment_repository, fragment_ngram_repository, N, N_NEW): + fragment = TransliteratedFragmentFactory.build() + number = MuseumNumberSchema().dump(fragment.number) + fragment_id = fragment_repository.create(fragment) + + assert not fragment_ngram_repository._ngrams.exists({"_id": fragment_id}) + + fragment_ngram_repository.update_ngrams(number, N) + bigrams = ngrams_from_signs(fragment.signs, N) + assert fragment_ngram_repository.get_ngrams(fragment_id) == bigrams + + fragment_ngram_repository.update_ngrams(number, N_NEW) + bigrams = ngrams_from_signs(fragment.signs, N_NEW) + assert fragment_ngram_repository.get_ngrams(fragment_id) == bigrams From 582a099392934452fc4a0b607c96ef2051395f41 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Thu, 14 Sep 2023 12:30:08 +0000 Subject: [PATCH 24/36] update type cast --- ebl/fragmentarium/infrastructure/fragment_ngram_repository.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py index c0860fe85..21e1b5d88 100644 --- a/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py +++ b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py @@ -56,4 +56,4 @@ def update_ngrams( def get_ngrams(self, id_: str): ngrams = self._ngrams.find_one_by_id(id_)[f"{NGRAM_FIELD}s"] - return set(map(tuple, ngrams)) + return {tuple(ngram) for ngram in ngrams} From 0ea06101691fbbe12b524365ada6711e240c6e21 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 19 Sep 2023 09:58:03 +0000 Subject: [PATCH 25/36] add get_ngrams --- ebl/corpus/infrastructure/corpus_ngram_repository.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ebl/corpus/infrastructure/corpus_ngram_repository.py b/ebl/corpus/infrastructure/corpus_ngram_repository.py index 69e91ca23..98c9bee53 100644 --- a/ebl/corpus/infrastructure/corpus_ngram_repository.py +++ b/ebl/corpus/infrastructure/corpus_ngram_repository.py @@ -6,7 +6,7 @@ CHAPTER_NGRAM_COLLECTION, CHAPTERS_COLLECTION, ) -from typing import Sequence +from typing import Sequence, Set, Tuple from ebl.common.query.util import aggregate_all_ngrams, replace_all @@ -68,3 +68,8 @@ def update_ngrams( ) except NotFoundError: self._ngrams.insert_one(data) + + def get_ngrams(self, chapter_id: ChapterId) -> Set[Tuple[str]]: + ngrams = self._ngrams.find_one(chapter_id_query(chapter_id))[NGRAM_FIELD] + + return {tuple(ngram) for ngram in ngrams} From b9ebc269110c3bc2cfd0049b18d37c937342c11a Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 19 Sep 2023 09:58:11 +0000 Subject: [PATCH 26/36] add type hints --- .../infrastructure/fragment_ngram_repository.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py index 21e1b5d88..59acc6e41 100644 --- a/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py +++ b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py @@ -4,7 +4,7 @@ FRAGMENT_NGRAM_COLLECTION, FRAGMENTS_COLLECTION, ) -from typing import Sequence +from typing import Sequence, Set, Tuple from ebl.common.query.util import aggregate_all_ngrams, replace_all @@ -25,7 +25,7 @@ def aggregate_fragment_ngrams( {"$match": {f"museumNumber.{key}": value for key, value in number.items()}}, { "$project": { - f"{NGRAM_FIELD}s": { + NGRAM_FIELD: { "$split": [ replace_all("\n", " # "), " ", @@ -33,7 +33,7 @@ def aggregate_fragment_ngrams( } } }, - *aggregate_all_ngrams(f"${NGRAM_FIELD}s", N, f"{NGRAM_FIELD}s"), + *aggregate_all_ngrams(f"${NGRAM_FIELD}", N, NGRAM_FIELD), ] def update_ngrams( @@ -48,12 +48,12 @@ def update_ngrams( ): try: self._ngrams.update_one( - {"_id": data["_id"]}, {"$set": {"ngrams": data["ngrams"]}} + {"_id": data["_id"]}, {"$set": {NGRAM_FIELD: data[NGRAM_FIELD]}} ) except NotFoundError: self._ngrams.insert_one(data) - def get_ngrams(self, id_: str): - ngrams = self._ngrams.find_one_by_id(id_)[f"{NGRAM_FIELD}s"] + def get_ngrams(self, id_: str) -> Set[Tuple[str]]: + ngrams = self._ngrams.find_one_by_id(id_)[NGRAM_FIELD] return {tuple(ngram) for ngram in ngrams} From c26b6107f717d9695a931509200c176e7d63eaac Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 19 Sep 2023 09:58:25 +0000 Subject: [PATCH 27/36] add chapter ngram tests, refactor --- ebl/tests/common/ngram_test_support.py | 20 ++++++++++ .../corpus/test_chapter_ngram_repository.py | 38 +++++++++++++++++++ .../test_fragment_ngram_repository.py | 36 ++++-------------- 3 files changed, 66 insertions(+), 28 deletions(-) create mode 100644 ebl/tests/common/ngram_test_support.py create mode 100644 ebl/tests/corpus/test_chapter_ngram_repository.py diff --git a/ebl/tests/common/ngram_test_support.py b/ebl/tests/common/ngram_test_support.py new file mode 100644 index 000000000..31d93229c --- /dev/null +++ b/ebl/tests/common/ngram_test_support.py @@ -0,0 +1,20 @@ +from typing import Sequence, Set, Tuple, TypeVar + +T = TypeVar("T") + +N_VALUES = [ + [1], + [1, 2], + [1, 2, 3], + [5], +] + + +def _ngrams(sequence: Sequence[T], n: int) -> Set[Tuple[T]]: + return set(zip(*(sequence[i:] for i in range(n)))) + + +def ngrams_from_signs(signs: str, N: Sequence[int]) -> Set[Tuple[str]]: + split_signs = signs.replace("\n", " # ").split() + all_ngrams = set.union(*(_ngrams(split_signs, n) for n in N)) + return {ngram for ngram in all_ngrams if "X" not in ngram} diff --git a/ebl/tests/corpus/test_chapter_ngram_repository.py b/ebl/tests/corpus/test_chapter_ngram_repository.py new file mode 100644 index 000000000..530758d33 --- /dev/null +++ b/ebl/tests/corpus/test_chapter_ngram_repository.py @@ -0,0 +1,38 @@ +from typing import Sequence, Set, Tuple, Optional + +import pytest +from ebl.tests.factories.corpus import ChapterFactory +from ebl.corpus.infrastructure.queries import chapter_id_query +from ebl.tests.common.ngram_test_support import ngrams_from_signs, N_VALUES + + +def chapter_ngrams_from_signs( + chapter_signs: Sequence[Optional[str]], N: Sequence[int] +) -> Set[Tuple[str]]: + return set.union( + *(ngrams_from_signs(signs, N) for signs in chapter_signs if signs is not None) + ) + + +@pytest.mark.parametrize( + "N", + N_VALUES, +) +@pytest.mark.parametrize( + "N_NEW", + N_VALUES, +) +def test_update_chapter_ngrams(text_repository, chapter_ngram_repository, N, N_NEW): + chapter = ChapterFactory.build() + text_repository.create_chapter(chapter) + + assert not chapter_ngram_repository._ngrams.exists(chapter_id_query(chapter.id_)) + + chapter_ngram_repository.update_ngrams(chapter.id_, N) + ngrams = chapter_ngrams_from_signs(chapter.signs, N) + + assert chapter_ngram_repository.get_ngrams(chapter.id_) == ngrams + + chapter_ngram_repository.update_ngrams(chapter.id_, N_NEW) + ngrams = chapter_ngrams_from_signs(chapter.signs, N_NEW) + assert chapter_ngram_repository.get_ngrams(chapter.id_) == ngrams diff --git a/ebl/tests/fragmentarium/test_fragment_ngram_repository.py b/ebl/tests/fragmentarium/test_fragment_ngram_repository.py index 7a91bc71a..a4c88ebc0 100644 --- a/ebl/tests/fragmentarium/test_fragment_ngram_repository.py +++ b/ebl/tests/fragmentarium/test_fragment_ngram_repository.py @@ -1,29 +1,7 @@ -from typing import Sequence, Set, Tuple, TypeVar - import pytest from ebl.tests.factories.fragment import TransliteratedFragmentFactory from ebl.transliteration.application.museum_number_schema import MuseumNumberSchema - -T = TypeVar("T") - - -def ngrams(sequence: Sequence[T], n: int) -> Set[Tuple[T]]: - return set(zip(*(sequence[i:] for i in range(n)))) - - -def ngrams_from_signs(signs: str, N: Sequence[int]) -> Set[Tuple[str]]: - split_signs = signs.replace("\n", " # ").split() - all_ngrams = set.union(*(ngrams(split_signs, n) for n in N)) - return {ngram for ngram in all_ngrams if "X" not in ngram} - - -N_VALUES = [ - [1], - [1, 2], - [1, 2, 3], - [5], - [99], -] +from ebl.tests.common.ngram_test_support import ngrams_from_signs, N_VALUES @pytest.mark.parametrize( @@ -34,7 +12,9 @@ def ngrams_from_signs(signs: str, N: Sequence[int]) -> Set[Tuple[str]]: "N_NEW", N_VALUES, ) -def test_update_ngrams(fragment_repository, fragment_ngram_repository, N, N_NEW): +def test_update_fragment_ngrams( + fragment_repository, fragment_ngram_repository, N, N_NEW +): fragment = TransliteratedFragmentFactory.build() number = MuseumNumberSchema().dump(fragment.number) fragment_id = fragment_repository.create(fragment) @@ -42,9 +22,9 @@ def test_update_ngrams(fragment_repository, fragment_ngram_repository, N, N_NEW) assert not fragment_ngram_repository._ngrams.exists({"_id": fragment_id}) fragment_ngram_repository.update_ngrams(number, N) - bigrams = ngrams_from_signs(fragment.signs, N) - assert fragment_ngram_repository.get_ngrams(fragment_id) == bigrams + ngrams = ngrams_from_signs(fragment.signs, N) + assert fragment_ngram_repository.get_ngrams(fragment_id) == ngrams fragment_ngram_repository.update_ngrams(number, N_NEW) - bigrams = ngrams_from_signs(fragment.signs, N_NEW) - assert fragment_ngram_repository.get_ngrams(fragment_id) == bigrams + ngrams = ngrams_from_signs(fragment.signs, N_NEW) + assert fragment_ngram_repository.get_ngrams(fragment_id) == ngrams From 4275e0875387290dbbba3db34183a772d08f5652 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 19 Sep 2023 12:29:07 +0000 Subject: [PATCH 28/36] fix fragment ngram route, add test --- .../fragment_ngram_repository.py | 11 +++++++ ebl/fragmentarium/web/bootstrap.py | 4 +-- ebl/fragmentarium/web/ngram_matcher.py | 15 ++++++---- .../test_fragment_ngram_route.py | 29 +++++++++++++++++++ 4 files changed, 52 insertions(+), 7 deletions(-) create mode 100644 ebl/tests/fragmentarium/test_fragment_ngram_route.py diff --git a/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py index 59acc6e41..548c38a1b 100644 --- a/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py +++ b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py @@ -1,5 +1,7 @@ from ebl.errors import NotFoundError from ebl.mongo_collection import MongoCollection +from ebl.transliteration.application.museum_number_schema import MuseumNumberSchema +from ebl.transliteration.domain.museum_number import MuseumNumber from ebl.transliteration.infrastructure.collections import ( FRAGMENT_NGRAM_COLLECTION, FRAGMENTS_COLLECTION, @@ -57,3 +59,12 @@ def get_ngrams(self, id_: str) -> Set[Tuple[str]]: ngrams = self._ngrams.find_one_by_id(id_)[NGRAM_FIELD] return {tuple(ngram) for ngram in ngrams} + + def get_or_set_ngrams(self, id_: str, N: Sequence[int]) -> Set[Tuple[str]]: + try: + ngrams = self._ngrams.find_one_by_id(id_)[NGRAM_FIELD] + except NotFoundError: + self.update_ngrams(MuseumNumberSchema().dump(MuseumNumber.of(id_)), N) + ngrams = self._ngrams.find_one_by_id(id_)[NGRAM_FIELD] + + return {tuple(ngram) for ngram in ngrams} diff --git a/ebl/fragmentarium/web/bootstrap.py b/ebl/fragmentarium/web/bootstrap.py index ed27d5acd..773e7cb85 100644 --- a/ebl/fragmentarium/web/bootstrap.py +++ b/ebl/fragmentarium/web/bootstrap.py @@ -74,7 +74,7 @@ def create_fragmentarium_routes(api: falcon.App, context: Context): fragment_date = FragmentDateResource(updater) fragment_dates_in_text = FragmentDatesInTextResource(updater) - ngram_aligner = NgramAlignResource(context.fragment_repository) + ngrams = NgramAlignResource(context.fragment_ngram_repository) fragment_matcher = FragmentMatcherResource( FragmentMatcher(context.fragment_repository) @@ -113,7 +113,7 @@ def create_fragmentarium_routes(api: falcon.App, context: Context): routes = [ ("/fragments", fragment_search), - ("/fragments/{number}/align", ngram_aligner), + ("/fragments/{number}/ngrams", ngrams), ("/fragments/{number}/match", fragment_matcher), ("/fragments/{number}/genres", fragment_genre), ("/fragments/{number}/script", fragment_script), diff --git a/ebl/fragmentarium/web/ngram_matcher.py b/ebl/fragmentarium/web/ngram_matcher.py index a35ae2d11..a9738475a 100644 --- a/ebl/fragmentarium/web/ngram_matcher.py +++ b/ebl/fragmentarium/web/ngram_matcher.py @@ -1,13 +1,18 @@ from falcon import Request, Response -from ebl.fragmentarium.application.fragment_repository import FragmentRepository +from ebl.fragmentarium.infrastructure.fragment_ngram_repository import ( + FragmentNGramRepository, +) + +DEFAULT_N = [1, 2, 3] class NgramAlignResource: def __init__( self, - repository: FragmentRepository, + ngram_repository: FragmentNGramRepository, ): - self.fragment_repository = repository + self.ngram_repository = ngram_repository - def on_get(self, req: Request, resp: Response, number) -> None: - resp.media = {} + def on_get(self, _req: Request, resp: Response, number: str) -> None: + N = _req.get_param_as_list("n", transform=int, default=DEFAULT_N) + resp.media = list(self.ngram_repository.get_or_set_ngrams(number, N)) diff --git a/ebl/tests/fragmentarium/test_fragment_ngram_route.py b/ebl/tests/fragmentarium/test_fragment_ngram_route.py new file mode 100644 index 000000000..edbb5e69f --- /dev/null +++ b/ebl/tests/fragmentarium/test_fragment_ngram_route.py @@ -0,0 +1,29 @@ +import pytest +from ebl.tests.factories.fragment import TransliteratedFragmentFactory +from ebl.tests.common.ngram_test_support import ngrams_from_signs, N_VALUES +import falcon + +from ebl.transliteration.application.museum_number_schema import MuseumNumberSchema + + +@pytest.mark.parametrize( + "N", + N_VALUES, +) +@pytest.mark.parametrize("pre_generate_ngrams", [True, False]) +def test_update_fragment_ngrams( + client, fragmentarium, fragment_ngram_repository, N, pre_generate_ngrams +): + fragment = TransliteratedFragmentFactory.build() + fragment_id = fragmentarium.create(fragment) + + if pre_generate_ngrams: + number = MuseumNumberSchema().dump(fragment.number) + fragment_ngram_repository.update_ngrams(number, N) + + result = client.simulate_get(f"/fragments/{fragment_id}/ngrams", params={"n": N}) + + assert result.status == falcon.HTTP_OK + assert {tuple(ngram) for ngram in result.json} == ngrams_from_signs( + fragment.signs, N + ) From 080059efdbd32d5433a52e545d16b15c2de625ac Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Thu, 21 Sep 2023 10:03:26 +0000 Subject: [PATCH 29/36] use global DEFAULT_N --- ebl/common/infrastructure/ngrams.py | 1 + ebl/users/web/update_cache.py | 8 +++----- 2 files changed, 4 insertions(+), 5 deletions(-) create mode 100644 ebl/common/infrastructure/ngrams.py diff --git a/ebl/common/infrastructure/ngrams.py b/ebl/common/infrastructure/ngrams.py new file mode 100644 index 000000000..889870b98 --- /dev/null +++ b/ebl/common/infrastructure/ngrams.py @@ -0,0 +1 @@ +DEFAULT_N = [1, 2, 3] diff --git a/ebl/users/web/update_cache.py b/ebl/users/web/update_cache.py index 339ddd34c..4b6d23934 100644 --- a/ebl/users/web/update_cache.py +++ b/ebl/users/web/update_cache.py @@ -1,4 +1,5 @@ from marshmallow import EXCLUDE +from ebl.common.infrastructure.ngrams import DEFAULT_N from ebl.corpus.application.id_schemas import ChapterIdSchema from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository from ebl.fragmentarium.infrastructure.fragment_ngram_repository import ( @@ -6,16 +7,13 @@ ) -NGRAM_LENGHTS = [1, 2, 3] - - def create_fragment_ngram_cache(_req, resp, resource): if museum_number_dto := resp.media.get("museumNumber"): ngram_repository: FragmentNGramRepository = resource.ngram_repository - ngram_repository.update_ngrams(museum_number_dto, NGRAM_LENGHTS) + ngram_repository.set_ngrams(museum_number_dto, DEFAULT_N) def create_chapter_ngram_cache(_req, resp, resource): ngram_repository: ChapterNGramRepository = resource.ngram_repository chapter_id = ChapterIdSchema().load(resp.media, unknown=EXCLUDE) - ngram_repository.update_ngrams(chapter_id, NGRAM_LENGHTS) + ngram_repository.set_ngrams(chapter_id, DEFAULT_N) From 1fe024cd80fa78d25421b3c4db0b73c6c72c0a7d Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Thu, 21 Sep 2023 10:04:02 +0000 Subject: [PATCH 30/36] create ngrams on chapter creation --- ebl/corpus/application/text_repository.py | 2 +- ebl/corpus/infrastructure/mongo_text_repository.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/ebl/corpus/application/text_repository.py b/ebl/corpus/application/text_repository.py index a1ed7fd61..67bef7956 100644 --- a/ebl/corpus/application/text_repository.py +++ b/ebl/corpus/application/text_repository.py @@ -19,7 +19,7 @@ def create(self, text: Text) -> None: ... @abstractmethod - def create_chapter(self, chapter: Chapter) -> None: + def create_chapter(self, chapter: Chapter, N: Optional[Sequence[int]]) -> None: ... @abstractmethod diff --git a/ebl/corpus/infrastructure/mongo_text_repository.py b/ebl/corpus/infrastructure/mongo_text_repository.py index 1c61fccef..8cd39bb5a 100644 --- a/ebl/corpus/infrastructure/mongo_text_repository.py +++ b/ebl/corpus/infrastructure/mongo_text_repository.py @@ -6,6 +6,7 @@ from ebl.bibliography.infrastructure.bibliography import join_reference_documents +from ebl.common.infrastructure.ngrams import DEFAULT_N from ebl.common.query.query_result import CorpusQueryResult from ebl.common.query.query_schemas import CorpusQueryResultSchema from ebl.corpus.application.text_repository import TextRepository @@ -28,6 +29,7 @@ from ebl.corpus.infrastructure.chapter_query_filters import ( filter_query_by_transliteration, ) +from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository from ebl.corpus.infrastructure.corpus_search_aggregations import CorpusPatternMatcher from ebl.corpus.infrastructure.manuscript_lemma_filter import ( filter_manuscripts_by_lemma, @@ -68,6 +70,7 @@ class MongoTextRepository(TextRepository): def __init__(self, database: Database): self._texts = MongoCollection(database, TEXTS_COLLECTION) self._chapters = MongoCollection(database, CHAPTERS_COLLECTION) + self._ngram_repository = ChapterNGramRepository(database) def create_indexes(self) -> None: self._texts.create_index( @@ -107,8 +110,11 @@ def create_indexes(self) -> None: def create(self, text: Text) -> None: self._texts.insert_one(TextSchema(exclude=["chapters"]).dump(text)) - def create_chapter(self, chapter: Chapter) -> None: + def create_chapter( + self, chapter: Chapter, N: Optional[Sequence[int]] = None + ) -> None: self._chapters.insert_one(ChapterSchema().dump(chapter)) + self._ngram_repository.set_ngrams(chapter.id_, N or DEFAULT_N) def find(self, id_: TextId) -> Text: try: From 7432529292a7132a714f5506cd32479698a5c548 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Thu, 21 Sep 2023 11:58:33 +0000 Subject: [PATCH 31/36] rename param --- ebl/corpus/application/text_repository.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ebl/corpus/application/text_repository.py b/ebl/corpus/application/text_repository.py index 67bef7956..df0485f4b 100644 --- a/ebl/corpus/application/text_repository.py +++ b/ebl/corpus/application/text_repository.py @@ -19,7 +19,9 @@ def create(self, text: Text) -> None: ... @abstractmethod - def create_chapter(self, chapter: Chapter, N: Optional[Sequence[int]]) -> None: + def create_chapter( + self, chapter: Chapter, ngram_n: Optional[Sequence[int]] = None + ) -> None: ... @abstractmethod From 27cf1c9833c3d039ab92316f72e7671f3dc0efaf Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Thu, 21 Sep 2023 11:58:49 +0000 Subject: [PATCH 32/36] add compute_overlaps method --- .../infrastructure/corpus_ngram_repository.py | 87 ++++++++++++++++--- 1 file changed, 73 insertions(+), 14 deletions(-) diff --git a/ebl/corpus/infrastructure/corpus_ngram_repository.py b/ebl/corpus/infrastructure/corpus_ngram_repository.py index 98c9bee53..9f9be2a54 100644 --- a/ebl/corpus/infrastructure/corpus_ngram_repository.py +++ b/ebl/corpus/infrastructure/corpus_ngram_repository.py @@ -1,3 +1,4 @@ +from ebl.corpus.application.id_schemas import ChapterIdSchema from ebl.corpus.domain.chapter import ChapterId from ebl.corpus.infrastructure.queries import chapter_id_query from ebl.errors import NotFoundError @@ -6,7 +7,7 @@ CHAPTER_NGRAM_COLLECTION, CHAPTERS_COLLECTION, ) -from typing import Sequence, Set, Tuple +from typing import List, Optional, Sequence, Set, Tuple from ebl.common.query.util import aggregate_all_ngrams, replace_all @@ -22,7 +23,7 @@ def aggregate_chapter_ngrams( self, chapter_id: ChapterId, N: Sequence[int], - ): + ) -> Sequence[dict]: return [ {"$match": chapter_id_query(chapter_id)}, {"$project": {"signs": 1, "textId": 1, "stage": 1, "name": 1}}, @@ -51,25 +52,83 @@ def aggregate_chapter_ngrams( {"$project": {"_id": False}}, ] - def update_ngrams( + def set_ngrams( self, chapter_id: ChapterId, N: Sequence[int], - ) -> None: + ) -> Set[Tuple[str]]: aggregation = self.aggregate_chapter_ngrams(chapter_id, N) - if data := next( + data = next( self._chapters.aggregate(aggregation, allowDiskUse=True), - None, - ): - try: - self._ngrams.update_one( - chapter_id_query(chapter_id), - {"$set": {NGRAM_FIELD: data[NGRAM_FIELD]}}, - ) - except NotFoundError: - self._ngrams.insert_one(data) + {NGRAM_FIELD: [], **ChapterIdSchema().dump(chapter_id)}, + ) + try: + self._ngrams.update_one( + chapter_id_query(chapter_id), + {"$set": {NGRAM_FIELD: data[NGRAM_FIELD]}}, + ) + except NotFoundError: + self._ngrams.insert_one(data) + + return {tuple(ngram) for ngram in data[NGRAM_FIELD]} def get_ngrams(self, chapter_id: ChapterId) -> Set[Tuple[str]]: ngrams = self._ngrams.find_one(chapter_id_query(chapter_id))[NGRAM_FIELD] return {tuple(ngram) for ngram in ngrams} + + def get_or_set_ngrams( + self, chapter_id: ChapterId, N: Sequence[int] + ) -> Set[Tuple[str]]: + try: + return self.get_ngrams(chapter_id) + except NotFoundError: + ngrams = self.set_ngrams(chapter_id, N) + + return ngrams + + def compute_overlaps( + self, ngrams: Set[Tuple[str]], limit: Optional[int] = None + ) -> Sequence[dict]: + ngram_list = list(ngrams) + pipeline: List[dict] = [ + {"$match": {"textId.category": {"$ne": 99}}}, + { + "$project": { + "_id": 0, + "textId": 1, + "name": 1, + "stage": 1, + "overlap": { + "$let": { + "vars": { + "intersection": { + "$size": { + "$setIntersection": ["$ngram", ngram_list] + } + }, + "minLength": { + "$min": [ + {"$size": "$ngram"}, + {"$size": [ngram_list]}, + ] + }, + }, + "in": { + "$cond": [ + {"$eq": ["$$minLength", 0]}, + 0.0, + {"$divide": ["$$intersection", "$$minLength"]}, + ] + }, + } + }, + } + }, + {"$sort": {"overlap": -1}}, + ] + + if limit: + pipeline.append({"$limit": limit}) + + return list(self._ngrams.aggregate(pipeline, allowDiskUse=True)) From 949c1b33c7b7385456e8594fb8b231afc57743c7 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Thu, 21 Sep 2023 11:59:02 +0000 Subject: [PATCH 33/36] refactor --- ebl/corpus/infrastructure/mongo_text_repository.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ebl/corpus/infrastructure/mongo_text_repository.py b/ebl/corpus/infrastructure/mongo_text_repository.py index 8cd39bb5a..277c2c196 100644 --- a/ebl/corpus/infrastructure/mongo_text_repository.py +++ b/ebl/corpus/infrastructure/mongo_text_repository.py @@ -111,10 +111,10 @@ def create(self, text: Text) -> None: self._texts.insert_one(TextSchema(exclude=["chapters"]).dump(text)) def create_chapter( - self, chapter: Chapter, N: Optional[Sequence[int]] = None + self, chapter: Chapter, ngram_n: Optional[Sequence[int]] = None ) -> None: self._chapters.insert_one(ChapterSchema().dump(chapter)) - self._ngram_repository.set_ngrams(chapter.id_, N or DEFAULT_N) + self._ngram_repository.set_ngrams(chapter.id_, ngram_n or DEFAULT_N) def find(self, id_: TextId) -> Text: try: From 4b9f2aaf04ae0177c5ca35db9254008163e3a879 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Thu, 21 Sep 2023 12:00:27 +0000 Subject: [PATCH 34/36] refactor --- .../application/fragment_repository.py | 7 ++++- .../fragment_ngram_repository.py | 31 +++++++++++-------- ebl/fragmentarium/web/bootstrap.py | 4 ++- ebl/fragmentarium/web/ngram_matcher.py | 10 ++++-- ebl/tests/common/ngram_test_support.py | 4 ++- .../test_fragment_ngram_repository.py | 6 ++-- ebl/users/web/update_cache.py | 5 ++- 7 files changed, 43 insertions(+), 24 deletions(-) diff --git a/ebl/fragmentarium/application/fragment_repository.py b/ebl/fragmentarium/application/fragment_repository.py index f88efee6d..d4f5a0f18 100644 --- a/ebl/fragmentarium/application/fragment_repository.py +++ b/ebl/fragmentarium/application/fragment_repository.py @@ -17,7 +17,12 @@ def create_indexes(self) -> None: ... @abstractmethod - def create(self, fragment: Fragment, sort_key: Optional[int] = None) -> str: + def create( + self, + fragment: Fragment, + sort_key: Optional[int] = None, + ngram_n: Optional[Sequence[int]] = None, + ) -> str: ... @abstractmethod diff --git a/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py index 548c38a1b..66476b2a9 100644 --- a/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py +++ b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py @@ -1,6 +1,5 @@ from ebl.errors import NotFoundError from ebl.mongo_collection import MongoCollection -from ebl.transliteration.application.museum_number_schema import MuseumNumberSchema from ebl.transliteration.domain.museum_number import MuseumNumber from ebl.transliteration.infrastructure.collections import ( FRAGMENT_NGRAM_COLLECTION, @@ -9,6 +8,7 @@ from typing import Sequence, Set, Tuple from ebl.common.query.util import aggregate_all_ngrams, replace_all +from ebl.transliteration.infrastructure.queries import museum_number_is NGRAM_FIELD = "ngram" @@ -20,11 +20,11 @@ def __init__(self, database): def aggregate_fragment_ngrams( self, - number: dict, + number: MuseumNumber, N: Sequence[int], ): return [ - {"$match": {f"museumNumber.{key}": value for key, value in number.items()}}, + {"$match": museum_number_is(number)}, { "$project": { NGRAM_FIELD: { @@ -38,11 +38,11 @@ def aggregate_fragment_ngrams( *aggregate_all_ngrams(f"${NGRAM_FIELD}", N, NGRAM_FIELD), ] - def update_ngrams( + def set_ngrams( self, - number: dict, + number: MuseumNumber, N: Sequence[int], - ) -> None: + ) -> Set[Tuple[str]]: aggregation = self.aggregate_fragment_ngrams(number, N) if data := next( self._fragments.aggregate(aggregation, allowDiskUse=True), @@ -55,16 +55,21 @@ def update_ngrams( except NotFoundError: self._ngrams.insert_one(data) - def get_ngrams(self, id_: str) -> Set[Tuple[str]]: - ngrams = self._ngrams.find_one_by_id(id_)[NGRAM_FIELD] + return {tuple(ngram) for ngram in data[NGRAM_FIELD]} + + return set() + + def get_ngrams(self, number: MuseumNumber) -> Set[Tuple[str]]: + ngrams = self._ngrams.find_one_by_id(str(number))[NGRAM_FIELD] return {tuple(ngram) for ngram in ngrams} - def get_or_set_ngrams(self, id_: str, N: Sequence[int]) -> Set[Tuple[str]]: + def get_or_set_ngrams( + self, number: MuseumNumber, N: Sequence[int] + ) -> Set[Tuple[str]]: try: - ngrams = self._ngrams.find_one_by_id(id_)[NGRAM_FIELD] + return self.get_ngrams(number) except NotFoundError: - self.update_ngrams(MuseumNumberSchema().dump(MuseumNumber.of(id_)), N) - ngrams = self._ngrams.find_one_by_id(id_)[NGRAM_FIELD] + ngrams = self.set_ngrams(number, N) - return {tuple(ngram) for ngram in ngrams} + return ngrams diff --git a/ebl/fragmentarium/web/bootstrap.py b/ebl/fragmentarium/web/bootstrap.py index 773e7cb85..0d5fabdb8 100644 --- a/ebl/fragmentarium/web/bootstrap.py +++ b/ebl/fragmentarium/web/bootstrap.py @@ -74,7 +74,9 @@ def create_fragmentarium_routes(api: falcon.App, context: Context): fragment_date = FragmentDateResource(updater) fragment_dates_in_text = FragmentDatesInTextResource(updater) - ngrams = NgramAlignResource(context.fragment_ngram_repository) + ngrams = NgramAlignResource( + context.fragment_ngram_repository, context.chapter_ngram_repository + ) fragment_matcher = FragmentMatcherResource( FragmentMatcher(context.fragment_repository) diff --git a/ebl/fragmentarium/web/ngram_matcher.py b/ebl/fragmentarium/web/ngram_matcher.py index a9738475a..0018a6616 100644 --- a/ebl/fragmentarium/web/ngram_matcher.py +++ b/ebl/fragmentarium/web/ngram_matcher.py @@ -1,18 +1,22 @@ from falcon import Request, Response +from ebl.common.infrastructure.ngrams import DEFAULT_N +from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository from ebl.fragmentarium.infrastructure.fragment_ngram_repository import ( FragmentNGramRepository, ) - -DEFAULT_N = [1, 2, 3] +from ebl.transliteration.domain.museum_number import MuseumNumber class NgramAlignResource: def __init__( self, ngram_repository: FragmentNGramRepository, + chapter_ngram_repository: ChapterNGramRepository, ): self.ngram_repository = ngram_repository + self.chapter_ngram_repository = chapter_ngram_repository def on_get(self, _req: Request, resp: Response, number: str) -> None: N = _req.get_param_as_list("n", transform=int, default=DEFAULT_N) - resp.media = list(self.ngram_repository.get_or_set_ngrams(number, N)) + ngrams = self.ngram_repository.get_or_set_ngrams(MuseumNumber.of(number), N) + resp.media = self.chapter_ngram_repository.compute_overlaps(ngrams) diff --git a/ebl/tests/common/ngram_test_support.py b/ebl/tests/common/ngram_test_support.py index 31d93229c..767bbfd2e 100644 --- a/ebl/tests/common/ngram_test_support.py +++ b/ebl/tests/common/ngram_test_support.py @@ -1,11 +1,13 @@ from typing import Sequence, Set, Tuple, TypeVar +from ebl.common.infrastructure.ngrams import DEFAULT_N + T = TypeVar("T") N_VALUES = [ + DEFAULT_N, [1], [1, 2], - [1, 2, 3], [5], ] diff --git a/ebl/tests/fragmentarium/test_fragment_ngram_repository.py b/ebl/tests/fragmentarium/test_fragment_ngram_repository.py index a4c88ebc0..fcd754053 100644 --- a/ebl/tests/fragmentarium/test_fragment_ngram_repository.py +++ b/ebl/tests/fragmentarium/test_fragment_ngram_repository.py @@ -1,6 +1,5 @@ import pytest from ebl.tests.factories.fragment import TransliteratedFragmentFactory -from ebl.transliteration.application.museum_number_schema import MuseumNumberSchema from ebl.tests.common.ngram_test_support import ngrams_from_signs, N_VALUES @@ -16,15 +15,14 @@ def test_update_fragment_ngrams( fragment_repository, fragment_ngram_repository, N, N_NEW ): fragment = TransliteratedFragmentFactory.build() - number = MuseumNumberSchema().dump(fragment.number) fragment_id = fragment_repository.create(fragment) assert not fragment_ngram_repository._ngrams.exists({"_id": fragment_id}) - fragment_ngram_repository.update_ngrams(number, N) + fragment_ngram_repository.set_ngrams(fragment.number, N) ngrams = ngrams_from_signs(fragment.signs, N) assert fragment_ngram_repository.get_ngrams(fragment_id) == ngrams - fragment_ngram_repository.update_ngrams(number, N_NEW) + fragment_ngram_repository.set_ngrams(fragment.number, N_NEW) ngrams = ngrams_from_signs(fragment.signs, N_NEW) assert fragment_ngram_repository.get_ngrams(fragment_id) == ngrams diff --git a/ebl/users/web/update_cache.py b/ebl/users/web/update_cache.py index 4b6d23934..9827cf3fb 100644 --- a/ebl/users/web/update_cache.py +++ b/ebl/users/web/update_cache.py @@ -5,12 +5,15 @@ from ebl.fragmentarium.infrastructure.fragment_ngram_repository import ( FragmentNGramRepository, ) +from ebl.transliteration.application.museum_number_schema import MuseumNumberSchema def create_fragment_ngram_cache(_req, resp, resource): if museum_number_dto := resp.media.get("museumNumber"): ngram_repository: FragmentNGramRepository = resource.ngram_repository - ngram_repository.set_ngrams(museum_number_dto, DEFAULT_N) + ngram_repository.set_ngrams( + MuseumNumberSchema().load(museum_number_dto), DEFAULT_N + ) def create_chapter_ngram_cache(_req, resp, resource): From 06576b11af3d274f8f3345159c1c72c9bf48c317 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Thu, 21 Sep 2023 12:00:45 +0000 Subject: [PATCH 35/36] add FragmentNGramRepository --- .../infrastructure/mongo_fragment_repository.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py b/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py index 33494a7eb..250b6cf58 100644 --- a/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py +++ b/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py @@ -6,6 +6,7 @@ from ebl.bibliography.infrastructure.bibliography import join_reference_documents from ebl.common.domain.scopes import Scope +from ebl.common.infrastructure.ngrams import DEFAULT_N from ebl.common.query.query_result import QueryResult from ebl.common.query.query_schemas import QueryResultSchema from ebl.errors import NotFoundError @@ -19,6 +20,9 @@ from ebl.fragmentarium.domain.joins import Join from ebl.fragmentarium.domain.line_to_vec_encoding import LineToVecEncoding from ebl.fragmentarium.infrastructure.collections import JOINS_COLLECTION +from ebl.fragmentarium.infrastructure.fragment_ngram_repository import ( + FragmentNGramRepository, +) from ebl.fragmentarium.infrastructure.fragment_search_aggregations import PatternMatcher from ebl.fragmentarium.domain.date import Date, DateSchema @@ -51,6 +55,7 @@ class MongoFragmentRepository(FragmentRepository): def __init__(self, database): self._fragments = MongoCollection(database, FRAGMENTS_COLLECTION) self._joins = MongoCollection(database, JOINS_COLLECTION) + self._ngram_repository = FragmentNGramRepository(database) def create_indexes(self) -> None: self._fragments.create_index( @@ -102,14 +107,16 @@ def count_lines(self): except StopIteration: return 0 - def create(self, fragment, sort_key=None): - return self._fragments.insert_one( + def create(self, fragment, sort_key=None, ngram_n=None): + id_ = self._fragments.insert_one( { "_id": str(fragment.number), **FragmentSchema(exclude=["joins"]).dump(fragment), **({} if sort_key is None else {"_sortKey": sort_key}), } ) + self._ngram_repository.set_ngrams(MuseumNumber.of(id_), ngram_n or DEFAULT_N) + return id_ def create_many(self, fragments: Sequence[Fragment]) -> Sequence[str]: schema = FragmentSchema(exclude=["joins"]) From 63039eb8e81345d4ded0a27d83c86c82ad38aff3 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Thu, 21 Sep 2023 12:00:55 +0000 Subject: [PATCH 36/36] extend ngram tests --- .../corpus/test_chapter_ngram_repository.py | 14 +++--- .../test_fragment_ngram_route.py | 43 ++++++++++++++----- 2 files changed, 42 insertions(+), 15 deletions(-) diff --git a/ebl/tests/corpus/test_chapter_ngram_repository.py b/ebl/tests/corpus/test_chapter_ngram_repository.py index 530758d33..608557239 100644 --- a/ebl/tests/corpus/test_chapter_ngram_repository.py +++ b/ebl/tests/corpus/test_chapter_ngram_repository.py @@ -14,6 +14,13 @@ def chapter_ngrams_from_signs( ) +def test_create_chapter_sets_ngrams(text_repository, chapter_ngram_repository): + chapter = ChapterFactory.build() + text_repository.create_chapter(chapter) + + assert chapter_ngram_repository._ngrams.exists(chapter_id_query(chapter.id_)) + + @pytest.mark.parametrize( "N", N_VALUES, @@ -24,15 +31,12 @@ def chapter_ngrams_from_signs( ) def test_update_chapter_ngrams(text_repository, chapter_ngram_repository, N, N_NEW): chapter = ChapterFactory.build() - text_repository.create_chapter(chapter) - - assert not chapter_ngram_repository._ngrams.exists(chapter_id_query(chapter.id_)) + text_repository.create_chapter(chapter, N) - chapter_ngram_repository.update_ngrams(chapter.id_, N) ngrams = chapter_ngrams_from_signs(chapter.signs, N) assert chapter_ngram_repository.get_ngrams(chapter.id_) == ngrams - chapter_ngram_repository.update_ngrams(chapter.id_, N_NEW) + chapter_ngram_repository.set_ngrams(chapter.id_, N_NEW) ngrams = chapter_ngrams_from_signs(chapter.signs, N_NEW) assert chapter_ngram_repository.get_ngrams(chapter.id_) == ngrams diff --git a/ebl/tests/fragmentarium/test_fragment_ngram_route.py b/ebl/tests/fragmentarium/test_fragment_ngram_route.py index edbb5e69f..469d3856d 100644 --- a/ebl/tests/fragmentarium/test_fragment_ngram_route.py +++ b/ebl/tests/fragmentarium/test_fragment_ngram_route.py @@ -1,29 +1,52 @@ +from typing import Sequence import pytest +from ebl.corpus.application.id_schemas import ChapterIdSchema +from ebl.corpus.domain.chapter import Chapter +from ebl.fragmentarium.domain.fragment import Fragment +from ebl.tests.corpus.test_chapter_ngram_repository import chapter_ngrams_from_signs +from ebl.tests.factories.corpus import ChapterFactory from ebl.tests.factories.fragment import TransliteratedFragmentFactory from ebl.tests.common.ngram_test_support import ngrams_from_signs, N_VALUES import falcon -from ebl.transliteration.application.museum_number_schema import MuseumNumberSchema +SIGNS = ["X BA KU ABZ075", "KI DU ABZ411 BA MA TI\nX MU TA MA UD", "KU ABZ411 MA KI"] + + +def compute_overlap(fragment: Fragment, chapter: Chapter, N: Sequence[int]) -> float: + F = ngrams_from_signs(fragment.signs, N) + C = chapter_ngrams_from_signs(chapter.signs, N) + + return (len(F & C) / min(len(F), len(C))) if F and C else 0.0 @pytest.mark.parametrize( "N", N_VALUES, ) -@pytest.mark.parametrize("pre_generate_ngrams", [True, False]) -def test_update_fragment_ngrams( - client, fragmentarium, fragment_ngram_repository, N, pre_generate_ngrams +def test_match_fragment_ngrams( + client, + fragment_repository, + text_repository, + N, ): fragment = TransliteratedFragmentFactory.build() - fragment_id = fragmentarium.create(fragment) + fragment_id = fragment_repository.create(fragment, ngram_n=N) + chapters = [ChapterFactory.build(signs=(signs,)) for signs in SIGNS] - if pre_generate_ngrams: - number = MuseumNumberSchema().dump(fragment.number) - fragment_ngram_repository.update_ngrams(number, N) + for chapter in chapters: + text_repository.create_chapter(chapter, N) result = client.simulate_get(f"/fragments/{fragment_id}/ngrams", params={"n": N}) assert result.status == falcon.HTTP_OK - assert {tuple(ngram) for ngram in result.json} == ngrams_from_signs( - fragment.signs, N + assert result.json == sorted( + ( + { + **ChapterIdSchema().dump(chapter.id_), + "overlap": compute_overlap(fragment, chapter, N), + } + for chapter in chapters + ), + key=lambda item: item["overlap"], + reverse=True, )