diff --git a/ebl/app.py b/ebl/app.py index e98d4b20f..410a5825d 100644 --- a/ebl/app.py +++ b/ebl/app.py @@ -9,6 +9,7 @@ from sentry_sdk import configure_scope from sentry_sdk.integrations.falcon import FalconIntegration import althaia +from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository import ebl.error_handler from ebl.bibliography.infrastructure.bibliography import MongoBibliographyRepository from ebl.bibliography.web.bootstrap import create_bibliography_routes @@ -25,6 +26,9 @@ from ebl.ebl_ai_client import EblAiClient from ebl.files.infrastructure.grid_fs_file_repository import GridFsFileRepository from ebl.files.web.bootstrap import create_files_route +from ebl.fragmentarium.infrastructure.fragment_ngram_repository import ( + FragmentNGramRepository, +) from ebl.markup.web.bootstrap import create_markup_route from ebl.fragmentarium.infrastructure.cropped_sign_images_repository import ( MongoCroppedSignImagesRepository, @@ -85,6 +89,8 @@ def create_context(): photo_repository=GridFsFileRepository(database, "photos"), folio_repository=GridFsFileRepository(database, "folios"), fragment_repository=MongoFragmentRepository(database), + fragment_ngram_repository=FragmentNGramRepository(database), + chapter_ngram_repository=ChapterNGramRepository(database), changelog=Changelog(database), bibliography_repository=MongoBibliographyRepository(database), text_repository=MongoTextRepository(database), diff --git a/ebl/common/infrastructure/ngrams.py b/ebl/common/infrastructure/ngrams.py new file mode 100644 index 000000000..889870b98 --- /dev/null +++ b/ebl/common/infrastructure/ngrams.py @@ -0,0 +1 @@ +DEFAULT_N = [1, 2, 3] diff --git a/ebl/common/query/util.py b/ebl/common/query/util.py index ee71de18c..f396e0abe 100644 --- a/ebl/common/query/util.py +++ b/ebl/common/query/util.py @@ -1,4 +1,4 @@ -from typing import Union, Dict +from typing import Sequence, Union, Dict def flatten_field(input_: Union[str, Dict], depth=1) -> Dict: @@ -16,8 +16,8 @@ def drop_duplicates(input_: Union[str, Dict]) -> Dict: def ngrams(input_: Union[str, Dict], n) -> Dict: - if n <= 1: - raise ValueError("ngram size must be 2 or more") + if n <= 0: + raise ValueError("ngram size must be 1 or more") return { "$zip": { "inputs": [ @@ -39,3 +39,48 @@ def ngrams(input_: Union[str, Dict], n) -> Dict: def filter_array(input_, as_, cond) -> Dict: return {"$filter": {"input": input_, "as": as_, "cond": cond}} + + +def aggregate_all_ngrams( + input_: Union[str, Dict], + N: Sequence[int], + output_: str = "ngrams", +): + signs_to_exclude = ["X", ""] + + exclude_empty = { + "$eq": [ + { + "$size": { + "$setIntersection": [ + "$$this", + signs_to_exclude, + ] + } + }, + 0, + ] + } + return [ + { + "$addFields": { + output_: drop_duplicates( + filter_array( + {"$concatArrays": [ngrams(input_, n) for n in N if n > 0]}, + "this", + exclude_empty, + ) + ) + } + }, + ] + + +def replace_all(old: str, new: str): + return { + "$replaceAll": { + "input": "$signs", + "find": old, + "replacement": new, + } + } diff --git a/ebl/context.py b/ebl/context.py index 547aaf43b..7798636a1 100644 --- a/ebl/context.py +++ b/ebl/context.py @@ -6,6 +6,7 @@ from ebl.bibliography.application.bibliography_repository import BibliographyRepository from ebl.cache.application.custom_cache import ChapterCache from ebl.changelog import Changelog +from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository from ebl.corpus.infrastructure.mongo_text_repository import MongoTextRepository from ebl.dictionary.application.word_repository import WordRepository from ebl.ebl_ai_client import EblAiClient @@ -19,6 +20,9 @@ from ebl.fragmentarium.infrastructure.cropped_sign_images_repository import ( MongoCroppedSignImagesRepository, ) +from ebl.fragmentarium.infrastructure.fragment_ngram_repository import ( + FragmentNGramRepository, +) from ebl.lemmatization.application.suggestion_finder import LemmaRepository from ebl.transliteration.application.parallel_line_injector import ParallelLineInjector from ebl.transliteration.application.sign_repository import SignRepository @@ -38,6 +42,8 @@ class Context: photo_repository: FileRepository folio_repository: FileRepository fragment_repository: FragmentRepository + fragment_ngram_repository: FragmentNGramRepository + chapter_ngram_repository: ChapterNGramRepository changelog: Changelog bibliography_repository: BibliographyRepository text_repository: MongoTextRepository diff --git a/ebl/corpus/application/text_repository.py b/ebl/corpus/application/text_repository.py index a1ed7fd61..df0485f4b 100644 --- a/ebl/corpus/application/text_repository.py +++ b/ebl/corpus/application/text_repository.py @@ -19,7 +19,9 @@ def create(self, text: Text) -> None: ... @abstractmethod - def create_chapter(self, chapter: Chapter) -> None: + def create_chapter( + self, chapter: Chapter, ngram_n: Optional[Sequence[int]] = None + ) -> None: ... @abstractmethod diff --git a/ebl/corpus/infrastructure/corpus_ngram_repository.py b/ebl/corpus/infrastructure/corpus_ngram_repository.py new file mode 100644 index 000000000..9f9be2a54 --- /dev/null +++ b/ebl/corpus/infrastructure/corpus_ngram_repository.py @@ -0,0 +1,134 @@ +from ebl.corpus.application.id_schemas import ChapterIdSchema +from ebl.corpus.domain.chapter import ChapterId +from ebl.corpus.infrastructure.queries import chapter_id_query +from ebl.errors import NotFoundError +from ebl.mongo_collection import MongoCollection +from ebl.transliteration.infrastructure.collections import ( + CHAPTER_NGRAM_COLLECTION, + CHAPTERS_COLLECTION, +) +from typing import List, Optional, Sequence, Set, Tuple + +from ebl.common.query.util import aggregate_all_ngrams, replace_all + +NGRAM_FIELD = "ngram" + + +class ChapterNGramRepository: + def __init__(self, database): + self._chapters = MongoCollection(database, CHAPTERS_COLLECTION) + self._ngrams = MongoCollection(database, CHAPTER_NGRAM_COLLECTION) + + def aggregate_chapter_ngrams( + self, + chapter_id: ChapterId, + N: Sequence[int], + ) -> Sequence[dict]: + return [ + {"$match": chapter_id_query(chapter_id)}, + {"$project": {"signs": 1, "textId": 1, "stage": 1, "name": 1}}, + {"$unwind": "$signs"}, + { + "$addFields": { + NGRAM_FIELD: { + "$split": [ + replace_all("\n", " # "), + " ", + ] + } + } + }, + *aggregate_all_ngrams(f"${NGRAM_FIELD}", N, NGRAM_FIELD), + {"$unwind": f"${NGRAM_FIELD}"}, + { + "$group": { + "_id": None, + NGRAM_FIELD: {"$addToSet": f"${NGRAM_FIELD}"}, + "textId": {"$first": "$textId"}, + "name": {"$first": "$name"}, + "stage": {"$first": "$stage"}, + } + }, + {"$project": {"_id": False}}, + ] + + def set_ngrams( + self, + chapter_id: ChapterId, + N: Sequence[int], + ) -> Set[Tuple[str]]: + aggregation = self.aggregate_chapter_ngrams(chapter_id, N) + data = next( + self._chapters.aggregate(aggregation, allowDiskUse=True), + {NGRAM_FIELD: [], **ChapterIdSchema().dump(chapter_id)}, + ) + try: + self._ngrams.update_one( + chapter_id_query(chapter_id), + {"$set": {NGRAM_FIELD: data[NGRAM_FIELD]}}, + ) + except NotFoundError: + self._ngrams.insert_one(data) + + return {tuple(ngram) for ngram in data[NGRAM_FIELD]} + + def get_ngrams(self, chapter_id: ChapterId) -> Set[Tuple[str]]: + ngrams = self._ngrams.find_one(chapter_id_query(chapter_id))[NGRAM_FIELD] + + return {tuple(ngram) for ngram in ngrams} + + def get_or_set_ngrams( + self, chapter_id: ChapterId, N: Sequence[int] + ) -> Set[Tuple[str]]: + try: + return self.get_ngrams(chapter_id) + except NotFoundError: + ngrams = self.set_ngrams(chapter_id, N) + + return ngrams + + def compute_overlaps( + self, ngrams: Set[Tuple[str]], limit: Optional[int] = None + ) -> Sequence[dict]: + ngram_list = list(ngrams) + pipeline: List[dict] = [ + {"$match": {"textId.category": {"$ne": 99}}}, + { + "$project": { + "_id": 0, + "textId": 1, + "name": 1, + "stage": 1, + "overlap": { + "$let": { + "vars": { + "intersection": { + "$size": { + "$setIntersection": ["$ngram", ngram_list] + } + }, + "minLength": { + "$min": [ + {"$size": "$ngram"}, + {"$size": [ngram_list]}, + ] + }, + }, + "in": { + "$cond": [ + {"$eq": ["$$minLength", 0]}, + 0.0, + {"$divide": ["$$intersection", "$$minLength"]}, + ] + }, + } + }, + } + }, + {"$sort": {"overlap": -1}}, + ] + + if limit: + pipeline.append({"$limit": limit}) + + return list(self._ngrams.aggregate(pipeline, allowDiskUse=True)) diff --git a/ebl/corpus/infrastructure/mongo_text_repository.py b/ebl/corpus/infrastructure/mongo_text_repository.py index 1c61fccef..277c2c196 100644 --- a/ebl/corpus/infrastructure/mongo_text_repository.py +++ b/ebl/corpus/infrastructure/mongo_text_repository.py @@ -6,6 +6,7 @@ from ebl.bibliography.infrastructure.bibliography import join_reference_documents +from ebl.common.infrastructure.ngrams import DEFAULT_N from ebl.common.query.query_result import CorpusQueryResult from ebl.common.query.query_schemas import CorpusQueryResultSchema from ebl.corpus.application.text_repository import TextRepository @@ -28,6 +29,7 @@ from ebl.corpus.infrastructure.chapter_query_filters import ( filter_query_by_transliteration, ) +from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository from ebl.corpus.infrastructure.corpus_search_aggregations import CorpusPatternMatcher from ebl.corpus.infrastructure.manuscript_lemma_filter import ( filter_manuscripts_by_lemma, @@ -68,6 +70,7 @@ class MongoTextRepository(TextRepository): def __init__(self, database: Database): self._texts = MongoCollection(database, TEXTS_COLLECTION) self._chapters = MongoCollection(database, CHAPTERS_COLLECTION) + self._ngram_repository = ChapterNGramRepository(database) def create_indexes(self) -> None: self._texts.create_index( @@ -107,8 +110,11 @@ def create_indexes(self) -> None: def create(self, text: Text) -> None: self._texts.insert_one(TextSchema(exclude=["chapters"]).dump(text)) - def create_chapter(self, chapter: Chapter) -> None: + def create_chapter( + self, chapter: Chapter, ngram_n: Optional[Sequence[int]] = None + ) -> None: self._chapters.insert_one(ChapterSchema().dump(chapter)) + self._ngram_repository.set_ngrams(chapter.id_, ngram_n or DEFAULT_N) def find(self, id_: TextId) -> Text: try: diff --git a/ebl/corpus/web/bootstrap.py b/ebl/corpus/web/bootstrap.py index 362461011..e906242c3 100644 --- a/ebl/corpus/web/bootstrap.py +++ b/ebl/corpus/web/bootstrap.py @@ -48,9 +48,15 @@ def create_corpus_routes(api: falcon.App, context: Context): chapters_by_lemma = ChaptersByLemmaResource(corpus) alignment = AlignmentResource(corpus, context.custom_cache) manuscript_lemmatization = LemmatizationResource(corpus, context.custom_cache) - manuscript = ManuscriptsResource(corpus, context.custom_cache) - lines = LinesResource(corpus, context.custom_cache) - lines_import = LinesImportResource(corpus, context.custom_cache) + manuscript = ManuscriptsResource( + corpus, context.custom_cache, context.chapter_ngram_repository + ) + lines = LinesResource( + corpus, context.custom_cache, context.chapter_ngram_repository + ) + lines_import = LinesImportResource( + corpus, context.custom_cache, context.chapter_ngram_repository + ) colophons = ColophonsResource(corpus) unplaced_lines = UnplacedLinesResource(corpus) extant_lines = ExtantLinesResource(corpus) diff --git a/ebl/corpus/web/lines.py b/ebl/corpus/web/lines.py index 6ef1a3084..487c1cb85 100644 --- a/ebl/corpus/web/lines.py +++ b/ebl/corpus/web/lines.py @@ -7,11 +7,13 @@ from ebl.corpus.application.corpus import Corpus from ebl.corpus.domain.line import Line from ebl.corpus.domain.lines_update import LinesUpdate +from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository from ebl.corpus.web.chapter_schemas import ApiChapterSchema, ApiLineSchema from ebl.corpus.web.display_schemas import LineDetailsDisplay, LineDetailsDisplaySchema from ebl.corpus.web.text_utils import create_chapter_id from ebl.errors import NotFoundError from ebl.marshmallowschema import validate +from ebl.users.web.update_cache import create_chapter_ngram_cache from ebl.users.web.require_scope import require_scope @@ -39,12 +41,19 @@ class LinesImportSchema(Schema): class LinesResource: - def __init__(self, corpus: Corpus, cache: ChapterCache): + def __init__( + self, + corpus: Corpus, + cache: ChapterCache, + ngram_repository: ChapterNGramRepository, + ): self._corpus = corpus self._cache = cache + self.ngram_repository = ngram_repository @falcon.before(require_scope, "write:texts") @validate(LinesUpdateSchema()) + @falcon.after(create_chapter_ngram_cache) def on_post( self, req: falcon.Request, @@ -64,12 +73,19 @@ def on_post( class LinesImportResource: - def __init__(self, corpus: Corpus, cache: ChapterCache): + def __init__( + self, + corpus: Corpus, + cache: ChapterCache, + ngram_repository: ChapterNGramRepository, + ): self._corpus = corpus self._cache = cache + self.ngram_repository = ngram_repository @falcon.before(require_scope, "write:texts") @validate(LinesImportSchema()) + @falcon.after(create_chapter_ngram_cache) def on_post( self, req: falcon.Request, diff --git a/ebl/corpus/web/manuscripts.py b/ebl/corpus/web/manuscripts.py index 511d99705..35c2d7b60 100644 --- a/ebl/corpus/web/manuscripts.py +++ b/ebl/corpus/web/manuscripts.py @@ -3,6 +3,7 @@ from ebl.cache.application.custom_cache import ChapterCache from ebl.corpus.application.corpus import Corpus +from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository from ebl.corpus.web.chapter_schemas import ( ApiChapterSchema, ApiManuscriptSchema, @@ -10,6 +11,7 @@ ) from ebl.corpus.web.text_utils import create_chapter_id from ebl.marshmallowschema import validate +from ebl.users.web.update_cache import create_chapter_ngram_cache from ebl.users.web.require_scope import require_scope @@ -21,9 +23,15 @@ class ManuscriptDtoSchema(Schema): class ManuscriptsResource: - def __init__(self, corpus: Corpus, cache: ChapterCache): + def __init__( + self, + corpus: Corpus, + cache: ChapterCache, + ngram_repository: ChapterNGramRepository, + ): self._corpus = corpus self._cache = cache + self.ngram_repository = ngram_repository def on_get( self, @@ -41,6 +49,7 @@ def on_get( @falcon.before(require_scope, "write:texts") @validate(ManuscriptDtoSchema()) + @falcon.after(create_chapter_ngram_cache) def on_post( self, req: falcon.Request, diff --git a/ebl/fragmentarium/application/fragment_repository.py b/ebl/fragmentarium/application/fragment_repository.py index f88efee6d..d4f5a0f18 100644 --- a/ebl/fragmentarium/application/fragment_repository.py +++ b/ebl/fragmentarium/application/fragment_repository.py @@ -17,7 +17,12 @@ def create_indexes(self) -> None: ... @abstractmethod - def create(self, fragment: Fragment, sort_key: Optional[int] = None) -> str: + def create( + self, + fragment: Fragment, + sort_key: Optional[int] = None, + ngram_n: Optional[Sequence[int]] = None, + ) -> str: ... @abstractmethod diff --git a/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py new file mode 100644 index 000000000..66476b2a9 --- /dev/null +++ b/ebl/fragmentarium/infrastructure/fragment_ngram_repository.py @@ -0,0 +1,75 @@ +from ebl.errors import NotFoundError +from ebl.mongo_collection import MongoCollection +from ebl.transliteration.domain.museum_number import MuseumNumber +from ebl.transliteration.infrastructure.collections import ( + FRAGMENT_NGRAM_COLLECTION, + FRAGMENTS_COLLECTION, +) +from typing import Sequence, Set, Tuple + +from ebl.common.query.util import aggregate_all_ngrams, replace_all +from ebl.transliteration.infrastructure.queries import museum_number_is + +NGRAM_FIELD = "ngram" + + +class FragmentNGramRepository: + def __init__(self, database): + self._fragments = MongoCollection(database, FRAGMENTS_COLLECTION) + self._ngrams = MongoCollection(database, FRAGMENT_NGRAM_COLLECTION) + + def aggregate_fragment_ngrams( + self, + number: MuseumNumber, + N: Sequence[int], + ): + return [ + {"$match": museum_number_is(number)}, + { + "$project": { + NGRAM_FIELD: { + "$split": [ + replace_all("\n", " # "), + " ", + ] + } + } + }, + *aggregate_all_ngrams(f"${NGRAM_FIELD}", N, NGRAM_FIELD), + ] + + def set_ngrams( + self, + number: MuseumNumber, + N: Sequence[int], + ) -> Set[Tuple[str]]: + aggregation = self.aggregate_fragment_ngrams(number, N) + if data := next( + self._fragments.aggregate(aggregation, allowDiskUse=True), + None, + ): + try: + self._ngrams.update_one( + {"_id": data["_id"]}, {"$set": {NGRAM_FIELD: data[NGRAM_FIELD]}} + ) + except NotFoundError: + self._ngrams.insert_one(data) + + return {tuple(ngram) for ngram in data[NGRAM_FIELD]} + + return set() + + def get_ngrams(self, number: MuseumNumber) -> Set[Tuple[str]]: + ngrams = self._ngrams.find_one_by_id(str(number))[NGRAM_FIELD] + + return {tuple(ngram) for ngram in ngrams} + + def get_or_set_ngrams( + self, number: MuseumNumber, N: Sequence[int] + ) -> Set[Tuple[str]]: + try: + return self.get_ngrams(number) + except NotFoundError: + ngrams = self.set_ngrams(number, N) + + return ngrams diff --git a/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py b/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py index 33494a7eb..250b6cf58 100644 --- a/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py +++ b/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py @@ -6,6 +6,7 @@ from ebl.bibliography.infrastructure.bibliography import join_reference_documents from ebl.common.domain.scopes import Scope +from ebl.common.infrastructure.ngrams import DEFAULT_N from ebl.common.query.query_result import QueryResult from ebl.common.query.query_schemas import QueryResultSchema from ebl.errors import NotFoundError @@ -19,6 +20,9 @@ from ebl.fragmentarium.domain.joins import Join from ebl.fragmentarium.domain.line_to_vec_encoding import LineToVecEncoding from ebl.fragmentarium.infrastructure.collections import JOINS_COLLECTION +from ebl.fragmentarium.infrastructure.fragment_ngram_repository import ( + FragmentNGramRepository, +) from ebl.fragmentarium.infrastructure.fragment_search_aggregations import PatternMatcher from ebl.fragmentarium.domain.date import Date, DateSchema @@ -51,6 +55,7 @@ class MongoFragmentRepository(FragmentRepository): def __init__(self, database): self._fragments = MongoCollection(database, FRAGMENTS_COLLECTION) self._joins = MongoCollection(database, JOINS_COLLECTION) + self._ngram_repository = FragmentNGramRepository(database) def create_indexes(self) -> None: self._fragments.create_index( @@ -102,14 +107,16 @@ def count_lines(self): except StopIteration: return 0 - def create(self, fragment, sort_key=None): - return self._fragments.insert_one( + def create(self, fragment, sort_key=None, ngram_n=None): + id_ = self._fragments.insert_one( { "_id": str(fragment.number), **FragmentSchema(exclude=["joins"]).dump(fragment), **({} if sort_key is None else {"_sortKey": sort_key}), } ) + self._ngram_repository.set_ngrams(MuseumNumber.of(id_), ngram_n or DEFAULT_N) + return id_ def create_many(self, fragments: Sequence[Fragment]) -> Sequence[str]: schema = FragmentSchema(exclude=["joins"]) diff --git a/ebl/fragmentarium/web/bootstrap.py b/ebl/fragmentarium/web/bootstrap.py index 442cba654..0d5fabdb8 100644 --- a/ebl/fragmentarium/web/bootstrap.py +++ b/ebl/fragmentarium/web/bootstrap.py @@ -11,6 +11,7 @@ from ebl.fragmentarium.web.folios import FoliosResource from ebl.fragmentarium.web.fragment_genre import FragmentGenreResource from ebl.fragmentarium.web.fragment_script import FragmentScriptResource +from ebl.fragmentarium.web.ngram_matcher import NgramAlignResource from ebl.fragmentarium.web.fragment_date import ( FragmentDateResource, FragmentDatesInTextResource, @@ -73,6 +74,10 @@ def create_fragmentarium_routes(api: falcon.App, context: Context): fragment_date = FragmentDateResource(updater) fragment_dates_in_text = FragmentDatesInTextResource(updater) + ngrams = NgramAlignResource( + context.fragment_ngram_repository, context.chapter_ngram_repository + ) + fragment_matcher = FragmentMatcherResource( FragmentMatcher(context.fragment_repository) ) @@ -90,7 +95,9 @@ def create_fragmentarium_routes(api: falcon.App, context: Context): lemmatization = LemmatizationResource(updater) references = ReferencesResource(updater) transliteration = TransliterationResource( - updater, context.get_transliteration_update_factory() + updater, + context.get_transliteration_update_factory(), + context.fragment_ngram_repository, ) introduction = IntroductionResource(updater) archaeology = ArchaeologyResource(updater) @@ -108,6 +115,7 @@ def create_fragmentarium_routes(api: falcon.App, context: Context): routes = [ ("/fragments", fragment_search), + ("/fragments/{number}/ngrams", ngrams), ("/fragments/{number}/match", fragment_matcher), ("/fragments/{number}/genres", fragment_genre), ("/fragments/{number}/script", fragment_script), diff --git a/ebl/fragmentarium/web/ngram_matcher.py b/ebl/fragmentarium/web/ngram_matcher.py new file mode 100644 index 000000000..0018a6616 --- /dev/null +++ b/ebl/fragmentarium/web/ngram_matcher.py @@ -0,0 +1,22 @@ +from falcon import Request, Response +from ebl.common.infrastructure.ngrams import DEFAULT_N +from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository +from ebl.fragmentarium.infrastructure.fragment_ngram_repository import ( + FragmentNGramRepository, +) +from ebl.transliteration.domain.museum_number import MuseumNumber + + +class NgramAlignResource: + def __init__( + self, + ngram_repository: FragmentNGramRepository, + chapter_ngram_repository: ChapterNGramRepository, + ): + self.ngram_repository = ngram_repository + self.chapter_ngram_repository = chapter_ngram_repository + + def on_get(self, _req: Request, resp: Response, number: str) -> None: + N = _req.get_param_as_list("n", transform=int, default=DEFAULT_N) + ngrams = self.ngram_repository.get_or_set_ngrams(MuseumNumber.of(number), N) + resp.media = self.chapter_ngram_repository.compute_overlaps(ngrams) diff --git a/ebl/fragmentarium/web/transliterations.py b/ebl/fragmentarium/web/transliterations.py index 1d8111845..ba811cb73 100644 --- a/ebl/fragmentarium/web/transliterations.py +++ b/ebl/fragmentarium/web/transliterations.py @@ -3,9 +3,13 @@ from falcon.media.validators.jsonschema import validate from ebl.fragmentarium.application.fragment_updater import FragmentUpdater +from ebl.fragmentarium.infrastructure.fragment_ngram_repository import ( + FragmentNGramRepository, +) from ebl.fragmentarium.web.dtos import create_response_dto, parse_museum_number from ebl.transliteration.domain.atf import Atf from ebl.transliteration.domain.transliteration_error import TransliterationError +from ebl.users.web.update_cache import create_fragment_ngram_cache from ebl.users.web.require_scope import require_scope from ebl.errors import DataError from ebl.fragmentarium.domain.fragment import NotLowestJoinError @@ -19,12 +23,19 @@ class TransliterationResource: - def __init__(self, updater: FragmentUpdater, transliteration_factory): + def __init__( + self, + updater: FragmentUpdater, + transliteration_factory, + ngram_repository: FragmentNGramRepository, + ): self._updater = updater self._transliteration_factory = transliteration_factory + self.ngram_repository = ngram_repository @falcon.before(require_scope, "transliterate:fragments") @validate(TRANSLITERATION_DTO_SCHEMA) + @falcon.after(create_fragment_ngram_cache) def on_post(self, req: Request, resp: Response, number: str) -> None: try: user = req.context.user diff --git a/ebl/tests/common/ngram_test_support.py b/ebl/tests/common/ngram_test_support.py new file mode 100644 index 000000000..767bbfd2e --- /dev/null +++ b/ebl/tests/common/ngram_test_support.py @@ -0,0 +1,22 @@ +from typing import Sequence, Set, Tuple, TypeVar + +from ebl.common.infrastructure.ngrams import DEFAULT_N + +T = TypeVar("T") + +N_VALUES = [ + DEFAULT_N, + [1], + [1, 2], + [5], +] + + +def _ngrams(sequence: Sequence[T], n: int) -> Set[Tuple[T]]: + return set(zip(*(sequence[i:] for i in range(n)))) + + +def ngrams_from_signs(signs: str, N: Sequence[int]) -> Set[Tuple[str]]: + split_signs = signs.replace("\n", " # ").split() + all_ngrams = set.union(*(_ngrams(split_signs, n) for n in N)) + return {ngram for ngram in all_ngrams if "X" not in ngram} diff --git a/ebl/tests/conftest.py b/ebl/tests/conftest.py index a2c7dcd3f..8b01a9b6e 100644 --- a/ebl/tests/conftest.py +++ b/ebl/tests/conftest.py @@ -27,6 +27,7 @@ from ebl.cache.infrastructure.mongo_cache_repository import MongoCacheRepository from ebl.changelog import Changelog from ebl.corpus.application.corpus import Corpus +from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository from ebl.corpus.infrastructure.mongo_text_repository import MongoTextRepository from ebl.dictionary.application.dictionary_service import Dictionary from ebl.dictionary.infrastructure.word_repository import MongoWordRepository @@ -44,6 +45,9 @@ from ebl.fragmentarium.infrastructure.cropped_sign_images_repository import ( MongoCroppedSignImagesRepository, ) +from ebl.fragmentarium.infrastructure.fragment_ngram_repository import ( + FragmentNGramRepository, +) from ebl.fragmentarium.infrastructure.mongo_annotations_repository import ( MongoAnnotationsRepository, ) @@ -199,6 +203,16 @@ def fragment_repository(database): return MongoFragmentRepository(database) +@pytest.fixture +def fragment_ngram_repository(database): + return FragmentNGramRepository(database) + + +@pytest.fixture +def chapter_ngram_repository(database): + return ChapterNGramRepository(database) + + @pytest.fixture def fragmentarium(fragment_repository): return Fragmentarium(fragment_repository) @@ -401,6 +415,8 @@ def context( photo_repository, folio_repository, fragment_repository, + fragment_ngram_repository, + chapter_ngram_repository, text_repository, changelog, bibliography_repository, @@ -420,6 +436,8 @@ def context( photo_repository=photo_repository, folio_repository=folio_repository, fragment_repository=fragment_repository, + fragment_ngram_repository=fragment_ngram_repository, + chapter_ngram_repository=chapter_ngram_repository, changelog=changelog, bibliography_repository=bibliography_repository, text_repository=text_repository, diff --git a/ebl/tests/corpus/test_chapter_ngram_repository.py b/ebl/tests/corpus/test_chapter_ngram_repository.py new file mode 100644 index 000000000..608557239 --- /dev/null +++ b/ebl/tests/corpus/test_chapter_ngram_repository.py @@ -0,0 +1,42 @@ +from typing import Sequence, Set, Tuple, Optional + +import pytest +from ebl.tests.factories.corpus import ChapterFactory +from ebl.corpus.infrastructure.queries import chapter_id_query +from ebl.tests.common.ngram_test_support import ngrams_from_signs, N_VALUES + + +def chapter_ngrams_from_signs( + chapter_signs: Sequence[Optional[str]], N: Sequence[int] +) -> Set[Tuple[str]]: + return set.union( + *(ngrams_from_signs(signs, N) for signs in chapter_signs if signs is not None) + ) + + +def test_create_chapter_sets_ngrams(text_repository, chapter_ngram_repository): + chapter = ChapterFactory.build() + text_repository.create_chapter(chapter) + + assert chapter_ngram_repository._ngrams.exists(chapter_id_query(chapter.id_)) + + +@pytest.mark.parametrize( + "N", + N_VALUES, +) +@pytest.mark.parametrize( + "N_NEW", + N_VALUES, +) +def test_update_chapter_ngrams(text_repository, chapter_ngram_repository, N, N_NEW): + chapter = ChapterFactory.build() + text_repository.create_chapter(chapter, N) + + ngrams = chapter_ngrams_from_signs(chapter.signs, N) + + assert chapter_ngram_repository.get_ngrams(chapter.id_) == ngrams + + chapter_ngram_repository.set_ngrams(chapter.id_, N_NEW) + ngrams = chapter_ngrams_from_signs(chapter.signs, N_NEW) + assert chapter_ngram_repository.get_ngrams(chapter.id_) == ngrams diff --git a/ebl/tests/fragmentarium/test_fragment_ngram_repository.py b/ebl/tests/fragmentarium/test_fragment_ngram_repository.py new file mode 100644 index 000000000..fcd754053 --- /dev/null +++ b/ebl/tests/fragmentarium/test_fragment_ngram_repository.py @@ -0,0 +1,28 @@ +import pytest +from ebl.tests.factories.fragment import TransliteratedFragmentFactory +from ebl.tests.common.ngram_test_support import ngrams_from_signs, N_VALUES + + +@pytest.mark.parametrize( + "N", + N_VALUES, +) +@pytest.mark.parametrize( + "N_NEW", + N_VALUES, +) +def test_update_fragment_ngrams( + fragment_repository, fragment_ngram_repository, N, N_NEW +): + fragment = TransliteratedFragmentFactory.build() + fragment_id = fragment_repository.create(fragment) + + assert not fragment_ngram_repository._ngrams.exists({"_id": fragment_id}) + + fragment_ngram_repository.set_ngrams(fragment.number, N) + ngrams = ngrams_from_signs(fragment.signs, N) + assert fragment_ngram_repository.get_ngrams(fragment_id) == ngrams + + fragment_ngram_repository.set_ngrams(fragment.number, N_NEW) + ngrams = ngrams_from_signs(fragment.signs, N_NEW) + assert fragment_ngram_repository.get_ngrams(fragment_id) == ngrams diff --git a/ebl/tests/fragmentarium/test_fragment_ngram_route.py b/ebl/tests/fragmentarium/test_fragment_ngram_route.py new file mode 100644 index 000000000..469d3856d --- /dev/null +++ b/ebl/tests/fragmentarium/test_fragment_ngram_route.py @@ -0,0 +1,52 @@ +from typing import Sequence +import pytest +from ebl.corpus.application.id_schemas import ChapterIdSchema +from ebl.corpus.domain.chapter import Chapter +from ebl.fragmentarium.domain.fragment import Fragment +from ebl.tests.corpus.test_chapter_ngram_repository import chapter_ngrams_from_signs +from ebl.tests.factories.corpus import ChapterFactory +from ebl.tests.factories.fragment import TransliteratedFragmentFactory +from ebl.tests.common.ngram_test_support import ngrams_from_signs, N_VALUES +import falcon + +SIGNS = ["X BA KU ABZ075", "KI DU ABZ411 BA MA TI\nX MU TA MA UD", "KU ABZ411 MA KI"] + + +def compute_overlap(fragment: Fragment, chapter: Chapter, N: Sequence[int]) -> float: + F = ngrams_from_signs(fragment.signs, N) + C = chapter_ngrams_from_signs(chapter.signs, N) + + return (len(F & C) / min(len(F), len(C))) if F and C else 0.0 + + +@pytest.mark.parametrize( + "N", + N_VALUES, +) +def test_match_fragment_ngrams( + client, + fragment_repository, + text_repository, + N, +): + fragment = TransliteratedFragmentFactory.build() + fragment_id = fragment_repository.create(fragment, ngram_n=N) + chapters = [ChapterFactory.build(signs=(signs,)) for signs in SIGNS] + + for chapter in chapters: + text_repository.create_chapter(chapter, N) + + result = client.simulate_get(f"/fragments/{fragment_id}/ngrams", params={"n": N}) + + assert result.status == falcon.HTTP_OK + assert result.json == sorted( + ( + { + **ChapterIdSchema().dump(chapter.id_), + "overlap": compute_overlap(fragment, chapter, N), + } + for chapter in chapters + ), + key=lambda item: item["overlap"], + reverse=True, + ) diff --git a/ebl/transliteration/infrastructure/collections.py b/ebl/transliteration/infrastructure/collections.py index 84c152182..7d5a6628a 100644 --- a/ebl/transliteration/infrastructure/collections.py +++ b/ebl/transliteration/infrastructure/collections.py @@ -1,3 +1,5 @@ TEXTS_COLLECTION = "texts" CHAPTERS_COLLECTION = "chapters" FRAGMENTS_COLLECTION = "fragments" +FRAGMENT_NGRAM_COLLECTION = "fragment_ngrams" +CHAPTER_NGRAM_COLLECTION = "chapter_ngrams" diff --git a/ebl/users/web/update_cache.py b/ebl/users/web/update_cache.py new file mode 100644 index 000000000..9827cf3fb --- /dev/null +++ b/ebl/users/web/update_cache.py @@ -0,0 +1,22 @@ +from marshmallow import EXCLUDE +from ebl.common.infrastructure.ngrams import DEFAULT_N +from ebl.corpus.application.id_schemas import ChapterIdSchema +from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository +from ebl.fragmentarium.infrastructure.fragment_ngram_repository import ( + FragmentNGramRepository, +) +from ebl.transliteration.application.museum_number_schema import MuseumNumberSchema + + +def create_fragment_ngram_cache(_req, resp, resource): + if museum_number_dto := resp.media.get("museumNumber"): + ngram_repository: FragmentNGramRepository = resource.ngram_repository + ngram_repository.set_ngrams( + MuseumNumberSchema().load(museum_number_dto), DEFAULT_N + ) + + +def create_chapter_ngram_cache(_req, resp, resource): + ngram_repository: ChapterNGramRepository = resource.ngram_repository + chapter_id = ChapterIdSchema().load(resp.media, unknown=EXCLUDE) + ngram_repository.set_ngrams(chapter_id, DEFAULT_N)