Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

N-Gram Matcher #455

Draft
wants to merge 36 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
6a40a9c
add ngram matcher route
fsimonjetz Sep 7, 2023
2e0125d
allow n-grams w. n=1
fsimonjetz Sep 8, 2023
eb005f8
add fragment ngram aggregation
fsimonjetz Sep 8, 2023
1d4b686
add chapter ngram aggregation
fsimonjetz Sep 8, 2023
24e1617
add general ngram aggregation
fsimonjetz Sep 11, 2023
4f81188
add fragment ngram repo
fsimonjetz Sep 11, 2023
c0f4cb7
add ngram collections
fsimonjetz Sep 11, 2023
2281f6b
add ChapterNGramRepository
fsimonjetz Sep 11, 2023
d19f883
add ngram extraction hook function
fsimonjetz Sep 11, 2023
fe6dc7b
add FragmentNGramRepository to TransliterationResource
fsimonjetz Sep 11, 2023
44afafc
add fragment_ngram_repository to app
fsimonjetz Sep 11, 2023
010b2ac
'Refactored by Sourcery' (#456)
sourcery-ai[bot] Sep 12, 2023
abd4864
fix aggregation, include chapter id
fsimonjetz Sep 12, 2023
b82a072
fix line replace bug
fsimonjetz Sep 12, 2023
3497f08
add create_chapter_ngram_cache
fsimonjetz Sep 12, 2023
b3b830f
refactoring
fsimonjetz Sep 12, 2023
7177c6c
add ChapterNGramRepository to app and resources
fsimonjetz Sep 12, 2023
b97aa65
add ChapterNGramRepository to LinesResource
fsimonjetz Sep 12, 2023
ccb2ce1
remove comment
fsimonjetz Sep 12, 2023
ad18ece
add n-gram repositories to test context
fsimonjetz Sep 12, 2023
8a1e84c
Refactoring
fsimonjetz Sep 12, 2023
0003a59
Add get_ngrams method
fsimonjetz Sep 13, 2023
34cf3b5
add Fragment Ngram Repo test
fsimonjetz Sep 13, 2023
582a099
update type cast
fsimonjetz Sep 14, 2023
0ea0610
add get_ngrams
fsimonjetz Sep 19, 2023
b9ebc26
add type hints
fsimonjetz Sep 19, 2023
c26b610
add chapter ngram tests, refactor
fsimonjetz Sep 19, 2023
4275e08
fix fragment ngram route, add test
fsimonjetz Sep 19, 2023
080059e
use global DEFAULT_N
fsimonjetz Sep 21, 2023
1fe024c
create ngrams on chapter creation
fsimonjetz Sep 21, 2023
7432529
rename param
fsimonjetz Sep 21, 2023
27cf1c9
add compute_overlaps method
fsimonjetz Sep 21, 2023
949c1b3
refactor
fsimonjetz Sep 21, 2023
4b9f2aa
refactor
fsimonjetz Sep 21, 2023
06576b1
add FragmentNGramRepository
fsimonjetz Sep 21, 2023
63039eb
extend ngram tests
fsimonjetz Sep 21, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions ebl/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from sentry_sdk import configure_scope
from sentry_sdk.integrations.falcon import FalconIntegration
import althaia
from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository
import ebl.error_handler
from ebl.bibliography.infrastructure.bibliography import MongoBibliographyRepository
from ebl.bibliography.web.bootstrap import create_bibliography_routes
Expand All @@ -25,6 +26,9 @@
from ebl.ebl_ai_client import EblAiClient
from ebl.files.infrastructure.grid_fs_file_repository import GridFsFileRepository
from ebl.files.web.bootstrap import create_files_route
from ebl.fragmentarium.infrastructure.fragment_ngram_repository import (
FragmentNGramRepository,
)
from ebl.markup.web.bootstrap import create_markup_route
from ebl.fragmentarium.infrastructure.cropped_sign_images_repository import (
MongoCroppedSignImagesRepository,
Expand Down Expand Up @@ -85,6 +89,8 @@ def create_context():
photo_repository=GridFsFileRepository(database, "photos"),
folio_repository=GridFsFileRepository(database, "folios"),
fragment_repository=MongoFragmentRepository(database),
fragment_ngram_repository=FragmentNGramRepository(database),
chapter_ngram_repository=ChapterNGramRepository(database),
changelog=Changelog(database),
bibliography_repository=MongoBibliographyRepository(database),
text_repository=MongoTextRepository(database),
Expand Down
1 change: 1 addition & 0 deletions ebl/common/infrastructure/ngrams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
DEFAULT_N = [1, 2, 3]
51 changes: 48 additions & 3 deletions ebl/common/query/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Union, Dict
from typing import Sequence, Union, Dict


def flatten_field(input_: Union[str, Dict], depth=1) -> Dict:
Expand All @@ -16,8 +16,8 @@ def drop_duplicates(input_: Union[str, Dict]) -> Dict:


def ngrams(input_: Union[str, Dict], n) -> Dict:
if n <= 1:
raise ValueError("ngram size must be 2 or more")
if n <= 0:
raise ValueError("ngram size must be 1 or more")
return {
"$zip": {
"inputs": [
Expand All @@ -39,3 +39,48 @@ def ngrams(input_: Union[str, Dict], n) -> Dict:

def filter_array(input_, as_, cond) -> Dict:
return {"$filter": {"input": input_, "as": as_, "cond": cond}}


def aggregate_all_ngrams(
input_: Union[str, Dict],
fsimonjetz marked this conversation as resolved.
Show resolved Hide resolved
N: Sequence[int],
output_: str = "ngrams",
):
signs_to_exclude = ["X", ""]

exclude_empty = {
"$eq": [
{
"$size": {
"$setIntersection": [
"$$this",
signs_to_exclude,
]
}
},
0,
]
}
return [
{
"$addFields": {
output_: drop_duplicates(
filter_array(
{"$concatArrays": [ngrams(input_, n) for n in N if n > 0]},
"this",
exclude_empty,
)
)
}
},
]


def replace_all(old: str, new: str):
return {
"$replaceAll": {
"input": "$signs",
"find": old,
"replacement": new,
}
}
6 changes: 6 additions & 0 deletions ebl/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from ebl.bibliography.application.bibliography_repository import BibliographyRepository
from ebl.cache.application.custom_cache import ChapterCache
from ebl.changelog import Changelog
from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository
from ebl.corpus.infrastructure.mongo_text_repository import MongoTextRepository
from ebl.dictionary.application.word_repository import WordRepository
from ebl.ebl_ai_client import EblAiClient
Expand All @@ -19,6 +20,9 @@
from ebl.fragmentarium.infrastructure.cropped_sign_images_repository import (
MongoCroppedSignImagesRepository,
)
from ebl.fragmentarium.infrastructure.fragment_ngram_repository import (
FragmentNGramRepository,
)
from ebl.lemmatization.application.suggestion_finder import LemmaRepository
from ebl.transliteration.application.parallel_line_injector import ParallelLineInjector
from ebl.transliteration.application.sign_repository import SignRepository
Expand All @@ -38,6 +42,8 @@ class Context:
photo_repository: FileRepository
folio_repository: FileRepository
fragment_repository: FragmentRepository
fragment_ngram_repository: FragmentNGramRepository
chapter_ngram_repository: ChapterNGramRepository
changelog: Changelog
bibliography_repository: BibliographyRepository
text_repository: MongoTextRepository
Expand Down
4 changes: 3 additions & 1 deletion ebl/corpus/application/text_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ def create(self, text: Text) -> None:
...

@abstractmethod
def create_chapter(self, chapter: Chapter) -> None:
def create_chapter(
self, chapter: Chapter, ngram_n: Optional[Sequence[int]] = None
) -> None:
...

@abstractmethod
Expand Down
134 changes: 134 additions & 0 deletions ebl/corpus/infrastructure/corpus_ngram_repository.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
from ebl.corpus.application.id_schemas import ChapterIdSchema
from ebl.corpus.domain.chapter import ChapterId
from ebl.corpus.infrastructure.queries import chapter_id_query
from ebl.errors import NotFoundError
from ebl.mongo_collection import MongoCollection
from ebl.transliteration.infrastructure.collections import (
CHAPTER_NGRAM_COLLECTION,
CHAPTERS_COLLECTION,
)
from typing import List, Optional, Sequence, Set, Tuple

from ebl.common.query.util import aggregate_all_ngrams, replace_all

NGRAM_FIELD = "ngram"


class ChapterNGramRepository:
def __init__(self, database):
self._chapters = MongoCollection(database, CHAPTERS_COLLECTION)
self._ngrams = MongoCollection(database, CHAPTER_NGRAM_COLLECTION)

def aggregate_chapter_ngrams(
self,
chapter_id: ChapterId,
N: Sequence[int],
) -> Sequence[dict]:
return [
{"$match": chapter_id_query(chapter_id)},
{"$project": {"signs": 1, "textId": 1, "stage": 1, "name": 1}},
{"$unwind": "$signs"},
{
"$addFields": {
NGRAM_FIELD: {
"$split": [
replace_all("\n", " # "),
" ",
]
}
}
},
*aggregate_all_ngrams(f"${NGRAM_FIELD}", N, NGRAM_FIELD),
{"$unwind": f"${NGRAM_FIELD}"},
{
"$group": {
"_id": None,
NGRAM_FIELD: {"$addToSet": f"${NGRAM_FIELD}"},
"textId": {"$first": "$textId"},
"name": {"$first": "$name"},
"stage": {"$first": "$stage"},
}
},
{"$project": {"_id": False}},
]

def set_ngrams(
self,
chapter_id: ChapterId,
N: Sequence[int],
) -> Set[Tuple[str]]:
aggregation = self.aggregate_chapter_ngrams(chapter_id, N)
data = next(
self._chapters.aggregate(aggregation, allowDiskUse=True),
{NGRAM_FIELD: [], **ChapterIdSchema().dump(chapter_id)},
)
try:
self._ngrams.update_one(
chapter_id_query(chapter_id),
{"$set": {NGRAM_FIELD: data[NGRAM_FIELD]}},
)
except NotFoundError:
self._ngrams.insert_one(data)

return {tuple(ngram) for ngram in data[NGRAM_FIELD]}

def get_ngrams(self, chapter_id: ChapterId) -> Set[Tuple[str]]:
ngrams = self._ngrams.find_one(chapter_id_query(chapter_id))[NGRAM_FIELD]

return {tuple(ngram) for ngram in ngrams}

def get_or_set_ngrams(
self, chapter_id: ChapterId, N: Sequence[int]
) -> Set[Tuple[str]]:
try:
return self.get_ngrams(chapter_id)
except NotFoundError:
ngrams = self.set_ngrams(chapter_id, N)

return ngrams

def compute_overlaps(
self, ngrams: Set[Tuple[str]], limit: Optional[int] = None
) -> Sequence[dict]:
ngram_list = list(ngrams)
pipeline: List[dict] = [
{"$match": {"textId.category": {"$ne": 99}}},
{
"$project": {
"_id": 0,
"textId": 1,
"name": 1,
"stage": 1,
"overlap": {
"$let": {
"vars": {
"intersection": {
"$size": {
"$setIntersection": ["$ngram", ngram_list]
}
},
"minLength": {
"$min": [
{"$size": "$ngram"},
{"$size": [ngram_list]},
]
},
},
"in": {
"$cond": [
{"$eq": ["$$minLength", 0]},
0.0,
{"$divide": ["$$intersection", "$$minLength"]},
]
},
}
},
}
},
{"$sort": {"overlap": -1}},
]

if limit:
pipeline.append({"$limit": limit})

return list(self._ngrams.aggregate(pipeline, allowDiskUse=True))
8 changes: 7 additions & 1 deletion ebl/corpus/infrastructure/mongo_text_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@


from ebl.bibliography.infrastructure.bibliography import join_reference_documents
from ebl.common.infrastructure.ngrams import DEFAULT_N
from ebl.common.query.query_result import CorpusQueryResult
from ebl.common.query.query_schemas import CorpusQueryResultSchema
from ebl.corpus.application.text_repository import TextRepository
Expand All @@ -28,6 +29,7 @@
from ebl.corpus.infrastructure.chapter_query_filters import (
filter_query_by_transliteration,
)
from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository
from ebl.corpus.infrastructure.corpus_search_aggregations import CorpusPatternMatcher
from ebl.corpus.infrastructure.manuscript_lemma_filter import (
filter_manuscripts_by_lemma,
Expand Down Expand Up @@ -68,6 +70,7 @@ class MongoTextRepository(TextRepository):
def __init__(self, database: Database):
self._texts = MongoCollection(database, TEXTS_COLLECTION)
self._chapters = MongoCollection(database, CHAPTERS_COLLECTION)
self._ngram_repository = ChapterNGramRepository(database)

def create_indexes(self) -> None:
self._texts.create_index(
Expand Down Expand Up @@ -107,8 +110,11 @@ def create_indexes(self) -> None:
def create(self, text: Text) -> None:
self._texts.insert_one(TextSchema(exclude=["chapters"]).dump(text))

def create_chapter(self, chapter: Chapter) -> None:
def create_chapter(
self, chapter: Chapter, ngram_n: Optional[Sequence[int]] = None
) -> None:
self._chapters.insert_one(ChapterSchema().dump(chapter))
self._ngram_repository.set_ngrams(chapter.id_, ngram_n or DEFAULT_N)

def find(self, id_: TextId) -> Text:
try:
Expand Down
12 changes: 9 additions & 3 deletions ebl/corpus/web/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,15 @@ def create_corpus_routes(api: falcon.App, context: Context):
chapters_by_lemma = ChaptersByLemmaResource(corpus)
alignment = AlignmentResource(corpus, context.custom_cache)
manuscript_lemmatization = LemmatizationResource(corpus, context.custom_cache)
manuscript = ManuscriptsResource(corpus, context.custom_cache)
lines = LinesResource(corpus, context.custom_cache)
lines_import = LinesImportResource(corpus, context.custom_cache)
manuscript = ManuscriptsResource(
corpus, context.custom_cache, context.chapter_ngram_repository
)
lines = LinesResource(
corpus, context.custom_cache, context.chapter_ngram_repository
)
lines_import = LinesImportResource(
corpus, context.custom_cache, context.chapter_ngram_repository
)
colophons = ColophonsResource(corpus)
unplaced_lines = UnplacedLinesResource(corpus)
extant_lines = ExtantLinesResource(corpus)
Expand Down
20 changes: 18 additions & 2 deletions ebl/corpus/web/lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
from ebl.corpus.application.corpus import Corpus
from ebl.corpus.domain.line import Line
from ebl.corpus.domain.lines_update import LinesUpdate
from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository
from ebl.corpus.web.chapter_schemas import ApiChapterSchema, ApiLineSchema
from ebl.corpus.web.display_schemas import LineDetailsDisplay, LineDetailsDisplaySchema
from ebl.corpus.web.text_utils import create_chapter_id
from ebl.errors import NotFoundError
from ebl.marshmallowschema import validate
from ebl.users.web.update_cache import create_chapter_ngram_cache
from ebl.users.web.require_scope import require_scope


Expand Down Expand Up @@ -39,12 +41,19 @@ class LinesImportSchema(Schema):


class LinesResource:
def __init__(self, corpus: Corpus, cache: ChapterCache):
def __init__(
self,
corpus: Corpus,
cache: ChapterCache,
ngram_repository: ChapterNGramRepository,
):
self._corpus = corpus
self._cache = cache
self.ngram_repository = ngram_repository

@falcon.before(require_scope, "write:texts")
@validate(LinesUpdateSchema())
@falcon.after(create_chapter_ngram_cache)
def on_post(
self,
req: falcon.Request,
Expand All @@ -64,12 +73,19 @@ def on_post(


class LinesImportResource:
def __init__(self, corpus: Corpus, cache: ChapterCache):
def __init__(
self,
corpus: Corpus,
cache: ChapterCache,
ngram_repository: ChapterNGramRepository,
):
self._corpus = corpus
self._cache = cache
self.ngram_repository = ngram_repository

@falcon.before(require_scope, "write:texts")
@validate(LinesImportSchema())
@falcon.after(create_chapter_ngram_cache)
def on_post(
self,
req: falcon.Request,
Expand Down
Loading
Loading