Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

N-Gram Matcher #455

Draft
wants to merge 36 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
6a40a9c
add ngram matcher route
fsimonjetz Sep 7, 2023
2e0125d
allow n-grams w. n=1
fsimonjetz Sep 8, 2023
eb005f8
add fragment ngram aggregation
fsimonjetz Sep 8, 2023
1d4b686
add chapter ngram aggregation
fsimonjetz Sep 8, 2023
24e1617
add general ngram aggregation
fsimonjetz Sep 11, 2023
4f81188
add fragment ngram repo
fsimonjetz Sep 11, 2023
c0f4cb7
add ngram collections
fsimonjetz Sep 11, 2023
2281f6b
add ChapterNGramRepository
fsimonjetz Sep 11, 2023
d19f883
add ngram extraction hook function
fsimonjetz Sep 11, 2023
fe6dc7b
add FragmentNGramRepository to TransliterationResource
fsimonjetz Sep 11, 2023
44afafc
add fragment_ngram_repository to app
fsimonjetz Sep 11, 2023
010b2ac
'Refactored by Sourcery' (#456)
sourcery-ai[bot] Sep 12, 2023
abd4864
fix aggregation, include chapter id
fsimonjetz Sep 12, 2023
b82a072
fix line replace bug
fsimonjetz Sep 12, 2023
3497f08
add create_chapter_ngram_cache
fsimonjetz Sep 12, 2023
b3b830f
refactoring
fsimonjetz Sep 12, 2023
7177c6c
add ChapterNGramRepository to app and resources
fsimonjetz Sep 12, 2023
b97aa65
add ChapterNGramRepository to LinesResource
fsimonjetz Sep 12, 2023
ccb2ce1
remove comment
fsimonjetz Sep 12, 2023
ad18ece
add n-gram repositories to test context
fsimonjetz Sep 12, 2023
8a1e84c
Refactoring
fsimonjetz Sep 12, 2023
0003a59
Add get_ngrams method
fsimonjetz Sep 13, 2023
34cf3b5
add Fragment Ngram Repo test
fsimonjetz Sep 13, 2023
582a099
update type cast
fsimonjetz Sep 14, 2023
0ea0610
add get_ngrams
fsimonjetz Sep 19, 2023
b9ebc26
add type hints
fsimonjetz Sep 19, 2023
c26b610
add chapter ngram tests, refactor
fsimonjetz Sep 19, 2023
4275e08
fix fragment ngram route, add test
fsimonjetz Sep 19, 2023
080059e
use global DEFAULT_N
fsimonjetz Sep 21, 2023
1fe024c
create ngrams on chapter creation
fsimonjetz Sep 21, 2023
7432529
rename param
fsimonjetz Sep 21, 2023
27cf1c9
add compute_overlaps method
fsimonjetz Sep 21, 2023
949c1b3
refactor
fsimonjetz Sep 21, 2023
4b9f2aa
refactor
fsimonjetz Sep 21, 2023
06576b1
add FragmentNGramRepository
fsimonjetz Sep 21, 2023
63039eb
extend ngram tests
fsimonjetz Sep 21, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions ebl/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from sentry_sdk import configure_scope
from sentry_sdk.integrations.falcon import FalconIntegration
import althaia
from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository
import ebl.error_handler
from ebl.bibliography.infrastructure.bibliography import MongoBibliographyRepository
from ebl.bibliography.web.bootstrap import create_bibliography_routes
Expand All @@ -25,6 +26,9 @@
from ebl.ebl_ai_client import EblAiClient
from ebl.files.infrastructure.grid_fs_file_repository import GridFsFileRepository
from ebl.files.web.bootstrap import create_files_route
from ebl.fragmentarium.infrastructure.fragment_ngram_repository import (
FragmentNGramRepository,
)
from ebl.markup.web.bootstrap import create_markup_route
from ebl.fragmentarium.infrastructure.cropped_sign_images_repository import (
MongoCroppedSignImagesRepository,
Expand Down Expand Up @@ -85,6 +89,8 @@ def create_context():
photo_repository=GridFsFileRepository(database, "photos"),
folio_repository=GridFsFileRepository(database, "folios"),
fragment_repository=MongoFragmentRepository(database),
fragment_ngram_repository=FragmentNGramRepository(database),
chapter_ngram_repository=ChapterNGramRepository(database),
changelog=Changelog(database),
bibliography_repository=MongoBibliographyRepository(database),
text_repository=MongoTextRepository(database),
Expand Down
54 changes: 51 additions & 3 deletions ebl/common/query/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Union, Dict
from typing import Optional, Sequence, Union, Dict


def flatten_field(input_: Union[str, Dict], depth=1) -> Dict:
Expand All @@ -16,8 +16,8 @@ def drop_duplicates(input_: Union[str, Dict]) -> Dict:


def ngrams(input_: Union[str, Dict], n) -> Dict:
if n <= 1:
raise ValueError("ngram size must be 2 or more")
if n <= 0:
raise ValueError("ngram size must be 1 or more")
return {
"$zip": {
"inputs": [
Expand All @@ -39,3 +39,51 @@ def ngrams(input_: Union[str, Dict], n) -> Dict:

def filter_array(input_, as_, cond) -> Dict:
return {"$filter": {"input": input_, "as": as_, "cond": cond}}


def aggregate_all_ngrams(
input_: Union[str, Dict],
fsimonjetz marked this conversation as resolved.
Show resolved Hide resolved
N: Sequence[int],
output_: str = "ngrams",
signs_to_exclude: Optional[Sequence[str]] = None,
ngram_field="ngram",
):
if signs_to_exclude is None:
signs_to_exclude = ["X", ""]

exclude_empty = {
"$eq": [
{
"$size": {
"$setIntersection": [
f"$${ngram_field}",
signs_to_exclude,
]
}
},
0,
]
}
return [
{
"$addFields": {
output_: drop_duplicates(
filter_array(
{"$concatArrays": [ngrams(input_, n) for n in N if n > 0]},
ngram_field,
exclude_empty,
)
)
}
},
]


def replace_all(old: str, new: str):
return {
"$replaceAll": {
"input": "$signs",
"find": old,
"replacement": new,
}
}
6 changes: 6 additions & 0 deletions ebl/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from ebl.bibliography.application.bibliography_repository import BibliographyRepository
from ebl.cache.application.custom_cache import ChapterCache
from ebl.changelog import Changelog
from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository
from ebl.corpus.infrastructure.mongo_text_repository import MongoTextRepository
from ebl.dictionary.application.word_repository import WordRepository
from ebl.ebl_ai_client import EblAiClient
Expand All @@ -19,6 +20,9 @@
from ebl.fragmentarium.infrastructure.cropped_sign_images_repository import (
MongoCroppedSignImagesRepository,
)
from ebl.fragmentarium.infrastructure.fragment_ngram_repository import (
FragmentNGramRepository,
)
from ebl.lemmatization.application.suggestion_finder import LemmaRepository
from ebl.transliteration.application.parallel_line_injector import ParallelLineInjector
from ebl.transliteration.application.sign_repository import SignRepository
Expand All @@ -38,6 +42,8 @@ class Context:
photo_repository: FileRepository
folio_repository: FileRepository
fragment_repository: FragmentRepository
fragment_ngram_repository: FragmentNGramRepository
chapter_ngram_repository: ChapterNGramRepository
changelog: Changelog
bibliography_repository: BibliographyRepository
text_repository: MongoTextRepository
Expand Down
72 changes: 72 additions & 0 deletions ebl/corpus/infrastructure/corpus_ngram_repository.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from ebl.corpus.domain.chapter import ChapterId
from ebl.corpus.infrastructure.queries import chapter_id_query
from ebl.errors import NotFoundError
from ebl.mongo_collection import MongoCollection
from ebl.transliteration.infrastructure.collections import (
CHAPTER_NGRAM_COLLECTION,
CHAPTERS_COLLECTION,
)
from typing import Optional, Sequence

from ebl.common.query.util import aggregate_all_ngrams, replace_all

NGRAM_FIELD = "ngram"


class ChapterNGramRepository:
def __init__(self, database):
self._chapters = MongoCollection(database, CHAPTERS_COLLECTION)
self._ngrams = MongoCollection(database, CHAPTER_NGRAM_COLLECTION)

def aggregate_chapter_ngrams(
self,
chapter_id: ChapterId,
N: Sequence[int],
signs_to_exclude: Optional[Sequence[str]] = None,
):
return [
{"$match": chapter_id_query(chapter_id)},
{"$project": {"signs": 1, "textId": 1, "stage": 1, "name": 1}},
{"$unwind": "$signs"},
{
"$addFields": {
NGRAM_FIELD: {
"$split": [
replace_all("\n", " # "),
" ",
]
}
}
},
*aggregate_all_ngrams(f"${NGRAM_FIELD}", N, NGRAM_FIELD, signs_to_exclude),
{"$unwind": f"${NGRAM_FIELD}"},
{
"$group": {
"_id": None,
NGRAM_FIELD: {"$addToSet": f"${NGRAM_FIELD}"},
"textId": {"$first": "$textId"},
"name": {"$first": "$name"},
"stage": {"$first": "$stage"},
}
},
{"$project": {"_id": False}},
]

def update_ngrams(
self,
chapter_id: ChapterId,
N: Sequence[int],
signs_to_exclude: Optional[Sequence[str]] = None,
) -> None:
aggregation = self.aggregate_chapter_ngrams(chapter_id, N, signs_to_exclude)
if data := next(
self._chapters.aggregate(aggregation, allowDiskUse=True),
None,
):
try:
self._ngrams.update_one(
chapter_id_query(chapter_id),
{"$set": {NGRAM_FIELD: data[NGRAM_FIELD]}},
)
except NotFoundError:
self._ngrams.insert_one(data)
12 changes: 9 additions & 3 deletions ebl/corpus/web/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,15 @@ def create_corpus_routes(api: falcon.App, context: Context):
chapters_by_lemma = ChaptersByLemmaResource(corpus)
alignment = AlignmentResource(corpus, context.custom_cache)
manuscript_lemmatization = LemmatizationResource(corpus, context.custom_cache)
manuscript = ManuscriptsResource(corpus, context.custom_cache)
lines = LinesResource(corpus, context.custom_cache)
lines_import = LinesImportResource(corpus, context.custom_cache)
manuscript = ManuscriptsResource(
corpus, context.custom_cache, context.chapter_ngram_repository
)
lines = LinesResource(
corpus, context.custom_cache, context.chapter_ngram_repository
)
lines_import = LinesImportResource(
corpus, context.custom_cache, context.chapter_ngram_repository
)
colophons = ColophonsResource(corpus)
unplaced_lines = UnplacedLinesResource(corpus)
extant_lines = ExtantLinesResource(corpus)
Expand Down
20 changes: 18 additions & 2 deletions ebl/corpus/web/lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
from ebl.corpus.application.corpus import Corpus
from ebl.corpus.domain.line import Line
from ebl.corpus.domain.lines_update import LinesUpdate
from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository
from ebl.corpus.web.chapter_schemas import ApiChapterSchema, ApiLineSchema
from ebl.corpus.web.display_schemas import LineDetailsDisplay, LineDetailsDisplaySchema
from ebl.corpus.web.text_utils import create_chapter_id
from ebl.errors import NotFoundError
from ebl.marshmallowschema import validate
from ebl.users.web.update_cache import create_chapter_ngram_cache
from ebl.users.web.require_scope import require_scope


Expand Down Expand Up @@ -39,12 +41,19 @@ class LinesImportSchema(Schema):


class LinesResource:
def __init__(self, corpus: Corpus, cache: ChapterCache):
def __init__(
self,
corpus: Corpus,
cache: ChapterCache,
ngram_repository: ChapterNGramRepository,
):
self._corpus = corpus
self._cache = cache
self.ngram_repository = ngram_repository

@falcon.before(require_scope, "write:texts")
@validate(LinesUpdateSchema())
@falcon.after(create_chapter_ngram_cache)
def on_post(
self,
req: falcon.Request,
Expand All @@ -64,12 +73,19 @@ def on_post(


class LinesImportResource:
def __init__(self, corpus: Corpus, cache: ChapterCache):
def __init__(
self,
corpus: Corpus,
cache: ChapterCache,
ngram_repository: ChapterNGramRepository,
):
self._corpus = corpus
self._cache = cache
self.ngram_repository = ngram_repository

@falcon.before(require_scope, "write:texts")
@validate(LinesImportSchema())
@falcon.after(create_chapter_ngram_cache)
def on_post(
self,
req: falcon.Request,
Expand Down
11 changes: 10 additions & 1 deletion ebl/corpus/web/manuscripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@

from ebl.cache.application.custom_cache import ChapterCache
from ebl.corpus.application.corpus import Corpus
from ebl.corpus.infrastructure.corpus_ngram_repository import ChapterNGramRepository
from ebl.corpus.web.chapter_schemas import (
ApiChapterSchema,
ApiManuscriptSchema,
MuseumNumberString,
)
from ebl.corpus.web.text_utils import create_chapter_id
from ebl.marshmallowschema import validate
from ebl.users.web.update_cache import create_chapter_ngram_cache
from ebl.users.web.require_scope import require_scope


Expand All @@ -21,9 +23,15 @@ class ManuscriptDtoSchema(Schema):


class ManuscriptsResource:
def __init__(self, corpus: Corpus, cache: ChapterCache):
def __init__(
self,
corpus: Corpus,
cache: ChapterCache,
ngram_repository: ChapterNGramRepository,
):
self._corpus = corpus
self._cache = cache
self.ngram_repository = ngram_repository

def on_get(
self,
Expand All @@ -41,6 +49,7 @@ def on_get(

@falcon.before(require_scope, "write:texts")
@validate(ManuscriptDtoSchema())
@falcon.after(create_chapter_ngram_cache)
def on_post(
self,
req: falcon.Request,
Expand Down
58 changes: 58 additions & 0 deletions ebl/fragmentarium/infrastructure/fragment_ngram_repository.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from ebl.errors import NotFoundError
from ebl.mongo_collection import MongoCollection
from ebl.transliteration.infrastructure.collections import (
FRAGMENT_NGRAM_COLLECTION,
FRAGMENTS_COLLECTION,
)
from typing import Optional, Sequence

from ebl.common.query.util import aggregate_all_ngrams, replace_all

NGRAM_FIELD = "ngram"


class FragmentNGramRepository:
def __init__(self, database):
self._fragments = MongoCollection(database, FRAGMENTS_COLLECTION)
self._ngrams = MongoCollection(database, FRAGMENT_NGRAM_COLLECTION)

def aggregate_fragment_ngrams(
self,
number: dict,
N: Sequence[int],
signs_to_exclude: Optional[Sequence[str]] = None,
):
return [
{"$match": {f"museumNumber.{key}": value for key, value in number.items()}},
{
"$project": {
f"{NGRAM_FIELD}s": {
"$split": [
replace_all("\n", " # "),
" ",
]
}
}
},
*aggregate_all_ngrams(
f"${NGRAM_FIELD}s", N, f"{NGRAM_FIELD}s", signs_to_exclude
),
]

def update_ngrams(
self,
number: dict,
N: Sequence[int],
signs_to_exclude: Optional[Sequence[str]] = None,
) -> None:
aggregation = self.aggregate_fragment_ngrams(number, N, signs_to_exclude)
if data := next(
self._fragments.aggregate(aggregation, allowDiskUse=True),
None,
):
try:
self._ngrams.update_one(
{"_id": data["_id"]}, {"$set": {"ngrams": data["ngrams"]}}
)
except NotFoundError:
self._ngrams.insert_one(data)
Loading
Loading