Skip to content

Commit

Permalink
Add mimetype inclusion/exclusion flags to reingest command
Browse files Browse the repository at this point in the history
  • Loading branch information
stchris committed Feb 28, 2024
1 parent c7a1b5e commit d908f63
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 4 deletions.
12 changes: 10 additions & 2 deletions aleph/logic/collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,21 @@ def _generate():


def reingest_collection(
collection, job_id=None, index=False, flush=True, include_ingest=False
collection,
job_id=None,
index=False,
flush=True,
include_ingest=False,
include_mimetypes=[],
exclude_mimetypes=[],
):
"""Trigger a re-ingest for all documents in the collection."""
job_id = job_id or Job.random_id()
if flush:
ingest_flush(collection, include_ingest=include_ingest)
for document in Document.by_collection(collection.id):
for document in Document.by_collection_and_mimetype(
collection.id, include_mimetypes, exclude_mimetypes
):
proxy = document.to_proxy(ns=collection.ns)
ingest_entity(collection, proxy, job_id=job_id, index=index)

Expand Down
30 changes: 28 additions & 2 deletions aleph/manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,11 +205,37 @@ def reindex_casefiles(flush=False):
@click.option("--index", is_flag=True, default=False)
@click.option("--include_ingest", is_flag=True, default=False)
@click.option("--flush/--no-flush", default=True)
def reingest(foreign_id, index=False, flush=True, include_ingest=False):
@click.option(
"--include-mimetype",
"include_mimetypes",
multiple=True,
default=[],
help="Only include documents with these mimetypes (multiple mentions possible)",
)
@click.option(
"--exclude-mimetype",
"exclude_mimetypes",
multiple=True,
default=[],
help="Exclude documents with these mimetypes (multiple mentions possible)",
)
def reingest(
foreign_id,
index=False,
flush=True,
include_ingest=False,
include_mimetypes=[],
exclude_mimetypes=[],
):
"""Process documents and database entities and index them."""
collection = get_collection(foreign_id)
reingest_collection(
collection, index=index, flush=flush, include_ingest=include_ingest
collection,
index=index,
flush=flush,
include_ingest=include_ingest,
include_mimetypes=include_mimetypes,
exclude_mimetypes=exclude_mimetypes,
)


Expand Down
13 changes: 13 additions & 0 deletions aleph/model/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,19 @@ def by_collection(cls, collection_id=None):
q = q.yield_per(5000)
return q

@classmethod
def by_collection_and_mimetype(
cls, collection_id=None, include_mimetypes=[], exclude_mimetypes=[]
):
q = cls.all()
q = q.filter(cls.collection_id == collection_id)
if include_mimetypes:
q = q.filter(cls.meta["mime_type"].astext.in_(include_mimetypes))
if exclude_mimetypes:
q = q.filter(~cls.meta["mime_type"].astext.in_(exclude_mimetypes))
q = q.yield_per(5000)
return q

def to_proxy(self, ns=None):
ns = ns or self.collection.ns
proxy = model.get_proxy(
Expand Down
16 changes: 16 additions & 0 deletions aleph/tests/test_ingest_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,22 @@ def test_upload_csv_doc(self):
assert doc.schema == Document.SCHEMA, doc.schema
assert doc.meta["countries"] == ["de", "us"], doc.meta
assert doc.meta["languages"] == ["eng"], doc.meta
assert doc.meta["mime_type"] == "text/csv", doc.meta

assert (
Document.by_collection_and_mimetype(
collection_id=1, include_mimetypes=["text/csv"], exclude_mimetypes=None
).count()
== 1
)
assert (
Document.by_collection_and_mimetype(
collection_id=1,
include_mimetypes=None,
exclude_mimetypes=["text/csv"],
).count()
== 0
)

status = get_status(self.col)
assert status.get("pending") == 1, status
Expand Down

0 comments on commit d908f63

Please sign in to comment.