Skip to content

Commit

Permalink
Merge pull request #657 from alephdata/bug/3894
Browse files Browse the repository at this point in the history
Add Workbook metadata to Table entities
  • Loading branch information
catileptic authored Dec 17, 2024
2 parents 97050ac + 325eced commit c014ac9
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 2 deletions.
16 changes: 15 additions & 1 deletion ingestors/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@
from ftmstore import get_dataset
from servicelayer.cache import get_redis
from servicelayer.logs import configure_logging
from servicelayer.taskqueue import Dataset, Task
from servicelayer.taskqueue import (
Dataset,
Task,
get_rabbitmq_channel,
declare_rabbitmq_queue,
)
from servicelayer import settings as sl_settings
from servicelayer.archive.util import ensure_path
from servicelayer import settings as sls
Expand Down Expand Up @@ -78,6 +83,7 @@ def _ingest_path(db, dataset, path, languages=[]):
entity.make_id(checksum)
entity.set("fileName", path.name)
log.info("Queue: %r", entity.to_dict())

manager.queue_entity(entity)
if path.is_dir():
DirectoryIngestor.crawl(manager, path)
Expand Down Expand Up @@ -116,6 +122,7 @@ def analyze(dataset):
def debug(path, languages=None):
"""Debug the ingest for the given path."""
settings.fts.DATABASE_URI = "sqlite:////tmp/debug.sqlite3"
settings.TESTING = True

# collection ID that is meant for testing purposes only
debug_datatset_id = 100
Expand All @@ -126,6 +133,13 @@ def debug(path, languages=None):
database_uri=settings.fts.DATABASE_URI,
)
db.delete()
channel = get_rabbitmq_channel()
qos_mapping = {
settings.STAGE_INGEST: settings.RABBITMQ_QOS_INGEST_QUEUE,
settings.STAGE_ANALYZE: settings.RABBITMQ_QOS_ANALYZE_QUEUE,
}
for queue_name in qos_mapping.keys():
declare_rabbitmq_queue(channel, queue_name, qos_mapping[queue_name])
_ingest_path(db, debug_datatset_id, path, languages=languages)
worker = get_worker()
worker.process(blocking=False)
Expand Down
10 changes: 10 additions & 0 deletions ingestors/tabular/ods.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,16 @@ def ingest(self, file_path, entity):
table = self.manager.make_entity("Table", parent=entity)
table.make_id(entity.id, name)
table.set("title", name)
# add workbook metadata to individual tables
for metadatum in [
"authoredAt",
"author",
"summary",
"generator",
"date",
"processingAgent",
]:
table.set(metadatum, entity.get(metadatum))
# Emit a partial table fragment with parent reference and name
# early, so that we don't have orphan fragments in case of an error
# in the middle of processing.
Expand Down
11 changes: 11 additions & 0 deletions ingestors/tabular/xls.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,17 @@ def ingest(self, file_path, entity):
table = self.manager.make_entity("Table", parent=entity)
table.make_id(entity.id, sheet.name)
table.set("title", sheet.name)
# add workbook metadata to individual tables
for metadatum in [
"authoredAt",
"modifiedAt",
"author",
"summary",
"generator",
"language",
"processingAgent",
]:
table.set(metadatum, entity.get(metadatum))
# Emit a partial table fragment with parent reference and name
# early, so that we don't have orphan fragments in case of an error
# in the middle of processing.
Expand Down
12 changes: 11 additions & 1 deletion ingestors/tabular/xlsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,22 @@ def ingest(self, file_path, entity):
table = self.manager.make_entity("Table", parent=entity)
table.make_id(entity.id, name)
table.set("title", name)
# add workbook metadata to individual tables
for metadatum in [
"authoredAt",
"modifiedAt",
"author",
"summary",
"generator",
"language",
"processingAgent",
]:
table.set(metadatum, entity.get(metadatum))
# Emit a partial table fragment with parent reference and name
# early, so that we don't have orphan fragments in case of an error
# in the middle of processing.
# See https://github.com/alephdata/ingest-file/issues/171
self.manager.emit_entity(table, fragment="initial")
log.debug("Sheet: %s", name)
self.emit_row_tuples(table, self.generate_rows(sheet))
if table.has("csvHash"):
self.manager.emit_entity(table)
Expand Down
Binary file added tests/fixtures/staff_list.xlsx
Binary file not shown.
21 changes: 21 additions & 0 deletions tests/test_tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,24 @@ def test_password_protected_xls(self):
self.assertIn(ENCRYPTED_MSG, err)
status = self.manager.entities[0].first("processingStatus")
self.assertEqual("failure", status)

def test_metadata_inheritance(self):
fixture_path, entity = self.fixture("staff_list.xlsx")
self.manager.ingest(fixture_path, entity)
table_entities = self.get_emitted("Table")
parent_entity = self.get_emitted("Workbook").pop()
self.assertEqual(len(table_entities), 3)

for metadatum in [
"authoredAt",
"modifiedAt",
"author",
"summary",
"generator",
"language",
"processingAgent",
]:
for table_entity in table_entities:
self.assertEqual(
table_entity.get(metadatum), parent_entity.get(metadatum)
)

0 comments on commit c014ac9

Please sign in to comment.