Skip to content

Commit

Permalink
Add CLI tool to import AIP from METS file (#355)
Browse files Browse the repository at this point in the history
Added a CLI tool to import AIP-related data from a METS file.
  • Loading branch information
mcantelon committed Nov 25, 2024
1 parent 911875f commit bb43912
Show file tree
Hide file tree
Showing 5 changed files with 199 additions and 67 deletions.
79 changes: 79 additions & 0 deletions AIPscan/Aggregator/mets_parse_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,20 @@
"""Collects a number of functions that aid in the retrieval of
information from an AIP METS file.
"""
import os

import lxml
import metsrw
import requests

from AIPscan.Aggregator import database_helpers
from AIPscan.Aggregator.task_helpers import (
create_numbered_subdirs,
get_mets_url,
write_mets,
)
from AIPscan.helpers import file_sha256_hash
from AIPscan.models import AIP


class METSError(Exception):
Expand Down Expand Up @@ -101,3 +106,77 @@ def download_mets(
download_file = write_mets(mets_response, package_uuid, numbered_subdir)

return download_file


def import_from_mets(
filename,
aip_size,
package_uuid,
storage_service_id,
storage_location_id,
fetch_job_id,
origin_pipeline_id,
logger,
delete_file=False,
):
mets_name = os.path.basename(filename)
mets_hash = file_sha256_hash(filename)

# If METS file's hash matches an existing value, this is a duplicate of an
# existing AIP and we can safely ignore it.
matching_aip = AIP.query.filter_by(mets_sha256=mets_hash).first()
if matching_aip is not None:
logger.info(
"Skipping METS file {} - identical to existing record".format(mets_name)
)
try:
if delete_file:
os.remove(filename)
except OSError as err:
logger.warning("Unable to delete METS file: {}".format(err))

Check warning on line 136 in AIPscan/Aggregator/mets_parse_helpers.py

View check run for this annotation

Codecov / codecov/patch

AIPscan/Aggregator/mets_parse_helpers.py#L135-L136

Added lines #L135 - L136 were not covered by tests
return

logger.info("Processing METS file {}".format(mets_name))

try:
mets = parse_mets_with_metsrw(filename)
except METSError:

Check warning on line 143 in AIPscan/Aggregator/mets_parse_helpers.py

View check run for this annotation

Codecov / codecov/patch

AIPscan/Aggregator/mets_parse_helpers.py#L143

Added line #L143 was not covered by tests
# An error we need to log and report back to the user.
return

Check warning on line 145 in AIPscan/Aggregator/mets_parse_helpers.py

View check run for this annotation

Codecov / codecov/patch

AIPscan/Aggregator/mets_parse_helpers.py#L145

Added line #L145 was not covered by tests

try:
original_name = get_aip_original_name(mets)
except METSError:

Check warning on line 149 in AIPscan/Aggregator/mets_parse_helpers.py

View check run for this annotation

Codecov / codecov/patch

AIPscan/Aggregator/mets_parse_helpers.py#L149

Added line #L149 was not covered by tests
# Some other error with the METS file that we might want to
# log and act upon.
original_name = package_uuid

Check warning on line 152 in AIPscan/Aggregator/mets_parse_helpers.py

View check run for this annotation

Codecov / codecov/patch

AIPscan/Aggregator/mets_parse_helpers.py#L152

Added line #L152 was not covered by tests

# Delete records of any previous versions of this AIP, which will shortly
# be replaced by new records from the updated METS.
previous_aips = AIP.query.filter_by(uuid=package_uuid).all()
for previous_aip in previous_aips:
logger.info(
"Deleting record for AIP {} to replace from newer METS".format(package_uuid)
)
database_helpers.delete_aip_object(previous_aip)

aip = database_helpers.create_aip_object(
package_uuid=package_uuid,
transfer_name=original_name,
create_date=mets.createdate,
mets_sha256=mets_hash,
size=aip_size,
storage_service_id=storage_service_id,
storage_location_id=storage_location_id,
fetch_job_id=fetch_job_id,
origin_pipeline_id=origin_pipeline_id,
)

database_helpers.process_aip_data(aip, mets)

# Delete METS file.
if delete_file:
try:
os.remove(filename)
except OSError as err:
logger.warning("Unable to delete METS file: {}".format(err))

Check warning on line 182 in AIPscan/Aggregator/mets_parse_helpers.py

View check run for this annotation

Codecov / codecov/patch

AIPscan/Aggregator/mets_parse_helpers.py#L181-L182

Added lines #L181 - L182 were not covered by tests
75 changes: 11 additions & 64 deletions AIPscan/Aggregator/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,14 @@
from AIPscan import db, typesense_helpers
from AIPscan.Aggregator import database_helpers
from AIPscan.Aggregator.celery_helpers import write_celery_update
from AIPscan.Aggregator.mets_parse_helpers import (
METSError,
download_mets,
get_aip_original_name,
parse_mets_with_metsrw,
)
from AIPscan.Aggregator.mets_parse_helpers import download_mets, import_from_mets
from AIPscan.Aggregator.task_helpers import (
format_api_url_with_limit_offset,
parse_package_list_file,
process_package_object,
summarize_fetch_job_results,
)
from AIPscan.extensions import celery
from AIPscan.helpers import file_sha256_hash
from AIPscan.models import (
AIP,
Agent,
Expand Down Expand Up @@ -320,66 +314,19 @@ def get_mets(
timestamp_str,
package_list_no,
)
mets_name = os.path.basename(download_file)
mets_hash = file_sha256_hash(download_file)

# If METS file's hash matches an existing value, this is a duplicate of an
# existing AIP and we can safely ignore it.
matching_aip = AIP.query.filter_by(mets_sha256=mets_hash).first()
if matching_aip is not None:
tasklogger.info(
"Skipping METS file {} - identical to existing record".format(mets_name)
)
try:
os.remove(download_file)
except OSError as err:
tasklogger.warning("Unable to delete METS file: {}".format(err))
return

tasklogger.info("Processing METS file {}".format(mets_name))

try:
mets = parse_mets_with_metsrw(download_file)
except METSError:
# An error we need to log and report back to the user.
return

try:
original_name = get_aip_original_name(mets)
except METSError:
# Some other error with the METS file that we might want to
# log and act upon.
original_name = package_uuid

# Delete records of any previous versions of this AIP, which will shortly
# be replaced by new records from the updated METS.
previous_aips = AIP.query.filter_by(uuid=package_uuid).all()
for previous_aip in previous_aips:
tasklogger.info(
"Deleting record for AIP {} to replace from newer METS".format(package_uuid)
)
database_helpers.delete_aip_object(previous_aip)

aip = database_helpers.create_aip_object(
package_uuid=package_uuid,
transfer_name=original_name,
create_date=mets.createdate,
mets_sha256=mets_hash,
size=aip_size,
storage_service_id=storage_service_id,
storage_location_id=storage_location_id,
fetch_job_id=fetch_job_id,
origin_pipeline_id=origin_pipeline_id,
import_from_mets(
download_file,
aip_size,
package_uuid,
storage_service_id,
storage_location_id,
fetch_job_id,
origin_pipeline_id,
tasklogger,
delete_file=True,
)

database_helpers.process_aip_data(aip, mets)

# Delete downloaded METS file.
try:
os.remove(download_file)
except OSError as err:
tasklogger.warning("Unable to delete METS file: {}".format(err))


@celery.task()
def delete_fetch_job(fetch_job_id):
Expand Down
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Copyright Artefactual Systems Inc (2021)

* [Screenshots](#screenshots)
* [Installation](#installation)
* [Tools](#tools)
* [Usage](#usage)

## Screenshots
Expand Down Expand Up @@ -282,6 +283,18 @@ These individual fetch jobs shouldn't be deleted, via the AIPscan web UI,
until all fetch jobs (for each "page") have run. Otherwise the cached list of
packages will be deleted and the package list will have to be downloaded again.

### METS import

The METS import tool, `tools/mets-import`, allows a METS file from an AIP to
be used to import data representing an AIP and its contents.

The AIP UUID will be parsed from the METS file username, if present. Otherwise
it can be specified using the `--aip-uuid` CLI option.

The size of the AIP must be specified using the `--aip-size` CLI option. The
UUID of the pipeline it originated from must be specified using the
`--origin-pipeline-uuid` CLI option.

### Running tools

These should be run using the same system user and virtual environment that
Expand Down
9 changes: 6 additions & 3 deletions tools/app/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@ def create_app_instance(configuration, db):
return app


def log_and_raise_click_error(logger, message):
logger.critical(message)

def raise_click_error(message):
err = click.ClickException(message)
err.exit_code = 1
raise err


def log_and_raise_click_error(logger, message):
logger.critical(message)
raise_click_error(message)
90 changes: 90 additions & 0 deletions tools/mets-import
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/usr/bin/env python3
import logging
import os
import pathlib
import sys
import uuid
from datetime import datetime

import click
from app import cli

from AIPscan import db
from AIPscan.Aggregator import database_helpers
from AIPscan.Aggregator.mets_parse_helpers import import_from_mets
from config import CONFIGS


@click.command()
@click.option("--ss-id", "-s", required=True, help="Storage service ID.", type=int)
@click.option(
"--location-id", "-l", required=True, help="Storage location ID.", type=int
)
@click.option("--aip-size", "-a", required=True, help="AIP size.", type=int)
@click.option(
"--origin-pipeline-uuid",
"-o",
required=True,
help="Origin pipeline UUID.",
type=str,
)
@click.option("--aip-uuid", "-u", help="Package UUID.", type=str)
@click.option("--verbose", "-v", is_flag=True, help="Show debug messages.", type=bool)
@click.argument("filename")
def main(
ss_id, location_id, aip_size, origin_pipeline_uuid, aip_uuid, verbose, filename
):
# Check if METS file exists
if not pathlib.Path(filename).exists():
cli.raise_click_error("METS file does not exist.")

# Log to screen
logger_name = pathlib.PurePosixPath(sys.argv[0]).name
logger = logging.getLogger(logger_name)

if verbose:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)

# Try to parse AIP UUID from METS filename, if not specified
if not aip_uuid:
try:
aip_uuid = str(uuid.UUID(os.path.basename(filename)[5:41]))
logger.info(f"Parsed AIP UUID {aip_uuid} from filename")
except ValueError:
cli.raise_click_error("No AIP UUID found in METS filename.")

# Initialize Flask app context
app = cli.create_app_instance(CONFIGS[cli.config_name], db)

with app.app_context():
# Create a fetch_job and take note of its ID
datetime_obj_start = datetime.now().replace(microsecond=0)
session_id = str(uuid.uuid4())

fetch_job = database_helpers.create_fetch_job(
datetime_obj_start, session_id, ss_id
)
fetch_job_id = fetch_job.id

# Import METS file
logger.info("Importing...")

import_from_mets(
filename,
aip_size,
aip_uuid,
ss_id,
location_id,
fetch_job_id,
origin_pipeline_uuid,
logger,
delete_file=False,
)

logger.info("Done.")


if __name__ == "__main__":
main()

0 comments on commit bb43912

Please sign in to comment.