diff --git a/AIPscan/Aggregator/mets_parse_helpers.py b/AIPscan/Aggregator/mets_parse_helpers.py index 96651d79..53605e8e 100644 --- a/AIPscan/Aggregator/mets_parse_helpers.py +++ b/AIPscan/Aggregator/mets_parse_helpers.py @@ -3,15 +3,20 @@ """Collects a number of functions that aid in the retrieval of information from an AIP METS file. """ +import os + import lxml import metsrw import requests +from AIPscan.Aggregator import database_helpers from AIPscan.Aggregator.task_helpers import ( create_numbered_subdirs, get_mets_url, write_mets, ) +from AIPscan.helpers import file_sha256_hash +from AIPscan.models import AIP class METSError(Exception): @@ -101,3 +106,77 @@ def download_mets( download_file = write_mets(mets_response, package_uuid, numbered_subdir) return download_file + + +def import_from_mets( + filename, + aip_size, + package_uuid, + storage_service_id, + storage_location_id, + fetch_job_id, + origin_pipeline_id, + logger, + delete_file=False, +): + mets_name = os.path.basename(filename) + mets_hash = file_sha256_hash(filename) + + # If METS file's hash matches an existing value, this is a duplicate of an + # existing AIP and we can safely ignore it. + matching_aip = AIP.query.filter_by(mets_sha256=mets_hash).first() + if matching_aip is not None: + logger.info( + "Skipping METS file {} - identical to existing record".format(mets_name) + ) + try: + if delete_file: + os.remove(filename) + except OSError as err: + logger.warning("Unable to delete METS file: {}".format(err)) + return + + logger.info("Processing METS file {}".format(mets_name)) + + try: + mets = parse_mets_with_metsrw(filename) + except METSError: + # An error we need to log and report back to the user. + return + + try: + original_name = get_aip_original_name(mets) + except METSError: + # Some other error with the METS file that we might want to + # log and act upon. + original_name = package_uuid + + # Delete records of any previous versions of this AIP, which will shortly + # be replaced by new records from the updated METS. + previous_aips = AIP.query.filter_by(uuid=package_uuid).all() + for previous_aip in previous_aips: + logger.info( + "Deleting record for AIP {} to replace from newer METS".format(package_uuid) + ) + database_helpers.delete_aip_object(previous_aip) + + aip = database_helpers.create_aip_object( + package_uuid=package_uuid, + transfer_name=original_name, + create_date=mets.createdate, + mets_sha256=mets_hash, + size=aip_size, + storage_service_id=storage_service_id, + storage_location_id=storage_location_id, + fetch_job_id=fetch_job_id, + origin_pipeline_id=origin_pipeline_id, + ) + + database_helpers.process_aip_data(aip, mets) + + # Delete METS file. + if delete_file: + try: + os.remove(filename) + except OSError as err: + logger.warning("Unable to delete METS file: {}".format(err)) diff --git a/AIPscan/Aggregator/tasks.py b/AIPscan/Aggregator/tasks.py index 16b1bdb1..fdd106a5 100644 --- a/AIPscan/Aggregator/tasks.py +++ b/AIPscan/Aggregator/tasks.py @@ -10,12 +10,7 @@ from AIPscan import db, typesense_helpers from AIPscan.Aggregator import database_helpers from AIPscan.Aggregator.celery_helpers import write_celery_update -from AIPscan.Aggregator.mets_parse_helpers import ( - METSError, - download_mets, - get_aip_original_name, - parse_mets_with_metsrw, -) +from AIPscan.Aggregator.mets_parse_helpers import download_mets, import_from_mets from AIPscan.Aggregator.task_helpers import ( format_api_url_with_limit_offset, parse_package_list_file, @@ -23,7 +18,6 @@ summarize_fetch_job_results, ) from AIPscan.extensions import celery -from AIPscan.helpers import file_sha256_hash from AIPscan.models import ( AIP, Agent, @@ -320,66 +314,19 @@ def get_mets( timestamp_str, package_list_no, ) - mets_name = os.path.basename(download_file) - mets_hash = file_sha256_hash(download_file) - - # If METS file's hash matches an existing value, this is a duplicate of an - # existing AIP and we can safely ignore it. - matching_aip = AIP.query.filter_by(mets_sha256=mets_hash).first() - if matching_aip is not None: - tasklogger.info( - "Skipping METS file {} - identical to existing record".format(mets_name) - ) - try: - os.remove(download_file) - except OSError as err: - tasklogger.warning("Unable to delete METS file: {}".format(err)) - return - tasklogger.info("Processing METS file {}".format(mets_name)) - - try: - mets = parse_mets_with_metsrw(download_file) - except METSError: - # An error we need to log and report back to the user. - return - - try: - original_name = get_aip_original_name(mets) - except METSError: - # Some other error with the METS file that we might want to - # log and act upon. - original_name = package_uuid - - # Delete records of any previous versions of this AIP, which will shortly - # be replaced by new records from the updated METS. - previous_aips = AIP.query.filter_by(uuid=package_uuid).all() - for previous_aip in previous_aips: - tasklogger.info( - "Deleting record for AIP {} to replace from newer METS".format(package_uuid) - ) - database_helpers.delete_aip_object(previous_aip) - - aip = database_helpers.create_aip_object( - package_uuid=package_uuid, - transfer_name=original_name, - create_date=mets.createdate, - mets_sha256=mets_hash, - size=aip_size, - storage_service_id=storage_service_id, - storage_location_id=storage_location_id, - fetch_job_id=fetch_job_id, - origin_pipeline_id=origin_pipeline_id, + import_from_mets( + download_file, + aip_size, + package_uuid, + storage_service_id, + storage_location_id, + fetch_job_id, + origin_pipeline_id, + tasklogger, + delete_file=True, ) - database_helpers.process_aip_data(aip, mets) - - # Delete downloaded METS file. - try: - os.remove(download_file) - except OSError as err: - tasklogger.warning("Unable to delete METS file: {}".format(err)) - @celery.task() def delete_fetch_job(fetch_job_id): diff --git a/README.md b/README.md index 11ebf5f1..d0f0f4f2 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Copyright Artefactual Systems Inc (2021) * [Screenshots](#screenshots) * [Installation](#installation) +* [Tools](#tools) * [Usage](#usage) ## Screenshots @@ -282,6 +283,18 @@ These individual fetch jobs shouldn't be deleted, via the AIPscan web UI, until all fetch jobs (for each "page") have run. Otherwise the cached list of packages will be deleted and the package list will have to be downloaded again. +### METS import + +The METS import tool, `tools/mets-import`, allows a METS file from an AIP to +be used to import data representing an AIP and its contents. + +The AIP UUID will be parsed from the METS file username, if present. Otherwise +it can be specified using the `--aip-uuid` CLI option. + +The size of the AIP must be specified using the `--aip-size` CLI option. The +UUID of the pipeline it originated from must be specified using the +`--origin-pipeline-uuid` CLI option. + ### Running tools These should be run using the same system user and virtual environment that diff --git a/tools/app/cli.py b/tools/app/cli.py index dce7ae49..b0f2f414 100644 --- a/tools/app/cli.py +++ b/tools/app/cli.py @@ -20,9 +20,12 @@ def create_app_instance(configuration, db): return app -def log_and_raise_click_error(logger, message): - logger.critical(message) - +def raise_click_error(message): err = click.ClickException(message) err.exit_code = 1 raise err + + +def log_and_raise_click_error(logger, message): + logger.critical(message) + raise_click_error(message) diff --git a/tools/mets-import b/tools/mets-import new file mode 100755 index 00000000..babcd582 --- /dev/null +++ b/tools/mets-import @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +import logging +import os +import pathlib +import sys +import uuid +from datetime import datetime + +import click +from app import cli + +from AIPscan import db +from AIPscan.Aggregator import database_helpers +from AIPscan.Aggregator.mets_parse_helpers import import_from_mets +from config import CONFIGS + + +@click.command() +@click.option("--ss-id", "-s", required=True, help="Storage service ID.", type=int) +@click.option( + "--location-id", "-l", required=True, help="Storage location ID.", type=int +) +@click.option("--aip-size", "-a", required=True, help="AIP size.", type=int) +@click.option( + "--origin-pipeline-uuid", + "-o", + required=True, + help="Origin pipeline UUID.", + type=str, +) +@click.option("--aip-uuid", "-u", help="Package UUID.", type=str) +@click.option("--verbose", "-v", is_flag=True, help="Show debug messages.", type=bool) +@click.argument("filename") +def main( + ss_id, location_id, aip_size, origin_pipeline_uuid, aip_uuid, verbose, filename +): + # Check if METS file exists + if not pathlib.Path(filename).exists(): + cli.raise_click_error("METS file does not exist.") + + # Log to screen + logger_name = pathlib.PurePosixPath(sys.argv[0]).name + logger = logging.getLogger(logger_name) + + if verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + # Try to parse AIP UUID from METS filename, if not specified + if not aip_uuid: + try: + aip_uuid = str(uuid.UUID(os.path.basename(filename)[5:41])) + logger.info(f"Parsed AIP UUID {aip_uuid} from filename") + except ValueError: + cli.raise_click_error("No AIP UUID found in METS filename.") + + # Initialize Flask app context + app = cli.create_app_instance(CONFIGS[cli.config_name], db) + + with app.app_context(): + # Create a fetch_job and take note of its ID + datetime_obj_start = datetime.now().replace(microsecond=0) + session_id = str(uuid.uuid4()) + + fetch_job = database_helpers.create_fetch_job( + datetime_obj_start, session_id, ss_id + ) + fetch_job_id = fetch_job.id + + # Import METS file + logger.info("Importing...") + + import_from_mets( + filename, + aip_size, + aip_uuid, + ss_id, + location_id, + fetch_job_id, + origin_pipeline_uuid, + logger, + delete_file=False, + ) + + logger.info("Done.") + + +if __name__ == "__main__": + main()