From c39cff268a50fa71857ed69420d51db8fd1afa3b Mon Sep 17 00:00:00 2001 From: Mike Cantelon Date: Sat, 23 Nov 2024 16:19:51 -0800 Subject: [PATCH] Add CLI tool to import AIP from METS file (#355) Added a CLI tool to import AIP-related data from a METS file. --- AIPscan/Aggregator/mets_parse_helpers.py | 79 ++++++++++++++++++++++ AIPscan/Aggregator/tasks.py | 75 ++++----------------- tools/app/cli.py | 9 ++- tools/import_mets | 84 ++++++++++++++++++++++++ 4 files changed, 180 insertions(+), 67 deletions(-) create mode 100755 tools/import_mets diff --git a/AIPscan/Aggregator/mets_parse_helpers.py b/AIPscan/Aggregator/mets_parse_helpers.py index 96651d79..53605e8e 100644 --- a/AIPscan/Aggregator/mets_parse_helpers.py +++ b/AIPscan/Aggregator/mets_parse_helpers.py @@ -3,15 +3,20 @@ """Collects a number of functions that aid in the retrieval of information from an AIP METS file. """ +import os + import lxml import metsrw import requests +from AIPscan.Aggregator import database_helpers from AIPscan.Aggregator.task_helpers import ( create_numbered_subdirs, get_mets_url, write_mets, ) +from AIPscan.helpers import file_sha256_hash +from AIPscan.models import AIP class METSError(Exception): @@ -101,3 +106,77 @@ def download_mets( download_file = write_mets(mets_response, package_uuid, numbered_subdir) return download_file + + +def import_from_mets( + filename, + aip_size, + package_uuid, + storage_service_id, + storage_location_id, + fetch_job_id, + origin_pipeline_id, + logger, + delete_file=False, +): + mets_name = os.path.basename(filename) + mets_hash = file_sha256_hash(filename) + + # If METS file's hash matches an existing value, this is a duplicate of an + # existing AIP and we can safely ignore it. + matching_aip = AIP.query.filter_by(mets_sha256=mets_hash).first() + if matching_aip is not None: + logger.info( + "Skipping METS file {} - identical to existing record".format(mets_name) + ) + try: + if delete_file: + os.remove(filename) + except OSError as err: + logger.warning("Unable to delete METS file: {}".format(err)) + return + + logger.info("Processing METS file {}".format(mets_name)) + + try: + mets = parse_mets_with_metsrw(filename) + except METSError: + # An error we need to log and report back to the user. + return + + try: + original_name = get_aip_original_name(mets) + except METSError: + # Some other error with the METS file that we might want to + # log and act upon. + original_name = package_uuid + + # Delete records of any previous versions of this AIP, which will shortly + # be replaced by new records from the updated METS. + previous_aips = AIP.query.filter_by(uuid=package_uuid).all() + for previous_aip in previous_aips: + logger.info( + "Deleting record for AIP {} to replace from newer METS".format(package_uuid) + ) + database_helpers.delete_aip_object(previous_aip) + + aip = database_helpers.create_aip_object( + package_uuid=package_uuid, + transfer_name=original_name, + create_date=mets.createdate, + mets_sha256=mets_hash, + size=aip_size, + storage_service_id=storage_service_id, + storage_location_id=storage_location_id, + fetch_job_id=fetch_job_id, + origin_pipeline_id=origin_pipeline_id, + ) + + database_helpers.process_aip_data(aip, mets) + + # Delete METS file. + if delete_file: + try: + os.remove(filename) + except OSError as err: + logger.warning("Unable to delete METS file: {}".format(err)) diff --git a/AIPscan/Aggregator/tasks.py b/AIPscan/Aggregator/tasks.py index 16b1bdb1..fdd106a5 100644 --- a/AIPscan/Aggregator/tasks.py +++ b/AIPscan/Aggregator/tasks.py @@ -10,12 +10,7 @@ from AIPscan import db, typesense_helpers from AIPscan.Aggregator import database_helpers from AIPscan.Aggregator.celery_helpers import write_celery_update -from AIPscan.Aggregator.mets_parse_helpers import ( - METSError, - download_mets, - get_aip_original_name, - parse_mets_with_metsrw, -) +from AIPscan.Aggregator.mets_parse_helpers import download_mets, import_from_mets from AIPscan.Aggregator.task_helpers import ( format_api_url_with_limit_offset, parse_package_list_file, @@ -23,7 +18,6 @@ summarize_fetch_job_results, ) from AIPscan.extensions import celery -from AIPscan.helpers import file_sha256_hash from AIPscan.models import ( AIP, Agent, @@ -320,66 +314,19 @@ def get_mets( timestamp_str, package_list_no, ) - mets_name = os.path.basename(download_file) - mets_hash = file_sha256_hash(download_file) - - # If METS file's hash matches an existing value, this is a duplicate of an - # existing AIP and we can safely ignore it. - matching_aip = AIP.query.filter_by(mets_sha256=mets_hash).first() - if matching_aip is not None: - tasklogger.info( - "Skipping METS file {} - identical to existing record".format(mets_name) - ) - try: - os.remove(download_file) - except OSError as err: - tasklogger.warning("Unable to delete METS file: {}".format(err)) - return - tasklogger.info("Processing METS file {}".format(mets_name)) - - try: - mets = parse_mets_with_metsrw(download_file) - except METSError: - # An error we need to log and report back to the user. - return - - try: - original_name = get_aip_original_name(mets) - except METSError: - # Some other error with the METS file that we might want to - # log and act upon. - original_name = package_uuid - - # Delete records of any previous versions of this AIP, which will shortly - # be replaced by new records from the updated METS. - previous_aips = AIP.query.filter_by(uuid=package_uuid).all() - for previous_aip in previous_aips: - tasklogger.info( - "Deleting record for AIP {} to replace from newer METS".format(package_uuid) - ) - database_helpers.delete_aip_object(previous_aip) - - aip = database_helpers.create_aip_object( - package_uuid=package_uuid, - transfer_name=original_name, - create_date=mets.createdate, - mets_sha256=mets_hash, - size=aip_size, - storage_service_id=storage_service_id, - storage_location_id=storage_location_id, - fetch_job_id=fetch_job_id, - origin_pipeline_id=origin_pipeline_id, + import_from_mets( + download_file, + aip_size, + package_uuid, + storage_service_id, + storage_location_id, + fetch_job_id, + origin_pipeline_id, + tasklogger, + delete_file=True, ) - database_helpers.process_aip_data(aip, mets) - - # Delete downloaded METS file. - try: - os.remove(download_file) - except OSError as err: - tasklogger.warning("Unable to delete METS file: {}".format(err)) - @celery.task() def delete_fetch_job(fetch_job_id): diff --git a/tools/app/cli.py b/tools/app/cli.py index dce7ae49..b0f2f414 100644 --- a/tools/app/cli.py +++ b/tools/app/cli.py @@ -20,9 +20,12 @@ def create_app_instance(configuration, db): return app -def log_and_raise_click_error(logger, message): - logger.critical(message) - +def raise_click_error(message): err = click.ClickException(message) err.exit_code = 1 raise err + + +def log_and_raise_click_error(logger, message): + logger.critical(message) + raise_click_error(message) diff --git a/tools/import_mets b/tools/import_mets new file mode 100755 index 00000000..7408ae81 --- /dev/null +++ b/tools/import_mets @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +import logging +import os +import pathlib +import sys +import uuid +from datetime import datetime + +import click +from app import cli + +from AIPscan import db +from AIPscan.Aggregator import database_helpers +from AIPscan.Aggregator.mets_parse_helpers import import_from_mets +from config import CONFIGS + + +@click.command() +@click.option("--ss-id", "-s", required=True, help="Storage service ID.", type=int) +@click.option( + "--location-id", "-l", required=True, help="Storage location ID.", type=int +) +@click.option("--aip-size", "-a", required=True, help="AIP size.", type=int) +@click.option( + "--origin-pipeline-id", "-o", required=True, help="Origin pipeline UUID.", type=str +) +@click.option("--package-uuid", "-u", help="Package UUID.", type=str) +@click.option("--verbose", "-v", is_flag=True, help="Show debug messages.", type=bool) +@click.argument("filename") +def main(ss_id, location_id, aip_size, origin_pipeline_id, package_uuid, verbose, filename): + # Check if METS file exists + if not pathlib.Path(filename).exists(): + cli.raise_click_error("METS file does not exist.") + + # Log to screen + logger_name = pathlib.PurePosixPath(sys.argv[0]).name + logger = logging.getLogger(logger_name) + + if verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + # Try to parse package UUID from METS filename, if not specified + if not package_uuid: + try: + package_uuid = str(uuid.UUID(os.path.basename(filename)[5:41])) + logger.info(f"Parsed UUID {package_uuid} from filename") + except ValueError: + cli.raise_click_error("No package UUID in filename.") + + # Initialize Flask app context + app = cli.create_app_instance(CONFIGS[cli.config_name], db) + + with app.app_context(): + # Create a fetch_job and take note of its ID + datetime_obj_start = datetime.now().replace(microsecond=0) + session_id = str(uuid.uuid4()) + + fetch_job = database_helpers.create_fetch_job( + datetime_obj_start, session_id, ss_id + ) + fetch_job_id = fetch_job.id + + # Import METS file + logger.info("Importing...") + + import_from_mets( + filename, + aip_size, + package_uuid, + ss_id, + location_id, + fetch_job_id, + origin_pipeline_id, + logger, + delete_file=False, + ) + + logger.info("Done.") + + +if __name__ == "__main__": + main()