Skip to content

Commit

Permalink
Add CLI tool to import AIP from METS file (#355)
Browse files Browse the repository at this point in the history
Added a CLI tool to import AIP-related data from a METS file.
  • Loading branch information
mcantelon committed Nov 24, 2024
1 parent 911875f commit c39cff2
Show file tree
Hide file tree
Showing 4 changed files with 180 additions and 67 deletions.
79 changes: 79 additions & 0 deletions AIPscan/Aggregator/mets_parse_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,20 @@
"""Collects a number of functions that aid in the retrieval of
information from an AIP METS file.
"""
import os

import lxml
import metsrw
import requests

from AIPscan.Aggregator import database_helpers
from AIPscan.Aggregator.task_helpers import (
create_numbered_subdirs,
get_mets_url,
write_mets,
)
from AIPscan.helpers import file_sha256_hash
from AIPscan.models import AIP


class METSError(Exception):
Expand Down Expand Up @@ -101,3 +106,77 @@ def download_mets(
download_file = write_mets(mets_response, package_uuid, numbered_subdir)

return download_file


def import_from_mets(
filename,
aip_size,
package_uuid,
storage_service_id,
storage_location_id,
fetch_job_id,
origin_pipeline_id,
logger,
delete_file=False,
):
mets_name = os.path.basename(filename)
mets_hash = file_sha256_hash(filename)

# If METS file's hash matches an existing value, this is a duplicate of an
# existing AIP and we can safely ignore it.
matching_aip = AIP.query.filter_by(mets_sha256=mets_hash).first()
if matching_aip is not None:
logger.info(
"Skipping METS file {} - identical to existing record".format(mets_name)
)
try:
if delete_file:
os.remove(filename)
except OSError as err:
logger.warning("Unable to delete METS file: {}".format(err))
return

logger.info("Processing METS file {}".format(mets_name))

try:
mets = parse_mets_with_metsrw(filename)
except METSError:
# An error we need to log and report back to the user.
return

try:
original_name = get_aip_original_name(mets)
except METSError:
# Some other error with the METS file that we might want to
# log and act upon.
original_name = package_uuid

# Delete records of any previous versions of this AIP, which will shortly
# be replaced by new records from the updated METS.
previous_aips = AIP.query.filter_by(uuid=package_uuid).all()
for previous_aip in previous_aips:
logger.info(
"Deleting record for AIP {} to replace from newer METS".format(package_uuid)
)
database_helpers.delete_aip_object(previous_aip)

aip = database_helpers.create_aip_object(
package_uuid=package_uuid,
transfer_name=original_name,
create_date=mets.createdate,
mets_sha256=mets_hash,
size=aip_size,
storage_service_id=storage_service_id,
storage_location_id=storage_location_id,
fetch_job_id=fetch_job_id,
origin_pipeline_id=origin_pipeline_id,
)

database_helpers.process_aip_data(aip, mets)

# Delete METS file.
if delete_file:
try:
os.remove(filename)
except OSError as err:
logger.warning("Unable to delete METS file: {}".format(err))
75 changes: 11 additions & 64 deletions AIPscan/Aggregator/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,14 @@
from AIPscan import db, typesense_helpers
from AIPscan.Aggregator import database_helpers
from AIPscan.Aggregator.celery_helpers import write_celery_update
from AIPscan.Aggregator.mets_parse_helpers import (
METSError,
download_mets,
get_aip_original_name,
parse_mets_with_metsrw,
)
from AIPscan.Aggregator.mets_parse_helpers import download_mets, import_from_mets
from AIPscan.Aggregator.task_helpers import (
format_api_url_with_limit_offset,
parse_package_list_file,
process_package_object,
summarize_fetch_job_results,
)
from AIPscan.extensions import celery
from AIPscan.helpers import file_sha256_hash
from AIPscan.models import (
AIP,
Agent,
Expand Down Expand Up @@ -320,66 +314,19 @@ def get_mets(
timestamp_str,
package_list_no,
)
mets_name = os.path.basename(download_file)
mets_hash = file_sha256_hash(download_file)

# If METS file's hash matches an existing value, this is a duplicate of an
# existing AIP and we can safely ignore it.
matching_aip = AIP.query.filter_by(mets_sha256=mets_hash).first()
if matching_aip is not None:
tasklogger.info(
"Skipping METS file {} - identical to existing record".format(mets_name)
)
try:
os.remove(download_file)
except OSError as err:
tasklogger.warning("Unable to delete METS file: {}".format(err))
return

tasklogger.info("Processing METS file {}".format(mets_name))

try:
mets = parse_mets_with_metsrw(download_file)
except METSError:
# An error we need to log and report back to the user.
return

try:
original_name = get_aip_original_name(mets)
except METSError:
# Some other error with the METS file that we might want to
# log and act upon.
original_name = package_uuid

# Delete records of any previous versions of this AIP, which will shortly
# be replaced by new records from the updated METS.
previous_aips = AIP.query.filter_by(uuid=package_uuid).all()
for previous_aip in previous_aips:
tasklogger.info(
"Deleting record for AIP {} to replace from newer METS".format(package_uuid)
)
database_helpers.delete_aip_object(previous_aip)

aip = database_helpers.create_aip_object(
package_uuid=package_uuid,
transfer_name=original_name,
create_date=mets.createdate,
mets_sha256=mets_hash,
size=aip_size,
storage_service_id=storage_service_id,
storage_location_id=storage_location_id,
fetch_job_id=fetch_job_id,
origin_pipeline_id=origin_pipeline_id,
import_from_mets(
download_file,
aip_size,
package_uuid,
storage_service_id,
storage_location_id,
fetch_job_id,
origin_pipeline_id,
tasklogger,
delete_file=True,
)

database_helpers.process_aip_data(aip, mets)

# Delete downloaded METS file.
try:
os.remove(download_file)
except OSError as err:
tasklogger.warning("Unable to delete METS file: {}".format(err))


@celery.task()
def delete_fetch_job(fetch_job_id):
Expand Down
9 changes: 6 additions & 3 deletions tools/app/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@ def create_app_instance(configuration, db):
return app


def log_and_raise_click_error(logger, message):
logger.critical(message)

def raise_click_error(message):
err = click.ClickException(message)
err.exit_code = 1
raise err


def log_and_raise_click_error(logger, message):
logger.critical(message)
raise_click_error(message)
84 changes: 84 additions & 0 deletions tools/import_mets
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python3
import logging
import os
import pathlib
import sys
import uuid
from datetime import datetime

import click
from app import cli

from AIPscan import db
from AIPscan.Aggregator import database_helpers
from AIPscan.Aggregator.mets_parse_helpers import import_from_mets
from config import CONFIGS


@click.command()
@click.option("--ss-id", "-s", required=True, help="Storage service ID.", type=int)
@click.option(
"--location-id", "-l", required=True, help="Storage location ID.", type=int
)
@click.option("--aip-size", "-a", required=True, help="AIP size.", type=int)
@click.option(
"--origin-pipeline-id", "-o", required=True, help="Origin pipeline UUID.", type=str
)
@click.option("--package-uuid", "-u", help="Package UUID.", type=str)
@click.option("--verbose", "-v", is_flag=True, help="Show debug messages.", type=bool)
@click.argument("filename")
def main(ss_id, location_id, aip_size, origin_pipeline_id, package_uuid, verbose, filename):
# Check if METS file exists
if not pathlib.Path(filename).exists():
cli.raise_click_error("METS file does not exist.")

# Log to screen
logger_name = pathlib.PurePosixPath(sys.argv[0]).name
logger = logging.getLogger(logger_name)

if verbose:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)

# Try to parse package UUID from METS filename, if not specified
if not package_uuid:
try:
package_uuid = str(uuid.UUID(os.path.basename(filename)[5:41]))
logger.info(f"Parsed UUID {package_uuid} from filename")
except ValueError:
cli.raise_click_error("No package UUID in filename.")

# Initialize Flask app context
app = cli.create_app_instance(CONFIGS[cli.config_name], db)

with app.app_context():
# Create a fetch_job and take note of its ID
datetime_obj_start = datetime.now().replace(microsecond=0)
session_id = str(uuid.uuid4())

fetch_job = database_helpers.create_fetch_job(
datetime_obj_start, session_id, ss_id
)
fetch_job_id = fetch_job.id

# Import METS file
logger.info("Importing...")

import_from_mets(
filename,
aip_size,
package_uuid,
ss_id,
location_id,
fetch_job_id,
origin_pipeline_id,
logger,
delete_file=False,
)

logger.info("Done.")


if __name__ == "__main__":
main()

0 comments on commit c39cff2

Please sign in to comment.