From 887c3aa3ada43cf6a1921be1e89492759e858416 Mon Sep 17 00:00:00 2001 From: Jeffery Antoniuk Date: Fri, 17 May 2024 17:23:29 -0600 Subject: [PATCH] Add node set gathering plus AIP creation * linked to: https://gitlab.com/calincs/cwrc/leaf/leaf-base-i8/-/commits/review/preservation_v1 --- .dockerignore | 4 ++ .github/workflows/anchore.yml | 50 +++++++++++++++++ .github/workflows/push.yml | 102 ++++++++++++++++++++++++++++++++++ Dockerfile | 25 +++++++++ rootfs/drupal/api.py | 49 ++++++++++++++++ rootfs/drupal/utilities.py | 66 ++++++++++++++++++++++ rootfs/get_node_ids.py | 66 ++++++++++++++++++++++ rootfs/requirements.txt | 1 + rootfs/tests/unit_tests.py | 15 +++++ 9 files changed, 378 insertions(+) create mode 100644 .dockerignore create mode 100644 .github/workflows/anchore.yml create mode 100644 .github/workflows/push.yml create mode 100644 Dockerfile create mode 100644 rootfs/drupal/api.py create mode 100644 rootfs/drupal/utilities.py create mode 100644 rootfs/get_node_ids.py create mode 100644 rootfs/requirements.txt create mode 100644 rootfs/tests/unit_tests.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..704d551 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,4 @@ +Dockerfile +README.md +tests +tests/**/* diff --git a/.github/workflows/anchore.yml b/.github/workflows/anchore.yml new file mode 100644 index 0000000..a950b45 --- /dev/null +++ b/.github/workflows/anchore.yml @@ -0,0 +1,50 @@ +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +# This workflow checks out code, builds an image, performs a container image +# vulnerability scan with Anchore's Grype tool, and integrates the results with GitHub Advanced Security +# code scanning feature. For more information on the Anchore scan action usage +# and parameters, see https://github.com/anchore/scan-action. For more +# information on Anchore's container image scanning tool Grype, see +# https://github.com/anchore/grype +name: Anchore Grype vulnerability scan + +on: + push: + branches: [ "main" ] + pull_request: + # The branches below must be a subset of the branches above + branches: [ "main" ] + schedule: + - cron: '39 23 * * 1' + +permissions: + contents: read + +jobs: + Anchore-Build-Scan: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + runs-on: ubuntu-latest + steps: + - name: Check out the code + uses: actions/checkout@v4 + - name: Build the Docker image + run: docker build . --file Dockerfile --tag localbuild/testimage:latest + - name: Run the Anchore Grype scan action + uses: anchore/scan-action@3343887d815d7b07465f6fdcd395bd66508d486a #v3.6.4 + id: scan + with: + image: "localbuild/testimage:latest" + # fail-build: true + fail-build: false + severity-cutoff: critical + - name: Upload vulnerability report + uses: github/codeql-action/upload-sarif@v3 + if: always() + with: + sarif_file: ${{ steps.scan.outputs.sarif }} diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml new file mode 100644 index 0000000..003fdc2 --- /dev/null +++ b/.github/workflows/push.yml @@ -0,0 +1,102 @@ +name: Build and publish + +on: + push: + branches: [ "main" ] + # Publish semver tags as releases. + tags: [ 'v*.*.*' ] + pull_request: + branches: [ "main" ] + +env: + # Use docker.io for Docker Hub if empty + REGISTRY: ghcr.io + # github.repository as / + IMAGE_NAME: ${{ github.repository }} + +jobs: + build: + + runs-on: ubuntu-latest + + permissions: + contents: read + packages: write + # This is used to complete the identity challenge + # with sigstore/fulcio when running outside of PRs. + id-token: write + attestations: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # Install the cosign tool except on PR + # https://github.com/sigstore/cosign-installer + #- name: Install cosign + # if: github.event_name != 'pull_request' + # uses: sigstore/cosign-installer@6e04d228eb30da1757ee4e1dd75a0ec73a653e06 #v3.1.1 + # with: + # cosign-release: 'v2.1.1' + + # Set up BuildKit Docker container builder to be able to build + # multi-platform images and export cache + # https://github.com/docker/setup-buildx-action + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@d70bba72b1f3fd22344832f00baa16ece964efeb # v3.3.0 + + # Login against a Docker registry except on PR + # https://github.com/docker/login-action + - name: Log into registry ${{ env.REGISTRY }} + # if: github.event_name != 'pull_request' + uses: docker/login-action@e92390c5fb421da1463c202d546fed0ec5c39f20 # v3.1.0 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + # Extract metadata (tags, labels) for Docker + # https://github.com/docker/metadata-action + - name: Extract Docker metadata + id: meta + uses: docker/metadata-action@8e5442c4ef9f78752691e2d8f8d19755c6f78e81 # v5.5.1 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + + # Build and push Docker image with Buildx (don't push on PR) + # https://github.com/docker/build-push-action + - name: Build and push Docker image + id: build-and-push + uses: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0 # v5.3.0 + with: + context: . + # push: ${{ github.event_name != 'pull_request' }} + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + # Sign the resulting Docker image digest except on PRs. + # This will only write to the public Rekor transparency log when the Docker + # repository is public to avoid leaking data. If you would like to publish + # transparency data even for private images, pass --force to cosign below. + # https://github.com/sigstore/cosign + #- name: Sign the published Docker image + # if: ${{ github.event_name != 'pull_request' }} + # env: + # # https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-environment-variable + # TAGS: ${{ steps.meta.outputs.tags }} + # DIGEST: ${{ steps.build-and-push.outputs.digest }} + # # This step uses the identity token to provision an ephemeral certificate + # # against the sigstore community Fulcio instance. + # run: echo "${TAGS}" | xargs -I {} cosign sign --yes {}@${DIGEST} + + # https://docs.github.com/en/actions/publishing-packages/publishing-docker-images + # https://docs.github.com/en/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds + - name: Generate artifact attestation + uses: actions/attest-build-provenance@v1 + with: + subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}} + subject-digest: ${{ steps.build-and-push.outputs.digest }} + push-to-registry: true diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..8377b24 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,25 @@ +# syntax=docker/dockerfile:1.7 +ARG BAGGER_REPOSITORY +ARG BAGGER_TAG + +FROM --platform=$BUILDPLATFORM ${BAGGER_REPOSITORY:-ghcr.io/cwrc}/isle-bagger:${BAGGER_TAG:-v0.0.1} + +# Install packages and tools that allow for basic downloads. +RUN --mount=type=cache,id=bagger-apk-${TARGETARCH},sharing=locked,target=/var/cache/apk \ + apk add --no-cache \ + python3 \ + py-pip \ + py3-requests \ + && \ + echo '' > /root/.ash_history + +WORKDIR /var/www/ + +# requries v24+ of Docker +# https://github.com/docker/build-push-action/issues/761 +#COPY --chown=nginx:nginx --link rootfs / +COPY --chown=nginx:nginx rootfs / + +#RUN find /var/www/bagger ! -user nginx -exec chown nginx:ng + +#RUN pip install -r requirements.txt --user diff --git a/rootfs/drupal/api.py b/rootfs/drupal/api.py new file mode 100644 index 0000000..459ebd3 --- /dev/null +++ b/rootfs/drupal/api.py @@ -0,0 +1,49 @@ +""" +Drupal API utility functions +""" + +import requests + +from urllib.parse import urljoin + + +# initialize a session with API endpoint +def init_session(args, username, password): + + session = requests.Session() + session.auth = (username, password) + + #auth_endpoint = 'user/login?_format=json' + #response = session.post( + # urljoin(args.server, auth_endpoint), + # json={'email': username, 'pass': password}, + # headers={'Content-Type': 'application/json'} + #) + #response.raise_for_status() + + return session + + +# +def get_node_list(session, server, page=0, date_filter=''): + + node_view_endpoint = f"views/preservation_show_node_timestamps?page={page}&changed={date_filter}" + response = session.get( + urljoin(server, node_view_endpoint), + #allow_redirects=config["allow_redirects"], + #verify=config["secure_ssl_only"], + #auth=(config["username"], config["password"]), + #params=config["query"], + #headers=config["headers"], + ) + response.raise_for_status() + return response + +# +def get_media_list(session, server, page=0, date_filter=''): + node_view_endpoint = f"views/preservation_show_media_timestamps?page={page}&changed={date_filter}" + response = session.get( + urljoin(server, node_view_endpoint), + ) + response.raise_for_status() + return response \ No newline at end of file diff --git a/rootfs/drupal/utilities.py b/rootfs/drupal/utilities.py new file mode 100644 index 0000000..b1bf134 --- /dev/null +++ b/rootfs/drupal/utilities.py @@ -0,0 +1,66 @@ +""" +Script utility functions +""" + +import json +import subprocess + +from drupal import api as drupalApi + +# build list of ids from Drupal Nodes +def id_list_from_nodes(session, args) : + + node_list = {} + page = 0 + + while True: + node = drupalApi.get_node_list(session, args.server, page, args.date) + node_json = json.loads(node.content) + + if len(node_json) == 0 : + break + + else : + #print(node_json) + for node in node_json: + node_list[node["nid"][0]['value']] = { "changed": node['changed'][0]["value"]} + page+=1 + + return node_list + + +# query media as media changes are not reflected as node revisions +# exclude Drupal Media not attached to a Drupal Node +def id_list_merge_with_media(session, args, node_list) : + + page = 0 + while True : + media = drupalApi.get_media_list(session, args.server, page, args.date) + media_json = json.loads(media.content) + + if len(media_json) == 0 : + break + else : + for media in media_json: + media_of = None + if "field_media_of" in media and len(media["field_media_of"]) >= 1 and "target_id" in media["field_media_of"][0]: + media_of = media["field_media_of"][0]['target_id'] + media_changed = media['changed'][0]["value"] if ("changed" in media) else None + if media_of is not None and media_changed is not None and media_of not in node_list : + # media changed but the parent node did not change + node_list[media_of] = { "changed": media_changed} + elif media_of is not None and media_changed is not None and node_list[media_of]["changed"] < media_changed : + node_list[media_of] = { "changed": media_changed} + page+=1 + +# create archival information package +def create_aip(node_list, bagger_app_path) : + + for node in node_list : + # cd ${BAGGER_APP_DIR} && ./bin/console app:islandora_bagger:create_bag -vvv --settings=var/sample_per_bag_config.yaml --node=1 + subprocess.run( + [ './bin/console', 'app:islandora_bagger:create_bag', '-vvv', '--settings=var/sample_per_bag_config.yaml', f'--node={node.key}'], + stdout=subprocess.PIPE, + check=True, + cwd=bagger_app_path + ) \ No newline at end of file diff --git a/rootfs/get_node_ids.py b/rootfs/get_node_ids.py new file mode 100644 index 0000000..47e9664 --- /dev/null +++ b/rootfs/get_node_ids.py @@ -0,0 +1,66 @@ +############################################################################################## +# desc: connect to a Drupal instance, get a list of Drupal Nodes and Media that have changed +# since a supplied date and return a list of Drupal Nodes (e.g., to preserve in an +# AIP - archival information package) +# usage: python3 get_node_id.py --server ${server_name} --output ${output_path} --date '2024-05-16T16:51:52' +# license: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication +# date: June 15, 2022 +############################################################################################## + +from getpass import getpass +from time import sleep +import argparse +import json +import logging +import os + +from drupal import api as drupalApi +from drupal import utilities as drupalUtilities + +# +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--server', required=True, help='Servername.') + parser.add_argument('--output', required=True, help='Location to store JSON (like) output file.') + parser.add_argument('--date', required=False, help='Items changed after the given date.') + parser.add_argument('--wait', required=False, help='Time to wait between API calls.', type=float, default=0.1) + parser.add_argument('--logging_level', required=False, help='Logging level.', default=logging.WARNING) + return parser.parse_args() + + +# +def process(args, session, output_file): + + # a list of resources to preserve + node_list = {} + + # get a list of Drupal Node IDs changed since a given optional date + node_list = drupalUtilities.id_list_from_nodes(session, args) + print(node_list) + + # inspect Drupal Media for changes + # a Media change is does not transitively change the associated Node change timestamp) + # if Media changed then add associated Node ID to the list + drupalUtilities.id_list_merge_with_media(session, args, node_list) + print(node_list) + + # create archival information packages + drupalUtilities.create_aip(node_list, args.BAGGER_APP_PATH) + + # upload archival information packages +# +def main(): + args = parse_args() + args['BAGGER_APP_PATH'] = os.getenv('BAGGER_APP_PATH') + + username = input('Username:') + password = getpass('Password:') + + session = drupalApi.init_session(args, username, password) + + with open(args.output, 'wt', encoding="utf-8", newline='') as output_file: + process(args, session, output_file) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/rootfs/requirements.txt b/rootfs/requirements.txt new file mode 100644 index 0000000..75fec6e --- /dev/null +++ b/rootfs/requirements.txt @@ -0,0 +1 @@ +requests>=2.31 \ No newline at end of file diff --git a/rootfs/tests/unit_tests.py b/rootfs/tests/unit_tests.py new file mode 100644 index 0000000..b3e3ab1 --- /dev/null +++ b/rootfs/tests/unit_tests.py @@ -0,0 +1,15 @@ +""" Very quickly written unit tests for a one-time script +""" + + +import csv +import os +import pytest +import pytest_mock +import shutil +import sys + +from swiftclient.service import ClientException, SwiftError, SwiftService, SwiftUploadObject + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +