Skip to content

Commit

Permalink
Add node set gathering plus AIP creation
Browse files Browse the repository at this point in the history
  • Loading branch information
jefferya committed May 17, 2024
1 parent b1f56cc commit 887c3aa
Show file tree
Hide file tree
Showing 9 changed files with 378 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Dockerfile
README.md
tests
tests/**/*
50 changes: 50 additions & 0 deletions .github/workflows/anchore.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.

# This workflow checks out code, builds an image, performs a container image
# vulnerability scan with Anchore's Grype tool, and integrates the results with GitHub Advanced Security
# code scanning feature. For more information on the Anchore scan action usage
# and parameters, see https://github.com/anchore/scan-action. For more
# information on Anchore's container image scanning tool Grype, see
# https://github.com/anchore/grype
name: Anchore Grype vulnerability scan

on:
push:
branches: [ "main" ]
pull_request:
# The branches below must be a subset of the branches above
branches: [ "main" ]
schedule:
- cron: '39 23 * * 1'

permissions:
contents: read

jobs:
Anchore-Build-Scan:
permissions:
contents: read # for actions/checkout to fetch code
security-events: write # for github/codeql-action/upload-sarif to upload SARIF results
actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status
runs-on: ubuntu-latest
steps:
- name: Check out the code
uses: actions/checkout@v4
- name: Build the Docker image
run: docker build . --file Dockerfile --tag localbuild/testimage:latest
- name: Run the Anchore Grype scan action
uses: anchore/scan-action@3343887d815d7b07465f6fdcd395bd66508d486a #v3.6.4
id: scan
with:
image: "localbuild/testimage:latest"
# fail-build: true
fail-build: false
severity-cutoff: critical
- name: Upload vulnerability report
uses: github/codeql-action/upload-sarif@v3
if: always()
with:
sarif_file: ${{ steps.scan.outputs.sarif }}
102 changes: 102 additions & 0 deletions .github/workflows/push.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
name: Build and publish

on:
push:
branches: [ "main" ]
# Publish semver tags as releases.
tags: [ 'v*.*.*' ]
pull_request:
branches: [ "main" ]

env:
# Use docker.io for Docker Hub if empty
REGISTRY: ghcr.io
# github.repository as <account>/<repo>
IMAGE_NAME: ${{ github.repository }}

jobs:
build:

runs-on: ubuntu-latest

permissions:
contents: read
packages: write
# This is used to complete the identity challenge
# with sigstore/fulcio when running outside of PRs.
id-token: write
attestations: write

steps:
- name: Checkout repository
uses: actions/checkout@v4

# Install the cosign tool except on PR
# https://github.com/sigstore/cosign-installer
#- name: Install cosign
# if: github.event_name != 'pull_request'
# uses: sigstore/cosign-installer@6e04d228eb30da1757ee4e1dd75a0ec73a653e06 #v3.1.1
# with:
# cosign-release: 'v2.1.1'

# Set up BuildKit Docker container builder to be able to build
# multi-platform images and export cache
# https://github.com/docker/setup-buildx-action
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@d70bba72b1f3fd22344832f00baa16ece964efeb # v3.3.0

# Login against a Docker registry except on PR
# https://github.com/docker/login-action
- name: Log into registry ${{ env.REGISTRY }}
# if: github.event_name != 'pull_request'
uses: docker/login-action@e92390c5fb421da1463c202d546fed0ec5c39f20 # v3.1.0
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

# Extract metadata (tags, labels) for Docker
# https://github.com/docker/metadata-action
- name: Extract Docker metadata
id: meta
uses: docker/metadata-action@8e5442c4ef9f78752691e2d8f8d19755c6f78e81 # v5.5.1
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}

# Build and push Docker image with Buildx (don't push on PR)
# https://github.com/docker/build-push-action
- name: Build and push Docker image
id: build-and-push
uses: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0 # v5.3.0
with:
context: .
# push: ${{ github.event_name != 'pull_request' }}
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max

# Sign the resulting Docker image digest except on PRs.
# This will only write to the public Rekor transparency log when the Docker
# repository is public to avoid leaking data. If you would like to publish
# transparency data even for private images, pass --force to cosign below.
# https://github.com/sigstore/cosign
#- name: Sign the published Docker image
# if: ${{ github.event_name != 'pull_request' }}
# env:
# # https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-environment-variable
# TAGS: ${{ steps.meta.outputs.tags }}
# DIGEST: ${{ steps.build-and-push.outputs.digest }}
# # This step uses the identity token to provision an ephemeral certificate
# # against the sigstore community Fulcio instance.
# run: echo "${TAGS}" | xargs -I {} cosign sign --yes {}@${DIGEST}

# https://docs.github.com/en/actions/publishing-packages/publishing-docker-images
# https://docs.github.com/en/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds
- name: Generate artifact attestation
uses: actions/attest-build-provenance@v1
with:
subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}}
subject-digest: ${{ steps.build-and-push.outputs.digest }}
push-to-registry: true
25 changes: 25 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# syntax=docker/dockerfile:1.7
ARG BAGGER_REPOSITORY
ARG BAGGER_TAG

FROM --platform=$BUILDPLATFORM ${BAGGER_REPOSITORY:-ghcr.io/cwrc}/isle-bagger:${BAGGER_TAG:-v0.0.1}

# Install packages and tools that allow for basic downloads.
RUN --mount=type=cache,id=bagger-apk-${TARGETARCH},sharing=locked,target=/var/cache/apk \
apk add --no-cache \
python3 \
py-pip \
py3-requests \
&& \
echo '' > /root/.ash_history

WORKDIR /var/www/

# requries v24+ of Docker
# https://github.com/docker/build-push-action/issues/761
#COPY --chown=nginx:nginx --link rootfs /
COPY --chown=nginx:nginx rootfs /

#RUN find /var/www/bagger ! -user nginx -exec chown nginx:ng

#RUN pip install -r requirements.txt --user
49 changes: 49 additions & 0 deletions rootfs/drupal/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""
Drupal API utility functions
"""

import requests

from urllib.parse import urljoin


# initialize a session with API endpoint
def init_session(args, username, password):

session = requests.Session()
session.auth = (username, password)

#auth_endpoint = 'user/login?_format=json'
#response = session.post(
# urljoin(args.server, auth_endpoint),
# json={'email': username, 'pass': password},
# headers={'Content-Type': 'application/json'}
#)
#response.raise_for_status()

return session


#
def get_node_list(session, server, page=0, date_filter=''):

node_view_endpoint = f"views/preservation_show_node_timestamps?page={page}&changed={date_filter}"
response = session.get(
urljoin(server, node_view_endpoint),
#allow_redirects=config["allow_redirects"],
#verify=config["secure_ssl_only"],
#auth=(config["username"], config["password"]),
#params=config["query"],
#headers=config["headers"],
)
response.raise_for_status()
return response

#
def get_media_list(session, server, page=0, date_filter=''):
node_view_endpoint = f"views/preservation_show_media_timestamps?page={page}&changed={date_filter}"
response = session.get(
urljoin(server, node_view_endpoint),
)
response.raise_for_status()
return response
66 changes: 66 additions & 0 deletions rootfs/drupal/utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""
Script utility functions
"""

import json
import subprocess

from drupal import api as drupalApi

# build list of ids from Drupal Nodes
def id_list_from_nodes(session, args) :

node_list = {}
page = 0

while True:
node = drupalApi.get_node_list(session, args.server, page, args.date)
node_json = json.loads(node.content)

if len(node_json) == 0 :
break

else :
#print(node_json)
for node in node_json:
node_list[node["nid"][0]['value']] = { "changed": node['changed'][0]["value"]}
page+=1

return node_list


# query media as media changes are not reflected as node revisions
# exclude Drupal Media not attached to a Drupal Node
def id_list_merge_with_media(session, args, node_list) :

page = 0
while True :
media = drupalApi.get_media_list(session, args.server, page, args.date)
media_json = json.loads(media.content)

if len(media_json) == 0 :
break
else :
for media in media_json:
media_of = None
if "field_media_of" in media and len(media["field_media_of"]) >= 1 and "target_id" in media["field_media_of"][0]:
media_of = media["field_media_of"][0]['target_id']
media_changed = media['changed'][0]["value"] if ("changed" in media) else None
if media_of is not None and media_changed is not None and media_of not in node_list :
# media changed but the parent node did not change
node_list[media_of] = { "changed": media_changed}
elif media_of is not None and media_changed is not None and node_list[media_of]["changed"] < media_changed :
node_list[media_of] = { "changed": media_changed}
page+=1

# create archival information package
def create_aip(node_list, bagger_app_path) :

for node in node_list :
# cd ${BAGGER_APP_DIR} && ./bin/console app:islandora_bagger:create_bag -vvv --settings=var/sample_per_bag_config.yaml --node=1
subprocess.run(
[ './bin/console', 'app:islandora_bagger:create_bag', '-vvv', '--settings=var/sample_per_bag_config.yaml', f'--node={node.key}'],
stdout=subprocess.PIPE,
check=True,
cwd=bagger_app_path
)
66 changes: 66 additions & 0 deletions rootfs/get_node_ids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
##############################################################################################
# desc: connect to a Drupal instance, get a list of Drupal Nodes and Media that have changed
# since a supplied date and return a list of Drupal Nodes (e.g., to preserve in an
# AIP - archival information package)
# usage: python3 get_node_id.py --server ${server_name} --output ${output_path} --date '2024-05-16T16:51:52'
# license: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
# date: June 15, 2022
##############################################################################################

from getpass import getpass
from time import sleep
import argparse
import json
import logging
import os

from drupal import api as drupalApi
from drupal import utilities as drupalUtilities

#
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--server', required=True, help='Servername.')
parser.add_argument('--output', required=True, help='Location to store JSON (like) output file.')
parser.add_argument('--date', required=False, help='Items changed after the given date.')
parser.add_argument('--wait', required=False, help='Time to wait between API calls.', type=float, default=0.1)
parser.add_argument('--logging_level', required=False, help='Logging level.', default=logging.WARNING)
return parser.parse_args()


#
def process(args, session, output_file):

# a list of resources to preserve
node_list = {}

# get a list of Drupal Node IDs changed since a given optional date
node_list = drupalUtilities.id_list_from_nodes(session, args)
print(node_list)

# inspect Drupal Media for changes
# a Media change is does not transitively change the associated Node change timestamp)
# if Media changed then add associated Node ID to the list
drupalUtilities.id_list_merge_with_media(session, args, node_list)
print(node_list)

# create archival information packages
drupalUtilities.create_aip(node_list, args.BAGGER_APP_PATH)

# upload archival information packages
#
def main():
args = parse_args()
args['BAGGER_APP_PATH'] = os.getenv('BAGGER_APP_PATH')

username = input('Username:')
password = getpass('Password:')

session = drupalApi.init_session(args, username, password)

with open(args.output, 'wt', encoding="utf-8", newline='') as output_file:
process(args, session, output_file)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions rootfs/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
requests>=2.31
15 changes: 15 additions & 0 deletions rootfs/tests/unit_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
""" Very quickly written unit tests for a one-time script
"""


import csv
import os
import pytest
import pytest_mock
import shutil
import sys

from swiftclient.service import ClientException, SwiftError, SwiftService, SwiftUploadObject

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

0 comments on commit 887c3aa

Please sign in to comment.