fix json to parquet trigger cron #853
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: upload-and-deploy | |
on: | |
push: | |
branches: "*" | |
tags-ignore: "*" | |
env: | |
NAMESPACE: main | |
PYTHON_VERSION: 3.9 | |
DEV_INPUT_BUCKET: recover-dev-input-data | |
DEV_RAW_BUCKET: recover-dev-raw-data | |
DEV_INTERMEDIATE_BUCKET: recover-dev-intermediate-data | |
DEV_PROCESSED_BUCKET: recover-dev-processed-data | |
PROD_INPUT_BUCKET: recover-input-data | |
PROD_RAW_BUCKET: recover-raw-data | |
PROD_INTERMEDIATE_BUCKET: recover-intermediate-data | |
INTEGRATION_TEST_NUM_EXPORTS: 28 | |
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true | |
jobs: | |
pre-commit: | |
name: Run pre-commit hooks against all files | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v3 | |
- uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ env.PYTHON_VERSION }} | |
- uses: pre-commit/[email protected] | |
deploy-snowflake: | |
name: Deploy Snowflake resources | |
needs: pre-commit | |
runs-on: ubuntu-latest | |
env: | |
PRIVATE_KEY_PASSPHRASE: ${{ secrets.SNOWFLAKE_PRIVATE_KEY_PASSPHRASE }} | |
SNOWFLAKE_ENVIRONMENT: ${{ github.ref_name == 'main' && 'staging' || github.ref_name }} | |
steps: | |
- uses: actions/checkout@v3 | |
- name: Configure Snowflake connection | |
run: | | |
# Create temporary files for config.toml and our private key | |
config_file=$(mktemp) | |
private_key_file=$(mktemp) | |
# Write to the private key file | |
echo "${{ secrets.SNOWFLAKE_PRIVATE_KEY }}" > $private_key_file | |
# Write to config.toml file | |
echo 'default_connection_name = "recover"' >> $config_file | |
echo '[connections.recover]' >> $config_file | |
echo "account = \"${{ vars.SNOWFLAKE_ACCOUNT }}\"" >> $config_file | |
echo "user = \"${{ vars.SNOWFLAKE_USER }}\"" >> $config_file | |
echo "role = \"${{ vars.SNOWFLAKE_ROLE }}\"" >> $config_file | |
echo 'warehouse = "RECOVER_XSMALL"' >> $config_file | |
echo 'authenticator = "SNOWFLAKE_JWT"' >> $config_file | |
echo "private_key_path = \"$private_key_file\"" >> $config_file | |
# Write config.toml path to global environment | |
echo "SNOWFLAKE_CONFIG_PATH=$config_file" >> $GITHUB_ENV | |
- name: Configuration file information | |
run: | | |
echo "Snowflake configuration is located at $SNOWFLAKE_CONFIG_PATH" | |
cat $SNOWFLAKE_CONFIG_PATH | |
- name: Install Snowflake CLI | |
uses: Snowflake-Labs/[email protected] | |
with: | |
default-config-file-path: ${{ env.SNOWFLAKE_CONFIG_PATH }} | |
- name: Test Snowflake connection | |
run: | | |
snow --version | |
snow connection test | |
- name: Deploy Snowflake objects | |
run: | | |
snow sql \ | |
-D "git_branch=$GITHUB_REF_NAME" \ | |
-D "environment=$SNOWFLAKE_ENVIRONMENT" \ | |
-f snowflake/objects/deploy.sql | |
upload-files: | |
name: Upload files to S3 bucket in development | |
runs-on: ubuntu-latest | |
needs: pre-commit | |
# These permissions are needed to interact with GitHub's OIDC Token endpoint. | |
permissions: | |
id-token: write | |
contents: read | |
environment: develop | |
steps: | |
- name: Setup code, pipenv, aws | |
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3 | |
with: | |
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }} | |
role_session_name: GitHubActions-${{ github.repository_owner }}-${{ github.event.repository.name }}-${{ github.run_id }} | |
python_version: ${{ env.PYTHON_VERSION }} | |
- name: Setup sam | |
uses: aws-actions/setup-sam@v2 | |
- name: Set namespace for non-default branch | |
if: github.ref_name != 'main' | |
run: echo "NAMESPACE=$GITHUB_REF_NAME" >> $GITHUB_ENV | |
- name: Copy files to templates bucket, use dev cloudformation bucket | |
run: > | |
python src/scripts/manage_artifacts/artifacts.py | |
--upload | |
--namespace $NAMESPACE | |
--cfn_bucket ${{ vars.CFN_BUCKET }} | |
--shareable-artifacts-bucket ${{ vars.SHAREABLE_ARTIFACTS_BUCKET }} | |
nonglue-unit-tests: | |
name: Runs unit tests that are not dependent on aws-glue package resources | |
runs-on: ubuntu-latest | |
needs: pre-commit | |
# These permissions are needed to interact with GitHub's OIDC Token endpoint. | |
permissions: | |
id-token: write | |
contents: read | |
environment: develop | |
steps: | |
- name: Setup code, pipenv, aws | |
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3 | |
with: | |
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }} | |
role_session_name: ${{ github.event.repository.name }}-${{ github.run_id }}-nonglue-unit-tests | |
python_version: ${{ env.PYTHON_VERSION }} | |
- name: Install additional python dependency | |
run: | | |
pipenv install ecs_logging~=2.0 | |
pipenv install pytest-datadir | |
- name: Test scripts with pytest (lambda, etc.) | |
run: | | |
pipenv run python -m pytest -v \ | |
tests/test_s3_event_config_lambda.py \ | |
tests/test_s3_to_glue_lambda.py \ | |
tests/test_lambda_dispatch.py \ | |
tests/test_consume_logs.py \ | |
tests/test_lambda_raw.py \ | |
tests/test_lambda_raw_sync.py | |
- name: Test dev synapse folders for STS access with pytest | |
run: > | |
pipenv run python -m pytest tests/test_setup_external_storage.py | |
--test-bucket $DEV_INPUT_BUCKET | |
--test-synapse-folder-id syn51758510 | |
--namespace $NAMESPACE | |
--test-sts-permission read_only | |
-v | |
pipenv run python -m pytest tests/test_setup_external_storage.py | |
--test-bucket $DEV_PROCESSED_BUCKET | |
--test-synapse-folder-id syn51084525 | |
--namespace $NAMESPACE/parquet | |
--test-sts-permission read_write | |
-v | |
pytest-docker: | |
name: Build and push testing docker images to the pytest ECR repository. | |
needs: pre-commit | |
runs-on: ubuntu-latest | |
# These permissions are needed to interact with GitHub's OIDC Token endpoint. | |
permissions: | |
id-token: write | |
contents: read | |
strategy: | |
matrix: | |
include: | |
- tag_name: aws_glue_3 | |
dockerfile: tests/Dockerfile.aws_glue_3 | |
- tag_name: aws_glue_4 | |
dockerfile: tests/Dockerfile.aws_glue_4 | |
environment: develop | |
steps: | |
- name: Assume AWS role | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
role-to-assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }} | |
aws-region: "us-east-1" | |
# unmasking of the AWS account ID allows the acct id to pass through outputs | |
mask-aws-account-id: "no" | |
- name: Login to Amazon ECR | |
id: login-ecr | |
uses: aws-actions/amazon-ecr-login@v1 | |
- name: Get ECR secret names | |
id: ecr | |
run: | | |
usernameKey=docker_username_$(echo ${{ steps.login-ecr.outputs.registry }} | tr '.-' _) | |
echo "username-key=$usernameKey" >> $GITHUB_OUTPUT | |
passwordKey=docker_password_$(echo ${{ steps.login-ecr.outputs.registry }} | tr '.-' _) | |
echo "password-key=$passwordKey" >> $GITHUB_OUTPUT | |
- uses: actions/checkout@v3 | |
- name: Set up Docker Buildx | |
uses: docker/setup-buildx-action@v2 | |
- name: Build and push to ECR | |
id: docker-build-push | |
uses: docker/build-push-action@v4 | |
with: | |
push: true | |
tags: ${{ steps.login-ecr.outputs.registry }}/pytest:${{ github.ref_name }}_${{ matrix.tag_name }} | |
file: ${{ matrix.dockerfile }} | |
cache-from: type=local,src=/tmp/.buildx-cache | |
cache-to: type=local,dest=/tmp/.buildx-cache | |
outputs: | |
ecr-registry: ${{ steps.login-ecr.outputs.registry }} | |
ecr-username: ${{ steps.login-ecr.outputs[steps.ecr.outputs.username-key] }} | |
ecr-password: ${{ steps.login-ecr.outputs[steps.ecr.outputs.password-key] }} | |
glue-unit-tests: | |
name: Run Pytest unit tests for AWS glue | |
needs: pytest-docker | |
environment: develop | |
runs-on: ubuntu-latest | |
# These permissions are needed to interact with GitHub's OIDC Token endpoint. | |
permissions: | |
id-token: write | |
contents: read | |
strategy: | |
matrix: | |
tag_name: | |
["aws_glue_3", "aws_glue_4"] | |
container: | |
image: ${{ needs.pytest-docker.outputs.ecr-registry }}/pytest:${{ github.ref_name }}_${{ matrix.tag_name }} | |
credentials: | |
username: ${{ needs.pytest-docker.outputs.ecr-username }} | |
password: ${{ needs.pytest-docker.outputs.ecr-password }} | |
env: | |
DISABLE_SSL: true | |
options: "--user root" | |
steps: | |
- name: Assume AWS role | |
uses: aws-actions/configure-aws-credentials@v2 | |
with: | |
role-to-assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }} | |
aws-region: "us-east-1" | |
- uses: actions/checkout@v3 | |
- run: chown -R glue_user $GITHUB_WORKSPACE | |
- run: su - glue_user --command "aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID" | |
- run: su - glue_user --command "aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY" | |
- run: su - glue_user --command "aws configure set aws_session_token $AWS_SESSION_TOKEN" | |
- run: su - glue_user --command "aws configure set region $AWS_REGION" | |
- name: Set namespace for non-default branch or for tag | |
if: github.ref_name != 'main' | |
run: echo "NAMESPACE=$GITHUB_REF_NAME" >> $GITHUB_ENV | |
- name: Run Pytest unit tests under AWS Glue 3.0 | |
if: matrix.tag_name == 'aws_glue_3' | |
run: | | |
su - glue_user --command "cd $GITHUB_WORKSPACE && python3 -m pytest \ | |
tests/test_s3_to_json.py \ | |
tests/test_compare_parquet_datasets.py -v" | |
- name: Run unit tests for JSON to Parquet under AWS Glue 4.0 | |
if: matrix.tag_name == 'aws_glue_4' | |
run: > | |
su - glue_user --command "cd $GITHUB_WORKSPACE && | |
python3 -m pytest tests/test_json_to_parquet.py --namespace $NAMESPACE -v" | |
- name: Run unit tests for Great Expectations on Parquet under AWS Glue 4.0 | |
if: matrix.tag_name == 'aws_glue_4' | |
run: > | |
su - glue_user --command "cd $GITHUB_WORKSPACE && | |
python3 -m pytest tests/test_run_great_expectations_on_parquet.py -v" | |
sceptre-deploy-develop: | |
name: Deploys branch using sceptre | |
runs-on: ubuntu-latest | |
needs: [upload-files, nonglue-unit-tests, glue-unit-tests] | |
environment: develop | |
# These permissions are needed to interact with GitHub's OIDC Token endpoint. | |
permissions: | |
id-token: write | |
contents: read | |
steps: | |
- name: Setup code, pipenv, aws | |
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3 | |
with: | |
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }} | |
role_session_name: GitHubActions-${{ github.repository_owner }}-${{ github.event.repository.name }}-${{ github.run_id }} | |
python_version: ${{ env.PYTHON_VERSION }} | |
- name: Create directory for remote sceptre templates | |
run: mkdir -p templates/remote/ | |
- name: Set namespace for non-default branch | |
if: github.ref_name != 'main' | |
run: echo "NAMESPACE=$GITHUB_REF_NAME" >> $GITHUB_ENV | |
- name: "Deploy sceptre stacks to dev" | |
run: pipenv run sceptre --debug --var "namespace=${{ env.NAMESPACE }}" launch develop --yes | |
- name: Delete preexisting S3 event notification for this namespace | |
uses: gagoar/invoke-aws-lambda@v3 | |
with: | |
AWS_ACCESS_KEY_ID: ${{ env.AWS_ACCESS_KEY_ID }} | |
AWS_SECRET_ACCESS_KEY: ${{ env.AWS_SECRET_ACCESS_KEY }} | |
AWS_SESSION_TOKEN: ${{ env.AWS_SESSION_TOKEN }} | |
REGION: ${{ env.AWS_REGION }} | |
FunctionName: ${{ env.NAMESPACE }}-S3EventConfig | |
Payload: '{"RequestType": "Delete"}' | |
LogType: Tail | |
- name: Create S3 event notification for this namespace | |
uses: gagoar/invoke-aws-lambda@v3 | |
with: | |
AWS_ACCESS_KEY_ID: ${{ env.AWS_ACCESS_KEY_ID }} | |
AWS_SECRET_ACCESS_KEY: ${{ env.AWS_SECRET_ACCESS_KEY }} | |
AWS_SESSION_TOKEN: ${{ env.AWS_SESSION_TOKEN }} | |
REGION: ${{ env.AWS_REGION }} | |
FunctionName: ${{ env.NAMESPACE }}-S3EventConfig | |
Payload: '{"RequestType": "Create"}' | |
LogType: Tail | |
integration-test-develop-cleanup: | |
name: Cleanup non-main branch data before integration tests | |
runs-on: ubuntu-latest | |
needs: sceptre-deploy-develop | |
environment: develop | |
# These permissions are needed to interact with GitHub's OIDC Token endpoint. | |
permissions: | |
id-token: write | |
contents: read | |
steps: | |
- name: Setup code, pipenv, aws | |
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3 | |
with: | |
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }} | |
role_session_name: GitHubActions-${{ github.repository_owner }}-${{ github.event.repository.name }}-${{ github.run_id }} | |
python_version: ${{ env.PYTHON_VERSION }} | |
- name: Set namespace for non-default branch | |
if: github.ref_name != 'main' | |
run: echo "NAMESPACE=$GITHUB_REF_NAME" >> $GITHUB_ENV | |
- name: Clean input data bucket | |
run: > | |
pipenv run python src/scripts/manage_artifacts/clean_for_integration_test.py | |
--bucket $DEV_INPUT_BUCKET | |
--bucket_prefix "${{ env.NAMESPACE }}/" | |
- name: Clean raw data bucket | |
run: > | |
pipenv run python src/scripts/manage_artifacts/clean_for_integration_test.py | |
--bucket $DEV_RAW_BUCKET | |
--bucket_prefix "${{ env.NAMESPACE }}/json/" | |
- name: Clean intermediate data bucket | |
run: > | |
pipenv run python src/scripts/manage_artifacts/clean_for_integration_test.py | |
--bucket $DEV_INTERMEDIATE_BUCKET | |
--bucket_prefix "${{ env.NAMESPACE }}/json/" | |
integration-test-develop: | |
name: Triggers ETL workflow with S3 test files | |
runs-on: ubuntu-latest | |
needs: [sceptre-deploy-develop, integration-test-develop-cleanup] | |
environment: develop | |
# These permissions are needed to interact with GitHub's OIDC Token endpoint. | |
permissions: | |
id-token: write | |
contents: read | |
env: | |
EXPORT_S3_KEY_PREFIX: pilot-data | |
steps: | |
- name: Setup code, pipenv, aws | |
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3 | |
with: | |
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }} | |
role_session_name: integration-test-${{ github.run_id }} | |
python_version: ${{ env.PYTHON_VERSION }} | |
- name: Set namespace for non-default branch or for tag | |
if: github.ref_name != 'main' | |
run: echo "NAMESPACE=$GITHUB_REF_NAME" >> $GITHUB_ENV | |
- name: Fetch the most recent exports. | |
id: recent-exports | |
run: | | |
# Retrieve the last ~2 weeks of exports from each cohort | |
# Ignore keys which end with "/" and which match "owner.txt" | |
echo "KEYS=$( | |
aws s3api list-objects-v2 \ | |
--bucket $DEV_INPUT_BUCKET \ | |
--prefix $EXPORT_S3_KEY_PREFIX \ | |
--query '((sort_by(Contents[? !ends_with(Key, `/`) && !contains(Key, `owner.txt`)], &LastModified)[::-1])[:${{ env.INTEGRATION_TEST_NUM_EXPORTS }}])[*].Key' | | |
jq -c | |
)" >> "$GITHUB_OUTPUT" | |
- name: Copy most recent exports to this namespace | |
run: > | |
echo '${{ steps.recent-exports.outputs.KEYS }}' | jq -r '.[]' | while read -r key; do | |
aws s3 cp "s3://$DEV_INPUT_BUCKET/$key" "s3://$DEV_INPUT_BUCKET/$NAMESPACE/${key#"$EXPORT_S3_KEY_PREFIX"/}"; | |
done | |
- name: Write most recent exports to S3 cloudformation bucket | |
run: > | |
echo '${{ steps.recent-exports.outputs.KEYS }}' | | |
jq --arg bucket "s3://$DEV_INPUT_BUCKET/" '.[] |= $bucket + .' | | |
aws s3 cp - "s3://${{ vars.CFN_BUCKET }}/$NAMESPACE/integration_test_exports.json" | |
sceptre-deploy-staging: | |
name: Deploys to staging of prod using sceptre | |
runs-on: ubuntu-latest | |
needs: integration-test-develop | |
if: github.ref_name == 'main' | |
environment: prod | |
# These permissions are needed to interact with GitHub's OIDC Token endpoint. | |
permissions: | |
id-token: write | |
contents: read | |
steps: | |
- name: Setup code, pipenv, aws | |
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3 | |
with: | |
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }} | |
role_session_name: GitHubActions-${{ github.repository_owner }}-${{ github.event.repository.name }}-${{ github.run_id }} | |
python_version: ${{ env.PYTHON_VERSION }} | |
- name: Copy files to templates bucket | |
run: > | |
python src/scripts/manage_artifacts/artifacts.py | |
--upload | |
--namespace staging | |
--cfn_bucket ${{ vars.CFN_BUCKET }} | |
--shareable-artifacts-bucket ${{ vars.SHAREABLE_ARTIFACTS_BUCKET }} | |
- name: Create directory for remote sceptre templates | |
run: mkdir -p templates/remote/ | |
- name: Deploy sceptre stacks to staging on prod | |
run: pipenv run sceptre --var "namespace=staging" launch prod --yes | |
integration-test-staging-cleanup: | |
name: Cleanup main branch staging data before integration tests | |
runs-on: ubuntu-latest | |
needs: sceptre-deploy-staging | |
if: github.ref_name == 'main' | |
environment: prod | |
# These permissions are needed to interact with GitHub's OIDC Token endpoint. | |
permissions: | |
id-token: write | |
contents: read | |
steps: | |
- name: Setup code, pipenv, aws | |
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3 | |
with: | |
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }} | |
role_session_name: GitHubActions-${{ github.repository_owner }}-${{ github.event.repository.name }}-${{ github.run_id }} | |
python_version: ${{ env.PYTHON_VERSION }} | |
- name: Clean input data bucket | |
run: > | |
pipenv run python src/scripts/manage_artifacts/clean_for_integration_test.py | |
--bucket $PROD_INPUT_BUCKET | |
--bucket_prefix "staging/" | |
- name: Clean raw data bucket | |
run: > | |
pipenv run python src/scripts/manage_artifacts/clean_for_integration_test.py | |
--bucket $PROD_RAW_BUCKET | |
--bucket_prefix "staging/json/" | |
- name: Clean intermediate data bucket | |
run: > | |
pipenv run python src/scripts/manage_artifacts/clean_for_integration_test.py | |
--bucket $PROD_INTERMEDIATE_BUCKET | |
--bucket_prefix "staging/json/" | |
integration-test-staging: | |
name: Triggers staging workflow with production data | |
runs-on: ubuntu-latest | |
needs: [sceptre-deploy-staging, integration-test-staging-cleanup] | |
environment: prod | |
# These permissions are needed to interact with GitHub's OIDC Token endpoint. | |
permissions: | |
id-token: write | |
contents: read | |
env: | |
EXPORT_S3_KEY_PREFIX: main | |
steps: | |
- name: Setup code, pipenv, aws | |
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3 | |
with: | |
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }} | |
role_session_name: integration-test-${{ github.run_id }} | |
python_version: ${{ env.PYTHON_VERSION }} | |
- name: Fetch the most recent exports. | |
id: recent-exports | |
run: | | |
# Retrieve the last ~2 weeks of exports from each cohort | |
# Ignore keys which end with "/" and which match "owner.txt" | |
echo "KEYS=$( | |
aws s3api list-objects-v2 \ | |
--bucket $PROD_INPUT_BUCKET \ | |
--prefix "$EXPORT_S3_KEY_PREFIX/" \ | |
--query '((sort_by(Contents[? !ends_with(Key, `/`) && !contains(Key, `owner.txt`)], &LastModified)[::-1])[:${{ env.INTEGRATION_TEST_NUM_EXPORTS }}])[*].Key' | | |
jq -c | |
)" >> "$GITHUB_OUTPUT" | |
- name: Copy most recent exports to this namespace | |
run: > | |
echo '${{ steps.recent-exports.outputs.KEYS }}' | jq -r '.[]' | while read -r key; do | |
aws s3 cp "s3://$PROD_INPUT_BUCKET/$key" "s3://$PROD_INPUT_BUCKET/staging/${key#"$EXPORT_S3_KEY_PREFIX"/}"; | |
done | |
- name: Write most recent exports to S3 cloudformation bucket | |
run: > | |
echo '${{ steps.recent-exports.outputs.KEYS }}' | | |
jq --arg bucket "s3://$PROD_INPUT_BUCKET/" '.[] |= $bucket + .' | | |
aws s3 cp - "s3://${{ vars.CFN_BUCKET }}/staging/integration_test_exports.json" |