Skip to content

fix json to parquet trigger cron #853

fix json to parquet trigger cron

fix json to parquet trigger cron #853

name: upload-and-deploy
on:
push:
branches: "*"
tags-ignore: "*"
env:
NAMESPACE: main
PYTHON_VERSION: 3.9
DEV_INPUT_BUCKET: recover-dev-input-data
DEV_RAW_BUCKET: recover-dev-raw-data
DEV_INTERMEDIATE_BUCKET: recover-dev-intermediate-data
DEV_PROCESSED_BUCKET: recover-dev-processed-data
PROD_INPUT_BUCKET: recover-input-data
PROD_RAW_BUCKET: recover-raw-data
PROD_INTERMEDIATE_BUCKET: recover-intermediate-data
INTEGRATION_TEST_NUM_EXPORTS: 28
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
jobs:
pre-commit:
name: Run pre-commit hooks against all files
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- uses: pre-commit/[email protected]
deploy-snowflake:
name: Deploy Snowflake resources
needs: pre-commit
runs-on: ubuntu-latest
env:
PRIVATE_KEY_PASSPHRASE: ${{ secrets.SNOWFLAKE_PRIVATE_KEY_PASSPHRASE }}
SNOWFLAKE_ENVIRONMENT: ${{ github.ref_name == 'main' && 'staging' || github.ref_name }}
steps:
- uses: actions/checkout@v3
- name: Configure Snowflake connection
run: |
# Create temporary files for config.toml and our private key
config_file=$(mktemp)
private_key_file=$(mktemp)
# Write to the private key file
echo "${{ secrets.SNOWFLAKE_PRIVATE_KEY }}" > $private_key_file
# Write to config.toml file
echo 'default_connection_name = "recover"' >> $config_file
echo '[connections.recover]' >> $config_file
echo "account = \"${{ vars.SNOWFLAKE_ACCOUNT }}\"" >> $config_file
echo "user = \"${{ vars.SNOWFLAKE_USER }}\"" >> $config_file
echo "role = \"${{ vars.SNOWFLAKE_ROLE }}\"" >> $config_file
echo 'warehouse = "RECOVER_XSMALL"' >> $config_file
echo 'authenticator = "SNOWFLAKE_JWT"' >> $config_file
echo "private_key_path = \"$private_key_file\"" >> $config_file
# Write config.toml path to global environment
echo "SNOWFLAKE_CONFIG_PATH=$config_file" >> $GITHUB_ENV
- name: Configuration file information
run: |
echo "Snowflake configuration is located at $SNOWFLAKE_CONFIG_PATH"
cat $SNOWFLAKE_CONFIG_PATH
- name: Install Snowflake CLI
uses: Snowflake-Labs/[email protected]
with:
default-config-file-path: ${{ env.SNOWFLAKE_CONFIG_PATH }}
- name: Test Snowflake connection
run: |
snow --version
snow connection test
- name: Deploy Snowflake objects
run: |
snow sql \
-D "git_branch=$GITHUB_REF_NAME" \
-D "environment=$SNOWFLAKE_ENVIRONMENT" \
-f snowflake/objects/deploy.sql
upload-files:
name: Upload files to S3 bucket in development
runs-on: ubuntu-latest
needs: pre-commit
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
permissions:
id-token: write
contents: read
environment: develop
steps:
- name: Setup code, pipenv, aws
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3
with:
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }}
role_session_name: GitHubActions-${{ github.repository_owner }}-${{ github.event.repository.name }}-${{ github.run_id }}
python_version: ${{ env.PYTHON_VERSION }}
- name: Setup sam
uses: aws-actions/setup-sam@v2
- name: Set namespace for non-default branch
if: github.ref_name != 'main'
run: echo "NAMESPACE=$GITHUB_REF_NAME" >> $GITHUB_ENV
- name: Copy files to templates bucket, use dev cloudformation bucket
run: >
python src/scripts/manage_artifacts/artifacts.py
--upload
--namespace $NAMESPACE
--cfn_bucket ${{ vars.CFN_BUCKET }}
--shareable-artifacts-bucket ${{ vars.SHAREABLE_ARTIFACTS_BUCKET }}
nonglue-unit-tests:
name: Runs unit tests that are not dependent on aws-glue package resources
runs-on: ubuntu-latest
needs: pre-commit
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
permissions:
id-token: write
contents: read
environment: develop
steps:
- name: Setup code, pipenv, aws
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3
with:
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }}
role_session_name: ${{ github.event.repository.name }}-${{ github.run_id }}-nonglue-unit-tests
python_version: ${{ env.PYTHON_VERSION }}
- name: Install additional python dependency
run: |
pipenv install ecs_logging~=2.0
pipenv install pytest-datadir
- name: Test scripts with pytest (lambda, etc.)
run: |
pipenv run python -m pytest -v \
tests/test_s3_event_config_lambda.py \
tests/test_s3_to_glue_lambda.py \
tests/test_lambda_dispatch.py \
tests/test_consume_logs.py \
tests/test_lambda_raw.py \
tests/test_lambda_raw_sync.py
- name: Test dev synapse folders for STS access with pytest
run: >
pipenv run python -m pytest tests/test_setup_external_storage.py
--test-bucket $DEV_INPUT_BUCKET
--test-synapse-folder-id syn51758510
--namespace $NAMESPACE
--test-sts-permission read_only
-v
pipenv run python -m pytest tests/test_setup_external_storage.py
--test-bucket $DEV_PROCESSED_BUCKET
--test-synapse-folder-id syn51084525
--namespace $NAMESPACE/parquet
--test-sts-permission read_write
-v
pytest-docker:
name: Build and push testing docker images to the pytest ECR repository.
needs: pre-commit
runs-on: ubuntu-latest
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
permissions:
id-token: write
contents: read
strategy:
matrix:
include:
- tag_name: aws_glue_3
dockerfile: tests/Dockerfile.aws_glue_3
- tag_name: aws_glue_4
dockerfile: tests/Dockerfile.aws_glue_4
environment: develop
steps:
- name: Assume AWS role
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }}
aws-region: "us-east-1"
# unmasking of the AWS account ID allows the acct id to pass through outputs
mask-aws-account-id: "no"
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v1
- name: Get ECR secret names
id: ecr
run: |
usernameKey=docker_username_$(echo ${{ steps.login-ecr.outputs.registry }} | tr '.-' _)
echo "username-key=$usernameKey" >> $GITHUB_OUTPUT
passwordKey=docker_password_$(echo ${{ steps.login-ecr.outputs.registry }} | tr '.-' _)
echo "password-key=$passwordKey" >> $GITHUB_OUTPUT
- uses: actions/checkout@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Build and push to ECR
id: docker-build-push
uses: docker/build-push-action@v4
with:
push: true
tags: ${{ steps.login-ecr.outputs.registry }}/pytest:${{ github.ref_name }}_${{ matrix.tag_name }}
file: ${{ matrix.dockerfile }}
cache-from: type=local,src=/tmp/.buildx-cache
cache-to: type=local,dest=/tmp/.buildx-cache
outputs:
ecr-registry: ${{ steps.login-ecr.outputs.registry }}
ecr-username: ${{ steps.login-ecr.outputs[steps.ecr.outputs.username-key] }}
ecr-password: ${{ steps.login-ecr.outputs[steps.ecr.outputs.password-key] }}
glue-unit-tests:
name: Run Pytest unit tests for AWS glue
needs: pytest-docker
environment: develop
runs-on: ubuntu-latest
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
permissions:
id-token: write
contents: read
strategy:
matrix:
tag_name:
["aws_glue_3", "aws_glue_4"]
container:
image: ${{ needs.pytest-docker.outputs.ecr-registry }}/pytest:${{ github.ref_name }}_${{ matrix.tag_name }}
credentials:
username: ${{ needs.pytest-docker.outputs.ecr-username }}
password: ${{ needs.pytest-docker.outputs.ecr-password }}
env:
DISABLE_SSL: true
options: "--user root"
steps:
- name: Assume AWS role
uses: aws-actions/configure-aws-credentials@v2
with:
role-to-assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }}
aws-region: "us-east-1"
- uses: actions/checkout@v3
- run: chown -R glue_user $GITHUB_WORKSPACE
- run: su - glue_user --command "aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID"
- run: su - glue_user --command "aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY"
- run: su - glue_user --command "aws configure set aws_session_token $AWS_SESSION_TOKEN"
- run: su - glue_user --command "aws configure set region $AWS_REGION"
- name: Set namespace for non-default branch or for tag
if: github.ref_name != 'main'
run: echo "NAMESPACE=$GITHUB_REF_NAME" >> $GITHUB_ENV
- name: Run Pytest unit tests under AWS Glue 3.0
if: matrix.tag_name == 'aws_glue_3'
run: |
su - glue_user --command "cd $GITHUB_WORKSPACE && python3 -m pytest \
tests/test_s3_to_json.py \
tests/test_compare_parquet_datasets.py -v"
- name: Run unit tests for JSON to Parquet under AWS Glue 4.0
if: matrix.tag_name == 'aws_glue_4'
run: >
su - glue_user --command "cd $GITHUB_WORKSPACE &&
python3 -m pytest tests/test_json_to_parquet.py --namespace $NAMESPACE -v"
- name: Run unit tests for Great Expectations on Parquet under AWS Glue 4.0
if: matrix.tag_name == 'aws_glue_4'
run: >
su - glue_user --command "cd $GITHUB_WORKSPACE &&
python3 -m pytest tests/test_run_great_expectations_on_parquet.py -v"
sceptre-deploy-develop:
name: Deploys branch using sceptre
runs-on: ubuntu-latest
needs: [upload-files, nonglue-unit-tests, glue-unit-tests]
environment: develop
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
permissions:
id-token: write
contents: read
steps:
- name: Setup code, pipenv, aws
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3
with:
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }}
role_session_name: GitHubActions-${{ github.repository_owner }}-${{ github.event.repository.name }}-${{ github.run_id }}
python_version: ${{ env.PYTHON_VERSION }}
- name: Create directory for remote sceptre templates
run: mkdir -p templates/remote/
- name: Set namespace for non-default branch
if: github.ref_name != 'main'
run: echo "NAMESPACE=$GITHUB_REF_NAME" >> $GITHUB_ENV
- name: "Deploy sceptre stacks to dev"
run: pipenv run sceptre --debug --var "namespace=${{ env.NAMESPACE }}" launch develop --yes
- name: Delete preexisting S3 event notification for this namespace
uses: gagoar/invoke-aws-lambda@v3
with:
AWS_ACCESS_KEY_ID: ${{ env.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ env.AWS_SECRET_ACCESS_KEY }}
AWS_SESSION_TOKEN: ${{ env.AWS_SESSION_TOKEN }}
REGION: ${{ env.AWS_REGION }}
FunctionName: ${{ env.NAMESPACE }}-S3EventConfig
Payload: '{"RequestType": "Delete"}'
LogType: Tail
- name: Create S3 event notification for this namespace
uses: gagoar/invoke-aws-lambda@v3
with:
AWS_ACCESS_KEY_ID: ${{ env.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ env.AWS_SECRET_ACCESS_KEY }}
AWS_SESSION_TOKEN: ${{ env.AWS_SESSION_TOKEN }}
REGION: ${{ env.AWS_REGION }}
FunctionName: ${{ env.NAMESPACE }}-S3EventConfig
Payload: '{"RequestType": "Create"}'
LogType: Tail
integration-test-develop-cleanup:
name: Cleanup non-main branch data before integration tests
runs-on: ubuntu-latest
needs: sceptre-deploy-develop
environment: develop
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
permissions:
id-token: write
contents: read
steps:
- name: Setup code, pipenv, aws
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3
with:
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }}
role_session_name: GitHubActions-${{ github.repository_owner }}-${{ github.event.repository.name }}-${{ github.run_id }}
python_version: ${{ env.PYTHON_VERSION }}
- name: Set namespace for non-default branch
if: github.ref_name != 'main'
run: echo "NAMESPACE=$GITHUB_REF_NAME" >> $GITHUB_ENV
- name: Clean input data bucket
run: >
pipenv run python src/scripts/manage_artifacts/clean_for_integration_test.py
--bucket $DEV_INPUT_BUCKET
--bucket_prefix "${{ env.NAMESPACE }}/"
- name: Clean raw data bucket
run: >
pipenv run python src/scripts/manage_artifacts/clean_for_integration_test.py
--bucket $DEV_RAW_BUCKET
--bucket_prefix "${{ env.NAMESPACE }}/json/"
- name: Clean intermediate data bucket
run: >
pipenv run python src/scripts/manage_artifacts/clean_for_integration_test.py
--bucket $DEV_INTERMEDIATE_BUCKET
--bucket_prefix "${{ env.NAMESPACE }}/json/"
integration-test-develop:
name: Triggers ETL workflow with S3 test files
runs-on: ubuntu-latest
needs: [sceptre-deploy-develop, integration-test-develop-cleanup]
environment: develop
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
permissions:
id-token: write
contents: read
env:
EXPORT_S3_KEY_PREFIX: pilot-data
steps:
- name: Setup code, pipenv, aws
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3
with:
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }}
role_session_name: integration-test-${{ github.run_id }}
python_version: ${{ env.PYTHON_VERSION }}
- name: Set namespace for non-default branch or for tag
if: github.ref_name != 'main'
run: echo "NAMESPACE=$GITHUB_REF_NAME" >> $GITHUB_ENV
- name: Fetch the most recent exports.
id: recent-exports
run: |
# Retrieve the last ~2 weeks of exports from each cohort
# Ignore keys which end with "/" and which match "owner.txt"
echo "KEYS=$(
aws s3api list-objects-v2 \
--bucket $DEV_INPUT_BUCKET \
--prefix $EXPORT_S3_KEY_PREFIX \
--query '((sort_by(Contents[? !ends_with(Key, `/`) && !contains(Key, `owner.txt`)], &LastModified)[::-1])[:${{ env.INTEGRATION_TEST_NUM_EXPORTS }}])[*].Key' |
jq -c
)" >> "$GITHUB_OUTPUT"
- name: Copy most recent exports to this namespace
run: >
echo '${{ steps.recent-exports.outputs.KEYS }}' | jq -r '.[]' | while read -r key; do
aws s3 cp "s3://$DEV_INPUT_BUCKET/$key" "s3://$DEV_INPUT_BUCKET/$NAMESPACE/${key#"$EXPORT_S3_KEY_PREFIX"/}";
done
- name: Write most recent exports to S3 cloudformation bucket
run: >
echo '${{ steps.recent-exports.outputs.KEYS }}' |
jq --arg bucket "s3://$DEV_INPUT_BUCKET/" '.[] |= $bucket + .' |
aws s3 cp - "s3://${{ vars.CFN_BUCKET }}/$NAMESPACE/integration_test_exports.json"
sceptre-deploy-staging:
name: Deploys to staging of prod using sceptre
runs-on: ubuntu-latest
needs: integration-test-develop
if: github.ref_name == 'main'
environment: prod
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
permissions:
id-token: write
contents: read
steps:
- name: Setup code, pipenv, aws
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3
with:
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }}
role_session_name: GitHubActions-${{ github.repository_owner }}-${{ github.event.repository.name }}-${{ github.run_id }}
python_version: ${{ env.PYTHON_VERSION }}
- name: Copy files to templates bucket
run: >
python src/scripts/manage_artifacts/artifacts.py
--upload
--namespace staging
--cfn_bucket ${{ vars.CFN_BUCKET }}
--shareable-artifacts-bucket ${{ vars.SHAREABLE_ARTIFACTS_BUCKET }}
- name: Create directory for remote sceptre templates
run: mkdir -p templates/remote/
- name: Deploy sceptre stacks to staging on prod
run: pipenv run sceptre --var "namespace=staging" launch prod --yes
integration-test-staging-cleanup:
name: Cleanup main branch staging data before integration tests
runs-on: ubuntu-latest
needs: sceptre-deploy-staging
if: github.ref_name == 'main'
environment: prod
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
permissions:
id-token: write
contents: read
steps:
- name: Setup code, pipenv, aws
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3
with:
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }}
role_session_name: GitHubActions-${{ github.repository_owner }}-${{ github.event.repository.name }}-${{ github.run_id }}
python_version: ${{ env.PYTHON_VERSION }}
- name: Clean input data bucket
run: >
pipenv run python src/scripts/manage_artifacts/clean_for_integration_test.py
--bucket $PROD_INPUT_BUCKET
--bucket_prefix "staging/"
- name: Clean raw data bucket
run: >
pipenv run python src/scripts/manage_artifacts/clean_for_integration_test.py
--bucket $PROD_RAW_BUCKET
--bucket_prefix "staging/json/"
- name: Clean intermediate data bucket
run: >
pipenv run python src/scripts/manage_artifacts/clean_for_integration_test.py
--bucket $PROD_INTERMEDIATE_BUCKET
--bucket_prefix "staging/json/"
integration-test-staging:
name: Triggers staging workflow with production data
runs-on: ubuntu-latest
needs: [sceptre-deploy-staging, integration-test-staging-cleanup]
environment: prod
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
permissions:
id-token: write
contents: read
env:
EXPORT_S3_KEY_PREFIX: main
steps:
- name: Setup code, pipenv, aws
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3
with:
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }}
role_session_name: integration-test-${{ github.run_id }}
python_version: ${{ env.PYTHON_VERSION }}
- name: Fetch the most recent exports.
id: recent-exports
run: |
# Retrieve the last ~2 weeks of exports from each cohort
# Ignore keys which end with "/" and which match "owner.txt"
echo "KEYS=$(
aws s3api list-objects-v2 \
--bucket $PROD_INPUT_BUCKET \
--prefix "$EXPORT_S3_KEY_PREFIX/" \
--query '((sort_by(Contents[? !ends_with(Key, `/`) && !contains(Key, `owner.txt`)], &LastModified)[::-1])[:${{ env.INTEGRATION_TEST_NUM_EXPORTS }}])[*].Key' |
jq -c
)" >> "$GITHUB_OUTPUT"
- name: Copy most recent exports to this namespace
run: >
echo '${{ steps.recent-exports.outputs.KEYS }}' | jq -r '.[]' | while read -r key; do
aws s3 cp "s3://$PROD_INPUT_BUCKET/$key" "s3://$PROD_INPUT_BUCKET/staging/${key#"$EXPORT_S3_KEY_PREFIX"/}";
done
- name: Write most recent exports to S3 cloudformation bucket
run: >
echo '${{ steps.recent-exports.outputs.KEYS }}' |
jq --arg bucket "s3://$PROD_INPUT_BUCKET/" '.[] |= $bucket + .' |
aws s3 cp - "s3://${{ vars.CFN_BUCKET }}/staging/integration_test_exports.json"