Skip to content

feat(datasets): add 4 salt marsh story dataset configs #132

feat(datasets): add 4 salt marsh story dataset configs

feat(datasets): add 4 salt marsh story dataset configs #132

Workflow file for this run

# This GitHub Actions workflow automates the process of
# publishing dataset collections to a staging environment
# It is triggered by a pull request to the main branch
# that modifies any files within the ingestion-data/dataset-config/ directory
# The workflow includes steps to
# - publish the datasets,
# - constantly updates the status of the workflow in the PR comment
name: Publish collection to staging
on:
pull_request:
branches: ['main']
paths:
# Run the workflow only if files inside this path are updated
- ingestion-data/staging/dataset-config/*
push:
branches:
- main
permissions:
pull-requests: write
contents: read
jobs:
publish-new-datasets:
if: ${{ github.event_name == 'pull_request' && (github.event.action == 'synchronize' || github.event.action == 'opened') }}
runs-on: ubuntu-latest
environment: staging
outputs:
publishedCollections: ${{ steps.publish-collections.outputs.success_collections }}
steps:
- uses: actions/checkout@v4
# Initializes the PR comment
# Edits existing or creates new comment
# Why? - Cleanliness!
- name: Initialize PR comment with workflow start
id: init-comment
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
WORKFLOW_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
body="### Workflow Status
**Starting workflow...** [View action run]($WORKFLOW_URL)"
# Get the PR number
PR_NUMBER=${{ github.event.pull_request.number }}
# Fetch existing comments
COMMENTS=$(gh api repos/${{ github.repository }}/issues/${PR_NUMBER}/comments --jq '.[] | select(.body | contains("### Workflow Status")) | {id: .id, body: .body}')
# Check if a comment already exists
COMMENT_ID=$(echo "$COMMENTS" | jq -r '.id' | head -n 1)
if [ -z "$COMMENT_ID" ]; then
# No existing comment, create a new one
COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${PR_NUMBER}/comments -f body="$body" --jq '.id')
else
# Comment exists, overwrite the existing comment
gh api repos/${{ github.repository }}/issues/comments/$COMMENT_ID -X PATCH -f body="$body"
fi
echo "COMMENT_ID=$COMMENT_ID" >> $GITHUB_OUTPUT
# Find only the newly added files
# Only .json files
# The files are outputted to GITHUB_OUTPUT, which can be used in subsequent steps
- name: Get newly added files
id: changed-files
uses: tj-actions/changed-files@v45
with:
files: |
**.json
- name: List all newly added files
env:
ADDED_FILES: ${{ steps.changed-files.outputs.added_files }}
run: |
for file in ${ADDED_FILES}; do
echo "$file was added"
done
# Uses service client creds to get token
# No username/password needed
- name: Get auth token
id: get-token
run: |
echo "Vars: $vars"
response=$(curl -X POST \
${{ vars.STAGING_COGNITO_DOMAIN }}/oauth2/token \
-H "Content-Type: application/x-www-form-urlencoded" \
-d "grant_type=client_credentials" \
-d "client_id=${{ vars.STAGING_CLIENT_ID }}" \
-d "client_secret=${{ secrets.STAGING_CLIENT_SECRET }}"
)
access_token=$(echo "$response" | jq -r '.access_token')
echo "ACCESS_TOKEN=$access_token" >> $GITHUB_OUTPUT
# Makes request to /dataset/publish endpoint
# Outputs only files that were successfully published
# Used by other steps
# If none of the requests are successful, workflow fails
# Updates the PR comment with status of collection publication
- name: Publish all newly added collections to staging
id: publish-collections
env:
ADDED_FILES: ${{ steps.changed-files.outputs.added_files }}
WORKFLOWS_URL: ${{ vars.STAGING_WORKFLOWS_URL }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
AUTH_TOKEN: ${{ steps.get-token.outputs.ACCESS_TOKEN }}
COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }}
run: |
if [ -z "$WORKFLOWS_URL" ]; then
echo "WORKFLOWS_URL is not set"
exit 1
fi
if [ -z "$AUTH_TOKEN" ]; then
echo "AUTH_TOKEN is not set"
exit 1
fi
publish_url="${WORKFLOWS_URL%/}/dataset/publish"
bearer_token=$AUTH_TOKEN
# Track successful publications
all_failed=true
declare -a success_collections=()
status_message='### Collection Publication Status
'
for file in ${ADDED_FILES}; do
echo $file
if [ -f "$file" ]; then
dataset_config=$(jq '.' "$file")
collection_id=$(jq -r '.collection' "$file")
echo "Publishing $collection_id"
response=$(curl -s -w "%{http_code}" -o response.txt -X POST "$publish_url" \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $AUTH_TOKEN" \
-d "$dataset_config"
)
status_code=$(tail -n1 <<< "$response")
# Update status message based on response code
if [ "$status_code" -eq 200 ] || [ "$status_code" -eq 201 ]; then
echo "$collection_id successfully published ✅"
status_message+="- **$collection_id**: Successfully published ✅
"
success_collections+=("$file")
all_failed=false
else
echo "$collection_id failed to publish ❌"
status_message+="- **$collection_id**: Failed to publish. Error code $status_code. ❌
"
fi
else
echo "File $file does not exist"
exit 1
fi
done
# Exit workflow if all the requests fail
if [ "$all_failed" = true ]; then
echo "All collections failed to publish."
exit 1
fi
# Output only successful collections to be used in subsequent steps
echo "success_collections=${success_collections[*]}" >> $GITHUB_OUTPUT
# Update PR comment
CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
UPDATED_BODY="$CURRENT_BODY
$status_message"
gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.9'
- uses: actions/cache@v4
with:
path: ${{ env.pythonLocation }}
key: ${{ env.pythonLocation }}-pip-${{ hashFiles('requirements.txt') }}
# If the workflow fails at any point, the PR comment will be updated
- name: Update PR comment on overall workflow failure
if: failure()
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }}
run: |
WORKFLOW_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
UPDATED_BODY="$CURRENT_BODY
** ❌ The workflow run failed. [See logs here]($WORKFLOW_URL)**"
gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"
create-mdx-files:
runs-on: ubuntu-latest
needs: publish-new-datasets
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Use output from dataset-publication-and-configuration
run: |
echo "The output from the previous step is: ${{ needs.publish-new-datasets.outputs.publishedCollections }}"
# Creates a slim dataset mdx file for each collection based on the dataset config json
- name: Create dataset mdx for given collections
env:
PUBLISHED_COLLECTION_FILES: ${{ needs.publish-new-datasets.outputs.publishedCollections }}
run: |
echo $PUBLISHED_COLLECTION_FILES
pip install -r ./scripts/requirements.txt
for file in ${PUBLISHED_COLLECTION_FILES}; do
python3 ./scripts/generate-mdx.py "$file"
done
open-veda-config-pr:
runs-on: ubuntu-latest
needs: create-mdx-files
steps:
- name: Open veda-config PR
run: |
echo "NO-OP. Placeholder for future job that will open a Pull Request in veda-config for a dashboard preview for the new/changed datasets."
publish-to-prod-on-pr-merge:
if: ${{ github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged == true }}
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Publish to production on PR merge
run: echo "NO-OP. This step runs when a PR is merged."