.github/workflows/pr.yml

# This GitHub Actions workflow automates the process of
# publishing dataset collections to a staging environment
# It is triggered by a pull request to the main branch
# that modifies any files within the ingestion-data/dataset-config/ directory
# The workflow includes steps to
#   - publish the datasets,
#   - constantly updates the status of the workflow in the PR comment

name: Publish collection to staging

on:
  pull_request:
    branches: ['main']
    paths:
      # Run the workflow only if files inside this path are updated
      - ingestion-data/staging/dataset-config/*

  push:
    branches:
      - main

permissions:
  pull-requests: write
  contents: read

jobs:
  publish-new-datasets:
    if: ${{ github.event_name == 'pull_request' && (github.event.action == 'synchronize' || github.event.action == 'opened') }}
    runs-on: ubuntu-latest
    environment: staging
    outputs:
      publishedCollections: ${{ steps.publish-collections.outputs.success_collections }}
      commentId: ${{ steps.init-comment.outputs.COMMENT_ID }}
    steps:
      - uses: actions/checkout@v4

      # Initializes the PR comment
      # Edits existing or creates new comment
      # Why? - Cleanliness!
      - name: Initialize PR comment with workflow start
        id: init-comment
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          WORKFLOW_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
          body="### Workflow Status
          **Starting workflow...** [View action run]($WORKFLOW_URL)"

          # Get the PR number
          PR_NUMBER=${{ github.event.pull_request.number }}

          # Fetch existing comments
          COMMENTS=$(gh api repos/${{ github.repository }}/issues/${PR_NUMBER}/comments --jq '.[] | select(.body | contains("### Workflow Status")) | {id: .id, body: .body}')

          # Check if a comment already exists
          COMMENT_ID=$(echo "$COMMENTS" | jq -r '.id' | head -n 1)

          if [ -z "$COMMENT_ID" ]; then
            # No existing comment, create a new one
            COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${PR_NUMBER}/comments -f body="$body" --jq '.id')
          else
            # Comment exists, overwrite the existing comment
            gh api repos/${{ github.repository }}/issues/comments/$COMMENT_ID -X PATCH -f body="$body"
          fi

          echo "COMMENT_ID=$COMMENT_ID" >> $GITHUB_OUTPUT

      # Find only the newly added files
      # Only .json files
      # The files are outputted to GITHUB_OUTPUT, which can be used in subsequent steps
      - name: Get newly added files
        id: changed-files
        uses: tj-actions/changed-files@v45
        with:
          files: |
            **.json

      - name: List all newly added files
        env:
          ADDED_FILES: ${{ steps.changed-files.outputs.added_files }}
        run: |
          for file in ${ADDED_FILES}; do
            echo "$file was added"
          done

      # Uses service client creds to get token
      # No username/password needed
      - name: Get auth token
        id: get-token
        run: |
          echo "Vars: $vars"
          response=$(curl -X POST \
            ${{ vars.STAGING_COGNITO_DOMAIN }}/oauth2/token \
            -H "Content-Type: application/x-www-form-urlencoded" \
            -d "grant_type=client_credentials" \
            -d "client_id=${{ vars.STAGING_CLIENT_ID }}" \
            -d "client_secret=${{ secrets.STAGING_CLIENT_SECRET }}"
          )

          access_token=$(echo "$response" | jq -r '.access_token')
          echo "ACCESS_TOKEN=$access_token" >> $GITHUB_OUTPUT

      # Makes request to /dataset/publish endpoint
      # Outputs only files that were successfully published
      # Used by other steps
      # If none of the requests are successful, workflow fails
      # Updates the PR comment with status of collection publication
      - name: Publish all newly added collections to staging
        id: publish-collections
        env:
          ADDED_FILES: ${{ steps.changed-files.outputs.added_files }}
          WORKFLOWS_URL: ${{ vars.STAGING_WORKFLOWS_URL }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          AUTH_TOKEN: ${{ steps.get-token.outputs.ACCESS_TOKEN }}
          COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }}
        run: |
          if [ -z "$WORKFLOWS_URL" ]; then
            echo "WORKFLOWS_URL is not set"
            exit 1
          fi

          if [ -z "$AUTH_TOKEN" ]; then
            echo "AUTH_TOKEN is not set"
            exit 1
          fi

          publish_url="${WORKFLOWS_URL%/}/dataset/publish"
          bearer_token=$AUTH_TOKEN

          # Track successful publications
          all_failed=true
          declare -a success_collections=()
          status_message='### Collection Publication Status
          '

          for file in ${ADDED_FILES}; do
            echo $file
            if [ -f "$file" ]; then
              dataset_config=$(jq '.' "$file")
              collection_id=$(jq -r '.collection' "$file")

              echo "Publishing $collection_id"
              response=$(curl -s -w "%{http_code}" -o response.txt -X POST "$publish_url" \
                -H "Content-Type: application/json" \
                -H "Authorization: Bearer $AUTH_TOKEN" \
                -d "$dataset_config"
              )

              status_code=$(tail -n1 <<< "$response")

              # Update status message based on response code
              if [ "$status_code" -eq 200 ] || [ "$status_code" -eq 201 ]; then
                echo "$collection_id successfully published ✅"
                status_message+="- **$collection_id**: Successfully published ✅
                "
                success_collections+=("$file")
                all_failed=false
              else
                echo "$collection_id failed to publish ❌"
                status_message+="- **$collection_id**: Failed to publish. Error code $status_code. ❌
                "
              fi
            else
              echo "File $file does not exist"
              exit 1
            fi
          done

          # Exit workflow if all the requests fail
          if [ "$all_failed" = true ]; then
            echo "All collections failed to publish."
            exit 1
          fi

          # Output only successful collections to be used in subsequent steps
          echo "success_collections=${success_collections[*]}" >> $GITHUB_OUTPUT

          # Update PR comment
          CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
          UPDATED_BODY="$CURRENT_BODY

          $status_message"
          gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.9'
      - uses: actions/cache@v4
        with:
          path: ${{ env.pythonLocation }}
          key: ${{ env.pythonLocation }}-pip-${{ hashFiles('requirements.txt') }}

      # If the workflow fails at any point, the PR comment will be updated
      - name: Update PR comment on overall workflow failure
        if: failure()
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }}
        run: |
          WORKFLOW_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
          CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
          UPDATED_BODY="$CURRENT_BODY

          ** ❌ The workflow run failed. [See logs here]($WORKFLOW_URL)**"
          gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"

  create-mdx-files-and-open-pr:
    runs-on: ubuntu-latest
    environment: staging
    needs: publish-new-datasets
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Use output from publish-new-datasets
        run: |
          echo "The output from the previous step is: ${{ needs.publish-new-datasets.outputs.publishedCollections }}"

      # Creates a slim dataset mdx file for each collection based on the dataset config json
      - name: Create dataset mdx for given collections
        env:
          PUBLISHED_COLLECTION_FILES: ${{ needs.publish-new-datasets.outputs.publishedCollections }}
        run: |
          echo $PUBLISHED_COLLECTION_FILES
          collection_ids=""
          pip install -r ./scripts/requirements.txt
          for file in ${PUBLISHED_COLLECTION_FILES}; do
            collection_id=$(python3 ./scripts/generate-mdx.py "$file")
            collection_id=$(echo "$collection_id" | sed 's/^["\s]*//;s/["\s]*$//')
            echo "Processed collection ID: $collection_id"
            collection_ids="$collection_ids$collection_id,"
          done
          # Remove trailing comma
          collection_ids=${collection_ids%,}
          echo "Final collection_ids: $collection_ids"
          echo "collection_ids=${collection_ids}" >> $GITHUB_ENV

      - name: Set up Variables
        run: |
          echo "VEDA_CONFIG_REPO=${{ vars.VEDA_CONFIG_REPO_ORG }}/${{ vars.VEDA_CONFIG_REPO_NAME }}" >> $GITHUB_ENV

      - name: Clone veda-config repository
        run: |
          git clone https://github.com/${{ env.VEDA_CONFIG_REPO }}.git
          ls

      - name: Copy untracked mdx files to veda-config
        run: |
          echo "Copying untracked .mdx files to veda-config repository"
          ls ./ingestion-data/dataset-mdx/
          mkdir -p datasets
          find ingestion-data/dataset-mdx/ -name '*.mdx' -exec cp {} veda-config/datasets/ \;

      - name: Create veda-config PR with changes
        id: create-pr
        env:
          GITHUB_TOKEN: ${{ secrets.VEDA_CONFIG_REPO_ACCESS_TOKEN }}
          COMMENT_ID: ${{ needs.publish-new-datasets.outputs.commentId }}
          PUBLISHED_COLLECTION_FILES: ${{ steps.publish-collections.outputs.success_collections }}
        run: |
          cd veda-config
          git config --global user.name "github-actions[bot]"
          git config --global user.email "github-actions[bot]@users.noreply.github.com"
          git remote set-url origin https://${{ secrets.VEDA_CONFIG_REPO_ACCESS_TOKEN }}@github.com/${{ env.VEDA_CONFIG_REPO }}

          files_string=$(IFS=$'\n'; echo "${PUBLISHED_COLLECTION_FILES[*]}")
          hash=$(echo -n "$files_string" | md5sum | cut -d ' ' -f 1)
          NEW_BRANCH="add-dataset-$hash"

          git fetch origin
          if git ls-remote --exit-code --heads origin $NEW_BRANCH; then
            git push origin --delete $NEW_BRANCH
          fi
          git checkout -b $NEW_BRANCH

          git status
          git add .
          git commit -m "feat: add MDX files for dataset(s) [Automated workflow]"
          git push origin $NEW_BRANCH

          # Convert the comma-separated list into bullet points
          collection_bullet_points=""
          IFS=',' read -ra IDs <<< "$collection_ids"
          for id in "${IDs[@]}"; do
            collection_bullet_points+="- $id\n"
          done

          body="### Add dataset(s) - [Automated PR by ${{ github.actor }}]\n $collection_bullet_points\n"

          echo "$body"
          PR_URL=$(GITHUB_TOKEN=${{ secrets.VEDA_CONFIG_REPO_ACCESS_TOKEN }} gh pr create -R ${{ env.VEDA_CONFIG_REPO }} -H $NEW_BRANCH -B develop --title "Add dataset(s) $(echo '${{ env.collection_ids }}' | jq '.[0]') [Automated PR by ${{ github.actor }}]" --body "$body")


          echo "PR_URL=$PR_URL" >> $GITHUB_OUTPUT
          echo "PR creation succeeded!"

      # Updates the comment with a link to the above PR
      - name: Update PR comment with PR creation result
        if: success()
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          COMMENT_ID: ${{ needs.publish-new-datasets.outputs.commentId }}
        run: |
         PR_URL=${{ steps.create-pr.outputs.PR_URL }}
          CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
          UPDATED_BODY="$CURRENT_BODY

          **A PR has been created with the dataset configuration: 🗺️ [PR link]($PR_URL)**"
          gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"

      - name: Update PR comment on PR creation failure
        if: failure() && steps.create-pr.outcome == 'failure'
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          COMMENT_ID: ${{ needs.publish-new-datasets.outputs.commentId }}
        run: |
          CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
          UPDATED_BODY="$CURRENT_BODY

          **Failed ❌ to create a PR with the dataset configuration. 😔 **"
          gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"

      # If the workflow fails at any point, the PR comment will be updated
      - name: Update PR comment on overall workflow failure
        if: failure() && steps.create-pr.outcome != 'failure'
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          COMMENT_ID: ${{ needs.publish-new-datasets.outputs.commentId }}
        run: |
          WORKFLOW_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
          CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
          UPDATED_BODY="$CURRENT_BODY

          # Output WORKFLOW_URL to logs for verification
          echo "Workflow URL: $WORKFLOW_URL"

          ** ❌ The workflow run failed. [See logs here]($WORKFLOW_URL)**"
          gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"

          echo "Updated Comment Body: $UPDATED_BODY"

  publish-to-prod-on-pr-merge:
    if: ${{ github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged == true }}
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Publish to production on PR merge
        run: echo "NO-OP. This step runs when a PR is merged."