Skip to content

Satyaog/feature/covalent #58

Satyaog/feature/covalent

Satyaog/feature/covalent #58

Workflow file for this run

name: cloud-tests
on:
# Runs for pull requests
pull_request:
branches:
- master
permissions:
id-token: write
contents: write
jobs:
cloud-tests:
strategy:
fail-fast: true
matrix:
include:
- arch: cuda
exclude: "no-cuda"
run_on: azure__a100
# - arch: rocm
# exclude : "no-rocm"
runs-on: ubuntu-latest
environment: cloud-ci
# Cancel previous jobs if a new version was pushed
concurrency:
group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.run_on }}"
cancel-in-progress: true
defaults:
run:
shell: bash -el {0}
env:
MILABENCH_CONFIG: "config/standard.yaml"
MILABENCH_SYSTEM: "config/cloud-multinodes-system.yaml"
MILABENCH_BASE: "output"
MILABENCH_ARGS: ""
MILABENCH_DASH: "no"
ARM_TENANT_ID: "${{ secrets.ARM_TENANT_ID }}"
ARM_SUBSCRIPTION_ID: "${{ secrets.ARM_SUBSCRIPTION_ID }}"
AZURE_CORE_OUTPUT: none
steps:
- uses: actions/checkout@v3
with:
token: ${{ github.token }}
- uses: actions/setup-python@v2
with:
python-version: 3.9
# Follow
# https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/service_principal_client_secret
# to generate a clientId as well as a clientSecret
- name: Azure login
uses: azure/login@v2
with:
creds: |
{
"clientId": "${{ secrets.ARM_CLIENT_ID }}",
"clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}",
"subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}",
"tenantId": "${{ secrets.ARM_TENANT_ID }}"
}
- name: dependencies
run: |
python -m pip install -U pip
python -m pip install -U poetry
poetry lock --no-update
poetry install
- name: setup cloud credentials
run: |
mkdir -p ~/.aws
mkdir -p ~/.ssh/covalent
echo "${{ secrets.COVALENT_EC2_EXECUTOR_KEYPAIR }}" >~/.ssh/covalent/covalent-ec2-executor-keypair.pem
echo "[default]" >~/.aws/credentials
echo "aws_access_key_id=${{ secrets.AWS_ACCESS_KEY_ID }}" >>~/.aws/credentials
echo "aws_secret_access_key=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >>~/.aws/credentials
chmod -R a-rwx,u+rwX ~/.aws ~/.ssh
- name: start covalent server
run: |
poetry run -- python3 -m milabench.scripts.covalent serve start --develop
- name: setup cloud
run: |
poetry run milabench cloud \
--setup \
--run-on ${{ matrix.run_on }} \
--system "$MILABENCH_SYSTEM" >$MILABENCH_SYSTEM.${{ matrix.run_on }}
echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.${{ matrix.run_on }}" >>$GITHUB_ENV
- name: install benchmarks
run: |
poetry run milabench install --variant ${{ matrix.arch }} \
--exclude llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single
- name: prepare benchmarks
run: |
poetry run milabench prepare \
--exclude llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single
- name: run benchmarks
run: |
poetry run milabench run \
--exclude llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single
- name: Summary
run: |
git config credential.${{ github.server_url }}.username ${{ github.actor }}
git config credential.helper '!f() { test "$1" = get && echo "password=$GITHUB_TOKEN"; }; f'
git config --global user.email "[email protected]"
git config --global user.name "GitHub CI"
poetry run milabench report --push
env:
GITHUB_TOKEN: ${{ github.token }}
- name: DEBUG state file
if: always()
run: |
cat /tmp/milabench/covalent_venv/lib/python*/site-packages/covalent_azure_plugin/infra/*.tfstate
- name: teardown cloud
if: always()
run: |
if [[ -f "${MILABENCH_SYSTEM%.*}" ]]
then
export MILABENCH_SYSTEM=${MILABENCH_SYSTEM%.*}
fi
poetry run milabench cloud \
--teardown \
--run-on ${{ matrix.run_on }} \
--all
- name: DEBUG logs
if: always()
run: |
cat ~/.cache/covalent/covalent_ui.log