-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add covalent cloud aws ec2 infra and report --push
covalent is not compatible with milabench as it requires sqlalchemy<2.0.0 Update .github/workflows/cloud-ci.yml Apply suggestions from code review Update .github/workflows/cloud-ci.yml Add azure covalent cloud infra Fix reports Fix cloud-ci with gpu arch Add multi-node on cloud Fix cloud data dir * VM on the cloud might not have enough space on all partitions. Add a workaround which should cover most cases * Use branch and commit name to versionize reports directories * Fix parsing error when temperature is not available in nvidia-smi outputs * export MILABENCH_* env vars to remote Add docs Update cloud-ci.yml Update cloud-ci.yml Update cloud-ci.yml Update cloud-ci.yml Update cloud-ci.yml Update cloud-ci.yml Update cloud-ci.yml Update cloud-ci.yml Fix cloud instance name conflict This would prevent the CI or multiple contributors to run tests with the same config Fix github push in CI Cleaner and tested azure plugin Fix cloud-ci Fix cloud multi-nodes * Copy ssh key to allow connections from master to workers * Use local ip for manager's ip such that workers can find it and connect to it * Fix incompatibility between pandas and numpy 2.0.0 * Fix diffusion benches permission Fix llm multi gpus and nodes in ci Update hrepr Fix ci Fix cloud ci
- Loading branch information
Showing
28 changed files
with
1,662 additions
and
287 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
name: cloud-tests | ||
|
||
on: | ||
# Runs for pull requests | ||
pull_request: | ||
branches: | ||
- master | ||
|
||
permissions: | ||
id-token: write | ||
contents: write | ||
|
||
jobs: | ||
cloud-tests: | ||
strategy: | ||
fail-fast: true | ||
max-parallel: 1 | ||
matrix: | ||
system: ["1g:1n", "1g:4n", "2g:4n"] | ||
include: | ||
- arch: cuda | ||
exclude: "no-cuda" | ||
# - arch: rocm | ||
# exclude : "no-rocm" | ||
|
||
runs-on: ubuntu-latest | ||
environment: cloud-ci | ||
|
||
# Cancel previous jobs if a new version was pushed | ||
concurrency: | ||
group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.system }}" | ||
cancel-in-progress: true | ||
|
||
defaults: | ||
run: | ||
shell: bash -el {0} | ||
|
||
env: | ||
MILABENCH_CONFIG: "config/standard.yaml" | ||
MILABENCH_SYSTEM: "config/cloud-multinodes-system.yaml" | ||
MILABENCH_BASE: "output" | ||
MILABENCH_ARGS: "" | ||
MILABENCH_DASH: "no" | ||
MILABENCH_HF_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN}} | ||
ARM_TENANT_ID: "${{ secrets.ARM_TENANT_ID }}" | ||
ARM_SUBSCRIPTION_ID: "${{ secrets.ARM_SUBSCRIPTION_ID }}" | ||
AZURE_CORE_OUTPUT: none | ||
_MULTI_GPUS: "diffusion-gpus,dinov2-giant-gpus,lightning-gpus,resnet152-ddp-gpus,llm-lora-ddp-gpus,llm-lora-mp-gpus,llm-full-mp-gpus" | ||
_MULTI_NODES: "diffusion-nodes,dinov2-giant-nodes,llm-lora-ddp-nodes,llm-full-mp-nodes" | ||
|
||
steps: | ||
- uses: actions/checkout@v3 | ||
with: | ||
token: ${{ github.token }} | ||
|
||
- uses: actions/setup-python@v2 | ||
with: | ||
python-version: '3.10' | ||
|
||
# Follow | ||
# https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/service_principal_client_secret | ||
# to generate a clientId as well as a clientSecret | ||
- name: Azure login | ||
uses: azure/login@v2 | ||
with: | ||
creds: | | ||
{ | ||
"clientId": "${{ secrets.ARM_CLIENT_ID }}", | ||
"clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}", | ||
"subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}", | ||
"tenantId": "${{ secrets.ARM_TENANT_ID }}" | ||
} | ||
- name: dependencies | ||
run: | | ||
python -m pip install -U pip | ||
python -m pip install -U poetry | ||
poetry lock --no-update | ||
poetry install | ||
- name: setup cloud credentials | ||
run: | | ||
mkdir -p ~/.aws | ||
mkdir -p ~/.ssh/covalent | ||
echo "${{ secrets.COVALENT_EC2_EXECUTOR_KEYPAIR }}" >~/.ssh/covalent/covalent-ec2-executor-keypair.pem | ||
echo "[default]" >~/.aws/credentials | ||
echo "aws_access_key_id=${{ secrets.AWS_ACCESS_KEY_ID }}" >>~/.aws/credentials | ||
echo "aws_secret_access_key=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >>~/.aws/credentials | ||
chmod -R a-rwx,u+rwX ~/.aws ~/.ssh | ||
- name: start covalent server | ||
run: | | ||
poetry run -- python3 -m milabench.scripts.covalent serve start --develop | ||
- name: setup cloud | ||
run: | | ||
gpus=$(echo "${{ matrix.system }}" | cut -d":" -f1") | ||
nodes=$(echo "${{ matrix.system }}" | cut -d":" -f2") | ||
case "$nodes" in | ||
"1n") | ||
MILABENCH_SYSTEM="config/cloud-system.yaml" | ||
EXCLUDE="$EXCLUDE,$_MULTI_NODES" | ||
;; | ||
"2n") | ||
MILABENCH_SYSTEM="config/cloud-multinodes-system.yaml" | ||
SELECT="$SELECT,$_MULTI_NODES" | ||
EXCLUDE="$EXCLUDE,$_MULTI_GPUS" | ||
;; | ||
*) | ||
exit 1 | ||
;; | ||
esac | ||
case "$gpus" in | ||
"1g") | ||
RUN_ON="azure__a100" | ||
EXCLUDE="$EXCLUDE,$_MULTI_GPUS,$_MULTI_NODES" | ||
;; | ||
# "2g") | ||
# RUN_ON="azure__a100_x2" | ||
# SELECT="$SELECT,$_MULTI_GPUS" | ||
# ;; | ||
"4g") | ||
RUN_ON="azure__a100_x4" | ||
SELECT="$SELECT,$_MULTI_GPUS" | ||
;; | ||
*) | ||
exit 1 | ||
;; | ||
esac | ||
if [[ -z "$(echo "$SELECT" | cut -d"," -f1)" ]] | ||
then | ||
SELECT="$(echo "$SELECT" | cut -d"," -f2-)" | ||
fi | ||
if [[ -z "$(echo "$EXCLUDE" | cut -d"," -f1)" ]] | ||
then | ||
EXCLUDE="$(echo "$EXCLUDE" | cut -d"," -f2-)" | ||
fi | ||
if [[ ! -z "$SELECT" ]] | ||
then | ||
export SELECT="--select $SELECT" | ||
fi | ||
if [[ ! -z "$EXCLUDE" ]] | ||
then | ||
export EXCLUDE="--exclude $EXCLUDE" | ||
fi | ||
poetry run milabench cloud \ | ||
--setup \ | ||
--run-on $RUN_ON \ | ||
--system "$MILABENCH_SYSTEM" >$MILABENCH_SYSTEM.$RUN_ON | ||
echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.$RUN_ON" >>$GITHUB_ENV | ||
echo "RUN_ON=$RUN_ON" >>$GITHUB_ENV | ||
echo "SELECT=$SELECT" >>$GITHUB_ENV | ||
echo "EXCLUDE=$EXCLUDE" >>$GITHUB_ENV | ||
- name: install benchmarks | ||
run: | | ||
poetry run milabench install --variant ${{ matrix.arch }} $SELECT $EXCLUDE | ||
- name: prepare benchmarks | ||
run: | | ||
poetry run milabench prepare $SELECT $EXCLUDE | ||
- name: run benchmarks | ||
run: | | ||
poetry run milabench run $SELECT $EXCLUDE | ||
- name: Summary | ||
run: | | ||
git config credential.${{ github.server_url }}.username ${{ github.actor }} | ||
git config credential.helper '!f() { test "$1" = get && echo "password=$GITHUB_TOKEN"; }; f' | ||
git config --global user.email "[email protected]" | ||
git config --global user.name "GitHub CI" | ||
poetry run milabench report --push | ||
env: | ||
GITHUB_TOKEN: ${{ github.token }} | ||
|
||
- name: DEBUG state file | ||
if: always() | ||
run: | | ||
cat /tmp/milabench/covalent_venv/lib/python*/site-packages/covalent_azure_plugin/infra/*.tfstate | ||
- name: teardown cloud | ||
if: always() | ||
run: | | ||
if [[ -f "${MILABENCH_SYSTEM%.*}" ]] | ||
then | ||
export MILABENCH_SYSTEM=${MILABENCH_SYSTEM%.*} | ||
fi | ||
poetry run milabench cloud \ | ||
--teardown \ | ||
--run-on $RUN_ON \ | ||
--all | ||
- name: DEBUG logs | ||
if: always() | ||
run: | | ||
cat ~/.cache/covalent/covalent_ui.log |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
#!/usr/bin/env python | ||
|
||
from dataclasses import dataclass | ||
|
||
from accelerate import Accelerator | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
system: | ||
# Nodes list | ||
nodes: | ||
# Alias used to reference the node | ||
- name: manager | ||
# Use 1.1.1.1 as an ip placeholder | ||
ip: 1.1.1.1 | ||
port: 5000 | ||
# Use this node as the master node or not | ||
main: true | ||
# User to use in remote milabench operations | ||
user: user | ||
|
||
- name: node1 | ||
ip: 1.1.1.1 | ||
main: false | ||
user: username | ||
|
||
# Cloud instances profiles | ||
cloud_profiles: | ||
azure__a100: | ||
username: ubuntu | ||
size: Standard_NC24ads_A100_v4 | ||
location: eastus2 | ||
disk_size: 512 | ||
azure__a100_x2: | ||
username: ubuntu | ||
size: Standard_NC48ads_A100_v4 | ||
location: eastus2 | ||
disk_size: 512 | ||
azure__a100_x4: | ||
username: ubuntu | ||
size: Standard_NC96ads_A100_v4 | ||
location: eastus2 | ||
disk_size: 512 | ||
azure__a10_x2: | ||
username: ubuntu | ||
size: Standard_NV72ads_A10_v5 | ||
location: eastus2 | ||
disk_size: 512 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
system: | ||
# Nodes list | ||
nodes: | ||
# Alias used to reference the node | ||
- name: manager | ||
# Use 1.1.1.1 as an ip placeholder | ||
ip: 1.1.1.1 | ||
port: 5000 | ||
# Use this node as the master node or not | ||
main: true | ||
# User to use in remote milabench operations | ||
user: user | ||
|
||
# Cloud instances profiles | ||
cloud_profiles: | ||
azure__a100: | ||
username: ubuntu | ||
size: Standard_NC24ads_A100_v4 | ||
location: eastus2 | ||
disk_size: 512 | ||
azure__a100_x2: | ||
username: ubuntu | ||
size: Standard_NC48ads_A100_v4 | ||
location: eastus2 | ||
disk_size: 512 | ||
azure__a100_x4: | ||
username: ubuntu | ||
size: Standard_NC96ads_A100_v4 | ||
location: eastus2 | ||
disk_size: 512 | ||
azure__a10: | ||
username: ubuntu | ||
size: Standard_NV36ads_A10_v5 | ||
location: eastus2 | ||
disk_size: 512 | ||
azure__a10_x2: | ||
username: ubuntu | ||
size: Standard_NV72ads_A10_v5 | ||
location: eastus2 | ||
disk_size: 512 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
system: | ||
# Nodes list | ||
nodes: | ||
# Alias used to reference the node | ||
- name: manager | ||
# Use 1.1.1.1 as an ip placeholder | ||
ip: 1.1.1.1 | ||
# Use this node as the master node or not | ||
main: true | ||
# User to use in remote milabench operations | ||
user: user | ||
|
||
- name: node1 | ||
ip: 1.1.1.1 | ||
main: false | ||
user: username | ||
|
||
# Cloud instances profiles | ||
cloud_profiles: | ||
# The cloud platform to use in the form of {PLATFORM} or | ||
# {PLATFORM}__{PROFILE_NAME} | ||
azure: | ||
# covalent-azure-plugin args | ||
username: ubuntu | ||
size: Standard_B1s | ||
location: eastus2 | ||
azure__free: | ||
username: ubuntu | ||
size: Standard_B2ats_v2 | ||
location: eastus2 | ||
ec2: | ||
# covalent-ec2-plugin args | ||
username: ubuntu | ||
instance_type: t2.micro | ||
volume_size: 8 | ||
region: us-east-2 | ||
state_id: 71669879043a3864225aabb94f91a2d4 |
Oops, something went wrong.