From b46219cb2b00e81be3ba7e33a9ca22e8bc3bd375 Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Wed, 4 Sep 2024 02:35:00 -0400 Subject: [PATCH] Update ci --- .github/workflows/cloud-ci.yml | 19 ++++++++++--------- milabench/cli/cloud.py | 4 ++-- milabench/remote.py | 3 +-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml index 7111f2597..1168920a7 100644 --- a/.github/workflows/cloud-ci.yml +++ b/.github/workflows/cloud-ci.yml @@ -16,7 +16,7 @@ jobs: fail-fast: true max-parallel: 1 matrix: - system: ["1g:1n", "1g:4n", "2g:4n"] + system: ["1n:1g", "1n:4g", "2n:4g"] include: - arch: cuda exclude: "no-cuda" @@ -38,15 +38,15 @@ jobs: env: MILABENCH_CONFIG: "config/standard.yaml" MILABENCH_SYSTEM: "config/cloud-multinodes-system.yaml" - MILABENCH_BASE: "output" + MILABENCH_BASE: "../output" MILABENCH_ARGS: "" MILABENCH_DASH: "no" MILABENCH_HF_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN}} ARM_TENANT_ID: "${{ secrets.ARM_TENANT_ID }}" ARM_SUBSCRIPTION_ID: "${{ secrets.ARM_SUBSCRIPTION_ID }}" AZURE_CORE_OUTPUT: none - _MULTI_GPUS: "diffusion-gpus,dinov2-giant-gpus,lightning-gpus,resnet152-ddp-gpus,llm-lora-ddp-gpus,llm-lora-mp-gpus,llm-full-mp-gpus" - _MULTI_NODES: "diffusion-nodes,dinov2-giant-nodes,llm-lora-ddp-nodes,llm-full-mp-nodes" + _MULTI_GPUS: "diffusion-gpus,dinov2-giant-gpus,lightning-gpus,resnet152-ddp-gpus,llm-full-mp-gpus,llm-lora-ddp-gpus,llm-lora-mp-gpus" + _MULTI_NODES: "multinode" steps: - uses: actions/checkout@v3 @@ -94,8 +94,8 @@ jobs: - name: setup cloud run: | - gpus=$(echo "${{ matrix.system }}" | cut -d":" -f1") - nodes=$(echo "${{ matrix.system }}" | cut -d":" -f2") + nodes=$(echo "${{ matrix.system }}" | cut -d":" -f1) + gpus=$(echo "${{ matrix.system }}" | cut -d":" -f2) case "$nodes" in "1n") MILABENCH_SYSTEM="config/cloud-system.yaml" @@ -140,21 +140,22 @@ jobs: if [[ ! -z "$SELECT" ]] then - export SELECT="--select $SELECT" + SELECT="--select $SELECT" fi if [[ ! -z "$EXCLUDE" ]] then - export EXCLUDE="--exclude $EXCLUDE" + EXCLUDE="--exclude $EXCLUDE" fi + echo "RUN_ON=$RUN_ON" >>$GITHUB_ENV + poetry run milabench cloud \ --setup \ --run-on $RUN_ON \ --system "$MILABENCH_SYSTEM" >$MILABENCH_SYSTEM.$RUN_ON echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.$RUN_ON" >>$GITHUB_ENV - echo "RUN_ON=$RUN_ON" >>$GITHUB_ENV echo "SELECT=$SELECT" >>$GITHUB_ENV echo "EXCLUDE=$EXCLUDE" >>$GITHUB_ENV diff --git a/milabench/cli/cloud.py b/milabench/cli/cloud.py index 5c69834df..993da34b5 100644 --- a/milabench/cli/cloud.py +++ b/milabench/cli/cloud.py @@ -48,8 +48,8 @@ def manage_cloud(pack, run_on, action="setup"): default_state_prefix = profile or run_on default_state_id = "_".join((pack.config["hash"][:6], blabla())) - remote_base = XPath("/data") / pack.dirs.base.name - local_base = pack.dirs.base.absolute().parent + local_base = pack.dirs.base.absolute() + remote_base = XPath("/data") / local_base.parent.name / local_base.name nodes = iter(enumerate(pack.config["system"]["nodes"])) for i, n in nodes: diff --git a/milabench/remote.py b/milabench/remote.py index 3a4b348d4..d7c71fe7d 100644 --- a/milabench/remote.py +++ b/milabench/remote.py @@ -66,9 +66,8 @@ def rsync(node, src=None, remote_src=None, dest=None, force=False) -> list: return [ "rsync", - *(["--force"] if force else []), + *(["--force", "--del"] if force else []), "-aHv", - "--del", "-e", f"ssh {key} -oCheckHostIP=no -oStrictHostKeyChecking=no", "--include=*/.git/*",