From 27bb6ca1ab84346d941b1349b4cdc1af6f74c935 Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Wed, 22 May 2024 14:27:51 -0400 Subject: [PATCH] Fix cloud-ci --- .github/workflows/cleanup.yml | 23 ---- .github/workflows/cloud-ci.yml | 141 ------------------------- .github/workflows/docker-run.yml | 50 --------- .github/workflows/docker.yml | 91 ---------------- .github/workflows/docs.yml | 33 ------ .github/workflows/integration.yml | 77 -------------- .github/workflows/report_container.yml | 60 ----------- .github/workflows/show_config.yml | 26 ----- .github/workflows/test-cloud-ci.yml | 38 +++++++ .github/workflows/tests.yml | 113 -------------------- 10 files changed, 38 insertions(+), 614 deletions(-) delete mode 100644 .github/workflows/cleanup.yml delete mode 100644 .github/workflows/cloud-ci.yml delete mode 100644 .github/workflows/docker-run.yml delete mode 100644 .github/workflows/docker.yml delete mode 100644 .github/workflows/docs.yml delete mode 100644 .github/workflows/integration.yml delete mode 100644 .github/workflows/report_container.yml delete mode 100644 .github/workflows/show_config.yml create mode 100644 .github/workflows/test-cloud-ci.yml delete mode 100644 .github/workflows/tests.yml diff --git a/.github/workflows/cleanup.yml b/.github/workflows/cleanup.yml deleted file mode 100644 index e205cb001..000000000 --- a/.github/workflows/cleanup.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Clean space on the CI node - -on: - # Allow manual runs - workflow_dispatch: - -# define build arguments - -jobs: - clean: - strategy: - matrix: - include: - - arch: cuda - - arch: rocm - - runs-on: [self-hosted, "${{ matrix.arch }}"] - - steps: - - name: Get an overview of available space - run: | - df -h - docker image ls diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml deleted file mode 100644 index c19954c71..000000000 --- a/.github/workflows/cloud-ci.yml +++ /dev/null @@ -1,141 +0,0 @@ -name: tests - -on: - # Runs for pull requests - pull_request: - branches: - - master - -permissions: - id-token: write - -jobs: - cloud-tests: - strategy: - fail-fast: true - matrix: - include: - - arch: cuda - exclude: "no-cuda" - run_on: azure__a100 - # - arch: rocm - # exclude : "no-rocm" - - runs-on: ubuntu-latest - environment: cloud-ci - - # Cancel previous jobs if a new version was pushed - concurrency: - group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.run_on }}" - cancel-in-progress: true - - defaults: - run: - shell: bash -el {0} - - env: - MILABENCH_CONFIG: "config/standard.yaml" - MILABENCH_SYSTEM: "config/cloud-system.yaml" - MILABENCH_BASE: "output" - MILABENCH_ARGS: "" - MILABENCH_GPU_ARCH: "${{ matrix.arch }}" - MILABENCH_DASH: "no" - ARM_TENANT_ID: "${{ secrets.ARM_TENANT_ID }}" - ARM_SUBSCRIPTION_ID: "${{ secrets.ARM_SUBSCRIPTION_ID }}" - AZURE_CORE_OUTPUT: none - - steps: - - uses: actions/checkout@v3 - with: - token: ${{ github.token }} - - - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - # Follow - # https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/service_principal_client_secret - # to generate a clientId as well as a clientSecret - - name: Azure login - uses: azure/login@v2 - with: - creds: | - { - "clientId": "${{ secrets.ARM_CLIENT_ID }}", - "clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}", - "subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}", - "tenantId": "${{ secrets.ARM_TENANT_ID }}" - } - - - name: dependencies - run: | - python -m pip install -U pip - python -m pip install -U poetry - poetry lock --no-update - poetry install - - - name: setup cloud credentials - run: | - mkdir -p ~/.aws - mkdir -p ~/.ssh/covalent - echo "${{ secrets.COVALENT_EC2_EXECUTOR_KEYPAIR }}" >~/.ssh/covalent/covalent-ec2-executor-keypair.pem - echo "[default]" >~/.aws/credentials - echo "aws_access_key_id=${{ secrets.AWS_ACCESS_KEY_ID }}" >>~/.aws/credentials - echo "aws_secret_access_key=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >>~/.aws/credentials - chmod -R a-rwx,u+rwX ~/.aws ~/.ssh - - - name: setup cloud - run: | - _system=$( - poetry run milabench cloud \ - --setup \ - --run-on ${{ matrix.run_on }} - ) - { read _hash ; }< <( - echo -n "$_system" | while read l - do - if [[ "$l" == "# hash::>"* ]] - then - echo -n "${l#*::>}" - fi - done - echo - ) - if [[ -z "${_hash}" ]] - then - >&2 echo "Failed to fetch system config hash" - exit 1 - fi - echo -n "$_system" >$MILABENCH_SYSTEM.$_hash - echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.$_hash" >>$GITHUB_ENV - - - name: install benchmarks - run: | - poetry run milabench install --variant ${{ matrix.arch }} - - - name: prepare benchmarks - run: | - poetry run milabench prepare - - - name: run benchmarks - run: | - poetry run milabench run - - - name: Summary - run: | - # git remote set-url origin "https://${{ vars.REPORTS_USERNAME }}:${{ secrets.REPORTS_PAT }}@$(git remote get-url origin | cut -d'/' -f3-)" - git config --global user.email "github-ci@example.com" - git config --global user.name "GitHub CI" - poetry run milabench report --push - - - name: teardown cloud - if: always() - run: | - if [[ -f "${MILABENCH_SYSTEM%.*}" ]] - then - export MILABENCH_SYSTEM=${MILABENCH_SYSTEM%.*} - fi - poetry run milabench cloud \ - --teardown \ - --run-on ${{ matrix.run_on }} \ - --all diff --git a/.github/workflows/docker-run.yml b/.github/workflows/docker-run.yml deleted file mode 100644 index 35c72fe59..000000000 --- a/.github/workflows/docker-run.yml +++ /dev/null @@ -1,50 +0,0 @@ -# Run Milabench using nightly docker images -name: docker-run - -on: - # Only works on manual runs - workflow_dispatch: - -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} - IMAGE_PATH: ghcr.io/mila-iqia/milabench:cuda-nightly - -jobs: - build-image: - strategy: - matrix: - - arch: [cuda, rocm] - - runs-on: [self-hosted, "${{ matrix.arch }}"] - - permissions: - contents: read - - env: - IMAGE_NAME: "ghcr.io/mila-iqia/milabench:${{ matrix.arch }}-nightly" - - steps: - - name: pull - run: | - docker pull $IMAGE_NAME - - - name: run - run: | - OUTPUT="$(pwd)/../results" - mkdir -p $OUTPUT - docker run --rm --shm-size=256M \ - --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all \ - -v $OUTPUT:/milabench/envs/runs \ - $IMAGE_NAME milabench run - - - name: Check out the repo - uses: actions/checkout@v3 - - - name: summary - run: | - python -m pip install -U pip - python -m pip install -U poetry - poetry lock --no-update - poetry install - milabench summary $OUTPUT diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml deleted file mode 100644 index fb8c75f26..000000000 --- a/.github/workflows/docker.yml +++ /dev/null @@ -1,91 +0,0 @@ -name: Publish Docker image - -on: - # Allow manual runs - workflow_dispatch: - - # Only run for push on the main branch or for tagged version - push: - branches: - - master - tags: - - v*.*.* - -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} - - -permissions: - packages: write - - -# define build arguments - -jobs: - build-image: - strategy: - fail-fast: false - matrix: - include: - - arch: cuda - - arch: rocm - - runs-on: [self-hosted, "${{ matrix.arch }}"] - - permissions: - contents: read - packages: write - - steps: - - name: Show all images - run: | - docker image ls - - - name: Prune - run: | - # Prune all images older than 2 weeks - # The images are still on github registry - docker image prune -f -a --filter "until=336h" - docker system prune -f - - - name: Check out the repo - uses: actions/checkout@v3 - - - name: Get Image Tag Name - env: - GITHUB_REF_NAME_ENV: ${{ github.ref_name }} - run: | - REGEX="(.*)v(.*)\.(.*)\.(.*)" - IMAGE_TAG="nightly" - if [[ "${GITHUB_REF_NAME_ENV}" =~ $REGEX ]]; then - IMAGE_TAG="${GITHUB_REF_NAME##*/}" - fi - echo "IMAGE_TAG=$IMAGE_TAG" >> $GITHUB_ENV - - - name: Log in to the registry - uses: docker/login-action@v2 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata (tags, labels) for the image - id: meta - uses: docker/metadata-action@v4 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - tags: | - type=raw,value=${{ matrix.arch }}-${{ env.IMAGE_TAG }} - - - name: Build and push the image - uses: docker/build-push-action@v3 - with: - context: . - push: true - file: docker/Dockerfile-${{ matrix.arch }} - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - build-args: | - ARCH=${{ matrix.arch }} - CONFIG=standard.yaml diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml deleted file mode 100644 index 99ac4253d..000000000 --- a/.github/workflows/docs.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: docs - -on: - push: - branches: - - master - -jobs: - build: - runs-on: ubuntu-latest - steps: - - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - - uses: actions/checkout@master - with: - # otherwise, you will failed to push refs to dest repo - fetch-depth: 0 - - - name: Install Dependencies - run: | - pip install -e . - pip install sphinx sphinx-rtd-theme - - - name: Build and Commit - uses: sphinx-notes/pages@v2 - - - name: Push changes - uses: ad-m/github-push-action@master - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - branch: gh-pages diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml deleted file mode 100644 index 717ca7e6d..000000000 --- a/.github/workflows/integration.yml +++ /dev/null @@ -1,77 +0,0 @@ -name: integration - -on: - # Runs every sunday - schedule: - - cron: '0 0 * * SUN' - - # Runs for pull requests - pull_request: - branches: - - master - - # Runs on publish - release: - types: - [published] - - # Allow manual triggers - workflow_dispatch: - -jobs: - # Label of the container job - postgresql: - runs-on: ubuntu-latest - - concurrency: - group: "${{ github.ref }}" - cancel-in-progress: true - - services: - # The hostname of the PostgreSQL service is the label - postgres: - image: postgres - env: - POSTGRES_PASSWORD: password - POSTGRES_USER: username - POSTGRES_DB: milabench - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - # Maps tcp port 5432 on service container to the host - - 5432:5432 - - steps: - - uses: actions/setup-python@v4 - with: - python-version: '3.9' - - - name: Check out repository code - uses: actions/checkout@v3 - - - name: dependencies - run: | - if [[ ! -d "~/.cargo/bin" ]]; then - wget --no-check-certificate --secure-protocol=TLSv1_2 -qO- https://sh.rustup.rs | sh -s -- -y - fi - export PATH="~/.cargo/bin:${PATH}" - python -m pip install -U pip - python -m pip install -U poetry - - - name: install - run: | - pip install pytest - poetry lock --no-update - pip install -e . - - - name: tests - env: - POSTGRES_USER: username - POSTGRES_PSWD: password - POSTGRES_DB: milabench - POSTGRES_HOST: localhost - POSTGRES_PORT: 5432 - run: pytest tests/integration diff --git a/.github/workflows/report_container.yml b/.github/workflows/report_container.yml deleted file mode 100644 index 1d48daedd..000000000 --- a/.github/workflows/report_container.yml +++ /dev/null @@ -1,60 +0,0 @@ -name: Publish Docker image for reports - -on: - # Allow manual runs - workflow_dispatch: - -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} - -permissions: - packages: write - -# define build arguments -jobs: - build-image: - runs-on: ubuntu-22.04 - - strategy: - fail-fast: false - - permissions: - contents: read - packages: write - - steps: - - name: Check out the repo - uses: actions/checkout@v3 - - - name: Get Image Tag Name - env: - GITHUB_REF_NAME_ENV: ${{ github.ref_name }} - run: | - echo "IMAGE_TAG=$GITHUB_REF_NAME_ENV" >> $GITHUB_ENV - - - name: Log in to the registry - uses: docker/login-action@v2 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata (tags, labels) for the image - id: meta - uses: docker/metadata-action@v4 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - tags: | - type=raw,value=report-${{ env.IMAGE_TAG }} - - - name: Build and push the image - uses: docker/build-push-action@v3 - with: - context: . - push: true - file: docker/Dockerfile-report - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - build-args: | - CONFIG=standard.yaml diff --git a/.github/workflows/show_config.yml b/.github/workflows/show_config.yml deleted file mode 100644 index 1d2dd8096..000000000 --- a/.github/workflows/show_config.yml +++ /dev/null @@ -1,26 +0,0 @@ -# Run Milabench using nightly docker images -name: show-config - -on: - # Only works on manual runs - workflow_dispatch: - -jobs: - execute: - runs-on: [self-hosted, rocm] - - steps: - - name: Set up Python 3.9 - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - - name: Install Dependencies - run: | - python -m pip install --upgrade pip - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.4.2 - - - name: Show Pytorch Config - run: | - python -c "import torch; print(torch.__config__.show())" - diff --git a/.github/workflows/test-cloud-ci.yml b/.github/workflows/test-cloud-ci.yml new file mode 100644 index 000000000..3feaf659e --- /dev/null +++ b/.github/workflows/test-cloud-ci.yml @@ -0,0 +1,38 @@ +name: azure-tests + +jobs: + cloud-tests: + environment: test-cloud-ci + + env: + ARM_TENANT_ID: "${{ secrets.ARM_TENANT_ID }}" + ARM_SUBSCRIPTION_ID: "${{ secrets.ARM_SUBSCRIPTION_ID }}" + AZURE_CORE_OUTPUT: none + + steps: + - uses: actions/checkout@v3 + with: + token: ${{ github.token }} + + - uses: actions/setup-python@v2 + with: + python-version: 3.9 + + - name: echo + run: | + echo "1${{ secrets.ARM_CLIENT_ID }}", "2${{ secrets.ARM_CLIENT_SECRET }}", "3${{ secrets.ARM_SUBSCRIPTION_ID }}", "4${{ secrets.ARM_TENANT_ID }}" + echo "1${{ secrets }}" + + # Follow + # https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/service_principal_client_secret + # to generate a clientId as well as a clientSecret + - name: Azure login + uses: azure/login@v2 + with: + creds: | + { + "clientId": "1${{ secrets.ARM_CLIENT_ID }}", + "clientSecret": "2${{ secrets.ARM_CLIENT_SECRET }}", + "subscriptionId": "3${{ secrets.ARM_SUBSCRIPTION_ID }}", + "tenantId": "4${{ secrets.ARM_TENANT_ID }}" + } diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml deleted file mode 100644 index 7d456f9bb..000000000 --- a/.github/workflows/tests.yml +++ /dev/null @@ -1,113 +0,0 @@ -name: tests - -on: - # Runs every sunday - schedule: - - cron: '0 0 * * SUN' - - # Runs for pull requests - pull_request: - branches: - - master - - # Runs on publish - release: - types: - [published] - - # Allow manual triggers - workflow_dispatch: - - -jobs: - tests: - strategy: - fail-fast: false - matrix: - include: - - arch: cuda - exclude : "unsupported-cuda" - # - arch: rocm - # exclude : "unsupported-rocm" - - runs-on: [self-hosted, "${{ matrix.arch }}"] - - # Cancel previous jobs if a new version was pushed - concurrency: - group: "${{ github.ref }}-${{ matrix.arch }}" - cancel-in-progress: true - - defaults: - run: - shell: bash -el {0} - - env: - MILABENCH_CONFIG: "config/ci.yaml" - MILABENCH_BASE: "output" - MILABENCH_ARGS: "" - MILABENCH_GPU_ARCH: "${{ matrix.arch }}" - MILABENCH_DASH: "no" - MILABENCH_EXCLUDE: "${{ matrix.exclude }}" - - steps: - - uses: actions/checkout@v3 - - - uses: conda-incubator/setup-miniconda@v2 - with: - auto-activate-base: false - python-version: 3.9 - miniconda-version: "latest" - activate-environment: test - - - name: Pytorch Sanity - run: | - if [[ "${MILABENCH_GPU_ARCH}" == "rocm" ]]; then - groups - /opt/rocm/bin/rocminfo - fi - - - name: dependencies - run: | - if [[ ! -d "~/.cargo/bin" ]]; then - wget --no-check-certificate --secure-protocol=TLSv1_2 -qO- https://sh.rustup.rs | sh -s -- -y - fi - export PATH="~/.cargo/bin:${PATH}" - python -m pip install -U pip - python -m pip install -U poetry - poetry lock --no-update - # poetry v1.7 has a bug where it can't find pip during the first - # install attempt: - # Output: - # [...]/.venv/bin/python: can't open file - # '[...]/lib/python3.9/site-packages/virtualenv/seed/wheels/embed/pip-23.3.1-py3-none-any.whl/pip': - # [Errno 2] No such file or directory - ! poetry install - poetry install - - - name: pin - run: | - MILABENCH_GPU_ARCH=cuda poetry run milabench pin -c constraints/cuda.txt --config config/standard.yaml - MILABENCH_GPU_ARCH=rocm poetry run milabench pin -c constraints/rocm.txt --config config/standard.yaml - git diff --stat - - - name: tests - run: | - export PATH="/opt/rocm/bin:$PATH" - pytest --ignore=tests/integration tests/ - - - name: install benchmarks - run: | - milabench install --exclude "${MILABENCH_EXCLUDE}" - - - name: prepare benchmarks - run: | - milabench prepare --exclude "${MILABENCH_EXCLUDE}" - - - name: run benchmarks - run: | - export PATH="/opt/rocm/bin:$PATH" - milabench run --validations all --exclude "${MILABENCH_EXCLUDE}" - - - name: Summary - run: | - milabench summary $MILABENCH_BASE/runs/