diff --git a/.github/workflows/report_container.yml b/.github/workflows/report_container.yml new file mode 100644 index 000000000..1d48daedd --- /dev/null +++ b/.github/workflows/report_container.yml @@ -0,0 +1,60 @@ +name: Publish Docker image for reports + +on: + # Allow manual runs + workflow_dispatch: + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +permissions: + packages: write + +# define build arguments +jobs: + build-image: + runs-on: ubuntu-22.04 + + strategy: + fail-fast: false + + permissions: + contents: read + packages: write + + steps: + - name: Check out the repo + uses: actions/checkout@v3 + + - name: Get Image Tag Name + env: + GITHUB_REF_NAME_ENV: ${{ github.ref_name }} + run: | + echo "IMAGE_TAG=$GITHUB_REF_NAME_ENV" >> $GITHUB_ENV + + - name: Log in to the registry + uses: docker/login-action@v2 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for the image + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=raw,value=report-${{ env.IMAGE_TAG }} + + - name: Build and push the image + uses: docker/build-push-action@v3 + with: + context: . + push: true + file: docker/Dockerfile-report + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + build-args: | + CONFIG=standard.yaml diff --git a/README.md b/README.md index 71bad28cc..e71634e1c 100644 --- a/README.md +++ b/README.md @@ -1,197 +1,94 @@ # Milabench -[Documentation](https://mila-iqia.github.io/milabench) +[Documentation](https://milabench.readthedocs.io/en/stable/) + +Benchmarking framework for Machine learning and Artificial Intelligence, geared toward +evaluating current and future hardware in a research environment. + +* Simple / Hands-off +* Wide selection of models on diverse applications + * Multi GPUs + * Multi node + * nlp / transformer / llm / rl / rnn + * vision / classification / convnet / resnet / transformer + * audio +* Docker Container +* Works on slurm +* Automatic batch resize +* Focussed on training +* Ease of use +* Pytorch focused +* ROCm & NVIDIA +* Independent + +## Getting Started + +The easiest way to run milabbench is to run it with one of its docker image. +It will include all of the necessary data + + + # Choose the image you want to use + export MILABENCH_IMAGE=ghcr.io/mila-iqia/milabench:cuda-nightly + + # Pull the image we are going to run + docker pull $MILABENCH_IMAGE + + # Run milabench + docker run -it --rm --ipc=host --gpus=all \ + -v $(pwd)/results:/milabench/envs/runs \ + $MILABENCH_IMAGE \ + milabench run + + ================= + Benchmark results + ================= + fail n perf sem% std% peak_memory score weight + bert-fp16 0 8 155.08 0.3% 4.3% 24552 1241.260310 0.00 + bert-fp32 0 8 29.52 0.0% 0.5% 31524 236.337218 0.00 + bert-tf32 0 8 120.46 0.4% 6.1% 31524 964.713297 0.00 + bert-tf32-fp16 0 8 154.76 0.3% 4.1% 24552 1238.477257 3.00 + convnext_large-fp16 0 8 337.48 0.9% 14.0% 27658 2741.604444 0.00 + convnext_large-fp32 0 8 44.61 0.8% 12.6% 49786 354.207225 0.00 + convnext_large-tf32 0 8 135.99 0.7% 11.2% 49786 1089.394916 0.00 + convnext_large-tf32-fp16 0 8 338.58 0.8% 13.0% 27658 2744.325170 3.00 + davit_large 0 8 312.79 0.3% 6.7% 35058 2515.326450 1.00 + davit_large-multi 0 1 2401.65 1.0% 7.7% 42232 2401.651720 5.00 + dlrm 0 1 188777.20 1.8% 14.0% 3194 188777.203190 1.00 + focalnet 0 8 400.47 0.2% 5.4% 26604 3215.431924 2.00 + opt-1_3b 0 1 26.71 0.1% 0.4% 44116 26.714365 5.00 + opt-1_3b-multinode 0 2 34.62 0.2% 1.0% 43552 34.618292 10.00 + opt-6_7b 0 1 14.32 0.0% 0.1% 55750 14.319587 5.00 + opt-6_7b-multinode 0 2 10.79 0.1% 0.7% 49380 10.792595 10.00 + reformer 0 8 61.70 0.0% 0.9% 25376 494.110834 1.00 + regnet_y_128gf 0 8 99.96 0.2% 5.0% 31840 803.012507 2.00 + resnet152 0 8 710.18 0.3% 6.2% 36732 5710.828608 1.00 + resnet152-multi 0 1 5367.34 1.0% 8.1% 38638 5367.338469 5.00 + resnet50 0 8 984.43 0.9% 19.1% 5026 7927.257351 1.00 + rwkv 0 8 428.65 0.2% 3.8% 5546 3435.097716 1.00 + stargan 0 8 51.32 1.8% 40.8% 37848 413.238870 1.00 + super-slomo 0 8 41.63 0.1% 2.3% 34082 332.395065 1.00 + t5 0 8 48.05 0.2% 3.9% 35466 384.317023 2.00 + whisper 0 8 248.16 0.0% 0.6% 37006 1985.861017 1.00 + + Scores + ------ + Failure rate: 0.00% (PASS) + Score: 219.06 + + +## Details The benchmark suite has been validated on the following configurations: -| Python version | GPU | Configuration file | -| - | - | - | -| 3.9.12 (conda) | 4x NVIDIA A100 80GB | config/standard.yaml | +| Python version | GPU | Configuration file | +| - | - | - | +| 3.9.12 (conda) | 4x NVIDIA A100 80GB | config/standard.yaml | | 3.9.12 (conda) | 4x NVIDIA RTX8000 48GB | config/standard.yaml | -| 3.9.16 (conda) | 2x NVIDIA K80 | config/ci.yaml | -| 3.9.16 (conda) | 2x AMD MI100 | config/ci.yaml | +| 3.9.16 (conda) | 2x NVIDIA K80 | config/ci.yaml | +| 3.9.16 (conda) | 2x AMD MI100 | config/ci.yaml | We are working on validating it on more configurations and will update the above table as we do. - diff --git a/benchmarks/accelerate_opt/benchfile.py b/benchmarks/accelerate_opt/benchfile.py index 56bd98bb9..8dfe62c30 100644 --- a/benchmarks/accelerate_opt/benchfile.py +++ b/benchmarks/accelerate_opt/benchfile.py @@ -44,6 +44,7 @@ def build_run_plan(self): for rank, node in enumerate(nodes): host = node["ip"] user = node["user"] + port = node.get("port", 22) options = dict() if rank == 0: @@ -63,6 +64,7 @@ def build_run_plan(self): host=host, user=user, key=key, + port=port, executor=DockerRunCommand( AccelerateLaunchCommand(pack, rank=rank), self.config["system"].get("docker_image"), diff --git a/benchmarks/accelerate_opt/main.py b/benchmarks/accelerate_opt/main.py index 7d1a33e61..bc2ead051 100644 --- a/benchmarks/accelerate_opt/main.py +++ b/benchmarks/accelerate_opt/main.py @@ -35,6 +35,7 @@ def arguments(): parser.add_argument("--validation_split_percentage", required=True, type=int) parser.add_argument("--dataset_name", required=True, type=str) parser.add_argument("--dataset_config_name", required=True, type=str) + parser.add_argument("--dataset_rev", required=True, type=str) parser.add_argument("--cache", required=True, type=str) parser.add_argument("--model_name", required=True, type=str) parser.add_argument("--prepare_only", action="store_true", default=False) @@ -180,17 +181,19 @@ def mblog(data): validation_split_percentage = config["validation_split_percentage"] dataset_name = config["dataset_name"] dataset_config_name = config["dataset_config_name"] - raw_datasets = load_dataset(dataset_name, dataset_config_name) + raw_datasets = load_dataset(dataset_name, dataset_config_name, revision=config["dataset_rev"]) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( dataset_name, dataset_config_name, - split=f"train[:{validation_split_percentage}%]", + split=f"train[:{validation_split_percentage}%]", + revision=config["dataset_rev"] ) raw_datasets["train"] = load_dataset( dataset_name, dataset_config_name, - split=f"train[{validation_split_percentage}%:]", + split=f"train[{validation_split_percentage}%:]", + revision=config["dataset_rev"] ) model_name = config["model_name"] diff --git a/config/base.yaml b/config/base.yaml index e5043e8e4..daa358f77 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -109,6 +109,7 @@ _accelerate_opt: --max_train_steps: 100 --dataset_name: "wikitext" --dataset_config_name: "wikitext-103-v1" + --dataset_rev: "b08601e" --validation_split_percentage: 5 --per_gpu_batch_size: 1 --cpus_per_gpu: 8 diff --git a/docker/Dockerfile-report b/docker/Dockerfile-report new file mode 100644 index 000000000..a967a1cfe --- /dev/null +++ b/docker/Dockerfile-report @@ -0,0 +1,48 @@ +FROM ubuntu:22.04 + + +# Arguments +# --------- + +ARG ARCH=cuda +ENV MILABENCH_GPU_ARCH=$ARCH + +ARG CONFIG=standard.yaml +ENV MILABENCH_CONFIG_NAME=$CONFIG +ENV MILABENCH_DOCKER=1 + +# Paths +# ----- + +ENV MILABENCH_CONFIG=/milabench/milabench/config/$MILABENCH_CONFIG_NAME +ENV MILABENCH_BASE=/milabench/envs +ENV MILABENCH_OUTPUT=/milabench/results/ +ENV MILABENCH_ARGS="" + +# Copy milabench +# -------------- + +WORKDIR /milabench +COPY . /milabench/milabench/ + +# Install Dependencies +# -------------------- + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update -y &&\ + apt-get install -y git python3 python-is-python3 python3-pip &&\ + apt-get update -y &&\ + apt-get clean &&\ + rm -rf /var/lib/apt/lists/* + +# Install Milabench +# ----------------- + +RUN python -m pip install -U pip &&\ + python -m pip install -U setuptools &&\ + python -m pip install -U poetry &&\ + python -m pip install -e /milabench/milabench/ &&\ + python -m pip cache purge + +CMD milabench report + diff --git a/milabench/config.py b/milabench/config.py index 5daca593c..7c289e0e9 100644 --- a/milabench/config.py +++ b/milabench/config.py @@ -173,7 +173,7 @@ def resolve_addresses(nodes): return self - + def get_gpu_capacity(strict=False): try: capacity = 0