Skip to content

Commit

Permalink
Update Dockerfile-cuda & avoid capacity fetch on prepare & install (#207
Browse files Browse the repository at this point in the history
)
  • Loading branch information
Delaunay authored Mar 1, 2024
1 parent 0bf6348 commit a94fb7a
Show file tree
Hide file tree
Showing 10 changed files with 58 additions and 56 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ jobs:
matrix:
include:
- arch: cuda
exclude : "no-cuda"
exclude : "unsupported-cuda"
# - arch: rocm
# exclude : "no-rocm"
# exclude : "unsupported-rocm"

runs-on: [self-hosted, "${{ matrix.arch }}"]

Expand Down
2 changes: 1 addition & 1 deletion config/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,7 @@ rwkv:
tags:
- llm
- rnn
- no-rocm
- unsupported-rocm
plan:
method: per_gpu
argv:
Expand Down
26 changes: 9 additions & 17 deletions docker/Dockerfile-cuda
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
FROM ubuntu:22.04
# FROM ubuntu:22.04

# For cuda-gdb
# FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04

# Arguments
# ---------

# Use ofed_info -s to get your local version
ARG MOFED_VERSION=5.4-3.4.0.0
ARG CONFIG=standard.yaml
ARG ARCH=cuda
ENV MILABENCH_GPU_ARCH=$ARCH

ARG CONFIG=standard.yaml
ENV MILABENCH_GPU_ARCH=$ARCH
ENV MILABENCH_CONFIG_NAME=$CONFIG
ENV MILABENCH_DOCKER=1

Expand All @@ -36,24 +38,18 @@ COPY . /milabench/milabench/
# rustc: used by BERT models inside https://pypi.org/project/tokenizers/
# build-essential: for rust

# Use ofed_info -s to get your local version
ARG MOFED_VERSION=5.4-3.4.0.0

ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update -y &&\
apt-get install -y git build-essential curl python3 python-is-python3 python3-pip &&\
apt-get install -y --no-install-recommends git build-essential curl python3 python-is-python3 python3-pip &&\
curl -o /etc/apt/trusted.gpg.d/mellanox.asc https://content.mellanox.com/ofed/RPM-GPG-KEY-Mellanox &&\
curl -o /etc/apt/sources.list.d/mellanox.list https://linux.mellanox.com/public/repo/mlnx_ofed/${MOFED_VERSION}/ubuntu22.04/mellanox_mlnx_ofed.list &&\
curl -o cuda-keyring_1.1-1_all.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb &&\
dpkg -i cuda-keyring_1.1-1_all.deb &&\
apt-get update -y &&\
apt-get install -y libibverbs1 nvidia-compute-utils-535 nvidia-utils-535 cuda-11-8 &&\
apt-get install -y --no-install-recommends libibverbs1 &&\
apt-get clean &&\
rm -rf /var/lib/apt/lists/* &&\
rm cuda-keyring_1.1-1_all.deb
rm -rf /var/lib/apt/lists/*

# Install Rust

RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
ENV CUDA_HOME=/usr/local/cuda-11.8
Expand All @@ -78,8 +74,4 @@ RUN milabench install --config $MILABENCH_CONFIG --base $MILABENCH_BASE $MILABEN
milabench prepare --config $MILABENCH_CONFIG --base $MILABENCH_BASE $MILABENCH_ARGS &&\
python -m pip cache purge

# Patch for https://github.com/pytorch/pytorch/issues/97041
RUN cd /milabench/envs/venv/torch/lib/python3.10/site-packages/torch/lib &&\
ln -sfn libnvrtc-672ee683.so.11.2 libnvrtc.so

CMD milabench run
17 changes: 8 additions & 9 deletions milabench/cli/publish.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
import re
import json
import subprocess
from contextlib import contextmanager
import multiprocessing
from dataclasses import dataclass
from urllib.parse import urlparse, ParseResult
import time
import threading
import signal
import os
import re
import signal
import subprocess
import sys
import threading
import time
from contextlib import contextmanager
from dataclasses import dataclass
from urllib.parse import ParseResult, urlparse

from coleo import Option, tooled


SLEEP = 0.01
_INIT = 0
_READY = 1
Expand Down
9 changes: 5 additions & 4 deletions milabench/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def arguments():

# Define capabilities
capabilities: Option = ""

return CommonArguments(
config,
system,
Expand All @@ -91,7 +91,7 @@ def arguments():
def get_multipack(args = None, run_name=None, overrides={}):
if args is None:
args = arguments()

override = [
o if re.match(pattern=r"[.\w]+=", string=o) else f"={o}" for o in args.override
]
Expand Down Expand Up @@ -225,13 +225,14 @@ def _get_multipack(

arch = deduce_arch()
base_defaults = get_base_defaults(
base=args.base,
arch=arch,
base=args.base,
arch=arch,
run_name=run_name
)
system_config = build_system_config(
args.system,
defaults={"system": base_defaults["_defaults"]["system"]},
gpu=True
)
overrides = merge({"*": system_config}, overrides)

Expand Down
28 changes: 21 additions & 7 deletions milabench/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import contextvars
import os
import socket

import psutil
Expand Down Expand Up @@ -174,16 +175,28 @@ def resolve_addresses(nodes):
return self


def get_gpu_capacity():
capacity = float(0)
def get_gpu_capacity(strict=False):
try:
capacity = 0

for k, v in get_gpu_info()["gpus"].items():
capacity = min(v["memory"]["total"], capacity)

for k, v in get_gpu_info()["gpus"].items():
capacity = min(v["memory"]["total"], capacity)
return capacity
except:
print("GPU not available, defaulting to 0 MiB")
if strict:
raise
return 0

return capacity
def is_autoscale_enabled():
return (
os.getenv("MILABENCH_SIZER_AUTO", False)
or os.getenv("MILABENCH_SIZER_MULTIPLE") is not None
)


def build_system_config(config_file, defaults=None):
def build_system_config(config_file, defaults=None, gpu=True):
"""Load the system configuration, verify its validity and resolve ip addresses
Notes
Expand All @@ -204,7 +217,8 @@ def build_system_config(config_file, defaults=None):

system = config.get("system", {})

if "gpu" not in system:
# capacity is only required if batch resizer is enabled
if (gpu or is_autoscale_enabled()) and not "gpu" not in system:
system["gpu"] = {"capacity": f"{int(get_gpu_capacity())} MiB"}

if system.get("sshkey") is not None:
Expand Down
12 changes: 6 additions & 6 deletions milabench/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,9 +300,9 @@ def on_data(self, entry, data, row):
load = int(data.get("load", 0) * 100)
currm, totalm = data.get("memory", [0, 0])
temp = int(data.get("temperature", 0))
row[
f"gpu:{gpuid}"
] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
row[f"gpu:{gpuid}"] = (
f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
)
row["gpu_load"] = f"{load}%"
row["gpu_mem"] = f"{currm:.0f}/{totalm:.0f} MB"
row["gpu_temp"] = f"{temp}C"
Expand Down Expand Up @@ -376,9 +376,9 @@ def on_data(self, entry, data, row):
load = int(data.get("load", 0) * 100)
currm, totalm = data.get("memory", [0, 0])
temp = int(data.get("temperature", 0))
row[
f"gpu:{gpuid}"
] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
row[f"gpu:{gpuid}"] = (
f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
)
else:
task = data.pop("task", "")
units = data.pop("units", "")
Expand Down
1 change: 0 additions & 1 deletion milabench/merge.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Utilities to merge dictionaries and other data structures."""


from collections import deque
from functools import reduce
from typing import Union
Expand Down
1 change: 1 addition & 0 deletions milabench/scripts/vcs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Use to retrieve GIT version info, this file cannot import milabench modules
as it is executed as part of the installation process"""

import os
import subprocess
import warnings
Expand Down
14 changes: 5 additions & 9 deletions milabench/sizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,14 @@
import numpy as np
import yaml

from .config import system_global
from .config import is_autoscale_enabled, system_global
from .validation.validation import ValidationLayer

ROOT = os.path.dirname(__file__)

default_scaling_config = os.path.join(ROOT, "..", "config", "scaling.yaml")


def is_autoscale_enabled():
return (
os.getenv("MILABENCH_SIZER_AUTO", False)
or os.getenv("MILABENCH_SIZER_MULTIPLE") is not None
)


def getenv(name, type):
value = os.getenv(name)

Expand Down Expand Up @@ -109,6 +102,9 @@ def get_capacity(self, capacity):
def auto_size(self, benchmark, capacity):
capacity = self.get_capacity(capacity)

if capacity is None:
return None

config = self.benchscaling(benchmark)

data = list(sorted(config["model"].items(), key=lambda x: x[0]))
Expand Down Expand Up @@ -182,7 +178,7 @@ def scale_argv(pack, argv):
sizer = sizer_global.get()
system = system_global.get()

capacity = system["gpu"]["capacity"]
capacity = system.get("gpu", dict()).get("capacity")

return sizer.argv(pack, capacity, argv)

Expand Down

0 comments on commit a94fb7a

Please sign in to comment.