Skip to content

Commit

Permalink
Merge branch 'hpu_tweaks' of github.com:mila-iqia/milabench into phas…
Browse files Browse the repository at this point in the history
…e_lock
  • Loading branch information
pierre.delaunay committed Jun 25, 2024
2 parents 4316cd1 + 3586203 commit 94b27a7
Show file tree
Hide file tree
Showing 24 changed files with 332 additions and 30 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,6 @@ output/
workspace/
.pin/tmp-*
dry/

stderr.txt
stdout.txt
11 changes: 9 additions & 2 deletions benchmarks/accelerate_opt/benchfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,29 @@
)
from milabench.pack import Package
from milabench.utils import select_nodes
from milabench.sizer import resolve_argv


def resolve_placeholder(pack, name):
placeholder = pack.config["argv"][name]
return resolve_argv(pack, [placeholder])


class AccelerateBenchmark(Package):
base_requirements = "requirements.in"

def make_env(self):
env = super().make_env()
env["OMP_NUM_THREADS"] = str(self.config["argv"]["--cpus_per_gpu"])
value = resolve_placeholder(pack, "--cpus_per_gpu")
env["OMP_NUM_THREADS"] = str(value)
return env

def build_prepare_plan(self):
return CmdCommand(
self,
"accelerate",
"launch",
"--mixed_precision=fp16",
"--mixed_precision=bf16",
"--num_machines=1",
"--dynamo_backend=no",
"--num_processes=1",
Expand Down
6 changes: 6 additions & 0 deletions benchmarks/brax/voirfile.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from dataclasses import dataclass

from voir import configurable
from voir.phase import StopProgram
from voir.instruments import dash, early_stop, gpu_monitor, log, rate


Expand Down Expand Up @@ -40,3 +41,8 @@ def instrument_main(ov, options: Config):
early_stop(n=options.stop, key="rate", task="train"),
gpu_monitor(poll_interval=3),
)

try:
yield ov.phases.run_script
except StopProgram:
print("early stopped")
6 changes: 6 additions & 0 deletions benchmarks/dlrm/voirfile.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from dataclasses import dataclass

from voir import configurable
from voir.phase import StopProgram
from voir.instruments import dash, early_stop, gpu_monitor, log, rate


Expand Down Expand Up @@ -53,3 +54,8 @@ def instrument_main(ov, options: Config):
ov.probe(
"//run(inputBatch as batch, !#loop_inputBatch as step, !!#endloop_inputBatch as step_end)"
).augment(task=lambda: "train").give()

try:
yield ov.phases.run_script
except StopProgram:
print("early stopped")
6 changes: 6 additions & 0 deletions benchmarks/huggingface/voirfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@


from voir import configurable
from voir.phase import StopProgram
from voir.instruments import dash, early_stop, gpu_monitor, log, rate


Expand Down Expand Up @@ -43,3 +44,8 @@ def instrument_main(ov, options: Config):

os.environ["VOIR_EARLYSTOP_COUNT"] = str(options.stop)
os.environ["VOIR_EARLYSTOP_SKIP"] = str(options.skip)

try:
yield ov.phases.run_script
except StopProgram:
print("early stopped")
1 change: 1 addition & 0 deletions benchmarks/llama/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import torch

from voir.phase import StopProgram
from benchmate.monitor import setupvoir
import torchcompat.core as accelerator

Expand Down
9 changes: 9 additions & 0 deletions benchmarks/llama/voirfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from voir.phase import StopProgram


@configurable
def instrument_main(ov):
try:
yield ov.phases.run_script
except StopProgram:
print("early stopped")
6 changes: 6 additions & 0 deletions benchmarks/stargan/voirfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os

from voir import configurable
from voir.phase import StopProgram
from voir.instruments import dash, early_stop, gpu_monitor, log, rate


Expand Down Expand Up @@ -43,3 +44,8 @@ def instrument_main(ov, options: Config):

os.environ["VOIR_EARLYSTOP_COUNT"] = str(options.stop)
os.environ["VOIR_EARLYSTOP_SKIP"] = str(options.skip)

try:
yield ov.phases.run_script
except StopProgram:
print("early stopped")
6 changes: 6 additions & 0 deletions benchmarks/super-slomo/voirfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os

from voir import configurable
from voir.phase import StopProgram
from voir.instruments import dash, early_stop, gpu_monitor, log, rate


Expand Down Expand Up @@ -42,3 +43,8 @@ def instrument_main(ov, options: Config):

os.environ["VOIR_EARLYSTOP_COUNT"] = str(options.stop)
os.environ["VOIR_EARLYSTOP_SKIP"] = str(options.skip)

try:
yield ov.phases.run_script
except StopProgram:
print("early stopped")
6 changes: 6 additions & 0 deletions benchmarks/timm/voirfile.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from dataclasses import dataclass

from voir import configurable
from voir.phase import StopProgram
from voir.instruments import dash, early_stop, gpu_monitor, log, rate

import torchcompat.core as accelerator
Expand Down Expand Up @@ -70,3 +71,8 @@ def instrument_main(ov, options: Config):
instruments.append(early_stop(n=options.stop, key="rate", task="train", signal="stop"))

ov.require(*instruments)

try:
yield ov.phases.run_script
except StopProgram:
print("early stopped")
7 changes: 5 additions & 2 deletions benchmarks/torchvision/voirfile.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from dataclasses import dataclass

from voir import configurable
from voir.phase import StopProgram
from voir.instruments import dash, early_stop, gpu_monitor, log
from benchmate.observer import BenchObserver

Expand Down Expand Up @@ -58,5 +59,7 @@ def instrument_main(ov, options: Config):
probe = ov.probe("//train_epoch > criterion", overridable=True)
probe['criterion'].override(observer.criterion)



try:
yield ov.phases.run_script
except StopProgram:
print("early stopped")
9 changes: 9 additions & 0 deletions benchmarks/torchvision_ddp/voirfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from voir.phase import StopProgram


@configurable
def instrument_main(ov):
try:
yield ov.phases.run_script
except StopProgram:
print("early stopped")
9 changes: 6 additions & 3 deletions benchmate/benchmate/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ def generate_tensors(batch_size, shapes, device):
return tensors


def generate_tensor_classification(model, batch_size, in_shape, device):
model = model.to(device)
def generate_tensor_classification(original_model, batch_size, in_shape, device):
model = original_model.to(device=device)
inp = torch.randn((batch_size, *in_shape), device=device)
out = torch.rand_like(model(inp))
return inp, out
Expand Down Expand Up @@ -195,7 +195,10 @@ def pytorch(folder, batch_size, num_workers, distributed=False, epochs=60):
def synthetic(model, batch_size, fixed_batch):
return SyntheticData(
tensors=generate_tensor_classification(
model, batch_size, (3, 244, 244), device=accelerator.fetch_device(0)
model,
batch_size,
(3, 244, 244),
device=accelerator.fetch_device(0)
),
n=1000,
fixed_batch=fixed_batch,
Expand Down
11 changes: 9 additions & 2 deletions benchmate/benchmate/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,19 @@ def push(self, pusher):

class LazyLossPusher(LazyMetricPusher):
def record(self, loss):
value = loss
# no .item() we do not want to sync
self.append(loss.detach())
if hasattr(loss, "detach"):
value = loss.detach()
self.append(value)

def materialize(self, loss):
value = loss
# synch here is fine
return {"loss": loss.item(), "task": self.task}
if hasattr(loss, "item"):
value = loss.item()

return {"loss": value, "task": self.task}


class CPUTimer:
Expand Down
8 changes: 5 additions & 3 deletions benchmate/benchmate/observer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class BenchObserver:
"""

def __init__(
self, *args, backward_callback=None, step_callback=None, stdout=False, **kwargs
self, *args, backward_callback=None, step_callback=None, stdout=False, rank=None, **kwargs
):
self.wrapped = None
self.args = args
Expand All @@ -32,6 +32,7 @@ def __init__(
self.optimizer_step_callback = step_callback
self.stdout = stdout
self.task = "train"
self.rank = rank
self.losses = LazyLossPusher(self.task)

self.pusher = give_push()
Expand All @@ -43,7 +44,8 @@ def on_iterator_stop_iterator(self):
self.losses.push(self.pusher)

def record_loss(self, loss):
self.losses.record(loss)
if self.rank is None or self.rank == 1:
self.losses.record(loss)
return loss

def override_return_value(self, function, override):
Expand All @@ -62,7 +64,7 @@ def override_return_value(self, function, override):
def loader(self, loader):
"""Wrap a dataloader or an iterable which enable accurate measuring of time spent in the loop's body"""
self.wrapped = TimedIterator(
loader, *self.args, push=self.pusher, **self.kwargs
loader, *self.args, rank=self.rank, push=self.pusher, **self.kwargs
)
self.wrapped.task = self.task
self.wrapped.on_iterator_stop_iterator = self.on_iterator_stop_iterator
Expand Down
102 changes: 102 additions & 0 deletions benchmate/benchmate/warden.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from dataclasses import dataclass
import re
import os
import subprocess
import traceback
import signal

from voir.instruments.gpu import get_gpu_info
from milabench.syslog import syslog

@dataclass
class ProcessInfo:
gpu: int
pid: int
type: str
process_name: str
memory: int
unit: str


def _hpu_parse_processes():
output = subprocess.check_output(["hl-smi"], text=True)

line_format = re.compile(
r"\|(\s+)(?P<gpu>\d+)(\s+)(?P<pid>\d+)(\s+)(?P<type>\w+)(\s+)(?P<process_name>\w+)(\s+)(?P<memory>\d+)((?P<unit>\w+))(\s+)"
)

info = []
for line in output.split("\n"):
if match := line_format.match(line):
info.append(ProcessInfo(**match.groupdict()))

return info



def _default():
return []

backends = {
"hpu": _hpu_parse_processes,
"cpu": _default
}


class GPUProcessWarden:
"""Ensure all the process using the GPU are killed before & after the bench"""
def __init__(self, kill_on_start=True, kill_on_end=True):
self.gpus = get_gpu_info()
self.arch = self.gpus['arch']
self.fetch_fun = backends.get(self.arch, _default)
self.kill_on_start = kill_on_start
self.kill_on_end = kill_on_end
self.dead_processes = []

def __enter__(self):
if self.kill_on_start:
self.ensure_free()

return self

def __exit__(self, *args):
if self.kill_on_end:
self.ensure_free()

return None

def fetch_processes(self):
try:
return self.fetch_fun()
except :
traceback.print_exc()
return []

def kill(self, pid, signal):
if pid in self.dead_processes:
return

try:
os.kill(pid, signal):
except ProcessLookupError:
self.dead_processes.append(pid)

def ensure_free(self):
processes = self.fetch_processes()
if len(processes) == 0:
return

syslog("Found {0} still using devices after bench ended", len(processes))

# Keyboard interrupt
for process in processes:
self.kill(process.pid, signal.SIGINT)

# Sig Term, please close now
for process in processes:
self.kill(process.pid, signal.SIGTERM)

# Sig Kill, just die
for process in processes:
self.kill(process.pid, signal.SIGKILL)

2 changes: 1 addition & 1 deletion config/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ resnet152-ddp:
--model: resnet152
--batch-size: 256
--num-workers: "auto({n_worker}, 8)"
--loader: dali
--loader: torch

efficientnet_b4:
inherits: _torchvision
Expand Down
Loading

0 comments on commit 94b27a7

Please sign in to comment.