Skip to content

Commit

Permalink
Add metric gathering utility for batch x worker run
Browse files Browse the repository at this point in the history
  • Loading branch information
pierre.delaunay committed Jun 14, 2024
1 parent 58ab90b commit 1fad24c
Show file tree
Hide file tree
Showing 10 changed files with 290 additions and 76 deletions.
124 changes: 124 additions & 0 deletions milabench/cli/gather.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import argparse
import os
import re
from dataclasses import dataclass, field

import pandas as pd

from ..common import _read_reports
from ..report import make_dataframe, pandas_to_string
from ..summary import make_summary


def default_tags():
return [
"worker=w([a-z0-9]*)",
"multiple=m([0-9]*)",
"power=p([0-9]*)",
"capacity=c([A-Za-z0-9]*(Go)?)",
]


# fmt: off
@dataclass
class Arguments:
runs: str
tags: list = field(default_factory=default_tags)
# fmt: on


def arguments():
parser = argparse.ArgumentParser()
parser.add_argument(
"--runs",
type=str,
help="Run folder",
default="/home/mila/d/delaunap/batch_x_worker/",
)
parser.add_argument(
"--tags",
type=str,
help="Tags defined in run names",
default=default_tags(),
)
return parser.parse_args() # Arguments()


def get_config(reports):
k = list(reports.keys())[0]
config = None
for line in reports[k]:
if line["event"] == "config":
config = line["data"]
break
return config


def extract_tags(name, tags):
for tag, pat in tags.items():
if m := pat.search(name):
value = m.group(1)
yield tag, value
else:
print(f"{tag} not found in {name}")
yield tag, "NA"


def gather_cli(args=None):
"""Gather metrics from runs inside a folder in a neat format.
It can extract tags/flags from the runname and create new columns to uniquely identify runs.
Examples
--------
>>> python -m milabench.cli.gather --runs /home/mila/d/delaunap/batch_x_worker/
bench | fail | n | perf | sem% | std% | peak_memory | score | weight | elapsed | name | worker | multiple | power | capacity
brax | 0 | 1 | 722480.33 | 0.7% | 5.2% | 6448 | 722480.33 | 1.00 | 94 | w16-m8-c4Go | 16 | 8 | NA | 4Go
dlrm | 0 | 1 | 350641.30 | 0.6% | 4.6% | 7624 | 350641.30 | 1.00 | 124 | w16-m8-c4Go | 16 | 8 | NA | 4Go
....
brax | 0 | 1 | 723867.42 | 0.6% | 4.5% | 6448 | 723867.42 | 1.00 | 94 | w2-m8-c8Go | 2 | 8 | NA | 8Go
dlrm | 0 | 1 | 403113.36 | 0.7% | 5.1% | 7420 | 403113.36 | 1.00 | 258 | w2-m8-c8Go | 2 | 8 | NA | 8Go
bf16 | 0 | 8 | 293.08 | 0.3% | 7.5% | 5688 | 2361.09 | 0.00 | 18 | w2-m8-c8Go | 2 | 8 | NA | 8Go
fp16 | 0 | 8 | 290.58 | 0.2% | 4.9% | 5688 | 2335.63 | 0.00 | 29 | w2-m8-c8Go | 2 | 8 | NA | 8Go
"""
if args is None:
args = arguments()

runs = []
for folder in os.listdir(args.runs):
if folder.startswith("prepare"):
continue

if folder.startswith("install"):
continue

path = f"{args.runs}/{folder}"
if os.path.isdir(path):
runs.append(path)

tags = dict()
for tag in args.tags:
name, regex = tag.split("=")
tags[name] = re.compile(regex)

query = ("batch_size", "elapsed")
data = []
for run in runs:
reports = _read_reports(run)
summary = make_summary(reports.values(), query=query)
df = make_dataframe(summary, None, None, query=query)

name = run.split("/")[-1]
df["name"] = name.split(".", maxsplit=1)[0]
for tag, value in extract_tags(name, tags):
df[tag] = value

data.append(df)

gathered = pd.concat(data)
print(pandas_to_string(gathered))


if __name__ == "__main__":
gather_cli()
17 changes: 12 additions & 5 deletions milabench/cli/matrix.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
from dataclasses import dataclass
import sys
from dataclasses import dataclass

import yaml
from coleo import Option, tooled

from ..system import build_system_config
from ..common import deduce_arch, build_config, get_base_defaults, merge, is_selected
from ..common import (
build_config,
deduce_arch,
get_base_defaults,
is_selected,
merge,
)
from ..sizer import resolve_argv, scale_argv
from ..system import build_system_config


# fmt: off
Expand Down Expand Up @@ -78,13 +84,14 @@ def cli_matrix_run(args=None):

def resolve_args(conf, argv):
from ..pack import Package

pack = Package(conf)

args = []
for k, v in argv.items():
args.append(k)
args.append(v)

sized_args = scale_argv(pack, args)
final_args = resolve_argv(pack, sized_args)

Expand All @@ -94,7 +101,7 @@ def resolve_args(conf, argv):
argv[k] = final_args[i + 1]
i += 2
continue

print(f"Missing resolved argument {k}")

return argv
Expand Down
2 changes: 1 addition & 1 deletion milabench/commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,7 +666,7 @@ def _argv(self, **_) -> List:

resolver = new_argument_resolver(self.pack)

cpu_per_process = resolver(str(self.pack.config['argv']['--cpus_per_gpu']))
cpu_per_process = resolver(str(self.pack.config["argv"]["--cpus_per_gpu"]))
return [
# -- Run the command in the right venv
# This could be inside the SSH Command
Expand Down
2 changes: 1 addition & 1 deletion milabench/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
from milabench.alt_async import proceed
from milabench.utils import available_layers, blabla, multilogger

from .system import build_system_config
from .config import build_config
from .fs import XPath
from .log import TerminalFormatter
from .merge import merge
from .multi import MultiPackage
from .report import make_report
from .summary import aggregate, make_summary
from .system import build_system_config


def get_pack(defn):
Expand Down
3 changes: 0 additions & 3 deletions milabench/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,12 @@
from copy import deepcopy

import psutil
from copy import deepcopy
import yaml
from omegaconf import OmegaConf

from .fs import XPath
from .merge import merge


config_global = contextvars.ContextVar("config", default=None)


Expand Down Expand Up @@ -112,7 +110,6 @@ def build_matrix_bench(all_configs):

for name, bench_config in all_configs.items():
for k, v in expand_matrix(name, bench_config):

if k in expanded_config:
raise ValueError("Bench name is not unique")

Expand Down
15 changes: 8 additions & 7 deletions milabench/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,11 @@ def __call__(self, entry):
pass

elif event == "config":

def _show(k, entry):
if k in ("meta", "system"):
return

if isinstance(entry, dict):
for k2, v in entry.items():
_show(f"{k}.{k2}", v)
Expand Down Expand Up @@ -302,9 +303,9 @@ def on_data(self, entry, data, row):
load = int(data.get("load", 0) * 100)
currm, totalm = data.get("memory", [0, 0])
temp = int(data.get("temperature", 0))
row[f"gpu:{gpuid}"] = (
f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
)
row[
f"gpu:{gpuid}"
] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
row["gpu_load"] = f"{load}%"
row["gpu_mem"] = f"{currm:.0f}/{totalm:.0f} MB"
row["gpu_temp"] = f"{temp}C"
Expand Down Expand Up @@ -378,9 +379,9 @@ def on_data(self, entry, data, row):
load = int(data.get("load", 0) * 100)
currm, totalm = data.get("memory", [0, 0])
temp = int(data.get("temperature", 0))
row[f"gpu:{gpuid}"] = (
f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
)
row[
f"gpu:{gpuid}"
] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
else:
task = data.pop("task", "")
units = data.pop("units", "")
Expand Down
Loading

0 comments on commit 1fad24c

Please sign in to comment.