Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add worker resolution #221

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,4 @@ test.out
output/
workspace/
.pin/tmp-*
dry/
19 changes: 11 additions & 8 deletions config/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ _torchvision:
--lr: 0.01
--no-stdout: true
--epochs: 50
--num-workers: 8
--num-workers: "auto({n_worker}, 8)"
--loader: pytorch
--data: "{milabench_data}/FakeImageNet"

Expand All @@ -37,7 +37,7 @@ _torchvision_ddp:
n: 1
argv:
--epochs: 10
--num-workers: 8
--num-workers: "auto({n_worker}, 8)"
--loader: pytorch
--data: "{milabench_data}/FakeImageNet"

Expand Down Expand Up @@ -82,15 +82,14 @@ llama:
argv:
--pretrained: true


_hf:
inherits: _defaults
definition: ../benchmarks/huggingface
group: hf
install_group: torch
argv:
--precision: 'tf32-fp16'
--num-workers: 8
--num-workers: "auto({n_worker}, 8)"

plan:
method: per_gpu
Expand All @@ -111,6 +110,7 @@ _timm:
--val-split: ''
--data-dir: "{milabench_data}"
--dataset: "FakeImageNet"
--workers: "auto({n_worker}, 8)"

_sb3:
inherits: _defaults
Expand Down Expand Up @@ -143,7 +143,7 @@ _accelerate_opt:
--dataset_rev: "b08601e"
--validation_split_percentage: 5
--per_gpu_batch_size: 1
--cpus_per_gpu: 8
--cpus_per_gpu: "auto({n_worker}, 8)"
# --model_name: "facebook/opt-2.7b"
# --model_name: "facebook/opt-1.3b"
# --model_name: "facebook/opt-350m"
Expand Down Expand Up @@ -203,7 +203,7 @@ resnet50:
argv:
--model: resnet50
--batch-size: 256
--num-workers: "{cpu_per_gpu}"
--num-workers: "auto({n_worker}, 8)"
--loader: pytorch

resnet50-noio:
Expand Down Expand Up @@ -231,7 +231,7 @@ resnet152-ddp:
argv:
--model: resnet152
--batch-size: 256
--num-workers: 8
--num-workers: "auto({n_worker}, 8)"
--loader: dali

efficientnet_b4:
Expand Down Expand Up @@ -507,6 +507,7 @@ stargan:
--model_save_dir: "{milabench_extra}/models"
--sample_dir: "{milabench_extra}/samples"
--result_dir: "{milabench_extra}/results"
--num_workers: "auto({n_worker}, 8)"

super-slomo:
inherits: _defaults
Expand All @@ -524,7 +525,7 @@ super-slomo:
--train_batch_size: 64
--dataset_root: "{milabench_data}/FakeImageNet"
--loader: pytorch
--num_workers: 8
--num_workers: "auto({n_worker}, 8)"

ppo:
inherits: _sb3
Expand Down Expand Up @@ -588,6 +589,7 @@ dlrm:
--test-mini-batch-size: 16384
--test-num-workers: 0
--use-gpu: true
--num-workers: "auto({n_worker}, 8)"

rwkv:
inherits: _defaults
Expand Down Expand Up @@ -625,6 +627,7 @@ rwkv:
--grad_cp: 0
--random_seed: 1234
--enable_progress_bar: "False"

brax:
inherits: _defaults
tags:
Expand Down
6 changes: 6 additions & 0 deletions config/scaling.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,18 @@ focalnet:
optimized: 128
opt-1_3b:
arg: --per_gpu_batch_size
model:
1: 42126 MiB
optimized: 1
opt-1_3b-multinode:
arg: --per_gpu_batch_size
model:
1: 42126 MiB
optimized: 1
opt-6_7b-multinode:
arg: --per_gpu_batch_size
model:
1: 55380 MiB
optimized: 1
reformer:
arg: --batch-size
Expand Down
6 changes: 3 additions & 3 deletions milabench/_version.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""This file is generated, do not modify"""

__tag__ = "v0.0.10-147-gc6540c3e"
__commit__ = "c6540c3e470222e44b4a841954593185db49b111"
__date__ = "2024-06-12 07:11:39 -0400"
__tag__ = "v0.0.10-147-g1ef648ee"
__commit__ = "1ef648eeb78233e53274058cd9cfcdc539f01bae"
__date__ = "2024-06-12 09:39:51 -0400"
124 changes: 124 additions & 0 deletions milabench/cli/gather.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import argparse
import os
import re
from dataclasses import dataclass, field

import pandas as pd

from ..common import _read_reports
from ..report import make_dataframe, pandas_to_string
from ..summary import make_summary


def default_tags():
return [
"worker=w([a-z0-9]*)",
"multiple=m([0-9]*)",
"power=p([0-9]*)",
"capacity=c([A-Za-z0-9]*(Go)?)",
]


# fmt: off
@dataclass
class Arguments:
runs: str
tags: list = field(default_factory=default_tags)
# fmt: on


def arguments():
parser = argparse.ArgumentParser()
parser.add_argument(
"--runs",
type=str,
help="Run folder",
default="/home/mila/d/delaunap/batch_x_worker/",
)
parser.add_argument(
"--tags",
type=str,
help="Tags defined in run names",
default=default_tags(),
)
return parser.parse_args() # Arguments()


def get_config(reports):
k = list(reports.keys())[0]
config = None
for line in reports[k]:
if line["event"] == "config":
config = line["data"]
break
return config


def extract_tags(name, tags):
for tag, pat in tags.items():
if m := pat.search(name):
value = m.group(1)
yield tag, value
else:
print(f"{tag} not found in {name}")
yield tag, "NA"


def gather_cli(args=None):
"""Gather metrics from runs inside a folder in a neat format.
It can extract tags/flags from the runname and create new columns to uniquely identify runs.

Examples
--------

>>> python -m milabench.cli.gather --runs /home/mila/d/delaunap/batch_x_worker/
bench | fail | n | perf | sem% | std% | peak_memory | score | weight | elapsed | name | worker | multiple | power | capacity
brax | 0 | 1 | 722480.33 | 0.7% | 5.2% | 6448 | 722480.33 | 1.00 | 94 | w16-m8-c4Go | 16 | 8 | NA | 4Go
dlrm | 0 | 1 | 350641.30 | 0.6% | 4.6% | 7624 | 350641.30 | 1.00 | 124 | w16-m8-c4Go | 16 | 8 | NA | 4Go
....
brax | 0 | 1 | 723867.42 | 0.6% | 4.5% | 6448 | 723867.42 | 1.00 | 94 | w2-m8-c8Go | 2 | 8 | NA | 8Go
dlrm | 0 | 1 | 403113.36 | 0.7% | 5.1% | 7420 | 403113.36 | 1.00 | 258 | w2-m8-c8Go | 2 | 8 | NA | 8Go
bf16 | 0 | 8 | 293.08 | 0.3% | 7.5% | 5688 | 2361.09 | 0.00 | 18 | w2-m8-c8Go | 2 | 8 | NA | 8Go
fp16 | 0 | 8 | 290.58 | 0.2% | 4.9% | 5688 | 2335.63 | 0.00 | 29 | w2-m8-c8Go | 2 | 8 | NA | 8Go

"""
if args is None:
args = arguments()

runs = []
for folder in os.listdir(args.runs):
if folder.startswith("prepare"):
continue

if folder.startswith("install"):
continue

path = f"{args.runs}/{folder}"
if os.path.isdir(path):
runs.append(path)

tags = dict()
for tag in args.tags:
name, regex = tag.split("=")
tags[name] = re.compile(regex)

query = ("batch_size", "elapsed")
data = []
for run in runs:
reports = _read_reports(run)
summary = make_summary(reports.values(), query=query)
df = make_dataframe(summary, None, None, query=query)

name = run.split("/")[-1]
df["name"] = name.split(".", maxsplit=1)[0]
for tag, value in extract_tags(name, tags):
df[tag] = value

data.append(df)

gathered = pd.concat(data)
print(pandas_to_string(gathered))


if __name__ == "__main__":
gather_cli()
38 changes: 34 additions & 4 deletions milabench/cli/matrix.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import sys
from dataclasses import dataclass

import yaml
from coleo import Option, tooled

from ..common import (
build_config,
build_system_config,
deduce_arch,
get_base_defaults,
is_selected,
merge,
)
from ..sizer import resolve_argv, scale_argv
from ..system import build_system_config


# fmt: off
Expand Down Expand Up @@ -79,7 +82,34 @@ def cli_matrix_run(args=None):

clean_config(config, args)

for k in config:
print(k)
def resolve_args(conf, argv):
from ..pack import Package

# yaml.dump(config, sys.stdout)
pack = Package(conf)

args = []
for k, v in argv.items():
args.append(k)
args.append(v)

sized_args = scale_argv(pack, args)
final_args = resolve_argv(pack, sized_args)

i = 0
for k, v in argv.items():
if final_args[i] == k:
argv[k] = final_args[i + 1]
i += 2
continue

print(f"Missing resolved argument {k}")

return argv

for _, conf in config.items():
conf["argv"] = resolve_args(conf, conf["argv"])

# for k in config:
# print(k)

yaml.dump(config, sys.stdout)
10 changes: 9 additions & 1 deletion milabench/commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,14 @@ def _argv(self, **_) -> List:
else ["--multi_gpu"]
)

#
# Can this logic be removed?
#
from ..sizer import new_argument_resolver

resolver = new_argument_resolver(self.pack)

cpu_per_process = resolver(str(self.pack.config["argv"]["--cpus_per_gpu"]))
return [
# -- Run the command in the right venv
# This could be inside the SSH Command
Expand All @@ -676,7 +684,7 @@ def _argv(self, **_) -> List:
f"--num_machines={num_machines}",
*deepspeed_argv,
f"--gradient_accumulation_steps={self.pack.config['gradient_accumulation_steps']}",
f"--num_cpu_threads_per_process={self.pack.config['argv']['--cpus_per_gpu']}",
f"--num_cpu_threads_per_process={cpu_per_process}",
f"--main_process_ip={manager['ip']}",
f"--main_process_port={manager['port']}",
f"--num_processes={nproc}",
Expand Down
3 changes: 2 additions & 1 deletion milabench/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,14 @@
from milabench.alt_async import proceed
from milabench.utils import available_layers, blabla, multilogger

from .config import build_config, build_system_config
from .config import build_config
from .fs import XPath
from .log import TerminalFormatter
from .merge import merge
from .multi import MultiPackage
from .report import make_report
from .summary import aggregate, make_summary
from .system import build_system_config


def get_pack(defn):
Expand Down
Loading
Loading