diff --git a/config/slurm.yaml b/config/slurm.yaml index fe6c7a631..6d7fa8a5b 100644 --- a/config/slurm.yaml +++ b/config/slurm.yaml @@ -14,6 +14,7 @@ multi-node-full: - --time=2:00:00 - --ntasks-per-node=1 - --mem=0 + - --export=ALL,MILABENCH_SIZER_AUTO=0 single-node-full: # DGX run: 1 node x 8 A100 80Go SXM4 @@ -27,27 +28,82 @@ single-node-full: - --time=1:30:00 - --ntasks-per-node=1 - --mem=0 + - --export=ALL,MILABENCH_SIZER_AUTO=0 -multi-node-small: - # Any GPU, 2 nodes x 2 GPU +# +# +# +single-node-small: + # Any GPU, 1 node x 2 GPU - --partition=staff-idt - --ntasks=1 - --gpus-per-task=2 - --exclusive - - --nodes=2 + - --nodes=1 - --cpus-per-task=16 - --time=1:30:00 - --ntasks-per-node=1 - --mem=128G + - --export=ALL,MILABENCH_SIZER_AUTO=1,MILABENCH_SIZER_MULTIPLE=8 -single-node-small: - # Any GPU, 1 node x 2 GPU + +multi-node-small: + # rtx8000, 2 nodes x 2 GPU - --partition=staff-idt + - --gpus-per-task=rtx8000:2 - --ntasks=1 - - --gpus-per-task=2 - --exclusive - - --nodes=1 + - --nodes=2 - --cpus-per-task=16 - --time=1:30:00 - --ntasks-per-node=1 - --mem=128G + - --export=ALL,MILABENCH_SIZER_AUTO=1,MILABENCH_SIZER_MULTIPLE=8 + +# +# RTS 48Go +# +multi-node-rtx: + - --partition=staff-idt + - --gpus-per-task=rtx8000:8 + - --ntasks=1 + - --exclusive + - --nodes=2 + - --cpus-per-task=64 + - --time=1:30:00 + - --ntasks-per-node=1 + - --mem=0 + - --exclusive + - --export=ALL,MILABENCH_SIZER_AUTO=1,MILABENCH_SIZER_MULTIPLE=8 + +# +# V100 - 32 Go +# +multi-node-v100: + - --partition=staff-idt + - --gpus-per-task=v100:8 + - --ntasks=1 + - --exclusive + - --nodes=2 + - --cpus-per-task=40 + - --time=1:30:00 + - --ntasks-per-node=1 + - --mem=0 + - --exclusive + - --export=ALL,MILABENCH_SIZER_AUTO=1,MILABENCH_SIZER_MULTIPLE=8 + +# +# Small A100 - 40Go +# +multi-node-a100: + - --partition=staff-idt + - --gpus-per-task=a100:8 + - --ntasks=1 + - --exclusive + - --nodes=2 + - --cpus-per-task=128 + - --time=1:30:00 + - --ntasks-per-node=1 + - --mem=0 + - --exclusive + - --export=ALL,MILABENCH_SIZER_AUTO=1,MILABENCH_SIZER_MULTIPLE=8 \ No newline at end of file diff --git a/milabench/system.py b/milabench/system.py index a8fc6b878..8c3e99def 100644 --- a/milabench/system.py +++ b/milabench/system.py @@ -40,6 +40,27 @@ def _print(): warn_no_config = print_once("No system config found, using defaults") +_global_options = {} + +def _track_options(name, type, default, value): + """This is just a helper so command line can display the options""" + global _global_options + + try: + _global_options[name] = { + "type": type, + "default": default, + "value": value + } + except: + pass + + +def as_environment_variable(name): + frags = name.split(".") + return "MILABENCH_" + "_".join(map(str.upper, frags)) + + def option(name, etype, default=None): options = dict() system = system_global.get() @@ -49,7 +70,7 @@ def option(name, etype, default=None): warn_no_config() frags = name.split(".") - env_name = "MILABENCH_" + "_".join(map(str.upper, frags)) + env_name = as_environment_variable(name) env_value = getenv(env_name, etype) lookup = options @@ -59,6 +80,8 @@ def option(name, etype, default=None): system_value = lookup.get(frags[-1], None) final_value = env_value or system_value or default + _track_options(name, etype, default, final_value) + if final_value is None: return None try: @@ -169,8 +192,14 @@ class Nodes: user: str +@dataclass +class Github: + pat: str = option("github.path", str, None) + + @dataclass class SystemConfig: + """This is meant to be an exhaustive list of all the environment overrides""" arch: str = getenv("MILABENCH_GPU_ARCH", str) sshkey: str = None docker_image: str = None @@ -178,6 +207,12 @@ class SystemConfig: gpu: GPUConfig = None options: Options = None + base: str = option("base", str, None) + config: str = option("config", str, None) + dash: bool = option("dash", bool, 1) + noterm: bool = option("noterm", bool, 0) + github: Github = None + def check_node_config(nodes): mandatory_fields = ["name", "ip", "user"] @@ -353,3 +388,49 @@ def build_system_config(config_file, defaults=None, gpu=True): system["self"] = self return config + + +def show_overrides(to_json=False): + import json + import copy + config = {} + + for name, value in _global_options.items(): + frags = name.split('.') + + dct = config + for p in frags[:-1]: + dct = dct.setdefault(p, dict()) + + val_name = frags[-1] + val = copy.deepcopy(value) + + val["type"] = str(val["type"].__name__) + dct[val_name] = val + val["env_name"] = envname = as_environment_variable(name) + + def compact(d, depth): + for k, v in d.items(): + idt = " " * depth + + if "env_name" in v: + value = v["value"] + default = v["default"] + if value != default: + print(f"{idt}{k:<{30 - len(idt)}}: {str(value):<40} (default={default})") + else: + print(f"{idt}{k:<{30 - len(idt)}}: {str(value):<40} {v['env_name']}") + else: + print(f"{idt}{k}:") + compact(v, depth + 1) + + if to_json: + print(json.dumps(config, indent=2)) + else: + compact(config, 0) + + + + +if __name__ == "__main__": + show_overrides() \ No newline at end of file