Do a mock run where everything except run is executed

mila-iqia · Jul 8, 2024 · 3ce28f7 · 3ce28f7
1 parent dd7f388
commit 3ce28f7
Show file tree

Hide file tree

Showing 5 changed files with 135 additions and 8 deletions.
diff --git a/benchmate/benchmate/datagen.py b/benchmate/benchmate/datagen.py
@@ -7,6 +7,7 @@
 from collections import defaultdict
 from pathlib import Path
 
+import torchcompat.core as acc
 import torch
 from tqdm import tqdm
 
@@ -79,26 +80,39 @@ def generate_sets(root, sets, shape):
         json.dump(sets, fp)
 
 
+def device_count():
+    try:
+        return acc.device_count()
+    except:
+        return 1
+
 def generate_fakeimagenet():
+    # config = json.loads(os.environ["MILABENCH_CONFIG"])
+
     parser = argparse.ArgumentParser()
     parser.add_argument("--batch-size", default=512, type=int)
     parser.add_argument("--batch-count", default=60, type=int)
+    parser.add_argument("--device-count", default=device_count(), type=int)
     parser.add_argument("--image-size", default=[3, 384, 384], type=int, nargs="+")
     parser.add_argument("--val", default=0.1, type=float, nargs="+")
     parser.add_argument("--test", default=0.1, type=float, nargs="+")
+
     args, _ = parser.parse_known_args()
 
     data_directory = os.environ["MILABENCH_DIR_DATA"]
-    dest = os.path.join(data_directory, "FakeImageNet")
+
+    dest = os.path.join(data_directory, f"FakeImageNet")
     print(f"Generating fake data into {dest}...")
 
-    total_images = args.batch_size * args.batch_count
+    total_images = args.batch_size * args.batch_count * args.device_count
     size_spec = {
-        "train": total_images,
-        "val": int(total_images * args.val),
-        "test": int(total_images * args.test),
+        f"train": total_images,
+        f"val": int(total_images * args.val),
+        f"test": int(total_images * args.test),
     }
 
+
+
     generate_sets(dest, size_spec, args.image_size)
     print("Done!")
 

diff --git a/benchmate/benchmate/dataset.py b/benchmate/benchmate/dataset.py
@@ -1,5 +1,9 @@
 import os
 from collections import defaultdict
+import math
+
+import torch
+from torch.utils.data.distributed import DistributedSampler
 
 
 def no_transform(args):
@@ -48,3 +52,16 @@ def __getitem__(self, item):
 
     def __len__(self):
         return len(self.clip)
+
+
+class ExclusiveSetSampler(DistributedSampler):
+    def __init__(self, dataset, num_sets: int, set_id: int, shuffle: bool = True, seed: int = 0, drop_last: bool = False) -> None:
+        super().__init__(
+            dataset, 
+            num_replicas=num_sets,
+            rank=set_id,
+            shuffle=shuffle,
+            seed=seed,
+            drop_last=drop_last        
+        )
+
diff --git a/milabench/_version.py b/milabench/_version.py
@@ -1,5 +1,5 @@
 """This file is generated, do not modify"""
 
-__tag__ = "v0.1.0-20-gf2cc75f8"
-__commit__ = "f2cc75f8a4728dcac223c91153119b43fec05698"
-__date__ = "2024-07-03 12:37:48 -0400"
+__tag__ = "v0.1.0-24-gdd7f3888"
+__commit__ = "dd7f3888ac0524b3b587e415d1de0e2019cd751f"
+__date__ = "2024-07-03 16:07:47 -0400"
diff --git a/milabench/utils.py b/milabench/utils.py
@@ -242,3 +242,55 @@ def enumerate_rank(nodes):
         else:
             yield rank, node
             rank += 1
+
+
+def get_available_ram(leeway=1024):
+    import os
+    import psutil
+    try:
+        # Note: if slurm does not give us access to the entire RAM we wont be able
+        # to do much
+        if jobid := os.getenv("SLURM_JOB_ID", None):
+            filename = f"/sys/fs/cgroup/system.slice/slurmstepd.scope/job_{jobid}"
+            mem_max = filename + "/memory.max"
+            mem_cur = filename + "/memory.current"
+            with open(mem_max, "r") as fp:
+                mem_max = int(fp.read())
+            with open(mem_cur, "r") as fp:
+                mem_cur = int(fp.read())
+            return (int(mem_max) - int(mem_cur) - leeway)
+    except Exception as err:
+        vm = psutil.virtual_memory()
+        return vm.available - leeway
+
+
+def fill_ram():
+    #
+    # This takes too long to be a viable option
+    #
+    import numpy as np
+    available = get_available_ram()
+    array = np.zeros((available,), dtype=np.int8)
+    # unless we use it it wont allocate it
+    array.fill(1)
+    del array
+
+
+def empty_cache():
+    """Empty ram cache before a bench"""
+    import subprocess
+
+    # make users able to manage cgroup
+    # $SUDO chmod 777 -R /sys/fs/cgroup/*
+
+    # make sysctl executable by user, for cache cleaning
+    # $SUDO chmod u+s /sbin/sysctl
+
+    try:
+        # finish on the fly writes
+        subprocess.run(["sync"])
+        # Drop the cache
+        subprocess.run(["sysctl", "vm.drop_caches=3"])
+        subprocess.run(["sudo", "sysctl", "vm.drop_caches=3"])
+    except Exception:
+        pass
diff --git a/tests/test_mock.py b/tests/test_mock.py
@@ -0,0 +1,44 @@
+
+import milabench.commands.executors
+
+import traceback
+from pytest import fixture
+
+
+@fixture
+def args(standard_config, tmp_path):
+    return [
+        "--base", str(tmp_path),
+        "--config", str(standard_config)
+    ]
+
+
+async def mock_exec(command, phase="run", timeout=False, timeout_delay=600, **kwargs):
+    return [0]
+
+
+def run_cli(*args):
+    from milabench.cli import main
+
+    print(" ".join(args))
+    try:
+        main(args)
+    except SystemExit as exc:
+        assert not exc.code
+
+
+def test_milabench(monkeypatch, args):
+    monkeypatch.setenv("MILABENCH_GPU_ARCH", "cuda")
+    monkeypatch.setattr(milabench.commands, "execute_command", mock_exec)
+
+    run_cli("install", *args)
+
+    run_cli("prepare", *args)
+
+    #
+    # use Mock GPU-SMI
+    #
+    monkeypatch.setenv("MILABENCH_GPU_ARCH", "mock")
+    from milabench.cli.dry import assume_gpu
+    with assume_gpu(8):
+        run_cli("run", *args, "--no-report")