From 1d5ddb985e2c423126be5e3b6c5bdc7e22d9662b Mon Sep 17 00:00:00 2001
From: Fabrice Normandin <normandf@mila.quebec>
Date: Tue, 7 Nov 2023 13:25:24 -0500
Subject: [PATCH] (temp) save changes in test_examples.py

Signed-off-by: Fabrice Normandin <normandf@mila.quebec>
---
 tests/test_examples.py | 116 ++++++++++++++++++++++++++---------------
 1 file changed, 73 insertions(+), 43 deletions(-)

diff --git a/tests/test_examples.py b/tests/test_examples.py
index d1a598be..c1381c11 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -1,5 +1,6 @@
 """Tests that launch the examples as jobs on the Mila cluster and check that they work correctly."""
 from __future__ import annotations
+import functools
 
 import logging
 import os
@@ -10,7 +11,8 @@
 import time
 from logging import getLogger as get_logger
 from pathlib import Path
-from typing import Any
+import enum
+from typing import Any, NamedTuple
 
 import pytest
 import rich.console
@@ -33,22 +35,61 @@
 SCRATCH = Path(os.environ["SCRATCH"])
 
 
-gpu_types = [
-    "1g.10gb",  # MIG-ed A100 GPU
-    "2g.20gb",  # MIG-ed A100 GPU
-    "3g.40gb",  # MIG-ed A100 GPU
-    # "a100",
-    # "a100l",  # Note: needs a reservation.
-    # "a6000",
-    "rtx8000",
-    pytest.param(
-        "v100",
-        marks=[
-            pytest.mark.xfail(reason="Can take a while to schedule"),
-            pytest.mark.timeout(120),
-        ],
-    ),
-]
+class GpuModel(enum.Enum):
+    # a100_10gb = "1g.10gb"
+    a100_20gb = "2g.20gb"
+    a100_3g40gb = "3g.40gb"
+    a100_4g40gb = "4g.40gb"
+    a100 = "a100"
+    a100l = "a100l"
+    a6000 = "a6000"
+    rtx8000 = "rtx8000"
+    v100 = "v100"
+
+
+gpu_memory_gb = {
+    "1g.10gb": 10,
+    "2g.20gb": 20,
+    "3g.40gb": 40,
+    "a100": 40,
+    "a100l": 80,
+    "a6000": 48,
+    "rtx8000": 48,
+    "v100": 16,
+}
+
+
+class AvailTotal(NamedTuple):
+    avail: int
+    total: int
+
+
+@functools.cache
+def savail() -> dict[str, AvailTotal]:
+    """Gets the output of the `savail` command in a Python dictionary.
+
+    ```
+     GPU               Avail / Total
+    ===============================
+    1g.10gb              38 / 40
+    2g.20gb              59 / 60
+    3g.40gb              39 / 40
+    a100                  0 / 16
+    a100l                 3 / 56
+    a6000                 1 / 8
+    rtx8000              156 / 384
+    v100                 10 / 50
+    ```
+    """
+    savail_output = subprocess.check_output(["savail"]).decode("utf-8")
+    lines = [line.strip() for line in savail_output.splitlines()[2:]]
+    return {
+        gpu_type: AvailTotal(int(avail), int(total))
+        for gpu_type, avail, _, total in [line.split() for line in lines]
+    }
+
+
+gpu_types = [v.value for v in GpuModel]
 
 
 @pytest.fixture(scope="session", autouse=True)
@@ -156,34 +197,23 @@ def _test_id(arg: Path | bool | dict) -> str:
     return "-".join(f"{k}={v}" for k, v in arg.items())
 
 
+@pytest.fixture(params=gpu_types)
+def sbatch_gpu_override(request: pytest.FixtureRequest) -> dict[str, str]:
+    gpu_type: str = request.param
+    gpu_availability = savail()
+
+    assert gpu_type in gpu_availability, f"{gpu_type} doesn't show up in the savail output!"
+    avail, total = gpu_availability[gpu_type]
+    if avail == 0:
+        pytest.skip(reason="Isn't available on the cluster at the moment.")
+
+    return {"gres": f"gpu:{gpu_type}:1"}
+
+
 @pytest.mark.parametrize(
     ("example_dir", "make_reproducible", "sbatch_overrides"),
     [
-        pytest.param(
-            EXAMPLES_DIR / "frameworks" / "pytorch_setup",
-            False,
-            {"gres": f"gpu:{gpu_type}:1"},
-            marks=(
-                [
-                    pytest.mark.xfail(reason="Can take a while to schedule"),
-                    pytest.mark.timeout(120),
-                ]
-                if gpu_type == "v100"
-                else []
-            ),
-        )
-        for gpu_type in [
-            "1g.10gb",  # MIG-ed A100 GPU
-            "2g.20gb",  # MIG-ed A100 GPU
-            "3g.40gb",  # MIG-ed A100 GPU
-            # "a100",
-            # "a100l",  # Note: needs a reservation.
-            # "a6000",
-            "rtx8000",
-            "v100",
-        ]
-    ]
-    + [
+        (EXAMPLES_DIR / "frameworks" / "pytorch_setup", False, {}),
         (EXAMPLES_DIR / "distributed" / "single_gpu", True, {}),
         (EXAMPLES_DIR / "distributed" / "multi_gpu", True, {}),
         pytest.param(
@@ -198,7 +228,7 @@ def _test_id(arg: Path | bool | dict) -> str:
     ],
     ids=_test_id,
 )
-def test_pytorch_example(
+def test_pytorch_example_on_all_gpus(
     example_dir: Path,
     make_reproducible: bool,
     sbatch_overrides: dict[str, Any] | None,