added global memory tracking for when host pid namespaces are not acc…

…essible
huggingface · Mar 12, 2024 · 7ade74f · 7ade74f
1 parent a72524b
commit 7ade74f
Show file tree

Hide file tree

Showing 8 changed files with 58 additions and 31 deletions.
diff --git a/.github/workflows/test_api_cuda.yaml b/.github/workflows/test_api_cuda.yaml
@@ -42,9 +42,9 @@ jobs:
         run: docker run
           --rm
           --gpus all
-          --pid host
           --shm-size 64G
           --env USE_CUDA="1"
+          --env TRACK_GLOBAL_VRAM="1"
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
           --entrypoint /bin/bash

diff --git a/.github/workflows/test_cli_cuda_onnxruntime.yaml b/.github/workflows/test_cli_cuda_onnxruntime.yaml
@@ -33,11 +33,11 @@ jobs:
         run: docker run
           --rm
           --gpus all
-          --pid host
           --shm-size 64G
           --env USE_CUDA="1"
-          --entrypoint /bin/bash
+          --env TRACK_GLOBAL_VRAM="1"
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
+          --entrypoint /bin/bash
           opt-bench-cuda:11.8.0
           -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and cuda and onnxruntime' -x"
diff --git a/.github/workflows/test_cli_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml
@@ -42,9 +42,9 @@ jobs:
         run: docker run
           --rm
           --gpus all
-          --pid host
           --shm-size 64G
           --env USE_CUDA="1"
+          --env TRACK_GLOBAL_VRAM="1"
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
           --entrypoint /bin/bash

diff --git a/.github/workflows/test_cli_cuda_torch_ort.yaml b/.github/workflows/test_cli_cuda_torch_ort.yaml
@@ -33,11 +33,11 @@ jobs:
         run: docker run
           --rm
           --gpus all
-          --pid host
           --shm-size 64G
           --env USE_CUDA="1"
-          --entrypoint /bin/bash
+          --env TRACK_GLOBAL_VRAM="1"
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
+          --entrypoint /bin/bash
           opt-bench-cuda:11.8.0
           -c "pip install -e .[testing,torch-ort,peft] && python -m torch_ort.configure && pytest -k 'cli and cuda and torch_ort' -x"
diff --git a/.github/workflows/test_cli_tensorrt_llm.yaml b/.github/workflows/test_cli_tensorrt_llm.yaml
@@ -34,6 +34,7 @@ jobs:
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
+          --env TRACK_GLOBAL_VRAM="1"
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
           --entrypoint /bin/bash

diff --git a/.github/workflows/test_cli_tensorrt_onnxruntime.yaml b/.github/workflows/test_cli_tensorrt_onnxruntime.yaml
@@ -34,6 +34,7 @@ jobs:
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
+          --env TRACK_GLOBAL_VRAM="1"
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
           --entrypoint /bin/bash

diff --git a/examples/pytorch_timm.yaml b/examples/pytorch_timm.yaml
@@ -18,6 +18,7 @@ launcher:
   device_isolation: true
 
 benchmark:
+  memory: true
   input_shapes:
     batch_size: 1
 

diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py
@@ -202,27 +202,55 @@ def monitor_gpu_vram_memory(process_id: int, device_ids: List[int], connection:
                 "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`."
             )
         pynvml.nvmlInit()
+
         devices_handles = [pynvml.nvmlDeviceGetHandleByIndex(device_id) for device_id in device_ids]
+        track_global_vram = os.environ.get("TRACK_GLOBAL_VRAM", "0") == "1"
+
+        if track_global_vram:
+            LOGGER.warning(
+                "Tracking global VRAM usage. This will track the memory usage of all processes using the device(s)."
+            )
+        else:
+            LOGGER.info(
+                "Tracking process-specific VRAM usage. This will track the memory usage of the monitored process and its children."
+            )
 
         while not stop:
             current_used_memory = 0
-            for device_id, device_handle in zip(device_ids, devices_handles):
-                try:
-                    device_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(device_handle)
-                except Exception as e:
-                    LOGGER.warning(f"\t\t+ Could not get process list for device {device_id}: {e}")
-                    continue
-                for device_process in device_processes:
-                    if device_process.pid == process_id:
-                        current_used_memory += device_process.usedGpuMemory
-                    else:
-                        try:
-                            cpu_process = psutil.Process(device_process.pid)
-                        except Exception as e:
-                            LOGGER.warning(f"\t\t+ Could not get process info for process {device_process.pid}: {e}")
-                            continue
-                        if cpu_process.parent() is not None and cpu_process.parent().pid == process_id:
+
+            if track_global_vram:
+                for device_id, device_handle in zip(device_ids, devices_handles):
+                    try:
+                        device_memory = pynvml.nvmlDeviceGetMemoryInfo(device_handle)
+                    except Exception as e:
+                        LOGGER.warning(f"Could not get memory info for device {device_id}: {e}")
+                        continue
+                    current_used_memory += device_memory.used
+            else:
+                for device_id, device_handle in zip(device_ids, devices_handles):
+                    try:
+                        device_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(device_handle)
+                    except Exception as e:
+                        LOGGER.warning(
+                            f"Could not get process list for device {device_id}: {e}. "
+                            "Something went wrong with the GPU device."
+                        )
+                        continue
+                    for device_process in device_processes:
+                        if device_process.pid == process_id:
                             current_used_memory += device_process.usedGpuMemory
+                        else:
+                            try:
+                                cpu_process = psutil.Process(device_process.pid)
+                            except Exception as e:
+                                LOGGER.warning(
+                                    f"Could not get process info for process {device_process.pid}: {e}. "
+                                    "Please make sure your code has the necessary PID Namespaces permissions. "
+                                    "Or enable global VRAM usage tracking by setting the `GLOBAL_VRAM_USAGE` environment variable to `1`."
+                                )
+                                continue
+                            if cpu_process.parent() is not None and cpu_process.parent().pid == process_id:
+                                current_used_memory += device_process.usedGpuMemory
 
             max_memory = max(max_memory, current_used_memory)
             stop = connection.poll(interval)
@@ -253,7 +281,7 @@ def monitor_gpu_vram_memory(process_id: int, device_ids: List[int], connection:
                         try:
                             gpu_process_info = amdsmi.amdsmi_get_gpu_process_info(device_handle, process_handle)
                         except Exception as e:
-                            LOGGER.warning(f"\t\t+ Could not get process info for process {process_handle}: {e}")
+                            LOGGER.warning(f"Could not get process info for process {process_handle}: {e}")
                             continue
                         # only memory usage of the monitored process and its children is tracked
                         if gpu_process_info["pid"] == process_id:
@@ -262,9 +290,7 @@ def monitor_gpu_vram_memory(process_id: int, device_ids: List[int], connection:
                             try:
                                 cpu_process_info = psutil.Process(gpu_process_info["pid"])
                             except Exception as e:
-                                LOGGER.warning(
-                                    f"\t\t+ Could not get process info for process {gpu_process_info['pid']}: {e}"
-                                )
+                                LOGGER.warning(f"Could not get process info for process {gpu_process_info['pid']}: {e}")
                                 continue
                             if cpu_process_info.parent() is not None and cpu_process_info.ppid() == process_id:
                                 current_used_memory += gpu_process_info["memory_usage"]["vram_mem"]
@@ -280,13 +306,13 @@ def monitor_gpu_vram_memory(process_id: int, device_ids: List[int], connection:
                     try:
                         processes_handles = amdsmi.amdsmi_get_process_list(device_handle)
                     except Exception as e:
-                        LOGGER.warning(f"\t\t+ Could not get process list for device {device_id}: {e}")
+                        LOGGER.warning(f"Could not get process list for device {device_id}: {e}")
                         continue
                     for process_handle in processes_handles:
                         try:
                             gpu_process_info = amdsmi.amdsmi_get_process_info(device_handle, process_handle)
                         except Exception as e:
-                            LOGGER.warning(f"\t\t+ Could not get process info for process {process_handle}: {e}")
+                            LOGGER.warning(f"Could not get process info for process {process_handle}: {e}")
                             continue
                         # only memory usage of the monitored process and its children is tracked
                         if gpu_process_info["pid"] == process_id:
@@ -295,9 +321,7 @@ def monitor_gpu_vram_memory(process_id: int, device_ids: List[int], connection:
                             try:
                                 cpu_process_info = psutil.Process(gpu_process_info["pid"])
                             except Exception as e:
-                                LOGGER.warning(
-                                    f"\t\t+ Could not get process info for process {gpu_process_info['pid']}: {e}"
-                                )
+                                LOGGER.warning(f"Could not get process info for process {gpu_process_info['pid']}: {e}")
                                 continue
                             if cpu_process_info.parent() is not None and cpu_process_info.ppid() == process_id:
                                 current_used_memory += gpu_process_info["memory_usage"]["vram_mem"]