diff --git a/.github/workflows/test_api_cuda.yaml b/.github/workflows/test_api_cuda.yaml index 28d9b435..c64b7b11 100644 --- a/.github/workflows/test_api_cuda.yaml +++ b/.github/workflows/test_api_cuda.yaml @@ -22,7 +22,8 @@ jobs: { torch_cuda: cu121, torch_pre_release: 1, cuda_version: 12.1.1 }, ] - runs-on: nvidia-gpu + runs-on: [multi-gpu, nvidia-gpu, 4-a10, ci] + steps: - name: Checkout uses: actions/checkout@v3 @@ -37,18 +38,13 @@ jobs: --tag opt-bench-cuda:${{ matrix.image.cuda_version }} . - - name: Get GPUs with most free memory - id: get_devices - run: | - echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')" - - name: Run tests run: docker run --rm - --pid host + --gpus all --shm-size 64G --env USE_CUDA="1" - --gpus '"device=${{ steps.get_devices.outputs.devices }}"' + --env PROCESS_SPECIFIC_VRAM="0" --volume $(pwd):/workspace/optimum-benchmark --workdir /workspace/optimum-benchmark --entrypoint /bin/bash diff --git a/.github/workflows/test_cli_cuda_onnxruntime.yaml b/.github/workflows/test_cli_cuda_onnxruntime.yaml index adb31be3..df154deb 100644 --- a/.github/workflows/test_cli_cuda_onnxruntime.yaml +++ b/.github/workflows/test_cli_cuda_onnxruntime.yaml @@ -13,7 +13,8 @@ concurrency: jobs: build_image_and_run_cli_cuda_onnxruntime_tests: - runs-on: nvidia-gpu + runs-on: [single-gpu, nvidia-gpu, a10, ci] + steps: - name: Checkout uses: actions/checkout@v3 @@ -28,20 +29,15 @@ jobs: --tag opt-bench-cuda:11.8.0 . - - name: Get GPUs with most free memory - id: get_devices - run: | - echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')" - - name: Run tests run: docker run --rm - --pid host + --gpus all --shm-size 64G --env USE_CUDA="1" - --entrypoint /bin/bash - --gpus '"device=${{ steps.get_devices.outputs.devices }}"' + --env PROCESS_SPECIFIC_VRAM="0" --volume $(pwd):/workspace/optimum-benchmark --workdir /workspace/optimum-benchmark + --entrypoint /bin/bash opt-bench-cuda:11.8.0 -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and cuda and onnxruntime' -x" diff --git a/.github/workflows/test_cli_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml index 204722db..43a541d0 100644 --- a/.github/workflows/test_cli_cuda_pytorch.yaml +++ b/.github/workflows/test_cli_cuda_pytorch.yaml @@ -22,7 +22,8 @@ jobs: { torch_cuda: cu121, torch_pre_release: 1, cuda_version: 12.1.1 }, ] - runs-on: nvidia-gpu + runs-on: [multi-gpu, nvidia-gpu, 4-a10, ci] + steps: - name: Checkout uses: actions/checkout@v3 @@ -37,18 +38,13 @@ jobs: --tag opt-bench-cuda:${{ matrix.image.cuda_version }} . - - name: Get GPUs with most free memory - id: get_devices - run: | - echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')" - - name: Run tests run: docker run --rm - --pid host + --gpus all --shm-size 64G --env USE_CUDA="1" - --gpus '"device=${{ steps.get_devices.outputs.devices }}"' + --env PROCESS_SPECIFIC_VRAM="0" --volume $(pwd):/workspace/optimum-benchmark --workdir /workspace/optimum-benchmark --entrypoint /bin/bash diff --git a/.github/workflows/test_cli_cuda_torch_ort.yaml b/.github/workflows/test_cli_cuda_torch_ort.yaml index 680f3f0f..4c939d29 100644 --- a/.github/workflows/test_cli_cuda_torch_ort.yaml +++ b/.github/workflows/test_cli_cuda_torch_ort.yaml @@ -13,7 +13,8 @@ concurrency: jobs: build_image_and_run_cli_cuda_torch_ort_tests: - runs-on: nvidia-gpu + runs-on: [multi-gpu, nvidia-gpu, 4-a10, ci] + steps: - name: Checkout uses: actions/checkout@v3 @@ -28,20 +29,15 @@ jobs: --tag opt-bench-cuda:11.8.0 . - - name: Get GPUs with most free memory - id: get_devices - run: | - echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')" - - name: Run tests run: docker run --rm - --pid host + --gpus all --shm-size 64G --env USE_CUDA="1" - --entrypoint /bin/bash - --gpus '"device=${{ steps.get_devices.outputs.devices }}"' + --env PROCESS_SPECIFIC_VRAM="0" --volume $(pwd):/workspace/optimum-benchmark --workdir /workspace/optimum-benchmark + --entrypoint /bin/bash opt-bench-cuda:11.8.0 -c "pip install -e .[testing,torch-ort,peft] && python -m torch_ort.configure && pytest -k 'cli and cuda and torch_ort' -x" diff --git a/.github/workflows/test_cli_tensorrt_llm.yaml b/.github/workflows/test_cli_tensorrt_llm.yaml index 40438055..0a53d442 100644 --- a/.github/workflows/test_cli_tensorrt_llm.yaml +++ b/.github/workflows/test_cli_tensorrt_llm.yaml @@ -13,7 +13,8 @@ concurrency: jobs: pull_image_and_run_cli_tensorrt_llm_tests: - runs-on: nvidia-gpu + runs-on: [single-gpu, nvidia-gpu, a10, ci] + steps: - name: Checkout uses: actions/checkout@v3 @@ -26,18 +27,14 @@ jobs: --tag opt-bench-tensorrt-llm:latest . - - name: Get GPUs with most free memory - id: get_devices - run: | - echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')" - - name: Run tests run: docker run --rm + --gpus all --pid host --shm-size 64G --env USE_CUDA="1" - --gpus '"device=${{ steps.get_devices.outputs.devices }}"' + --env PROCESS_SPECIFIC_VRAM="0" --volume $(pwd):/workspace/optimum-benchmark --workdir /workspace/optimum-benchmark --entrypoint /bin/bash diff --git a/.github/workflows/test_cli_tensorrt_onnxruntime.yaml b/.github/workflows/test_cli_tensorrt_onnxruntime.yaml index 1d52ee33..80dcb486 100644 --- a/.github/workflows/test_cli_tensorrt_onnxruntime.yaml +++ b/.github/workflows/test_cli_tensorrt_onnxruntime.yaml @@ -13,7 +13,8 @@ concurrency: jobs: build_image_and_run_cli_tensorrt_onnxruntime_tests: - runs-on: nvidia-gpu + runs-on: [single-gpu, nvidia-gpu, a10, ci] + steps: - name: Checkout uses: actions/checkout@v3 @@ -26,18 +27,14 @@ jobs: --tag opt-bench-tensorrt:latest . - - name: Get GPUs with most free memory - id: get_devices - run: | - echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')" - - name: Run tests run: docker run --rm + --gpus all --pid host --shm-size 64G --env USE_CUDA="1" - --gpus '"device=${{ steps.get_devices.outputs.devices }}"' + --env PROCESS_SPECIFIC_VRAM="0" --volume $(pwd):/workspace/optimum-benchmark --workdir /workspace/optimum-benchmark --entrypoint /bin/bash diff --git a/docker/cpu.dockerfile b/docker/cpu.dockerfile index f15db72f..5bd00bd3 100644 --- a/docker/cpu.dockerfile +++ b/docker/cpu.dockerfile @@ -1,4 +1,6 @@ -FROM ubuntu:latest +ARG UBUNTU_VERSION=20.04 + +FROM ubuntu:${UBUNTU_VERSION} # Ignore interactive questions during `docker build` ENV DEBIAN_FRONTEND noninteractive diff --git a/examples/pytorch_timm.yaml b/examples/pytorch_timm.yaml index 4b2c5295..c5fa25b6 100644 --- a/examples/pytorch_timm.yaml +++ b/examples/pytorch_timm.yaml @@ -18,6 +18,7 @@ launcher: device_isolation: true benchmark: + memory: true input_shapes: batch_size: 1 diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py index 017c21fe..8f94be6d 100644 --- a/optimum_benchmark/trackers/memory.py +++ b/optimum_benchmark/trackers/memory.py @@ -29,6 +29,9 @@ Memory_Unit_Literal = Literal["MB"] +PROCESS_SPECIFIC_VRAM = os.environ.get("PROCESS_SPECIFIC_VRAM", "1") == "1" + + @dataclass class Memory: unit: Memory_Unit_Literal @@ -47,11 +50,17 @@ def aggregate(memories: List["Memory"]) -> "Memory": unit = memories[0].unit max_ram = sum(memory.max_ram for memory in memories) - max_vram = sum(memory.max_vram for memory in memories) if memories[0].max_vram is not None else None + + if PROCESS_SPECIFIC_VRAM: + max_vram = sum(memory.max_vram for memory in memories) if memories[0].max_vram is not None else None + else: + max_vram = max(memory.max_vram for memory in memories) if memories[0].max_vram is not None else None + max_reserved = sum(memory.max_reserved for memory in memories) if memories[0].max_reserved is not None else None max_allocated = ( sum(memory.max_allocated for memory in memories) if memories[0].max_allocated is not None else None ) + return Memory( unit=unit, max_ram=max_ram, max_vram=max_vram, max_reserved=max_reserved, max_allocated=max_allocated ) @@ -174,58 +183,82 @@ def get_max_memory(self): return Memory(unit=MEMORY_UNIT, max_ram=self.max_ram_memory) -def monitor_cpu_ram_memory(process_id: int, connection: Connection, interval: float = 0.001): +def monitor_cpu_ram_memory(monitored_pid: int, connection: Connection, interval: float = 0.001): stop = False - max_memory = 0 - process = psutil.Process(process_id) + max_used_memory = 0 + process = psutil.Process(monitored_pid) connection.send(0) while not stop: meminfo_attr = "memory_info" if hasattr(process, "memory_info") else "get_memory_info" - current_used_memory = getattr(process, meminfo_attr)()[0] - max_memory = max(max_memory, current_used_memory) + used_memory = getattr(process, meminfo_attr)()[0] + max_used_memory = max(max_used_memory, used_memory) stop = connection.poll(interval) - connection.send(max_memory / 1e6) # convert to MB + connection.send(max_used_memory / 1e6) # convert to MB connection.close() -def monitor_gpu_vram_memory(process_id: int, device_ids: List[int], connection: Connection, interval: float = 0.01): +def monitor_gpu_vram_memory(monitored_pid: int, device_ids: List[int], connection: Connection, interval: float = 0.01): stop = False - max_memory = 0 + max_used_memory = 0 + monitored_process = psutil.Process(monitored_pid) connection.send(0) + if PROCESS_SPECIFIC_VRAM: + LOGGER.warning( + "Tracking process-specific VRAM usage. This will track the memory usage of the monitored process and its children only." + ) + else: + LOGGER.warning( + "Tracking global-device VRAM usage. This will track the memory usage of monitored device(s). " + "Which may include memory used by other processes that are not relevant to the monitored process." + ) + if is_nvidia_system(): if not is_pynvml_available(): raise ValueError( "The library pynvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. " "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`." ) + pynvml.nvmlInit() devices_handles = [pynvml.nvmlDeviceGetHandleByIndex(device_id) for device_id in device_ids] - while not stop: - current_used_memory = 0 - for device_id, device_handle in zip(device_ids, devices_handles): - try: - device_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(device_handle) - except Exception as e: - LOGGER.warning(f"\t\t+ Could not get process list for device {device_id}: {e}") - continue - for device_process in device_processes: - if device_process.pid == process_id: - current_used_memory += device_process.usedGpuMemory - else: - try: - cpu_process = psutil.Process(device_process.pid) - except Exception as e: - LOGGER.warning(f"\t\t+ Could not get process info for process {device_process.pid}: {e}") - continue - if cpu_process.parent() is not None and cpu_process.parent().pid == process_id: - current_used_memory += device_process.usedGpuMemory + if PROCESS_SPECIFIC_VRAM: + while not stop: + used_memory = 0 + monitored_pids = [monitored_pid] + [child.pid for child in monitored_process.children(recursive=True)] + + for device_id, device_handle in zip(device_ids, devices_handles): + try: + device_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(device_handle) + except Exception as e: + LOGGER.warning(f"Could not get process list for device {device_id}: {e}.") + continue + + for device_process in device_processes: + if device_process.pid in monitored_pids: + used_memory += device_process.usedGpuMemory - max_memory = max(max_memory, current_used_memory) - stop = connection.poll(interval) + max_used_memory = max(max_used_memory, used_memory) + stop = connection.poll(interval) + + else: + while not stop: + used_memory = 0 + + for device_id, device_handle in zip(device_ids, devices_handles): + try: + device_memory = pynvml.nvmlDeviceGetMemoryInfo(device_handle) + except Exception as e: + LOGGER.warning(f"Could not get memory info for device {device_id}: {e}") + continue + + used_memory += device_memory.used + + max_used_memory = max(max_used_memory, used_memory) + stop = connection.poll(interval) pynvml.nvmlShutdown() @@ -241,73 +274,60 @@ def monitor_gpu_vram_memory(process_id: int, device_ids: List[int], connection: if rocm_version >= "5.7": devices_handles = amdsmi.amdsmi_get_processor_handles() while not stop: - current_used_memory = 0 + used_memory = 0 + monitored_pids = [monitored_pid] + [child.pid for child in monitored_process.children(recursive=True)] + for device_id in device_ids: device_handle = devices_handles[device_id] try: processes_handles = amdsmi.amdsmi_get_gpu_process_list(device_handle) except Exception as e: - LOGGER.warning(f"\t\t+ Could not get process list for device {device_id}: {e}") + LOGGER.warning(f"Could not get process list for device {device_id}: {e}") continue + for process_handle in processes_handles: try: gpu_process_info = amdsmi.amdsmi_get_gpu_process_info(device_handle, process_handle) except Exception as e: - LOGGER.warning(f"\t\t+ Could not get process info for process {process_handle}: {e}") + LOGGER.warning(f"Could not get process info for process {process_handle}: {e}") continue - # only memory usage of the monitored process and its children is tracked - if gpu_process_info["pid"] == process_id: - current_used_memory += gpu_process_info["memory_usage"]["vram_mem"] - else: - try: - cpu_process_info = psutil.Process(gpu_process_info["pid"]) - except Exception as e: - LOGGER.warning( - f"\t\t+ Could not get process info for process {gpu_process_info['pid']}: {e}" - ) - continue - if cpu_process_info.parent() is not None and cpu_process_info.ppid() == process_id: - current_used_memory += gpu_process_info["memory_usage"]["vram_mem"] - - max_memory = max(max_memory, current_used_memory) + + if gpu_process_info["pid"] in monitored_pids: + used_memory += gpu_process_info["memory_usage"]["vram_mem"] + + max_used_memory = max(max_used_memory, used_memory) stop = connection.poll(interval) + else: devices_handles = amdsmi.amdsmi_get_device_handles() while not stop: - current_used_memory = 0 + used_memory = 0 + monitored_pids = [monitored_pid] + [child.pid for child in monitored_process.children(recursive=True)] + for device_id in device_ids: device_handle = devices_handles[device_id] try: processes_handles = amdsmi.amdsmi_get_process_list(device_handle) except Exception as e: - LOGGER.warning(f"\t\t+ Could not get process list for device {device_id}: {e}") + LOGGER.warning(f"Could not get process list for device {device_id}: {e}") continue + for process_handle in processes_handles: try: gpu_process_info = amdsmi.amdsmi_get_process_info(device_handle, process_handle) except Exception as e: - LOGGER.warning(f"\t\t+ Could not get process info for process {process_handle}: {e}") + LOGGER.warning(f"Could not get process info for process {process_handle}: {e}") continue - # only memory usage of the monitored process and its children is tracked - if gpu_process_info["pid"] == process_id: - current_used_memory += gpu_process_info["memory_usage"]["vram_mem"] - else: - try: - cpu_process_info = psutil.Process(gpu_process_info["pid"]) - except Exception as e: - LOGGER.warning( - f"\t\t+ Could not get process info for process {gpu_process_info['pid']}: {e}" - ) - continue - if cpu_process_info.parent() is not None and cpu_process_info.ppid() == process_id: - current_used_memory += gpu_process_info["memory_usage"]["vram_mem"] - - max_memory = max(max_memory, current_used_memory) + + if gpu_process_info["pid"] in monitored_pids: + used_memory += gpu_process_info["memory_usage"]["vram_mem"] + + max_used_memory = max(max_used_memory, used_memory) stop = connection.poll(interval) amdsmi.amdsmi_shut_down() else: raise ValueError("Only NVIDIA and AMD ROCm GPUs are supported for CUDA memory tracking.") - connection.send(max_memory / 1e6) # convert to MB + connection.send(max_used_memory / 1e6) # convert to MB connection.close() diff --git a/tests/test_api.py b/tests/test_api.py index da2fe35f..2516ef23 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -49,6 +49,8 @@ BACKENDS = ["pytorch", "none"] DEVICES = ["cpu", "cuda"] +CUDA_VISIBLE_DEVICES = ",".join([str(i) for i in range(torch.cuda.device_count())]) + @pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("backend", BACKENDS) @@ -113,10 +115,16 @@ def test_api_memory_tracker(device, backend): @pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("launcher_config", LAUNCHER_CONFIGS) def test_api_launch(device, launcher_config): - benchmark_config = InferenceConfig(latency=True, memory=True) + device_ids = CUDA_VISIBLE_DEVICES if device == "cuda" else None + benchmark_config = InferenceConfig( + memory=True, + latency=True, + input_shapes={"batch_size": 4}, + ) + backend_config = PyTorchConfig( model="bert-base-uncased", - device_ids="0,1" if device == "cuda" else None, + device_ids=device_ids, no_weights=True, device=device, )