diff --git a/.github/workflows/test_api_cuda.yaml b/.github/workflows/test_api_cuda.yaml
index 28d9b435..c64b7b11 100644
--- a/.github/workflows/test_api_cuda.yaml
+++ b/.github/workflows/test_api_cuda.yaml
@@ -22,7 +22,8 @@ jobs:
             { torch_cuda: cu121, torch_pre_release: 1, cuda_version: 12.1.1 },
           ]
 
-    runs-on: nvidia-gpu
+    runs-on: [multi-gpu, nvidia-gpu, 4-a10, ci]
+
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -37,18 +38,13 @@ jobs:
           --tag opt-bench-cuda:${{ matrix.image.cuda_version }}
           .
 
-      - name: Get GPUs with most free memory
-        id: get_devices
-        run: |
-          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
-
       - name: Run tests
         run: docker run
           --rm
-          --pid host
+          --gpus all
           --shm-size 64G
           --env USE_CUDA="1"
-          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
+          --env PROCESS_SPECIFIC_VRAM="0"
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
           --entrypoint /bin/bash
diff --git a/.github/workflows/test_cli_cuda_onnxruntime.yaml b/.github/workflows/test_cli_cuda_onnxruntime.yaml
index adb31be3..df154deb 100644
--- a/.github/workflows/test_cli_cuda_onnxruntime.yaml
+++ b/.github/workflows/test_cli_cuda_onnxruntime.yaml
@@ -13,7 +13,8 @@ concurrency:
 
 jobs:
   build_image_and_run_cli_cuda_onnxruntime_tests:
-    runs-on: nvidia-gpu
+    runs-on: [single-gpu, nvidia-gpu, a10, ci]
+
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -28,20 +29,15 @@ jobs:
           --tag opt-bench-cuda:11.8.0
           .
 
-      - name: Get GPUs with most free memory
-        id: get_devices
-        run: |
-          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
-
       - name: Run tests
         run: docker run
           --rm
-          --pid host
+          --gpus all
           --shm-size 64G
           --env USE_CUDA="1"
-          --entrypoint /bin/bash
-          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
+          --env PROCESS_SPECIFIC_VRAM="0"
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
+          --entrypoint /bin/bash
           opt-bench-cuda:11.8.0
           -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and cuda and onnxruntime' -x"
diff --git a/.github/workflows/test_cli_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml
index 204722db..43a541d0 100644
--- a/.github/workflows/test_cli_cuda_pytorch.yaml
+++ b/.github/workflows/test_cli_cuda_pytorch.yaml
@@ -22,7 +22,8 @@ jobs:
             { torch_cuda: cu121, torch_pre_release: 1, cuda_version: 12.1.1 },
           ]
 
-    runs-on: nvidia-gpu
+    runs-on: [multi-gpu, nvidia-gpu, 4-a10, ci]
+
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -37,18 +38,13 @@ jobs:
           --tag opt-bench-cuda:${{ matrix.image.cuda_version }}
           .
 
-      - name: Get GPUs with most free memory
-        id: get_devices
-        run: |
-          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
-
       - name: Run tests
         run: docker run
           --rm
-          --pid host
+          --gpus all
           --shm-size 64G
           --env USE_CUDA="1"
-          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
+          --env PROCESS_SPECIFIC_VRAM="0"
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
           --entrypoint /bin/bash
diff --git a/.github/workflows/test_cli_cuda_torch_ort.yaml b/.github/workflows/test_cli_cuda_torch_ort.yaml
index 680f3f0f..4c939d29 100644
--- a/.github/workflows/test_cli_cuda_torch_ort.yaml
+++ b/.github/workflows/test_cli_cuda_torch_ort.yaml
@@ -13,7 +13,8 @@ concurrency:
 
 jobs:
   build_image_and_run_cli_cuda_torch_ort_tests:
-    runs-on: nvidia-gpu
+    runs-on: [multi-gpu, nvidia-gpu, 4-a10, ci]
+
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -28,20 +29,15 @@ jobs:
           --tag opt-bench-cuda:11.8.0
           .
 
-      - name: Get GPUs with most free memory
-        id: get_devices
-        run: |
-          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
-
       - name: Run tests
         run: docker run
           --rm
-          --pid host
+          --gpus all
           --shm-size 64G
           --env USE_CUDA="1"
-          --entrypoint /bin/bash
-          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
+          --env PROCESS_SPECIFIC_VRAM="0"
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
+          --entrypoint /bin/bash
           opt-bench-cuda:11.8.0
           -c "pip install -e .[testing,torch-ort,peft] && python -m torch_ort.configure && pytest -k 'cli and cuda and torch_ort' -x"
diff --git a/.github/workflows/test_cli_tensorrt_llm.yaml b/.github/workflows/test_cli_tensorrt_llm.yaml
index 40438055..0a53d442 100644
--- a/.github/workflows/test_cli_tensorrt_llm.yaml
+++ b/.github/workflows/test_cli_tensorrt_llm.yaml
@@ -13,7 +13,8 @@ concurrency:
 
 jobs:
   pull_image_and_run_cli_tensorrt_llm_tests:
-    runs-on: nvidia-gpu
+    runs-on: [single-gpu, nvidia-gpu, a10, ci]
+
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -26,18 +27,14 @@ jobs:
           --tag opt-bench-tensorrt-llm:latest
           .
 
-      - name: Get GPUs with most free memory
-        id: get_devices
-        run: |
-          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
-
       - name: Run tests
         run: docker run
           --rm
+          --gpus all
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
-          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
+          --env PROCESS_SPECIFIC_VRAM="0"
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
           --entrypoint /bin/bash
diff --git a/.github/workflows/test_cli_tensorrt_onnxruntime.yaml b/.github/workflows/test_cli_tensorrt_onnxruntime.yaml
index 1d52ee33..80dcb486 100644
--- a/.github/workflows/test_cli_tensorrt_onnxruntime.yaml
+++ b/.github/workflows/test_cli_tensorrt_onnxruntime.yaml
@@ -13,7 +13,8 @@ concurrency:
 
 jobs:
   build_image_and_run_cli_tensorrt_onnxruntime_tests:
-    runs-on: nvidia-gpu
+    runs-on: [single-gpu, nvidia-gpu, a10, ci]
+
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -26,18 +27,14 @@ jobs:
           --tag opt-bench-tensorrt:latest
           .
 
-      - name: Get GPUs with most free memory
-        id: get_devices
-        run: |
-          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
-
       - name: Run tests
         run: docker run
           --rm
+          --gpus all
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
-          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
+          --env PROCESS_SPECIFIC_VRAM="0"
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
           --entrypoint /bin/bash
diff --git a/docker/cpu.dockerfile b/docker/cpu.dockerfile
index f15db72f..5bd00bd3 100644
--- a/docker/cpu.dockerfile
+++ b/docker/cpu.dockerfile
@@ -1,4 +1,6 @@
-FROM ubuntu:latest
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION}
 
 # Ignore interactive questions during `docker build`
 ENV DEBIAN_FRONTEND noninteractive
diff --git a/examples/pytorch_timm.yaml b/examples/pytorch_timm.yaml
index 4b2c5295..c5fa25b6 100644
--- a/examples/pytorch_timm.yaml
+++ b/examples/pytorch_timm.yaml
@@ -18,6 +18,7 @@ launcher:
   device_isolation: true
 
 benchmark:
+  memory: true
   input_shapes:
     batch_size: 1
 
diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py
index 017c21fe..8f94be6d 100644
--- a/optimum_benchmark/trackers/memory.py
+++ b/optimum_benchmark/trackers/memory.py
@@ -29,6 +29,9 @@
 Memory_Unit_Literal = Literal["MB"]
 
 
+PROCESS_SPECIFIC_VRAM = os.environ.get("PROCESS_SPECIFIC_VRAM", "1") == "1"
+
+
 @dataclass
 class Memory:
     unit: Memory_Unit_Literal
@@ -47,11 +50,17 @@ def aggregate(memories: List["Memory"]) -> "Memory":
 
         unit = memories[0].unit
         max_ram = sum(memory.max_ram for memory in memories)
-        max_vram = sum(memory.max_vram for memory in memories) if memories[0].max_vram is not None else None
+
+        if PROCESS_SPECIFIC_VRAM:
+            max_vram = sum(memory.max_vram for memory in memories) if memories[0].max_vram is not None else None
+        else:
+            max_vram = max(memory.max_vram for memory in memories) if memories[0].max_vram is not None else None
+
         max_reserved = sum(memory.max_reserved for memory in memories) if memories[0].max_reserved is not None else None
         max_allocated = (
             sum(memory.max_allocated for memory in memories) if memories[0].max_allocated is not None else None
         )
+
         return Memory(
             unit=unit, max_ram=max_ram, max_vram=max_vram, max_reserved=max_reserved, max_allocated=max_allocated
         )
@@ -174,58 +183,82 @@ def get_max_memory(self):
             return Memory(unit=MEMORY_UNIT, max_ram=self.max_ram_memory)
 
 
-def monitor_cpu_ram_memory(process_id: int, connection: Connection, interval: float = 0.001):
+def monitor_cpu_ram_memory(monitored_pid: int, connection: Connection, interval: float = 0.001):
     stop = False
-    max_memory = 0
-    process = psutil.Process(process_id)
+    max_used_memory = 0
+    process = psutil.Process(monitored_pid)
     connection.send(0)
 
     while not stop:
         meminfo_attr = "memory_info" if hasattr(process, "memory_info") else "get_memory_info"
-        current_used_memory = getattr(process, meminfo_attr)()[0]
-        max_memory = max(max_memory, current_used_memory)
+        used_memory = getattr(process, meminfo_attr)()[0]
+        max_used_memory = max(max_used_memory, used_memory)
         stop = connection.poll(interval)
 
-    connection.send(max_memory / 1e6)  # convert to MB
+    connection.send(max_used_memory / 1e6)  # convert to MB
     connection.close()
 
 
-def monitor_gpu_vram_memory(process_id: int, device_ids: List[int], connection: Connection, interval: float = 0.01):
+def monitor_gpu_vram_memory(monitored_pid: int, device_ids: List[int], connection: Connection, interval: float = 0.01):
     stop = False
-    max_memory = 0
+    max_used_memory = 0
+    monitored_process = psutil.Process(monitored_pid)
     connection.send(0)
 
+    if PROCESS_SPECIFIC_VRAM:
+        LOGGER.warning(
+            "Tracking process-specific VRAM usage. This will track the memory usage of the monitored process and its children only."
+        )
+    else:
+        LOGGER.warning(
+            "Tracking global-device VRAM usage. This will track the memory usage of monitored device(s). "
+            "Which may include memory used by other processes that are not relevant to the monitored process."
+        )
+
     if is_nvidia_system():
         if not is_pynvml_available():
             raise ValueError(
                 "The library pynvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. "
                 "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`."
             )
+
         pynvml.nvmlInit()
         devices_handles = [pynvml.nvmlDeviceGetHandleByIndex(device_id) for device_id in device_ids]
 
-        while not stop:
-            current_used_memory = 0
-            for device_id, device_handle in zip(device_ids, devices_handles):
-                try:
-                    device_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(device_handle)
-                except Exception as e:
-                    LOGGER.warning(f"\t\t+ Could not get process list for device {device_id}: {e}")
-                    continue
-                for device_process in device_processes:
-                    if device_process.pid == process_id:
-                        current_used_memory += device_process.usedGpuMemory
-                    else:
-                        try:
-                            cpu_process = psutil.Process(device_process.pid)
-                        except Exception as e:
-                            LOGGER.warning(f"\t\t+ Could not get process info for process {device_process.pid}: {e}")
-                            continue
-                        if cpu_process.parent() is not None and cpu_process.parent().pid == process_id:
-                            current_used_memory += device_process.usedGpuMemory
+        if PROCESS_SPECIFIC_VRAM:
+            while not stop:
+                used_memory = 0
+                monitored_pids = [monitored_pid] + [child.pid for child in monitored_process.children(recursive=True)]
+
+                for device_id, device_handle in zip(device_ids, devices_handles):
+                    try:
+                        device_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(device_handle)
+                    except Exception as e:
+                        LOGGER.warning(f"Could not get process list for device {device_id}: {e}.")
+                        continue
+
+                    for device_process in device_processes:
+                        if device_process.pid in monitored_pids:
+                            used_memory += device_process.usedGpuMemory
 
-            max_memory = max(max_memory, current_used_memory)
-            stop = connection.poll(interval)
+                max_used_memory = max(max_used_memory, used_memory)
+                stop = connection.poll(interval)
+
+        else:
+            while not stop:
+                used_memory = 0
+
+                for device_id, device_handle in zip(device_ids, devices_handles):
+                    try:
+                        device_memory = pynvml.nvmlDeviceGetMemoryInfo(device_handle)
+                    except Exception as e:
+                        LOGGER.warning(f"Could not get memory info for device {device_id}: {e}")
+                        continue
+
+                    used_memory += device_memory.used
+
+                max_used_memory = max(max_used_memory, used_memory)
+                stop = connection.poll(interval)
 
         pynvml.nvmlShutdown()
 
@@ -241,73 +274,60 @@ def monitor_gpu_vram_memory(process_id: int, device_ids: List[int], connection:
         if rocm_version >= "5.7":
             devices_handles = amdsmi.amdsmi_get_processor_handles()
             while not stop:
-                current_used_memory = 0
+                used_memory = 0
+                monitored_pids = [monitored_pid] + [child.pid for child in monitored_process.children(recursive=True)]
+
                 for device_id in device_ids:
                     device_handle = devices_handles[device_id]
                     try:
                         processes_handles = amdsmi.amdsmi_get_gpu_process_list(device_handle)
                     except Exception as e:
-                        LOGGER.warning(f"\t\t+ Could not get process list for device {device_id}: {e}")
+                        LOGGER.warning(f"Could not get process list for device {device_id}: {e}")
                         continue
+
                     for process_handle in processes_handles:
                         try:
                             gpu_process_info = amdsmi.amdsmi_get_gpu_process_info(device_handle, process_handle)
                         except Exception as e:
-                            LOGGER.warning(f"\t\t+ Could not get process info for process {process_handle}: {e}")
+                            LOGGER.warning(f"Could not get process info for process {process_handle}: {e}")
                             continue
-                        # only memory usage of the monitored process and its children is tracked
-                        if gpu_process_info["pid"] == process_id:
-                            current_used_memory += gpu_process_info["memory_usage"]["vram_mem"]
-                        else:
-                            try:
-                                cpu_process_info = psutil.Process(gpu_process_info["pid"])
-                            except Exception as e:
-                                LOGGER.warning(
-                                    f"\t\t+ Could not get process info for process {gpu_process_info['pid']}: {e}"
-                                )
-                                continue
-                            if cpu_process_info.parent() is not None and cpu_process_info.ppid() == process_id:
-                                current_used_memory += gpu_process_info["memory_usage"]["vram_mem"]
-
-                max_memory = max(max_memory, current_used_memory)
+
+                        if gpu_process_info["pid"] in monitored_pids:
+                            used_memory += gpu_process_info["memory_usage"]["vram_mem"]
+
+                max_used_memory = max(max_used_memory, used_memory)
                 stop = connection.poll(interval)
+
         else:
             devices_handles = amdsmi.amdsmi_get_device_handles()
             while not stop:
-                current_used_memory = 0
+                used_memory = 0
+                monitored_pids = [monitored_pid] + [child.pid for child in monitored_process.children(recursive=True)]
+
                 for device_id in device_ids:
                     device_handle = devices_handles[device_id]
                     try:
                         processes_handles = amdsmi.amdsmi_get_process_list(device_handle)
                     except Exception as e:
-                        LOGGER.warning(f"\t\t+ Could not get process list for device {device_id}: {e}")
+                        LOGGER.warning(f"Could not get process list for device {device_id}: {e}")
                         continue
+
                     for process_handle in processes_handles:
                         try:
                             gpu_process_info = amdsmi.amdsmi_get_process_info(device_handle, process_handle)
                         except Exception as e:
-                            LOGGER.warning(f"\t\t+ Could not get process info for process {process_handle}: {e}")
+                            LOGGER.warning(f"Could not get process info for process {process_handle}: {e}")
                             continue
-                        # only memory usage of the monitored process and its children is tracked
-                        if gpu_process_info["pid"] == process_id:
-                            current_used_memory += gpu_process_info["memory_usage"]["vram_mem"]
-                        else:
-                            try:
-                                cpu_process_info = psutil.Process(gpu_process_info["pid"])
-                            except Exception as e:
-                                LOGGER.warning(
-                                    f"\t\t+ Could not get process info for process {gpu_process_info['pid']}: {e}"
-                                )
-                                continue
-                            if cpu_process_info.parent() is not None and cpu_process_info.ppid() == process_id:
-                                current_used_memory += gpu_process_info["memory_usage"]["vram_mem"]
-
-                max_memory = max(max_memory, current_used_memory)
+
+                        if gpu_process_info["pid"] in monitored_pids:
+                            used_memory += gpu_process_info["memory_usage"]["vram_mem"]
+
+                max_used_memory = max(max_used_memory, used_memory)
                 stop = connection.poll(interval)
 
         amdsmi.amdsmi_shut_down()
     else:
         raise ValueError("Only NVIDIA and AMD ROCm GPUs are supported for CUDA memory tracking.")
 
-    connection.send(max_memory / 1e6)  # convert to MB
+    connection.send(max_used_memory / 1e6)  # convert to MB
     connection.close()
diff --git a/tests/test_api.py b/tests/test_api.py
index da2fe35f..2516ef23 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -49,6 +49,8 @@
 BACKENDS = ["pytorch", "none"]
 DEVICES = ["cpu", "cuda"]
 
+CUDA_VISIBLE_DEVICES = ",".join([str(i) for i in range(torch.cuda.device_count())])
+
 
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("backend", BACKENDS)
@@ -113,10 +115,16 @@ def test_api_memory_tracker(device, backend):
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("launcher_config", LAUNCHER_CONFIGS)
 def test_api_launch(device, launcher_config):
-    benchmark_config = InferenceConfig(latency=True, memory=True)
+    device_ids = CUDA_VISIBLE_DEVICES if device == "cuda" else None
+    benchmark_config = InferenceConfig(
+        memory=True,
+        latency=True,
+        input_shapes={"batch_size": 4},
+    )
+
     backend_config = PyTorchConfig(
         model="bert-base-uncased",
-        device_ids="0,1" if device == "cuda" else None,
+        device_ids=device_ids,
         no_weights=True,
         device=device,
     )