fixed available_kv_cache_memory

vllm-project · Dec 21, 2024 · 8e87434 · 8e87434
1 parent 51ff216
commit 8e87434
Showing 1 changed file with 2 additions and 1 deletion.
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
@@ -207,7 +207,8 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         memory_for_current_instance = total_gpu_memory * \
             self.cache_config.gpu_memory_utilization
         available_kv_cache_memory = (memory_for_current_instance -
-                                     result.non_kv_cache_memory_in_bytes)
+                                     result.non_kv_cache_memory_in_bytes -
+                                     result.baseline_memory_in_bytes)
 
         # Calculate the number of blocks that can be allocated with the
         # profiled peak memory.