Skip to content

Commit

Permalink
[Misc] Add logging for CUDA memory (vllm-project#10027)
Browse files Browse the repository at this point in the history
Signed-off-by: Chenghao Yang <[email protected]>
Signed-off-by: youkaichao <[email protected]>
Co-authored-by: Chenghao Yang <[email protected]>
Co-authored-by: youkaichao <[email protected]>
  • Loading branch information
3 people authored Nov 5, 2024
1 parent cd34029 commit 09d3550
Showing 1 changed file with 14 additions and 10 deletions.
24 changes: 14 additions & 10 deletions vllm/worker/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@
from vllm.sampling_params import SamplingParams
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
from vllm.transformers_utils.config import uses_mrope
from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, async_tensor_h2d,
flatten_2d_lists, is_pin_memory_available,
supports_dynamo, weak_ref_tensor)
from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache,
async_tensor_h2d, flatten_2d_lists,
is_pin_memory_available, supports_dynamo,
weak_ref_tensor)
from vllm.worker.model_runner_base import (
ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
_add_attn_metadata_broadcastable_dict,
Expand Down Expand Up @@ -1383,16 +1384,16 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
per sequence in the batch.
"""
assert not self.model_config.enforce_eager
logger.info("Capturing the model for CUDA graphs. This may lead to "
logger.info("Capturing cudagraphs for decoding. This may lead to "
"unexpected consequences if the model is not static. To "
"run the model in eager mode, set 'enforce_eager=True' or "
"use '--enforce-eager' in the CLI.")
logger.info("CUDA graphs can take additional 1~3 GiB memory per GPU. "
"If you are running out of memory, consider decreasing "
"`gpu_memory_utilization` or enforcing eager mode. "
"You can also reduce the `max_num_seqs` as needed "
"to decrease memory usage.")
logger.info("If out-of-memory error occurs during cudagraph capture,"
" consider decreasing `gpu_memory_utilization` or "
"switching to eager mode. You can also reduce the "
"`max_num_seqs` as needed to decrease memory usage.")
start_time = time.perf_counter()
start_free_gpu_memory = torch.cuda.mem_get_info()[0]

# Prepare dummy inputs. These will be reused for all batch sizes.
max_batch_size = self.max_batchsize_to_capture
Expand Down Expand Up @@ -1497,9 +1498,12 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
graph_runner)

end_time = time.perf_counter()
end_free_gpu_memory = torch.cuda.mem_get_info()[0]
elapsed_time = end_time - start_time
cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
# This usually takes < 10 seconds.
logger.info("Graph capturing finished in %.0f secs.", elapsed_time)
logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
elapsed_time, cuda_graph_size / GiB_bytes)

def _update_inputs_to_capture_for_enc_dec_model(self,
capture_inputs: Dict[str,
Expand Down

0 comments on commit 09d3550

Please sign in to comment.