diff --git a/benchmarks/profiling/benchmark_latency.py b/benchmarks/profiling/benchmark_latency.py index 6acca00e2f9d9..73366f22f8de8 100644 --- a/benchmarks/profiling/benchmark_latency.py +++ b/benchmarks/profiling/benchmark_latency.py @@ -1,5 +1,6 @@ """Benchmark the latency of processing a single batch of requests.""" import argparse +import dataclasses import json import os import time @@ -12,9 +13,8 @@ from tqdm import tqdm from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs +from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.utils import FlexibleArgumentParser @@ -30,14 +30,14 @@ def rpd_profiler_context(): rpd.top_totals() @contextmanager - def torch_profiler_context(profile_dir: Optional[str] = None): + def torch_profiler_context(profile_result_dir: Optional[str] = None): p = torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA, ], on_trace_ready=torch.profiler.tensorboard_trace_handler( - str(profile_dir))) + str(profile_result_dir))) p.start() try: with torch.no_grad(): @@ -47,63 +47,38 @@ def torch_profiler_context(profile_dir: Optional[str] = None): print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1)) - def get_profiling_context(profile_dir: Optional[str] = None): + def get_profiling_context(profile_result_dir: Optional[str] = None): if args.profile_torch: - return torch_profiler_context(profile_dir) + return torch_profiler_context(profile_result_dir) elif args.profile_rpd: return rpd_profiler_context() else: return nullcontext() if args.profile_torch or args.profile_rpd: - profile_dir = Path(args.profile_dir - or "./vllm_benchmark_latency_result") - profile_dir.mkdir(parents=True, exist_ok=True) + profile_result_dir = Path(args.profile_result_dir + or "./vllm_benchmark_latency_result") + profile_result_dir.mkdir(parents=True, exist_ok=True) name = os.path.basename(os.path.normpath(args.model)) model_trace_name = ( f"{name}_in_{args.input_len}_out_{args.output_len}_" f"batch_{args.batch_size}_tp_{args.tensor_parallel_size}") - print(f"Profiling (results will be saved to '{profile_dir}')...") + print( + f"Profiling (results will be saved to '{profile_result_dir}')...") if args.profile_rpd: - profile_dir /= f"{model_trace_name}.rpd" - os.environ["VLLM_RPD_PROFILER_DIR"] = str(profile_dir) + profile_result_dir /= f"{model_trace_name}.rpd" + os.environ["VLLM_RPD_PROFILER_DIR"] = str(profile_result_dir) + + engine_args = EngineArgs.from_cli_args(args) # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. - llm = LLM( - model=args.model, - speculative_model=args.speculative_model, - num_speculative_tokens=args.num_speculative_tokens, - speculative_draft_tensor_parallel_size=\ - args.speculative_draft_tensor_parallel_size, - tokenizer=args.tokenizer, - quantization=args.quantization, - tensor_parallel_size=args.tensor_parallel_size, - trust_remote_code=args.trust_remote_code, - dtype=args.dtype, - max_model_len=args.max_model_len, - enforce_eager=args.enforce_eager, - kv_cache_dtype=args.kv_cache_dtype, - quantization_param_path=args.quantization_param_path, - device=args.device, - ray_workers_use_nsight=args.ray_workers_use_nsight, - use_v2_block_manager=args.use_v2_block_manager, - enable_chunked_prefill=args.enable_chunked_prefill, - download_dir=args.download_dir, - block_size=args.block_size, - gpu_memory_utilization=args.gpu_memory_utilization, - load_format=args.load_format, - distributed_executor_backend=args.distributed_executor_backend, - otlp_traces_endpoint=args.otlp_traces_endpoint, - enable_prefix_caching=args.enable_prefix_caching, - num_scheduler_steps=args.num_scheduler_steps, - ) + llm = LLM(**dataclasses.asdict(engine_args)) sampling_params = SamplingParams( n=args.n, - temperature=0.0 if args.use_beam_search else 1.0, + temperature=1.0, top_p=1.0, - use_beam_search=args.use_beam_search, ignore_eos=True, max_tokens=args.output_len, ) @@ -115,9 +90,9 @@ def get_profiling_context(profile_dir: Optional[str] = None): "prompt_token_ids": batch } for batch in dummy_prompt_token_ids.tolist()] - def run_to_completion(profile_dir: Optional[str] = None): - if profile_dir: - with get_profiling_context(profile_dir): + def run_to_completion(profile_result_dir: Optional[str] = None): + if profile_result_dir: + with get_profiling_context(profile_result_dir): llm.generate(dummy_inputs, sampling_params=sampling_params, use_tqdm=False) @@ -132,16 +107,16 @@ def run_to_completion(profile_dir: Optional[str] = None): print("Warming up...") for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): - run_to_completion(profile_dir=None) + run_to_completion(profile_result_dir=None) if args.profile_torch or args.profile_rpd: - run_to_completion(profile_dir=profile_dir) + run_to_completion(profile_result_dir=profile_result_dir) return # Benchmark. latencies = [] for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): - latencies.append(run_to_completion(profile_dir=None)) + latencies.append(run_to_completion(profile_result_dir=None)) latencies = np.array(latencies) percentages = [10, 25, 50, 75, 90, 99] percentiles = np.percentile(latencies, percentages) @@ -164,19 +139,6 @@ def run_to_completion(profile_dir: Optional[str] = None): parser = FlexibleArgumentParser( description='Benchmark the latency of processing a single batch of ' 'requests till completion.') - parser.add_argument('--model', type=str, default='facebook/opt-125m') - parser.add_argument('--speculative-model', type=str, default=None) - parser.add_argument('--num-speculative-tokens', type=int, default=None) - parser.add_argument('--speculative-draft-tensor-parallel-size', - '-spec-draft-tp', - type=int, - default=None) - parser.add_argument('--tokenizer', type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) parser.add_argument('--input-len', type=int, default=32) parser.add_argument('--output-len', type=int, default=128) parser.add_argument('--batch-size', type=int, default=8) @@ -193,52 +155,6 @@ def run_to_completion(profile_dir: Optional[str] = None): type=int, default=30, help='Number of iterations to run.') - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument( - '--max-model-len', - type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--enforce-eager', - action='store_true', - help='enforce eager mode and disable CUDA graph') - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], - default="auto", - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') - parser.add_argument( - '--quantization-param-path', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. ' - 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' - 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' - 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') - parser.add_argument( - '--quantized-weights-path', - type=str, - default=None, - help='Path to the safetensor file containing the quantized weights ' - 'and scaling factors. This should generally be supplied, when ' - 'quantization is FP8.') parser.add_argument( '--profile-torch', action='store_true', @@ -248,89 +164,17 @@ def run_to_completion(profile_dir: Optional[str] = None): action='store_true', help='profile the generation process of a single batch') parser.add_argument( - '--profile-dir', + '--profile-result-dir', type=str, default=os.getenv('VLLM_RPD_PROFILER_DIR', default=None), help=('path to save the profiler output. Can be visualized ' 'with ui.perfetto.dev or Tensorboard.')) - parser.add_argument("--device", - type=str, - default="auto", - choices=DEVICE_OPTIONS, - help='device type for vLLM execution') - parser.add_argument('--block-size', - type=int, - default=16, - help='block size of key/value cache') - parser.add_argument( - '--enable-chunked-prefill', - action='store_true', - help='If True, the prefill requests can be chunked based on the ' - 'max_num_batched_tokens') - parser.add_argument("--enable-prefix-caching", - action='store_true', - help="Enable automatic prefix caching") - parser.add_argument('--use-v2-block-manager', action='store_true') - parser.add_argument( - "--ray-workers-use-nsight", - action='store_true', - help="If specified, use nsight to profile ray workers", - ) - parser.add_argument('--download-dir', - type=str, - default=None, - help='directory to download and load the weights, ' - 'default to the default cache dir of huggingface') parser.add_argument( '--output-json', type=str, default=None, help='Path to save the latency results in JSON format.') - parser.add_argument('--gpu-memory-utilization', - type=float, - default=0.9, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') - parser.add_argument( - '--load-format', - type=str, - default=EngineArgs.load_format, - choices=[ - 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', - 'bitsandbytes' - ], - help='The format of the model weights to load.\n\n' - '* "auto" will try to load the weights in the safetensors format ' - 'and fall back to the pytorch bin format if safetensors format ' - 'is not available.\n' - '* "pt" will load the weights in the pytorch bin format.\n' - '* "safetensors" will load the weights in the safetensors format.\n' - '* "npcache" will load the weights in pytorch format and store ' - 'a numpy cache to speed up the loading.\n' - '* "dummy" will initialize the weights with random values, ' - 'which is mainly for profiling.\n' - '* "tensorizer" will load the weights using tensorizer from ' - 'CoreWeave. See the Tensorize vLLM Model script in the Examples' - 'section for more information.\n' - '* "bitsandbytes" will load the weights using bitsandbytes ' - 'quantization.\n') - parser.add_argument( - '--distributed-executor-backend', - choices=['ray', 'mp'], - default=None, - help='Backend to use for distributed serving. When more than 1 GPU ' - 'is used, will be automatically set to "ray" if installed ' - 'or "mp" (multiprocessing) otherwise.') - parser.add_argument( - '--otlp-traces-endpoint', - type=str, - default=None, - help='Target URL to which OpenTelemetry traces will be sent.') - parser.add_argument( - "--num-scheduler-steps", - type=int, - default=1, - help="Maximum number of forward steps per scheduler call.") + + parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() main(args) diff --git a/benchmarks/profiling/benchmark_throughput.py b/benchmarks/profiling/benchmark_throughput.py index 81d126f456028..3bbdd7d4267ae 100644 --- a/benchmarks/profiling/benchmark_throughput.py +++ b/benchmarks/profiling/benchmark_throughput.py @@ -1,5 +1,6 @@ """Benchmark offline inference throughput.""" import argparse +import dataclasses import json import os import random @@ -14,10 +15,9 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) -from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args) -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.utils import FlexibleArgumentParser, merge_async_iterators @@ -69,30 +69,8 @@ def sample_requests( def run_vllm( requests: List[Tuple[str, int, int]], - model: str, - tokenizer: str, - quantization: Optional[str], - tensor_parallel_size: int, - seed: int, n: int, - use_beam_search: bool, - trust_remote_code: bool, - dtype: str, - max_model_len: Optional[int], - enforce_eager: bool, - kv_cache_dtype: str, - quantization_param_path: Optional[str], - device: str, - enable_prefix_caching: bool, - enable_chunked_prefill: bool, - max_num_batched_tokens: int, - distributed_executor_backend: Optional[str], - gpu_memory_utilization: float = 0.9, - num_scheduler_steps: int = 1, - use_v2_block_manager: bool = False, - download_dir: Optional[str] = None, - load_format: str = EngineArgs.load_format, - disable_async_output_proc: bool = False, + engine_args: EngineArgs, ) -> float: from vllm import LLM, SamplingParams @@ -143,30 +121,7 @@ def get_profiling_context(profile_dir: Optional[str] = None): profile_dir /= f"{model_trace_name}.rpd" os.environ["VLLM_RPD_PROFILER_DIR"] = str(profile_dir) - llm = LLM( - model=model, - tokenizer=tokenizer, - quantization=quantization, - tensor_parallel_size=tensor_parallel_size, - seed=seed, - trust_remote_code=trust_remote_code, - dtype=dtype, - max_model_len=max_model_len, - gpu_memory_utilization=gpu_memory_utilization, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - quantization_param_path=quantization_param_path, - device=device, - enable_prefix_caching=enable_prefix_caching, - download_dir=download_dir, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - distributed_executor_backend=distributed_executor_backend, - load_format=load_format, - num_scheduler_steps=num_scheduler_steps, - use_v2_block_manager=use_v2_block_manager, - disable_async_output_proc=disable_async_output_proc, - ) + llm = LLM(**dataclasses.asdict(engine_args)) # Add the requests to the engine. prompts: List[str] = [] @@ -176,9 +131,8 @@ def get_profiling_context(profile_dir: Optional[str] = None): sampling_params.append( SamplingParams( n=n, - temperature=0.0 if use_beam_search else 1.0, + temperature=1.0, top_p=1.0, - use_beam_search=use_beam_search, ignore_eos=True, max_tokens=output_len, )) @@ -196,59 +150,11 @@ def get_profiling_context(profile_dir: Optional[str] = None): async def run_vllm_async( requests: List[Tuple[str, int, int]], - model: str, - tokenizer: str, - quantization: Optional[str], - tensor_parallel_size: int, - seed: int, n: int, - use_beam_search: bool, - trust_remote_code: bool, - dtype: str, - max_model_len: Optional[int], - enforce_eager: bool, - kv_cache_dtype: str, - quantization_param_path: Optional[str], - device: str, - enable_prefix_caching: bool, - enable_chunked_prefill: bool, - max_num_batched_tokens: int, - distributed_executor_backend: Optional[str], - gpu_memory_utilization: float = 0.9, - num_scheduler_steps: int = 1, - use_v2_block_manager: bool = False, - download_dir: Optional[str] = None, - load_format: str = EngineArgs.load_format, - disable_async_output_proc: bool = False, + engine_args: AsyncEngineArgs, disable_frontend_multiprocessing: bool = False, ) -> float: from vllm import SamplingParams - engine_args = AsyncEngineArgs( - model=model, - tokenizer=tokenizer, - quantization=quantization, - tensor_parallel_size=tensor_parallel_size, - seed=seed, - trust_remote_code=trust_remote_code, - dtype=dtype, - max_model_len=max_model_len, - gpu_memory_utilization=gpu_memory_utilization, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - quantization_param_path=quantization_param_path, - device=device, - enable_prefix_caching=enable_prefix_caching, - download_dir=download_dir, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - distributed_executor_backend=distributed_executor_backend, - load_format=load_format, - num_scheduler_steps=num_scheduler_steps, - use_v2_block_manager=use_v2_block_manager, - disable_async_output_proc=disable_async_output_proc, - worker_use_ray=False, - disable_log_requests=True, - ) async with build_async_engine_client_from_engine_args( engine_args, disable_frontend_multiprocessing) as llm: @@ -261,9 +167,8 @@ async def run_vllm_async( sampling_params.append( SamplingParams( n=n, - temperature=0.0 if use_beam_search else 1.0, + temperature=1.0, top_p=1.0, - use_beam_search=use_beam_search, ignore_eos=True, max_tokens=output_len, )) @@ -375,24 +280,17 @@ def main(args: argparse.Namespace): args.output_len) if args.backend == "vllm": - run_args = [ - requests, args.model, args.tokenizer, args.quantization, - args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, - args.trust_remote_code, args.dtype, args.max_model_len, - args.enforce_eager, args.kv_cache_dtype, - args.quantization_param_path, args.device, - args.enable_prefix_caching, args.enable_chunked_prefill, - args.max_num_batched_tokens, args.distributed_executor_backend, - args.gpu_memory_utilization, args.num_scheduler_steps, - args.use_v2_block_manager, args.download_dir, args.load_format, - args.disable_async_output_proc - ] - if args.async_engine: - run_args.append(args.disable_frontend_multiprocessing) - elapsed_time = uvloop.run(run_vllm_async(*run_args)) + elapsed_time = uvloop.run( + run_vllm_async( + requests, + args.n, + AsyncEngineArgs.from_cli_args(args), + args.disable_frontend_multiprocessing, + )) else: - elapsed_time = run_vllm(*run_args) + elapsed_time = run_vllm(requests, args.n, + EngineArgs.from_cli_args(args)) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -445,142 +343,23 @@ def main(args: argparse.Namespace): default=None, help="Output length for each request. Overrides the " "output length from the dataset.") - parser.add_argument("--model", type=str, default="facebook/opt-125m") - parser.add_argument("--tokenizer", type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) parser.add_argument("--n", type=int, default=1, help="Number of generated sequences per prompt.") - parser.add_argument("--use-beam-search", action="store_true") parser.add_argument("--num-prompts", type=int, default=1000, help="Number of prompts to process.") - parser.add_argument("--seed", type=int, default=0) parser.add_argument("--hf-max-batch-size", type=int, default=None, help="Maximum batch size for HF backend.") - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument( - '--max-model-len', - type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--gpu-memory-utilization', - type=float, - default=0.9, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') - parser.add_argument("--enforce-eager", - action="store_true", - help="enforce eager execution") - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], - default="auto", - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') - parser.add_argument( - '--quantization-param-path', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. ' - 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' - 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' - 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') - parser.add_argument("--device", - type=str, - default="auto", - choices=DEVICE_OPTIONS, - help='device type for vLLM execution') - parser.add_argument( - "--num-scheduler-steps", - type=int, - default=1, - help="Maximum number of forward steps per scheduler call.") - parser.add_argument("--use-v2-block-manager", - action='store_true', - help="Enable block manager v2.") - parser.add_argument( - "--enable-prefix-caching", - action='store_true', - help="Enable automatic prefix caching for vLLM backend.") - parser.add_argument("--enable-chunked-prefill", - action='store_true', - help="enable chunked prefill for vLLM backend.") - parser.add_argument('--max-num-batched-tokens', - type=int, - default=None, - help='maximum number of batched tokens per ' - 'iteration') - parser.add_argument('--download-dir', - type=str, - default=None, - help='directory to download and load the weights, ' - 'default to the default cache dir of huggingface') parser.add_argument( '--output-json', type=str, default=None, help='Path to save the throughput results in JSON format.') - parser.add_argument( - '--distributed-executor-backend', - choices=['ray', 'mp'], - default=None, - help='Backend to use for distributed serving. When more than 1 GPU ' - 'is used, will be automatically set to "ray" if installed ' - 'or "mp" (multiprocessing) otherwise.') - parser.add_argument( - '--load-format', - type=str, - default=EngineArgs.load_format, - choices=[ - 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', - 'bitsandbytes' - ], - help='The format of the model weights to load.\n\n' - '* "auto" will try to load the weights in the safetensors format ' - 'and fall back to the pytorch bin format if safetensors format ' - 'is not available.\n' - '* "pt" will load the weights in the pytorch bin format.\n' - '* "safetensors" will load the weights in the safetensors format.\n' - '* "npcache" will load the weights in pytorch format and store ' - 'a numpy cache to speed up the loading.\n' - '* "dummy" will initialize the weights with random values, ' - 'which is mainly for profiling.\n' - '* "tensorizer" will load the weights using tensorizer from ' - 'CoreWeave. See the Tensorize vLLM Model script in the Examples' - 'section for more information.\n' - '* "bitsandbytes" will load the weights using bitsandbytes ' - 'quantization.\n') - parser.add_argument( - "--disable-async-output-proc", - action='store_true', - default=False, - help="Disable async output processor for vLLM backend.") parser.add_argument("--async-engine", action='store_true', default=False, @@ -603,6 +382,7 @@ def main(args: argparse.Namespace): default=None, help=('path to save the profiler output. Can be visualized ' 'with ui.perfetto.dev or Tensorboard.')) + parser = AsyncEngineArgs.add_cli_args(parser) args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model