Skip to content

Commit

Permalink
Merge branch 'main' into rpd_build_arg
Browse files Browse the repository at this point in the history
  • Loading branch information
gshtras authored Nov 8, 2024
2 parents 635f574 + 72ffb94 commit 024b047
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 419 deletions.
208 changes: 26 additions & 182 deletions benchmarks/profiling/benchmark_latency.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Benchmark the latency of processing a single batch of requests."""
import argparse
import dataclasses
import json
import os
import time
Expand All @@ -12,9 +13,8 @@
from tqdm import tqdm

from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import PromptType
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.utils import FlexibleArgumentParser


Expand All @@ -30,14 +30,14 @@ def rpd_profiler_context():
rpd.top_totals()

@contextmanager
def torch_profiler_context(profile_dir: Optional[str] = None):
def torch_profiler_context(profile_result_dir: Optional[str] = None):
p = torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir)))
str(profile_result_dir)))
p.start()
try:
with torch.no_grad():
Expand All @@ -47,63 +47,38 @@ def torch_profiler_context(profile_dir: Optional[str] = None):
print(p.key_averages().table(sort_by="self_cuda_time_total",
row_limit=-1))

def get_profiling_context(profile_dir: Optional[str] = None):
def get_profiling_context(profile_result_dir: Optional[str] = None):
if args.profile_torch:
return torch_profiler_context(profile_dir)
return torch_profiler_context(profile_result_dir)
elif args.profile_rpd:
return rpd_profiler_context()
else:
return nullcontext()

if args.profile_torch or args.profile_rpd:
profile_dir = Path(args.profile_dir
or "./vllm_benchmark_latency_result")
profile_dir.mkdir(parents=True, exist_ok=True)
profile_result_dir = Path(args.profile_result_dir
or "./vllm_benchmark_latency_result")
profile_result_dir.mkdir(parents=True, exist_ok=True)
name = os.path.basename(os.path.normpath(args.model))
model_trace_name = (
f"{name}_in_{args.input_len}_out_{args.output_len}_"
f"batch_{args.batch_size}_tp_{args.tensor_parallel_size}")
print(f"Profiling (results will be saved to '{profile_dir}')...")
print(
f"Profiling (results will be saved to '{profile_result_dir}')...")
if args.profile_rpd:
profile_dir /= f"{model_trace_name}.rpd"
os.environ["VLLM_RPD_PROFILER_DIR"] = str(profile_dir)
profile_result_dir /= f"{model_trace_name}.rpd"
os.environ["VLLM_RPD_PROFILER_DIR"] = str(profile_result_dir)

engine_args = EngineArgs.from_cli_args(args)

# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
llm = LLM(
model=args.model,
speculative_model=args.speculative_model,
num_speculative_tokens=args.num_speculative_tokens,
speculative_draft_tensor_parallel_size=\
args.speculative_draft_tensor_parallel_size,
tokenizer=args.tokenizer,
quantization=args.quantization,
tensor_parallel_size=args.tensor_parallel_size,
trust_remote_code=args.trust_remote_code,
dtype=args.dtype,
max_model_len=args.max_model_len,
enforce_eager=args.enforce_eager,
kv_cache_dtype=args.kv_cache_dtype,
quantization_param_path=args.quantization_param_path,
device=args.device,
ray_workers_use_nsight=args.ray_workers_use_nsight,
use_v2_block_manager=args.use_v2_block_manager,
enable_chunked_prefill=args.enable_chunked_prefill,
download_dir=args.download_dir,
block_size=args.block_size,
gpu_memory_utilization=args.gpu_memory_utilization,
load_format=args.load_format,
distributed_executor_backend=args.distributed_executor_backend,
otlp_traces_endpoint=args.otlp_traces_endpoint,
enable_prefix_caching=args.enable_prefix_caching,
num_scheduler_steps=args.num_scheduler_steps,
)
llm = LLM(**dataclasses.asdict(engine_args))

sampling_params = SamplingParams(
n=args.n,
temperature=0.0 if args.use_beam_search else 1.0,
temperature=1.0,
top_p=1.0,
use_beam_search=args.use_beam_search,
ignore_eos=True,
max_tokens=args.output_len,
)
Expand All @@ -115,9 +90,9 @@ def get_profiling_context(profile_dir: Optional[str] = None):
"prompt_token_ids": batch
} for batch in dummy_prompt_token_ids.tolist()]

def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir:
with get_profiling_context(profile_dir):
def run_to_completion(profile_result_dir: Optional[str] = None):
if profile_result_dir:
with get_profiling_context(profile_result_dir):
llm.generate(dummy_inputs,
sampling_params=sampling_params,
use_tqdm=False)
Expand All @@ -132,16 +107,16 @@ def run_to_completion(profile_dir: Optional[str] = None):

print("Warming up...")
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
run_to_completion(profile_dir=None)
run_to_completion(profile_result_dir=None)

if args.profile_torch or args.profile_rpd:
run_to_completion(profile_dir=profile_dir)
run_to_completion(profile_result_dir=profile_result_dir)
return

# Benchmark.
latencies = []
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
latencies.append(run_to_completion(profile_dir=None))
latencies.append(run_to_completion(profile_result_dir=None))
latencies = np.array(latencies)
percentages = [10, 25, 50, 75, 90, 99]
percentiles = np.percentile(latencies, percentages)
Expand All @@ -164,19 +139,6 @@ def run_to_completion(profile_dir: Optional[str] = None):
parser = FlexibleArgumentParser(
description='Benchmark the latency of processing a single batch of '
'requests till completion.')
parser.add_argument('--model', type=str, default='facebook/opt-125m')
parser.add_argument('--speculative-model', type=str, default=None)
parser.add_argument('--num-speculative-tokens', type=int, default=None)
parser.add_argument('--speculative-draft-tensor-parallel-size',
'-spec-draft-tp',
type=int,
default=None)
parser.add_argument('--tokenizer', type=str, default=None)
parser.add_argument('--quantization',
'-q',
choices=[*QUANTIZATION_METHODS, None],
default=None)
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
parser.add_argument('--input-len', type=int, default=32)
parser.add_argument('--output-len', type=int, default=128)
parser.add_argument('--batch-size', type=int, default=8)
Expand All @@ -193,52 +155,6 @@ def run_to_completion(profile_dir: Optional[str] = None):
type=int,
default=30,
help='Number of iterations to run.')
parser.add_argument('--trust-remote-code',
action='store_true',
help='trust remote code from huggingface')
parser.add_argument(
'--max-model-len',
type=int,
default=None,
help='Maximum length of a sequence (including prompt and output). '
'If None, will be derived from the model.')
parser.add_argument(
'--dtype',
type=str,
default='auto',
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
help='data type for model weights and activations. '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.')
parser.add_argument('--enforce-eager',
action='store_true',
help='enforce eager mode and disable CUDA graph')
parser.add_argument(
'--kv-cache-dtype',
type=str,
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
default="auto",
help='Data type for kv cache storage. If "auto", will use model '
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
parser.add_argument(
'--quantization-param-path',
type=str,
default=None,
help='Path to the JSON file containing the KV cache scaling factors. '
'This should generally be supplied, when KV cache dtype is FP8. '
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
'instead supported for common inference criteria.')
parser.add_argument(
'--quantized-weights-path',
type=str,
default=None,
help='Path to the safetensor file containing the quantized weights '
'and scaling factors. This should generally be supplied, when '
'quantization is FP8.')
parser.add_argument(
'--profile-torch',
action='store_true',
Expand All @@ -248,89 +164,17 @@ def run_to_completion(profile_dir: Optional[str] = None):
action='store_true',
help='profile the generation process of a single batch')
parser.add_argument(
'--profile-dir',
'--profile-result-dir',
type=str,
default=os.getenv('VLLM_RPD_PROFILER_DIR', default=None),
help=('path to save the profiler output. Can be visualized '
'with ui.perfetto.dev or Tensorboard.'))
parser.add_argument("--device",
type=str,
default="auto",
choices=DEVICE_OPTIONS,
help='device type for vLLM execution')
parser.add_argument('--block-size',
type=int,
default=16,
help='block size of key/value cache')
parser.add_argument(
'--enable-chunked-prefill',
action='store_true',
help='If True, the prefill requests can be chunked based on the '
'max_num_batched_tokens')
parser.add_argument("--enable-prefix-caching",
action='store_true',
help="Enable automatic prefix caching")
parser.add_argument('--use-v2-block-manager', action='store_true')
parser.add_argument(
"--ray-workers-use-nsight",
action='store_true',
help="If specified, use nsight to profile ray workers",
)
parser.add_argument('--download-dir',
type=str,
default=None,
help='directory to download and load the weights, '
'default to the default cache dir of huggingface')
parser.add_argument(
'--output-json',
type=str,
default=None,
help='Path to save the latency results in JSON format.')
parser.add_argument('--gpu-memory-utilization',
type=float,
default=0.9,
help='the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.')
parser.add_argument(
'--load-format',
type=str,
default=EngineArgs.load_format,
choices=[
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
'bitsandbytes'
],
help='The format of the model weights to load.\n\n'
'* "auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
'is not available.\n'
'* "pt" will load the weights in the pytorch bin format.\n'
'* "safetensors" will load the weights in the safetensors format.\n'
'* "npcache" will load the weights in pytorch format and store '
'a numpy cache to speed up the loading.\n'
'* "dummy" will initialize the weights with random values, '
'which is mainly for profiling.\n'
'* "tensorizer" will load the weights using tensorizer from '
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
'section for more information.\n'
'* "bitsandbytes" will load the weights using bitsandbytes '
'quantization.\n')
parser.add_argument(
'--distributed-executor-backend',
choices=['ray', 'mp'],
default=None,
help='Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.')
parser.add_argument(
'--otlp-traces-endpoint',
type=str,
default=None,
help='Target URL to which OpenTelemetry traces will be sent.')
parser.add_argument(
"--num-scheduler-steps",
type=int,
default=1,
help="Maximum number of forward steps per scheduler call.")

parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
main(args)
Loading

0 comments on commit 024b047

Please sign in to comment.