diff --git a/benchmarks/cutlass_benchmarks/dense_mm/bench_v1.py b/benchmarks/cutlass_benchmarks/dense_mm/bench_v1.py deleted file mode 100644 index d2f532c6bf18c..0000000000000 --- a/benchmarks/cutlass_benchmarks/dense_mm/bench_v1.py +++ /dev/null @@ -1,191 +0,0 @@ -## Cutlass benchmark V1 - -from typing import Callable, Iterable - -import torch -import torch.utils.benchmark as TBenchmark -from torch.utils.benchmark import Measurement as TMeasurement -from utils import make_rand_tensors - -import vllm._custom_ops as ops - - -# bench -def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args, - **kwargs) -> TMeasurement: - min_run_time = 1 - - globals = { - "args": args, - "kwargs": kwargs, - "fn": fn, - } - return TBenchmark.Timer( - stmt="fn(*args, **kwargs)", - globals=globals, - label=label, - sub_label=sub_label, - description=description, - ).blocked_autorange(min_run_time=min_run_time) - - -def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str, - sub_label: str) -> Iterable[TMeasurement]: - assert dtype == torch.int8 - a, b = make_rand_tensors(torch.int8, m, n, k) - scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) - scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) - bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16) - azp = torch.zeros((m, ), device="cuda", dtype=torch.int32) - azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32) - - timers = [] - # pytorch impl - bfloat16 - timers.append( - bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales", - torch.mm, a.to(dtype=torch.bfloat16), - b.to(dtype=torch.bfloat16))) - - # pytorch impl - float16 - timers.append( - bench_fn(label, sub_label, - "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm, - a.to(dtype=torch.float16), b.to(dtype=torch.float16))) - - # cutlass impl - timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, - torch.bfloat16)) - - # cutlass with bias - timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16, - bias)) - - # cutlass with azp per-tensor - timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp", - ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, - torch.bfloat16, azp_adj)) - - # cutlass with azp per-tensor + bias - timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias", - ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, - torch.bfloat16, azp_adj, None, bias)) - - # cutlass with azp per-token - timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt", - ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, - torch.bfloat16, azp_adj, azp)) - - # cutlass with azp per-token + bias - timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias", - ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, - torch.bfloat16, azp_adj, azp, bias)) - - return timers - - -def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, - sub_label: str) -> Iterable[TMeasurement]: - assert dtype == torch.float8_e4m3fn - a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k) - scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) - scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) - bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16) - - timers = [] - - # pytorch impl w. bf16 - timers.append( - bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales", - torch.mm, a.to(dtype=torch.bfloat16, device="cuda"), - b.to(dtype=torch.bfloat16, device="cuda"))) - - # pytorch impl: bf16 output, without fp8 fast accum - timers.append( - bench_fn(label, - sub_label, - "pytorch_fp8_fp8_bf16_scaled_mm", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.bfloat16)) - - # pytorch impl: bf16 output, with fp8 fast accum - timers.append( - bench_fn(label, - sub_label, - "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.bfloat16, - use_fast_accum=True)) - - # pytorch impl: fp16 output, without fp8 fast accum - timers.append( - bench_fn(label, - sub_label, - "pytorch_fp8_fp8_fp16_scaled_mm", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.float16)) - - # pytorch impl: fp16 output, with fp8 fast accum - timers.append( - bench_fn(label, - sub_label, - "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.float16, - use_fast_accum=True)) - - # cutlass impl: bf16 output - timers.append( - bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, - torch.bfloat16)) - # cutlass impl: fp16 output - timers.append( - bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16)) - - # cutlass impl: bf16 output, with bias - timers.append( - bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16, - bias)) - - # cutlass impl: fp16 output, with bias - timers.append( - bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16, - bias.to(dtype=torch.float16))) - - return timers - - -def bench_v1(dtype: torch.dtype, m: int, k: int, n: int, label: str, - sub_label: str) -> Iterable[TMeasurement]: - if dtype == torch.int8: - return bench_int8(dtype, m, k, n, label, sub_label) - if dtype == torch.float8_e4m3fn: - return bench_fp8(dtype, m, k, n, label, sub_label) - raise ValueError("unsupported type") diff --git a/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py b/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py deleted file mode 100644 index 466887d94f957..0000000000000 --- a/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py +++ /dev/null @@ -1,293 +0,0 @@ -import dataclasses -import random -from typing import Any, Callable, Iterable, Optional - -import torch -import torch.utils.benchmark as TBenchmark -from torch.utils.benchmark import Measurement as TMeasurement -from utils import make_n_rand_tensors - -import vllm._custom_ops as ops - - -@dataclasses.dataclass -class CudaGraphBenchParams: - num_ops_in_cuda_graph: int - - -@dataclasses.dataclass -class ArgPool: - ''' - When some argument of the benchmarking function is annotated with this type, - the benchmarking class (BenchMM) will collapse the argument to a pick a - single value from the given list of values, during function invocation. - - For every invocation during a benchmarking run, it will choose a - different value from the list. - ''' - values: Iterable[Any] - - -class BenchMM: - - class ArgsIterator: - - def __init__(self, args_list, kwargs_list): - assert len(args_list) == len(kwargs_list) - self.args_list = args_list - self.kwargs_list = kwargs_list - self.n = len(self.args_list) - self.idx = 0 - - def __next__(self): - while True: - yield (self.args_list[self.idx], self.kwargs_list[self.idx]) - self.idx += 1 - self.idx = self.idx % self.n - - def reset(self): - self.idx = 0 - - @property - def n_args(self): - return self.n - - def __init__(self, cuda_graph_params: Optional[CudaGraphBenchParams], - label: str, sub_label: str, description: str, fn: Callable, - *args, **kwargs): - - self.cuda_graph_params = cuda_graph_params - self.use_cuda_graph = self.cuda_graph_params is not None - self.label = label - self.sub_label = sub_label - self.description = description - self.fn = fn - - # Process args - self._args = args - self._kwargs = kwargs - self.args_list, self.kwargs_list = self.collapse_argpool( - *args, **kwargs) - self.args_iterator = self.ArgsIterator(self.args_list, - self.kwargs_list) - - # Cudagraph runner - self.g = None - if self.use_cuda_graph: - self.g = self.get_cuda_graph_runner() - - # benchmark run params - self.min_run_time = 1 - - def collapse_argpool(self, *args, **kwargs): - kwargs = kwargs if kwargs is not None else {} - assert kwargs is None or all([ - not isinstance(v, ArgPool) for k, v in kwargs.items() - ]), 'ArgPools in kwargs are not supported yet' - - arg_pool_indices = [ - i for i, x in enumerate(args) if isinstance(x, ArgPool) - ] - if len(arg_pool_indices) == 0: - return [args], [kwargs] - - # make sure all the Arg pools have the same number of choices - arg_pool_size = len(args[arg_pool_indices[0]].values) - assert all( - [len(args[i].values) == arg_pool_size for i in arg_pool_indices]) - - # create copies of the args - args_list = [] - kwargs_list = [] - for _ in range(arg_pool_size): - args_list.append(args) - kwargs_list.append(kwargs.copy()) - - # collapse the arg pools by simply choosing the ith value - for i in range(arg_pool_size): - assert isinstance(args_list[i], tuple) - # get as list - args_i = list(args_list[i]) - # collapse - make replacements - for arg_pool_idx in arg_pool_indices: - val_from_pool = args_i[arg_pool_idx].values[i] - args_i[arg_pool_idx] = val_from_pool - # store back as tuple - args_list[i] = tuple(args_i) - - return args_list, kwargs_list - - def get_cuda_graph_runner(self): - assert self.use_cuda_graph - assert self.args_iterator is not None - - num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph - - # warmup - args_it = self.args_iterator.__next__() - for _ in range(5): - args, kwargs = next(args_it) - self.fn(*args, **kwargs) - - self.args_iterator.reset() - args_it = self.args_iterator.__next__() - - stream = torch.cuda.Stream() - with torch.cuda.stream(stream): - g = torch.cuda.CUDAGraph() - with torch.cuda.graph(g): - for _ in range(num_graph_ops): - args, kwargs = next(args_it) - self.fn(*args, **kwargs) - return g - - def run_cudagraph(self) -> TMeasurement: - assert self.use_cuda_graph - globals = {'g': self.g} - - return TBenchmark.Timer( - stmt="g.replay()", - globals=globals, - label=self.label, - sub_label=self.sub_label, - description=self.description, - ).blocked_autorange(min_run_time=self.min_run_time) - - def run_eager(self) -> TMeasurement: - setup = None - stmt = None - globals = None - - has_arg_pool = self.args_iterator.n_args > 1 - if has_arg_pool: - setup = ''' - args_iterator.reset() - args_it = args_iterator.__next__() - ''' - stmt = ''' - args, kwargs = next(args_it) - fn(*args, **kwargs) - ''' - globals = {'fn': self.fn, 'args_iterator': self.args_iterator} - else: - # no arg pool. Just use the args and kwargs directly - self.args_iterator.reset() - args_it = self.args_iterator.__next__() - args, kwargs = next(args_it) - - setup = "" - stmt = ''' - fn(*args, **kwargs) - ''' - globals = {'fn': self.fn, 'args': args, 'kwargs': kwargs} - - return TBenchmark.Timer( - stmt=stmt, - setup=setup, - globals=globals, - label=self.label, - sub_label=self.sub_label, - description=self.description, - ).blocked_autorange(min_run_time=self.min_run_time) - - def run(self) -> TMeasurement: - timer = None - if self.use_cuda_graph: # noqa SIM108 - timer = self.run_cudagraph() - else: - timer = self.run_eager() - #assert timer.meets_confidence() - #assert not timer.has_warnings, f"Warnings {timer._warnings}" - if not timer.meets_confidence() or timer.has_warnings: - print("Doesn't meet confidence - re-running bench ...") - return self.run() - return timer - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - if exc_type: - print(f"exc type {exc_type}") - print(f"exc value {exc_value}") - print(f"exc traceback {traceback}") - - -def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int], - with_arg_pool: Optional[int], m: int, k: int, n: int, label: str, - sub_label: str) -> Iterable[TMeasurement]: - - arg_pool_size = with_arg_pool if with_arg_pool else 1 - cuda_graph_params: Optional[CudaGraphBenchParams] = None - if with_cuda_graph: - num_ops_in_cuda_graph = with_cuda_graph if with_cuda_graph else None - cuda_graph_params = CudaGraphBenchParams(num_ops_in_cuda_graph) - - assert dtype == torch.float8_e4m3fn - - # Make input As and Bs - As, Bs = make_n_rand_tensors(arg_pool_size, torch.float8_e4m3fn, m, n, k) - bf16_As = [x.to(dtype=torch.bfloat16) for x in As] - bf16_Bs = [x.to(dtype=torch.bfloat16) for x in Bs] - # shuffle As and Bs to prevent any suspicion of pattern exploitation - random.shuffle(As) - random.shuffle(Bs) - random.shuffle(bf16_As) - random.shuffle(bf16_Bs) - - # Make scales and biases - scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) - scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) - - timers = [] - - # pytorch impl w. bf16 - with BenchMM(cuda_graph_params, label, sub_label, - "pytorch_bf16_bf16_bf16_matmul-no-scales", torch.mm, - ArgPool(bf16_As), ArgPool(bf16_Bs)) as bench: - timers.append(bench.run()) - - ## pytorch impl: bf16 output, without fp8 fast accum - with BenchMM(cuda_graph_params, - label, - sub_label, - "pytorch_fp8_fp8_bf16_scaled_mm", - torch._scaled_mm, - ArgPool(As), - ArgPool(Bs), - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.bfloat16) as bench: - timers.append(bench.run()) - - ## pytorch impl: bf16 output, with fp8 fast accum - with BenchMM(cuda_graph_params, - label, - sub_label, - "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum", - torch._scaled_mm, - ArgPool(As), - ArgPool(Bs), - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.bfloat16, - use_fast_accum=True) as bench: - timers.append(bench.run()) - - ## cutlass impl: bf16 output - with BenchMM(cuda_graph_params, label, sub_label, - "cutlass_fp8_fp8_bf16_scaled_mm", ops.cutlass_scaled_mm, - ArgPool(As), ArgPool(Bs), scale_a, scale_b, - torch.bfloat16) as bench: - timers.append(bench.run()) - - return timers - - -def bench_v2(dtype: torch.dtype, with_cuda_graph: Optional[int], - with_arg_pool: Optional[int], m: int, k: int, n: int, label: str, - sub_label: str) -> Iterable[TMeasurement]: - if dtype == torch.float8_e4m3fn: - return bench_fp8(dtype, with_cuda_graph, with_arg_pool, m, k, n, label, - sub_label) - raise ValueError("unsupported type") diff --git a/benchmarks/cutlass_benchmarks/dense_mm/utils.py b/benchmarks/cutlass_benchmarks/dense_mm/utils.py deleted file mode 100644 index c8fcd50a51d31..0000000000000 --- a/benchmarks/cutlass_benchmarks/dense_mm/utils.py +++ /dev/null @@ -1,36 +0,0 @@ -# Cutlass bench utils -from typing import Iterable, Tuple - -import torch - - -def to_fp8(tensor: torch.Tensor) -> torch.Tensor: - finfo = torch.finfo(torch.float8_e4m3fn) - return torch.round(tensor.clamp( - min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) - - -def to_int8(tensor: torch.Tensor) -> torch.Tensor: - return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) - - -def make_rand_tensors(dtype: torch.dtype, m: int, n: int, - k: int) -> Tuple[torch.Tensor, torch.Tensor]: - a = torch.randn((m, k), device='cuda') * 5 - b = torch.randn((n, k), device='cuda').t() * 5 - - if dtype == torch.int8: - return to_int8(a), to_int8(b) - if dtype == torch.float8_e4m3fn: - return to_fp8(a), to_fp8(b) - - raise ValueError("unsupported dtype") - -def make_n_rand_tensors(num_tensors: int, dtype: torch.dtype, - m: int, n: int, k: int) -> \ - Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]: - ABs = [] - for _ in range(num_tensors): - ABs.append(make_rand_tensors(dtype, m, n, k)) - As, Bs = zip(*ABs) - return list(As), list(Bs) diff --git a/benchmarks/cutlass_benchmarks/dense_mm/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/dense_mm/w8a8_benchmarks.py deleted file mode 100644 index a597988cd2840..0000000000000 --- a/benchmarks/cutlass_benchmarks/dense_mm/w8a8_benchmarks.py +++ /dev/null @@ -1,211 +0,0 @@ -import argparse -import copy -import itertools -import pickle as pkl -import time -from typing import Iterable, List, Tuple - -import torch -import torch.utils.benchmark as TBenchmark -from bench_v1 import bench_v1 -from bench_v2 import bench_v2 -from torch.utils.benchmark import Measurement as TMeasurement -from weight_shapes import WEIGHT_SHAPES - -from vllm.utils import FlexibleArgumentParser - -DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) -DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] -DEFAULT_TP_SIZES = [1] - - -# runner -def print_timers(timers: Iterable[TMeasurement]): - compare = TBenchmark.Compare(timers) - compare.print() - - -def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: - results = [] - dtype = args.dtype - - use_bench_v2 = args.with_cuda_graph or args.with_arg_pool - for m, k, n in MKNs: - if use_bench_v2: - label = f"scaled-{dtype}-gemm" - label = f"{label}-cugraph_{args.with_cuda_graph}" \ - if args.with_cuda_graph else label - label = f"{label}-argpool_{args.with_arg_pool}" \ - if args.with_arg_pool else label - timers = bench_v2(args.dtype, args.with_cuda_graph, - args.with_arg_pool, m, k, n, label, - f"MKN=({m}x{k}x{n})") - else: - timers = bench_v1(args.dtype, m, k, n, f"scaled-{dtype}-gemm", - f"MKN=({m}x{k}x{n})") - - print_timers(timers) - results.extend(timers) - - return results - - -# output makers -def make_output(data: Iterable[TMeasurement], - MKNs: Iterable[Tuple[int, int, int]], - base_description: str, - timestamp=None): - print(f"== All Results {base_description} ====") - print_timers(data) - - # pickle all the results - timestamp = int(time.time()) if timestamp is None else timestamp - with open(f"{base_description}-{timestamp}.pkl", "wb") as f: - pkl.dump(data, f) - - -# argparse runners - - -def run_square_bench(args): - dim_sizes = list( - range(args.dim_start, args.dim_end + 1, args.dim_increment)) - MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) - data = run(args, MKNs) - - make_output(data, MKNs, f"square_bench-{args.dtype}") - - -def run_range_bench(args): - dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment)) - n = len(dim_sizes) - Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes - Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes - Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes - MKNs = list(zip(Ms, Ks, Ns)) - data = run(args, MKNs) - - make_output(data, MKNs, f"range_bench-{args.dtype}") - - -def run_model_bench(args): - print("Benchmarking models:") - for i, model in enumerate(args.models): - print(f"[{i}] {model}") - - def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]: - KNs = [] - for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): - if tp_split_dim is not None: - KN[tp_split_dim] = KN[tp_split_dim] // tp_size - KNs.append(KN) - return KNs - - model_bench_data = [] - models_tps = list(itertools.product(args.models, args.tp_sizes)) - for model, tp_size in models_tps: - Ms = args.batch_sizes - KNs = model_shapes(model, tp_size) - MKNs = [] - for m in Ms: - for k, n in KNs: - MKNs.append((m, k, n)) - - data = run(args, MKNs) - model_bench_data.append(data) - - # Print all results - for data, model_tp in zip(model_bench_data, models_tps): - model, tp_size = model_tp - print(f"== Results {args.dtype} {model}-TP{tp_size} ====") - print_timers(data) - - timestamp = int(time.time()) - - all_data = [] - for d in model_bench_data: - all_data.extend(d) - # pickle all data - with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f: - pkl.dump(all_data, f) - - -if __name__ == '__main__': - - def to_torch_dtype(dt): - if dt == "int8": - return torch.int8 - if dt == "fp8": - return torch.float8_e4m3fn - raise ValueError("unsupported dtype") - - parser = FlexibleArgumentParser( - description=""" -Benchmark Cutlass GEMM. - - To run square GEMMs: - python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64 - - To run constant N and K and sweep M: - python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384 - - To run dimensions from a model: - python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1 - - Output: - - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs. - """, # noqa: E501 - formatter_class=argparse.RawTextHelpFormatter) - - parser.add_argument("--dtype", - type=to_torch_dtype, - required=True, - help="Available options are ['int8', 'fp8']") - parser.add_argument( - '--with-cuda-graph', - type=int, - default=None, - help="Number of ops/matmuls in a cudagraph execution. When set" - "cuda-graphs is enabled") - parser.add_argument( - '--with-arg-pool', - type=int, - default=None, - help="Number of A and B tensors to use as arg-pool. When not set," - "it defaults to 1") - - subparsers = parser.add_subparsers(dest="cmd") - - square_parser = subparsers.add_parser("square_bench") - square_parser.add_argument("--dim-start", type=int, required=True) - square_parser.add_argument("--dim-end", type=int, required=True) - square_parser.add_argument("--dim-increment", type=int, required=True) - square_parser.set_defaults(func=run_square_bench) - - range_parser = subparsers.add_parser("range_bench") - range_parser.add_argument("--dim-start", type=int, required=True) - range_parser.add_argument("--dim-end", type=int, required=True) - range_parser.add_argument("--dim-increment", type=int, required=True) - range_parser.add_argument("--m-constant", type=int, default=None) - range_parser.add_argument("--n-constant", type=int, default=None) - range_parser.add_argument("--k-constant", type=int, default=None) - range_parser.set_defaults(func=run_range_bench) - - model_parser = subparsers.add_parser("model_bench") - model_parser.add_argument("--models", - nargs="+", - type=str, - default=DEFAULT_MODELS, - choices=WEIGHT_SHAPES.keys()) - model_parser.add_argument("--tp-sizes", - nargs="+", - type=int, - default=DEFAULT_TP_SIZES) - model_parser.add_argument("--batch-sizes", - nargs="+", - type=int, - default=DEFAULT_BATCH_SIZES) - model_parser.set_defaults(func=run_model_bench) - - args = parser.parse_args() - args.func(args) diff --git a/benchmarks/cutlass_benchmarks/dense_mm/weight_shapes.py b/benchmarks/cutlass_benchmarks/dense_mm/weight_shapes.py deleted file mode 100644 index 77f15891d84b2..0000000000000 --- a/benchmarks/cutlass_benchmarks/dense_mm/weight_shapes.py +++ /dev/null @@ -1,75 +0,0 @@ -# Weight Shapes are in the format -# ([K, N], TP_SPLIT_DIM) -# Example: -# A shape of ([14336, 4096], 0) indicates the following GEMM shape, -# - TP1 : K = 14336, N = 4096 -# - TP2 : K = 7168, N = 4096 -# A shape of ([4096, 6144], 1) indicates the following GEMM shape, -# - TP1 : K = 4096, N = 6144 -# - TP4 : K = 4096, N = 1536 - -# TP1 shapes -WEIGHT_SHAPES = { - "mistralai/Mistral-7B-v0.1": [ - ([4096, 6144], 1), - ([4096, 4096], 0), - ([4096, 28672], 1), - ([14336, 4096], 0), - ], - "meta-llama/Llama-2-7b-hf": [ - ([4096, 12288], 1), - ([4096, 4096], 0), - ([4096, 22016], 1), - ([11008, 4096], 0), - ], - "meta-llama/Llama-3-8b": [ - ([4096, 6144], 1), - ([4096, 4096], 0), - ([4096, 28672], 1), - ([14336, 4096], 0), - ], - "meta-llama/Llama-2-13b-hf": [ - ([5120, 15360], 1), - ([5120, 5120], 0), - ([5120, 27648], 1), - ([13824, 5120], 0), - ], - "meta-llama/Llama-2-70b-hf": [ - ([8192, 10240], 1), - ([8192, 8192], 0), - ([8192, 57344], 1), - ([28672, 8192], 0), - ], - "meta-llama/Llama-2-70b-tp4-hf": [([8192, 2560], None), ([2048, - 8192], None), - ([8192, 14336], None), - ([7168, 8192], None)], - # The shape space is very big when benchmarking a large set of kernels. - # For example: Let, - # - #kernels to benchmark be 1700 - # - #models to benchmark be 4 (each model has 4 shapes) - # - #batch sizes be 6 (16, 32, 64, 128, 256, 512) - # For 1 kernel, 1 shape and 1 batch-size, H100 takes 1 second (approx.) - # to run, then the benchmark suite would take, - # 1700 * (4 * 4) * 6 = 163200 seconds => 46 hrs. - # Below, we exploit some observation on the benchmark shapes to create a - # representative set. - # - # From previous benchmarking runs, we observe that perf if stratified as, - # N - small, medium, large and K - small and large. We also observe that - # in the model shapes, when K is small, we have small, medium and large Ns. - # when K is large, we only have small Ns. - # - # models : ['meta-llama/Llama-2-7b-hf', 'meta-llama/Llama-3-8b', - # 'meta-llama/Llama-2-13b-hf', 'meta-llama/Llama-2-70b-tp4-hf'] - # Ks : [2048, 4096, 5120, 7168, 8192, 11008, 13824, 14336] - # Ns : [2560, 4096, 5120, 6144, 8192, 12288, 14336, 15360, - # 22016, 27648, 28672] - "llama-representative-set": [ - ([4096, 4096], None), # small K, small N - ([4096, 8192], None), # small K, medium N - ([4096, 22016], None), # small K, large N - ([14336, 4096], None), # large K, small N - ([8192, 14336], None), # medium K, large N (from llama-2-70b-tp4-hf - ], -} diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py new file mode 100644 index 0000000000000..abcde3b016a7b --- /dev/null +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -0,0 +1,389 @@ +import argparse +import copy +import itertools +import pickle as pkl +import time +from typing import Callable, Iterable, List, Tuple + +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement +from weight_shapes import WEIGHT_SHAPES + +from vllm import _custom_ops as ops +from vllm.utils import FlexibleArgumentParser + +DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) +DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] +DEFAULT_TP_SIZES = [1] + +# helpers + + +def to_fp8(tensor: torch.Tensor) -> torch.Tensor: + finfo = torch.finfo(torch.float8_e4m3fn) + return torch.round(tensor.clamp( + min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) + + +def to_int8(tensor: torch.Tensor) -> torch.Tensor: + return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) + + +def make_rand_tensors(dtype: torch.dtype, m: int, n: int, + k: int) -> Tuple[torch.Tensor, torch.Tensor]: + a = torch.randn((m, k), device='cuda') * 5 + b = torch.randn((n, k), device='cuda').t() * 5 + + if dtype == torch.int8: + return to_int8(a), to_int8(b) + if dtype == torch.float8_e4m3fn: + return to_fp8(a), to_fp8(b) + + raise ValueError("unsupported dtype") + + +# bench +def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args, + **kwargs) -> TMeasurement: + min_run_time = 1 + + globals = { + "args": args, + "kwargs": kwargs, + "fn": fn, + } + return TBenchmark.Timer( + stmt="fn(*args, **kwargs)", + globals=globals, + label=label, + sub_label=sub_label, + description=description, + ).blocked_autorange(min_run_time=min_run_time) + + +def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str, + sub_label: str) -> Iterable[TMeasurement]: + assert dtype == torch.int8 + a, b = make_rand_tensors(torch.int8, m, n, k) + scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) + scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) + bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16) + azp = torch.zeros((m, ), device="cuda", dtype=torch.int32) + azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32) + + timers = [] + # pytorch impl - bfloat16 + timers.append( + bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales", + torch.mm, a.to(dtype=torch.bfloat16), + b.to(dtype=torch.bfloat16))) + + # pytorch impl - float16 + timers.append( + bench_fn(label, sub_label, + "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm, + a.to(dtype=torch.float16), b.to(dtype=torch.float16))) + + # cutlass impl + timers.append( + bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm", + ops.cutlass_scaled_mm, a, b, scale_a, scale_b, + torch.bfloat16)) + + # cutlass with bias + timers.append( + bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias", + ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16, + bias)) + + # cutlass with azp per-tensor + timers.append( + bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp", + ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, + torch.bfloat16, azp_adj)) + + # cutlass with azp per-tensor + bias + timers.append( + bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias", + ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, + torch.bfloat16, azp_adj, None, bias)) + + # cutlass with azp per-token + timers.append( + bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt", + ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, + torch.bfloat16, azp_adj, azp)) + + # cutlass with azp per-token + bias + timers.append( + bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias", + ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, + torch.bfloat16, azp_adj, azp, bias)) + + return timers + + +def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, + sub_label: str) -> Iterable[TMeasurement]: + assert dtype == torch.float8_e4m3fn + a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k) + scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) + scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) + bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16) + + timers = [] + + # pytorch impl w. bf16 + timers.append( + bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales", + torch.mm, a.to(dtype=torch.bfloat16, device="cuda"), + b.to(dtype=torch.bfloat16, device="cuda"))) + + # pytorch impl: bf16 output, without fp8 fast accum + timers.append( + bench_fn(label, + sub_label, + "pytorch_fp8_fp8_bf16_scaled_mm", + torch._scaled_mm, + a, + b, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.bfloat16)) + + # pytorch impl: bf16 output, with fp8 fast accum + timers.append( + bench_fn(label, + sub_label, + "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum", + torch._scaled_mm, + a, + b, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.bfloat16, + use_fast_accum=True)) + + # pytorch impl: fp16 output, without fp8 fast accum + timers.append( + bench_fn(label, + sub_label, + "pytorch_fp8_fp8_fp16_scaled_mm", + torch._scaled_mm, + a, + b, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.float16)) + + # pytorch impl: fp16 output, with fp8 fast accum + timers.append( + bench_fn(label, + sub_label, + "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum", + torch._scaled_mm, + a, + b, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.float16, + use_fast_accum=True)) + + # cutlass impl: bf16 output + timers.append( + bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm", + ops.cutlass_scaled_mm, a, b, scale_a, scale_b, + torch.bfloat16)) + # cutlass impl: fp16 output + timers.append( + bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm", + ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16)) + + # cutlass impl: bf16 output, with bias + timers.append( + bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias", + ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16, + bias)) + + # cutlass impl: fp16 output, with bias + timers.append( + bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias", + ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16, + bias.to(dtype=torch.float16))) + + return timers + + +def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str, + sub_label: str) -> Iterable[TMeasurement]: + if dtype == torch.int8: + return bench_int8(dtype, m, k, n, label, sub_label) + if dtype == torch.float8_e4m3fn: + return bench_fp8(dtype, m, k, n, label, sub_label) + raise ValueError("unsupported type") + + +# runner +def print_timers(timers: Iterable[TMeasurement]): + compare = TBenchmark.Compare(timers) + compare.print() + + +def run(dtype: torch.dtype, + MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: + results = [] + for m, k, n in MKNs: + timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm", + f"MKN=({m}x{k}x{n})") + print_timers(timers) + results.extend(timers) + + return results + + +# output makers +def make_output(data: Iterable[TMeasurement], + MKNs: Iterable[Tuple[int, int, int]], + base_description: str, + timestamp=None): + print(f"== All Results {base_description} ====") + print_timers(data) + + # pickle all the results + timestamp = int(time.time()) if timestamp is None else timestamp + with open(f"{base_description}-{timestamp}.pkl", "wb") as f: + pkl.dump(data, f) + + +# argparse runners + + +def run_square_bench(args): + dim_sizes = list( + range(args.dim_start, args.dim_end + 1, args.dim_increment)) + MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) + data = run(args.dtype, MKNs) + + make_output(data, MKNs, f"square_bench-{args.dtype}") + + +def run_range_bench(args): + dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment)) + n = len(dim_sizes) + Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes + Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes + Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes + MKNs = list(zip(Ms, Ks, Ns)) + data = run(args.dtype, MKNs) + + make_output(data, MKNs, f"range_bench-{args.dtype}") + + +def run_model_bench(args): + print("Benchmarking models:") + for i, model in enumerate(args.models): + print(f"[{i}] {model}") + + def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]: + KNs = [] + for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): + KN[tp_split_dim] = KN[tp_split_dim] // tp_size + KNs.append(KN) + return KNs + + model_bench_data = [] + models_tps = list(itertools.product(args.models, args.tp_sizes)) + for model, tp_size in models_tps: + Ms = args.batch_sizes + KNs = model_shapes(model, tp_size) + MKNs = [] + for m in Ms: + for k, n in KNs: + MKNs.append((m, k, n)) + + data = run(args.dtype, MKNs) + model_bench_data.append(data) + + # Print all results + for data, model_tp in zip(model_bench_data, models_tps): + model, tp_size = model_tp + print(f"== Results {args.dtype} {model}-TP{tp_size} ====") + print_timers(data) + + timestamp = int(time.time()) + + all_data = [] + for d in model_bench_data: + all_data.extend(d) + # pickle all data + with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f: + pkl.dump(all_data, f) + + +if __name__ == '__main__': + + def to_torch_dtype(dt): + if dt == "int8": + return torch.int8 + if dt == "fp8": + return torch.float8_e4m3fn + raise ValueError("unsupported dtype") + + parser = FlexibleArgumentParser( + description=""" +Benchmark Cutlass GEMM. + + To run square GEMMs: + python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64 + + To run constant N and K and sweep M: + python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384 + + To run dimensions from a model: + python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1 + + Output: + - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs. + """, # noqa: E501 + formatter_class=argparse.RawTextHelpFormatter) + + parser.add_argument("--dtype", + type=to_torch_dtype, + required=True, + help="Available options are ['int8', 'fp8']") + subparsers = parser.add_subparsers(dest="cmd") + + square_parser = subparsers.add_parser("square_bench") + square_parser.add_argument("--dim-start", type=int, required=True) + square_parser.add_argument("--dim-end", type=int, required=True) + square_parser.add_argument("--dim-increment", type=int, required=True) + square_parser.set_defaults(func=run_square_bench) + + range_parser = subparsers.add_parser("range_bench") + range_parser.add_argument("--dim-start", type=int, required=True) + range_parser.add_argument("--dim-end", type=int, required=True) + range_parser.add_argument("--dim-increment", type=int, required=True) + range_parser.add_argument("--m-constant", type=int, default=None) + range_parser.add_argument("--n-constant", type=int, default=None) + range_parser.add_argument("--k-constant", type=int, default=None) + range_parser.set_defaults(func=run_range_bench) + + model_parser = subparsers.add_parser("model_bench") + model_parser.add_argument("--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES.keys()) + model_parser.add_argument("--tp-sizes", + nargs="+", + type=int, + default=DEFAULT_TP_SIZES) + model_parser.add_argument("--batch-sizes", + nargs="+", + type=int, + default=DEFAULT_BATCH_SIZES) + model_parser.set_defaults(func=run_model_bench) + + args = parser.parse_args() + args.func(args) \ No newline at end of file diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py new file mode 100644 index 0000000000000..d58fb0bf86374 --- /dev/null +++ b/benchmarks/cutlass_benchmarks/weight_shapes.py @@ -0,0 +1,43 @@ +# Weight Shapes are in the format +# ([K, N], TP_SPLIT_DIM) +# Example: +# A shape of ([14336, 4096], 0) indicates the following GEMM shape, +# - TP1 : K = 14336, N = 4096 +# - TP2 : K = 7168, N = 4096 +# A shape of ([4096, 6144], 1) indicates the following GEMM shape, +# - TP1 : K = 4096, N = 6144 +# - TP4 : K = 4096, N = 1536 + +# TP1 shapes +WEIGHT_SHAPES = { + "mistralai/Mistral-7B-v0.1": [ + ([4096, 6144], 1), + ([4096, 4096], 0), + ([4096, 28672], 1), + ([14336, 4096], 0), + ], + "meta-llama/Llama-2-7b-hf": [ + ([4096, 12288], 1), + ([4096, 4096], 0), + ([4096, 22016], 1), + ([11008, 4096], 0), + ], + "meta-llama/Llama-3-8b": [ + ([4096, 6144], 1), + ([4096, 4096], 0), + ([4096, 28672], 1), + ([14336, 4096], 0), + ], + "meta-llama/Llama-2-13b-hf": [ + ([5120, 15360], 1), + ([5120, 5120], 0), + ([5120, 27648], 1), + ([13824, 5120], 0), + ], + "meta-llama/Llama-2-70b-hf": [ + ([8192, 10240], 1), + ([8192, 8192], 0), + ([8192, 57344], 1), + ([28672, 8192], 0), + ], +} \ No newline at end of file