diff --git a/benchmarks/cutlass_benchmarks/dense_mm/bench_v1.py b/benchmarks/cutlass_benchmarks/dense_mm/bench_v1.py
deleted file mode 100644
index d2f532c6bf18c..0000000000000
--- a/benchmarks/cutlass_benchmarks/dense_mm/bench_v1.py
+++ /dev/null
@@ -1,191 +0,0 @@
-## Cutlass benchmark V1
-
-from typing import Callable, Iterable
-
-import torch
-import torch.utils.benchmark as TBenchmark
-from torch.utils.benchmark import Measurement as TMeasurement
-from utils import make_rand_tensors
-
-import vllm._custom_ops as ops
-
-
-# bench
-def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
-             **kwargs) -> TMeasurement:
-    min_run_time = 1
-
-    globals = {
-        "args": args,
-        "kwargs": kwargs,
-        "fn": fn,
-    }
-    return TBenchmark.Timer(
-        stmt="fn(*args, **kwargs)",
-        globals=globals,
-        label=label,
-        sub_label=sub_label,
-        description=description,
-    ).blocked_autorange(min_run_time=min_run_time)
-
-
-def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-               sub_label: str) -> Iterable[TMeasurement]:
-    assert dtype == torch.int8
-    a, b = make_rand_tensors(torch.int8, m, n, k)
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
-    azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
-    azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
-
-    timers = []
-    # pytorch impl - bfloat16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16),
-                 b.to(dtype=torch.bfloat16)))
-
-    # pytorch impl - float16
-    timers.append(
-        bench_fn(label, sub_label,
-                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
-                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
-
-    # cutlass impl
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
-
-    # cutlass with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
-    # cutlass with azp per-tensor
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj))
-
-    # cutlass with azp per-tensor + bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, None, bias))
-
-    # cutlass with azp per-token
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, azp))
-
-    # cutlass with azp per-token + bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, azp, bias))
-
-    return timers
-
-
-def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
-    assert dtype == torch.float8_e4m3fn
-    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
-
-    timers = []
-
-    # pytorch impl w. bf16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda")))
-
-    # pytorch impl: bf16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16))
-
-    # pytorch impl: bf16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16,
-                 use_fast_accum=True))
-
-    # pytorch impl: fp16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16))
-
-    # pytorch impl: fp16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16,
-                 use_fast_accum=True))
-
-    # cutlass impl: bf16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
-    # cutlass impl: fp16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16))
-
-    # cutlass impl: bf16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
-    # cutlass impl: fp16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16,
-                 bias.to(dtype=torch.float16)))
-
-    return timers
-
-
-def bench_v1(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-             sub_label: str) -> Iterable[TMeasurement]:
-    if dtype == torch.int8:
-        return bench_int8(dtype, m, k, n, label, sub_label)
-    if dtype == torch.float8_e4m3fn:
-        return bench_fp8(dtype, m, k, n, label, sub_label)
-    raise ValueError("unsupported type")
diff --git a/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py b/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py
deleted file mode 100644
index 466887d94f957..0000000000000
--- a/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py
+++ /dev/null
@@ -1,293 +0,0 @@
-import dataclasses
-import random
-from typing import Any, Callable, Iterable, Optional
-
-import torch
-import torch.utils.benchmark as TBenchmark
-from torch.utils.benchmark import Measurement as TMeasurement
-from utils import make_n_rand_tensors
-
-import vllm._custom_ops as ops
-
-
-@dataclasses.dataclass
-class CudaGraphBenchParams:
-    num_ops_in_cuda_graph: int
-
-
-@dataclasses.dataclass
-class ArgPool:
-    '''
-    When some argument of the benchmarking function is annotated with this type,
-    the benchmarking class (BenchMM) will collapse the argument to a pick a
-    single value from the given list of values, during function invocation.
-
-    For every invocation during a benchmarking run, it will choose a
-    different value from the list.
-    '''
-    values: Iterable[Any]
-
-
-class BenchMM:
-
-    class ArgsIterator:
-
-        def __init__(self, args_list, kwargs_list):
-            assert len(args_list) == len(kwargs_list)
-            self.args_list = args_list
-            self.kwargs_list = kwargs_list
-            self.n = len(self.args_list)
-            self.idx = 0
-
-        def __next__(self):
-            while True:
-                yield (self.args_list[self.idx], self.kwargs_list[self.idx])
-                self.idx += 1
-                self.idx = self.idx % self.n
-
-        def reset(self):
-            self.idx = 0
-
-        @property
-        def n_args(self):
-            return self.n
-
-    def __init__(self, cuda_graph_params: Optional[CudaGraphBenchParams],
-                 label: str, sub_label: str, description: str, fn: Callable,
-                 *args, **kwargs):
-
-        self.cuda_graph_params = cuda_graph_params
-        self.use_cuda_graph = self.cuda_graph_params is not None
-        self.label = label
-        self.sub_label = sub_label
-        self.description = description
-        self.fn = fn
-
-        # Process args
-        self._args = args
-        self._kwargs = kwargs
-        self.args_list, self.kwargs_list = self.collapse_argpool(
-            *args, **kwargs)
-        self.args_iterator = self.ArgsIterator(self.args_list,
-                                               self.kwargs_list)
-
-        # Cudagraph runner
-        self.g = None
-        if self.use_cuda_graph:
-            self.g = self.get_cuda_graph_runner()
-
-        # benchmark run params
-        self.min_run_time = 1
-
-    def collapse_argpool(self, *args, **kwargs):
-        kwargs = kwargs if kwargs is not None else {}
-        assert kwargs is None or all([
-            not isinstance(v, ArgPool) for k, v in kwargs.items()
-        ]), 'ArgPools in kwargs are not supported yet'
-
-        arg_pool_indices = [
-            i for i, x in enumerate(args) if isinstance(x, ArgPool)
-        ]
-        if len(arg_pool_indices) == 0:
-            return [args], [kwargs]
-
-        # make sure all the Arg pools have the same number of choices
-        arg_pool_size = len(args[arg_pool_indices[0]].values)
-        assert all(
-            [len(args[i].values) == arg_pool_size for i in arg_pool_indices])
-
-        # create copies of the args
-        args_list = []
-        kwargs_list = []
-        for _ in range(arg_pool_size):
-            args_list.append(args)
-            kwargs_list.append(kwargs.copy())
-
-        # collapse the arg pools by simply choosing the ith value
-        for i in range(arg_pool_size):
-            assert isinstance(args_list[i], tuple)
-            # get as list
-            args_i = list(args_list[i])
-            # collapse - make replacements
-            for arg_pool_idx in arg_pool_indices:
-                val_from_pool = args_i[arg_pool_idx].values[i]
-                args_i[arg_pool_idx] = val_from_pool
-            # store back as tuple
-            args_list[i] = tuple(args_i)
-
-        return args_list, kwargs_list
-
-    def get_cuda_graph_runner(self):
-        assert self.use_cuda_graph
-        assert self.args_iterator is not None
-
-        num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph
-
-        # warmup
-        args_it = self.args_iterator.__next__()
-        for _ in range(5):
-            args, kwargs = next(args_it)
-            self.fn(*args, **kwargs)
-
-        self.args_iterator.reset()
-        args_it = self.args_iterator.__next__()
-
-        stream = torch.cuda.Stream()
-        with torch.cuda.stream(stream):
-            g = torch.cuda.CUDAGraph()
-            with torch.cuda.graph(g):
-                for _ in range(num_graph_ops):
-                    args, kwargs = next(args_it)
-                    self.fn(*args, **kwargs)
-        return g
-
-    def run_cudagraph(self) -> TMeasurement:
-        assert self.use_cuda_graph
-        globals = {'g': self.g}
-
-        return TBenchmark.Timer(
-            stmt="g.replay()",
-            globals=globals,
-            label=self.label,
-            sub_label=self.sub_label,
-            description=self.description,
-        ).blocked_autorange(min_run_time=self.min_run_time)
-
-    def run_eager(self) -> TMeasurement:
-        setup = None
-        stmt = None
-        globals = None
-
-        has_arg_pool = self.args_iterator.n_args > 1
-        if has_arg_pool:
-            setup = '''
-                    args_iterator.reset()
-                    args_it = args_iterator.__next__()
-                    '''
-            stmt = '''
-                    args, kwargs = next(args_it)
-                    fn(*args, **kwargs)
-                    '''
-            globals = {'fn': self.fn, 'args_iterator': self.args_iterator}
-        else:
-            # no arg pool. Just use the args and kwargs directly
-            self.args_iterator.reset()
-            args_it = self.args_iterator.__next__()
-            args, kwargs = next(args_it)
-
-            setup = ""
-            stmt = '''
-                    fn(*args, **kwargs)
-                   '''
-            globals = {'fn': self.fn, 'args': args, 'kwargs': kwargs}
-
-        return TBenchmark.Timer(
-            stmt=stmt,
-            setup=setup,
-            globals=globals,
-            label=self.label,
-            sub_label=self.sub_label,
-            description=self.description,
-        ).blocked_autorange(min_run_time=self.min_run_time)
-
-    def run(self) -> TMeasurement:
-        timer = None
-        if self.use_cuda_graph:  # noqa SIM108
-            timer = self.run_cudagraph()
-        else:
-            timer = self.run_eager()
-        #assert timer.meets_confidence()
-        #assert not timer.has_warnings, f"Warnings {timer._warnings}"
-        if not timer.meets_confidence() or timer.has_warnings:
-            print("Doesn't meet confidence - re-running bench ...")
-            return self.run()
-        return timer
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        if exc_type:
-            print(f"exc type {exc_type}")
-            print(f"exc value {exc_value}")
-            print(f"exc traceback {traceback}")
-
-
-def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
-              with_arg_pool: Optional[int], m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
-
-    arg_pool_size = with_arg_pool if with_arg_pool else 1
-    cuda_graph_params: Optional[CudaGraphBenchParams] = None
-    if with_cuda_graph:
-        num_ops_in_cuda_graph = with_cuda_graph if with_cuda_graph else None
-        cuda_graph_params = CudaGraphBenchParams(num_ops_in_cuda_graph)
-
-    assert dtype == torch.float8_e4m3fn
-
-    # Make input As and Bs
-    As, Bs = make_n_rand_tensors(arg_pool_size, torch.float8_e4m3fn, m, n, k)
-    bf16_As = [x.to(dtype=torch.bfloat16) for x in As]
-    bf16_Bs = [x.to(dtype=torch.bfloat16) for x in Bs]
-    # shuffle As and Bs to prevent any suspicion of pattern exploitation
-    random.shuffle(As)
-    random.shuffle(Bs)
-    random.shuffle(bf16_As)
-    random.shuffle(bf16_Bs)
-
-    # Make scales and biases
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-
-    timers = []
-
-    # pytorch impl w. bf16
-    with BenchMM(cuda_graph_params, label, sub_label,
-                 "pytorch_bf16_bf16_bf16_matmul-no-scales", torch.mm,
-                 ArgPool(bf16_As), ArgPool(bf16_Bs)) as bench:
-        timers.append(bench.run())
-
-    ## pytorch impl: bf16 output, without fp8 fast accum
-    with BenchMM(cuda_graph_params,
-                 label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm",
-                 torch._scaled_mm,
-                 ArgPool(As),
-                 ArgPool(Bs),
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16) as bench:
-        timers.append(bench.run())
-
-    ## pytorch impl: bf16 output, with fp8 fast accum
-    with BenchMM(cuda_graph_params,
-                 label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 ArgPool(As),
-                 ArgPool(Bs),
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16,
-                 use_fast_accum=True) as bench:
-        timers.append(bench.run())
-
-    ## cutlass impl: bf16 output
-    with BenchMM(cuda_graph_params, label, sub_label,
-                 "cutlass_fp8_fp8_bf16_scaled_mm", ops.cutlass_scaled_mm,
-                 ArgPool(As), ArgPool(Bs), scale_a, scale_b,
-                 torch.bfloat16) as bench:
-        timers.append(bench.run())
-
-    return timers
-
-
-def bench_v2(dtype: torch.dtype, with_cuda_graph: Optional[int],
-             with_arg_pool: Optional[int], m: int, k: int, n: int, label: str,
-             sub_label: str) -> Iterable[TMeasurement]:
-    if dtype == torch.float8_e4m3fn:
-        return bench_fp8(dtype, with_cuda_graph, with_arg_pool, m, k, n, label,
-                         sub_label)
-    raise ValueError("unsupported type")
diff --git a/benchmarks/cutlass_benchmarks/dense_mm/utils.py b/benchmarks/cutlass_benchmarks/dense_mm/utils.py
deleted file mode 100644
index c8fcd50a51d31..0000000000000
--- a/benchmarks/cutlass_benchmarks/dense_mm/utils.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Cutlass bench utils
-from typing import Iterable, Tuple
-
-import torch
-
-
-def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.Tensor) -> torch.Tensor:
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
-def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
-
-    if dtype == torch.int8:
-        return to_int8(a), to_int8(b)
-    if dtype == torch.float8_e4m3fn:
-        return to_fp8(a), to_fp8(b)
-
-    raise ValueError("unsupported dtype")
-
-def make_n_rand_tensors(num_tensors: int, dtype: torch.dtype,
-                        m: int, n: int, k: int) -> \
-                        Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
-    ABs = []
-    for _ in range(num_tensors):
-        ABs.append(make_rand_tensors(dtype, m, n, k))
-    As, Bs = zip(*ABs)
-    return list(As), list(Bs)
diff --git a/benchmarks/cutlass_benchmarks/dense_mm/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/dense_mm/w8a8_benchmarks.py
deleted file mode 100644
index a597988cd2840..0000000000000
--- a/benchmarks/cutlass_benchmarks/dense_mm/w8a8_benchmarks.py
+++ /dev/null
@@ -1,211 +0,0 @@
-import argparse
-import copy
-import itertools
-import pickle as pkl
-import time
-from typing import Iterable, List, Tuple
-
-import torch
-import torch.utils.benchmark as TBenchmark
-from bench_v1 import bench_v1
-from bench_v2 import bench_v2
-from torch.utils.benchmark import Measurement as TMeasurement
-from weight_shapes import WEIGHT_SHAPES
-
-from vllm.utils import FlexibleArgumentParser
-
-DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
-DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
-DEFAULT_TP_SIZES = [1]
-
-
-# runner
-def print_timers(timers: Iterable[TMeasurement]):
-    compare = TBenchmark.Compare(timers)
-    compare.print()
-
-
-def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
-    results = []
-    dtype = args.dtype
-
-    use_bench_v2 = args.with_cuda_graph or args.with_arg_pool
-    for m, k, n in MKNs:
-        if use_bench_v2:
-            label = f"scaled-{dtype}-gemm"
-            label = f"{label}-cugraph_{args.with_cuda_graph}" \
-                  if args.with_cuda_graph else label
-            label = f"{label}-argpool_{args.with_arg_pool}" \
-                if args.with_arg_pool else label
-            timers = bench_v2(args.dtype, args.with_cuda_graph,
-                              args.with_arg_pool, m, k, n, label,
-                              f"MKN=({m}x{k}x{n})")
-        else:
-            timers = bench_v1(args.dtype, m, k, n, f"scaled-{dtype}-gemm",
-                              f"MKN=({m}x{k}x{n})")
-
-        print_timers(timers)
-        results.extend(timers)
-
-    return results
-
-
-# output makers
-def make_output(data: Iterable[TMeasurement],
-                MKNs: Iterable[Tuple[int, int, int]],
-                base_description: str,
-                timestamp=None):
-    print(f"== All Results {base_description} ====")
-    print_timers(data)
-
-    # pickle all the results
-    timestamp = int(time.time()) if timestamp is None else timestamp
-    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(data, f)
-
-
-# argparse runners
-
-
-def run_square_bench(args):
-    dim_sizes = list(
-        range(args.dim_start, args.dim_end + 1, args.dim_increment))
-    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-    data = run(args, MKNs)
-
-    make_output(data, MKNs, f"square_bench-{args.dtype}")
-
-
-def run_range_bench(args):
-    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
-    n = len(dim_sizes)
-    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
-    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
-    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
-    MKNs = list(zip(Ms, Ks, Ns))
-    data = run(args, MKNs)
-
-    make_output(data, MKNs, f"range_bench-{args.dtype}")
-
-
-def run_model_bench(args):
-    print("Benchmarking models:")
-    for i, model in enumerate(args.models):
-        print(f"[{i}]  {model}")
-
-    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
-        KNs = []
-        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
-            if tp_split_dim is not None:
-                KN[tp_split_dim] = KN[tp_split_dim] // tp_size
-            KNs.append(KN)
-        return KNs
-
-    model_bench_data = []
-    models_tps = list(itertools.product(args.models, args.tp_sizes))
-    for model, tp_size in models_tps:
-        Ms = args.batch_sizes
-        KNs = model_shapes(model, tp_size)
-        MKNs = []
-        for m in Ms:
-            for k, n in KNs:
-                MKNs.append((m, k, n))
-
-        data = run(args, MKNs)
-        model_bench_data.append(data)
-
-    # Print all results
-    for data, model_tp in zip(model_bench_data, models_tps):
-        model, tp_size = model_tp
-        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
-        print_timers(data)
-
-    timestamp = int(time.time())
-
-    all_data = []
-    for d in model_bench_data:
-        all_data.extend(d)
-    # pickle all data
-    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(all_data, f)
-
-
-if __name__ == '__main__':
-
-    def to_torch_dtype(dt):
-        if dt == "int8":
-            return torch.int8
-        if dt == "fp8":
-            return torch.float8_e4m3fn
-        raise ValueError("unsupported dtype")
-
-    parser = FlexibleArgumentParser(
-        description="""
-Benchmark Cutlass GEMM.
-
-    To run square GEMMs:
-        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
-    
-    To run constant N and K and sweep M:
-        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
-    
-    To run dimensions from a model:
-        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
-    
-    Output:
-        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
-            """,  # noqa: E501
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    parser.add_argument("--dtype",
-                        type=to_torch_dtype,
-                        required=True,
-                        help="Available options are ['int8', 'fp8']")
-    parser.add_argument(
-        '--with-cuda-graph',
-        type=int,
-        default=None,
-        help="Number of ops/matmuls in a cudagraph execution. When set"
-        "cuda-graphs is enabled")
-    parser.add_argument(
-        '--with-arg-pool',
-        type=int,
-        default=None,
-        help="Number of A and B tensors to use as arg-pool. When not set,"
-        "it defaults to 1")
-
-    subparsers = parser.add_subparsers(dest="cmd")
-
-    square_parser = subparsers.add_parser("square_bench")
-    square_parser.add_argument("--dim-start", type=int, required=True)
-    square_parser.add_argument("--dim-end", type=int, required=True)
-    square_parser.add_argument("--dim-increment", type=int, required=True)
-    square_parser.set_defaults(func=run_square_bench)
-
-    range_parser = subparsers.add_parser("range_bench")
-    range_parser.add_argument("--dim-start", type=int, required=True)
-    range_parser.add_argument("--dim-end", type=int, required=True)
-    range_parser.add_argument("--dim-increment", type=int, required=True)
-    range_parser.add_argument("--m-constant", type=int, default=None)
-    range_parser.add_argument("--n-constant", type=int, default=None)
-    range_parser.add_argument("--k-constant", type=int, default=None)
-    range_parser.set_defaults(func=run_range_bench)
-
-    model_parser = subparsers.add_parser("model_bench")
-    model_parser.add_argument("--models",
-                              nargs="+",
-                              type=str,
-                              default=DEFAULT_MODELS,
-                              choices=WEIGHT_SHAPES.keys())
-    model_parser.add_argument("--tp-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_TP_SIZES)
-    model_parser.add_argument("--batch-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_BATCH_SIZES)
-    model_parser.set_defaults(func=run_model_bench)
-
-    args = parser.parse_args()
-    args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/dense_mm/weight_shapes.py b/benchmarks/cutlass_benchmarks/dense_mm/weight_shapes.py
deleted file mode 100644
index 77f15891d84b2..0000000000000
--- a/benchmarks/cutlass_benchmarks/dense_mm/weight_shapes.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Weight Shapes are in the format
-# ([K, N], TP_SPLIT_DIM)
-# Example:
-#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
-#   - TP1 : K = 14336, N = 4096
-#   - TP2 : K = 7168, N = 4096
-#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
-#   - TP1 : K = 4096, N = 6144
-#   - TP4 : K = 4096, N = 1536
-
-# TP1 shapes
-WEIGHT_SHAPES = {
-    "mistralai/Mistral-7B-v0.1": [
-        ([4096, 6144], 1),
-        ([4096, 4096], 0),
-        ([4096, 28672], 1),
-        ([14336, 4096], 0),
-    ],
-    "meta-llama/Llama-2-7b-hf": [
-        ([4096, 12288], 1),
-        ([4096, 4096], 0),
-        ([4096, 22016], 1),
-        ([11008, 4096], 0),
-    ],
-    "meta-llama/Llama-3-8b": [
-        ([4096, 6144], 1),
-        ([4096, 4096], 0),
-        ([4096, 28672], 1),
-        ([14336, 4096], 0),
-    ],
-    "meta-llama/Llama-2-13b-hf": [
-        ([5120, 15360], 1),
-        ([5120, 5120], 0),
-        ([5120, 27648], 1),
-        ([13824, 5120], 0),
-    ],
-    "meta-llama/Llama-2-70b-hf": [
-        ([8192, 10240], 1),
-        ([8192, 8192], 0),
-        ([8192, 57344], 1),
-        ([28672, 8192], 0),
-    ],
-    "meta-llama/Llama-2-70b-tp4-hf": [([8192, 2560], None), ([2048,
-                                                              8192], None),
-                                      ([8192, 14336], None),
-                                      ([7168, 8192], None)],
-    # The shape space is very big when benchmarking a large set of kernels.
-    # For example: Let,
-    #  - #kernels to benchmark be 1700
-    #  - #models to benchmark be 4 (each model has 4 shapes)
-    #  - #batch sizes be 6 (16, 32, 64, 128, 256, 512)
-    # For 1 kernel, 1 shape and 1 batch-size, H100 takes 1 second (approx.)
-    # to run, then the benchmark suite would take,
-    # 1700 * (4 * 4) * 6 = 163200 seconds => 46 hrs.
-    # Below, we exploit some observation on the benchmark shapes to create a
-    # representative set.
-    #
-    # From previous benchmarking runs, we observe that perf if stratified as,
-    # N - small, medium, large and K - small and large. We also observe that
-    # in the model shapes, when K is small, we have small, medium and large Ns.
-    # when K is large, we only have small Ns.
-    #
-    # models : ['meta-llama/Llama-2-7b-hf', 'meta-llama/Llama-3-8b',
-    #  'meta-llama/Llama-2-13b-hf', 'meta-llama/Llama-2-70b-tp4-hf']
-    # Ks : [2048, 4096, 5120, 7168, 8192, 11008, 13824, 14336]
-    # Ns : [2560, 4096, 5120, 6144, 8192, 12288, 14336, 15360,
-    #         22016, 27648, 28672]
-    "llama-representative-set": [
-        ([4096, 4096], None),  # small K, small N
-        ([4096, 8192], None),  # small K, medium N
-        ([4096, 22016], None),  # small K, large N
-        ([14336, 4096], None),  # large K, small N
-        ([8192, 14336], None),  # medium K, large N (from llama-2-70b-tp4-hf
-    ],
-}
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
new file mode 100644
index 0000000000000..abcde3b016a7b
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -0,0 +1,389 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from typing import Callable, Iterable, List, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+# helpers
+
+
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
+                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    if dtype == torch.int8:
+        return to_int8(a), to_int8(b)
+    if dtype == torch.float8_e4m3fn:
+        return to_fp8(a), to_fp8(b)
+
+    raise ValueError("unsupported dtype")
+
+
+# bench
+def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
+             **kwargs) -> TMeasurement:
+    min_run_time = 1
+
+    globals = {
+        "args": args,
+        "kwargs": kwargs,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(*args, **kwargs)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+               sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.int8
+    a, b = make_rand_tensors(torch.int8, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+    azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
+    azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
+
+    timers = []
+    # pytorch impl - bfloat16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16),
+                 b.to(dtype=torch.bfloat16)))
+
+    # pytorch impl - float16
+    timers.append(
+        bench_fn(label, sub_label,
+                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
+                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+
+    # cutlass impl
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass with azp per-tensor
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp",
+                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
+                 torch.bfloat16, azp_adj))
+
+    # cutlass with azp per-tensor + bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias",
+                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
+                 torch.bfloat16, azp_adj, None, bias))
+
+    # cutlass with azp per-token
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt",
+                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
+                 torch.bfloat16, azp_adj, azp))
+
+    # cutlass with azp per-token + bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias",
+                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
+                 torch.bfloat16, azp_adj, azp, bias))
+
+    return timers
+
+
+def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.float8_e4m3fn
+    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    timers = []
+
+    # pytorch impl w. bf16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda")))
+
+    # pytorch impl: bf16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16))
+
+    # pytorch impl: bf16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16,
+                 use_fast_accum=True))
+
+    # pytorch impl: fp16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16))
+
+    # pytorch impl: fp16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16,
+                 use_fast_accum=True))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16))
+
+    # cutlass impl: bf16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass impl: fp16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16,
+                 bias.to(dtype=torch.float16)))
+
+    return timers
+
+
+def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+          sub_label: str) -> Iterable[TMeasurement]:
+    if dtype == torch.int8:
+        return bench_int8(dtype, m, k, n, label, sub_label)
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, m, k, n, label, sub_label)
+    raise ValueError("unsupported type")
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(dtype: torch.dtype,
+        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+    results = []
+    for m, k, n in MKNs:
+        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
+                       f"MKN=({m}x{k}x{n})")
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+# output makers
+def make_output(data: Iterable[TMeasurement],
+                MKNs: Iterable[Tuple[int, int, int]],
+                base_description: str,
+                timestamp=None):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+    dim_sizes = list(
+        range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+    n = len(dim_sizes)
+    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+    MKNs = list(zip(Ms, Ks, Ns))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args.dtype, MKNs)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        raise ValueError("unsupported dtype")
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark Cutlass GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--dtype",
+                        type=to_torch_dtype,
+                        required=True,
+                        help="Available options are ['int8', 'fp8']")
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--dim-start", type=int, required=True)
+    range_parser.add_argument("--dim-end", type=int, required=True)
+    range_parser.add_argument("--dim-increment", type=int, required=True)
+    range_parser.add_argument("--m-constant", type=int, default=None)
+    range_parser.add_argument("--n-constant", type=int, default=None)
+    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--batch-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_BATCH_SIZES)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
\ No newline at end of file
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
new file mode 100644
index 0000000000000..d58fb0bf86374
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -0,0 +1,43 @@
+# Weight Shapes are in the format
+# ([K, N], TP_SPLIT_DIM)
+# Example:
+#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
+#   - TP1 : K = 14336, N = 4096
+#   - TP2 : K = 7168, N = 4096
+#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
+#   - TP1 : K = 4096, N = 6144
+#   - TP4 : K = 4096, N = 1536
+
+# TP1 shapes
+WEIGHT_SHAPES = {
+    "mistralai/Mistral-7B-v0.1": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-7b-hf": [
+        ([4096, 12288], 1),
+        ([4096, 4096], 0),
+        ([4096, 22016], 1),
+        ([11008, 4096], 0),
+    ],
+    "meta-llama/Llama-3-8b": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-13b-hf": [
+        ([5120, 15360], 1),
+        ([5120, 5120], 0),
+        ([5120, 27648], 1),
+        ([13824, 5120], 0),
+    ],
+    "meta-llama/Llama-2-70b-hf": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+}
\ No newline at end of file