From e98bc45f96df15f64fee44f04d2a962dfb488ff0 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Wed, 25 Sep 2024 02:43:33 -0400 Subject: [PATCH] clean up unit tests, disable single awq test --- tests/kernels/test_awq_marlin.py | 50 +++----------------------------- tests/kernels/test_moe.py | 46 ++--------------------------- tests/kernels/utils.py | 45 ++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 90 deletions(-) diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/test_awq_marlin.py index e408636bdb2d2..4481ebaf9c1ff 100644 --- a/tests/kernels/test_awq_marlin.py +++ b/tests/kernels/test_awq_marlin.py @@ -2,12 +2,11 @@ Run `pytest tests/kernels/test_awq_marlin.py`. """ -from typing import List - import pytest import torch -from vllm.model_executor.layers.activation import SiluAndMul +from tests.kernels.utils import (compute_max_diff, stack_and_dev, torch_moe, + torch_moe_single) from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( fused_marlin_moe, single_marlin_moe) from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk @@ -16,47 +15,6 @@ from vllm.scalar_type import scalar_types -def stack_and_dev(tensors: List[torch.Tensor]): - dev = tensors[0].device - return torch.stack(tensors, dim=0).to(dev) - - -def compute_max_diff(output, output_ref): - return torch.mean(torch.abs(output - output_ref)) / torch.mean( - torch.abs(output_ref)) - - -def torch_moe(a, w1, w2, score, topk): - B, D = a.shape - a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D) - out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device) - score = torch.softmax(score, dim=-1, dtype=torch.float32) - topk_weight, topk_ids = torch.topk(score, topk) - topk_weight = topk_weight.view(-1) - topk_ids = topk_ids.view(-1) - for i in range(w1.shape[0]): - mask = topk_ids == i - if mask.sum(): - out[mask] = SiluAndMul()( - a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1) - return (out.view(B, -1, w2.shape[1]) * - topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1) - - -def torch_moe_single(a, w, score, topk): - B, D = a.shape - a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D) - out = torch.zeros(B * topk, w.shape[1], dtype=a.dtype, device=a.device) - score = torch.softmax(score, dim=-1, dtype=torch.float32) - _, topk_ids = torch.topk(score, topk) - topk_ids = topk_ids.view(-1) - for i in range(w.shape[0]): - mask = topk_ids == i - if mask.sum(): - out[mask] = a[mask] @ w[i].transpose(0, 1) - return (out.view(B, -1, w.shape[1])).sum(dim=1) - - @pytest.mark.parametrize("m", [64, 512, 222, 33, 1]) @pytest.mark.parametrize("n", [128, 2048, 256, 1024]) @pytest.mark.parametrize("k", [128, 1024, 512]) @@ -148,8 +106,8 @@ def test_fused_marlin_moe_awq( assert compute_max_diff(marlin_output, torch_output) < 4e-2 -# @pytest.mark.skip("This test is here for the sake of debugging, " -# "don't run it in automated tests.") +@pytest.mark.skip("This test is here for the sake of debugging, " + "don't run it in automated tests.") @pytest.mark.parametrize("m", [64, 512, 222, 33, 1]) @pytest.mark.parametrize("n", [128, 2048, 256, 1024]) @pytest.mark.parametrize("k", [128, 1024, 512]) diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index f5e8b3e270a91..5f03a65c62f95 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -2,14 +2,13 @@ Run `pytest tests/kernels/test_moe.py`. """ -from typing import List - import pytest import torch from transformers import MixtralConfig from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock -from vllm.model_executor.layers.activation import SiluAndMul +from tests.kernels.utils import (compute_max_diff, stack_and_dev, torch_moe, + torch_moe_single) from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( fused_marlin_moe, single_marlin_moe) @@ -21,37 +20,6 @@ from vllm.utils import seed_everything -def torch_moe(a, w1, w2, score, topk): - B, D = a.shape - a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D) - out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device) - score = torch.softmax(score, dim=-1, dtype=torch.float32) - topk_weight, topk_ids = torch.topk(score, topk) - topk_weight = topk_weight.view(-1) - topk_ids = topk_ids.view(-1) - for i in range(w1.shape[0]): - mask = topk_ids == i - if mask.sum(): - out[mask] = SiluAndMul()( - a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1) - return (out.view(B, -1, w2.shape[1]) * - topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1) - - -def torch_moe_single(a, w, score, topk): - B, D = a.shape - a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D) - out = torch.zeros(B * topk, w.shape[1], dtype=a.dtype, device=a.device) - score = torch.softmax(score, dim=-1, dtype=torch.float32) - _, topk_ids = torch.topk(score, topk) - topk_ids = topk_ids.view(-1) - for i in range(w.shape[0]): - mask = topk_ids == i - if mask.sum(): - out[mask] = a[mask] @ w[i].transpose(0, 1) - return (out.view(B, -1, w.shape[1])).sum(dim=1) - - @pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1]) @pytest.mark.parametrize("n", [2048, 256, 1024]) @pytest.mark.parametrize("k", [128, 511, 1024]) @@ -124,16 +92,6 @@ def test_mixtral_moe(dtype: torch.dtype): atol=mixtral_moe_tol[dtype]) -def stack_and_dev(tensors: List[torch.Tensor]): - dev = tensors[0].device - return torch.stack(tensors, dim=0).to(dev) - - -def compute_max_diff(output, output_ref): - return torch.mean(torch.abs(output - output_ref)) / torch.mean( - torch.abs(output_ref)) - - @pytest.mark.parametrize("m", [64, 512, 222, 33, 1]) @pytest.mark.parametrize("n", [128, 2048, 256, 1024]) @pytest.mark.parametrize("k", [128, 1024, 512]) diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 5746932c30a45..41dfdcd08ff08 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -10,6 +10,7 @@ import torch from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType +from vllm.model_executor.layers.activation import SiluAndMul from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL, make_tensor_with_pad) @@ -960,3 +961,47 @@ def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket, kwargs, test_utils=test_utils, raise_exception=raise_exception) if cond else {} + + +# Marlin MoE test utils + + +def stack_and_dev(tensors: List[torch.Tensor]): + dev = tensors[0].device + return torch.stack(tensors, dim=0).to(dev) + + +def compute_max_diff(output, output_ref): + return torch.mean(torch.abs(output - output_ref)) / torch.mean( + torch.abs(output_ref)) + + +def torch_moe(a, w1, w2, score, topk): + B, D = a.shape + a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D) + out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device) + score = torch.softmax(score, dim=-1, dtype=torch.float32) + topk_weight, topk_ids = torch.topk(score, topk) + topk_weight = topk_weight.view(-1) + topk_ids = topk_ids.view(-1) + for i in range(w1.shape[0]): + mask = topk_ids == i + if mask.sum(): + out[mask] = SiluAndMul()( + a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1) + return (out.view(B, -1, w2.shape[1]) * + topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1) + + +def torch_moe_single(a, w, score, topk): + B, D = a.shape + a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D) + out = torch.zeros(B * topk, w.shape[1], dtype=a.dtype, device=a.device) + score = torch.softmax(score, dim=-1, dtype=torch.float32) + _, topk_ids = torch.topk(score, topk) + topk_ids = topk_ids.view(-1) + for i in range(w.shape[0]): + mask = topk_ids == i + if mask.sum(): + out[mask] = a[mask] @ w[i].transpose(0, 1) + return (out.view(B, -1, w.shape[1])).sum(dim=1)