Skip to content

Commit

Permalink
Make workspace smaller, add very small thread config
Browse files Browse the repository at this point in the history
  • Loading branch information
ElizaWszola committed Oct 2, 2024
1 parent 8fe6da4 commit a966417
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 1 deletion.
2 changes: 2 additions & 0 deletions csrc/moe/marlin_moe_ops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ thread_config_t small_batch_thread_configs[] = {
{128, 64, 128}, // Reduce N 2X, same K
{64, 256, 256}, // Reduce K 2X, increase N 2X
{64, 128, 128}, // Reduce K 2X, same N
{64, 64, 128}, // Reduce both 2X
};

thread_config_t large_batch_thread_configs[] = {
Expand All @@ -167,6 +168,7 @@ thread_config_t large_batch_thread_configs[] = {
{128, 128, 256}, // Reduce N 2X, increase K 2X
{64, 128, 128}, // Reduce N 2X, same K
{128, 64, 128}, // Reduce N 4X, increase K 2X
{64, 64, 128}, // Reduce N 4X, same K
};

int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def fused_marlin_moe(

sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)

max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16
max_workspace_size = (max(2 * N, K) // 64) * 16
workspace = torch.zeros(max_workspace_size,
dtype=torch.int,
device="cuda",
Expand Down

0 comments on commit a966417

Please sign in to comment.