diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu
index b3cccd4c566fb..69d66b5d7101e 100644
--- a/csrc/moe/marlin_moe_ops.cu
+++ b/csrc/moe/marlin_moe_ops.cu
@@ -157,6 +157,7 @@ thread_config_t small_batch_thread_configs[] = {
     {128, 64, 128},   // Reduce N 2X, same K
     {64, 256, 256},   // Reduce K 2X, increase N 2X
     {64, 128, 128},   // Reduce K 2X, same N
+    {64, 64, 128},    // Reduce both 2X
 };
 
 thread_config_t large_batch_thread_configs[] = {
@@ -167,6 +168,7 @@ thread_config_t large_batch_thread_configs[] = {
     {128, 128, 256},  // Reduce N 2X, increase K 2X
     {64, 128, 128},   // Reduce N 2X, same K
     {128, 64, 128},   // Reduce N 4X, increase K 2X
+    {64, 64, 128},    // Reduce N 4X, same K
 };
 
 int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 66f589dba7851..5964d5a5465fd 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -218,7 +218,7 @@ def fused_marlin_moe(
 
     sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
 
-    max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16
+    max_workspace_size = (max(2 * N, K) // 64) * 16
     workspace = torch.zeros(max_workspace_size,
                             dtype=torch.int,
                             device="cuda",