From 3bf9a0b672b9616d4508c77a0c2e2dcb2c954b5f Mon Sep 17 00:00:00 2001
From: yisheng <yi.sheng@intel.com>
Date: Mon, 22 Jul 2024 12:03:00 +0800
Subject: [PATCH] grad_wei can't be NoneType when running with DeepSpeed, for
 zero3 will divided the gradient

---
 megatron/core/tensor_parallel/layers.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py
index 2245113c9c..23200d71d3 100644
--- a/megatron/core/tensor_parallel/layers.py
+++ b/megatron/core/tensor_parallel/layers.py
@@ -285,6 +285,7 @@ def forward(ctx, input, weight, bias, gradient_accumulation_fusion,
     @staticmethod
     @custom_bwd
     def backward(ctx, grad_output):
+        args = get_args()
         input, weight = ctx.saved_tensors
         use_bias = ctx.use_bias
 
@@ -366,9 +367,13 @@ def backward(ctx, grad_output):
         #     grad_weight = None
         # else:
         #     grad_weight = grad_output.t().matmul(total_input)
-        from megatron.core.tensor_parallel.weight_grad_store import WeightGradStore
-        WeightGradStore.put(total_input, grad_output, weight, gradientUpdateFunction)
-        grad_weight = None
+        if args.enable_zbh1_pipeline:
+            from megatron.core.tensor_parallel.weight_grad_store import WeightGradStore
+            WeightGradStore.put(total_input, grad_output, weight, gradientUpdateFunction)
+            grad_weight = None
+        else:
+            grad_weight = grad_output.t().matmul(total_input)
+
         grad_bias = grad_output.sum(dim=0) if use_bias else None
 
         if ctx.sequence_parallel: