From 3bf9a0b672b9616d4508c77a0c2e2dcb2c954b5f Mon Sep 17 00:00:00 2001 From: yisheng Date: Mon, 22 Jul 2024 12:03:00 +0800 Subject: [PATCH] grad_wei can't be NoneType when running with DeepSpeed, for zero3 will divided the gradient --- megatron/core/tensor_parallel/layers.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py index 2245113c9c..23200d71d3 100644 --- a/megatron/core/tensor_parallel/layers.py +++ b/megatron/core/tensor_parallel/layers.py @@ -285,6 +285,7 @@ def forward(ctx, input, weight, bias, gradient_accumulation_fusion, @staticmethod @custom_bwd def backward(ctx, grad_output): + args = get_args() input, weight = ctx.saved_tensors use_bias = ctx.use_bias @@ -366,9 +367,13 @@ def backward(ctx, grad_output): # grad_weight = None # else: # grad_weight = grad_output.t().matmul(total_input) - from megatron.core.tensor_parallel.weight_grad_store import WeightGradStore - WeightGradStore.put(total_input, grad_output, weight, gradientUpdateFunction) - grad_weight = None + if args.enable_zbh1_pipeline: + from megatron.core.tensor_parallel.weight_grad_store import WeightGradStore + WeightGradStore.put(total_input, grad_output, weight, gradientUpdateFunction) + grad_weight = None + else: + grad_weight = grad_output.t().matmul(total_input) + grad_bias = grad_output.sum(dim=0) if use_bias else None if ctx.sequence_parallel: