diff --git a/cd-moe/modeling_deepseek_v2.py b/cd-moe/modeling_deepseek_v2.py index d51262af8..db31cc8d2 100644 --- a/cd-moe/modeling_deepseek_v2.py +++ b/cd-moe/modeling_deepseek_v2.py @@ -401,14 +401,6 @@ def backward(ctx, grad_output): condense_layer_order_path = os.path.join( current_dir, "layer_idx_order.e6.json") condense_layer_order = json.load(open(condense_layer_order_path, 'r')) - -trim_layer_idxs = [] -layer_map_trim = {} -new_layer_idx = 0 -for origin_layer_idx in range(27): - layer_map_trim[origin_layer_idx] = new_layer_idx - new_layer_idx += 1 -condense_layer_order = list(filter(lambda x: x not in trim_layer_idxs, condense_layer_order)) prune_layer_idxs = condense_layer_order[:prune_layer_num] print("condense layer idx {}".format(prune_layer_idxs)) @@ -690,7 +682,6 @@ def forward( if past_key_value is not None: # print(self.layer_idx) cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - # cache_idx = layer_map_trim[self.layer_idx-1] + 1 if self.layer_idx > 0 else 0 key_states, value_states = past_key_value.update( key_states, value_states, self.layer_idx, cache_kwargs)