diff --git a/cd-moe/modeling_deepseek_v2.py b/cd-moe/modeling_deepseek_v2.py index db31cc8d2..69ae36bfb 100644 --- a/cd-moe/modeling_deepseek_v2.py +++ b/cd-moe/modeling_deepseek_v2.py @@ -1276,13 +1276,6 @@ def __init__(self, config: DeepseekConfig): self.gradient_checkpointing = False # Initialize weights and apply final processing - - global layer_num, trim_layer_idxs - self.layer_num = layer_num - self.trim_layer_idxs = trim_layer_idxs - - # self.trim_layer_idxs = [i+1 for i in trim_layer_idxs] # add first ffn layer - self.post_init() def get_input_embeddings(self): @@ -1379,15 +1372,6 @@ def forward( next_decoder_cache = None for tmp_layer_idx, decoder_layer in enumerate(self.layers): - if tmp_layer_idx > 0: - global global_layer - relative_layer = global_layer % self.layer_num - if relative_layer in self.trim_layer_idxs: - # print("layer_num {} current_layer {}, BLOCK_TRIM layer".format( - # self.layer_num, relative_layer)) - global_layer +=1 - continue - if output_hidden_states: all_hidden_states += (hidden_states,)