From e133fc69ca63081d65a503f9042003c97207da88 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Sun, 15 Dec 2024 18:50:29 -0500
Subject: [PATCH] cleanup mixtral.cc

---
 inference/models/mixtral.cc | 61 +++++++++++--------------------------
 1 file changed, 18 insertions(+), 43 deletions(-)

diff --git a/inference/models/mixtral.cc b/inference/models/mixtral.cc
index 942c0f421b..3d17a5f584 100644
--- a/inference/models/mixtral.cc
+++ b/inference/models/mixtral.cc
@@ -66,7 +66,6 @@ void MIXTRAL::create_mixtral_model(FFModel &ff,
                               NULL,
                               embed_init,
                               "embed_tokens");
-  // token has dimensions (hidden_size, 1, 128)
 
   Tensor mlp_out = nullptr;
 
@@ -88,13 +87,9 @@ void MIXTRAL::create_mixtral_model(FFModel &ff,
           std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
     } else {
-//      printf("before first rms norm in layer %d token has %d dims\n",i, token->num_dims);
-//      printf("before first rms norm in layer %d mlp_out has %d dims\n",i, token->num_dims);
-//      printf("before first rms norm in layer %d token dims are %d %d %d %d\n",i, token->dims[0], token->dims[1], token->dims[2], token->dims[3]);
-//      printf("before first rms norm in layer %d, mlp_out dims are %d %d %d %d\n",i, mlp_out->dims[0], mlp_out->dims[1], mlp_out->dims[2], mlp_out->dims[3]);
 	  ff.residual_rms_norm(
-          token, //  (1024, 1, 128) confirmed 3 dims
-          mlp_out, //  (1024, 1, 128) confirmed 3 dims
+          token, //  (1024, batch, sequence)
+          mlp_out, //  (1024, batch, sequence)
           token_att_norm,
           mixtral_config.rms_norm_eps,
           mixtral_config.hidden_size,
@@ -105,9 +100,7 @@ void MIXTRAL::create_mixtral_model(FFModel &ff,
       token = token_att_norm[0];
       att_norm = token_att_norm[1];
     }
-    // token has dimensions (hidden_size, 1, 128)
-
-
+    // token has dimensions (hidden_size, batch, sequence)
 
     Tensor qkv_proj = ff.dense(
           att_norm,
@@ -225,12 +218,12 @@ void MIXTRAL::create_mixtral_model(FFModel &ff,
         DT_NONE,
         std::string("layers." + std::to_string(i) + ".post_attention_layernorm")
             .c_str());
-    token = token_ff_norm[0];   // token has dimensions (hidden_size, 1, 128)
+    token = token_ff_norm[0];   // token has dimensions (hidden_size, batch, sequence)
     Tensor ff_norm = token_ff_norm[1];
 
     // MoE
     Tensor gate = ff.dense(
-        ff_norm, // (hidden_size, 1, 128)
+        ff_norm, // (hidden_size, batch, sequence)
         mixtral_config.num_local_experts,
         AC_MODE_NONE,
         false,
@@ -243,7 +236,7 @@ void MIXTRAL::create_mixtral_model(FFModel &ff,
         std::string("layers." + std::to_string(i) + ".block_sparse_moe_gate")
             .c_str());
     gate = ff.softmax(
-        gate, // (num_experts, 1, 128)
+        gate, // (num_experts, batch, sequence)
         0,
         DT_NONE,
         std::string("layers." + std::to_string(i) + ".block_sparse_moe_softmax")
@@ -252,43 +245,30 @@ void MIXTRAL::create_mixtral_model(FFModel &ff,
     Tensor topk_out[2] = {nullptr, nullptr};
 
     ff.top_k(
-        gate, // (num_experts, 1, 128)
+        gate, // (num_experts, batch, sequence)
         topk_out,
         mixtral_config.num_experts_per_tok,
         false,
         std::string("layers." + std::to_string(i) + ".block_sparse_moe_topk")
             .c_str());
-    Tensor topk_values = topk_out[0]; // (experts_per_tok, 1, 128) (confirmed 3 dims)
-    Tensor topk_indices = topk_out[1];  // (experts_per_tok, 1, 128) (confirmed 3 dims)
+    Tensor topk_values = topk_out[0]; // (experts_per_tok, batch, sequence)
+    Tensor topk_indices = topk_out[1];  // (experts_per_tok, batch, sequence)
 
     Tensor grouped_tokens[mixtral_config.num_local_experts] = {nullptr};
-    ff.group_by( // TODO this group_by does not crash, but it sets all tokens to 0 or something! Need to figure out why it make outptu tokens all the same
-        ff_norm, // (hidden_size, 1, 128)
+    ff.group_by(
+        ff_norm, // (hidden_size, batch, sequence)
         topk_indices,
         grouped_tokens,
         mixtral_config.num_local_experts,
-        0.0f, // TODO understand why this does not cause a dimension of 128? maybe the 128 is never set?
+        0.0f,
         std::string("layers." + std::to_string(i) + ".block_sparse_moe_groupby")
             .c_str());
 
-    // Can use this to create a grouped_tokens2 used no where just to see if group_by can run successfully
-//    Tensor grouped_tokens2[mixtral_config.num_local_experts] = {nullptr};
-//    ff.group_by(
-//        ff_norm, // (hidden_size, 1, 128)
-//        topk_indices,
-//        grouped_tokens2,
-//        mixtral_config.num_local_experts,
-//        1.0f, // TODO understand why this does not cause a dimension of 128? maybe the 128 is never set?
-//        std::string("layers." + std::to_string(i) + ".block_sparse_moe_groupby")
-//            .c_str());
-
-
     Tensor aggregate_inputs[4 + mixtral_config.num_local_experts] = {nullptr};
     
     for (int expert_idx = 0; expert_idx < mixtral_config.num_local_experts;
          expert_idx++) {
-      // grouped_tokens[expert_idx] = ff_norm; // TODO this is a dirty fix. Restore using group_by!
-      Tensor w1 = ff.dense(grouped_tokens[expert_idx],  // (hidden_size, 1, result of calc in groupby)
+      Tensor w1 = ff.dense(grouped_tokens[expert_idx],  // (hidden_size, batch, max tokens per expert)
                            mixtral_config.intermediate_size,
                            AC_MODE_NONE,
                            false,
@@ -348,10 +328,7 @@ void MIXTRAL::create_mixtral_model(FFModel &ff,
 //    Tensor topk_values_reduced = ff.reduce_sum(topk_values, {0}, true);
 //    topk_values = ff.divide(topk_values, topk_values_reduced);
 
-//    mlp_out = aggregate_inputs[5]; // TODO don't use only one expert
-
-// Everything below is needed to use aggregate // TODO try not needing the _dummy stuff
-
+  // TODO have 2 fixed inputs instead of 4
     Tensor topk_values_DUMMY = ff.softmax(
         topk_values,
         -1,
@@ -360,7 +337,7 @@ void MIXTRAL::create_mixtral_model(FFModel &ff,
             .c_str());
 
     Tensor gate_DUMMY = ff.softmax(
-        gate, // (num_experts, 1, 128)
+        gate, // (num_experts, batch, sequence)
         -1,
         DT_NONE,
         std::string("layers." + std::to_string(i) + ".dummy")
@@ -370,7 +347,7 @@ void MIXTRAL::create_mixtral_model(FFModel &ff,
     aggregate_inputs[1] = topk_indices;
     aggregate_inputs[2] = topk_values_DUMMY;
     aggregate_inputs[3] = gate_DUMMY;
-//
+
     mlp_out = ff.aggregate(aggregate_inputs,
                            mixtral_config.num_local_experts,
                            0.0f,
@@ -378,9 +355,7 @@ void MIXTRAL::create_mixtral_model(FFModel &ff,
                                        ".block_sparse_moe_experts_aggregate")
                                .c_str());
 
-//  printf("mlp_out in layer %d dims are %d %d %d %d\n",i, mlp_out->dims[0], mlp_out->dims[1], mlp_out->dims[2], mlp_out->dims[3]);
-  assert(mlp_out->dims[0] == mixtral_config.hidden_size && "mlp_out dims[0] != hidden_size");
-//  printf("seq length is now %d\n", mlp_out->dims[2]);
+    assert(mlp_out->dims[0] == mixtral_config.hidden_size && "mlp_out dims[0] != hidden_size");
 
  }
 
@@ -414,7 +389,7 @@ void MIXTRAL::create_mixtral_model(FFModel &ff,
       Tensor softmax = ff.softmax(dense, -1);
       output = ff.sampling(softmax, generation_config.topp);
     } else {
-      Tensor softmax = ff.softmax(dense, -1); // TODO added that to copy llama, see if needed in HF transformers impl.
+      Tensor softmax = ff.softmax(dense, -1);
       output = ff.argmax(softmax, /*beam_Search*/ false);
     }