Merge branch 'peft' into fix_cublas_default

flexflow · Nov 2, 2023 · 0a9b983 · 0a9b983
2 parents 65f497a + bf78ea4
commit 0a9b983
Show file tree

Hide file tree

Showing 194 changed files with 10,521 additions and 1,501 deletions.
diff --git a/.gitignore b/.gitignore
@@ -186,4 +186,6 @@ gpt_tokenizer
 # pip version
 python/flexflow/version.txt
 
-inference_tensors
+inference_tensors
+
+Untitled-1.ipynb
diff --git a/conda/flexflow.yml b/conda/flexflow.yml
@@ -25,3 +25,9 @@ dependencies:
     - sentencepiece
     - einops
     - requests
+    - scipy
+    - bitsandbytes
+    - datasets
+    - accelerate
+    - loralib
+    - peft
diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
@@ -95,6 +95,8 @@ RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind1
 RUN conda install pytorch torchvision torchaudio -c pytorch
 RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops
 RUN pip3 install tensorflow notebook
+# PEFT-related
+RUN pip3 install scipy bitsandbytes datasets accelerate loralib peft
 
 # Install Rust
 RUN curl https://sh.rustup.rs -sSf | sh -s -- -y

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "flexflow/ffconst.h"
+#include "flexflow/fftype.h"
 #include "legion.h"
 #include <cstddef>
 #include <cstdlib>
@@ -43,6 +44,8 @@ class BatchConfig {
   BatchConfig();
   int num_active_requests() const;
   int num_active_tokens() const;
+  int num_active_infr_tokens() const;
+  int num_active_peft_tokens() const;
   static int max_requests_per_batch();
   static int max_tokens_per_batch();
   static int max_sequence_length();
@@ -61,11 +64,23 @@ class BatchConfig {
   int num_tokens;
 
   struct PerRequestInfo {
+    PerRequestInfo() {
+      first_token_depth_in_request = 0;
+      first_token_offset_in_batch = 0;
+      num_tokens_in_batch = 0;
+      max_sequence_length = 0;
+      request_guid = 0;
+      peft_model_id = PEFTModelID::NO_ID;
+      peft_bwd = false;
+    }
     int first_token_depth_in_request;
     int first_token_offset_in_batch;
     int num_tokens_in_batch;
     int max_sequence_length;
     RequestGuid request_guid;
+    // PEFT fields
+    PEFTModelID peft_model_id;
+    bool peft_bwd;
   };
   struct PerTokenInfo {
     int abs_depth_in_request;

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
@@ -15,7 +15,7 @@
 
 #ifndef _FLEXFLOW_CONFIG_H_
 #define _FLEXFLOW_CONFIG_H_
-#include "ffconst.h"
+#include "flexflow/ffconst.h"
 #include "legion.h"
 #include <cstring>
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
@@ -64,6 +64,8 @@ constexpr ParameterSyncType CHOSEN_SYNC_TYPE = ParameterSyncType::PS;
 #endif
 
 class FFConfig;
+class MemoryAllocator;
+class PEFTWeightAllocator;
 
 struct FFHandler {
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
@@ -77,6 +79,12 @@ struct FFHandler {
   size_t workSpaceSize;
   void *offload_reserve_space;
   size_t offload_reserve_space_size;
+  // PEFT related fields
+  MemoryAllocator *peft_activation_allocator;
+  size_t peft_activation_reserve_space_size;
+  PEFTWeightAllocator *peft_weight_allocator;
+  size_t peft_weight_reserve_space_size;
+  // Quantization fields
   DataType quantization_type;
   bool allowTensorOpMathConversion;
 #ifdef FF_USE_NCCL
@@ -87,6 +95,8 @@ struct FFHandler {
 struct FFInitInfo {
   size_t workSpaceSize;
   size_t offload_reserve_space_size;
+  size_t peft_activation_reserve_space_size;
+  size_t peft_weight_reserve_space_size;
   DataType quantization_type;
   bool allowTensorOpMathConversion;
   // int myRank, allRanks;
@@ -143,6 +153,9 @@ class FFConfig {
   bool cpu_offload;
   size_t offload_reserve_space_size;
   DataType quantization_type;
+  // PEFT related fields
+  size_t peft_activation_reserve_space_size;
+  size_t peft_weight_reserve_space_size;
   // Control parallelizable dimensions
   bool only_data_parallel;
   bool enable_sample_parallel;

diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
@@ -46,6 +46,12 @@ enum LossType {
   LOSS_IDENTITY = 54,
 };
 
+enum OptimizerType {
+  OPTIMIZER_TYPE_NONE = 60,
+  OPTIMIZER_TYPE_SGD = 61,
+  OPTIMIZER_TYPE_ADAM = 62,
+};
+
 enum CompMode {
   COMP_MODE_TRAINING = 70,
   COMP_MODE_INFERENCE = 71,
@@ -172,6 +178,9 @@ enum OperatorType {
   OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION,
   OP_TREE_INC_MULTIHEAD_SELF_ATTENTION,
   OP_SAMPLING,
+  // PEFT Ops
+  OP_LORA_MLP_FIRST,
+  OP_LORA_MLP_SECOND,
   // Parallel Ops
   OP_REPARTITION,
   OP_COMBINE,
@@ -269,5 +278,7 @@ enum {
   TENSOR_GUID_LAST_VALID = 3999999,
   PARALLEL_TENSOR_GUID_FIRST_VALID = 4000000,
   NODE_GUID_FIRST_VALID = 5000000,
+  PEFT_MODEL_ID_FIRST_VALID = 6000000,
+  PEFT_MODEL_ID_LAST_VALID = 6999999
 };
 #endif // _FLEXFLOW_CONST_H_
diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h
@@ -3,6 +3,7 @@
 
 #include "flexflow/ffconst.h"
 #include <cstddef>
+#include <functional>
 
 namespace FlexFlow {
 
@@ -18,6 +19,29 @@ class LayerID {
   size_t id, transformer_layer_id, model_id;
 };
 
+class PEFTModelID {
+public:
+  static const PEFTModelID NO_ID;
+  PEFTModelID();
+  PEFTModelID(size_t id);
+  bool is_valid_id() const;
+  friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs);
+  friend std::ostream &operator<<(std::ostream &os,
+                                  PEFTModelID const &peft_model_id);
+
+public:
+  size_t id;
+};
+
 }; // namespace FlexFlow
 
+namespace std {
+template <>
+struct hash<FlexFlow::PEFTModelID> {
+  size_t operator()(FlexFlow::PEFTModelID const &n) const {
+    return n.id;
+  }
+};
+} // namespace std
+
 #endif // _FF_TYPE_H
diff --git a/include/flexflow/layer.h b/include/flexflow/layer.h
@@ -49,7 +49,7 @@ class Layer {
   Tensor outputs[MAX_NUM_OUTPUTS];
   Tensor inputs[MAX_NUM_INPUTS];
   Tensor weights[MAX_NUM_WEIGHTS];
-  bool trainableInputs[MAX_NUM_INPUTS];
+  // bool trainable_inputs[MAX_NUM_INPUTS];
   int numInputs, numWeights, numOutputs;
   bool profiling;
   bool inference_debugging;

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
@@ -106,26 +106,39 @@ enum TaskIDs {
   LAYERNORM_FWD_TASK_ID,
   LAYERNORM_INF_TASK_ID,
   LAYERNORM_BWD_TASK_ID,
+  LAYERNORM_PEFT_BWD_TASK_ID,
   RESIDUAL_LAYERNORM_INIT_TASK_ID,
   RESIDUAL_LAYERNORM_INF_TASK_ID,
+  RESIDUAL_LAYERNORM_BWD_TASK_ID,
+  RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
   ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID,
   ADD_BIAS_RESIDUAL_LAYERNORM_INF_TASK_ID,
+  ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID,
+  ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
   SIGMOID_SILU_MULTI_INIT_TASK_ID,
   SIGMOID_SILU_MULTI_INF_TASK_ID,
+  SIGMOID_SILU_MULTI_BWD_TASK_ID,
+  SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID,
   LINEAR_INIT_TASK_ID,
   LINEAR_INIT_PARA_TASK_ID,
   LINEAR_INF_TASK_ID,
+  LINEAR_PEFT_BWD_TASK_ID,
   LINEAR_FWD_TASK_ID,
   LINEAR_BWD_TASK_ID,
   LINEAR_BWD2_TASK_ID,
   LINEAR_UPD_TASK_ID,
+  LORA_LINEAR_INIT_TASK_ID,
+  LORA_LINEAR_REG_TASK_ID,
+  LORA_LINEAR_INF_TASK_ID,
+  LORA_LINEAR_PEFT_BWD_TASK_ID,
   FLAT_INIT_TASK_ID,
   FLAT_FWD_TASK_ID,
   FLAT_BWD_TASK_ID,
   SOFTMAX_INIT_TASK_ID,
   SOFTMAX_FWD_TASK_ID,
   SOFTMAX_BWD_TASK_ID,
   SOFTMAX_INF_TASK_ID,
+  SOFTMAX_PEFT_BWD_TASK_ID,
   CONCAT_INIT_TASK_ID,
   CONCAT_FWD_TASK_ID,
   CONCAT_BWD_TASK_ID,
@@ -160,20 +173,26 @@ enum TaskIDs {
   RMSNORM_INIT_TASK_ID,
   RMSNORM_FWD_TASK_ID,
   RMSNORM_INF_TASK_ID,
+  RMSNORM_BWD_TASK_ID,
+  RMSNORM_PEFT_BWD_TASK_ID,
   RESIDUAL_RMSNORM_INIT_TASK_ID,
   RESIDUAL_RMSNORM_INF_TASK_ID,
+  RESIDUAL_RMSNORM_BWD_TASK_ID,
+  RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID,
   BEAM_TOPK_INIT_TASK_ID,
   BEAM_TOPK_INF_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
+  INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID,
   SPEC_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
   SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
   TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
   TREE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
   MSELOSS_BWD_TASK_ID,
   FUSEDOP_INIT_TASK_ID,
+  FUSEDOP_PEFT_BWD_TASK_ID,
   FUSEDOP_FWD_TASK_ID,
   FUSEDOP_BWD_TASK_ID,
   FUSEDOP_INF_TASK_ID,
@@ -231,9 +250,10 @@ enum TaskIDs {
   PIPELINE_FWD_TASK_ID,
   PIPELINE_BWD_TASK_ID,
   ALLREDUCE_INIT_TASK_ID,
-  ALLREDUCE_INF_TASK_ID,
   ALLREDUCE_FWD_TASK_ID,
   ALLREDUCE_BWD_TASK_ID,
+  ALLREDUCE_INF_TASK_ID,
+  ALLREDUCE_PEFT_BWD_TASK_ID,
   FUSED_PARALLELOP_INIT_TASK_ID,
   FUSED_PARALLELOP_FWD_TASK_ID,
   FUSED_PARALLELOP_BWD_TASK_ID,
@@ -321,6 +341,7 @@ class ResidualLayerNorm;
 class AddBiasResidualLayerNorm;
 class SigmoidSiluMulti;
 class Linear;
+class LoraLinear;
 class MultiHeadAttention;
 class IncMultiHeadSelfAttention;
 class TreeIncMultiHeadSelfAttention;
@@ -800,10 +821,26 @@ class FFModel {
       bool position_bias = false,
       char const *name = NULL);
   // ========================================
+  // PEFT Layers
+  // ========================================
+  void lora_linear(Tensor const input,
+                   Tensor const output,
+                   OperatorType _type,
+                   char const *name = nullptr);
+  // ========================================
   // Inference APIs
   // ========================================
-  GenerationResult generate(std::vector<std::string> &prompts,
-                            int max_seq_length);
+  GenerationResult generate(std::string const &prompts,
+                            int max_seq_length,
+                            PEFTModelID peft_model_id = PEFTModelID::NO_ID);
+
+  GenerationResult generate(std::vector<std::string> const &prompts,
+                            int max_seq_length,
+                            PEFTModelID peft_model_id = PEFTModelID::NO_ID);
+
+  PEFTModelID register_peft_model(
+      LoraLinearConfig const mlp_first = LoraLinearConfig::DefaultConfig,
+      LoraLinearConfig const mlp_second = LoraLinearConfig::DefaultConfig);
 
   Tensor create_tensor_legion_ordering(int num_dim,
                                        int const dims[],
@@ -1100,7 +1137,7 @@ class FFModel {
   void clear_graph_search_cache();
 
 public:
-  size_t op_global_guid, layer_global_guid;
+  size_t op_global_guid, layer_global_guid, peft_model_global_guid;
   size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid;
   size_t current_transformer_layer_id;
   // positional embedding start offset
@@ -1178,6 +1215,10 @@ class FFModel {
           SigmoidSiluMulti *>,
       std::unordered_map<std::pair<ParallelTensorShape, LinearParams>,
                          Linear *>,
+      std::unordered_map<
+          std::pair<std::pair<ParallelTensorShape, ParallelTensorShape>,
+                    LoraLinearParams>,
+          LoraLinear *>,
       std::unordered_map<std::pair<ParallelTensorShape, Pool2DParams>,
                          Pool2D *>,
       std::unordered_map<std::pair<std::tuple<ParallelTensorShape,

diff --git a/include/flexflow/op_meta.h b/include/flexflow/op_meta.h
@@ -9,7 +9,7 @@ class Op;
 
 class OpMeta {
 public:
-  OpMeta(FFHandler _handle);
+  // OpMeta(FFHandler _handle);
   OpMeta(FFHandler _handle, Op const *op);
 
 public:
@@ -19,7 +19,8 @@ class OpMeta {
   int decoding_step;
   char op_name[MAX_OPNAME];
   LayerID layer_guid;
-  bool trainableInputs[MAX_NUM_INPUTS];
+  bool trainable_inputs[MAX_NUM_INPUTS];
+  bool reset_input_grads[MAX_NUM_INPUTS];
   DataType input_type[MAX_NUM_INPUTS];
   DataType weight_type[MAX_NUM_WEIGHTS];
   DataType output_type[MAX_NUM_OUTPUTS];

diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
@@ -226,6 +226,13 @@ class Op {
                                       MachineView const *mv = nullptr) {
     assert(false);
   };
+  virtual Legion::FutureMap peft_bwd(FFModel const &,
+                                     BatchConfigFuture const &,
+                                     std::vector<ParallelTensor> const &,
+                                     std::vector<ParallelTensor> const &,
+                                     MachineView const *mv = nullptr) {
+    assert(false);
+  }
   virtual void print_layer(FFModel const &model) = 0;
   static void save_inference_tensors_to_file(
       OpMeta *m,
@@ -320,7 +327,8 @@ class Op {
   ParallelTensor outputs[MAX_NUM_OUTPUTS];
   ParallelTensor inputs[MAX_NUM_INPUTS];
   ParallelParameter weights[MAX_NUM_WEIGHTS];
-  bool trainableInputs[MAX_NUM_INPUTS];
+  bool trainable_inputs[MAX_NUM_INPUTS];
+  bool reset_input_grads[MAX_NUM_INPUTS];
   OpMeta *meta[MAX_NUM_WORKERS];
   std::map<ParallelTensor, OpMeta *[MAX_NUM_WORKERS]> inference_meta;
   int numInputs, numWeights, numOutputs;

diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h
@@ -23,6 +23,7 @@
 #include "flexflow/ops/inc_multihead_self_attention_params.h"
 #include "flexflow/ops/layer_norm_params.h"
 #include "flexflow/ops/linear_params.h"
+#include "flexflow/ops/lora_linear_params.h"
 #include "flexflow/ops/pool_2d_params.h"
 #include "flexflow/ops/reduce_params.h"
 #include "flexflow/ops/reshape_params.h"
@@ -67,6 +68,7 @@ using OperatorParameters = mp::variant<AggregateParams,
                                        AddBiasResidualLayerNormParams,
                                        SigmoidSiluMultiParams,
                                        LinearParams,
+                                       LoraLinearParams,
                                        MultiHeadAttentionParams,
                                        IncMultiHeadSelfAttentionParams,
                                        BeamTopKParams,