Merge branch 'peft' into peft_xinhao

xinhaoc · Feb 6, 2024 · 3a6b2ab · 3a6b2ab
2 parents 0444a9d + 32a0716
commit 3a6b2ab
Show file tree

Hide file tree

Showing 49 changed files with 3,666 additions and 1,992 deletions.
diff --git a/.gitignore b/.gitignore
@@ -189,4 +189,5 @@ python/flexflow/version.txt
 inference_tensors
 hf_peft_tensors
 
-Untitled-1.ipynb
+Untitled-1.ipynb
+Untitled-2.ipynb
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
@@ -154,6 +154,7 @@ class FFConfig {
   size_t offload_reserve_space_size;
   DataType quantization_type;
   // PEFT related fields
+  bool enable_peft;
   size_t peft_activation_reserve_space_size;
   size_t peft_weight_reserve_space_size;
   // Control parallelizable dimensions

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
@@ -240,9 +240,11 @@ enum TaskIDs {
   COMBINE_INIT_TASK_ID,
   COMBINE_FWD_TASK_ID,
   COMBINE_BWD_TASK_ID,
+  COMBINE_PEFT_BWD_TASK_ID,
   REPLICATE_INIT_TASK_ID,
   REPLICATE_FWD_TASK_ID,
   REPLICATE_BWD_TASK_ID,
+  REPLICATE_PEFT_BWD_TASK_ID,
   REDUCTION_INIT_TASK_ID,
   REDUCTION_FWD_TASK_ID,
   REDUCTION_BWD_TASK_ID,
@@ -1122,6 +1124,7 @@ class FFModel {
   Legion::IndexSpace get_task_is(Legion::Domain const &domain) const;
   Legion::IndexSpace get_task_is(ParallelConfig const &pc) const;
   Legion::IndexSpace get_task_is(MachineView const &view) const;
+  bool need_to_add_combine(int layer_idx) const;
   bool is_mlp_block(int layer_idx) const;
   void create_operators_from_layers();
   Op *create_operator_from_layer(Layer *layer,

diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
@@ -267,28 +267,34 @@ class Op {
       bool fwd_pass = true,
       bool before_kernel = false) {
     // Check if output directory exists, and create it if it does not
-    char const *folder_path = "./inference_tensors";
+    char const *folder_path = "./inference_tensors/";
     struct stat st = {0};
     if (stat(folder_path, &st) == -1) {
       // Directory does not exist, create it
       mkdir(folder_path, 0700);
     }
     // output base filepath, shared by all tensors from the same operator
     std::string op_name_without_uid = get_op_name_without_uid(m);
-    std::string base_filepath =
-        "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
-        (fwd_pass ? "_decoding-step_" : "_bwd-step_") +
-        (fwd_pass ? std::to_string(m->decoding_step)
-                  : std::to_string(m->bwd_step)) +
-        "_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) +
-        "_layer-name_" + op_name_without_uid + "_shard-id_" +
-        std::to_string(shard_id);
+    std::cout << (fwd_pass ? "INF " : "BWD ") << op_name_without_uid
+              << std::endl;
+    std::string base_filepath = std::string(folder_path);
+    if (m->layer_guid.model_id > 0) {
+      base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_";
+    }
+    if (fwd_pass) {
+      base_filepath += "fwd_step_" + std::to_string(m->decoding_step);
+    } else {
+      base_filepath += "bwd_step_" + std::to_string(m->bwd_step);
+    }
+    base_filepath += "_layers_" +
+                     std::to_string(m->layer_guid.transformer_layer_id) + "_" +
+                     op_name_without_uid + "_shard_" + std::to_string(shard_id);
     if (before_kernel) {
       base_filepath += "_pre";
     }
     // save batch config, if passed
     if (bc != nullptr) {
-      bc->save_to_file(base_filepath + "_batch-config");
+      bc->save_to_file(base_filepath + "_batch_config");
     }
     // save all inputs
     for (int i = 0; i < input_tensors.size(); i++) {

diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h
@@ -26,6 +26,7 @@ class AddBiasResidualLayerNorm : public Op {
                            float _eps,
                            bool allocate_weights,
                            char const *name);
+  void map_output_tensors(FFModel &ff) override;
   void init(FFModel const &) override;
   void init_inference(FFModel const &,
                       std::vector<ParallelTensor> const &,

diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -19,7 +19,6 @@ class LoraLinearMeta : public OpMeta {
 public:
   LoraLinearMeta(FFHandler handle, LoraLinear const *li);
   ~LoraLinearMeta(void);
-  char op_name[MAX_OPNAME];
   // PEFT related fields
   void *low_rank_activation;
   void *input_activation;

diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h
@@ -46,6 +46,7 @@ void backward_kernel_wrapper(SoftmaxMeta const *m,
 
 void inference_kernel_wrapper(SoftmaxMeta const *m,
                               BatchConfig const *bc,
+                              bool is_last_op,
                               GenericTensorAccessorR const &input,
                               GenericTensorAccessorW const &output,
                               GenericTensorAccessorW const &output_grad);

diff --git a/include/flexflow/parallel_ops/combine.h b/include/flexflow/parallel_ops/combine.h
@@ -40,6 +40,11 @@ class Combine : public ParallelOp {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void backward(FFModel const &) override;
   bool get_int_parameter(PMParameter, int *) const override;
   bool append_parallel_op_info(
@@ -56,6 +61,10 @@ class Combine : public ParallelOp {
                             std::vector<Legion::PhysicalRegion> const &regions,
                             Legion::Context ctx,
                             Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   template <typename T>
   static void
       forward_task_with_type(Legion::Task const *task,

diff --git a/include/flexflow/parallel_ops/parallel_op.h b/include/flexflow/parallel_ops/parallel_op.h
@@ -41,7 +41,7 @@ class ParallelOp : public Op {
 public:
   Legion::LogicalPartition input_lp, output_grad_lp;
   std::unordered_map<ParallelTensor, Legion::LogicalPartition>
-      inference_input_lps;
+      inference_input_lps, inference_output_grad_lps;
 };
 
 }; // namespace FlexFlow

diff --git a/include/flexflow/parallel_ops/replicate.h b/include/flexflow/parallel_ops/replicate.h
@@ -54,10 +54,19 @@ class Replicate : public ParallelOp {
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
                            Legion::Runtime *runtime);
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   static void backward_task(Legion::Task const *task,
                             std::vector<Legion::PhysicalRegion> const &regions,
                             Legion::Context ctx,
                             Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   static void forward_kernel_wrapper(ReplicateMeta const *m,
                                      GenericTensorAccessorR const &input,
                                      GenericTensorAccessorW const &output,

diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py
@@ -51,9 +51,12 @@ def get_configs():
             "tensor_parallelism_degree": 1,
             "pipeline_parallelism_degree": 4,
             "offload": False,
-            "offload_reserve_space_size": 1024**2,
+            "offload_reserve_space_size": 8 * 1024, # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
+            "enable_peft": False,
+            "peft_activation_reserve_space_size": 1024, # 1GB
+            "peft_weight_reserve_space_size": 1024, # 1GB
             "profiling": False,
             "inference_debugging": False,
             "fusion": True,

diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py
@@ -51,9 +51,12 @@ def get_configs():
             "tensor_parallelism_degree": 2,
             "pipeline_parallelism_degree": 2,
             "offload": False,
-            "offload_reserve_space_size": 1024**2,
+            "offload_reserve_space_size": 8 * 1024, # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
+            "enable_peft": False,
+            "peft_activation_reserve_space_size": 1024, # 1GB
+            "peft_weight_reserve_space_size": 1024, # 1GB
             "profiling": False,
             "inference_debugging": False,
             "fusion": True,

diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py
@@ -87,7 +87,10 @@
     "offload": "-offload",
     "offload_reserve_space_size": "-offload-reserve-space-size",
     "use_4bit_quantization": "--4bit-quantization",
-    "use_8bit_quantization": "--8bit-quantization"
+    "use_8bit_quantization": "--8bit-quantization",
+    "enable_peft": "",
+    "peft_activation_reserve_space_size": "-peft-activation-reserve-space-size",
+    "peft_weight_reserve_space_size": "-peft-weight-reserve-space-size",
 }
 
 

diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
@@ -44,6 +44,9 @@ def init(
     offload_reserve_space_size: Optional[int] = None,
     use_4bit_quantization: Optional[bool] = None,
     use_8bit_quantization: Optional[bool] = None,
+    enable_peft: Optional[bool] = None,
+    peft_activation_reserve_space_size: Optional[int] = None,
+    peft_weight_reserve_space_size: Optional[int] = None,
     profiling: Optional[bool] = None,
     inference_debugging: Optional[bool] = None,
     fusion: Optional[bool] = None,
@@ -68,9 +71,12 @@ def init(
     - tensor_parallelism_degree: the degree of parallelization in the tensor parallel dimension (using the Megatron technique), defaults to 1
     - pipeline_parallelism_degree: the degree of parallelization in the pipeline parallel dimension, defaults to 1
     - offload: whether to enable offloading of the weights to CPU, defaults to False
-    - offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, default to 1024^2
+    - offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, defaults to 8 GB
     - use_4bit_quantization: whether to use 4-bit quantization, defaults to False
     - use_8bit_quantization: whether to use 8-bit quantization, defaults to False
+    - enable_peft: whether to enable the use of PEFT, defaults to False
+    - peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB
+    - peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB
     - profiling: whether to enable the FlexFlow profiling mode, defaults to False
     - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
     - fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
@@ -98,12 +104,18 @@ def init(
     :type pipeline_parallelism_degree: Optional[int], optional
     :param offload: whether to enable offloading of the weights to CPU, defaults to False
     :type offload: Optional[bool], optional
-    :param offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, default to 1024^2
+    :param offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, defaults to 8 GB
     :type offload_reserve_space_size: Optional[int], optional
     :param use_4bit_quantization: whether to use 4-bit quantization, defaults to False
     :type use_4bit_quantization: Optional[bool], optional
     :param use_8bit_quantization: whether to use 8-bit quantization, defaults to False
     :type use_8bit_quantization: Optional[bool], optional
+    :param enable_peft: whether to enable the use of PEFT, defaults to False
+    :type enable_peft: Optional[bool], optional
+    :param peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB
+    :type peft_activation_reserve_space_size: Optional[int], optional
+    :param peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB
+    :type peft_weight_reserve_space_size: Optional[int], optional
     :param profiling: whether to enable the FlexFlow profiling mode, defaults to False
     :type profiling: Optional[bool], optional
     :param inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
@@ -131,6 +143,9 @@ def init(
             offload_reserve_space_size is not None,
             use_4bit_quantization is not None,
             use_8bit_quantization is not None,
+            enable_peft is not None,
+            peft_activation_reserve_space_size is not None,
+            peft_weight_reserve_space_size is not None,
             profiling is not None,
             inference_debugging is not None,
             fusion is not None,
@@ -156,6 +171,9 @@ def init(
             "offload_reserve_space_size": offload_reserve_space_size,
             "use_4bit_quantization": use_4bit_quantization,
             "use_8bit_quantization": use_8bit_quantization,
+            "enable_peft": enable_peft,
+            "peft_activation_reserve_space_size": peft_activation_reserve_space_size,
+            "peft_weight_reserve_space_size": peft_weight_reserve_space_size,
             "profiling": profiling,
             "inference_debugging": inference_debugging,
             "fusion": fusion,
@@ -176,6 +194,8 @@ def init(
         "tensor_parallelism_degree",
         "pipeline_parallelism_degree",
         "offload_reserve_space_size",
+        "peft_activation_reserve_space_size",
+        "peft_weight_reserve_space_size",
     ]
     for param in positive_int_params:
         __check_positive_int(configs_dict, param)
@@ -194,11 +214,17 @@ def init(
     if configs_dict.get("offload", None) is None:
         configs_dict["offload"] = False
     if configs_dict.get("offload_reserve_space_size", None) is None:
-        configs_dict["offload_reserve_space_size"] = 1024**2
+        configs_dict["offload_reserve_space_size"] = 8*1024**3
     if configs_dict.get("use_4bit_quantization", None) is None:
         configs_dict["use_4bit_quantization"] = False
     if configs_dict.get("use_8bit_quantization", None) is None:
         configs_dict["use_8bit_quantization"] = False
+    if configs_dict.get("enable_peft", None) is None:
+        configs_dict["enable_peft"] = False
+    if configs_dict.get("peft_activation_reserve_space_size", None) is None:
+        configs_dict["peft_activation_reserve_space_size"] = 8*1024**3
+    if configs_dict.get("peft_weight_reserve_space_size", None) is None:
+        configs_dict["peft_weight_reserve_space_size"] = 1024**3
     if configs_dict.get("profiling", None) is None:
         configs_dict["profiling"] = False
     if configs_dict.get("inference_debugging", None) is None: