Skip to content

Commit

Permalink
Merge branch 'peft' into peft_xinhao
Browse files Browse the repository at this point in the history
  • Loading branch information
xinhaoc authored Feb 6, 2024
2 parents 0444a9d + 32a0716 commit 3a6b2ab
Show file tree
Hide file tree
Showing 49 changed files with 3,666 additions and 1,992 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -189,4 +189,5 @@ python/flexflow/version.txt
inference_tensors
hf_peft_tensors

Untitled-1.ipynb
Untitled-1.ipynb
Untitled-2.ipynb
1 change: 1 addition & 0 deletions include/flexflow/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ class FFConfig {
size_t offload_reserve_space_size;
DataType quantization_type;
// PEFT related fields
bool enable_peft;
size_t peft_activation_reserve_space_size;
size_t peft_weight_reserve_space_size;
// Control parallelizable dimensions
Expand Down
3 changes: 3 additions & 0 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -240,9 +240,11 @@ enum TaskIDs {
COMBINE_INIT_TASK_ID,
COMBINE_FWD_TASK_ID,
COMBINE_BWD_TASK_ID,
COMBINE_PEFT_BWD_TASK_ID,
REPLICATE_INIT_TASK_ID,
REPLICATE_FWD_TASK_ID,
REPLICATE_BWD_TASK_ID,
REPLICATE_PEFT_BWD_TASK_ID,
REDUCTION_INIT_TASK_ID,
REDUCTION_FWD_TASK_ID,
REDUCTION_BWD_TASK_ID,
Expand Down Expand Up @@ -1122,6 +1124,7 @@ class FFModel {
Legion::IndexSpace get_task_is(Legion::Domain const &domain) const;
Legion::IndexSpace get_task_is(ParallelConfig const &pc) const;
Legion::IndexSpace get_task_is(MachineView const &view) const;
bool need_to_add_combine(int layer_idx) const;
bool is_mlp_block(int layer_idx) const;
void create_operators_from_layers();
Op *create_operator_from_layer(Layer *layer,
Expand Down
26 changes: 16 additions & 10 deletions include/flexflow/operator.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,28 +267,34 @@ class Op {
bool fwd_pass = true,
bool before_kernel = false) {
// Check if output directory exists, and create it if it does not
char const *folder_path = "./inference_tensors";
char const *folder_path = "./inference_tensors/";
struct stat st = {0};
if (stat(folder_path, &st) == -1) {
// Directory does not exist, create it
mkdir(folder_path, 0700);
}
// output base filepath, shared by all tensors from the same operator
std::string op_name_without_uid = get_op_name_without_uid(m);
std::string base_filepath =
"./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
(fwd_pass ? "_decoding-step_" : "_bwd-step_") +
(fwd_pass ? std::to_string(m->decoding_step)
: std::to_string(m->bwd_step)) +
"_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) +
"_layer-name_" + op_name_without_uid + "_shard-id_" +
std::to_string(shard_id);
std::cout << (fwd_pass ? "INF " : "BWD ") << op_name_without_uid
<< std::endl;
std::string base_filepath = std::string(folder_path);
if (m->layer_guid.model_id > 0) {
base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_";
}
if (fwd_pass) {
base_filepath += "fwd_step_" + std::to_string(m->decoding_step);
} else {
base_filepath += "bwd_step_" + std::to_string(m->bwd_step);
}
base_filepath += "_layers_" +
std::to_string(m->layer_guid.transformer_layer_id) + "_" +
op_name_without_uid + "_shard_" + std::to_string(shard_id);
if (before_kernel) {
base_filepath += "_pre";
}
// save batch config, if passed
if (bc != nullptr) {
bc->save_to_file(base_filepath + "_batch-config");
bc->save_to_file(base_filepath + "_batch_config");
}
// save all inputs
for (int i = 0; i < input_tensors.size(); i++) {
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/ops/add_bias_residual_layer_norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class AddBiasResidualLayerNorm : public Op {
float _eps,
bool allocate_weights,
char const *name);
void map_output_tensors(FFModel &ff) override;
void init(FFModel const &) override;
void init_inference(FFModel const &,
std::vector<ParallelTensor> const &,
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/kernels/lora_linear_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ class LoraLinearMeta : public OpMeta {
public:
LoraLinearMeta(FFHandler handle, LoraLinear const *li);
~LoraLinearMeta(void);
char op_name[MAX_OPNAME];
// PEFT related fields
void *low_rank_activation;
void *input_activation;
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/ops/kernels/softmax_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ void backward_kernel_wrapper(SoftmaxMeta const *m,

void inference_kernel_wrapper(SoftmaxMeta const *m,
BatchConfig const *bc,
bool is_last_op,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output,
GenericTensorAccessorW const &output_grad);
Expand Down
9 changes: 9 additions & 0 deletions include/flexflow/parallel_ops/combine.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ class Combine : public ParallelOp {
std::vector<ParallelTensor> const &,
std::vector<ParallelTensor> const &,
MachineView const *mv = nullptr) override;
Legion::FutureMap peft_bwd(FFModel const &,
BatchConfigFuture const &bc,
std::vector<ParallelTensor> const &,
std::vector<ParallelTensor> const &,
MachineView const *mv = nullptr) override;
void backward(FFModel const &) override;
bool get_int_parameter(PMParameter, int *) const override;
bool append_parallel_op_info(
Expand All @@ -56,6 +61,10 @@ class Combine : public ParallelOp {
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void peft_bwd_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
template <typename T>
static void
forward_task_with_type(Legion::Task const *task,
Expand Down
2 changes: 1 addition & 1 deletion include/flexflow/parallel_ops/parallel_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class ParallelOp : public Op {
public:
Legion::LogicalPartition input_lp, output_grad_lp;
std::unordered_map<ParallelTensor, Legion::LogicalPartition>
inference_input_lps;
inference_input_lps, inference_output_grad_lps;
};

}; // namespace FlexFlow
Expand Down
9 changes: 9 additions & 0 deletions include/flexflow/parallel_ops/replicate.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,19 @@ class Replicate : public ParallelOp {
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
Legion::FutureMap peft_bwd(FFModel const &,
BatchConfigFuture const &bc,
std::vector<ParallelTensor> const &,
std::vector<ParallelTensor> const &,
MachineView const *mv = nullptr) override;
static void backward_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void peft_bwd_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void forward_kernel_wrapper(ReplicateMeta const *m,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output,
Expand Down
5 changes: 4 additions & 1 deletion inference/python/incr_decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,12 @@ def get_configs():
"tensor_parallelism_degree": 1,
"pipeline_parallelism_degree": 4,
"offload": False,
"offload_reserve_space_size": 1024**2,
"offload_reserve_space_size": 8 * 1024, # 8GB
"use_4bit_quantization": False,
"use_8bit_quantization": False,
"enable_peft": False,
"peft_activation_reserve_space_size": 1024, # 1GB
"peft_weight_reserve_space_size": 1024, # 1GB
"profiling": False,
"inference_debugging": False,
"fusion": True,
Expand Down
5 changes: 4 additions & 1 deletion inference/python/spec_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,12 @@ def get_configs():
"tensor_parallelism_degree": 2,
"pipeline_parallelism_degree": 2,
"offload": False,
"offload_reserve_space_size": 1024**2,
"offload_reserve_space_size": 8 * 1024, # 8GB
"use_4bit_quantization": False,
"use_8bit_quantization": False,
"enable_peft": False,
"peft_activation_reserve_space_size": 1024, # 1GB
"peft_weight_reserve_space_size": 1024, # 1GB
"profiling": False,
"inference_debugging": False,
"fusion": True,
Expand Down
5 changes: 4 additions & 1 deletion python/flexflow/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,10 @@
"offload": "-offload",
"offload_reserve_space_size": "-offload-reserve-space-size",
"use_4bit_quantization": "--4bit-quantization",
"use_8bit_quantization": "--8bit-quantization"
"use_8bit_quantization": "--8bit-quantization",
"enable_peft": "",
"peft_activation_reserve_space_size": "-peft-activation-reserve-space-size",
"peft_weight_reserve_space_size": "-peft-weight-reserve-space-size",
}


Expand Down
32 changes: 29 additions & 3 deletions python/flexflow/serve/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ def init(
offload_reserve_space_size: Optional[int] = None,
use_4bit_quantization: Optional[bool] = None,
use_8bit_quantization: Optional[bool] = None,
enable_peft: Optional[bool] = None,
peft_activation_reserve_space_size: Optional[int] = None,
peft_weight_reserve_space_size: Optional[int] = None,
profiling: Optional[bool] = None,
inference_debugging: Optional[bool] = None,
fusion: Optional[bool] = None,
Expand All @@ -68,9 +71,12 @@ def init(
- tensor_parallelism_degree: the degree of parallelization in the tensor parallel dimension (using the Megatron technique), defaults to 1
- pipeline_parallelism_degree: the degree of parallelization in the pipeline parallel dimension, defaults to 1
- offload: whether to enable offloading of the weights to CPU, defaults to False
- offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, default to 1024^2
- offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, defaults to 8 GB
- use_4bit_quantization: whether to use 4-bit quantization, defaults to False
- use_8bit_quantization: whether to use 8-bit quantization, defaults to False
- enable_peft: whether to enable the use of PEFT, defaults to False
- peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB
- peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB
- profiling: whether to enable the FlexFlow profiling mode, defaults to False
- inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
- fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
Expand Down Expand Up @@ -98,12 +104,18 @@ def init(
:type pipeline_parallelism_degree: Optional[int], optional
:param offload: whether to enable offloading of the weights to CPU, defaults to False
:type offload: Optional[bool], optional
:param offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, default to 1024^2
:param offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, defaults to 8 GB
:type offload_reserve_space_size: Optional[int], optional
:param use_4bit_quantization: whether to use 4-bit quantization, defaults to False
:type use_4bit_quantization: Optional[bool], optional
:param use_8bit_quantization: whether to use 8-bit quantization, defaults to False
:type use_8bit_quantization: Optional[bool], optional
:param enable_peft: whether to enable the use of PEFT, defaults to False
:type enable_peft: Optional[bool], optional
:param peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB
:type peft_activation_reserve_space_size: Optional[int], optional
:param peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB
:type peft_weight_reserve_space_size: Optional[int], optional
:param profiling: whether to enable the FlexFlow profiling mode, defaults to False
:type profiling: Optional[bool], optional
:param inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
Expand Down Expand Up @@ -131,6 +143,9 @@ def init(
offload_reserve_space_size is not None,
use_4bit_quantization is not None,
use_8bit_quantization is not None,
enable_peft is not None,
peft_activation_reserve_space_size is not None,
peft_weight_reserve_space_size is not None,
profiling is not None,
inference_debugging is not None,
fusion is not None,
Expand All @@ -156,6 +171,9 @@ def init(
"offload_reserve_space_size": offload_reserve_space_size,
"use_4bit_quantization": use_4bit_quantization,
"use_8bit_quantization": use_8bit_quantization,
"enable_peft": enable_peft,
"peft_activation_reserve_space_size": peft_activation_reserve_space_size,
"peft_weight_reserve_space_size": peft_weight_reserve_space_size,
"profiling": profiling,
"inference_debugging": inference_debugging,
"fusion": fusion,
Expand All @@ -176,6 +194,8 @@ def init(
"tensor_parallelism_degree",
"pipeline_parallelism_degree",
"offload_reserve_space_size",
"peft_activation_reserve_space_size",
"peft_weight_reserve_space_size",
]
for param in positive_int_params:
__check_positive_int(configs_dict, param)
Expand All @@ -194,11 +214,17 @@ def init(
if configs_dict.get("offload", None) is None:
configs_dict["offload"] = False
if configs_dict.get("offload_reserve_space_size", None) is None:
configs_dict["offload_reserve_space_size"] = 1024**2
configs_dict["offload_reserve_space_size"] = 8*1024**3
if configs_dict.get("use_4bit_quantization", None) is None:
configs_dict["use_4bit_quantization"] = False
if configs_dict.get("use_8bit_quantization", None) is None:
configs_dict["use_8bit_quantization"] = False
if configs_dict.get("enable_peft", None) is None:
configs_dict["enable_peft"] = False
if configs_dict.get("peft_activation_reserve_space_size", None) is None:
configs_dict["peft_activation_reserve_space_size"] = 8*1024**3
if configs_dict.get("peft_weight_reserve_space_size", None) is None:
configs_dict["peft_weight_reserve_space_size"] = 1024**3
if configs_dict.get("profiling", None) is None:
configs_dict["profiling"] = False
if configs_dict.get("inference_debugging", None) is None:
Expand Down
Loading

0 comments on commit 3a6b2ab

Please sign in to comment.