Skip to content

Commit

Permalink
Merge branch 'peft' into fix_cublas_default
Browse files Browse the repository at this point in the history
  • Loading branch information
goliaro authored Nov 2, 2023
2 parents 65f497a + bf78ea4 commit 0a9b983
Show file tree
Hide file tree
Showing 194 changed files with 10,521 additions and 1,501 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -186,4 +186,6 @@ gpt_tokenizer
# pip version
python/flexflow/version.txt

inference_tensors
inference_tensors

Untitled-1.ipynb
6 changes: 6 additions & 0 deletions conda/flexflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,9 @@ dependencies:
- sentencepiece
- einops
- requests
- scipy
- bitsandbytes
- datasets
- accelerate
- loralib
- peft
2 changes: 2 additions & 0 deletions docker/flexflow-environment/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind1
RUN conda install pytorch torchvision torchaudio -c pytorch
RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops
RUN pip3 install tensorflow notebook
# PEFT-related
RUN pip3 install scipy bitsandbytes datasets accelerate loralib peft

# Install Rust
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
Expand Down
15 changes: 15 additions & 0 deletions include/flexflow/batch_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#pragma once

#include "flexflow/ffconst.h"
#include "flexflow/fftype.h"
#include "legion.h"
#include <cstddef>
#include <cstdlib>
Expand Down Expand Up @@ -43,6 +44,8 @@ class BatchConfig {
BatchConfig();
int num_active_requests() const;
int num_active_tokens() const;
int num_active_infr_tokens() const;
int num_active_peft_tokens() const;
static int max_requests_per_batch();
static int max_tokens_per_batch();
static int max_sequence_length();
Expand All @@ -61,11 +64,23 @@ class BatchConfig {
int num_tokens;

struct PerRequestInfo {
PerRequestInfo() {
first_token_depth_in_request = 0;
first_token_offset_in_batch = 0;
num_tokens_in_batch = 0;
max_sequence_length = 0;
request_guid = 0;
peft_model_id = PEFTModelID::NO_ID;
peft_bwd = false;
}
int first_token_depth_in_request;
int first_token_offset_in_batch;
int num_tokens_in_batch;
int max_sequence_length;
RequestGuid request_guid;
// PEFT fields
PEFTModelID peft_model_id;
bool peft_bwd;
};
struct PerTokenInfo {
int abs_depth_in_request;
Expand Down
15 changes: 14 additions & 1 deletion include/flexflow/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

#ifndef _FLEXFLOW_CONFIG_H_
#define _FLEXFLOW_CONFIG_H_
#include "ffconst.h"
#include "flexflow/ffconst.h"
#include "legion.h"
#include <cstring>
#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
Expand Down Expand Up @@ -64,6 +64,8 @@ constexpr ParameterSyncType CHOSEN_SYNC_TYPE = ParameterSyncType::PS;
#endif

class FFConfig;
class MemoryAllocator;
class PEFTWeightAllocator;

struct FFHandler {
#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
Expand All @@ -77,6 +79,12 @@ struct FFHandler {
size_t workSpaceSize;
void *offload_reserve_space;
size_t offload_reserve_space_size;
// PEFT related fields
MemoryAllocator *peft_activation_allocator;
size_t peft_activation_reserve_space_size;
PEFTWeightAllocator *peft_weight_allocator;
size_t peft_weight_reserve_space_size;
// Quantization fields
DataType quantization_type;
bool allowTensorOpMathConversion;
#ifdef FF_USE_NCCL
Expand All @@ -87,6 +95,8 @@ struct FFHandler {
struct FFInitInfo {
size_t workSpaceSize;
size_t offload_reserve_space_size;
size_t peft_activation_reserve_space_size;
size_t peft_weight_reserve_space_size;
DataType quantization_type;
bool allowTensorOpMathConversion;
// int myRank, allRanks;
Expand Down Expand Up @@ -143,6 +153,9 @@ class FFConfig {
bool cpu_offload;
size_t offload_reserve_space_size;
DataType quantization_type;
// PEFT related fields
size_t peft_activation_reserve_space_size;
size_t peft_weight_reserve_space_size;
// Control parallelizable dimensions
bool only_data_parallel;
bool enable_sample_parallel;
Expand Down
11 changes: 11 additions & 0 deletions include/flexflow/ffconst.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ enum LossType {
LOSS_IDENTITY = 54,
};

enum OptimizerType {
OPTIMIZER_TYPE_NONE = 60,
OPTIMIZER_TYPE_SGD = 61,
OPTIMIZER_TYPE_ADAM = 62,
};

enum CompMode {
COMP_MODE_TRAINING = 70,
COMP_MODE_INFERENCE = 71,
Expand Down Expand Up @@ -172,6 +178,9 @@ enum OperatorType {
OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION,
OP_TREE_INC_MULTIHEAD_SELF_ATTENTION,
OP_SAMPLING,
// PEFT Ops
OP_LORA_MLP_FIRST,
OP_LORA_MLP_SECOND,
// Parallel Ops
OP_REPARTITION,
OP_COMBINE,
Expand Down Expand Up @@ -269,5 +278,7 @@ enum {
TENSOR_GUID_LAST_VALID = 3999999,
PARALLEL_TENSOR_GUID_FIRST_VALID = 4000000,
NODE_GUID_FIRST_VALID = 5000000,
PEFT_MODEL_ID_FIRST_VALID = 6000000,
PEFT_MODEL_ID_LAST_VALID = 6999999
};
#endif // _FLEXFLOW_CONST_H_
24 changes: 24 additions & 0 deletions include/flexflow/fftype.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "flexflow/ffconst.h"
#include <cstddef>
#include <functional>

namespace FlexFlow {

Expand All @@ -18,6 +19,29 @@ class LayerID {
size_t id, transformer_layer_id, model_id;
};

class PEFTModelID {
public:
static const PEFTModelID NO_ID;
PEFTModelID();
PEFTModelID(size_t id);
bool is_valid_id() const;
friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs);
friend std::ostream &operator<<(std::ostream &os,
PEFTModelID const &peft_model_id);

public:
size_t id;
};

}; // namespace FlexFlow

namespace std {
template <>
struct hash<FlexFlow::PEFTModelID> {
size_t operator()(FlexFlow::PEFTModelID const &n) const {
return n.id;
}
};
} // namespace std

#endif // _FF_TYPE_H
2 changes: 1 addition & 1 deletion include/flexflow/layer.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class Layer {
Tensor outputs[MAX_NUM_OUTPUTS];
Tensor inputs[MAX_NUM_INPUTS];
Tensor weights[MAX_NUM_WEIGHTS];
bool trainableInputs[MAX_NUM_INPUTS];
// bool trainable_inputs[MAX_NUM_INPUTS];
int numInputs, numWeights, numOutputs;
bool profiling;
bool inference_debugging;
Expand Down
49 changes: 45 additions & 4 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,26 +106,39 @@ enum TaskIDs {
LAYERNORM_FWD_TASK_ID,
LAYERNORM_INF_TASK_ID,
LAYERNORM_BWD_TASK_ID,
LAYERNORM_PEFT_BWD_TASK_ID,
RESIDUAL_LAYERNORM_INIT_TASK_ID,
RESIDUAL_LAYERNORM_INF_TASK_ID,
RESIDUAL_LAYERNORM_BWD_TASK_ID,
RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID,
ADD_BIAS_RESIDUAL_LAYERNORM_INF_TASK_ID,
ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID,
ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
SIGMOID_SILU_MULTI_INIT_TASK_ID,
SIGMOID_SILU_MULTI_INF_TASK_ID,
SIGMOID_SILU_MULTI_BWD_TASK_ID,
SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID,
LINEAR_INIT_TASK_ID,
LINEAR_INIT_PARA_TASK_ID,
LINEAR_INF_TASK_ID,
LINEAR_PEFT_BWD_TASK_ID,
LINEAR_FWD_TASK_ID,
LINEAR_BWD_TASK_ID,
LINEAR_BWD2_TASK_ID,
LINEAR_UPD_TASK_ID,
LORA_LINEAR_INIT_TASK_ID,
LORA_LINEAR_REG_TASK_ID,
LORA_LINEAR_INF_TASK_ID,
LORA_LINEAR_PEFT_BWD_TASK_ID,
FLAT_INIT_TASK_ID,
FLAT_FWD_TASK_ID,
FLAT_BWD_TASK_ID,
SOFTMAX_INIT_TASK_ID,
SOFTMAX_FWD_TASK_ID,
SOFTMAX_BWD_TASK_ID,
SOFTMAX_INF_TASK_ID,
SOFTMAX_PEFT_BWD_TASK_ID,
CONCAT_INIT_TASK_ID,
CONCAT_FWD_TASK_ID,
CONCAT_BWD_TASK_ID,
Expand Down Expand Up @@ -160,20 +173,26 @@ enum TaskIDs {
RMSNORM_INIT_TASK_ID,
RMSNORM_FWD_TASK_ID,
RMSNORM_INF_TASK_ID,
RMSNORM_BWD_TASK_ID,
RMSNORM_PEFT_BWD_TASK_ID,
RESIDUAL_RMSNORM_INIT_TASK_ID,
RESIDUAL_RMSNORM_INF_TASK_ID,
RESIDUAL_RMSNORM_BWD_TASK_ID,
RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID,
BEAM_TOPK_INIT_TASK_ID,
BEAM_TOPK_INF_TASK_ID,
INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID,
INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID,
INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID,
SPEC_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
TREE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
MSELOSS_BWD_TASK_ID,
FUSEDOP_INIT_TASK_ID,
FUSEDOP_PEFT_BWD_TASK_ID,
FUSEDOP_FWD_TASK_ID,
FUSEDOP_BWD_TASK_ID,
FUSEDOP_INF_TASK_ID,
Expand Down Expand Up @@ -231,9 +250,10 @@ enum TaskIDs {
PIPELINE_FWD_TASK_ID,
PIPELINE_BWD_TASK_ID,
ALLREDUCE_INIT_TASK_ID,
ALLREDUCE_INF_TASK_ID,
ALLREDUCE_FWD_TASK_ID,
ALLREDUCE_BWD_TASK_ID,
ALLREDUCE_INF_TASK_ID,
ALLREDUCE_PEFT_BWD_TASK_ID,
FUSED_PARALLELOP_INIT_TASK_ID,
FUSED_PARALLELOP_FWD_TASK_ID,
FUSED_PARALLELOP_BWD_TASK_ID,
Expand Down Expand Up @@ -321,6 +341,7 @@ class ResidualLayerNorm;
class AddBiasResidualLayerNorm;
class SigmoidSiluMulti;
class Linear;
class LoraLinear;
class MultiHeadAttention;
class IncMultiHeadSelfAttention;
class TreeIncMultiHeadSelfAttention;
Expand Down Expand Up @@ -800,10 +821,26 @@ class FFModel {
bool position_bias = false,
char const *name = NULL);
// ========================================
// PEFT Layers
// ========================================
void lora_linear(Tensor const input,
Tensor const output,
OperatorType _type,
char const *name = nullptr);
// ========================================
// Inference APIs
// ========================================
GenerationResult generate(std::vector<std::string> &prompts,
int max_seq_length);
GenerationResult generate(std::string const &prompts,
int max_seq_length,
PEFTModelID peft_model_id = PEFTModelID::NO_ID);

GenerationResult generate(std::vector<std::string> const &prompts,
int max_seq_length,
PEFTModelID peft_model_id = PEFTModelID::NO_ID);

PEFTModelID register_peft_model(
LoraLinearConfig const mlp_first = LoraLinearConfig::DefaultConfig,
LoraLinearConfig const mlp_second = LoraLinearConfig::DefaultConfig);

Tensor create_tensor_legion_ordering(int num_dim,
int const dims[],
Expand Down Expand Up @@ -1100,7 +1137,7 @@ class FFModel {
void clear_graph_search_cache();

public:
size_t op_global_guid, layer_global_guid;
size_t op_global_guid, layer_global_guid, peft_model_global_guid;
size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid;
size_t current_transformer_layer_id;
// positional embedding start offset
Expand Down Expand Up @@ -1178,6 +1215,10 @@ class FFModel {
SigmoidSiluMulti *>,
std::unordered_map<std::pair<ParallelTensorShape, LinearParams>,
Linear *>,
std::unordered_map<
std::pair<std::pair<ParallelTensorShape, ParallelTensorShape>,
LoraLinearParams>,
LoraLinear *>,
std::unordered_map<std::pair<ParallelTensorShape, Pool2DParams>,
Pool2D *>,
std::unordered_map<std::pair<std::tuple<ParallelTensorShape,
Expand Down
5 changes: 3 additions & 2 deletions include/flexflow/op_meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class Op;

class OpMeta {
public:
OpMeta(FFHandler _handle);
// OpMeta(FFHandler _handle);
OpMeta(FFHandler _handle, Op const *op);

public:
Expand All @@ -19,7 +19,8 @@ class OpMeta {
int decoding_step;
char op_name[MAX_OPNAME];
LayerID layer_guid;
bool trainableInputs[MAX_NUM_INPUTS];
bool trainable_inputs[MAX_NUM_INPUTS];
bool reset_input_grads[MAX_NUM_INPUTS];
DataType input_type[MAX_NUM_INPUTS];
DataType weight_type[MAX_NUM_WEIGHTS];
DataType output_type[MAX_NUM_OUTPUTS];
Expand Down
10 changes: 9 additions & 1 deletion include/flexflow/operator.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,13 @@ class Op {
MachineView const *mv = nullptr) {
assert(false);
};
virtual Legion::FutureMap peft_bwd(FFModel const &,
BatchConfigFuture const &,
std::vector<ParallelTensor> const &,
std::vector<ParallelTensor> const &,
MachineView const *mv = nullptr) {
assert(false);
}
virtual void print_layer(FFModel const &model) = 0;
static void save_inference_tensors_to_file(
OpMeta *m,
Expand Down Expand Up @@ -320,7 +327,8 @@ class Op {
ParallelTensor outputs[MAX_NUM_OUTPUTS];
ParallelTensor inputs[MAX_NUM_INPUTS];
ParallelParameter weights[MAX_NUM_WEIGHTS];
bool trainableInputs[MAX_NUM_INPUTS];
bool trainable_inputs[MAX_NUM_INPUTS];
bool reset_input_grads[MAX_NUM_INPUTS];
OpMeta *meta[MAX_NUM_WORKERS];
std::map<ParallelTensor, OpMeta *[MAX_NUM_WORKERS]> inference_meta;
int numInputs, numWeights, numOutputs;
Expand Down
2 changes: 2 additions & 0 deletions include/flexflow/operator_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "flexflow/ops/inc_multihead_self_attention_params.h"
#include "flexflow/ops/layer_norm_params.h"
#include "flexflow/ops/linear_params.h"
#include "flexflow/ops/lora_linear_params.h"
#include "flexflow/ops/pool_2d_params.h"
#include "flexflow/ops/reduce_params.h"
#include "flexflow/ops/reshape_params.h"
Expand Down Expand Up @@ -67,6 +68,7 @@ using OperatorParameters = mp::variant<AggregateParams,
AddBiasResidualLayerNormParams,
SigmoidSiluMultiParams,
LinearParams,
LoraLinearParams,
MultiHeadAttentionParams,
IncMultiHeadSelfAttentionParams,
BeamTopKParams,
Expand Down
Loading

0 comments on commit 0a9b983

Please sign in to comment.