Skip to content

Commit

Permalink
Merge branch 'inference' into incr_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
zwang86 authored Oct 9, 2023
2 parents 2bbb987 + 50ff264 commit da2ab09
Show file tree
Hide file tree
Showing 94 changed files with 1,109 additions and 1,270 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -185,3 +185,5 @@ gpt_tokenizer

# pip version
python/flexflow/version.txt

inference_tensors
2 changes: 1 addition & 1 deletion conda/pytorch-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ channels:
- defaults
- conda-forge
dependencies:
- python>=3.6
- python>=3.6,<3.12
- pip
- pip:
- numpy>=1.16.0
Expand Down
13 changes: 10 additions & 3 deletions include/flexflow/batch_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ class BatchConfig {
static int max_requests_per_batch();
static int max_tokens_per_batch();
static int max_sequence_length();
friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc);
void print() const;
void save_to_file(std::string const &filename) const;
virtual InferenceMode get_mode() const;
static BatchConfig const *from_future(BatchConfigFuture const &future);
// Maximum possible values for different parameters
Expand All @@ -55,9 +57,8 @@ class BatchConfig {
static int const MAX_NUM_REQUESTS = 64;
static int const MAX_NUM_TOKENS = 1024;

// These are set by update
// Set by update
int num_tokens;
bool loading_prompt = false;

struct PerRequestInfo {
int token_start_offset;
Expand All @@ -74,15 +75,18 @@ class BatchConfig {
PerTokenInfo tokensInfo[MAX_NUM_TOKENS];

bool request_completed[MAX_NUM_REQUESTS];
bool request_running[MAX_NUM_TOKENS];
bool request_running[MAX_NUM_REQUESTS];
};

class TreeVerifyBatchConfig : public BatchConfig {
public:
TreeVerifyBatchConfig();
~TreeVerifyBatchConfig();
InferenceMode get_mode() const;
friend std::ostream &operator<<(std::ostream &os,
TreeVerifyBatchConfig const &bc);
void print() const;
void save_to_file(std::string const &filename) const;
struct CommittedTokensInfo {
int token_index; // the index of the token in the previous batch
int request_index; // request index in the batch
Expand All @@ -108,7 +112,10 @@ class BeamSearchBatchConfig : public BatchConfig {

~BeamSearchBatchConfig();

friend std::ostream &operator<<(std::ostream &os,
BeamSearchBatchConfig const &bc);
void print() const;
void save_to_file(std::string const &filename) const;
bool done() const;
int max_beam_depth_all_requests() const;
int current_depth_all_requests() const;
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ class FFConfig {
Legion::Runtime *lg_hlr;
// Legion::FieldSpace field_space;
bool syntheticInput, profiling, perform_fusion;
bool inference_debugging;
size_t simulator_work_space_size;
size_t search_budget;
float search_alpha;
Expand Down
4 changes: 2 additions & 2 deletions include/flexflow/fftype.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ class LayerID {
public:
static const LayerID NO_ID;
LayerID();
LayerID(size_t id, size_t transformer_layer_id);
LayerID(size_t id, size_t transformer_layer_id, size_t model_id);
bool is_valid_id() const;
friend bool operator==(LayerID const &lhs, LayerID const &rhs);

public:
size_t id, transformer_layer_id;
size_t id, transformer_layer_id, model_id;
};

}; // namespace FlexFlow
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/layer.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class Layer {
bool trainableInputs[MAX_NUM_INPUTS];
int numInputs, numWeights, numOutputs;
bool profiling;
bool inference_debugging;

private:
std::unordered_map<std::string, long long> int_properties;
Expand Down
5 changes: 5 additions & 0 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -1234,6 +1234,8 @@ class FFModel {
std::unordered_map<size_t, NoOp *> cached_noop_ops;
std::unordered_map<size_t, NoOp *> cached_input_ops;
std::vector<MachineView> all_valid_views;
int model_id; // unique incremental id assigned to each model. Used in the
// inference_debugging mode.
#ifdef FF_USE_NCCL
std::unordered_map<size_t, ncclComm_t *> view_hash_to_nccl_comms;
#endif
Expand Down Expand Up @@ -1262,6 +1264,9 @@ class FFModel {
ElementUnary *
unary(OperatorType op, char const *name = NULL, float scalar = 0.0);
PCG::Node new_node(Op *);
static int model_counter; // number of instantiated FFModel objects. Used to
// assign a unique incremental id to each model.
// Used in the inference_debugging mode.
};

class UtilityTasks {
Expand Down
4 changes: 4 additions & 0 deletions include/flexflow/op_meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ class OpMeta {
public:
FFHandler handle;
bool profiling; // Measure the run time of the task
bool inference_debugging;
int decoding_step;
char op_name[MAX_OPNAME];
LayerID layer_guid;
bool trainableInputs[MAX_NUM_INPUTS];
DataType input_type[MAX_NUM_INPUTS];
DataType weight_type[MAX_NUM_WEIGHTS];
Expand Down
10 changes: 10 additions & 0 deletions include/flexflow/operator.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef _OPERATOR_H
#define _OPERATOR_H

#include "flexflow/accessor.h"
#include "flexflow/batch_config.h"
#include "flexflow/fftype.h"
#include "flexflow/machine_view.h"
Expand Down Expand Up @@ -183,6 +184,7 @@ class Op {
const ParallelTensor input4 = NULL);
Op(int guid,
bool profiling,
bool inference_debugging,
OperatorType otype,
DataType dtype,
char const *name,
Expand Down Expand Up @@ -225,6 +227,13 @@ class Op {
assert(false);
};
virtual void print_layer(FFModel const &model) = 0;
static void save_inference_tensors_to_file(
OpMeta *m,
int shard_id,
BatchConfig const *bc,
std::vector<GenericTensorAccessorR> input_tensors,
std::vector<GenericTensorAccessorR> weight_tensors,
std::vector<GenericTensorAccessorW> output_tensors);
virtual bool measure_operator_cost(Simulator *sim,
MachineView const &mv,
CostMetrics &cost_metrics) const = 0;
Expand Down Expand Up @@ -316,6 +325,7 @@ class Op {
std::map<ParallelTensor, OpMeta *[MAX_NUM_WORKERS]> inference_meta;
int numInputs, numWeights, numOutputs;
bool profiling;
bool inference_debugging;
bool add_bias_only_once;
#ifdef FF_USE_NCCL
ncclUniqueId ncclId;
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/add_bias_residual_layer_norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ class AddBiasResidualLayerNormMeta : public OpMeta {
int64_t effective_batch_size, effective_num_elements;
float eps;
void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
char op_name[MAX_OPNAME];
Realm::RegionInstance reserveInst;
};

Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/element_unary.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ class ElementUnaryMeta : public OpMeta {
DataType data_type;
bool inplace;
float scalar;
char op_name[MAX_OPNAME];
};

class ElementUnary : public Op {
Expand Down
3 changes: 0 additions & 3 deletions include/flexflow/ops/inc_multihead_self_attention.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,9 +185,6 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
bool *qk_prod_scaling;
bool *position_bias;
float scaling_factor;
#ifdef INFERENCE_TESTS
float *kcache, *vcache;
#endif
void *weight_ptr, *bias_ptr; // for weight offload
void *devQKVProjArray, *keyCache, *valueCache;
void *qk_prods, *qk_prods_softmax;
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/kernels/concat_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ class ConcatMeta : public OpMeta {
public:
ConcatMeta(FFHandler handle) : OpMeta(handle){};
int legion_axis;
char op_name[MAX_OPNAME];
};

namespace Kernels {
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/kernels/conv_2d_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ class Conv2DMeta : public OpMeta {
miopenConvBwdDataAlgorithm_t bwdDataAlgo;
#endif
bool relu, use_bias;
char op_name[MAX_OPNAME];
};

namespace Kernels {
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/kernels/element_binary_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ class ElementBinaryMeta : public OpMeta {
OperatorType op_type;
bool inplace_a, has_same_operands;
bool broadcast_input1, broadcast_input2;
char op_name[MAX_OPNAME];
};

namespace Kernels {
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/kernels/linear_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ class LinearMeta : public OpMeta {
RegularizerMode kernel_reg_type;
float kernel_reg_lambda;
bool use_bias, add_bias_only_once;
char op_name[MAX_OPNAME];
Realm::RegionInstance reserveInst;
};

Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/kernels/pool_2d_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ class Pool2DMeta : public OpMeta {
ffActivationDescriptor_t actiDesc;
ffPoolingDescriptor_t poolDesc;
bool relu;
char op_name[MAX_OPNAME];
};

namespace Kernels {
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/kernels/residual_rms_norm_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ class ResidualRMSNormMeta : public OpMeta {
int in_dim;
int batch_size;
int num_elements;
char op_name[MAX_OPNAME];
Realm::RegionInstance reserveInst;
};

Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/kernels/rms_norm_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ class RMSNormMeta : public OpMeta {
int in_dim;
int batch_size;
int num_elements;
char op_name[MAX_OPNAME];
Realm::RegionInstance reserveInst;
};

Expand Down
2 changes: 1 addition & 1 deletion include/flexflow/ops/kernels/softmax_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ class SoftmaxMeta : public OpMeta {
miopenTensorDescriptor_t outputTensor;
#endif
bool profiling;
bool inference_debugging;
int dim;
char op_name[MAX_OPNAME];
DataType input_type, output_type;
};

Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/layer_norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ class LayerNormMeta : public OpMeta {
int64_t effective_batch_size, effective_num_elements;
float eps;
void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
char op_name[MAX_OPNAME];
Realm::RegionInstance reserveInst;
};

Expand Down
1 change: 1 addition & 0 deletions include/flexflow/ops/linear.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ class Linear : public Op {
private:
Linear(int guid,
bool profiling,
bool inference_debugging,
const ParallelTensor input,
int out_dim,
ActiMode activation,
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/residual_layer_norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ class ResidualLayerNormMeta : public OpMeta {
int64_t effective_batch_size, effective_num_elements;
float eps;
void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
char op_name[MAX_OPNAME];
Realm::RegionInstance reserveInst;
};

Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/residual_rms_norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ class ResidualRMSNorm : public Op {

public:
float eps;
char op_name[MAX_OPNAME];
int effective_batch_size;
int dim, data_dim;
};
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/rms_norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ class RMSNorm : public Op {

public:
float eps;
char op_name[MAX_OPNAME];
int effective_batch_size;
int dim, data_dim;
};
Expand Down
1 change: 0 additions & 1 deletion include/flexflow/ops/sigmoid_silu_multi.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ class SigmoidSiluMultiMeta : public OpMeta {
~SigmoidSiluMultiMeta(void);

public:
char op_name[MAX_OPNAME];
Realm::RegionInstance reserveInst;
};

Expand Down
3 changes: 3 additions & 0 deletions include/flexflow/utils/hip_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ __host__ void updateGAS(float *para_ptr,
template <typename T>
void print_tensor(T const *ptr, size_t num_elements, char const *prefix);

template <typename T>
void save_tensor(T const *ptr, size_t num_elements, char const *file_name);

template <typename T>
T *download_tensor(T const *ptr, size_t num_elements);

Expand Down
1 change: 1 addition & 0 deletions inference/python/incr_decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def get_configs():
"use_4bit_quantization": False,
"use_8bit_quantization": False,
"profiling": False,
"inference_debugging": False,
"fusion": True,
}
llm_configs = {
Expand Down
1 change: 1 addition & 0 deletions inference/python/spec_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def get_configs():
"use_4bit_quantization": False,
"use_8bit_quantization": False,
"profiling": False,
"inference_debugging": False,
"fusion": True,
}
llm_configs = {
Expand Down
1 change: 1 addition & 0 deletions python/flexflow/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
"num_cpus": "-ll:cpu",
"legion_utility_processors": "-ll:util",
"profiling": "--profiling",
"inference_debugging": "--inference-debugging",
"fusion": "--fusion",
"disable_control_replication": "--disable-control-replication",
# Training args
Expand Down
8 changes: 8 additions & 0 deletions python/flexflow/serve/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def init(
use_4bit_quantization: Optional[bool] = None,
use_8bit_quantization: Optional[bool] = None,
profiling: Optional[bool] = None,
inference_debugging: Optional[bool] = None,
fusion: Optional[bool] = None,
):
"""
Expand All @@ -71,6 +72,7 @@ def init(
- use_4bit_quantization: whether to use 4-bit quantization, defaults to False
- use_8bit_quantization: whether to use 8-bit quantization, defaults to False
- profiling: whether to enable the FlexFlow profiling mode, defaults to False
- inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
- fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
The configurations are passed down to the FlexFlow runtime (implemented in C++) via command line arguments.
Expand Down Expand Up @@ -104,6 +106,8 @@ def init(
:type use_8bit_quantization: Optional[bool], optional
:param profiling: whether to enable the FlexFlow profiling mode, defaults to False
:type profiling: Optional[bool], optional
:param inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
:type inference_debugging: Optional[bool], optional
:param fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
:type fusion: Optional[bool], optional
Expand All @@ -128,6 +132,7 @@ def init(
use_4bit_quantization is not None,
use_8bit_quantization is not None,
profiling is not None,
inference_debugging is not None,
fusion is not None,
]
):
Expand All @@ -152,6 +157,7 @@ def init(
"use_4bit_quantization": use_4bit_quantization,
"use_8bit_quantization": use_8bit_quantization,
"profiling": profiling,
"inference_debugging": inference_debugging,
"fusion": fusion,
}

Expand Down Expand Up @@ -195,6 +201,8 @@ def init(
configs_dict["use_8bit_quantization"] = False
if configs_dict.get("profiling", None) is None:
configs_dict["profiling"] = False
if configs_dict.get("inference_debugging", None) is None:
configs_dict["inference_debugging"] = False
if configs_dict.get("fusion", None) is None:
configs_dict["fusion"] = True

Expand Down
Loading

0 comments on commit da2ab09

Please sign in to comment.