Skip to content

Commit

Permalink
Merge branch 'inference' into optimize_attn
Browse files Browse the repository at this point in the history
  • Loading branch information
goliaro authored Sep 22, 2023
2 parents 238b6bd + a4f2588 commit 1f22002
Show file tree
Hide file tree
Showing 73 changed files with 3,222 additions and 1,539 deletions.
1 change: 1 addition & 0 deletions include/flexflow/ffconst.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ enum OperatorType {
OP_POW, // https://pytorch.org/docs/stable/generated/torch.pow.html
OP_MEAN, // https://pytorch.org/docs/stable/generated/torch.mean.html
OP_LAYERNORM,
OP_ADD_BIAS_RESIDUAL_LAYERNORM,
OP_EXPERTS,
OP_GATHER, // https://pytorch.org/docs/stable/generated/torch.gather.html
OP_RMS_NORM,
Expand Down
14 changes: 11 additions & 3 deletions include/flexflow/flexflow_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,17 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle,
bool use_bias,
char const *name);

flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
flexflow_model_t handle,
const flexflow_tensor_t input,
const flexflow_tensor_t residual,
int n,
int *axes,
bool elementwise_affine,
float eps,
bool use_bias,
char const *name);

flexflow_tensor_t
flexflow_model_add_batch_matmul(flexflow_model_t handle,
const flexflow_tensor_t a,
Expand Down Expand Up @@ -972,9 +983,6 @@ void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_);

void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
flexflow_model_t model_handle_,
int num_layers,
char const **layer_names,
flexflow_op_t *layers,
bool use_full_precision);

#ifdef __cplusplus
Expand Down
21 changes: 21 additions & 0 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,12 @@ enum TaskIDs {
LOAD_IMAGES_TASK_ID,
NORMALIZE_IMAGES_TASK_ID,
ELEMENTBINARY_INIT_TASK_ID,
ELEMENTBINARY_INF_TASK_ID,
ELEMENTBINARY_FWD_TASK_ID,
ELEMENTBINARY_BWD_TASK_ID,
ELEMENTUNARY_INIT_TASK_ID,
ELEMENTUNARY_FWD_TASK_ID,
ELEMENTUNARY_INF_TASK_ID,
ELEMENTUNARY_BWD_TASK_ID,
EXPERTS_INIT_TASK_ID,
EXPERTS_FWD_TASK_ID,
Expand Down Expand Up @@ -102,7 +104,10 @@ enum TaskIDs {
BATCHMATMUL_BWD_TASK_ID,
LAYERNORM_INIT_TASK_ID,
LAYERNORM_FWD_TASK_ID,
LAYERNORM_INF_TASK_ID,
LAYERNORM_BWD_TASK_ID,
ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID,
ADD_BIAS_RESIDUAL_LAYERNORM_INF_TASK_ID,
LINEAR_INIT_TASK_ID,
LINEAR_INIT_PARA_TASK_ID,
LINEAR_INF_TASK_ID,
Expand Down Expand Up @@ -150,6 +155,7 @@ enum TaskIDs {
ATTENTION_BWD_TASK_ID,
RMSNROM_INIT_TASK_ID,
RMSNROM_FWD_TASK_ID,
RMSNROM_INF_TASK_ID,
BEAM_TOPK_INIT_TASK_ID,
BEAM_TOPK_INF_TASK_ID,
INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
Expand Down Expand Up @@ -305,6 +311,7 @@ class Flat;
class Gather;
class Group_by;
class LayerNorm;
class AddBiasResidualLayerNorm;
class Linear;
class MultiHeadAttention;
class IncMultiHeadSelfAttention;
Expand Down Expand Up @@ -528,6 +535,16 @@ class FFModel {
bool use_bias = true,
DataType data_type = DT_NONE,
char const *name = NULL);
// Add a add_bias_residual_layer_norm layer
void add_bias_residual_layer_norm(const Tensor input,
const Tensor residual,
Tensor *outputs,
std::vector<int> const &axes,
bool elementwise_affine,
float eps,
bool use_bias = true,
DataType data_type = DT_NONE,
char const *name = NULL);
// Add a batch_norm layer
Tensor
batch_norm(const Tensor input, bool relu = true, char const *name = NULL);
Expand Down Expand Up @@ -1111,6 +1128,10 @@ class FFModel {
Group_by *>,
std::unordered_map<std::pair<ParallelTensorShape, LayerNormParams>,
LayerNorm *>,
std::unordered_map<
std::pair<std::pair<ParallelTensorShape, ParallelTensorShape>,
AddBiasResidualLayerNormParams>,
AddBiasResidualLayerNorm *>,
std::unordered_map<std::pair<ParallelTensorShape, LinearParams>,
Linear *>,
std::unordered_map<std::pair<ParallelTensorShape, Pool2DParams>,
Expand Down
2 changes: 2 additions & 0 deletions include/flexflow/operator_params.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef _OPERATOR_PARAMS_H
#define _OPERATOR_PARAMS_H

#include "flexflow/ops/add_bias_residual_layer_norm_params.h"
#include "flexflow/ops/aggregate_params.h"
#include "flexflow/ops/aggregate_spec_params.h"
#include "flexflow/ops/arg_topk_params.h"
Expand Down Expand Up @@ -59,6 +60,7 @@ using OperatorParameters = mp::variant<AggregateParams,
GatherParams,
Group_byParams,
LayerNormParams,
AddBiasResidualLayerNormParams,
LinearParams,
MultiHeadAttentionParams,
IncMultiHeadSelfAttentionParams,
Expand Down
113 changes: 113 additions & 0 deletions include/flexflow/ops/add_bias_residual_layer_norm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#pragma once

#include "flexflow/inference.h"
#include "flexflow/model.h"
#include "flexflow/utils/memory_allocator.h"
namespace FlexFlow {

class AddBiasResidualLayerNormMeta;

class AddBiasResidualLayerNorm : public Op {
public:
using Params = AddBiasResidualLayerNormParams;
using Input = std::pair<ParallelTensor, ParallelTensor>;
AddBiasResidualLayerNorm(FFModel &model,
Params const &params,
Input const &inputs,
char const *name = nullptr,
bool allocate_weights = false);
AddBiasResidualLayerNorm(FFModel &model,
LayerID const &_layer_guid,
const ParallelTensor _input,
const ParallelTensor _residual,
std::vector<int> const &axes,
bool _elementwise_affine,
bool _use_bias,
float _eps,
bool allocate_weights,
char const *name);
void init(FFModel const &) override;
void init_inference(FFModel const &,
std::vector<ParallelTensor> const &,
std::vector<ParallelTensor> const &,
MachineView const *mv = nullptr) override;
void forward(FFModel const &) override;
void backward(FFModel const &) override;
Legion::FutureMap inference(FFModel const &,
BatchConfigFuture const &,
std::vector<ParallelTensor> const &,
std::vector<ParallelTensor> const &,
MachineView const *mv = nullptr) override;
void print_layer(FFModel const &model) override {
assert(0);
}
static Op *
create_operator_from_layer(FFModel &model,
Layer const *layer,
std::vector<ParallelTensor> const &inputs);
void serialize(Legion::Serializer &) const override;
static PCG::Node deserialize(FFModel &ff,
Legion::Deserializer &d,
ParallelTensor inputs[],
int num_inputs);

AddBiasResidualLayerNormParams get_params() const;

static OpMeta *init_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void inference_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
bool measure_operator_cost(Simulator *sim,
MachineView const &pc,
CostMetrics &cost_metrics) const override;
template <typename T>
static void inference_kernel(AddBiasResidualLayerNormMeta const *m,
int attn_bias_dim,
int residual_volume,
T const *input_ptr,
T const *attn_bias_ptr,
T const *residual_ptr,
T *added_output_ptr,
T *output_ptr,
T const *gamma_ptr,
T const *beta_ptr,
ffStream_t stream);
static void inference_kernel_wrapper(AddBiasResidualLayerNormMeta const *m,
int attn_bias_dim,
int residual_volume,
GenericTensorAccessorR const &input,
GenericTensorAccessorW &added_output,
GenericTensorAccessorW &output,
GenericTensorAccessorR const &residual,
GenericTensorAccessorR const &attn_bias,
GenericTensorAccessorR const &gamma,
GenericTensorAccessorR const &beta);

public:
bool elementwise_affine, use_bias;
int64_t effective_batch_size, effective_num_elements;
float eps;
std::vector<int> axes;
};

class AddBiasResidualLayerNormMeta : public OpMeta {
public:
AddBiasResidualLayerNormMeta(FFHandler handle,
AddBiasResidualLayerNorm const *ln,
MemoryAllocator &gpu_mem_allocator);
~AddBiasResidualLayerNormMeta(void);

public:
bool elementwise_affine, use_bias;
int64_t effective_batch_size, effective_num_elements;
float eps;
void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
char op_name[MAX_OPNAME];
Realm::RegionInstance reserveInst;
};

}; // namespace FlexFlow
29 changes: 29 additions & 0 deletions include/flexflow/ops/add_bias_residual_layer_norm_params.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#pragma once

#include "flexflow/ffconst.h"
#include "flexflow/fftype.h"
#include "flexflow/parallel_tensor.h"

namespace FlexFlow {

struct AddBiasResidualLayerNormParams {
LayerID layer_guid;
std::vector<int> axes;
bool elementwise_affine;
float eps;
bool use_bias;
bool is_valid(
std::pair<ParallelTensorShape, ParallelTensorShape> const &) const;
};

bool operator==(AddBiasResidualLayerNormParams const &,
AddBiasResidualLayerNormParams const &);

} // namespace FlexFlow

namespace std {
template <>
struct hash<FlexFlow::AddBiasResidualLayerNormParams> {
size_t operator()(FlexFlow::AddBiasResidualLayerNormParams const &) const;
};
} // namespace std
1 change: 1 addition & 0 deletions include/flexflow/ops/beam_topk_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define _FLEXFLOW_BEAM_TOPK_PARAMS_H

#include "flexflow/ffconst.h"
#include "flexflow/fftype.h"
#include "flexflow/parallel_tensor.h"

namespace FlexFlow {
Expand Down
4 changes: 4 additions & 0 deletions include/flexflow/ops/element_binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ class ElementBinary : public Op {
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void inference_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void backward_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/ops/element_binary_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define _FLEXFLOW_ELEMENT_BINARY_PARAMS_H

#include "flexflow/ffconst.h"
#include "flexflow/fftype.h"
#include "flexflow/parallel_tensor.h"

namespace FlexFlow {
Expand Down
4 changes: 4 additions & 0 deletions include/flexflow/ops/element_unary.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ class ElementUnary : public Op {
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void inference_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void backward_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/ops/element_unary_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define _FLEXFLOW_ELEMENTARY_UNARY_PARAMS_H

#include "flexflow/ffconst.h"
#include "flexflow/fftype.h"
#include "flexflow/parallel_tensor.h"

namespace FlexFlow {
Expand Down
2 changes: 2 additions & 0 deletions include/flexflow/ops/experts_params.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#pragma once

#include "flexflow/ffconst.h"
#include "flexflow/fftype.h"
#include "flexflow/operator.h"
#include "flexflow/parallel_tensor.h"

Expand Down
2 changes: 2 additions & 0 deletions include/flexflow/ops/gather_params.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#ifndef _FLEXFLOW_GATHER_PARAMS_H
#define _FLEXFLOW_GATHER_PARAMS_H

#include "flexflow/ffconst.h"
#include "flexflow/fftype.h"
#include "flexflow/parallel_tensor.h"

namespace FlexFlow {
Expand Down
19 changes: 10 additions & 9 deletions include/flexflow/ops/inc_multihead_self_attention.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ class IncMultiHeadSelfAttention : public Op {
int _kdim,
int _vdim,
float _dropout,
bool _bias,
bool _add_bias_kv,
bool _qkv_bias,
bool _final_bias,
bool _add_zero_attn,
bool _apply_rotary_embedding,
bool _scaling_query,
Expand All @@ -58,8 +58,8 @@ class IncMultiHeadSelfAttention : public Op {
int _kdim,
int _vdim,
float _dropout,
bool _bias,
bool _add_bias_kv,
bool _qkv_bias,
bool _final_bias,
bool _add_zero_attn,
bool _apply_rotary_embedding,
bool _scaling_query,
Expand Down Expand Up @@ -125,8 +125,8 @@ class IncMultiHeadSelfAttention : public Op {
public:
int num_q_heads, num_kv_heads, tensor_parallelism_degree;
float dropout, scaling_factor;
bool bias;
bool add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query,
bool qkv_bias;
bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
qk_prod_scaling, position_bias;
int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
int qoSeqLength, kvSeqLength;
Expand Down Expand Up @@ -154,11 +154,11 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
int _vProjSize,
int _oProjSize,
bool _apply_rotary_embedding,
bool _bias,
bool _qkv_bias,
bool _scaling_query,
bool _qk_prod_scaling,
bool _position_bias,
bool _add_bias_kv,
bool _final_bias,
float _scaling_factor,
GenericTensorAccessorR const &weight,
MemoryAllocator &gpu_mem_allocator,
Expand All @@ -179,7 +179,8 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
int global_num_q_heads, global_num_kv_heads, num_q_heads, num_kv_heads;
bool *has_load_weights;
bool *apply_rotary_embedding;
bool *bias;
bool *qkv_bias;
bool *final_bias;
bool *scaling_query;
bool *qk_prod_scaling;
bool *position_bias;
Expand Down
Loading

0 comments on commit 1f22002

Please sign in to comment.