Skip to content

Commit

Permalink
Merge branch 'bert_fix1' into xinhao_inference
Browse files Browse the repository at this point in the history
  • Loading branch information
xinhaoc authored May 17, 2024
2 parents 024d188 + 355d4b4 commit 0f8b5f2
Show file tree
Hide file tree
Showing 38 changed files with 2,374 additions and 134 deletions.
1 change: 1 addition & 0 deletions examples/python/pytorch/mt5/mt5_ff.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def top_level_task():
input_names = ["input_ids", "attention_mask"]

print("Tracing the model...")
print(batch_size)
hf_model = PyTorchModel(
model, is_hf_model=True, input_names=input_names,
batch_size=batch_size, seq_length=seq_length,
Expand Down
3 changes: 3 additions & 0 deletions include/flexflow/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ class FFConfig {
// Legion::FieldSpace field_space;
bool benchmarking, profiling, perform_fusion;
bool inference_debugging;
Legion::FieldSpace field_space;
bool syntheticInput;
size_t simulator_work_space_size;
size_t search_budget;
float search_alpha;
Expand All @@ -165,6 +167,7 @@ class FFConfig {
int data_parallelism_degree;
int tensor_parallelism_degree;
int pipeline_parallelism_degree;

// Control Tensor Op Math Conversion
bool allow_tensor_op_math_conversion;
std::string dataset_path;
Expand Down
2 changes: 2 additions & 0 deletions include/flexflow/flexflow_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ void flexflow_model_compute_metrics(flexflow_model_t handle);

void flexflow_model_update(flexflow_model_t handle);

void flexflow_model_unified_update(flexflow_model_t handle);

void flexflow_model_compile(flexflow_model_t handle,
enum LossType loss_type,
int *metrics,
Expand Down
11 changes: 11 additions & 0 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ enum TaskIDs {
// Optimizer with NCCL
SGD_UPD_NCCL_TASK_ID,
ADAM_UPD_NCCL_TASK_ID,
ADAM_UNIFY_UPD_NCCL_TASK_ID,
// Initializer
GLOROT_INIT_TASK_ID,
ZERO_INIT_TASK_ID,
Expand Down Expand Up @@ -344,6 +345,7 @@ class SpecIncMultiHeadSelfAttention;
class Sampling;
class ArgMax;
class Combine;
class AllReduce;
class Repartition;
class Reduction;
class Replicate;
Expand Down Expand Up @@ -1050,6 +1052,10 @@ class FFModel {
std::vector<Op *> const &old_operators,
std::unordered_map<ParallelTensor, std::vector<ParallelTensor>>
*pt_mapping = nullptr);
void unified_update();
bool apply_fusion(std::vector<Op *> const &operators,
std::vector<Op *> &new_operators);

Op *get_final_operator() const;
void compile(LossType loss_type,
std::vector<MetricsType> const &metrics,
Expand Down Expand Up @@ -1102,6 +1108,8 @@ class FFModel {
Legion::IndexSpace get_task_is(Legion::Domain const &domain) const;
Legion::IndexSpace get_task_is(ParallelConfig const &pc) const;
Legion::IndexSpace get_task_is(MachineView const &view) const;

bool is_transformer_block(int layer_idx) const;
bool is_mlp_block(int layer_idx) const;
void create_operators_from_layers();
Op *create_operator_from_layer(Layer *layer,
Expand Down Expand Up @@ -1132,6 +1140,7 @@ class FFModel {
int metrics_input;
ParallelTensor parallel_label_tensor;
Tensor label_tensor;
int num_inputs = 0;

std::vector<Layer *> layers;
std::vector<Op *> operators;
Expand Down Expand Up @@ -1240,6 +1249,8 @@ class FFModel {
Replicate *>,
std::unordered_map<std::pair<ParallelTensorShape, ReductionParams>,
Reduction *>,
std::unordered_map<std::pair<ParallelTensorShape, AllReduceParams>,
AllReduce *>,
std::unordered_map<std::pair<ParallelTensorShape, CombineParams>,
Combine *>,
std::unordered_map<std::pair<ParallelTensorShape, AllReduceParams>,
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/operator_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "flexflow/ops/beam_topk_params.h"
#include "flexflow/ops/cast_params.h"
#include "flexflow/ops/concat_params.h"
#include "flexflow/parallel_ops/allreduce_params.h"
#include "flexflow/ops/conv_2d_params.h"
#include "flexflow/ops/dropout_params.h"
#include "flexflow/ops/element_binary_params.h"
Expand Down
7 changes: 7 additions & 0 deletions include/flexflow/ops/dropout.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@
#include "flexflow/node.h"
#include "flexflow/operator.h"
#include "flexflow/ops/dropout_params.h"
#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
#include <curand.h>
#include <curand_kernel.h>
#elif defined(FF_USE_HIP_ROCM)
#include <hiprand/hiprand.h>
#include <hiprand/hiprand_kernel.h>
#endif

namespace FlexFlow {

Expand Down
1 change: 1 addition & 0 deletions include/flexflow/ops/element_binary_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ struct ElementBinaryParams {
bool inplace_a;
char name[MAX_OPNAME];


bool is_valid(
std::pair<ParallelTensorShape, ParallelTensorShape> const &) const;
};
Expand Down
16 changes: 12 additions & 4 deletions include/flexflow/ops/kernels/dropout_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "flexflow/fftype.h"
#include "flexflow/op_meta.h"
#include "flexflow/ops/dropout.h"
#include "flexflow/accessor.h"

namespace FlexFlow {

Expand All @@ -17,33 +18,40 @@ class DropoutMeta : public OpMeta {
~DropoutMeta(void);
Realm::RegionInstance reserveInst;
#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
curandState *state;
cudnnTensorDescriptor_t inputTensor, outputTensor;
cudnnDropoutDescriptor_t dropoutDesc;
#else
miopenTensorDescriptor_t inputTensor, outputTensor;
miopenDropoutDescriptor_t dropoutDesc;
hiprandState *state;
#endif
void *reserveSpace, *dropoutStates;
size_t reserveSpaceSize, dropoutStateSize;
size_t num_elements;
long long seed;
float rate;
};

namespace Kernels {
namespace Dropout {
void forward_kernel_wrapper(DropoutMeta *m,
float const *input_ptr,
float *output_ptr);
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output);
void backward_kernel_wrapper(DropoutMeta *m,
float const *output_grad_ptr,
float *input_grad_ptr);
GenericTensorAccessorR const &output_grad,
GenericTensorAccessorW const &input_grad);

namespace Internal {
void forward_kernel(DropoutMeta *m,
float const *input_ptr,
float *output_ptr,
size_t num_elements,
ffStream_t stream);
void backward_kernel(DropoutMeta *m,
float const *output_grad_ptr,
float *input_grad_ptr,
size_t num_elements,
ffStream_t stream);
} // namespace Internal
} // namespace Dropout
Expand Down
23 changes: 23 additions & 0 deletions include/flexflow/optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include "flexflow/parallel_tensor.h"
#include "legion.h"
#include "accessor.h"

namespace FlexFlow {

Expand All @@ -30,6 +31,7 @@ class Optimizer {
virtual void init(void) = 0;
virtual void next(void) = 0;
virtual void update(const ParallelTensor p) = 0;
virtual void unified_update(std::vector<ParallelTensor> const parameters) = 0;
FFModel const *model;
};

Expand All @@ -43,6 +45,7 @@ class SGDOptimizer : public Optimizer {
void init(void);
void next(void);
void update(const ParallelTensor p);
void unified_update(std::vector<ParallelTensor> const parameters);
void set_weight_decay(double _weight_decay);
static void ps_update_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Expand All @@ -60,6 +63,11 @@ class SGDOptimizer : public Optimizer {
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void
nccl_unified_update_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void nccl_update_task_gpu(SGDOptimizer const *op,
OpMeta const *meta,
float const *w_grad_ptr,
Expand All @@ -85,6 +93,7 @@ class AdamOptimizer : public Optimizer {
void init(void);
void next(void);
void update(const ParallelTensor p);
void unified_update(std::vector<ParallelTensor> const parameters);
void set_weight_decay(double _weight_decay);
static void ps_update_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Expand All @@ -103,17 +112,31 @@ class AdamOptimizer : public Optimizer {
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void
nccl_unified_update_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void nccl_update_task_gpu(AdamOptimizer const *op,
OpMeta const *meta,
float const *w_grad_ptr,
size_t size,
float *w_ptr,
float *v_ptr,
float *m_ptr);
static void nccl_unified_update_task_gpu(AdamOptimizer const *op,
OpMeta const *meta,
GenericTensorAccessorR *accWGrads,
size_t *size,
GenericTensorAccessorW *accWs,
GenericTensorAccessorW *accVs,
GenericTensorAccessorW *accMs);
#endif
double alpha, beta1, beta2, weight_decay, epsilon;
double alpha_t, beta1_t, beta2_t;
std::map<Legion::LogicalRegion, ParallelTensor> v_values, m_values;
size_t reservedWorkSpaceSize = 0;
int parameters_num = 0;
};

}; // namespace FlexFlow
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/parallel_ops/allreduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class AllReduce : public ParallelOp {
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);

static void forward_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/parallel_ops/kernels/allreduce_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ void inference_kernel_wrapper(AllReduceMeta const *m,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output);


void forward_kernel_wrapper(AllReduceMeta const *m,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output);
Expand Down
3 changes: 3 additions & 0 deletions include/flexflow/utils/cuda_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,13 @@ template <typename DT>
__global__ void copy_kernel(DT *dst, const DT *src, Legion::coord_t size);

template <typename DT>

__global__ void copy_kernel_discrete(DT *dst,
const DT *src,
Legion::coord_t size,
size_t *index);
__global__ void copy_kernel_with_replicate(DT *dst, const DT *src, Legion::coord_t origin_size, Legion::coord_t size);


template <typename T>
__global__ void add_kernel(T *data_ptr, T const *grad_ptr, size_t size);
Expand Down
Loading

0 comments on commit 0f8b5f2

Please sign in to comment.