diff --git a/cmake/json.cmake b/cmake/json.cmake index 63ac50b203..3cf57a7864 100644 --- a/cmake/json.cmake +++ b/cmake/json.cmake @@ -1,4 +1 @@ -include(FetchContent) - -FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz) -FetchContent_MakeAvailable(json) \ No newline at end of file +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/json) diff --git a/config/config.linux b/config/config.linux index acffc210f5..a4b903ef15 100755 --- a/config/config.linux +++ b/config/config.linux @@ -59,7 +59,7 @@ FF_USE_PYTHON=${FF_USE_PYTHON:-ON} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS:-} # select GASNET conduit -FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv} +FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ofi} # set UCX dir if Legion networks is set to ucx UCX_DIR=${UCX_DIR:-""} @@ -102,8 +102,6 @@ if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then elif [[ "$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" || "$FF_GPU_BACKEND" == "hip_rocm" ]]; then # enable NCCL FF_USE_NCCL=${FF_USE_NCCL:-ON} -else - FF_USE_NCCL=OFF fi function get_build_configs() { diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py index 41b84a269e..b1dc442dd1 100644 --- a/examples/python/pytorch/mt5/mt5_ff.py +++ b/examples/python/pytorch/mt5/mt5_ff.py @@ -3,16 +3,18 @@ import sys import numpy as np +import torch from flexflow.core import * +import flexflow.core as ff from flexflow.torch.model import PyTorchModel -from transformers import MT5ForConditionalGeneration, T5Tokenizer - +#from transformers import MT5ForConditionalGeneration, T5Tokenizer +from transformers import BertForMaskedLM, BertTokenizer, BertConfig sys.path.append("./examples/python/pytorch/mt5") from mt5_torch import DataPreparer, get_dataloaders, set_seed BASE_DIR = "examples/python/pytorch/mt5" DATA_DIR = os.path.join(BASE_DIR, "data") -NUMPY_DIR = os.path.join(DATA_DIR, "numpy") +NUMPY_DIR = os.path.join(DATA_DIR, "numpy_candle") def data_to_numpy() -> None: @@ -28,7 +30,8 @@ def data_to_numpy() -> None: """ model_params = { "SEED": 42, - "MODEL": "google/mt5-small", + #"MODEL": "google/mt5-small", + "MODEL": "bert-base-uncased", "TRAIN_BATCH_SIZE": None, # use the full dataset as one batch "EVAL_BATCH_SIZE": None, # use the full dataset as one batch "TRAIN_EPOCHS": 1, # unused @@ -36,7 +39,8 @@ def data_to_numpy() -> None: "MAX_TARGET_TEXT_LENGTH": 48, } set_seed(model_params) - tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"]) + #tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"]) + tokenizer = BertTokenizer.from_pretrained(model_params["MODEL"]) print("Getting dataloaders...") train_loader, eval_loader = get_dataloaders(tokenizer, model_params) assert len(train_loader) == 1 @@ -61,8 +65,8 @@ def preprocess_train() -> None: y_shape = y.shape assert len(y.shape) == 2, \ "`y` should have shape (num examples, sequence length)" - y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long) - lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long) + y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32) + lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32) y_ids[:, :] = y[:, :-1] lm_labels[:, :] = y[:, 1:] @@ -81,36 +85,60 @@ def preprocess_train() -> None: def top_level_task(): ffconfig = FFConfig() ffmodel = FFModel(ffconfig) - model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small") - + #model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small") + # config = BertConfig.from_pretrained('bert-base-uncased') + + # # Modify the configuration to set a different number of layers + # config.num_hidden_layers = 1 # Set the number of layers you want + # model = BertForMaskedLM.from_pretrained("bert-base-uncased", config=config) + # model.num_layers = 1 + model = BertForMaskedLM.from_pretrained("bert-base-uncased") + #model = BertModel.from_pretrained("bert-base-uncased") # Load train data as numpy arrays print("Loading data...") - ids = np.load(os.path.join(NUMPY_DIR, "train_source_ids.npy")) - mask = np.load(os.path.join(NUMPY_DIR, "train_source_mask.npy")) - y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) - lm_labels = np.load(os.path.join(NUMPY_DIR, "train_lm_labels.npy")) + ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")).astype('int32') + ids = np.pad(ids, ((0,0), (0,17)), 'constant') + #ids = np.random.randint(0, 5, (1000, 512)) + #print('ids_shape', ids.shape) + #print('ids', ids) + mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")).astype('int32') + mask = np.pad(mask, ((0,0), (0,17)), 'constant') + #mask = np.random.randint(0, 2, (1000, 512)) + #y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) + lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")).astype('int32') + lm_labels = np.pad(lm_labels, ((0,0), (0,17)), 'constant') + #lm_labels = np.random.randint(-1, 5, (1000, 512)) + position_id = torch.arange(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy() + token_type_ids = torch.zeros(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy() + batch_size = ffconfig.batch_size input_ids_shape = (batch_size, ids.shape[1]) attention_mask_shape = (batch_size, mask.shape[1]) - decoder_input_ids_shape = (batch_size, y_ids.shape[1]) + #decoder_input_ids_shape = (batch_size, y_ids.shape[1]) input_tensors = [ - ffmodel.create_tensor(input_ids_shape, DataType.DT_INT64), # input_ids - ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT64), # attention_mask - ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids + ffmodel.create_tensor(input_ids_shape, DataType.DT_INT32), # input_ids + ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT32), # attention_mask + #ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids ] encoder_seq_length = ids.shape[1] - decoder_seq_length = y_ids.shape[1] - seq_length = (encoder_seq_length, decoder_seq_length) - input_names = ["input_ids", "attention_mask", "decoder_input_ids"] + #decoder_seq_length = y_ids.shape[1] + #seq_length = (encoder_seq_length, decoder_seq_length) + seq_length = encoder_seq_length + #input_names = ["input_ids", "attention_mask", "decoder_input_ids"] + input_names = ["input_ids", "attention_mask"] print("Tracing the model...") + print(batch_size) hf_model = PyTorchModel( model, is_hf_model=True, input_names=input_names, batch_size=batch_size, seq_length=seq_length, ) output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True) - ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) + #from flexflow.torch.model import file_to_ff + #file_to_ff("mt5.ff", ffmodel, input_tensors) + ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, beta1=0.9, beta2=0.98, weight_decay=0.0, epsilon=2e-8) + # ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) print("Compiling the model...") ffmodel.compile( @@ -121,13 +149,21 @@ def top_level_task(): MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY, ], ) + + # load weights here + ffmodel.load_bert_pretrained(checkpoint=model) print("Creating data loaders...") + print('id_dtype', ids.dtype) + print('mask_dtype', mask.dtype) + print('labels_dtype', lm_labels.dtype) input_ids_dl = ffmodel.create_data_loader(input_tensors[0], ids) attention_mask_dl = ffmodel.create_data_loader(input_tensors[1], mask) - decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids) + #decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids) # NOTE: We cast down the label tensor data to 32-bit to accommodate the # label tensor's required dtype + token_type_ids_dl = ffmodel.create_data_loader(input_tensors[2], token_type_ids) + position_id_dl = ffmodel.create_data_loader(input_tensors[3], position_id) labels_dl = ffmodel.create_data_loader( ffmodel.label_tensor, lm_labels.astype("int32") ) @@ -138,31 +174,34 @@ def top_level_task(): print("Training...") epochs = ffconfig.epochs ffmodel.fit( - x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl], + #x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl], + x=[input_ids_dl, attention_mask_dl, position_id_dl, token_type_ids_dl], y=labels_dl, batch_size=batch_size, epochs=epochs, ) if __name__ == "__main__": - # Generate the .tsv files if needed - if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \ - not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")): - DataPreparer.data_to_tsv() - # Convert the .tsv files to .npy if needed - if not os.path.exists(NUMPY_DIR): - os.mkdir(NUMPY_DIR) - prefixes = ["train_", "eval_"] - suffixes = ["source_ids.npy", "source_mask.npy", "target_ids.npy"] - npy_filenames = [ - pre + suf for pre, suf in itertools.product(prefixes, suffixes) - ] - if any( - not os.path.exists(os.path.join(NUMPY_DIR, filename)) - for filename in npy_filenames - ): - data_to_numpy() - # Preprocess the training data if needed - if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \ - not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")): - preprocess_train() + ## Generate the .tsv files if needed + #if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \ + # not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")): + # DataPreparer.data_to_tsv() + ## Convert the .tsv files to .npy if needed + #if not os.path.exists(NUMPY_DIR): + # os.mkdir(NUMPY_DIR) + #prefixes = ["train_", "eval_"] + #suffixes = ["source_ids.npy", "source_mask.npy", "target_ids.npy"] + #npy_filenames = [ + # pre + suf for pre, suf in itertools.product(prefixes, suffixes) + #] + #if any( + # not os.path.exists(os.path.join(NUMPY_DIR, filename)) + # for filename in npy_filenames + #): + # data_to_numpy() + ## Preprocess the training data if needed + #if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \ + # not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")): + # preprocess_train() + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() diff --git a/examples/python/pytorch/mt5/mt5_torch.py b/examples/python/pytorch/mt5/mt5_torch.py index 78886eed6c..4d741c44a5 100644 --- a/examples/python/pytorch/mt5/mt5_torch.py +++ b/examples/python/pytorch/mt5/mt5_torch.py @@ -7,7 +7,7 @@ import os import numpy as np -import pandas as pd +#import pandas as pd import torch from torch.utils.data import DataLoader, Dataset from transformers import MT5ForConditionalGeneration, T5Tokenizer @@ -311,5 +311,5 @@ def TorchMT5Trainer( "MAX_TARGET_TEXT_LENGTH": 48, "LEARNING_RATE": 1e-4, } - device = torch.device(0) + device = torch.device('cpu') TorchMT5Trainer(model_params, device) diff --git a/gdb/pretty_print.py b/gdb/pretty_print.py index 4cccc9b76b..e6fbe298ce 100644 --- a/gdb/pretty_print.py +++ b/gdb/pretty_print.py @@ -61,7 +61,11 @@ def to_string(self): size = dim['size'] degree = dim['degree'] parallel_idx = dim['parallel_idx'] - toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx}]') + if dim['is_replica_dim']: + is_replica = 'r=t' + else: + is_replica = 'r=f' + toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx} {is_replica}]') return f'TensorShape<{" ".join(toks)}>' class ParallelTensorBasePrinter: @@ -77,9 +81,31 @@ def to_string(self): size = dim['size'] degree = dim['degree'] parallel_idx = dim['parallel_idx'] - toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx}]') + tok = f'{i}=[s={size} d={degree} pi={parallel_idx} ' + if dim['is_replica_dim']: + tok += 'r=t' + else: + tok += 'r=f' + tok += ']' + toks.append(tok) return f'ParallelTensorBase<{" ".join(toks)}>' +class ParallelDimPrinter: + def __init__(self, val): + self.val = val + + def to_string(self): + size = self.val['size'] + degree = self.val['degree'] + parallel_idx = self.val['parallel_idx'] + tok = f's={size} d={degree} pi={parallel_idx} ' + if dim['is_replica_dim']: + tok += 'r=t' + else: + tok += 'r=f' + return f'ParallelDim<{tok}>' + + def build_pretty_printer(): pp = gdb.printing.RegexpCollectionPrettyPrinter( "flexflow") @@ -89,6 +115,7 @@ def build_pretty_printer(): pp.add_printer('Domain', '^Legion::Domain$', DomainPrinter) pp.add_printer('ParallelTensorShape', '^FlexFlow::ParallelTensorShape$', TensorShapePrinter) pp.add_printer('ParallelTensorBase', '^FlexFlow::ParallelTensorBase$', ParallelTensorBasePrinter) + pp.add_printer('ParallelDim', '^FlexFlow::ParallelDim$', ParallelDimPrinter) return pp gdb.printing.register_pretty_printer( diff --git a/ichanges.txt b/ichanges.txt new file mode 100644 index 0000000000..aa0912640b --- /dev/null +++ b/ichanges.txt @@ -0,0 +1,5 @@ +changes: +cudnnSetTensorDescriptorFromDomain4SoftMax +try_one_lambda in grpah.cc + +field_space = runtime->create_field_space(lg_ctx in model.cc \ No newline at end of file diff --git a/include/flexflow/config.h b/include/flexflow/config.h index dd9d657117..2f6d22dd6f 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -165,8 +165,10 @@ class FFConfig { Legion::Context lg_ctx; Legion::Runtime *lg_hlr; Legion::IndexSpaceT<1> all_gpu_task_is; + Legion::FieldSpace field_space; + bool syntheticInput, profiling, perform_fusion; // Legion::FieldSpace field_space; - bool benchmarking, profiling, perform_fusion; + bool benchmarking; bool inference_debugging; size_t simulator_work_space_size; size_t search_budget; @@ -227,4 +229,4 @@ enum FieldIDs { }; // namespace FlexFlow -#endif //_FLEXFLOW_CONFIG_H_ +#endif //_FLEXFLOW_CONFIG_H_ \ No newline at end of file diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 52b4b3d362..fbb98d090e 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -127,6 +127,8 @@ void flexflow_model_compute_metrics(flexflow_model_t handle); void flexflow_model_update(flexflow_model_t handle); +void flexflow_model_unified_update(flexflow_model_t handle); + void flexflow_model_compile(flexflow_model_t handle, enum LossType loss_type, int *metrics, @@ -344,6 +346,7 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle, flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle, const flexflow_tensor_t input, int dim, + bool last_layer, char const *name); flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle, diff --git a/include/flexflow/graph.h b/include/flexflow/graph.h index 9dc6572593..d441adef17 100644 --- a/include/flexflow/graph.h +++ b/include/flexflow/graph.h @@ -91,9 +91,9 @@ struct NodeCompare { struct GraphOptimalViewSerialized { #ifdef LEGION_MAX_RETURN_SIZE - static const size_t buffer_size = LEGION_MAX_RETURN_SIZE - 8; + static size_t const buffer_size = 4 * LEGION_MAX_RETURN_SIZE - 8; #else - static const size_t buffer_size = 1024 * 1024 - 8; + static size_t const buffer_size = 1024 * 1024 - 8; #endif size_t total_bytes; char data[buffer_size]; @@ -279,7 +279,7 @@ class SearchHelper { mutable std::unordered_map cached_graph_costs; mutable std::unordered_map>> + std::unique_ptr const>> cached_operator_valid_views; }; @@ -332,6 +332,8 @@ class Graph { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + // static GraphOptimalViewSerialized + // graph_optimize_wrapper(FFModel * model); Node find_bottleneck_node(Node const &sink_node, Node const &source_node) const; void print_strategy_computation_graph( diff --git a/include/flexflow/initializer.h b/include/flexflow/initializer.h index 062530a655..3c44d1184a 100644 --- a/include/flexflow/initializer.h +++ b/include/flexflow/initializer.h @@ -46,7 +46,7 @@ class GlorotUniform : public Initializer { class Op; struct ZeroInitMeta { - static int const MAX_NUM_REGIONS = 64; + static int const MAX_NUM_REGIONS = 128; int num_regions; Op *op_ptr; DataType data_types[MAX_NUM_REGIONS]; diff --git a/include/flexflow/machine_view.h b/include/flexflow/machine_view.h index 807b0c9c0d..76cc05d8f5 100644 --- a/include/flexflow/machine_view.h +++ b/include/flexflow/machine_view.h @@ -16,7 +16,7 @@ namespace FlexFlow { class FFConfig; struct MachineView { - static const MachineView NO_VIEW; + static MachineView const NO_VIEW; MachineView(); int get_device_id(Legion::DomainPoint const &p) const; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 4ad735ef7d..46c6282a65 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -213,6 +213,7 @@ enum TaskIDs { // Optimizer with NCCL SGD_UPD_NCCL_TASK_ID, ADAM_UPD_NCCL_TASK_ID, + ADAM_UNIFY_UPD_NCCL_TASK_ID, // Initializer GLOROT_INIT_TASK_ID, ZERO_INIT_TASK_ID, @@ -374,6 +375,7 @@ class SpecIncMultiHeadSelfAttention; class Sampling; class ArgMax; class Combine; +class AllReduce; class Repartition; class Reduction; class Replicate; @@ -440,74 +442,74 @@ class FFModel { bool cpu_offload; // C++ APIs for constructing models // Add an exp layer - Tensor exp(const Tensor x, char const *name = NULL); + Tensor exp(Tensor const x, char const *name = NULL); // Add an add layer - Tensor add(const Tensor x, - const Tensor y, + Tensor add(Tensor const x, + Tensor const y, bool inplace_a = false, char const *name = NULL); // Add a subtract layer - Tensor subtract(const Tensor x, - const Tensor y, + Tensor subtract(Tensor const x, + Tensor const y, bool inplace_a = false, char const *name = NULL); // Add a multiply layer - Tensor multiply(const Tensor x, - const Tensor y, + Tensor multiply(Tensor const x, + Tensor const y, bool inplace_a = false, char const *name = NULL); // Add a divide layer - Tensor divide(const Tensor x, - const Tensor y, + Tensor divide(Tensor const x, + Tensor const y, bool inplace_a = false, char const *name = NULL); // Add a max layer - Tensor max(const Tensor x, - const Tensor y, + Tensor max(Tensor const x, + Tensor const y, bool inplace_a = false, char const *name = NULL); // Add a min layer - Tensor min(const Tensor x, - const Tensor y, + Tensor min(Tensor const x, + Tensor const y, bool inplace_a = false, char const *name = NULL); // Add a rsqrt layer - Tensor rsqrt(const Tensor x, bool inplace = true, char const *name = NULL); + Tensor rsqrt(Tensor const x, bool inplace = true, char const *name = NULL); // Add a pow layer - Tensor pow(const Tensor x, + Tensor pow(Tensor const x, float const exponent, bool inplace = true, char const *name = NULL); // Add a scalar multiply layer - Tensor scalar_multiply(const Tensor x, + Tensor scalar_multiply(Tensor const x, float const scalar, bool inplace = true, char const *name = NULL); - Tensor scalar_add(const Tensor x, + Tensor scalar_add(Tensor const x, float const scalar, bool inplace = true, char const *name = NULL); - Tensor scalar_sub(const Tensor x, + Tensor scalar_sub(Tensor const x, float const scalar, bool inplace = true, char const *name = NULL); - Tensor scalar_truediv(const Tensor x, + Tensor scalar_truediv(Tensor const x, float const scalar, bool inplace = true, char const *name = NULL); // Add a sin layer - Tensor sin(const Tensor x, char const *name = NULL); + Tensor sin(Tensor const x, char const *name = NULL); // Add a cos layer - Tensor cos(const Tensor x, char const *name = NULL); + Tensor cos(Tensor const x, char const *name = NULL); // Add an activation layer - Tensor relu(const Tensor x, bool inplace = true, char const *name = NULL); - Tensor identity(const Tensor x, char const *name = NULL); - Tensor gelu(const Tensor x, char const *name = NULL); - Tensor sigmoid(const Tensor x, char const *name = NULL); - Tensor tanh(const Tensor x, char const *name = NULL); - Tensor elu(const Tensor x, bool inplace = true, char const *name = NULL); + Tensor relu(Tensor const x, bool inplace = true, char const *name = NULL); + Tensor identity(Tensor const x, char const *name = NULL); + Tensor gelu(Tensor const x, char const *name = NULL); + Tensor sigmoid(Tensor const x, char const *name = NULL); + Tensor tanh(Tensor const x, char const *name = NULL); + Tensor elu(Tensor const x, bool inplace = true, char const *name = NULL); // Add a 2D convolutional layer - Tensor conv2d(const Tensor input, + Tensor conv2d(Tensor const input, int outChannels, int kernelH, int kernelW, @@ -523,12 +525,12 @@ class FFModel { Initializer *bias_initializer = NULL, char const *name = NULL); // Add a dropout layer - Tensor dropout(const Tensor input, + Tensor dropout(Tensor const input, float rate, unsigned long long seed = 0, char const *name = NULL); // Add an embedding layer - Tensor embedding(const Tensor input, + Tensor embedding(Tensor const input, int num_entries, int outDim, AggrMode aggr, @@ -537,13 +539,13 @@ class FFModel { Initializer *kernel_initializer = NULL, char const *name = NULL); // Add a gather layer - Tensor gather(const Tensor input, - const Tensor index, + Tensor gather(Tensor const input, + Tensor const index, int dim, char const *name = NULL); // Add a group_by layer - void group_by(const Tensor data, - const Tensor assign, + void group_by(Tensor const data, + Tensor const assign, Tensor *outputs, int n, float alpha, @@ -565,7 +567,7 @@ class FFModel { float lambda_bal, char const *name = NULL); // Add a 2D pooling layer - Tensor pool2d(const Tensor input, + Tensor pool2d(Tensor const input, int kernelH, int kernelW, int strideH, @@ -576,7 +578,7 @@ class FFModel { ActiMode activation = AC_MODE_NONE, char const *name = NULL); // Add a layer_norm layer - Tensor layer_norm(const Tensor input, + Tensor layer_norm(Tensor const input, std::vector const &axes, bool elementwise_affine, float eps, @@ -584,9 +586,9 @@ class FFModel { DataType data_type = DT_NONE, char const *name = NULL); // Add a layer_norm layer with residual(s) - void residual_layer_norm(const Tensor input, - const Tensor residual1, - const Tensor residual2, + void residual_layer_norm(Tensor const input, + Tensor const residual1, + Tensor const residual2, Tensor *outputs, bool use_two_residuals, std::vector const &axes, @@ -597,8 +599,8 @@ class FFModel { DataType data_type = DT_NONE, char const *name = NULL); // Add a add_bias_residual_layer_norm layer - void add_bias_residual_layer_norm(const Tensor input, - const Tensor residual, + void add_bias_residual_layer_norm(Tensor const input, + Tensor const residual, Tensor *outputs, std::vector const &axes, bool elementwise_affine, @@ -608,28 +610,28 @@ class FFModel { DataType data_type = DT_NONE, char const *name = NULL); // Add a sigmoid_silu_multi layer - Tensor sigmoid_silu_multi(const Tensor input1, - const Tensor input2, + Tensor sigmoid_silu_multi(Tensor const input1, + Tensor const input2, DataType data_type = DT_NONE, char const *name = NULL); // Add a batch_norm layer Tensor - batch_norm(const Tensor input, bool relu = true, char const *name = NULL); + batch_norm(Tensor const input, bool relu = true, char const *name = NULL); // Add a batch_matmul layer - Tensor batch_matmul(const Tensor A, - const Tensor B, + Tensor batch_matmul(Tensor const A, + Tensor const B, int a_seq_length_dim = -1, int b_seq_length_dim = -1, char const *name = nullptr); // Add a root mean square layer - Tensor rms_norm(const Tensor input, + Tensor rms_norm(Tensor const input, float eps, int dim, DataType data_type = DT_NONE, char const *name = NULL); // Add a residual root mean square layer - void residual_rms_norm(const Tensor input1, - const Tensor input2, + void residual_rms_norm(Tensor const input1, + Tensor const input2, Tensor *outputs, float eps, int dim, @@ -637,13 +639,13 @@ class FFModel { DataType data_type = DT_NONE, char const *name = NULL); // Add a beam search top k layer - Tensor beam_top_k(const Tensor input, + Tensor beam_top_k(Tensor const input, int max_beam_size, bool sorted, char const *name = NULL); // Add a dense layer - Tensor dense(const Tensor input, + Tensor dense(Tensor const input, int outDim, ActiMode activation = AC_MODE_NONE, bool use_bias = true, @@ -655,7 +657,7 @@ class FFModel { float regularizer_lambda = 0.0, char const *name = NULL); // Add a cast layer - Tensor cast(const Tensor input, DataType dtype, char const *name = nullptr); + Tensor cast(Tensor const input, DataType dtype, char const *name = nullptr); // Add a concat layer Tensor concat(int n, Tensor const *tensors, int axis, char const *name = NULL); @@ -670,58 +672,59 @@ class FFModel { int experts_internal_dim_size = 0, // hidden dimension for internal layers char const *name = NULL); // Add a mean layer - Tensor mean(const Tensor input, + Tensor mean(Tensor const input, std::vector const &dims, bool keepdims, char const *name); // Add a moe layer (wrapping topk, group_by and aggregate operators) - Tensor moe(const Tensor input, + Tensor moe(Tensor const input, int num_exp, int num_select, int expert_hidden_size, float alpha, float lambda); // Add a split layer - void split(const Tensor input, + void split(Tensor const input, Tensor *outputs, std::vector const &split, int axis, char const *name = NULL); // Add a flat layer - Tensor flat(const Tensor input, char const *name = NULL); + Tensor flat(Tensor const input, char const *name = NULL); // Add a softmax layer - Tensor softmax(const Tensor input, + Tensor softmax(Tensor const input, int dim = -1, + bool last_layer = false, DataType data_type = DT_NONE, char const *name = NULL); // Create input tensors and constants - Tensor transpose(const Tensor input, + Tensor transpose(Tensor const input, std::vector const &perm, char const *name = NULL); - Tensor reduce_sum(const Tensor input, + Tensor reduce_sum(Tensor const input, std::vector const &axes, bool keepdims = false, char const *name = nullptr); - Tensor reshape(const Tensor input, + Tensor reshape(Tensor const input, std::vector const &shape, char const *name = NULL); - Tensor reverse(const Tensor input, int axis, char const *name = NULL); - void top_k(const Tensor input, + Tensor reverse(Tensor const input, int axis, char const *name = NULL); + void top_k(Tensor const input, Tensor *outputs, int k, bool sorted, char const *name = NULL); - Tensor arg_top_k(const Tensor input, + Tensor arg_top_k(Tensor const input, // Tensor *outputs, int k, bool sorted, bool speculative_decoding, char const *name = NULL); - Tensor argmax(const Tensor input, bool beam_search, char const *name = NULL); - Tensor sampling(const Tensor input, float top_p, char const *name = NULL); - Tensor multihead_attention(const Tensor query, - const Tensor key, - const Tensor value, + Tensor argmax(Tensor const input, bool beam_search, char const *name = NULL); + Tensor sampling(Tensor const input, float top_p, char const *name = NULL); + Tensor multihead_attention(Tensor const query, + Tensor const key, + Tensor const value, int embed_dim, int num_heads, int kdim = 0, @@ -733,7 +736,7 @@ class FFModel { DataType data_type = DT_NONE, Initializer *kernel_initializer = NULL, char const *name = NULL); - Tensor inc_multihead_self_attention(const Tensor input, + Tensor inc_multihead_self_attention(Tensor const input, int embed_dim, int num_heads, int kdim = 0, @@ -751,7 +754,7 @@ class FFModel { bool position_bias = false, char const *name = NULL); Tensor - spec_inc_multihead_self_attention(const Tensor input, + spec_inc_multihead_self_attention(Tensor const input, int embed_dim, int num_heads, int kdim = 0, @@ -769,7 +772,7 @@ class FFModel { bool position_bias = false, char const *name = NULL); Tensor inc_multihead_self_attention_verify( - const Tensor input, + Tensor const input, int embed_dim, int num_heads, int kdim = 0, @@ -786,7 +789,7 @@ class FFModel { bool qk_prod_scaling = true, bool position_bias = false, char const *name = NULL); - Tensor inc_multiquery_self_attention(const Tensor input, + Tensor inc_multiquery_self_attention(Tensor const input, int embed_dim, int num_q_heads, int num_kv_heads, @@ -805,7 +808,7 @@ class FFModel { bool position_bias = false, char const *name = NULL); Tensor - spec_inc_multiquery_self_attention(const Tensor input, + spec_inc_multiquery_self_attention(Tensor const input, int embed_dim, int num_q_heads, int num_kv_heads, @@ -824,7 +827,7 @@ class FFModel { bool position_bias = false, char const *name = NULL); Tensor inc_multiquery_self_attention_verify( - const Tensor input, + Tensor const input, int embed_dim, int num_q_heads, int num_kv_heads, @@ -859,7 +862,7 @@ class FFModel { bool create_grad = true); ParallelTensor create_parallel_tensor_legion_ordering(int num_dim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op = NULL, int owner_idx = 0, @@ -872,7 +875,7 @@ class FFModel { int owner_idx = 0, bool create_grad = true); ParallelTensor create_parallel_tensor(int num_dim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op = NULL, int owner_idx = 0, @@ -885,7 +888,7 @@ class FFModel { int owner_idx = 0, bool create_grad = true); template - ParallelTensor create_parallel_tensor(const ParallelDim dims[], + ParallelTensor create_parallel_tensor(ParallelDim const dims[], DataType data_type, Op const *owner_op = NULL, int owner_idx = 0, @@ -909,7 +912,7 @@ class FFModel { ParameterSyncType sync_type = ParameterSyncType::NONE); template ParallelParameter create_parallel_weight( - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op = NULL, bool create_grad = true, @@ -917,7 +920,7 @@ class FFModel { ParameterSyncType sync_type = ParameterSyncType::NONE); ParallelParameter create_parallel_weight( int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op = NULL, bool create_grad = true, @@ -925,7 +928,7 @@ class FFModel { ParameterSyncType sync_type = ParameterSyncType::NONE); ParallelParameter create_parallel_weight_legion_ordering( int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op = NULL, bool create_grad = true, @@ -934,7 +937,7 @@ class FFModel { void map_tensor(ParallelTensor tensor, Op const *parallel_op); void map_weight(ParallelTensor tensor, Op const *parallel_op); - bool get_parallel_tensor_from_tensor(const Tensor tensor, + bool get_parallel_tensor_from_tensor(Tensor const tensor, ParallelTensor ¶llel_tensor) const; template @@ -975,7 +978,7 @@ class FFModel { // Internal PCG::Node creation APIs // ======================================== template - PCG::Node get_or_create_node(const typename T::Input &input, + PCG::Node get_or_create_node(typename T::Input const &input, typename T::Params const ¶ms) { using Params = typename T::Params; @@ -1005,50 +1008,50 @@ class FFModel { return this->new_node(op); } - PCG::Node get_or_create_noop_node(const ParallelTensor input); + PCG::Node get_or_create_noop_node(ParallelTensor const input); PCG::Node get_or_create_input_node(ParallelTensorShape const &); PCG::Node get_or_create_fused_parallel_node( - const ParallelTensor input, + ParallelTensor const input, std::vector const ¶llel_ops); - PCG::Node get_or_create_parallel_op_node(const ParallelTensor input, + PCG::Node get_or_create_parallel_op_node(ParallelTensor const input, ParallelOpInfo const &); // ======================================== // Internal APIs that should not be invoked from applications // ======================================== void create_disjoint_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], Legion::IndexSpace const &part_is, Legion::LogicalRegion const ®ion, Legion::LogicalPartition &part); template void create_disjoint_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], Legion::IndexSpaceT const &part_is, Legion::LogicalRegion const ®ion, Legion::LogicalPartition &part); void create_aliased_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, Legion::IndexSpace const &part_is, Legion::LogicalRegion const ®ion, Legion::LogicalPartition &part); template void create_aliased_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, Legion::IndexSpaceT const &part_is, Legion::LogicalRegion const ®ion, Legion::LogicalPartition &part); template - void create_disjoint_partition(const ParallelTensor tensor, + void create_disjoint_partition(ParallelTensor const tensor, Legion::IndexSpaceT const &part_is, Legion::LogicalPartition &part_fwd, Legion::LogicalPartition &part_bwd); template void create_data_parallel_partition_with_diff_dims( - const ParallelTensor tensor, + ParallelTensor const tensor, Legion::IndexSpaceT const &task_is, Legion::LogicalPartition &part_fwd, Legion::LogicalPartition &part_bwd); @@ -1079,6 +1082,9 @@ class FFModel { void get_metrics(); void backward(int seq_length = -1); void update(); + void unified_update(); + // bool apply_fusion(std::vector const &operators, + // std::vector &new_operators); bool apply_fusion( std::vector const &operators, std::vector &new_operators, @@ -1137,10 +1143,11 @@ class FFModel { Legion::IndexSpace get_or_create_task_is(ParallelConfig const &pc); Legion::IndexSpace get_or_create_task_is(MachineView const &view); Legion::IndexSpace get_or_create_task_is(Legion::Domain const &domain); - Legion::IndexSpace get_or_create_task_is(const ParallelTensor); + Legion::IndexSpace get_or_create_task_is(ParallelTensor const); Legion::IndexSpace get_task_is(Legion::Domain const &domain) const; Legion::IndexSpace get_task_is(ParallelConfig const &pc) const; Legion::IndexSpace get_task_is(MachineView const &view) const; + bool is_transformer_block(int layer_idx) const; bool need_to_add_combine(int layer_idx) const; bool need_to_add_allreduce(int layer_idx) const; bool need_to_add_parallel_identity(int layer_idx) const; @@ -1174,6 +1181,7 @@ class FFModel { int metrics_input; ParallelTensor parallel_label_tensor; Tensor label_tensor; + int num_inputs = 0; std::vector layers; std::vector operators; @@ -1292,6 +1300,8 @@ class FFModel { Replicate *>, std::unordered_map, Reduction *>, + std::unordered_map, + AllReduce *>, std::unordered_map, Combine *>, std::unordered_map, diff --git a/include/flexflow/ops/dropout.h b/include/flexflow/ops/dropout.h index 37304bdada..b8033c98ba 100644 --- a/include/flexflow/ops/dropout.h +++ b/include/flexflow/ops/dropout.h @@ -5,6 +5,13 @@ #include "flexflow/node.h" #include "flexflow/operator.h" #include "flexflow/ops/dropout_params.h" +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include +#include +#elif defined(FF_USE_HIP_ROCM) +#include +#include +#endif namespace FlexFlow { diff --git a/include/flexflow/ops/element_binary.h b/include/flexflow/ops/element_binary.h index 08747bb9a4..e5efa43bf8 100644 --- a/include/flexflow/ops/element_binary.h +++ b/include/flexflow/ops/element_binary.h @@ -17,8 +17,8 @@ class ElementBinary : public Op { ElementBinary(FFModel &model, LayerID const &layer_guid, OperatorType type, - const ParallelTensor x, - const ParallelTensor y, + ParallelTensor const x, + ParallelTensor const y, bool inplace_a, char const *name); ElementBinary(FFModel &model, @@ -78,6 +78,7 @@ class ElementBinary : public Op { public: bool inplace_a, has_same_operands; bool broadcast_input1, broadcast_input2; + int batch_size; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/kernels/dropout_kernels.h b/include/flexflow/ops/kernels/dropout_kernels.h index 421974fbaa..b2201dd34e 100644 --- a/include/flexflow/ops/kernels/dropout_kernels.h +++ b/include/flexflow/ops/kernels/dropout_kernels.h @@ -5,6 +5,7 @@ #include "flexflow/fftype.h" #include "flexflow/op_meta.h" #include "flexflow/ops/dropout.h" +#include "flexflow/accessor.h" namespace FlexFlow { @@ -17,33 +18,40 @@ class DropoutMeta : public OpMeta { ~DropoutMeta(void); Realm::RegionInstance reserveInst; #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) + curandState *state; cudnnTensorDescriptor_t inputTensor, outputTensor; cudnnDropoutDescriptor_t dropoutDesc; #else miopenTensorDescriptor_t inputTensor, outputTensor; miopenDropoutDescriptor_t dropoutDesc; + hiprandState *state; #endif void *reserveSpace, *dropoutStates; size_t reserveSpaceSize, dropoutStateSize; + size_t num_elements; + long long seed; + float rate; }; namespace Kernels { namespace Dropout { void forward_kernel_wrapper(DropoutMeta *m, - float const *input_ptr, - float *output_ptr); + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); void backward_kernel_wrapper(DropoutMeta *m, - float const *output_grad_ptr, - float *input_grad_ptr); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); namespace Internal { void forward_kernel(DropoutMeta *m, float const *input_ptr, float *output_ptr, + size_t num_elements, ffStream_t stream); void backward_kernel(DropoutMeta *m, float const *output_grad_ptr, float *input_grad_ptr, + size_t num_elements, ffStream_t stream); } // namespace Internal } // namespace Dropout diff --git a/include/flexflow/ops/kernels/element_binary_kernels.h b/include/flexflow/ops/kernels/element_binary_kernels.h index 5a375fb661..111c5140ce 100644 --- a/include/flexflow/ops/kernels/element_binary_kernels.h +++ b/include/flexflow/ops/kernels/element_binary_kernels.h @@ -23,6 +23,9 @@ class ElementBinaryMeta : public OpMeta { OperatorType op_type; bool inplace_a, has_same_operands; bool broadcast_input1, broadcast_input2; + int batch_size; + size_t replicate_size; + char op_name[MAX_OPNAME]; }; namespace Kernels { diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index 0b7f1090f6..342d1cd45e 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -23,6 +23,8 @@ class SoftmaxMeta : public OpMeta { bool profiling; bool inference_debugging; int dim; + bool last_layer; + char op_name[MAX_OPNAME]; }; namespace Kernels { @@ -34,7 +36,12 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, GenericTensorAccessorW const &input_grad, - GenericTensorAccessorR const &output_grad); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &outputs, + size_t num_elements); +// float *input_grad_ptr, +// float const *output_grad_ptr, +// float const *output_ptr, void inference_kernel_wrapper(SoftmaxMeta const *m, BatchConfig const *bc, @@ -54,11 +61,11 @@ void forward_kernel(SoftmaxMeta const *m, DT const *input_ptr, DT *output_ptr, ffStream_t stream); - template void backward_kernel(SoftmaxMeta const *m, DT *input_grad_ptr, DT const *output_grad_ptr, + DT const *output_ptr, size_t num_elements, ffStream_t stream); diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index b5e9538ea6..f63caad916 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -18,7 +18,7 @@ class LayerNorm : public Op { bool allocate_weights = false); LayerNorm(FFModel &model, LayerID const &_layer_guid, - const ParallelTensor _input, + ParallelTensor const _input, std::vector const &axes, bool _elementwise_affine, bool _use_bias, @@ -148,6 +148,7 @@ class LayerNormMeta : public OpMeta { int64_t effective_batch_size, effective_num_elements; float eps; void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; + char op_name[MAX_OPNAME]; Realm::RegionInstance reserveInst; // PEFT related fields void *input_activation; diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 82aff53766..de871769fd 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -17,6 +17,7 @@ class Softmax : public Op { LayerID const &_layer_guid, const ParallelTensor logit, int dim, + bool _last_layer, char const *name); Softmax(FFModel &model, Params const ¶ms, @@ -93,6 +94,7 @@ class Softmax : public Op { public: int dim; + bool last_layer; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/softmax_params.h b/include/flexflow/ops/softmax_params.h index 63dc87641f..36141f8e28 100644 --- a/include/flexflow/ops/softmax_params.h +++ b/include/flexflow/ops/softmax_params.h @@ -8,6 +8,7 @@ namespace FlexFlow { struct SoftmaxParams { LayerID layer_guid; int dim; + bool last_layer; char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/split.h b/include/flexflow/ops/split.h index cb9c6bdb57..95c569738d 100644 --- a/include/flexflow/ops/split.h +++ b/include/flexflow/ops/split.h @@ -59,6 +59,8 @@ class Split : public Op { Params get_params() const; + tl::optional as_dot() const override; + public: int legion_axis; std::vector splits; diff --git a/include/flexflow/optimizer.h b/include/flexflow/optimizer.h index bab7e6e4ed..401fffb351 100644 --- a/include/flexflow/optimizer.h +++ b/include/flexflow/optimizer.h @@ -18,6 +18,7 @@ #include "flexflow/parallel_tensor.h" #include "legion.h" +#include "accessor.h" namespace FlexFlow { @@ -30,6 +31,7 @@ class Optimizer { virtual void init(void) = 0; virtual void next(void) = 0; virtual void update(const ParallelTensor p) = 0; + virtual void unified_update(std::vector const parameters) = 0; FFModel const *model; }; @@ -43,6 +45,7 @@ class SGDOptimizer : public Optimizer { void init(void); void next(void); void update(const ParallelTensor p); + void unified_update(std::vector const parameters); void set_weight_decay(double _weight_decay); static void ps_update_task(Legion::Task const *task, std::vector const ®ions, @@ -60,6 +63,11 @@ class SGDOptimizer : public Optimizer { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void + nccl_unified_update_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void nccl_update_task_gpu(SGDOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, @@ -85,6 +93,7 @@ class AdamOptimizer : public Optimizer { void init(void); void next(void); void update(const ParallelTensor p); + void unified_update(std::vector const parameters); void set_weight_decay(double _weight_decay); static void ps_update_task(Legion::Task const *task, std::vector const ®ions, @@ -103,6 +112,11 @@ class AdamOptimizer : public Optimizer { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void + nccl_unified_update_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void nccl_update_task_gpu(AdamOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, @@ -110,10 +124,19 @@ class AdamOptimizer : public Optimizer { float *w_ptr, float *v_ptr, float *m_ptr); + static void nccl_unified_update_task_gpu(AdamOptimizer const *op, + OpMeta const *meta, + GenericTensorAccessorR *accWGrads, + size_t *size, + GenericTensorAccessorW *accWs, + GenericTensorAccessorW *accVs, + GenericTensorAccessorW *accMs); #endif double alpha, beta1, beta2, weight_decay, epsilon; double alpha_t, beta1_t, beta2_t; std::map v_values, m_values; + size_t reservedWorkSpaceSize = 0; + int parameters_num = 0; }; }; // namespace FlexFlow diff --git a/include/flexflow/parallel_ops/allreduce.h b/include/flexflow/parallel_ops/allreduce.h index 7e0e4362e2..b5f57a0b53 100644 --- a/include/flexflow/parallel_ops/allreduce.h +++ b/include/flexflow/parallel_ops/allreduce.h @@ -16,7 +16,7 @@ class AllReduce : public ParallelOp { using Input = ParallelTensor; AllReduce(FFModel &model, - const ParallelTensor input, + ParallelTensor const input, int allreduce_legion_dim, char const *name = NULL); AllReduce(FFModel &model, @@ -60,6 +60,7 @@ class AllReduce : public ParallelOp { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/include/flexflow/parallel_ops/replicate.h b/include/flexflow/parallel_ops/replicate.h index c27616634f..2ed85befc9 100644 --- a/include/flexflow/parallel_ops/replicate.h +++ b/include/flexflow/parallel_ops/replicate.h @@ -18,7 +18,7 @@ class Replicate : public ParallelOp { using Input = ParallelTensor; Replicate(FFModel &model, - const ParallelTensor input, + ParallelTensor const input, int replicate_legion_dim, int replicate_degree, char const *name = NULL); @@ -46,6 +46,12 @@ class Replicate : public ParallelOp { bool get_int_parameter(PMParameter, int *) const override; bool append_parallel_op_info( std::vector ¶llel_ops) const override; + // <<<<<<< HEAD + // static void init_task(Legion::Task const *task, + // std::vector const ®ions, + // Legion::Context ctx, + // Legion::Runtime *runtime); + // ======= static OpMeta *init_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, @@ -63,6 +69,21 @@ class Replicate : public ParallelOp { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + + template + static void + forward_task_with_type(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + + template + static void backward_task_with_type( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/include/flexflow/parallel_tensor.h b/include/flexflow/parallel_tensor.h index d06ecd7bac..a04c1afe86 100644 --- a/include/flexflow/parallel_tensor.h +++ b/include/flexflow/parallel_tensor.h @@ -101,6 +101,7 @@ struct ParallelTensorShape { RecordFormatter as_dot() const; size_t get_piece_size() const; + size_t get_piece_num_elements() const; bool is_valid() const; int get_num_replica_dims() const; diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index 486a65eb3d..4b9d605646 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -86,6 +86,12 @@ __global__ void assign_kernel(DT *ptr, Legion::coord_t size, DT value); template __global__ void copy_kernel(DT *dst, const DT *src, Legion::coord_t size); +template +__global__ void copy_kernel_with_replicate(DT *dst, + const DT *src, + Legion::coord_t origin_size, + Legion::coord_t size); + template __global__ void copy_kernel_discrete(DT *dst, const DT *src, diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h index 805cc46b4c..820625cf85 100644 --- a/include/flexflow/utils/hip_helper.h +++ b/include/flexflow/utils/hip_helper.h @@ -23,7 +23,7 @@ do { \ std::stringstream _error; \ if (status != miopenStatusSuccess) { \ - _error << "CUDNN failure: " << status; \ + _error << "CUDNN failure: " << miopenGetErrorString(status); \ FatalError(_error.str()); \ } \ } while (0) @@ -86,6 +86,12 @@ __global__ void assign_kernel(DT *ptr, Legion::coord_t size, DT value); template __global__ void copy_kernel(DT *dst, const DT *src, Legion::coord_t size); +template +__global__ void copy_kernel_with_replicate(DT *dst, + const DT *src, + Legion::coord_t origin_size, + Legion::coord_t size); + template __global__ void copy_kernel_discrete(DT *dst, const DT *src, @@ -174,6 +180,10 @@ miopenStatus_t Legion::Domain domain, DataType data_type = DT_FLOAT); +miopenStatus_t + cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor, + Legion::Domain domain); + hipblasDatatype_t ff_to_cuda_datatype(DataType type); miopenDataType_t ff_to_cudnn_datatype(DataType type); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 7692ccb88f..9b857210f2 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -39,7 +39,7 @@ from flexflow.config import * from .flexflowlib import ffi, flexflow_library from typing import Union, List -from peft import LoraConfig +# from peft import LoraConfig import json @@ -1193,7 +1193,6 @@ def __detach_raw_ptr(self, ffconfig): # Parameter # ----------------------------------------------------------------------- - class Parameter(Tensor): __slots__ = ["parameter_handle"] @@ -1880,14 +1879,14 @@ def from_jsonfile(self, jsonfile: str): config_dict["optimizer_type"] = OptimizerType.OPTIMIZER_TYPE_SGD return LoraLinearConfig(**config_dict) - def to_hf_config(self) -> LoraConfig: - return LoraConfig( - base_model_name_or_path=self.base_model_name_or_path, - r=self.rank, - target_modules=self.target_modules, - lora_alpha=self.lora_alpha, - lora_dropout=self.lora_dropout, - ) + # def to_hf_config(self) -> LoraConfig: + # return LoraConfig( + # base_model_name_or_path=self.base_model_name_or_path, + # r=self.rank, + # target_modules=self.target_modules, + # lora_alpha=self.lora_alpha, + # lora_dropout=self.lora_dropout, + # ) @property def cache_folder(self): @@ -3095,7 +3094,7 @@ def flat(self, input, name=None): self.add_layer(OpType.FLAT, name) return Tensor(handle, owner_op_type=OpType.FLAT) - def softmax(self, input, axis=-1, name=None): + def softmax(self, input, axis=-1, last_layer=False, name=None): """Softmax activation function. :param input: the input Tensor. @@ -3108,7 +3107,7 @@ def softmax(self, input, axis=-1, name=None): """ c_name = get_c_name(name) handle = ffc().flexflow_model_add_softmax( - self.handle, input.handle, axis, c_name + self.handle, input.handle, axis, last_layer, c_name ) self.add_layer(OpType.SOFTMAX, name) return Tensor(handle, owner_op_type=OpType.SOFTMAX) @@ -4344,6 +4343,13 @@ def update(self): :returns: None -- no returns. """ ffc().flexflow_model_update(self.handle) + + def unified_update(self): + """Update weights and biases of all layers. + + :returns: None -- no returns. + """ + ffc.flexflow_model_unified_update(self.handle) def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): """Configure the model for trainting. FlexFlow uses lazy initialization, @@ -4386,6 +4392,26 @@ def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): for ff_tensor, np_tensor in self.attr_tensors.items(): ff_tensor.set_tensor(self, np_tensor) print("Compiled ffmodel!") + + def load_bert_pretrained(self, checkpoint=None): + # store weights in dict + weights_dict = {} + for name, params in checkpoint.named_parameters(): + weights_dict[name.replace("LayerNorm", "layer_norm").replace(".", "_")] = params.detach().cpu().numpy() + print(name.replace("LayerNorm", "layer_norm").replace(".", "_")) + # some weights not in params + weights_dict['cls_predictions_decoder_weight'] = checkpoint.cls.predictions.decoder.weight.detach().cpu().numpy() + weights_dict['cls_predictions_decoder_bias'] = checkpoint.cls.predictions.decoder.bias.detach().cpu().numpy() + for i in range (self._nb_layers): + layer = self._layers[i] + if (layer.name + "_weight") in weights_dict: + print('weight: ' + layer.name) + weight = layer.get_parameter_by_id(0) + weight.set_tensor(self, weights_dict[layer.name + "_weight"]) + if (layer.name + "_bias") in weights_dict: + print('bias: ' + layer.name) + bias = layer.get_parameter_by_id(1) + bias.set_tensor(self, weights_dict[layer.name + "_bias"]) def fit(self, x=None, y=None, batch_size=None, epochs=1): """Trains the model for a fixed number of epochs (iterations on a dataset). @@ -4420,13 +4446,13 @@ def fit(self, x=None, y=None, batch_size=None, epochs=1): for d in dataloaders: d.reset() self.reset_metrics() - iterations = num_samples / batch_size + iterations = 1 for iter in range(0, int(iterations)): self._ffconfig.begin_trace(self._tracing_id) for d in dataloaders: d.next_batch(self) self.forward() - self.zero_gradients() + # self.zero_gradients() self.backward() self.update() self._ffconfig.end_trace(self._tracing_id) diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index df4042748f..5d4f892ccc 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -1,4 +1,4 @@ -# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# Copyright 2020 Stanford University, Los Alamos National Laboratory # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,6 @@ from collections import OrderedDict from enum import Enum from typing import List -import copy import numpy as np from flexflow.core.flexflow_cffi import Tensor, NormInitializer @@ -26,6 +25,7 @@ try: import torch + print(torch.__version__) from torch.fx.immutable_collections import immutable_dict except: pass @@ -653,14 +653,26 @@ def string_to_ff(string, ffmodel, node_to_output): data = Node.StringData(string) name = data.name input_tensor = node_to_output[data.innodes[0]] - return ffmodel.identity(input=input_tensor, name=name) - # TODO: Change to ffmodel.layernorm() once supported + axes = [len(input_tensor.dims) - 1] + return ffmodel.layer_norm( + input=input_tensor, + axes=axes, + elementwise_affine=True, + eps=1e-6, + name=name, + ) def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] - return ffmodel.identity(input=input_tensor, name=self.name) - # TODO: Change to ffmodel.layernorm() once supported - + axes = [0] + eps = self.module.eps + return ffmodel.layer_norm( + input=input_tensor, + axes=axes, + elementwise_affine=True, + eps=eps, + name=self.name, + ) class T5LayerNormNode(Node): """ @@ -931,7 +943,7 @@ def construct_node(node): elif name.find("contiguous") >= 0: return ContiguousNode(node) elif name.find("tanh") >= 0: return TanhFNode(node) elif name.find("gelu") >= 0: return GeluFNode(node) - assert 0, f"Unknown function or method: {name}" + assert 0, f"Unknown function or method: {name} {node}" @staticmethod def is_right_scalar_op(node): @@ -1186,16 +1198,24 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_sub( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor, scalar = \ FunctionNode.parse_scalar_op(self, node_to_output) - return ffmodel.scalar_sub( - input=input_tensor, scalar=scalar, name=self.name, - ) - + if self.scalar_pos == FunctionNode.ScalarPosition.RIGHT: + return ffmodel.scalar_sub( + input=input_tensor, scalar=scalar, inplace=False, name=self.name, + ) + else: + negative_input = ffmodel.scalar_multiply( + input=input_tensor, scalar=-1, inplace=False, name=self.name + '_negative', + ) + return ffmodel.scalar_sub( + input=negative_input, scalar=-scalar, inplace=False, name=self.name, + ) + class ScalarTrueDivNode(FunctionNode): def __init__(self, node): @@ -1220,15 +1240,16 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_true_divide( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] scalar = self.innodes[1] assert type(scalar) is float + return ffmodel.scalar_true_divide( - input=input_tensor, scalar=scalar, name=self.name, + input=input_tensor, scalar=scalar, inplace=False, name=self.name, ) @@ -1409,6 +1430,10 @@ def to_ff(self, ffmodel, node_to_output): @staticmethod def slice_tensor(ffmodel, tensor, slices, name): + + print('slices', slices) + old_shape = tensor.dims + print('old_shape', tensor.dims) """Returns a reshaped tensor based on the given slices.""" def is_colon(slice_elem): """Returns if the slice is equivalent to `:`.""" @@ -1424,11 +1449,20 @@ def is_truncate(slice_elem, old_size): stop = old_size if slice_elem.stop == None else slice_elem.stop new_size = stop - start return new_size < old_size - + def is_single_element(slice_elem): return isinstance(slice_elem, int) + def is_exact(slice_elem, old_size): + if slice_elem is None: + return False + start = 0 if slice_elem.start == None else slice_elem.start + stop = old_size if slice_elem.stop == None else slice_elem.stop + new_size = stop - start + return new_size == old_size + shape = tensor.dims + print('input dims', tensor.dims) # Fewer slices than input dimensions diff = len(shape) - len(slices) @@ -1441,12 +1475,18 @@ def is_single_element(slice_elem): # Match dimensions from right to left new_shape = [] # append then reverse j = len(shape) - 1 + import copy curr_tensor = copy.copy(tensor) for slice_elem in reversed(slices): - if is_colon(slice_elem): + print('slice_elem', slice_elem) + if is_colon(slice_elem) or is_exact(slice_elem, shape[j]): + print('shape', shape) assert j >= 0 + print('j', j) + print('new_shape_bef', new_shape) new_shape.append(shape[j]) + print('new_shape_aft', new_shape) j -= 1 elif is_unsqueeze(slice_elem): new_shape.append(1) @@ -1456,6 +1496,8 @@ def is_single_element(slice_elem): curr_tensor = ffmodel.split(input=curr_tensor, sizes=splits, axis=j, name=name)[0] new_shape.append(1) j -= 1 + elif is_exact(slice_elem, shape[j]): + print('exact') elif is_truncate(slice_elem, shape[j]): assert j >= 0 start = 0 if slice_elem.start == None else slice_elem.start @@ -1481,8 +1523,45 @@ def is_single_element(slice_elem): assert 0, f"Unsupported slice element: {slice_elem}" new_shape.reverse() - return ffmodel.reshape(input=curr_tensor, shape=new_shape, name=name,) - + if len(new_shape) == 0: + return curr_tensor + else: + print('new_shape', new_shape) + if old_shape == new_shape: + return curr_tensor + return ffmodel.reshape(input=curr_tensor, shape=new_shape, name=name,) + + + +# """Returns a reshaped tensor based on the given slices.""" +# def is_colon(slice_elem): +# """Returns if the slice is equivalent to `:`.""" +# return slice_elem == slice(None, None, None) +# +# def is_unsqueeze(slice_elem): +# """Returns if the slice is equivalent to unsqueezing that +# dimension.""" +# return slice_elem is None +# shape = tensor.dims +# # Match dimensions from right to left +# new_shape = [] # append then reverse +# j = len(shape) - 1 +# for slice_elem in reversed(slices): +# if is_colon(slice_elem): +# assert j >= 0 +# new_shape.append(shape[j]) +# j -= 1 +# elif is_unsqueeze(slice_elem): +# new_shape.append(1) +# else: +# assert 0, f"Unsupported slice element: {slice_elem}" +# new_shape.reverse() +# return ffmodel.reshape( +# input=tensor, shape=new_shape, name=name, +# ) + + + @staticmethod def strings_to_slices(strings: List[str]): # Extract slice elements @@ -1583,14 +1662,14 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_multiply( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor, scalar = \ FunctionNode.parse_scalar_op(self, node_to_output) return ffmodel.scalar_multiply( - input=input_tensor, scalar=scalar, name=self.name, + input=input_tensor, scalar=scalar, inplace=False, name=self.name, ) @@ -1751,7 +1830,7 @@ def __init__(self, node): def parse(self): s = [self.name] scalar = self.innodes[1] - if type(scalar) is not int or type(scalar) is not float: + if not isinstance(scalar, [int, float]): assert 0, "FlexFlow does not support tensor floor division" innodes = (self.innodes[0],) s.append(self.parse_inoutnodes(innodes)) @@ -2290,11 +2369,16 @@ def string_to_ff(string, ffmodel, node_to_output): "since attributes require access to the PyTorch model" ) - def to_ff(self, ffmodel, node_to_output): - return self.attr_to_ff_tensor(ffmodel) + def to_ff(self, ffmodel, node_to_output, input_tensors): + return self.attr_to_ff_tensor(ffmodel, input_tensors) + + def attr_to_ff_tensor(self, ffmodel, input_tensors): - def attr_to_ff_tensor(self, ffmodel): - torch_tensor = self.attr + + torch_tensor = self.attr + assert (torch_tensor.shape[0] == 1) + batch_size = ffmodel._ffconfig.batch_size + torch_tensor = np.repeat(torch_tensor, batch_size, axis=0) ff_dtype = Node.torch_to_ff_dtype(torch_tensor.dtype) requires_grad = torch_tensor.requires_grad @@ -2309,14 +2393,17 @@ def attr_to_ff_tensor(self, ffmodel): ff_dtype = DataType.DT_FLOAT np_tensor = np_tensor.astype(np.float32) + print('attr: ', torch_tensor.shape) + assert (torch_tensor.shape[0] == batch_size) ff_tensor = ffmodel.create_tensor( - torch_tensor.shape, ff_dtype, requires_grad, + torch_tensor.shape, ff_dtype, True, ) # delay set_tensor, add to ffmodel ffmodel.attr_tensors[ff_tensor] = np_tensor # ff_tensor.set_tensor( # ffmodel, np_tensor # ) + input_tensors.append(ff_tensor) return ff_tensor @@ -2398,7 +2485,7 @@ def to_ff(self, ffmodel, node_to_output, output_tensors): # `CrossEntropyLoss()` implementation logits = node_to_output[other["logits"].name] softmax_logits = ffmodel.softmax( - input=logits, name=self.name, + input=logits, last_layer=True, name=self.name, ) output_tensors[:] += [softmax_logits] else: @@ -2440,6 +2527,11 @@ def _trace_model(self): batch_size=self.batch_size, sequence_length=self.seq_length, ) + + #import pickle + #with open('symbolic_trace', 'rb') as f: + #traced = pickle.load(f) + else: traced = torch.fx.symbolic_trace(self.model) @@ -2527,6 +2619,8 @@ def torch_to_ff(self, ffmodel, input_tensors, verbose=False): elif isinstance(node, OutputNode): node.to_ff(ffmodel, node_to_output, output_tensors) node_output = None + elif isinstance(node, AttributeNode): + node_output = node.to_ff(ffmodel, node_to_output, input_tensors) else: node_output = node.to_ff(ffmodel, node_to_output) diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index e39cb29037..3ad8eb555e 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -35,7 +35,9 @@ class FFCObjectWrapper { t_.impl = const_cast(static_cast(t)); \ return t_; \ } \ - static T unwrap(T_ t_) { return static_cast(t_.impl); } \ + static T unwrap(T_ t_) { \ + return static_cast(t_.impl); \ + } \ static const T unwrap_const(const T_ t_) { \ return static_cast(t_.impl); \ } @@ -235,6 +237,11 @@ void flexflow_model_update(flexflow_model_t handle_) { handle->update(); } +void flexflow_model_unified_update(flexflow_model_t handle_) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + handle->unified_update(); +} + void flexflow_model_compile(flexflow_model_t handle_, enum LossType loss_type, int *metrics, @@ -260,56 +267,56 @@ void flexflow_model_zero_gradients(flexflow_model_t handle_) { } flexflow_tensor_t flexflow_model_add_exp(flexflow_model_t handle_, - const flexflow_tensor_t x_, + flexflow_tensor_t const x_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor x = FFCObjectWrapper::unwrap_const(x_); + Tensor const x = FFCObjectWrapper::unwrap_const(x_); Tensor tensor = handle->exp(x, name); DEBUG_PRINT("[Exp] new Tensor %p, x %p, name %s", tensor, x, name); return FFCObjectWrapper::wrap(tensor); } flexflow_tensor_t flexflow_model_add_sin(flexflow_model_t handle_, - const flexflow_tensor_t x_, + flexflow_tensor_t const x_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor x = FFCObjectWrapper::unwrap_const(x_); + Tensor const x = FFCObjectWrapper::unwrap_const(x_); Tensor tensor = handle->sin(x, name); DEBUG_PRINT("[Sin] new Tensor %p, x %p, name %s", tensor, x, name); return FFCObjectWrapper::wrap(tensor); } flexflow_tensor_t flexflow_model_add_cos(flexflow_model_t handle_, - const flexflow_tensor_t x_, + flexflow_tensor_t const x_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor x = FFCObjectWrapper::unwrap_const(x_); + Tensor const x = FFCObjectWrapper::unwrap_const(x_); Tensor tensor = handle->cos(x, name); DEBUG_PRINT("[Cos] new Tensor %p, x %p, name %s", tensor, x, name); return FFCObjectWrapper::wrap(tensor); } flexflow_tensor_t flexflow_model_add_add(flexflow_model_t handle_, - const flexflow_tensor_t x_, - const flexflow_tensor_t y_, + flexflow_tensor_t const x_, + flexflow_tensor_t const y_, bool inplace_a, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor x = FFCObjectWrapper::unwrap_const(x_); - const Tensor y = FFCObjectWrapper::unwrap_const(y_); + Tensor const x = FFCObjectWrapper::unwrap_const(x_); + Tensor const y = FFCObjectWrapper::unwrap_const(y_); Tensor tensor = handle->add(x, y, inplace_a, name); DEBUG_PRINT("[Add] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name); return FFCObjectWrapper::wrap(tensor); } flexflow_tensor_t flexflow_model_add_subtract(flexflow_model_t handle_, - const flexflow_tensor_t x_, - const flexflow_tensor_t y_, + flexflow_tensor_t const x_, + flexflow_tensor_t const y_, bool inplace_a, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor x = FFCObjectWrapper::unwrap_const(x_); - const Tensor y = FFCObjectWrapper::unwrap_const(y_); + Tensor const x = FFCObjectWrapper::unwrap_const(x_); + Tensor const y = FFCObjectWrapper::unwrap_const(y_); Tensor tensor = handle->subtract(x, y, inplace_a, name); DEBUG_PRINT( "[Subtract] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name); @@ -317,13 +324,13 @@ flexflow_tensor_t flexflow_model_add_subtract(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_multiply(flexflow_model_t handle_, - const flexflow_tensor_t x_, - const flexflow_tensor_t y_, + flexflow_tensor_t const x_, + flexflow_tensor_t const y_, bool inplace_a, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor x = FFCObjectWrapper::unwrap_const(x_); - const Tensor y = FFCObjectWrapper::unwrap_const(y_); + Tensor const x = FFCObjectWrapper::unwrap_const(x_); + Tensor const y = FFCObjectWrapper::unwrap_const(y_); Tensor tensor = handle->multiply(x, y, inplace_a, name); DEBUG_PRINT( "[Multiply] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name); @@ -331,13 +338,13 @@ flexflow_tensor_t flexflow_model_add_multiply(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_divide(flexflow_model_t handle_, - const flexflow_tensor_t x_, - const flexflow_tensor_t y_, + flexflow_tensor_t const x_, + flexflow_tensor_t const y_, bool inplace_a, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor x = FFCObjectWrapper::unwrap_const(x_); - const Tensor y = FFCObjectWrapper::unwrap_const(y_); + Tensor const x = FFCObjectWrapper::unwrap_const(x_); + Tensor const y = FFCObjectWrapper::unwrap_const(y_); Tensor tensor = handle->divide(x, y, inplace_a, name); DEBUG_PRINT( "[Divide] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name); @@ -345,33 +352,33 @@ flexflow_tensor_t flexflow_model_add_divide(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_max(flexflow_model_t handle_, - const flexflow_tensor_t x_, - const flexflow_tensor_t y_, + flexflow_tensor_t const x_, + flexflow_tensor_t const y_, bool inplace_a, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor x = FFCObjectWrapper::unwrap_const(x_); - const Tensor y = FFCObjectWrapper::unwrap_const(y_); + Tensor const x = FFCObjectWrapper::unwrap_const(x_); + Tensor const y = FFCObjectWrapper::unwrap_const(y_); Tensor tensor = handle->max(x, y, inplace_a, name); DEBUG_PRINT("[Max] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name); return FFCObjectWrapper::wrap(tensor); } flexflow_tensor_t flexflow_model_add_min(flexflow_model_t handle_, - const flexflow_tensor_t x_, - const flexflow_tensor_t y_, + flexflow_tensor_t const x_, + flexflow_tensor_t const y_, bool inplace_a, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor x = FFCObjectWrapper::unwrap_const(x_); - const Tensor y = FFCObjectWrapper::unwrap_const(y_); + Tensor const x = FFCObjectWrapper::unwrap_const(x_); + Tensor const y = FFCObjectWrapper::unwrap_const(y_); Tensor tensor = handle->min(x, y, inplace_a, name); DEBUG_PRINT("[Min] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name); return FFCObjectWrapper::wrap(tensor); } flexflow_tensor_t flexflow_model_add_reduce_sum(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int *axes, int n, bool keepdims, @@ -392,21 +399,21 @@ flexflow_tensor_t flexflow_model_add_reduce_sum(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_rsqrt(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input = FFCObjectWrapper::unwrap(input_); + Tensor const input = FFCObjectWrapper::unwrap(input_); Tensor tensor = handle->rsqrt(input, name); DEBUG_PRINT("[Rsqrt] new Tensor %p, input %p, name %s", tensor, input, name); return FFCObjectWrapper::wrap(tensor); } flexflow_tensor_t flexflow_model_add_pow(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, float const exponent, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input = FFCObjectWrapper::unwrap(input_); + Tensor const input = FFCObjectWrapper::unwrap(input_); Tensor tensor = handle->pow(input, exponent, name); DEBUG_PRINT("[Pow] new Tensor %p, input %p, exponent %f, name %s", tensor, @@ -417,13 +424,13 @@ flexflow_tensor_t flexflow_model_add_pow(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_mean(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int *dims, int n, bool keepdims, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input = FFCObjectWrapper::unwrap(input_); + Tensor const input = FFCObjectWrapper::unwrap(input_); std::vector dims_vec; char cbuffer[256]; char *cbuffer_ptr = cbuffer; @@ -448,7 +455,7 @@ flexflow_tensor_t flexflow_model_add_mean(flexflow_model_t handle_, flexflow_tensor_t flexflow_model_add_conv2d(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int out_channels, int kernel_h, int kernel_w, @@ -464,7 +471,7 @@ flexflow_tensor_t flexflow_initializer_t bias_initializer_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input = FFCObjectWrapper::unwrap_const(input_); + Tensor const input = FFCObjectWrapper::unwrap_const(input_); Layer *shared_op = FFCObjectWrapper::unwrap(shared_op_); Initializer *kernel_initializer = FFCObjectWrapper::unwrap(kernel_initializer_); @@ -512,7 +519,7 @@ flexflow_tensor_t flexflow_tensor_t flexflow_model_add_embedding(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int num_entries, int out_dim, enum AggrMode aggr, @@ -521,7 +528,7 @@ flexflow_tensor_t flexflow_initializer_t kernel_initializer_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input = FFCObjectWrapper::unwrap_const(input_); + Tensor const input = FFCObjectWrapper::unwrap_const(input_); Layer *shared_op = FFCObjectWrapper::unwrap(shared_op_); Initializer *kernel_initializer = FFCObjectWrapper::unwrap(kernel_initializer_); @@ -595,7 +602,7 @@ flexflow_tensor_t } flexflow_tensor_t flexflow_model_add_batch_norm(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, bool relu, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -615,7 +622,7 @@ flexflow_tensor_t flexflow_model_add_batch_norm(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int n, int *axes, bool elementwise_affine, @@ -623,7 +630,7 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_, bool use_bias, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input = FFCObjectWrapper::unwrap(input_); + Tensor const input = FFCObjectWrapper::unwrap(input_); std::vector axes_vec; for (int i = 0; i < n; i++) { axes_vec.push_back(axes[i]); @@ -647,9 +654,9 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_, flexflow_tensor_t * flexflow_model_add_residual_layer_norm(flexflow_model_t handle_, - const flexflow_tensor_t input_, - const flexflow_tensor_t residual1_, - const flexflow_tensor_t residual2_, + flexflow_tensor_t const input_, + flexflow_tensor_t const residual1_, + flexflow_tensor_t const residual2_, bool use_two_residuals, int n, int *axes, @@ -659,9 +666,9 @@ flexflow_tensor_t * bool inplace_residual, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input = FFCObjectWrapper::unwrap(input_); - const Tensor residual1 = FFCObjectWrapper::unwrap(residual1_); - const Tensor residual2 = + Tensor const input = FFCObjectWrapper::unwrap(input_); + Tensor const residual1 = FFCObjectWrapper::unwrap(residual1_); + Tensor const residual2 = use_two_residuals ? FFCObjectWrapper::unwrap(residual2_) : nullptr; Tensor tensor_outputs[2]; std::vector axes_vec; @@ -709,8 +716,8 @@ flexflow_tensor_t * flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( flexflow_model_t handle_, - const flexflow_tensor_t input_, - const flexflow_tensor_t residual_, + flexflow_tensor_t const input_, + flexflow_tensor_t const residual_, int n, int *axes, bool elementwise_affine, @@ -719,8 +726,8 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( bool inplace_residual, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input = FFCObjectWrapper::unwrap(input_); - const Tensor residual = FFCObjectWrapper::unwrap(residual_); + Tensor const input = FFCObjectWrapper::unwrap(input_); + Tensor const residual = FFCObjectWrapper::unwrap(residual_); Tensor tensor_outputs[2]; std::vector axes_vec; for (int i = 0; i < n; i++) { @@ -759,12 +766,12 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( flexflow_tensor_t flexflow_model_add_sigmoid_silu_multi(flexflow_model_t handle_, - const flexflow_tensor_t input1_, - const flexflow_tensor_t input2_, + flexflow_tensor_t const input1_, + flexflow_tensor_t const input2_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input1 = FFCObjectWrapper::unwrap(input1_); - const Tensor input2 = FFCObjectWrapper::unwrap(input2_); + Tensor const input1 = FFCObjectWrapper::unwrap(input1_); + Tensor const input2 = FFCObjectWrapper::unwrap(input2_); Tensor tensor = handle->sigmoid_silu_multi(input1, input2, input1->data_type, name); DEBUG_PRINT("[SigmoidSiluMulti] new Tensor %p, input1 %p, input2 %p, name %s", @@ -776,8 +783,8 @@ flexflow_tensor_t } flexflow_tensor_t flexflow_model_add_batch_matmul(flexflow_model_t handle_, - const flexflow_tensor_t a_, - const flexflow_tensor_t b_, + flexflow_tensor_t const a_, + flexflow_tensor_t const b_, int a_seq_length_dim, int b_seq_length_dim) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -791,7 +798,7 @@ flexflow_tensor_t flexflow_model_add_batch_matmul(flexflow_model_t handle_, flexflow_tensor_t flexflow_model_add_dense( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int out_dim, enum ActiMode activation /* AC_MODE_NONE */, bool use_bias /* true */, @@ -803,7 +810,7 @@ flexflow_tensor_t flexflow_model_add_dense( float kernel_reg_lambda, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input = FFCObjectWrapper::unwrap_const(input_); + Tensor const input = FFCObjectWrapper::unwrap_const(input_); Layer *shared_op = FFCObjectWrapper::unwrap(shared_op_); Initializer *kernel_initializer = FFCObjectWrapper::unwrap(kernel_initializer_); @@ -909,8 +916,8 @@ flexflow_tensor_t flexflow_model_add_flat(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle_, - const flexflow_tensor_t input_, - const flexflow_tensor_t index_, + flexflow_tensor_t const input_, + flexflow_tensor_t const index_, int dim, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -927,19 +934,21 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int dim, + bool last_layer, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); - Tensor tensor = handle->softmax(input, dim, input->data_type, name); + Tensor tensor = + handle->softmax(input, dim, last_layer, input->data_type, name); DEBUG_PRINT( "[Softmax] new Tensor %p, input %p, name %s", tensor, input, name); return FFCObjectWrapper::wrap(tensor); } flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int n, int *perm, char const *name) { @@ -959,7 +968,7 @@ flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_reshape(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int n, int *shape, char const *name) { @@ -979,7 +988,7 @@ flexflow_tensor_t flexflow_model_add_reshape(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_reverse(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int axis, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -995,7 +1004,7 @@ flexflow_tensor_t flexflow_model_add_reverse(flexflow_model_t handle_, flexflow_tensor_t flexflow_model_add_scalar_multiply(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, float const scalar, bool inplace, char const *name) { @@ -1011,7 +1020,7 @@ flexflow_tensor_t } flexflow_tensor_t flexflow_model_add_scalar_add(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, float const scalar, bool inplace, char const *name) { @@ -1027,7 +1036,7 @@ flexflow_tensor_t flexflow_model_add_scalar_add(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_scalar_sub(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, float const scalar, bool inplace, char const *name) { @@ -1045,7 +1054,7 @@ flexflow_tensor_t flexflow_model_add_scalar_sub(flexflow_model_t handle_, flexflow_tensor_t flexflow_model_add_scalar_truediv(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, float const scalar, bool inplace, char const *name) { @@ -1062,7 +1071,7 @@ flexflow_tensor_t } flexflow_tensor_t flexflow_model_add_gelu(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); @@ -1072,7 +1081,7 @@ flexflow_tensor_t flexflow_model_add_gelu(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_identity(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); @@ -1083,7 +1092,7 @@ flexflow_tensor_t flexflow_model_add_identity(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_relu(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, bool inplace, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -1094,7 +1103,7 @@ flexflow_tensor_t flexflow_model_add_relu(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_sigmoid(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); @@ -1105,7 +1114,7 @@ flexflow_tensor_t flexflow_model_add_sigmoid(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_tanh(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); @@ -1115,7 +1124,7 @@ flexflow_tensor_t flexflow_model_add_tanh(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_elu(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, bool inplace, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -1126,7 +1135,7 @@ flexflow_tensor_t flexflow_model_add_elu(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_dropout(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, float rate, unsigned long long seed, char const *name) { @@ -1144,9 +1153,9 @@ flexflow_tensor_t flexflow_model_add_dropout(flexflow_model_t handle_, flexflow_tensor_t flexflow_model_add_multihead_attention( flexflow_model_t handle_, - const flexflow_tensor_t query_, - const flexflow_tensor_t key_, - const flexflow_tensor_t value_, + flexflow_tensor_t const query_, + flexflow_tensor_t const key_, + flexflow_tensor_t const value_, int embed_dim, int num_heads, int kdim, @@ -1199,7 +1208,7 @@ flexflow_tensor_t flexflow_model_add_multihead_attention( flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int embed_dim, int num_heads, int kdim, @@ -1242,7 +1251,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention( flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int embed_dim, int num_heads, int kdim, @@ -1286,7 +1295,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention( flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int embed_dim, int num_heads, int kdim, @@ -1330,7 +1339,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int embed_dim, int num_q_heads, int num_kv_heads, @@ -1375,7 +1384,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int embed_dim, int num_q_heads, int num_kv_heads, @@ -1421,7 +1430,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention( flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int embed_dim, int num_q_heads, int num_kv_heads, @@ -1466,7 +1475,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( } flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, float eps, int dim, char const *name) { @@ -1478,8 +1487,8 @@ flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_, flexflow_tensor_t * flexflow_model_add_residual_rms_norm(flexflow_model_t handle_, - const flexflow_tensor_t input1_, - const flexflow_tensor_t input2_, + flexflow_tensor_t const input1_, + flexflow_tensor_t const input2_, float eps, int dim, bool inplace_residual, @@ -1506,7 +1515,7 @@ flexflow_tensor_t * } flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int k, bool sorted, bool speculative_decoding, @@ -1519,7 +1528,7 @@ flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int max_beam_size, bool sorted, char const *name) { @@ -1530,7 +1539,7 @@ flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, float top_p, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -1540,7 +1549,7 @@ flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, bool beam_search, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -1551,7 +1560,7 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, flexflow_peft_model_id_t flexflow_model_add_lora_layer( flexflow_model_t handle_, - const flexflow_lora_linear_config_t peft_config_) { + flexflow_lora_linear_config_t const peft_config_) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); LoraLinearConfig const *peft_config = FFCObjectWrapper::unwrap(peft_config_); PEFTModelID *peft_model_id = handle->add_lora_layer(*peft_config); diff --git a/src/dataloader/dataloader.cc b/src/dataloader/dataloader.cc index 441a088194..614482e8b1 100644 --- a/src/dataloader/dataloader.cc +++ b/src/dataloader/dataloader.cc @@ -97,7 +97,7 @@ SingleDataLoader::SingleDataLoader(FFModel &ff, datatype = datatype_; // Currently assume that the leading dim of input is a replica dim of degree 1 assert(input->dims[input->num_dims - 1].is_replica_dim); - assert(input->dims[input->num_dims - 1].size == 1); + // assert(input->dims[input->num_dims - 1].size == 1); batch_input = input; ParallelDim dims[MAX_TENSOR_DIM]; diff --git a/src/dataloader/dataloader.cpp b/src/dataloader/dataloader.cpp index 7d9ffc02b1..97668d705d 100644 --- a/src/dataloader/dataloader.cpp +++ b/src/dataloader/dataloader.cpp @@ -41,10 +41,12 @@ void SingleDataLoader::load_input(Task const *task, int num_dims = full_input_domain.get_dim(); assert(num_dims + 1 == batch_input_domain.get_dim()); // assert the leading replica dim has a degree of one - assert(batch_input_domain.hi()[num_dims] == - batch_input_domain.lo()[num_dims]); + // assert(batch_input_domain.hi()[num_dims] == + // batch_input_domain.lo()[num_dims]); coord_t batch_size = batch_input_domain.hi()[num_dims - 1] - batch_input_domain.lo()[num_dims - 1] + 1; + coord_t replicate_num = + batch_input_domain.hi()[num_dims] - batch_input_domain.lo()[num_dims] + 1; coord_t num_elements_per_batch = batch_input_domain.get_volume() / batch_size; // FIXME: currently assume continous indices assert(batch_size == meta->num_samples); @@ -61,13 +63,15 @@ void SingleDataLoader::load_input(Task const *task, // printf("ptr(%p, %p), idx0 %d nb_elements_per_batch %d, batch_size %d, // %d\n", acc_full_input.ptr, input_zc, start_idx, num_elements_per_batch, // batch_size, start_idx * num_elements_per_batch); - hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel
), + assert(batch_input_domain.get_volume() % replicate_num == 0); + hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel_with_replicate
), GET_BLOCKS(batch_input_domain.get_volume()), CUDA_NUM_THREADS, 0, stream, batch_input_ptr, input_zc, + batch_input_domain.get_volume() / replicate_num, batch_input_domain.get_volume()); checkCUDA(hipDeviceSynchronize()); } diff --git a/src/dataloader/dataloader.cu b/src/dataloader/dataloader.cu index c2994d00a2..5462532d76 100644 --- a/src/dataloader/dataloader.cu +++ b/src/dataloader/dataloader.cu @@ -40,10 +40,13 @@ void SingleDataLoader::load_input(Task const *task, int num_dims = full_input_domain.get_dim(); assert(num_dims + 1 == batch_input_domain.get_dim()); // assert the leading replica dim has a degree of one - assert(batch_input_domain.hi()[num_dims] == - batch_input_domain.lo()[num_dims]); + // assert(batch_input_domain.hi()[num_dims] == + // batch_input_domain.lo()[num_dims]); coord_t batch_size = batch_input_domain.hi()[num_dims - 1] - batch_input_domain.lo()[num_dims - 1] + 1; + + coord_t replicate_num = + batch_input_domain.hi()[num_dims] - batch_input_domain.lo()[num_dims] + 1; coord_t num_elements_per_batch = batch_input_domain.get_volume() / batch_size; // FIXME: currently assume continous indices assert(batch_size == meta->num_samples); @@ -60,11 +63,15 @@ void SingleDataLoader::load_input(Task const *task, // printf("ptr(%p, %p), idx0 %d nb_elements_per_batch %d, batch_size %d, // %d\n", acc_full_input.ptr, input_zc, start_idx, num_elements_per_batch, // batch_size, start_idx * num_elements_per_batch); - copy_kernel
+ assert(batch_input_domain.get_volume() % replicate_num == 0); + copy_kernel_with_replicate
<<>>(batch_input_ptr, input_zc, batch_input_domain.get_volume()); + stream>>>(batch_input_ptr, + input_zc, + batch_input_domain.get_volume() / replicate_num, + batch_input_domain.get_volume()); checkCUDA(cudaDeviceSynchronize()); } diff --git a/src/loss_functions/loss_functions.cc b/src/loss_functions/loss_functions.cc index ae89c3d469..d887ee9243 100644 --- a/src/loss_functions/loss_functions.cc +++ b/src/loss_functions/loss_functions.cc @@ -49,6 +49,8 @@ void Loss::backward(FFModel *model, if (loss_type == LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE) { assert(logit->get_volume() == label->get_volume()); scale_factor = 2.0f / logit->get_volume(); + } else if (loss_type == LOSS_SPARSE_CATEGORICAL_CROSSENTROPY) { + scale_factor = 1.0f; } else { scale_factor = 1.0f / model->config.batchSize; } @@ -131,9 +133,12 @@ void Loss::backward_task_with_dim(Task const *task, regions[2], task->regions[2], FID_DATA, ctx, runtime); // assertion the outter-most dim is replica dim and replica degree is 1 assert(acc_logit.rect.hi[NDIM - 1] == acc_logit.rect.lo[NDIM - 1]); - int num_samples = - acc_logit.rect.hi[NDIM - 2] - acc_logit.rect.lo[NDIM - 2] + 1; - int num_classes = acc_logit.rect.volume() / num_samples; + + int num_classes = acc_logit.rect.hi[0] - acc_logit.rect.lo[0] + 1; + int num_samples = acc_logit.rect.volume() / num_classes; + // int num_samples = + // acc_logit.rect.hi[NDIM - 2] - acc_logit.rect.lo[NDIM - 2] + 1; + // int num_classes = acc_logit.rect.volume() / num_samples; assert(acc_logit_grad.rect == acc_logit.rect); int k = 1; if (loss->repl_labels) { diff --git a/src/loss_functions/loss_functions.cpp b/src/loss_functions/loss_functions.cpp index 99c13f5a67..fda28cbb77 100644 --- a/src/loss_functions/loss_functions.cpp +++ b/src/loss_functions/loss_functions.cpp @@ -20,6 +20,7 @@ namespace FlexFlow { using namespace Legion; +int const MASK_TOKEN = -100; __global__ void sparse_categorical_crossentropy_loss_backward(float *logit_grad, @@ -33,6 +34,25 @@ __global__ void } } +__global__ void + sparse_categorical_crossentropy_loss_backward_with_mask(float *logit_grad, + int const *label, + coord_t num_samples, + coord_t num_classes, + int const k, + float *num) { + CUDA_KERNEL_LOOP(i, num_samples * num_classes) { + int sample_id = i / num_classes; + int label_idx = label[i / (k * num_classes)]; + if (label_idx != MASK_TOKEN && (i == sample_id * num_classes + label_idx)) { + logit_grad[i] -= 1.0f; + atomicAdd(&num[0], 1.0f); + } else if (label_idx == MASK_TOKEN) { + logit_grad[i] = 0.0f; + } + } +} + __global__ void categorical_crossentropy_loss_backward(float *logit_grad, float const *logit, float const *label, @@ -75,8 +95,14 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_ptr, logit_volume * sizeof(float), hipMemcpyDeviceToDevice)); - hipLaunchKernelGGL(sparse_categorical_crossentropy_loss_backward, - GET_BLOCKS(num_samples), + + assert(scale_factor == 1.0f); + float *num; + checkCUDA(hipMalloc(&num, sizeof(float))); + float effective_tokens; + int parallelism = num_samples * num_classes; + hipLaunchKernelGGL(sparse_categorical_crossentropy_loss_backward_with_mask, + GET_BLOCKS(parallelism), CUDA_NUM_THREADS, 0, stream, @@ -84,7 +110,10 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( label_ptr, num_samples, num_classes, - k); + k, + num); + + hipMemcpy(&effective_tokens, num, sizeof(float), hipMemcpyDeviceToHost); // Scale logit gradients by op->scale_factor hipLaunchKernelGGL(scale_kernel, GET_BLOCKS(logit_grad_volume), @@ -94,7 +123,7 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_grad_ptr, logit_grad_volume, 0, - scale_factor * k); + 1.0f / effective_tokens); } void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( diff --git a/src/loss_functions/loss_functions.cu b/src/loss_functions/loss_functions.cu index 636ef9c4c3..3ae492e4a7 100644 --- a/src/loss_functions/loss_functions.cu +++ b/src/loss_functions/loss_functions.cu @@ -18,6 +18,7 @@ namespace FlexFlow { +int const MASK_TOKEN = -100; using namespace Legion; __global__ void @@ -32,6 +33,25 @@ __global__ void } } +__global__ void + sparse_categorical_crossentropy_loss_backward_with_mask(float *logit_grad, + int const *label, + coord_t num_samples, + coord_t num_classes, + int const k, + float *num) { + CUDA_KERNEL_LOOP(i, num_samples * num_classes) { + int sample_id = i / num_classes; + int label_idx = label[i / (k * num_classes)]; + if (label_idx != MASK_TOKEN && (i == sample_id * num_classes + label_idx)) { + logit_grad[i] -= 1.0f; + atomicAdd(&num[0], 1.0f); + } else if (label_idx == MASK_TOKEN) { + logit_grad[i] = 0.0f; + } + } +} + __global__ void categorical_crossentropy_loss_backward(float *logit_grad, float const *logit, float const *label, @@ -74,14 +94,25 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_ptr, logit_volume * sizeof(float), cudaMemcpyDeviceToDevice)); - sparse_categorical_crossentropy_loss_backward<<>>( - logit_grad_ptr, label_ptr, num_samples, num_classes, k); - // Scale logit gradients by op->scale_factor + // calculate the scale factor inside kernel; + assert(scale_factor == 1.0f); + float *num; + checkCUDA(cudaMalloc(&num, sizeof(float))); + float effective_tokens; + int parallelism = num_samples * num_classes; + // sparse_categorical_crossentropy_loss_backward<<>>( + // logit_grad_ptr, label_ptr, num_samples, num_classes, k, num); + sparse_categorical_crossentropy_loss_backward_with_mask<<< + GET_BLOCKS(parallelism), + CUDA_NUM_THREADS, + 0, + stream>>>(logit_grad_ptr, label_ptr, num_samples, num_classes, k, num); + cudaMemcpy(&effective_tokens, num, sizeof(float), cudaMemcpyDeviceToHost); scale_kernel<<>>( - logit_grad_ptr, logit_grad_volume, 0.0f, scale_factor * k); + logit_grad_ptr, logit_grad_volume, 0.0f, 1.0f / effective_tokens); } void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( diff --git a/src/metrics_functions/metrics_functions.cc b/src/metrics_functions/metrics_functions.cc index e8ccbfe2e4..8c7e23ad8a 100644 --- a/src/metrics_functions/metrics_functions.cc +++ b/src/metrics_functions/metrics_functions.cc @@ -15,6 +15,7 @@ #include "flexflow/metrics_functions.h" #include "flexflow/model.h" +#include namespace FlexFlow { @@ -90,6 +91,8 @@ void Metrics::compute(FFModel *model, false /*must*/, 0 /*mapper_id*/, logit->machine_view.hash()); + // std::cout << "logit shape: " << logit->get_shape() << std::endl; + // std::cout << "label shape: " << label->get_shape() << std::endl; launcher.add_region_requirement(RegionRequirement( logit->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, logit->region)); launcher.add_field(0, FID_DATA); @@ -154,6 +157,7 @@ PerfMetrics assert(acc_label.rect.lo[0] == acc_label.rect.hi[0]); // Cannot measure categorical_crossentropy w/ sparse labels // Use measure_sparse_categorical_crossentropy instead + // std::cout << "num_classes: " << num_classes << std::endl; assert(!me->measure_categorical_crossentropy); Metrics::update_metrics_sparse_label_kernel_wrapper(acc_logit.ptr, acc_label.ptr, diff --git a/src/metrics_functions/metrics_functions.cpp b/src/metrics_functions/metrics_functions.cpp index 90d727b9b1..1c57bd6ba9 100644 --- a/src/metrics_functions/metrics_functions.cpp +++ b/src/metrics_functions/metrics_functions.cpp @@ -20,6 +20,7 @@ namespace FlexFlow { float const LOG_MIN_VALUE = 0.00000001f; +int const MASK_TOKEN = -100; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, @@ -30,7 +31,7 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, CUDA_KERNEL_LOOP(b, num_samples) { if (metrics.measure_accuracy) { float max_val = -1.0f; - int my_label = -1; + int my_label = 0; for (int i = 0; i < num_classes; i++) { float my_logit = logits[b * num_classes + i]; if (my_logit > max_val) { @@ -39,14 +40,19 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, } } assert(my_label >= 0); - atomicAdd(&(perf->train_all), 1); - if (labels[b] == my_label) { - atomicAdd(&(perf->train_correct), 1); + if (labels[b] != MASK_TOKEN) { + atomicAdd(&(perf->train_all), 1); + if (labels[b] == my_label) { + atomicAdd(&(perf->train_correct), 1); + } } } if (metrics.measure_sparse_categorical_crossentropy) { - float my_logit = max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); - atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + if (labels[b] != MASK_TOKEN) { + float my_logit = + max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); + atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + } } if (metrics.measure_mean_squared_error || metrics.measure_root_mean_squared_error || diff --git a/src/metrics_functions/metrics_functions.cu b/src/metrics_functions/metrics_functions.cu index 2e037eb472..8c584c397c 100644 --- a/src/metrics_functions/metrics_functions.cu +++ b/src/metrics_functions/metrics_functions.cu @@ -19,6 +19,7 @@ namespace FlexFlow { float const LOG_MIN_VALUE = 0.00000001f; +int const MASK_TOKEN = -100; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, @@ -29,7 +30,7 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, CUDA_KERNEL_LOOP(b, num_samples) { if (metrics.measure_accuracy) { float max_val = -1.0f; - int my_label = -1; + int my_label = 0; for (int i = 0; i < num_classes; i++) { float my_logit = logits[b * num_classes + i]; if (my_logit > max_val) { @@ -38,14 +39,19 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, } } assert(my_label >= 0); - atomicAdd(&(perf->train_all), 1); - if (labels[b] == my_label) { - atomicAdd(&(perf->train_correct), 1); + if (labels[b] != MASK_TOKEN) { + atomicAdd(&(perf->train_all), 1); + if (labels[b] == my_label) { + atomicAdd(&(perf->train_correct), 1); + } } } if (metrics.measure_sparse_categorical_crossentropy) { - float my_logit = max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); - atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + if (labels[b] != MASK_TOKEN) { + float my_logit = + max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); + atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + } } if (metrics.measure_mean_squared_error || metrics.measure_root_mean_squared_error || diff --git a/src/ops/cast.cc b/src/ops/cast.cc index 4a52bf874e..18e9045783 100644 --- a/src/ops/cast.cc +++ b/src/ops/cast.cc @@ -38,7 +38,7 @@ using Legion::Task; using Legion::TaskArgument; using Legion::TaskLauncher; -Tensor FFModel::cast(const Tensor input, DataType dtype, char const *name) { +Tensor FFModel::cast(Tensor const input, DataType dtype, char const *name) { Layer *cast = new Layer(this, OP_CAST, dtype, @@ -316,6 +316,7 @@ void Cast::forward_task(Task const *task, } void Cast::backward(FFModel const &ff) { + ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; diff --git a/src/ops/dropout.cc b/src/ops/dropout.cc index 190d6fd496..d060324de4 100644 --- a/src/ops/dropout.cc +++ b/src/ops/dropout.cc @@ -28,7 +28,7 @@ using PCG::Node; using namespace FlexFlow::Kernels::Dropout; -Tensor FFModel::dropout(const Tensor input, +Tensor FFModel::dropout(Tensor const input, float rate, unsigned long long seed, char const *name) { @@ -86,7 +86,7 @@ bool operator==(DropoutParams const &lhs, DropoutParams const &rhs) { } Dropout::Dropout(FFModel &model, - const ParallelTensor _input, + ParallelTensor const _input, float _rate, unsigned long long _seed, char const *name) @@ -111,12 +111,12 @@ Dropout::Dropout(FFModel &model, Dropout::Dropout(FFModel &model, Dropout const &other, - const ParallelTensor input) + ParallelTensor const input) : Dropout(model, input, other.rate, other.seed, other.name) {} Dropout::Dropout(FFModel &model, DropoutParams const ¶ms, - const ParallelTensor input, + ParallelTensor const input, char const *name) : Dropout(model, input, params.rate, params.seed, params.name) {} @@ -209,12 +209,12 @@ void Dropout::forward_task(Task const *task, assert(task->regions.size() == 2); // const Dropout* dropout = (const Dropout*) task->args; DropoutMeta *m = *((DropoutMeta **)task->local_args); - float const *input_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *output_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - forward_kernel_wrapper(m, input_ptr, output_ptr); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + forward_kernel_wrapper(m, input, output); } void Dropout::backward(FFModel const &ff) { @@ -263,7 +263,12 @@ void Dropout::backward_task(Task const *task, float const *output_grad_ptr = helperGetTensorPointerRO( regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel_wrapper(m, output_grad_ptr, input_grad_ptr); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + backward_kernel_wrapper(m, output_grad, input_grad); } void Dropout::serialize(Legion::Serializer &sez) const { @@ -310,30 +315,39 @@ bool Dropout::measure_operator_cost(Simulator *sim, sim->free_all(); float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); assert(input_ptr != NULL); + + GenericTensorAccessorR input_acc( + m->input_type[0], sub_input.get_domain(), input_ptr); cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); assert(output_ptr != NULL); + + GenericTensorAccessorW output_acc( + m->output_type[0], sub_input.get_domain(), output_ptr); cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); assert(m->profiling == false); std::function forward, backward; - forward = [&] { forward_kernel_wrapper(m, input_ptr, output_ptr); }; + forward = [&] { forward_kernel_wrapper(m, input_acc, output_acc); }; if (sim->computationMode == COMP_MODE_TRAINING) { float *input_grad_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); assert(input_grad_ptr != NULL); + GenericTensorAccessorW input_grad_acc( + m->output_type[0], sub_input.get_domain(), input_grad_ptr); cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *output_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); assert(output_grad_ptr != NULL); + GenericTensorAccessorR output_grad_acc( + m->output_type[0], sub_input.get_domain(), output_grad_ptr); cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - backward = [=] { - backward_kernel_wrapper(m, output_grad_ptr, input_grad_ptr); + backward = [&] { + backward_kernel_wrapper(m, output_grad_acc, input_grad_acc); }; } diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index cf8696182b..d14df410a1 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -24,7 +24,7 @@ using Legion::TaskLauncher; using namespace FlexFlow::Kernels::ElementBinary; -bool broadcastable(const Tensor t1, const Tensor t2) { +bool broadcastable(Tensor const t1, Tensor const t2) { int dim = std::min(t1->num_dims, t2->num_dims); for (int i = 0; i < dim; i++) { if ((t1->dims[i] != t2->dims[i]) && (t1->dims[i] > 1) && @@ -36,8 +36,8 @@ bool broadcastable(const Tensor t1, const Tensor t2) { } Tensor FFModel::binary(OperatorType op, - const Tensor in1, - const Tensor in2, + Tensor const in1, + Tensor const in2, bool inplace_a, char const *name) { Layer *ele = nullptr; @@ -125,43 +125,43 @@ Op *ElementBinary::create_operator_from_layer( layer->name); } -Tensor FFModel::add(const Tensor in1, - const Tensor in2, +Tensor FFModel::add(Tensor const in1, + Tensor const in2, bool inplace_a, char const *name) { return this->binary(OP_EW_ADD, in1, in2, inplace_a, name); } -Tensor FFModel::subtract(const Tensor in1, - const Tensor in2, +Tensor FFModel::subtract(Tensor const in1, + Tensor const in2, bool inplace_a, char const *name) { return this->binary(OP_EW_SUB, in1, in2, inplace_a, name); } -Tensor FFModel::multiply(const Tensor in1, - const Tensor in2, +Tensor FFModel::multiply(Tensor const in1, + Tensor const in2, bool inplace_a, char const *name) { return this->binary(OP_EW_MUL, in1, in2, inplace_a, name); } -Tensor FFModel::divide(const Tensor in1, - const Tensor in2, +Tensor FFModel::divide(Tensor const in1, + Tensor const in2, bool inplace_a, char const *name) { return this->binary(OP_EW_DIV, in1, in2, inplace_a, name); } -Tensor FFModel::max(const Tensor in1, - const Tensor in2, +Tensor FFModel::max(Tensor const in1, + Tensor const in2, bool inplace_a, char const *name) { return this->binary(OP_EW_MAX, in1, in2, inplace_a, name); } -Tensor FFModel::min(const Tensor in1, - const Tensor in2, +Tensor FFModel::min(Tensor const in1, + Tensor const in2, bool inplace_a, char const *name) { return this->binary(OP_EW_MIN, in1, in2, inplace_a, name); @@ -197,8 +197,8 @@ bool operator==(ElementBinaryParams const &lhs, ElementBinary::ElementBinary(FFModel &model, LayerID const &_layer_guid, OperatorType _op_type, - const ParallelTensor in1, - const ParallelTensor in2, + ParallelTensor const in1, + ParallelTensor const in2, bool _inplace_a, char const *name) : Op(model, @@ -239,6 +239,8 @@ ElementBinary::ElementBinary(FFModel &model, numdim, dims, in1->data_type, this); broadcast_input1 = (inputs[0]->get_volume() != outputs[0]->get_volume()); broadcast_input2 = (inputs[1]->get_volume() != outputs[0]->get_volume()); + + batch_size = dims[numdim - 2].size; } ElementBinary::ElementBinary( @@ -438,6 +440,8 @@ OpMeta *ElementBinary::init_task(Task const *task, m->has_same_operands = eb->has_same_operands; m->broadcast_input1 = eb->broadcast_input1; m->broadcast_input2 = eb->broadcast_input2; + m->batch_size = eb->batch_size; + std::strcpy(m->op_name, eb->name); m->layer_guid = eb->layer_guid; Domain input1_domain = runtime->get_index_space_domain( @@ -470,6 +474,10 @@ OpMeta *ElementBinary::init_task(Task const *task, } else { output_domain = input1_domain; } + m->replicate_size = m->broadcast_input1 + ? (input1_domain.get_volume() / m->batch_size) + : (input2_domain.get_volume() / m->batch_size); + assert(task->regions.size() == regions.size()); assert(regions.size() == num_regions); init_kernel(m, input1_domain, input2_domain, output_domain); @@ -483,7 +491,7 @@ void ElementBinary::forward(FFModel const &ff) { set_argumentmap_for_forward(ff, argmap); IndexLauncher launcher(ELEMENTBINARY_FWD_TASK_ID, parallel_is, - TaskArgument(NULL, 0), + TaskArgument(this, sizeof(ElementBinary)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -740,7 +748,7 @@ __host__ void std::vector const ®ions, Context ctx, Runtime *runtime) { - // const ElementBinary* ele = (const ElementBinary*) task->args; + ElementBinary const *ele = (ElementBinary const *)task->args; ElementBinaryMeta const *m = *((ElementBinaryMeta **)task->local_args); GenericTensorAccessorR in1, in2; GenericTensorAccessorW out; @@ -1121,6 +1129,28 @@ bool ElementBinary::measure_operator_cost(Simulator *sim, delete m; return true; } +// void ElementBinary::serialize(Legion::Serializer &sez) const { +// sez.serialize(this->op_type); +// sez.serialize(this->inplace_a); +// } + +using PCG::Node; +/*static*/ +// Node ElementBinary::deserialize(FFModel &ff, +// Legion::Deserializer &dez, +// ParallelTensor inputs[], +// int num_inputs) { +// assert(num_inputs == 2); +// OperatorType op_type; +// bool inplace_a; +// dez.deserialize(op_type); +// dez.deserialize(inplace_a); +// ElementBinaryParams params; +// params.type = op_type; +// params.inplace_a = inplace_a; +// return ff.get_or_create_node({inputs[0], inputs[1]}, +// params); +// } void ElementBinary::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index 09cf13c717..1b8ba3a657 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -131,7 +131,8 @@ Tensor FFModel::tanh(const Tensor x, char const *name) { } Tensor FFModel::identity(const Tensor x, char const *name) { - return this->unary(OP_IDENTITY, x, false /*inplace*/, name); + // return this->unary(OP_IDENTITY, x, false /*inplace*/, name); + return x; } Tensor FFModel::gelu(const Tensor x, char const *name) { diff --git a/src/ops/element_unary.cpp b/src/ops/element_unary.cpp index 435abdfe11..75f8e11580 100644 --- a/src/ops/element_unary.cpp +++ b/src/ops/element_unary.cpp @@ -54,7 +54,7 @@ void ElementUnary::init_kernel(ElementUnaryMeta *m, template __global__ void elewise_unary_forward_kernel( - coord_t volume, const T scalar, OperatorType type, T const *in, T *out) { + coord_t volume, T const scalar, OperatorType type, T const *in, T *out) { CUDA_KERNEL_LOOP(i, volume) { switch (type) { case OP_EXP: { @@ -156,7 +156,7 @@ void ElementUnary::forward_kernel_wrapper(ElementUnaryMeta const *m, template __global__ void elewise_unary_backward_kernel(coord_t volume, - const T scalar, + T const scalar, OperatorType type, T const *output, T const *output_grad, diff --git a/src/ops/element_unary.cu b/src/ops/element_unary.cu index 15e6852388..c978a55ddb 100644 --- a/src/ops/element_unary.cu +++ b/src/ops/element_unary.cu @@ -54,7 +54,7 @@ void ElementUnary::init_kernel(ElementUnaryMeta *m, template __global__ void elewise_unary_forward_kernel( - coord_t volume, const T scalar, OperatorType type, T const *in, T *out) { + coord_t volume, T const scalar, OperatorType type, T const *in, T *out) { CUDA_KERNEL_LOOP(i, volume) { switch (type) { case OP_EXP: { @@ -169,7 +169,7 @@ void ElementUnary::forward_kernel_wrapper(ElementUnaryMeta const *m, template __global__ void elewise_unary_backward_kernel(coord_t volume, - const T scalar, + T const scalar, OperatorType type, T const *output, T const *output_grad, diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 95b538bdb6..1063217260 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -39,7 +39,7 @@ using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Embedding; -Tensor FFModel::embedding(const Tensor input, +Tensor FFModel::embedding(Tensor const input, int num_entries, int out_dim, AggrMode aggr, @@ -247,7 +247,7 @@ Embedding::Embedding(FFModel &model, Embedding::Embedding(FFModel &model, Embedding const &other, - const ParallelTensor input, + ParallelTensor const input, bool allocate_weights) : Embedding(model, other.layer_guid, @@ -261,7 +261,7 @@ Embedding::Embedding(FFModel &model, Embedding::Embedding(FFModel &model, LayerID const &_layer_guid, - const ParallelTensor _input, + ParallelTensor const _input, int _num_entries, int _out_channels, AggrMode _aggr, @@ -313,7 +313,6 @@ Embedding::Embedding(FFModel &model, outputs[0] = model.create_parallel_tensor_legion_ordering( output_ndim, output_dims, dtype, this); - assert(check_output_input_weight_parallel_dims(allocate_weights)); } @@ -940,7 +939,7 @@ void EmbeddingLookup_int64_t_float_float__avx2_fma(int const block_size, bool normalize_by_lengths, float *out) { #ifdef FF_USE_AVX2 - const int64_t prefdist_T0 = 16; + int64_t const prefdist_T0 = 16; if (block_size == 128) { // unrolling 16 times int64_t dataInd = 0; @@ -964,17 +963,17 @@ void EmbeddingLookup_int64_t_float_float__avx2_fma(int const block_size, __m256 vop120 = _mm256_setzero_ps(); for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex]; ++dataInd) { - const int64_t idx = indices[dataInd]; + int64_t const idx = indices[dataInd]; float wgt = 1.f; if (weight) { wgt = weight[dataInd]; } __m256 vwgt = _mm256_set1_ps(wgt); float const *ip = &input[idx * block_size]; - const int64_t next_T0 = (dataInd < index_size - prefdist_T0) + int64_t const next_T0 = (dataInd < index_size - prefdist_T0) ? (dataInd + prefdist_T0) : dataInd; - const int64_t idx_pref_T0 = indices[next_T0]; + int64_t const idx_pref_T0 = indices[next_T0]; assert(idx >= 0 && idx_pref_T0 >= 0 && idx < data_size && idx_pref_T0 < data_size); float const *ip_next_T0 = &input[idx_pref_T0 * block_size]; @@ -1050,10 +1049,10 @@ void EmbeddingLookup_int64_t_float_float__avx2_fma(int const block_size, } __m256 vwgt = _mm256_set1_ps(wgt); float const *ip = &input[idx * block_size]; - const int64_t next_T0 = (dataInd < index_size - prefdist_T0) + int64_t const next_T0 = (dataInd < index_size - prefdist_T0) ? (dataInd + prefdist_T0) : dataInd; - const int64_t idx_pref_T0 = indices[next_T0]; + int64_t const idx_pref_T0 = indices[next_T0]; assert(idx >= 0 && idx_pref_T0 >= 0 && idx < data_size && idx_pref_T0 < data_size); float const *ip_next_T0 = &input[idx_pref_T0 * block_size]; @@ -1094,17 +1093,17 @@ else { } for (int64_t start = dataInd; dataInd < start + lengths[rangeIndex]; ++dataInd) { - const int64_t idx = indices[dataInd]; + int64_t const idx = indices[dataInd]; float wgt = 1.f; if (weight) { wgt = weight[dataInd]; } __m256 vwgt = _mm256_set1_ps(wgt); float const *ip = &input[idx * block_size]; - const int64_t next_T0 = (dataInd < index_size - prefdist_T0) + int64_t const next_T0 = (dataInd < index_size - prefdist_T0) ? (dataInd + prefdist_T0) : dataInd; - const int64_t idx_pref_T0 = indices[next_T0]; + int64_t const idx_pref_T0 = indices[next_T0]; assert(idx >= 0 && idx_pref_T0 >= 0 && idx < data_size && idx_pref_T0 < data_size); float const *ip_next_T0 = &input[idx_pref_T0 * block_size]; diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 121139beb1..de13854898 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -232,7 +232,9 @@ bool FusedOp::add_operator( assert(!op->is_parallel_op() || op->op_type == OP_ALLREDUCE || op->op_type == OP_PARALLEL_IDENTITY); // Currently don't consider nested fusion - assert(op->op_type != OP_FUSED); + if (op->op_type == OP_FUSED) { + return false; + } MachineView my_view = outputs[0]->machine_view; MachineView op_view = op->outputs[0]->machine_view; if (my_view == op_view) { diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index 9f826cd611..540bda18b5 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -24,6 +24,7 @@ #include "flexflow/ops/flat.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" +#include "flexflow/ops/kernels/cast_kernels.h" #include "flexflow/ops/kernels/concat_kernels.h" #include "flexflow/ops/kernels/conv_2d_kernels.h" #include "flexflow/ops/kernels/dropout_kernels.h" @@ -39,6 +40,7 @@ #include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" #include "flexflow/ops/layer_norm.h" +#include "flexflow/ops/linear.h" #include "flexflow/ops/residual_layer_norm.h" #include "flexflow/ops/sigmoid_silu_multi.h" #include "flexflow/ops/spec_inc_multihead_self_attention.h" @@ -1344,9 +1346,7 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); DropoutMeta *m = (DropoutMeta *)metas->meta[op]; Kernels::Dropout::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + m, my_input_accessor[0], my_output_accessor[0]); break; } case OP_LINEAR: { @@ -1427,8 +1427,8 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; Kernels::ElementBinary::forward_kernel_wrapper(m, my_input_accessor[0], @@ -1489,7 +1489,6 @@ __host__ void FusedOp::forward_task(Task const *task, assert(effective_batch_size * in_dim == my_input_accessor[0].domain.get_volume()); } - assert(my_input_accessor[0].data_type == DT_INT32 || my_input_accessor[0].data_type == DT_INT64); Kernels::Embedding::forward_kernel_wrapper(m, @@ -1505,7 +1504,11 @@ __host__ void FusedOp::forward_task(Task const *task, case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); @@ -1548,7 +1551,19 @@ __host__ void FusedOp::forward_task(Task const *task, assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; - Kernels::Softmax::forward_kernel_wrapper( + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0]); + } else { + assert(false); + } + break; + } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::forward_kernel_wrapper( m, my_input_accessor[0], my_output_accessor[0]); break; } @@ -1558,10 +1573,24 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); - Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + if (my_input_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int64_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int32_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false && "Unsupported data type"); + } break; } case OP_TRANSPOSE: { @@ -1597,6 +1626,39 @@ __host__ void FusedOp::forward_task(Task const *task, m, my_input_accessor[0], my_output_accessor[0], gamma, beta); break; } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_output_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } + case OP_RESIDUAL_LAYERNORM: { + assert(false && "Operator ResidualLayerNorm does not support " + "the forward() task"); + break; + } + case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { + assert(false && "Operator AddBiasResidualLayerNorm does not support " + "the forward() task"); + break; + } + case OP_RESIDUAL_LAYERNORM: { assert(false && "Operator ResidualLayerNorm does not support " "the forward() task"); @@ -1981,7 +2043,11 @@ __host__ void FusedOp::backward_task(Task const *task, case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); @@ -2000,7 +2066,8 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == + // my_output_accessor[0].domain); Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; Kernels::Pool2D::backward_kernel_wrapper( m, @@ -2028,10 +2095,51 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); assert(my_input_grad_accessor[0].domain.get_volume() == my_output_grad_accessor[0].domain.get_volume()); - Kernels::Reshape::backward_kernel_wrapper( - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain.get_volume()); + if (my_input_grad_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_int64_ptr(), + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].get_int32_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::backward_kernel_wrapper( + m, + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::backward_kernel_wrapper( + m, my_input_grad_accessor[0], my_output_grad_accessor[0]); break; } case OP_TRANSPOSE: { @@ -2049,6 +2157,46 @@ __host__ void FusedOp::backward_task(Task const *task, my_output_grad_accessor[0].domain); break; } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; + } + LayerNorm::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + gamma.get_float_ptr(), + my_weight_grad_accessor[0].get_float_ptr(), + my_weight_grad_accessor[1].get_float_ptr()); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } default: assert(false && "Fusion currently does not support type"); } @@ -2058,13 +2206,17 @@ __host__ void FusedOp::backward_task(Task const *task, assert(ooff == 0); // for (int i = 0; i < fused->numWeights; i++) // print_tensor(weight_grad_ptr[i], - // weight_grad_domain[i].get_volume(), "[Fused:backward:weight_grad]"); + // weight_grad_domain[i].get_volume(), + // "[Fused:backward:weight_grad]"); // for (int i = 0; i < fused->numInputs; i++) - // print_tensor(input_grad_ptr[i], input_grad_domain[i].get_volume(), + // print_tensor(input_grad_ptr[i], + // input_grad_domain[i].get_volume(), // "[Fused:backward:input_grad]"); // for (int i = 0; i < fused->numOutputs; i++) // print_tensor(output_grad_ptr[i], - // output_grad_domain[i].get_volume(), "[Fused:backward:output_grad]"); + // output_grad_domain[i].get_volume(), + // "[Fused:backward:output_grad]"); +} } -}; // namespace FlexFlow +; // namespace FlexFlow diff --git a/src/ops/fused.cu b/src/ops/fused.cu index cab28181da..8871faf6f7 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -24,6 +24,7 @@ #include "flexflow/ops/fused.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" +#include "flexflow/ops/kernels/cast_kernels.h" #include "flexflow/ops/kernels/concat_kernels.h" #include "flexflow/ops/kernels/conv_2d_kernels.h" #include "flexflow/ops/kernels/dropout_kernels.h" @@ -1367,9 +1368,7 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); DropoutMeta *m = (DropoutMeta *)metas->meta[op]; Kernels::Dropout::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + m, my_input_accessor[0], my_output_accessor[0]); break; } case OP_LINEAR: { @@ -1450,8 +1449,8 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; Kernels::ElementBinary::forward_kernel_wrapper(m, my_input_accessor[0], @@ -1528,7 +1527,11 @@ __host__ void FusedOp::forward_task(Task const *task, case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); @@ -1564,6 +1567,34 @@ __host__ void FusedOp::forward_task(Task const *task, my_input_accessor[0].domain.get_volume()); break; } + case OP_RESHAPE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + assert(my_input_accessor[0].data_type == + my_output_accessor[0].data_type); + if (my_input_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int64_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int32_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false && "Unsupported data type"); + } + break; + } case OP_SOFTMAX: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); @@ -1573,18 +1604,22 @@ __host__ void FusedOp::forward_task(Task const *task, SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; Kernels::Softmax::forward_kernel_wrapper( m, my_input_accessor[0], my_output_accessor[0]); + // if (my_input_accessor[0].data_type == DT_FLOAT) { + // Kernels::Softmax::forward_kernel_wrapper( + // m, + // my_input_accessor[0].get_float_ptr(), + // my_output_accessor[0].get_float_ptr()); + // } else { + // assert(false); + // } break; } - case OP_RESHAPE: { + case OP_ALLREDUCE: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0]); break; } case OP_TRANSPOSE: { @@ -1594,6 +1629,8 @@ __host__ void FusedOp::forward_task(Task const *task, assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); TransposeMeta *m = (TransposeMeta *)metas->meta[op]; + assert(my_input_accessor[0].data_type == + my_output_accessor[0].data_type); Kernels::Transpose::forward_kernel_wrapper( m, my_input_accessor[0].get_float_ptr(), @@ -1606,6 +1643,7 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); if (m->elementwise_affine) { assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias)); } @@ -1620,6 +1658,29 @@ __host__ void FusedOp::forward_task(Task const *task, m, my_input_accessor[0], my_output_accessor[0], gamma, beta); break; } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_output_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } + case OP_RESIDUAL_LAYERNORM: { assert(false && "Operator ResidualLayerNorm does not support " "the forward() task"); @@ -1655,7 +1716,6 @@ __host__ void FusedOp::forward_task(Task const *task, // print_tensor(output_ptr[i], output_domain[i].get_volume(), // "[Fused:forward:output]"); } - /* regions[...](I): input regions[...](I): weight @@ -1908,9 +1968,7 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); DropoutMeta *m = (DropoutMeta *)metas->meta[op]; Kernels::Dropout::backward_kernel_wrapper( - m, - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr()); + m, my_output_grad_accessor[0], my_input_grad_accessor[0]); break; } case OP_EW_ADD: @@ -1922,8 +1980,9 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + // assert(my_input_accessor[0].domain == + // my_output_accessor[0].domain); ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; Kernels::ElementBinary::backward_kernel_wrapper( m, @@ -1939,7 +1998,8 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 1); EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; - assert(my_input_accessor[0].data_type == DT_INT64); + assert(my_input_accessor[0].data_type == DT_INT64 || + my_input_accessor[0].data_type == DT_INT32); int in_dim, out_dim, effective_batch_size; if (m->aggr == AGGR_MODE_NONE) { in_dim = 1; @@ -2004,7 +2064,11 @@ __host__ void FusedOp::backward_task(Task const *task, case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); @@ -2023,7 +2087,8 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == + // my_output_accessor[0].domain); Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; Kernels::Pool2D::backward_kernel_wrapper( m, @@ -2051,10 +2116,51 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); assert(my_input_grad_accessor[0].domain.get_volume() == my_output_grad_accessor[0].domain.get_volume()); - Kernels::Reshape::backward_kernel_wrapper( - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain.get_volume()); + if (my_input_grad_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_int64_ptr(), + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].get_int32_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::backward_kernel_wrapper( + m, + my_input_grad_accessor[0], + my_output_grad_accessor[0], + my_output_accessor[0], + my_input_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::backward_kernel_wrapper( + m, my_input_grad_accessor[0], my_output_grad_accessor[0]); break; } case OP_TRANSPOSE: { @@ -2072,6 +2178,46 @@ __host__ void FusedOp::backward_task(Task const *task, my_output_grad_accessor[0].domain); break; } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; + } + LayerNorm::backward_kernel_wrapper(m, + my_output_grad_accessor[0], + my_input_accessor[0], + my_input_grad_accessor[0], + gamma, + my_weight_grad_accessor[0], + my_weight_grad_accessor[1]); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + + break; + } default: assert(false && "Fusion currently does not support type"); } @@ -2083,11 +2229,11 @@ __host__ void FusedOp::backward_task(Task const *task, // print_tensor(weight_grad_ptr[i], // weight_grad_domain[i].get_volume(), "[Fused:backward:weight_grad]"); // for (int i = 0; i < fused->numInputs; i++) - // print_tensor(input_grad_ptr[i], input_grad_domain[i].get_volume(), + // print_tensor(input_grad_ptr[i], + // input_grad_domain[i].get_volume(), // "[Fused:backward:input_grad]"); // for (int i = 0; i < fused->numOutputs; i++) // print_tensor(output_grad_ptr[i], // output_grad_domain[i].get_volume(), "[Fused:backward:output_grad]"); } - }; // namespace FlexFlow diff --git a/src/ops/kernels/dropout_kernels.cpp b/src/ops/kernels/dropout_kernels.cpp index c8b1887fd4..96cc246956 100644 --- a/src/ops/kernels/dropout_kernels.cpp +++ b/src/ops/kernels/dropout_kernels.cpp @@ -30,6 +30,10 @@ DropoutMeta::DropoutMeta(FFHandler handler, Domain const &output_domain) : OpMeta(handler, dropout) { profiling = dropout->profiling; + rate = dropout->rate; + seed = dropout->seed; + input_type[0] = dropout->data_type; + output_type[0] = dropout->data_type; inference_debugging = dropout->inference_debugging; checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); @@ -79,20 +83,68 @@ DropoutMeta::~DropoutMeta(void) { namespace Kernels { namespace Dropout { +__global__ void dropout_forward_kernel(float p, + long long seed, + size_t num_elements, + float const *input_ptr, + float *output_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + float scale = 1.0 / p; + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed, i, 0, &state); + float rand = hiprand_uniform(&state); + if (input_ptr[i] < p) { + output_ptr[i] = 0; + } else { + output_ptr[i] = input_ptr[i] * scale; + } + } +} + +__global__ void dropout_backward_kernel(float p, + long long seed, + size_t num_elements, + float const *input_ptr, + float *output_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + float scale = 1.0 / p; + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed, i, 0, &state); + float rand = hiprand_uniform(&state); + if (input_ptr[i] < p) { + output_ptr[i] = 0; + } else { + output_ptr[i] = input_ptr[i] * scale; + } + } +} + void forward_kernel_wrapper(DropoutMeta *m, - float const *input_ptr, - float *output_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::forward_kernel(m, input_ptr, output_ptr, stream); + + Internal::forward_kernel(m, + input.get_float_ptr(), + output.get_float_ptr(), + input.domain.get_volume(), + stream); + + // printf("dropout %d\n", input.domain.get_volume()); + // assert(false); } void backward_kernel_wrapper(DropoutMeta *m, - float const *output_grad_ptr, - float *input_grad_ptr) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::backward_kernel(m, output_grad_ptr, input_grad_ptr, stream); + Internal::backward_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + output_grad.domain.get_volume(), + stream); } namespace Internal { @@ -100,35 +152,58 @@ namespace Internal { void forward_kernel(DropoutMeta *m, float const *input_ptr, float *output_ptr, + size_t num_elements, hipStream_t stream) { checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + int parallelism = num_elements; + hipLaunchKernelGGL(HIP_KERNEL_NAME(dropout_forward_kernel), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + m->seed, + m->rate, + num_elements, + input_ptr, + output_ptr); - checkCUDNN(miopenDropoutForward(m->handle.dnn, - m->dropoutDesc, - m->inputTensor /* not used */, - m->inputTensor, - input_ptr, - m->outputTensor, - output_ptr, - m->reserveSpace, - m->reserveSpaceSize)); + // checkCUDNN(miopenDropoutForward(m->handle.dnn, + // m->dropoutDesc, + // m->inputTensor /* not used */, + // m->inputTensor, + // input_ptr, + // m->outputTensor, + // output_ptr, + // m->reserveSpace, + // m->reserveSpaceSize)); } void backward_kernel(DropoutMeta *m, float const *output_grad_ptr, float *input_grad_ptr, + size_t num_elements, hipStream_t stream) { checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - - checkCUDNN(miopenDropoutBackward(m->handle.dnn, - m->dropoutDesc, - m->inputTensor /* not used */, - m->outputTensor, - output_grad_ptr, - m->inputTensor, - input_grad_ptr, - m->reserveSpace, - m->reserveSpaceSize)); + int parallelism = num_elements; + hipLaunchKernelGGL(HIP_KERNEL_NAME(dropout_backward_kernel), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + m->seed, + m->rate, + num_elements, + output_grad_ptr, + input_grad_ptr); + // checkCUDNN(miopenDropoutBackward(m->handle.dnn, + // m->dropoutDesc, + // m->inputTensor /* not used */, + // m->outputTensor, + // output_grad_ptr, + // m->inputTensor, + // input_grad_ptr, + // m->reserveSpace, + // m->reserveSpaceSize)); } } // namespace Internal diff --git a/src/ops/kernels/dropout_kernels.cu b/src/ops/kernels/dropout_kernels.cu index d65b951f51..176afdf90b 100644 --- a/src/ops/kernels/dropout_kernels.cu +++ b/src/ops/kernels/dropout_kernels.cu @@ -29,6 +29,10 @@ DropoutMeta::DropoutMeta(FFHandler handler, Domain const &output_domain) : OpMeta(handler, dropout) { profiling = dropout->profiling; + rate = dropout->rate; + seed = dropout->seed; + input_type[0] = dropout->data_type; + output_type[0] = dropout->data_type; inference_debugging = dropout->inference_debugging; checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); @@ -75,20 +79,97 @@ DropoutMeta::~DropoutMeta(void) { namespace Kernels { namespace Dropout { +__global__ void dropout_forward_kernel(float p, + long long seed, + size_t num_elements, + float const *input_ptr, + float *output_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + float scale = 1.0 / p; + curandStatePhilox4_32_10_t state; + curand_init(seed, i, 0, &state); + float rand = curand_uniform(&state); + if (input_ptr[i] < p) { + output_ptr[i] = 0; + } else { + output_ptr[i] = input_ptr[i] * scale; + } + } +} + +__global__ void dropout_backward_kernel(float p, + long long seed, + size_t num_elements, + float const *input_ptr, + float *output_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + float scale = 1.0 / p; + curandStatePhilox4_32_10_t state; + curand_init(seed, i, 0, &state); + float rand = curand_uniform(&state); + if (input_ptr[i] < p) { + output_ptr[i] = 0; + } else { + output_ptr[i] = input_ptr[i] * scale; + } + } +} + void forward_kernel_wrapper(DropoutMeta *m, - float const *input_ptr, - float *output_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::forward_kernel(m, input_ptr, output_ptr, stream); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + Internal::forward_kernel(m, + input.get_float_ptr(), + output.get_float_ptr(), + input.domain.get_volume(), + stream); + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf(" [dropout] forward time = %.2lfms\n", elapsed); + } } void backward_kernel_wrapper(DropoutMeta *m, - float const *output_grad_ptr, - float *input_grad_ptr) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::backward_kernel(m, output_grad_ptr, input_grad_ptr, stream); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + Internal::backward_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + output_grad.domain.get_volume(), + stream); + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf(" [dropout] backward time = %.2lfms\n", elapsed); + } } namespace Internal { @@ -96,33 +177,48 @@ namespace Internal { void forward_kernel(DropoutMeta *m, float const *input_ptr, float *output_ptr, + size_t num_elements, cudaStream_t stream) { checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - checkCUDNN(cudnnDropoutForward(m->handle.dnn, - m->dropoutDesc, - m->inputTensor, - input_ptr, - m->outputTensor, - output_ptr, - m->reserveSpace, - m->reserveSpaceSize)); + int parallelism = num_elements; + dropout_forward_kernel<<>>( + m->seed, m->rate, num_elements, input_ptr, output_ptr); + + // checkCUDNN(cudnnDropoutForward(m->handle.dnn, + // m->dropoutDesc, + // m->inputTensor, + // input_ptr, + // m->outputTensor, + // output_ptr, + // m->reserveSpace, + // m->reserveSpaceSize)); } void backward_kernel(DropoutMeta *m, float const *output_grad_ptr, float *input_grad_ptr, + size_t num_elements, cudaStream_t stream) { checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + int parallelism = num_elements; + dropout_backward_kernel<<>>( + m->seed, m->rate, num_elements, output_grad_ptr, input_grad_ptr); - checkCUDNN(cudnnDropoutBackward(m->handle.dnn, - m->dropoutDesc, - m->outputTensor, - output_grad_ptr, - m->inputTensor, - input_grad_ptr, - m->reserveSpace, - m->reserveSpaceSize)); + // checkCUDNN(cudnnDropoutBackward(m->handle.dnn, + // m->dropoutDesc, + // m->outputTensor, + // output_grad_ptr, + // m->inputTensor, + // input_grad_ptr, + // m->reserveSpace, + // m->reserveSpaceSize)); } } // namespace Internal diff --git a/src/ops/kernels/element_binary_kernels.cpp b/src/ops/kernels/element_binary_kernels.cpp index a65372de85..8ca4d35f54 100644 --- a/src/ops/kernels/element_binary_kernels.cpp +++ b/src/ops/kernels/element_binary_kernels.cpp @@ -73,13 +73,13 @@ void forward_kernel_wrapper(ElementBinaryMeta const *m, GenericTensorAccessorW const &out) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - hipEvent_t t_start, t_end; if (m->profiling) { checkCUDA(hipEventCreate(&t_start)); checkCUDA(hipEventCreate(&t_end)); checkCUDA(hipEventRecord(t_start, stream)); } + // print_tensor(in1_ptr, in1_domain.get_volume(), "input1:"); // print_tensor(in2_ptr, in2_domain.get_volume(), "input2:"); Internal::forward_kernel( @@ -201,6 +201,21 @@ __global__ void elewise_binary_forward_kernel(coord_t volume, } } +// for simplicity, assume the replicate dimension is the batchsize +__global__ void + elewise_binary_forward_kernel_broadcast2(float const *in1_ptr, + float const *in2_ptr, + float *output_ptr, + size_t volume, + size_t batch_size, + size_t replicate_size) { + CUDA_KERNEL_LOOP(i, volume) { + size_t batch = i / replicate_size; + output_ptr[i] = + in1_ptr[i] + in2_ptr[batch * replicate_size + i % replicate_size]; + } +} + __global__ void elewise_binary_backward_kernel(coord_t volume, float const alpha, float const beta, @@ -248,7 +263,6 @@ void forward_kernel(ElementBinaryMeta const *m, hipStream_t stream) { checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - float alpha1 = 1.0f, alpha2 = 1.0f, beta = 0.0f; switch (m->op_type) { case OP_EW_SUB: @@ -287,6 +301,19 @@ void forward_kernel(ElementBinaryMeta const *m, &alpha1, m->outputTensor, out_ptr)); + } else if (m->op_type == OP_EW_ADD && m->broadcast_input2) { + int parallelism = m->batch_size * m->replicate_size; + hipLaunchKernelGGL(elewise_binary_forward_kernel_broadcast2, + GET_BLOCKS(parallelism), + CUDA_NUM_THREADS, + 0, + stream, + in1_ptr, + in2_ptr, + out_ptr, + m->batch_size * m->replicate_size, + m->batch_size, + m->replicate_size); } else { checkCUDNN(miopenOpTensor(m->handle.dnn, m->opDesc, diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp index fa31c5adff..ca95f8dade 100644 --- a/src/ops/kernels/softmax.cpp +++ b/src/ops/kernels/softmax.cpp @@ -33,6 +33,7 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( outputTensor, input_domain, softmax->data_type)); dim = softmax->dim; + last_layer = softmax->last_layer; profiling = softmax->profiling; inference_debugging = softmax->inference_debugging; std::strcpy(op_name, softmax->name); @@ -78,7 +79,9 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, GenericTensorAccessorW const &input_grad, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &outputs, + size_t num_elements) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -93,12 +96,14 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, Internal::backward_kernel(m, input_grad.get_float_ptr(), output_grad.get_float_ptr(), + outputs.get_float_ptr(), output_grad.domain.get_volume(), stream); } else if (m->output_type[0] == DT_HALF) { Internal::backward_kernel(m, input_grad.get_half_ptr(), output_grad.get_half_ptr(), + outputs.get_half_ptr(), output_grad.domain.get_volume(), stream); } else { @@ -250,124 +255,142 @@ template void backward_kernel(SoftmaxMeta const *m, DT *input_grad_ptr, DT const *output_grad_ptr, + DT const *output_ptr, size_t num_elements, hipStream_t stream) { - checkCUDA(hipMemcpyAsync(input_grad_ptr, - output_grad_ptr, - num_elements * sizeof(DT), - hipMemcpyDeviceToDevice, - stream)); -} + if (m->last_layer) { + checkCUDA(hipMemcpyAsync(input_grad_ptr, + output_grad_ptr, + num_elements * sizeof(float), + hipMemcpyDeviceToDevice, + stream)); + } else { + float alpha = 1.0f, beta = 0.0f; + checkCUDNN(miopenSoftmaxBackward_V2(m->handle.dnn, + &alpha, + m->inputTensor, + output_ptr, + m->inputTensor, + output_grad_ptr, + &beta, + m->inputTensor, + input_grad_ptr, + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_CHANNEL)); + } -template -void inference_kernel(SoftmaxMeta const *m, - BatchConfig const *bc, - DT const *input_ptr, - DT *output_ptr, - int num_classes, - hipStream_t stream) { - checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + template + void inference_kernel(SoftmaxMeta const *m, + BatchConfig const *bc, + DT const *input_ptr, + DT *output_ptr, + int num_classes, + hipStream_t stream) { + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - float alpha = 1.0f, beta = 0.0f; - miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); - checkCUDNN(miopenSet4dTensorDescriptor(m->outputTensor, - cudnn_data_type, - bc->num_active_tokens(), - num_classes, - 1, - 1)); - checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, - &alpha, - m->outputTensor, - input_ptr, - &beta, - m->outputTensor, - output_ptr, - MIOPEN_SOFTMAX_ACCURATE, - MIOPEN_SOFTMAX_MODE_CHANNEL)); -} + float alpha = 1.0f, beta = 0.0f; + miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + checkCUDNN(miopenSet4dTensorDescriptor(m->outputTensor, + cudnn_data_type, + bc->num_active_tokens(), + num_classes, + 1, + 1)); + checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, + &alpha, + m->outputTensor, + input_ptr, + &beta, + m->outputTensor, + output_ptr, + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_CHANNEL)); + } -template -__global__ void sparse_categorical_crossentropy_loss_peft_backward( - DT *input_grad, - DT const *output_grad, - BatchConfig::TokenId const *token_ids, - int num_tokens, - int num_classes) { - CUDA_KERNEL_LOOP(i, num_tokens * num_classes) { - int class_idx = i % num_classes; - int token_idx = i / num_classes; - input_grad[i] = output_grad[i]; - if (class_idx == token_ids[token_idx]) { - input_grad[i] = input_grad[i] - (DT)1.0f; + template + __global__ void sparse_categorical_crossentropy_loss_peft_backward( + DT * input_grad, + DT const *output_grad, + BatchConfig::TokenId const *token_ids, + int num_tokens, + int num_classes) { + CUDA_KERNEL_LOOP(i, num_tokens * num_classes) { + int class_idx = i % num_classes; + int token_idx = i / num_classes; + input_grad[i] = output_grad[i]; + if (class_idx == token_ids[token_idx]) { + input_grad[i] = input_grad[i] - (DT)1.0f; + } } } -} -template -void peft_bwd_kernel(SoftmaxMeta const *m, - BatchConfig const *bc, - DT *input_grad_ptr, - DT const *output_grad_ptr, - int num_classes, - hipStream_t stream) { - BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS]; - int tokens_previous_requests = 0; - for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } - // Skip non-PEFT requests - if (!bc->requestsInfo[i].peft_bwd) { - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; - continue; - } - int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1; - // shift labels by 1 position to the left (ignore first token label) - for (int j = 0; j < num_bwd_tokens; j++) { - token_ids[j] = bc->tokensInfo[j + tokens_previous_requests + 1].token_id; - } + template + void peft_bwd_kernel(SoftmaxMeta const *m, + BatchConfig const *bc, + DT *input_grad_ptr, + DT const *output_grad_ptr, + int num_classes, + hipStream_t stream) { + BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS]; + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (!bc->requestsInfo[i].peft_bwd) { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1; + // shift labels by 1 position to the left (ignore first token label) + for (int j = 0; j < num_bwd_tokens; j++) { + token_ids[j] = + bc->tokensInfo[j + tokens_previous_requests + 1].token_id; + } - DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch - 1); - // ignore last token - checkCUDA(hipMemsetAsync(input_grad_ptr + - (tokens_previous_requests + - bc->requestsInfo[i].num_tokens_in_batch - 1) * - num_classes, - 0, - num_classes * sizeof(DT), - stream)); - checkCUDA(hipMemcpyAsync(m->handle.workSpace, - token_ids, - sizeof(BatchConfig::TokenId) * num_bwd_tokens, - hipMemcpyHostToDevice, - stream)); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(sparse_categorical_crossentropy_loss_peft_backward
), - GET_BLOCKS(num_bwd_tokens * num_classes), - CUDA_NUM_THREADS, - 0, - stream, - input_grad_ptr + tokens_previous_requests * num_classes, - output_grad_ptr + tokens_previous_requests * num_classes, - static_cast(m->handle.workSpace), - num_bwd_tokens, - num_classes); - // scale - hipLaunchKernelGGL(HIP_KERNEL_NAME(scale_kernel
), - GET_BLOCKS(num_bwd_tokens * num_classes), - CUDA_NUM_THREADS, - 0, - stream, - input_grad_ptr + tokens_previous_requests * num_classes, - num_bwd_tokens * num_classes, - DT(0.0), - scale_factor); + DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch - 1); + // ignore last token + checkCUDA(hipMemsetAsync( + input_grad_ptr + (tokens_previous_requests + + bc->requestsInfo[i].num_tokens_in_batch - 1) * + num_classes, + 0, + num_classes * sizeof(DT), + stream)); + checkCUDA(hipMemcpyAsync(m->handle.workSpace, + token_ids, + sizeof(BatchConfig::TokenId) * num_bwd_tokens, + hipMemcpyHostToDevice, + stream)); + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + sparse_categorical_crossentropy_loss_peft_backward
), + GET_BLOCKS(num_bwd_tokens * num_classes), + CUDA_NUM_THREADS, + 0, + stream, + input_grad_ptr + tokens_previous_requests * num_classes, + output_grad_ptr + tokens_previous_requests * num_classes, + static_cast(m->handle.workSpace), + num_bwd_tokens, + num_classes); + // scale + hipLaunchKernelGGL(HIP_KERNEL_NAME(scale_kernel
), + GET_BLOCKS(num_bwd_tokens * num_classes), + CUDA_NUM_THREADS, + 0, + stream, + input_grad_ptr + + tokens_previous_requests * num_classes, + num_bwd_tokens * num_classes, + DT(0.0), + scale_factor); - tokens_previous_requests += num_bwd_tokens + 1; + tokens_previous_requests += num_bwd_tokens + 1; + } + assert(tokens_previous_requests == bc->num_active_tokens()); } - assert(tokens_previous_requests == bc->num_active_tokens()); -} } // namespace Internal } // namespace Softmax diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index 16f1219bf6..27e2249978 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -32,6 +32,7 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( outputTensor, input_domain, softmax->data_type)); dim = softmax->dim; + last_layer = softmax->last_layer; profiling = softmax->profiling; inference_debugging = softmax->inference_debugging; std::strcpy(op_name, softmax->name); @@ -77,7 +78,9 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, GenericTensorAccessorW const &input_grad, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &outputs, + size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -92,12 +95,14 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, Internal::backward_kernel(m, input_grad.get_float_ptr(), output_grad.get_float_ptr(), + outputs.get_float_ptr(), output_grad.domain.get_volume(), stream); } else if (m->output_type[0] == DT_HALF) { Internal::backward_kernel(m, input_grad.get_half_ptr(), output_grad.get_half_ptr(), + outputs.get_half_ptr(), output_grad.domain.get_volume(), stream); } else { @@ -249,13 +254,30 @@ template void backward_kernel(SoftmaxMeta const *m, DT *input_grad_ptr, DT const *output_grad_ptr, + DT const *output_ptr, size_t num_elements, cudaStream_t stream) { - checkCUDA(cudaMemcpyAsync(input_grad_ptr, - output_grad_ptr, - num_elements * sizeof(DT), - cudaMemcpyDeviceToDevice, - stream)); + + if (m->last_layer) { + checkCUDA(cudaMemcpyAsync(input_grad_ptr, + output_grad_ptr, + num_elements * sizeof(float), + cudaMemcpyDeviceToDevice, + stream)); + } else { + float alpha = 1.0f, beta = 0.0f; + checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + m->inputTensor, + output_ptr, + m->inputTensor, + output_grad_ptr, + &beta, + m->inputTensor, + input_grad_ptr)); + } } template diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 3161987d60..bf66504fbe 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -63,7 +63,7 @@ LayerNormParams LayerNorm::get_params() const { return params; } -Tensor FFModel::layer_norm(const Tensor input, +Tensor FFModel::layer_norm(Tensor const input, std::vector const &axes, bool elementwise_affine, float eps, @@ -133,6 +133,7 @@ Tensor FFModel::layer_norm(const Tensor input, ln, 0, true /*create_grad*/); + if (num_weights > 0) { assert(elementwise_affine); int numdims = axes.size(); @@ -206,7 +207,7 @@ LayerNorm::LayerNorm(FFModel &model, LayerNorm::LayerNorm(FFModel &model, LayerID const &_layer_guid, - const ParallelTensor _input, + ParallelTensor const _input, std::vector const &_axes, bool _elementwise_affine, bool _use_bias, @@ -233,45 +234,85 @@ LayerNorm::LayerNorm(FFModel &model, for (int i = 0; i < axes.size(); i++) { M *= inputs[0]->dims[axes[i]].size; } - int num_replicas = 1; - for (int i = 0; i < inputs[0]->num_dims; i++) { - if (inputs[0]->dims[i].is_replica_dim) { - num_replicas *= inputs[0]->dims[i].size; - } - } effective_num_elements = M; - effective_batch_size = (inputs[0]->get_volume() / num_replicas) / M; - assert(use_bias == (numWeights == 2)); + effective_batch_size = inputs[0]->get_volume() / M; + assert(elementwise_affine == (numWeights == 2)); if (numWeights > 0 && allocate_weights) { - assert(elementwise_affine); - ParallelTensorShape beta_gamma_shape = _input->get_shape(); - for (int i = axes.size(); i < beta_gamma_shape.num_dims - 1; i++) { - beta_gamma_shape.dims[i].size = 1; + ParallelDim dims[axes.size() + 1]; + int num_dims = axes.size(); + for (int i = 0; i < num_dims; i++) { + dims[i] = inputs[0]->dims[i]; } + assert(numInputs == 1); + dims[num_dims].degree = inputs[0]->dims[inputs[0]->num_dims - 1].degree; + dims[num_dims].size = dims[num_dims].degree; + dims[num_dims].parallel_idx = + inputs[0]->dims[inputs[0]->num_dims - 1].parallel_idx; + dims[num_dims].is_replica_dim = true; + num_dims += 1; + int seed = std::rand(); - Initializer *gamma_initializer = new UniformInitializer(seed, 1.0f, 1.0f); - weights[0] = model.create_parallel_weight_legion_ordering( - beta_gamma_shape.num_dims, // axes.size(), - beta_gamma_shape.dims, - _input->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - gamma_initializer, - CHOSEN_SYNC_TYPE); - if (numWeights == 2) { - assert(use_bias); - Initializer *beta_initializer = new UniformInitializer(seed, 0.0f, 0.0f); - weights[1] = model.create_parallel_weight_legion_ordering( - beta_gamma_shape.num_dims, //.size(), - beta_gamma_shape.dims, - _input->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - beta_initializer, - CHOSEN_SYNC_TYPE); - } + Initializer *gamma_initializer = new UniformInitializer(seed, 0.0f, 1.0f); + Initializer *beta_initializer = new UniformInitializer(seed, 0.0f, 1.0f); + weights[0] = + model.create_parallel_weight_legion_ordering(num_dims, + dims, + _input->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + gamma_initializer, + CHOSEN_SYNC_TYPE); + weights[1] = + model.create_parallel_weight_legion_ordering(num_dims, + dims, + _input->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + beta_initializer, + CHOSEN_SYNC_TYPE); } + // ======= } +// int num_replicas = 1; +// for (int i = 0; i < inputs[0]->num_dims; i++) { +// if (inputs[0]->dims[i].is_replica_dim) { +// num_replicas *= inputs[0]->dims[i].size; +// } +// } +// effective_num_elements = M; +// effective_batch_size = (inputs[0]->get_volume() / num_replicas) / M; +// assert(use_bias == (numWeights == 2)); +// if (numWeights > 0 && allocate_weights) { +// assert(elementwise_affine); +// ParallelTensorShape beta_gamma_shape = _input->get_shape(); +// for (int i = axes.size(); i < beta_gamma_shape.num_dims - 1; i++) { +// beta_gamma_shape.dims[i].size = 1; +// } +// int seed = std::rand(); +// Initializer *gamma_initializer = new +// UniformInitializer(seed, 1.0f, 1.0f); weights[0] = +// model.create_parallel_weight_legion_ordering( +// beta_gamma_shape.num_dims, // axes.size(), +// beta_gamma_shape.dims, +// _input->data_type, +// NULL /*owner_op*/, +// true /*create_grad*/, +// gamma_initializer, +// CHOSEN_SYNC_TYPE); +// if (numWeights == 2) { +// assert(use_bias); +// Initializer *beta_initializer = new UniformInitializer(seed, 0.0f, +// 0.0f); weights[1] = model.create_parallel_weight_legion_ordering( +// beta_gamma_shape.num_dims, //.size(), +// beta_gamma_shape.dims, +// _input->data_type, +// NULL /*owner_op*/, +// true /*create_grad*/, +// beta_initializer, +// CHOSEN_SYNC_TYPE); +// } +// } +// } void LayerNorm::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -593,6 +634,10 @@ void LayerNorm::forward_task(Task const *task, assert(task->regions.size() == regions.size()); float const *in_ptr = NULL; float *out_ptr = NULL, *gamma_ptr = NULL, *beta_ptr = NULL; + // <<<<<<< HEAD + // GenericTensorAccessorR in; + // GenericTensorAccessorW out, gamma, beta; + // ======= GenericTensorAccessorR in, gamma, beta; GenericTensorAccessorW out; @@ -609,12 +654,34 @@ void LayerNorm::forward_task(Task const *task, out = helperGetGenericTensorAccessorWO( m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(in_domain == out_domain); - assert(in_domain.get_volume() == - m->effective_num_elements * m->effective_batch_size); + // assert(in_domain.get_volume() == + // m->effective_num_elements * m->effective_batch_size); + if (m->elementwise_affine) { assert(m->use_bias == (regions.size() == 4)); Domain gamma_domain = runtime->get_index_space_domain( ctx, task->regions[2].region.get_index_space()); + // <<<<<<< HEAD + // // gamma_ptr = helperGetTensorPointerRW( + // // regions[2], task->regions[2], FID_DATA, ctx, runtime); + // gamma = helperGetGenericTensorAccessorRW( + // m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, + // runtime); + // Domain beta_domain = runtime->get_index_space_domain( + // ctx, task->regions[3].region.get_index_space()); + // // beta_ptr = helperGetTensorPointerRW( + // // regions[3], task->regions[3], FID_DATA, ctx, runtime); + // beta = helperGetGenericTensorAccessorRW( + // m->input_type[0], regions[3], task->regions[3], FID_DATA, ctx, + // runtime); + // assert(gamma_domain == beta_domain); + // assert(gamma_domain.get_volume() == m->effective_num_elements); + // int numdims = gamma_domain.get_dim() - 1; + // for (int i = 0; i < numdims; i++) { + // int g_d = gamma_domain.hi()[i] - gamma_domain.lo()[i] + 1; + // int in_d = in_domain.hi()[i] - in_domain.lo()[i] + 1; + // assert(g_d == in_d); + // ======= gamma = helperGetGenericTensorAccessorRO( m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); if (m->use_bias) { @@ -836,8 +903,8 @@ void LayerNorm::backward_task(Task const *task, Domain in_grad_domain = runtime->get_index_space_domain( ctx, task->regions[2].region.get_index_space()); assert(in_domain == out_grad_domain); - assert(in_domain.get_volume() == - m->effective_num_elements * m->effective_batch_size); + // assert(in_domain.get_volume() == + // m->effective_num_elements * m->effective_batch_size); if (m->elementwise_affine) { assert(m->use_bias == (regions.size() == 6)); @@ -906,6 +973,7 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, // FIXME please add gamma_ptr and beta_ptr after finish the implementation float *gamma_ptr = NULL, *beta_ptr = NULL; + GenericTensorAccessorW gamma_acc; GenericTensorAccessorW beta_acc; diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index 27d314e21e..318ed9f5e3 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -109,11 +109,11 @@ __global__ void LayerNormFusedForwardKernel(int64_t N, T *Y) { __shared__ float m_shared[C10_WARP_SIZE]; __shared__ float v_shared[C10_WARP_SIZE]; - const int64_t i = blockIdx.x; + int64_t const i = blockIdx.x; float sum1 = 0.0f; float sum2 = 0.0f; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; + int64_t const index = i * N + j; sum1 += static_cast(X[index]); sum2 += static_cast(X[index]) * static_cast(X[index]); } @@ -131,7 +131,7 @@ __global__ void LayerNormFusedForwardKernel(int64_t N, using T_ACC = T; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; + int64_t const index = i * N + j; const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); const T_ACC beta_v = @@ -151,18 +151,19 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, T const *beta_ptr, hipStream_t stream) { - LayerNormFusedForwardKernel - <<effective_batch_size, - std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements), - 0, - stream>>>(m->effective_num_elements, - m->eps, - in_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_ptr, - beta_ptr, - out_ptr); + hipLaunchKernelGGL(HIP_KERNEL_NAME(LayerNormFusedForwardKernel) + m->effective_batch_size, + std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements), + 0, + stream, + m->effective_num_elements, + m->eps, + in_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + beta_ptr, + out_ptr); } /*static*/ @@ -329,11 +330,11 @@ __global__ void ComputeInternalGradientsCUDAKernel( using T_ACC = T; __shared__ T_ACC ds_shared[C10_WARP_SIZE]; __shared__ T_ACC db_shared[C10_WARP_SIZE]; - const int64_t i = blockIdx.x; + int64_t const i = blockIdx.x; T_ACC sum1 = 0; T_ACC sum2 = 0; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; + int64_t const index = i * N + j; const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); sum1 += @@ -358,7 +359,7 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, T *c1, T *c2) { using T_ACC = T; - const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; + int64_t const index = blockIdx.x * blockDim.x + threadIdx.x; if (index < M) { const T_ACC s = T_ACC(1) / static_cast((int)N); const T_ACC a = (db[index] * static_cast(mean[index]) - ds[index]) * @@ -381,12 +382,12 @@ __global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M, T *dg, T *db) { using T_ACC = T; - const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + int64_t const j = blockIdx.x * blockDim.x + threadIdx.x; if (j < N) { T_ACC sum1 = 0; T_ACC sum2 = 0; for (int64_t i = 0; i < M; ++i) { - const int64_t index = i * N + j; + int64_t const index = i * N + j; sum1 += dg == nullptr ? T_ACC(0) : static_cast(dY[index]) * (static_cast(X[index]) - @@ -415,17 +416,17 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M, using T_ACC = T; __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; - const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + int64_t const j = blockIdx.x * blockDim.x + threadIdx.x; T_ACC dg_sum1 = 0; T_ACC dg_sum2 = 0; T_ACC db_sum1 = 0; T_ACC db_sum2 = 0; if (j < N) { for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) { - const int64_t i1 = i; - const int64_t i2 = i + blockDim.y; - const int64_t index1 = i1 * N + j; - const int64_t index2 = i2 * N + j; + int64_t const i1 = i; + int64_t const i2 = i + blockDim.y; + int64_t const index1 = i1 * N + j; + int64_t const index2 = i2 * N + j; dg_sum1 += dg == nullptr ? T_ACC(0) : static_cast(dY[index1]) * (static_cast(X[index1]) - @@ -452,7 +453,7 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M, sum1 = WarpReduceSum(sum1); sum2 = WarpReduceSum(sum2); if (threadIdx.x == 0) { - const int64_t j = blockIdx.x * blockDim.x + threadIdx.y; + int64_t const j = blockIdx.x * blockDim.x + threadIdx.y; if (j < N) { if (dg != nullptr) { dg[j] = sum1; @@ -467,7 +468,7 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M, sum1 = WarpReduceSum(sum1); sum2 = WarpReduceSum(sum2); if (threadIdx.x == 0) { - const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y; + int64_t const j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y; if (j < N) { if (dg != nullptr) { dg[j] = sum1; @@ -489,8 +490,8 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY, int const N, T *buf) { auto const i1 = blockIdx.x; - const T mean_val = mean[i1]; - const T rstd_val = rstd[i1]; + T const mean_val = mean[i1]; + T const rstd_val = rstd[i1]; T stats_x1{0}, stats_x2{0}; constexpr int unroll = 4; auto l = unroll * threadIdx.x; @@ -503,16 +504,16 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY, #pragma unroll for (int k = 0; k < unroll; k++) { T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); - const T c_h = static_cast(X_i[l + k]); - const T c_loss = static_cast(dY_i[l + k]); + T const c_h = static_cast(X_i[l + k]); + T const c_loss = static_cast(dY_i[l + k]); stats_x1 += c_loss * gamma_val; stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; } } for (; l < N; l++) { T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); - const T c_h = static_cast(X_i[l]); - const T c_loss = static_cast(dY_i[l]); + T const c_h = static_cast(X_i[l]); + T const c_loss = static_cast(dY_i[l]); stats_x1 += c_loss * gamma_val; stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; } @@ -530,8 +531,8 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY, T term1 = (T(1) / fH) * rstd_val; for (int l = threadIdx.x; l < N; l += blockDim.x) { - const T x = X_i[l]; - const T dy = dY_i[l]; + T const x = X_i[l]; + T const dy = dY_i[l]; T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); T f_grad_input = fH * gamma_val * dy; f_grad_input -= (x - mean_val) * rstd_val * stats_x2; @@ -565,67 +566,87 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, T *gamma_grad_ptr, T *beta_grad_ptr, hipStream_t stream) { - const int64_t M = m->effective_batch_size; - const int64_t N = m->effective_num_elements; - ComputeInternalGradientsCUDAKernel - <<>>( - N, - output_grad_ptr, - input_ptr, - gamma_ptr, - static_cast(m->ds_ptr), - static_cast(m->db_ptr)); - const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; - ComputeGradientFusedParamsCUDAKernel - <<>>(M, - N, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - static_cast(m->ds_ptr), - static_cast(m->db_ptr), - static_cast(m->scale_ptr), - static_cast(m->bias_ptr)); + int64_t const M = m->effective_batch_size; + int64_t const N = m->effective_num_elements; + hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel), + M, + kCUDABlockReduceNumThreads, + 0, + stream, + N, + output_grad_ptr, + input_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); + int64_t const B = (M + kCUDANumThreads - 1) / kCUDANumThreads; + hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel), + B, + kCUDANumThreads, + 0, + stream, + M, + N, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); int const warp_size = C10_WARP_SIZE; int const num_threads = 128; - const dim3 blocks(M); + dim3 const blocks(M); int nshared = (num_threads / warp_size) * sizeof(T); - layer_norm_grad_input_kernel<<>>( - output_grad_ptr, - input_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_ptr, - input_grad_ptr, - N); + hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel), + blocks, + num_threads, + nshared, + stream, + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + N); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { if (M < 512) { // For small batch size, do colwise reduce directly - const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; - GammaBetaBackwardSimpleCUDAKernel - <<>>(M, - N, - output_grad_ptr, - input_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_grad_ptr, - beta_grad_ptr); + int64_t const B = (N + kCUDANumThreads - 1) / kCUDANumThreads; + + hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardSimpleCUDAKernel), + , + B, + kCUDANumThreads, + 0, + stream >, + M, + N, + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); } else { - const int64_t B = + int64_t const B = (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize; constexpr int kThreadX = kColwiseReduceTileSize; constexpr int kThreadY = kColwiseReduceTileSize / 2; - GammaBetaBackwardCUDAKernel - <<>>( - M, - N, - output_grad_ptr, - input_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_grad_ptr, - beta_grad_ptr); + hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardCUDAKernel), + B, + dim3(kThreadX, kThreadY), + 0, + stream, + M, + N, + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); } } } @@ -637,11 +658,11 @@ void LayerNorm::peft_bwd_kernel(LayerNormMeta const *m, T *input_grad_ptr, T const *gamma_ptr, hipStream_t stream) { - const int64_t M = m->effective_batch_size; - const int64_t N = m->effective_num_elements; + int64_t const M = m->effective_batch_size; + int64_t const N = m->effective_num_elements; int const warp_size = C10_WARP_SIZE; int const num_threads = 128; - const dim3 blocks(M); + dim3 const blocks(M); int nshared = (num_threads / warp_size) * sizeof(T); layer_norm_grad_input_kernel<<>>( output_grad_ptr, @@ -711,4 +732,13 @@ void LayerNorm::backward_kernel_wrapper( } } -} // namespace FlexFlow +template void + LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, + float const *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *gamma_ptr, + float *gamma_grad_ptr, + float *beta_grad_ptr); + +}; // namespace FlexFlow diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 0801d11617..b118aabd6e 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -36,6 +36,26 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, inference_debugging = ln->inference_debugging; eps = ln->eps; DataType data_type = ln->data_type; + // <<<<<<< HEAD + // checkCUDA( + // cudaMalloc(&mean_ptr, data_type_size(data_type) * + // effective_batch_size)); + // checkCUDA( + // cudaMalloc(&rstd_ptr, data_type_size(data_type) * + // effective_batch_size)); + // checkCUDA( + // cudaMalloc(&ds_ptr, data_type_size(data_type) * + // effective_batch_size)); + // checkCUDA( + // cudaMalloc(&db_ptr, data_type_size(data_type) * + // effective_batch_size)); + // checkCUDA( + // cudaMalloc(&scale_ptr, data_type_size(data_type) * + // effective_batch_size)); + // checkCUDA( + // cudaMalloc(&bias_ptr, data_type_size(data_type) * + // effective_batch_size)); + // ======= size_t totalSize = effective_batch_size * data_type_size(data_type) * 6; gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); mean_ptr = gpu_mem_allocator.allocate_instance_untyped( @@ -57,6 +77,7 @@ LayerNormMeta::~LayerNormMeta(void) { if (reserveInst != Realm::RegionInstance::NO_INST) { reserveInst.destroy(); } + // >>>>>>> xinhao/merged_bert } template @@ -108,11 +129,11 @@ __global__ void LayerNormFusedForwardKernel(int64_t N, T *Y) { __shared__ float m_shared[C10_WARP_SIZE]; __shared__ float v_shared[C10_WARP_SIZE]; - const int64_t i = blockIdx.x; + int64_t const i = blockIdx.x; float sum1 = 0.0f; float sum2 = 0.0f; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; + int64_t const index = i * N + j; sum1 += static_cast(X[index]); sum2 += static_cast(X[index]) * static_cast(X[index]); } @@ -130,7 +151,7 @@ __global__ void LayerNormFusedForwardKernel(int64_t N, using T_ACC = T; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; + int64_t const index = i * N + j; const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); const T_ACC beta_v = @@ -149,7 +170,6 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, T const *gamma_ptr, T const *beta_ptr, cudaStream_t stream) { - LayerNormFusedForwardKernel <<effective_batch_size, std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements), @@ -179,6 +199,7 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } + if (m->input_type[0] == DT_FLOAT) { LayerNorm::forward_kernel( m, @@ -328,11 +349,11 @@ __global__ void ComputeInternalGradientsCUDAKernel( using T_ACC = T; __shared__ T_ACC ds_shared[C10_WARP_SIZE]; __shared__ T_ACC db_shared[C10_WARP_SIZE]; - const int64_t i = blockIdx.x; + int64_t const i = blockIdx.x; T_ACC sum1 = 0; T_ACC sum2 = 0; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; + int64_t const index = i * N + j; const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); sum1 += @@ -357,7 +378,7 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, T *c1, T *c2) { using T_ACC = T; - const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; + int64_t const index = blockIdx.x * blockDim.x + threadIdx.x; if (index < M) { const T_ACC s = T_ACC(1) / static_cast((int)N); const T_ACC a = (db[index] * static_cast(mean[index]) - ds[index]) * @@ -380,12 +401,12 @@ __global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M, T *dg, T *db) { using T_ACC = T; - const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + int64_t const j = blockIdx.x * blockDim.x + threadIdx.x; if (j < N) { T_ACC sum1 = 0; T_ACC sum2 = 0; for (int64_t i = 0; i < M; ++i) { - const int64_t index = i * N + j; + int64_t const index = i * N + j; sum1 += dg == nullptr ? T_ACC(0) : static_cast(dY[index]) * (static_cast(X[index]) - @@ -414,17 +435,17 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M, using T_ACC = T; __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; - const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + int64_t const j = blockIdx.x * blockDim.x + threadIdx.x; T_ACC dg_sum1 = 0; T_ACC dg_sum2 = 0; T_ACC db_sum1 = 0; T_ACC db_sum2 = 0; if (j < N) { for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) { - const int64_t i1 = i; - const int64_t i2 = i + blockDim.y; - const int64_t index1 = i1 * N + j; - const int64_t index2 = i2 * N + j; + int64_t const i1 = i; + int64_t const i2 = i + blockDim.y; + int64_t const index1 = i1 * N + j; + int64_t const index2 = i2 * N + j; dg_sum1 += dg == nullptr ? T_ACC(0) : static_cast(dY[index1]) * (static_cast(X[index1]) - @@ -451,7 +472,7 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M, sum1 = WarpReduceSum(sum1); sum2 = WarpReduceSum(sum2); if (threadIdx.x == 0) { - const int64_t j = blockIdx.x * blockDim.x + threadIdx.y; + int64_t const j = blockIdx.x * blockDim.x + threadIdx.y; if (j < N) { if (dg != nullptr) { dg[j] = sum1; @@ -466,7 +487,7 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M, sum1 = WarpReduceSum(sum1); sum2 = WarpReduceSum(sum2); if (threadIdx.x == 0) { - const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y; + int64_t const j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y; if (j < N) { if (dg != nullptr) { dg[j] = sum1; @@ -488,8 +509,8 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY, int const N, T *buf) { auto const i1 = blockIdx.x; - const T mean_val = mean[i1]; - const T rstd_val = rstd[i1]; + T const mean_val = mean[i1]; + T const rstd_val = rstd[i1]; T stats_x1{0}, stats_x2{0}; constexpr int unroll = 4; auto l = unroll * threadIdx.x; @@ -502,16 +523,16 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY, #pragma unroll for (int k = 0; k < unroll; k++) { T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); - const T c_h = static_cast(X_i[l + k]); - const T c_loss = static_cast(dY_i[l + k]); + T const c_h = static_cast(X_i[l + k]); + T const c_loss = static_cast(dY_i[l + k]); stats_x1 += c_loss * gamma_val; stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; } } for (; l < N; l++) { T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); - const T c_h = static_cast(X_i[l]); - const T c_loss = static_cast(dY_i[l]); + T const c_h = static_cast(X_i[l]); + T const c_loss = static_cast(dY_i[l]); stats_x1 += c_loss * gamma_val; stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; } @@ -529,8 +550,8 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY, T term1 = (T(1) / fH) * rstd_val; for (int l = threadIdx.x; l < N; l += blockDim.x) { - const T x = X_i[l]; - const T dy = dY_i[l]; + T const x = X_i[l]; + T const dy = dY_i[l]; T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); T f_grad_input = fH * gamma_val * dy; f_grad_input -= (x - mean_val) * rstd_val * stats_x2; @@ -564,8 +585,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, T *gamma_grad_ptr, T *beta_grad_ptr, cudaStream_t stream) { - const int64_t M = m->effective_batch_size; - const int64_t N = m->effective_num_elements; + int64_t const M = m->effective_batch_size; + int64_t const N = m->effective_num_elements; ComputeInternalGradientsCUDAKernel <<>>( N, @@ -574,7 +595,7 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, gamma_ptr, static_cast(m->ds_ptr), static_cast(m->db_ptr)); - const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; + int64_t const B = (M + kCUDANumThreads - 1) / kCUDANumThreads; ComputeGradientFusedParamsCUDAKernel <<>>(M, N, @@ -586,7 +607,7 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, static_cast(m->bias_ptr)); int const warp_size = C10_WARP_SIZE; int const num_threads = 128; - const dim3 blocks(M); + dim3 const blocks(M); int nshared = (num_threads / warp_size) * sizeof(T); layer_norm_grad_input_kernel<<>>( output_grad_ptr, @@ -600,7 +621,7 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { if (M < 512) { // For small batch size, do colwise reduce directly - const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; + int64_t const B = (N + kCUDANumThreads - 1) / kCUDANumThreads; GammaBetaBackwardSimpleCUDAKernel <<>>(M, N, @@ -611,7 +632,7 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, gamma_grad_ptr, beta_grad_ptr); } else { - const int64_t B = + int64_t const B = (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize; constexpr int kThreadX = kColwiseReduceTileSize; constexpr int kThreadY = kColwiseReduceTileSize / 2; @@ -636,11 +657,11 @@ void LayerNorm::peft_bwd_kernel(LayerNormMeta const *m, T *input_grad_ptr, T const *gamma_ptr, cudaStream_t stream) { - const int64_t M = m->effective_batch_size; - const int64_t N = m->effective_num_elements; + int64_t const M = m->effective_batch_size; + int64_t const N = m->effective_num_elements; int const warp_size = C10_WARP_SIZE; int const num_threads = 128; - const dim3 blocks(M); + dim3 const blocks(M); int nshared = (num_threads / warp_size) * sizeof(T); layer_norm_grad_input_kernel<<>>( output_grad_ptr, diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 20ad762b62..6e8130df01 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -33,7 +33,7 @@ using namespace FlexFlow::Kernels::Linear; static constexpr int KERNEL_IDX = 0; static constexpr int BIAS_IDX = 1; -Tensor FFModel::dense(const Tensor input, +Tensor FFModel::dense(Tensor const input, int outDim, ActiMode activation, bool use_bias, @@ -157,7 +157,7 @@ Op *Linear::create_operator_from_layer( Linear::Linear(FFModel &model, Linear const &other, - const ParallelTensor input, + ParallelTensor const input, bool allocate_weights) : Linear(model, other.layer_guid, @@ -194,7 +194,7 @@ Linear::Linear(FFModel &model, Linear::Linear(FFModel &model, LayerID const &_layer_guid, - const ParallelTensor _input, + ParallelTensor const _input, int out_dim, ActiMode _activation, RegularizerMode _kernel_reg_type, @@ -248,6 +248,23 @@ Linear::Linear(FFModel &model, } } + kernel_shape.dims[0].size = this->in_channels; + bias_shape.dims[0].degree = _input->dims[_input->num_dims - 1].degree; + bias_shape.dims[0].parallel_idx = + _input->dims[_input->num_dims - 1].parallel_idx; + bias_shape.dims[1].size = bias_shape.dims[1].degree = 1; + bias_shape.dims[1].parallel_idx = -1; + bias_shape.dims[bias_shape.num_dims - 1].size = + bias_shape.dims[bias_shape.num_dims - 1].degree = 1; + for (int i = 0; i < input_shape.num_dims - 1; i++) { + if (_input->dims[i].degree > 1) { + bias_shape.dims[bias_shape.num_dims - 1].size *= _input->dims[i].degree; + bias_shape.dims[bias_shape.num_dims - 1].degree *= _input->dims[i].degree; + bias_shape.dims[bias_shape.num_dims - 1].parallel_idx = + _input->dims[i].parallel_idx; + } + } + if (allocate_weights) { Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/); if (quantization_type != DT_NONE) { @@ -854,7 +871,15 @@ void Linear::forward_task_with_dim(Task const *task, int out_dim = acc_output.rect.hi[0] - acc_output.rect.lo[0] + 1; int batch_size = acc_output.rect.volume() / out_dim; assert(acc_output.rect.volume() == static_cast(out_dim * batch_size)); - assert(acc_input.rect.volume() == static_cast(in_dim * batch_size)); + // assert(acc_input.rect.volume() == static_cast(in_dim * + // batch_size)); + assert(acc_kernel.rect.volume() == static_cast(in_dim * out_dim)); + // float const *acc_bias_ptr = NULL; + // if (m->use_bias) { + // TensorAccessorR acc_bias( + // ======= + // assert(acc_input.rect.volume() == static_cast(in_dim * + // batch_size)); // assert(acc_kernel.rect.volume() == static_cast(in_dim * out_dim)); DT const *acc_bias_ptr = nullptr; if (m->use_bias && @@ -1056,18 +1081,25 @@ void Linear::backward_task_with_dim(Task const *task, static_cast(in_dim * out_dim)); DT *acc_bias_grad_ptr = nullptr; if (m->use_bias) { - TensorAccessorW acc_bias_grad(regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); + // <<<<<<< HEAD + TensorAccessorW acc_bias_grad(regions[rid], + task->regions[rid], + FID_DATA, + ctx, + runtime, + true /*readOutput*/); + // ======= + // TensorAccessorW acc_bias_grad(regions[rid], + // task->regions[rid], + // FID_DATA, + // ctx, + // runtime, + // true /*readOutput*/); rid++; assert(acc_bias_grad.rect.volume() == static_cast(out_dim)); acc_bias_grad_ptr = static_cast
(acc_bias_grad.ptr); } assert(rid == regions.size()); - backward_kernel_wrapper(m, acc_input.ptr, input_grad, @@ -1464,7 +1496,7 @@ bool LinearParams::is_valid(ParallelTensorShape const &input_shape) const { * It takes a the input tensor as a parameter, instead of the input's * ParallelTensorShape. */ -void LinearParams::solve_dims(const ParallelTensor input, +void LinearParams::solve_dims(ParallelTensor const input, ParallelDim output_dims[MAX_TENSOR_DIM], int *output_ndims, ParallelDim kernel_dims[MAX_TENSOR_DIM], diff --git a/src/ops/reshape.cc b/src/ops/reshape.cc index 4e7fd2eb96..9970d7359c 100644 --- a/src/ops/reshape.cc +++ b/src/ops/reshape.cc @@ -47,7 +47,7 @@ bool ReshapeParams::is_valid(ParallelTensorShape const &input) const { return input.is_valid(); } -Tensor FFModel::reshape(const Tensor input, +Tensor FFModel::reshape(Tensor const input, std::vector const &shape, char const *name) { Layer *reshape = new Layer(this, @@ -80,9 +80,14 @@ Op *Reshape::create_operator_from_layer( return new Reshape(model, layer->layer_guid, inputs[0], shape, layer->name); } +bool match_pattern(std::vector const &_shape) { + return (_shape.size() == 4 && _shape[1] == 1 && _shape[2] == 1 && + _shape[3] == 512); +} + Reshape::Reshape(FFModel &model, LayerID const &_layer_guid, - const ParallelTensor input, + ParallelTensor const input, std::vector const &_shape, char const *name) : Op(model, @@ -106,19 +111,60 @@ Reshape::Reshape(FFModel &model, if (input->dims[i].is_replica_dim) { num_replica_dims++; } + // std::cout << "reshape input size: " << input->dims[i].size + // << ", parallelidx: " << input->dims[i].parallel_idx << ". + // degree: " << input->dims[i].degree + // << "is replicate dim: " << input->dims[i].is_replica_dim << + // "\n"; } + + // assert(false); // assert that all replica dims are leading dims for (int i = 0; i < num_replica_dims; i++) { assert(input->dims[input->num_dims - 1 - i].is_replica_dim); } int numdim = (int)_shape.size(); ParallelDim dims[MAX_TENSOR_DIM]; + + bool expanded = numdim >= input->num_dims; + bool aggregation = numdim < input->num_dims - 1; + for (int i = 0; i < numdim; i++) { - dims[i].size = _shape[numdim - 1 - i]; - dims[i].degree = 1; - dims[i].parallel_idx = -1; - dims[i].is_replica_dim = false; + if (expanded && i < numdim - 1 && + _shape[i] * _shape[i + 1] == input->dims[numdim - i - 2].size) { + dims[numdim - i - 1].size = _shape[i]; + dims[numdim - i - 1].degree = input->dims[numdim - i - 2].degree; + dims[numdim - i - 1].parallel_idx = + input->dims[numdim - i - 2].parallel_idx; + dims[numdim - i - 1].is_replica_dim = + input->dims[numdim - i - 2].is_replica_dim; + std::cout << "expand dim i:" << i << ", " << dims[numdim - i - 1].degree + << ", " << dims[numdim - i - 1].size << "\n"; + } else if (aggregation && + (_shape[i] == input->dims[input->num_dims - 2 - i].size * + input->dims[input->num_dims - 3 - i].size)) { + // inherit + dims[numdim - i - 1].size = _shape[i]; + dims[numdim - i - 1].degree = input->dims[input->num_dims - 2 - i].degree; + dims[numdim - i - 1].parallel_idx = + input->dims[input->num_dims - 2 - i].parallel_idx; + dims[numdim - i - 1].is_replica_dim = + input->dims[input->num_dims - 2 - i].is_replica_dim; + // std::cout << "agree i: " << i <<", " << _shape[i] << "\n"; + } else { + dims[numdim - i - 1].size = _shape[i]; + dims[numdim - i - 1].degree = 1; + dims[numdim - i - 1].parallel_idx = -1; + dims[numdim - i - 1].is_replica_dim = false; + } } + + // for (int i = 0; i < numdim; i++) { + // dims[i].size = _shape[numdim - 1 - i]; + // dims[i].degree = 1; + // dims[i].parallel_idx = -1; + // dims[i].is_replica_dim = false; + // } // copy all replica dims for (int i = 0; i < num_replica_dims; i++) { dims[i + numdim] = input->dims[input->num_dims - 1 - i]; @@ -131,6 +177,23 @@ Reshape::Reshape(FFModel &model, } dims[numdim - 1 - i] = input->dims[input->num_dims - 1 - i]; } + + // TODO temporary fix for input to attention QK, fix it after fuse the + // attention block + if (match_pattern(_shape) && model.config.tensor_parallelism_degree > 1) { + // number of heads + + dims[2].size = 12; + dims[2].degree = model.config.tensor_parallelism_degree; + dims[2].parallel_idx = 0; + dims[2].is_replica_dim = true; + + dims[4].size = 1; + dims[4].degree = 1; + dims[4].parallel_idx = -1; + dims[4].is_replica_dim = false; + } + outputs[0] = model.create_parallel_tensor_legion_ordering( numdim, dims, input->data_type, this); assert(outputs[0]->get_volume() == inputs[0]->get_volume()); @@ -138,7 +201,7 @@ Reshape::Reshape(FFModel &model, Reshape::Reshape(FFModel &model, ReshapeParams const ¶ms, - const ParallelTensor input, + ParallelTensor const input, char const *name) : Reshape(model, params.layer_guid, input, params.shape, params.name) {} diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index a02d88b98b..a6ad76eef5 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -49,6 +49,7 @@ void Softmax::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->dim); sez.serialize(strlen(this->name)); + sez.serialize(this->last_layer); sez.serialize(this->name, strlen(this->name)); } @@ -68,12 +69,17 @@ Node Softmax::deserialize(FFModel &ff, dez.deserialize(dim); size_t name_len; char name[MAX_OPNAME] = {0}; + + bool last_layer; + dez.deserialize(name_len); + dez.deserialize(last_layer); dez.deserialize(name, name_len); SoftmaxParams params; params.layer_guid = layer_guid; params.dim = dim; + params.last_layer = last_layer; strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } @@ -92,8 +98,9 @@ SoftmaxParams Softmax::get_params() const { return params; } -Tensor FFModel::softmax(const Tensor _input, +Tensor FFModel::softmax(Tensor const _input, int dim, + bool last_layer, DataType data_type, char const *name) { if (data_type == DT_NONE) { @@ -115,6 +122,8 @@ Tensor FFModel::softmax(const Tensor _input, sm->outputs[0] = create_tensor_legion_ordering( numdims, dims, data_type, sm, 0, true /*create_grad*/); sm->add_int_property("softmax_dim", dim); + + sm->add_int_property("last_layer", last_layer); layers.push_back(sm); return sm->outputs[0]; } @@ -126,17 +135,21 @@ Op *Softmax::create_operator_from_layer( long long value; layer->get_int_property("softmax_dim", value); int dim = (int)value; + layer->get_int_property("last_layer", value); + bool last_layer = (bool)value; return new Softmax(model, layer->layer_guid, inputs[0], (inputs[0]->num_dims - 1 - dim) % inputs[0]->num_dims, + last_layer, layer->name); } Softmax::Softmax(FFModel &model, LayerID const &_layer_guid, - const ParallelTensor _input, + ParallelTensor const _input, int _dim, + bool _last_layer, char const *name) : Op(model, OP_SOFTMAX, @@ -146,7 +159,7 @@ Softmax::Softmax(FFModel &model, 0 /*weights*/, 1 /*outputs*/, _input), - dim(_dim) { + dim(_dim), last_layer(_last_layer) { // Currently assume we always perform softmax along the inner most dim assert(dim == 0); layer_guid = _layer_guid; @@ -160,9 +173,14 @@ Softmax::Softmax(FFModel &model, Softmax::Softmax(FFModel &model, SoftmaxParams const ¶ms, - const ParallelTensor input, + ParallelTensor const input, char const *name) - : Softmax(model, params.layer_guid, input, params.dim, params.name) {} + : Softmax(model, + params.layer_guid, + input, + params.dim, + params.last_layer, + name) {} void Softmax::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -346,6 +364,13 @@ void Softmax::backward(FFModel const &ff) { EXCLUSIVE, outputs[0]->region_grad)); launcher.add_field(1, FID_DATA); + + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(2, FID_DATA); runtime->execute_index_space(ctx, launcher); } @@ -360,7 +385,10 @@ void Softmax::backward_task(Task const *task, m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel_wrapper(m, input_grad, output_grad); + GenericTensorAccessorR outputs = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + backward_kernel_wrapper( + m, input_grad, output_grad, outputs, outputs.domain.get_volume()); } FutureMap Softmax::inference(FFModel const &ff, @@ -571,13 +599,22 @@ bool Softmax::measure_operator_cost(Simulator *sim, float *output_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - GenericTensorAccessorW output_grad_acc( + GenericTensorAccessorR output_grad_acc( DT_FLOAT, sub_output.get_domain(), output_grad_ptr); assert(output_grad_ptr != NULL); + float *output_ptr = + (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + GenericTensorAccessorR output_acc( + DT_FLOAT, sub_output.get_domain(), output_ptr); cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + backward = [&] { - backward_kernel_wrapper(m, input_grad_acc, output_grad_acc); + backward_kernel_wrapper(m, + input_grad_acc, + output_grad_acc, + output_acc, + sub_output.get_volume()); }; } @@ -610,6 +647,7 @@ size_t hash::operator()( size_t key = 0; hash_combine(key, params.layer_guid.id); hash_combine(key, params.dim); + hash_combine(key, params.last_layer); return key; } }; // namespace std diff --git a/src/ops/split.cc b/src/ops/split.cc index 92cfbd49e9..b9fb5375a7 100644 --- a/src/ops/split.cc +++ b/src/ops/split.cc @@ -413,6 +413,17 @@ void Split::backward_task(Task const *task, split->numOutputs); } +tl::optional Split::as_dot() const { + RecordFormatter rr; + RecordFormatter r; + + r << this->inputs[0]->get_shape().as_dot(); + r << this->outputs[0]->get_shape().as_dot(); + rr << r; + + return rr; +} + bool Split::measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { diff --git a/src/ops/transpose.cc b/src/ops/transpose.cc index bffde477de..765b6d4585 100644 --- a/src/ops/transpose.cc +++ b/src/ops/transpose.cc @@ -57,7 +57,7 @@ TransposeParams Transpose::get_params() const { return params; } -Tensor FFModel::transpose(const Tensor input, +Tensor FFModel::transpose(Tensor const input, std::vector const &_perm, char const *name) { Layer *transpose = new Layer(this, @@ -99,12 +99,12 @@ Op *Transpose::create_operator_from_layer( Transpose::Transpose(FFModel &model, TransposeParams const ¶ms, - const ParallelTensor input, + ParallelTensor const input, char const *name) : Transpose(model, input, params.perm, params.name) {} Transpose::Transpose(FFModel &model, - const ParallelTensor input, + ParallelTensor const input, std::vector const &_perm, char const *name) : Op(model, diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc index 52c4ec2e28..2893a68e06 100644 --- a/src/parallel_ops/allreduce.cc +++ b/src/parallel_ops/allreduce.cc @@ -63,7 +63,7 @@ AllReduceParams AllReduce::get_params() const { } AllReduce::AllReduce(FFModel &model, - const ParallelTensor _input, + ParallelTensor const _input, int _allreduce_legion_dim, char const *name) : ParallelOp(model, OP_ALLREDUCE, name, _input), @@ -202,6 +202,7 @@ void AllReduce::backward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_backward(ff, argmap); assert(numOutputs == 1); assert(numInputs == 1); IndexLauncher launcher(ALLREDUCE_BWD_TASK_ID, @@ -342,6 +343,7 @@ void AllReduce::inference_task(Task const *task, m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(input.data_type == output.data_type); + inference_kernel_wrapper(m, bc, input, output); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); @@ -412,6 +414,7 @@ void AllReduce::peft_bwd_task(Task const *task, m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(input_grad.data_type == output_grad.data_type); + peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc index ce9c032350..d12b8e3c4d 100644 --- a/src/parallel_ops/combine.cc +++ b/src/parallel_ops/combine.cc @@ -76,7 +76,7 @@ Combine::Combine(FFModel &model, params.name) {} Combine::Combine(FFModel &model, - const ParallelTensor _input, + ParallelTensor const _input, int _combine_legion_dim, int _combine_degree, char const *name) @@ -272,6 +272,7 @@ void Combine::forward(FFModel const &ff) { assert(numInputs == 1); assert(inputs[0]->data_type == outputs[0]->data_type); DataType data_type = inputs[0]->data_type; + set_argumentmap_for_forward(ff, argmap); IndexLauncher launcher(COMBINE_FWD_TASK_ID, outputs[0]->parallel_is, TaskArgument(nullptr, 0), diff --git a/src/parallel_ops/kernels/allreduce_kernels.cpp b/src/parallel_ops/kernels/allreduce_kernels.cpp index 7067035465..82c2b1dad9 100644 --- a/src/parallel_ops/kernels/allreduce_kernels.cpp +++ b/src/parallel_ops/kernels/allreduce_kernels.cpp @@ -32,6 +32,7 @@ void forward_kernel_wrapper(AllReduceMeta const *m, checkCUDA(get_legion_stream(&stream)); assert(input.data_type == output.data_type); assert(input.domain == output.domain); + size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; #ifdef FF_USE_NCCL ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); checkNCCL(ncclAllReduce(input.ptr, @@ -49,7 +50,25 @@ void forward_kernel_wrapper(AllReduceMeta const *m, void backward_kernel_wrapper(AllReduceMeta const *m, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output_grad) { - assert(false && "To be implemented"); + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input_grad.data_type == output_grad.data_type); + assert(input_grad.domain == output_grad.domain); +#ifdef FF_USE_NCCL + // ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); + // std::cout <<"input volume: " << input.domain.get_volume() << "\n"; + // print_tensor((float*)input.ptr, 32, "input ptr"); + ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input_grad.data_type); + checkNCCL(ncclAllReduce(output_grad.ptr, + input_grad.ptr, + output_grad.domain.get_volume(), + nccl_data_type, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif } void inference_kernel_wrapper(AllReduceMeta const *m, diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu index 3041f9adf9..09d37e101c 100644 --- a/src/parallel_ops/kernels/allreduce_kernels.cu +++ b/src/parallel_ops/kernels/allreduce_kernels.cu @@ -49,7 +49,25 @@ void forward_kernel_wrapper(AllReduceMeta const *m, void backward_kernel_wrapper(AllReduceMeta const *m, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output_grad) { - assert(false && "To be implemented"); + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input_grad.data_type == output_grad.data_type); + assert(input_grad.domain == output_grad.domain); +#ifdef FF_USE_NCCL + // ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); + // std::cout <<"input volume: " << input.domain.get_volume() << "\n"; + // print_tensor((float*)input.ptr, 32, "input ptr"); + ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input_grad.data_type); + checkNCCL(ncclAllReduce(output_grad.ptr, + input_grad.ptr, + output_grad.domain.get_volume(), + nccl_data_type, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif } void inference_kernel_wrapper(AllReduceMeta const *m, @@ -68,6 +86,7 @@ void inference_kernel_wrapper(AllReduceMeta const *m, output.ptr, num_elements, nccl_data_type, + ncclSum, m->handle.ncclComm, stream)); diff --git a/src/parallel_ops/kernels/replicate_kernels.cpp b/src/parallel_ops/kernels/replicate_kernels.cpp index f49e0d4eb0..23bb8a52e9 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cpp +++ b/src/parallel_ops/kernels/replicate_kernels.cpp @@ -83,6 +83,45 @@ template void backward_kernel(float const *output_grad_ptr, size_t num_elements, size_t num_replicas); +template void forward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(double const *output_grad_ptr, + double *input_grad_ptr, + size_t num_elements, + size_t num_replicas); + +template void forward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(int64_t const *output_grad_ptr, + int64_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); + +template void forward_kernel(int32_t const *input_ptr, + int32_t *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(int32_t const *input_ptr, + int32_t *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(int32_t const *output_grad_ptr, + int32_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); + } // namespace Replicate } // namespace Kernels } // namespace FlexFlow diff --git a/src/parallel_ops/kernels/replicate_kernels.cu b/src/parallel_ops/kernels/replicate_kernels.cu index 0b5c434aa6..6705d04339 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cu +++ b/src/parallel_ops/kernels/replicate_kernels.cu @@ -75,6 +75,42 @@ template void backward_kernel(float const *output_grad_ptr, float *input_grad_ptr, size_t num_elements, size_t num_replicas); +template void forward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(double const *output_grad_ptr, + double *input_grad_ptr, + size_t num_elements, + size_t num_replicas); +template void forward_kernel(int32_t const *input_ptr, + int32_t *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(int32_t const *input_ptr, + int32_t *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(int32_t const *output_grad_ptr, + int32_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); +template void forward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(int64_t const *output_grad_ptr, + int64_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); } // namespace Replicate } // namespace Kernels diff --git a/src/parallel_ops/reduction.cc b/src/parallel_ops/reduction.cc index 7306e04334..2254f3e828 100644 --- a/src/parallel_ops/reduction.cc +++ b/src/parallel_ops/reduction.cc @@ -64,7 +64,7 @@ ReductionParams Reduction::get_params() const { } Reduction::Reduction(FFModel &model, - const ParallelTensor _input, + ParallelTensor const _input, int _reduction_legion_dim, int _reduction_degree, char const *name) diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index 38215fc903..b9af7fb0cd 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -63,7 +63,7 @@ ReplicateParams Replicate::get_params() const { } Replicate::Replicate(FFModel &model, - const ParallelTensor _input, + ParallelTensor const _input, int _replicate_legion_dim, int _replicate_degree, char const *name) @@ -263,10 +263,11 @@ void Replicate::forward(FFModel const &ff) { parallel_is = outputs[0]->parallel_is; assert(numOutputs == 1); assert(numInputs == 1); + DataType data_type = inputs[0]->data_type; set_argumentmap_for_forward(ff, argmap); IndexLauncher launcher(REPLICATE_FWD_TASK_ID, outputs[0]->parallel_is, - TaskArgument(NULL, 0), + TaskArgument(&data_type, sizeof(DataType)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -337,7 +338,7 @@ void Replicate::backward(FFModel const &ff) { assert(numInputs == 1); IndexLauncher launcher(REPLICATE_BWD_TASK_ID, inputs[0]->parallel_is, - TaskArgument(NULL, 0), + TaskArgument(&data_type, sizeof(DataType)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -395,12 +396,42 @@ bool Replicate::append_parallel_op_info( return true; } +// static OpMeta *Replicate::init_task(Task const *task, +// std::vector const +// ®ions, Context ctx, Runtime *runtime) +// {} + +/*static*/ void Replicate::forward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); + // <<<<<<< HEAD + // DataType data_type = *((DataType *)task->args); + // if (data_type == DT_FLOAT) { + // forward_task_with_type(task, regions, ctx, runtime); + // } else if (data_type == DT_DOUBLE) { + // forward_task_with_type(task, regions, ctx, runtime); + // } else if (data_type == DT_INT32) { + // forward_task_with_type(task, regions, ctx, runtime); + // } else if (data_type == DT_INT64) { + // forward_task_with_type(task, regions, ctx, runtime); + // } else { + // assert(false && "Unsupported data type in Replicate forward"); + // } + // } + + // template + // void Replicate::forward_task_with_type( + // Task const *task, + // std::vector const ®ions, + // Context ctx, + // Runtime *runtime) { + // assert(regions.size() == 2); + // assert(task->regions.size() == 2); + // ======= ReplicateMeta const *m = *((ReplicateMeta **)task->local_args); if (m->inference_debugging) { @@ -417,6 +448,14 @@ void Replicate::forward_task(Task const *task, assert(output_domain.hi()[i] == input_domain.hi()[i]); } assert(input_domain.get_volume() == output_domain.get_volume()); + // <<<<<<< HEAD + // T const *input_ptr = helperGetTensorPointerRO( + // regions[0], task->regions[0], FID_DATA, ctx, runtime); + // T *output_ptr = helperGetTensorPointerRW( + // regions[1], task->regions[1], FID_DATA, ctx, runtime); + + // forward_kernel(input_ptr, output_ptr, input_domain.get_volume()); + // ======= GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); @@ -474,6 +513,28 @@ void Replicate::backward_task(Task const *task, Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); + DataType data_type = *((DataType *)task->args); + if (data_type == DT_FLOAT) { + backward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_DOUBLE) { + backward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT32) { + backward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT64) { + backward_task_with_type(task, regions, ctx, runtime); + } else { + assert(false && "Unsupported data type in Embedding forward"); + } +} + +template +void Replicate::backward_task_with_type( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); Domain output_grad_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); Domain input_grad_domain = runtime->get_index_space_domain( @@ -485,12 +546,12 @@ void Replicate::backward_task(Task const *task, } size_t num_elements = input_grad_domain.get_volume(); size_t num_replicas = output_grad_domain.get_volume() / num_elements; - float const *output_grad_ptr = helperGetTensorPointerRO( + T const *output_grad_ptr = helperGetTensorPointerRO( regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *input_grad_ptr = helperGetTensorPointerRW( + T *input_grad_ptr = helperGetTensorPointerRW( regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel( + backward_kernel( output_grad_ptr, input_grad_ptr, num_elements, num_replicas); } diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 386a0c940b..1f2ff5062c 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -62,6 +62,15 @@ __global__ void copy_kernel(DT *dst, const DT *src, coord_t size) { dst[i] = src[i]; } } +template +__global__ void copy_kernel_with_replicate(DT *dst, + const DT *src, + coord_t origin_size, + coord_t size) { + CUDA_KERNEL_LOOP(i, size) { + dst[i] = src[i % origin_size]; + } +} template __global__ void @@ -660,6 +669,14 @@ template __global__ void copy_kernel(half *dst, half const *src, coord_t size); template __global__ void copy_kernel(float *dst, float const *src, coord_t size); +template __global__ void copy_kernel_with_replicate(float *dst, + float const *src, + coord_t origin_size, + coord_t size); +template __global__ void copy_kernel_with_replicate( + int32_t *dst, int32_t const *src, coord_t origin_size, coord_t size); +template __global__ void copy_kernel_with_replicate( + int64_t *dst, int64_t const *src, coord_t origin_size, coord_t size); template __global__ void copy_kernel(double *dst, double const *src, coord_t size); template __global__ void diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 1a38782e81..6b9f1ddc22 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -71,7 +71,7 @@ using FlexFlow::MachineView; Legion::Logger log_graph("graph"); Legion::Logger log_simplify("graph_simplify"); -const Node Node::INVALID_NODE = Node(); +Node const Node::INVALID_NODE = Node(); Node::Node(void) : guid(0), ptr(NULL) {} @@ -1899,6 +1899,7 @@ namespace { std::pair, std::unordered_map> try_one_lambda(std::pair &lambda, Task const *task, + // FFModel *model, std::shared_ptr &cached_simulator, bool perform_memory_search) { // Create a new fresh model @@ -1916,6 +1917,76 @@ std::pair, std::unordered_map> model->config.workersPerNode, model->config.cpusPerNode, model->all_valid_views); + // <<<<<<< HEAD + if (model->config.only_data_parallel) { + Graph *graph = new Graph(model); + graph->print_dot(); + std::unordered_map op_to_node_map; + for (FlexFlow::Op const *dstOp : model->operators) { + Node dstNode; + dstNode.ptr = dstOp; + dstNode.guid = model->node_global_guid++; + op_to_node_map[dstOp] = dstNode; + for (int j = 0; j < dstOp->numInputs; j++) { + FlexFlow::Op const *srcOp = dstOp->inputs[j]->owner_op; + assert(op_to_node_map.find(srcOp) != op_to_node_map.end()); + Node srcNode = op_to_node_map[srcOp]; + graph->add_edge(srcNode, dstNode, dstOp->inputs[j]->owner_idx, j); + } + } + graph->print_dot(); + std::unique_ptr curr_best_graph; + std::unordered_map curr_optimal_views; + curr_best_graph = std::unique_ptr(graph); + MachineView data_parallel_view; + data_parallel_view.device_type = MachineView::GPU; + data_parallel_view.ndims = 1; + data_parallel_view.dim[0] = + model->config.numNodes * model->config.workersPerNode; + data_parallel_view.stride[0] = 1; + data_parallel_view.start_device_id = 0; + // Currently assume a 1D machine view is needed + assert(model->config.data_parallelism_degree == 1 || + model->config.tensor_parallelism_degree == 1); + int degree = model->config.data_parallelism_degree * + model->config.tensor_parallelism_degree; + for (auto const &node : curr_best_graph->inEdges) { + Op const *op = node.first.ptr; + MachineView mv; + mv.device_type = MachineView::GPU; + mv.ndims = 1; + int total_parallel_degree = 1; + for (int i = 0; i < op->outputs[0]->num_dims; i++) { + total_parallel_degree *= op->outputs[0]->dims[i].degree; + } + mv.dim[0] = total_parallel_degree; + mv.stride[0] = 1; + mv.start_device_id = 0; + // std::cout << mv.start_device_id + degree - 1 << "\n"; + // std::cout << model->config.numNodes << "\n"; + // std::cout << model->config.workersPerNode << "\n"; + // assert(false); + assert(mv.start_device_id + degree - 1 < + model->config.numNodes * model->config.workersPerNode); + curr_optimal_views[node.first] = mv; + for (int i = 0; i < node.first.ptr->numOutputs; i++) { + assert(node.first.ptr->outputs[i]->is_valid_machine_view(mv)); + } + } + // for (auto const &node : curr_best_graph->inEdges) { + // curr_optimal_views[node.first] = data_parallel_view; + // } + return std::make_pair(std::move(curr_best_graph), curr_optimal_views); + } + + // Runtime *runtime = model->config.lg_hlr; + // Context ctx = model->config.lg_ctx; + // Task const *task = runtime->get_current_task(ctx); + // Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + // .only_kind(Memory::GPU_FB_MEM) + // .best_affinity_to(task->target_proc) + // .first(); + // ======= Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MachineModel *machine; if (model->config.machine_model_version == 0) { @@ -1949,6 +2020,16 @@ std::pair, std::unordered_map> std::unique_ptr curr_best_graph; std::unordered_map curr_optimal_views; + // <<<<<<< HEAD + // // Main step to optimize the PCG of an FFModel + // model->graph_optimize(model->config.search_budget, + // model->config.only_data_parallel, + // curr_best_graph, + // curr_optimal_views, + // perform_memory_search, + // MemoryOptimConfig{lambda.first}, + // lambda.second); + // ======= if (model->config.only_data_parallel) { Graph *graph = new Graph(model); std::unordered_map op_to_node_map; @@ -2104,11 +2185,18 @@ bool is_valid_strategy( * @param runtime Not used * @return GraphOptimalViewSerialized Serialized optimal PCG */ + GraphOptimalViewSerialized Graph::graph_optimize_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { + // FFModel *model = *((FFModel **)task->args); + // return Graph::graph_optimize_wrapper(model); + // } + + // /*static*/ + // GraphOptimalViewSerialized Graph::graph_optimize_wrapper(FFModel *model) { auto model_config = (*((FFModel **)task->args))->config; bool perform_memory_search = model_config.perform_memory_search; float memory_threshold = model_config.device_mem; @@ -2422,6 +2510,13 @@ GraphOptimalViewSerialized sez.serialize(reduction->name, strlen(reduction->name)); break; } + case OP_ALLREDUCE: { + AllReduce *allreduce = (AllReduce *)op; + sez.serialize(allreduce->allreduce_dim); + sez.serialize(strlen(allreduce->name)); + sez.serialize(allreduce->name, strlen(allreduce->name)); + break; + } case OP_COMBINE: { Combine *combine = (Combine *)op; sez.serialize(combine->combine_dim); @@ -2430,13 +2525,13 @@ GraphOptimalViewSerialized sez.serialize(combine->name, strlen(combine->name)); break; } - case OP_ALLREDUCE: { - AllReduce *allreduce = (AllReduce *)op; - sez.serialize(allreduce->allreduce_dim); - sez.serialize(strlen(allreduce->name)); - sez.serialize(allreduce->name, strlen(allreduce->name)); - break; - } + // case OP_ALLREDUCE: { + // AllReduce *allreduce = (AllReduce *)op; + // sez.serialize(allreduce->allreduce_dim); + // sez.serialize(strlen(allreduce->name)); + // sez.serialize(allreduce->name, strlen(allreduce->name)); + // break; + // } case OP_PARALLEL_IDENTITY: { ParallelIdentity *parallel_identity = (ParallelIdentity *)op; sez.serialize(parallel_identity->parallel_identity_dim); @@ -3140,6 +3235,17 @@ void FFModel::deserialize_graph_optimal_view( node = get_or_create_node(inputs[0], params); break; } + // case OP_ALLREDUCE: { + // assert(num_inputs == 1); + // int allreduce_dim; + // dez.deserialize(allreduce_dim); + // size_t name_len; + // char name[MAX_OPNAME] = {0}; + // dez.deserialize(name_len); + // dez.deserialize(name, name_len); + // node = get_or_create_node(inputs[0], {allreduce_dim}); + // break; + // } case OP_FUSED_PARALLEL: { assert(num_inputs == 1); FusedParallelOpParams params; diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index 057be8f443..aa2244d43f 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -56,6 +56,16 @@ __global__ void copy_kernel(DT *dst, const DT *src, coord_t size) { } } +template +__global__ void copy_kernel_with_replicate(DT *dst, + const DT *src, + coord_t origin_size, + coord_t size) { + CUDA_KERNEL_LOOP(i, size) { + dst[i] = src[i % origin_size]; + } +} + template __global__ void copy_kernel_discrete(DT *dst, const DT *src, coord_t size, size_t *index) { @@ -551,6 +561,57 @@ miopenStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax( return miopenStatusBadParm; } +miopenStatus_t + cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor, + Domain domain) { + int dims[MAX_TENSOR_DIM]; + switch (domain.get_dim()) { + case 1: { + Rect<1> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + return miopenSet4dTensorDescriptor(tensor, miopenFloat, dims[0], 1, 1, 1); + } + case 2: { + Rect<2> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[1], dims[0], 1, 1); + } + case 3: { + Rect<3> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[2] * dims[1], dims[0], 1, 1); + } + case 4: { + Rect<4> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + dims[3] = rect.hi[3] - rect.lo[3] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[3] * dims[2] * dims[1], dims[0], 1, 1); + } + case 5: { + Rect<5> rect = domain; + int leading_dim_size = rect.hi[4] - rect.lo[4] + 1; + assert(leading_dim_size == 1); + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + dims[3] = rect.hi[3] - rect.lo[3] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[3], dims[2], dims[1], dims[0]); + } + default: + assert(false && "Unsupported dim number"); + } + return miopenStatusBadParm; +} + miopenDataType_t ff_to_cudnn_datatype(DataType type) { switch (type) { case DT_HALF: @@ -680,6 +741,16 @@ template __global__ void copy_kernel(half *dst, half const *src, coord_t size); template __global__ void copy_kernel(float *dst, float const *src, coord_t size); + +template __global__ void copy_kernel_with_replicate(float *dst, + float const *src, + coord_t origin_size, + coord_t size); +template __global__ void copy_kernel_with_replicate( + int32_t *dst, int32_t const *src, coord_t origin_size, coord_t size); +template __global__ void copy_kernel_with_replicate( + int64_t *dst, int64_t const *src, coord_t origin_size, coord_t size); + template __global__ void copy_kernel(double *dst, double const *src, coord_t size); template __global__ void diff --git a/src/runtime/machine_view.cc b/src/runtime/machine_view.cc index dadece7691..44dff5a2da 100644 --- a/src/runtime/machine_view.cc +++ b/src/runtime/machine_view.cc @@ -1,4 +1,5 @@ #include "flexflow/machine_view.h" +#include "flexflow/utils/hash_utils.h" namespace FlexFlow { @@ -47,15 +48,15 @@ size_t MachineView::num_parts() const { } size_t MachineView::hash() const { - size_t ret = 17; - ret = ret * 31 + std::hash()(device_type); - ret = ret * 31 + std::hash()(ndims); - ret = ret * 31 + std::hash()(start_device_id); + size_t h = 0; + hash_combine(h, device_type); + hash_combine(h, ndims); + hash_combine(h, start_device_id); for (int i = 0; i < ndims; i++) { - ret = ret * 31 + std::hash()(dim[i]); - ret = ret * 31 + std::hash()(stride[i]); + hash_combine(h, dim[i]); + hash_combine(h, stride[i]); } - return ret; + return h; } int MachineView::get_device_id(DomainPoint const &p) const { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index f46630db3c..68034f7c69 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -96,10 +96,10 @@ Op::Op(FFModel &model, int numWeights, bool allocate_weights, int numOutputs, - const ParallelTensor input1, - const ParallelTensor input2, - const ParallelTensor input3, - const ParallelTensor input4) + ParallelTensor const input1, + ParallelTensor const input2, + ParallelTensor const input3, + ParallelTensor const input4) : Op(model, otype, dtype, @@ -119,10 +119,10 @@ Op::Op(FFModel &model, int _numInputs, int _numWeights, int _numOutputs, - const ParallelTensor _input1, - const ParallelTensor _input2, - const ParallelTensor _input3, - const ParallelTensor _input4) + ParallelTensor const _input1, + ParallelTensor const _input2, + ParallelTensor const _input3, + ParallelTensor const _input4) : op_type(_otype), data_type(_dtype), op_guid(model.op_global_guid++), numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs), profiling(model.config.profiling), @@ -606,9 +606,15 @@ ncclComm_t Op::init_nccl_comms_task(Task const *task, } } ncclComm_t ncclComm; + fprintf(stderr, "Before ncclCommInitRank\n"); checkNCCL(ncclCommInitRank(&ncclComm, allRanks, ncclId, myRank)); - // fprintf(stderr, "ncclComm(%p) allRanks(%d) myRank(%d) ncclId(%p)\n", - // ncclComm, allRanks, myRank, ncclId); + fprintf(stderr, + "After ncclCommInitRank ncclComm(%p) allRanks(%d) myRank(%d) " + "ncclId(%p)\n", + ncclComm, + allRanks, + myRank, + ncclId); return ncclComm; } @@ -1040,9 +1046,9 @@ void Op::register_output_parallel_dims( operation); } -int Op::get_output_to_input_dim_mapping(const ParallelTensor output, +int Op::get_output_to_input_dim_mapping(ParallelTensor const output, int output_dim, - const ParallelTensor input) { + ParallelTensor const input) { int output_idx = -1, input_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -1075,9 +1081,9 @@ int Op::get_output_to_input_dim_mapping(const ParallelTensor output, return -1; } -int Op::get_output_to_weight_dim_mapping(const ParallelTensor output, +int Op::get_output_to_weight_dim_mapping(ParallelTensor const output, int output_dim, - const ParallelTensor weight) { + ParallelTensor const weight) { int output_idx = -1, weight_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -1143,6 +1149,9 @@ bool Op::check_output_input_weight_parallel_dims(bool allocate_weights) const { break; } + printf("other dim degree: %d, input dim degree %d\n", + other_dim.degree, + input_dim.degree); assert(other_dim.degree == input_dim.degree); assert(other_dim.parallel_idx == input_dim.parallel_idx); } @@ -1152,18 +1161,25 @@ bool Op::check_output_input_weight_parallel_dims(bool allocate_weights) const { bool Op::check_output_input_weight_same_parallel_is() const { assert(numOutputs > 0); IndexSpace parallel_is = outputs[0]->parallel_is; + std::cout << "output space: " + << ", " << parallel_is << "\n"; for (int i = 0; i < numOutputs; i++) { if (outputs[i]->parallel_is != parallel_is) { + std::cout << "output mismatch" + << "\n"; return false; } } for (int i = 0; i < numInputs; i++) { + std::cout << "input space: " << i << ", " << inputs[i]->parallel_is << "\n"; if (inputs[i]->parallel_is != parallel_is) { return false; } } for (int i = 0; i < numWeights; i++) { if (weights[i]->parallel_is != parallel_is) { + std::cout << "weight mismatch" + << "\n"; return false; } } @@ -1205,7 +1221,7 @@ void Op::set_argumentmap_for_init(FFModel const &ff, ArgumentMap &argmap) { for (PointInRectIterator it(rect); it(); it++) { \ FFHandler handle = ff.handlers[view.get_device_id(*it)]; \ if (ff.config.computationMode == COMP_MODE_TRAINING && \ - op_type == OP_WEIGHT) { \ + (op_type == OP_WEIGHT || op_type == OP_ALLREDUCE)) { \ ncclComm_t *nccl_comms = ff.find_nccl_comms(view); \ handle.ncclComm = nccl_comms[idx++]; \ } \ @@ -1527,6 +1543,7 @@ FFRuntime::FFRuntime(FFConfig &config) { Context ctx = config.lg_ctx; ArgumentMap argmap; + Domain domain = runtime->get_index_space_domain(ctx, config.all_gpu_task_is); Rect<1> task_rect = domain; // int rank = 0; @@ -1759,7 +1776,7 @@ Tensor FFModel::create_tensor(int numdim, } ParallelTensor FFModel::create_parallel_tensor(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *op, int idx, @@ -1792,7 +1809,7 @@ Tensor FFModel::create_tensor_legion_ordering(int numdim, ParallelTensor FFModel::create_parallel_tensor_legion_ordering(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *op, int idx, @@ -1842,7 +1859,7 @@ Tensor FFModel::create_tensor(int const dims[], } template -ParallelTensor FFModel::create_parallel_tensor(const ParallelDim dims[], +ParallelTensor FFModel::create_parallel_tensor(ParallelDim const dims[], DataType data_type, Op const *owner_op, int owner_idx, @@ -1923,7 +1940,7 @@ Parameter FFModel::create_weight(int numdim, } template -ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[], +ParallelParameter FFModel::create_parallel_weight(ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1954,7 +1971,7 @@ ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[], } ParallelParameter FFModel::create_parallel_weight(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1974,7 +1991,7 @@ ParallelParameter FFModel::create_parallel_weight(int numdim, ParallelParameter FFModel::create_parallel_weight_legion_ordering( int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -2188,7 +2205,7 @@ void FFModel::map_weight_with_dim(ParallelTensor weight, } bool FFModel::get_parallel_tensor_from_tensor( - const Tensor tensor, ParallelTensor ¶llel_tensor) const { + Tensor const tensor, ParallelTensor ¶llel_tensor) const { // check if tensor->parallel_tensor is already set if (tensor->parallel_tensor != nullptr) { parallel_tensor = tensor->parallel_tensor; @@ -2225,7 +2242,7 @@ bool FFModel::get_parallel_tensor_from_tensor( } void FFModel::create_disjoint_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], IndexSpace const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -2248,7 +2265,7 @@ void FFModel::create_disjoint_partition(int num_dims, template void FFModel::create_disjoint_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], IndexSpaceT const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -2281,7 +2298,7 @@ void FFModel::create_disjoint_partition_with_dim2( } void FFModel::create_aliased_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, IndexSpace const &part_is, LogicalRegion const ®ion, @@ -2305,7 +2322,7 @@ void FFModel::create_aliased_partition(int num_dims, template void FFModel::create_aliased_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, IndexSpaceT const &part_is, LogicalRegion const ®ion, @@ -2342,7 +2359,7 @@ void FFModel::create_aliased_partition_with_dim2( } template -void FFModel::create_disjoint_partition(const ParallelTensor tensor, +void FFModel::create_disjoint_partition(ParallelTensor const tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -2390,7 +2407,7 @@ void FFModel::create_disjoint_partition(const ParallelTensor tensor, template void FFModel::create_data_parallel_partition_with_diff_dims( - const ParallelTensor tensor, + ParallelTensor const tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -2772,7 +2789,7 @@ IndexSpace FFModel::get_task_is(ParallelConfig const &pc) const { return get_task_is(view); } -IndexSpace FFModel::get_or_create_task_is(const ParallelTensor tensor) { +IndexSpace FFModel::get_or_create_task_is(ParallelTensor const tensor) { MachineView view; view.ndims = 0; for (int i = 0; i < tensor->num_dims; i++) { @@ -2943,6 +2960,10 @@ void FFModel::update() { } } +void FFModel::unified_update() { + optimizer->unified_update(parameters); +} + Op *FFModel::get_final_operator() const { int idx = operators.size() - 1; while (operators[idx]->op_type == OP_INPUT || @@ -3112,12 +3133,21 @@ Op *FFModel::create_operator_from_layer( dims[num_dims].degree = 1; dims[num_dims].parallel_idx = -1; dims[num_dims].is_replica_dim = true; - if (config.computationMode == COMP_MODE_INFERENCE && - config.tensor_parallelism_degree > 1) { + if (config.tensor_parallelism_degree > 1 && + ((num_inputs != 1) || + config.computationMode == COMP_MODE_INFERENCE)) { dims[num_dims].size *= config.tensor_parallelism_degree; dims[num_dims].degree *= config.tensor_parallelism_degree; dims[num_dims].parallel_idx = 0; } + // TODO temporary fix for input to attention QK, fix it after fuse the + // attention block + else if (config.tensor_parallelism_degree > 1) { + // n heads + dims[num_dims].size *= 12; + dims[num_dims].degree *= config.tensor_parallelism_degree; + dims[num_dims].parallel_idx = 0; + } // create_parallel_tensor adds an NoOp into operators ParallelTensor pt = create_parallel_tensor_legion_ordering(num_dims + 1, @@ -3132,12 +3162,21 @@ Op *FFModel::create_operator_from_layer( assert(tensor->parallel_tensor == nullptr); tensor->parallel_tensor = pt; // start from data parllel tensor - if (config.only_data_parallel && - config.computationMode == COMP_MODE_TRAINING) { - Repartition *part = new Repartition( - *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); - operators.push_back(part); - } + // if (config.only_data_parallel && + // config.computationMode == COMP_MODE_TRAINING) { + // Repartition *part = new Repartition( + // *this, pt, num_dims - 1, config.numNodes * + // config.workersPerNode); + // operators.push_back(part); + // } + num_inputs++; + // if (config.only_data_parallel && + // config.computationMode == COMP_MODE_TRAINING) { + // Repartition *part = new Repartition( + // *this, pt, num_dims - 1, config.numNodes * + // config.workersPerNode); + // operators.push_back(part); + // } return operators[operators.size() - 1]; } case OP_MULTIHEAD_ATTENTION: { @@ -3358,6 +3397,17 @@ Op *FFModel::create_operator_from_layer( } } +bool FFModel::is_transformer_block(int layer_idx) const { + auto const &l = layers[layer_idx]; + if (l->op_type == OP_DROPOUT && layer_idx >= 4 && + layers[layer_idx - 1]->op_type == OP_LINEAR && + layers[layer_idx - 2]->op_type == OP_RESHAPE && + layers[layer_idx - 3]->op_type == OP_TRANSPOSE && + layers[layer_idx - 4]->op_type == OP_BATCHMATMUL) { + return true; + } + return false; +} bool FFModel::is_mlp_block(int layer_idx) const { auto const &l = layers[layer_idx]; // standard opt relu @@ -3491,8 +3541,8 @@ bool FFModel::need_to_add_parallel_identity(int layer_idx) const { } void FFModel::create_operators_from_layers() { - std::map tensors_to_parallel_tensors; - std::map + std::map tensors_to_parallel_tensors; + std::map op_before_allreduce_tensors_to_parallel_tensors; std::map transformer_layer_allreduce_count; std::map transformer_layer_parallel_identity_count; @@ -3525,6 +3575,15 @@ void FFModel::create_operators_from_layers() { partitioned_inputs.push_back(comb->outputs[0]); operators.push_back(comb); op = create_operator_from_layer(l, partitioned_inputs); + } else if (config.computationMode == COMP_MODE_TRAINING && + config.tensor_parallelism_degree > 1 && + l->op_type == OP_LAYERNORM && layer_idx == layers.size() - 6) { + std::vector partitioned_inputs; + Combine *comb = + new Combine(*this, inputs[0], 3, config.tensor_parallelism_degree); + partitioned_inputs.push_back(comb->outputs[0]); + operators.push_back(comb); + op = create_operator_from_layer(l, partitioned_inputs); } else { op = create_operator_from_layer(l, inputs); } @@ -3613,13 +3672,32 @@ void FFModel::create_operators_from_layers() { tensors_to_parallel_tensors[l->outputs[l->numOutputs - 1]] = parallel_identity->outputs[0]; op = parallel_identity; - } else { - assert(op->numOutputs == l->numOutputs); - for (int i = 0; i < op->numOutputs; i++) { - assert(tensors_to_parallel_tensors.find(l->outputs[i]) == - tensors_to_parallel_tensors.end()); - tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i]; - } + } else if (config.computationMode == COMP_MODE_TRAINING && + config.tensor_parallelism_degree > 1 && + (is_transformer_block(layer_idx) || is_mlp_block(layer_idx) || + // llama mlp layer + (l->op_type == OP_LINEAR && layer_idx >= 2 && + layers[layer_idx - 1]->op_type == OP_GELU && + layers[layer_idx - 2]->op_type == OP_LINEAR) || + // LLAMA without element-wise operator fusion + (l->op_type == OP_LINEAR && layer_idx >= 5 && + layers[layer_idx - 1]->op_type == OP_EW_MUL && + layers[layer_idx - 2]->op_type == OP_EW_MUL && + layers[layer_idx - 3]->op_type == OP_SIGMOID && + layers[layer_idx - 4]->op_type == OP_LINEAR && + layers[layer_idx - 5]->op_type == OP_LINEAR))) { + assert(op->numOutputs == 1); + AllReduce *allreduce = + new AllReduce(*this, op->outputs[0], op->outputs[0]->num_dims - 1); + operators.push_back(allreduce); + op = allreduce; + } + + assert(op->numOutputs == l->numOutputs); + for (int i = 0; i < op->numOutputs; i++) { + assert(tensors_to_parallel_tensors.find(l->outputs[i]) == + tensors_to_parallel_tensors.end()); + tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i]; } // if the operator has op_type==OP_LORA, and the second-to-last operator in // the operators vector has op_type==OP_ALLREDUCE, move the operator before @@ -3656,6 +3734,15 @@ void FFModel::compile(LossType loss_type, // Launch the graph optimize task { FFModel *model = this; + // PCG::GraphOptimalViewSerialized ret; + // if (false) { + // TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID, + // TaskArgument(&model, sizeof(FFModel *))); + // Future future = runtime->execute_task(ctx, launcher); + // ret = future.get_result(); + // } else { + // ret = PCG::Graph::graph_optimize_wrapper(this); + // } TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID, TaskArgument(&model, sizeof(FFModel *))); Future future = runtime->execute_task(ctx, launcher); @@ -3799,6 +3886,71 @@ void FFModel::compile(LossType loss_type, } } + int degree = + config.data_parallelism_degree * config.tensor_parallelism_degree; + + for (int op_idx = 0; op_idx < operators.size(); op_idx++) { + Op const *op = operators[op_idx]; + // Skip weight operators + if (op->op_type == OP_WEIGHT) { + continue; + } + // Get machine views + std::vector machine_views; + for (int j = 0; j < config.data_parallelism_degree; j++) { + MachineView mv; + mv.device_type = MachineView::GPU; + mv.ndims = 1; + // mv.start_device_id = 0; + mv.stride[0] = 1; + int parallel_degree = 1; + for (int k = 0; k < op->outputs[0]->num_dims; k++) { + parallel_degree *= op->outputs[0]->dims[k].degree; + } + mv.dim[0] = parallel_degree; + mv.start_device_id = 0; + // if (mv != op->outputs[0]->machine_view) { + // std::cout << "start: " << + // op->outputs[0]->machine_view.start_device_id + // << ", mv.ndims " << op->outputs[0]->machine_view.ndims + // << ", mv.stride[0] " << + // op->outputs[0]->machine_view.stride[0] + // << ", mv.dim[0] " << op->outputs[0]->machine_view.dim[0] + // << "\n"; + // std::cout << "parallel_degree: " << parallel_degree << "\n"; + // std::cout << "op type: " << op->name << "\n"; + // } + assert(mv == op->outputs[0]->machine_view); + machine_views.push_back(mv); + } + for (int i = 0; i < op->numOutputs; i++) { + ParallelTensor pt_base = op->outputs[i]; + + if (op->op_type == OP_REPLICATE) { + assert(op->numInputs == 1 && op->numOutputs == 1); + } + std::vector list; + bool found_parallel_tensor = false; + if (!found_parallel_tensor) { + for (int j = 0; j < config.data_parallelism_degree; j++) { + // Copy the metadata from pt_base to pt + ParallelTensor pt = new ParallelTensorBase(*pt_base); + pt->region = + runtime->create_logical_region(ctx, + pt_base->region.get_index_space(), + pt_base->region.get_field_space()); + pt->part = runtime->get_logical_partition( + ctx, pt->region, pt_base->part.get_index_partition()); + pt->machine_view = machine_views[j]; + Domain part_domain = + runtime->get_index_space_domain(ctx, pt_base->parallel_is); + assert(pt->machine_view.get_domain() == part_domain); + list.push_back(pt); + } + } + } + } + // Perform fusion optimizations if (config.perform_fusion) { fprintf(stderr, "Applying fusion optimizations during compilation...\n"); @@ -4321,45 +4473,46 @@ void FFIterationConfig::reset() { // Default Config Parameters struct DefaultConfig { - const static int epochs = 1; + static int const epochs = 1; // const static int iterations = 1; - const static int batchSize = 64; - const static bool profiling = false; - const static bool benchmarking = false; - const static bool inference_debugging = false; + + static int const batchSize = 64; + static bool const profiling = false; + static bool const benchmarking = false; + static bool const inference_debugging = false; constexpr static float learningRate = 0.01f; constexpr static float weightDecay = 0.0001f; - const static size_t workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB - const static int numNodes = 1; - const static int workersPerNode = 0; - const static int cpusPerNode = 0; - const static size_t searchBudget = -1; - const static size_t simulatorWorkSpaceSize = + static size_t const workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB + static int const numNodes = 1; + static int const workersPerNode = 0; + static int const cpusPerNode = 0; + static size_t const searchBudget = -1; + static size_t const simulatorWorkSpaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2 GB constexpr static float searchAlpha = 1.2f; - const static bool searchOverlapBackwardUpdate = false; - const static size_t offloadReserveSpaceSize = + static bool const searchOverlapBackwardUpdate = false; + static size_t const offloadReserveSpaceSize = (size_t)8 * 1024 * 1024 * 1024; // 8 GB // PEFT related fields - const static bool enablePeft = false; - const static size_t peftActivationReserveSpaceSize = + static bool const enablePeft = false; + static size_t const peftActivationReserveSpaceSize = (size_t)1 * 1024 * 1024 * 1024; // 1GB - const static size_t peftWeightReserveSpaceSize = + static size_t const peftWeightReserveSpaceSize = (size_t)1 * 1024 * 1024 * 1024; // 1GB - const static bool cpuOffload = false; - const static bool onlyDataParallel = true; - const static bool enableSampleParallel = true; - const static bool enableParameterParallel = false; - const static bool enableAttributeParallel = false; - const static bool enableInplaceOptimizations = false; - const static bool allowTensorOpMathConversion = false; - const static int machine_model_version = 0; - const static int simulator_segment_size = 16777216; // 16 MB - const static int simulator_max_num_segments = 1; - const static int base_optimize_threshold = 10; - const static bool enable_control_replication = true; + static bool const cpuOffload = false; + static bool const onlyDataParallel = true; + static bool const enableSampleParallel = true; + static bool const enableParameterParallel = false; + static bool const enableAttributeParallel = false; + static bool const enableInplaceOptimizations = false; + static bool const allowTensorOpMathConversion = false; + static int const machine_model_version = 0; + static int const simulator_segment_size = 16777216; // 16 MB + static int const simulator_max_num_segments = 1; + static int const base_optimize_threshold = 10; + static bool const enable_control_replication = true; // The default python data loader type is 2 to enable control replication - const static int python_data_loader_type = 2; + static int const python_data_loader_type = 2; }; FFConfig::FFConfig() { @@ -4440,6 +4593,9 @@ FFConfig::FFConfig() { Rect<1> task_rect(Point<1>(0), Point<1>(workersPerNode * numNodes - 1)); // Create an index space for tasks running on all GPUs all_gpu_task_is = runtime->create_index_space(lg_ctx, task_rect); + // <<<<<<< HEAD + // field_space = runtime->create_field_space(lg_ctx); + // ======= // field_space = runtime->create_field_space(lg_ctx); } @@ -4539,11 +4695,13 @@ void FFConfig::parse_args(char **argv, int argc) { tensor_parallelism_degree = std::stoi(argv[++i]); continue; } + // pipeline parallelism degree if (!strcmp(argv[i], "-pipeline-parallelism-degree")) { pipeline_parallelism_degree = std::stoi(argv[++i]); continue; } + if ((!strcmp(argv[i], "--enable-parameter-parallel"))) { enable_parameter_parallel = true; continue; @@ -7388,7 +7546,6 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } - // FusedParallelOp { TaskVariantRegistrar registrar(FUSED_PARALLELOP_FWD_TASK_ID, @@ -7482,6 +7639,23 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(ADAM_UNIFY_UPD_NCCL_TASK_ID, + "Adam unified NCCL Update"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + AdamOptimizer::nccl_unified_update_task>( + registrar, "Adam unified NCCL Update Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } #endif // Initializer { diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index c42a0c9aa6..91a16e8db7 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -333,6 +333,7 @@ void AdamOptimizer::init(void) { Context ctx = model->config.lg_ctx; Runtime *runtime = model->config.lg_hlr; Initializer *initializer = new ZeroInitializer(); + reservedWorkSpaceSize = 0; for (size_t i = 0; i < model->parameters.size(); i++) { ParallelTensor p = model->parameters[i]; Domain domain = @@ -381,6 +382,7 @@ void AdamOptimizer::update(const ParallelTensor p) { assert(v_values.find(p->region) != v_values.end()); assert(m_values.find(p->region) != m_values.end()); assert(p->owner_op != NULL); + reservedWorkSpaceSize += p->get_volume() * sizeof(float); if (p->sync_type == ParameterSyncType::PS) { TaskLauncher launcher(ADAM_UPD_PS_TASK_ID, TaskArgument(this, sizeof(AdamOptimizer)), @@ -492,6 +494,119 @@ void AdamOptimizer::update(const ParallelTensor p) { } } +void SGDOptimizer::unified_update(std::vector const parameters) { + //todo +} + +void AdamOptimizer::unified_update(std::vector const parameters) { + Context ctx = model->config.lg_ctx; + Runtime *runtime = model->config.lg_hlr; + const ParallelTensor p0 = parameters.at(0); + ArgumentMap argmap; + Domain domain = runtime->get_index_space_domain(ctx, p0->parallel_is); + switch (domain.get_dim()) { +#define DIMFUNC(DIM) \ + case DIM: { \ + Rect rect = domain; \ + int idx = 0; \ + for (PointInRectIterator it(rect); it(); it++) { \ + OpMeta *mp = p0->owner_op->meta[idx++]; \ + argmap.set_point(*it, TaskArgument(&mp, sizeof(OpMeta *))); \ + } \ + break; \ + } + LEGION_FOREACH_N(DIMFUNC) +#undef DIMFUNC + default: + assert(false); + } + + int offset = 0; + int processed_parameters_num = 0; + // printf("param size: %d\n", parameters.size()); + + size_t workSpaceSize = model->handlers->workSpaceSize * + model->config.workersPerNode * model->config.numNodes; + + while (processed_parameters_num < parameters.size()) { + parameters_num = 0; + + for(int i = processed_parameters_num; i < parameters.size(); i++){ + const ParallelTensor p = parameters.at(i); + assert(v_values.find(p->region) != v_values.end()); + assert(m_values.find(p->region) != m_values.end()); + assert(p->owner_op != NULL); + if (reservedWorkSpaceSize + p->get_volume() * sizeof(float) >= workSpaceSize) { + break; + } + reservedWorkSpaceSize += p->get_volume() * sizeof(float); + parameters_num += 1; + assert(p->sync_type == ParameterSyncType::NCCL); + assert(p->parallel_is != IndexSpace::NO_SPACE); + } + + // printf("parameters_num: %d %zu, %zu, %d\n", parameters_num, + // reservedWorkSpaceSize, model->handlers->workSpaceSize, + // parameters.size()); + assert(processed_parameters_num <= parameters.size()); + + IndexLauncher launcher(ADAM_UNIFY_UPD_NCCL_TASK_ID, + p0->parallel_is, + TaskArgument(this, sizeof(AdamOptimizer)), + argmap, + Predicate::TRUE_PRED, + false /*must_epoch*/, + 0 /*mapper_id*/, + p0->machine_view.hash()); + // launch a unified task + for (int j = 0; j < parameters_num; j++) { + const ParallelTensor p = parameters.at(processed_parameters_num + j); + + // regions[0]: region_grad + launcher.add_region_requirement(RegionRequirement(p->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + p->region_grad)); + launcher.add_field(offset, FID_DATA); + // regions[1]: region + launcher.add_region_requirement(RegionRequirement( + p->part, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, p->region)); + launcher.add_field(offset + 1, FID_DATA); + // regions[2]: w_region + launcher.add_region_requirement( + RegionRequirement(v_values[p->region]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + v_values[p->region]->region)); + launcher.add_field(offset + 2, FID_DATA); + // regions[3]: m_region + launcher.add_region_requirement( + RegionRequirement(m_values[p->region]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + m_values[p->region]->region)); + launcher.add_field(offset + 3, FID_DATA); + offset += 4; + } + + // update alpha, beta + for (int i = 0; i < parameters_num; i++) { + this->next(); + } + launcher.concurrent = true; + FutureMap fm = runtime->execute_index_space(ctx, launcher); + // runtime->execute_must_epoch(ctx, must_epoch_launcher); + runtime->issue_execution_fence(ctx); + reservedWorkSpaceSize = 0; + offset = 0; + processed_parameters_num += parameters_num; + } + parameters_num = 0; +} + void AdamOptimizer::ps_update_task(Task const *task, std::vector const ®ions, Context ctx, @@ -605,6 +720,72 @@ void AdamOptimizer::nccl_update_task(Task const *task, nccl_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr); } + +void AdamOptimizer::nccl_unified_update_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + // assert(regions.size() == 4); + // assert(task->regions.size() == 4); + AdamOptimizer const *op = (AdamOptimizer *)task->args; + OpMeta const *meta = *((OpMeta **)task->local_args); + // FFHandler handler = *((FFHandler*) task->local_args); + Domain domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + + // float const *w_grad_ptr[op->parameters_num]; + // float *w_ptr[op->parameters_num], *v_ptr[op->parameters_num], + // *m_ptr[op->parameters_num]; + + // hipMalloc(w_grad_ptr, sizeof(float*) * op->parameters_num); + // hipMalloc(w_ptr, sizeof(float*) * op->parameters_num); + // hipMalloc(v_ptr, sizeof(float*) * op->parameters_num); + // hipMalloc(m_ptr, sizeof(float*) * op->parameters_num); + GenericTensorAccessorR accWGrads[op->parameters_num]; + GenericTensorAccessorW accWs[op->parameters_num]; + GenericTensorAccessorW accVs[op->parameters_num]; + GenericTensorAccessorW accMs[op->parameters_num]; + size_t *size = new size_t[op->parameters_num]; + int offset = 0; + + // printf("parameters_num: %d\n", op->parameters_num); + + for (int i = 0; i < op->parameters_num; i++) { + accWGrads[i] = helperGetGenericTensorAccessorRO(DataType::DT_FLOAT, + regions[offset], + task->regions[offset], + FID_DATA, + ctx, + runtime); + accWs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, + regions[offset + 1], + task->regions[offset + 1], + FID_DATA, + ctx, + runtime); + accVs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, + regions[offset + 2], + task->regions[offset + 2], + FID_DATA, + ctx, + runtime); + accMs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, + regions[offset + 3], + task->regions[offset + 3], + FID_DATA, + ctx, + runtime); + offset += 4; + + size[i] = accWGrads[i].domain.get_volume(); + // w_grad_ptr[i] = accWGrad.get_float_ptr(); + // w_ptr[i] = accW.get_float_ptr(); + // v_ptr[i] = accV.get_float_ptr(); + // m_ptr[i] = accM.get_float_ptr(); + } + nccl_unified_update_task_gpu(op, meta, accWGrads, size, accWs, accVs, accMs); +} #endif }; // namespace FlexFlow diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index 59efaf5256..67f2541f92 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -204,6 +204,7 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, m_ptr, v_ptr, w_ptr); + // checkCUDA(hipDeviceSynchronize()); } @@ -245,6 +246,74 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, w_ptr); // checkCUDA(hipDeviceSynchronize()); } + +__host__ void AdamOptimizer::nccl_unified_update_task_gpu( + AdamOptimizer const *op, + OpMeta const *meta, + GenericTensorAccessorR *accWGrads, + size_t *size, + GenericTensorAccessorW *accWs, + GenericTensorAccessorW *accVs, + GenericTensorAccessorW *accMs) { + + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + // assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); + + void *workSpace_ptr = meta->handle.workSpace; + + for (int i = 0; i < op->parameters_num; i++) { + hipMemcpyAsync(workSpace_ptr, + accWGrads[i].get_float_ptr(), + size[i] * sizeof(float), + hipMemcpyDeviceToDevice, + stream); + workSpace_ptr = + static_cast(workSpace_ptr) + size[i] * sizeof(float); + } + + // do allreduce once + checkNCCL(ncclAllReduce(meta->handle.workSpace, + (float *)meta->handle.workSpace, + meta->handle.workSpaceSize, + ncclFloat, + ncclSum, + meta->handle.ncclComm, + stream)); + + workSpace_ptr = static_cast(meta->handle.workSpace); + float alpha_t = op->alpha_t; + float beta1_t = op->beta1_t; + float beta2_t = op->beta2_t; + for (int i = 0; i < op->parameters_num; i++) { + // update + // printf("update %d\n", i); + hipLaunchKernelGGL(HIP_KERNEL_NAME(adam_update), + GET_BLOCKS(size[i]), + CUDA_NUM_THREADS, + 0, + stream, + size[i], + alpha_t, + op->beta1, + op->beta2, + op->weight_decay, + op->epsilon, + static_cast(workSpace_ptr), + accMs[i].get_float_ptr(), + accVs[i].get_float_ptr(), + accWs[i].get_float_ptr()); + workSpace_ptr = + static_cast(workSpace_ptr) + size[i] * sizeof(float); + + // update + beta1_t *= op->beta1; + beta2_t *= op->beta2; + alpha_t = op->alpha * sqrt(1 - beta2_t) / (1 - beta1_t); + } + + // checkCUDA(hipDeviceSynchronize()); +} #endif }; // namespace FlexFlow diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu index df37e3b135..50c986e146 100644 --- a/src/runtime/optimizer_kernel.cu +++ b/src/runtime/optimizer_kernel.cu @@ -216,6 +216,103 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, w_ptr); // checkCUDA(cudaDeviceSynchronize()); } + +__host__ void AdamOptimizer::nccl_unified_update_task_gpu( + AdamOptimizer const *op, + OpMeta const *meta, + GenericTensorAccessorR *accWGrads, + size_t *size, + GenericTensorAccessorW *accWs, + GenericTensorAccessorW *accVs, + GenericTensorAccessorW *accMs) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + // assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); + + cudaEvent_t t_start, t_start1, t_start2, t_end; + cudaEventCreate(&t_start); + cudaEventCreate(&t_start1); + cudaEventCreate(&t_start2); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + cudaEventRecord(t_start1, stream); + cudaEventRecord(t_start2, stream); + + void *allocate_ptr; + // = meta->handle.workSpace; + checkCUDA( + cudaMalloc(&allocate_ptr,meta->handle.workSpaceSize)); + + void *workSpace_ptr = allocate_ptr; + + for (int i = 0; i < op->parameters_num; i++) { + cudaMemcpyAsync(workSpace_ptr, + accWGrads[i].get_float_ptr(), + size[i] * sizeof(float), + cudaMemcpyDeviceToDevice, + stream); + workSpace_ptr = + static_cast(workSpace_ptr) + size[i] * sizeof(float); + } + + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start1, t_end)); + cudaEventDestroy(t_start1); + printf("[optimizer] data copy time = %.2lfms\n", elapsed); + + // do allreduce once + checkNCCL(ncclAllReduce(meta->handle.workSpace, + (float *)meta->handle.workSpace, + meta->handle.workSpaceSize, + ncclFloat, + ncclSum, + meta->handle.ncclComm, + stream)); + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start2, t_end)); + cudaEventDestroy(t_start2); + printf("[optimizer] allreduce time = %.2lfms\n", elapsed); + + // workSpace_ptr = static_cast(meta->handle.workSpace); + workSpace_ptr = static_cast(allocate_ptr); + float alpha_t = op->alpha_t; + float beta1_t = op->beta1_t; + float beta2_t = op->beta2_t; + for (int i = 0; i < op->parameters_num; i++) { + // update + // printf("update %d\n", i); + adam_update<<>>( + size[i], + alpha_t, + op->beta1, + op->beta2, + op->weight_decay, + op->epsilon, + static_cast(workSpace_ptr), + accMs[i].get_float_ptr(), + accVs[i].get_float_ptr(), + accWs[i].get_float_ptr()); + workSpace_ptr = + static_cast(workSpace_ptr) + size[i] * sizeof(float); + + // update + beta1_t *= op->beta1; + beta2_t *= op->beta2; + alpha_t = op->alpha * sqrt(1 - beta2_t) / (1 - beta1_t); + } + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + checkCUDA(cudaFree(allocate_ptr)); + printf("[optimizer] total time = %.2lfms\n", elapsed); +} #endif }; // namespace FlexFlow diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 8f1be15fd1..f26affd989 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -136,10 +136,15 @@ bool ParallelTensorShape::operator!=(ParallelTensorShape const &other) const { size_t ParallelTensorShape::get_piece_size() const { size_t piece_size = data_type_size(this->data_type); + return piece_size * this->get_piece_num_elements(); +} + +size_t ParallelTensorShape::get_piece_num_elements() const { + size_t piece_num_elements = 1; for (int i = 0; i < this->num_dims; i++) { - piece_size *= this->dims[i].size / this->dims[i].degree; + piece_num_elements *= this->dims[i].size / this->dims[i].degree; } - return piece_size; + return piece_num_elements; } RecordFormatter ParallelTensorShape::as_dot() const { @@ -270,7 +275,7 @@ void ParallelTensorBase::attach_raw_ptr(FFConfig &config, Runtime *runtime = config.lg_hlr; AttachLauncher launcher(EXTERNAL_INSTANCE, region, region); std::vector fields(1, FID_DATA); - const Memory local_sysmem = + Memory const local_sysmem = Machine::MemoryQuery(Machine::get_machine()) .has_affinity_to(runtime->get_executing_processor(ctx)) .only_kind(Memory::SYSTEM_MEM) @@ -668,12 +673,35 @@ bool ParallelTensorBase::set_tensor(FFModel const *ff, } else if (sync_type == ParameterSyncType::PS) { num_replicas = 1; } else { - num_replicas = 1; + for (int i = 0; i < this->num_dims; i++) { + if (this->dims[i].is_replica_dim) { + num_replicas *= this->dims[i].size; + } + } + // num_replicas = 1; } for (size_t i = 0; i < dim_sizes.size(); i++) { volume = volume * dim_sizes[i]; } - RegionRequirement req(region, WRITE_ONLY, EXCLUSIVE, region); + // Debug prints + { + std::string tensor_name; + if (owner_op == nullptr) { + tensor_name = "No OwnerOp"; + } else { + tensor_name = std::string(owner_op->name); + } + std::ostringstream oss; + for (int i = 0; i < dim_sizes.size(); i++) { + oss << dim_sizes[i] << ", "; + } + printf("%s num_replicas(%zu) volume(%zu) dims(%s)\n", + tensor_name.c_str(), + num_replicas, + volume, + oss.str().c_str()); + } + RegionRequirement req(region, READ_WRITE, EXCLUSIVE, region); req.add_field(FID_DATA); InlineLauncher launcher(req); PhysicalRegion pr = runtime->map_region(ctx, launcher); diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index 9b6510fe5e..54047f3219 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -58,7 +58,7 @@ using namespace Legion; Legion::Logger log_xfers("xfers"); Legion::Logger log_xfer_matches("xfer_matches"); -const TensorX TensorX::NO_TX = TensorX(); +TensorX const TensorX::NO_TX = TensorX(); bool TensorX::operator==(TensorX const &other) const { return this->op == other.op && this->idx == other.idx; @@ -156,7 +156,7 @@ tl::optional TensorX::to_tensor(GraphXfer const *xfer) const { } } -OpX::OpX(const OperatorType _type, +OpX::OpX(OperatorType const _type, int num_inputs, int num_outputs, TensorX const &input0, @@ -178,7 +178,7 @@ OpX::OpX(const OperatorType _type, } } -OpX::OpX(const OperatorType _type, +OpX::OpX(OperatorType const _type, int num_inputs, int num_outputs, TensorX const *input_array) @@ -614,8 +614,9 @@ void GraphXfer::run( SimplificationSettings const &simplification_settings, int &num_matches_found, int &num_matches_rejected) { - // printf("run: depth(%d) srcOps.size(%zu) graph.size(%zu) candidates(%zu)\n", - // depth, srcOps.size(), graph->inEdges.size(), candidates.size()); + // printf("run: depth(%d) srcOps.size(%zu) graph.size(%zu) + // candidates(%zu)\n", depth, srcOps.size(), graph->inEdges.size(), + // candidates.size()); if (depth >= (int)srcOps.size()) { // Create dst operators bool pass = true; @@ -1215,6 +1216,7 @@ void Graph::export_strategy_computation_graph( for (auto const &node : s.get_nodes(*this)) { // Add node if (strategy.find(node) == strategy.end()) { + dot.add_node(node, {{"label", node.to_string()}}); // Check FusedParallel node here and print out the detailed information if (node.ptr->op_type == OperatorType::OP_FUSED_PARALLEL) { RecordFormatter rf; @@ -1925,6 +1927,7 @@ void GraphSearchHelper::graph_optimize( this->logger->debug() << "Starting graph optimization"; Graph *graph = this->construct_graph(); + graph->print_dot(); graph->duplicate_input_nodes(); std::unordered_map empty_strategy; if (!this->config.export_strategy_computation_graph_file.empty()) { @@ -1991,8 +1994,8 @@ void GraphSearchHelper::graph_optimize_with_memory( Graph *graph = this->construct_graph(); // The input nodes may need to be duplicated because the PCG was constructed - // to have one input node for one input, but the actual execution graph should - // have the distributed version of inputs (i.e. multiple nodes). + // to have one input node for one input, but the actual execution graph + // should have the distributed version of inputs (i.e. multiple nodes). graph->duplicate_input_nodes(); // Export an empty schedule if needed. @@ -2278,7 +2281,8 @@ std::unique_ptr GraphSearchHelper::base_optimize( int budget = model->config.search_budget; if (budget == 0) { log_xfers.warning() - << "Base search budget is set to 0. This is probably not what you want " + << "Base search budget is set to 0. This is probably not what you " + "want " "(use the --budget flag to set the base search budget)"; } for (int iter = 0; iter < budget || budget == -1; iter++) { @@ -2375,7 +2379,8 @@ std::unique_ptr GraphSearchHelper::base_optimize_with_memory( int budget = model->config.search_budget; if (budget == 0) { log_xfers.warning() - << "Base search budget is set to 0. This is probably not what you want " + << "Base search budget is set to 0. This is probably not what you " + "want " "(use the --budget flag to set the base search budget)"; } @@ -2547,8 +2552,8 @@ void GraphSearchHelper::try_cache_result( /** * @brief Get the cost/result of PCG if sequentially split it. * - * @details This function is to combine the search results from DP sub-problems. - * The sub-problems are solved by generic_sequence_optimize(). + * @details This function is to combine the search results from DP + * sub-problems. The sub-problems are solved by generic_sequence_optimize(). */ template T GraphSearchHelper::execute_sequence_split( @@ -2727,8 +2732,8 @@ T GraphSearchHelper::generic_sequence_optimize( // this->generic_sequence_optimize(post_graph.get(), // sink_node, output_shape, bottleneck_output_shape); // this->logger->debug() << "Cost of post_graph (" << - // bottleneck_output_shape << "): " << post_cost; float current_cost - // = pre_cost + post_cost; + // bottleneck_output_shape << "): " << post_cost; float + // current_cost = pre_cost + post_cost; current_cost = this->execute_sequence_split(pre_graph, post_graph, @@ -2790,10 +2795,10 @@ T GraphSearchHelper::generic_sequence_optimize_with_memory( tl::optional const &input_shape) { TAG_ENTER(this->logger); - // Try to find the result from cache first. But this will only get the cached - // result if the returned type is float. The float number means the best run - // time cost with only machine quantity (without distinguishing machine - // identities). + // Try to find the result from cache first. But this will only get the + // cached result if the returned type is float. The float number means the + // best run time cost with only machine quantity (without distinguishing + // machine identities). size_t hash = gs_dp_state_hash(graph, sink_node, output_shape, input_shape); tl::optional cached = this->try_get_cost_from_cache(hash); if (cached.has_value()) { @@ -3623,6 +3628,7 @@ void FFModel::graph_optimize( this->graph_search->graph_optimize( budget, only_data_parallel, best_graph, optimal_views); } + best_graph->print_dot(); } bool FFModel::convert_graph_to_operators( @@ -3754,8 +3760,12 @@ bool FFModel::convert_graph_to_operators( case OP_SOFTMAX: { assert(inList.size() == 1); Softmax *softmax = (Softmax *)node.ptr; - new_op = new Softmax( - *this, softmax->layer_guid, inputs[0], softmax->dim, softmax->name); + new_op = new Softmax(*this, + softmax->layer_guid, + inputs[0], + softmax->dim, + softmax->last_layer, + softmax->name); break; } case OP_COMBINE: { @@ -3814,6 +3824,12 @@ bool FFModel::convert_graph_to_operators( parallel_identity->name); break; } + // case OP_ALLREDUCE: { + // assert(inList.size() == 1); + // AllReduce *allreduce = (AllReduce *)node.ptr; + // new_op = new AllReduce(*this, inputs[0], allreduce->allreduce_dim); + // break; + // } case OP_FUSED_PARALLEL: { assert(inList.size() == 1); FusedParallelOp *fused = (FusedParallelOp *)node.ptr;