From 243615200b0a737e85d4c90fe4d96707780d66ad Mon Sep 17 00:00:00 2001 From: Teresa Noyola Date: Mon, 17 Apr 2023 17:46:19 -0400 Subject: [PATCH 01/72] push changes to print dot graph --- config/config.linux | 6 +- examples/python/pytorch/mt5/mt5_ff.py | 89 ++++++++++++---------- examples/python/pytorch/mt5/mt5_torch.py | 4 +- include/flexflow/graph.h | 2 +- include/flexflow/ops/split.h | 2 + python/flexflow/torch/model.py | 49 ++++++++++-- src/metrics_functions/metrics_functions.cu | 2 +- src/ops/element_unary.cc | 3 +- src/ops/linear.cc | 4 +- src/ops/split.cc | 11 +++ src/runtime/ffconst_utils.cc | 2 + src/runtime/graph.cc | 1 + src/runtime/model.cc | 6 ++ src/runtime/substitution.cc | 4 + 14 files changed, 130 insertions(+), 55 deletions(-) diff --git a/config/config.linux b/config/config.linux index 017243408b..72fe466a7f 100755 --- a/config/config.linux +++ b/config/config.linux @@ -14,7 +14,7 @@ #INSTALL_DIR= # set build type -BUILD_TYPE=${BUILD_TYPE:-Release} +BUILD_TYPE=Debug # set CUDA Arch to the desired GPU architecture(s) to target (e.g. pass "FF_CUDA_ARCH=60" for Pascal). # To pass more than one value, separate architecture numbers with a comma (e.g. FF_CUDA_ARCH=70,75). @@ -23,10 +23,10 @@ BUILD_TYPE=${BUILD_TYPE:-Release} FF_CUDA_ARCH=${FF_CUDA_ARCH:-"autodetect"} # set CUDNN dir in case cmake cannot autodetect a path -CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"} +CUDNN_DIR=${CUDNN_DIR:-"/sw/summit/cuda/11.0.3"} #/usr/local/cuda # set CUDA dir in case cmake cannot autodetect a path -CUDA_DIR=${CUDA_DIR:-"/usr/local/cuda"} +CUDA_DIR=${CUDA_DIR:-"/sw/summit/cuda/11.0.3"} # enable Python FF_USE_PYTHON=${FF_USE_PYTHON:-ON} diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py index 41b84a269e..dccd9ba48c 100644 --- a/examples/python/pytorch/mt5/mt5_ff.py +++ b/examples/python/pytorch/mt5/mt5_ff.py @@ -5,14 +5,14 @@ import numpy as np from flexflow.core import * from flexflow.torch.model import PyTorchModel -from transformers import MT5ForConditionalGeneration, T5Tokenizer - +#from transformers import MT5ForConditionalGeneration, T5Tokenizer +from transformers import BertForMaskedLM, BertTokenizer sys.path.append("./examples/python/pytorch/mt5") from mt5_torch import DataPreparer, get_dataloaders, set_seed BASE_DIR = "examples/python/pytorch/mt5" DATA_DIR = os.path.join(BASE_DIR, "data") -NUMPY_DIR = os.path.join(DATA_DIR, "numpy") +NUMPY_DIR = os.path.join(DATA_DIR, "numpy_candle") def data_to_numpy() -> None: @@ -28,7 +28,8 @@ def data_to_numpy() -> None: """ model_params = { "SEED": 42, - "MODEL": "google/mt5-small", + #"MODEL": "google/mt5-small", + "MODEL": "bert-base-uncased", "TRAIN_BATCH_SIZE": None, # use the full dataset as one batch "EVAL_BATCH_SIZE": None, # use the full dataset as one batch "TRAIN_EPOCHS": 1, # unused @@ -36,7 +37,8 @@ def data_to_numpy() -> None: "MAX_TARGET_TEXT_LENGTH": 48, } set_seed(model_params) - tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"]) + #tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"]) + tokenizer = BertTokenizer.from_pretrained(model_params["MODEL"]) print("Getting dataloaders...") train_loader, eval_loader = get_dataloaders(tokenizer, model_params) assert len(train_loader) == 1 @@ -81,28 +83,31 @@ def preprocess_train() -> None: def top_level_task(): ffconfig = FFConfig() ffmodel = FFModel(ffconfig) - model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small") - + #model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small") + model = BertForMaskedLM.from_pretrained("bert-base-uncased") + #model = BertModel.from_pretrained("bert-base-uncased") # Load train data as numpy arrays print("Loading data...") - ids = np.load(os.path.join(NUMPY_DIR, "train_source_ids.npy")) - mask = np.load(os.path.join(NUMPY_DIR, "train_source_mask.npy")) - y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) - lm_labels = np.load(os.path.join(NUMPY_DIR, "train_lm_labels.npy")) + ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")) + mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")) + #y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) + lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")) batch_size = ffconfig.batch_size input_ids_shape = (batch_size, ids.shape[1]) attention_mask_shape = (batch_size, mask.shape[1]) - decoder_input_ids_shape = (batch_size, y_ids.shape[1]) + #decoder_input_ids_shape = (batch_size, y_ids.shape[1]) input_tensors = [ ffmodel.create_tensor(input_ids_shape, DataType.DT_INT64), # input_ids ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT64), # attention_mask - ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids + #ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids ] encoder_seq_length = ids.shape[1] - decoder_seq_length = y_ids.shape[1] - seq_length = (encoder_seq_length, decoder_seq_length) - input_names = ["input_ids", "attention_mask", "decoder_input_ids"] + #decoder_seq_length = y_ids.shape[1] + #seq_length = (encoder_seq_length, decoder_seq_length) + seq_length = encoder_seq_length + #input_names = ["input_ids", "attention_mask", "decoder_input_ids"] + input_names = ["input_ids", "attention_mask"] print("Tracing the model...") hf_model = PyTorchModel( @@ -110,6 +115,8 @@ def top_level_task(): batch_size=batch_size, seq_length=seq_length, ) output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True) + #from flexflow.torch.model import file_to_ff + #file_to_ff("mt5.ff", ffmodel, input_tensors) ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) print("Compiling the model...") @@ -123,9 +130,12 @@ def top_level_task(): ) print("Creating data loaders...") + print('id_dtype', ids.dtype) + print('mask_dtype', mask.dtype) + print('labels_dtype', lm_labels.dtype) input_ids_dl = ffmodel.create_data_loader(input_tensors[0], ids) attention_mask_dl = ffmodel.create_data_loader(input_tensors[1], mask) - decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids) + #decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids) # NOTE: We cast down the label tensor data to 32-bit to accommodate the # label tensor's required dtype labels_dl = ffmodel.create_data_loader( @@ -138,31 +148,32 @@ def top_level_task(): print("Training...") epochs = ffconfig.epochs ffmodel.fit( - x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl], + #x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl], + x=[input_ids_dl, attention_mask_dl], y=labels_dl, batch_size=batch_size, epochs=epochs, ) if __name__ == "__main__": - # Generate the .tsv files if needed - if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \ - not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")): - DataPreparer.data_to_tsv() - # Convert the .tsv files to .npy if needed - if not os.path.exists(NUMPY_DIR): - os.mkdir(NUMPY_DIR) - prefixes = ["train_", "eval_"] - suffixes = ["source_ids.npy", "source_mask.npy", "target_ids.npy"] - npy_filenames = [ - pre + suf for pre, suf in itertools.product(prefixes, suffixes) - ] - if any( - not os.path.exists(os.path.join(NUMPY_DIR, filename)) - for filename in npy_filenames - ): - data_to_numpy() - # Preprocess the training data if needed - if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \ - not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")): - preprocess_train() + ## Generate the .tsv files if needed + #if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \ + # not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")): + # DataPreparer.data_to_tsv() + ## Convert the .tsv files to .npy if needed + #if not os.path.exists(NUMPY_DIR): + # os.mkdir(NUMPY_DIR) + #prefixes = ["train_", "eval_"] + #suffixes = ["source_ids.npy", "source_mask.npy", "target_ids.npy"] + #npy_filenames = [ + # pre + suf for pre, suf in itertools.product(prefixes, suffixes) + #] + #if any( + # not os.path.exists(os.path.join(NUMPY_DIR, filename)) + # for filename in npy_filenames + #): + # data_to_numpy() + ## Preprocess the training data if needed + #if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \ + # not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")): + # preprocess_train() top_level_task() diff --git a/examples/python/pytorch/mt5/mt5_torch.py b/examples/python/pytorch/mt5/mt5_torch.py index 78886eed6c..4d741c44a5 100644 --- a/examples/python/pytorch/mt5/mt5_torch.py +++ b/examples/python/pytorch/mt5/mt5_torch.py @@ -7,7 +7,7 @@ import os import numpy as np -import pandas as pd +#import pandas as pd import torch from torch.utils.data import DataLoader, Dataset from transformers import MT5ForConditionalGeneration, T5Tokenizer @@ -311,5 +311,5 @@ def TorchMT5Trainer( "MAX_TARGET_TEXT_LENGTH": 48, "LEARNING_RATE": 1e-4, } - device = torch.device(0) + device = torch.device('cpu') TorchMT5Trainer(model_params, device) diff --git a/include/flexflow/graph.h b/include/flexflow/graph.h index e52911853e..800f750f42 100644 --- a/include/flexflow/graph.h +++ b/include/flexflow/graph.h @@ -90,7 +90,7 @@ struct NodeCompare { struct GraphOptimalViewSerialized { #ifdef LEGION_MAX_RETURN_SIZE - static const size_t buffer_size = LEGION_MAX_RETURN_SIZE - 8; + static const size_t buffer_size = 4*LEGION_MAX_RETURN_SIZE - 8; #else static const size_t buffer_size = 1024 * 1024 - 8; #endif diff --git a/include/flexflow/ops/split.h b/include/flexflow/ops/split.h index 633268ffbf..6c0736a76f 100644 --- a/include/flexflow/ops/split.h +++ b/include/flexflow/ops/split.h @@ -50,6 +50,8 @@ class Split : public Op { Params get_params() const; + tl::optional as_dot() const override; + public: int legion_axis; std::vector splits; diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index 65b1669e99..2e6a2a23f3 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -1,4 +1,4 @@ -# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# Copyright 2020 Stanford University, Los Alamos National Laboratory # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,6 @@ from collections import OrderedDict from enum import Enum from typing import List -import copy import numpy as np from flexflow.core.flexflow_cffi import Tensor, NormInitializer @@ -26,6 +25,7 @@ try: import torch + print(torch.__version__) from torch.fx.immutable_collections import immutable_dict except: pass @@ -931,7 +931,7 @@ def construct_node(node): elif name.find("contiguous") >= 0: return ContiguousNode(node) elif name.find("tanh") >= 0: return TanhFNode(node) elif name.find("gelu") >= 0: return GeluFNode(node) - assert 0, f"Unknown function or method: {name}" + assert 0, f"Unknown function or method: {name} {node}" @staticmethod def is_right_scalar_op(node): @@ -1424,7 +1424,7 @@ def is_truncate(slice_elem, old_size): stop = old_size if slice_elem.stop == None else slice_elem.stop new_size = stop - start return new_size < old_size - + def is_single_element(slice_elem): return isinstance(slice_elem, int) @@ -1441,6 +1441,7 @@ def is_single_element(slice_elem): # Match dimensions from right to left new_shape = [] # append then reverse j = len(shape) - 1 + import copy curr_tensor = copy.copy(tensor) for slice_elem in reversed(slices): @@ -1482,7 +1483,38 @@ def is_single_element(slice_elem): new_shape.reverse() return ffmodel.reshape(input=curr_tensor, shape=new_shape, name=name,) - + + + +# """Returns a reshaped tensor based on the given slices.""" +# def is_colon(slice_elem): +# """Returns if the slice is equivalent to `:`.""" +# return slice_elem == slice(None, None, None) +# +# def is_unsqueeze(slice_elem): +# """Returns if the slice is equivalent to unsqueezing that +# dimension.""" +# return slice_elem is None +# shape = tensor.dims +# # Match dimensions from right to left +# new_shape = [] # append then reverse +# j = len(shape) - 1 +# for slice_elem in reversed(slices): +# if is_colon(slice_elem): +# assert j >= 0 +# new_shape.append(shape[j]) +# j -= 1 +# elif is_unsqueeze(slice_elem): +# new_shape.append(1) +# else: +# assert 0, f"Unsupported slice element: {slice_elem}" +# new_shape.reverse() +# return ffmodel.reshape( +# input=tensor, shape=new_shape, name=name, +# ) + + + @staticmethod def strings_to_slices(strings: List[str]): # Extract slice elements @@ -1751,7 +1783,7 @@ def __init__(self, node): def parse(self): s = [self.name] scalar = self.innodes[1] - if type(scalar) is not int or type(scalar) is not float: + if not isinstance(scalar, [int, float]): assert 0, "FlexFlow does not support tensor floor division" innodes = (self.innodes[0],) s.append(self.parse_inoutnodes(innodes)) @@ -2440,6 +2472,11 @@ def _trace_model(self): batch_size=self.batch_size, sequence_length=self.seq_length, ) + + #import pickle + #with open('symbolic_trace', 'rb') as f: + #traced = pickle.load(f) + else: traced = torch.fx.symbolic_trace(self.model) diff --git a/src/metrics_functions/metrics_functions.cu b/src/metrics_functions/metrics_functions.cu index 2e037eb472..b68b10d873 100644 --- a/src/metrics_functions/metrics_functions.cu +++ b/src/metrics_functions/metrics_functions.cu @@ -29,7 +29,7 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, CUDA_KERNEL_LOOP(b, num_samples) { if (metrics.measure_accuracy) { float max_val = -1.0f; - int my_label = -1; + int my_label = 0; for (int i = 0; i < num_classes; i++) { float my_logit = logits[b * num_classes + i]; if (my_logit > max_val) { diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index 60112bfdc9..b50f54bbf0 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -125,7 +125,8 @@ Tensor FFModel::tanh(const Tensor x, char const *name) { } Tensor FFModel::identity(const Tensor x, char const *name) { - return this->unary(OP_IDENTITY, x, false /*inplace*/, name); + //return this->unary(OP_IDENTITY, x, false /*inplace*/, name); + return x; } Tensor FFModel::gelu(const Tensor x, char const *name) { diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 668c8d070a..4a37f7c02e 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -418,7 +418,7 @@ void Linear::forward_task_with_dim(Task const *task, assert(acc_kernel.rect.volume() == static_cast(in_dim * out_dim)); float const *acc_bias_ptr = NULL; if (m->use_bias) { - TensorAccessorR acc_bias( + TensorAccessorR acc_bias( regions[3], task->regions[3], FID_DATA, ctx, runtime); assert(acc_bias.rect.volume() == static_cast(out_dim)); acc_bias_ptr = acc_bias.ptr; @@ -604,7 +604,7 @@ void Linear::backward_task_with_dim(Task const *task, static_cast(in_dim * out_dim)); float *acc_bias_grad_ptr = NULL; if (m->use_bias) { - TensorAccessorW acc_bias_grad(regions[rid], + TensorAccessorW acc_bias_grad(regions[rid], task->regions[rid], FID_DATA, ctx, diff --git a/src/ops/split.cc b/src/ops/split.cc index 4f60cb96f0..3517852942 100644 --- a/src/ops/split.cc +++ b/src/ops/split.cc @@ -330,6 +330,17 @@ void Split::backward_task(Task const *task, split->numOutputs); } +tl::optional Split::as_dot() const { + RecordFormatter rr; + RecordFormatter r; + + r << this->inputs[0]->get_shape().as_dot(); + r << this->outputs[0]->get_shape().as_dot(); + rr << r; + + return rr; +} + bool Split::measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index d8f4e6e179..7ab9201113 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -172,6 +172,8 @@ std::string get_operator_type_name(OperatorType type) { return "Pipeline"; case OP_FUSED_PARALLEL: return "FusedParallelOp"; + case OP_GELU: + return "Gelu"; default: throw std::runtime_error("Operator type unsupported: " + std::to_string(type)); diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index ad298f5c93..77bb29cf47 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1588,6 +1588,7 @@ GraphOptimalViewSerialized std::unordered_map optimal_views; if (model->config.only_data_parallel) { Graph *graph = new Graph(model); + graph->print_dot(); std::unordered_map op_to_node_map; for (FlexFlow::Op const *dstOp : model->operators) { Node dstNode; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index ca1ab33343..5318684f53 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -2631,10 +2631,16 @@ Op *FFModel::create_operator_from_layer( tensor->parallel_tensor = pt; // start from data parllel tensor if (config.only_data_parallel) { + if (pt->dims[num_dims-1].size == 1) { + Replicate *repl = new Replicate( + *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); + operators.push_back(repl); + } else { Repartition *part = new Repartition( *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); operators.push_back(part); } + } return operators[operators.size() - 1]; } case OP_MULTIHEAD_ATTENTION: { diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index 9d3d6057f1..2925eb7555 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -1892,6 +1892,7 @@ void GraphSearchHelper::graph_optimize( this->logger->debug() << "Starting graph optimization"; Graph *graph = this->construct_graph(); + graph->print_dot(); graph->duplicate_input_nodes(); std::unordered_map empty_strategy; if (!this->config.export_strategy_computation_graph_file.empty()) { @@ -1929,6 +1930,8 @@ void GraphSearchHelper::graph_optimize( } } best_graph->print_strategy_computation_graph(optimal.views); + //std::cout << "PCG:" << std::endl; + //best_graph->print_dot(); optimal_views = real_optimal_views; } @@ -3117,6 +3120,7 @@ void FFModel::graph_optimize( std::unordered_map &optimal_views) { this->graph_search->graph_optimize( budget, only_data_parallel, best_graph, optimal_views); + best_graph->print_dot(); } bool FFModel::convert_graph_to_operators( From 7030ae09677f4157118bb79dbeeab3a89354cc79 Mon Sep 17 00:00:00 2001 From: Teresa Noyola Date: Fri, 21 Apr 2023 16:45:26 -0400 Subject: [PATCH 02/72] padded input to 512, added isExact to slice_tensor --- examples/python/pytorch/mt5/mt5_ff.py | 9 +++++++++ python/flexflow/torch/model.py | 14 +++++++++++++- src/runtime/model.cc | 3 ++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py index dccd9ba48c..fa0801a517 100644 --- a/examples/python/pytorch/mt5/mt5_ff.py +++ b/examples/python/pytorch/mt5/mt5_ff.py @@ -3,6 +3,7 @@ import sys import numpy as np +import torch from flexflow.core import * from flexflow.torch.model import PyTorchModel #from transformers import MT5ForConditionalGeneration, T5Tokenizer @@ -89,9 +90,17 @@ def top_level_task(): # Load train data as numpy arrays print("Loading data...") ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")) + ids = np.pad(ids, ((0,0), (0,17)), 'constant') + #ids = np.random.randint(0, 5, (1000, 512)) + #print('ids_shape', ids.shape) + #print('ids', ids) mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")) + mask = np.pad(mask, ((0,0), (0,17)), 'constant') + #mask = np.random.randint(0, 2, (1000, 512)) #y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")) + lm_labels = np.pad(lm_labels, ((0,0), (0,17)), 'constant') + #lm_labels = np.random.randint(-1, 5, (1000, 512)) batch_size = ffconfig.batch_size input_ids_shape = (batch_size, ids.shape[1]) diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index 2e6a2a23f3..ecd455a292 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -1409,6 +1409,7 @@ def to_ff(self, ffmodel, node_to_output): @staticmethod def slice_tensor(ffmodel, tensor, slices, name): + """Returns a reshaped tensor based on the given slices.""" def is_colon(slice_elem): """Returns if the slice is equivalent to `:`.""" @@ -1428,6 +1429,12 @@ def is_truncate(slice_elem, old_size): def is_single_element(slice_elem): return isinstance(slice_elem, int) + def is_exact(slice_elem, old_size): + start = 0 if slice_elem.start == None else slice_elem.start + stop = old_size if slice_elem.stop == None else slice_elem.stop + new_size = stop - start + return new_size == old_size + shape = tensor.dims # Fewer slices than input dimensions @@ -1457,6 +1464,8 @@ def is_single_element(slice_elem): curr_tensor = ffmodel.split(input=curr_tensor, sizes=splits, axis=j, name=name)[0] new_shape.append(1) j -= 1 + elif is_exact(slice_elem, shape[j]): + pass elif is_truncate(slice_elem, shape[j]): assert j >= 0 start = 0 if slice_elem.start == None else slice_elem.start @@ -1482,7 +1491,10 @@ def is_single_element(slice_elem): assert 0, f"Unsupported slice element: {slice_elem}" new_shape.reverse() - return ffmodel.reshape(input=curr_tensor, shape=new_shape, name=name,) + if len(new_shape) == 0: + return curr_tensor + else: + return ffmodel.reshape(input=curr_tensor, shape=new_shape, name=name,) diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 5318684f53..55e76d5071 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -2634,7 +2634,8 @@ Op *FFModel::create_operator_from_layer( if (pt->dims[num_dims-1].size == 1) { Replicate *repl = new Replicate( *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); - operators.push_back(repl); + repl->outputs[0]->dims[num_dims-1].is_replica_dim = true; + operators.push_back(repl); } else { Repartition *part = new Repartition( *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); From 280d29a47e5366fcfd968769aa7b64499f7fa7e4 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Sun, 7 May 2023 16:52:58 -0700 Subject: [PATCH 03/72] Add multiprecision support to replicate --- include/flexflow/parallel_ops/replicate.h | 14 ++++++ src/parallel_ops/replicate.cc | 59 ++++++++++++++++++++--- 2 files changed, 65 insertions(+), 8 deletions(-) diff --git a/include/flexflow/parallel_ops/replicate.h b/include/flexflow/parallel_ops/replicate.h index 381f690cdc..44ab0c20a5 100644 --- a/include/flexflow/parallel_ops/replicate.h +++ b/include/flexflow/parallel_ops/replicate.h @@ -39,6 +39,20 @@ class Replicate : public ParallelOp { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + + template + static void + forward_task_with_type(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + + template + static void backward_task_with_type(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index 031166e63e..16919bb739 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -143,7 +143,7 @@ void Replicate::forward(FFModel const &ff) { assert(numInputs == 1); IndexLauncher launcher(REPLICATE_FWD_TASK_ID, outputs[0]->parallel_is, - TaskArgument(NULL, 0), + TaskArgument(&data_type, sizeof(DataType)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -169,7 +169,7 @@ void Replicate::backward(FFModel const &ff) { assert(numInputs == 1); IndexLauncher launcher(REPLICATE_BWD_TASK_ID, inputs[0]->parallel_is, - TaskArgument(NULL, 0), + TaskArgument(&data_type, sizeof(DataType)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -222,7 +222,29 @@ bool Replicate::append_parallel_op_info( return true; } +/*static*/ void Replicate::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + DataType data_type = *((DataType *)task->args); + if (data_type == DT_FLOAT) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_DOUBLE) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT32) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT64) { + forward_task_with_type(task, regions, ctx, runtime); + } else { + assert(false && "Unsupported data type in Replicate forward"); + } +} + +template +void Replicate::forward_task_with_type(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { @@ -238,15 +260,36 @@ void Replicate::forward_task(Task const *task, assert(output_domain.hi()[i] == input_domain.hi()[i]); } assert(input_domain.get_volume() == output_domain.get_volume()); - float const *input_ptr = helperGetTensorPointerRO( + T const *input_ptr = helperGetTensorPointerRO( regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *output_ptr = helperGetTensorPointerRW( + T *output_ptr = helperGetTensorPointerRW( regions[1], task->regions[1], FID_DATA, ctx, runtime); - forward_kernel(input_ptr, output_ptr, input_domain.get_volume()); + forward_kernel(input_ptr, output_ptr, input_domain.get_volume()); } void Replicate::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + DataType data_type = *((DataType *)task->args); + if (data_type == DT_FLOAT) { + backward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_DOUBLE) { + backward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT32) { + backward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT64) { + backward_task_with_type(task, regions, ctx, runtime); + } else { + assert(false && "Unsupported data type in Embedding forward"); + } +} + +template +void Replicate::backward_task_with_type(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { @@ -263,12 +306,12 @@ void Replicate::backward_task(Task const *task, } size_t num_elements = input_grad_domain.get_volume(); size_t num_replicas = output_grad_domain.get_volume() / num_elements; - float const *output_grad_ptr = helperGetTensorPointerRO( + T const *output_grad_ptr = helperGetTensorPointerRO( regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *input_grad_ptr = helperGetTensorPointerRW( + T *input_grad_ptr = helperGetTensorPointerRW( regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel( + backward_kernel( output_grad_ptr, input_grad_ptr, num_elements, num_replicas); } From 8e84401cf5e7b623f6a39ab968c6563a06ec060d Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Sun, 7 May 2023 16:56:46 -0700 Subject: [PATCH 04/72] Add explicit template instantiations for replicate kernels --- src/parallel_ops/kernels/replicate_kernels.cu | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/parallel_ops/kernels/replicate_kernels.cu b/src/parallel_ops/kernels/replicate_kernels.cu index de208d2aed..0055bbaea0 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cu +++ b/src/parallel_ops/kernels/replicate_kernels.cu @@ -68,6 +68,42 @@ template void backward_kernel(float const *output_grad_ptr, float *input_grad_ptr, size_t num_elements, size_t num_replicas); +template void forward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(double const *output_grad_ptr, + double *input_grad_ptr, + size_t num_elements, + size_t num_replicas); +template void forward_kernel(float const *input_ptr, + int32_t *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(float const *input_ptr, + int32_t *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(float const *output_grad_ptr, + int32_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); +template void forward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(double const *output_grad_ptr, + double *input_grad_ptr, + size_t num_elements, + size_t num_replicas); } // namespace Replicate } // namespace Kernels From a2ddb91c3b94d11f2446326c29b34cd5638f382b Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Sun, 7 May 2023 17:00:37 -0700 Subject: [PATCH 05/72] Fix incorrect instantiations --- src/parallel_ops/kernels/replicate_kernels.cu | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/parallel_ops/kernels/replicate_kernels.cu b/src/parallel_ops/kernels/replicate_kernels.cu index 0055bbaea0..2b8ff55eac 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cu +++ b/src/parallel_ops/kernels/replicate_kernels.cu @@ -80,28 +80,28 @@ template void backward_kernel(double const *output_grad_ptr, double *input_grad_ptr, size_t num_elements, size_t num_replicas); -template void forward_kernel(float const *input_ptr, +template void forward_kernel(int32_t const *input_ptr, int32_t *output_ptr, size_t num_elements); template __global__ void - replicate_backward_kernel(float const *input_ptr, + replicate_backward_kernel(int32_t const *input_ptr, int32_t *output_ptr, size_t num_elements, size_t num_replicas); -template void backward_kernel(float const *output_grad_ptr, +template void backward_kernel(int32_t const *output_grad_ptr, int32_t *input_grad_ptr, size_t num_elements, size_t num_replicas); -template void forward_kernel(double const *input_ptr, - double *output_ptr, +template void forward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, size_t num_elements); template __global__ void - replicate_backward_kernel(double const *input_ptr, - double *output_ptr, + replicate_backward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, size_t num_elements, size_t num_replicas); -template void backward_kernel(double const *output_grad_ptr, - double *input_grad_ptr, +template void backward_kernel(int64_t const *output_grad_ptr, + int64_t *input_grad_ptr, size_t num_elements, size_t num_replicas); From 52cc8e8b1231c5a4bc5c14f9c7c0012a8f23b464 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Mon, 8 May 2023 15:22:10 -0700 Subject: [PATCH 06/72] Add nop init_task for replicate --- include/flexflow/parallel_ops/replicate.h | 4 ++++ src/parallel_ops/replicate.cc | 9 ++++++++- src/runtime/model.cc | 7 +++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/include/flexflow/parallel_ops/replicate.h b/include/flexflow/parallel_ops/replicate.h index 44ab0c20a5..9514d055f3 100644 --- a/include/flexflow/parallel_ops/replicate.h +++ b/include/flexflow/parallel_ops/replicate.h @@ -31,6 +31,10 @@ class Replicate : public ParallelOp { bool get_int_parameter(PMParameter, int *) const override; bool append_parallel_op_info( std::vector ¶llel_ops) const override; + static void init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void forward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index 16919bb739..dec5b3acbe 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -115,7 +115,7 @@ void Replicate::init(FFModel const &ff) { Runtime *runtime = ff.config.lg_hlr; assert(numOutputs == 1); assert(numInputs == 1); - IndexLauncher launcher(REPLICATE_FWD_TASK_ID, + IndexLauncher launcher(REPLICATE_INIT_TASK_ID, outputs[0]->parallel_is, TaskArgument(NULL, 0), argmap, @@ -141,6 +141,7 @@ void Replicate::forward(FFModel const &ff) { Runtime *runtime = ff.config.lg_hlr; assert(numOutputs == 1); assert(numInputs == 1); + DataType data_type = inputs[0]->data_type; IndexLauncher launcher(REPLICATE_FWD_TASK_ID, outputs[0]->parallel_is, TaskArgument(&data_type, sizeof(DataType)), @@ -222,6 +223,12 @@ bool Replicate::append_parallel_op_info( return true; } +void Replicate::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { +} + /*static*/ void Replicate::forward_task(Task const *task, std::vector const ®ions, diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 55e76d5071..1e95823b57 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4446,6 +4446,13 @@ void register_flexflow_internal_tasks() { registrar, "Combine Backward Task"); } // Replicate + { + TaskVariantRegistrar registrar(REPLICATE_INIT_TASK_ID, "Replicate Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Replicate Init Task"); + } { TaskVariantRegistrar registrar(REPLICATE_FWD_TASK_ID, "Replicate Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); From 9eb530a3fb22aa9058ed381bb3bdefd17a6534d5 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Mon, 8 May 2023 15:31:25 -0700 Subject: [PATCH 07/72] Fix replicate init_task registration --- src/runtime/model.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 1e95823b57..2e41c6af0d 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4450,7 +4450,7 @@ void register_flexflow_internal_tasks() { TaskVariantRegistrar registrar(REPLICATE_INIT_TASK_ID, "Replicate Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( + Runtime::preregister_task_variant( registrar, "Replicate Init Task"); } { From d7a219ca4e6af68b544d7747d04408e4b15f440b Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Wed, 10 May 2023 11:33:04 -0700 Subject: [PATCH 08/72] Hopefully print hip errors --- include/flexflow/utils/hip_helper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h index 6970832231..2ea09770d6 100644 --- a/include/flexflow/utils/hip_helper.h +++ b/include/flexflow/utils/hip_helper.h @@ -19,7 +19,7 @@ do { \ std::stringstream _error; \ if (status != miopenStatusSuccess) { \ - _error << "CUDNN failure: " << status; \ + _error << "CUDNN failure: " << miopenGetErrorString(status); \ FatalError(_error.str()); \ } \ } while (0) From f323a6e75c4909aba86d30a85386d533739f9413 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Wed, 10 May 2023 11:38:26 -0700 Subject: [PATCH 09/72] Instantiate extra hip replicate kernels --- .../kernels/replicate_kernels.cpp | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/parallel_ops/kernels/replicate_kernels.cpp b/src/parallel_ops/kernels/replicate_kernels.cpp index 29f1d30d1f..ff9751ee34 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cpp +++ b/src/parallel_ops/kernels/replicate_kernels.cpp @@ -76,6 +76,45 @@ template void backward_kernel(float const *output_grad_ptr, size_t num_elements, size_t num_replicas); +template void forward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(double const *output_grad_ptr, + double *input_grad_ptr, + size_t num_elements, + size_t num_replicas); + +template void forward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(int64_t const *output_grad_ptr, + int64_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); + +template void forward_kernel(int32_t const *input_ptr, + int32_t *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(int32_t const *input_ptr, + int32_t *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(int32_t const *output_grad_ptr, + int32_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); + } // namespace Replicate } // namespace Kernels } // namespace FlexFlow From 9a75f248d31f0d98f52610176d1a6e94142f9009 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 10 May 2023 18:57:28 +0000 Subject: [PATCH 10/72] fix --- src/parallel_ops/replicate.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index dec5b3acbe..6b24d98abb 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -75,7 +75,7 @@ Replicate::Replicate(FFModel &model, dims[replicate_dim].degree *= replicate_degree; ParallelTensorBase::update_parallel_ids(numdim, dims); outputs[0] = model.create_parallel_tensor_legion_ordering( - numdim, dims, DT_FLOAT, this); + numdim, dims, _input->data_type, this); // inputs[0]->print("Replicate::input"); // outputs[0]->print("Replicate::output"); } From c9277f34b912f3b6ed2445d35b2be725d6d4dd50 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 10 May 2023 19:04:51 +0000 Subject: [PATCH 11/72] debug changs --- src/ops/element_binary.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index b90e85588b..6adea1480c 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -381,7 +381,7 @@ void ElementBinary::forward(FFModel const &ff) { set_argumentmap_for_forward(ff, argmap); IndexLauncher launcher(ELEMENTBINARY_FWD_TASK_ID, parallel_is, - TaskArgument(NULL, 0), + TaskArgument(this, sizeof(ElementBinary)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -448,7 +448,7 @@ __host__ void std::vector const ®ions, Context ctx, Runtime *runtime) { - // const ElementBinary* ele = (const ElementBinary*) task->args; + const ElementBinary* ele = (const ElementBinary*) task->args; ElementBinaryMeta const *m = *((ElementBinaryMeta **)task->local_args); Domain in1_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); From fe66561263fb5bf2f0e95b416b5e922cdfea567d Mon Sep 17 00:00:00 2001 From: Teresa Noyola Date: Fri, 12 May 2023 17:01:30 -0400 Subject: [PATCH 12/72] Add slice_tensor fix --- python/flexflow/torch/model.py | 18 ++++++++++++++++-- src/runtime/graph.cc | 1 + 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index ecd455a292..aa80758843 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -1410,6 +1410,9 @@ def to_ff(self, ffmodel, node_to_output): @staticmethod def slice_tensor(ffmodel, tensor, slices, name): + print('slices', slices) + old_shape = tensor.dims + print('old_shape', tensor.dims) """Returns a reshaped tensor based on the given slices.""" def is_colon(slice_elem): """Returns if the slice is equivalent to `:`.""" @@ -1430,12 +1433,15 @@ def is_single_element(slice_elem): return isinstance(slice_elem, int) def is_exact(slice_elem, old_size): + if slice_elem is None: + return False start = 0 if slice_elem.start == None else slice_elem.start stop = old_size if slice_elem.stop == None else slice_elem.stop new_size = stop - start return new_size == old_size shape = tensor.dims + print('input dims', tensor.dims) # Fewer slices than input dimensions diff = len(shape) - len(slices) @@ -1452,9 +1458,14 @@ def is_exact(slice_elem, old_size): curr_tensor = copy.copy(tensor) for slice_elem in reversed(slices): - if is_colon(slice_elem): + print('slice_elem', slice_elem) + if is_colon(slice_elem) or is_exact(slice_elem, shape[j]): + print('shape', shape) assert j >= 0 + print('j', j) + print('new_shape_bef', new_shape) new_shape.append(shape[j]) + print('new_shape_aft', new_shape) j -= 1 elif is_unsqueeze(slice_elem): new_shape.append(1) @@ -1465,7 +1476,7 @@ def is_exact(slice_elem, old_size): new_shape.append(1) j -= 1 elif is_exact(slice_elem, shape[j]): - pass + print('exact') elif is_truncate(slice_elem, shape[j]): assert j >= 0 start = 0 if slice_elem.start == None else slice_elem.start @@ -1494,6 +1505,9 @@ def is_exact(slice_elem, old_size): if len(new_shape) == 0: return curr_tensor else: + print('new_shape', new_shape) + if old_shape == new_shape: + return curr_tensor return ffmodel.reshape(input=curr_tensor, shape=new_shape, name=name,) diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 77bb29cf47..4b55a39104 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1602,6 +1602,7 @@ GraphOptimalViewSerialized graph->add_edge(srcNode, dstNode, dstOp->inputs[j]->owner_idx, j); } } + graph->print_dot(); best_graph = std::unique_ptr(graph); MachineView data_parallel_view; data_parallel_view.device_type = MachineView::GPU; From 11777485a15c88f6c981a0b2a5116ec3a48d5671 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Fri, 12 May 2023 14:14:06 -0700 Subject: [PATCH 13/72] Add logging for metrics --- src/metrics_functions/metrics_functions.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/metrics_functions/metrics_functions.cc b/src/metrics_functions/metrics_functions.cc index e8ccbfe2e4..7244b06925 100644 --- a/src/metrics_functions/metrics_functions.cc +++ b/src/metrics_functions/metrics_functions.cc @@ -15,6 +15,7 @@ #include "flexflow/metrics_functions.h" #include "flexflow/model.h" +#include namespace FlexFlow { @@ -90,6 +91,8 @@ void Metrics::compute(FFModel *model, false /*must*/, 0 /*mapper_id*/, logit->machine_view.hash()); + std::cout << "logit shape: " << logit->get_shape() << std::endl; + std::cout << "label shape: " << label->get_shape() << std::endl; launcher.add_region_requirement(RegionRequirement( logit->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, logit->region)); launcher.add_field(0, FID_DATA); @@ -154,6 +157,7 @@ PerfMetrics assert(acc_label.rect.lo[0] == acc_label.rect.hi[0]); // Cannot measure categorical_crossentropy w/ sparse labels // Use measure_sparse_categorical_crossentropy instead + std::cout << "num_classes: " << num_classes << std::endl; assert(!me->measure_categorical_crossentropy); Metrics::update_metrics_sparse_label_kernel_wrapper(acc_logit.ptr, acc_label.ptr, From 5b7cacea6c72bf779f26f8298e054e4bbd2d1eaf Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Fri, 12 May 2023 14:21:01 -0700 Subject: [PATCH 14/72] Add the cuda metrics hack to hip kernel as well --- src/metrics_functions/metrics_functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/metrics_functions/metrics_functions.cpp b/src/metrics_functions/metrics_functions.cpp index 90d727b9b1..d30686be24 100644 --- a/src/metrics_functions/metrics_functions.cpp +++ b/src/metrics_functions/metrics_functions.cpp @@ -30,7 +30,7 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, CUDA_KERNEL_LOOP(b, num_samples) { if (metrics.measure_accuracy) { float max_val = -1.0f; - int my_label = -1; + int my_label = 0; for (int i = 0; i < num_classes; i++) { float my_logit = logits[b * num_classes + i]; if (my_logit > max_val) { From e798a91d49a267550ee902206f0ade158ba7eb47 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Fri, 12 May 2023 14:44:05 -0700 Subject: [PATCH 15/72] Add parallel dim pretty printing --- gdb/pretty_print.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/gdb/pretty_print.py b/gdb/pretty_print.py index 4cccc9b76b..aac8d56444 100644 --- a/gdb/pretty_print.py +++ b/gdb/pretty_print.py @@ -80,6 +80,17 @@ def to_string(self): toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx}]') return f'ParallelTensorBase<{" ".join(toks)}>' +class ParallelDimPrinter: + def __init__(self, val): + self.val = val + + def to_string(self): + size = self.val['size'] + degree = self.val['degree'] + parallel_idx = self.val['parallel_idx'] + return f'ParallelDim' + + def build_pretty_printer(): pp = gdb.printing.RegexpCollectionPrettyPrinter( "flexflow") @@ -89,6 +100,7 @@ def build_pretty_printer(): pp.add_printer('Domain', '^Legion::Domain$', DomainPrinter) pp.add_printer('ParallelTensorShape', '^FlexFlow::ParallelTensorShape$', TensorShapePrinter) pp.add_printer('ParallelTensorBase', '^FlexFlow::ParallelTensorBase$', ParallelTensorBasePrinter) + pp.add_printer('ParallelDim', '^FlexFlow::ParallelDim$', ParallelDimPrinter) return pp gdb.printing.register_pretty_printer( From 90541cfa8f03a9f8d5b4da8c6ed2d9d163d65b68 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 12 May 2023 22:21:14 +0000 Subject: [PATCH 16/72] [Embedding] bug fix --- src/ops/embedding.cc | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 3b53213b91..18bf7e324e 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -148,33 +148,23 @@ int Embedding::output_size(ParallelDim output_dims[MAX_TENSOR_DIM]) { int const OUT_CHANNELS = Output::OUT_CHANNELS; if (aggr == AGGR_MODE_NONE) { int num_dims = input->num_dims + 1; - for (int i = 1; i < num_dims - 1; i++) { + for (int i = 1; i < num_dims; i++) { output_dims[i] = input->dims[i - 1]; } assert(OUT_CHANNELS == 0); output_dims[OUT_CHANNELS].size = this->out_channels; output_dims[OUT_CHANNELS].degree = 1; output_dims[OUT_CHANNELS].parallel_idx = -1; - // Currently do not support parallelizing over the replica dim - output_dims[num_dims - 1].size = 1; - output_dims[num_dims - 1].degree = 1; - output_dims[num_dims - 1].parallel_idx = -1; - output_dims[num_dims - 1].is_replica_dim = true; return num_dims; } else { int num_dims = input->num_dims; - for (int i = 1; i < num_dims - 1; i++) { + for (int i = 1; i < num_dims; i++) { output_dims[i] = input->dims[i]; } assert(OUT_CHANNELS == 0); output_dims[OUT_CHANNELS].size = this->out_channels; output_dims[OUT_CHANNELS].degree = 1; output_dims[OUT_CHANNELS].parallel_idx = -1; - // Currently do not support parallelizing over the replica dim - output_dims[num_dims - 1].size = 1; - output_dims[num_dims - 1].degree = 1; - output_dims[num_dims - 1].parallel_idx = -1; - output_dims[num_dims - 1].is_replica_dim = true; return num_dims; } // const int REPLICA = this->output_vocab_size_replica_dim(); From 786214379ac1c45a74197e154860435c89db4e0c Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Fri, 12 May 2023 15:40:42 -0700 Subject: [PATCH 17/72] Add replica dim to pretty print --- gdb/pretty_print.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/gdb/pretty_print.py b/gdb/pretty_print.py index aac8d56444..ec551002db 100644 --- a/gdb/pretty_print.py +++ b/gdb/pretty_print.py @@ -77,7 +77,13 @@ def to_string(self): size = dim['size'] degree = dim['degree'] parallel_idx = dim['parallel_idx'] - toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx}]') + tok = f'{i}=[s={size} d={degree} pi={parallel_idx} ' + if dim['is_replica_dim']: + tok += 'r=t' + else: + tok += 'r=f' + tok += ']' + toks.append() return f'ParallelTensorBase<{" ".join(toks)}>' class ParallelDimPrinter: @@ -88,7 +94,12 @@ def to_string(self): size = self.val['size'] degree = self.val['degree'] parallel_idx = self.val['parallel_idx'] - return f'ParallelDim' + tok = f's={size} d={degree} pi={parallel_idx} ' + if dim['is_replica_dim']: + tok += 'r=t' + else: + tok += 'r=f' + return f'ParallelDim<{tok}>' def build_pretty_printer(): From ef43c36343e6c0191bb790546e4fd441aabf1cf7 Mon Sep 17 00:00:00 2001 From: Teresa Noyola Date: Fri, 12 May 2023 16:29:28 -0700 Subject: [PATCH 18/72] Fix replicate issue with python hack --- gdb/pretty_print.py | 2 +- python/flexflow/torch/model.py | 5 +++++ src/runtime/model.cc | 22 +++++++++++----------- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/gdb/pretty_print.py b/gdb/pretty_print.py index ec551002db..0c384884f0 100644 --- a/gdb/pretty_print.py +++ b/gdb/pretty_print.py @@ -83,7 +83,7 @@ def to_string(self): else: tok += 'r=f' tok += ']' - toks.append() + toks.append(tok) return f'ParallelTensorBase<{" ".join(toks)}>' class ParallelDimPrinter: diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index aa80758843..2d0c388e3e 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -2353,6 +2353,9 @@ def to_ff(self, ffmodel, node_to_output): def attr_to_ff_tensor(self, ffmodel): torch_tensor = self.attr + assert (torch_tensor.shape[0] == 1) + batch_size = ffmodel._ffconfig.batch_size + torch_tensor = np.repeat(torch_tensor, batch_size, axis=0) ff_dtype = Node.torch_to_ff_dtype(torch_tensor.dtype) requires_grad = torch_tensor.requires_grad @@ -2367,6 +2370,8 @@ def attr_to_ff_tensor(self, ffmodel): ff_dtype = DataType.DT_FLOAT np_tensor = np_tensor.astype(np.float32) + print('attr: ', torch_tensor.shape) + assert (torch_tensor.shape[0] == batch_size) ff_tensor = ffmodel.create_tensor( torch_tensor.shape, ff_dtype, requires_grad, ) diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 2e41c6af0d..d9ee74ff4a 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -2630,17 +2630,17 @@ Op *FFModel::create_operator_from_layer( assert(tensor->parallel_tensor == nullptr); tensor->parallel_tensor = pt; // start from data parllel tensor - if (config.only_data_parallel) { - if (pt->dims[num_dims-1].size == 1) { - Replicate *repl = new Replicate( - *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); - repl->outputs[0]->dims[num_dims-1].is_replica_dim = true; - operators.push_back(repl); - } else { - Repartition *part = new Repartition( - *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); - operators.push_back(part); - } + if (config.only_data_parallel && config.numNodes * config.workersPerNode > 1) { + if (pt->dims[num_dims-1].size == 1) { + Replicate *repl = new Replicate( + *this, pt, num_dims, config.numNodes * config.workersPerNode); + repl->outputs[0]->dims[num_dims].is_replica_dim = true; + operators.push_back(repl); + } else { + Repartition *part = new Repartition( + *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); + operators.push_back(part); + } } return operators[operators.size() - 1]; } From dd8090e2c3ea2d8c6e6de328fdc929f495d0173e Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Sat, 20 May 2023 16:18:27 -0700 Subject: [PATCH 19/72] Use local json submodule --- cmake/json.cmake | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cmake/json.cmake b/cmake/json.cmake index 63ac50b203..3cf57a7864 100644 --- a/cmake/json.cmake +++ b/cmake/json.cmake @@ -1,4 +1 @@ -include(FetchContent) - -FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz) -FetchContent_MakeAvailable(json) \ No newline at end of file +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/json) From 0dc618772d52a934c5b6c3b254d7bf3c44b043c2 Mon Sep 17 00:00:00 2001 From: Teresa Noyola Date: Sat, 20 May 2023 19:30:32 -0400 Subject: [PATCH 20/72] ofi conduit-related fixes --- config/config.inc | 4 +++- config/config.linux | 19 +++++-------------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/config/config.inc b/config/config.inc index da043b2880..0ec6a4696f 100644 --- a/config/config.inc +++ b/config/config.inc @@ -76,6 +76,8 @@ if [ "$FF_USE_GASNET" = "ON" ]; then SET_GASNET+=" -DFF_GASNET_CONDUIT=mpi" elif [ "$FF_GASNET_CONDUIT" = "udp" ]; then SET_GASNET+=" -DFF_GASNET_CONDUIT=udp" + elif [ "$FF_GASNET_CONDUIT" = "ofi" ]; then + SET_GASNET+=" -DFF_GASNET_CONDUIT=ofi" fi elif [ "$FF_USE_GASNET" = "OFF" ]; then SET_GASNET="-DFF_USE_GASNET=OFF" @@ -176,7 +178,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then chmod +x "$(pwd)/nvidia_hipcc" SET_CXX="-DCMAKE_CXX_COMPILER=$(pwd)/nvidia_hipcc -DCMAKE_CXX_LINKER=$(pwd)/nvidia_hipcc" else - SET_CXX="-DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_CXX_LINKER=/opt/rocm/bin/hipcc" + SET_CXX="-DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc -DCMAKE_CXX_LINKER=$ROCM_PATH/bin/hipcc -DHIP_PATH=$ROCM_PATH/hip" fi fi fi diff --git a/config/config.linux b/config/config.linux index 72fe466a7f..ccbe78cefc 100755 --- a/config/config.linux +++ b/config/config.linux @@ -14,7 +14,7 @@ #INSTALL_DIR= # set build type -BUILD_TYPE=Debug +BUILD_TYPE=${BUILD_TYPE:-Release} # set CUDA Arch to the desired GPU architecture(s) to target (e.g. pass "FF_CUDA_ARCH=60" for Pascal). # To pass more than one value, separate architecture numbers with a comma (e.g. FF_CUDA_ARCH=70,75). @@ -32,10 +32,10 @@ CUDA_DIR=${CUDA_DIR:-"/sw/summit/cuda/11.0.3"} FF_USE_PYTHON=${FF_USE_PYTHON:-ON} # enable GASNet -FF_USE_GASNET=${FF_USE_GASNET:-OFF} +FF_USE_GASNET=${FF_USE_GASNET:-ON} # select GASNET conduit -FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv} +FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ofi} # build C++ examples FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-OFF} @@ -76,14 +76,5 @@ function get_build_configs() { BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} FF_USE_GASNET=${FF_USE_GASNET} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND}" } -if [ -n "$1" ]; then - if [ "$1" != "get-docker-configs" ]; then - . $(dirname $0)/config.inc - # You can pass the name of the variable you want to print out as $1. This - # is used in the python setup script to get the cmake config - echo "${!1}" - fi -else - . $(dirname $0)/config.inc - run_cmake $* -fi +. $(dirname $0)/config.inc +run_cmake $* From 0950ac71d7a64bb4f0b6f6b096b24ef0155e6dd9 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Mon, 22 May 2023 13:22:11 -0700 Subject: [PATCH 21/72] Add mpi flags for hip --- config/config.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/config.inc b/config/config.inc index 0ec6a4696f..940e741934 100644 --- a/config/config.inc +++ b/config/config.inc @@ -178,7 +178,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then chmod +x "$(pwd)/nvidia_hipcc" SET_CXX="-DCMAKE_CXX_COMPILER=$(pwd)/nvidia_hipcc -DCMAKE_CXX_LINKER=$(pwd)/nvidia_hipcc" else - SET_CXX="-DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc -DCMAKE_CXX_LINKER=$ROCM_PATH/bin/hipcc -DHIP_PATH=$ROCM_PATH/hip" + SET_CXX="-DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc -DCMAKE_CXX_LINKER=$ROCM_PATH/bin/hipcc -DHIP_PATH=$ROCM_PATH/hip -DCMAKE_CXX_FLAGS='-I${MPICH_DIR}/include' -DCMAKE_EXE_LINKER_FLAGS='-L${MPICH_DIR}/lib -lmpi' -DCMAKE_SHARED_LINKER_FLAGS='-L${MPICH_DIR}/lib -lmpi'" fi fi fi From 4b06040b3f8cc9bb1d116037e2e5bdbac7dd9e35 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 24 May 2023 18:32:26 +0000 Subject: [PATCH 22/72] fix fusion bug --- src/ops/fused.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 618d957169..e3de838021 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -110,7 +110,9 @@ bool FusedOp::add_operator(FFModel &model, Op *op) { // in forward and backward assert(!op->is_parallel_op()); // Currently don't consider nested fusion - assert(op->op_type != OP_FUSED); + if (op->op_type == OP_FUSED) { + return false; + } MachineView my_view = outputs[0]->machine_view; MachineView op_view = op->outputs[0]->machine_view; if (my_view == op_view) { From 99e9f959fbe069d865ceccb9d324c2956f663085 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 24 May 2023 18:37:07 +0000 Subject: [PATCH 23/72] increase the max number of regions in a ZeroInitMeta from 64 to 128 --- include/flexflow/initializer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/flexflow/initializer.h b/include/flexflow/initializer.h index 062530a655..3c44d1184a 100644 --- a/include/flexflow/initializer.h +++ b/include/flexflow/initializer.h @@ -46,7 +46,7 @@ class GlorotUniform : public Initializer { class Op; struct ZeroInitMeta { - static int const MAX_NUM_REGIONS = 64; + static int const MAX_NUM_REGIONS = 128; int num_regions; Op *op_ptr; DataType data_types[MAX_NUM_REGIONS]; From 282c44a865abcf81b88d062704252760c5295909 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 24 May 2023 18:48:49 +0000 Subject: [PATCH 24/72] support mixed precision --- src/ops/fused.cu | 54 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 10 deletions(-) diff --git a/src/ops/fused.cu b/src/ops/fused.cu index ca2a331984..3633c68505 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -414,10 +414,25 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); - Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + assert(my_input_accessor[0].data_type == my_output_accessor[0].data_type); + if (my_input_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int64_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int32_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int64_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false && "Unsupported data type"); + } break; } case OP_TRANSPOSE: { @@ -427,12 +442,31 @@ __host__ void FusedOp::forward_task(Task const *task, assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); TransposeMeta *m = (TransposeMeta *)metas->meta[op]; - Kernels::Transpose::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain, - my_output_accessor[0].domain); + assert(my_input_accessor[0].data_type == my_output_accessor[0].data_type); + if (my_input_accessor[0].data_type == DT_INT64) { + Kernels::Transpose::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int64_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_input_accessor[0].domain, + my_output_accessor[0].domain); + } else if (my_input_accessor[0].data_type == DT_INT32) { + Kernels::Transpose::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int32_ptr(), + my_input_accessor[0].domain, + my_output_accessor[0].domain); + } else if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Transpose::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain, + my_output_accessor[0].domain); + } else { + assert(false && "Unsupported data type"); + } break; } default: { From 992dcb925a9f832b062c09039baf381f0e0250fc Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 24 May 2023 18:53:19 +0000 Subject: [PATCH 25/72] undo changes to Fused::Transpose --- config/config.linux | 4 ++-- src/ops/fused.cu | 30 ++++++------------------------ 2 files changed, 8 insertions(+), 26 deletions(-) diff --git a/config/config.linux b/config/config.linux index ccbe78cefc..c630462034 100755 --- a/config/config.linux +++ b/config/config.linux @@ -23,10 +23,10 @@ BUILD_TYPE=${BUILD_TYPE:-Release} FF_CUDA_ARCH=${FF_CUDA_ARCH:-"autodetect"} # set CUDNN dir in case cmake cannot autodetect a path -CUDNN_DIR=${CUDNN_DIR:-"/sw/summit/cuda/11.0.3"} #/usr/local/cuda +CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"} #/usr/local/cuda # set CUDA dir in case cmake cannot autodetect a path -CUDA_DIR=${CUDA_DIR:-"/sw/summit/cuda/11.0.3"} +CUDA_DIR=${CUDA_DIR:-"/usr/local/cuda"} # enable Python FF_USE_PYTHON=${FF_USE_PYTHON:-ON} diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 3633c68505..720f1ed693 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -443,30 +443,12 @@ __host__ void FusedOp::forward_task(Task const *task, my_output_accessor[0].domain.get_volume()); TransposeMeta *m = (TransposeMeta *)metas->meta[op]; assert(my_input_accessor[0].data_type == my_output_accessor[0].data_type); - if (my_input_accessor[0].data_type == DT_INT64) { - Kernels::Transpose::forward_kernel_wrapper( - m, - my_input_accessor[0].get_int64_ptr(), - my_output_accessor[0].get_int64_ptr(), - my_input_accessor[0].domain, - my_output_accessor[0].domain); - } else if (my_input_accessor[0].data_type == DT_INT32) { - Kernels::Transpose::forward_kernel_wrapper( - m, - my_input_accessor[0].get_int32_ptr(), - my_output_accessor[0].get_int32_ptr(), - my_input_accessor[0].domain, - my_output_accessor[0].domain); - } else if (my_input_accessor[0].data_type == DT_FLOAT) { - Kernels::Transpose::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain, - my_output_accessor[0].domain); - } else { - assert(false && "Unsupported data type"); - } + Kernels::Transpose::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain, + my_output_accessor[0].domain); break; } default: { From f52877415e1cee78f90171d9589fb12963ebb34d Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 24 May 2023 18:54:38 +0000 Subject: [PATCH 26/72] undo changes to config.linux --- config/config.linux | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/config.linux b/config/config.linux index c630462034..ccbe78cefc 100755 --- a/config/config.linux +++ b/config/config.linux @@ -23,10 +23,10 @@ BUILD_TYPE=${BUILD_TYPE:-Release} FF_CUDA_ARCH=${FF_CUDA_ARCH:-"autodetect"} # set CUDNN dir in case cmake cannot autodetect a path -CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"} #/usr/local/cuda +CUDNN_DIR=${CUDNN_DIR:-"/sw/summit/cuda/11.0.3"} #/usr/local/cuda # set CUDA dir in case cmake cannot autodetect a path -CUDA_DIR=${CUDA_DIR:-"/usr/local/cuda"} +CUDA_DIR=${CUDA_DIR:-"/sw/summit/cuda/11.0.3"} # enable Python FF_USE_PYTHON=${FF_USE_PYTHON:-ON} From a68150db548c52e85943c68f9410b23e26a4eb86 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 2 Jun 2023 22:16:15 +0000 Subject: [PATCH 27/72] try to fix layernorm --- python/flexflow/torch/model.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index 2d0c388e3e..5ad5ebea88 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -653,14 +653,25 @@ def string_to_ff(string, ffmodel, node_to_output): data = Node.StringData(string) name = data.name input_tensor = node_to_output[data.innodes[0]] - return ffmodel.identity(input=input_tensor, name=name) - # TODO: Change to ffmodel.layernorm() once supported + axes = [len(input_tensor.dims) - 1] + return ffmodel.layer_norm( + input=input_tensor, + axes=axes, + elementwise_affine=True, + eps=1e-6, + name=name, + ) def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] - return ffmodel.identity(input=input_tensor, name=self.name) - # TODO: Change to ffmodel.layernorm() once supported - + axes = [len(input_tensor.dims) - 1] + return ffmodel.layer_norm( + input=input_tensor, + axes=axes, + elementwise_affine=True, + eps=1e-6, + name=name, + ) class T5LayerNormNode(Node): """ From 2bf9afc43d8f81fcaf44f158200622f97bc0a661 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 2 Jun 2023 22:21:27 +0000 Subject: [PATCH 28/72] fix typo --- python/flexflow/torch/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index 5ad5ebea88..11e0c16e48 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -670,7 +670,7 @@ def to_ff(self, ffmodel, node_to_output): axes=axes, elementwise_affine=True, eps=1e-6, - name=name, + name=self.name, ) class T5LayerNormNode(Node): From f6f7a327f268daa434471ace50025fb940123c89 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Fri, 2 Jun 2023 19:20:42 -0700 Subject: [PATCH 29/72] Add possible layernorm fix --- src/ops/layer_norm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 5d7fff3410..a8c429cd82 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -182,7 +182,7 @@ LayerNorm::LayerNorm(FFModel &model, M *= inputs[0]->dims[inputs[0]->num_dims - 1 - axes[i]].size; } effective_num_elements = M; - effective_batch_size = inputs[0]->get_volume() / M; + effective_batch_size = inputs[0]->get_shape().get_piece_size() / M; if (numWeights > 0 && allocate_weights) { int kernel_dims = 2; assert(false); From 5e03b0a536b44723a72fd8ba60be27eca652234b Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Fri, 2 Jun 2023 19:33:03 -0700 Subject: [PATCH 30/72] Fix additional layernorm bug due to get_piece_size return size in bytes --- include/flexflow/parallel_tensor.h | 1 + src/runtime/parallel_tensor.cc | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/flexflow/parallel_tensor.h b/include/flexflow/parallel_tensor.h index aed99c8204..c8791275dd 100644 --- a/include/flexflow/parallel_tensor.h +++ b/include/flexflow/parallel_tensor.h @@ -100,6 +100,7 @@ struct ParallelTensorShape { RecordFormatter as_dot() const; size_t get_piece_size() const; + size_t get_piece_num_elements() const; bool is_valid() const; int get_num_replica_dims() const; diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 963ad8af73..479bde3898 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -135,12 +135,18 @@ bool ParallelTensorShape::operator!=(ParallelTensorShape const &other) const { size_t ParallelTensorShape::get_piece_size() const { size_t piece_size = data_type_size(this->data_type); + return piece_size * this->get_piece_num_elements(); +} + +size_t ParallelTensorShape::get_piece_num_elements() const { + size_t piece_num_elements = 1; for (int i = 0; i < this->num_dims; i++) { - piece_size *= this->dims[i].size / this->dims[i].degree; + piece_num_elements *= this->dims[i].size / this->dims[i].degree; } - return piece_size; + return piece_num_elements; } + RecordFormatter ParallelTensorShape::as_dot() const { RecordFormatter r; for (int i = 0; i < this->num_dims; i++) { From 53fb8bd042b244df65a14770cc9f0606c4e0e6cd Mon Sep 17 00:00:00 2001 From: Teresa Noyola Date: Fri, 2 Jun 2023 20:00:24 -0700 Subject: [PATCH 31/72] Bugfixes --- gdb/pretty_print.py | 6 +++++- src/ops/layer_norm.cc | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/gdb/pretty_print.py b/gdb/pretty_print.py index 0c384884f0..e6fbe298ce 100644 --- a/gdb/pretty_print.py +++ b/gdb/pretty_print.py @@ -61,7 +61,11 @@ def to_string(self): size = dim['size'] degree = dim['degree'] parallel_idx = dim['parallel_idx'] - toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx}]') + if dim['is_replica_dim']: + is_replica = 'r=t' + else: + is_replica = 'r=f' + toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx} {is_replica}]') return f'TensorShape<{" ".join(toks)}>' class ParallelTensorBasePrinter: diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index a8c429cd82..e8c65b4b03 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -182,7 +182,7 @@ LayerNorm::LayerNorm(FFModel &model, M *= inputs[0]->dims[inputs[0]->num_dims - 1 - axes[i]].size; } effective_num_elements = M; - effective_batch_size = inputs[0]->get_shape().get_piece_size() / M; + effective_batch_size = inputs[0]->get_shape().get_piece_num_elements() / M; if (numWeights > 0 && allocate_weights) { int kernel_dims = 2; assert(false); From 449a14c2369e6e886b7d54503b76f2da97899135 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Fri, 2 Jun 2023 20:01:22 -0700 Subject: [PATCH 32/72] Actually check elementwise_affine --- src/ops/layer_norm.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index e8c65b4b03..07584cc5b1 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -63,6 +63,7 @@ Tensor FFModel::layer_norm(const Tensor input, float eps, char const *name) { // FIXME: currently disable elementwise_affine + assert(!elementwise_affine); elementwise_affine = false; // axes must be the last axes.size() dimensions for (int i = 0; i < axes.size(); i++) { From c737be64070e276c9276adb72b4d413e2a19652a Mon Sep 17 00:00:00 2001 From: Teresa Noyola Date: Mon, 5 Jun 2023 16:58:59 -0700 Subject: [PATCH 33/72] Revert "Actually check elementwise_affine" This reverts commit 449a14c2369e6e886b7d54503b76f2da97899135. --- src/ops/layer_norm.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 07584cc5b1..e8c65b4b03 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -63,7 +63,6 @@ Tensor FFModel::layer_norm(const Tensor input, float eps, char const *name) { // FIXME: currently disable elementwise_affine - assert(!elementwise_affine); elementwise_affine = false; // axes must be the last axes.size() dimensions for (int i = 0; i < axes.size(); i++) { From a98e09d71ab03683e846b92bd52e442603895b59 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Mon, 5 Jun 2023 21:15:34 -0700 Subject: [PATCH 34/72] Change optimizer to adam with correct hyperparams --- examples/python/pytorch/mt5/mt5_ff.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py index fa0801a517..08af8d88a7 100644 --- a/examples/python/pytorch/mt5/mt5_ff.py +++ b/examples/python/pytorch/mt5/mt5_ff.py @@ -126,7 +126,8 @@ def top_level_task(): output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True) #from flexflow.torch.model import file_to_ff #file_to_ff("mt5.ff", ffmodel, input_tensors) - ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) + ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, adam_beta1=0.9, adam_beta2=0.98, weight_decay=0.0, adam_epsilon=2e-8) + # ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) print("Compiling the model...") ffmodel.compile( From 4bec811060862a7f882d2e6e81fbcd9c70e2d9b6 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 4 Jul 2023 23:26:34 +0000 Subject: [PATCH 35/72] fix training bert model. --- examples/python/pytorch/mt5/mt5_ff.py | 26 +- include/flexflow/model.h | 6 +- .../flexflow/ops/kernels/softmax_kernels.h | 6 +- include/flexflow/ops/layer_norm.h | 11 +- include/flexflow/ops/softmax.h | 2 + include/flexflow/ops/softmax_params.h | 1 + include/flexflow/utils/cuda_helper.h | 6 + python/flexflow/core/flexflow_cffi.py | 23 +- python/flexflow/torch/model.py | 51 ++-- python/flexflow_c.cc | 7 +- python/flexflow_c.h | 1 + src/loss_functions/loss_functions.cc | 11 +- src/loss_functions/loss_functions.cu | 59 ++-- src/metrics_functions/metrics_functions.cc | 6 +- src/metrics_functions/metrics_functions.cu | 18 +- src/ops/element_unary.cu | 5 +- src/ops/kernels/softmax.cu | 37 ++- src/ops/layer_norm.cc | 254 +++++++++++++++--- src/ops/layer_norm.cu | 248 ++++++++++++----- src/ops/softmax.cc | 46 +++- src/runtime/cuda_helper.cu | 77 +++++- src/runtime/graph.cc | 6 +- src/runtime/substitution.cc | 9 +- 23 files changed, 720 insertions(+), 196 deletions(-) diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py index 08af8d88a7..c2868e9d1e 100644 --- a/examples/python/pytorch/mt5/mt5_ff.py +++ b/examples/python/pytorch/mt5/mt5_ff.py @@ -64,8 +64,8 @@ def preprocess_train() -> None: y_shape = y.shape assert len(y.shape) == 2, \ "`y` should have shape (num examples, sequence length)" - y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long) - lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long) + y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32) + lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32) y_ids[:, :] = y[:, :-1] lm_labels[:, :] = y[:, 1:] @@ -89,26 +89,29 @@ def top_level_task(): #model = BertModel.from_pretrained("bert-base-uncased") # Load train data as numpy arrays print("Loading data...") - ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")) + ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")).astype('int32') ids = np.pad(ids, ((0,0), (0,17)), 'constant') #ids = np.random.randint(0, 5, (1000, 512)) #print('ids_shape', ids.shape) #print('ids', ids) - mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")) + mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")).astype('int32') mask = np.pad(mask, ((0,0), (0,17)), 'constant') #mask = np.random.randint(0, 2, (1000, 512)) #y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) - lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")) + lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")).astype('int32') lm_labels = np.pad(lm_labels, ((0,0), (0,17)), 'constant') #lm_labels = np.random.randint(-1, 5, (1000, 512)) + position_id = torch.arange(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy() + token_type_ids = torch.zeros(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy() + batch_size = ffconfig.batch_size input_ids_shape = (batch_size, ids.shape[1]) attention_mask_shape = (batch_size, mask.shape[1]) #decoder_input_ids_shape = (batch_size, y_ids.shape[1]) input_tensors = [ - ffmodel.create_tensor(input_ids_shape, DataType.DT_INT64), # input_ids - ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT64), # attention_mask + ffmodel.create_tensor(input_ids_shape, DataType.DT_INT32), # input_ids + ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT32), # attention_mask #ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids ] encoder_seq_length = ids.shape[1] @@ -126,7 +129,7 @@ def top_level_task(): output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True) #from flexflow.torch.model import file_to_ff #file_to_ff("mt5.ff", ffmodel, input_tensors) - ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, adam_beta1=0.9, adam_beta2=0.98, weight_decay=0.0, adam_epsilon=2e-8) + ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, beta1=0.9, beta2=0.98, weight_decay=0.0, epsilon=2e-8) # ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) print("Compiling the model...") @@ -138,6 +141,9 @@ def top_level_task(): MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY, ], ) + + # load weights here + ffmodel.load_bert_pretrained(checkpoint=model) print("Creating data loaders...") print('id_dtype', ids.dtype) @@ -148,6 +154,8 @@ def top_level_task(): #decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids) # NOTE: We cast down the label tensor data to 32-bit to accommodate the # label tensor's required dtype + token_type_ids_dl = ffmodel.create_data_loader(input_tensors[2], token_type_ids) + position_id_dl = ffmodel.create_data_loader(input_tensors[3], position_id) labels_dl = ffmodel.create_data_loader( ffmodel.label_tensor, lm_labels.astype("int32") ) @@ -159,7 +167,7 @@ def top_level_task(): epochs = ffconfig.epochs ffmodel.fit( #x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl], - x=[input_ids_dl, attention_mask_dl], + x=[input_ids_dl, attention_mask_dl, position_id_dl, token_type_ids_dl], y=labels_dl, batch_size=batch_size, epochs=epochs, ) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index c6bc6929ad..0496d5fa8f 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -472,6 +472,7 @@ class FFModel { std::vector const &axes, bool elementwise_affine, float eps, + DataType data_type = DT_NONE, char const *name = NULL); // Add a batch_norm layer Tensor @@ -518,7 +519,10 @@ class FFModel { // Add a flat layer Tensor flat(const Tensor input, char const *name = NULL); // Add a softmax layer - Tensor softmax(const Tensor input, int dim = -1, char const *name = NULL); + Tensor softmax(const Tensor input, + int dim = -1, + bool last_layer = false, + char const *name = NULL); // Create input tensors and constants Tensor transpose(const Tensor input, std::vector const &perm, diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index 81b34d8558..9aec9f57c9 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -20,6 +20,7 @@ class SoftmaxMeta : public OpMeta { #endif bool profiling; int dim; + bool last_layer; char op_name[MAX_OPNAME]; }; @@ -33,6 +34,7 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements); namespace Internal { @@ -40,8 +42,10 @@ void forward_kernel(SoftmaxMeta const *m, float const *input_ptr, float *output_ptr, ffStream_t stream); -void backward_kernel(float *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements, ffStream_t stream); } // namespace Internal diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 8273b9ab52..552b9cf365 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -66,12 +66,11 @@ class LayerNorm : public Op { T *gamma_ptr, T *beta_ptr, ffStream_t stream); - template static void forward_kernel_wrapper(LayerNormMeta const *m, - T const *input_ptr, - T *output_ptr, - T *gamma_ptr, - T *beta_ptr); + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorW &gamma, + GenericTensorAccessorW &beta); template static void backward_kernel(LayerNormMeta const *m, T const *output_grad_ptr, @@ -105,7 +104,7 @@ class LayerNormMeta : public OpMeta { bool elementwise_affine; int64_t effective_batch_size, effective_num_elements; float eps; - float *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; + void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; char op_name[MAX_OPNAME]; }; diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 25a20315bd..2616294a3a 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -15,6 +15,7 @@ class Softmax : public Op { Softmax(FFModel &model, const ParallelTensor logit, int dim, + bool _last_layer, char const *name); Softmax(FFModel &model, Params const ¶ms, @@ -64,6 +65,7 @@ class Softmax : public Op { public: int dim; + bool last_layer; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/softmax_params.h b/include/flexflow/ops/softmax_params.h index d805d9966d..545e3a5cb9 100644 --- a/include/flexflow/ops/softmax_params.h +++ b/include/flexflow/ops/softmax_params.h @@ -7,6 +7,7 @@ namespace FlexFlow { struct SoftmaxParams { int dim; + bool last_layer; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(SoftmaxParams const &, SoftmaxParams const &); diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index 46e323b186..a4b2be0a66 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -132,9 +132,15 @@ __host__ void updateGAS(float *para_ptr, template void print_tensor(T const *ptr, size_t num_elements, char const *prefix); +template +void save_tensor(T const *ptr, size_t num_elements, char const *file_name); cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Legion::Domain domain); +cudnnStatus_t + cudnnSetTensorDescriptorFromDomain4SoftMax(cudnnTensorDescriptor_t tensor, + Legion::Domain domain, + DataType data_type = DT_FLOAT); cudaDataType_t ff_to_cuda_datatype(DataType type); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 42339d781c..4c01057109 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -1595,7 +1595,7 @@ def flat(self, input, name=None): self.add_layer(OpType.FLAT, name) return Tensor(handle, owner_op_type=OpType.FLAT) - def softmax(self, input, axis=-1, name=None): + def softmax(self, input, axis=-1, last_layer=False, name=None): """Softmax activation function. :param input: the input Tensor. @@ -1607,7 +1607,7 @@ def softmax(self, input, axis=-1, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_softmax(self.handle, input.handle, axis, c_name) + handle = ffc.flexflow_model_add_softmax(self.handle, input.handle, axis, last_layer, c_name) self.add_layer(OpType.SOFTMAX, name) return Tensor(handle, owner_op_type=OpType.SOFTMAX) @@ -2041,6 +2041,25 @@ def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): ff_tensor.set_tensor(self, np_tensor) print("Compiled ffmodel!") + def load_bert_pretrained(self, checkpoint=None): + # store weights in dict + weights_dict = {} + for name, params in checkpoint.named_parameters(): + weights_dict[name.replace("LayerNorm", "layer_norm").replace(".", "_")] = params.detach().cpu().numpy() + print(name.replace("LayerNorm", "layer_norm").replace(".", "_")) + # some weights not in params + weights_dict['cls_predictions_decoder_weight'] = checkpoint.cls.predictions.decoder.weight.detach().cpu().numpy() + weights_dict['cls_predictions_decoder_bias'] = checkpoint.cls.predictions.decoder.bias.detach().cpu().numpy() + for i in range (self._nb_layers): + layer = self._layers[i] + if (layer.name + "_weight") in weights_dict: + print('weight: ' + layer.name) + weight = layer.get_parameter_by_id(0); + weight.set_tensor(self, weights_dict[layer.name + "_weight"]) + if (layer.name + "_bias") in weights_dict: + print('bias: ' + layer.name) + bias = layer.get_parameter_by_id(1); + bias.set_tensor(self, weights_dict[layer.name + "_bias"]) def fit(self, x=None, y=None, batch_size=None, epochs=1): """Trains the model for a fixed number of epochs (iterations on a dataset). diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index 11e0c16e48..8ebac2146c 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -664,12 +664,13 @@ def string_to_ff(string, ffmodel, node_to_output): def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] - axes = [len(input_tensor.dims) - 1] + axes = [0] + eps = self.module.eps return ffmodel.layer_norm( input=input_tensor, axes=axes, elementwise_affine=True, - eps=1e-6, + eps=eps, name=self.name, ) @@ -1197,16 +1198,24 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_sub( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor, scalar = \ FunctionNode.parse_scalar_op(self, node_to_output) - return ffmodel.scalar_sub( - input=input_tensor, scalar=scalar, name=self.name, - ) - + if self.scalar_pos == FunctionNode.ScalarPosition.RIGHT: + return ffmodel.scalar_sub( + input=input_tensor, scalar=scalar, inplace=False, name=self.name, + ) + else: + negative_input = ffmodel.scalar_multiply( + input=input_tensor, scalar=-1, inplace=False, name=self.name + '_negative', + ) + return ffmodel.scalar_sub( + input=negative_input, scalar=-scalar, inplace=False, name=self.name, + ) + class ScalarTrueDivNode(FunctionNode): def __init__(self, node): @@ -1231,15 +1240,16 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_true_divide( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] scalar = self.innodes[1] assert type(scalar) is float + return ffmodel.scalar_true_divide( - input=input_tensor, scalar=scalar, name=self.name, + input=input_tensor, scalar=scalar, inplace=False, name=self.name, ) @@ -1652,14 +1662,14 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_multiply( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor, scalar = \ FunctionNode.parse_scalar_op(self, node_to_output) return ffmodel.scalar_multiply( - input=input_tensor, scalar=scalar, name=self.name, + input=input_tensor, scalar=scalar, inplace=False, name=self.name, ) @@ -2359,11 +2369,13 @@ def string_to_ff(string, ffmodel, node_to_output): "since attributes require access to the PyTorch model" ) - def to_ff(self, ffmodel, node_to_output): - return self.attr_to_ff_tensor(ffmodel) + def to_ff(self, ffmodel, node_to_output, input_tensors): + return self.attr_to_ff_tensor(ffmodel, input_tensors) + + def attr_to_ff_tensor(self, ffmodel, input_tensors): + - def attr_to_ff_tensor(self, ffmodel): - torch_tensor = self.attr + torch_tensor = self.attr assert (torch_tensor.shape[0] == 1) batch_size = ffmodel._ffconfig.batch_size torch_tensor = np.repeat(torch_tensor, batch_size, axis=0) @@ -2382,15 +2394,16 @@ def attr_to_ff_tensor(self, ffmodel): np_tensor = np_tensor.astype(np.float32) print('attr: ', torch_tensor.shape) - assert (torch_tensor.shape[0] == batch_size) + assert (torch_tensor.shape[0] == batch_size) ff_tensor = ffmodel.create_tensor( - torch_tensor.shape, ff_dtype, requires_grad, + torch_tensor.shape, ff_dtype, True, ) # delay set_tensor, add to ffmodel ffmodel.attr_tensors[ff_tensor] = np_tensor # ff_tensor.set_tensor( # ffmodel, np_tensor # ) + input_tensors.append(ff_tensor) return ff_tensor @@ -2472,7 +2485,7 @@ def to_ff(self, ffmodel, node_to_output, output_tensors): # `CrossEntropyLoss()` implementation logits = node_to_output[other["logits"].name] softmax_logits = ffmodel.softmax( - input=logits, name=self.name, + input=logits, last_layer=True, name=self.name, ) output_tensors[:] += [softmax_logits] else: @@ -2606,6 +2619,8 @@ def torch_to_ff(self, ffmodel, input_tensors, verbose=False): elif isinstance(node, OutputNode): node.to_ff(ffmodel, node_to_output, output_tensors) node_output = None + elif isinstance(node, AttributeNode): + node_output = node.to_ff(ffmodel, node_to_output, input_tensors) else: node_output = node.to_ff(ffmodel, node_to_output) diff --git a/python/flexflow_c.cc b/python/flexflow_c.cc index 74a5da6ce1..fd688c0c6a 100644 --- a/python/flexflow_c.cc +++ b/python/flexflow_c.cc @@ -568,8 +568,8 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_, for (int i = 0; i < n; i++) { axes_vec.push_back(axes[i]); } - Tensor tensor = - handle->layer_norm(input, axes_vec, elementwise_affine, eps, name); + Tensor tensor = handle->layer_norm( + input, axes_vec, elementwise_affine, eps, input->data_type, name); DEBUG_PRINT("[LayerNorm] new Tensor %p, input %p, elementwise_affine %d, eps " "%f, name %s", tensor, @@ -730,10 +730,11 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle_, flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle_, const flexflow_tensor_t input_, int dim, + bool last_layer, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); - Tensor tensor = handle->softmax(input, dim, name); + Tensor tensor = handle->softmax(input, dim, last_layer, name); DEBUG_PRINT( "[Softmax] new Tensor %p, input %p, name %s", tensor, input, name); return FFCObjectWrapper::wrap(tensor); diff --git a/python/flexflow_c.h b/python/flexflow_c.h index fb64c78fd2..5409002e5e 100644 --- a/python/flexflow_c.h +++ b/python/flexflow_c.h @@ -276,6 +276,7 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle, flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle, const flexflow_tensor_t input, int dim, + bool last_layer, char const *name); flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle, diff --git a/src/loss_functions/loss_functions.cc b/src/loss_functions/loss_functions.cc index ae89c3d469..d887ee9243 100644 --- a/src/loss_functions/loss_functions.cc +++ b/src/loss_functions/loss_functions.cc @@ -49,6 +49,8 @@ void Loss::backward(FFModel *model, if (loss_type == LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE) { assert(logit->get_volume() == label->get_volume()); scale_factor = 2.0f / logit->get_volume(); + } else if (loss_type == LOSS_SPARSE_CATEGORICAL_CROSSENTROPY) { + scale_factor = 1.0f; } else { scale_factor = 1.0f / model->config.batchSize; } @@ -131,9 +133,12 @@ void Loss::backward_task_with_dim(Task const *task, regions[2], task->regions[2], FID_DATA, ctx, runtime); // assertion the outter-most dim is replica dim and replica degree is 1 assert(acc_logit.rect.hi[NDIM - 1] == acc_logit.rect.lo[NDIM - 1]); - int num_samples = - acc_logit.rect.hi[NDIM - 2] - acc_logit.rect.lo[NDIM - 2] + 1; - int num_classes = acc_logit.rect.volume() / num_samples; + + int num_classes = acc_logit.rect.hi[0] - acc_logit.rect.lo[0] + 1; + int num_samples = acc_logit.rect.volume() / num_classes; + // int num_samples = + // acc_logit.rect.hi[NDIM - 2] - acc_logit.rect.lo[NDIM - 2] + 1; + // int num_classes = acc_logit.rect.volume() / num_samples; assert(acc_logit_grad.rect == acc_logit.rect); int k = 1; if (loss->repl_labels) { diff --git a/src/loss_functions/loss_functions.cu b/src/loss_functions/loss_functions.cu index 01766347b0..edd8f03fa4 100644 --- a/src/loss_functions/loss_functions.cu +++ b/src/loss_functions/loss_functions.cu @@ -18,6 +18,7 @@ namespace FlexFlow { +int const MASK_TOKEN = -100; using namespace Legion; __global__ void @@ -32,6 +33,25 @@ __global__ void } } +__global__ void + sparse_categorical_crossentropy_loss_backward_with_mask(float *logit_grad, + int const *label, + coord_t num_samples, + coord_t num_classes, + int const k, + float *num) { + CUDA_KERNEL_LOOP(i, num_samples * num_classes) { + int sample_id = i / num_classes; + int label_idx = label[i / (k * num_classes)]; + if (label_idx != MASK_TOKEN && (i == sample_id * num_classes + label_idx)) { + logit_grad[i] -= 1.0f; + atomicAdd(&num[0], 1.0f); + } else if (label_idx == MASK_TOKEN) { + logit_grad[i] = 0.0f; + } + } +} + __global__ void categorical_crossentropy_loss_backward(float *logit_grad, float const *logit, float const *label, @@ -74,14 +94,25 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_ptr, logit_volume * sizeof(float), cudaMemcpyDeviceToDevice)); - sparse_categorical_crossentropy_loss_backward<<>>( - logit_grad_ptr, label_ptr, num_samples, num_classes, k); - // Scale logit gradients by op->scale_factor + // calculate the scale factor inside kernel; + assert(scale_factor == 1.0f); + float *num; + checkCUDA(cudaMalloc(&num, sizeof(float))); + float effective_tokens; + int parallelism = num_samples * num_classes; + // sparse_categorical_crossentropy_loss_backward<<>>( + // logit_grad_ptr, label_ptr, num_samples, num_classes, k, num); + sparse_categorical_crossentropy_loss_backward_with_mask<<< + GET_BLOCKS(parallelism), + CUDA_NUM_THREADS, + 0, + stream>>>(logit_grad_ptr, label_ptr, num_samples, num_classes, k, num); + cudaMemcpy(&effective_tokens, num, sizeof(float), cudaMemcpyDeviceToHost); scale_kernel<<>>( - logit_grad_ptr, logit_grad_volume, 0, scale_factor * k); + logit_grad_ptr, logit_grad_volume, 0, 1.0f / effective_tokens); } void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( @@ -122,19 +153,17 @@ void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper( logit_grad_ptr, logit_grad_volume, 0, scale_factor); } -void Loss::identity_loss_backward_kernel_wrapper( - float *loss_grad_ptr, - float const *loss_ptr, - size_t loss_volume, - size_t loss_grad_volume, - float scale_factor) { +void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr, + float const *loss_ptr, + size_t loss_volume, + size_t loss_grad_volume, + float scale_factor) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); identity_loss_backward<<>>( - loss_grad_ptr, loss_ptr, loss_volume); + stream>>>(loss_grad_ptr, loss_ptr, loss_volume); // Scale logit gradients by loss->scale_factor scale_kernel<<>>( loss_grad_ptr, loss_grad_volume, 0, scale_factor); diff --git a/src/metrics_functions/metrics_functions.cc b/src/metrics_functions/metrics_functions.cc index 7244b06925..8c7e23ad8a 100644 --- a/src/metrics_functions/metrics_functions.cc +++ b/src/metrics_functions/metrics_functions.cc @@ -91,8 +91,8 @@ void Metrics::compute(FFModel *model, false /*must*/, 0 /*mapper_id*/, logit->machine_view.hash()); - std::cout << "logit shape: " << logit->get_shape() << std::endl; - std::cout << "label shape: " << label->get_shape() << std::endl; + // std::cout << "logit shape: " << logit->get_shape() << std::endl; + // std::cout << "label shape: " << label->get_shape() << std::endl; launcher.add_region_requirement(RegionRequirement( logit->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, logit->region)); launcher.add_field(0, FID_DATA); @@ -157,7 +157,7 @@ PerfMetrics assert(acc_label.rect.lo[0] == acc_label.rect.hi[0]); // Cannot measure categorical_crossentropy w/ sparse labels // Use measure_sparse_categorical_crossentropy instead - std::cout << "num_classes: " << num_classes << std::endl; + // std::cout << "num_classes: " << num_classes << std::endl; assert(!me->measure_categorical_crossentropy); Metrics::update_metrics_sparse_label_kernel_wrapper(acc_logit.ptr, acc_label.ptr, diff --git a/src/metrics_functions/metrics_functions.cu b/src/metrics_functions/metrics_functions.cu index b68b10d873..8c584c397c 100644 --- a/src/metrics_functions/metrics_functions.cu +++ b/src/metrics_functions/metrics_functions.cu @@ -19,6 +19,7 @@ namespace FlexFlow { float const LOG_MIN_VALUE = 0.00000001f; +int const MASK_TOKEN = -100; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, @@ -29,7 +30,7 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, CUDA_KERNEL_LOOP(b, num_samples) { if (metrics.measure_accuracy) { float max_val = -1.0f; - int my_label = 0; + int my_label = 0; for (int i = 0; i < num_classes; i++) { float my_logit = logits[b * num_classes + i]; if (my_logit > max_val) { @@ -38,14 +39,19 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, } } assert(my_label >= 0); - atomicAdd(&(perf->train_all), 1); - if (labels[b] == my_label) { - atomicAdd(&(perf->train_correct), 1); + if (labels[b] != MASK_TOKEN) { + atomicAdd(&(perf->train_all), 1); + if (labels[b] == my_label) { + atomicAdd(&(perf->train_correct), 1); + } } } if (metrics.measure_sparse_categorical_crossentropy) { - float my_logit = max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); - atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + if (labels[b] != MASK_TOKEN) { + float my_logit = + max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); + atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + } } if (metrics.measure_mean_squared_error || metrics.measure_root_mean_squared_error || diff --git a/src/ops/element_unary.cu b/src/ops/element_unary.cu index d6e5bcfdc3..187e60282f 100644 --- a/src/ops/element_unary.cu +++ b/src/ops/element_unary.cu @@ -202,8 +202,9 @@ __global__ void elewise_unary_backward_kernel(coord_t volume, case OP_GELU: { input_grad[i] = (T)(output_grad[i] * - (0.5 * erfc(-input[i] * M_SQRT1_2) - - 0.5 * M_SQRT1_2 * input[i] * exp(-input[i] * input[i] * 0.5))); + (0.5 * erfc(-input[i] * M_SQRT1_2) + + 0.5 * M_SQRT1_2 * input[i] * + ((2 / sqrt(M_PI)) * exp(-input[i] * input[i] * 0.5f)))); break; } case OP_RSQRT: { diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index d83d9952c9..e163c9a0c7 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -26,8 +26,10 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, Domain const &input_domain) : OpMeta(handler) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); - checkCUDNN(cudnnSetTensorDescriptorFromDomain(inputTensor, input_domain)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( + inputTensor, input_domain, softmax->data_type)); dim = softmax->dim; + last_layer = softmax->last_layer; profiling = softmax->profiling; std::strcpy(op_name, softmax->name); } @@ -66,6 +68,7 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -77,7 +80,7 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, cudaEventRecord(t_start, stream); } Internal::backward_kernel( - input_grad_ptr, output_grad_ptr, num_elements, stream); + m, input_grad_ptr, output_grad_ptr, output_ptr, num_elements, stream); if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -113,15 +116,33 @@ void forward_kernel(SoftmaxMeta const *m, output_ptr)); } -void backward_kernel(float *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements, cudaStream_t stream) { - checkCUDA(cudaMemcpyAsync(input_grad_ptr, - output_grad_ptr, - num_elements * sizeof(float), - cudaMemcpyDeviceToDevice, - stream)); + + if (m->last_layer) { + checkCUDA(cudaMemcpyAsync(input_grad_ptr, + output_grad_ptr, + num_elements * sizeof(float), + cudaMemcpyDeviceToDevice, + stream)); + } else { + float alpha = 1.0f, beta = 0.0f; + checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + m->inputTensor, + output_ptr, + m->inputTensor, + output_grad_ptr, + &beta, + m->inputTensor, + input_grad_ptr)); + } } } // namespace Internal diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index e8c65b4b03..915041d2bb 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -61,10 +61,27 @@ Tensor FFModel::layer_norm(const Tensor input, std::vector const &axes, bool elementwise_affine, float eps, + DataType data_type, char const *name) { - // FIXME: currently disable elementwise_affine - elementwise_affine = false; - // axes must be the last axes.size() dimensions + // In PyTorch, axes must be the sizes of the last axes.size() dimensions of + // the input tensor. However, since the tensor dimensions are reversed in + // FlexFlow (batch size is the last dimension), we require that axes must be + // the sizes of the FIRST axes.size() dimensions of the input tensor. + + // Another difference is that in PyTorch, the axes vector should contain the + // sizes of the dimensions with respect to which you want to compute the + // layernorm. In FlexFlow, instead, axes should contain the INDICES of the + // dimensions in question. We do this because the size of a dimension might be + // different when splitting a tensor in model parallelism. + assert( + axes.size() <= input->num_dims && + "number of axes must be less than tensor dimensions"); // input does not + // have replica + // dimension here + for (int i = 0; i < axes.size(); i++) { + assert(axes[i] == i && "axes must be the first axes.size() dimensions"); + } +#ifdef DEADCODE for (int i = 0; i < axes.size(); i++) { bool found = false; for (int j = 0; j < axes.size(); j++) { @@ -76,15 +93,33 @@ Tensor FFModel::layer_norm(const Tensor input, assert(false && "axes must be the last axes.size() dimensions"); } } +#endif + if (data_type == DT_NONE) { + data_type = input->data_type; + } int num_weights = elementwise_affine ? 2 : 0; - Layer *ln = new Layer(this, - OP_LAYERNORM, - DT_FLOAT, - name, - 1 /*inputs*/, - num_weights, - 1 /*outputs*/, - input); + Layer *ln = nullptr; + if (data_type != input->data_type) { + Tensor casted_input = cast(input, data_type, "type cast for layer_norm"); + ln = new Layer(this, + OP_LAYERNORM, + data_type, + name, + 1 /*inputs*/, + num_weights, + 1 /*outputs*/, + casted_input); + } else { + ln = new Layer(this, + OP_LAYERNORM, + data_type, + name, + 1 /*inputs*/, + num_weights, + 1 /*outputs*/, + input); + } + ln->outputs[0] = create_tensor_legion_ordering(input->num_dims, input->dims, input->data_type, @@ -92,19 +127,19 @@ Tensor FFModel::layer_norm(const Tensor input, 0, true /*create_grad*/); if (num_weights == 2) { - int M = 1; - for (int i = 0; i < axes.size(); i++) { - M *= input->dims[input->num_dims - 1 - axes[i]]; + int numdims = axes.size(); + int dims[numdims]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[axes[i]]; } - int dims[1] = {M}; - ln->weights[0] = create_weight_legion_ordering(1, + ln->weights[0] = create_weight_legion_ordering(numdims, dims, input->data_type, ln, true /*create_grad*/, nullptr, CHOSEN_SYNC_TYPE); - ln->weights[1] = create_weight_legion_ordering(1, + ln->weights[1] = create_weight_legion_ordering(numdims, dims, input->data_type, ln, @@ -179,19 +214,36 @@ LayerNorm::LayerNorm(FFModel &model, ParallelDim output_dims[MAX_TENSOR_DIM]; int M = 1; for (int i = 0; i < axes.size(); i++) { - M *= inputs[0]->dims[inputs[0]->num_dims - 1 - axes[i]].size; + M *= inputs[0]->dims[axes[i]].size; } effective_num_elements = M; - effective_batch_size = inputs[0]->get_shape().get_piece_num_elements() / M; + effective_batch_size = inputs[0]->get_volume() / M; + assert(elementwise_affine == (numWeights == 2)); if (numWeights > 0 && allocate_weights) { - int kernel_dims = 2; - assert(false); - // weights[0] = model.create_parallel_weight_legion_ordering( - // kernel_dims, - } else { - // do nothing + ParallelDim dims[axes.size()]; + for (int i = 0; i < axes.size(); i++) { + dims[i] = inputs[0]->dims[i]; + } + int seed = std::rand(); + Initializer *gamma_initializer = new UniformInitializer(seed, 0.0f, 1.0f); + Initializer *beta_initializer = new UniformInitializer(seed, 0.0f, 1.0f); + weights[0] = + model.create_parallel_weight_legion_ordering(axes.size(), + dims, + _input->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + gamma_initializer, + CHOSEN_SYNC_TYPE); + weights[1] = + model.create_parallel_weight_legion_ordering(axes.size(), + dims, + _input->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + beta_initializer, + CHOSEN_SYNC_TYPE); } - return; } void LayerNorm::init(FFModel const &ff) { @@ -221,6 +273,20 @@ void LayerNorm::init(FFModel const &ff) { EXCLUSIVE, inputs[0]->region)); launcher.add_field(1, FID_DATA); + if (elementwise_affine) { + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(3, FID_DATA); + } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap(ff, fm); @@ -233,6 +299,8 @@ OpMeta *LayerNorm::init_task(Task const *task, LayerNorm *ln = (LayerNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); LayerNormMeta *meta = new LayerNormMeta(handle, ln); + meta->input_type[0] = ln->inputs[0]->data_type; + meta->output_type[0] = ln->outputs[0]->data_type; return meta; } @@ -292,14 +360,21 @@ void LayerNorm::forward_task(Task const *task, assert(task->regions.size() == regions.size()); float const *in_ptr = NULL; float *out_ptr = NULL, *gamma_ptr = NULL, *beta_ptr = NULL; + GenericTensorAccessorR in; + GenericTensorAccessorW out, gamma, beta; + Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); + // in_ptr = helperGetTensorPointerRO( + // regions[0], task->regions[0], FID_DATA, ctx, runtime); + in = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); Domain out_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - out_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); + // out_ptr = helperGetTensorPointerWO( + // regions[1], task->regions[1], FID_DATA, ctx, runtime); + out = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(in_domain == out_domain); assert(in_domain.get_volume() == m->effective_num_elements * m->effective_batch_size); @@ -307,20 +382,28 @@ void LayerNorm::forward_task(Task const *task, assert(regions.size() == 4); Domain gamma_domain = runtime->get_index_space_domain( ctx, task->regions[2].region.get_index_space()); - gamma_ptr = helperGetTensorPointerRW( - regions[2], task->regions[2], FID_DATA, ctx, runtime); + // gamma_ptr = helperGetTensorPointerRW( + // regions[2], task->regions[2], FID_DATA, ctx, runtime); + gamma = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); Domain beta_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); - beta_ptr = helperGetTensorPointerRW( - regions[3], task->regions[3], FID_DATA, ctx, runtime); + // beta_ptr = helperGetTensorPointerRW( + // regions[3], task->regions[3], FID_DATA, ctx, runtime); + beta = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); assert(gamma_domain == beta_domain); assert(gamma_domain.get_volume() == m->effective_num_elements); + int numdims = gamma_domain.get_dim(); + for (int i = 0; i < numdims; i++) { + int g_d = gamma_domain.hi()[i] - gamma_domain.lo()[i] + 1; + int in_d = in_domain.hi()[i] - in_domain.lo()[i] + 1; + assert(g_d == in_d); + } } else { assert(regions.size() == 2); } - - LayerNorm::forward_kernel_wrapper( - m, in_ptr, out_ptr, gamma_ptr, beta_ptr); + LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta); } void LayerNorm::backward(FFModel const &ff) { @@ -447,7 +530,100 @@ void LayerNorm::backward_task(Task const *task, bool LayerNorm::measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { - return false; + ParallelTensorBase sub_output, sub_input; + if (!outputs[0]->get_sub_tensor(mv, sub_output)) { + return false; + } + if (!inputs[0]->get_sub_tensor(mv, sub_input)) { + return false; + } + Domain input_domain = sub_input.get_domain(); + Domain output_domain = sub_output.get_domain(); + LayerNormMeta *m = new LayerNormMeta(sim->handler, this); + + sim->free_all(); + float *in_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); + assert(in_ptr != NULL); + GenericTensorAccessorR input1_acc(inputs[0]->data_type, input_domain, in_ptr); + cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + + float *out_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + assert(out_ptr != NULL); + GenericTensorAccessorW output_acc( + outputs[0]->data_type, output_domain, out_ptr); + cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + + // FIXME please add gamma_ptr and beta_ptr after finish the implementation + float *gamma_ptr = NULL, *beta_ptr = NULL; + GenericTensorAccessorW gamma_acc; + GenericTensorAccessorW beta_acc; + + bool out_of_memory = + (in_ptr == NULL) || (out_ptr == NULL) || + (((gamma_ptr == NULL) || (beta_ptr == NULL)) && (m->elementwise_affine)); + if (out_of_memory) { + cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME; + cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME; + return true; + } + + std::function forward, backward; + forward = [&] { + forward_kernel_wrapper(m, input1_acc, output_acc, gamma_acc, beta_acc); + }; + + if (sim->computationMode == COMP_MODE_TRAINING) { + float *in_grad_ptr = + (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); + assert(in_grad_ptr != NULL); + cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + + float *out_grad_ptr = NULL; + out_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + assert(out_grad_ptr != NULL); + cost_metrics.outputs_memory += + cost_metrics.total_mem_diff_from(sim->offset); + + float *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL; + + out_of_memory = (in_grad_ptr == NULL) || (out_grad_ptr == NULL) || + (((gamma_grad_ptr == NULL) || (beta_grad_ptr == NULL)) && + (m->elementwise_affine)); + if (out_of_memory) { + cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME; + cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME; + return true; + } + + backward = [&] { + backward_kernel_wrapper(m, + out_grad_ptr, + in_ptr, + in_grad_ptr, + gamma_ptr, + gamma_grad_ptr, + beta_grad_ptr); + }; + } + + inner_measure_operator_cost(sim, forward, backward, cost_metrics); + + if (sim->computationMode == COMP_MODE_TRAINING) { + log_measure.debug("[Measure LayerNorm] name(%s) num_elements(%zu) " + "forward_time(%.4lf) backward_time(%.4lf)\n", + name, + sub_output.get_volume(), + cost_metrics.forward_time, + cost_metrics.backward_time); + } else { + log_measure.debug("[Measure LayerNorm] name(%s) num_elements(%zu) " + "forward_time(%.4lf)\n", + name, + sub_output.get_volume(), + cost_metrics.forward_time); + } + + return true; } void LayerNorm::serialize(Legion::Serializer &sez) const { @@ -512,4 +688,4 @@ size_t hash::operator()( hash_combine(key, params.elementwise_affine); return key; } -}; // namespace std +}; // namespace std \ No newline at end of file diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index ac477ba2ad..f0539f8405 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -13,6 +13,7 @@ * limitations under the License. */ +#include "flexflow/ffconst_utils.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/utils/cuda_helper.h" @@ -30,12 +31,19 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln) effective_num_elements = ln->effective_num_elements; profiling = ln->profiling; eps = ln->eps; - checkCUDA(cudaMalloc(&mean_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&rstd_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&ds_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&db_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&scale_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&bias_ptr, sizeof(float) * effective_batch_size)); + DataType data_type = ln->data_type; + checkCUDA( + cudaMalloc(&mean_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&rstd_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&ds_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&db_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&scale_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&bias_ptr, data_type_size(data_type) * effective_batch_size)); } template @@ -77,26 +85,26 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { } template -__global__ void - RowwiseMomentsCUDAKernel(int64_t N, T eps, T const *X, T *mean, T *rstd) { - __shared__ T m_shared[C10_WARP_SIZE]; - __shared__ T v_shared[C10_WARP_SIZE]; +__global__ void RowwiseMomentsCUDAKernel( + int64_t N, float eps, T const *X, T *mean, T *rstd) { + __shared__ float m_shared[C10_WARP_SIZE]; + __shared__ float v_shared[C10_WARP_SIZE]; const int64_t i = blockIdx.x; - T sum1 = 0; - T sum2 = 0; + float sum1 = 0.0f; + float sum2 = 0.0f; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - sum1 += static_cast(X[index]); - sum2 += static_cast(X[index]) * static_cast(X[index]); + sum1 += static_cast(X[index]); + sum2 += static_cast(X[index]) * static_cast(X[index]); } - sum1 = BlockReduceSum(sum1, m_shared); - sum2 = BlockReduceSum(sum2, v_shared); + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); if (threadIdx.x == 0) { - const T scale = T(1) / static_cast(N); + float const scale = float(1) / static_cast(N); sum1 *= scale; - sum2 = max(sum2 * scale - sum1 * sum1, T(0)); - mean[i] = sum1; - rstd[i] = rsqrt(sum2 + static_cast(eps)); + sum2 = max(sum2 * scale - sum1 * sum1, float(0)); + mean[i] = static_cast(sum1); + rstd[i] = static_cast(rsqrt(sum2 + eps)); } } @@ -130,27 +138,30 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, T *gamma_ptr, T *beta_ptr, cudaStream_t stream) { - RowwiseMomentsCUDAKernel + RowwiseMomentsCUDAKernel <<effective_batch_size, kCUDABlockReduceNumThreads, 0, stream>>>( - m->effective_num_elements, m->eps, in_ptr, m->mean_ptr, m->rstd_ptr); - LayerNormForwardCUDAKernel + m->effective_num_elements, + m->eps, + in_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr)); + LayerNormForwardCUDAKernel <<effective_batch_size, kCUDANumThreads, 0, stream>>>( m->effective_num_elements, in_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_ptr, beta_ptr, out_ptr); } /*static*/ -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - T const *in_ptr, - T *out_ptr, - T *gamma_ptr, - T *beta_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorW &gamma, + GenericTensorAccessorW &beta) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -160,8 +171,24 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - LayerNorm::forward_kernel( - m, in_ptr, out_ptr, gamma_ptr, beta_ptr, stream); + if (m->input_type[0] == DT_FLOAT) { + LayerNorm::forward_kernel(m, + input.get_float_ptr(), + output.get_float_ptr(), + gamma.get_float_ptr(), + beta.get_float_ptr(), + stream); + } else if (m->input_type[0] == DT_HALF) { + LayerNorm::forward_kernel(m, + input.get_half_ptr(), + output.get_half_ptr(), + gamma.get_half_ptr(), + beta.get_half_ptr(), + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } + if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -170,8 +197,8 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, cudaEventDestroy(t_start); cudaEventDestroy(t_end); printf("[LayerNorm] forward time (CF) = %.2fms\n", elapsed); - print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); - print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); + // print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); + // print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); } } @@ -352,6 +379,82 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M, } } +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + dX_i[l] = f_grad_input; + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + + compute_gI(dY, X, mean, rstd, gamma, dX, N, buf); +} + /*static*/ template void LayerNorm::backward_kernel(LayerNormMeta const *m, @@ -366,17 +469,34 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, const int64_t N = m->effective_num_elements; ComputeInternalGradientsCUDAKernel <<>>( - N, output_grad_ptr, input_ptr, gamma_ptr, m->ds_ptr, m->db_ptr); + N, + output_grad_ptr, + input_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; ComputeGradientFusedParamsCUDAKernel <<>>(M, N, - m->mean_ptr, - m->rstd_ptr, - m->ds_ptr, - m->db_ptr, - m->scale_ptr, - m->bias_ptr); + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + N); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { if (M < 512) { // For small batch size, do colwise reduce directly @@ -386,8 +506,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, N, output_grad_ptr, input_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_grad_ptr, beta_grad_ptr); } else { @@ -396,14 +516,15 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, constexpr int kThreadX = kColwiseReduceTileSize; constexpr int kThreadY = kColwiseReduceTileSize / 2; GammaBetaBackwardCUDAKernel - <<>>(M, - N, - output_grad_ptr, - input_ptr, - m->mean_ptr, - m->rstd_ptr, - gamma_grad_ptr, - beta_grad_ptr); + <<>>( + M, + N, + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); } } } @@ -419,21 +540,18 @@ void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, T *beta_grad_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - LayerNorm::backward_kernel(m, - output_grad_ptr, - input_ptr, - input_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr, - stream); + if (m->output_type[0] == DT_FLOAT) { + LayerNorm::backward_kernel(m, + output_grad_ptr, + input_ptr, + input_grad_ptr, + gamma_ptr, + gamma_grad_ptr, + beta_grad_ptr, + stream); + } } -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - float const *in_ptr, - float *out_ptr, - float *gamma_ptr, - float *beta_ptr); template void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, float const *output_grad_ptr, @@ -443,4 +561,4 @@ template void float *gamma_grad_ptr, float *beta_grad_ptr); -}; // namespace FlexFlow +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 029b20afd1..ab65db542e 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -52,7 +52,10 @@ SoftmaxParams Softmax::get_params() const { return params; } -Tensor FFModel::softmax(const Tensor _input, int dim, char const *name) { +Tensor FFModel::softmax(const Tensor _input, + int dim, + bool last_layer, + char const *name) { Layer *sm = new Layer(this, OP_SOFTMAX, DT_FLOAT, @@ -69,6 +72,8 @@ Tensor FFModel::softmax(const Tensor _input, int dim, char const *name) { sm->outputs[0] = create_tensor_legion_ordering( numdims, dims, DT_FLOAT, sm, 0, true /*create_grad*/); sm->add_int_property("softmax_dim", dim); + + sm->add_int_property("last_layer", last_layer); layers.push_back(sm); return sm->outputs[0]; } @@ -80,15 +85,19 @@ Op *Softmax::create_operator_from_layer( long long value; layer->get_int_property("softmax_dim", value); int dim = (int)value; + layer->get_int_property("last_layer", value); + bool last_layer = (bool)value; return new Softmax(model, inputs[0], (inputs[0]->num_dims - 1 - dim) % inputs[0]->num_dims, + last_layer, layer->name); } Softmax::Softmax(FFModel &model, const ParallelTensor _input, int _dim, + bool _last_layer, char const *name) : Op(model, OP_SOFTMAX, @@ -98,7 +107,7 @@ Softmax::Softmax(FFModel &model, 0 /*weights*/, 1 /*outputs*/, _input), - dim(_dim) { + dim(_dim), last_layer(_last_layer) { // Currently assume we always perform softmax along the inner most dim assert(dim == 0); ParallelDim dims[MAX_TENSOR_DIM]; @@ -113,7 +122,7 @@ Softmax::Softmax(FFModel &model, SoftmaxParams const ¶ms, const ParallelTensor input, char const *name) - : Softmax(model, input, params.dim, name) {} + : Softmax(model, input, params.dim, params.last_layer, name) {} void Softmax::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); @@ -283,6 +292,13 @@ void Softmax::backward(FFModel const &ff) { EXCLUSIVE, outputs[0]->region_grad)); launcher.add_field(1, FID_DATA); + + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(2, FID_DATA); runtime->execute_index_space(ctx, launcher); } @@ -315,8 +331,8 @@ void Softmax::backward_task_with_dim(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); + assert(regions.size() == 3); + assert(task->regions.size() == 3); // const Softmax* softmax = (Softmax*) task->args; SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); TensorAccessorW acc_input_grad(regions[0], @@ -327,11 +343,16 @@ void Softmax::backward_task_with_dim(Task const *task, true /*readOutput*/); TensorAccessorR acc_output_grad( regions[1], task->regions[1], FID_DATA, ctx, runtime); + TensorAccessorR acc_output( + regions[2], task->regions[1], FID_DATA, ctx, runtime); // make sure the image indices match! assert(acc_input_grad.rect == acc_output_grad.rect); - backward_kernel_wrapper( - m, acc_input_grad.ptr, acc_output_grad.ptr, acc_input_grad.rect.volume()); + backward_kernel_wrapper(m, + acc_input_grad.ptr, + acc_output_grad.ptr, + acc_output.ptr, + acc_input_grad.rect.volume()); } bool Softmax::get_int_parameter(PMParameter para, int *value) const { @@ -377,11 +398,17 @@ bool Softmax::measure_operator_cost(Simulator *sim, float *output_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); assert(output_grad_ptr != NULL); + float *output_ptr = + (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); backward = [&] { - backward_kernel_wrapper( - m, input_grad_ptr, output_grad_ptr, sub_output.get_volume()); + backward_kernel_wrapper(m, + input_grad_ptr, + output_grad_ptr, + output_ptr, + sub_output.get_volume()); }; } @@ -413,6 +440,7 @@ size_t hash::operator()( FlexFlow::SoftmaxParams const ¶ms) const { size_t key = 0; hash_combine(key, params.dim); + hash_combine(key, params.last_layer); return key; } }; // namespace std diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 53e61b90d9..b6004af14a 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -215,7 +215,7 @@ __host__ void int idx = 0; printf("%s", prefix); for (idx = 0; idx < num_elements; idx++) { - printf(" %.4lf", (float)host_ptr[idx]); + printf(" %.10lf", (float)host_ptr[idx]); if (idx >= 16) { break; } @@ -224,6 +224,76 @@ __host__ void checkCUDA(cudaFreeHost(host_ptr)); } +template +__host__ void + save_tensor(T const *ptr, size_t num_elements, char const *file_name) { + T *host_ptr; + checkCUDA(cudaHostAlloc(&host_ptr, + sizeof(T) * num_elements, + cudaHostAllocPortable | cudaHostAllocMapped)); + checkCUDA(cudaMemcpy( + host_ptr, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost)); + FILE *tensor_file; + tensor_file = fopen(file_name, "w"); + for (unsigned i = 0; i < num_elements; i++) { + fprintf(tensor_file, "%.8f, ", (float)host_ptr[i]); + } + + fclose(tensor_file); + checkCUDA(cudaFreeHost(host_ptr)); +} + +cudnnStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax( + cudnnTensorDescriptor_t tensor, Domain domain, DataType data_type) { + int dims[MAX_TENSOR_DIM]; + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(data_type); + switch (domain.get_dim()) { + case 1: { + Rect<1> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + return cudnnSetTensor4dDescriptor( + tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, dims[0], 1, 1, 1); + } + case 2: { + Rect<2> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + return cudnnSetTensor4dDescriptor( + tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, dims[1], dims[0], 1, 1); + } + case 3: { + Rect<3> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + return cudnnSetTensor4dDescriptor(tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + dims[2] * dims[1], + dims[0], + 1, + 1); + } + case 4: { + Rect<4> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + dims[3] = rect.hi[3] - rect.lo[3] + 1; + return cudnnSetTensor4dDescriptor(tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + dims[3] * dims[2] * dims[1], + dims[0], + 1, + 1); + } + default: + assert(false && "Unsupported dim number"); + } + return CUDNN_STATUS_BAD_PARAM; +} + cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Domain domain) { int dims[MAX_TENSOR_DIM]; @@ -370,3 +440,8 @@ template __host__ void print_tensor(int32_t const *ptr, size_t rect, char const *prefix); template __host__ void print_tensor(int64_t const *ptr, size_t rect, char const *prefix); +template __host__ void + save_tensor(float const *ptr, size_t rect, char const *file_name); +template __host__ void save_tensor(int32_t const *ptr, + size_t rect, + char const *file_name); diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 4b55a39104..5310588477 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1718,6 +1718,7 @@ GraphOptimalViewSerialized case OP_SOFTMAX: { Softmax *softmax = (Softmax *)op; sez.serialize(softmax->dim); + sez.serialize(softmax->last_layer); break; } case OP_REPARTITION: { @@ -2098,8 +2099,11 @@ void FFModel::deserialize_graph_optimal_view( case OP_SOFTMAX: { assert(num_inputs == 1); int softmax_dim; + bool last_layer; dez.deserialize(softmax_dim); - node = get_or_create_node(inputs[0], {softmax_dim}); + dez.deserialize(last_layer); + node = + get_or_create_node(inputs[0], {softmax_dim, last_layer}); break; } case OP_TRANSPOSE: { diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index 2925eb7555..6d52e135cd 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -1930,8 +1930,8 @@ void GraphSearchHelper::graph_optimize( } } best_graph->print_strategy_computation_graph(optimal.views); - //std::cout << "PCG:" << std::endl; - //best_graph->print_dot(); + // std::cout << "PCG:" << std::endl; + // best_graph->print_dot(); optimal_views = real_optimal_views; } @@ -3120,7 +3120,7 @@ void FFModel::graph_optimize( std::unordered_map &optimal_views) { this->graph_search->graph_optimize( budget, only_data_parallel, best_graph, optimal_views); - best_graph->print_dot(); + best_graph->print_dot(); } bool FFModel::convert_graph_to_operators( @@ -3221,7 +3221,8 @@ bool FFModel::convert_graph_to_operators( case OP_SOFTMAX: { assert(inList.size() == 1); Softmax *softmax = (Softmax *)node.ptr; - new_op = new Softmax(*this, inputs[0], softmax->dim, NULL); + new_op = new Softmax( + *this, inputs[0], softmax->dim, softmax->last_layer, NULL); break; } case OP_COMBINE: { From 2d28c15d92461bc8d74c0397f2aaf887952e4b92 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 4 Jul 2023 19:43:48 -0400 Subject: [PATCH 36/72] revert changes --- examples/python/pytorch/mt5/mt5_ff.py | 26 +- include/flexflow/model.h | 6 +- .../flexflow/ops/kernels/softmax_kernels.h | 6 +- include/flexflow/ops/layer_norm.h | 11 +- include/flexflow/ops/softmax.h | 2 - include/flexflow/ops/softmax_params.h | 1 - include/flexflow/utils/cuda_helper.h | 6 - python/flexflow/core/flexflow_cffi.py | 23 +- python/flexflow/torch/model.py | 51 ++-- python/flexflow_c.cc | 7 +- python/flexflow_c.h | 1 - src/loss_functions/loss_functions.cc | 11 +- src/loss_functions/loss_functions.cu | 59 ++-- src/metrics_functions/metrics_functions.cc | 6 +- src/metrics_functions/metrics_functions.cu | 18 +- src/ops/element_unary.cu | 5 +- src/ops/kernels/softmax.cu | 37 +-- src/ops/layer_norm.cc | 254 +++--------------- src/ops/layer_norm.cu | 248 +++++------------ src/ops/softmax.cc | 46 +--- src/runtime/cuda_helper.cu | 77 +----- src/runtime/graph.cc | 6 +- src/runtime/substitution.cc | 9 +- 23 files changed, 196 insertions(+), 720 deletions(-) diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py index c2868e9d1e..08af8d88a7 100644 --- a/examples/python/pytorch/mt5/mt5_ff.py +++ b/examples/python/pytorch/mt5/mt5_ff.py @@ -64,8 +64,8 @@ def preprocess_train() -> None: y_shape = y.shape assert len(y.shape) == 2, \ "`y` should have shape (num examples, sequence length)" - y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32) - lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32) + y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long) + lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long) y_ids[:, :] = y[:, :-1] lm_labels[:, :] = y[:, 1:] @@ -89,29 +89,26 @@ def top_level_task(): #model = BertModel.from_pretrained("bert-base-uncased") # Load train data as numpy arrays print("Loading data...") - ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")).astype('int32') + ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")) ids = np.pad(ids, ((0,0), (0,17)), 'constant') #ids = np.random.randint(0, 5, (1000, 512)) #print('ids_shape', ids.shape) #print('ids', ids) - mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")).astype('int32') + mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")) mask = np.pad(mask, ((0,0), (0,17)), 'constant') #mask = np.random.randint(0, 2, (1000, 512)) #y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) - lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")).astype('int32') + lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")) lm_labels = np.pad(lm_labels, ((0,0), (0,17)), 'constant') #lm_labels = np.random.randint(-1, 5, (1000, 512)) - position_id = torch.arange(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy() - token_type_ids = torch.zeros(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy() - batch_size = ffconfig.batch_size input_ids_shape = (batch_size, ids.shape[1]) attention_mask_shape = (batch_size, mask.shape[1]) #decoder_input_ids_shape = (batch_size, y_ids.shape[1]) input_tensors = [ - ffmodel.create_tensor(input_ids_shape, DataType.DT_INT32), # input_ids - ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT32), # attention_mask + ffmodel.create_tensor(input_ids_shape, DataType.DT_INT64), # input_ids + ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT64), # attention_mask #ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids ] encoder_seq_length = ids.shape[1] @@ -129,7 +126,7 @@ def top_level_task(): output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True) #from flexflow.torch.model import file_to_ff #file_to_ff("mt5.ff", ffmodel, input_tensors) - ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, beta1=0.9, beta2=0.98, weight_decay=0.0, epsilon=2e-8) + ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, adam_beta1=0.9, adam_beta2=0.98, weight_decay=0.0, adam_epsilon=2e-8) # ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) print("Compiling the model...") @@ -141,9 +138,6 @@ def top_level_task(): MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY, ], ) - - # load weights here - ffmodel.load_bert_pretrained(checkpoint=model) print("Creating data loaders...") print('id_dtype', ids.dtype) @@ -154,8 +148,6 @@ def top_level_task(): #decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids) # NOTE: We cast down the label tensor data to 32-bit to accommodate the # label tensor's required dtype - token_type_ids_dl = ffmodel.create_data_loader(input_tensors[2], token_type_ids) - position_id_dl = ffmodel.create_data_loader(input_tensors[3], position_id) labels_dl = ffmodel.create_data_loader( ffmodel.label_tensor, lm_labels.astype("int32") ) @@ -167,7 +159,7 @@ def top_level_task(): epochs = ffconfig.epochs ffmodel.fit( #x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl], - x=[input_ids_dl, attention_mask_dl, position_id_dl, token_type_ids_dl], + x=[input_ids_dl, attention_mask_dl], y=labels_dl, batch_size=batch_size, epochs=epochs, ) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 0496d5fa8f..c6bc6929ad 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -472,7 +472,6 @@ class FFModel { std::vector const &axes, bool elementwise_affine, float eps, - DataType data_type = DT_NONE, char const *name = NULL); // Add a batch_norm layer Tensor @@ -519,10 +518,7 @@ class FFModel { // Add a flat layer Tensor flat(const Tensor input, char const *name = NULL); // Add a softmax layer - Tensor softmax(const Tensor input, - int dim = -1, - bool last_layer = false, - char const *name = NULL); + Tensor softmax(const Tensor input, int dim = -1, char const *name = NULL); // Create input tensors and constants Tensor transpose(const Tensor input, std::vector const &perm, diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index 9aec9f57c9..81b34d8558 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -20,7 +20,6 @@ class SoftmaxMeta : public OpMeta { #endif bool profiling; int dim; - bool last_layer; char op_name[MAX_OPNAME]; }; @@ -34,7 +33,6 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, float *input_grad_ptr, float const *output_grad_ptr, - float const *output_ptr, size_t num_elements); namespace Internal { @@ -42,10 +40,8 @@ void forward_kernel(SoftmaxMeta const *m, float const *input_ptr, float *output_ptr, ffStream_t stream); -void backward_kernel(SoftmaxMeta const *m, - float *input_grad_ptr, +void backward_kernel(float *input_grad_ptr, float const *output_grad_ptr, - float const *output_ptr, size_t num_elements, ffStream_t stream); } // namespace Internal diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 552b9cf365..8273b9ab52 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -66,11 +66,12 @@ class LayerNorm : public Op { T *gamma_ptr, T *beta_ptr, ffStream_t stream); + template static void forward_kernel_wrapper(LayerNormMeta const *m, - GenericTensorAccessorR const &input, - GenericTensorAccessorW &output, - GenericTensorAccessorW &gamma, - GenericTensorAccessorW &beta); + T const *input_ptr, + T *output_ptr, + T *gamma_ptr, + T *beta_ptr); template static void backward_kernel(LayerNormMeta const *m, T const *output_grad_ptr, @@ -104,7 +105,7 @@ class LayerNormMeta : public OpMeta { bool elementwise_affine; int64_t effective_batch_size, effective_num_elements; float eps; - void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; + float *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; char op_name[MAX_OPNAME]; }; diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 2616294a3a..25a20315bd 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -15,7 +15,6 @@ class Softmax : public Op { Softmax(FFModel &model, const ParallelTensor logit, int dim, - bool _last_layer, char const *name); Softmax(FFModel &model, Params const ¶ms, @@ -65,7 +64,6 @@ class Softmax : public Op { public: int dim; - bool last_layer; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/softmax_params.h b/include/flexflow/ops/softmax_params.h index 545e3a5cb9..d805d9966d 100644 --- a/include/flexflow/ops/softmax_params.h +++ b/include/flexflow/ops/softmax_params.h @@ -7,7 +7,6 @@ namespace FlexFlow { struct SoftmaxParams { int dim; - bool last_layer; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(SoftmaxParams const &, SoftmaxParams const &); diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index a4b2be0a66..46e323b186 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -132,15 +132,9 @@ __host__ void updateGAS(float *para_ptr, template void print_tensor(T const *ptr, size_t num_elements, char const *prefix); -template -void save_tensor(T const *ptr, size_t num_elements, char const *file_name); cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Legion::Domain domain); -cudnnStatus_t - cudnnSetTensorDescriptorFromDomain4SoftMax(cudnnTensorDescriptor_t tensor, - Legion::Domain domain, - DataType data_type = DT_FLOAT); cudaDataType_t ff_to_cuda_datatype(DataType type); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 4c01057109..42339d781c 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -1595,7 +1595,7 @@ def flat(self, input, name=None): self.add_layer(OpType.FLAT, name) return Tensor(handle, owner_op_type=OpType.FLAT) - def softmax(self, input, axis=-1, last_layer=False, name=None): + def softmax(self, input, axis=-1, name=None): """Softmax activation function. :param input: the input Tensor. @@ -1607,7 +1607,7 @@ def softmax(self, input, axis=-1, last_layer=False, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_softmax(self.handle, input.handle, axis, last_layer, c_name) + handle = ffc.flexflow_model_add_softmax(self.handle, input.handle, axis, c_name) self.add_layer(OpType.SOFTMAX, name) return Tensor(handle, owner_op_type=OpType.SOFTMAX) @@ -2041,25 +2041,6 @@ def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): ff_tensor.set_tensor(self, np_tensor) print("Compiled ffmodel!") - def load_bert_pretrained(self, checkpoint=None): - # store weights in dict - weights_dict = {} - for name, params in checkpoint.named_parameters(): - weights_dict[name.replace("LayerNorm", "layer_norm").replace(".", "_")] = params.detach().cpu().numpy() - print(name.replace("LayerNorm", "layer_norm").replace(".", "_")) - # some weights not in params - weights_dict['cls_predictions_decoder_weight'] = checkpoint.cls.predictions.decoder.weight.detach().cpu().numpy() - weights_dict['cls_predictions_decoder_bias'] = checkpoint.cls.predictions.decoder.bias.detach().cpu().numpy() - for i in range (self._nb_layers): - layer = self._layers[i] - if (layer.name + "_weight") in weights_dict: - print('weight: ' + layer.name) - weight = layer.get_parameter_by_id(0); - weight.set_tensor(self, weights_dict[layer.name + "_weight"]) - if (layer.name + "_bias") in weights_dict: - print('bias: ' + layer.name) - bias = layer.get_parameter_by_id(1); - bias.set_tensor(self, weights_dict[layer.name + "_bias"]) def fit(self, x=None, y=None, batch_size=None, epochs=1): """Trains the model for a fixed number of epochs (iterations on a dataset). diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index 8ebac2146c..11e0c16e48 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -664,13 +664,12 @@ def string_to_ff(string, ffmodel, node_to_output): def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] - axes = [0] - eps = self.module.eps + axes = [len(input_tensor.dims) - 1] return ffmodel.layer_norm( input=input_tensor, axes=axes, elementwise_affine=True, - eps=eps, + eps=1e-6, name=self.name, ) @@ -1198,24 +1197,16 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_sub( - input=input_tensor, scalar=scalar, inplace=False, name=name, + input=input_tensor, scalar=scalar, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor, scalar = \ FunctionNode.parse_scalar_op(self, node_to_output) - if self.scalar_pos == FunctionNode.ScalarPosition.RIGHT: - return ffmodel.scalar_sub( - input=input_tensor, scalar=scalar, inplace=False, name=self.name, - ) - else: - negative_input = ffmodel.scalar_multiply( - input=input_tensor, scalar=-1, inplace=False, name=self.name + '_negative', - ) - return ffmodel.scalar_sub( - input=negative_input, scalar=-scalar, inplace=False, name=self.name, - ) - + return ffmodel.scalar_sub( + input=input_tensor, scalar=scalar, name=self.name, + ) + class ScalarTrueDivNode(FunctionNode): def __init__(self, node): @@ -1240,16 +1231,15 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_true_divide( - input=input_tensor, scalar=scalar, inplace=False, name=name, + input=input_tensor, scalar=scalar, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] scalar = self.innodes[1] assert type(scalar) is float - return ffmodel.scalar_true_divide( - input=input_tensor, scalar=scalar, inplace=False, name=self.name, + input=input_tensor, scalar=scalar, name=self.name, ) @@ -1662,14 +1652,14 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_multiply( - input=input_tensor, scalar=scalar, inplace=False, name=name, + input=input_tensor, scalar=scalar, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor, scalar = \ FunctionNode.parse_scalar_op(self, node_to_output) return ffmodel.scalar_multiply( - input=input_tensor, scalar=scalar, inplace=False, name=self.name, + input=input_tensor, scalar=scalar, name=self.name, ) @@ -2369,13 +2359,11 @@ def string_to_ff(string, ffmodel, node_to_output): "since attributes require access to the PyTorch model" ) - def to_ff(self, ffmodel, node_to_output, input_tensors): - return self.attr_to_ff_tensor(ffmodel, input_tensors) - - def attr_to_ff_tensor(self, ffmodel, input_tensors): - + def to_ff(self, ffmodel, node_to_output): + return self.attr_to_ff_tensor(ffmodel) - torch_tensor = self.attr + def attr_to_ff_tensor(self, ffmodel): + torch_tensor = self.attr assert (torch_tensor.shape[0] == 1) batch_size = ffmodel._ffconfig.batch_size torch_tensor = np.repeat(torch_tensor, batch_size, axis=0) @@ -2394,16 +2382,15 @@ def attr_to_ff_tensor(self, ffmodel, input_tensors): np_tensor = np_tensor.astype(np.float32) print('attr: ', torch_tensor.shape) - assert (torch_tensor.shape[0] == batch_size) + assert (torch_tensor.shape[0] == batch_size) ff_tensor = ffmodel.create_tensor( - torch_tensor.shape, ff_dtype, True, + torch_tensor.shape, ff_dtype, requires_grad, ) # delay set_tensor, add to ffmodel ffmodel.attr_tensors[ff_tensor] = np_tensor # ff_tensor.set_tensor( # ffmodel, np_tensor # ) - input_tensors.append(ff_tensor) return ff_tensor @@ -2485,7 +2472,7 @@ def to_ff(self, ffmodel, node_to_output, output_tensors): # `CrossEntropyLoss()` implementation logits = node_to_output[other["logits"].name] softmax_logits = ffmodel.softmax( - input=logits, last_layer=True, name=self.name, + input=logits, name=self.name, ) output_tensors[:] += [softmax_logits] else: @@ -2619,8 +2606,6 @@ def torch_to_ff(self, ffmodel, input_tensors, verbose=False): elif isinstance(node, OutputNode): node.to_ff(ffmodel, node_to_output, output_tensors) node_output = None - elif isinstance(node, AttributeNode): - node_output = node.to_ff(ffmodel, node_to_output, input_tensors) else: node_output = node.to_ff(ffmodel, node_to_output) diff --git a/python/flexflow_c.cc b/python/flexflow_c.cc index fd688c0c6a..74a5da6ce1 100644 --- a/python/flexflow_c.cc +++ b/python/flexflow_c.cc @@ -568,8 +568,8 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_, for (int i = 0; i < n; i++) { axes_vec.push_back(axes[i]); } - Tensor tensor = handle->layer_norm( - input, axes_vec, elementwise_affine, eps, input->data_type, name); + Tensor tensor = + handle->layer_norm(input, axes_vec, elementwise_affine, eps, name); DEBUG_PRINT("[LayerNorm] new Tensor %p, input %p, elementwise_affine %d, eps " "%f, name %s", tensor, @@ -730,11 +730,10 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle_, flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle_, const flexflow_tensor_t input_, int dim, - bool last_layer, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); - Tensor tensor = handle->softmax(input, dim, last_layer, name); + Tensor tensor = handle->softmax(input, dim, name); DEBUG_PRINT( "[Softmax] new Tensor %p, input %p, name %s", tensor, input, name); return FFCObjectWrapper::wrap(tensor); diff --git a/python/flexflow_c.h b/python/flexflow_c.h index 5409002e5e..fb64c78fd2 100644 --- a/python/flexflow_c.h +++ b/python/flexflow_c.h @@ -276,7 +276,6 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle, flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle, const flexflow_tensor_t input, int dim, - bool last_layer, char const *name); flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle, diff --git a/src/loss_functions/loss_functions.cc b/src/loss_functions/loss_functions.cc index d887ee9243..ae89c3d469 100644 --- a/src/loss_functions/loss_functions.cc +++ b/src/loss_functions/loss_functions.cc @@ -49,8 +49,6 @@ void Loss::backward(FFModel *model, if (loss_type == LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE) { assert(logit->get_volume() == label->get_volume()); scale_factor = 2.0f / logit->get_volume(); - } else if (loss_type == LOSS_SPARSE_CATEGORICAL_CROSSENTROPY) { - scale_factor = 1.0f; } else { scale_factor = 1.0f / model->config.batchSize; } @@ -133,12 +131,9 @@ void Loss::backward_task_with_dim(Task const *task, regions[2], task->regions[2], FID_DATA, ctx, runtime); // assertion the outter-most dim is replica dim and replica degree is 1 assert(acc_logit.rect.hi[NDIM - 1] == acc_logit.rect.lo[NDIM - 1]); - - int num_classes = acc_logit.rect.hi[0] - acc_logit.rect.lo[0] + 1; - int num_samples = acc_logit.rect.volume() / num_classes; - // int num_samples = - // acc_logit.rect.hi[NDIM - 2] - acc_logit.rect.lo[NDIM - 2] + 1; - // int num_classes = acc_logit.rect.volume() / num_samples; + int num_samples = + acc_logit.rect.hi[NDIM - 2] - acc_logit.rect.lo[NDIM - 2] + 1; + int num_classes = acc_logit.rect.volume() / num_samples; assert(acc_logit_grad.rect == acc_logit.rect); int k = 1; if (loss->repl_labels) { diff --git a/src/loss_functions/loss_functions.cu b/src/loss_functions/loss_functions.cu index edd8f03fa4..01766347b0 100644 --- a/src/loss_functions/loss_functions.cu +++ b/src/loss_functions/loss_functions.cu @@ -18,7 +18,6 @@ namespace FlexFlow { -int const MASK_TOKEN = -100; using namespace Legion; __global__ void @@ -33,25 +32,6 @@ __global__ void } } -__global__ void - sparse_categorical_crossentropy_loss_backward_with_mask(float *logit_grad, - int const *label, - coord_t num_samples, - coord_t num_classes, - int const k, - float *num) { - CUDA_KERNEL_LOOP(i, num_samples * num_classes) { - int sample_id = i / num_classes; - int label_idx = label[i / (k * num_classes)]; - if (label_idx != MASK_TOKEN && (i == sample_id * num_classes + label_idx)) { - logit_grad[i] -= 1.0f; - atomicAdd(&num[0], 1.0f); - } else if (label_idx == MASK_TOKEN) { - logit_grad[i] = 0.0f; - } - } -} - __global__ void categorical_crossentropy_loss_backward(float *logit_grad, float const *logit, float const *label, @@ -94,25 +74,14 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_ptr, logit_volume * sizeof(float), cudaMemcpyDeviceToDevice)); - // calculate the scale factor inside kernel; - assert(scale_factor == 1.0f); - float *num; - checkCUDA(cudaMalloc(&num, sizeof(float))); - float effective_tokens; - int parallelism = num_samples * num_classes; - // sparse_categorical_crossentropy_loss_backward<<>>( - // logit_grad_ptr, label_ptr, num_samples, num_classes, k, num); - sparse_categorical_crossentropy_loss_backward_with_mask<<< - GET_BLOCKS(parallelism), - CUDA_NUM_THREADS, - 0, - stream>>>(logit_grad_ptr, label_ptr, num_samples, num_classes, k, num); - cudaMemcpy(&effective_tokens, num, sizeof(float), cudaMemcpyDeviceToHost); + sparse_categorical_crossentropy_loss_backward<<>>( + logit_grad_ptr, label_ptr, num_samples, num_classes, k); + // Scale logit gradients by op->scale_factor scale_kernel<<>>( - logit_grad_ptr, logit_grad_volume, 0, 1.0f / effective_tokens); + logit_grad_ptr, logit_grad_volume, 0, scale_factor * k); } void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( @@ -153,17 +122,19 @@ void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper( logit_grad_ptr, logit_grad_volume, 0, scale_factor); } -void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr, - float const *loss_ptr, - size_t loss_volume, - size_t loss_grad_volume, - float scale_factor) { +void Loss::identity_loss_backward_kernel_wrapper( + float *loss_grad_ptr, + float const *loss_ptr, + size_t loss_volume, + size_t loss_grad_volume, + float scale_factor) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); identity_loss_backward<<>>(loss_grad_ptr, loss_ptr, loss_volume); + stream>>>( + loss_grad_ptr, loss_ptr, loss_volume); // Scale logit gradients by loss->scale_factor scale_kernel<<>>( loss_grad_ptr, loss_grad_volume, 0, scale_factor); diff --git a/src/metrics_functions/metrics_functions.cc b/src/metrics_functions/metrics_functions.cc index 8c7e23ad8a..7244b06925 100644 --- a/src/metrics_functions/metrics_functions.cc +++ b/src/metrics_functions/metrics_functions.cc @@ -91,8 +91,8 @@ void Metrics::compute(FFModel *model, false /*must*/, 0 /*mapper_id*/, logit->machine_view.hash()); - // std::cout << "logit shape: " << logit->get_shape() << std::endl; - // std::cout << "label shape: " << label->get_shape() << std::endl; + std::cout << "logit shape: " << logit->get_shape() << std::endl; + std::cout << "label shape: " << label->get_shape() << std::endl; launcher.add_region_requirement(RegionRequirement( logit->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, logit->region)); launcher.add_field(0, FID_DATA); @@ -157,7 +157,7 @@ PerfMetrics assert(acc_label.rect.lo[0] == acc_label.rect.hi[0]); // Cannot measure categorical_crossentropy w/ sparse labels // Use measure_sparse_categorical_crossentropy instead - // std::cout << "num_classes: " << num_classes << std::endl; + std::cout << "num_classes: " << num_classes << std::endl; assert(!me->measure_categorical_crossentropy); Metrics::update_metrics_sparse_label_kernel_wrapper(acc_logit.ptr, acc_label.ptr, diff --git a/src/metrics_functions/metrics_functions.cu b/src/metrics_functions/metrics_functions.cu index 8c584c397c..b68b10d873 100644 --- a/src/metrics_functions/metrics_functions.cu +++ b/src/metrics_functions/metrics_functions.cu @@ -19,7 +19,6 @@ namespace FlexFlow { float const LOG_MIN_VALUE = 0.00000001f; -int const MASK_TOKEN = -100; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, @@ -30,7 +29,7 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, CUDA_KERNEL_LOOP(b, num_samples) { if (metrics.measure_accuracy) { float max_val = -1.0f; - int my_label = 0; + int my_label = 0; for (int i = 0; i < num_classes; i++) { float my_logit = logits[b * num_classes + i]; if (my_logit > max_val) { @@ -39,19 +38,14 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, } } assert(my_label >= 0); - if (labels[b] != MASK_TOKEN) { - atomicAdd(&(perf->train_all), 1); - if (labels[b] == my_label) { - atomicAdd(&(perf->train_correct), 1); - } + atomicAdd(&(perf->train_all), 1); + if (labels[b] == my_label) { + atomicAdd(&(perf->train_correct), 1); } } if (metrics.measure_sparse_categorical_crossentropy) { - if (labels[b] != MASK_TOKEN) { - float my_logit = - max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); - atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); - } + float my_logit = max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); + atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); } if (metrics.measure_mean_squared_error || metrics.measure_root_mean_squared_error || diff --git a/src/ops/element_unary.cu b/src/ops/element_unary.cu index 187e60282f..d6e5bcfdc3 100644 --- a/src/ops/element_unary.cu +++ b/src/ops/element_unary.cu @@ -202,9 +202,8 @@ __global__ void elewise_unary_backward_kernel(coord_t volume, case OP_GELU: { input_grad[i] = (T)(output_grad[i] * - (0.5 * erfc(-input[i] * M_SQRT1_2) + - 0.5 * M_SQRT1_2 * input[i] * - ((2 / sqrt(M_PI)) * exp(-input[i] * input[i] * 0.5f)))); + (0.5 * erfc(-input[i] * M_SQRT1_2) - + 0.5 * M_SQRT1_2 * input[i] * exp(-input[i] * input[i] * 0.5))); break; } case OP_RSQRT: { diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index e163c9a0c7..d83d9952c9 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -26,10 +26,8 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, Domain const &input_domain) : OpMeta(handler) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); - checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( - inputTensor, input_domain, softmax->data_type)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain(inputTensor, input_domain)); dim = softmax->dim; - last_layer = softmax->last_layer; profiling = softmax->profiling; std::strcpy(op_name, softmax->name); } @@ -68,7 +66,6 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, float *input_grad_ptr, float const *output_grad_ptr, - float const *output_ptr, size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -80,7 +77,7 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, cudaEventRecord(t_start, stream); } Internal::backward_kernel( - m, input_grad_ptr, output_grad_ptr, output_ptr, num_elements, stream); + input_grad_ptr, output_grad_ptr, num_elements, stream); if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -116,33 +113,15 @@ void forward_kernel(SoftmaxMeta const *m, output_ptr)); } -void backward_kernel(SoftmaxMeta const *m, - float *input_grad_ptr, +void backward_kernel(float *input_grad_ptr, float const *output_grad_ptr, - float const *output_ptr, size_t num_elements, cudaStream_t stream) { - - if (m->last_layer) { - checkCUDA(cudaMemcpyAsync(input_grad_ptr, - output_grad_ptr, - num_elements * sizeof(float), - cudaMemcpyDeviceToDevice, - stream)); - } else { - float alpha = 1.0f, beta = 0.0f; - checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &alpha, - m->inputTensor, - output_ptr, - m->inputTensor, - output_grad_ptr, - &beta, - m->inputTensor, - input_grad_ptr)); - } + checkCUDA(cudaMemcpyAsync(input_grad_ptr, + output_grad_ptr, + num_elements * sizeof(float), + cudaMemcpyDeviceToDevice, + stream)); } } // namespace Internal diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 915041d2bb..e8c65b4b03 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -61,27 +61,10 @@ Tensor FFModel::layer_norm(const Tensor input, std::vector const &axes, bool elementwise_affine, float eps, - DataType data_type, char const *name) { - // In PyTorch, axes must be the sizes of the last axes.size() dimensions of - // the input tensor. However, since the tensor dimensions are reversed in - // FlexFlow (batch size is the last dimension), we require that axes must be - // the sizes of the FIRST axes.size() dimensions of the input tensor. - - // Another difference is that in PyTorch, the axes vector should contain the - // sizes of the dimensions with respect to which you want to compute the - // layernorm. In FlexFlow, instead, axes should contain the INDICES of the - // dimensions in question. We do this because the size of a dimension might be - // different when splitting a tensor in model parallelism. - assert( - axes.size() <= input->num_dims && - "number of axes must be less than tensor dimensions"); // input does not - // have replica - // dimension here - for (int i = 0; i < axes.size(); i++) { - assert(axes[i] == i && "axes must be the first axes.size() dimensions"); - } -#ifdef DEADCODE + // FIXME: currently disable elementwise_affine + elementwise_affine = false; + // axes must be the last axes.size() dimensions for (int i = 0; i < axes.size(); i++) { bool found = false; for (int j = 0; j < axes.size(); j++) { @@ -93,33 +76,15 @@ Tensor FFModel::layer_norm(const Tensor input, assert(false && "axes must be the last axes.size() dimensions"); } } -#endif - if (data_type == DT_NONE) { - data_type = input->data_type; - } int num_weights = elementwise_affine ? 2 : 0; - Layer *ln = nullptr; - if (data_type != input->data_type) { - Tensor casted_input = cast(input, data_type, "type cast for layer_norm"); - ln = new Layer(this, - OP_LAYERNORM, - data_type, - name, - 1 /*inputs*/, - num_weights, - 1 /*outputs*/, - casted_input); - } else { - ln = new Layer(this, - OP_LAYERNORM, - data_type, - name, - 1 /*inputs*/, - num_weights, - 1 /*outputs*/, - input); - } - + Layer *ln = new Layer(this, + OP_LAYERNORM, + DT_FLOAT, + name, + 1 /*inputs*/, + num_weights, + 1 /*outputs*/, + input); ln->outputs[0] = create_tensor_legion_ordering(input->num_dims, input->dims, input->data_type, @@ -127,19 +92,19 @@ Tensor FFModel::layer_norm(const Tensor input, 0, true /*create_grad*/); if (num_weights == 2) { - int numdims = axes.size(); - int dims[numdims]; - for (int i = 0; i < numdims; i++) { - dims[i] = input->dims[axes[i]]; + int M = 1; + for (int i = 0; i < axes.size(); i++) { + M *= input->dims[input->num_dims - 1 - axes[i]]; } - ln->weights[0] = create_weight_legion_ordering(numdims, + int dims[1] = {M}; + ln->weights[0] = create_weight_legion_ordering(1, dims, input->data_type, ln, true /*create_grad*/, nullptr, CHOSEN_SYNC_TYPE); - ln->weights[1] = create_weight_legion_ordering(numdims, + ln->weights[1] = create_weight_legion_ordering(1, dims, input->data_type, ln, @@ -214,36 +179,19 @@ LayerNorm::LayerNorm(FFModel &model, ParallelDim output_dims[MAX_TENSOR_DIM]; int M = 1; for (int i = 0; i < axes.size(); i++) { - M *= inputs[0]->dims[axes[i]].size; + M *= inputs[0]->dims[inputs[0]->num_dims - 1 - axes[i]].size; } effective_num_elements = M; - effective_batch_size = inputs[0]->get_volume() / M; - assert(elementwise_affine == (numWeights == 2)); + effective_batch_size = inputs[0]->get_shape().get_piece_num_elements() / M; if (numWeights > 0 && allocate_weights) { - ParallelDim dims[axes.size()]; - for (int i = 0; i < axes.size(); i++) { - dims[i] = inputs[0]->dims[i]; - } - int seed = std::rand(); - Initializer *gamma_initializer = new UniformInitializer(seed, 0.0f, 1.0f); - Initializer *beta_initializer = new UniformInitializer(seed, 0.0f, 1.0f); - weights[0] = - model.create_parallel_weight_legion_ordering(axes.size(), - dims, - _input->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - gamma_initializer, - CHOSEN_SYNC_TYPE); - weights[1] = - model.create_parallel_weight_legion_ordering(axes.size(), - dims, - _input->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - beta_initializer, - CHOSEN_SYNC_TYPE); + int kernel_dims = 2; + assert(false); + // weights[0] = model.create_parallel_weight_legion_ordering( + // kernel_dims, + } else { + // do nothing } + return; } void LayerNorm::init(FFModel const &ff) { @@ -273,20 +221,6 @@ void LayerNorm::init(FFModel const &ff) { EXCLUSIVE, inputs[0]->region)); launcher.add_field(1, FID_DATA); - if (elementwise_affine) { - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(2, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region)); - launcher.add_field(3, FID_DATA); - } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap(ff, fm); @@ -299,8 +233,6 @@ OpMeta *LayerNorm::init_task(Task const *task, LayerNorm *ln = (LayerNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); LayerNormMeta *meta = new LayerNormMeta(handle, ln); - meta->input_type[0] = ln->inputs[0]->data_type; - meta->output_type[0] = ln->outputs[0]->data_type; return meta; } @@ -360,21 +292,14 @@ void LayerNorm::forward_task(Task const *task, assert(task->regions.size() == regions.size()); float const *in_ptr = NULL; float *out_ptr = NULL, *gamma_ptr = NULL, *beta_ptr = NULL; - GenericTensorAccessorR in; - GenericTensorAccessorW out, gamma, beta; - Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - // in_ptr = helperGetTensorPointerRO( - // regions[0], task->regions[0], FID_DATA, ctx, runtime); - in = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + in_ptr = helperGetTensorPointerRO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); Domain out_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - // out_ptr = helperGetTensorPointerWO( - // regions[1], task->regions[1], FID_DATA, ctx, runtime); - out = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + out_ptr = helperGetTensorPointerWO( + regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(in_domain == out_domain); assert(in_domain.get_volume() == m->effective_num_elements * m->effective_batch_size); @@ -382,28 +307,20 @@ void LayerNorm::forward_task(Task const *task, assert(regions.size() == 4); Domain gamma_domain = runtime->get_index_space_domain( ctx, task->regions[2].region.get_index_space()); - // gamma_ptr = helperGetTensorPointerRW( - // regions[2], task->regions[2], FID_DATA, ctx, runtime); - gamma = helperGetGenericTensorAccessorRW( - m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + gamma_ptr = helperGetTensorPointerRW( + regions[2], task->regions[2], FID_DATA, ctx, runtime); Domain beta_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); - // beta_ptr = helperGetTensorPointerRW( - // regions[3], task->regions[3], FID_DATA, ctx, runtime); - beta = helperGetGenericTensorAccessorRW( - m->input_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); + beta_ptr = helperGetTensorPointerRW( + regions[3], task->regions[3], FID_DATA, ctx, runtime); assert(gamma_domain == beta_domain); assert(gamma_domain.get_volume() == m->effective_num_elements); - int numdims = gamma_domain.get_dim(); - for (int i = 0; i < numdims; i++) { - int g_d = gamma_domain.hi()[i] - gamma_domain.lo()[i] + 1; - int in_d = in_domain.hi()[i] - in_domain.lo()[i] + 1; - assert(g_d == in_d); - } } else { assert(regions.size() == 2); } - LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta); + + LayerNorm::forward_kernel_wrapper( + m, in_ptr, out_ptr, gamma_ptr, beta_ptr); } void LayerNorm::backward(FFModel const &ff) { @@ -530,100 +447,7 @@ void LayerNorm::backward_task(Task const *task, bool LayerNorm::measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { - ParallelTensorBase sub_output, sub_input; - if (!outputs[0]->get_sub_tensor(mv, sub_output)) { - return false; - } - if (!inputs[0]->get_sub_tensor(mv, sub_input)) { - return false; - } - Domain input_domain = sub_input.get_domain(); - Domain output_domain = sub_output.get_domain(); - LayerNormMeta *m = new LayerNormMeta(sim->handler, this); - - sim->free_all(); - float *in_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - assert(in_ptr != NULL); - GenericTensorAccessorR input1_acc(inputs[0]->data_type, input_domain, in_ptr); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - float *out_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - assert(out_ptr != NULL); - GenericTensorAccessorW output_acc( - outputs[0]->data_type, output_domain, out_ptr); - cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - // FIXME please add gamma_ptr and beta_ptr after finish the implementation - float *gamma_ptr = NULL, *beta_ptr = NULL; - GenericTensorAccessorW gamma_acc; - GenericTensorAccessorW beta_acc; - - bool out_of_memory = - (in_ptr == NULL) || (out_ptr == NULL) || - (((gamma_ptr == NULL) || (beta_ptr == NULL)) && (m->elementwise_affine)); - if (out_of_memory) { - cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - return true; - } - - std::function forward, backward; - forward = [&] { - forward_kernel_wrapper(m, input1_acc, output_acc, gamma_acc, beta_acc); - }; - - if (sim->computationMode == COMP_MODE_TRAINING) { - float *in_grad_ptr = - (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - assert(in_grad_ptr != NULL); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - float *out_grad_ptr = NULL; - out_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - assert(out_grad_ptr != NULL); - cost_metrics.outputs_memory += - cost_metrics.total_mem_diff_from(sim->offset); - - float *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL; - - out_of_memory = (in_grad_ptr == NULL) || (out_grad_ptr == NULL) || - (((gamma_grad_ptr == NULL) || (beta_grad_ptr == NULL)) && - (m->elementwise_affine)); - if (out_of_memory) { - cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - return true; - } - - backward = [&] { - backward_kernel_wrapper(m, - out_grad_ptr, - in_ptr, - in_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr); - }; - } - - inner_measure_operator_cost(sim, forward, backward, cost_metrics); - - if (sim->computationMode == COMP_MODE_TRAINING) { - log_measure.debug("[Measure LayerNorm] name(%s) num_elements(%zu) " - "forward_time(%.4lf) backward_time(%.4lf)\n", - name, - sub_output.get_volume(), - cost_metrics.forward_time, - cost_metrics.backward_time); - } else { - log_measure.debug("[Measure LayerNorm] name(%s) num_elements(%zu) " - "forward_time(%.4lf)\n", - name, - sub_output.get_volume(), - cost_metrics.forward_time); - } - - return true; + return false; } void LayerNorm::serialize(Legion::Serializer &sez) const { @@ -688,4 +512,4 @@ size_t hash::operator()( hash_combine(key, params.elementwise_affine); return key; } -}; // namespace std \ No newline at end of file +}; // namespace std diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index f0539f8405..ac477ba2ad 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -13,7 +13,6 @@ * limitations under the License. */ -#include "flexflow/ffconst_utils.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/utils/cuda_helper.h" @@ -31,19 +30,12 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln) effective_num_elements = ln->effective_num_elements; profiling = ln->profiling; eps = ln->eps; - DataType data_type = ln->data_type; - checkCUDA( - cudaMalloc(&mean_ptr, data_type_size(data_type) * effective_batch_size)); - checkCUDA( - cudaMalloc(&rstd_ptr, data_type_size(data_type) * effective_batch_size)); - checkCUDA( - cudaMalloc(&ds_ptr, data_type_size(data_type) * effective_batch_size)); - checkCUDA( - cudaMalloc(&db_ptr, data_type_size(data_type) * effective_batch_size)); - checkCUDA( - cudaMalloc(&scale_ptr, data_type_size(data_type) * effective_batch_size)); - checkCUDA( - cudaMalloc(&bias_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA(cudaMalloc(&mean_ptr, sizeof(float) * effective_batch_size)); + checkCUDA(cudaMalloc(&rstd_ptr, sizeof(float) * effective_batch_size)); + checkCUDA(cudaMalloc(&ds_ptr, sizeof(float) * effective_batch_size)); + checkCUDA(cudaMalloc(&db_ptr, sizeof(float) * effective_batch_size)); + checkCUDA(cudaMalloc(&scale_ptr, sizeof(float) * effective_batch_size)); + checkCUDA(cudaMalloc(&bias_ptr, sizeof(float) * effective_batch_size)); } template @@ -85,26 +77,26 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { } template -__global__ void RowwiseMomentsCUDAKernel( - int64_t N, float eps, T const *X, T *mean, T *rstd) { - __shared__ float m_shared[C10_WARP_SIZE]; - __shared__ float v_shared[C10_WARP_SIZE]; +__global__ void + RowwiseMomentsCUDAKernel(int64_t N, T eps, T const *X, T *mean, T *rstd) { + __shared__ T m_shared[C10_WARP_SIZE]; + __shared__ T v_shared[C10_WARP_SIZE]; const int64_t i = blockIdx.x; - float sum1 = 0.0f; - float sum2 = 0.0f; + T sum1 = 0; + T sum2 = 0; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - sum1 += static_cast(X[index]); - sum2 += static_cast(X[index]) * static_cast(X[index]); + sum1 += static_cast(X[index]); + sum2 += static_cast(X[index]) * static_cast(X[index]); } - sum1 = BlockReduceSum(sum1, m_shared); - sum2 = BlockReduceSum(sum2, v_shared); + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); if (threadIdx.x == 0) { - float const scale = float(1) / static_cast(N); + const T scale = T(1) / static_cast(N); sum1 *= scale; - sum2 = max(sum2 * scale - sum1 * sum1, float(0)); - mean[i] = static_cast(sum1); - rstd[i] = static_cast(rsqrt(sum2 + eps)); + sum2 = max(sum2 * scale - sum1 * sum1, T(0)); + mean[i] = sum1; + rstd[i] = rsqrt(sum2 + static_cast(eps)); } } @@ -138,30 +130,27 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, T *gamma_ptr, T *beta_ptr, cudaStream_t stream) { - RowwiseMomentsCUDAKernel + RowwiseMomentsCUDAKernel <<effective_batch_size, kCUDABlockReduceNumThreads, 0, stream>>>( - m->effective_num_elements, - m->eps, - in_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr)); - LayerNormForwardCUDAKernel + m->effective_num_elements, m->eps, in_ptr, m->mean_ptr, m->rstd_ptr); + LayerNormForwardCUDAKernel <<effective_batch_size, kCUDANumThreads, 0, stream>>>( m->effective_num_elements, in_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), + m->mean_ptr, + m->rstd_ptr, gamma_ptr, beta_ptr, out_ptr); } /*static*/ +template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - GenericTensorAccessorR const &input, - GenericTensorAccessorW &output, - GenericTensorAccessorW &gamma, - GenericTensorAccessorW &beta) { + T const *in_ptr, + T *out_ptr, + T *gamma_ptr, + T *beta_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -171,24 +160,8 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - if (m->input_type[0] == DT_FLOAT) { - LayerNorm::forward_kernel(m, - input.get_float_ptr(), - output.get_float_ptr(), - gamma.get_float_ptr(), - beta.get_float_ptr(), - stream); - } else if (m->input_type[0] == DT_HALF) { - LayerNorm::forward_kernel(m, - input.get_half_ptr(), - output.get_half_ptr(), - gamma.get_half_ptr(), - beta.get_half_ptr(), - stream); - } else { - assert(false && "unsupport datatype in layernorm"); - } - + LayerNorm::forward_kernel( + m, in_ptr, out_ptr, gamma_ptr, beta_ptr, stream); if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -197,8 +170,8 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, cudaEventDestroy(t_start); cudaEventDestroy(t_end); printf("[LayerNorm] forward time (CF) = %.2fms\n", elapsed); - // print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); - // print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); + print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); + print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); } } @@ -379,82 +352,6 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M, } } -template -__device__ __inline__ void compute_gI(T const *__restrict__ dY, - T const *__restrict__ X, - T const *__restrict__ mean, - T const *__restrict__ rstd, - T const *__restrict__ gamma, - T *dX, - int const N, - T *buf) { - auto const i1 = blockIdx.x; - const T mean_val = mean[i1]; - const T rstd_val = rstd[i1]; - T stats_x1{0}, stats_x2{0}; - constexpr int unroll = 4; - auto l = unroll * threadIdx.x; - T const *X_i = X + i1 * N; - T const *dY_i = dY + i1 * N; - T *dX_i = dX + i1 * N; - // vectorized reads don't improve perf, so use regular unrolling - - for (; l + unroll - 1 < N; l += blockDim.x * unroll) { -#pragma unroll - for (int k = 0; k < unroll; k++) { - T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); - const T c_h = static_cast(X_i[l + k]); - const T c_loss = static_cast(dY_i[l + k]); - stats_x1 += c_loss * gamma_val; - stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; - } - } - for (; l < N; l++) { - T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); - const T c_h = static_cast(X_i[l]); - const T c_loss = static_cast(dY_i[l]); - stats_x1 += c_loss * gamma_val; - stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; - } - - stats_x1 = BlockReduceSum(stats_x1, buf); - stats_x2 = BlockReduceSum(stats_x2, buf); - if (threadIdx.x == 0) { - buf[0] = stats_x1; - buf[1] = stats_x2; - } - __syncthreads(); - stats_x1 = buf[0]; - stats_x2 = buf[1]; - T fH = N; - T term1 = (T(1) / fH) * rstd_val; - - for (int l = threadIdx.x; l < N; l += blockDim.x) { - const T x = X_i[l]; - const T dy = dY_i[l]; - T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); - T f_grad_input = fH * gamma_val * dy; - f_grad_input -= (x - mean_val) * rstd_val * stats_x2; - f_grad_input -= stats_x1; - f_grad_input *= term1; - dX_i[l] = f_grad_input; - } -} - -template -__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, - T const *__restrict__ X, - T const *__restrict__ mean, - T const *__restrict__ rstd, - T const *__restrict__ gamma, - T *dX, - int const N) { - alignas(sizeof(double)) extern __shared__ char s_data1[]; - T *buf = reinterpret_cast(&s_data1); - - compute_gI(dY, X, mean, rstd, gamma, dX, N, buf); -} - /*static*/ template void LayerNorm::backward_kernel(LayerNormMeta const *m, @@ -469,34 +366,17 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, const int64_t N = m->effective_num_elements; ComputeInternalGradientsCUDAKernel <<>>( - N, - output_grad_ptr, - input_ptr, - gamma_ptr, - static_cast(m->ds_ptr), - static_cast(m->db_ptr)); + N, output_grad_ptr, input_ptr, gamma_ptr, m->ds_ptr, m->db_ptr); const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; ComputeGradientFusedParamsCUDAKernel <<>>(M, N, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - static_cast(m->ds_ptr), - static_cast(m->db_ptr), - static_cast(m->scale_ptr), - static_cast(m->bias_ptr)); - int const warp_size = C10_WARP_SIZE; - int const num_threads = 128; - const dim3 blocks(M); - int nshared = (num_threads / warp_size) * sizeof(T); - layer_norm_grad_input_kernel<<>>( - output_grad_ptr, - input_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_ptr, - input_grad_ptr, - N); + m->mean_ptr, + m->rstd_ptr, + m->ds_ptr, + m->db_ptr, + m->scale_ptr, + m->bias_ptr); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { if (M < 512) { // For small batch size, do colwise reduce directly @@ -506,8 +386,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, N, output_grad_ptr, input_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), + m->mean_ptr, + m->rstd_ptr, gamma_grad_ptr, beta_grad_ptr); } else { @@ -516,15 +396,14 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, constexpr int kThreadX = kColwiseReduceTileSize; constexpr int kThreadY = kColwiseReduceTileSize / 2; GammaBetaBackwardCUDAKernel - <<>>( - M, - N, - output_grad_ptr, - input_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_grad_ptr, - beta_grad_ptr); + <<>>(M, + N, + output_grad_ptr, + input_ptr, + m->mean_ptr, + m->rstd_ptr, + gamma_grad_ptr, + beta_grad_ptr); } } } @@ -540,18 +419,21 @@ void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, T *beta_grad_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - if (m->output_type[0] == DT_FLOAT) { - LayerNorm::backward_kernel(m, - output_grad_ptr, - input_ptr, - input_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr, - stream); - } + LayerNorm::backward_kernel(m, + output_grad_ptr, + input_ptr, + input_grad_ptr, + gamma_ptr, + gamma_grad_ptr, + beta_grad_ptr, + stream); } +template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, + float const *in_ptr, + float *out_ptr, + float *gamma_ptr, + float *beta_ptr); template void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, float const *output_grad_ptr, @@ -561,4 +443,4 @@ template void float *gamma_grad_ptr, float *beta_grad_ptr); -}; // namespace FlexFlow \ No newline at end of file +}; // namespace FlexFlow diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index ab65db542e..029b20afd1 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -52,10 +52,7 @@ SoftmaxParams Softmax::get_params() const { return params; } -Tensor FFModel::softmax(const Tensor _input, - int dim, - bool last_layer, - char const *name) { +Tensor FFModel::softmax(const Tensor _input, int dim, char const *name) { Layer *sm = new Layer(this, OP_SOFTMAX, DT_FLOAT, @@ -72,8 +69,6 @@ Tensor FFModel::softmax(const Tensor _input, sm->outputs[0] = create_tensor_legion_ordering( numdims, dims, DT_FLOAT, sm, 0, true /*create_grad*/); sm->add_int_property("softmax_dim", dim); - - sm->add_int_property("last_layer", last_layer); layers.push_back(sm); return sm->outputs[0]; } @@ -85,19 +80,15 @@ Op *Softmax::create_operator_from_layer( long long value; layer->get_int_property("softmax_dim", value); int dim = (int)value; - layer->get_int_property("last_layer", value); - bool last_layer = (bool)value; return new Softmax(model, inputs[0], (inputs[0]->num_dims - 1 - dim) % inputs[0]->num_dims, - last_layer, layer->name); } Softmax::Softmax(FFModel &model, const ParallelTensor _input, int _dim, - bool _last_layer, char const *name) : Op(model, OP_SOFTMAX, @@ -107,7 +98,7 @@ Softmax::Softmax(FFModel &model, 0 /*weights*/, 1 /*outputs*/, _input), - dim(_dim), last_layer(_last_layer) { + dim(_dim) { // Currently assume we always perform softmax along the inner most dim assert(dim == 0); ParallelDim dims[MAX_TENSOR_DIM]; @@ -122,7 +113,7 @@ Softmax::Softmax(FFModel &model, SoftmaxParams const ¶ms, const ParallelTensor input, char const *name) - : Softmax(model, input, params.dim, params.last_layer, name) {} + : Softmax(model, input, params.dim, name) {} void Softmax::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); @@ -292,13 +283,6 @@ void Softmax::backward(FFModel const &ff) { EXCLUSIVE, outputs[0]->region_grad)); launcher.add_field(1, FID_DATA); - - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(2, FID_DATA); runtime->execute_index_space(ctx, launcher); } @@ -331,8 +315,8 @@ void Softmax::backward_task_with_dim(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(regions.size() == 3); - assert(task->regions.size() == 3); + assert(regions.size() == 2); + assert(task->regions.size() == 2); // const Softmax* softmax = (Softmax*) task->args; SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); TensorAccessorW acc_input_grad(regions[0], @@ -343,16 +327,11 @@ void Softmax::backward_task_with_dim(Task const *task, true /*readOutput*/); TensorAccessorR acc_output_grad( regions[1], task->regions[1], FID_DATA, ctx, runtime); - TensorAccessorR acc_output( - regions[2], task->regions[1], FID_DATA, ctx, runtime); // make sure the image indices match! assert(acc_input_grad.rect == acc_output_grad.rect); - backward_kernel_wrapper(m, - acc_input_grad.ptr, - acc_output_grad.ptr, - acc_output.ptr, - acc_input_grad.rect.volume()); + backward_kernel_wrapper( + m, acc_input_grad.ptr, acc_output_grad.ptr, acc_input_grad.rect.volume()); } bool Softmax::get_int_parameter(PMParameter para, int *value) const { @@ -398,17 +377,11 @@ bool Softmax::measure_operator_cost(Simulator *sim, float *output_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); assert(output_grad_ptr != NULL); - float *output_ptr = - (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); backward = [&] { - backward_kernel_wrapper(m, - input_grad_ptr, - output_grad_ptr, - output_ptr, - sub_output.get_volume()); + backward_kernel_wrapper( + m, input_grad_ptr, output_grad_ptr, sub_output.get_volume()); }; } @@ -440,7 +413,6 @@ size_t hash::operator()( FlexFlow::SoftmaxParams const ¶ms) const { size_t key = 0; hash_combine(key, params.dim); - hash_combine(key, params.last_layer); return key; } }; // namespace std diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index b6004af14a..53e61b90d9 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -215,7 +215,7 @@ __host__ void int idx = 0; printf("%s", prefix); for (idx = 0; idx < num_elements; idx++) { - printf(" %.10lf", (float)host_ptr[idx]); + printf(" %.4lf", (float)host_ptr[idx]); if (idx >= 16) { break; } @@ -224,76 +224,6 @@ __host__ void checkCUDA(cudaFreeHost(host_ptr)); } -template -__host__ void - save_tensor(T const *ptr, size_t num_elements, char const *file_name) { - T *host_ptr; - checkCUDA(cudaHostAlloc(&host_ptr, - sizeof(T) * num_elements, - cudaHostAllocPortable | cudaHostAllocMapped)); - checkCUDA(cudaMemcpy( - host_ptr, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost)); - FILE *tensor_file; - tensor_file = fopen(file_name, "w"); - for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%.8f, ", (float)host_ptr[i]); - } - - fclose(tensor_file); - checkCUDA(cudaFreeHost(host_ptr)); -} - -cudnnStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax( - cudnnTensorDescriptor_t tensor, Domain domain, DataType data_type) { - int dims[MAX_TENSOR_DIM]; - cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(data_type); - switch (domain.get_dim()) { - case 1: { - Rect<1> rect = domain; - dims[0] = rect.hi[0] - rect.lo[0] + 1; - return cudnnSetTensor4dDescriptor( - tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, dims[0], 1, 1, 1); - } - case 2: { - Rect<2> rect = domain; - dims[0] = rect.hi[0] - rect.lo[0] + 1; - dims[1] = rect.hi[1] - rect.lo[1] + 1; - return cudnnSetTensor4dDescriptor( - tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, dims[1], dims[0], 1, 1); - } - case 3: { - Rect<3> rect = domain; - dims[0] = rect.hi[0] - rect.lo[0] + 1; - dims[1] = rect.hi[1] - rect.lo[1] + 1; - dims[2] = rect.hi[2] - rect.lo[2] + 1; - return cudnnSetTensor4dDescriptor(tensor, - CUDNN_TENSOR_NCHW, - cudnn_data_type, - dims[2] * dims[1], - dims[0], - 1, - 1); - } - case 4: { - Rect<4> rect = domain; - dims[0] = rect.hi[0] - rect.lo[0] + 1; - dims[1] = rect.hi[1] - rect.lo[1] + 1; - dims[2] = rect.hi[2] - rect.lo[2] + 1; - dims[3] = rect.hi[3] - rect.lo[3] + 1; - return cudnnSetTensor4dDescriptor(tensor, - CUDNN_TENSOR_NCHW, - cudnn_data_type, - dims[3] * dims[2] * dims[1], - dims[0], - 1, - 1); - } - default: - assert(false && "Unsupported dim number"); - } - return CUDNN_STATUS_BAD_PARAM; -} - cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Domain domain) { int dims[MAX_TENSOR_DIM]; @@ -440,8 +370,3 @@ template __host__ void print_tensor(int32_t const *ptr, size_t rect, char const *prefix); template __host__ void print_tensor(int64_t const *ptr, size_t rect, char const *prefix); -template __host__ void - save_tensor(float const *ptr, size_t rect, char const *file_name); -template __host__ void save_tensor(int32_t const *ptr, - size_t rect, - char const *file_name); diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 5310588477..4b55a39104 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1718,7 +1718,6 @@ GraphOptimalViewSerialized case OP_SOFTMAX: { Softmax *softmax = (Softmax *)op; sez.serialize(softmax->dim); - sez.serialize(softmax->last_layer); break; } case OP_REPARTITION: { @@ -2099,11 +2098,8 @@ void FFModel::deserialize_graph_optimal_view( case OP_SOFTMAX: { assert(num_inputs == 1); int softmax_dim; - bool last_layer; dez.deserialize(softmax_dim); - dez.deserialize(last_layer); - node = - get_or_create_node(inputs[0], {softmax_dim, last_layer}); + node = get_or_create_node(inputs[0], {softmax_dim}); break; } case OP_TRANSPOSE: { diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index 6d52e135cd..2925eb7555 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -1930,8 +1930,8 @@ void GraphSearchHelper::graph_optimize( } } best_graph->print_strategy_computation_graph(optimal.views); - // std::cout << "PCG:" << std::endl; - // best_graph->print_dot(); + //std::cout << "PCG:" << std::endl; + //best_graph->print_dot(); optimal_views = real_optimal_views; } @@ -3120,7 +3120,7 @@ void FFModel::graph_optimize( std::unordered_map &optimal_views) { this->graph_search->graph_optimize( budget, only_data_parallel, best_graph, optimal_views); - best_graph->print_dot(); + best_graph->print_dot(); } bool FFModel::convert_graph_to_operators( @@ -3221,8 +3221,7 @@ bool FFModel::convert_graph_to_operators( case OP_SOFTMAX: { assert(inList.size() == 1); Softmax *softmax = (Softmax *)node.ptr; - new_op = new Softmax( - *this, inputs[0], softmax->dim, softmax->last_layer, NULL); + new_op = new Softmax(*this, inputs[0], softmax->dim, NULL); break; } case OP_COMBINE: { From 2025d565e8bdcc9a03b8a0e84d64e7b69a77cb1d Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Wed, 5 Jul 2023 13:40:54 -0400 Subject: [PATCH 37/72] fix bert training issue. (#832) --- examples/python/pytorch/mt5/mt5_ff.py | 26 +- include/flexflow/model.h | 6 +- .../flexflow/ops/kernels/softmax_kernels.h | 6 +- include/flexflow/ops/layer_norm.h | 11 +- include/flexflow/ops/softmax.h | 2 + include/flexflow/ops/softmax_params.h | 1 + include/flexflow/parallel_ops/replicate.h | 17 +- include/flexflow/utils/cuda_helper.h | 6 + python/flexflow/core/flexflow_cffi.py | 23 +- python/flexflow/torch/model.py | 51 ++-- python/flexflow_c.cc | 7 +- python/flexflow_c.h | 1 + src/loss_functions/loss_functions.cc | 11 +- src/loss_functions/loss_functions.cu | 59 ++-- src/metrics_functions/metrics_functions.cc | 6 +- src/metrics_functions/metrics_functions.cu | 18 +- src/ops/element_unary.cu | 5 +- src/ops/kernels/softmax.cu | 37 ++- src/ops/layer_norm.cc | 254 +++++++++++++++--- src/ops/layer_norm.cu | 248 ++++++++++++----- src/ops/softmax.cc | 46 +++- src/runtime/cuda_helper.cu | 77 +++++- src/runtime/graph.cc | 6 +- src/runtime/substitution.cc | 9 +- 24 files changed, 729 insertions(+), 204 deletions(-) diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py index 08af8d88a7..c2868e9d1e 100644 --- a/examples/python/pytorch/mt5/mt5_ff.py +++ b/examples/python/pytorch/mt5/mt5_ff.py @@ -64,8 +64,8 @@ def preprocess_train() -> None: y_shape = y.shape assert len(y.shape) == 2, \ "`y` should have shape (num examples, sequence length)" - y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long) - lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long) + y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32) + lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32) y_ids[:, :] = y[:, :-1] lm_labels[:, :] = y[:, 1:] @@ -89,26 +89,29 @@ def top_level_task(): #model = BertModel.from_pretrained("bert-base-uncased") # Load train data as numpy arrays print("Loading data...") - ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")) + ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")).astype('int32') ids = np.pad(ids, ((0,0), (0,17)), 'constant') #ids = np.random.randint(0, 5, (1000, 512)) #print('ids_shape', ids.shape) #print('ids', ids) - mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")) + mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")).astype('int32') mask = np.pad(mask, ((0,0), (0,17)), 'constant') #mask = np.random.randint(0, 2, (1000, 512)) #y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) - lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")) + lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")).astype('int32') lm_labels = np.pad(lm_labels, ((0,0), (0,17)), 'constant') #lm_labels = np.random.randint(-1, 5, (1000, 512)) + position_id = torch.arange(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy() + token_type_ids = torch.zeros(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy() + batch_size = ffconfig.batch_size input_ids_shape = (batch_size, ids.shape[1]) attention_mask_shape = (batch_size, mask.shape[1]) #decoder_input_ids_shape = (batch_size, y_ids.shape[1]) input_tensors = [ - ffmodel.create_tensor(input_ids_shape, DataType.DT_INT64), # input_ids - ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT64), # attention_mask + ffmodel.create_tensor(input_ids_shape, DataType.DT_INT32), # input_ids + ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT32), # attention_mask #ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids ] encoder_seq_length = ids.shape[1] @@ -126,7 +129,7 @@ def top_level_task(): output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True) #from flexflow.torch.model import file_to_ff #file_to_ff("mt5.ff", ffmodel, input_tensors) - ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, adam_beta1=0.9, adam_beta2=0.98, weight_decay=0.0, adam_epsilon=2e-8) + ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, beta1=0.9, beta2=0.98, weight_decay=0.0, epsilon=2e-8) # ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) print("Compiling the model...") @@ -138,6 +141,9 @@ def top_level_task(): MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY, ], ) + + # load weights here + ffmodel.load_bert_pretrained(checkpoint=model) print("Creating data loaders...") print('id_dtype', ids.dtype) @@ -148,6 +154,8 @@ def top_level_task(): #decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids) # NOTE: We cast down the label tensor data to 32-bit to accommodate the # label tensor's required dtype + token_type_ids_dl = ffmodel.create_data_loader(input_tensors[2], token_type_ids) + position_id_dl = ffmodel.create_data_loader(input_tensors[3], position_id) labels_dl = ffmodel.create_data_loader( ffmodel.label_tensor, lm_labels.astype("int32") ) @@ -159,7 +167,7 @@ def top_level_task(): epochs = ffconfig.epochs ffmodel.fit( #x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl], - x=[input_ids_dl, attention_mask_dl], + x=[input_ids_dl, attention_mask_dl, position_id_dl, token_type_ids_dl], y=labels_dl, batch_size=batch_size, epochs=epochs, ) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index c6bc6929ad..0496d5fa8f 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -472,6 +472,7 @@ class FFModel { std::vector const &axes, bool elementwise_affine, float eps, + DataType data_type = DT_NONE, char const *name = NULL); // Add a batch_norm layer Tensor @@ -518,7 +519,10 @@ class FFModel { // Add a flat layer Tensor flat(const Tensor input, char const *name = NULL); // Add a softmax layer - Tensor softmax(const Tensor input, int dim = -1, char const *name = NULL); + Tensor softmax(const Tensor input, + int dim = -1, + bool last_layer = false, + char const *name = NULL); // Create input tensors and constants Tensor transpose(const Tensor input, std::vector const &perm, diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index 81b34d8558..9aec9f57c9 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -20,6 +20,7 @@ class SoftmaxMeta : public OpMeta { #endif bool profiling; int dim; + bool last_layer; char op_name[MAX_OPNAME]; }; @@ -33,6 +34,7 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements); namespace Internal { @@ -40,8 +42,10 @@ void forward_kernel(SoftmaxMeta const *m, float const *input_ptr, float *output_ptr, ffStream_t stream); -void backward_kernel(float *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements, ffStream_t stream); } // namespace Internal diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 8273b9ab52..552b9cf365 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -66,12 +66,11 @@ class LayerNorm : public Op { T *gamma_ptr, T *beta_ptr, ffStream_t stream); - template static void forward_kernel_wrapper(LayerNormMeta const *m, - T const *input_ptr, - T *output_ptr, - T *gamma_ptr, - T *beta_ptr); + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorW &gamma, + GenericTensorAccessorW &beta); template static void backward_kernel(LayerNormMeta const *m, T const *output_grad_ptr, @@ -105,7 +104,7 @@ class LayerNormMeta : public OpMeta { bool elementwise_affine; int64_t effective_batch_size, effective_num_elements; float eps; - float *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; + void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; char op_name[MAX_OPNAME]; }; diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 25a20315bd..2616294a3a 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -15,6 +15,7 @@ class Softmax : public Op { Softmax(FFModel &model, const ParallelTensor logit, int dim, + bool _last_layer, char const *name); Softmax(FFModel &model, Params const ¶ms, @@ -64,6 +65,7 @@ class Softmax : public Op { public: int dim; + bool last_layer; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/softmax_params.h b/include/flexflow/ops/softmax_params.h index d805d9966d..545e3a5cb9 100644 --- a/include/flexflow/ops/softmax_params.h +++ b/include/flexflow/ops/softmax_params.h @@ -7,6 +7,7 @@ namespace FlexFlow { struct SoftmaxParams { int dim; + bool last_layer; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(SoftmaxParams const &, SoftmaxParams const &); diff --git a/include/flexflow/parallel_ops/replicate.h b/include/flexflow/parallel_ops/replicate.h index 9514d055f3..ac41a6437e 100644 --- a/include/flexflow/parallel_ops/replicate.h +++ b/include/flexflow/parallel_ops/replicate.h @@ -32,9 +32,9 @@ class Replicate : public ParallelOp { bool append_parallel_op_info( std::vector ¶llel_ops) const override; static void init_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void forward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, @@ -46,16 +46,17 @@ class Replicate : public ParallelOp { template static void - forward_task_with_type(Legion::Task const *task, + forward_task_with_type(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); template - static void backward_task_with_type(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); + static void backward_task_with_type( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index 46e323b186..a4b2be0a66 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -132,9 +132,15 @@ __host__ void updateGAS(float *para_ptr, template void print_tensor(T const *ptr, size_t num_elements, char const *prefix); +template +void save_tensor(T const *ptr, size_t num_elements, char const *file_name); cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Legion::Domain domain); +cudnnStatus_t + cudnnSetTensorDescriptorFromDomain4SoftMax(cudnnTensorDescriptor_t tensor, + Legion::Domain domain, + DataType data_type = DT_FLOAT); cudaDataType_t ff_to_cuda_datatype(DataType type); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 42339d781c..4c01057109 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -1595,7 +1595,7 @@ def flat(self, input, name=None): self.add_layer(OpType.FLAT, name) return Tensor(handle, owner_op_type=OpType.FLAT) - def softmax(self, input, axis=-1, name=None): + def softmax(self, input, axis=-1, last_layer=False, name=None): """Softmax activation function. :param input: the input Tensor. @@ -1607,7 +1607,7 @@ def softmax(self, input, axis=-1, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_softmax(self.handle, input.handle, axis, c_name) + handle = ffc.flexflow_model_add_softmax(self.handle, input.handle, axis, last_layer, c_name) self.add_layer(OpType.SOFTMAX, name) return Tensor(handle, owner_op_type=OpType.SOFTMAX) @@ -2041,6 +2041,25 @@ def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): ff_tensor.set_tensor(self, np_tensor) print("Compiled ffmodel!") + def load_bert_pretrained(self, checkpoint=None): + # store weights in dict + weights_dict = {} + for name, params in checkpoint.named_parameters(): + weights_dict[name.replace("LayerNorm", "layer_norm").replace(".", "_")] = params.detach().cpu().numpy() + print(name.replace("LayerNorm", "layer_norm").replace(".", "_")) + # some weights not in params + weights_dict['cls_predictions_decoder_weight'] = checkpoint.cls.predictions.decoder.weight.detach().cpu().numpy() + weights_dict['cls_predictions_decoder_bias'] = checkpoint.cls.predictions.decoder.bias.detach().cpu().numpy() + for i in range (self._nb_layers): + layer = self._layers[i] + if (layer.name + "_weight") in weights_dict: + print('weight: ' + layer.name) + weight = layer.get_parameter_by_id(0); + weight.set_tensor(self, weights_dict[layer.name + "_weight"]) + if (layer.name + "_bias") in weights_dict: + print('bias: ' + layer.name) + bias = layer.get_parameter_by_id(1); + bias.set_tensor(self, weights_dict[layer.name + "_bias"]) def fit(self, x=None, y=None, batch_size=None, epochs=1): """Trains the model for a fixed number of epochs (iterations on a dataset). diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index 11e0c16e48..8ebac2146c 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -664,12 +664,13 @@ def string_to_ff(string, ffmodel, node_to_output): def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] - axes = [len(input_tensor.dims) - 1] + axes = [0] + eps = self.module.eps return ffmodel.layer_norm( input=input_tensor, axes=axes, elementwise_affine=True, - eps=1e-6, + eps=eps, name=self.name, ) @@ -1197,16 +1198,24 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_sub( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor, scalar = \ FunctionNode.parse_scalar_op(self, node_to_output) - return ffmodel.scalar_sub( - input=input_tensor, scalar=scalar, name=self.name, - ) - + if self.scalar_pos == FunctionNode.ScalarPosition.RIGHT: + return ffmodel.scalar_sub( + input=input_tensor, scalar=scalar, inplace=False, name=self.name, + ) + else: + negative_input = ffmodel.scalar_multiply( + input=input_tensor, scalar=-1, inplace=False, name=self.name + '_negative', + ) + return ffmodel.scalar_sub( + input=negative_input, scalar=-scalar, inplace=False, name=self.name, + ) + class ScalarTrueDivNode(FunctionNode): def __init__(self, node): @@ -1231,15 +1240,16 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_true_divide( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] scalar = self.innodes[1] assert type(scalar) is float + return ffmodel.scalar_true_divide( - input=input_tensor, scalar=scalar, name=self.name, + input=input_tensor, scalar=scalar, inplace=False, name=self.name, ) @@ -1652,14 +1662,14 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_multiply( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor, scalar = \ FunctionNode.parse_scalar_op(self, node_to_output) return ffmodel.scalar_multiply( - input=input_tensor, scalar=scalar, name=self.name, + input=input_tensor, scalar=scalar, inplace=False, name=self.name, ) @@ -2359,11 +2369,13 @@ def string_to_ff(string, ffmodel, node_to_output): "since attributes require access to the PyTorch model" ) - def to_ff(self, ffmodel, node_to_output): - return self.attr_to_ff_tensor(ffmodel) + def to_ff(self, ffmodel, node_to_output, input_tensors): + return self.attr_to_ff_tensor(ffmodel, input_tensors) + + def attr_to_ff_tensor(self, ffmodel, input_tensors): + - def attr_to_ff_tensor(self, ffmodel): - torch_tensor = self.attr + torch_tensor = self.attr assert (torch_tensor.shape[0] == 1) batch_size = ffmodel._ffconfig.batch_size torch_tensor = np.repeat(torch_tensor, batch_size, axis=0) @@ -2382,15 +2394,16 @@ def attr_to_ff_tensor(self, ffmodel): np_tensor = np_tensor.astype(np.float32) print('attr: ', torch_tensor.shape) - assert (torch_tensor.shape[0] == batch_size) + assert (torch_tensor.shape[0] == batch_size) ff_tensor = ffmodel.create_tensor( - torch_tensor.shape, ff_dtype, requires_grad, + torch_tensor.shape, ff_dtype, True, ) # delay set_tensor, add to ffmodel ffmodel.attr_tensors[ff_tensor] = np_tensor # ff_tensor.set_tensor( # ffmodel, np_tensor # ) + input_tensors.append(ff_tensor) return ff_tensor @@ -2472,7 +2485,7 @@ def to_ff(self, ffmodel, node_to_output, output_tensors): # `CrossEntropyLoss()` implementation logits = node_to_output[other["logits"].name] softmax_logits = ffmodel.softmax( - input=logits, name=self.name, + input=logits, last_layer=True, name=self.name, ) output_tensors[:] += [softmax_logits] else: @@ -2606,6 +2619,8 @@ def torch_to_ff(self, ffmodel, input_tensors, verbose=False): elif isinstance(node, OutputNode): node.to_ff(ffmodel, node_to_output, output_tensors) node_output = None + elif isinstance(node, AttributeNode): + node_output = node.to_ff(ffmodel, node_to_output, input_tensors) else: node_output = node.to_ff(ffmodel, node_to_output) diff --git a/python/flexflow_c.cc b/python/flexflow_c.cc index 74a5da6ce1..fd688c0c6a 100644 --- a/python/flexflow_c.cc +++ b/python/flexflow_c.cc @@ -568,8 +568,8 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_, for (int i = 0; i < n; i++) { axes_vec.push_back(axes[i]); } - Tensor tensor = - handle->layer_norm(input, axes_vec, elementwise_affine, eps, name); + Tensor tensor = handle->layer_norm( + input, axes_vec, elementwise_affine, eps, input->data_type, name); DEBUG_PRINT("[LayerNorm] new Tensor %p, input %p, elementwise_affine %d, eps " "%f, name %s", tensor, @@ -730,10 +730,11 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle_, flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle_, const flexflow_tensor_t input_, int dim, + bool last_layer, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); - Tensor tensor = handle->softmax(input, dim, name); + Tensor tensor = handle->softmax(input, dim, last_layer, name); DEBUG_PRINT( "[Softmax] new Tensor %p, input %p, name %s", tensor, input, name); return FFCObjectWrapper::wrap(tensor); diff --git a/python/flexflow_c.h b/python/flexflow_c.h index fb64c78fd2..5409002e5e 100644 --- a/python/flexflow_c.h +++ b/python/flexflow_c.h @@ -276,6 +276,7 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle, flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle, const flexflow_tensor_t input, int dim, + bool last_layer, char const *name); flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle, diff --git a/src/loss_functions/loss_functions.cc b/src/loss_functions/loss_functions.cc index ae89c3d469..d887ee9243 100644 --- a/src/loss_functions/loss_functions.cc +++ b/src/loss_functions/loss_functions.cc @@ -49,6 +49,8 @@ void Loss::backward(FFModel *model, if (loss_type == LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE) { assert(logit->get_volume() == label->get_volume()); scale_factor = 2.0f / logit->get_volume(); + } else if (loss_type == LOSS_SPARSE_CATEGORICAL_CROSSENTROPY) { + scale_factor = 1.0f; } else { scale_factor = 1.0f / model->config.batchSize; } @@ -131,9 +133,12 @@ void Loss::backward_task_with_dim(Task const *task, regions[2], task->regions[2], FID_DATA, ctx, runtime); // assertion the outter-most dim is replica dim and replica degree is 1 assert(acc_logit.rect.hi[NDIM - 1] == acc_logit.rect.lo[NDIM - 1]); - int num_samples = - acc_logit.rect.hi[NDIM - 2] - acc_logit.rect.lo[NDIM - 2] + 1; - int num_classes = acc_logit.rect.volume() / num_samples; + + int num_classes = acc_logit.rect.hi[0] - acc_logit.rect.lo[0] + 1; + int num_samples = acc_logit.rect.volume() / num_classes; + // int num_samples = + // acc_logit.rect.hi[NDIM - 2] - acc_logit.rect.lo[NDIM - 2] + 1; + // int num_classes = acc_logit.rect.volume() / num_samples; assert(acc_logit_grad.rect == acc_logit.rect); int k = 1; if (loss->repl_labels) { diff --git a/src/loss_functions/loss_functions.cu b/src/loss_functions/loss_functions.cu index 01766347b0..edd8f03fa4 100644 --- a/src/loss_functions/loss_functions.cu +++ b/src/loss_functions/loss_functions.cu @@ -18,6 +18,7 @@ namespace FlexFlow { +int const MASK_TOKEN = -100; using namespace Legion; __global__ void @@ -32,6 +33,25 @@ __global__ void } } +__global__ void + sparse_categorical_crossentropy_loss_backward_with_mask(float *logit_grad, + int const *label, + coord_t num_samples, + coord_t num_classes, + int const k, + float *num) { + CUDA_KERNEL_LOOP(i, num_samples * num_classes) { + int sample_id = i / num_classes; + int label_idx = label[i / (k * num_classes)]; + if (label_idx != MASK_TOKEN && (i == sample_id * num_classes + label_idx)) { + logit_grad[i] -= 1.0f; + atomicAdd(&num[0], 1.0f); + } else if (label_idx == MASK_TOKEN) { + logit_grad[i] = 0.0f; + } + } +} + __global__ void categorical_crossentropy_loss_backward(float *logit_grad, float const *logit, float const *label, @@ -74,14 +94,25 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_ptr, logit_volume * sizeof(float), cudaMemcpyDeviceToDevice)); - sparse_categorical_crossentropy_loss_backward<<>>( - logit_grad_ptr, label_ptr, num_samples, num_classes, k); - // Scale logit gradients by op->scale_factor + // calculate the scale factor inside kernel; + assert(scale_factor == 1.0f); + float *num; + checkCUDA(cudaMalloc(&num, sizeof(float))); + float effective_tokens; + int parallelism = num_samples * num_classes; + // sparse_categorical_crossentropy_loss_backward<<>>( + // logit_grad_ptr, label_ptr, num_samples, num_classes, k, num); + sparse_categorical_crossentropy_loss_backward_with_mask<<< + GET_BLOCKS(parallelism), + CUDA_NUM_THREADS, + 0, + stream>>>(logit_grad_ptr, label_ptr, num_samples, num_classes, k, num); + cudaMemcpy(&effective_tokens, num, sizeof(float), cudaMemcpyDeviceToHost); scale_kernel<<>>( - logit_grad_ptr, logit_grad_volume, 0, scale_factor * k); + logit_grad_ptr, logit_grad_volume, 0, 1.0f / effective_tokens); } void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( @@ -122,19 +153,17 @@ void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper( logit_grad_ptr, logit_grad_volume, 0, scale_factor); } -void Loss::identity_loss_backward_kernel_wrapper( - float *loss_grad_ptr, - float const *loss_ptr, - size_t loss_volume, - size_t loss_grad_volume, - float scale_factor) { +void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr, + float const *loss_ptr, + size_t loss_volume, + size_t loss_grad_volume, + float scale_factor) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); identity_loss_backward<<>>( - loss_grad_ptr, loss_ptr, loss_volume); + stream>>>(loss_grad_ptr, loss_ptr, loss_volume); // Scale logit gradients by loss->scale_factor scale_kernel<<>>( loss_grad_ptr, loss_grad_volume, 0, scale_factor); diff --git a/src/metrics_functions/metrics_functions.cc b/src/metrics_functions/metrics_functions.cc index 7244b06925..8c7e23ad8a 100644 --- a/src/metrics_functions/metrics_functions.cc +++ b/src/metrics_functions/metrics_functions.cc @@ -91,8 +91,8 @@ void Metrics::compute(FFModel *model, false /*must*/, 0 /*mapper_id*/, logit->machine_view.hash()); - std::cout << "logit shape: " << logit->get_shape() << std::endl; - std::cout << "label shape: " << label->get_shape() << std::endl; + // std::cout << "logit shape: " << logit->get_shape() << std::endl; + // std::cout << "label shape: " << label->get_shape() << std::endl; launcher.add_region_requirement(RegionRequirement( logit->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, logit->region)); launcher.add_field(0, FID_DATA); @@ -157,7 +157,7 @@ PerfMetrics assert(acc_label.rect.lo[0] == acc_label.rect.hi[0]); // Cannot measure categorical_crossentropy w/ sparse labels // Use measure_sparse_categorical_crossentropy instead - std::cout << "num_classes: " << num_classes << std::endl; + // std::cout << "num_classes: " << num_classes << std::endl; assert(!me->measure_categorical_crossentropy); Metrics::update_metrics_sparse_label_kernel_wrapper(acc_logit.ptr, acc_label.ptr, diff --git a/src/metrics_functions/metrics_functions.cu b/src/metrics_functions/metrics_functions.cu index b68b10d873..8c584c397c 100644 --- a/src/metrics_functions/metrics_functions.cu +++ b/src/metrics_functions/metrics_functions.cu @@ -19,6 +19,7 @@ namespace FlexFlow { float const LOG_MIN_VALUE = 0.00000001f; +int const MASK_TOKEN = -100; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, @@ -29,7 +30,7 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, CUDA_KERNEL_LOOP(b, num_samples) { if (metrics.measure_accuracy) { float max_val = -1.0f; - int my_label = 0; + int my_label = 0; for (int i = 0; i < num_classes; i++) { float my_logit = logits[b * num_classes + i]; if (my_logit > max_val) { @@ -38,14 +39,19 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, } } assert(my_label >= 0); - atomicAdd(&(perf->train_all), 1); - if (labels[b] == my_label) { - atomicAdd(&(perf->train_correct), 1); + if (labels[b] != MASK_TOKEN) { + atomicAdd(&(perf->train_all), 1); + if (labels[b] == my_label) { + atomicAdd(&(perf->train_correct), 1); + } } } if (metrics.measure_sparse_categorical_crossentropy) { - float my_logit = max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); - atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + if (labels[b] != MASK_TOKEN) { + float my_logit = + max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); + atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + } } if (metrics.measure_mean_squared_error || metrics.measure_root_mean_squared_error || diff --git a/src/ops/element_unary.cu b/src/ops/element_unary.cu index d6e5bcfdc3..187e60282f 100644 --- a/src/ops/element_unary.cu +++ b/src/ops/element_unary.cu @@ -202,8 +202,9 @@ __global__ void elewise_unary_backward_kernel(coord_t volume, case OP_GELU: { input_grad[i] = (T)(output_grad[i] * - (0.5 * erfc(-input[i] * M_SQRT1_2) - - 0.5 * M_SQRT1_2 * input[i] * exp(-input[i] * input[i] * 0.5))); + (0.5 * erfc(-input[i] * M_SQRT1_2) + + 0.5 * M_SQRT1_2 * input[i] * + ((2 / sqrt(M_PI)) * exp(-input[i] * input[i] * 0.5f)))); break; } case OP_RSQRT: { diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index d83d9952c9..e163c9a0c7 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -26,8 +26,10 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, Domain const &input_domain) : OpMeta(handler) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); - checkCUDNN(cudnnSetTensorDescriptorFromDomain(inputTensor, input_domain)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( + inputTensor, input_domain, softmax->data_type)); dim = softmax->dim; + last_layer = softmax->last_layer; profiling = softmax->profiling; std::strcpy(op_name, softmax->name); } @@ -66,6 +68,7 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -77,7 +80,7 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, cudaEventRecord(t_start, stream); } Internal::backward_kernel( - input_grad_ptr, output_grad_ptr, num_elements, stream); + m, input_grad_ptr, output_grad_ptr, output_ptr, num_elements, stream); if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -113,15 +116,33 @@ void forward_kernel(SoftmaxMeta const *m, output_ptr)); } -void backward_kernel(float *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements, cudaStream_t stream) { - checkCUDA(cudaMemcpyAsync(input_grad_ptr, - output_grad_ptr, - num_elements * sizeof(float), - cudaMemcpyDeviceToDevice, - stream)); + + if (m->last_layer) { + checkCUDA(cudaMemcpyAsync(input_grad_ptr, + output_grad_ptr, + num_elements * sizeof(float), + cudaMemcpyDeviceToDevice, + stream)); + } else { + float alpha = 1.0f, beta = 0.0f; + checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + m->inputTensor, + output_ptr, + m->inputTensor, + output_grad_ptr, + &beta, + m->inputTensor, + input_grad_ptr)); + } } } // namespace Internal diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index e8c65b4b03..915041d2bb 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -61,10 +61,27 @@ Tensor FFModel::layer_norm(const Tensor input, std::vector const &axes, bool elementwise_affine, float eps, + DataType data_type, char const *name) { - // FIXME: currently disable elementwise_affine - elementwise_affine = false; - // axes must be the last axes.size() dimensions + // In PyTorch, axes must be the sizes of the last axes.size() dimensions of + // the input tensor. However, since the tensor dimensions are reversed in + // FlexFlow (batch size is the last dimension), we require that axes must be + // the sizes of the FIRST axes.size() dimensions of the input tensor. + + // Another difference is that in PyTorch, the axes vector should contain the + // sizes of the dimensions with respect to which you want to compute the + // layernorm. In FlexFlow, instead, axes should contain the INDICES of the + // dimensions in question. We do this because the size of a dimension might be + // different when splitting a tensor in model parallelism. + assert( + axes.size() <= input->num_dims && + "number of axes must be less than tensor dimensions"); // input does not + // have replica + // dimension here + for (int i = 0; i < axes.size(); i++) { + assert(axes[i] == i && "axes must be the first axes.size() dimensions"); + } +#ifdef DEADCODE for (int i = 0; i < axes.size(); i++) { bool found = false; for (int j = 0; j < axes.size(); j++) { @@ -76,15 +93,33 @@ Tensor FFModel::layer_norm(const Tensor input, assert(false && "axes must be the last axes.size() dimensions"); } } +#endif + if (data_type == DT_NONE) { + data_type = input->data_type; + } int num_weights = elementwise_affine ? 2 : 0; - Layer *ln = new Layer(this, - OP_LAYERNORM, - DT_FLOAT, - name, - 1 /*inputs*/, - num_weights, - 1 /*outputs*/, - input); + Layer *ln = nullptr; + if (data_type != input->data_type) { + Tensor casted_input = cast(input, data_type, "type cast for layer_norm"); + ln = new Layer(this, + OP_LAYERNORM, + data_type, + name, + 1 /*inputs*/, + num_weights, + 1 /*outputs*/, + casted_input); + } else { + ln = new Layer(this, + OP_LAYERNORM, + data_type, + name, + 1 /*inputs*/, + num_weights, + 1 /*outputs*/, + input); + } + ln->outputs[0] = create_tensor_legion_ordering(input->num_dims, input->dims, input->data_type, @@ -92,19 +127,19 @@ Tensor FFModel::layer_norm(const Tensor input, 0, true /*create_grad*/); if (num_weights == 2) { - int M = 1; - for (int i = 0; i < axes.size(); i++) { - M *= input->dims[input->num_dims - 1 - axes[i]]; + int numdims = axes.size(); + int dims[numdims]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[axes[i]]; } - int dims[1] = {M}; - ln->weights[0] = create_weight_legion_ordering(1, + ln->weights[0] = create_weight_legion_ordering(numdims, dims, input->data_type, ln, true /*create_grad*/, nullptr, CHOSEN_SYNC_TYPE); - ln->weights[1] = create_weight_legion_ordering(1, + ln->weights[1] = create_weight_legion_ordering(numdims, dims, input->data_type, ln, @@ -179,19 +214,36 @@ LayerNorm::LayerNorm(FFModel &model, ParallelDim output_dims[MAX_TENSOR_DIM]; int M = 1; for (int i = 0; i < axes.size(); i++) { - M *= inputs[0]->dims[inputs[0]->num_dims - 1 - axes[i]].size; + M *= inputs[0]->dims[axes[i]].size; } effective_num_elements = M; - effective_batch_size = inputs[0]->get_shape().get_piece_num_elements() / M; + effective_batch_size = inputs[0]->get_volume() / M; + assert(elementwise_affine == (numWeights == 2)); if (numWeights > 0 && allocate_weights) { - int kernel_dims = 2; - assert(false); - // weights[0] = model.create_parallel_weight_legion_ordering( - // kernel_dims, - } else { - // do nothing + ParallelDim dims[axes.size()]; + for (int i = 0; i < axes.size(); i++) { + dims[i] = inputs[0]->dims[i]; + } + int seed = std::rand(); + Initializer *gamma_initializer = new UniformInitializer(seed, 0.0f, 1.0f); + Initializer *beta_initializer = new UniformInitializer(seed, 0.0f, 1.0f); + weights[0] = + model.create_parallel_weight_legion_ordering(axes.size(), + dims, + _input->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + gamma_initializer, + CHOSEN_SYNC_TYPE); + weights[1] = + model.create_parallel_weight_legion_ordering(axes.size(), + dims, + _input->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + beta_initializer, + CHOSEN_SYNC_TYPE); } - return; } void LayerNorm::init(FFModel const &ff) { @@ -221,6 +273,20 @@ void LayerNorm::init(FFModel const &ff) { EXCLUSIVE, inputs[0]->region)); launcher.add_field(1, FID_DATA); + if (elementwise_affine) { + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(3, FID_DATA); + } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap(ff, fm); @@ -233,6 +299,8 @@ OpMeta *LayerNorm::init_task(Task const *task, LayerNorm *ln = (LayerNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); LayerNormMeta *meta = new LayerNormMeta(handle, ln); + meta->input_type[0] = ln->inputs[0]->data_type; + meta->output_type[0] = ln->outputs[0]->data_type; return meta; } @@ -292,14 +360,21 @@ void LayerNorm::forward_task(Task const *task, assert(task->regions.size() == regions.size()); float const *in_ptr = NULL; float *out_ptr = NULL, *gamma_ptr = NULL, *beta_ptr = NULL; + GenericTensorAccessorR in; + GenericTensorAccessorW out, gamma, beta; + Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); + // in_ptr = helperGetTensorPointerRO( + // regions[0], task->regions[0], FID_DATA, ctx, runtime); + in = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); Domain out_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - out_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); + // out_ptr = helperGetTensorPointerWO( + // regions[1], task->regions[1], FID_DATA, ctx, runtime); + out = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(in_domain == out_domain); assert(in_domain.get_volume() == m->effective_num_elements * m->effective_batch_size); @@ -307,20 +382,28 @@ void LayerNorm::forward_task(Task const *task, assert(regions.size() == 4); Domain gamma_domain = runtime->get_index_space_domain( ctx, task->regions[2].region.get_index_space()); - gamma_ptr = helperGetTensorPointerRW( - regions[2], task->regions[2], FID_DATA, ctx, runtime); + // gamma_ptr = helperGetTensorPointerRW( + // regions[2], task->regions[2], FID_DATA, ctx, runtime); + gamma = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); Domain beta_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); - beta_ptr = helperGetTensorPointerRW( - regions[3], task->regions[3], FID_DATA, ctx, runtime); + // beta_ptr = helperGetTensorPointerRW( + // regions[3], task->regions[3], FID_DATA, ctx, runtime); + beta = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); assert(gamma_domain == beta_domain); assert(gamma_domain.get_volume() == m->effective_num_elements); + int numdims = gamma_domain.get_dim(); + for (int i = 0; i < numdims; i++) { + int g_d = gamma_domain.hi()[i] - gamma_domain.lo()[i] + 1; + int in_d = in_domain.hi()[i] - in_domain.lo()[i] + 1; + assert(g_d == in_d); + } } else { assert(regions.size() == 2); } - - LayerNorm::forward_kernel_wrapper( - m, in_ptr, out_ptr, gamma_ptr, beta_ptr); + LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta); } void LayerNorm::backward(FFModel const &ff) { @@ -447,7 +530,100 @@ void LayerNorm::backward_task(Task const *task, bool LayerNorm::measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { - return false; + ParallelTensorBase sub_output, sub_input; + if (!outputs[0]->get_sub_tensor(mv, sub_output)) { + return false; + } + if (!inputs[0]->get_sub_tensor(mv, sub_input)) { + return false; + } + Domain input_domain = sub_input.get_domain(); + Domain output_domain = sub_output.get_domain(); + LayerNormMeta *m = new LayerNormMeta(sim->handler, this); + + sim->free_all(); + float *in_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); + assert(in_ptr != NULL); + GenericTensorAccessorR input1_acc(inputs[0]->data_type, input_domain, in_ptr); + cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + + float *out_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + assert(out_ptr != NULL); + GenericTensorAccessorW output_acc( + outputs[0]->data_type, output_domain, out_ptr); + cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + + // FIXME please add gamma_ptr and beta_ptr after finish the implementation + float *gamma_ptr = NULL, *beta_ptr = NULL; + GenericTensorAccessorW gamma_acc; + GenericTensorAccessorW beta_acc; + + bool out_of_memory = + (in_ptr == NULL) || (out_ptr == NULL) || + (((gamma_ptr == NULL) || (beta_ptr == NULL)) && (m->elementwise_affine)); + if (out_of_memory) { + cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME; + cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME; + return true; + } + + std::function forward, backward; + forward = [&] { + forward_kernel_wrapper(m, input1_acc, output_acc, gamma_acc, beta_acc); + }; + + if (sim->computationMode == COMP_MODE_TRAINING) { + float *in_grad_ptr = + (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); + assert(in_grad_ptr != NULL); + cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + + float *out_grad_ptr = NULL; + out_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + assert(out_grad_ptr != NULL); + cost_metrics.outputs_memory += + cost_metrics.total_mem_diff_from(sim->offset); + + float *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL; + + out_of_memory = (in_grad_ptr == NULL) || (out_grad_ptr == NULL) || + (((gamma_grad_ptr == NULL) || (beta_grad_ptr == NULL)) && + (m->elementwise_affine)); + if (out_of_memory) { + cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME; + cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME; + return true; + } + + backward = [&] { + backward_kernel_wrapper(m, + out_grad_ptr, + in_ptr, + in_grad_ptr, + gamma_ptr, + gamma_grad_ptr, + beta_grad_ptr); + }; + } + + inner_measure_operator_cost(sim, forward, backward, cost_metrics); + + if (sim->computationMode == COMP_MODE_TRAINING) { + log_measure.debug("[Measure LayerNorm] name(%s) num_elements(%zu) " + "forward_time(%.4lf) backward_time(%.4lf)\n", + name, + sub_output.get_volume(), + cost_metrics.forward_time, + cost_metrics.backward_time); + } else { + log_measure.debug("[Measure LayerNorm] name(%s) num_elements(%zu) " + "forward_time(%.4lf)\n", + name, + sub_output.get_volume(), + cost_metrics.forward_time); + } + + return true; } void LayerNorm::serialize(Legion::Serializer &sez) const { @@ -512,4 +688,4 @@ size_t hash::operator()( hash_combine(key, params.elementwise_affine); return key; } -}; // namespace std +}; // namespace std \ No newline at end of file diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index ac477ba2ad..f0539f8405 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -13,6 +13,7 @@ * limitations under the License. */ +#include "flexflow/ffconst_utils.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/utils/cuda_helper.h" @@ -30,12 +31,19 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln) effective_num_elements = ln->effective_num_elements; profiling = ln->profiling; eps = ln->eps; - checkCUDA(cudaMalloc(&mean_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&rstd_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&ds_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&db_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&scale_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&bias_ptr, sizeof(float) * effective_batch_size)); + DataType data_type = ln->data_type; + checkCUDA( + cudaMalloc(&mean_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&rstd_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&ds_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&db_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&scale_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&bias_ptr, data_type_size(data_type) * effective_batch_size)); } template @@ -77,26 +85,26 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { } template -__global__ void - RowwiseMomentsCUDAKernel(int64_t N, T eps, T const *X, T *mean, T *rstd) { - __shared__ T m_shared[C10_WARP_SIZE]; - __shared__ T v_shared[C10_WARP_SIZE]; +__global__ void RowwiseMomentsCUDAKernel( + int64_t N, float eps, T const *X, T *mean, T *rstd) { + __shared__ float m_shared[C10_WARP_SIZE]; + __shared__ float v_shared[C10_WARP_SIZE]; const int64_t i = blockIdx.x; - T sum1 = 0; - T sum2 = 0; + float sum1 = 0.0f; + float sum2 = 0.0f; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - sum1 += static_cast(X[index]); - sum2 += static_cast(X[index]) * static_cast(X[index]); + sum1 += static_cast(X[index]); + sum2 += static_cast(X[index]) * static_cast(X[index]); } - sum1 = BlockReduceSum(sum1, m_shared); - sum2 = BlockReduceSum(sum2, v_shared); + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); if (threadIdx.x == 0) { - const T scale = T(1) / static_cast(N); + float const scale = float(1) / static_cast(N); sum1 *= scale; - sum2 = max(sum2 * scale - sum1 * sum1, T(0)); - mean[i] = sum1; - rstd[i] = rsqrt(sum2 + static_cast(eps)); + sum2 = max(sum2 * scale - sum1 * sum1, float(0)); + mean[i] = static_cast(sum1); + rstd[i] = static_cast(rsqrt(sum2 + eps)); } } @@ -130,27 +138,30 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, T *gamma_ptr, T *beta_ptr, cudaStream_t stream) { - RowwiseMomentsCUDAKernel + RowwiseMomentsCUDAKernel <<effective_batch_size, kCUDABlockReduceNumThreads, 0, stream>>>( - m->effective_num_elements, m->eps, in_ptr, m->mean_ptr, m->rstd_ptr); - LayerNormForwardCUDAKernel + m->effective_num_elements, + m->eps, + in_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr)); + LayerNormForwardCUDAKernel <<effective_batch_size, kCUDANumThreads, 0, stream>>>( m->effective_num_elements, in_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_ptr, beta_ptr, out_ptr); } /*static*/ -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - T const *in_ptr, - T *out_ptr, - T *gamma_ptr, - T *beta_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorW &gamma, + GenericTensorAccessorW &beta) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -160,8 +171,24 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - LayerNorm::forward_kernel( - m, in_ptr, out_ptr, gamma_ptr, beta_ptr, stream); + if (m->input_type[0] == DT_FLOAT) { + LayerNorm::forward_kernel(m, + input.get_float_ptr(), + output.get_float_ptr(), + gamma.get_float_ptr(), + beta.get_float_ptr(), + stream); + } else if (m->input_type[0] == DT_HALF) { + LayerNorm::forward_kernel(m, + input.get_half_ptr(), + output.get_half_ptr(), + gamma.get_half_ptr(), + beta.get_half_ptr(), + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } + if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -170,8 +197,8 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, cudaEventDestroy(t_start); cudaEventDestroy(t_end); printf("[LayerNorm] forward time (CF) = %.2fms\n", elapsed); - print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); - print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); + // print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); + // print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); } } @@ -352,6 +379,82 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M, } } +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + dX_i[l] = f_grad_input; + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + + compute_gI(dY, X, mean, rstd, gamma, dX, N, buf); +} + /*static*/ template void LayerNorm::backward_kernel(LayerNormMeta const *m, @@ -366,17 +469,34 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, const int64_t N = m->effective_num_elements; ComputeInternalGradientsCUDAKernel <<>>( - N, output_grad_ptr, input_ptr, gamma_ptr, m->ds_ptr, m->db_ptr); + N, + output_grad_ptr, + input_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; ComputeGradientFusedParamsCUDAKernel <<>>(M, N, - m->mean_ptr, - m->rstd_ptr, - m->ds_ptr, - m->db_ptr, - m->scale_ptr, - m->bias_ptr); + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + N); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { if (M < 512) { // For small batch size, do colwise reduce directly @@ -386,8 +506,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, N, output_grad_ptr, input_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_grad_ptr, beta_grad_ptr); } else { @@ -396,14 +516,15 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, constexpr int kThreadX = kColwiseReduceTileSize; constexpr int kThreadY = kColwiseReduceTileSize / 2; GammaBetaBackwardCUDAKernel - <<>>(M, - N, - output_grad_ptr, - input_ptr, - m->mean_ptr, - m->rstd_ptr, - gamma_grad_ptr, - beta_grad_ptr); + <<>>( + M, + N, + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); } } } @@ -419,21 +540,18 @@ void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, T *beta_grad_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - LayerNorm::backward_kernel(m, - output_grad_ptr, - input_ptr, - input_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr, - stream); + if (m->output_type[0] == DT_FLOAT) { + LayerNorm::backward_kernel(m, + output_grad_ptr, + input_ptr, + input_grad_ptr, + gamma_ptr, + gamma_grad_ptr, + beta_grad_ptr, + stream); + } } -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - float const *in_ptr, - float *out_ptr, - float *gamma_ptr, - float *beta_ptr); template void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, float const *output_grad_ptr, @@ -443,4 +561,4 @@ template void float *gamma_grad_ptr, float *beta_grad_ptr); -}; // namespace FlexFlow +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 029b20afd1..ab65db542e 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -52,7 +52,10 @@ SoftmaxParams Softmax::get_params() const { return params; } -Tensor FFModel::softmax(const Tensor _input, int dim, char const *name) { +Tensor FFModel::softmax(const Tensor _input, + int dim, + bool last_layer, + char const *name) { Layer *sm = new Layer(this, OP_SOFTMAX, DT_FLOAT, @@ -69,6 +72,8 @@ Tensor FFModel::softmax(const Tensor _input, int dim, char const *name) { sm->outputs[0] = create_tensor_legion_ordering( numdims, dims, DT_FLOAT, sm, 0, true /*create_grad*/); sm->add_int_property("softmax_dim", dim); + + sm->add_int_property("last_layer", last_layer); layers.push_back(sm); return sm->outputs[0]; } @@ -80,15 +85,19 @@ Op *Softmax::create_operator_from_layer( long long value; layer->get_int_property("softmax_dim", value); int dim = (int)value; + layer->get_int_property("last_layer", value); + bool last_layer = (bool)value; return new Softmax(model, inputs[0], (inputs[0]->num_dims - 1 - dim) % inputs[0]->num_dims, + last_layer, layer->name); } Softmax::Softmax(FFModel &model, const ParallelTensor _input, int _dim, + bool _last_layer, char const *name) : Op(model, OP_SOFTMAX, @@ -98,7 +107,7 @@ Softmax::Softmax(FFModel &model, 0 /*weights*/, 1 /*outputs*/, _input), - dim(_dim) { + dim(_dim), last_layer(_last_layer) { // Currently assume we always perform softmax along the inner most dim assert(dim == 0); ParallelDim dims[MAX_TENSOR_DIM]; @@ -113,7 +122,7 @@ Softmax::Softmax(FFModel &model, SoftmaxParams const ¶ms, const ParallelTensor input, char const *name) - : Softmax(model, input, params.dim, name) {} + : Softmax(model, input, params.dim, params.last_layer, name) {} void Softmax::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); @@ -283,6 +292,13 @@ void Softmax::backward(FFModel const &ff) { EXCLUSIVE, outputs[0]->region_grad)); launcher.add_field(1, FID_DATA); + + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(2, FID_DATA); runtime->execute_index_space(ctx, launcher); } @@ -315,8 +331,8 @@ void Softmax::backward_task_with_dim(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); + assert(regions.size() == 3); + assert(task->regions.size() == 3); // const Softmax* softmax = (Softmax*) task->args; SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); TensorAccessorW acc_input_grad(regions[0], @@ -327,11 +343,16 @@ void Softmax::backward_task_with_dim(Task const *task, true /*readOutput*/); TensorAccessorR acc_output_grad( regions[1], task->regions[1], FID_DATA, ctx, runtime); + TensorAccessorR acc_output( + regions[2], task->regions[1], FID_DATA, ctx, runtime); // make sure the image indices match! assert(acc_input_grad.rect == acc_output_grad.rect); - backward_kernel_wrapper( - m, acc_input_grad.ptr, acc_output_grad.ptr, acc_input_grad.rect.volume()); + backward_kernel_wrapper(m, + acc_input_grad.ptr, + acc_output_grad.ptr, + acc_output.ptr, + acc_input_grad.rect.volume()); } bool Softmax::get_int_parameter(PMParameter para, int *value) const { @@ -377,11 +398,17 @@ bool Softmax::measure_operator_cost(Simulator *sim, float *output_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); assert(output_grad_ptr != NULL); + float *output_ptr = + (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); backward = [&] { - backward_kernel_wrapper( - m, input_grad_ptr, output_grad_ptr, sub_output.get_volume()); + backward_kernel_wrapper(m, + input_grad_ptr, + output_grad_ptr, + output_ptr, + sub_output.get_volume()); }; } @@ -413,6 +440,7 @@ size_t hash::operator()( FlexFlow::SoftmaxParams const ¶ms) const { size_t key = 0; hash_combine(key, params.dim); + hash_combine(key, params.last_layer); return key; } }; // namespace std diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 53e61b90d9..b6004af14a 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -215,7 +215,7 @@ __host__ void int idx = 0; printf("%s", prefix); for (idx = 0; idx < num_elements; idx++) { - printf(" %.4lf", (float)host_ptr[idx]); + printf(" %.10lf", (float)host_ptr[idx]); if (idx >= 16) { break; } @@ -224,6 +224,76 @@ __host__ void checkCUDA(cudaFreeHost(host_ptr)); } +template +__host__ void + save_tensor(T const *ptr, size_t num_elements, char const *file_name) { + T *host_ptr; + checkCUDA(cudaHostAlloc(&host_ptr, + sizeof(T) * num_elements, + cudaHostAllocPortable | cudaHostAllocMapped)); + checkCUDA(cudaMemcpy( + host_ptr, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost)); + FILE *tensor_file; + tensor_file = fopen(file_name, "w"); + for (unsigned i = 0; i < num_elements; i++) { + fprintf(tensor_file, "%.8f, ", (float)host_ptr[i]); + } + + fclose(tensor_file); + checkCUDA(cudaFreeHost(host_ptr)); +} + +cudnnStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax( + cudnnTensorDescriptor_t tensor, Domain domain, DataType data_type) { + int dims[MAX_TENSOR_DIM]; + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(data_type); + switch (domain.get_dim()) { + case 1: { + Rect<1> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + return cudnnSetTensor4dDescriptor( + tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, dims[0], 1, 1, 1); + } + case 2: { + Rect<2> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + return cudnnSetTensor4dDescriptor( + tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, dims[1], dims[0], 1, 1); + } + case 3: { + Rect<3> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + return cudnnSetTensor4dDescriptor(tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + dims[2] * dims[1], + dims[0], + 1, + 1); + } + case 4: { + Rect<4> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + dims[3] = rect.hi[3] - rect.lo[3] + 1; + return cudnnSetTensor4dDescriptor(tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + dims[3] * dims[2] * dims[1], + dims[0], + 1, + 1); + } + default: + assert(false && "Unsupported dim number"); + } + return CUDNN_STATUS_BAD_PARAM; +} + cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Domain domain) { int dims[MAX_TENSOR_DIM]; @@ -370,3 +440,8 @@ template __host__ void print_tensor(int32_t const *ptr, size_t rect, char const *prefix); template __host__ void print_tensor(int64_t const *ptr, size_t rect, char const *prefix); +template __host__ void + save_tensor(float const *ptr, size_t rect, char const *file_name); +template __host__ void save_tensor(int32_t const *ptr, + size_t rect, + char const *file_name); diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 4b55a39104..5310588477 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1718,6 +1718,7 @@ GraphOptimalViewSerialized case OP_SOFTMAX: { Softmax *softmax = (Softmax *)op; sez.serialize(softmax->dim); + sez.serialize(softmax->last_layer); break; } case OP_REPARTITION: { @@ -2098,8 +2099,11 @@ void FFModel::deserialize_graph_optimal_view( case OP_SOFTMAX: { assert(num_inputs == 1); int softmax_dim; + bool last_layer; dez.deserialize(softmax_dim); - node = get_or_create_node(inputs[0], {softmax_dim}); + dez.deserialize(last_layer); + node = + get_or_create_node(inputs[0], {softmax_dim, last_layer}); break; } case OP_TRANSPOSE: { diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index 2925eb7555..6d52e135cd 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -1930,8 +1930,8 @@ void GraphSearchHelper::graph_optimize( } } best_graph->print_strategy_computation_graph(optimal.views); - //std::cout << "PCG:" << std::endl; - //best_graph->print_dot(); + // std::cout << "PCG:" << std::endl; + // best_graph->print_dot(); optimal_views = real_optimal_views; } @@ -3120,7 +3120,7 @@ void FFModel::graph_optimize( std::unordered_map &optimal_views) { this->graph_search->graph_optimize( budget, only_data_parallel, best_graph, optimal_views); - best_graph->print_dot(); + best_graph->print_dot(); } bool FFModel::convert_graph_to_operators( @@ -3221,7 +3221,8 @@ bool FFModel::convert_graph_to_operators( case OP_SOFTMAX: { assert(inList.size() == 1); Softmax *softmax = (Softmax *)node.ptr; - new_op = new Softmax(*this, inputs[0], softmax->dim, NULL); + new_op = new Softmax( + *this, inputs[0], softmax->dim, softmax->last_layer, NULL); break; } case OP_COMBINE: { From 5f793c1ef5dfee3e26932d171550f42c44ece2cb Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Tue, 25 Jul 2023 05:40:45 -0700 Subject: [PATCH 38/72] Improve machine_view hash --- src/runtime/machine_view.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/runtime/machine_view.cc b/src/runtime/machine_view.cc index dadece7691..0e3112da12 100644 --- a/src/runtime/machine_view.cc +++ b/src/runtime/machine_view.cc @@ -1,4 +1,5 @@ #include "flexflow/machine_view.h" +#include "flexflow/utils/hash_utils.h" namespace FlexFlow { @@ -47,13 +48,13 @@ size_t MachineView::num_parts() const { } size_t MachineView::hash() const { - size_t ret = 17; - ret = ret * 31 + std::hash()(device_type); - ret = ret * 31 + std::hash()(ndims); - ret = ret * 31 + std::hash()(start_device_id); + size_t h = 0; + hash_combine(h, device_type); + hash_combine(h, ndims); + hash_combine(start_device_id); for (int i = 0; i < ndims; i++) { - ret = ret * 31 + std::hash()(dim[i]); - ret = ret * 31 + std::hash()(stride[i]); + hash_combine(h, dim[i]); + hash_combine(h, stride[i]); } return ret; } From 2c09397312c005e29eb03d63b9b48c7cda1c6791 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Tue, 25 Jul 2023 05:43:08 -0700 Subject: [PATCH 39/72] Fix bugs in improved hashing --- src/runtime/machine_view.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/runtime/machine_view.cc b/src/runtime/machine_view.cc index 0e3112da12..44dff5a2da 100644 --- a/src/runtime/machine_view.cc +++ b/src/runtime/machine_view.cc @@ -51,12 +51,12 @@ size_t MachineView::hash() const { size_t h = 0; hash_combine(h, device_type); hash_combine(h, ndims); - hash_combine(start_device_id); + hash_combine(h, start_device_id); for (int i = 0; i < ndims; i++) { hash_combine(h, dim[i]); hash_combine(h, stride[i]); } - return ret; + return h; } int MachineView::get_device_id(DomainPoint const &p) const { From 862e9d7cac0a87c3af9adf5c17959458665ed861 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 25 Jul 2023 22:36:15 +0000 Subject: [PATCH 40/72] fix weight dimension in layernorm --- src/ops/layer_norm.cc | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 915041d2bb..062eb28955 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -220,15 +220,24 @@ LayerNorm::LayerNorm(FFModel &model, effective_batch_size = inputs[0]->get_volume() / M; assert(elementwise_affine == (numWeights == 2)); if (numWeights > 0 && allocate_weights) { - ParallelDim dims[axes.size()]; - for (int i = 0; i < axes.size(); i++) { + ParallelDim dims[axes.size() + 1]; + int num_dims = axes.size(); + for (int i = 0; i < num_dims; i++) { dims[i] = inputs[0]->dims[i]; } + assert(numInputs == 1); + dims[num_dims].degree = inputs[0]->dims[inputs[0]->num_dims - 2].degree; + dims[num_dims].size = dims[num_dims].degree; + dims[num_dims].parallel_idx = + inputs[0]->dims[inputs[0]->num_dims - 2].parallel_idx; + dims[num_dims].is_replica_dim = true; + num_dims += 1; + int seed = std::rand(); Initializer *gamma_initializer = new UniformInitializer(seed, 0.0f, 1.0f); Initializer *beta_initializer = new UniformInitializer(seed, 0.0f, 1.0f); weights[0] = - model.create_parallel_weight_legion_ordering(axes.size(), + model.create_parallel_weight_legion_ordering(num_dims, dims, _input->data_type, NULL /*owner_op*/, @@ -236,7 +245,7 @@ LayerNorm::LayerNorm(FFModel &model, gamma_initializer, CHOSEN_SYNC_TYPE); weights[1] = - model.create_parallel_weight_legion_ordering(axes.size(), + model.create_parallel_weight_legion_ordering(num_dims, dims, _input->data_type, NULL /*owner_op*/, @@ -376,8 +385,9 @@ void LayerNorm::forward_task(Task const *task, out = helperGetGenericTensorAccessorWO( m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(in_domain == out_domain); - assert(in_domain.get_volume() == - m->effective_num_elements * m->effective_batch_size); + // assert(in_domain.get_volume() == + // m->effective_num_elements * m->effective_batch_size); + if (m->elementwise_affine) { assert(regions.size() == 4); Domain gamma_domain = runtime->get_index_space_domain( @@ -394,7 +404,7 @@ void LayerNorm::forward_task(Task const *task, m->input_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); assert(gamma_domain == beta_domain); assert(gamma_domain.get_volume() == m->effective_num_elements); - int numdims = gamma_domain.get_dim(); + int numdims = gamma_domain.get_dim() - 1; for (int i = 0; i < numdims; i++) { int g_d = gamma_domain.hi()[i] - gamma_domain.lo()[i] + 1; int in_d = in_domain.hi()[i] - in_domain.lo()[i] + 1; @@ -495,8 +505,8 @@ void LayerNorm::backward_task(Task const *task, in_grad_ptr = helperGetTensorPointerRW( regions[2], task->regions[2], FID_DATA, ctx, runtime); assert(in_domain == out_grad_domain); - assert(in_domain.get_volume() == - m->effective_num_elements * m->effective_batch_size); + // assert(in_domain.get_volume() == + // m->effective_num_elements * m->effective_batch_size); if (m->elementwise_affine) { assert(regions.size() == 6); Domain gamma_domain = runtime->get_index_space_domain( From 2eee87525591144db8d96817dbe8d300f89719cf Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 11 Aug 2023 19:21:22 +0000 Subject: [PATCH 41/72] fix `preregister_task_variant` issue, linting --- include/flexflow/graph.h | 2 +- src/ops/element_binary.cc | 2 +- src/ops/element_unary.cc | 2 +- src/ops/fused.cu | 6 ++- src/ops/linear.cc | 10 ++-- .../kernels/replicate_kernels.cpp | 48 +++++++++---------- src/parallel_ops/kernels/replicate_kernels.cu | 48 +++++++++---------- src/parallel_ops/replicate.cc | 35 +++++++------- src/runtime/model.cc | 18 +++++-- src/runtime/parallel_tensor.cc | 1 - 10 files changed, 91 insertions(+), 81 deletions(-) diff --git a/include/flexflow/graph.h b/include/flexflow/graph.h index 5a0754a426..c4fed8ff58 100644 --- a/include/flexflow/graph.h +++ b/include/flexflow/graph.h @@ -91,7 +91,7 @@ struct NodeCompare { struct GraphOptimalViewSerialized { #ifdef LEGION_MAX_RETURN_SIZE - static const size_t buffer_size = 4*LEGION_MAX_RETURN_SIZE - 8; + static const size_t buffer_size = 4 * LEGION_MAX_RETURN_SIZE - 8; #else static const size_t buffer_size = 1024 * 1024 - 8; #endif diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 6adea1480c..0e0d8e7c31 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -448,7 +448,7 @@ __host__ void std::vector const ®ions, Context ctx, Runtime *runtime) { - const ElementBinary* ele = (const ElementBinary*) task->args; + ElementBinary const *ele = (ElementBinary const *)task->args; ElementBinaryMeta const *m = *((ElementBinaryMeta **)task->local_args); Domain in1_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index aa50cf935e..46643b655b 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -131,7 +131,7 @@ Tensor FFModel::tanh(const Tensor x, char const *name) { } Tensor FFModel::identity(const Tensor x, char const *name) { - //return this->unary(OP_IDENTITY, x, false /*inplace*/, name); + // return this->unary(OP_IDENTITY, x, false /*inplace*/, name); return x; } diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 720f1ed693..15072513a7 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -414,7 +414,8 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); - assert(my_input_accessor[0].data_type == my_output_accessor[0].data_type); + assert(my_input_accessor[0].data_type == + my_output_accessor[0].data_type); if (my_input_accessor[0].data_type == DT_INT64) { Kernels::Reshape::forward_kernel_wrapper( my_input_accessor[0].get_int64_ptr(), @@ -442,7 +443,8 @@ __host__ void FusedOp::forward_task(Task const *task, assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); TransposeMeta *m = (TransposeMeta *)metas->meta[op]; - assert(my_input_accessor[0].data_type == my_output_accessor[0].data_type); + assert(my_input_accessor[0].data_type == + my_output_accessor[0].data_type); Kernels::Transpose::forward_kernel_wrapper( m, my_input_accessor[0].get_float_ptr(), diff --git a/src/ops/linear.cc b/src/ops/linear.cc index cfea321dda..d5257b9c3e 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -624,11 +624,11 @@ void Linear::backward_task_with_dim(Task const *task, float *acc_bias_grad_ptr = NULL; if (m->use_bias) { TensorAccessorW acc_bias_grad(regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); + task->regions[rid], + FID_DATA, + ctx, + runtime, + true /*readOutput*/); rid++; assert(acc_bias_grad.rect.volume() == static_cast(out_dim)); acc_bias_grad_ptr = static_cast(acc_bias_grad.ptr); diff --git a/src/parallel_ops/kernels/replicate_kernels.cpp b/src/parallel_ops/kernels/replicate_kernels.cpp index ff9751ee34..c66995877e 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cpp +++ b/src/parallel_ops/kernels/replicate_kernels.cpp @@ -77,43 +77,43 @@ template void backward_kernel(float const *output_grad_ptr, size_t num_replicas); template void forward_kernel(double const *input_ptr, - double *output_ptr, - size_t num_elements); + double *output_ptr, + size_t num_elements); template __global__ void replicate_backward_kernel(double const *input_ptr, - double *output_ptr, - size_t num_elements, - size_t num_replicas); + double *output_ptr, + size_t num_elements, + size_t num_replicas); template void backward_kernel(double const *output_grad_ptr, - double *input_grad_ptr, - size_t num_elements, - size_t num_replicas); + double *input_grad_ptr, + size_t num_elements, + size_t num_replicas); template void forward_kernel(int64_t const *input_ptr, - int64_t *output_ptr, - size_t num_elements); + int64_t *output_ptr, + size_t num_elements); template __global__ void replicate_backward_kernel(int64_t const *input_ptr, - int64_t *output_ptr, - size_t num_elements, - size_t num_replicas); + int64_t *output_ptr, + size_t num_elements, + size_t num_replicas); template void backward_kernel(int64_t const *output_grad_ptr, - int64_t *input_grad_ptr, - size_t num_elements, - size_t num_replicas); + int64_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); template void forward_kernel(int32_t const *input_ptr, - int32_t *output_ptr, - size_t num_elements); + int32_t *output_ptr, + size_t num_elements); template __global__ void replicate_backward_kernel(int32_t const *input_ptr, - int32_t *output_ptr, - size_t num_elements, - size_t num_replicas); + int32_t *output_ptr, + size_t num_elements, + size_t num_replicas); template void backward_kernel(int32_t const *output_grad_ptr, - int32_t *input_grad_ptr, - size_t num_elements, - size_t num_replicas); + int32_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); } // namespace Replicate } // namespace Kernels diff --git a/src/parallel_ops/kernels/replicate_kernels.cu b/src/parallel_ops/kernels/replicate_kernels.cu index 2b8ff55eac..6ed4f424cf 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cu +++ b/src/parallel_ops/kernels/replicate_kernels.cu @@ -69,41 +69,41 @@ template void backward_kernel(float const *output_grad_ptr, size_t num_elements, size_t num_replicas); template void forward_kernel(double const *input_ptr, - double *output_ptr, - size_t num_elements); + double *output_ptr, + size_t num_elements); template __global__ void replicate_backward_kernel(double const *input_ptr, - double *output_ptr, - size_t num_elements, - size_t num_replicas); + double *output_ptr, + size_t num_elements, + size_t num_replicas); template void backward_kernel(double const *output_grad_ptr, - double *input_grad_ptr, - size_t num_elements, - size_t num_replicas); + double *input_grad_ptr, + size_t num_elements, + size_t num_replicas); template void forward_kernel(int32_t const *input_ptr, - int32_t *output_ptr, - size_t num_elements); + int32_t *output_ptr, + size_t num_elements); template __global__ void replicate_backward_kernel(int32_t const *input_ptr, - int32_t *output_ptr, - size_t num_elements, - size_t num_replicas); + int32_t *output_ptr, + size_t num_elements, + size_t num_replicas); template void backward_kernel(int32_t const *output_grad_ptr, - int32_t *input_grad_ptr, - size_t num_elements, - size_t num_replicas); + int32_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); template void forward_kernel(int64_t const *input_ptr, - int64_t *output_ptr, - size_t num_elements); + int64_t *output_ptr, + size_t num_elements); template __global__ void replicate_backward_kernel(int64_t const *input_ptr, - int64_t *output_ptr, - size_t num_elements, - size_t num_replicas); + int64_t *output_ptr, + size_t num_elements, + size_t num_replicas); template void backward_kernel(int64_t const *output_grad_ptr, - int64_t *input_grad_ptr, - size_t num_elements, - size_t num_replicas); + int64_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); } // namespace Replicate } // namespace Kernels diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index 1714684f75..322ab061e5 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -230,15 +230,14 @@ bool Replicate::append_parallel_op_info( void Replicate::init_task(Task const *task, std::vector const ®ions, - Context ctx, - Runtime *runtime) { -} + Context ctx, + Runtime *runtime) {} /*static*/ void Replicate::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { + std::vector const ®ions, + Context ctx, + Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); DataType data_type = *((DataType *)task->args); @@ -256,10 +255,11 @@ void Replicate::forward_task(Task const *task, } template -void Replicate::forward_task_with_type(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { +void Replicate::forward_task_with_type( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); Domain input_domain = runtime->get_index_space_domain( @@ -281,9 +281,9 @@ void Replicate::forward_task_with_type(Task const *task, } void Replicate::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { + std::vector const ®ions, + Context ctx, + Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); DataType data_type = *((DataType *)task->args); @@ -301,10 +301,11 @@ void Replicate::backward_task(Task const *task, } template -void Replicate::backward_task_with_type(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { +void Replicate::backward_task_with_type( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); Domain output_grad_domain = runtime->get_index_space_domain( diff --git a/src/runtime/model.cc b/src/runtime/model.cc index b5a2dcbd0c..fc5dc7d740 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -2635,11 +2635,12 @@ Op *FFModel::create_operator_from_layer( assert(tensor->parallel_tensor == nullptr); tensor->parallel_tensor = pt; // start from data parllel tensor - if (config.only_data_parallel && config.numNodes * config.workersPerNode > 1) { - if (pt->dims[num_dims-1].size == 1) { + if (config.only_data_parallel && + config.numNodes * config.workersPerNode > 1) { + if (pt->dims[num_dims - 1].size == 1) { Replicate *repl = new Replicate( *this, pt, num_dims, config.numNodes * config.workersPerNode); - repl->outputs[0]->dims[num_dims].is_replica_dim = true; + repl->outputs[0]->dims[num_dims].is_replica_dim = true; operators.push_back(repl); } else { Repartition *part = new Repartition( @@ -5137,8 +5138,15 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(REPLICATE_INIT_TASK_ID, "Replicate Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Replicate Init Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Replicate Init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } } { TaskVariantRegistrar registrar(REPLICATE_FWD_TASK_ID, "Replicate Forward"); diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 479bde3898..6cc0d0067e 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -146,7 +146,6 @@ size_t ParallelTensorShape::get_piece_num_elements() const { return piece_num_elements; } - RecordFormatter ParallelTensorShape::as_dot() const { RecordFormatter r; for (int i = 0; i < this->num_dims; i++) { From b9d13327861a161915a8b78bc408b4a4ccb5718f Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 14 Aug 2023 14:16:19 -0400 Subject: [PATCH 42/72] try to run graph_optimize on each node --- SPECINFER.md | 145 +++++++++++++++++++++++++++++++++++++++ include/flexflow/graph.h | 2 + src/runtime/graph.cc | 23 +++++-- src/runtime/model.cc | 16 +++-- 4 files changed, 174 insertions(+), 12 deletions(-) create mode 100644 SPECINFER.md diff --git a/SPECINFER.md b/SPECINFER.md new file mode 100644 index 0000000000..347b394db2 --- /dev/null +++ b/SPECINFER.md @@ -0,0 +1,145 @@ +# FlexFlow Serve: Low-Latency, High-Performance LLM Serving +![build](https://github.com/flexflow/flexflow/workflows/build/badge.svg?branch=master) ![gpu tests](https://github.com/flexflow/flexflow/workflows/gpu-ci/badge.svg?branch=master) ![multinode gpu tests](https://github.com/flexflow/flexflow/workflows/multinode-test/badge.svg?branch=master) ![docker](https://github.com/flexflow/flexflow/workflows/docker-build/badge.svg?branch=master) ![pip](https://github.com/flexflow/flexflow/workflows/pip-install/badge.svg?branch=master) ![shell-check](https://github.com/flexflow/flexflow/workflows/Shell%20Check/badge.svg?branch=master) ![clang-format](https://github.com/flexflow/flexflow/workflows/clang-format%20Check/badge.svg?branch=master) [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest) + +

+A SpecInfer Demo +

+ +## What is FlexFlow Serve + +

+An overview of SpecInfer +

+ +The high computational and memory requirements of generative large language +models (LLMs) make it challenging to serve them quickly and cheaply. +FlexFlow Serve is an open-source compiler and distributed system for +__low latency__, __high performance__ LLM serving. FlexFlow Serve outperforms +existing systems by 1.3-2.0x for single-node, multi-GPU inference and by +1.4-2.4x for multi-node, multi-GPU inference. + +that accelerates generative LLM +inference with __speculative inference__ and __token tree verification__. A key insight +behind SpecInfer is to combine various collectively boost-tuned small speculative +models (SSMs) to jointly predict the LLM’s outputs; the predictions are organized as a +token tree, whose nodes each represent a candidate token sequence. The correctness +of all candidate token sequences represented by a token tree is verified against the +LLM’s output in parallel using a novel tree-based parallel decoding mechanism. +SpecInfer uses an LLM as a token tree verifier instead of an incremental decoder, +which largely reduces the end-to-end inference latency and computational requirement +for serving generative LLMs while provably preserving model quality. + +

+Performance comparison +

+ +## Build/Install SpecInfer +SpecInfer is built on top of FlexFlow. You can build/install SpecInfer by building the inference branch of FlexFlow. Please read the [instructions](../INSTALL.md) for building/installing FlexFlow from source code. If you would like to quickly try SpecInfer, we also provide pre-built Docker packages ([specinfer-cuda](https://github.com/flexflow/FlexFlow/pkgs/container/specinfer-cuda) with a CUDA backend, [specinfer-hip_rocm](https://github.com/flexflow/FlexFlow/pkgs/container/specinfer-hip_rocm) with a HIP-ROCM backend) with all dependencies pre-installed (N.B.: currently, the CUDA pre-built containers are only fully compatible with host machines that have CUDA 11.7 installed), together with [Dockerfiles](./docker) if you wish to build the containers manually. + +## Run SpecInfer +The source code of the SpecInfer pipeline is available at [this folder](../inference/spec_infer/). The SpecInfer executable will be available at `/build_dir/inference/spec_infer/spec_infer` at compilation. You can use the following command-line arguments to run SpecInfer: + +* `-ll:gpu`: number of GPU processors to use on each node for serving an LLM (default: 0) +* `-ll:fsize`: size of device memory on each GPU in MB +* `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. SpecInfer keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters. +* `-llm-model`: the LLM model type as a case-insensitive string (e.g. "opt" or "llama") +* `-llm-weight`: path to the folder that stores the LLM weights +* `-llm-config`: path to the json file that stores the LLM model configs +* `-ssm-model`: the LLM model type as a case-insensitive string (e.g. "opt" or "llama"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs. +* `-ssm-weight`: path to the folder that stores the small speculative models' weights. The number of `-ssm-weight`s must match the number of `-ssm-model`s and `-ssm-config`s. +* `-ssm-config`: path to the json file that stores the SSM model configs. The number of `-ssm-config`s must match the number of `-ssm-model`s and `-ssm-weight`s. +* `-tokenizer`: path to the tokenizer file (see [Tokenizers](#tokenizers) for preparing a tokenizer for SpecInfer). +* `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used. +* `-prompt`: (optional) path to the prompt file. SpecInfer expects a json format file for prompts, all of which will be served by SpecInfer. In addition, users can also use the following API for registering requests: +* `-output-file`: (optional) filepath to use to save the output of the model, together with the generation latency + + +```c++ +class RequestManager { + RequestGuid register_new_request(std::string const &prompt, int max_sequence_length); +} +``` +For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-190M models for speculative inference. + +```bash +./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight /path/to/llm/weights -llm-config /path/to/llm/config.json -ssm-model llama -ssm-weight /path/to/ssm1/weights -ssm-config /path/to/ssm/config.json -ssm-model llama -smm-weight /path/to/ssm2/weights -ssm-config /path/to/ssm2/config.json -tokenizer /path/to/tokenizer.model -prompt /path/to/prompt.json --use-full-precision -tensor-parallelism-degree 2 -pipeline-parallelism-degree 2 +``` + +### Tokenizers +SpecInfer supports two tokenizers: + +* The SentencePiece tokenizer is used to support the LLaMA model family (e.g., LLaMA-6B, LLaMA-13B, and LLaMA-190M in our demo). We used the pretrained sentencepiece tokenizer from LLAMA, which is also available on Hugging Face (model id: `decapoda-research/llama-7b-hf`). If you are using our LLAMA-160M weights for the demo, however, you should use the tokenizer from the [JackFram/llama-160m](https://huggingface.co/JackFram/llama-160m/resolve/main/tokenizer.model) HuggingFace repo. +* The GPT2 tokenizer is used to support the Open Pre-trained Transformer model family (e.g., OPT-13B and OPT-125M). To use it, download the [vocab](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-vocab.json) and [merges](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-merges.txt) files and pass the folder containing them as a parameter. + +### Mixed-precision Support +SpecInfer now supports single-precision floating points and half-precision floating points. By default we use half-precision. Add `--use-full-precision` to the command line to run the demo with single-precision, please make sure to use the correct weight files in the form below. + +### CPU Offloading +SpecInfer offers offloading-based inference for running large models (e.g., llama-7B) on a single GPU. CPU offloading is a choice to save tensors in CPU memory, and only copy the tensor to GPU when doing calculation. Notice that now we selectively offload the largest weight tensors (weights tensor in Linear, Attention). Besides, since the small model occupies considerably less space, it it does not pose a bottleneck for GPU memory, the offloading will bring more runtime space and computational cost, so we only do the offloading for the large model. You can run the offloading example by adding `-offload` and `-offload-reserve-space-size` flags. +#### Quantization +To reduce data transferred between the CPU and GPU, SpecInfer provides int4 and int8 quantization. The compressed tensors are stored on the CPU side. Once copied to the GPU, these tensors undergo decompression and conversion back to their original precision. Please find the compressed weight files in our s3 bucket, or use [this script](../inference/utils/compress_llama_weights.py) from [FlexGen](https://github.com/FMInference/FlexGen) project to do the compression manually. The quantization method can be selected using the `--4bit-quantization` and `--8bit-quantization` flags. + +Below is an example command line to use offloading and quantization in SpecInfer. + +```bash +./inference/spec_infer/spec_infer -ll:gpu 1 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight /path/to/llm/weights -llm-config /path/to/llm/config.json -ssm-model llama -ssm-weight /path/to/ssm1/weights -ssm-config /path/to/ssm/config.json -ssm-model llama -smm-weight /path/to/ssm2/weights -ssm-config /path/to/ssm2/config.json -tokenizer /path/to/tokenizer.model -prompt /path/to/prompt.json --use-full-precision -offload -offload-reserve-space-size 6000 --8bit-quantization +``` + + + +### LLM Weights +The weight files used in our demo are extracted from HuggingFace, and stored in our AWS S3 bucket. + +| Model | Model id on Hugging Face | Storage Location (single precision) | Storage Location (half precision) | +| :---- | :---- | :---- | :---- | +| LLaMA-7B | decapoda-research/llama-7b-hf | s3://specinfer/weights/llama_7B_weights.tar.gz | s3://specinfer/half_weights/llama_7B_weights.tar.gz +| LLaMA-190M | JackFram/llama-160m | s3://specinfer/weights/llama_160M_weights.tar.gz | s3://specinfer/half_weights/llama_160M_weights.tar.gz +| OPT-6.7B | facebook/opt-6.7b | s3://specinfer/weights/opt_6B_weights.tar.gz | s3://specinfer/half_weights/opt_6B_weights.tar.gz +| OPT-125M | facebook/opt-125m | s3://specinfer/weights/opt_125M_weights.tar.gz | s3://specinfer/half_weights/opt_125M_weights.tar.gz + +You can use [this script](../inference/utils/download_llama_weights.py) to automatically download and convert the weights of a HuggingFace LLAMA LLM and a LLAMA SSM to the SpecInfer weight format. The script also downloads the LLAMA tokenizer. If you would like to try the OPT model instead, use [this script](../inference/utils/download_opt_weights.py) to download (and convert) the OPT weights and tokenizer. + +### Prompt Datasets +We have evaluated SpecInfer on the following prompts datasets: [Chatbot instruction prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatbot.json), [ChatGPT Prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatgpt.json), [WebQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/webqa.json), [Alpaca](https://specinfer.s3.us-east-2.amazonaws.com/prompts/alpaca.json), and [PIQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/piqa.json). + +### Script to run the demo +You can take a look at [this script](../tests/inference_tests.sh), which is run in CI for each new commit, for an example of how to run the demo. + +## Difference between SpecInfer and HuggingFace Assistant Model + +There are two major differences between the two systems. + +* First, the HuggingFace assistant model produces a single candidate token sequence during speculation, while SpecInfer generates and verifies a speculated token tree, whose tokens each represent a candidate token sequence. To deal with the more complex verification task, SpecInfer includes a number of systems and algorithmic optimizations to quickly and efficiently verify all tokens of a token tree in parallel. + +* Second, instead of considering a single assistant model, SpecInfer combines a variety of collectively boost-tuned small speculative models (SSMs) to jointly predict the LLM's outputs. We observe that using multiple boost-tuned SSMs is critical for improving speculative performance. + +## TODOs + +SpecInfer is under active development. We currently focus on the following tasks and strongly welcome all contributions to SpecInfer from bug fixes to new features and extensions. + +* Low-precision and mixed-precision support. The current version uses single-precision floating points for computing tree attention. We are actively working on support half-precision floating points, and int4 and int8 quantizations. +* Offloading-based generative LLM inference. Another promising avenue for future work is using speculative inference and token tree verification to reduce the end-to-end inference for offloading-based generative LLM inference. A potential application of this technique is enabling a single commodity GPU to serve LLMs for latency critical tasks. + +## Acknowledgements +This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting SpecInfer and the underlying FlexFlow runtime system. The following paper describes design, implementation, and key optimizations of SpecInfer. + +* Xupeng Miao*, Gabriele Oliaro*, Zhihao Zhang*, Xinhao Cheng, Zeyu Wang, Rae Ying Yee Wong, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia. [SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification](https://arxiv.org/abs/2305.09781). + +\* Denotes equal contribution + +### Citation +Please cite as: + +``` bibtex +@misc{miao2023specinfer, + title={SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification}, + author={Xupeng Miao and Gabriele Oliaro and Zhihao Zhang and Xinhao Cheng and Zeyu Wang and Rae Ying Yee Wong and Zhuoming Chen and Daiyaan Arfeen and Reyna Abhyankar and Zhihao Jia}, + year={2023}, + eprint={2305.09781}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +## License +Both SpecInfer and FlexFlow use Apache License 2.0. diff --git a/include/flexflow/graph.h b/include/flexflow/graph.h index c4fed8ff58..2c92eeeb31 100644 --- a/include/flexflow/graph.h +++ b/include/flexflow/graph.h @@ -332,6 +332,8 @@ class Graph { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static GraphOptimalViewSerialized + graph_optimize_wrapper(FFModel * model); Node find_bottleneck_node(Node const &sink_node, Node const &source_node) const; void print_strategy_computation_graph( diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 15ba8ce951..5c49687712 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1882,11 +1882,11 @@ namespace { */ std::pair, std::unordered_map> try_one_lambda(std::pair &lambda, - Task const *task, + FFModel *model, std::shared_ptr &cached_simulator, bool perform_memory_search) { // Create a new fresh model - FFModel *model = *((FFModel **)task->args); + //FFModel *model = *((FFModel **)task->args); model->clear_graph_search_cache(); if (model->config.search_num_nodes.has_value()) { @@ -1900,6 +1900,9 @@ std::pair, std::unordered_map> model->config.workersPerNode, model->config.cpusPerNode, model->all_valid_views); + Runtime *runtime = model->config.lg_hlr; + Context ctx = model->config.lg_ctx; + const Task* task = runtime->get_current_task(ctx); Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) .only_kind(Memory::GPU_FB_MEM) .best_affinity_to(task->target_proc) @@ -2045,12 +2048,20 @@ bool is_valid_strategy( * @param runtime Not used * @return GraphOptimalViewSerialized Serialized optimal PCG */ + GraphOptimalViewSerialized Graph::graph_optimize_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - auto model_config = (*((FFModel **)task->args))->config; + FFModel* model = *((FFModel **)task->args); + return Graph::graph_optimize_wrapper(model); +} + +/*static*/ +GraphOptimalViewSerialized + Graph::graph_optimize_wrapper(FFModel *model) { + auto model_config = model->config; bool perform_memory_search = model_config.perform_memory_search; float memory_threshold = model_config.device_mem; bool only_data_parallel = model_config.only_data_parallel; @@ -2066,7 +2077,7 @@ GraphOptimalViewSerialized // Be optimistic lambdas.emplace_back(std::make_pair(1.0, MemorySearchResult{})); auto try_result = try_one_lambda( - lambdas.back(), task, cached_simulator, perform_memory_search); + lambdas.back(), model, cached_simulator, perform_memory_search); best_graph = std::move(try_result.first); optimal_views = try_result.second; @@ -2082,7 +2093,7 @@ GraphOptimalViewSerialized // Not found the strategy; need to do binary search lambdas.emplace_back(std::make_pair(0.0, MemorySearchResult{})); try_result = try_one_lambda( - lambdas.back(), task, cached_simulator, perform_memory_search); + lambdas.back(), model, cached_simulator, perform_memory_search); best_graph = std::move(try_result.first); optimal_views = try_result.second; @@ -2109,7 +2120,7 @@ GraphOptimalViewSerialized lambdas.emplace_back(std::make_pair(mid, MemorySearchResult{})); try_result = try_one_lambda( - lambdas.back(), task, cached_simulator, perform_memory_search); + lambdas.back(), model, cached_simulator, perform_memory_search); if (!is_valid_strategy(lambdas, try_result.first.get(), diff --git a/src/runtime/model.cc b/src/runtime/model.cc index fc5dc7d740..8461668aaa 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -2831,12 +2831,16 @@ void FFModel::compile(LossType loss_type, // Launch the graph optimize task { FFModel *model = this; - TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID, - TaskArgument(&model, sizeof(FFModel *))); - Future future = runtime->execute_task(ctx, launcher); - - PCG::GraphOptimalViewSerialized ret = - future.get_result(); + PCG::GraphOptimalViewSerialized ret; + if (false) { + TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID, + TaskArgument(&model, sizeof(FFModel *))); + Future future = runtime->execute_task(ctx, launcher); + ret = + future.get_result(); + } else { + ret = PCG::Graph::graph_optimize_wrapper(this); + } Deserializer dez(ret.data, ret.total_bytes); // Reconstruct operators PCG::Graph *best_graph = new PCG::Graph(this); From b5b0815d83fd9a73afb4bd3b99a2f9cc5ca977ea Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 14 Aug 2023 14:16:38 -0400 Subject: [PATCH 43/72] remove unnecessary file --- SPECINFER.md | 145 --------------------------------------------------- 1 file changed, 145 deletions(-) delete mode 100644 SPECINFER.md diff --git a/SPECINFER.md b/SPECINFER.md deleted file mode 100644 index 347b394db2..0000000000 --- a/SPECINFER.md +++ /dev/null @@ -1,145 +0,0 @@ -# FlexFlow Serve: Low-Latency, High-Performance LLM Serving -![build](https://github.com/flexflow/flexflow/workflows/build/badge.svg?branch=master) ![gpu tests](https://github.com/flexflow/flexflow/workflows/gpu-ci/badge.svg?branch=master) ![multinode gpu tests](https://github.com/flexflow/flexflow/workflows/multinode-test/badge.svg?branch=master) ![docker](https://github.com/flexflow/flexflow/workflows/docker-build/badge.svg?branch=master) ![pip](https://github.com/flexflow/flexflow/workflows/pip-install/badge.svg?branch=master) ![shell-check](https://github.com/flexflow/flexflow/workflows/Shell%20Check/badge.svg?branch=master) ![clang-format](https://github.com/flexflow/flexflow/workflows/clang-format%20Check/badge.svg?branch=master) [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest) - -

-A SpecInfer Demo -

- -## What is FlexFlow Serve - -

-An overview of SpecInfer -

- -The high computational and memory requirements of generative large language -models (LLMs) make it challenging to serve them quickly and cheaply. -FlexFlow Serve is an open-source compiler and distributed system for -__low latency__, __high performance__ LLM serving. FlexFlow Serve outperforms -existing systems by 1.3-2.0x for single-node, multi-GPU inference and by -1.4-2.4x for multi-node, multi-GPU inference. - -that accelerates generative LLM -inference with __speculative inference__ and __token tree verification__. A key insight -behind SpecInfer is to combine various collectively boost-tuned small speculative -models (SSMs) to jointly predict the LLM’s outputs; the predictions are organized as a -token tree, whose nodes each represent a candidate token sequence. The correctness -of all candidate token sequences represented by a token tree is verified against the -LLM’s output in parallel using a novel tree-based parallel decoding mechanism. -SpecInfer uses an LLM as a token tree verifier instead of an incremental decoder, -which largely reduces the end-to-end inference latency and computational requirement -for serving generative LLMs while provably preserving model quality. - -

-Performance comparison -

- -## Build/Install SpecInfer -SpecInfer is built on top of FlexFlow. You can build/install SpecInfer by building the inference branch of FlexFlow. Please read the [instructions](../INSTALL.md) for building/installing FlexFlow from source code. If you would like to quickly try SpecInfer, we also provide pre-built Docker packages ([specinfer-cuda](https://github.com/flexflow/FlexFlow/pkgs/container/specinfer-cuda) with a CUDA backend, [specinfer-hip_rocm](https://github.com/flexflow/FlexFlow/pkgs/container/specinfer-hip_rocm) with a HIP-ROCM backend) with all dependencies pre-installed (N.B.: currently, the CUDA pre-built containers are only fully compatible with host machines that have CUDA 11.7 installed), together with [Dockerfiles](./docker) if you wish to build the containers manually. - -## Run SpecInfer -The source code of the SpecInfer pipeline is available at [this folder](../inference/spec_infer/). The SpecInfer executable will be available at `/build_dir/inference/spec_infer/spec_infer` at compilation. You can use the following command-line arguments to run SpecInfer: - -* `-ll:gpu`: number of GPU processors to use on each node for serving an LLM (default: 0) -* `-ll:fsize`: size of device memory on each GPU in MB -* `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. SpecInfer keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters. -* `-llm-model`: the LLM model type as a case-insensitive string (e.g. "opt" or "llama") -* `-llm-weight`: path to the folder that stores the LLM weights -* `-llm-config`: path to the json file that stores the LLM model configs -* `-ssm-model`: the LLM model type as a case-insensitive string (e.g. "opt" or "llama"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs. -* `-ssm-weight`: path to the folder that stores the small speculative models' weights. The number of `-ssm-weight`s must match the number of `-ssm-model`s and `-ssm-config`s. -* `-ssm-config`: path to the json file that stores the SSM model configs. The number of `-ssm-config`s must match the number of `-ssm-model`s and `-ssm-weight`s. -* `-tokenizer`: path to the tokenizer file (see [Tokenizers](#tokenizers) for preparing a tokenizer for SpecInfer). -* `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used. -* `-prompt`: (optional) path to the prompt file. SpecInfer expects a json format file for prompts, all of which will be served by SpecInfer. In addition, users can also use the following API for registering requests: -* `-output-file`: (optional) filepath to use to save the output of the model, together with the generation latency - - -```c++ -class RequestManager { - RequestGuid register_new_request(std::string const &prompt, int max_sequence_length); -} -``` -For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-190M models for speculative inference. - -```bash -./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight /path/to/llm/weights -llm-config /path/to/llm/config.json -ssm-model llama -ssm-weight /path/to/ssm1/weights -ssm-config /path/to/ssm/config.json -ssm-model llama -smm-weight /path/to/ssm2/weights -ssm-config /path/to/ssm2/config.json -tokenizer /path/to/tokenizer.model -prompt /path/to/prompt.json --use-full-precision -tensor-parallelism-degree 2 -pipeline-parallelism-degree 2 -``` - -### Tokenizers -SpecInfer supports two tokenizers: - -* The SentencePiece tokenizer is used to support the LLaMA model family (e.g., LLaMA-6B, LLaMA-13B, and LLaMA-190M in our demo). We used the pretrained sentencepiece tokenizer from LLAMA, which is also available on Hugging Face (model id: `decapoda-research/llama-7b-hf`). If you are using our LLAMA-160M weights for the demo, however, you should use the tokenizer from the [JackFram/llama-160m](https://huggingface.co/JackFram/llama-160m/resolve/main/tokenizer.model) HuggingFace repo. -* The GPT2 tokenizer is used to support the Open Pre-trained Transformer model family (e.g., OPT-13B and OPT-125M). To use it, download the [vocab](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-vocab.json) and [merges](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-merges.txt) files and pass the folder containing them as a parameter. - -### Mixed-precision Support -SpecInfer now supports single-precision floating points and half-precision floating points. By default we use half-precision. Add `--use-full-precision` to the command line to run the demo with single-precision, please make sure to use the correct weight files in the form below. - -### CPU Offloading -SpecInfer offers offloading-based inference for running large models (e.g., llama-7B) on a single GPU. CPU offloading is a choice to save tensors in CPU memory, and only copy the tensor to GPU when doing calculation. Notice that now we selectively offload the largest weight tensors (weights tensor in Linear, Attention). Besides, since the small model occupies considerably less space, it it does not pose a bottleneck for GPU memory, the offloading will bring more runtime space and computational cost, so we only do the offloading for the large model. You can run the offloading example by adding `-offload` and `-offload-reserve-space-size` flags. -#### Quantization -To reduce data transferred between the CPU and GPU, SpecInfer provides int4 and int8 quantization. The compressed tensors are stored on the CPU side. Once copied to the GPU, these tensors undergo decompression and conversion back to their original precision. Please find the compressed weight files in our s3 bucket, or use [this script](../inference/utils/compress_llama_weights.py) from [FlexGen](https://github.com/FMInference/FlexGen) project to do the compression manually. The quantization method can be selected using the `--4bit-quantization` and `--8bit-quantization` flags. - -Below is an example command line to use offloading and quantization in SpecInfer. - -```bash -./inference/spec_infer/spec_infer -ll:gpu 1 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight /path/to/llm/weights -llm-config /path/to/llm/config.json -ssm-model llama -ssm-weight /path/to/ssm1/weights -ssm-config /path/to/ssm/config.json -ssm-model llama -smm-weight /path/to/ssm2/weights -ssm-config /path/to/ssm2/config.json -tokenizer /path/to/tokenizer.model -prompt /path/to/prompt.json --use-full-precision -offload -offload-reserve-space-size 6000 --8bit-quantization -``` - - - -### LLM Weights -The weight files used in our demo are extracted from HuggingFace, and stored in our AWS S3 bucket. - -| Model | Model id on Hugging Face | Storage Location (single precision) | Storage Location (half precision) | -| :---- | :---- | :---- | :---- | -| LLaMA-7B | decapoda-research/llama-7b-hf | s3://specinfer/weights/llama_7B_weights.tar.gz | s3://specinfer/half_weights/llama_7B_weights.tar.gz -| LLaMA-190M | JackFram/llama-160m | s3://specinfer/weights/llama_160M_weights.tar.gz | s3://specinfer/half_weights/llama_160M_weights.tar.gz -| OPT-6.7B | facebook/opt-6.7b | s3://specinfer/weights/opt_6B_weights.tar.gz | s3://specinfer/half_weights/opt_6B_weights.tar.gz -| OPT-125M | facebook/opt-125m | s3://specinfer/weights/opt_125M_weights.tar.gz | s3://specinfer/half_weights/opt_125M_weights.tar.gz - -You can use [this script](../inference/utils/download_llama_weights.py) to automatically download and convert the weights of a HuggingFace LLAMA LLM and a LLAMA SSM to the SpecInfer weight format. The script also downloads the LLAMA tokenizer. If you would like to try the OPT model instead, use [this script](../inference/utils/download_opt_weights.py) to download (and convert) the OPT weights and tokenizer. - -### Prompt Datasets -We have evaluated SpecInfer on the following prompts datasets: [Chatbot instruction prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatbot.json), [ChatGPT Prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatgpt.json), [WebQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/webqa.json), [Alpaca](https://specinfer.s3.us-east-2.amazonaws.com/prompts/alpaca.json), and [PIQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/piqa.json). - -### Script to run the demo -You can take a look at [this script](../tests/inference_tests.sh), which is run in CI for each new commit, for an example of how to run the demo. - -## Difference between SpecInfer and HuggingFace Assistant Model - -There are two major differences between the two systems. - -* First, the HuggingFace assistant model produces a single candidate token sequence during speculation, while SpecInfer generates and verifies a speculated token tree, whose tokens each represent a candidate token sequence. To deal with the more complex verification task, SpecInfer includes a number of systems and algorithmic optimizations to quickly and efficiently verify all tokens of a token tree in parallel. - -* Second, instead of considering a single assistant model, SpecInfer combines a variety of collectively boost-tuned small speculative models (SSMs) to jointly predict the LLM's outputs. We observe that using multiple boost-tuned SSMs is critical for improving speculative performance. - -## TODOs - -SpecInfer is under active development. We currently focus on the following tasks and strongly welcome all contributions to SpecInfer from bug fixes to new features and extensions. - -* Low-precision and mixed-precision support. The current version uses single-precision floating points for computing tree attention. We are actively working on support half-precision floating points, and int4 and int8 quantizations. -* Offloading-based generative LLM inference. Another promising avenue for future work is using speculative inference and token tree verification to reduce the end-to-end inference for offloading-based generative LLM inference. A potential application of this technique is enabling a single commodity GPU to serve LLMs for latency critical tasks. - -## Acknowledgements -This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting SpecInfer and the underlying FlexFlow runtime system. The following paper describes design, implementation, and key optimizations of SpecInfer. - -* Xupeng Miao*, Gabriele Oliaro*, Zhihao Zhang*, Xinhao Cheng, Zeyu Wang, Rae Ying Yee Wong, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia. [SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification](https://arxiv.org/abs/2305.09781). - -\* Denotes equal contribution - -### Citation -Please cite as: - -``` bibtex -@misc{miao2023specinfer, - title={SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification}, - author={Xupeng Miao and Gabriele Oliaro and Zhihao Zhang and Xinhao Cheng and Zeyu Wang and Rae Ying Yee Wong and Zhuoming Chen and Daiyaan Arfeen and Reyna Abhyankar and Zhihao Jia}, - year={2023}, - eprint={2305.09781}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -} -``` - -## License -Both SpecInfer and FlexFlow use Apache License 2.0. From 94e35d96f33f7c7a8a7be1399e7294dc7f191852 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 15 Aug 2023 17:05:47 +0000 Subject: [PATCH 44/72] fix hip build --- include/flexflow/utils/hip_helper.h | 4 + src/loss_functions/loss_functions.cpp | 37 +++- src/metrics_functions/metrics_functions.cpp | 16 +- src/ops/element_unary.cpp | 5 +- src/ops/kernels/softmax.cpp | 37 +++- src/ops/layer_norm.cpp | 232 +++++++++++++++----- src/runtime/hip_helper.cpp | 51 +++++ 7 files changed, 309 insertions(+), 73 deletions(-) diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h index 2ea09770d6..709e78f517 100644 --- a/include/flexflow/utils/hip_helper.h +++ b/include/flexflow/utils/hip_helper.h @@ -137,6 +137,10 @@ miopenStatus_t cudnnSetTensorDescriptorFromDomain(miopenTensorDescriptor_t tensor, Legion::Domain domain); +miopenStatus_t + cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor, + Legion::Domain domain); + hipblasDatatype_t ff_to_cuda_datatype(DataType type); miopenDataType_t ff_to_cudnn_datatype(DataType type); diff --git a/src/loss_functions/loss_functions.cpp b/src/loss_functions/loss_functions.cpp index a87aaade84..3453f3fbf6 100644 --- a/src/loss_functions/loss_functions.cpp +++ b/src/loss_functions/loss_functions.cpp @@ -20,6 +20,7 @@ namespace FlexFlow { using namespace Legion; +int const MASK_TOKEN = -100; __global__ void sparse_categorical_crossentropy_loss_backward(float *logit_grad, @@ -33,6 +34,25 @@ __global__ void } } +__global__ void + sparse_categorical_crossentropy_loss_backward_with_mask(float *logit_grad, + int const *label, + coord_t num_samples, + coord_t num_classes, + int const k, + float *num) { + CUDA_KERNEL_LOOP(i, num_samples * num_classes) { + int sample_id = i / num_classes; + int label_idx = label[i / (k * num_classes)]; + if (label_idx != MASK_TOKEN && (i == sample_id * num_classes + label_idx)) { + logit_grad[i] -= 1.0f; + atomicAdd(&num[0], 1.0f); + } else if (label_idx == MASK_TOKEN) { + logit_grad[i] = 0.0f; + } + } +} + __global__ void categorical_crossentropy_loss_backward(float *logit_grad, float const *logit, float const *label, @@ -75,8 +95,14 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_ptr, logit_volume * sizeof(float), hipMemcpyDeviceToDevice)); - hipLaunchKernelGGL(sparse_categorical_crossentropy_loss_backward, - GET_BLOCKS(num_samples), + + assert(scale_factor == 1.0f); + float *num; + checkCUDA(hipMalloc(&num, sizeof(float))); + float effective_tokens; + int parallelism = num_samples * num_classes; + hipLaunchKernelGGL(sparse_categorical_crossentropy_loss_backward_with_mask, + GET_BLOCKS(parallelism), CUDA_NUM_THREADS, 0, stream, @@ -84,7 +110,10 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( label_ptr, num_samples, num_classes, - k); + k, + num); + + hipMemcpy(&effective_tokens, num, sizeof(float), hipMemcpyDeviceToHost); // Scale logit gradients by op->scale_factor hipLaunchKernelGGL(scale_kernel, GET_BLOCKS(logit_grad_volume), @@ -94,7 +123,7 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_grad_ptr, logit_grad_volume, 0, - scale_factor * k); + 1.0f / effective_tokens); } void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( diff --git a/src/metrics_functions/metrics_functions.cpp b/src/metrics_functions/metrics_functions.cpp index d30686be24..1c57bd6ba9 100644 --- a/src/metrics_functions/metrics_functions.cpp +++ b/src/metrics_functions/metrics_functions.cpp @@ -20,6 +20,7 @@ namespace FlexFlow { float const LOG_MIN_VALUE = 0.00000001f; +int const MASK_TOKEN = -100; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, @@ -39,14 +40,19 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, } } assert(my_label >= 0); - atomicAdd(&(perf->train_all), 1); - if (labels[b] == my_label) { - atomicAdd(&(perf->train_correct), 1); + if (labels[b] != MASK_TOKEN) { + atomicAdd(&(perf->train_all), 1); + if (labels[b] == my_label) { + atomicAdd(&(perf->train_correct), 1); + } } } if (metrics.measure_sparse_categorical_crossentropy) { - float my_logit = max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); - atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + if (labels[b] != MASK_TOKEN) { + float my_logit = + max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); + atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + } } if (metrics.measure_mean_squared_error || metrics.measure_root_mean_squared_error || diff --git a/src/ops/element_unary.cpp b/src/ops/element_unary.cpp index 43c84b0c41..38c6043297 100644 --- a/src/ops/element_unary.cpp +++ b/src/ops/element_unary.cpp @@ -189,8 +189,9 @@ __global__ void elewise_unary_backward_kernel(coord_t volume, case OP_GELU: { input_grad[i] = (T)(output_grad[i] * - (0.5 * erfc(-input[i] * M_SQRT1_2) - - 0.5 * M_SQRT1_2 * input[i] * exp(-input[i] * input[i] * 0.5))); + (0.5 * erfc(-input[i] * M_SQRT1_2) + + 0.5 * M_SQRT1_2 * input[i] * + ((2 / sqrt(M_PI)) * exp(-input[i] * input[i] * 0.5f)))); break; } case OP_RSQRT: { diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp index d63bd0edc5..6df6351bb0 100644 --- a/src/ops/kernels/softmax.cpp +++ b/src/ops/kernels/softmax.cpp @@ -27,8 +27,11 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, Domain const &input_domain) : OpMeta(handler) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); - checkCUDNN(cudnnSetTensorDescriptorFromDomain(inputTensor, input_domain)); + // checkCUDNN(cudnnSetTensorDescriptorFromDomain(inputTensor, input_domain)); + checkCUDNN( + cudnnSetTensorDescriptorFromDomain4SoftMax(inputTensor, input_domain)); dim = softmax->dim; + last_layer = softmax->last_layer; profiling = softmax->profiling; std::strcpy(op_name, softmax->name); } @@ -67,6 +70,7 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -78,7 +82,7 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, hipEventRecord(t_start, stream); } Internal::backward_kernel( - input_grad_ptr, output_grad_ptr, num_elements, stream); + m, input_grad_ptr, output_grad_ptr, output_ptr, num_elements, stream); if (m->profiling) { hipEventRecord(t_end, stream); checkCUDA(hipEventSynchronize(t_end)); @@ -114,15 +118,32 @@ void forward_kernel(SoftmaxMeta const *m, MIOPEN_SOFTMAX_MODE_CHANNEL)); } -void backward_kernel(float *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements, hipStream_t stream) { - checkCUDA(hipMemcpyAsync(input_grad_ptr, - output_grad_ptr, - num_elements * sizeof(float), - hipMemcpyDeviceToDevice, - stream)); + if (m->last_layer) { + checkCUDA(hipMemcpyAsync(input_grad_ptr, + output_grad_ptr, + num_elements * sizeof(float), + hipMemcpyDeviceToDevice, + stream)); + } else { + float alpha = 1.0f, beta = 0.0f; + checkCUDNN(miopenSoftmaxBackward_V2(m->handle.dnn, + &alpha, + m->inputTensor, + output_ptr, + m->inputTensor, + output_grad_ptr, + &beta, + m->inputTensor, + input_grad_ptr, + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_CHANNEL)); + } } } // namespace Internal diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index c3030e20b4..e03da48935 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -14,6 +14,7 @@ */ #include "flexflow/ops/layer_norm.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/utils/hip_helper.h" #include @@ -30,12 +31,26 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln) effective_batch_size = ln->effective_batch_size; effective_num_elements = ln->effective_num_elements; eps = ln->eps; - checkCUDA(hipMalloc(&mean_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&rstd_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&ds_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&db_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&scale_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&bias_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&mean_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&rstd_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&ds_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&db_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&scale_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&bias_ptr, sizeof(float) * effective_batch_size)); + + DataType data_type = ln->data_type; + checkCUDA( + hipMalloc(&mean_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&rstd_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&ds_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&db_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&scale_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&bias_ptr, data_type_size(data_type) * effective_batch_size)); } template @@ -43,12 +58,10 @@ __device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff) { -#if 0 #ifndef __HIP_PLATFORM_HCC__ - return __shfl_down_sync(mask, value, delta, width); + return __shfl_down_sync(mask, value, delta, width); #else - return __shfl_down(value, delta, width); -#endif + return __shfl_down(value, delta, width); #endif } @@ -79,26 +92,26 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { } template -__global__ void - RowwiseMomentsCUDAKernel(int64_t N, T eps, T const *X, T *mean, T *rstd) { - __shared__ T m_shared[C10_WARP_SIZE]; - __shared__ T v_shared[C10_WARP_SIZE]; +__global__ void RowwiseMomentsCUDAKernel( + int64_t N, float eps, T const *X, T *mean, T *rstd) { + __shared__ float m_shared[C10_WARP_SIZE]; + __shared__ float v_shared[C10_WARP_SIZE]; const int64_t i = blockIdx.x; - T sum1 = 0; - T sum2 = 0; + float sum1 = 0.0f; + float sum2 = 0.0f; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - sum1 += static_cast(X[index]); - sum2 += static_cast(X[index]) * static_cast(X[index]); + sum1 += static_cast(X[index]); + sum2 += static_cast(X[index]) * static_cast(X[index]); } - sum1 = BlockReduceSum(sum1, m_shared); - sum2 = BlockReduceSum(sum2, v_shared); + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); if (threadIdx.x == 0) { - const T scale = T(1) / static_cast(N); + float const scale = float(1) / static_cast(N); sum1 *= scale; - sum2 = max(sum2 * scale - sum1 * sum1, T(0)); - mean[i] = sum1; - rstd[i] = rsqrt(sum2 + static_cast(eps)); + sum2 = max(sum2 * scale - sum1 * sum1, float(0)); + mean[i] = static_cast(sum1); + rstd[i] = static_cast(rsqrt(sum2 + eps)); } } @@ -132,7 +145,7 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, T *gamma_ptr, T *beta_ptr, hipStream_t stream) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(RowwiseMomentsCUDAKernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(RowwiseMomentsCUDAKernel), m->effective_batch_size, kCUDABlockReduceNumThreads, 0, @@ -140,33 +153,50 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, m->effective_num_elements, m->eps, in_ptr, - m->mean_ptr, - m->rstd_ptr); - hipLaunchKernelGGL(HIP_KERNEL_NAME(LayerNormForwardCUDAKernel), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr)); + hipLaunchKernelGGL(HIP_KERNEL_NAME(LayerNormForwardCUDAKernel), m->effective_batch_size, kCUDANumThreads, 0, stream, m->effective_num_elements, in_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_ptr, beta_ptr, out_ptr); } /*static*/ -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - T const *in_ptr, - T *out_ptr, - T *gamma_ptr, - T *beta_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorW &gamma, + GenericTensorAccessorW &beta) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - LayerNorm::forward_kernel( - m, in_ptr, out_ptr, gamma_ptr, beta_ptr, stream); + // LayerNorm::forward_kernel( + // m, in_ptr, out_ptr, gamma_ptr, beta_ptr, stream); + + if (m->input_type[0] == DT_FLOAT) { + LayerNorm::forward_kernel(m, + input.get_float_ptr(), + output.get_float_ptr(), + gamma.get_float_ptr(), + beta.get_float_ptr(), + stream); + } else if (m->input_type[0] == DT_HALF) { + LayerNorm::forward_kernel(m, + input.get_half_ptr(), + output.get_half_ptr(), + gamma.get_half_ptr(), + beta.get_half_ptr(), + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } } template @@ -346,6 +376,82 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M, } } +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + dX_i[l] = f_grad_input; + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + + compute_gI(dY, X, mean, rstd, gamma, dX, N, buf); +} + /*static*/ template void LayerNorm::backward_kernel(LayerNormMeta const *m, @@ -367,8 +473,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, output_grad_ptr, input_ptr, gamma_ptr, - m->ds_ptr, - m->db_ptr); + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel), B, @@ -377,12 +483,29 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, stream, M, N, - m->mean_ptr, - m->rstd_ptr, - m->ds_ptr, - m->db_ptr, - m->scale_ptr, - m->bias_ptr); + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel), + blocks, + num_threads, + nshared, + stream, + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + N); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { if (M < 512) { // For small batch size, do colwise reduce directly @@ -396,8 +519,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, N, output_grad_ptr, input_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_grad_ptr, beta_grad_ptr); } else { @@ -414,8 +537,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, N, output_grad_ptr, input_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_grad_ptr, beta_grad_ptr); } @@ -443,11 +566,12 @@ void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, stream); } -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - float const *in_ptr, - float *out_ptr, - float *gamma_ptr, - float *beta_ptr); +// template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const +// *m, +// float const *in_ptr, +// float *out_ptr, +// float *gamma_ptr, +// float *beta_ptr); template void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, float const *output_grad_ptr, diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index 375b4f3d53..ffdcf0dac1 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -298,6 +298,57 @@ miopenStatus_t return miopenStatusBadParm; } +miopenStatus_t + cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor, + Domain domain) { + int dims[MAX_TENSOR_DIM]; + switch (domain.get_dim()) { + case 1: { + Rect<1> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + return miopenSet4dTensorDescriptor(tensor, miopenFloat, dims[0], 1, 1, 1); + } + case 2: { + Rect<2> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[1], dims[0], 1, 1); + } + case 3: { + Rect<3> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[2] * dims[1], dims[0], 1, 1); + } + case 4: { + Rect<4> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + dims[3] = rect.hi[3] - rect.lo[3] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[3] * dims[2] * dims[1], dims[0], 1, 1); + } + case 5: { + Rect<5> rect = domain; + int leading_dim_size = rect.hi[4] - rect.lo[4] + 1; + assert(leading_dim_size == 1); + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + dims[3] = rect.hi[3] - rect.lo[3] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[3], dims[2], dims[1], dims[0]); + } + default: + assert(false && "Unsupported dim number"); + } + return miopenStatusBadParm; +} + miopenDataType_t ff_to_cudnn_datatype(DataType type) { switch (type) { case DT_FLOAT: From ded175c67d043cec87fdfaa34164f2b566db22d7 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 18 Aug 2023 13:01:34 -0400 Subject: [PATCH 45/72] bypass simulator creation when only_data_parallel is specified --- src/runtime/graph.cc | 77 ++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 5c49687712..f17b4dd547 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1900,6 +1900,37 @@ std::pair, std::unordered_map> model->config.workersPerNode, model->config.cpusPerNode, model->all_valid_views); + if (model->config.only_data_parallel) { + Graph *graph = new Graph(model); + graph->print_dot(); + std::unordered_map op_to_node_map; + for (FlexFlow::Op const *dstOp : model->operators) { + Node dstNode; + dstNode.ptr = dstOp; + dstNode.guid = model->node_global_guid++; + op_to_node_map[dstOp] = dstNode; + for (int j = 0; j < dstOp->numInputs; j++) { + FlexFlow::Op const *srcOp = dstOp->inputs[j]->owner_op; + assert(op_to_node_map.find(srcOp) != op_to_node_map.end()); + Node srcNode = op_to_node_map[srcOp]; + graph->add_edge(srcNode, dstNode, dstOp->inputs[j]->owner_idx, j); + } + } + graph->print_dot(); + curr_best_graph = std::unique_ptr(graph); + MachineView data_parallel_view; + data_parallel_view.device_type = MachineView::GPU; + data_parallel_view.ndims = 1; + data_parallel_view.dim[0] = + model->config.numNodes * model->config.workersPerNode; + data_parallel_view.stride[0] = 1; + data_parallel_view.start_device_id = 0; + for (auto const &node : curr_best_graph->inEdges) { + curr_optimal_views[node.first] = data_parallel_view; + } + return std::make_pair(std::move(curr_best_graph), curr_optimal_views); + } + Runtime *runtime = model->config.lg_hlr; Context ctx = model->config.lg_ctx; const Task* task = runtime->get_current_task(ctx); @@ -1939,44 +1970,14 @@ std::pair, std::unordered_map> std::unique_ptr curr_best_graph; std::unordered_map curr_optimal_views; - if (model->config.only_data_parallel) { - Graph *graph = new Graph(model); - graph->print_dot(); - std::unordered_map op_to_node_map; - for (FlexFlow::Op const *dstOp : model->operators) { - Node dstNode; - dstNode.ptr = dstOp; - dstNode.guid = model->node_global_guid++; - op_to_node_map[dstOp] = dstNode; - for (int j = 0; j < dstOp->numInputs; j++) { - FlexFlow::Op const *srcOp = dstOp->inputs[j]->owner_op; - assert(op_to_node_map.find(srcOp) != op_to_node_map.end()); - Node srcNode = op_to_node_map[srcOp]; - graph->add_edge(srcNode, dstNode, dstOp->inputs[j]->owner_idx, j); - } - } - graph->print_dot(); - curr_best_graph = std::unique_ptr(graph); - MachineView data_parallel_view; - data_parallel_view.device_type = MachineView::GPU; - data_parallel_view.ndims = 1; - data_parallel_view.dim[0] = - model->config.numNodes * model->config.workersPerNode; - data_parallel_view.stride[0] = 1; - data_parallel_view.start_device_id = 0; - for (auto const &node : curr_best_graph->inEdges) { - curr_optimal_views[node.first] = data_parallel_view; - } - } else { - // Main step to optimize the PCG of an FFModel - model->graph_optimize(model->config.search_budget, - model->config.only_data_parallel, - curr_best_graph, - curr_optimal_views, - perform_memory_search, - MemoryOptimConfig{lambda.first}, - lambda.second); - } + // Main step to optimize the PCG of an FFModel + model->graph_optimize(model->config.search_budget, + model->config.only_data_parallel, + curr_best_graph, + curr_optimal_views, + perform_memory_search, + MemoryOptimConfig{lambda.first}, + lambda.second); // Return the best result of the current search return std::make_pair(std::move(curr_best_graph), curr_optimal_views); }; From 1f7e8b79ba3ae341fb0cd100dc42dbeb1683a334 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 18 Aug 2023 14:33:59 -0400 Subject: [PATCH 46/72] add nccl prints --- src/runtime/model.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 8461668aaa..c52d5e7f0c 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -584,9 +584,10 @@ ncclComm_t Op::init_nccl_comms_task(Task const *task, } } ncclComm_t ncclComm; + fprintf(stderr, "Before ncclCommInitRank\n"); checkNCCL(ncclCommInitRank(&ncclComm, allRanks, ncclId, myRank)); - // fprintf(stderr, "ncclComm(%p) allRanks(%d) myRank(%d) ncclId(%p)\n", - // ncclComm, allRanks, myRank, ncclId); + fprintf(stderr, "After ncclCommInitRank ncclComm(%p) allRanks(%d) myRank(%d) ncclId(%p)\n", + ncclComm, allRanks, myRank, ncclId); return ncclComm; } #endif From 3fb70f6be3c53c9338ad936e7a0ea63350fc17d7 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 21 Aug 2023 15:59:05 -0400 Subject: [PATCH 47/72] . --- src/runtime/graph.cc | 2 ++ src/runtime/parallel_tensor.cc | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index f17b4dd547..7447125197 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1917,6 +1917,8 @@ std::pair, std::unordered_map> } } graph->print_dot(); + std::unique_ptr curr_best_graph; + std::unordered_map curr_optimal_views; curr_best_graph = std::unique_ptr(graph); MachineView data_parallel_view; data_parallel_view.device_type = MachineView::GPU; diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 6cc0d0067e..18318db3ce 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -672,6 +672,20 @@ bool ParallelTensorBase::set_tensor(FFModel const *ff, for (size_t i = 0; i < dim_sizes.size(); i++) { volume = volume * dim_sizes[i]; } + // Debug prints + { + std::string tensor_name; + if (owner_op == nullptr) { + tensor_name = "No OwnerOp"; + } else { + tensor_name = std::string(owner_op->name); + } + std::ostringstream oss; + for (int i = 0; i < dim_sizes.size(); i++) + oss << dim_sizes[i] << ", "; + printf("%s num_replicas(%zu) volume(%zu) dims(%s)\n", tensor_name.c_str(), + num_replicas, volume, oss.str().c_str()); + } RegionRequirement req(region, READ_WRITE, EXCLUSIVE, region); req.add_field(FID_DATA); InlineLauncher launcher(req); From d652b6202e25570c23903375ba0d36c87b522c4d Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 29 Aug 2023 13:21:06 -0400 Subject: [PATCH 48/72] rccl --- CMakeLists.txt | 17 ++++++++++------- config/config.linux | 4 +--- include/flexflow/config.h | 4 +++- include/flexflow/machine_view.h | 4 +++- src/runtime/optimizer_kernel.cpp | 2 ++ 5 files changed, 19 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e7504d7026..81845dd7b3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -158,10 +158,6 @@ endif() # option for nccl option(FF_USE_NCCL "Run FlexFlow with NCCL" OFF) -if (FF_GPU_BACKEND STREQUAL "hip_rocm" AND FF_USE_NCCL STREQUAL "ON") - message(FATAL_ERROR "NCCL: ON for FF_GPU_BACKEND: hip_rocm. hip_rocm backend must have NCCL disabled.") -endif() - # option for avx2 option(FF_USE_AVX2 "Run FlexFlow with AVX2" OFF) @@ -224,7 +220,9 @@ endif() # NCCL if(FF_USE_NCCL) - include(nccl) + if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda") + include(nccl) + endif() list(APPEND FF_CC_FLAGS -DFF_USE_NCCL) list(APPEND FF_NVCC_FLAGS @@ -369,11 +367,13 @@ elseif(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm") elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") find_package(hipblas REQUIRED) find_package(miopen REQUIRED) + if(FF_USE_NCCL) + find_package(rccl REQUIRED) + endif() # find_package(rocrand REQUIRED) find_library(HIP_RAND_LIBRARY hiprand REQUIRED) add_compile_definitions(FF_USE_HIP_ROCM) - # The hip cmake config module defines three targets, # hip::amdhip64, hip::host, and hip::device. # @@ -387,12 +387,15 @@ elseif(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm") # Docs (outdated): # https://rocmdocs.amd.com/en/latest/Installation_Guide/Using-CMake-with-AMD-ROCm.html target_link_libraries(flexflow hip::device roc::hipblas MIOpen ${HIP_RAND_LIBRARY}) + if(FF_USE_NCCL) + target_link_libraries(flexflow rccl) + endif() endif() else() message(FATAL_ERROR "Unsupported FF_GPU_BACKEND for cmake: ${FF_GPU_BACKEND}") endif() -if(FF_USE_NCCL) +if(FF_USE_NCCL AND (FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda")) add_dependencies(flexflow ${NCCL_NAME}) endif() diff --git a/config/config.linux b/config/config.linux index a5bb093584..d3729aea4c 100755 --- a/config/config.linux +++ b/config/config.linux @@ -70,11 +70,9 @@ FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda} if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid." exit 1 -elif [[ "$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" ]]; then +elif [["$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" || "$FF_GPU_BACKEND" == "hip_rocm"]]; then # enable NCCL FF_USE_NCCL=${FF_USE_NCCL:-ON} -else - FF_USE_NCCL=OFF fi function get_build_configs() { diff --git a/include/flexflow/config.h b/include/flexflow/config.h index d82b1377c7..d1fe6231da 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -28,8 +28,10 @@ #error "Unknown device" #endif #include "tl/optional.hpp" -#ifdef FF_USE_NCCL +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include +#else +#include #endif namespace FlexFlow { diff --git a/include/flexflow/machine_view.h b/include/flexflow/machine_view.h index 8843dc4d6a..b843555e06 100644 --- a/include/flexflow/machine_view.h +++ b/include/flexflow/machine_view.h @@ -3,8 +3,10 @@ #include "legion.h" #include -#ifdef FF_USE_NCCL +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include +#else +#include #endif #include "flexflow/config.h" diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index 232799e027..e71adc87a8 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -87,6 +87,7 @@ __host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op, #ifdef FF_USE_NCCL __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, + OpMeta const *meta, float const *w_grad_ptr, size_t size, float *w_ptr, @@ -208,6 +209,7 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, #ifdef FF_USE_NCCL __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, + OpMeta const *meta, float const *w_grad_ptr, size_t size, float *w_ptr, From b39528b635adee41e98db319466663d962461ff2 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 6 Oct 2023 16:15:59 +0000 Subject: [PATCH 49/72] fix fuse --- include/flexflow/ops/layer_norm.h | 8 +- src/ops/fused.cpp | 1181 +++++++++++++++++------------ src/ops/fused.cu | 171 ++++- src/ops/layer_norm.cc | 4 +- src/ops/layer_norm.cpp | 4 +- src/ops/layer_norm.cu | 8 +- 6 files changed, 882 insertions(+), 494 deletions(-) diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 552b9cf365..de5ed48df2 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -63,14 +63,14 @@ class LayerNorm : public Op { static void forward_kernel(LayerNormMeta const *m, T const *input_ptr, T *output_ptr, - T *gamma_ptr, - T *beta_ptr, + T const *gamma_ptr, + T const *beta_ptr, ffStream_t stream); static void forward_kernel_wrapper(LayerNormMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW &output, - GenericTensorAccessorW &gamma, - GenericTensorAccessorW &beta); + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta); template static void backward_kernel(LayerNormMeta const *m, T const *output_grad_ptr, diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index a602c5d6b1..348680b2fd 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -18,6 +18,7 @@ #include "flexflow/ops/batch_norm.h" #include "flexflow/ops/element_unary.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" +#include "flexflow/ops/kernels/cast_kernels.h" #include "flexflow/ops/kernels/concat_kernels.h" #include "flexflow/ops/kernels/conv_2d_kernels.h" #include "flexflow/ops/kernels/dropout_kernels.h" @@ -26,7 +27,9 @@ #include "flexflow/ops/kernels/linear_kernels.h" #include "flexflow/ops/kernels/pool_2d_kernels.h" #include "flexflow/ops/kernels/reshape_kernels.h" +#include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" +#include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" #include "flexflow/utils/hip_helper.h" #include @@ -281,8 +284,8 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; Kernels::ElementBinary::forward_kernel_wrapper( m, @@ -292,488 +295,730 @@ __host__ void FusedOp::forward_task(Task const *task, break; break; } - case OP_RELU: - case OP_SIGMOID: - case OP_TANH: - case OP_ELU: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - break; - } - case OP_POOL2D: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; - Kernels::Pool2D::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); - break; - } - case OP_FLAT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - Kernels::Flat::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - break; - } - case OP_RESHAPE: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - break; - } - case OP_TRANSPOSE: { + + case OP_EMBEDDING: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - TransposeMeta *m = (TransposeMeta *)metas->meta[op]; - Kernels::Transpose::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain, - my_output_accessor[0].domain); - break; - } - default: { - fprintf(stderr, - "Fusion currently does not support type = %d\n", - fused->op_op_type[op]); - assert(false && "Fusion currently does not support type"); + EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; + if (m->aggr == AGGR_MODE_NONE) { + // assert(kernel_domain.get_dim() == 2); + assert(my_input_accessor[0].domain.get_dim() + 1 == + my_output_accessor[0].domain.get_dim()); + for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); i++) { + assert(my_input_accessor[0].domain.hi()[i] == + my_output_accessor[0].domain.hi()[i + 1]); + assert(my_input_accessor[0].domain.lo()[i] == + my_output_accessor[0].domain.lo()[i + 1]); + } + assert(my_weight_accessor[0].domain.hi()[0] - + my_weight_accessor[0].domain.lo()[0] == + my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0]); + } else { + assert(my_input_accessor[0].domain.get_dim() == + my_output_accessor[0].domain.get_dim()); + for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); i++) { + assert(my_input_accessor[0].domain.hi()[i] == + my_output_accessor[0].domain.hi()[i]); + assert(my_input_accessor[0].domain.lo()[i] == + my_output_accessor[0].domain.lo()[i]); + } + assert(my_weight_accessor[0].domain.hi()[0] - + my_weight_accessor[0].domain.lo()[0] == + my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0]); + } + int in_dim, out_dim, effective_batch_size; + if (m->aggr == AGGR_MODE_NONE) { + in_dim = 1; + out_dim = my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } else { + assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM); + in_dim = my_input_accessor[0].domain.hi()[0] - + my_input_accessor[0].domain.lo()[0] + 1; + out_dim = my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } + assert(my_input_accessor[0].data_type == DT_INT32 || + my_input_accessor[0].data_type == DT_INT64); + Kernels::Embedding::forward_kernel_wrapper(m, + my_input_accessor[0], + my_output_accessor[0], + my_weight_accessor[0], + in_dim, + out_dim, + effective_batch_size); + case OP_GELU: + case OP_RELU: + case OP_SIGMOID: + case OP_TANH: + case OP_ELU: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_POOL2D: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; + Kernels::Pool2D::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + break; + } + case OP_FLAT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + Kernels::Flat::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (m->input_type == DT_FLOAT) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + } else { + assert(false); + } + break; + } + case OP_RESHAPE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_TRANSPOSE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + TransposeMeta *m = (TransposeMeta *)metas->meta[op]; + Kernels::Transpose::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain, + my_output_accessor[0].domain); + break; + } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; + } + LayerNorm::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0], gamma, beta); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_INT64) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_output_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_accessor[0].domain.get_volume()); + } else { + assert(false); + } + default: { + fprintf(stderr, + "Fusion currently does not support type = %d\n", + fused->op_op_type[op]); + assert(false && "Fusion currently does not support type"); + } + } + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; } + // for (int i = 0; i < fused->numOutputs; i++) + // print_tensor(output_ptr[i], output_domain[i].get_volume(), + // "[Fused:forward:output]"); } - ioff += fused->op_num_inputs[op]; - woff += fused->op_num_weights[op]; - ooff += fused->op_num_outputs[op]; - } - // for (int i = 0; i < fused->numOutputs; i++) - // print_tensor(output_ptr[i], output_domain[i].get_volume(), - // "[Fused:forward:output]"); -} -/* - regions[...](I): input - regions[...](I): weight - regions[...](I): output - regions[...](I/O): input_grad - regions[...](I/O): weight_grad - regions[...](I/O): output_grad -*/ + /* + regions[...](I): input + regions[...](I): weight + regions[...](I): output + regions[...](I/O): input_grad + regions[...](I/O): weight_grad + regions[...](I/O): output_grad + */ -__host__ void FusedOp::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - // const FusedOp* fused = (FusedOp*) task->args; - FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); - FusedOp const *fused = metas->fused_op; + __host__ void FusedOp::backward_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + // const FusedOp* fused = (FusedOp*) task->args; + FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); + FusedOp const *fused = metas->fused_op; - assert(metas->numOperators == fused->numOperators); - assert(regions.size() == task->regions.size()); - { - int sum = fused->numInputs + fused->numWeights + fused->numOutputs; - assert(sum * 2 == (int)regions.size()); - } - GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorW weight_grad_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorR output_accessor[MAX_NUM_OUTPUTS]; - GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS]; - int roff = 0; - assert(fused->numInputs <= MAX_NUM_INPUTS); - for (int i = 0; i < fused->numInputs; i++) { - input_accessor[i] = - helperGetGenericTensorAccessorRO(fused->input_data_types[i], - regions[i], - task->regions[i], - FID_DATA, - ctx, - runtime); - } - roff += fused->numInputs; - assert(fused->numWeights <= MAX_NUM_WEIGHTS); - for (int i = 0; i < fused->numWeights; i++) { - weight_accessor[i] = - helperGetGenericTensorAccessorRO(fused->weight_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); - } - roff += fused->numWeights; - assert(fused->numOutputs <= MAX_NUM_OUTPUTS); - for (int i = 0; i < fused->numOutputs; i++) { - output_accessor[i] = - helperGetGenericTensorAccessorRO(fused->output_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); - } - roff += fused->numOutputs; - for (int i = 0; i < fused->numInputs; i++) { - input_grad_accessor[i] = - helperGetGenericTensorAccessorRW(fused->input_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); - assert(input_grad_accessor[i].domain == input_accessor[i].domain); - } - roff += fused->numInputs; - for (int i = 0; i < fused->numWeights; i++) { - weight_grad_accessor[i] = - helperGetGenericTensorAccessorRW(fused->weight_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); - assert(weight_grad_accessor[i].domain.get_volume() == - weight_accessor[i].domain.get_volume()); - } - roff += fused->numWeights; - for (int i = 0; i < fused->numOutputs; i++) { - output_grad_accessor[i] = - helperGetGenericTensorAccessorRW(fused->output_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); - assert(output_grad_accessor[i].domain == output_accessor[i].domain); - } - roff += fused->numOutputs; - // Assert that all meta share the same dnn/blas handler - int start = 0; - for (start = 0; start < fused->numOperators; start++) { - if (metas->meta[start] != NULL) { - break; - } - } - for (int op = start + 1; op < fused->numOperators; op++) { - if (metas->meta[op] != NULL) { - assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas); - assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn); - } - } - - hipStream_t stream; - checkCUDA(get_legion_stream(&stream)); - - int ioff = 0, woff = 0, ooff = 0; - GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorR my_output_accessor[MAX_NUM_OUTPUTS]; - GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorW my_weight_grad_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS]; - // Do backpropagation in the reverse ordering - for (int op = 0; op < fused->numOperators; op++) { - ioff += fused->op_num_inputs[op]; - woff += fused->op_num_weights[op]; - ooff += fused->op_num_outputs[op]; - } - - for (int op = fused->numOperators - 1; op >= 0; op--) { - ioff -= fused->op_num_inputs[op]; - woff -= fused->op_num_weights[op]; - ooff -= fused->op_num_outputs[op]; - for (int i = 0; i < fused->op_num_inputs[op]; i++) { - int my_off = fused->op_input_idx[i + ioff]; - if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { - my_input_accessor[i] = input_accessor[my_off]; - my_input_grad_accessor[i] = input_grad_accessor[my_off]; - } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { - my_input_accessor[i] = output_accessor[my_off]; - my_input_grad_accessor[i] = output_grad_accessor[my_off]; - assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain); - } else { - assert(false); + assert(metas->numOperators == fused->numOperators); + assert(regions.size() == task->regions.size()); + { + int sum = fused->numInputs + fused->numWeights + fused->numOutputs; + assert(sum * 2 == (int)regions.size()); } - } - for (int i = 0; i < fused->op_num_weights[op]; i++) { - assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; - my_weight_grad_accessor[i] = - weight_grad_accessor[fused->op_weight_idx[i + woff]]; - assert(my_weight_grad_accessor[i].domain.get_volume() == - my_weight_accessor[i].domain.get_volume()); - } - for (int i = 0; i < fused->op_num_outputs[op]; i++) { - assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); - my_output_accessor[i] = output_accessor[fused->op_output_idx[i + ooff]]; - my_output_grad_accessor[i] = - output_grad_accessor[fused->op_output_idx[i + ooff]]; - assert(my_output_grad_accessor[i].domain == my_output_accessor[i].domain); - } - switch (fused->op_op_type[op]) { - case OP_CONCAT: { - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - ConcatMeta *m = (ConcatMeta *)metas->meta[op]; - int num_inputs = fused->op_num_inputs[op]; - Kernels::Concat::backward_kernel_wrapper(m, - my_output_grad_accessor[0], - my_input_grad_accessor, - num_inputs, - m->legion_axis); - break; + GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW weight_grad_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorR output_accessor[MAX_NUM_OUTPUTS]; + GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS]; + int roff = 0; + assert(fused->numInputs <= MAX_NUM_INPUTS); + for (int i = 0; i < fused->numInputs; i++) { + input_accessor[i] = + helperGetGenericTensorAccessorRO(fused->input_data_types[i], + regions[i], + task->regions[i], + FID_DATA, + ctx, + runtime); } - case OP_CONV2D: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_dim() == 5); - assert(my_weight_accessor[0].domain.get_dim() == 5); - assert(my_output_accessor[0].domain.get_dim() == 5); - Conv2DMeta *m = (Conv2DMeta *)metas->meta[op]; - Kernels::Conv2D::backward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - my_weight_grad_accessor[0].get_float_ptr(), - my_weight_grad_accessor[1].get_float_ptr()); - break; + roff += fused->numInputs; + assert(fused->numWeights <= MAX_NUM_WEIGHTS); + for (int i = 0; i < fused->numWeights; i++) { + weight_accessor[i] = + helperGetGenericTensorAccessorRO(fused->weight_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); } - case OP_BATCHNORM: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_dim() == 5); - assert(my_weight_accessor[0].domain.get_dim() == 2); - assert(my_weight_accessor[1].domain.get_dim() == 2); - assert(my_output_accessor[0].domain.get_dim() == 5); - BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; - BatchNorm::backward_kernel( - m, - (float const *)my_input_accessor[0].get_float_ptr(), - (float *)my_output_grad_accessor[0].get_float_ptr(), - (float const *)my_output_accessor[0].get_float_ptr(), - (float *)my_input_grad_accessor[0].get_float_ptr(), - (float const *)my_weight_accessor[0].get_float_ptr(), - (float *)my_weight_grad_accessor[0].get_float_ptr(), - (float *)my_weight_grad_accessor[1].get_float_ptr(), - my_output_accessor[0].domain.get_volume()); - break; + roff += fused->numWeights; + assert(fused->numOutputs <= MAX_NUM_OUTPUTS); + for (int i = 0; i < fused->numOutputs; i++) { + output_accessor[i] = + helperGetGenericTensorAccessorRO(fused->output_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); } - case OP_DROPOUT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - DropoutMeta *m = (DropoutMeta *)metas->meta[op]; - Kernels::Dropout::backward_kernel_wrapper( - m, - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr()); - break; - } - case OP_LINEAR: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - Domain kernel_domain = my_weight_accessor[0].domain; - int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1; - int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1; - int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; - assert(my_output_accessor[0].domain.get_volume() == - out_dim * batch_size); - assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); - float *bias_grad_ptr = nullptr; - if (fused->op_num_weights[op] == 2) { - assert(my_weight_accessor[1].domain.get_volume() == out_dim); - bias_grad_ptr = my_weight_grad_accessor[1].get_float_ptr(); - } else { - assert(fused->op_num_weights[op] == 1); - } - LinearMeta *m = (LinearMeta *)metas->meta[op]; - Kernels::Linear::backward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - my_weight_grad_accessor[0].get_float_ptr(), - bias_grad_ptr, - in_dim, - out_dim, - batch_size); - break; - } - case OP_BATCHMATMUL: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - Domain out_domain = my_output_accessor[0].domain; - Domain a_domain = my_input_accessor[0].domain; - Domain b_domain = my_input_accessor[1].domain; - // check dims - int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; - assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); - int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; - assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); - int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; - assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); - assert(a_domain.get_dim() == b_domain.get_dim()); - assert(a_domain.get_dim() == out_domain.get_dim()); - int batch = 1; - for (int i = 2; i < a_domain.get_dim(); i++) { - int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; - assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); - assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); - batch *= dim_size; - } - BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; - Kernels::BatchMatmul::backward_kernel_wrapper( - meta, - (float const *)my_output_accessor[0].get_float_ptr(), - (float const *)my_output_grad_accessor[0].get_float_ptr(), - (float const *)my_input_accessor[0].get_float_ptr(), - (float *)my_input_grad_accessor[0].get_float_ptr(), - (float const *)my_input_accessor[1].get_float_ptr(), - (float *)my_input_grad_accessor[1].get_float_ptr(), - (float *)nullptr, - m, - n, - k, - batch); - break; + roff += fused->numOutputs; + for (int i = 0; i < fused->numInputs; i++) { + input_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->input_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + assert(input_grad_accessor[i].domain == input_accessor[i].domain); } - case OP_EW_ADD: - case OP_EW_SUB: - case OP_EW_MUL: - case OP_EW_DIV: - case OP_EW_MAX: - case OP_EW_MIN: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; - Kernels::ElementBinary::backward_kernel_wrapper( - m, - my_output_grad_accessor[0].get_float_ptr(), - my_input_accessor[0].get_float_ptr(), - my_input_accessor[1].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[1].get_float_ptr()); - break; + roff += fused->numInputs; + for (int i = 0; i < fused->numWeights; i++) { + weight_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->weight_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + assert(weight_grad_accessor[i].domain.get_volume() == + weight_accessor[i].domain.get_volume()); } - case OP_RELU: - case OP_SIGMOID: - case OP_TANH: - case OP_ELU: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - ElementUnary::backward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - break; + roff += fused->numWeights; + for (int i = 0; i < fused->numOutputs; i++) { + output_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->output_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + assert(output_grad_accessor[i].domain == output_accessor[i].domain); } - case OP_POOL2D: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; - Kernels::Pool2D::backward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr()); - break; + roff += fused->numOutputs; + // Assert that all meta share the same dnn/blas handler + int start = 0; + for (start = 0; start < fused->numOperators; start++) { + if (metas->meta[start] != NULL) { + break; + } } - case OP_FLAT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_grad_accessor[0].domain.get_volume() == - my_output_grad_accessor[0].domain.get_volume()); - Kernels::Flat::backward_kernel_wrapper( - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain.get_volume()); - break; + for (int op = start + 1; op < fused->numOperators; op++) { + if (metas->meta[op] != NULL) { + assert(metas->meta[start]->handle.blas == + metas->meta[op]->handle.blas); + assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn); + } } - case OP_RESHAPE: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_grad_accessor[0].domain.get_volume() == - my_output_grad_accessor[0].domain.get_volume()); - Kernels::Reshape::backward_kernel_wrapper( - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain.get_volume()); - break; + + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + int ioff = 0, woff = 0, ooff = 0; + GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorR my_output_accessor[MAX_NUM_OUTPUTS]; + GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorW my_weight_grad_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS]; + // Do backpropagation in the reverse ordering + for (int op = 0; op < fused->numOperators; op++) { + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; } - case OP_TRANSPOSE: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_grad_accessor[0].domain.get_volume() == - my_output_grad_accessor[0].domain.get_volume()); - TransposeMeta *m = (TransposeMeta *)metas->meta[op]; - Kernels::Transpose::backward_kernel_wrapper( - m, - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain, - my_output_grad_accessor[0].domain); - break; + + for (int op = fused->numOperators - 1; op >= 0; op--) { + ioff -= fused->op_num_inputs[op]; + woff -= fused->op_num_weights[op]; + ooff -= fused->op_num_outputs[op]; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + int my_off = fused->op_input_idx[i + ioff]; + if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { + my_input_accessor[i] = input_accessor[my_off]; + my_input_grad_accessor[i] = input_grad_accessor[my_off]; + } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { + my_input_accessor[i] = output_accessor[my_off]; + my_input_grad_accessor[i] = output_grad_accessor[my_off]; + assert(my_input_grad_accessor[i].domain == + my_input_accessor[i].domain); + } else { + assert(false); + } + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); + my_weight_accessor[i] = + weight_accessor[fused->op_weight_idx[i + woff]]; + my_weight_grad_accessor[i] = + weight_grad_accessor[fused->op_weight_idx[i + woff]]; + assert(my_weight_grad_accessor[i].domain.get_volume() == + my_weight_accessor[i].domain.get_volume()); + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); + my_output_accessor[i] = + output_accessor[fused->op_output_idx[i + ooff]]; + my_output_grad_accessor[i] = + output_grad_accessor[fused->op_output_idx[i + ooff]]; + assert(my_output_grad_accessor[i].domain == + my_output_accessor[i].domain); + } + switch (fused->op_op_type[op]) { + case OP_CONCAT: { + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + ConcatMeta *m = (ConcatMeta *)metas->meta[op]; + int num_inputs = fused->op_num_inputs[op]; + Kernels::Concat::backward_kernel_wrapper(m, + my_output_grad_accessor[0], + my_input_grad_accessor, + num_inputs, + m->legion_axis); + break; + } + case OP_CONV2D: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 5); + assert(my_output_accessor[0].domain.get_dim() == 5); + Conv2DMeta *m = (Conv2DMeta *)metas->meta[op]; + Kernels::Conv2D::backward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + my_weight_grad_accessor[0].get_float_ptr(), + my_weight_grad_accessor[1].get_float_ptr()); + break; + } + case OP_BATCHNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 2); + assert(my_weight_accessor[1].domain.get_dim() == 2); + assert(my_output_accessor[0].domain.get_dim() == 5); + BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; + BatchNorm::backward_kernel( + m, + (float const *)my_input_accessor[0].get_float_ptr(), + (float *)my_output_grad_accessor[0].get_float_ptr(), + (float const *)my_output_accessor[0].get_float_ptr(), + (float *)my_input_grad_accessor[0].get_float_ptr(), + (float const *)my_weight_accessor[0].get_float_ptr(), + (float *)my_weight_grad_accessor[0].get_float_ptr(), + (float *)my_weight_grad_accessor[1].get_float_ptr(), + my_output_accessor[0].domain.get_volume()); + break; + } + case OP_DROPOUT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + DropoutMeta *m = (DropoutMeta *)metas->meta[op]; + Kernels::Dropout::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr()); + break; + } + case OP_LINEAR: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + Domain kernel_domain = my_weight_accessor[0].domain; + int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1; + int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1; + int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; + assert(my_output_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_accessor[0].domain.get_volume() == + in_dim * batch_size); + float *bias_grad_ptr = nullptr; + if (fused->op_num_weights[op] == 2) { + assert(my_weight_accessor[1].domain.get_volume() == out_dim); + bias_grad_ptr = my_weight_grad_accessor[1].get_float_ptr(); + } else { + assert(fused->op_num_weights[op] == 1); + } + LinearMeta *m = (LinearMeta *)metas->meta[op]; + Kernels::Linear::backward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + my_weight_grad_accessor[0].get_float_ptr(), + bias_grad_ptr, + in_dim, + out_dim, + batch_size); + break; + } + case OP_BATCHMATMUL: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + Domain out_domain = my_output_accessor[0].domain; + Domain a_domain = my_input_accessor[0].domain; + Domain b_domain = my_input_accessor[1].domain; + // check dims + int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; + assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); + int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; + assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); + int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; + assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); + assert(a_domain.get_dim() == b_domain.get_dim()); + assert(a_domain.get_dim() == out_domain.get_dim()); + int batch = 1; + for (int i = 2; i < a_domain.get_dim(); i++) { + int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; + assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); + assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); + batch *= dim_size; + } + BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; + Kernels::BatchMatmul::backward_kernel_wrapper( + meta, + (float const *)my_output_accessor[0].get_float_ptr(), + (float const *)my_output_grad_accessor[0].get_float_ptr(), + (float const *)my_input_accessor[0].get_float_ptr(), + (float *)my_input_grad_accessor[0].get_float_ptr(), + (float const *)my_input_accessor[1].get_float_ptr(), + (float *)my_input_grad_accessor[1].get_float_ptr(), + (float *)nullptr, + m, + n, + k, + batch); + break; + } + case OP_EW_ADD: + case OP_EW_SUB: + case OP_EW_MUL: + case OP_EW_DIV: + case OP_EW_MAX: + case OP_EW_MIN: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + // assert(my_input_accessor[0].domain == + // my_input_accessor[1].domain); assert(my_input_accessor[0].domain + // == my_output_accessor[0].domain); + ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; + Kernels::ElementBinary::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_accessor[1].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[1].get_float_ptr()); + break; + } + case OP_EMBEDDING: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; + assert(my_input_accessor[0].data_type == DT_INT64 || + my_input_accessor[0].data_type == DT_INT32); + int in_dim, out_dim, effective_batch_size; + if (m->aggr == AGGR_MODE_NONE) { + in_dim = 1; + out_dim = my_output_grad_accessor[0].domain.hi()[0] - + my_output_grad_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_grad_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } else { + in_dim = my_input_accessor[0].domain.hi()[0] - + my_input_accessor[0].domain.lo()[0] + 1; + out_dim = my_output_grad_accessor[0].domain.hi()[0] - + my_output_grad_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_grad_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } + Kernels::Embedding::backward_kernel_wrapper( + m, + my_input_accessor[0], + my_output_grad_accessor[0], + my_weight_grad_accessor[0], + in_dim, + out_dim, + effective_batch_size); + break; + } + case OP_GELU: + case OP_RELU: + case OP_SIGMOID: + case OP_TANH: + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; + ElementUnary::backward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_POOL2D: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + // assert(my_input_accessor[0].domain == + // my_output_accessor[0].domain); + Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; + Kernels::Pool2D::backward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr()); + break; + } + case OP_FLAT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_volume() == + my_output_grad_accessor[0].domain.get_volume()); + Kernels::Flat::backward_kernel_wrapper( + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + break; + } + case OP_RESHAPE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_volume() == + my_output_grad_accessor[0].domain.get_volume()); + if (my_input_grad_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_int64_ptr(), + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].get_int32_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::backward_kernel_wrapper( + m, + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } + case OP_TRANSPOSE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_volume() == + my_output_grad_accessor[0].domain.get_volume()); + TransposeMeta *m = (TransposeMeta *)metas->meta[op]; + Kernels::Transpose::backward_kernel_wrapper( + m, + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain, + my_output_grad_accessor[0].domain); + break; + } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == + 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; + } + LayerNorm::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + gamma.get_float_ptr(), + my_weight_grad_accessor[0].get_float_ptr(), + my_weight_grad_accessor[1].get_float_ptr()); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_INT64) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + default: + assert(false && "Fusion currently does not support type"); + } + } + assert(ioff == 0); + assert(woff == 0); + assert(ooff == 0); + // for (int i = 0; i < fused->numWeights; i++) + // print_tensor(weight_grad_ptr[i], + // weight_grad_domain[i].get_volume(), + // "[Fused:backward:weight_grad]"); + // for (int i = 0; i < fused->numInputs; i++) + // print_tensor(input_grad_ptr[i], + // input_grad_domain[i].get_volume(), + // "[Fused:backward:input_grad]"); + // for (int i = 0; i < fused->numOutputs; i++) + // print_tensor(output_grad_ptr[i], + // output_grad_domain[i].get_volume(), + // "[Fused:backward:output_grad]"); } - default: - assert(false && "Fusion currently does not support type"); - } - } - assert(ioff == 0); - assert(woff == 0); - assert(ooff == 0); - // for (int i = 0; i < fused->numWeights; i++) - // print_tensor(weight_grad_ptr[i], - // weight_grad_domain[i].get_volume(), "[Fused:backward:weight_grad]"); - // for (int i = 0; i < fused->numInputs; i++) - // print_tensor(input_grad_ptr[i], input_grad_domain[i].get_volume(), - // "[Fused:backward:input_grad]"); - // for (int i = 0; i < fused->numOutputs; i++) - // print_tensor(output_grad_ptr[i], - // output_grad_domain[i].get_volume(), "[Fused:backward:output_grad]"); -} -}; // namespace FlexFlow + }; // namespace FlexFlow diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 15072513a7..62262c89af 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -21,6 +21,7 @@ #include "flexflow/ops/flat.h" #include "flexflow/ops/fused.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" +#include "flexflow/ops/kernels/cast_kernels.h" #include "flexflow/ops/kernels/concat_kernels.h" #include "flexflow/ops/kernels/conv_2d_kernels.h" #include "flexflow/ops/kernels/dropout_kernels.h" @@ -30,7 +31,9 @@ #include "flexflow/ops/kernels/linear_kernels.h" #include "flexflow/ops/kernels/pool_2d_kernels.h" #include "flexflow/ops/kernels/reshape_kernels.h" +#include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" +#include "flexflow/ops/layer_norm.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { @@ -294,8 +297,8 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; Kernels::ElementBinary::forward_kernel_wrapper( m, @@ -358,7 +361,8 @@ __host__ void FusedOp::forward_task(Task const *task, my_input_accessor[0].domain.get_volume()); } - assert(my_input_accessor[0].data_type == DT_INT64); + assert(my_input_accessor[0].data_type == DT_INT32 || + my_input_accessor[0].data_type == DT_INT64); Kernels::Embedding::forward_kernel_wrapper(m, my_input_accessor[0], my_output_accessor[0], @@ -368,10 +372,15 @@ __host__ void FusedOp::forward_task(Task const *task, effective_batch_size); break; } + case OP_GELU: case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); @@ -428,14 +437,31 @@ __host__ void FusedOp::forward_task(Task const *task, my_input_accessor[0].domain.get_volume()); } else if (my_input_accessor[0].data_type == DT_FLOAT) { Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_int64_ptr(), - my_output_accessor[0].get_int64_ptr(), + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), my_input_accessor[0].domain.get_volume()); } else { assert(false && "Unsupported data type"); } break; } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + } else { + assert(false); + } + break; + } case OP_TRANSPOSE: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); @@ -453,6 +479,43 @@ __host__ void FusedOp::forward_task(Task const *task, my_output_accessor[0].domain); break; } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; + } + LayerNorm::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0], gamma, beta); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_output_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_accessor[0].domain.get_volume()); + } else { + assert(false); + } + + break; + } default: { fprintf(stderr, "Fusion currently does not support type = %d\n", @@ -770,8 +833,8 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; Kernels::ElementBinary::backward_kernel_wrapper( m, @@ -787,7 +850,8 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 1); EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; - assert(my_input_accessor[0].data_type == DT_INT64); + assert(my_input_accessor[0].data_type == DT_INT64 || + my_input_accessor[0].data_type == DT_INT32); int in_dim, out_dim, effective_batch_size; if (m->aggr == AGGR_MODE_NONE) { in_dim = 1; @@ -848,10 +912,15 @@ __host__ void FusedOp::backward_task(Task const *task, batch_size); break; } + case OP_GELU: case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); @@ -898,10 +967,43 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); assert(my_input_grad_accessor[0].domain.get_volume() == my_output_grad_accessor[0].domain.get_volume()); - Kernels::Reshape::backward_kernel_wrapper( - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain.get_volume()); + if (my_input_grad_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_int64_ptr(), + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].get_int32_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::backward_kernel_wrapper( + m, + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false); + } break; } case OP_TRANSPOSE: { @@ -919,6 +1021,47 @@ __host__ void FusedOp::backward_task(Task const *task, my_output_grad_accessor[0].domain); break; } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; + } + LayerNorm::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + gamma.get_float_ptr(), + my_weight_grad_accessor[0].get_float_ptr(), + my_weight_grad_accessor[1].get_float_ptr()); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + + break; + } default: assert(false && "Fusion currently does not support type"); } diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 664b8c9f0b..a6130bb425 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -565,8 +565,8 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, // FIXME please add gamma_ptr and beta_ptr after finish the implementation float *gamma_ptr = NULL, *beta_ptr = NULL; - GenericTensorAccessorW gamma_acc; - GenericTensorAccessorW beta_acc; + GenericTensorAccessorR gamma_acc; + GenericTensorAccessorR beta_acc; bool out_of_memory = (in_ptr == NULL) || (out_ptr == NULL) || diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index e03da48935..75cf06b18b 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -173,8 +173,8 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW &output, - GenericTensorAccessorW &gamma, - GenericTensorAccessorW &beta) { + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); // LayerNorm::forward_kernel( diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index f0539f8405..736d122513 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -135,8 +135,8 @@ template void LayerNorm::forward_kernel(LayerNormMeta const *m, T const *in_ptr, T *out_ptr, - T *gamma_ptr, - T *beta_ptr, + T const *gamma_ptr, + T const *beta_ptr, cudaStream_t stream) { RowwiseMomentsCUDAKernel <<effective_batch_size, kCUDABlockReduceNumThreads, 0, stream>>>( @@ -160,8 +160,8 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW &output, - GenericTensorAccessorW &gamma, - GenericTensorAccessorW &beta) { + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); From 0cf3c8edde8e916b0963e802f3cc5b6ccfe35628 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 6 Oct 2023 12:37:46 -0400 Subject: [PATCH 50/72] fix hip --- src/ops/fused.cpp | 1268 ++++++++++++++++++++-------------------- src/ops/layer_norm.cpp | 4 +- 2 files changed, 634 insertions(+), 638 deletions(-) diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index 348680b2fd..226184f1a2 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -13,16 +13,18 @@ * limitations under the License. */ -#include "flexflow/ops/fused.h" #include "flexflow/model.h" #include "flexflow/ops/batch_norm.h" #include "flexflow/ops/element_unary.h" +#include "flexflow/ops/embedding.h" +#include "flexflow/ops/fused.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" #include "flexflow/ops/kernels/cast_kernels.h" #include "flexflow/ops/kernels/concat_kernels.h" #include "flexflow/ops/kernels/conv_2d_kernels.h" #include "flexflow/ops/kernels/dropout_kernels.h" #include "flexflow/ops/kernels/element_binary_kernels.h" +#include "flexflow/ops/kernels/embedding_kernels.h" #include "flexflow/ops/kernels/flat_kernels.h" #include "flexflow/ops/kernels/linear_kernels.h" #include "flexflow/ops/kernels/pool_2d_kernels.h" @@ -293,7 +295,6 @@ __host__ void FusedOp::forward_task(Task const *task, my_input_accessor[1].get_float_ptr(), my_output_accessor[0].get_float_ptr()); break; - break; } case OP_EMBEDDING: { @@ -358,667 +359,662 @@ __host__ void FusedOp::forward_task(Task const *task, in_dim, out_dim, effective_batch_size); - case OP_GELU: - case OP_RELU: - case OP_SIGMOID: - case OP_TANH: - case OP_ELU: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - break; - } - case OP_POOL2D: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; - Kernels::Pool2D::forward_kernel_wrapper( + break; + } + case OP_GELU: + case OP_RELU: + case OP_SIGMOID: + case OP_TANH: + case OP_ELU: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_POOL2D: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; + Kernels::Pool2D::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + break; + } + case OP_FLAT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + Kernels::Flat::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::forward_kernel_wrapper( m, my_input_accessor[0].get_float_ptr(), my_output_accessor[0].get_float_ptr()); - break; - } - case OP_FLAT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - Kernels::Flat::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - break; - } - case OP_SOFTMAX: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; - if (m->input_type == DT_FLOAT) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); - } else { - assert(false); - } - break; + } else { + assert(false); } - case OP_RESHAPE: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - break; + break; + } + case OP_RESHAPE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_TRANSPOSE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + TransposeMeta *m = (TransposeMeta *)metas->meta[op]; + Kernels::Transpose::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain, + my_output_accessor[0].domain); + break; + } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; } - case OP_TRANSPOSE: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - TransposeMeta *m = (TransposeMeta *)metas->meta[op]; - Kernels::Transpose::forward_kernel_wrapper( + LayerNorm::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0], gamma, beta); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::forward_kernel_wrapper( m, - my_input_accessor[0].get_float_ptr(), + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_output_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain, - my_output_accessor[0].domain); - break; - } - case OP_LAYERNORM: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; - assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); - GenericTensorAccessorR gamma, beta; - if (m->elementwise_affine) { - gamma = my_weight_accessor[0]; - beta = my_weight_accessor[1]; - } - LayerNorm::forward_kernel_wrapper( - m, my_input_accessor[0], my_output_accessor[0], gamma, beta); - break; - } - case OP_CAST: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - CastMeta const *m = (CastMeta *)metas->meta[op]; - if (m->input_data_type == DT_INT32 && - m->output_data_type == DT_INT64) { - Kernels::Cast::forward_kernel_wrapper( - m, - my_input_accessor[0].get_int32_ptr(), - my_output_accessor[0].get_int64_ptr(), - my_output_accessor[0].domain.get_volume()); - } else if (m->input_data_type == DT_INT32 && - m->output_data_type == DT_FLOAT) { - Kernels::Cast::forward_kernel_wrapper( - m, - my_input_accessor[0].get_int32_ptr(), - my_output_accessor[0].get_float_ptr(), - my_output_accessor[0].domain.get_volume()); - } else { - assert(false); - } - default: { - fprintf(stderr, - "Fusion currently does not support type = %d\n", - fused->op_op_type[op]); - assert(false && "Fusion currently does not support type"); - } + my_output_accessor[0].domain.get_volume()); + } else { + assert(false); } - ioff += fused->op_num_inputs[op]; - woff += fused->op_num_weights[op]; - ooff += fused->op_num_outputs[op]; + break; + } + default: { + fprintf(stderr, + "Fusion currently does not support type = %d\n", + fused->op_op_type[op]); + assert(false && "Fusion currently does not support type"); } - // for (int i = 0; i < fused->numOutputs; i++) - // print_tensor(output_ptr[i], output_domain[i].get_volume(), - // "[Fused:forward:output]"); } + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; + } + // for (int i = 0; i < fused->numOutputs; i++) + // print_tensor(output_ptr[i], output_domain[i].get_volume(), + // "[Fused:forward:output]"); +} - /* - regions[...](I): input - regions[...](I): weight - regions[...](I): output - regions[...](I/O): input_grad - regions[...](I/O): weight_grad - regions[...](I/O): output_grad - */ +/* + regions[...](I): input + regions[...](I): weight + regions[...](I): output + regions[...](I/O): input_grad + regions[...](I/O): weight_grad + regions[...](I/O): output_grad +*/ - __host__ void FusedOp::backward_task( - Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - // const FusedOp* fused = (FusedOp*) task->args; - FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); - FusedOp const *fused = metas->fused_op; +__host__ void FusedOp::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + // const FusedOp* fused = (FusedOp*) task->args; + FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); + FusedOp const *fused = metas->fused_op; - assert(metas->numOperators == fused->numOperators); - assert(regions.size() == task->regions.size()); - { - int sum = fused->numInputs + fused->numWeights + fused->numOutputs; - assert(sum * 2 == (int)regions.size()); - } - GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorW weight_grad_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorR output_accessor[MAX_NUM_OUTPUTS]; - GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS]; - int roff = 0; - assert(fused->numInputs <= MAX_NUM_INPUTS); - for (int i = 0; i < fused->numInputs; i++) { - input_accessor[i] = - helperGetGenericTensorAccessorRO(fused->input_data_types[i], - regions[i], - task->regions[i], - FID_DATA, - ctx, - runtime); + assert(metas->numOperators == fused->numOperators); + assert(regions.size() == task->regions.size()); + { + int sum = fused->numInputs + fused->numWeights + fused->numOutputs; + assert(sum * 2 == (int)regions.size()); + } + GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW weight_grad_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorR output_accessor[MAX_NUM_OUTPUTS]; + GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS]; + int roff = 0; + assert(fused->numInputs <= MAX_NUM_INPUTS); + for (int i = 0; i < fused->numInputs; i++) { + input_accessor[i] = + helperGetGenericTensorAccessorRO(fused->input_data_types[i], + regions[i], + task->regions[i], + FID_DATA, + ctx, + runtime); + } + roff += fused->numInputs; + assert(fused->numWeights <= MAX_NUM_WEIGHTS); + for (int i = 0; i < fused->numWeights; i++) { + weight_accessor[i] = + helperGetGenericTensorAccessorRO(fused->weight_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + } + roff += fused->numWeights; + assert(fused->numOutputs <= MAX_NUM_OUTPUTS); + for (int i = 0; i < fused->numOutputs; i++) { + output_accessor[i] = + helperGetGenericTensorAccessorRO(fused->output_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + } + roff += fused->numOutputs; + for (int i = 0; i < fused->numInputs; i++) { + input_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->input_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + assert(input_grad_accessor[i].domain == input_accessor[i].domain); + } + roff += fused->numInputs; + for (int i = 0; i < fused->numWeights; i++) { + weight_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->weight_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + assert(weight_grad_accessor[i].domain.get_volume() == + weight_accessor[i].domain.get_volume()); + } + roff += fused->numWeights; + for (int i = 0; i < fused->numOutputs; i++) { + output_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->output_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + assert(output_grad_accessor[i].domain == output_accessor[i].domain); + } + roff += fused->numOutputs; + // Assert that all meta share the same dnn/blas handler + int start = 0; + for (start = 0; start < fused->numOperators; start++) { + if (metas->meta[start] != NULL) { + break; + } + } + for (int op = start + 1; op < fused->numOperators; op++) { + if (metas->meta[op] != NULL) { + assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas); + assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn); + } + } + + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + int ioff = 0, woff = 0, ooff = 0; + GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorR my_output_accessor[MAX_NUM_OUTPUTS]; + GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorW my_weight_grad_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS]; + // Do backpropagation in the reverse ordering + for (int op = 0; op < fused->numOperators; op++) { + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; + } + + for (int op = fused->numOperators - 1; op >= 0; op--) { + ioff -= fused->op_num_inputs[op]; + woff -= fused->op_num_weights[op]; + ooff -= fused->op_num_outputs[op]; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + int my_off = fused->op_input_idx[i + ioff]; + if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { + my_input_accessor[i] = input_accessor[my_off]; + my_input_grad_accessor[i] = input_grad_accessor[my_off]; + } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { + my_input_accessor[i] = output_accessor[my_off]; + my_input_grad_accessor[i] = output_grad_accessor[my_off]; + assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain); + } else { + assert(false); } - roff += fused->numInputs; - assert(fused->numWeights <= MAX_NUM_WEIGHTS); - for (int i = 0; i < fused->numWeights; i++) { - weight_accessor[i] = - helperGetGenericTensorAccessorRO(fused->weight_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); + my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; + my_weight_grad_accessor[i] = + weight_grad_accessor[fused->op_weight_idx[i + woff]]; + assert(my_weight_grad_accessor[i].domain.get_volume() == + my_weight_accessor[i].domain.get_volume()); + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); + my_output_accessor[i] = output_accessor[fused->op_output_idx[i + ooff]]; + my_output_grad_accessor[i] = + output_grad_accessor[fused->op_output_idx[i + ooff]]; + assert(my_output_grad_accessor[i].domain == my_output_accessor[i].domain); + } + switch (fused->op_op_type[op]) { + case OP_CONCAT: { + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + ConcatMeta *m = (ConcatMeta *)metas->meta[op]; + int num_inputs = fused->op_num_inputs[op]; + Kernels::Concat::backward_kernel_wrapper(m, + my_output_grad_accessor[0], + my_input_grad_accessor, + num_inputs, + m->legion_axis); + break; } - roff += fused->numWeights; - assert(fused->numOutputs <= MAX_NUM_OUTPUTS); - for (int i = 0; i < fused->numOutputs; i++) { - output_accessor[i] = - helperGetGenericTensorAccessorRO(fused->output_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); + case OP_CONV2D: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 5); + assert(my_output_accessor[0].domain.get_dim() == 5); + Conv2DMeta *m = (Conv2DMeta *)metas->meta[op]; + Kernels::Conv2D::backward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + my_weight_grad_accessor[0].get_float_ptr(), + my_weight_grad_accessor[1].get_float_ptr()); + break; } - roff += fused->numOutputs; - for (int i = 0; i < fused->numInputs; i++) { - input_grad_accessor[i] = - helperGetGenericTensorAccessorRW(fused->input_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); - assert(input_grad_accessor[i].domain == input_accessor[i].domain); + case OP_BATCHNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 2); + assert(my_weight_accessor[1].domain.get_dim() == 2); + assert(my_output_accessor[0].domain.get_dim() == 5); + BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; + BatchNorm::backward_kernel( + m, + (float const *)my_input_accessor[0].get_float_ptr(), + (float *)my_output_grad_accessor[0].get_float_ptr(), + (float const *)my_output_accessor[0].get_float_ptr(), + (float *)my_input_grad_accessor[0].get_float_ptr(), + (float const *)my_weight_accessor[0].get_float_ptr(), + (float *)my_weight_grad_accessor[0].get_float_ptr(), + (float *)my_weight_grad_accessor[1].get_float_ptr(), + my_output_accessor[0].domain.get_volume()); + break; } - roff += fused->numInputs; - for (int i = 0; i < fused->numWeights; i++) { - weight_grad_accessor[i] = - helperGetGenericTensorAccessorRW(fused->weight_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); - assert(weight_grad_accessor[i].domain.get_volume() == - weight_accessor[i].domain.get_volume()); + case OP_DROPOUT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + DropoutMeta *m = (DropoutMeta *)metas->meta[op]; + Kernels::Dropout::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr()); + break; } - roff += fused->numWeights; - for (int i = 0; i < fused->numOutputs; i++) { - output_grad_accessor[i] = - helperGetGenericTensorAccessorRW(fused->output_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); - assert(output_grad_accessor[i].domain == output_accessor[i].domain); + case OP_LINEAR: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + Domain kernel_domain = my_weight_accessor[0].domain; + int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1; + int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1; + int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; + assert(my_output_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); + float *bias_grad_ptr = nullptr; + if (fused->op_num_weights[op] == 2) { + assert(my_weight_accessor[1].domain.get_volume() == out_dim); + bias_grad_ptr = my_weight_grad_accessor[1].get_float_ptr(); + } else { + assert(fused->op_num_weights[op] == 1); + } + LinearMeta *m = (LinearMeta *)metas->meta[op]; + Kernels::Linear::backward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + my_weight_grad_accessor[0].get_float_ptr(), + bias_grad_ptr, + in_dim, + out_dim, + batch_size); + break; } - roff += fused->numOutputs; - // Assert that all meta share the same dnn/blas handler - int start = 0; - for (start = 0; start < fused->numOperators; start++) { - if (metas->meta[start] != NULL) { - break; + case OP_BATCHMATMUL: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + Domain out_domain = my_output_accessor[0].domain; + Domain a_domain = my_input_accessor[0].domain; + Domain b_domain = my_input_accessor[1].domain; + // check dims + int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; + assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); + int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; + assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); + int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; + assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); + assert(a_domain.get_dim() == b_domain.get_dim()); + assert(a_domain.get_dim() == out_domain.get_dim()); + int batch = 1; + for (int i = 2; i < a_domain.get_dim(); i++) { + int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; + assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); + assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); + batch *= dim_size; } + BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; + Kernels::BatchMatmul::backward_kernel_wrapper( + meta, + (float const *)my_output_accessor[0].get_float_ptr(), + (float const *)my_output_grad_accessor[0].get_float_ptr(), + (float const *)my_input_accessor[0].get_float_ptr(), + (float *)my_input_grad_accessor[0].get_float_ptr(), + (float const *)my_input_accessor[1].get_float_ptr(), + (float *)my_input_grad_accessor[1].get_float_ptr(), + (float *)nullptr, + m, + n, + k, + batch); + break; + } + case OP_EW_ADD: + case OP_EW_SUB: + case OP_EW_MUL: + case OP_EW_DIV: + case OP_EW_MAX: + case OP_EW_MIN: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + // assert(my_input_accessor[0].domain == + // my_input_accessor[1].domain); assert(my_input_accessor[0].domain + // == my_output_accessor[0].domain); + ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; + Kernels::ElementBinary::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_accessor[1].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[1].get_float_ptr()); + break; } - for (int op = start + 1; op < fused->numOperators; op++) { - if (metas->meta[op] != NULL) { - assert(metas->meta[start]->handle.blas == - metas->meta[op]->handle.blas); - assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn); + case OP_EMBEDDING: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; + assert(my_input_accessor[0].data_type == DT_INT64 || + my_input_accessor[0].data_type == DT_INT32); + int in_dim, out_dim, effective_batch_size; + if (m->aggr == AGGR_MODE_NONE) { + in_dim = 1; + out_dim = my_output_grad_accessor[0].domain.hi()[0] - + my_output_grad_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_grad_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } else { + in_dim = my_input_accessor[0].domain.hi()[0] - + my_input_accessor[0].domain.lo()[0] + 1; + out_dim = my_output_grad_accessor[0].domain.hi()[0] - + my_output_grad_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_grad_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); } + Kernels::Embedding::backward_kernel_wrapper(m, + my_input_accessor[0], + my_output_grad_accessor[0], + my_weight_grad_accessor[0], + in_dim, + out_dim, + effective_batch_size); + break; } - - hipStream_t stream; - checkCUDA(get_legion_stream(&stream)); - - int ioff = 0, woff = 0, ooff = 0; - GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorR my_output_accessor[MAX_NUM_OUTPUTS]; - GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorW my_weight_grad_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS]; - // Do backpropagation in the reverse ordering - for (int op = 0; op < fused->numOperators; op++) { - ioff += fused->op_num_inputs[op]; - woff += fused->op_num_weights[op]; - ooff += fused->op_num_outputs[op]; + case OP_GELU: + case OP_RELU: + case OP_SIGMOID: + case OP_TANH: + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; + ElementUnary::backward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; } - - for (int op = fused->numOperators - 1; op >= 0; op--) { - ioff -= fused->op_num_inputs[op]; - woff -= fused->op_num_weights[op]; - ooff -= fused->op_num_outputs[op]; - for (int i = 0; i < fused->op_num_inputs[op]; i++) { - int my_off = fused->op_input_idx[i + ioff]; - if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { - my_input_accessor[i] = input_accessor[my_off]; - my_input_grad_accessor[i] = input_grad_accessor[my_off]; - } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { - my_input_accessor[i] = output_accessor[my_off]; - my_input_grad_accessor[i] = output_grad_accessor[my_off]; - assert(my_input_grad_accessor[i].domain == - my_input_accessor[i].domain); - } else { - assert(false); - } + case OP_POOL2D: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + // assert(my_input_accessor[0].domain == + // my_output_accessor[0].domain); + Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; + Kernels::Pool2D::backward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr()); + break; + } + case OP_FLAT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_volume() == + my_output_grad_accessor[0].domain.get_volume()); + Kernels::Flat::backward_kernel_wrapper( + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + break; + } + case OP_RESHAPE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_volume() == + my_output_grad_accessor[0].domain.get_volume()); + if (my_input_grad_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_int64_ptr(), + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].get_int32_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else { + assert(false); } - for (int i = 0; i < fused->op_num_weights[op]; i++) { - assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - my_weight_accessor[i] = - weight_accessor[fused->op_weight_idx[i + woff]]; - my_weight_grad_accessor[i] = - weight_grad_accessor[fused->op_weight_idx[i + woff]]; - assert(my_weight_grad_accessor[i].domain.get_volume() == - my_weight_accessor[i].domain.get_volume()); + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::backward_kernel_wrapper( + m, + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false); } - for (int i = 0; i < fused->op_num_outputs[op]; i++) { - assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); - my_output_accessor[i] = - output_accessor[fused->op_output_idx[i + ooff]]; - my_output_grad_accessor[i] = - output_grad_accessor[fused->op_output_idx[i + ooff]]; - assert(my_output_grad_accessor[i].domain == - my_output_accessor[i].domain); + break; + } + case OP_TRANSPOSE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_volume() == + my_output_grad_accessor[0].domain.get_volume()); + TransposeMeta *m = (TransposeMeta *)metas->meta[op]; + Kernels::Transpose::backward_kernel_wrapper( + m, + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain, + my_output_grad_accessor[0].domain); + break; + } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; } - switch (fused->op_op_type[op]) { - case OP_CONCAT: { - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - ConcatMeta *m = (ConcatMeta *)metas->meta[op]; - int num_inputs = fused->op_num_inputs[op]; - Kernels::Concat::backward_kernel_wrapper(m, - my_output_grad_accessor[0], - my_input_grad_accessor, - num_inputs, - m->legion_axis); - break; - } - case OP_CONV2D: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_dim() == 5); - assert(my_weight_accessor[0].domain.get_dim() == 5); - assert(my_output_accessor[0].domain.get_dim() == 5); - Conv2DMeta *m = (Conv2DMeta *)metas->meta[op]; - Kernels::Conv2D::backward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - my_weight_grad_accessor[0].get_float_ptr(), - my_weight_grad_accessor[1].get_float_ptr()); - break; - } - case OP_BATCHNORM: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_dim() == 5); - assert(my_weight_accessor[0].domain.get_dim() == 2); - assert(my_weight_accessor[1].domain.get_dim() == 2); - assert(my_output_accessor[0].domain.get_dim() == 5); - BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; - BatchNorm::backward_kernel( - m, - (float const *)my_input_accessor[0].get_float_ptr(), - (float *)my_output_grad_accessor[0].get_float_ptr(), - (float const *)my_output_accessor[0].get_float_ptr(), - (float *)my_input_grad_accessor[0].get_float_ptr(), - (float const *)my_weight_accessor[0].get_float_ptr(), - (float *)my_weight_grad_accessor[0].get_float_ptr(), - (float *)my_weight_grad_accessor[1].get_float_ptr(), - my_output_accessor[0].domain.get_volume()); - break; - } - case OP_DROPOUT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - DropoutMeta *m = (DropoutMeta *)metas->meta[op]; - Kernels::Dropout::backward_kernel_wrapper( - m, - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr()); - break; - } - case OP_LINEAR: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - Domain kernel_domain = my_weight_accessor[0].domain; - int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1; - int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1; - int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; - assert(my_output_accessor[0].domain.get_volume() == - out_dim * batch_size); - assert(my_input_accessor[0].domain.get_volume() == - in_dim * batch_size); - float *bias_grad_ptr = nullptr; - if (fused->op_num_weights[op] == 2) { - assert(my_weight_accessor[1].domain.get_volume() == out_dim); - bias_grad_ptr = my_weight_grad_accessor[1].get_float_ptr(); - } else { - assert(fused->op_num_weights[op] == 1); - } - LinearMeta *m = (LinearMeta *)metas->meta[op]; - Kernels::Linear::backward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - my_weight_grad_accessor[0].get_float_ptr(), - bias_grad_ptr, - in_dim, - out_dim, - batch_size); - break; - } - case OP_BATCHMATMUL: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - Domain out_domain = my_output_accessor[0].domain; - Domain a_domain = my_input_accessor[0].domain; - Domain b_domain = my_input_accessor[1].domain; - // check dims - int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; - assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); - int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; - assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); - int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; - assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); - assert(a_domain.get_dim() == b_domain.get_dim()); - assert(a_domain.get_dim() == out_domain.get_dim()); - int batch = 1; - for (int i = 2; i < a_domain.get_dim(); i++) { - int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; - assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); - assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); - batch *= dim_size; - } - BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; - Kernels::BatchMatmul::backward_kernel_wrapper( - meta, - (float const *)my_output_accessor[0].get_float_ptr(), - (float const *)my_output_grad_accessor[0].get_float_ptr(), - (float const *)my_input_accessor[0].get_float_ptr(), - (float *)my_input_grad_accessor[0].get_float_ptr(), - (float const *)my_input_accessor[1].get_float_ptr(), - (float *)my_input_grad_accessor[1].get_float_ptr(), - (float *)nullptr, - m, - n, - k, - batch); - break; - } - case OP_EW_ADD: - case OP_EW_SUB: - case OP_EW_MUL: - case OP_EW_DIV: - case OP_EW_MAX: - case OP_EW_MIN: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - // assert(my_input_accessor[0].domain == - // my_input_accessor[1].domain); assert(my_input_accessor[0].domain - // == my_output_accessor[0].domain); - ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; - Kernels::ElementBinary::backward_kernel_wrapper( - m, - my_output_grad_accessor[0].get_float_ptr(), - my_input_accessor[0].get_float_ptr(), - my_input_accessor[1].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[1].get_float_ptr()); - break; - } - case OP_EMBEDDING: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 1); - assert(fused->op_num_outputs[op] == 1); - EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; - assert(my_input_accessor[0].data_type == DT_INT64 || - my_input_accessor[0].data_type == DT_INT32); - int in_dim, out_dim, effective_batch_size; - if (m->aggr == AGGR_MODE_NONE) { - in_dim = 1; - out_dim = my_output_grad_accessor[0].domain.hi()[0] - - my_output_grad_accessor[0].domain.lo()[0] + 1; - effective_batch_size = - my_output_grad_accessor[0].domain.get_volume() / out_dim; - assert(effective_batch_size * in_dim == - my_input_accessor[0].domain.get_volume()); - } else { - in_dim = my_input_accessor[0].domain.hi()[0] - - my_input_accessor[0].domain.lo()[0] + 1; - out_dim = my_output_grad_accessor[0].domain.hi()[0] - - my_output_grad_accessor[0].domain.lo()[0] + 1; - effective_batch_size = - my_output_grad_accessor[0].domain.get_volume() / out_dim; - assert(effective_batch_size * in_dim == - my_input_accessor[0].domain.get_volume()); - } - Kernels::Embedding::backward_kernel_wrapper( - m, - my_input_accessor[0], - my_output_grad_accessor[0], - my_weight_grad_accessor[0], - in_dim, - out_dim, - effective_batch_size); - break; - } - case OP_GELU: - case OP_RELU: - case OP_SIGMOID: - case OP_TANH: - case OP_ELU: - case OP_SCALAR_ADD: - case OP_SCALAR_MULTIPLY: - case OP_SCALAR_SUB: - case OP_SCALAR_TRUE_DIV: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - ElementUnary::backward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - break; - } - case OP_POOL2D: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - // assert(my_input_accessor[0].domain == - // my_output_accessor[0].domain); - Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; - Kernels::Pool2D::backward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr()); - break; - } - case OP_FLAT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_grad_accessor[0].domain.get_volume() == - my_output_grad_accessor[0].domain.get_volume()); - Kernels::Flat::backward_kernel_wrapper( - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain.get_volume()); - break; - } - case OP_RESHAPE: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_grad_accessor[0].domain.get_volume() == - my_output_grad_accessor[0].domain.get_volume()); - if (my_input_grad_accessor[0].data_type == DT_INT64) { - Kernels::Reshape::backward_kernel_wrapper( - my_input_grad_accessor[0].get_int64_ptr(), - my_output_grad_accessor[0].get_int64_ptr(), - my_input_grad_accessor[0].domain.get_volume()); - } else if (my_input_grad_accessor[0].data_type == DT_INT32) { - Kernels::Reshape::forward_kernel_wrapper( - my_input_grad_accessor[0].get_int32_ptr(), - my_output_grad_accessor[0].get_int32_ptr(), - my_input_grad_accessor[0].domain.get_volume()); - } else if (my_input_grad_accessor[0].data_type == DT_FLOAT) { - Kernels::Reshape::backward_kernel_wrapper( - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain.get_volume()); - } else { - assert(false); - } - break; - } - case OP_SOFTMAX: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; - if (my_input_accessor[0].data_type == DT_FLOAT) { - Kernels::Softmax::backward_kernel_wrapper( - m, - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - } else { - assert(false); - } - break; - } - case OP_TRANSPOSE: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_grad_accessor[0].domain.get_volume() == - my_output_grad_accessor[0].domain.get_volume()); - TransposeMeta *m = (TransposeMeta *)metas->meta[op]; - Kernels::Transpose::backward_kernel_wrapper( - m, - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain, - my_output_grad_accessor[0].domain); - break; - } - case OP_LAYERNORM: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; - assert(fused->op_num_weights[op] == - 2 * (int)(m->elementwise_affine)); - GenericTensorAccessorR gamma, beta; - if (m->elementwise_affine) { - gamma = my_weight_accessor[0]; - beta = my_weight_accessor[1]; - } - LayerNorm::backward_kernel_wrapper( - m, - my_output_grad_accessor[0].get_float_ptr(), - my_input_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - gamma.get_float_ptr(), - my_weight_grad_accessor[0].get_float_ptr(), - my_weight_grad_accessor[1].get_float_ptr()); - break; - } - case OP_CAST: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - CastMeta const *m = (CastMeta *)metas->meta[op]; - if (m->input_data_type == DT_INT32 && - m->output_data_type == DT_INT64) { - Kernels::Cast::backward_kernel_wrapper( - my_output_grad_accessor[0].get_int64_ptr(), - my_input_grad_accessor[0].get_int32_ptr(), - my_output_grad_accessor[0].domain.get_volume()); - } else if (m->input_data_type == DT_INT32 && - m->output_data_type == DT_FLOAT) { - Kernels::Cast::backward_kernel_wrapper( - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_int32_ptr(), - my_output_grad_accessor[0].domain.get_volume()); - } else { - assert(false); - } - default: - assert(false && "Fusion currently does not support type"); - } + LayerNorm::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + gamma.get_float_ptr(), + my_weight_grad_accessor[0].get_float_ptr(), + my_weight_grad_accessor[1].get_float_ptr()); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else { + assert(false); } - assert(ioff == 0); - assert(woff == 0); - assert(ooff == 0); - // for (int i = 0; i < fused->numWeights; i++) - // print_tensor(weight_grad_ptr[i], - // weight_grad_domain[i].get_volume(), - // "[Fused:backward:weight_grad]"); - // for (int i = 0; i < fused->numInputs; i++) - // print_tensor(input_grad_ptr[i], - // input_grad_domain[i].get_volume(), - // "[Fused:backward:input_grad]"); - // for (int i = 0; i < fused->numOutputs; i++) - // print_tensor(output_grad_ptr[i], - // output_grad_domain[i].get_volume(), - // "[Fused:backward:output_grad]"); + break; } + default: + assert(false && "Fusion currently does not support type"); + } + } + assert(ioff == 0); + assert(woff == 0); + assert(ooff == 0); + // for (int i = 0; i < fused->numWeights; i++) + // print_tensor(weight_grad_ptr[i], + // weight_grad_domain[i].get_volume(), + // "[Fused:backward:weight_grad]"); + // for (int i = 0; i < fused->numInputs; i++) + // print_tensor(input_grad_ptr[i], + // input_grad_domain[i].get_volume(), + // "[Fused:backward:input_grad]"); + // for (int i = 0; i < fused->numOutputs; i++) + // print_tensor(output_grad_ptr[i], + // output_grad_domain[i].get_volume(), + // "[Fused:backward:output_grad]"); +} - }; // namespace FlexFlow +}; // namespace FlexFlow diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index 75cf06b18b..8ea2ebba9a 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -142,8 +142,8 @@ template void LayerNorm::forward_kernel(LayerNormMeta const *m, T const *in_ptr, T *out_ptr, - T *gamma_ptr, - T *beta_ptr, + T const *gamma_ptr, + T const *beta_ptr, hipStream_t stream) { hipLaunchKernelGGL(HIP_KERNEL_NAME(RowwiseMomentsCUDAKernel), m->effective_batch_size, From 17a1c4e6bb6ae499f76c2fc49092e25dc2813993 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 6 Oct 2023 12:47:53 -0400 Subject: [PATCH 51/72] more fix to hip --- src/ops/fused.cpp | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index 226184f1a2..cb2b81fbcf 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -13,11 +13,11 @@ * limitations under the License. */ +#include "flexflow/ops/fused.h" #include "flexflow/model.h" #include "flexflow/ops/batch_norm.h" #include "flexflow/ops/element_unary.h" #include "flexflow/ops/embedding.h" -#include "flexflow/ops/fused.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" #include "flexflow/ops/kernels/cast_kernels.h" #include "flexflow/ops/kernels/concat_kernels.h" @@ -365,7 +365,11 @@ __host__ void FusedOp::forward_task(Task const *task, case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); @@ -424,10 +428,24 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); - Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + if (my_input_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int64_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int32_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false && "Unsupported data type"); + } break; } case OP_TRANSPOSE: { From f65044dcd63c91e0729a3c53a9f49cc3f5871278 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 3 Nov 2023 04:59:11 -0400 Subject: [PATCH 52/72] customized kernel for broadcasting add. --- include/flexflow/ops/element_binary.h | 1 + .../ops/kernels/element_binary_kernels.h | 2 ++ src/ops/element_binary.cc | 9 ++++++ src/ops/kernels/element_binary_kernels.cpp | 32 ++++++++++++++++--- 4 files changed, 40 insertions(+), 4 deletions(-) diff --git a/include/flexflow/ops/element_binary.h b/include/flexflow/ops/element_binary.h index cfacec50f7..f83ce20609 100644 --- a/include/flexflow/ops/element_binary.h +++ b/include/flexflow/ops/element_binary.h @@ -58,6 +58,7 @@ class ElementBinary : public Op { public: bool inplace_a, has_same_operands; bool broadcast_input1, broadcast_input2; + int batch_size; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/kernels/element_binary_kernels.h b/include/flexflow/ops/kernels/element_binary_kernels.h index 529859195e..50c7f2b80c 100644 --- a/include/flexflow/ops/kernels/element_binary_kernels.h +++ b/include/flexflow/ops/kernels/element_binary_kernels.h @@ -22,6 +22,8 @@ class ElementBinaryMeta : public OpMeta { OperatorType op_type; bool inplace_a, has_same_operands; bool broadcast_input1, broadcast_input2; + int batch_size; + size_t replicate_size; char op_name[MAX_OPNAME]; }; diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 0e0d8e7c31..12895cfd98 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -211,6 +211,9 @@ ElementBinary::ElementBinary(FFModel &model, numdim, dims, in1->data_type, this); broadcast_input1 = (inputs[0]->get_volume() != outputs[0]->get_volume()); broadcast_input2 = (inputs[1]->get_volume() != outputs[0]->get_volume()); + + batch_size = dims[numdim - 2].size; + } ElementBinary::ElementBinary( @@ -337,6 +340,8 @@ OpMeta *ElementBinary::init_task(Task const *task, m->has_same_operands = eb->has_same_operands; m->broadcast_input1 = eb->broadcast_input1; m->broadcast_input2 = eb->broadcast_input2; + m->batch_size = eb->batch_size; + std::strcpy(m->op_name, eb->name); Domain input1_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); @@ -368,6 +373,10 @@ OpMeta *ElementBinary::init_task(Task const *task, } else { output_domain = input1_domain; } + m->replicate_size = m->broadcast_input1 + ? (input1_domain.get_volume() / m->batch_size) + : (input2_domain.get_volume() / m->batch_size); + assert(task->regions.size() == regions.size()); assert(regions.size() == num_regions); init_kernel(m, input1_domain, input2_domain, output_domain); diff --git a/src/ops/kernels/element_binary_kernels.cpp b/src/ops/kernels/element_binary_kernels.cpp index 4cdc839b59..325edba6d0 100644 --- a/src/ops/kernels/element_binary_kernels.cpp +++ b/src/ops/kernels/element_binary_kernels.cpp @@ -72,15 +72,12 @@ void forward_kernel_wrapper(ElementBinaryMeta const *m, float *out_ptr) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - hipEvent_t t_start, t_end; if (m->profiling) { hipEventCreate(&t_start); hipEventCreate(&t_end); hipEventRecord(t_start, stream); } - // print_tensor(in1_ptr, in1_domain.get_volume(), "input1:"); - // print_tensor(in2_ptr, in2_domain.get_volume(), "input2:"); Internal::forward_kernel(m, in1_ptr, in2_ptr, out_ptr, stream); // print_tensor(out_ptr, in1_domain.get_volume(), "output:"); if (m->profiling) { @@ -199,6 +196,21 @@ __global__ void elewise_binary_forward_kernel(coord_t volume, } } +// for simplicity, assume the replicate dimension is the batchsize +__global__ void + elewise_binary_forward_kernel_broadcast2(float const *in1_ptr, + float const *in2_ptr, + float *output_ptr, + size_t volume, + size_t batch_size, + size_t replicate_size) { + CUDA_KERNEL_LOOP(i, volume) { + size_t batch = i / replicate_size; + output_ptr[i] = + in1_ptr[i] + in2_ptr[batch * replicate_size + i % replicate_size]; + } +} + __global__ void elewise_binary_backward_kernel(coord_t volume, float const alpha, float const beta, @@ -245,7 +257,6 @@ void forward_kernel(ElementBinaryMeta const *m, hipStream_t stream) { checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - float alpha1 = 1.0f, alpha2 = 1.0f, beta = 0.0f; switch (m->op_type) { case OP_EW_SUB: @@ -284,6 +295,19 @@ void forward_kernel(ElementBinaryMeta const *m, &alpha1, m->outputTensor, out_ptr)); + } else if (m->op_type == OP_EW_ADD && m->broadcast_input2) { + int parallelism = m->batch_size * m->replicate_size; + hipLaunchKernelGGL(elewise_binary_forward_kernel_broadcast2, + GET_BLOCKS(parallelism), + CUDA_NUM_THREADS, + 0, + stream, + in1_ptr, + in2_ptr, + out_ptr, + m->batch_size * m->replicate_size, + m->batch_size, + m->replicate_size); } else { checkCUDNN(miopenOpTensor(m->handle.dnn, m->opDesc, From bcab56ab4aea151647b3e157779c79e82bdb530c Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Wed, 20 Dec 2023 02:28:36 -0500 Subject: [PATCH 53/72] dropout --- include/flexflow/ops/dropout.h | 7 + .../flexflow/ops/kernels/dropout_kernels.h | 16 ++- src/ops/dropout.cc | 38 ++++-- src/ops/kernels/dropout_kernels.cpp | 126 ++++++++++++++---- src/ops/kernels/dropout_kernels.cu | 12 +- 5 files changed, 151 insertions(+), 48 deletions(-) diff --git a/include/flexflow/ops/dropout.h b/include/flexflow/ops/dropout.h index 37304bdada..b8033c98ba 100644 --- a/include/flexflow/ops/dropout.h +++ b/include/flexflow/ops/dropout.h @@ -5,6 +5,13 @@ #include "flexflow/node.h" #include "flexflow/operator.h" #include "flexflow/ops/dropout_params.h" +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include +#include +#elif defined(FF_USE_HIP_ROCM) +#include +#include +#endif namespace FlexFlow { diff --git a/include/flexflow/ops/kernels/dropout_kernels.h b/include/flexflow/ops/kernels/dropout_kernels.h index 421974fbaa..b2201dd34e 100644 --- a/include/flexflow/ops/kernels/dropout_kernels.h +++ b/include/flexflow/ops/kernels/dropout_kernels.h @@ -5,6 +5,7 @@ #include "flexflow/fftype.h" #include "flexflow/op_meta.h" #include "flexflow/ops/dropout.h" +#include "flexflow/accessor.h" namespace FlexFlow { @@ -17,33 +18,40 @@ class DropoutMeta : public OpMeta { ~DropoutMeta(void); Realm::RegionInstance reserveInst; #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) + curandState *state; cudnnTensorDescriptor_t inputTensor, outputTensor; cudnnDropoutDescriptor_t dropoutDesc; #else miopenTensorDescriptor_t inputTensor, outputTensor; miopenDropoutDescriptor_t dropoutDesc; + hiprandState *state; #endif void *reserveSpace, *dropoutStates; size_t reserveSpaceSize, dropoutStateSize; + size_t num_elements; + long long seed; + float rate; }; namespace Kernels { namespace Dropout { void forward_kernel_wrapper(DropoutMeta *m, - float const *input_ptr, - float *output_ptr); + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); void backward_kernel_wrapper(DropoutMeta *m, - float const *output_grad_ptr, - float *input_grad_ptr); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); namespace Internal { void forward_kernel(DropoutMeta *m, float const *input_ptr, float *output_ptr, + size_t num_elements, ffStream_t stream); void backward_kernel(DropoutMeta *m, float const *output_grad_ptr, float *input_grad_ptr, + size_t num_elements, ffStream_t stream); } // namespace Internal } // namespace Dropout diff --git a/src/ops/dropout.cc b/src/ops/dropout.cc index 55f6730827..2ebfaff539 100644 --- a/src/ops/dropout.cc +++ b/src/ops/dropout.cc @@ -28,7 +28,7 @@ using PCG::Node; using namespace FlexFlow::Kernels::Dropout; -Tensor FFModel::dropout(const Tensor input, +Tensor FFModel::dropout(Tensor const input, float rate, unsigned long long seed, char const *name) { @@ -86,7 +86,7 @@ bool operator==(DropoutParams const &lhs, DropoutParams const &rhs) { } Dropout::Dropout(FFModel &model, - const ParallelTensor _input, + ParallelTensor const _input, float _rate, unsigned long long _seed, char const *name) @@ -111,12 +111,12 @@ Dropout::Dropout(FFModel &model, Dropout::Dropout(FFModel &model, Dropout const &other, - const ParallelTensor input) + ParallelTensor const input) : Dropout(model, input, other.rate, other.seed, other.name) {} Dropout::Dropout(FFModel &model, DropoutParams const ¶ms, - const ParallelTensor input, + ParallelTensor const input, char const *name) : Dropout(model, input, params.rate, params.seed, name) {} @@ -210,12 +210,12 @@ void Dropout::forward_task(Task const *task, assert(task->regions.size() == 2); // const Dropout* dropout = (const Dropout*) task->args; DropoutMeta *m = *((DropoutMeta **)task->local_args); - float const *input_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *output_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - - forward_kernel_wrapper(m, input_ptr, output_ptr); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + forward_kernel_wrapper(m, input, output); } void Dropout::backward(FFModel const &ff) { @@ -264,7 +264,13 @@ void Dropout::backward_task(Task const *task, float const *output_grad_ptr = helperGetTensorPointerRO( regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel_wrapper(m, output_grad_ptr, input_grad_ptr); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + backward_kernel_wrapper(m, output_grad, input_grad); } void Dropout::serialize(Legion::Serializer &sez) const { @@ -304,30 +310,36 @@ bool Dropout::measure_operator_cost(Simulator *sim, sim->free_all(); float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); assert(input_ptr != NULL); + + GenericTensorAccessorR input_acc(m->input_type[0], sub_input.get_domain(), input_ptr); cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); assert(output_ptr != NULL); + + GenericTensorAccessorW output_acc(m->output_type[0], sub_input.get_domain(), output_ptr); cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); assert(m->profiling == false); std::function forward, backward; - forward = [&] { forward_kernel_wrapper(m, input_ptr, output_ptr); }; + forward = [&] { forward_kernel_wrapper(m, input_acc, output_acc); }; if (sim->computationMode == COMP_MODE_TRAINING) { float *input_grad_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); assert(input_grad_ptr != NULL); + GenericTensorAccessorW input_grad_acc(m->output_type[0], sub_input.get_domain(), input_grad_ptr); cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *output_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); assert(output_grad_ptr != NULL); + GenericTensorAccessorR output_grad_acc(m->output_type[0], sub_input.get_domain(), output_grad_ptr); cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); backward = [&] { - backward_kernel_wrapper(m, output_grad_ptr, input_grad_ptr); + backward_kernel_wrapper(m, output_grad_acc, input_grad_acc); }; } diff --git a/src/ops/kernels/dropout_kernels.cpp b/src/ops/kernels/dropout_kernels.cpp index b0dd4c644e..c0d5748464 100644 --- a/src/ops/kernels/dropout_kernels.cpp +++ b/src/ops/kernels/dropout_kernels.cpp @@ -30,6 +30,11 @@ DropoutMeta::DropoutMeta(FFHandler handler, Domain const &output_domain) : OpMeta(handler) { profiling = dropout->profiling; + rate = dropout->rate; + seed = dropout->seed; + input_type[0] = dropout->data_type; + output_type[0] = dropout->data_type; + checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); checkCUDNN(miopenCreateDropoutDescriptor(&dropoutDesc)); @@ -78,20 +83,68 @@ DropoutMeta::~DropoutMeta(void) { namespace Kernels { namespace Dropout { +__global__ void dropout_forward_kernel(float p, + long long seed, + size_t num_elements, + float const *input_ptr, + float *output_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + float scale = 1.0 / p; + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed, i, 0, &state); + float rand = hiprand_uniform(&state); + if (input_ptr[i] < p) { + output_ptr[i] = 0; + } else { + output_ptr[i] = input_ptr[i] * scale; + } + } +} + +__global__ void dropout_backward_kernel(float p, + long long seed, + size_t num_elements, + float const *input_ptr, + float *output_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + float scale = 1.0 / p; + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed, i, 0, &state); + float rand = hiprand_uniform(&state); + if (input_ptr[i] < p) { + output_ptr[i] = 0; + } else { + output_ptr[i] = input_ptr[i] * scale; + } + } +} + void forward_kernel_wrapper(DropoutMeta *m, - float const *input_ptr, - float *output_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::forward_kernel(m, input_ptr, output_ptr, stream); + + Internal::forward_kernel(m, + input.get_float_ptr(), + output.get_float_ptr(), + input.domain.get_volume(), + stream); + + // printf("dropout %d\n", input.domain.get_volume()); + // assert(false); } void backward_kernel_wrapper(DropoutMeta *m, - float const *output_grad_ptr, - float *input_grad_ptr) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::backward_kernel(m, output_grad_ptr, input_grad_ptr, stream); + Internal::backward_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + output_grad.domain.get_volume(), + stream); } namespace Internal { @@ -99,35 +152,58 @@ namespace Internal { void forward_kernel(DropoutMeta *m, float const *input_ptr, float *output_ptr, + size_t num_elements, hipStream_t stream) { checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + int parallelism = num_elements; + hipLaunchKernelGGL(HIP_KERNEL_NAME(dropout_forward_kernel), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + m->seed, + m->rate, + num_elements, + input_ptr, + output_ptr); - checkCUDNN(miopenDropoutForward(m->handle.dnn, - m->dropoutDesc, - m->inputTensor /* not used */, - m->inputTensor, - input_ptr, - m->outputTensor, - output_ptr, - m->reserveSpace, - m->reserveSpaceSize)); + // checkCUDNN(miopenDropoutForward(m->handle.dnn, + // m->dropoutDesc, + // m->inputTensor /* not used */, + // m->inputTensor, + // input_ptr, + // m->outputTensor, + // output_ptr, + // m->reserveSpace, + // m->reserveSpaceSize)); } void backward_kernel(DropoutMeta *m, float const *output_grad_ptr, float *input_grad_ptr, + size_t num_elements, hipStream_t stream) { checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - - checkCUDNN(miopenDropoutBackward(m->handle.dnn, - m->dropoutDesc, - m->inputTensor /* not used */, - m->outputTensor, - output_grad_ptr, - m->inputTensor, - input_grad_ptr, - m->reserveSpace, - m->reserveSpaceSize)); + int parallelism = num_elements; + hipLaunchKernelGGL(HIP_KERNEL_NAME(dropout_backward_kernel), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + m->seed, + m->rate, + num_elements, + output_grad_ptr, + input_grad_ptr); + // checkCUDNN(miopenDropoutBackward(m->handle.dnn, + // m->dropoutDesc, + // m->inputTensor /* not used */, + // m->outputTensor, + // output_grad_ptr, + // m->inputTensor, + // input_grad_ptr, + // m->reserveSpace, + // m->reserveSpaceSize)); } } // namespace Internal diff --git a/src/ops/kernels/dropout_kernels.cu b/src/ops/kernels/dropout_kernels.cu index 4a76301fd6..39037cb37f 100644 --- a/src/ops/kernels/dropout_kernels.cu +++ b/src/ops/kernels/dropout_kernels.cu @@ -75,19 +75,19 @@ namespace Kernels { namespace Dropout { void forward_kernel_wrapper(DropoutMeta *m, - float const *input_ptr, - float *output_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::forward_kernel(m, input_ptr, output_ptr, stream); + Internal::forward_kernel(m, input.get_float_ptr(), output.get_float_ptr(), stream); } void backward_kernel_wrapper(DropoutMeta *m, - float const *output_grad_ptr, - float *input_grad_ptr) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::backward_kernel(m, output_grad_ptr, input_grad_ptr, stream); + Internal::backward_kernel(m, output_grad.get_float_ptr(), input_grad.get_float_ptr(), stream); } namespace Internal { From fa1fffcdc70425b099393a56bbf39c5c6a30e198 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Wed, 20 Dec 2023 16:37:54 -0500 Subject: [PATCH 54/72] optimizer --- include/flexflow/flexflow_c.h | 2 + include/flexflow/model.h | 2 + include/flexflow/optimizer.h | 23 ++++ python/flexflow/core/flexflow_cffi.py | 7 ++ src/c/flexflow_c.cc | 5 + src/runtime/model.cc | 19 +++ src/runtime/optimizer.cc | 167 ++++++++++++++++++++++++++ src/runtime/optimizer_kernel.cpp | 64 ++++++++++ 8 files changed, 289 insertions(+) diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index bde40de31b..2ddc8549fa 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -95,6 +95,8 @@ void flexflow_model_compute_metrics(flexflow_model_t handle); void flexflow_model_update(flexflow_model_t handle); +void flexflow_model_unified_update(flexflow_model_t handle); + void flexflow_model_compile(flexflow_model_t handle, enum LossType loss_type, int *metrics, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 5e7ad8025c..82a29c8182 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -151,6 +151,7 @@ enum TaskIDs { // Optimizer with NCCL SGD_UPD_NCCL_TASK_ID, ADAM_UPD_NCCL_TASK_ID, + ADAM_UNIFY_UPD_NCCL_TASK_ID, // Initializer GLOROT_INIT_TASK_ID, ZERO_INIT_TASK_ID, @@ -777,6 +778,7 @@ class FFModel { void get_metrics(); void backward(int seq_length = -1); void update(); + void unified_update(); bool apply_fusion(std::vector const &operators, std::vector &new_operators); Op *get_final_operator() const; diff --git a/include/flexflow/optimizer.h b/include/flexflow/optimizer.h index bab7e6e4ed..1101a35ae5 100644 --- a/include/flexflow/optimizer.h +++ b/include/flexflow/optimizer.h @@ -30,6 +30,7 @@ class Optimizer { virtual void init(void) = 0; virtual void next(void) = 0; virtual void update(const ParallelTensor p) = 0; + virtual void unified_update(std::vector parameters) = 0; FFModel const *model; }; @@ -43,6 +44,7 @@ class SGDOptimizer : public Optimizer { void init(void); void next(void); void update(const ParallelTensor p); + void unified_update(std::vector parameters); void set_weight_decay(double _weight_decay); static void ps_update_task(Legion::Task const *task, std::vector const ®ions, @@ -60,6 +62,11 @@ class SGDOptimizer : public Optimizer { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void + nccl_unified_update_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void nccl_update_task_gpu(SGDOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, @@ -85,6 +92,7 @@ class AdamOptimizer : public Optimizer { void init(void); void next(void); void update(const ParallelTensor p); + void unified_update(std::vector parameters); void set_weight_decay(double _weight_decay); static void ps_update_task(Legion::Task const *task, std::vector const ®ions, @@ -103,6 +111,11 @@ class AdamOptimizer : public Optimizer { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void + nccl_unified_update_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void nccl_update_task_gpu(AdamOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, @@ -110,10 +123,20 @@ class AdamOptimizer : public Optimizer { float *w_ptr, float *v_ptr, float *m_ptr); + static void nccl_unified_update_task_gpu(AdamOptimizer const *op, + OpMeta const *meta, + float const *w_grad_ptr[], + size_t *size, + float *w_ptr[], + float *v_ptr[], + float *m_ptr[]); #endif double alpha, beta1, beta2, weight_decay, epsilon; double alpha_t, beta1_t, beta2_t; std::map v_values, m_values; + size_t reservedWorkSpaceSize; + int parameters_num; + int processed_parameters_num; }; }; // namespace FlexFlow diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index c05fb96661..49ad5b3dc8 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -2018,6 +2018,13 @@ def update(self): :returns: None -- no returns. """ ffc.flexflow_model_update(self.handle) + + def unified_update(self): + """Update weights and biases of all layers. + + :returns: None -- no returns. + """ + ffc.flexflow_model_unified_update(self.handle) def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): """Configure the model for trainting. FlexFlow uses lazy initialization, diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index e3c430fd92..1d8634f224 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -178,6 +178,11 @@ void flexflow_model_update(flexflow_model_t handle_) { handle->update(); } +void flexflow_model_unified_update(flexflow_model_t handle_) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + handle->unified_update(); +} + void flexflow_model_compile(flexflow_model_t handle_, enum LossType loss_type, int *metrics, diff --git a/src/runtime/model.cc b/src/runtime/model.cc index c52d5e7f0c..b16766291a 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -2474,6 +2474,10 @@ void FFModel::update() { } } +void FFModel::unified_update() { + optimizer->unified_update(parameters); +} + Op *FFModel::get_final_operator() const { int idx = operators.size() - 1; while (operators[idx]->op_type == OP_INPUT || @@ -5302,6 +5306,21 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(ADAM_UNIFY_UPD_NCCL_TASK_ID, "Adam unified NCCL Update"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Adam unified NCCL Update Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } #endif // Initializer { diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index c42a0c9aa6..42e227f022 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -333,6 +333,7 @@ void AdamOptimizer::init(void) { Context ctx = model->config.lg_ctx; Runtime *runtime = model->config.lg_hlr; Initializer *initializer = new ZeroInitializer(); + reservedWorkSpaceSize = 0; for (size_t i = 0; i < model->parameters.size(); i++) { ParallelTensor p = model->parameters[i]; Domain domain = @@ -381,6 +382,8 @@ void AdamOptimizer::update(const ParallelTensor p) { assert(v_values.find(p->region) != v_values.end()); assert(m_values.find(p->region) != m_values.end()); assert(p->owner_op != NULL); + reservedWorkSpaceSize += p->get_volume() * sizeof(float); + printf("update workspace: %d", reservedWorkSpaceSize); if (p->sync_type == ParameterSyncType::PS) { TaskLauncher launcher(ADAM_UPD_PS_TASK_ID, TaskArgument(this, sizeof(AdamOptimizer)), @@ -492,6 +495,105 @@ void AdamOptimizer::update(const ParallelTensor p) { } } +void AdamOptimizer::unified_update(std::vector parameters) { + Context ctx = model->config.lg_ctx; + Runtime *runtime = model->config.lg_hlr; + + + printf("update workspace: %d", reservedWorkSpaceSize); + + const ParallelTensor p0 = parameters.at(0); + ArgumentMap argmap; + Domain domain = runtime->get_index_space_domain(ctx, p0->parallel_is); + switch (domain.get_dim()) { +#define DIMFUNC(DIM) \ + case DIM: { \ + Rect rect = domain; \ + int idx = 0; \ + for (PointInRectIterator it(rect); it(); it++) { \ + OpMeta *mp = p0->owner_op->meta[idx++]; \ + argmap.set_point(*it, TaskArgument(&mp, sizeof(OpMeta *))); \ + } \ + break; \ + } + LEGION_FOREACH_N(DIMFUNC) +#undef DIMFUNC + default: + assert(false); + } + + int offset = 0; + + while(parameters_num < parameters.size()){ + for(int i = 0; i < parameters.size(); i++){ + const ParallelTensor p = parameters.at(i); + assert(v_values.find(p->region) != v_values.end()); + assert(m_values.find(p->region) != m_values.end()); + assert(p->owner_op != NULL); + if(reservedWorkSpaceSize + p->get_volume() * sizeof(float) >= model->handlers->workSpaceSize){ + break; + } + reservedWorkSpaceSize += p->get_volume() * sizeof(float); + parameters_num += 1; + assert (p->sync_type == ParameterSyncType::NCCL); + assert(p->parallel_is != IndexSpace::NO_SPACE); + } + assert(parameters_num <= parameters.size()); + + //launch a unified task + for(int j = 0; j < parameters_num; j++){ + this->next(); + const ParallelTensor p = parameters.at(processed_parameters_num + j); + IndexLauncher launcher(ADAM_UNIFY_UPD_NCCL_TASK_ID, + p0->parallel_is, + TaskArgument(this, sizeof(AdamOptimizer)), + argmap, + Predicate::TRUE_PRED, + false /*must_epoch*/, + 0 /*mapper_id*/, + p0->machine_view.hash()); + // regions[0]: region_grad + launcher.add_region_requirement(RegionRequirement(p->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + p->region_grad)); + launcher.add_field(offset, FID_DATA); + // regions[1]: region + launcher.add_region_requirement(RegionRequirement( + p->part, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, p->region)); + launcher.add_field(offset + 1, FID_DATA); + // regions[2]: w_region + launcher.add_region_requirement( + RegionRequirement(v_values[p->region]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + v_values[p->region]->region)); + launcher.add_field(offset + 2, FID_DATA); + // regions[3]: m_region + launcher.add_region_requirement( + RegionRequirement(m_values[p->region]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + m_values[p->region]->region)); + launcher.add_field(offset + 3, FID_DATA); + offset += 4; + launcher.concurrent = true; + FutureMap fm = runtime->execute_index_space(ctx, launcher); + // runtime->execute_must_epoch(ctx, must_epoch_launcher); + runtime->issue_execution_fence(ctx); + reservedWorkSpaceSize = 0; + offset = 0; + processed_parameters_num += parameters_num; + } + printf("offset: %d\n", offset); + + } + +} + void AdamOptimizer::ps_update_task(Task const *task, std::vector const ®ions, Context ctx, @@ -605,6 +707,71 @@ void AdamOptimizer::nccl_update_task(Task const *task, nccl_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr); } + + +void AdamOptimizer::nccl_unified_update_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + // assert(regions.size() == 4); + // assert(task->regions.size() == 4); + AdamOptimizer const *op = (AdamOptimizer *)task->args; + OpMeta const *meta = *((OpMeta **)task->local_args); + // FFHandler handler = *((FFHandler*) task->local_args); + Domain domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + + printf("parameters_num: %d\n", op->parameters_num); + float const *w_grad_ptr = NULL; + float *w_ptr = NULL, *v_ptr = NULL, *m_ptr = NULL; + size_t size = 0; + + for(int i = 0; i < op->parameters_num; i++){ + + switch (domain.get_dim()) { +#define DIMFUNC(DIM) \ + case DIM: { \ + TensorAccessorR accWGrad( \ + regions[0], task->regions[0], FID_DATA, ctx, runtime); \ + TensorAccessorW accW(regions[1], \ + task->regions[1], \ + FID_DATA, \ + ctx, \ + runtime, \ + true /*readOutput*/); \ + TensorAccessorW accV(regions[2], \ + task->regions[2], \ + FID_DATA, \ + ctx, \ + runtime, \ + true /*readOutput*/); \ + TensorAccessorW accM(regions[3], \ + task->regions[3], \ + FID_DATA, \ + ctx, \ + runtime, \ + true /*readOutput*/); \ + size = accW.rect.volume(); \ + assert(accWGrad.rect == accW.rect); \ + assert(accWGrad.rect == accV.rect); \ + assert(accWGrad.rect == accM.rect); \ + w_grad_ptr = accWGrad.ptr; \ + w_ptr = accW.ptr; \ + v_ptr = accV.ptr; \ + m_ptr = accM.ptr; \ + break; \ + } + LEGION_FOREACH_N(DIMFUNC) +#undef DIMFUNC + default: { + // Unsupported dims + assert(false); + } + } + } + + nccl_unified_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr); +} #endif }; // namespace FlexFlow diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index e71adc87a8..039b27c654 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -204,6 +204,7 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, m_ptr, v_ptr, w_ptr); + // checkCUDA(hipDeviceSynchronize()); } @@ -245,6 +246,69 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, w_ptr); // checkCUDA(hipDeviceSynchronize()); } + +__host__ void AdamOptimizer::nccl_unified_update_task_gpu(AdamOptimizer const *op, + OpMeta const *meta, + float const *w_grad_ptr[], + size_t *size, + float *w_ptr[], + float *v_ptr[], + float *m_ptr[]) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); + + void *workSpace_ptr = static_cast(meta->handle.workSpace); + + for(int i = 0; i < op->parameters_num; i++){ + checkCUDA(hipMemcpyAsync(workSpace_ptr, + w_grad_ptr[i], + size[i] * sizeof(float), + hipMemcpyDeviceToDevice, + stream)); + workSpace_ptr += size[i] * sizeof(float); + } + + //do allreduce once + checkNCCL(ncclAllReduce(meta->handle.workSpace, + (float *)meta->handle.workSpace, + op->reservedWorkSpaceSize / sizeof(float), + ncclFloat, + ncclSum, + meta->handle.ncclComm, + stream)); + + workSpace_ptr = static_cast(meta->handle.workSpace); + float alpha_t = op->alpha_t; + float beta1_t = op->beta1_t; + float beta2_t = op->beta2_t; + for(int i = 0; i < op->parameters_num; i++){ + // update + hipLaunchKernelGGL(HIP_KERNEL_NAME(adam_update), + GET_BLOCKS(size), + CUDA_NUM_THREADS, + 0, + stream, + size[i], + alpha_t, + op->beta1, + op->beta2, + op->weight_decay, + op->epsilon, + static_cast(workSpace_ptr), + m_ptr[i], + v_ptr[i], + w_ptr[i]); + workSpace_ptr += size[i] * sizeof(float); + + //update + beta1_t *= beta1; + beta2_t *= beta2; + alpha_t = alpha * sqrt(1 - beta2_t) / (1 - beta1_t); + } + + // checkCUDA(hipDeviceSynchronize()); +} #endif }; // namespace FlexFlow \ No newline at end of file From 40d830c8b9c67e66f25bfe17ecc6f407ef179865 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Wed, 20 Dec 2023 19:09:18 -0500 Subject: [PATCH 55/72] opt --- include/flexflow/optimizer.h | 12 ++-- src/runtime/optimizer.cc | 111 +++++++++++++------------------ src/runtime/optimizer_kernel.cpp | 33 +++++---- 3 files changed, 72 insertions(+), 84 deletions(-) diff --git a/include/flexflow/optimizer.h b/include/flexflow/optimizer.h index 1101a35ae5..201408868b 100644 --- a/include/flexflow/optimizer.h +++ b/include/flexflow/optimizer.h @@ -30,7 +30,7 @@ class Optimizer { virtual void init(void) = 0; virtual void next(void) = 0; virtual void update(const ParallelTensor p) = 0; - virtual void unified_update(std::vector parameters) = 0; + virtual void unified_update(std::vector const parameters) = 0; FFModel const *model; }; @@ -44,7 +44,7 @@ class SGDOptimizer : public Optimizer { void init(void); void next(void); void update(const ParallelTensor p); - void unified_update(std::vector parameters); + void unified_update(std::vector const parameters); void set_weight_decay(double _weight_decay); static void ps_update_task(Legion::Task const *task, std::vector const ®ions, @@ -92,7 +92,7 @@ class AdamOptimizer : public Optimizer { void init(void); void next(void); void update(const ParallelTensor p); - void unified_update(std::vector parameters); + void unified_update(std::vector const parameters); void set_weight_decay(double _weight_decay); static void ps_update_task(Legion::Task const *task, std::vector const ®ions, @@ -134,9 +134,9 @@ class AdamOptimizer : public Optimizer { double alpha, beta1, beta2, weight_decay, epsilon; double alpha_t, beta1_t, beta2_t; std::map v_values, m_values; - size_t reservedWorkSpaceSize; - int parameters_num; - int processed_parameters_num; + size_t reservedWorkSpaceSize = 0; + int parameters_num = 0; + int processed_parameters_num = 0; }; }; // namespace FlexFlow diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index 42e227f022..fc30ae12cc 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -383,7 +383,6 @@ void AdamOptimizer::update(const ParallelTensor p) { assert(m_values.find(p->region) != m_values.end()); assert(p->owner_op != NULL); reservedWorkSpaceSize += p->get_volume() * sizeof(float); - printf("update workspace: %d", reservedWorkSpaceSize); if (p->sync_type == ParameterSyncType::PS) { TaskLauncher launcher(ADAM_UPD_PS_TASK_ID, TaskArgument(this, sizeof(AdamOptimizer)), @@ -495,13 +494,13 @@ void AdamOptimizer::update(const ParallelTensor p) { } } -void AdamOptimizer::unified_update(std::vector parameters) { +void SGDOptimizer::unified_update(std::vector const parameters) { + //todo +} + +void AdamOptimizer::unified_update(std::vector const parameters) { Context ctx = model->config.lg_ctx; Runtime *runtime = model->config.lg_hlr; - - - printf("update workspace: %d", reservedWorkSpaceSize); - const ParallelTensor p0 = parameters.at(0); ArgumentMap argmap; Domain domain = runtime->get_index_space_domain(ctx, p0->parallel_is); @@ -523,8 +522,10 @@ void AdamOptimizer::unified_update(std::vector parameters) { } int offset = 0; + printf("param size: %d, %d\n", parameters.size(), parameters_num); - while(parameters_num < parameters.size()){ + while(processed_parameters_num < parameters.size()){ + for(int i = 0; i < parameters.size(); i++){ const ParallelTensor p = parameters.at(i); assert(v_values.find(p->region) != v_values.end()); @@ -538,13 +539,11 @@ void AdamOptimizer::unified_update(std::vector parameters) { assert (p->sync_type == ParameterSyncType::NCCL); assert(p->parallel_is != IndexSpace::NO_SPACE); } + + printf("parameters_num: %d %d, %d\n", parameters_num, reservedWorkSpaceSize, model->handlers->workSpaceSize); assert(parameters_num <= parameters.size()); - //launch a unified task - for(int j = 0; j < parameters_num; j++){ - this->next(); - const ParallelTensor p = parameters.at(processed_parameters_num + j); - IndexLauncher launcher(ADAM_UNIFY_UPD_NCCL_TASK_ID, + IndexLauncher launcher(ADAM_UNIFY_UPD_NCCL_TASK_ID, p0->parallel_is, TaskArgument(this, sizeof(AdamOptimizer)), argmap, @@ -552,6 +551,10 @@ void AdamOptimizer::unified_update(std::vector parameters) { false /*must_epoch*/, 0 /*mapper_id*/, p0->machine_view.hash()); + //launch a unified task + for(int j = 0; j < parameters_num; j++){ + const ParallelTensor p = parameters.at(processed_parameters_num + j); + // regions[0]: region_grad launcher.add_region_requirement(RegionRequirement(p->part_grad, 0 /*projection id*/, @@ -580,16 +583,19 @@ void AdamOptimizer::unified_update(std::vector parameters) { m_values[p->region]->region)); launcher.add_field(offset + 3, FID_DATA); offset += 4; - launcher.concurrent = true; - FutureMap fm = runtime->execute_index_space(ctx, launcher); - // runtime->execute_must_epoch(ctx, must_epoch_launcher); - runtime->issue_execution_fence(ctx); - reservedWorkSpaceSize = 0; - offset = 0; - processed_parameters_num += parameters_num; } - printf("offset: %d\n", offset); + //update alpha, beta + for(int i = 0; i < parameters_num; i++){ + this->next(); + } + launcher.concurrent = true; + FutureMap fm = runtime->execute_index_space(ctx, launcher); + // runtime->execute_must_epoch(ctx, must_epoch_launcher); + runtime->issue_execution_fence(ctx); + reservedWorkSpaceSize = 0; + offset = 0; + processed_parameters_num += parameters_num; } } @@ -721,55 +727,28 @@ void AdamOptimizer::nccl_unified_update_task(Task const *task, Domain domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - printf("parameters_num: %d\n", op->parameters_num); - float const *w_grad_ptr = NULL; - float *w_ptr = NULL, *v_ptr = NULL, *m_ptr = NULL; - size_t size = 0; + float const *w_grad_ptr[op->parameters_num]; + float *w_ptr[op->parameters_num], *v_ptr[op->parameters_num], *m_ptr[op->parameters_num]; + size_t size[op->parameters_num]; + int offset = 0; + printf("parameters_num: %d\n", op->parameters_num); + for(int i = 0; i < op->parameters_num; i++){ - - switch (domain.get_dim()) { -#define DIMFUNC(DIM) \ - case DIM: { \ - TensorAccessorR accWGrad( \ - regions[0], task->regions[0], FID_DATA, ctx, runtime); \ - TensorAccessorW accW(regions[1], \ - task->regions[1], \ - FID_DATA, \ - ctx, \ - runtime, \ - true /*readOutput*/); \ - TensorAccessorW accV(regions[2], \ - task->regions[2], \ - FID_DATA, \ - ctx, \ - runtime, \ - true /*readOutput*/); \ - TensorAccessorW accM(regions[3], \ - task->regions[3], \ - FID_DATA, \ - ctx, \ - runtime, \ - true /*readOutput*/); \ - size = accW.rect.volume(); \ - assert(accWGrad.rect == accW.rect); \ - assert(accWGrad.rect == accV.rect); \ - assert(accWGrad.rect == accM.rect); \ - w_grad_ptr = accWGrad.ptr; \ - w_ptr = accW.ptr; \ - v_ptr = accV.ptr; \ - m_ptr = accM.ptr; \ - break; \ + GenericTensorAccessorR accWGrad = helperGetGenericTensorAccessorRO(DataType::DT_FLOAT, regions[offset], task->regions[offset], FID_DATA, ctx, runtime); + GenericTensorAccessorW accW = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+1], task->regions[offset+1], FID_DATA, ctx, runtime); + GenericTensorAccessorW accV = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+2], task->regions[offset+2], FID_DATA, ctx, runtime); + GenericTensorAccessorW accM = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+3], task->regions[offset+3], FID_DATA, ctx, runtime); + offset += 4; + + size[i] = accW.domain.get_volume(); + // assert(accWGrad.rect == accW.rect); + // assert(accWGrad.rect == accV.rect); + // assert(accWGrad.rect == accM.rect); + w_ptr[i] = accW.get_float_ptr(); + v_ptr[i] = accV.get_float_ptr(); + m_ptr[i] = accM.get_float_ptr(); } - LEGION_FOREACH_N(DIMFUNC) -#undef DIMFUNC - default: { - // Unsupported dims - assert(false); - } - } - } - nccl_unified_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr); } #endif diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index 039b27c654..5e7e613bc7 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -254,19 +254,27 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu(AdamOptimizer const *o float *w_ptr[], float *v_ptr[], float *m_ptr[]) { + hipStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); - void *workSpace_ptr = static_cast(meta->handle.workSpace); + void *workSpace_ptr = meta->handle.workSpace; for(int i = 0; i < op->parameters_num; i++){ - checkCUDA(hipMemcpyAsync(workSpace_ptr, - w_grad_ptr[i], - size[i] * sizeof(float), - hipMemcpyDeviceToDevice, - stream)); - workSpace_ptr += size[i] * sizeof(float); + // hipMemcpyAsync(static_cast(workSpace_ptr), + // w_grad_ptr[i], + // size[i] * sizeof(float), + // hipMemcpyDeviceToDevice, + // stream); + // hipError_t error = hipGetLastError(); + // if(error != hipSuccess) + // { + // // print the CUDA error message and exit + // printf("CUDA error: %s\n", hipGetErrorString(error)); + // } + + workSpace_ptr = static_cast(workSpace_ptr) + size[i] * sizeof(float); } //do allreduce once @@ -284,8 +292,9 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu(AdamOptimizer const *o float beta2_t = op->beta2_t; for(int i = 0; i < op->parameters_num; i++){ // update + std::cout<<"update"<<"\n"; hipLaunchKernelGGL(HIP_KERNEL_NAME(adam_update), - GET_BLOCKS(size), + GET_BLOCKS(size[i]), CUDA_NUM_THREADS, 0, stream, @@ -299,12 +308,12 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu(AdamOptimizer const *o m_ptr[i], v_ptr[i], w_ptr[i]); - workSpace_ptr += size[i] * sizeof(float); + workSpace_ptr = static_cast(workSpace_ptr) + size[i] * sizeof(float); //update - beta1_t *= beta1; - beta2_t *= beta2; - alpha_t = alpha * sqrt(1 - beta2_t) / (1 - beta1_t); + beta1_t *= op->beta1; + beta2_t *= op->beta2; + alpha_t = op->alpha * sqrt(1 - beta2_t) / (1 - beta1_t); } // checkCUDA(hipDeviceSynchronize()); From e8255265235c041e477119fc68015c0213c90d77 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Wed, 20 Dec 2023 20:46:51 -0500 Subject: [PATCH 56/72] fix --- src/runtime/optimizer.cc | 14 ++++++++++---- src/runtime/optimizer_kernel.cpp | 19 ++++++------------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index fc30ae12cc..cf23cc39a5 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -522,7 +522,7 @@ void AdamOptimizer::unified_update(std::vector const parameters) } int offset = 0; - printf("param size: %d, %d\n", parameters.size(), parameters_num); + // printf("param size: %d, %d\n", parameters.size(), parameters_num); while(processed_parameters_num < parameters.size()){ @@ -540,7 +540,7 @@ void AdamOptimizer::unified_update(std::vector const parameters) assert(p->parallel_is != IndexSpace::NO_SPACE); } - printf("parameters_num: %d %d, %d\n", parameters_num, reservedWorkSpaceSize, model->handlers->workSpaceSize); + // printf("parameters_num: %d %d, %d\n", parameters_num, reservedWorkSpaceSize, model->handlers->workSpaceSize); assert(parameters_num <= parameters.size()); IndexLauncher launcher(ADAM_UNIFY_UPD_NCCL_TASK_ID, @@ -729,7 +729,12 @@ void AdamOptimizer::nccl_unified_update_task(Task const *task, float const *w_grad_ptr[op->parameters_num]; float *w_ptr[op->parameters_num], *v_ptr[op->parameters_num], *m_ptr[op->parameters_num]; - size_t size[op->parameters_num]; + + hipMalloc(w_grad_ptr, sizeof(float*) * op->parameters_num); + hipMalloc(w_ptr, sizeof(float*) * op->parameters_num); + hipMalloc(v_ptr, sizeof(float*) * op->parameters_num); + hipMalloc(m_ptr, sizeof(float*) * op->parameters_num); + size_t *size = new size_t[op->parameters_num]; int offset = 0; printf("parameters_num: %d\n", op->parameters_num); @@ -741,10 +746,11 @@ void AdamOptimizer::nccl_unified_update_task(Task const *task, GenericTensorAccessorW accM = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+3], task->regions[offset+3], FID_DATA, ctx, runtime); offset += 4; - size[i] = accW.domain.get_volume(); + size[i] = accWGrad.domain.get_volume(); // assert(accWGrad.rect == accW.rect); // assert(accWGrad.rect == accV.rect); // assert(accWGrad.rect == accM.rect); + w_grad_ptr[i] = accWGrad.get_float_ptr(); w_ptr[i] = accW.get_float_ptr(); v_ptr[i] = accV.get_float_ptr(); m_ptr[i] = accM.get_float_ptr(); diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index 5e7e613bc7..a0e3180185 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -262,18 +262,11 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu(AdamOptimizer const *o void *workSpace_ptr = meta->handle.workSpace; for(int i = 0; i < op->parameters_num; i++){ - // hipMemcpyAsync(static_cast(workSpace_ptr), - // w_grad_ptr[i], - // size[i] * sizeof(float), - // hipMemcpyDeviceToDevice, - // stream); - // hipError_t error = hipGetLastError(); - // if(error != hipSuccess) - // { - // // print the CUDA error message and exit - // printf("CUDA error: %s\n", hipGetErrorString(error)); - // } - + hipMemcpyAsync(workSpace_ptr, + w_grad_ptr[i], + size[i] * sizeof(float), + hipMemcpyDeviceToDevice, + stream); workSpace_ptr = static_cast(workSpace_ptr) + size[i] * sizeof(float); } @@ -292,7 +285,7 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu(AdamOptimizer const *o float beta2_t = op->beta2_t; for(int i = 0; i < op->parameters_num; i++){ // update - std::cout<<"update"<<"\n"; + // printf("update %d\n", i); hipLaunchKernelGGL(HIP_KERNEL_NAME(adam_update), GET_BLOCKS(size[i]), CUDA_NUM_THREADS, From d2bdb15a4f360c7e72e6d42b1a29e07fba24dcad Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 11 Jan 2024 23:38:06 -0500 Subject: [PATCH 57/72] fix --- include/flexflow/optimizer.h | 9 +-- src/ops/fused.cu | 8 +-- src/ops/kernels/dropout_kernels.cu | 97 ++++++++++++++++++++++++------ src/runtime/optimizer.cc | 41 +++++++------ src/runtime/optimizer_kernel.cpp | 93 ++++++++++++++-------------- src/runtime/optimizer_kernel.cu | 62 +++++++++++++++++++ 6 files changed, 216 insertions(+), 94 deletions(-) diff --git a/include/flexflow/optimizer.h b/include/flexflow/optimizer.h index 201408868b..2b710d1e7d 100644 --- a/include/flexflow/optimizer.h +++ b/include/flexflow/optimizer.h @@ -18,6 +18,7 @@ #include "flexflow/parallel_tensor.h" #include "legion.h" +#include "accessor.h" namespace FlexFlow { @@ -125,11 +126,11 @@ class AdamOptimizer : public Optimizer { float *m_ptr); static void nccl_unified_update_task_gpu(AdamOptimizer const *op, OpMeta const *meta, - float const *w_grad_ptr[], + GenericTensorAccessorR *accWGrads, size_t *size, - float *w_ptr[], - float *v_ptr[], - float *m_ptr[]); + GenericTensorAccessorW *accWs, + GenericTensorAccessorW *accVs, + GenericTensorAccessorW *accMs); #endif double alpha, beta1, beta2, weight_decay, epsilon; double alpha_t, beta1_t, beta2_t; diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 62262c89af..e43ce51f9c 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -216,9 +216,7 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); DropoutMeta *m = (DropoutMeta *)metas->meta[op]; Kernels::Dropout::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + m, my_input_accessor[0], my_output_accessor[0]); break; } case OP_LINEAR: { @@ -819,9 +817,7 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); DropoutMeta *m = (DropoutMeta *)metas->meta[op]; Kernels::Dropout::backward_kernel_wrapper( - m, - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr()); + m, my_output_grad_accessor[0], my_input_grad_accessor[0]); break; } case OP_EW_ADD: diff --git a/src/ops/kernels/dropout_kernels.cu b/src/ops/kernels/dropout_kernels.cu index 39037cb37f..80ea2de941 100644 --- a/src/ops/kernels/dropout_kernels.cu +++ b/src/ops/kernels/dropout_kernels.cu @@ -74,12 +74,52 @@ DropoutMeta::~DropoutMeta(void) { namespace Kernels { namespace Dropout { +__global__ void dropout_forward_kernel(float p, + long long seed, + size_t num_elements, + float const *input_ptr, + float *output_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + float scale = 1.0 / p; + curandStatePhilox4_32_10_t state; + curand_init(seed, i, 0, &state); + float rand = curand_uniform(&state); + if (input_ptr[i] < p) { + output_ptr[i] = 0; + } else { + output_ptr[i] = input_ptr[i] * scale; + } + } +} + +__global__ void dropout_backward_kernel(float p, + long long seed, + size_t num_elements, + float const *input_ptr, + float *output_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + float scale = 1.0 / p; + curandStatePhilox4_32_10_t state; + curand_init(seed, i, 0, &state); + float rand = curand_uniform(&state); + if (input_ptr[i] < p) { + output_ptr[i] = 0; + } else { + output_ptr[i] = input_ptr[i] * scale; + } + } +} + void forward_kernel_wrapper(DropoutMeta *m, - GenericTensorAccessorR const &input, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::forward_kernel(m, input.get_float_ptr(), output.get_float_ptr(), stream); + Internal::forward_kernel(m, + input.get_float_ptr(), + output.get_float_ptr(), + input.domain.get_volume(), + stream); } void backward_kernel_wrapper(DropoutMeta *m, @@ -87,7 +127,11 @@ void backward_kernel_wrapper(DropoutMeta *m, GenericTensorAccessorW const &input_grad) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::backward_kernel(m, output_grad.get_float_ptr(), input_grad.get_float_ptr(), stream); + Internal::backward_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + output_grad.domain.get_volume(), + stream); } namespace Internal { @@ -95,33 +139,48 @@ namespace Internal { void forward_kernel(DropoutMeta *m, float const *input_ptr, float *output_ptr, + size_t num_elements, cudaStream_t stream) { checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - checkCUDNN(cudnnDropoutForward(m->handle.dnn, - m->dropoutDesc, - m->inputTensor, - input_ptr, - m->outputTensor, - output_ptr, - m->reserveSpace, - m->reserveSpaceSize)); + int parallelism = num_elements; + dropout_forward_kernel<<>>( + m->seed, m->rate, num_elements, input_ptr, output_ptr); + + // checkCUDNN(cudnnDropoutForward(m->handle.dnn, + // m->dropoutDesc, + // m->inputTensor, + // input_ptr, + // m->outputTensor, + // output_ptr, + // m->reserveSpace, + // m->reserveSpaceSize)); } void backward_kernel(DropoutMeta *m, float const *output_grad_ptr, float *input_grad_ptr, + size_t num_elements, cudaStream_t stream) { checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + int parallelism = num_elements; + dropout_backward_kernel<<>>( + m->seed, m->rate, num_elements, output_grad_ptr, input_grad_ptr); - checkCUDNN(cudnnDropoutBackward(m->handle.dnn, - m->dropoutDesc, - m->outputTensor, - output_grad_ptr, - m->inputTensor, - input_grad_ptr, - m->reserveSpace, - m->reserveSpaceSize)); + // checkCUDNN(cudnnDropoutBackward(m->handle.dnn, + // m->dropoutDesc, + // m->outputTensor, + // output_grad_ptr, + // m->inputTensor, + // input_grad_ptr, + // m->reserveSpace, + // m->reserveSpaceSize)); } } // namespace Internal diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index cf23cc39a5..3bca298b01 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -727,35 +727,36 @@ void AdamOptimizer::nccl_unified_update_task(Task const *task, Domain domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - float const *w_grad_ptr[op->parameters_num]; - float *w_ptr[op->parameters_num], *v_ptr[op->parameters_num], *m_ptr[op->parameters_num]; - - hipMalloc(w_grad_ptr, sizeof(float*) * op->parameters_num); - hipMalloc(w_ptr, sizeof(float*) * op->parameters_num); - hipMalloc(v_ptr, sizeof(float*) * op->parameters_num); - hipMalloc(m_ptr, sizeof(float*) * op->parameters_num); + // float const *w_grad_ptr[op->parameters_num]; + // float *w_ptr[op->parameters_num], *v_ptr[op->parameters_num], *m_ptr[op->parameters_num]; + + // hipMalloc(w_grad_ptr, sizeof(float*) * op->parameters_num); + // hipMalloc(w_ptr, sizeof(float*) * op->parameters_num); + // hipMalloc(v_ptr, sizeof(float*) * op->parameters_num); + // hipMalloc(m_ptr, sizeof(float*) * op->parameters_num); + GenericTensorAccessorR accWGrads[op->parameters_num]; + GenericTensorAccessorW accWs[op->parameters_num]; + GenericTensorAccessorW accVs[op->parameters_num]; + GenericTensorAccessorW accMs[op->parameters_num]; size_t *size = new size_t[op->parameters_num]; int offset = 0; printf("parameters_num: %d\n", op->parameters_num); for(int i = 0; i < op->parameters_num; i++){ - GenericTensorAccessorR accWGrad = helperGetGenericTensorAccessorRO(DataType::DT_FLOAT, regions[offset], task->regions[offset], FID_DATA, ctx, runtime); - GenericTensorAccessorW accW = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+1], task->regions[offset+1], FID_DATA, ctx, runtime); - GenericTensorAccessorW accV = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+2], task->regions[offset+2], FID_DATA, ctx, runtime); - GenericTensorAccessorW accM = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+3], task->regions[offset+3], FID_DATA, ctx, runtime); + accWGrads[i] = helperGetGenericTensorAccessorRO(DataType::DT_FLOAT, regions[offset], task->regions[offset], FID_DATA, ctx, runtime); + accWs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+1], task->regions[offset+1], FID_DATA, ctx, runtime); + accVs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+2], task->regions[offset+2], FID_DATA, ctx, runtime); + accMs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+3], task->regions[offset+3], FID_DATA, ctx, runtime); offset += 4; - size[i] = accWGrad.domain.get_volume(); - // assert(accWGrad.rect == accW.rect); - // assert(accWGrad.rect == accV.rect); - // assert(accWGrad.rect == accM.rect); - w_grad_ptr[i] = accWGrad.get_float_ptr(); - w_ptr[i] = accW.get_float_ptr(); - v_ptr[i] = accV.get_float_ptr(); - m_ptr[i] = accM.get_float_ptr(); + size[i] = accWGrads[i].domain.get_volume(); + // w_grad_ptr[i] = accWGrad.get_float_ptr(); + // w_ptr[i] = accW.get_float_ptr(); + // v_ptr[i] = accV.get_float_ptr(); + // m_ptr[i] = accM.get_float_ptr(); } - nccl_unified_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr); + nccl_unified_update_task_gpu(op, meta, accWGrads, size, accWs, accVs, accMs); } #endif diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index a0e3180185..0b7b459d9b 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -204,7 +204,7 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, m_ptr, v_ptr, w_ptr); - + // checkCUDA(hipDeviceSynchronize()); } @@ -247,68 +247,71 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, // checkCUDA(hipDeviceSynchronize()); } -__host__ void AdamOptimizer::nccl_unified_update_task_gpu(AdamOptimizer const *op, - OpMeta const *meta, - float const *w_grad_ptr[], - size_t *size, - float *w_ptr[], - float *v_ptr[], - float *m_ptr[]) { +__host__ void AdamOptimizer::nccl_unified_update_task_gpu( + AdamOptimizer const *op, + OpMeta const *meta, + GenericTensorAccessorR *accWGrads, + size_t *size, + GenericTensorAccessorW *accWs, + GenericTensorAccessorW *accVs, + GenericTensorAccessorW *accMs) { hipStream_t stream; - checkCUDA(get_legion_stream(&stream)); + checkCUDA(get_legion_stream(&stream)); assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); void *workSpace_ptr = meta->handle.workSpace; - for(int i = 0; i < op->parameters_num; i++){ + for (int i = 0; i < op->parameters_num; i++) { hipMemcpyAsync(workSpace_ptr, - w_grad_ptr[i], - size[i] * sizeof(float), - hipMemcpyDeviceToDevice, - stream); - workSpace_ptr = static_cast(workSpace_ptr) + size[i] * sizeof(float); + accWGrads[i].get_float_ptr(), + size[i] * sizeof(float), + hipMemcpyDeviceToDevice, + stream); + workSpace_ptr = + static_cast(workSpace_ptr) + size[i] * sizeof(float); } - //do allreduce once + // do allreduce once checkNCCL(ncclAllReduce(meta->handle.workSpace, - (float *)meta->handle.workSpace, - op->reservedWorkSpaceSize / sizeof(float), - ncclFloat, - ncclSum, - meta->handle.ncclComm, - stream)); + (float *)meta->handle.workSpace, + op->reservedWorkSpaceSize / sizeof(float), + ncclFloat, + ncclSum, + meta->handle.ncclComm, + stream)); workSpace_ptr = static_cast(meta->handle.workSpace); float alpha_t = op->alpha_t; float beta1_t = op->beta1_t; float beta2_t = op->beta2_t; - for(int i = 0; i < op->parameters_num; i++){ + for (int i = 0; i < op->parameters_num; i++) { // update // printf("update %d\n", i); - hipLaunchKernelGGL(HIP_KERNEL_NAME(adam_update), - GET_BLOCKS(size[i]), - CUDA_NUM_THREADS, - 0, - stream, - size[i], - alpha_t, - op->beta1, - op->beta2, - op->weight_decay, - op->epsilon, - static_cast(workSpace_ptr), - m_ptr[i], - v_ptr[i], - w_ptr[i]); - workSpace_ptr = static_cast(workSpace_ptr) + size[i] * sizeof(float); + hipLaunchKernelGGL(HIP_KERNEL_NAME(adam_update), + GET_BLOCKS(size[i]), + CUDA_NUM_THREADS, + 0, + stream, + size[i], + alpha_t, + op->beta1, + op->beta2, + op->weight_decay, + op->epsilon, + static_cast(workSpace_ptr), + accMs[i].get_float_ptr(), + accVs[i].get_float_ptr(), + accWs[i].get_float_ptr()); + workSpace_ptr = + static_cast(workSpace_ptr) + size[i] * sizeof(float); + + // update + beta1_t *= op->beta1; + beta2_t *= op->beta2; + alpha_t = op->alpha * sqrt(1 - beta2_t) / (1 - beta1_t); + } - //update - beta1_t *= op->beta1; - beta2_t *= op->beta2; - alpha_t = op->alpha * sqrt(1 - beta2_t) / (1 - beta1_t); - } - // checkCUDA(hipDeviceSynchronize()); } #endif diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu index 5f654fbb5b..d613ae76a4 100644 --- a/src/runtime/optimizer_kernel.cu +++ b/src/runtime/optimizer_kernel.cu @@ -216,6 +216,68 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, w_ptr); // checkCUDA(cudaDeviceSynchronize()); } + +__host__ void AdamOptimizer::nccl_unified_update_task_gpu( + AdamOptimizer const *op, + OpMeta const *meta, + GenericTensorAccessorR *accWGrads, + size_t *size, + GenericTensorAccessorW *accWs, + GenericTensorAccessorW *accVs, + GenericTensorAccessorW *accMs) { + + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); + + void *workSpace_ptr = meta->handle.workSpace; + + for (int i = 0; i < op->parameters_num; i++) { + cudaMemcpyAsync(workSpace_ptr, + accWGrads[i].get_float_ptr(), + size[i] * sizeof(float), + cudaMemcpyDeviceToDevice, + stream); + workSpace_ptr = + static_cast(workSpace_ptr) + size[i] * sizeof(float); + } + + // do allreduce once + checkNCCL(ncclAllReduce(meta->handle.workSpace, + (float *)meta->handle.workSpace, + op->reservedWorkSpaceSize / sizeof(float), + ncclFloat, + ncclSum, + meta->handle.ncclComm, + stream)); + + workSpace_ptr = static_cast(meta->handle.workSpace); + float alpha_t = op->alpha_t; + float beta1_t = op->beta1_t; + float beta2_t = op->beta2_t; + for (int i = 0; i < op->parameters_num; i++) { + // update + // printf("update %d\n", i); + adam_update<<>>( + size[i], + alpha_t, + op->beta1, + op->beta2, + op->weight_decay, + op->epsilon, + static_cast(workSpace_ptr), + accMs[i].get_float_ptr(), + accVs[i].get_float_ptr(), + accWs[i].get_float_ptr()); + workSpace_ptr = + static_cast(workSpace_ptr) + size[i] * sizeof(float); + + // update + beta1_t *= op->beta1; + beta2_t *= op->beta2; + alpha_t = op->alpha * sqrt(1 - beta2_t) / (1 - beta1_t); + } +} #endif }; // namespace FlexFlow From 3b9e1c6a6391acd178dda095ee59f586cd757287 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 12 Jan 2024 12:25:15 -0500 Subject: [PATCH 58/72] . --- include/flexflow/optimizer.h | 1 - src/runtime/optimizer.cc | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/flexflow/optimizer.h b/include/flexflow/optimizer.h index 2b710d1e7d..401fffb351 100644 --- a/include/flexflow/optimizer.h +++ b/include/flexflow/optimizer.h @@ -137,7 +137,6 @@ class AdamOptimizer : public Optimizer { std::map v_values, m_values; size_t reservedWorkSpaceSize = 0; int parameters_num = 0; - int processed_parameters_num = 0; }; }; // namespace FlexFlow diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index 3bca298b01..b5e475c466 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -522,6 +522,8 @@ void AdamOptimizer::unified_update(std::vector const parameters) } int offset = 0; + int processed_parameters_num = 0; + parameters_num = 0; // printf("param size: %d, %d\n", parameters.size(), parameters_num); while(processed_parameters_num < parameters.size()){ From 9f8bb9ee1e613689cb3675ab3def2930348f638c Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 12 Jan 2024 13:17:47 -0500 Subject: [PATCH 59/72] fix --- src/ops/kernels/dropout_kernels.cu | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/ops/kernels/dropout_kernels.cu b/src/ops/kernels/dropout_kernels.cu index 80ea2de941..c5b1a384df 100644 --- a/src/ops/kernels/dropout_kernels.cu +++ b/src/ops/kernels/dropout_kernels.cu @@ -29,6 +29,10 @@ DropoutMeta::DropoutMeta(FFHandler handler, Domain const &output_domain) : OpMeta(handler) { profiling = dropout->profiling; + rate = dropout->rate; + seed = dropout->seed; + input_type[0] = dropout->data_type; + output_type[0] = dropout->data_type; checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); checkCUDNN(cudnnCreateDropoutDescriptor(&dropoutDesc)); @@ -115,11 +119,28 @@ void forward_kernel_wrapper(DropoutMeta *m, GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + Internal::forward_kernel(m, input.get_float_ptr(), output.get_float_ptr(), input.domain.get_volume(), stream); + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf(" [dropout] forward time = %.2lfms\n", elapsed); + } } void backward_kernel_wrapper(DropoutMeta *m, @@ -127,11 +148,27 @@ void backward_kernel_wrapper(DropoutMeta *m, GenericTensorAccessorW const &input_grad) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } Internal::backward_kernel(m, output_grad.get_float_ptr(), input_grad.get_float_ptr(), output_grad.domain.get_volume(), stream); + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf(" [dropout] backward time = %.2lfms\n", elapsed); + } } namespace Internal { From fb91122170c2e81c3580fc582193f3e3afa19590 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 12 Jan 2024 13:19:18 -0500 Subject: [PATCH 60/72] remove print --- src/runtime/optimizer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index b5e475c466..03d26b6a64 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -743,7 +743,7 @@ void AdamOptimizer::nccl_unified_update_task(Task const *task, size_t *size = new size_t[op->parameters_num]; int offset = 0; - printf("parameters_num: %d\n", op->parameters_num); + // printf("parameters_num: %d\n", op->parameters_num); for(int i = 0; i < op->parameters_num; i++){ accWGrads[i] = helperGetGenericTensorAccessorRO(DataType::DT_FLOAT, regions[offset], task->regions[offset], FID_DATA, ctx, runtime); From c162d4cb03d03978fec5090fe8075d2fa402c432 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 18 Jan 2024 21:21:58 -0500 Subject: [PATCH 61/72] fix hip --- src/ops/fused.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index cb2b81fbcf..d1de787fb4 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -205,9 +205,7 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); DropoutMeta *m = (DropoutMeta *)metas->meta[op]; Kernels::Dropout::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + m, my_input_accessor[0], my_output_accessor[0]); break; } case OP_LINEAR: { From ea793172725e6b7f7a5df10d037764bec95aa98c Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 19 Jan 2024 09:55:14 -0500 Subject: [PATCH 62/72] fix multinodes --- src/runtime/optimizer.cc | 145 +++++++++++++++++++++++---------------- 1 file changed, 84 insertions(+), 61 deletions(-) diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index 03d26b6a64..74d665a592 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -523,27 +523,29 @@ void AdamOptimizer::unified_update(std::vector const parameters) int offset = 0; int processed_parameters_num = 0; - parameters_num = 0; // printf("param size: %d, %d\n", parameters.size(), parameters_num); while(processed_parameters_num < parameters.size()){ + parameters_num = 0; for(int i = 0; i < parameters.size(); i++){ const ParallelTensor p = parameters.at(i); assert(v_values.find(p->region) != v_values.end()); assert(m_values.find(p->region) != m_values.end()); assert(p->owner_op != NULL); - if(reservedWorkSpaceSize + p->get_volume() * sizeof(float) >= model->handlers->workSpaceSize){ - break; + if (reservedWorkSpaceSize + p->get_volume() * sizeof(float) >= + model->handlers->workSpaceSize) { + break; } reservedWorkSpaceSize += p->get_volume() * sizeof(float); parameters_num += 1; - assert (p->sync_type == ParameterSyncType::NCCL); + assert(p->sync_type == ParameterSyncType::NCCL); assert(p->parallel_is != IndexSpace::NO_SPACE); } - // printf("parameters_num: %d %d, %d\n", parameters_num, reservedWorkSpaceSize, model->handlers->workSpaceSize); - assert(parameters_num <= parameters.size()); + // printf("parameters_num: %d %d, %d\n", parameters_num, + // reservedWorkSpaceSize, model->handlers->workSpaceSize); + assert(processed_parameters_num <= parameters.size()); IndexLauncher launcher(ADAM_UNIFY_UPD_NCCL_TASK_ID, p0->parallel_is, @@ -553,42 +555,42 @@ void AdamOptimizer::unified_update(std::vector const parameters) false /*must_epoch*/, 0 /*mapper_id*/, p0->machine_view.hash()); - //launch a unified task - for(int j = 0; j < parameters_num; j++){ + // launch a unified task + for (int j = 0; j < parameters_num; j++) { const ParallelTensor p = parameters.at(processed_parameters_num + j); - // regions[0]: region_grad - launcher.add_region_requirement(RegionRequirement(p->part_grad, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - p->region_grad)); - launcher.add_field(offset, FID_DATA); - // regions[1]: region - launcher.add_region_requirement(RegionRequirement( - p->part, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, p->region)); - launcher.add_field(offset + 1, FID_DATA); - // regions[2]: w_region - launcher.add_region_requirement( - RegionRequirement(v_values[p->region]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - v_values[p->region]->region)); - launcher.add_field(offset + 2, FID_DATA); - // regions[3]: m_region - launcher.add_region_requirement( - RegionRequirement(m_values[p->region]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - m_values[p->region]->region)); - launcher.add_field(offset + 3, FID_DATA); - offset += 4; + // regions[0]: region_grad + launcher.add_region_requirement(RegionRequirement(p->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + p->region_grad)); + launcher.add_field(offset, FID_DATA); + // regions[1]: region + launcher.add_region_requirement(RegionRequirement( + p->part, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, p->region)); + launcher.add_field(offset + 1, FID_DATA); + // regions[2]: w_region + launcher.add_region_requirement( + RegionRequirement(v_values[p->region]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + v_values[p->region]->region)); + launcher.add_field(offset + 2, FID_DATA); + // regions[3]: m_region + launcher.add_region_requirement( + RegionRequirement(m_values[p->region]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + m_values[p->region]->region)); + launcher.add_field(offset + 3, FID_DATA); + offset += 4; } - - //update alpha, beta - for(int i = 0; i < parameters_num; i++){ + + // update alpha, beta + for (int i = 0; i < parameters_num; i++) { this->next(); } launcher.concurrent = true; @@ -597,9 +599,9 @@ void AdamOptimizer::unified_update(std::vector const parameters) runtime->issue_execution_fence(ctx); reservedWorkSpaceSize = 0; offset = 0; - processed_parameters_num += parameters_num; + processed_parameters_num += parameters_num; } - + parameters_num = 0; } void AdamOptimizer::ps_update_task(Task const *task, @@ -716,11 +718,11 @@ void AdamOptimizer::nccl_update_task(Task const *task, nccl_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr); } - -void AdamOptimizer::nccl_unified_update_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { +void AdamOptimizer::nccl_unified_update_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { // assert(regions.size() == 4); // assert(task->regions.size() == 4); AdamOptimizer const *op = (AdamOptimizer *)task->args; @@ -730,7 +732,8 @@ void AdamOptimizer::nccl_unified_update_task(Task const *task, ctx, task->regions[1].region.get_index_space()); // float const *w_grad_ptr[op->parameters_num]; - // float *w_ptr[op->parameters_num], *v_ptr[op->parameters_num], *m_ptr[op->parameters_num]; + // float *w_ptr[op->parameters_num], *v_ptr[op->parameters_num], + // *m_ptr[op->parameters_num]; // hipMalloc(w_grad_ptr, sizeof(float*) * op->parameters_num); // hipMalloc(w_ptr, sizeof(float*) * op->parameters_num); @@ -743,20 +746,40 @@ void AdamOptimizer::nccl_unified_update_task(Task const *task, size_t *size = new size_t[op->parameters_num]; int offset = 0; - // printf("parameters_num: %d\n", op->parameters_num); - - for(int i = 0; i < op->parameters_num; i++){ - accWGrads[i] = helperGetGenericTensorAccessorRO(DataType::DT_FLOAT, regions[offset], task->regions[offset], FID_DATA, ctx, runtime); - accWs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+1], task->regions[offset+1], FID_DATA, ctx, runtime); - accVs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+2], task->regions[offset+2], FID_DATA, ctx, runtime); - accMs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+3], task->regions[offset+3], FID_DATA, ctx, runtime); - offset += 4; - - size[i] = accWGrads[i].domain.get_volume(); - // w_grad_ptr[i] = accWGrad.get_float_ptr(); - // w_ptr[i] = accW.get_float_ptr(); - // v_ptr[i] = accV.get_float_ptr(); - // m_ptr[i] = accM.get_float_ptr(); + // printf("parameters_num: %d\n", op->parameters_num); + + for (int i = 0; i < op->parameters_num; i++) { + accWGrads[i] = helperGetGenericTensorAccessorRO(DataType::DT_FLOAT, + regions[offset], + task->regions[offset], + FID_DATA, + ctx, + runtime); + accWs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, + regions[offset + 1], + task->regions[offset + 1], + FID_DATA, + ctx, + runtime); + accVs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, + regions[offset + 2], + task->regions[offset + 2], + FID_DATA, + ctx, + runtime); + accMs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, + regions[offset + 3], + task->regions[offset + 3], + FID_DATA, + ctx, + runtime); + offset += 4; + + size[i] = accWGrads[i].domain.get_volume(); + // w_grad_ptr[i] = accWGrad.get_float_ptr(); + // w_ptr[i] = accW.get_float_ptr(); + // v_ptr[i] = accV.get_float_ptr(); + // m_ptr[i] = accM.get_float_ptr(); } nccl_unified_update_task_gpu(op, meta, accWGrads, size, accWs, accVs, accMs); } From 58d84eda1395e6d8cb56bfcaadde0e59278f04aa Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 19 Jan 2024 12:50:37 -0500 Subject: [PATCH 63/72] fix --- src/runtime/optimizer.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index 74d665a592..c2e433d2e5 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -523,12 +523,12 @@ void AdamOptimizer::unified_update(std::vector const parameters) int offset = 0; int processed_parameters_num = 0; - // printf("param size: %d, %d\n", parameters.size(), parameters_num); + // printf("param size: %d\n", parameters.size()); while(processed_parameters_num < parameters.size()){ parameters_num = 0; - for(int i = 0; i < parameters.size(); i++){ + for(int i = processed_parameters_num; i < parameters.size(); i++){ const ParallelTensor p = parameters.at(i); assert(v_values.find(p->region) != v_values.end()); assert(m_values.find(p->region) != m_values.end()); @@ -543,8 +543,8 @@ void AdamOptimizer::unified_update(std::vector const parameters) assert(p->parallel_is != IndexSpace::NO_SPACE); } - // printf("parameters_num: %d %d, %d\n", parameters_num, - // reservedWorkSpaceSize, model->handlers->workSpaceSize); + printf("parameters_num: %d %zu, %zu, %d\n", parameters_num, + reservedWorkSpaceSize, model->handlers->workSpaceSize, parameters.size()); assert(processed_parameters_num <= parameters.size()); IndexLauncher launcher(ADAM_UNIFY_UPD_NCCL_TASK_ID, From 01c9d4c560bf57188811156fefc1db0b2f013644 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 25 Jan 2024 18:27:36 -0500 Subject: [PATCH 64/72] fix --- src/runtime/optimizer.cc | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index c2e433d2e5..a683d234e3 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -525,16 +525,19 @@ void AdamOptimizer::unified_update(std::vector const parameters) int processed_parameters_num = 0; // printf("param size: %d\n", parameters.size()); - while(processed_parameters_num < parameters.size()){ + size_t workSpaceSize = model->handlers->workSpaceSize * + model->config.workersPerNode * model->config.numNodes; + + while (processed_parameters_num < parameters.size()) { parameters_num = 0; - + for(int i = processed_parameters_num; i < parameters.size(); i++){ const ParallelTensor p = parameters.at(i); assert(v_values.find(p->region) != v_values.end()); assert(m_values.find(p->region) != m_values.end()); assert(p->owner_op != NULL); if (reservedWorkSpaceSize + p->get_volume() * sizeof(float) >= - model->handlers->workSpaceSize) { + workSpaceSize) { break; } reservedWorkSpaceSize += p->get_volume() * sizeof(float); @@ -543,8 +546,9 @@ void AdamOptimizer::unified_update(std::vector const parameters) assert(p->parallel_is != IndexSpace::NO_SPACE); } - printf("parameters_num: %d %zu, %zu, %d\n", parameters_num, - reservedWorkSpaceSize, model->handlers->workSpaceSize, parameters.size()); + // printf("parameters_num: %d %zu, %zu, %d\n", parameters_num, + // reservedWorkSpaceSize, model->handlers->workSpaceSize, + // parameters.size()); assert(processed_parameters_num <= parameters.size()); IndexLauncher launcher(ADAM_UNIFY_UPD_NCCL_TASK_ID, From a31f8e9bbd27e799abab712d73b202dac87ed2b3 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 25 Jan 2024 23:39:37 -0500 Subject: [PATCH 65/72] fix --- src/runtime/optimizer_kernel.cpp | 4 ++-- src/runtime/optimizer_kernel.cu | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index 0b7b459d9b..373eb3fe7a 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -258,7 +258,7 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); + // assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); void *workSpace_ptr = meta->handle.workSpace; @@ -275,7 +275,7 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( // do allreduce once checkNCCL(ncclAllReduce(meta->handle.workSpace, (float *)meta->handle.workSpace, - op->reservedWorkSpaceSize / sizeof(float), + meta->handle.workSpaceSize, ncclFloat, ncclSum, meta->handle.ncclComm, diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu index d613ae76a4..a57281a5ec 100644 --- a/src/runtime/optimizer_kernel.cu +++ b/src/runtime/optimizer_kernel.cu @@ -228,7 +228,7 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); + // assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); void *workSpace_ptr = meta->handle.workSpace; @@ -245,7 +245,7 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( // do allreduce once checkNCCL(ncclAllReduce(meta->handle.workSpace, (float *)meta->handle.workSpace, - op->reservedWorkSpaceSize / sizeof(float), + meta->handle.workSpaceSize, ncclFloat, ncclSum, meta->handle.ncclComm, From 9141c46a64617de893d986b694f1202841e86b27 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 1 Feb 2024 23:23:14 -0500 Subject: [PATCH 66/72] tp --- examples/python/pytorch/mt5/mt5_ff.py | 1 + include/flexflow/ffconst.h | 2 + include/flexflow/model.h | 4 + include/flexflow/operator_params.h | 2 + include/flexflow/parallel_ops/allreduce.h | 57 ++++ .../flexflow/parallel_ops/allreduce_params.h | 22 ++ .../parallel_ops/kernels/allreduce_kernels.h | 31 ++ python/flexflow/core/flexflow_cffi.py | 4 +- src/ops/fused.cc | 2 +- src/ops/fused.cpp | 17 ++ src/ops/fused.cu | 17 ++ src/parallel_ops/allreduce.cc | 279 ++++++++++++++++++ .../kernels/allreduce_kernels.cpp | 58 ++++ src/parallel_ops/kernels/allreduce_kernels.cu | 56 ++++ src/runtime/ffconst_utils.cc | 2 + src/runtime/graph.cc | 19 ++ src/runtime/model.cc | 60 +++- src/runtime/operator_params.cc | 3 + src/runtime/optimizer.cc | 4 +- src/runtime/substitution.cc | 7 + 20 files changed, 635 insertions(+), 12 deletions(-) create mode 100644 include/flexflow/parallel_ops/allreduce.h create mode 100644 include/flexflow/parallel_ops/allreduce_params.h create mode 100644 include/flexflow/parallel_ops/kernels/allreduce_kernels.h create mode 100644 src/parallel_ops/allreduce.cc create mode 100644 src/parallel_ops/kernels/allreduce_kernels.cpp create mode 100644 src/parallel_ops/kernels/allreduce_kernels.cu diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py index c2868e9d1e..5dff7415d3 100644 --- a/examples/python/pytorch/mt5/mt5_ff.py +++ b/examples/python/pytorch/mt5/mt5_ff.py @@ -122,6 +122,7 @@ def top_level_task(): input_names = ["input_ids", "attention_mask"] print("Tracing the model...") + print(batch_size) hf_model = PyTorchModel( model, is_hf_model=True, input_names=input_names, batch_size=batch_size, seq_length=seq_length, diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 5658e2923d..060983b020 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -157,6 +157,7 @@ enum OperatorType { OP_REPLICATE, OP_REDUCTION, OP_PIPELINE, + OP_ALLREDUCE, OP_FUSED_PARALLEL, OP_INVALID, }; @@ -189,6 +190,7 @@ enum PMParameter { PM_COMBINE_DEGREE, // Combine PM_REDUCTION_DIM, // Reduction PM_REDUCTION_DEGREE, // Reduction + PM_ALLREDUCE_DIM, // AllReduce PM_SOFTMAX_DIM, // Softmax PM_NUM_HEADS, // MultiHeadAttention PM_INVALID, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 82a29c8182..3f45de3d79 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -191,6 +191,10 @@ enum TaskIDs { PIPELINE_INIT_TASK_ID, PIPELINE_FWD_TASK_ID, PIPELINE_BWD_TASK_ID, + ALLREDUCE_INIT_TASK_ID, + ALLREDUCE_INF_TASK_ID, + ALLREDUCE_FWD_TASK_ID, + ALLREDUCE_BWD_TASK_ID, FUSED_PARALLELOP_INIT_TASK_ID, FUSED_PARALLELOP_FWD_TASK_ID, FUSED_PARALLELOP_BWD_TASK_ID, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 24c84a85ed..84653ac9ca 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -7,6 +7,7 @@ #include "flexflow/ops/batch_matmul_params.h" #include "flexflow/ops/cast_params.h" #include "flexflow/ops/concat_params.h" +#include "flexflow/parallel_ops/allreduce_params.h" #include "flexflow/ops/conv_2d_params.h" #include "flexflow/ops/dropout_params.h" #include "flexflow/ops/element_binary_params.h" @@ -62,6 +63,7 @@ using OperatorParameters = mp::variant; tl::optional get_op_parameters(Op const *op); diff --git a/include/flexflow/parallel_ops/allreduce.h b/include/flexflow/parallel_ops/allreduce.h new file mode 100644 index 0000000000..a28d4cef9e --- /dev/null +++ b/include/flexflow/parallel_ops/allreduce.h @@ -0,0 +1,57 @@ +#ifndef _FLEXFLOW_ALLREDUCE_H +#define _FLEXFLOW_ALLREDUCE_H + +#include "flexflow/layer.h" +#include "flexflow/node.h" +#include "flexflow/op_meta.h" +#include "flexflow/operator.h" +#include "flexflow/parallel_ops/allreduce_params.h" +#include "parallel_op.h" + +namespace FlexFlow { + +class AllReduce : public ParallelOp { +public: + using Params = AllReduceParams; + using Input = ParallelTensor; + + AllReduce(FFModel &model, + const ParallelTensor input, + int allreduce_legion_dim, + char const *name = NULL); + AllReduce(FFModel &model, + Params const ¶ms, + Input const input, + char const *name = nullptr); + void create_input_partition(FFModel &model) override; + void init(FFModel const &) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + bool get_int_parameter(PMParameter, int *) const override; + bool append_parallel_op_info( + std::vector ¶llel_ops) const override; + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void forward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const override; + + Params get_params() const; + +public: + int allreduce_dim; +}; + +}; // namespace FlexFlow + +#endif // _FLEXFLOW_ALLREDUCE_H diff --git a/include/flexflow/parallel_ops/allreduce_params.h b/include/flexflow/parallel_ops/allreduce_params.h new file mode 100644 index 0000000000..a0daac8f9a --- /dev/null +++ b/include/flexflow/parallel_ops/allreduce_params.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_ALLREDUCE_PARAMS_H +#define _FLEXFLOW_ALLREDUCE_PARAMS_H + +namespace FlexFlow { + +struct AllReduceParams { + int allreduce_legion_dim; + char name[MAX_OPNAME]; + bool is_valid(ParallelTensorShape const &) const; +}; +bool operator==(AllReduceParams const &, AllReduceParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::AllReduceParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_ALLREDUCE_PARAMS_H diff --git a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h new file mode 100644 index 0000000000..02a5026fcf --- /dev/null +++ b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h @@ -0,0 +1,31 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_ALLREDUCE_KERNELS_H +#define _FLEXFLOW_OPS_KERNELS_ALLREDUCE_KERNELS_H + +#include "flexflow/device.h" +#include "flexflow/fftype.h" +#include "flexflow/op_meta.h" +#include "flexflow/parallel_ops/allreduce.h" + +namespace FlexFlow { + +class AllReduceMeta : public OpMeta { +public: + AllReduceMeta(FFHandler handle, AllReduce const *reduct); +}; + +namespace Kernels { +namespace AllReduce { + +void forward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void backward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad); + +} // namespace AllReduce +} // namespace Kernels +} // namespace FlexFlow + +#endif // _FLEXFLOW_OPS_KERNELS_ALLREDUCE_KERNELS_H diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 49ad5b3dc8..c39b58e4b8 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -2124,9 +2124,9 @@ def fit(self, x=None, y=None, batch_size=None, epochs=1): for d in dataloaders: d.next_batch(self) self.forward() - self.zero_gradients() + # self.zero_gradients() self.backward() - self.update() + self.unified_update() self._ffconfig.end_trace(self._tracing_id) def eval(self, x=None, y=None, batch_size=None): diff --git a/src/ops/fused.cc b/src/ops/fused.cc index d0c895c53d..b241ff1587 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -129,7 +129,7 @@ bool FusedOp::add_operator(FFModel &model, Op *op) { // op->name, op_config)); // Cannot fuse parallel operators since they have different paralel_is // in forward and backward - assert(!op->is_parallel_op()); + assert(!op->is_parallel_op() || op->op_type == OP_ALLREDUCE); // Currently don't consider nested fusion if (op->op_type == OP_FUSED) { return false; diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index d1de787fb4..9da93f0c65 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -31,6 +31,7 @@ #include "flexflow/ops/kernels/reshape_kernels.h" #include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" #include "flexflow/utils/hip_helper.h" @@ -420,6 +421,14 @@ __host__ void FusedOp::forward_task(Task const *task, } break; } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0]); + break; + } case OP_RESHAPE: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); @@ -957,6 +966,14 @@ __host__ void FusedOp::backward_task(Task const *task, } break; } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::backward_kernel_wrapper( + m, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } case OP_TRANSPOSE: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); diff --git a/src/ops/fused.cu b/src/ops/fused.cu index e43ce51f9c..b78447ba41 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -33,6 +33,7 @@ #include "flexflow/ops/kernels/reshape_kernels.h" #include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/utils/cuda_helper.h" @@ -460,6 +461,14 @@ __host__ void FusedOp::forward_task(Task const *task, } break; } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0]); + break; + } case OP_TRANSPOSE: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); @@ -1002,6 +1011,14 @@ __host__ void FusedOp::backward_task(Task const *task, } break; } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::backward_kernel_wrapper( + m, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } case OP_TRANSPOSE: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc new file mode 100644 index 0000000000..1bbc43ca0d --- /dev/null +++ b/src/parallel_ops/allreduce.cc @@ -0,0 +1,279 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/parallel_ops/allreduce.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/model.h" +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" +#include "flexflow/utils/hash_utils.h" + +namespace FlexFlow { +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::Future; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::LogicalPartition; +using Legion::LogicalRegion; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; + +using namespace FlexFlow::Kernels::AllReduce; + +/* Params */ +bool operator==(AllReduceParams const &lhs, AllReduceParams const &rhs) { + return lhs.allreduce_legion_dim == rhs.allreduce_legion_dim; +} + +bool AllReduceParams::is_valid(ParallelTensorShape const &input) const { + return input.is_valid(); +} + +AllReduceParams AllReduce::get_params() const { + AllReduceParams params; + params.allreduce_legion_dim = this->allreduce_dim; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } + return params; +} + +AllReduce::AllReduce(FFModel &model, + const ParallelTensor _input, + int _allreduce_legion_dim, + char const *name) + : ParallelOp(model, OP_ALLREDUCE, name, _input), + allreduce_dim(_allreduce_legion_dim) { + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + assert(dims[allreduce_dim].degree > 1); + // ParallelTensorBase::update_parallel_ids(numdim, dims); + outputs[0] = model.create_parallel_tensor_legion_ordering( + numdim, dims, _input->data_type, this); +} + +AllReduce::AllReduce(FFModel &model, + AllReduceParams const ¶ms, + ParallelTensor const input, + char const *name) + : AllReduce(model, input, params.allreduce_legion_dim, params.name) {} + +void AllReduce::create_input_partition(FFModel &ff) { + // Do nothing + return; +} + + +OpMeta *AllReduce::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + AllReduce *ar = (AllReduce *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + AllReduceMeta *meta = new AllReduceMeta(handle, ar); + meta->input_type[0] = ar->inputs[0]->data_type; + meta->output_type[0] = ar->outputs[0]->data_type; + assert(meta->input_type[0] == meta->output_type[0]); + return meta; +} + +void AllReduce::init(FFModel const &ff) { + ArgumentMap argmap; + parallel_is = outputs[0]->parallel_is; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(ALLREDUCE_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(AllReduce)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +void AllReduce::forward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = outputs[0]->parallel_is; + assert(numOutputs == 1); + assert(numInputs == 1); + set_argumentmap_for_forward(ff, argmap); + IndexLauncher launcher(ALLREDUCE_FWD_TASK_ID, + outputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +void AllReduce::backward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + IndexLauncher launcher(ALLREDUCE_BWD_TASK_ID, + inputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + inputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +bool AllReduce::measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const { + cost_metrics = CostMetrics(); + cost_metrics.forward_time = 0.0f; + cost_metrics.backward_time = 0.0f; + + cost_metrics.sync_time = 0; + cost_metrics.inputs_memory = 0; + cost_metrics.outputs_memory = 0; + cost_metrics.weights_memory = 0; + return true; +} + +bool AllReduce::get_int_parameter(PMParameter para, int *value) const { + switch (para) { + case PM_ALLREDUCE_DIM: + *value = allreduce_dim; + return true; + default: + return Op::get_int_parameter(para, value); + } +} + +bool AllReduce::append_parallel_op_info( + std::vector ¶llel_ops) const { + ParallelOpInfo ret; + ret.op_type = op_type; + ret.parallel_dim = allreduce_dim; + ret.parallel_degree = -1; // AllReduce does not affect parallel degree + parallel_ops.push_back(ret); + return true; +} + +/*static*/ +void AllReduce::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input.data_type == output.data_type); + forward_kernel_wrapper(m, input, output); +} + +void AllReduce::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input_grad.data_type == output_grad.data_type); + backward_kernel_wrapper(m, input_grad, output_grad); +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::AllReduceParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.allreduce_legion_dim); + return key; +} + +} // namespace std diff --git a/src/parallel_ops/kernels/allreduce_kernels.cpp b/src/parallel_ops/kernels/allreduce_kernels.cpp new file mode 100644 index 0000000000..0aea27107d --- /dev/null +++ b/src/parallel_ops/kernels/allreduce_kernels.cpp @@ -0,0 +1,58 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { + +AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct) + : OpMeta(handle) {} + +namespace Kernels { +namespace AllReduce { + +void forward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input.data_type == output.data_type); + assert(input.domain == output.domain); + size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; +#ifdef FF_USE_NCCL + // ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); + checkNCCL(ncclAllReduce(input.ptr, + output.ptr, + input.domain.get_volume(), + ncclFloat, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif +} + +void backward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + assert(false && "To be implemented"); +} + +} // namespace AllReduce +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu new file mode 100644 index 0000000000..bce970cbdc --- /dev/null +++ b/src/parallel_ops/kernels/allreduce_kernels.cu @@ -0,0 +1,56 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct) + : OpMeta(handle) {} + +namespace Kernels { +namespace AllReduce { + +void forward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input.data_type == output.data_type); + assert(input.domain == output.domain); +#ifdef FF_USE_NCCL + // ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); + checkNCCL(ncclAllReduce(input.ptr, + output.ptr, + input.domain.get_volume(), + ncclFloat, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif +} + +void backward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + assert(false && "To be implemented"); +} + +} // namespace AllReduce +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index 7ab9201113..e2debfa2d5 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -168,6 +168,8 @@ std::string get_operator_type_name(OperatorType type) { return "Replicate"; case OP_REDUCTION: return "Reduction"; + case OP_ALLREDUCE: + return "AllReduce"; case OP_PIPELINE: return "Pipeline"; case OP_FUSED_PARALLEL: diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 7447125197..8be3877b48 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -39,6 +39,7 @@ #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" #include "flexflow/parallel_ops/combine.h" +#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/fused_parallel_op.h" #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" @@ -2295,6 +2296,13 @@ GraphOptimalViewSerialized sez.serialize(reduction->reduction_dim); sez.serialize(reduction->reduction_degree); break; + } + case OP_ALLREDUCE: { + AllReduce *allreduce = (AllReduce *)op; + sez.serialize(allreduce->allreduce_dim); + sez.serialize(strlen(allreduce->name)); + sez.serialize(allreduce->name, strlen(allreduce->name)); + break; } case OP_COMBINE: { Combine *combine = (Combine *)op; @@ -2704,6 +2712,17 @@ void FFModel::deserialize_graph_optimal_view( {reduction_dim, reduction_degree}); break; } + case OP_ALLREDUCE: { + assert(num_inputs == 1); + int allreduce_dim; + dez.deserialize(allreduce_dim); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); + node = get_or_create_node(inputs[0], {allreduce_dim}); + break; + } case OP_FUSED_PARALLEL: { assert(num_inputs == 1); std::vector parallel_ops; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index b16766291a..3807e85391 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -53,6 +53,7 @@ #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" #include "flexflow/parallel_ops/partition.h" +#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/reduction.h" #include "flexflow/parallel_ops/replicate.h" #include "flexflow/substitution.h" @@ -2508,9 +2509,10 @@ bool FFModel::apply_fusion(std::vector const &operators, operators[l]->op_type == OP_WEIGHT) { continue; } - // don't fuse parallel op since they have different parallel_is in - // forward/backward - if (operators[l]->is_parallel_op()) { + // don't fuse parallel op except allReduce since they have different + // parallel_is in forward/backward + if (operators[l]->is_parallel_op() && + operators[l]->op_type != OP_ALLREDUCE) { continue; } size_t start = 0; @@ -2553,9 +2555,10 @@ bool FFModel::apply_fusion(std::vector const &operators, operators[i]->op_type == OP_WEIGHT) { continue; } - // don't fuse parallel op since they have different parallel_is in - // forward/backward - if (operators[i]->is_parallel_op()) { + // don't fuse parallel op except allReduce since they have different + // parallel_is in forward/backward + if (operators[i]->is_parallel_op() && + operators[i]->op_type != OP_ALLREDUCE) { continue; } fused_op = new FusedOp(*this, operators[i]); @@ -3491,7 +3494,7 @@ struct DefaultConfig { const static bool profiling = false; constexpr static float learningRate = 0.01f; constexpr static float weightDecay = 0.0001f; - const static size_t workSpaceSize = (size_t)1 * 1024 * 1024 * 1024; // 2GB + const static size_t workSpaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2GB const static int numNodes = 1; const static int workersPerNode = 0; const static int cpusPerNode = 0; @@ -5214,6 +5217,49 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + // AllReduce + { + TaskVariantRegistrar registrar(ALLREDUCE_INIT_TASK_ID, "AllReduce Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "AllReduce init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "AllReduce Forward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "AllReduce Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } // FusedParallelOp { TaskVariantRegistrar registrar(FUSED_PARALLELOP_FWD_TASK_ID, diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc index 41dd37dec7..322d7840fb 100644 --- a/src/runtime/operator_params.cc +++ b/src/runtime/operator_params.cc @@ -28,6 +28,7 @@ #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" #include "flexflow/parallel_ops/combine.h" +#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/fused_parallel_op.h" #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" @@ -94,6 +95,8 @@ tl::optional get_op_parameters(Op const *op) { return ((Reduction *)op)->get_params(); case OP_COMBINE: return ((Combine *)op)->get_params(); + case OP_ALLREDUCE: + return ((AllReduce *)op)->get_params(); case OP_FUSED_PARALLEL: return ((FusedParallelOp *)op)->get_params(); case OP_TRANSPOSE: diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index a683d234e3..2b90bc3f44 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -536,10 +536,10 @@ void AdamOptimizer::unified_update(std::vector const parameters) assert(v_values.find(p->region) != v_values.end()); assert(m_values.find(p->region) != m_values.end()); assert(p->owner_op != NULL); - if (reservedWorkSpaceSize + p->get_volume() * sizeof(float) >= - workSpaceSize) { + if (reservedWorkSpaceSize + p->get_volume() * sizeof(float) >= workSpaceSize) { break; } + printf("continueee\n"); reservedWorkSpaceSize += p->get_volume() * sizeof(float); parameters_num += 1; assert(p->sync_type == ParameterSyncType::NCCL); diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index e3adfec5b7..4f44a3a574 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -34,6 +34,7 @@ #include "flexflow/ops/split.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" +#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" #include "flexflow/parallel_ops/replicate.h" @@ -3743,6 +3744,12 @@ bool FFModel::convert_graph_to_operators( reduction->reduction_degree); break; } + case OP_ALLREDUCE: { + assert(inList.size() == 1); + AllReduce *allreduce = (AllReduce *)node.ptr; + new_op = new AllReduce(*this, inputs[0], allreduce->allreduce_dim); + break; + } case OP_FUSED_PARALLEL: { assert(inList.size() == 1); FusedParallelOp *fused = (FusedParallelOp *)node.ptr; From 81852893af7b866e42b1a834670535f459309758 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 15 Feb 2024 18:09:44 -0500 Subject: [PATCH 67/72] timer --- src/runtime/optimizer_kernel.cu | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu index a57281a5ec..ee8fc8dcd4 100644 --- a/src/runtime/optimizer_kernel.cu +++ b/src/runtime/optimizer_kernel.cu @@ -230,6 +230,15 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( checkCUDA(get_legion_stream(&stream)); // assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); + cudaEvent_t t_start, t_start1, t_start2, t_end; + cudaEventCreate(&t_start); + cudaEventCreate(&t_start1); + cudaEventCreate(&t_start2); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + cudaEventRecord(t_start1, stream); + cudaEventRecord(t_start2, stream); + void *workSpace_ptr = meta->handle.workSpace; for (int i = 0; i < op->parameters_num; i++) { @@ -242,6 +251,13 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( static_cast(workSpace_ptr) + size[i] * sizeof(float); } + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start1, t_end)); + cudaEventDestroy(t_start1); + printf("[optimizer] data copy time = %.2lfms\n", elapsed); + // do allreduce once checkNCCL(ncclAllReduce(meta->handle.workSpace, (float *)meta->handle.workSpace, @@ -250,6 +266,12 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( ncclSum, meta->handle.ncclComm, stream)); + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start2, t_end)); + cudaEventDestroy(t_start2); + printf("[optimizer] allreduce time = %.2lfms\n", elapsed); workSpace_ptr = static_cast(meta->handle.workSpace); float alpha_t = op->alpha_t; @@ -277,6 +299,14 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( beta2_t *= op->beta2; alpha_t = op->alpha * sqrt(1 - beta2_t) / (1 - beta1_t); } + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[optimizer] total time = %.2lfms\n", elapsed); + assert(false); } #endif From d958805a310d83084a2d7c5221dddf011f03cd87 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 16 Feb 2024 00:58:27 -0500 Subject: [PATCH 68/72] rmv --- src/runtime/optimizer_kernel.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu index ee8fc8dcd4..37c6e55ede 100644 --- a/src/runtime/optimizer_kernel.cu +++ b/src/runtime/optimizer_kernel.cu @@ -306,7 +306,6 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( cudaEventDestroy(t_start); cudaEventDestroy(t_end); printf("[optimizer] total time = %.2lfms\n", elapsed); - assert(false); } #endif From 38dfd87af234227f2e00604acff908c96e755ef4 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 22 Feb 2024 13:42:09 -0500 Subject: [PATCH 69/72] fix tp --- include/flexflow/config.h | 3 + include/flexflow/model.h | 6 + include/flexflow/ops/element_binary.h | 5 + include/flexflow/ops/element_binary_params.h | 1 + include/flexflow/utils/cuda_helper.h | 3 + python/flexflow/core/flexflow_cffi.py | 6 +- src/dataloader/dataloader.cc | 2 +- src/dataloader/dataloader.cu | 15 +- src/ops/element_binary.cc | 22 ++ src/ops/embedding.cc | 12 +- src/ops/layer_norm.cc | 4 +- src/ops/linear.cc | 20 +- src/ops/reshape.cc | 82 ++++- src/parallel_ops/allreduce.cc | 1 + src/parallel_ops/kernels/allreduce_kernels.cu | 19 +- src/runtime/cuda_helper.cu | 17 + src/runtime/graph.cc | 32 +- src/runtime/model.cc | 322 +++++++++++++----- src/runtime/optimizer.cc | 1 - src/runtime/optimizer_kernel.cu | 1 - src/runtime/parallel_tensor.cc | 18 +- 21 files changed, 484 insertions(+), 108 deletions(-) diff --git a/include/flexflow/config.h b/include/flexflow/config.h index d1fe6231da..b6a27a4f2a 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -124,6 +124,7 @@ class FFConfig { size_t workSpaceSize; Legion::Context lg_ctx; Legion::Runtime *lg_hlr; + Legion::IndexSpaceT<1> all_gpu_task_is; Legion::FieldSpace field_space; bool syntheticInput, profiling, perform_fusion; size_t simulator_work_space_size; @@ -137,6 +138,8 @@ class FFConfig { bool enable_parameter_parallel; bool enable_attribute_parallel; bool enable_inplace_optimizations; + int data_parallelism_degree; + int tensor_parallelism_degree; // Control Tensor Op Math Conversion bool allow_tensor_op_math_conversion; std::string dataset_path; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 3f45de3d79..fe73e6a0e3 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -278,6 +278,7 @@ class Split; class TopK; class Transpose; class Combine; +class AllReduce; class Repartition; class Reduction; class Replicate; @@ -834,6 +835,8 @@ class FFModel { Legion::IndexSpace get_task_is(Legion::Domain const &domain) const; Legion::IndexSpace get_task_is(ParallelConfig const &pc) const; Legion::IndexSpace get_task_is(MachineView const &view) const; + bool is_transformer_block(int layer_idx) const; + bool is_mlp_block(int layer_idx) const; void create_operators_from_layers(); Op *create_operator_from_layer(Layer *layer, std::vector const &inputs); @@ -860,6 +863,7 @@ class FFModel { int metrics_input; ParallelTensor parallel_label_tensor; Tensor label_tensor; + int num_inputs = 0; std::vector layers; std::vector operators; @@ -929,6 +933,8 @@ class FFModel { Replicate *>, std::unordered_map, Reduction *>, + std::unordered_map, + AllReduce *>, std::unordered_map, Combine *>, std::unordered_map, diff --git a/include/flexflow/ops/element_binary.h b/include/flexflow/ops/element_binary.h index f83ce20609..677ff23ce2 100644 --- a/include/flexflow/ops/element_binary.h +++ b/include/flexflow/ops/element_binary.h @@ -53,6 +53,11 @@ class ElementBinary : public Op { bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; + void serialize(Legion::Serializer &) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + ParallelTensor inputs[], + int num_inputs); Params get_params() const; public: diff --git a/include/flexflow/ops/element_binary_params.h b/include/flexflow/ops/element_binary_params.h index 5aa20e25a5..c70e1b597a 100644 --- a/include/flexflow/ops/element_binary_params.h +++ b/include/flexflow/ops/element_binary_params.h @@ -8,6 +8,7 @@ namespace FlexFlow { struct ElementBinaryParams { OperatorType type; + bool inplace_a; bool is_valid( std::pair const &) const; diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index a4b2be0a66..c56ab473c0 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -82,6 +82,9 @@ __global__ void assign_kernel(DT *ptr, Legion::coord_t size, DT value); template __global__ void copy_kernel(DT *dst, const DT *src, Legion::coord_t size); +template +__global__ void copy_kernel_with_replicate(DT *dst, const DT *src, Legion::coord_t origin_size, Legion::coord_t size); + template __global__ void add_kernel(T *data_ptr, T const *grad_ptr, size_t size); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index c39b58e4b8..750838d829 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -2079,11 +2079,11 @@ def load_bert_pretrained(self, checkpoint=None): layer = self._layers[i] if (layer.name + "_weight") in weights_dict: print('weight: ' + layer.name) - weight = layer.get_parameter_by_id(0); + weight = layer.get_parameter_by_id(0) weight.set_tensor(self, weights_dict[layer.name + "_weight"]) if (layer.name + "_bias") in weights_dict: print('bias: ' + layer.name) - bias = layer.get_parameter_by_id(1); + bias = layer.get_parameter_by_id(1) bias.set_tensor(self, weights_dict[layer.name + "_bias"]) def fit(self, x=None, y=None, batch_size=None, epochs=1): """Trains the model for a fixed number of epochs (iterations on a dataset). @@ -2126,7 +2126,7 @@ def fit(self, x=None, y=None, batch_size=None, epochs=1): self.forward() # self.zero_gradients() self.backward() - self.unified_update() + self.update() self._ffconfig.end_trace(self._tracing_id) def eval(self, x=None, y=None, batch_size=None): diff --git a/src/dataloader/dataloader.cc b/src/dataloader/dataloader.cc index 441a088194..614482e8b1 100644 --- a/src/dataloader/dataloader.cc +++ b/src/dataloader/dataloader.cc @@ -97,7 +97,7 @@ SingleDataLoader::SingleDataLoader(FFModel &ff, datatype = datatype_; // Currently assume that the leading dim of input is a replica dim of degree 1 assert(input->dims[input->num_dims - 1].is_replica_dim); - assert(input->dims[input->num_dims - 1].size == 1); + // assert(input->dims[input->num_dims - 1].size == 1); batch_input = input; ParallelDim dims[MAX_TENSOR_DIM]; diff --git a/src/dataloader/dataloader.cu b/src/dataloader/dataloader.cu index c2994d00a2..5462532d76 100644 --- a/src/dataloader/dataloader.cu +++ b/src/dataloader/dataloader.cu @@ -40,10 +40,13 @@ void SingleDataLoader::load_input(Task const *task, int num_dims = full_input_domain.get_dim(); assert(num_dims + 1 == batch_input_domain.get_dim()); // assert the leading replica dim has a degree of one - assert(batch_input_domain.hi()[num_dims] == - batch_input_domain.lo()[num_dims]); + // assert(batch_input_domain.hi()[num_dims] == + // batch_input_domain.lo()[num_dims]); coord_t batch_size = batch_input_domain.hi()[num_dims - 1] - batch_input_domain.lo()[num_dims - 1] + 1; + + coord_t replicate_num = + batch_input_domain.hi()[num_dims] - batch_input_domain.lo()[num_dims] + 1; coord_t num_elements_per_batch = batch_input_domain.get_volume() / batch_size; // FIXME: currently assume continous indices assert(batch_size == meta->num_samples); @@ -60,11 +63,15 @@ void SingleDataLoader::load_input(Task const *task, // printf("ptr(%p, %p), idx0 %d nb_elements_per_batch %d, batch_size %d, // %d\n", acc_full_input.ptr, input_zc, start_idx, num_elements_per_batch, // batch_size, start_idx * num_elements_per_batch); - copy_kernel
+ assert(batch_input_domain.get_volume() % replicate_num == 0); + copy_kernel_with_replicate
<<>>(batch_input_ptr, input_zc, batch_input_domain.get_volume()); + stream>>>(batch_input_ptr, + input_zc, + batch_input_domain.get_volume() / replicate_num, + batch_input_domain.get_volume()); checkCUDA(cudaDeviceSynchronize()); } diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 12895cfd98..84c3f8ba93 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -802,10 +802,32 @@ bool ElementBinary::measure_operator_cost(Simulator *sim, return true; } +void ElementBinary::serialize(Legion::Serializer &sez) const { + sez.serialize(this->op_type); + sez.serialize(this->inplace_a); +} + +using PCG::Node; +/*static*/ +Node ElementBinary::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + assert(num_inputs == 2); + OperatorType op_type; + bool inplace_a; + dez.deserialize(op_type); + dez.deserialize(inplace_a); + ElementBinaryParams params; + params.type = op_type; + params.inplace_a = inplace_a; + return ff.get_or_create_node({inputs[0], inputs[1]}, params); +} ElementBinaryParams ElementBinary::get_params() const { ElementBinaryParams params; params.type = this->op_type; + params.inplace_a = this->inplace_a; return params; } diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 18bf7e324e..8df6324460 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -148,23 +148,27 @@ int Embedding::output_size(ParallelDim output_dims[MAX_TENSOR_DIM]) { int const OUT_CHANNELS = Output::OUT_CHANNELS; if (aggr == AGGR_MODE_NONE) { int num_dims = input->num_dims + 1; - for (int i = 1; i < num_dims; i++) { + for (int i = 1; i < num_dims - 1; i++) { output_dims[i] = input->dims[i - 1]; } assert(OUT_CHANNELS == 0); output_dims[OUT_CHANNELS].size = this->out_channels; output_dims[OUT_CHANNELS].degree = 1; output_dims[OUT_CHANNELS].parallel_idx = -1; + // Copy replica dim + output_dims[num_dims - 1] = input->dims[input->num_dims - 1]; return num_dims; } else { int num_dims = input->num_dims; - for (int i = 1; i < num_dims; i++) { + for (int i = 1; i < num_dims - 1; i++) { output_dims[i] = input->dims[i]; } assert(OUT_CHANNELS == 0); output_dims[OUT_CHANNELS].size = this->out_channels; output_dims[OUT_CHANNELS].degree = 1; output_dims[OUT_CHANNELS].parallel_idx = -1; + // Copy replica dim + output_dims[num_dims - 1] = input->dims[input->num_dims - 1]; return num_dims; } // const int REPLICA = this->output_vocab_size_replica_dim(); @@ -179,13 +183,13 @@ int Embedding::weight_size(ParallelDim weight_dims[MAX_TENSOR_DIM]) { weight_dims[Weight::VOCAB_SIZE].size = this->num_entries; weight_dims[Weight::VOCAB_SIZE].degree = 1; weight_dims[Weight::VOCAB_SIZE].parallel_idx = -1; - for (int i = 2; i < input->num_dims; i++) { + for (int i = 2; i < input->num_dims + 1; i++) { weight_dims[i].size = input->dims[i - 1].degree; weight_dims[i].degree = weight_dims[i].size; weight_dims[i].parallel_idx = input->dims[i - 1].parallel_idx; weight_dims[i].is_replica_dim = true; } - return input->num_dims; + return input->num_dims + 1; } void Embedding::register_output_mappings() { diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index a6130bb425..ccbd7c2dd6 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -226,10 +226,10 @@ LayerNorm::LayerNorm(FFModel &model, dims[i] = inputs[0]->dims[i]; } assert(numInputs == 1); - dims[num_dims].degree = inputs[0]->dims[inputs[0]->num_dims - 2].degree; + dims[num_dims].degree = inputs[0]->dims[inputs[0]->num_dims - 1].degree; dims[num_dims].size = dims[num_dims].degree; dims[num_dims].parallel_idx = - inputs[0]->dims[inputs[0]->num_dims - 2].parallel_idx; + inputs[0]->dims[inputs[0]->num_dims - 1].parallel_idx; dims[num_dims].is_replica_dim = true; num_dims += 1; diff --git a/src/ops/linear.cc b/src/ops/linear.cc index d5257b9c3e..20cc193107 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -190,6 +190,23 @@ Linear::Linear(FFModel &model, params.construct_mappings(*this->parallel_dims_mapping, input_shape); params.solve_dims(input_shape, output_shape, kernel_shape, bias_shape); + kernel_shape.dims[0].size = this->in_channels; + bias_shape.dims[0].degree = _input->dims[_input->num_dims - 1].degree; + bias_shape.dims[0].parallel_idx = + _input->dims[_input->num_dims - 1].parallel_idx; + bias_shape.dims[1].size = bias_shape.dims[1].degree = 1; + bias_shape.dims[1].parallel_idx = -1; + bias_shape.dims[bias_shape.num_dims - 1].size = + bias_shape.dims[bias_shape.num_dims - 1].degree = 1; + for (int i = 0; i < input_shape.num_dims - 1; i++) { + if (_input->dims[i].degree > 1) { + bias_shape.dims[bias_shape.num_dims - 1].size *= _input->dims[i].degree; + bias_shape.dims[bias_shape.num_dims - 1].degree *= _input->dims[i].degree; + bias_shape.dims[bias_shape.num_dims - 1].parallel_idx = + _input->dims[i].parallel_idx; + } + } + if (allocate_weights) { Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/); @@ -220,7 +237,6 @@ Linear::Linear(FFModel &model, outputs[0] = model.create_parallel_tensor_legion_ordering( output_shape.num_dims, output_shape.dims, _data_type, this); - assert(check_output_input_weight_parallel_dims(allocate_weights)); } void Linear::init(FFModel const &ff) { @@ -433,7 +449,7 @@ void Linear::forward_task_with_dim(Task const *task, int out_dim = acc_output.rect.hi[0] - acc_output.rect.lo[0] + 1; int batch_size = acc_output.rect.volume() / out_dim; assert(acc_output.rect.volume() == static_cast(out_dim * batch_size)); - assert(acc_input.rect.volume() == static_cast(in_dim * batch_size)); + // assert(acc_input.rect.volume() == static_cast(in_dim * batch_size)); assert(acc_kernel.rect.volume() == static_cast(in_dim * out_dim)); float const *acc_bias_ptr = NULL; if (m->use_bias) { diff --git a/src/ops/reshape.cc b/src/ops/reshape.cc index 2b8a60bf21..07797bd223 100644 --- a/src/ops/reshape.cc +++ b/src/ops/reshape.cc @@ -80,9 +80,14 @@ Op *Reshape::create_operator_from_layer( return new Reshape(model, layer->layer_guid, inputs[0], shape, layer->name); } +bool match_pattern(std::vector const &_shape) { + return (_shape.size() == 4 && _shape[1] == 1 && _shape[2] == 1 && + _shape[3] == 512); +} + Reshape::Reshape(FFModel &model, LayerID const &_layer_guid, - const ParallelTensor input, + ParallelTensor const input, std::vector const &_shape, char const *name) : Op(model, @@ -106,19 +111,64 @@ Reshape::Reshape(FFModel &model, if (input->dims[i].is_replica_dim) { num_replica_dims++; } + // std::cout << "reshape input size: " << input->dims[i].size + // << ", parallelidx: " << input->dims[i].parallel_idx << ". degree: " << input->dims[i].degree + // << "is replicate dim: " << input->dims[i].is_replica_dim << + // "\n"; } + + // assert(false); // assert that all replica dims are leading dims for (int i = 0; i < num_replica_dims; i++) { assert(input->dims[input->num_dims - 1 - i].is_replica_dim); } int numdim = (int)_shape.size(); ParallelDim dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdim; i++) { - dims[i].size = _shape[numdim - 1 - i]; - dims[i].degree = 1; - dims[i].parallel_idx = -1; - dims[i].is_replica_dim = false; - } + + + bool expanded = numdim >= input->num_dims; + bool aggregation = numdim < input->num_dims - 1; + + for (int i = 0; i < numdim; i++) { + if (expanded && i < numdim - 1 && + _shape[i] * _shape[i + 1] == input->dims[numdim - i - 2].size) { + dims[numdim - i - 1].size = _shape[i]; + dims[numdim - i - 1].degree = input->dims[numdim - i - 2].degree; + dims[numdim - i - 1].parallel_idx = + input->dims[numdim - i - 2].parallel_idx; + dims[numdim - i - 1].is_replica_dim = + input->dims[numdim - i - 2].is_replica_dim; + std::cout << "expand dim i:" << i << ", " << dims[numdim - i - 1].degree + << ", " << dims[numdim - i - 1].size << "\n"; + } else if (aggregation && + (_shape[i] == input->dims[input->num_dims - 2 - i].size * + input->dims[input->num_dims - 3 - i].size)) { + // inherit + dims[numdim - i - 1].size = _shape[i]; + dims[numdim - i - 1].degree = + input->dims[input->num_dims - 2 - i].degree; + dims[numdim - i - 1].parallel_idx = + input->dims[input->num_dims - 2 - i].parallel_idx; + dims[numdim - i - 1].is_replica_dim = + input->dims[input->num_dims - 2 - i].is_replica_dim; + // std::cout << "agree i: " << i <<", " << _shape[i] << "\n"; + } else { + dims[numdim - i - 1].size = _shape[i]; + dims[numdim - i - 1].degree = 1; + dims[numdim - i - 1].parallel_idx = -1; + dims[numdim - i - 1].is_replica_dim = false; + } + } + + + + + // for (int i = 0; i < numdim; i++) { + // dims[i].size = _shape[numdim - 1 - i]; + // dims[i].degree = 1; + // dims[i].parallel_idx = -1; + // dims[i].is_replica_dim = false; + // } // copy all replica dims for (int i = 0; i < num_replica_dims; i++) { dims[i + numdim] = input->dims[input->num_dims - 1 - i]; @@ -131,6 +181,24 @@ Reshape::Reshape(FFModel &model, } dims[numdim - 1 - i] = input->dims[input->num_dims - 1 - i]; } + + //TODO temporary fix for input to attention QK, fix it after fuse the attention block + if(match_pattern(_shape) && model.config.tensor_parallelism_degree > 1){ + //number of heads + + dims[2].size = 12; + dims[2].degree = model.config.tensor_parallelism_degree; + dims[2].parallel_idx = 0; + dims[2].is_replica_dim = true; + + dims[4].size = 1; + dims[4].degree = 1; + dims[4].parallel_idx = -1; + dims[4].is_replica_dim = false; + + } + + outputs[0] = model.create_parallel_tensor_legion_ordering( numdim, dims, input->data_type, this); assert(outputs[0]->get_volume() == inputs[0]->get_volume()); diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc index 1bbc43ca0d..7052bb3ed5 100644 --- a/src/parallel_ops/allreduce.cc +++ b/src/parallel_ops/allreduce.cc @@ -173,6 +173,7 @@ void AllReduce::backward(FFModel const &ff) { Runtime *runtime = ff.config.lg_hlr; assert(numOutputs == 1); assert(numInputs == 1); + set_argumentmap_for_backward(ff, argmap); IndexLauncher launcher(ALLREDUCE_BWD_TASK_ID, inputs[0]->parallel_is, TaskArgument(NULL, 0), diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu index bce970cbdc..1e932d2b12 100644 --- a/src/parallel_ops/kernels/allreduce_kernels.cu +++ b/src/parallel_ops/kernels/allreduce_kernels.cu @@ -48,7 +48,24 @@ void forward_kernel_wrapper(AllReduceMeta const *m, void backward_kernel_wrapper(AllReduceMeta const *m, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output_grad) { - assert(false && "To be implemented"); + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input_grad.data_type == output_grad.data_type); + assert(input_grad.domain == output_grad.domain); +#ifdef FF_USE_NCCL + // ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); + // std::cout <<"input volume: " << input.domain.get_volume() << "\n"; + // print_tensor((float*)input.ptr, 32, "input ptr"); + checkNCCL(ncclAllReduce(output_grad.ptr, + input_grad.ptr, + output_grad.domain.get_volume(), + ncclFloat, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif } } // namespace AllReduce diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 3236545cb9..a4a58e60fc 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -61,6 +61,15 @@ __global__ void copy_kernel(DT *dst, const DT *src, coord_t size) { dst[i] = src[i]; } } +template +__global__ void copy_kernel_with_replicate(DT *dst, + const DT *src, + coord_t origin_size, + coord_t size) { + CUDA_KERNEL_LOOP(i, size) { + dst[i] = src[i % origin_size]; + } +} template __global__ void reluBackward(DT *grad_ptr, const DT *output, size_t n) { @@ -410,6 +419,14 @@ template __global__ void template __global__ void copy_kernel(float *dst, float const *src, coord_t size); +template __global__ void copy_kernel_with_replicate(float *dst, + float const *src, + coord_t origin_size, + coord_t size); +template __global__ void copy_kernel_with_replicate( + int32_t *dst, int32_t const *src, coord_t origin_size, coord_t size); +template __global__ void copy_kernel_with_replicate( + int64_t *dst, int64_t const *src, coord_t origin_size, coord_t size); template __global__ void copy_kernel(int32_t *dst, int32_t const *src, coord_t size); template __global__ void diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 8be3877b48..762c5911d6 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1928,9 +1928,37 @@ std::pair, std::unordered_map> model->config.numNodes * model->config.workersPerNode; data_parallel_view.stride[0] = 1; data_parallel_view.start_device_id = 0; + // Currently assume a 1D machine view is needed + assert(model->config.data_parallelism_degree == 1 || + model->config.tensor_parallelism_degree == 1); + int degree = model->config.data_parallelism_degree * + model->config.tensor_parallelism_degree; for (auto const &node : curr_best_graph->inEdges) { - curr_optimal_views[node.first] = data_parallel_view; - } + Op const *op = node.first.ptr; + MachineView mv; + mv.device_type = MachineView::GPU; + mv.ndims = 1; + int total_parallel_degree = 1; + for (int i = 0; i < op->outputs[0]->num_dims; i++) { + total_parallel_degree *= op->outputs[0]->dims[i].degree; + } + mv.dim[0] = total_parallel_degree; + mv.stride[0] = 1; + mv.start_device_id = 0; + // std::cout << mv.start_device_id + degree - 1 << "\n"; + // std::cout << model->config.numNodes << "\n"; + // std::cout << model->config.workersPerNode << "\n"; + // assert(false); + assert(mv.start_device_id + degree - 1 < + model->config.numNodes * model->config.workersPerNode); + curr_optimal_views[node.first] = mv; + for (int i = 0; i < node.first.ptr->numOutputs; i++) { + assert(node.first.ptr->outputs[i]->is_valid_machine_view(mv)); + } + } + // for (auto const &node : curr_best_graph->inEdges) { + // curr_optimal_views[node.first] = data_parallel_view; + // } return std::make_pair(std::move(curr_best_graph), curr_optimal_views); } diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 3807e85391..6feddcd03c 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -50,10 +50,10 @@ #include "flexflow/ops/split.h" #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" +#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" #include "flexflow/parallel_ops/partition.h" -#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/reduction.h" #include "flexflow/parallel_ops/replicate.h" #include "flexflow/substitution.h" @@ -79,10 +79,10 @@ Op::Op(FFModel &model, int numWeights, bool allocate_weights, int numOutputs, - const ParallelTensor input1, - const ParallelTensor input2, - const ParallelTensor input3, - const ParallelTensor input4) + ParallelTensor const input1, + ParallelTensor const input2, + ParallelTensor const input3, + ParallelTensor const input4) : Op(model, otype, dtype, @@ -102,10 +102,10 @@ Op::Op(FFModel &model, int _numInputs, int _numWeights, int _numOutputs, - const ParallelTensor _input1, - const ParallelTensor _input2, - const ParallelTensor _input3, - const ParallelTensor _input4) + ParallelTensor const _input1, + ParallelTensor const _input2, + ParallelTensor const _input3, + ParallelTensor const _input4) : op_type(_otype), data_type(_dtype), op_guid(model.op_global_guid++), numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs), profiling(model.config.profiling) { @@ -587,8 +587,13 @@ ncclComm_t Op::init_nccl_comms_task(Task const *task, ncclComm_t ncclComm; fprintf(stderr, "Before ncclCommInitRank\n"); checkNCCL(ncclCommInitRank(&ncclComm, allRanks, ncclId, myRank)); - fprintf(stderr, "After ncclCommInitRank ncclComm(%p) allRanks(%d) myRank(%d) ncclId(%p)\n", - ncclComm, allRanks, myRank, ncclId); + fprintf(stderr, + "After ncclCommInitRank ncclComm(%p) allRanks(%d) myRank(%d) " + "ncclId(%p)\n", + ncclComm, + allRanks, + myRank, + ncclId); return ncclComm; } #endif @@ -783,9 +788,9 @@ void Op::register_output_parallel_dims( operation); } -int Op::get_output_to_input_dim_mapping(const ParallelTensor output, +int Op::get_output_to_input_dim_mapping(ParallelTensor const output, int output_dim, - const ParallelTensor input) { + ParallelTensor const input) { int output_idx = -1, input_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -818,9 +823,9 @@ int Op::get_output_to_input_dim_mapping(const ParallelTensor output, return -1; } -int Op::get_output_to_weight_dim_mapping(const ParallelTensor output, +int Op::get_output_to_weight_dim_mapping(ParallelTensor const output, int output_dim, - const ParallelTensor weight) { + ParallelTensor const weight) { int output_idx = -1, weight_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -886,6 +891,9 @@ bool Op::check_output_input_weight_parallel_dims(bool allocate_weights) const { break; } + printf("other dim degree: %d, input dim degree %d\n", + other_dim.degree, + input_dim.degree); assert(other_dim.degree == input_dim.degree); assert(other_dim.parallel_idx == input_dim.parallel_idx); } @@ -895,18 +903,25 @@ bool Op::check_output_input_weight_parallel_dims(bool allocate_weights) const { bool Op::check_output_input_weight_same_parallel_is() const { assert(numOutputs > 0); IndexSpace parallel_is = outputs[0]->parallel_is; + std::cout << "output space: " + << ", " << parallel_is << "\n"; for (int i = 0; i < numOutputs; i++) { if (outputs[i]->parallel_is != parallel_is) { + std::cout << "output mismatch" + << "\n"; return false; } } for (int i = 0; i < numInputs; i++) { + std::cout << "input space: " << i << ", " << inputs[i]->parallel_is << "\n"; if (inputs[i]->parallel_is != parallel_is) { return false; } } for (int i = 0; i < numWeights; i++) { if (weights[i]->parallel_is != parallel_is) { + std::cout << "weight mismatch" + << "\n"; return false; } } @@ -948,7 +963,7 @@ void Op::set_argumentmap_for_init(FFModel const &ff, ArgumentMap &argmap) { for (PointInRectIterator it(rect); it(); it++) { \ FFHandler handle = ff.handlers[view.get_device_id(*it)]; \ if (ff.config.computationMode == COMP_MODE_TRAINING && \ - op_type == OP_WEIGHT) { \ + (op_type == OP_WEIGHT || op_type == OP_ALLREDUCE)) { \ ncclComm_t *nccl_comms = ff.find_nccl_comms(view); \ handle.ncclComm = nccl_comms[idx++]; \ } \ @@ -1192,9 +1207,11 @@ FFModel::FFModel(FFConfig &_config) //} ArgumentMap argmap; - Rect<1> task_rect(Point<1>(0), - Point<1>(config.workersPerNode * config.numNodes - 1)); - IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); + // Rect<1> task_rect(Point<1>(0), + // Point<1>(config.workersPerNode * config.numNodes - 1)); + // IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); + Domain domain = runtime->get_index_space_domain(ctx, config.all_gpu_task_is); + Rect<1> task_rect = domain; // int rank = 0; for (PointInRectIterator<1> it(task_rect); it(); it++) { @@ -1208,7 +1225,7 @@ FFModel::FFModel(FFConfig &_config) // Init CUDA library on each worker IndexLauncher initLauncher(FF_INIT_TASK_ID, - task_is, + config.all_gpu_task_is, TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, @@ -1301,7 +1318,7 @@ Tensor FFModel::create_tensor(int numdim, } ParallelTensor FFModel::create_parallel_tensor(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *op, int idx, @@ -1334,7 +1351,7 @@ Tensor FFModel::create_tensor_legion_ordering(int numdim, ParallelTensor FFModel::create_parallel_tensor_legion_ordering(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *op, int idx, @@ -1384,7 +1401,7 @@ Tensor FFModel::create_tensor(int const dims[], } template -ParallelTensor FFModel::create_parallel_tensor(const ParallelDim dims[], +ParallelTensor FFModel::create_parallel_tensor(ParallelDim const dims[], DataType data_type, Op const *owner_op, int owner_idx, @@ -1465,7 +1482,7 @@ Parameter FFModel::create_weight(int numdim, } template -ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[], +ParallelParameter FFModel::create_parallel_weight(ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1495,7 +1512,7 @@ ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[], } ParallelParameter FFModel::create_parallel_weight(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1515,7 +1532,7 @@ ParallelParameter FFModel::create_parallel_weight(int numdim, ParallelParameter FFModel::create_parallel_weight_legion_ordering( int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1720,7 +1737,7 @@ void FFModel::map_weight_with_dim(ParallelTensor weight, } bool FFModel::get_parallel_tensor_from_tensor( - const Tensor tensor, ParallelTensor ¶llel_tensor) const { + Tensor const tensor, ParallelTensor ¶llel_tensor) const { // check if tensor->parallel_tensor is already set if (tensor->parallel_tensor != nullptr) { parallel_tensor = tensor->parallel_tensor; @@ -1757,7 +1774,7 @@ bool FFModel::get_parallel_tensor_from_tensor( } void FFModel::create_disjoint_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], IndexSpace const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -1780,7 +1797,7 @@ void FFModel::create_disjoint_partition(int num_dims, template void FFModel::create_disjoint_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], IndexSpaceT const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -1813,7 +1830,7 @@ void FFModel::create_disjoint_partition_with_dim2( } void FFModel::create_aliased_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, IndexSpace const &part_is, LogicalRegion const ®ion, @@ -1837,7 +1854,7 @@ void FFModel::create_aliased_partition(int num_dims, template void FFModel::create_aliased_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, IndexSpaceT const &part_is, LogicalRegion const ®ion, @@ -1874,7 +1891,7 @@ void FFModel::create_aliased_partition_with_dim2( } template -void FFModel::create_disjoint_partition(const ParallelTensor tensor, +void FFModel::create_disjoint_partition(ParallelTensor const tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -1922,7 +1939,7 @@ void FFModel::create_disjoint_partition(const ParallelTensor tensor, template void FFModel::create_data_parallel_partition_with_diff_dims( - const ParallelTensor tensor, + ParallelTensor const tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -2304,7 +2321,7 @@ IndexSpace FFModel::get_task_is(ParallelConfig const &pc) const { return get_task_is(view); } -IndexSpace FFModel::get_or_create_task_is(const ParallelTensor tensor) { +IndexSpace FFModel::get_or_create_task_is(ParallelTensor const tensor) { MachineView view; view.ndims = 0; for (int i = 0; i < tensor->num_dims; i++) { @@ -2630,6 +2647,18 @@ Op *FFModel::create_operator_from_layer( dims[num_dims].degree = 1; dims[num_dims].parallel_idx = -1; dims[num_dims].is_replica_dim = true; + if (config.tensor_parallelism_degree > 1 && num_inputs != 1) { + dims[num_dims].size *= config.tensor_parallelism_degree; + dims[num_dims].degree *= config.tensor_parallelism_degree; + dims[num_dims].parallel_idx = 0; + } + //TODO temporary fix for input to attention QK, fix it after fuse the attention block + else if(config.tensor_parallelism_degree > 1){ + //n heads + dims[num_dims].size *= 12; + dims[num_dims].degree *= config.tensor_parallelism_degree; + dims[num_dims].parallel_idx = 0; + } // create_parallel_tensor adds an NoOp into operators ParallelTensor pt = create_parallel_tensor_legion_ordering(num_dims + 1, @@ -2643,19 +2672,14 @@ Op *FFModel::create_operator_from_layer( assert(tensor->parallel_tensor == nullptr); tensor->parallel_tensor = pt; // start from data parllel tensor - if (config.only_data_parallel && - config.numNodes * config.workersPerNode > 1) { - if (pt->dims[num_dims - 1].size == 1) { - Replicate *repl = new Replicate( - *this, pt, num_dims, config.numNodes * config.workersPerNode); - repl->outputs[0]->dims[num_dims].is_replica_dim = true; - operators.push_back(repl); - } else { - Repartition *part = new Repartition( - *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); - operators.push_back(part); - } - } + // if (config.only_data_parallel && + // config.computationMode == COMP_MODE_TRAINING) { + // Repartition *part = new Repartition( + // *this, pt, num_dims - 1, config.numNodes * + // config.workersPerNode); + // operators.push_back(part); + // } + num_inputs++; return operators[operators.size() - 1]; } case OP_MULTIHEAD_ATTENTION: { @@ -2798,9 +2822,42 @@ Op *FFModel::create_operator_from_layer( } } +bool FFModel::is_transformer_block(int layer_idx) const { + auto const &l = layers[layer_idx]; + if (l->op_type == OP_DROPOUT && layer_idx >= 4 && + layers[layer_idx - 1]->op_type == OP_LINEAR && + layers[layer_idx - 2]->op_type == OP_RESHAPE && + layers[layer_idx - 3]->op_type == OP_TRANSPOSE && + layers[layer_idx - 4]->op_type == OP_BATCHMATMUL) { + return true; + } + return false; +} +bool FFModel::is_mlp_block(int layer_idx) const { + auto const &l = layers[layer_idx]; + // standard opt relu + if (l->op_type == OP_LINEAR && layer_idx >= 2 && + layers[layer_idx - 1]->op_type == OP_RELU && + layers[layer_idx - 2]->op_type == OP_LINEAR) { + return true; + } + // mlp layer with relu embedded in first dense layer + if (l->op_type == OP_LINEAR && layer_idx >= 1 && + layers[layer_idx - 1]->op_type == OP_LINEAR) { + long long value; + layers[layer_idx - 1]->get_int_property("activation", value); + ActiMode activation = (ActiMode)value; + if (activation == AC_MODE_RELU) { + return true; + } + } + return false; +} + void FFModel::create_operators_from_layers() { - std::map tensors_to_parallel_tensors; - for (auto const &l : layers) { + std::map tensors_to_parallel_tensors; + for (int layer_idx = 0; layer_idx < layers.size(); layer_idx++) { + auto const &l = layers[layer_idx]; std::vector inputs; for (int i = 0; i < l->numInputs; i++) { // create new input tensors @@ -2808,7 +2865,50 @@ void FFModel::create_operators_from_layers() { tensors_to_parallel_tensors.end()); inputs.push_back(tensors_to_parallel_tensors[l->inputs[i]]); } - Op *op = create_operator_from_layer(l, inputs); + // Op *op = create_operator_from_layer(l, inputs); + Op *op = nullptr; + if (config.tensor_parallelism_degree > 1 && l->op_type == OP_LAYERNORM && + layer_idx == layers.size() - 6) { + std::vector partitioned_inputs; + Combine *comb = new Combine(*this, + inputs[0], + 3 /*inner most dim*/, + config.tensor_parallelism_degree); + partitioned_inputs.push_back(comb->outputs[0]); + operators.push_back(comb); + op = create_operator_from_layer(l, partitioned_inputs); + } else { + op = create_operator_from_layer(l, inputs); + } + + // add replicate operators after op if needed + if (config.tensor_parallelism_degree > 1 && l->op_type == OP_EMBEDDING) { + // assert(op->numOutputs == 1); + // Replicate *repl = new Replicate(*this, + // op->outputs[0], + // op->outputs[0]->num_dims - 1, + // config.tensor_parallelism_degree); + // operators.push_back(repl); + // op = repl; + } else if (config.tensor_parallelism_degree > 1 && + (is_transformer_block(layer_idx) || is_mlp_block(layer_idx) || + // llama mlp layer + (l->op_type == OP_LINEAR && layer_idx >= 2 && + layers[layer_idx - 1]->op_type == OP_GELU && + layers[layer_idx - 2]->op_type == OP_LINEAR) || + // LLAMA without element-wise operator fusion + (l->op_type == OP_LINEAR && layer_idx >= 5 && + layers[layer_idx - 1]->op_type == OP_EW_MUL && + layers[layer_idx - 2]->op_type == OP_EW_MUL && + layers[layer_idx - 3]->op_type == OP_SIGMOID && + layers[layer_idx - 4]->op_type == OP_LINEAR && + layers[layer_idx - 5]->op_type == OP_LINEAR))) { + assert(op->numOutputs == 1); + AllReduce *allreduce = + new AllReduce(*this, op->outputs[0], op->outputs[0]->num_dims - 1); + operators.push_back(allreduce); + op = allreduce; + } assert(op->numOutputs == l->numOutputs); for (int i = 0; i < op->numOutputs; i++) { tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i]; @@ -2844,8 +2944,7 @@ void FFModel::compile(LossType loss_type, TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID, TaskArgument(&model, sizeof(FFModel *))); Future future = runtime->execute_task(ctx, launcher); - ret = - future.get_result(); + ret = future.get_result(); } else { ret = PCG::Graph::graph_optimize_wrapper(this); } @@ -2981,6 +3080,60 @@ void FFModel::compile(LossType loss_type, } } + int degree = + config.data_parallelism_degree * config.tensor_parallelism_degree; + + for (int op_idx = 0; op_idx < operators.size(); op_idx++) { + Op const *op = operators[op_idx]; + // Skip weight operators + if (op->op_type == OP_WEIGHT) { + continue; + } + // Get machine views + std::vector machine_views; + for (int j = 0; j < config.data_parallelism_degree; j++) { + MachineView mv; + mv.device_type = MachineView::GPU; + mv.ndims = 1; + // mv.start_device_id = 0; + mv.stride[0] = 1; + int parallel_degree = 1; + for (int k = 0; k < op->outputs[0]->num_dims; k++) { + parallel_degree *= op->outputs[0]->dims[k].degree; + } + mv.dim[0] = parallel_degree; + mv.start_device_id = 0; + assert(mv == op->outputs[0]->machine_view); + machine_views.push_back(mv); + } + for (int i = 0; i < op->numOutputs; i++) { + ParallelTensor pt_base = op->outputs[i]; + + if (op->op_type == OP_REPLICATE) { + assert(op->numInputs == 1 && op->numOutputs == 1); + } + std::vector list; + bool found_parallel_tensor = false; + if (!found_parallel_tensor) { + for (int j = 0; j < config.data_parallelism_degree; j++) { + // Copy the metadata from pt_base to pt + ParallelTensor pt = new ParallelTensorBase(*pt_base); + pt->region = + runtime->create_logical_region(ctx, + pt_base->region.get_index_space(), + pt_base->region.get_field_space()); + pt->part = runtime->get_logical_partition( + ctx, pt->region, pt_base->part.get_index_partition()); + pt->machine_view = machine_views[j]; + Domain part_domain = + runtime->get_index_space_domain(ctx, pt_base->parallel_is); + assert(pt->machine_view.get_domain() == part_domain); + list.push_back(pt); + } + } + } + } + // Perform fusion optimizations if (config.perform_fusion) { fprintf(stderr, "Applying fusion optimizations during compilation...\n"); @@ -3488,34 +3641,34 @@ void FFIterationConfig::reset() { // Default Config Parameters struct DefaultConfig { - const static int epochs = 1; + static int const epochs = 1; // const static int iterations = 1; - const static int batchSize = 64; - const static bool profiling = false; + static int const batchSize = 64; + static bool const profiling = false; constexpr static float learningRate = 0.01f; constexpr static float weightDecay = 0.0001f; - const static size_t workSpaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2GB - const static int numNodes = 1; - const static int workersPerNode = 0; - const static int cpusPerNode = 0; - const static size_t searchBudget = -1; - const static size_t simulatorWorkSpaceSize = + static size_t const workSpaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2GB + static int const numNodes = 1; + static int const workersPerNode = 0; + static int const cpusPerNode = 0; + static size_t const searchBudget = -1; + static size_t const simulatorWorkSpaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2GB constexpr static float searchAlpha = 1.2f; - const static bool searchOverlapBackwardUpdate = false; - const static bool onlyDataParallel = false; - const static bool enableSampleParallel = true; - const static bool enableParameterParallel = false; - const static bool enableAttributeParallel = false; - const static bool enableInplaceOptimizations = false; - const static bool allowTensorOpMathConversion = false; - const static int machine_model_version = 0; - const static int simulator_segment_size = 16777216; // 16 MB - const static int simulator_max_num_segments = 1; - const static int base_optimize_threshold = 10; - const static bool enable_control_replication = true; + static bool const searchOverlapBackwardUpdate = false; + static bool const onlyDataParallel = false; + static bool const enableSampleParallel = true; + static bool const enableParameterParallel = false; + static bool const enableAttributeParallel = false; + static bool const enableInplaceOptimizations = false; + static bool const allowTensorOpMathConversion = false; + static int const machine_model_version = 0; + static int const simulator_segment_size = 16777216; // 16 MB + static int const simulator_max_num_segments = 1; + static int const base_optimize_threshold = 10; + static bool const enable_control_replication = true; // The default python data loader type is 2 to enable control replication - const static int python_data_loader_type = 2; + static int const python_data_loader_type = 2; }; FFConfig::FFConfig() { @@ -3535,6 +3688,8 @@ FFConfig::FFConfig() { search_overlap_backward_update = DefaultConfig::searchOverlapBackwardUpdate; computationMode = COMP_MODE_TRAINING; only_data_parallel = DefaultConfig::onlyDataParallel; + data_parallelism_degree = 1; + tensor_parallelism_degree = 1; enable_sample_parallel = DefaultConfig::enableSampleParallel; enable_parameter_parallel = DefaultConfig::enableParameterParallel; enable_attribute_parallel = DefaultConfig::enableAttributeParallel; @@ -3580,6 +3735,9 @@ FFConfig::FFConfig() { Runtime *runtime = Runtime::get_runtime(); lg_hlr = runtime; lg_ctx = Runtime::get_context(); + Rect<1> task_rect(Point<1>(0), Point<1>(workersPerNode * numNodes - 1)); + // Create an index space for tasks running on all GPUs + all_gpu_task_is = runtime->create_index_space(lg_ctx, task_rect); field_space = runtime->create_field_space(lg_ctx); } @@ -3640,6 +3798,16 @@ void FFConfig::parse_args(char **argv, int argc) { only_data_parallel = true; continue; } + // data parallelism degree + if (!strcmp(argv[i], "-data-parallelism-degree")) { + data_parallelism_degree = std::stoi(argv[++i]); + continue; + } + // tensor parallelism degree + if (!strcmp(argv[i], "-tensor-parallelism-degree")) { + tensor_parallelism_degree = std::stoi(argv[++i]); + continue; + } if ((!strcmp(argv[i], "--enable-parameter-parallel"))) { enable_parameter_parallel = true; continue; @@ -5353,11 +5521,13 @@ void register_flexflow_internal_tasks(Runtime *runtime, } } { - TaskVariantRegistrar registrar(ADAM_UNIFY_UPD_NCCL_TASK_ID, "Adam unified NCCL Update"); + TaskVariantRegistrar registrar(ADAM_UNIFY_UPD_NCCL_TASK_ID, + "Adam unified NCCL Update"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( + Runtime::preregister_task_variant< + AdamOptimizer::nccl_unified_update_task>( registrar, "Adam unified NCCL Update Task"); } else { if (enable_control_replication) { diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index 2b90bc3f44..91a16e8db7 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -539,7 +539,6 @@ void AdamOptimizer::unified_update(std::vector const parameters) if (reservedWorkSpaceSize + p->get_volume() * sizeof(float) >= workSpaceSize) { break; } - printf("continueee\n"); reservedWorkSpaceSize += p->get_volume() * sizeof(float); parameters_num += 1; assert(p->sync_type == ParameterSyncType::NCCL); diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu index 37c6e55ede..bfa57e9a8e 100644 --- a/src/runtime/optimizer_kernel.cu +++ b/src/runtime/optimizer_kernel.cu @@ -225,7 +225,6 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( GenericTensorAccessorW *accWs, GenericTensorAccessorW *accVs, GenericTensorAccessorW *accMs) { - cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); // assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 18318db3ce..b9f3dc89f7 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -660,14 +660,24 @@ bool ParallelTensorBase::set_tensor(FFModel const *ff, // TODO: check data type matches // TODO: Currently we use a task launch, change to index launch for NCCL // parameter - size_t volume = 1, num_replicas = 0; + size_t volume = 1, num_replicas = 1; if (sync_type == ParameterSyncType::NCCL) { - Domain domain = runtime->get_index_space_domain(ctx, parallel_is); - num_replicas = domain.get_volume(); + // Domain domain = runtime->get_index_space_domain(ctx, parallel_is); + // num_replicas = domain.get_volume(); + for (int i = 0; i < this->num_dims; i++) { + if (this->dims[i].is_replica_dim) { + num_replicas *= this->dims[i].size; + } + } } else if (sync_type == ParameterSyncType::PS) { num_replicas = 1; } else { - num_replicas = 1; + for (int i = 0; i < this->num_dims; i++) { + if (this->dims[i].is_replica_dim) { + num_replicas *= this->dims[i].size; + } + } + // num_replicas = 1; } for (size_t i = 0; i < dim_sizes.size(); i++) { volume = volume * dim_sizes[i]; From 355d4b493e926ab026f1c8f17b7a67dcaf224ab4 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Wed, 28 Feb 2024 23:46:35 -0500 Subject: [PATCH 70/72] try a fix --- src/runtime/optimizer_kernel.cu | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu index bfa57e9a8e..17adce94b8 100644 --- a/src/runtime/optimizer_kernel.cu +++ b/src/runtime/optimizer_kernel.cu @@ -238,7 +238,12 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( cudaEventRecord(t_start1, stream); cudaEventRecord(t_start2, stream); - void *workSpace_ptr = meta->handle.workSpace; + void *allocate_ptr; + // = meta->handle.workSpace; + checkCUDA( + cudaMalloc(&allocate_ptr,meta->handle.workSpaceSize)); + + void *workSpace_ptr = allocate_ptr; for (int i = 0; i < op->parameters_num; i++) { cudaMemcpyAsync(workSpace_ptr, @@ -272,7 +277,8 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( cudaEventDestroy(t_start2); printf("[optimizer] allreduce time = %.2lfms\n", elapsed); - workSpace_ptr = static_cast(meta->handle.workSpace); + // workSpace_ptr = static_cast(meta->handle.workSpace); + workSpace_ptr = static_cast(allocate_ptr); float alpha_t = op->alpha_t; float beta1_t = op->beta1_t; float beta2_t = op->beta2_t; @@ -304,6 +310,7 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); cudaEventDestroy(t_start); cudaEventDestroy(t_end); + checkCUDA(cudaFree(allocate_ptr)); printf("[optimizer] total time = %.2lfms\n", elapsed); } #endif From 8488ba023b21210d7e32e1760d6fbbc18afd3f8f Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 13 Sep 2024 10:14:15 -0400 Subject: [PATCH 71/72] fix hip --- include/flexflow/utils/cuda_helper.h | 5 ++++- include/flexflow/utils/hip_helper.h | 6 ++++++ src/dataloader/dataloader.cpp | 10 +++++++--- src/runtime/hip_helper.cpp | 20 ++++++++++++++++++++ 4 files changed, 37 insertions(+), 4 deletions(-) diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index c56ab473c0..d077995884 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -83,7 +83,10 @@ template __global__ void copy_kernel(DT *dst, const DT *src, Legion::coord_t size); template -__global__ void copy_kernel_with_replicate(DT *dst, const DT *src, Legion::coord_t origin_size, Legion::coord_t size); +__global__ void copy_kernel_with_replicate(DT *dst, + const DT *src, + Legion::coord_t origin_size, + Legion::coord_t size); template __global__ void add_kernel(T *data_ptr, T const *grad_ptr, size_t size); diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h index 709e78f517..8c589305c2 100644 --- a/include/flexflow/utils/hip_helper.h +++ b/include/flexflow/utils/hip_helper.h @@ -82,6 +82,12 @@ __global__ void assign_kernel(DT *ptr, Legion::coord_t size, DT value); template __global__ void copy_kernel(DT *dst, const DT *src, Legion::coord_t size); +template +__global__ void copy_kernel_with_replicate(DT *dst, + const DT *src, + Legion::coord_t origin_size, + Legion::coord_t size); + template __global__ void add_kernel(T *data_ptr, T const *grad_ptr, size_t size); diff --git a/src/dataloader/dataloader.cpp b/src/dataloader/dataloader.cpp index 7d9ffc02b1..97668d705d 100644 --- a/src/dataloader/dataloader.cpp +++ b/src/dataloader/dataloader.cpp @@ -41,10 +41,12 @@ void SingleDataLoader::load_input(Task const *task, int num_dims = full_input_domain.get_dim(); assert(num_dims + 1 == batch_input_domain.get_dim()); // assert the leading replica dim has a degree of one - assert(batch_input_domain.hi()[num_dims] == - batch_input_domain.lo()[num_dims]); + // assert(batch_input_domain.hi()[num_dims] == + // batch_input_domain.lo()[num_dims]); coord_t batch_size = batch_input_domain.hi()[num_dims - 1] - batch_input_domain.lo()[num_dims - 1] + 1; + coord_t replicate_num = + batch_input_domain.hi()[num_dims] - batch_input_domain.lo()[num_dims] + 1; coord_t num_elements_per_batch = batch_input_domain.get_volume() / batch_size; // FIXME: currently assume continous indices assert(batch_size == meta->num_samples); @@ -61,13 +63,15 @@ void SingleDataLoader::load_input(Task const *task, // printf("ptr(%p, %p), idx0 %d nb_elements_per_batch %d, batch_size %d, // %d\n", acc_full_input.ptr, input_zc, start_idx, num_elements_per_batch, // batch_size, start_idx * num_elements_per_batch); - hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel
), + assert(batch_input_domain.get_volume() % replicate_num == 0); + hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel_with_replicate
), GET_BLOCKS(batch_input_domain.get_volume()), CUDA_NUM_THREADS, 0, stream, batch_input_ptr, input_zc, + batch_input_domain.get_volume() / replicate_num, batch_input_domain.get_volume()); checkCUDA(hipDeviceSynchronize()); } diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index ffdcf0dac1..8617cb2ef3 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -55,6 +55,16 @@ __global__ void copy_kernel(DT *dst, const DT *src, coord_t size) { } } +template +__global__ void copy_kernel_with_replicate(DT *dst, + const DT *src, + coord_t origin_size, + coord_t size) { + CUDA_KERNEL_LOOP(i, size) { + dst[i] = src[i % origin_size]; + } +} + template __global__ void reluBackward(DT *grad_ptr, const DT *output, size_t n) { CUDA_KERNEL_LOOP(i, n) { @@ -404,6 +414,16 @@ template __global__ void template __global__ void copy_kernel(float *dst, float const *src, coord_t size); + +template __global__ void copy_kernel_with_replicate(float *dst, + float const *src, + coord_t origin_size, + coord_t size); +template __global__ void copy_kernel_with_replicate( + int32_t *dst, int32_t const *src, coord_t origin_size, coord_t size); +template __global__ void copy_kernel_with_replicate( + int64_t *dst, int64_t const *src, coord_t origin_size, coord_t size); + template __global__ void copy_kernel(int32_t *dst, int32_t const *src, coord_t size); template __global__ void From d5496e9bc10d59670d52052d84a5091beba8d197 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 26 Sep 2024 13:18:01 -0400 Subject: [PATCH 72/72] update submodule --- deps/legion | 2 +- deps/tokenizers-cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 160000 deps/tokenizers-cpp diff --git a/deps/legion b/deps/legion index 626b55689c..0d32b35542 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit 626b55689c77848b246e1da19678c7ad58899f0c +Subproject commit 0d32b35542bc0e9aba5950e485b8fc3413ae664b diff --git a/deps/tokenizers-cpp b/deps/tokenizers-cpp new file mode 160000 index 0000000000..c0fab1e14a --- /dev/null +++ b/deps/tokenizers-cpp @@ -0,0 +1 @@ +Subproject commit c0fab1e14a9421c1501acee5b7703e5dafa60479