From 243615200b0a737e85d4c90fe4d96707780d66ad Mon Sep 17 00:00:00 2001 From: Teresa Noyola Date: Mon, 17 Apr 2023 17:46:19 -0400 Subject: [PATCH 01/52] push changes to print dot graph --- config/config.linux | 6 +- examples/python/pytorch/mt5/mt5_ff.py | 89 ++++++++++++---------- examples/python/pytorch/mt5/mt5_torch.py | 4 +- include/flexflow/graph.h | 2 +- include/flexflow/ops/split.h | 2 + python/flexflow/torch/model.py | 49 ++++++++++-- src/metrics_functions/metrics_functions.cu | 2 +- src/ops/element_unary.cc | 3 +- src/ops/linear.cc | 4 +- src/ops/split.cc | 11 +++ src/runtime/ffconst_utils.cc | 2 + src/runtime/graph.cc | 1 + src/runtime/model.cc | 6 ++ src/runtime/substitution.cc | 4 + 14 files changed, 130 insertions(+), 55 deletions(-) diff --git a/config/config.linux b/config/config.linux index 017243408b..72fe466a7f 100755 --- a/config/config.linux +++ b/config/config.linux @@ -14,7 +14,7 @@ #INSTALL_DIR= # set build type -BUILD_TYPE=${BUILD_TYPE:-Release} +BUILD_TYPE=Debug # set CUDA Arch to the desired GPU architecture(s) to target (e.g. pass "FF_CUDA_ARCH=60" for Pascal). # To pass more than one value, separate architecture numbers with a comma (e.g. FF_CUDA_ARCH=70,75). @@ -23,10 +23,10 @@ BUILD_TYPE=${BUILD_TYPE:-Release} FF_CUDA_ARCH=${FF_CUDA_ARCH:-"autodetect"} # set CUDNN dir in case cmake cannot autodetect a path -CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"} +CUDNN_DIR=${CUDNN_DIR:-"/sw/summit/cuda/11.0.3"} #/usr/local/cuda # set CUDA dir in case cmake cannot autodetect a path -CUDA_DIR=${CUDA_DIR:-"/usr/local/cuda"} +CUDA_DIR=${CUDA_DIR:-"/sw/summit/cuda/11.0.3"} # enable Python FF_USE_PYTHON=${FF_USE_PYTHON:-ON} diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py index 41b84a269e..dccd9ba48c 100644 --- a/examples/python/pytorch/mt5/mt5_ff.py +++ b/examples/python/pytorch/mt5/mt5_ff.py @@ -5,14 +5,14 @@ import numpy as np from flexflow.core import * from flexflow.torch.model import PyTorchModel -from transformers import MT5ForConditionalGeneration, T5Tokenizer - +#from transformers import MT5ForConditionalGeneration, T5Tokenizer +from transformers import BertForMaskedLM, BertTokenizer sys.path.append("./examples/python/pytorch/mt5") from mt5_torch import DataPreparer, get_dataloaders, set_seed BASE_DIR = "examples/python/pytorch/mt5" DATA_DIR = os.path.join(BASE_DIR, "data") -NUMPY_DIR = os.path.join(DATA_DIR, "numpy") +NUMPY_DIR = os.path.join(DATA_DIR, "numpy_candle") def data_to_numpy() -> None: @@ -28,7 +28,8 @@ def data_to_numpy() -> None: """ model_params = { "SEED": 42, - "MODEL": "google/mt5-small", + #"MODEL": "google/mt5-small", + "MODEL": "bert-base-uncased", "TRAIN_BATCH_SIZE": None, # use the full dataset as one batch "EVAL_BATCH_SIZE": None, # use the full dataset as one batch "TRAIN_EPOCHS": 1, # unused @@ -36,7 +37,8 @@ def data_to_numpy() -> None: "MAX_TARGET_TEXT_LENGTH": 48, } set_seed(model_params) - tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"]) + #tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"]) + tokenizer = BertTokenizer.from_pretrained(model_params["MODEL"]) print("Getting dataloaders...") train_loader, eval_loader = get_dataloaders(tokenizer, model_params) assert len(train_loader) == 1 @@ -81,28 +83,31 @@ def preprocess_train() -> None: def top_level_task(): ffconfig = FFConfig() ffmodel = FFModel(ffconfig) - model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small") - + #model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small") + model = BertForMaskedLM.from_pretrained("bert-base-uncased") + #model = BertModel.from_pretrained("bert-base-uncased") # Load train data as numpy arrays print("Loading data...") - ids = np.load(os.path.join(NUMPY_DIR, "train_source_ids.npy")) - mask = np.load(os.path.join(NUMPY_DIR, "train_source_mask.npy")) - y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) - lm_labels = np.load(os.path.join(NUMPY_DIR, "train_lm_labels.npy")) + ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")) + mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")) + #y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) + lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")) batch_size = ffconfig.batch_size input_ids_shape = (batch_size, ids.shape[1]) attention_mask_shape = (batch_size, mask.shape[1]) - decoder_input_ids_shape = (batch_size, y_ids.shape[1]) + #decoder_input_ids_shape = (batch_size, y_ids.shape[1]) input_tensors = [ ffmodel.create_tensor(input_ids_shape, DataType.DT_INT64), # input_ids ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT64), # attention_mask - ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids + #ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids ] encoder_seq_length = ids.shape[1] - decoder_seq_length = y_ids.shape[1] - seq_length = (encoder_seq_length, decoder_seq_length) - input_names = ["input_ids", "attention_mask", "decoder_input_ids"] + #decoder_seq_length = y_ids.shape[1] + #seq_length = (encoder_seq_length, decoder_seq_length) + seq_length = encoder_seq_length + #input_names = ["input_ids", "attention_mask", "decoder_input_ids"] + input_names = ["input_ids", "attention_mask"] print("Tracing the model...") hf_model = PyTorchModel( @@ -110,6 +115,8 @@ def top_level_task(): batch_size=batch_size, seq_length=seq_length, ) output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True) + #from flexflow.torch.model import file_to_ff + #file_to_ff("mt5.ff", ffmodel, input_tensors) ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) print("Compiling the model...") @@ -123,9 +130,12 @@ def top_level_task(): ) print("Creating data loaders...") + print('id_dtype', ids.dtype) + print('mask_dtype', mask.dtype) + print('labels_dtype', lm_labels.dtype) input_ids_dl = ffmodel.create_data_loader(input_tensors[0], ids) attention_mask_dl = ffmodel.create_data_loader(input_tensors[1], mask) - decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids) + #decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids) # NOTE: We cast down the label tensor data to 32-bit to accommodate the # label tensor's required dtype labels_dl = ffmodel.create_data_loader( @@ -138,31 +148,32 @@ def top_level_task(): print("Training...") epochs = ffconfig.epochs ffmodel.fit( - x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl], + #x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl], + x=[input_ids_dl, attention_mask_dl], y=labels_dl, batch_size=batch_size, epochs=epochs, ) if __name__ == "__main__": - # Generate the .tsv files if needed - if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \ - not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")): - DataPreparer.data_to_tsv() - # Convert the .tsv files to .npy if needed - if not os.path.exists(NUMPY_DIR): - os.mkdir(NUMPY_DIR) - prefixes = ["train_", "eval_"] - suffixes = ["source_ids.npy", "source_mask.npy", "target_ids.npy"] - npy_filenames = [ - pre + suf for pre, suf in itertools.product(prefixes, suffixes) - ] - if any( - not os.path.exists(os.path.join(NUMPY_DIR, filename)) - for filename in npy_filenames - ): - data_to_numpy() - # Preprocess the training data if needed - if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \ - not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")): - preprocess_train() + ## Generate the .tsv files if needed + #if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \ + # not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")): + # DataPreparer.data_to_tsv() + ## Convert the .tsv files to .npy if needed + #if not os.path.exists(NUMPY_DIR): + # os.mkdir(NUMPY_DIR) + #prefixes = ["train_", "eval_"] + #suffixes = ["source_ids.npy", "source_mask.npy", "target_ids.npy"] + #npy_filenames = [ + # pre + suf for pre, suf in itertools.product(prefixes, suffixes) + #] + #if any( + # not os.path.exists(os.path.join(NUMPY_DIR, filename)) + # for filename in npy_filenames + #): + # data_to_numpy() + ## Preprocess the training data if needed + #if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \ + # not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")): + # preprocess_train() top_level_task() diff --git a/examples/python/pytorch/mt5/mt5_torch.py b/examples/python/pytorch/mt5/mt5_torch.py index 78886eed6c..4d741c44a5 100644 --- a/examples/python/pytorch/mt5/mt5_torch.py +++ b/examples/python/pytorch/mt5/mt5_torch.py @@ -7,7 +7,7 @@ import os import numpy as np -import pandas as pd +#import pandas as pd import torch from torch.utils.data import DataLoader, Dataset from transformers import MT5ForConditionalGeneration, T5Tokenizer @@ -311,5 +311,5 @@ def TorchMT5Trainer( "MAX_TARGET_TEXT_LENGTH": 48, "LEARNING_RATE": 1e-4, } - device = torch.device(0) + device = torch.device('cpu') TorchMT5Trainer(model_params, device) diff --git a/include/flexflow/graph.h b/include/flexflow/graph.h index e52911853e..800f750f42 100644 --- a/include/flexflow/graph.h +++ b/include/flexflow/graph.h @@ -90,7 +90,7 @@ struct NodeCompare { struct GraphOptimalViewSerialized { #ifdef LEGION_MAX_RETURN_SIZE - static const size_t buffer_size = LEGION_MAX_RETURN_SIZE - 8; + static const size_t buffer_size = 4*LEGION_MAX_RETURN_SIZE - 8; #else static const size_t buffer_size = 1024 * 1024 - 8; #endif diff --git a/include/flexflow/ops/split.h b/include/flexflow/ops/split.h index 633268ffbf..6c0736a76f 100644 --- a/include/flexflow/ops/split.h +++ b/include/flexflow/ops/split.h @@ -50,6 +50,8 @@ class Split : public Op { Params get_params() const; + tl::optional as_dot() const override; + public: int legion_axis; std::vector splits; diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index 65b1669e99..2e6a2a23f3 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -1,4 +1,4 @@ -# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# Copyright 2020 Stanford University, Los Alamos National Laboratory # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,6 @@ from collections import OrderedDict from enum import Enum from typing import List -import copy import numpy as np from flexflow.core.flexflow_cffi import Tensor, NormInitializer @@ -26,6 +25,7 @@ try: import torch + print(torch.__version__) from torch.fx.immutable_collections import immutable_dict except: pass @@ -931,7 +931,7 @@ def construct_node(node): elif name.find("contiguous") >= 0: return ContiguousNode(node) elif name.find("tanh") >= 0: return TanhFNode(node) elif name.find("gelu") >= 0: return GeluFNode(node) - assert 0, f"Unknown function or method: {name}" + assert 0, f"Unknown function or method: {name} {node}" @staticmethod def is_right_scalar_op(node): @@ -1424,7 +1424,7 @@ def is_truncate(slice_elem, old_size): stop = old_size if slice_elem.stop == None else slice_elem.stop new_size = stop - start return new_size < old_size - + def is_single_element(slice_elem): return isinstance(slice_elem, int) @@ -1441,6 +1441,7 @@ def is_single_element(slice_elem): # Match dimensions from right to left new_shape = [] # append then reverse j = len(shape) - 1 + import copy curr_tensor = copy.copy(tensor) for slice_elem in reversed(slices): @@ -1482,7 +1483,38 @@ def is_single_element(slice_elem): new_shape.reverse() return ffmodel.reshape(input=curr_tensor, shape=new_shape, name=name,) - + + + +# """Returns a reshaped tensor based on the given slices.""" +# def is_colon(slice_elem): +# """Returns if the slice is equivalent to `:`.""" +# return slice_elem == slice(None, None, None) +# +# def is_unsqueeze(slice_elem): +# """Returns if the slice is equivalent to unsqueezing that +# dimension.""" +# return slice_elem is None +# shape = tensor.dims +# # Match dimensions from right to left +# new_shape = [] # append then reverse +# j = len(shape) - 1 +# for slice_elem in reversed(slices): +# if is_colon(slice_elem): +# assert j >= 0 +# new_shape.append(shape[j]) +# j -= 1 +# elif is_unsqueeze(slice_elem): +# new_shape.append(1) +# else: +# assert 0, f"Unsupported slice element: {slice_elem}" +# new_shape.reverse() +# return ffmodel.reshape( +# input=tensor, shape=new_shape, name=name, +# ) + + + @staticmethod def strings_to_slices(strings: List[str]): # Extract slice elements @@ -1751,7 +1783,7 @@ def __init__(self, node): def parse(self): s = [self.name] scalar = self.innodes[1] - if type(scalar) is not int or type(scalar) is not float: + if not isinstance(scalar, [int, float]): assert 0, "FlexFlow does not support tensor floor division" innodes = (self.innodes[0],) s.append(self.parse_inoutnodes(innodes)) @@ -2440,6 +2472,11 @@ def _trace_model(self): batch_size=self.batch_size, sequence_length=self.seq_length, ) + + #import pickle + #with open('symbolic_trace', 'rb') as f: + #traced = pickle.load(f) + else: traced = torch.fx.symbolic_trace(self.model) diff --git a/src/metrics_functions/metrics_functions.cu b/src/metrics_functions/metrics_functions.cu index 2e037eb472..b68b10d873 100644 --- a/src/metrics_functions/metrics_functions.cu +++ b/src/metrics_functions/metrics_functions.cu @@ -29,7 +29,7 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, CUDA_KERNEL_LOOP(b, num_samples) { if (metrics.measure_accuracy) { float max_val = -1.0f; - int my_label = -1; + int my_label = 0; for (int i = 0; i < num_classes; i++) { float my_logit = logits[b * num_classes + i]; if (my_logit > max_val) { diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index 60112bfdc9..b50f54bbf0 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -125,7 +125,8 @@ Tensor FFModel::tanh(const Tensor x, char const *name) { } Tensor FFModel::identity(const Tensor x, char const *name) { - return this->unary(OP_IDENTITY, x, false /*inplace*/, name); + //return this->unary(OP_IDENTITY, x, false /*inplace*/, name); + return x; } Tensor FFModel::gelu(const Tensor x, char const *name) { diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 668c8d070a..4a37f7c02e 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -418,7 +418,7 @@ void Linear::forward_task_with_dim(Task const *task, assert(acc_kernel.rect.volume() == static_cast(in_dim * out_dim)); float const *acc_bias_ptr = NULL; if (m->use_bias) { - TensorAccessorR acc_bias( + TensorAccessorR acc_bias( regions[3], task->regions[3], FID_DATA, ctx, runtime); assert(acc_bias.rect.volume() == static_cast(out_dim)); acc_bias_ptr = acc_bias.ptr; @@ -604,7 +604,7 @@ void Linear::backward_task_with_dim(Task const *task, static_cast(in_dim * out_dim)); float *acc_bias_grad_ptr = NULL; if (m->use_bias) { - TensorAccessorW acc_bias_grad(regions[rid], + TensorAccessorW acc_bias_grad(regions[rid], task->regions[rid], FID_DATA, ctx, diff --git a/src/ops/split.cc b/src/ops/split.cc index 4f60cb96f0..3517852942 100644 --- a/src/ops/split.cc +++ b/src/ops/split.cc @@ -330,6 +330,17 @@ void Split::backward_task(Task const *task, split->numOutputs); } +tl::optional Split::as_dot() const { + RecordFormatter rr; + RecordFormatter r; + + r << this->inputs[0]->get_shape().as_dot(); + r << this->outputs[0]->get_shape().as_dot(); + rr << r; + + return rr; +} + bool Split::measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index d8f4e6e179..7ab9201113 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -172,6 +172,8 @@ std::string get_operator_type_name(OperatorType type) { return "Pipeline"; case OP_FUSED_PARALLEL: return "FusedParallelOp"; + case OP_GELU: + return "Gelu"; default: throw std::runtime_error("Operator type unsupported: " + std::to_string(type)); diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index ad298f5c93..77bb29cf47 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1588,6 +1588,7 @@ GraphOptimalViewSerialized std::unordered_map optimal_views; if (model->config.only_data_parallel) { Graph *graph = new Graph(model); + graph->print_dot(); std::unordered_map op_to_node_map; for (FlexFlow::Op const *dstOp : model->operators) { Node dstNode; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index ca1ab33343..5318684f53 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -2631,10 +2631,16 @@ Op *FFModel::create_operator_from_layer( tensor->parallel_tensor = pt; // start from data parllel tensor if (config.only_data_parallel) { + if (pt->dims[num_dims-1].size == 1) { + Replicate *repl = new Replicate( + *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); + operators.push_back(repl); + } else { Repartition *part = new Repartition( *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); operators.push_back(part); } + } return operators[operators.size() - 1]; } case OP_MULTIHEAD_ATTENTION: { diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index 9d3d6057f1..2925eb7555 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -1892,6 +1892,7 @@ void GraphSearchHelper::graph_optimize( this->logger->debug() << "Starting graph optimization"; Graph *graph = this->construct_graph(); + graph->print_dot(); graph->duplicate_input_nodes(); std::unordered_map empty_strategy; if (!this->config.export_strategy_computation_graph_file.empty()) { @@ -1929,6 +1930,8 @@ void GraphSearchHelper::graph_optimize( } } best_graph->print_strategy_computation_graph(optimal.views); + //std::cout << "PCG:" << std::endl; + //best_graph->print_dot(); optimal_views = real_optimal_views; } @@ -3117,6 +3120,7 @@ void FFModel::graph_optimize( std::unordered_map &optimal_views) { this->graph_search->graph_optimize( budget, only_data_parallel, best_graph, optimal_views); + best_graph->print_dot(); } bool FFModel::convert_graph_to_operators( From 7030ae09677f4157118bb79dbeeab3a89354cc79 Mon Sep 17 00:00:00 2001 From: Teresa Noyola Date: Fri, 21 Apr 2023 16:45:26 -0400 Subject: [PATCH 02/52] padded input to 512, added isExact to slice_tensor --- examples/python/pytorch/mt5/mt5_ff.py | 9 +++++++++ python/flexflow/torch/model.py | 14 +++++++++++++- src/runtime/model.cc | 3 ++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py index dccd9ba48c..fa0801a517 100644 --- a/examples/python/pytorch/mt5/mt5_ff.py +++ b/examples/python/pytorch/mt5/mt5_ff.py @@ -3,6 +3,7 @@ import sys import numpy as np +import torch from flexflow.core import * from flexflow.torch.model import PyTorchModel #from transformers import MT5ForConditionalGeneration, T5Tokenizer @@ -89,9 +90,17 @@ def top_level_task(): # Load train data as numpy arrays print("Loading data...") ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")) + ids = np.pad(ids, ((0,0), (0,17)), 'constant') + #ids = np.random.randint(0, 5, (1000, 512)) + #print('ids_shape', ids.shape) + #print('ids', ids) mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")) + mask = np.pad(mask, ((0,0), (0,17)), 'constant') + #mask = np.random.randint(0, 2, (1000, 512)) #y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")) + lm_labels = np.pad(lm_labels, ((0,0), (0,17)), 'constant') + #lm_labels = np.random.randint(-1, 5, (1000, 512)) batch_size = ffconfig.batch_size input_ids_shape = (batch_size, ids.shape[1]) diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index 2e6a2a23f3..ecd455a292 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -1409,6 +1409,7 @@ def to_ff(self, ffmodel, node_to_output): @staticmethod def slice_tensor(ffmodel, tensor, slices, name): + """Returns a reshaped tensor based on the given slices.""" def is_colon(slice_elem): """Returns if the slice is equivalent to `:`.""" @@ -1428,6 +1429,12 @@ def is_truncate(slice_elem, old_size): def is_single_element(slice_elem): return isinstance(slice_elem, int) + def is_exact(slice_elem, old_size): + start = 0 if slice_elem.start == None else slice_elem.start + stop = old_size if slice_elem.stop == None else slice_elem.stop + new_size = stop - start + return new_size == old_size + shape = tensor.dims # Fewer slices than input dimensions @@ -1457,6 +1464,8 @@ def is_single_element(slice_elem): curr_tensor = ffmodel.split(input=curr_tensor, sizes=splits, axis=j, name=name)[0] new_shape.append(1) j -= 1 + elif is_exact(slice_elem, shape[j]): + pass elif is_truncate(slice_elem, shape[j]): assert j >= 0 start = 0 if slice_elem.start == None else slice_elem.start @@ -1482,7 +1491,10 @@ def is_single_element(slice_elem): assert 0, f"Unsupported slice element: {slice_elem}" new_shape.reverse() - return ffmodel.reshape(input=curr_tensor, shape=new_shape, name=name,) + if len(new_shape) == 0: + return curr_tensor + else: + return ffmodel.reshape(input=curr_tensor, shape=new_shape, name=name,) diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 5318684f53..55e76d5071 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -2634,7 +2634,8 @@ Op *FFModel::create_operator_from_layer( if (pt->dims[num_dims-1].size == 1) { Replicate *repl = new Replicate( *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); - operators.push_back(repl); + repl->outputs[0]->dims[num_dims-1].is_replica_dim = true; + operators.push_back(repl); } else { Repartition *part = new Repartition( *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); From 280d29a47e5366fcfd968769aa7b64499f7fa7e4 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Sun, 7 May 2023 16:52:58 -0700 Subject: [PATCH 03/52] Add multiprecision support to replicate --- include/flexflow/parallel_ops/replicate.h | 14 ++++++ src/parallel_ops/replicate.cc | 59 ++++++++++++++++++++--- 2 files changed, 65 insertions(+), 8 deletions(-) diff --git a/include/flexflow/parallel_ops/replicate.h b/include/flexflow/parallel_ops/replicate.h index 381f690cdc..44ab0c20a5 100644 --- a/include/flexflow/parallel_ops/replicate.h +++ b/include/flexflow/parallel_ops/replicate.h @@ -39,6 +39,20 @@ class Replicate : public ParallelOp { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + + template + static void + forward_task_with_type(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + + template + static void backward_task_with_type(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index 031166e63e..16919bb739 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -143,7 +143,7 @@ void Replicate::forward(FFModel const &ff) { assert(numInputs == 1); IndexLauncher launcher(REPLICATE_FWD_TASK_ID, outputs[0]->parallel_is, - TaskArgument(NULL, 0), + TaskArgument(&data_type, sizeof(DataType)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -169,7 +169,7 @@ void Replicate::backward(FFModel const &ff) { assert(numInputs == 1); IndexLauncher launcher(REPLICATE_BWD_TASK_ID, inputs[0]->parallel_is, - TaskArgument(NULL, 0), + TaskArgument(&data_type, sizeof(DataType)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -222,7 +222,29 @@ bool Replicate::append_parallel_op_info( return true; } +/*static*/ void Replicate::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + DataType data_type = *((DataType *)task->args); + if (data_type == DT_FLOAT) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_DOUBLE) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT32) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT64) { + forward_task_with_type(task, regions, ctx, runtime); + } else { + assert(false && "Unsupported data type in Replicate forward"); + } +} + +template +void Replicate::forward_task_with_type(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { @@ -238,15 +260,36 @@ void Replicate::forward_task(Task const *task, assert(output_domain.hi()[i] == input_domain.hi()[i]); } assert(input_domain.get_volume() == output_domain.get_volume()); - float const *input_ptr = helperGetTensorPointerRO( + T const *input_ptr = helperGetTensorPointerRO( regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *output_ptr = helperGetTensorPointerRW( + T *output_ptr = helperGetTensorPointerRW( regions[1], task->regions[1], FID_DATA, ctx, runtime); - forward_kernel(input_ptr, output_ptr, input_domain.get_volume()); + forward_kernel(input_ptr, output_ptr, input_domain.get_volume()); } void Replicate::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + DataType data_type = *((DataType *)task->args); + if (data_type == DT_FLOAT) { + backward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_DOUBLE) { + backward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT32) { + backward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT64) { + backward_task_with_type(task, regions, ctx, runtime); + } else { + assert(false && "Unsupported data type in Embedding forward"); + } +} + +template +void Replicate::backward_task_with_type(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { @@ -263,12 +306,12 @@ void Replicate::backward_task(Task const *task, } size_t num_elements = input_grad_domain.get_volume(); size_t num_replicas = output_grad_domain.get_volume() / num_elements; - float const *output_grad_ptr = helperGetTensorPointerRO( + T const *output_grad_ptr = helperGetTensorPointerRO( regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *input_grad_ptr = helperGetTensorPointerRW( + T *input_grad_ptr = helperGetTensorPointerRW( regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel( + backward_kernel( output_grad_ptr, input_grad_ptr, num_elements, num_replicas); } From 8e84401cf5e7b623f6a39ab968c6563a06ec060d Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Sun, 7 May 2023 16:56:46 -0700 Subject: [PATCH 04/52] Add explicit template instantiations for replicate kernels --- src/parallel_ops/kernels/replicate_kernels.cu | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/parallel_ops/kernels/replicate_kernels.cu b/src/parallel_ops/kernels/replicate_kernels.cu index de208d2aed..0055bbaea0 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cu +++ b/src/parallel_ops/kernels/replicate_kernels.cu @@ -68,6 +68,42 @@ template void backward_kernel(float const *output_grad_ptr, float *input_grad_ptr, size_t num_elements, size_t num_replicas); +template void forward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(double const *output_grad_ptr, + double *input_grad_ptr, + size_t num_elements, + size_t num_replicas); +template void forward_kernel(float const *input_ptr, + int32_t *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(float const *input_ptr, + int32_t *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(float const *output_grad_ptr, + int32_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); +template void forward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(double const *output_grad_ptr, + double *input_grad_ptr, + size_t num_elements, + size_t num_replicas); } // namespace Replicate } // namespace Kernels From a2ddb91c3b94d11f2446326c29b34cd5638f382b Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Sun, 7 May 2023 17:00:37 -0700 Subject: [PATCH 05/52] Fix incorrect instantiations --- src/parallel_ops/kernels/replicate_kernels.cu | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/parallel_ops/kernels/replicate_kernels.cu b/src/parallel_ops/kernels/replicate_kernels.cu index 0055bbaea0..2b8ff55eac 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cu +++ b/src/parallel_ops/kernels/replicate_kernels.cu @@ -80,28 +80,28 @@ template void backward_kernel(double const *output_grad_ptr, double *input_grad_ptr, size_t num_elements, size_t num_replicas); -template void forward_kernel(float const *input_ptr, +template void forward_kernel(int32_t const *input_ptr, int32_t *output_ptr, size_t num_elements); template __global__ void - replicate_backward_kernel(float const *input_ptr, + replicate_backward_kernel(int32_t const *input_ptr, int32_t *output_ptr, size_t num_elements, size_t num_replicas); -template void backward_kernel(float const *output_grad_ptr, +template void backward_kernel(int32_t const *output_grad_ptr, int32_t *input_grad_ptr, size_t num_elements, size_t num_replicas); -template void forward_kernel(double const *input_ptr, - double *output_ptr, +template void forward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, size_t num_elements); template __global__ void - replicate_backward_kernel(double const *input_ptr, - double *output_ptr, + replicate_backward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, size_t num_elements, size_t num_replicas); -template void backward_kernel(double const *output_grad_ptr, - double *input_grad_ptr, +template void backward_kernel(int64_t const *output_grad_ptr, + int64_t *input_grad_ptr, size_t num_elements, size_t num_replicas); From 52cc8e8b1231c5a4bc5c14f9c7c0012a8f23b464 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Mon, 8 May 2023 15:22:10 -0700 Subject: [PATCH 06/52] Add nop init_task for replicate --- include/flexflow/parallel_ops/replicate.h | 4 ++++ src/parallel_ops/replicate.cc | 9 ++++++++- src/runtime/model.cc | 7 +++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/include/flexflow/parallel_ops/replicate.h b/include/flexflow/parallel_ops/replicate.h index 44ab0c20a5..9514d055f3 100644 --- a/include/flexflow/parallel_ops/replicate.h +++ b/include/flexflow/parallel_ops/replicate.h @@ -31,6 +31,10 @@ class Replicate : public ParallelOp { bool get_int_parameter(PMParameter, int *) const override; bool append_parallel_op_info( std::vector ¶llel_ops) const override; + static void init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void forward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index 16919bb739..dec5b3acbe 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -115,7 +115,7 @@ void Replicate::init(FFModel const &ff) { Runtime *runtime = ff.config.lg_hlr; assert(numOutputs == 1); assert(numInputs == 1); - IndexLauncher launcher(REPLICATE_FWD_TASK_ID, + IndexLauncher launcher(REPLICATE_INIT_TASK_ID, outputs[0]->parallel_is, TaskArgument(NULL, 0), argmap, @@ -141,6 +141,7 @@ void Replicate::forward(FFModel const &ff) { Runtime *runtime = ff.config.lg_hlr; assert(numOutputs == 1); assert(numInputs == 1); + DataType data_type = inputs[0]->data_type; IndexLauncher launcher(REPLICATE_FWD_TASK_ID, outputs[0]->parallel_is, TaskArgument(&data_type, sizeof(DataType)), @@ -222,6 +223,12 @@ bool Replicate::append_parallel_op_info( return true; } +void Replicate::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { +} + /*static*/ void Replicate::forward_task(Task const *task, std::vector const ®ions, diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 55e76d5071..1e95823b57 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4446,6 +4446,13 @@ void register_flexflow_internal_tasks() { registrar, "Combine Backward Task"); } // Replicate + { + TaskVariantRegistrar registrar(REPLICATE_INIT_TASK_ID, "Replicate Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Replicate Init Task"); + } { TaskVariantRegistrar registrar(REPLICATE_FWD_TASK_ID, "Replicate Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); From 9eb530a3fb22aa9058ed381bb3bdefd17a6534d5 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Mon, 8 May 2023 15:31:25 -0700 Subject: [PATCH 07/52] Fix replicate init_task registration --- src/runtime/model.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 1e95823b57..2e41c6af0d 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4450,7 +4450,7 @@ void register_flexflow_internal_tasks() { TaskVariantRegistrar registrar(REPLICATE_INIT_TASK_ID, "Replicate Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( + Runtime::preregister_task_variant( registrar, "Replicate Init Task"); } { From d7a219ca4e6af68b544d7747d04408e4b15f440b Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Wed, 10 May 2023 11:33:04 -0700 Subject: [PATCH 08/52] Hopefully print hip errors --- include/flexflow/utils/hip_helper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h index 6970832231..2ea09770d6 100644 --- a/include/flexflow/utils/hip_helper.h +++ b/include/flexflow/utils/hip_helper.h @@ -19,7 +19,7 @@ do { \ std::stringstream _error; \ if (status != miopenStatusSuccess) { \ - _error << "CUDNN failure: " << status; \ + _error << "CUDNN failure: " << miopenGetErrorString(status); \ FatalError(_error.str()); \ } \ } while (0) From f323a6e75c4909aba86d30a85386d533739f9413 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Wed, 10 May 2023 11:38:26 -0700 Subject: [PATCH 09/52] Instantiate extra hip replicate kernels --- .../kernels/replicate_kernels.cpp | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/parallel_ops/kernels/replicate_kernels.cpp b/src/parallel_ops/kernels/replicate_kernels.cpp index 29f1d30d1f..ff9751ee34 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cpp +++ b/src/parallel_ops/kernels/replicate_kernels.cpp @@ -76,6 +76,45 @@ template void backward_kernel(float const *output_grad_ptr, size_t num_elements, size_t num_replicas); +template void forward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(double const *output_grad_ptr, + double *input_grad_ptr, + size_t num_elements, + size_t num_replicas); + +template void forward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(int64_t const *output_grad_ptr, + int64_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); + +template void forward_kernel(int32_t const *input_ptr, + int32_t *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(int32_t const *input_ptr, + int32_t *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(int32_t const *output_grad_ptr, + int32_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); + } // namespace Replicate } // namespace Kernels } // namespace FlexFlow From 9a75f248d31f0d98f52610176d1a6e94142f9009 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 10 May 2023 18:57:28 +0000 Subject: [PATCH 10/52] fix --- src/parallel_ops/replicate.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index dec5b3acbe..6b24d98abb 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -75,7 +75,7 @@ Replicate::Replicate(FFModel &model, dims[replicate_dim].degree *= replicate_degree; ParallelTensorBase::update_parallel_ids(numdim, dims); outputs[0] = model.create_parallel_tensor_legion_ordering( - numdim, dims, DT_FLOAT, this); + numdim, dims, _input->data_type, this); // inputs[0]->print("Replicate::input"); // outputs[0]->print("Replicate::output"); } From c9277f34b912f3b6ed2445d35b2be725d6d4dd50 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 10 May 2023 19:04:51 +0000 Subject: [PATCH 11/52] debug changs --- src/ops/element_binary.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index b90e85588b..6adea1480c 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -381,7 +381,7 @@ void ElementBinary::forward(FFModel const &ff) { set_argumentmap_for_forward(ff, argmap); IndexLauncher launcher(ELEMENTBINARY_FWD_TASK_ID, parallel_is, - TaskArgument(NULL, 0), + TaskArgument(this, sizeof(ElementBinary)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -448,7 +448,7 @@ __host__ void std::vector const ®ions, Context ctx, Runtime *runtime) { - // const ElementBinary* ele = (const ElementBinary*) task->args; + const ElementBinary* ele = (const ElementBinary*) task->args; ElementBinaryMeta const *m = *((ElementBinaryMeta **)task->local_args); Domain in1_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); From fe66561263fb5bf2f0e95b416b5e922cdfea567d Mon Sep 17 00:00:00 2001 From: Teresa Noyola Date: Fri, 12 May 2023 17:01:30 -0400 Subject: [PATCH 12/52] Add slice_tensor fix --- python/flexflow/torch/model.py | 18 ++++++++++++++++-- src/runtime/graph.cc | 1 + 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index ecd455a292..aa80758843 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -1410,6 +1410,9 @@ def to_ff(self, ffmodel, node_to_output): @staticmethod def slice_tensor(ffmodel, tensor, slices, name): + print('slices', slices) + old_shape = tensor.dims + print('old_shape', tensor.dims) """Returns a reshaped tensor based on the given slices.""" def is_colon(slice_elem): """Returns if the slice is equivalent to `:`.""" @@ -1430,12 +1433,15 @@ def is_single_element(slice_elem): return isinstance(slice_elem, int) def is_exact(slice_elem, old_size): + if slice_elem is None: + return False start = 0 if slice_elem.start == None else slice_elem.start stop = old_size if slice_elem.stop == None else slice_elem.stop new_size = stop - start return new_size == old_size shape = tensor.dims + print('input dims', tensor.dims) # Fewer slices than input dimensions diff = len(shape) - len(slices) @@ -1452,9 +1458,14 @@ def is_exact(slice_elem, old_size): curr_tensor = copy.copy(tensor) for slice_elem in reversed(slices): - if is_colon(slice_elem): + print('slice_elem', slice_elem) + if is_colon(slice_elem) or is_exact(slice_elem, shape[j]): + print('shape', shape) assert j >= 0 + print('j', j) + print('new_shape_bef', new_shape) new_shape.append(shape[j]) + print('new_shape_aft', new_shape) j -= 1 elif is_unsqueeze(slice_elem): new_shape.append(1) @@ -1465,7 +1476,7 @@ def is_exact(slice_elem, old_size): new_shape.append(1) j -= 1 elif is_exact(slice_elem, shape[j]): - pass + print('exact') elif is_truncate(slice_elem, shape[j]): assert j >= 0 start = 0 if slice_elem.start == None else slice_elem.start @@ -1494,6 +1505,9 @@ def is_exact(slice_elem, old_size): if len(new_shape) == 0: return curr_tensor else: + print('new_shape', new_shape) + if old_shape == new_shape: + return curr_tensor return ffmodel.reshape(input=curr_tensor, shape=new_shape, name=name,) diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 77bb29cf47..4b55a39104 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1602,6 +1602,7 @@ GraphOptimalViewSerialized graph->add_edge(srcNode, dstNode, dstOp->inputs[j]->owner_idx, j); } } + graph->print_dot(); best_graph = std::unique_ptr(graph); MachineView data_parallel_view; data_parallel_view.device_type = MachineView::GPU; From 11777485a15c88f6c981a0b2a5116ec3a48d5671 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Fri, 12 May 2023 14:14:06 -0700 Subject: [PATCH 13/52] Add logging for metrics --- src/metrics_functions/metrics_functions.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/metrics_functions/metrics_functions.cc b/src/metrics_functions/metrics_functions.cc index e8ccbfe2e4..7244b06925 100644 --- a/src/metrics_functions/metrics_functions.cc +++ b/src/metrics_functions/metrics_functions.cc @@ -15,6 +15,7 @@ #include "flexflow/metrics_functions.h" #include "flexflow/model.h" +#include namespace FlexFlow { @@ -90,6 +91,8 @@ void Metrics::compute(FFModel *model, false /*must*/, 0 /*mapper_id*/, logit->machine_view.hash()); + std::cout << "logit shape: " << logit->get_shape() << std::endl; + std::cout << "label shape: " << label->get_shape() << std::endl; launcher.add_region_requirement(RegionRequirement( logit->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, logit->region)); launcher.add_field(0, FID_DATA); @@ -154,6 +157,7 @@ PerfMetrics assert(acc_label.rect.lo[0] == acc_label.rect.hi[0]); // Cannot measure categorical_crossentropy w/ sparse labels // Use measure_sparse_categorical_crossentropy instead + std::cout << "num_classes: " << num_classes << std::endl; assert(!me->measure_categorical_crossentropy); Metrics::update_metrics_sparse_label_kernel_wrapper(acc_logit.ptr, acc_label.ptr, From 5b7cacea6c72bf779f26f8298e054e4bbd2d1eaf Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Fri, 12 May 2023 14:21:01 -0700 Subject: [PATCH 14/52] Add the cuda metrics hack to hip kernel as well --- src/metrics_functions/metrics_functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/metrics_functions/metrics_functions.cpp b/src/metrics_functions/metrics_functions.cpp index 90d727b9b1..d30686be24 100644 --- a/src/metrics_functions/metrics_functions.cpp +++ b/src/metrics_functions/metrics_functions.cpp @@ -30,7 +30,7 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, CUDA_KERNEL_LOOP(b, num_samples) { if (metrics.measure_accuracy) { float max_val = -1.0f; - int my_label = -1; + int my_label = 0; for (int i = 0; i < num_classes; i++) { float my_logit = logits[b * num_classes + i]; if (my_logit > max_val) { From e798a91d49a267550ee902206f0ade158ba7eb47 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Fri, 12 May 2023 14:44:05 -0700 Subject: [PATCH 15/52] Add parallel dim pretty printing --- gdb/pretty_print.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/gdb/pretty_print.py b/gdb/pretty_print.py index 4cccc9b76b..aac8d56444 100644 --- a/gdb/pretty_print.py +++ b/gdb/pretty_print.py @@ -80,6 +80,17 @@ def to_string(self): toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx}]') return f'ParallelTensorBase<{" ".join(toks)}>' +class ParallelDimPrinter: + def __init__(self, val): + self.val = val + + def to_string(self): + size = self.val['size'] + degree = self.val['degree'] + parallel_idx = self.val['parallel_idx'] + return f'ParallelDim' + + def build_pretty_printer(): pp = gdb.printing.RegexpCollectionPrettyPrinter( "flexflow") @@ -89,6 +100,7 @@ def build_pretty_printer(): pp.add_printer('Domain', '^Legion::Domain$', DomainPrinter) pp.add_printer('ParallelTensorShape', '^FlexFlow::ParallelTensorShape$', TensorShapePrinter) pp.add_printer('ParallelTensorBase', '^FlexFlow::ParallelTensorBase$', ParallelTensorBasePrinter) + pp.add_printer('ParallelDim', '^FlexFlow::ParallelDim$', ParallelDimPrinter) return pp gdb.printing.register_pretty_printer( From 90541cfa8f03a9f8d5b4da8c6ed2d9d163d65b68 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 12 May 2023 22:21:14 +0000 Subject: [PATCH 16/52] [Embedding] bug fix --- src/ops/embedding.cc | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 3b53213b91..18bf7e324e 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -148,33 +148,23 @@ int Embedding::output_size(ParallelDim output_dims[MAX_TENSOR_DIM]) { int const OUT_CHANNELS = Output::OUT_CHANNELS; if (aggr == AGGR_MODE_NONE) { int num_dims = input->num_dims + 1; - for (int i = 1; i < num_dims - 1; i++) { + for (int i = 1; i < num_dims; i++) { output_dims[i] = input->dims[i - 1]; } assert(OUT_CHANNELS == 0); output_dims[OUT_CHANNELS].size = this->out_channels; output_dims[OUT_CHANNELS].degree = 1; output_dims[OUT_CHANNELS].parallel_idx = -1; - // Currently do not support parallelizing over the replica dim - output_dims[num_dims - 1].size = 1; - output_dims[num_dims - 1].degree = 1; - output_dims[num_dims - 1].parallel_idx = -1; - output_dims[num_dims - 1].is_replica_dim = true; return num_dims; } else { int num_dims = input->num_dims; - for (int i = 1; i < num_dims - 1; i++) { + for (int i = 1; i < num_dims; i++) { output_dims[i] = input->dims[i]; } assert(OUT_CHANNELS == 0); output_dims[OUT_CHANNELS].size = this->out_channels; output_dims[OUT_CHANNELS].degree = 1; output_dims[OUT_CHANNELS].parallel_idx = -1; - // Currently do not support parallelizing over the replica dim - output_dims[num_dims - 1].size = 1; - output_dims[num_dims - 1].degree = 1; - output_dims[num_dims - 1].parallel_idx = -1; - output_dims[num_dims - 1].is_replica_dim = true; return num_dims; } // const int REPLICA = this->output_vocab_size_replica_dim(); From 786214379ac1c45a74197e154860435c89db4e0c Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Fri, 12 May 2023 15:40:42 -0700 Subject: [PATCH 17/52] Add replica dim to pretty print --- gdb/pretty_print.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/gdb/pretty_print.py b/gdb/pretty_print.py index aac8d56444..ec551002db 100644 --- a/gdb/pretty_print.py +++ b/gdb/pretty_print.py @@ -77,7 +77,13 @@ def to_string(self): size = dim['size'] degree = dim['degree'] parallel_idx = dim['parallel_idx'] - toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx}]') + tok = f'{i}=[s={size} d={degree} pi={parallel_idx} ' + if dim['is_replica_dim']: + tok += 'r=t' + else: + tok += 'r=f' + tok += ']' + toks.append() return f'ParallelTensorBase<{" ".join(toks)}>' class ParallelDimPrinter: @@ -88,7 +94,12 @@ def to_string(self): size = self.val['size'] degree = self.val['degree'] parallel_idx = self.val['parallel_idx'] - return f'ParallelDim' + tok = f's={size} d={degree} pi={parallel_idx} ' + if dim['is_replica_dim']: + tok += 'r=t' + else: + tok += 'r=f' + return f'ParallelDim<{tok}>' def build_pretty_printer(): From ef43c36343e6c0191bb790546e4fd441aabf1cf7 Mon Sep 17 00:00:00 2001 From: Teresa Noyola Date: Fri, 12 May 2023 16:29:28 -0700 Subject: [PATCH 18/52] Fix replicate issue with python hack --- gdb/pretty_print.py | 2 +- python/flexflow/torch/model.py | 5 +++++ src/runtime/model.cc | 22 +++++++++++----------- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/gdb/pretty_print.py b/gdb/pretty_print.py index ec551002db..0c384884f0 100644 --- a/gdb/pretty_print.py +++ b/gdb/pretty_print.py @@ -83,7 +83,7 @@ def to_string(self): else: tok += 'r=f' tok += ']' - toks.append() + toks.append(tok) return f'ParallelTensorBase<{" ".join(toks)}>' class ParallelDimPrinter: diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index aa80758843..2d0c388e3e 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -2353,6 +2353,9 @@ def to_ff(self, ffmodel, node_to_output): def attr_to_ff_tensor(self, ffmodel): torch_tensor = self.attr + assert (torch_tensor.shape[0] == 1) + batch_size = ffmodel._ffconfig.batch_size + torch_tensor = np.repeat(torch_tensor, batch_size, axis=0) ff_dtype = Node.torch_to_ff_dtype(torch_tensor.dtype) requires_grad = torch_tensor.requires_grad @@ -2367,6 +2370,8 @@ def attr_to_ff_tensor(self, ffmodel): ff_dtype = DataType.DT_FLOAT np_tensor = np_tensor.astype(np.float32) + print('attr: ', torch_tensor.shape) + assert (torch_tensor.shape[0] == batch_size) ff_tensor = ffmodel.create_tensor( torch_tensor.shape, ff_dtype, requires_grad, ) diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 2e41c6af0d..d9ee74ff4a 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -2630,17 +2630,17 @@ Op *FFModel::create_operator_from_layer( assert(tensor->parallel_tensor == nullptr); tensor->parallel_tensor = pt; // start from data parllel tensor - if (config.only_data_parallel) { - if (pt->dims[num_dims-1].size == 1) { - Replicate *repl = new Replicate( - *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); - repl->outputs[0]->dims[num_dims-1].is_replica_dim = true; - operators.push_back(repl); - } else { - Repartition *part = new Repartition( - *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); - operators.push_back(part); - } + if (config.only_data_parallel && config.numNodes * config.workersPerNode > 1) { + if (pt->dims[num_dims-1].size == 1) { + Replicate *repl = new Replicate( + *this, pt, num_dims, config.numNodes * config.workersPerNode); + repl->outputs[0]->dims[num_dims].is_replica_dim = true; + operators.push_back(repl); + } else { + Repartition *part = new Repartition( + *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); + operators.push_back(part); + } } return operators[operators.size() - 1]; } From dd8090e2c3ea2d8c6e6de328fdc929f495d0173e Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Sat, 20 May 2023 16:18:27 -0700 Subject: [PATCH 19/52] Use local json submodule --- cmake/json.cmake | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cmake/json.cmake b/cmake/json.cmake index 63ac50b203..3cf57a7864 100644 --- a/cmake/json.cmake +++ b/cmake/json.cmake @@ -1,4 +1 @@ -include(FetchContent) - -FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz) -FetchContent_MakeAvailable(json) \ No newline at end of file +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/json) From 0dc618772d52a934c5b6c3b254d7bf3c44b043c2 Mon Sep 17 00:00:00 2001 From: Teresa Noyola Date: Sat, 20 May 2023 19:30:32 -0400 Subject: [PATCH 20/52] ofi conduit-related fixes --- config/config.inc | 4 +++- config/config.linux | 19 +++++-------------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/config/config.inc b/config/config.inc index da043b2880..0ec6a4696f 100644 --- a/config/config.inc +++ b/config/config.inc @@ -76,6 +76,8 @@ if [ "$FF_USE_GASNET" = "ON" ]; then SET_GASNET+=" -DFF_GASNET_CONDUIT=mpi" elif [ "$FF_GASNET_CONDUIT" = "udp" ]; then SET_GASNET+=" -DFF_GASNET_CONDUIT=udp" + elif [ "$FF_GASNET_CONDUIT" = "ofi" ]; then + SET_GASNET+=" -DFF_GASNET_CONDUIT=ofi" fi elif [ "$FF_USE_GASNET" = "OFF" ]; then SET_GASNET="-DFF_USE_GASNET=OFF" @@ -176,7 +178,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then chmod +x "$(pwd)/nvidia_hipcc" SET_CXX="-DCMAKE_CXX_COMPILER=$(pwd)/nvidia_hipcc -DCMAKE_CXX_LINKER=$(pwd)/nvidia_hipcc" else - SET_CXX="-DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_CXX_LINKER=/opt/rocm/bin/hipcc" + SET_CXX="-DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc -DCMAKE_CXX_LINKER=$ROCM_PATH/bin/hipcc -DHIP_PATH=$ROCM_PATH/hip" fi fi fi diff --git a/config/config.linux b/config/config.linux index 72fe466a7f..ccbe78cefc 100755 --- a/config/config.linux +++ b/config/config.linux @@ -14,7 +14,7 @@ #INSTALL_DIR= # set build type -BUILD_TYPE=Debug +BUILD_TYPE=${BUILD_TYPE:-Release} # set CUDA Arch to the desired GPU architecture(s) to target (e.g. pass "FF_CUDA_ARCH=60" for Pascal). # To pass more than one value, separate architecture numbers with a comma (e.g. FF_CUDA_ARCH=70,75). @@ -32,10 +32,10 @@ CUDA_DIR=${CUDA_DIR:-"/sw/summit/cuda/11.0.3"} FF_USE_PYTHON=${FF_USE_PYTHON:-ON} # enable GASNet -FF_USE_GASNET=${FF_USE_GASNET:-OFF} +FF_USE_GASNET=${FF_USE_GASNET:-ON} # select GASNET conduit -FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv} +FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ofi} # build C++ examples FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-OFF} @@ -76,14 +76,5 @@ function get_build_configs() { BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} FF_USE_GASNET=${FF_USE_GASNET} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND}" } -if [ -n "$1" ]; then - if [ "$1" != "get-docker-configs" ]; then - . $(dirname $0)/config.inc - # You can pass the name of the variable you want to print out as $1. This - # is used in the python setup script to get the cmake config - echo "${!1}" - fi -else - . $(dirname $0)/config.inc - run_cmake $* -fi +. $(dirname $0)/config.inc +run_cmake $* From 0950ac71d7a64bb4f0b6f6b096b24ef0155e6dd9 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Mon, 22 May 2023 13:22:11 -0700 Subject: [PATCH 21/52] Add mpi flags for hip --- config/config.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/config.inc b/config/config.inc index 0ec6a4696f..940e741934 100644 --- a/config/config.inc +++ b/config/config.inc @@ -178,7 +178,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then chmod +x "$(pwd)/nvidia_hipcc" SET_CXX="-DCMAKE_CXX_COMPILER=$(pwd)/nvidia_hipcc -DCMAKE_CXX_LINKER=$(pwd)/nvidia_hipcc" else - SET_CXX="-DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc -DCMAKE_CXX_LINKER=$ROCM_PATH/bin/hipcc -DHIP_PATH=$ROCM_PATH/hip" + SET_CXX="-DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc -DCMAKE_CXX_LINKER=$ROCM_PATH/bin/hipcc -DHIP_PATH=$ROCM_PATH/hip -DCMAKE_CXX_FLAGS='-I${MPICH_DIR}/include' -DCMAKE_EXE_LINKER_FLAGS='-L${MPICH_DIR}/lib -lmpi' -DCMAKE_SHARED_LINKER_FLAGS='-L${MPICH_DIR}/lib -lmpi'" fi fi fi From 4b06040b3f8cc9bb1d116037e2e5bdbac7dd9e35 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 24 May 2023 18:32:26 +0000 Subject: [PATCH 22/52] fix fusion bug --- src/ops/fused.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 618d957169..e3de838021 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -110,7 +110,9 @@ bool FusedOp::add_operator(FFModel &model, Op *op) { // in forward and backward assert(!op->is_parallel_op()); // Currently don't consider nested fusion - assert(op->op_type != OP_FUSED); + if (op->op_type == OP_FUSED) { + return false; + } MachineView my_view = outputs[0]->machine_view; MachineView op_view = op->outputs[0]->machine_view; if (my_view == op_view) { From 99e9f959fbe069d865ceccb9d324c2956f663085 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 24 May 2023 18:37:07 +0000 Subject: [PATCH 23/52] increase the max number of regions in a ZeroInitMeta from 64 to 128 --- include/flexflow/initializer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/flexflow/initializer.h b/include/flexflow/initializer.h index 062530a655..3c44d1184a 100644 --- a/include/flexflow/initializer.h +++ b/include/flexflow/initializer.h @@ -46,7 +46,7 @@ class GlorotUniform : public Initializer { class Op; struct ZeroInitMeta { - static int const MAX_NUM_REGIONS = 64; + static int const MAX_NUM_REGIONS = 128; int num_regions; Op *op_ptr; DataType data_types[MAX_NUM_REGIONS]; From 282c44a865abcf81b88d062704252760c5295909 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 24 May 2023 18:48:49 +0000 Subject: [PATCH 24/52] support mixed precision --- src/ops/fused.cu | 54 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 10 deletions(-) diff --git a/src/ops/fused.cu b/src/ops/fused.cu index ca2a331984..3633c68505 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -414,10 +414,25 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); - Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + assert(my_input_accessor[0].data_type == my_output_accessor[0].data_type); + if (my_input_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int64_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int32_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int64_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false && "Unsupported data type"); + } break; } case OP_TRANSPOSE: { @@ -427,12 +442,31 @@ __host__ void FusedOp::forward_task(Task const *task, assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); TransposeMeta *m = (TransposeMeta *)metas->meta[op]; - Kernels::Transpose::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain, - my_output_accessor[0].domain); + assert(my_input_accessor[0].data_type == my_output_accessor[0].data_type); + if (my_input_accessor[0].data_type == DT_INT64) { + Kernels::Transpose::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int64_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_input_accessor[0].domain, + my_output_accessor[0].domain); + } else if (my_input_accessor[0].data_type == DT_INT32) { + Kernels::Transpose::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int32_ptr(), + my_input_accessor[0].domain, + my_output_accessor[0].domain); + } else if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Transpose::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain, + my_output_accessor[0].domain); + } else { + assert(false && "Unsupported data type"); + } break; } default: { From 992dcb925a9f832b062c09039baf381f0e0250fc Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 24 May 2023 18:53:19 +0000 Subject: [PATCH 25/52] undo changes to Fused::Transpose --- config/config.linux | 4 ++-- src/ops/fused.cu | 30 ++++++------------------------ 2 files changed, 8 insertions(+), 26 deletions(-) diff --git a/config/config.linux b/config/config.linux index ccbe78cefc..c630462034 100755 --- a/config/config.linux +++ b/config/config.linux @@ -23,10 +23,10 @@ BUILD_TYPE=${BUILD_TYPE:-Release} FF_CUDA_ARCH=${FF_CUDA_ARCH:-"autodetect"} # set CUDNN dir in case cmake cannot autodetect a path -CUDNN_DIR=${CUDNN_DIR:-"/sw/summit/cuda/11.0.3"} #/usr/local/cuda +CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"} #/usr/local/cuda # set CUDA dir in case cmake cannot autodetect a path -CUDA_DIR=${CUDA_DIR:-"/sw/summit/cuda/11.0.3"} +CUDA_DIR=${CUDA_DIR:-"/usr/local/cuda"} # enable Python FF_USE_PYTHON=${FF_USE_PYTHON:-ON} diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 3633c68505..720f1ed693 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -443,30 +443,12 @@ __host__ void FusedOp::forward_task(Task const *task, my_output_accessor[0].domain.get_volume()); TransposeMeta *m = (TransposeMeta *)metas->meta[op]; assert(my_input_accessor[0].data_type == my_output_accessor[0].data_type); - if (my_input_accessor[0].data_type == DT_INT64) { - Kernels::Transpose::forward_kernel_wrapper( - m, - my_input_accessor[0].get_int64_ptr(), - my_output_accessor[0].get_int64_ptr(), - my_input_accessor[0].domain, - my_output_accessor[0].domain); - } else if (my_input_accessor[0].data_type == DT_INT32) { - Kernels::Transpose::forward_kernel_wrapper( - m, - my_input_accessor[0].get_int32_ptr(), - my_output_accessor[0].get_int32_ptr(), - my_input_accessor[0].domain, - my_output_accessor[0].domain); - } else if (my_input_accessor[0].data_type == DT_FLOAT) { - Kernels::Transpose::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain, - my_output_accessor[0].domain); - } else { - assert(false && "Unsupported data type"); - } + Kernels::Transpose::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain, + my_output_accessor[0].domain); break; } default: { From f52877415e1cee78f90171d9589fb12963ebb34d Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 24 May 2023 18:54:38 +0000 Subject: [PATCH 26/52] undo changes to config.linux --- config/config.linux | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/config.linux b/config/config.linux index c630462034..ccbe78cefc 100755 --- a/config/config.linux +++ b/config/config.linux @@ -23,10 +23,10 @@ BUILD_TYPE=${BUILD_TYPE:-Release} FF_CUDA_ARCH=${FF_CUDA_ARCH:-"autodetect"} # set CUDNN dir in case cmake cannot autodetect a path -CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"} #/usr/local/cuda +CUDNN_DIR=${CUDNN_DIR:-"/sw/summit/cuda/11.0.3"} #/usr/local/cuda # set CUDA dir in case cmake cannot autodetect a path -CUDA_DIR=${CUDA_DIR:-"/usr/local/cuda"} +CUDA_DIR=${CUDA_DIR:-"/sw/summit/cuda/11.0.3"} # enable Python FF_USE_PYTHON=${FF_USE_PYTHON:-ON} From a68150db548c52e85943c68f9410b23e26a4eb86 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 2 Jun 2023 22:16:15 +0000 Subject: [PATCH 27/52] try to fix layernorm --- python/flexflow/torch/model.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index 2d0c388e3e..5ad5ebea88 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -653,14 +653,25 @@ def string_to_ff(string, ffmodel, node_to_output): data = Node.StringData(string) name = data.name input_tensor = node_to_output[data.innodes[0]] - return ffmodel.identity(input=input_tensor, name=name) - # TODO: Change to ffmodel.layernorm() once supported + axes = [len(input_tensor.dims) - 1] + return ffmodel.layer_norm( + input=input_tensor, + axes=axes, + elementwise_affine=True, + eps=1e-6, + name=name, + ) def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] - return ffmodel.identity(input=input_tensor, name=self.name) - # TODO: Change to ffmodel.layernorm() once supported - + axes = [len(input_tensor.dims) - 1] + return ffmodel.layer_norm( + input=input_tensor, + axes=axes, + elementwise_affine=True, + eps=1e-6, + name=name, + ) class T5LayerNormNode(Node): """ From 2bf9afc43d8f81fcaf44f158200622f97bc0a661 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 2 Jun 2023 22:21:27 +0000 Subject: [PATCH 28/52] fix typo --- python/flexflow/torch/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index 5ad5ebea88..11e0c16e48 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -670,7 +670,7 @@ def to_ff(self, ffmodel, node_to_output): axes=axes, elementwise_affine=True, eps=1e-6, - name=name, + name=self.name, ) class T5LayerNormNode(Node): From f6f7a327f268daa434471ace50025fb940123c89 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Fri, 2 Jun 2023 19:20:42 -0700 Subject: [PATCH 29/52] Add possible layernorm fix --- src/ops/layer_norm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 5d7fff3410..a8c429cd82 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -182,7 +182,7 @@ LayerNorm::LayerNorm(FFModel &model, M *= inputs[0]->dims[inputs[0]->num_dims - 1 - axes[i]].size; } effective_num_elements = M; - effective_batch_size = inputs[0]->get_volume() / M; + effective_batch_size = inputs[0]->get_shape().get_piece_size() / M; if (numWeights > 0 && allocate_weights) { int kernel_dims = 2; assert(false); From 5e03b0a536b44723a72fd8ba60be27eca652234b Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Fri, 2 Jun 2023 19:33:03 -0700 Subject: [PATCH 30/52] Fix additional layernorm bug due to get_piece_size return size in bytes --- include/flexflow/parallel_tensor.h | 1 + src/runtime/parallel_tensor.cc | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/flexflow/parallel_tensor.h b/include/flexflow/parallel_tensor.h index aed99c8204..c8791275dd 100644 --- a/include/flexflow/parallel_tensor.h +++ b/include/flexflow/parallel_tensor.h @@ -100,6 +100,7 @@ struct ParallelTensorShape { RecordFormatter as_dot() const; size_t get_piece_size() const; + size_t get_piece_num_elements() const; bool is_valid() const; int get_num_replica_dims() const; diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 963ad8af73..479bde3898 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -135,12 +135,18 @@ bool ParallelTensorShape::operator!=(ParallelTensorShape const &other) const { size_t ParallelTensorShape::get_piece_size() const { size_t piece_size = data_type_size(this->data_type); + return piece_size * this->get_piece_num_elements(); +} + +size_t ParallelTensorShape::get_piece_num_elements() const { + size_t piece_num_elements = 1; for (int i = 0; i < this->num_dims; i++) { - piece_size *= this->dims[i].size / this->dims[i].degree; + piece_num_elements *= this->dims[i].size / this->dims[i].degree; } - return piece_size; + return piece_num_elements; } + RecordFormatter ParallelTensorShape::as_dot() const { RecordFormatter r; for (int i = 0; i < this->num_dims; i++) { From 53fb8bd042b244df65a14770cc9f0606c4e0e6cd Mon Sep 17 00:00:00 2001 From: Teresa Noyola Date: Fri, 2 Jun 2023 20:00:24 -0700 Subject: [PATCH 31/52] Bugfixes --- gdb/pretty_print.py | 6 +++++- src/ops/layer_norm.cc | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/gdb/pretty_print.py b/gdb/pretty_print.py index 0c384884f0..e6fbe298ce 100644 --- a/gdb/pretty_print.py +++ b/gdb/pretty_print.py @@ -61,7 +61,11 @@ def to_string(self): size = dim['size'] degree = dim['degree'] parallel_idx = dim['parallel_idx'] - toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx}]') + if dim['is_replica_dim']: + is_replica = 'r=t' + else: + is_replica = 'r=f' + toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx} {is_replica}]') return f'TensorShape<{" ".join(toks)}>' class ParallelTensorBasePrinter: diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index a8c429cd82..e8c65b4b03 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -182,7 +182,7 @@ LayerNorm::LayerNorm(FFModel &model, M *= inputs[0]->dims[inputs[0]->num_dims - 1 - axes[i]].size; } effective_num_elements = M; - effective_batch_size = inputs[0]->get_shape().get_piece_size() / M; + effective_batch_size = inputs[0]->get_shape().get_piece_num_elements() / M; if (numWeights > 0 && allocate_weights) { int kernel_dims = 2; assert(false); From 449a14c2369e6e886b7d54503b76f2da97899135 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Fri, 2 Jun 2023 20:01:22 -0700 Subject: [PATCH 32/52] Actually check elementwise_affine --- src/ops/layer_norm.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index e8c65b4b03..07584cc5b1 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -63,6 +63,7 @@ Tensor FFModel::layer_norm(const Tensor input, float eps, char const *name) { // FIXME: currently disable elementwise_affine + assert(!elementwise_affine); elementwise_affine = false; // axes must be the last axes.size() dimensions for (int i = 0; i < axes.size(); i++) { From c737be64070e276c9276adb72b4d413e2a19652a Mon Sep 17 00:00:00 2001 From: Teresa Noyola Date: Mon, 5 Jun 2023 16:58:59 -0700 Subject: [PATCH 33/52] Revert "Actually check elementwise_affine" This reverts commit 449a14c2369e6e886b7d54503b76f2da97899135. --- src/ops/layer_norm.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 07584cc5b1..e8c65b4b03 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -63,7 +63,6 @@ Tensor FFModel::layer_norm(const Tensor input, float eps, char const *name) { // FIXME: currently disable elementwise_affine - assert(!elementwise_affine); elementwise_affine = false; // axes must be the last axes.size() dimensions for (int i = 0; i < axes.size(); i++) { From a98e09d71ab03683e846b92bd52e442603895b59 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Mon, 5 Jun 2023 21:15:34 -0700 Subject: [PATCH 34/52] Change optimizer to adam with correct hyperparams --- examples/python/pytorch/mt5/mt5_ff.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py index fa0801a517..08af8d88a7 100644 --- a/examples/python/pytorch/mt5/mt5_ff.py +++ b/examples/python/pytorch/mt5/mt5_ff.py @@ -126,7 +126,8 @@ def top_level_task(): output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True) #from flexflow.torch.model import file_to_ff #file_to_ff("mt5.ff", ffmodel, input_tensors) - ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) + ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, adam_beta1=0.9, adam_beta2=0.98, weight_decay=0.0, adam_epsilon=2e-8) + # ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) print("Compiling the model...") ffmodel.compile( From 4bec811060862a7f882d2e6e81fbcd9c70e2d9b6 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 4 Jul 2023 23:26:34 +0000 Subject: [PATCH 35/52] fix training bert model. --- examples/python/pytorch/mt5/mt5_ff.py | 26 +- include/flexflow/model.h | 6 +- .../flexflow/ops/kernels/softmax_kernels.h | 6 +- include/flexflow/ops/layer_norm.h | 11 +- include/flexflow/ops/softmax.h | 2 + include/flexflow/ops/softmax_params.h | 1 + include/flexflow/utils/cuda_helper.h | 6 + python/flexflow/core/flexflow_cffi.py | 23 +- python/flexflow/torch/model.py | 51 ++-- python/flexflow_c.cc | 7 +- python/flexflow_c.h | 1 + src/loss_functions/loss_functions.cc | 11 +- src/loss_functions/loss_functions.cu | 59 ++-- src/metrics_functions/metrics_functions.cc | 6 +- src/metrics_functions/metrics_functions.cu | 18 +- src/ops/element_unary.cu | 5 +- src/ops/kernels/softmax.cu | 37 ++- src/ops/layer_norm.cc | 254 +++++++++++++++--- src/ops/layer_norm.cu | 248 ++++++++++++----- src/ops/softmax.cc | 46 +++- src/runtime/cuda_helper.cu | 77 +++++- src/runtime/graph.cc | 6 +- src/runtime/substitution.cc | 9 +- 23 files changed, 720 insertions(+), 196 deletions(-) diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py index 08af8d88a7..c2868e9d1e 100644 --- a/examples/python/pytorch/mt5/mt5_ff.py +++ b/examples/python/pytorch/mt5/mt5_ff.py @@ -64,8 +64,8 @@ def preprocess_train() -> None: y_shape = y.shape assert len(y.shape) == 2, \ "`y` should have shape (num examples, sequence length)" - y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long) - lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long) + y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32) + lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32) y_ids[:, :] = y[:, :-1] lm_labels[:, :] = y[:, 1:] @@ -89,26 +89,29 @@ def top_level_task(): #model = BertModel.from_pretrained("bert-base-uncased") # Load train data as numpy arrays print("Loading data...") - ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")) + ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")).astype('int32') ids = np.pad(ids, ((0,0), (0,17)), 'constant') #ids = np.random.randint(0, 5, (1000, 512)) #print('ids_shape', ids.shape) #print('ids', ids) - mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")) + mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")).astype('int32') mask = np.pad(mask, ((0,0), (0,17)), 'constant') #mask = np.random.randint(0, 2, (1000, 512)) #y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) - lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")) + lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")).astype('int32') lm_labels = np.pad(lm_labels, ((0,0), (0,17)), 'constant') #lm_labels = np.random.randint(-1, 5, (1000, 512)) + position_id = torch.arange(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy() + token_type_ids = torch.zeros(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy() + batch_size = ffconfig.batch_size input_ids_shape = (batch_size, ids.shape[1]) attention_mask_shape = (batch_size, mask.shape[1]) #decoder_input_ids_shape = (batch_size, y_ids.shape[1]) input_tensors = [ - ffmodel.create_tensor(input_ids_shape, DataType.DT_INT64), # input_ids - ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT64), # attention_mask + ffmodel.create_tensor(input_ids_shape, DataType.DT_INT32), # input_ids + ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT32), # attention_mask #ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids ] encoder_seq_length = ids.shape[1] @@ -126,7 +129,7 @@ def top_level_task(): output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True) #from flexflow.torch.model import file_to_ff #file_to_ff("mt5.ff", ffmodel, input_tensors) - ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, adam_beta1=0.9, adam_beta2=0.98, weight_decay=0.0, adam_epsilon=2e-8) + ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, beta1=0.9, beta2=0.98, weight_decay=0.0, epsilon=2e-8) # ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) print("Compiling the model...") @@ -138,6 +141,9 @@ def top_level_task(): MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY, ], ) + + # load weights here + ffmodel.load_bert_pretrained(checkpoint=model) print("Creating data loaders...") print('id_dtype', ids.dtype) @@ -148,6 +154,8 @@ def top_level_task(): #decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids) # NOTE: We cast down the label tensor data to 32-bit to accommodate the # label tensor's required dtype + token_type_ids_dl = ffmodel.create_data_loader(input_tensors[2], token_type_ids) + position_id_dl = ffmodel.create_data_loader(input_tensors[3], position_id) labels_dl = ffmodel.create_data_loader( ffmodel.label_tensor, lm_labels.astype("int32") ) @@ -159,7 +167,7 @@ def top_level_task(): epochs = ffconfig.epochs ffmodel.fit( #x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl], - x=[input_ids_dl, attention_mask_dl], + x=[input_ids_dl, attention_mask_dl, position_id_dl, token_type_ids_dl], y=labels_dl, batch_size=batch_size, epochs=epochs, ) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index c6bc6929ad..0496d5fa8f 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -472,6 +472,7 @@ class FFModel { std::vector const &axes, bool elementwise_affine, float eps, + DataType data_type = DT_NONE, char const *name = NULL); // Add a batch_norm layer Tensor @@ -518,7 +519,10 @@ class FFModel { // Add a flat layer Tensor flat(const Tensor input, char const *name = NULL); // Add a softmax layer - Tensor softmax(const Tensor input, int dim = -1, char const *name = NULL); + Tensor softmax(const Tensor input, + int dim = -1, + bool last_layer = false, + char const *name = NULL); // Create input tensors and constants Tensor transpose(const Tensor input, std::vector const &perm, diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index 81b34d8558..9aec9f57c9 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -20,6 +20,7 @@ class SoftmaxMeta : public OpMeta { #endif bool profiling; int dim; + bool last_layer; char op_name[MAX_OPNAME]; }; @@ -33,6 +34,7 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements); namespace Internal { @@ -40,8 +42,10 @@ void forward_kernel(SoftmaxMeta const *m, float const *input_ptr, float *output_ptr, ffStream_t stream); -void backward_kernel(float *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements, ffStream_t stream); } // namespace Internal diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 8273b9ab52..552b9cf365 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -66,12 +66,11 @@ class LayerNorm : public Op { T *gamma_ptr, T *beta_ptr, ffStream_t stream); - template static void forward_kernel_wrapper(LayerNormMeta const *m, - T const *input_ptr, - T *output_ptr, - T *gamma_ptr, - T *beta_ptr); + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorW &gamma, + GenericTensorAccessorW &beta); template static void backward_kernel(LayerNormMeta const *m, T const *output_grad_ptr, @@ -105,7 +104,7 @@ class LayerNormMeta : public OpMeta { bool elementwise_affine; int64_t effective_batch_size, effective_num_elements; float eps; - float *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; + void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; char op_name[MAX_OPNAME]; }; diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 25a20315bd..2616294a3a 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -15,6 +15,7 @@ class Softmax : public Op { Softmax(FFModel &model, const ParallelTensor logit, int dim, + bool _last_layer, char const *name); Softmax(FFModel &model, Params const ¶ms, @@ -64,6 +65,7 @@ class Softmax : public Op { public: int dim; + bool last_layer; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/softmax_params.h b/include/flexflow/ops/softmax_params.h index d805d9966d..545e3a5cb9 100644 --- a/include/flexflow/ops/softmax_params.h +++ b/include/flexflow/ops/softmax_params.h @@ -7,6 +7,7 @@ namespace FlexFlow { struct SoftmaxParams { int dim; + bool last_layer; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(SoftmaxParams const &, SoftmaxParams const &); diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index 46e323b186..a4b2be0a66 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -132,9 +132,15 @@ __host__ void updateGAS(float *para_ptr, template void print_tensor(T const *ptr, size_t num_elements, char const *prefix); +template +void save_tensor(T const *ptr, size_t num_elements, char const *file_name); cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Legion::Domain domain); +cudnnStatus_t + cudnnSetTensorDescriptorFromDomain4SoftMax(cudnnTensorDescriptor_t tensor, + Legion::Domain domain, + DataType data_type = DT_FLOAT); cudaDataType_t ff_to_cuda_datatype(DataType type); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 42339d781c..4c01057109 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -1595,7 +1595,7 @@ def flat(self, input, name=None): self.add_layer(OpType.FLAT, name) return Tensor(handle, owner_op_type=OpType.FLAT) - def softmax(self, input, axis=-1, name=None): + def softmax(self, input, axis=-1, last_layer=False, name=None): """Softmax activation function. :param input: the input Tensor. @@ -1607,7 +1607,7 @@ def softmax(self, input, axis=-1, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_softmax(self.handle, input.handle, axis, c_name) + handle = ffc.flexflow_model_add_softmax(self.handle, input.handle, axis, last_layer, c_name) self.add_layer(OpType.SOFTMAX, name) return Tensor(handle, owner_op_type=OpType.SOFTMAX) @@ -2041,6 +2041,25 @@ def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): ff_tensor.set_tensor(self, np_tensor) print("Compiled ffmodel!") + def load_bert_pretrained(self, checkpoint=None): + # store weights in dict + weights_dict = {} + for name, params in checkpoint.named_parameters(): + weights_dict[name.replace("LayerNorm", "layer_norm").replace(".", "_")] = params.detach().cpu().numpy() + print(name.replace("LayerNorm", "layer_norm").replace(".", "_")) + # some weights not in params + weights_dict['cls_predictions_decoder_weight'] = checkpoint.cls.predictions.decoder.weight.detach().cpu().numpy() + weights_dict['cls_predictions_decoder_bias'] = checkpoint.cls.predictions.decoder.bias.detach().cpu().numpy() + for i in range (self._nb_layers): + layer = self._layers[i] + if (layer.name + "_weight") in weights_dict: + print('weight: ' + layer.name) + weight = layer.get_parameter_by_id(0); + weight.set_tensor(self, weights_dict[layer.name + "_weight"]) + if (layer.name + "_bias") in weights_dict: + print('bias: ' + layer.name) + bias = layer.get_parameter_by_id(1); + bias.set_tensor(self, weights_dict[layer.name + "_bias"]) def fit(self, x=None, y=None, batch_size=None, epochs=1): """Trains the model for a fixed number of epochs (iterations on a dataset). diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index 11e0c16e48..8ebac2146c 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -664,12 +664,13 @@ def string_to_ff(string, ffmodel, node_to_output): def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] - axes = [len(input_tensor.dims) - 1] + axes = [0] + eps = self.module.eps return ffmodel.layer_norm( input=input_tensor, axes=axes, elementwise_affine=True, - eps=1e-6, + eps=eps, name=self.name, ) @@ -1197,16 +1198,24 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_sub( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor, scalar = \ FunctionNode.parse_scalar_op(self, node_to_output) - return ffmodel.scalar_sub( - input=input_tensor, scalar=scalar, name=self.name, - ) - + if self.scalar_pos == FunctionNode.ScalarPosition.RIGHT: + return ffmodel.scalar_sub( + input=input_tensor, scalar=scalar, inplace=False, name=self.name, + ) + else: + negative_input = ffmodel.scalar_multiply( + input=input_tensor, scalar=-1, inplace=False, name=self.name + '_negative', + ) + return ffmodel.scalar_sub( + input=negative_input, scalar=-scalar, inplace=False, name=self.name, + ) + class ScalarTrueDivNode(FunctionNode): def __init__(self, node): @@ -1231,15 +1240,16 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_true_divide( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] scalar = self.innodes[1] assert type(scalar) is float + return ffmodel.scalar_true_divide( - input=input_tensor, scalar=scalar, name=self.name, + input=input_tensor, scalar=scalar, inplace=False, name=self.name, ) @@ -1652,14 +1662,14 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_multiply( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor, scalar = \ FunctionNode.parse_scalar_op(self, node_to_output) return ffmodel.scalar_multiply( - input=input_tensor, scalar=scalar, name=self.name, + input=input_tensor, scalar=scalar, inplace=False, name=self.name, ) @@ -2359,11 +2369,13 @@ def string_to_ff(string, ffmodel, node_to_output): "since attributes require access to the PyTorch model" ) - def to_ff(self, ffmodel, node_to_output): - return self.attr_to_ff_tensor(ffmodel) + def to_ff(self, ffmodel, node_to_output, input_tensors): + return self.attr_to_ff_tensor(ffmodel, input_tensors) + + def attr_to_ff_tensor(self, ffmodel, input_tensors): + - def attr_to_ff_tensor(self, ffmodel): - torch_tensor = self.attr + torch_tensor = self.attr assert (torch_tensor.shape[0] == 1) batch_size = ffmodel._ffconfig.batch_size torch_tensor = np.repeat(torch_tensor, batch_size, axis=0) @@ -2382,15 +2394,16 @@ def attr_to_ff_tensor(self, ffmodel): np_tensor = np_tensor.astype(np.float32) print('attr: ', torch_tensor.shape) - assert (torch_tensor.shape[0] == batch_size) + assert (torch_tensor.shape[0] == batch_size) ff_tensor = ffmodel.create_tensor( - torch_tensor.shape, ff_dtype, requires_grad, + torch_tensor.shape, ff_dtype, True, ) # delay set_tensor, add to ffmodel ffmodel.attr_tensors[ff_tensor] = np_tensor # ff_tensor.set_tensor( # ffmodel, np_tensor # ) + input_tensors.append(ff_tensor) return ff_tensor @@ -2472,7 +2485,7 @@ def to_ff(self, ffmodel, node_to_output, output_tensors): # `CrossEntropyLoss()` implementation logits = node_to_output[other["logits"].name] softmax_logits = ffmodel.softmax( - input=logits, name=self.name, + input=logits, last_layer=True, name=self.name, ) output_tensors[:] += [softmax_logits] else: @@ -2606,6 +2619,8 @@ def torch_to_ff(self, ffmodel, input_tensors, verbose=False): elif isinstance(node, OutputNode): node.to_ff(ffmodel, node_to_output, output_tensors) node_output = None + elif isinstance(node, AttributeNode): + node_output = node.to_ff(ffmodel, node_to_output, input_tensors) else: node_output = node.to_ff(ffmodel, node_to_output) diff --git a/python/flexflow_c.cc b/python/flexflow_c.cc index 74a5da6ce1..fd688c0c6a 100644 --- a/python/flexflow_c.cc +++ b/python/flexflow_c.cc @@ -568,8 +568,8 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_, for (int i = 0; i < n; i++) { axes_vec.push_back(axes[i]); } - Tensor tensor = - handle->layer_norm(input, axes_vec, elementwise_affine, eps, name); + Tensor tensor = handle->layer_norm( + input, axes_vec, elementwise_affine, eps, input->data_type, name); DEBUG_PRINT("[LayerNorm] new Tensor %p, input %p, elementwise_affine %d, eps " "%f, name %s", tensor, @@ -730,10 +730,11 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle_, flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle_, const flexflow_tensor_t input_, int dim, + bool last_layer, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); - Tensor tensor = handle->softmax(input, dim, name); + Tensor tensor = handle->softmax(input, dim, last_layer, name); DEBUG_PRINT( "[Softmax] new Tensor %p, input %p, name %s", tensor, input, name); return FFCObjectWrapper::wrap(tensor); diff --git a/python/flexflow_c.h b/python/flexflow_c.h index fb64c78fd2..5409002e5e 100644 --- a/python/flexflow_c.h +++ b/python/flexflow_c.h @@ -276,6 +276,7 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle, flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle, const flexflow_tensor_t input, int dim, + bool last_layer, char const *name); flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle, diff --git a/src/loss_functions/loss_functions.cc b/src/loss_functions/loss_functions.cc index ae89c3d469..d887ee9243 100644 --- a/src/loss_functions/loss_functions.cc +++ b/src/loss_functions/loss_functions.cc @@ -49,6 +49,8 @@ void Loss::backward(FFModel *model, if (loss_type == LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE) { assert(logit->get_volume() == label->get_volume()); scale_factor = 2.0f / logit->get_volume(); + } else if (loss_type == LOSS_SPARSE_CATEGORICAL_CROSSENTROPY) { + scale_factor = 1.0f; } else { scale_factor = 1.0f / model->config.batchSize; } @@ -131,9 +133,12 @@ void Loss::backward_task_with_dim(Task const *task, regions[2], task->regions[2], FID_DATA, ctx, runtime); // assertion the outter-most dim is replica dim and replica degree is 1 assert(acc_logit.rect.hi[NDIM - 1] == acc_logit.rect.lo[NDIM - 1]); - int num_samples = - acc_logit.rect.hi[NDIM - 2] - acc_logit.rect.lo[NDIM - 2] + 1; - int num_classes = acc_logit.rect.volume() / num_samples; + + int num_classes = acc_logit.rect.hi[0] - acc_logit.rect.lo[0] + 1; + int num_samples = acc_logit.rect.volume() / num_classes; + // int num_samples = + // acc_logit.rect.hi[NDIM - 2] - acc_logit.rect.lo[NDIM - 2] + 1; + // int num_classes = acc_logit.rect.volume() / num_samples; assert(acc_logit_grad.rect == acc_logit.rect); int k = 1; if (loss->repl_labels) { diff --git a/src/loss_functions/loss_functions.cu b/src/loss_functions/loss_functions.cu index 01766347b0..edd8f03fa4 100644 --- a/src/loss_functions/loss_functions.cu +++ b/src/loss_functions/loss_functions.cu @@ -18,6 +18,7 @@ namespace FlexFlow { +int const MASK_TOKEN = -100; using namespace Legion; __global__ void @@ -32,6 +33,25 @@ __global__ void } } +__global__ void + sparse_categorical_crossentropy_loss_backward_with_mask(float *logit_grad, + int const *label, + coord_t num_samples, + coord_t num_classes, + int const k, + float *num) { + CUDA_KERNEL_LOOP(i, num_samples * num_classes) { + int sample_id = i / num_classes; + int label_idx = label[i / (k * num_classes)]; + if (label_idx != MASK_TOKEN && (i == sample_id * num_classes + label_idx)) { + logit_grad[i] -= 1.0f; + atomicAdd(&num[0], 1.0f); + } else if (label_idx == MASK_TOKEN) { + logit_grad[i] = 0.0f; + } + } +} + __global__ void categorical_crossentropy_loss_backward(float *logit_grad, float const *logit, float const *label, @@ -74,14 +94,25 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_ptr, logit_volume * sizeof(float), cudaMemcpyDeviceToDevice)); - sparse_categorical_crossentropy_loss_backward<<>>( - logit_grad_ptr, label_ptr, num_samples, num_classes, k); - // Scale logit gradients by op->scale_factor + // calculate the scale factor inside kernel; + assert(scale_factor == 1.0f); + float *num; + checkCUDA(cudaMalloc(&num, sizeof(float))); + float effective_tokens; + int parallelism = num_samples * num_classes; + // sparse_categorical_crossentropy_loss_backward<<>>( + // logit_grad_ptr, label_ptr, num_samples, num_classes, k, num); + sparse_categorical_crossentropy_loss_backward_with_mask<<< + GET_BLOCKS(parallelism), + CUDA_NUM_THREADS, + 0, + stream>>>(logit_grad_ptr, label_ptr, num_samples, num_classes, k, num); + cudaMemcpy(&effective_tokens, num, sizeof(float), cudaMemcpyDeviceToHost); scale_kernel<<>>( - logit_grad_ptr, logit_grad_volume, 0, scale_factor * k); + logit_grad_ptr, logit_grad_volume, 0, 1.0f / effective_tokens); } void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( @@ -122,19 +153,17 @@ void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper( logit_grad_ptr, logit_grad_volume, 0, scale_factor); } -void Loss::identity_loss_backward_kernel_wrapper( - float *loss_grad_ptr, - float const *loss_ptr, - size_t loss_volume, - size_t loss_grad_volume, - float scale_factor) { +void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr, + float const *loss_ptr, + size_t loss_volume, + size_t loss_grad_volume, + float scale_factor) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); identity_loss_backward<<>>( - loss_grad_ptr, loss_ptr, loss_volume); + stream>>>(loss_grad_ptr, loss_ptr, loss_volume); // Scale logit gradients by loss->scale_factor scale_kernel<<>>( loss_grad_ptr, loss_grad_volume, 0, scale_factor); diff --git a/src/metrics_functions/metrics_functions.cc b/src/metrics_functions/metrics_functions.cc index 7244b06925..8c7e23ad8a 100644 --- a/src/metrics_functions/metrics_functions.cc +++ b/src/metrics_functions/metrics_functions.cc @@ -91,8 +91,8 @@ void Metrics::compute(FFModel *model, false /*must*/, 0 /*mapper_id*/, logit->machine_view.hash()); - std::cout << "logit shape: " << logit->get_shape() << std::endl; - std::cout << "label shape: " << label->get_shape() << std::endl; + // std::cout << "logit shape: " << logit->get_shape() << std::endl; + // std::cout << "label shape: " << label->get_shape() << std::endl; launcher.add_region_requirement(RegionRequirement( logit->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, logit->region)); launcher.add_field(0, FID_DATA); @@ -157,7 +157,7 @@ PerfMetrics assert(acc_label.rect.lo[0] == acc_label.rect.hi[0]); // Cannot measure categorical_crossentropy w/ sparse labels // Use measure_sparse_categorical_crossentropy instead - std::cout << "num_classes: " << num_classes << std::endl; + // std::cout << "num_classes: " << num_classes << std::endl; assert(!me->measure_categorical_crossentropy); Metrics::update_metrics_sparse_label_kernel_wrapper(acc_logit.ptr, acc_label.ptr, diff --git a/src/metrics_functions/metrics_functions.cu b/src/metrics_functions/metrics_functions.cu index b68b10d873..8c584c397c 100644 --- a/src/metrics_functions/metrics_functions.cu +++ b/src/metrics_functions/metrics_functions.cu @@ -19,6 +19,7 @@ namespace FlexFlow { float const LOG_MIN_VALUE = 0.00000001f; +int const MASK_TOKEN = -100; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, @@ -29,7 +30,7 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, CUDA_KERNEL_LOOP(b, num_samples) { if (metrics.measure_accuracy) { float max_val = -1.0f; - int my_label = 0; + int my_label = 0; for (int i = 0; i < num_classes; i++) { float my_logit = logits[b * num_classes + i]; if (my_logit > max_val) { @@ -38,14 +39,19 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, } } assert(my_label >= 0); - atomicAdd(&(perf->train_all), 1); - if (labels[b] == my_label) { - atomicAdd(&(perf->train_correct), 1); + if (labels[b] != MASK_TOKEN) { + atomicAdd(&(perf->train_all), 1); + if (labels[b] == my_label) { + atomicAdd(&(perf->train_correct), 1); + } } } if (metrics.measure_sparse_categorical_crossentropy) { - float my_logit = max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); - atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + if (labels[b] != MASK_TOKEN) { + float my_logit = + max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); + atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + } } if (metrics.measure_mean_squared_error || metrics.measure_root_mean_squared_error || diff --git a/src/ops/element_unary.cu b/src/ops/element_unary.cu index d6e5bcfdc3..187e60282f 100644 --- a/src/ops/element_unary.cu +++ b/src/ops/element_unary.cu @@ -202,8 +202,9 @@ __global__ void elewise_unary_backward_kernel(coord_t volume, case OP_GELU: { input_grad[i] = (T)(output_grad[i] * - (0.5 * erfc(-input[i] * M_SQRT1_2) - - 0.5 * M_SQRT1_2 * input[i] * exp(-input[i] * input[i] * 0.5))); + (0.5 * erfc(-input[i] * M_SQRT1_2) + + 0.5 * M_SQRT1_2 * input[i] * + ((2 / sqrt(M_PI)) * exp(-input[i] * input[i] * 0.5f)))); break; } case OP_RSQRT: { diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index d83d9952c9..e163c9a0c7 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -26,8 +26,10 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, Domain const &input_domain) : OpMeta(handler) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); - checkCUDNN(cudnnSetTensorDescriptorFromDomain(inputTensor, input_domain)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( + inputTensor, input_domain, softmax->data_type)); dim = softmax->dim; + last_layer = softmax->last_layer; profiling = softmax->profiling; std::strcpy(op_name, softmax->name); } @@ -66,6 +68,7 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -77,7 +80,7 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, cudaEventRecord(t_start, stream); } Internal::backward_kernel( - input_grad_ptr, output_grad_ptr, num_elements, stream); + m, input_grad_ptr, output_grad_ptr, output_ptr, num_elements, stream); if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -113,15 +116,33 @@ void forward_kernel(SoftmaxMeta const *m, output_ptr)); } -void backward_kernel(float *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements, cudaStream_t stream) { - checkCUDA(cudaMemcpyAsync(input_grad_ptr, - output_grad_ptr, - num_elements * sizeof(float), - cudaMemcpyDeviceToDevice, - stream)); + + if (m->last_layer) { + checkCUDA(cudaMemcpyAsync(input_grad_ptr, + output_grad_ptr, + num_elements * sizeof(float), + cudaMemcpyDeviceToDevice, + stream)); + } else { + float alpha = 1.0f, beta = 0.0f; + checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + m->inputTensor, + output_ptr, + m->inputTensor, + output_grad_ptr, + &beta, + m->inputTensor, + input_grad_ptr)); + } } } // namespace Internal diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index e8c65b4b03..915041d2bb 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -61,10 +61,27 @@ Tensor FFModel::layer_norm(const Tensor input, std::vector const &axes, bool elementwise_affine, float eps, + DataType data_type, char const *name) { - // FIXME: currently disable elementwise_affine - elementwise_affine = false; - // axes must be the last axes.size() dimensions + // In PyTorch, axes must be the sizes of the last axes.size() dimensions of + // the input tensor. However, since the tensor dimensions are reversed in + // FlexFlow (batch size is the last dimension), we require that axes must be + // the sizes of the FIRST axes.size() dimensions of the input tensor. + + // Another difference is that in PyTorch, the axes vector should contain the + // sizes of the dimensions with respect to which you want to compute the + // layernorm. In FlexFlow, instead, axes should contain the INDICES of the + // dimensions in question. We do this because the size of a dimension might be + // different when splitting a tensor in model parallelism. + assert( + axes.size() <= input->num_dims && + "number of axes must be less than tensor dimensions"); // input does not + // have replica + // dimension here + for (int i = 0; i < axes.size(); i++) { + assert(axes[i] == i && "axes must be the first axes.size() dimensions"); + } +#ifdef DEADCODE for (int i = 0; i < axes.size(); i++) { bool found = false; for (int j = 0; j < axes.size(); j++) { @@ -76,15 +93,33 @@ Tensor FFModel::layer_norm(const Tensor input, assert(false && "axes must be the last axes.size() dimensions"); } } +#endif + if (data_type == DT_NONE) { + data_type = input->data_type; + } int num_weights = elementwise_affine ? 2 : 0; - Layer *ln = new Layer(this, - OP_LAYERNORM, - DT_FLOAT, - name, - 1 /*inputs*/, - num_weights, - 1 /*outputs*/, - input); + Layer *ln = nullptr; + if (data_type != input->data_type) { + Tensor casted_input = cast(input, data_type, "type cast for layer_norm"); + ln = new Layer(this, + OP_LAYERNORM, + data_type, + name, + 1 /*inputs*/, + num_weights, + 1 /*outputs*/, + casted_input); + } else { + ln = new Layer(this, + OP_LAYERNORM, + data_type, + name, + 1 /*inputs*/, + num_weights, + 1 /*outputs*/, + input); + } + ln->outputs[0] = create_tensor_legion_ordering(input->num_dims, input->dims, input->data_type, @@ -92,19 +127,19 @@ Tensor FFModel::layer_norm(const Tensor input, 0, true /*create_grad*/); if (num_weights == 2) { - int M = 1; - for (int i = 0; i < axes.size(); i++) { - M *= input->dims[input->num_dims - 1 - axes[i]]; + int numdims = axes.size(); + int dims[numdims]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[axes[i]]; } - int dims[1] = {M}; - ln->weights[0] = create_weight_legion_ordering(1, + ln->weights[0] = create_weight_legion_ordering(numdims, dims, input->data_type, ln, true /*create_grad*/, nullptr, CHOSEN_SYNC_TYPE); - ln->weights[1] = create_weight_legion_ordering(1, + ln->weights[1] = create_weight_legion_ordering(numdims, dims, input->data_type, ln, @@ -179,19 +214,36 @@ LayerNorm::LayerNorm(FFModel &model, ParallelDim output_dims[MAX_TENSOR_DIM]; int M = 1; for (int i = 0; i < axes.size(); i++) { - M *= inputs[0]->dims[inputs[0]->num_dims - 1 - axes[i]].size; + M *= inputs[0]->dims[axes[i]].size; } effective_num_elements = M; - effective_batch_size = inputs[0]->get_shape().get_piece_num_elements() / M; + effective_batch_size = inputs[0]->get_volume() / M; + assert(elementwise_affine == (numWeights == 2)); if (numWeights > 0 && allocate_weights) { - int kernel_dims = 2; - assert(false); - // weights[0] = model.create_parallel_weight_legion_ordering( - // kernel_dims, - } else { - // do nothing + ParallelDim dims[axes.size()]; + for (int i = 0; i < axes.size(); i++) { + dims[i] = inputs[0]->dims[i]; + } + int seed = std::rand(); + Initializer *gamma_initializer = new UniformInitializer(seed, 0.0f, 1.0f); + Initializer *beta_initializer = new UniformInitializer(seed, 0.0f, 1.0f); + weights[0] = + model.create_parallel_weight_legion_ordering(axes.size(), + dims, + _input->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + gamma_initializer, + CHOSEN_SYNC_TYPE); + weights[1] = + model.create_parallel_weight_legion_ordering(axes.size(), + dims, + _input->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + beta_initializer, + CHOSEN_SYNC_TYPE); } - return; } void LayerNorm::init(FFModel const &ff) { @@ -221,6 +273,20 @@ void LayerNorm::init(FFModel const &ff) { EXCLUSIVE, inputs[0]->region)); launcher.add_field(1, FID_DATA); + if (elementwise_affine) { + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(3, FID_DATA); + } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap(ff, fm); @@ -233,6 +299,8 @@ OpMeta *LayerNorm::init_task(Task const *task, LayerNorm *ln = (LayerNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); LayerNormMeta *meta = new LayerNormMeta(handle, ln); + meta->input_type[0] = ln->inputs[0]->data_type; + meta->output_type[0] = ln->outputs[0]->data_type; return meta; } @@ -292,14 +360,21 @@ void LayerNorm::forward_task(Task const *task, assert(task->regions.size() == regions.size()); float const *in_ptr = NULL; float *out_ptr = NULL, *gamma_ptr = NULL, *beta_ptr = NULL; + GenericTensorAccessorR in; + GenericTensorAccessorW out, gamma, beta; + Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); + // in_ptr = helperGetTensorPointerRO( + // regions[0], task->regions[0], FID_DATA, ctx, runtime); + in = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); Domain out_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - out_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); + // out_ptr = helperGetTensorPointerWO( + // regions[1], task->regions[1], FID_DATA, ctx, runtime); + out = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(in_domain == out_domain); assert(in_domain.get_volume() == m->effective_num_elements * m->effective_batch_size); @@ -307,20 +382,28 @@ void LayerNorm::forward_task(Task const *task, assert(regions.size() == 4); Domain gamma_domain = runtime->get_index_space_domain( ctx, task->regions[2].region.get_index_space()); - gamma_ptr = helperGetTensorPointerRW( - regions[2], task->regions[2], FID_DATA, ctx, runtime); + // gamma_ptr = helperGetTensorPointerRW( + // regions[2], task->regions[2], FID_DATA, ctx, runtime); + gamma = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); Domain beta_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); - beta_ptr = helperGetTensorPointerRW( - regions[3], task->regions[3], FID_DATA, ctx, runtime); + // beta_ptr = helperGetTensorPointerRW( + // regions[3], task->regions[3], FID_DATA, ctx, runtime); + beta = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); assert(gamma_domain == beta_domain); assert(gamma_domain.get_volume() == m->effective_num_elements); + int numdims = gamma_domain.get_dim(); + for (int i = 0; i < numdims; i++) { + int g_d = gamma_domain.hi()[i] - gamma_domain.lo()[i] + 1; + int in_d = in_domain.hi()[i] - in_domain.lo()[i] + 1; + assert(g_d == in_d); + } } else { assert(regions.size() == 2); } - - LayerNorm::forward_kernel_wrapper( - m, in_ptr, out_ptr, gamma_ptr, beta_ptr); + LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta); } void LayerNorm::backward(FFModel const &ff) { @@ -447,7 +530,100 @@ void LayerNorm::backward_task(Task const *task, bool LayerNorm::measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { - return false; + ParallelTensorBase sub_output, sub_input; + if (!outputs[0]->get_sub_tensor(mv, sub_output)) { + return false; + } + if (!inputs[0]->get_sub_tensor(mv, sub_input)) { + return false; + } + Domain input_domain = sub_input.get_domain(); + Domain output_domain = sub_output.get_domain(); + LayerNormMeta *m = new LayerNormMeta(sim->handler, this); + + sim->free_all(); + float *in_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); + assert(in_ptr != NULL); + GenericTensorAccessorR input1_acc(inputs[0]->data_type, input_domain, in_ptr); + cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + + float *out_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + assert(out_ptr != NULL); + GenericTensorAccessorW output_acc( + outputs[0]->data_type, output_domain, out_ptr); + cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + + // FIXME please add gamma_ptr and beta_ptr after finish the implementation + float *gamma_ptr = NULL, *beta_ptr = NULL; + GenericTensorAccessorW gamma_acc; + GenericTensorAccessorW beta_acc; + + bool out_of_memory = + (in_ptr == NULL) || (out_ptr == NULL) || + (((gamma_ptr == NULL) || (beta_ptr == NULL)) && (m->elementwise_affine)); + if (out_of_memory) { + cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME; + cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME; + return true; + } + + std::function forward, backward; + forward = [&] { + forward_kernel_wrapper(m, input1_acc, output_acc, gamma_acc, beta_acc); + }; + + if (sim->computationMode == COMP_MODE_TRAINING) { + float *in_grad_ptr = + (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); + assert(in_grad_ptr != NULL); + cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + + float *out_grad_ptr = NULL; + out_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + assert(out_grad_ptr != NULL); + cost_metrics.outputs_memory += + cost_metrics.total_mem_diff_from(sim->offset); + + float *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL; + + out_of_memory = (in_grad_ptr == NULL) || (out_grad_ptr == NULL) || + (((gamma_grad_ptr == NULL) || (beta_grad_ptr == NULL)) && + (m->elementwise_affine)); + if (out_of_memory) { + cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME; + cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME; + return true; + } + + backward = [&] { + backward_kernel_wrapper(m, + out_grad_ptr, + in_ptr, + in_grad_ptr, + gamma_ptr, + gamma_grad_ptr, + beta_grad_ptr); + }; + } + + inner_measure_operator_cost(sim, forward, backward, cost_metrics); + + if (sim->computationMode == COMP_MODE_TRAINING) { + log_measure.debug("[Measure LayerNorm] name(%s) num_elements(%zu) " + "forward_time(%.4lf) backward_time(%.4lf)\n", + name, + sub_output.get_volume(), + cost_metrics.forward_time, + cost_metrics.backward_time); + } else { + log_measure.debug("[Measure LayerNorm] name(%s) num_elements(%zu) " + "forward_time(%.4lf)\n", + name, + sub_output.get_volume(), + cost_metrics.forward_time); + } + + return true; } void LayerNorm::serialize(Legion::Serializer &sez) const { @@ -512,4 +688,4 @@ size_t hash::operator()( hash_combine(key, params.elementwise_affine); return key; } -}; // namespace std +}; // namespace std \ No newline at end of file diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index ac477ba2ad..f0539f8405 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -13,6 +13,7 @@ * limitations under the License. */ +#include "flexflow/ffconst_utils.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/utils/cuda_helper.h" @@ -30,12 +31,19 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln) effective_num_elements = ln->effective_num_elements; profiling = ln->profiling; eps = ln->eps; - checkCUDA(cudaMalloc(&mean_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&rstd_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&ds_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&db_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&scale_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&bias_ptr, sizeof(float) * effective_batch_size)); + DataType data_type = ln->data_type; + checkCUDA( + cudaMalloc(&mean_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&rstd_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&ds_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&db_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&scale_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&bias_ptr, data_type_size(data_type) * effective_batch_size)); } template @@ -77,26 +85,26 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { } template -__global__ void - RowwiseMomentsCUDAKernel(int64_t N, T eps, T const *X, T *mean, T *rstd) { - __shared__ T m_shared[C10_WARP_SIZE]; - __shared__ T v_shared[C10_WARP_SIZE]; +__global__ void RowwiseMomentsCUDAKernel( + int64_t N, float eps, T const *X, T *mean, T *rstd) { + __shared__ float m_shared[C10_WARP_SIZE]; + __shared__ float v_shared[C10_WARP_SIZE]; const int64_t i = blockIdx.x; - T sum1 = 0; - T sum2 = 0; + float sum1 = 0.0f; + float sum2 = 0.0f; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - sum1 += static_cast(X[index]); - sum2 += static_cast(X[index]) * static_cast(X[index]); + sum1 += static_cast(X[index]); + sum2 += static_cast(X[index]) * static_cast(X[index]); } - sum1 = BlockReduceSum(sum1, m_shared); - sum2 = BlockReduceSum(sum2, v_shared); + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); if (threadIdx.x == 0) { - const T scale = T(1) / static_cast(N); + float const scale = float(1) / static_cast(N); sum1 *= scale; - sum2 = max(sum2 * scale - sum1 * sum1, T(0)); - mean[i] = sum1; - rstd[i] = rsqrt(sum2 + static_cast(eps)); + sum2 = max(sum2 * scale - sum1 * sum1, float(0)); + mean[i] = static_cast(sum1); + rstd[i] = static_cast(rsqrt(sum2 + eps)); } } @@ -130,27 +138,30 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, T *gamma_ptr, T *beta_ptr, cudaStream_t stream) { - RowwiseMomentsCUDAKernel + RowwiseMomentsCUDAKernel <<effective_batch_size, kCUDABlockReduceNumThreads, 0, stream>>>( - m->effective_num_elements, m->eps, in_ptr, m->mean_ptr, m->rstd_ptr); - LayerNormForwardCUDAKernel + m->effective_num_elements, + m->eps, + in_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr)); + LayerNormForwardCUDAKernel <<effective_batch_size, kCUDANumThreads, 0, stream>>>( m->effective_num_elements, in_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_ptr, beta_ptr, out_ptr); } /*static*/ -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - T const *in_ptr, - T *out_ptr, - T *gamma_ptr, - T *beta_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorW &gamma, + GenericTensorAccessorW &beta) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -160,8 +171,24 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - LayerNorm::forward_kernel( - m, in_ptr, out_ptr, gamma_ptr, beta_ptr, stream); + if (m->input_type[0] == DT_FLOAT) { + LayerNorm::forward_kernel(m, + input.get_float_ptr(), + output.get_float_ptr(), + gamma.get_float_ptr(), + beta.get_float_ptr(), + stream); + } else if (m->input_type[0] == DT_HALF) { + LayerNorm::forward_kernel(m, + input.get_half_ptr(), + output.get_half_ptr(), + gamma.get_half_ptr(), + beta.get_half_ptr(), + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } + if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -170,8 +197,8 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, cudaEventDestroy(t_start); cudaEventDestroy(t_end); printf("[LayerNorm] forward time (CF) = %.2fms\n", elapsed); - print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); - print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); + // print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); + // print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); } } @@ -352,6 +379,82 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M, } } +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + dX_i[l] = f_grad_input; + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + + compute_gI(dY, X, mean, rstd, gamma, dX, N, buf); +} + /*static*/ template void LayerNorm::backward_kernel(LayerNormMeta const *m, @@ -366,17 +469,34 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, const int64_t N = m->effective_num_elements; ComputeInternalGradientsCUDAKernel <<>>( - N, output_grad_ptr, input_ptr, gamma_ptr, m->ds_ptr, m->db_ptr); + N, + output_grad_ptr, + input_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; ComputeGradientFusedParamsCUDAKernel <<>>(M, N, - m->mean_ptr, - m->rstd_ptr, - m->ds_ptr, - m->db_ptr, - m->scale_ptr, - m->bias_ptr); + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + N); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { if (M < 512) { // For small batch size, do colwise reduce directly @@ -386,8 +506,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, N, output_grad_ptr, input_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_grad_ptr, beta_grad_ptr); } else { @@ -396,14 +516,15 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, constexpr int kThreadX = kColwiseReduceTileSize; constexpr int kThreadY = kColwiseReduceTileSize / 2; GammaBetaBackwardCUDAKernel - <<>>(M, - N, - output_grad_ptr, - input_ptr, - m->mean_ptr, - m->rstd_ptr, - gamma_grad_ptr, - beta_grad_ptr); + <<>>( + M, + N, + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); } } } @@ -419,21 +540,18 @@ void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, T *beta_grad_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - LayerNorm::backward_kernel(m, - output_grad_ptr, - input_ptr, - input_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr, - stream); + if (m->output_type[0] == DT_FLOAT) { + LayerNorm::backward_kernel(m, + output_grad_ptr, + input_ptr, + input_grad_ptr, + gamma_ptr, + gamma_grad_ptr, + beta_grad_ptr, + stream); + } } -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - float const *in_ptr, - float *out_ptr, - float *gamma_ptr, - float *beta_ptr); template void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, float const *output_grad_ptr, @@ -443,4 +561,4 @@ template void float *gamma_grad_ptr, float *beta_grad_ptr); -}; // namespace FlexFlow +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 029b20afd1..ab65db542e 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -52,7 +52,10 @@ SoftmaxParams Softmax::get_params() const { return params; } -Tensor FFModel::softmax(const Tensor _input, int dim, char const *name) { +Tensor FFModel::softmax(const Tensor _input, + int dim, + bool last_layer, + char const *name) { Layer *sm = new Layer(this, OP_SOFTMAX, DT_FLOAT, @@ -69,6 +72,8 @@ Tensor FFModel::softmax(const Tensor _input, int dim, char const *name) { sm->outputs[0] = create_tensor_legion_ordering( numdims, dims, DT_FLOAT, sm, 0, true /*create_grad*/); sm->add_int_property("softmax_dim", dim); + + sm->add_int_property("last_layer", last_layer); layers.push_back(sm); return sm->outputs[0]; } @@ -80,15 +85,19 @@ Op *Softmax::create_operator_from_layer( long long value; layer->get_int_property("softmax_dim", value); int dim = (int)value; + layer->get_int_property("last_layer", value); + bool last_layer = (bool)value; return new Softmax(model, inputs[0], (inputs[0]->num_dims - 1 - dim) % inputs[0]->num_dims, + last_layer, layer->name); } Softmax::Softmax(FFModel &model, const ParallelTensor _input, int _dim, + bool _last_layer, char const *name) : Op(model, OP_SOFTMAX, @@ -98,7 +107,7 @@ Softmax::Softmax(FFModel &model, 0 /*weights*/, 1 /*outputs*/, _input), - dim(_dim) { + dim(_dim), last_layer(_last_layer) { // Currently assume we always perform softmax along the inner most dim assert(dim == 0); ParallelDim dims[MAX_TENSOR_DIM]; @@ -113,7 +122,7 @@ Softmax::Softmax(FFModel &model, SoftmaxParams const ¶ms, const ParallelTensor input, char const *name) - : Softmax(model, input, params.dim, name) {} + : Softmax(model, input, params.dim, params.last_layer, name) {} void Softmax::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); @@ -283,6 +292,13 @@ void Softmax::backward(FFModel const &ff) { EXCLUSIVE, outputs[0]->region_grad)); launcher.add_field(1, FID_DATA); + + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(2, FID_DATA); runtime->execute_index_space(ctx, launcher); } @@ -315,8 +331,8 @@ void Softmax::backward_task_with_dim(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); + assert(regions.size() == 3); + assert(task->regions.size() == 3); // const Softmax* softmax = (Softmax*) task->args; SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); TensorAccessorW acc_input_grad(regions[0], @@ -327,11 +343,16 @@ void Softmax::backward_task_with_dim(Task const *task, true /*readOutput*/); TensorAccessorR acc_output_grad( regions[1], task->regions[1], FID_DATA, ctx, runtime); + TensorAccessorR acc_output( + regions[2], task->regions[1], FID_DATA, ctx, runtime); // make sure the image indices match! assert(acc_input_grad.rect == acc_output_grad.rect); - backward_kernel_wrapper( - m, acc_input_grad.ptr, acc_output_grad.ptr, acc_input_grad.rect.volume()); + backward_kernel_wrapper(m, + acc_input_grad.ptr, + acc_output_grad.ptr, + acc_output.ptr, + acc_input_grad.rect.volume()); } bool Softmax::get_int_parameter(PMParameter para, int *value) const { @@ -377,11 +398,17 @@ bool Softmax::measure_operator_cost(Simulator *sim, float *output_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); assert(output_grad_ptr != NULL); + float *output_ptr = + (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); backward = [&] { - backward_kernel_wrapper( - m, input_grad_ptr, output_grad_ptr, sub_output.get_volume()); + backward_kernel_wrapper(m, + input_grad_ptr, + output_grad_ptr, + output_ptr, + sub_output.get_volume()); }; } @@ -413,6 +440,7 @@ size_t hash::operator()( FlexFlow::SoftmaxParams const ¶ms) const { size_t key = 0; hash_combine(key, params.dim); + hash_combine(key, params.last_layer); return key; } }; // namespace std diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 53e61b90d9..b6004af14a 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -215,7 +215,7 @@ __host__ void int idx = 0; printf("%s", prefix); for (idx = 0; idx < num_elements; idx++) { - printf(" %.4lf", (float)host_ptr[idx]); + printf(" %.10lf", (float)host_ptr[idx]); if (idx >= 16) { break; } @@ -224,6 +224,76 @@ __host__ void checkCUDA(cudaFreeHost(host_ptr)); } +template +__host__ void + save_tensor(T const *ptr, size_t num_elements, char const *file_name) { + T *host_ptr; + checkCUDA(cudaHostAlloc(&host_ptr, + sizeof(T) * num_elements, + cudaHostAllocPortable | cudaHostAllocMapped)); + checkCUDA(cudaMemcpy( + host_ptr, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost)); + FILE *tensor_file; + tensor_file = fopen(file_name, "w"); + for (unsigned i = 0; i < num_elements; i++) { + fprintf(tensor_file, "%.8f, ", (float)host_ptr[i]); + } + + fclose(tensor_file); + checkCUDA(cudaFreeHost(host_ptr)); +} + +cudnnStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax( + cudnnTensorDescriptor_t tensor, Domain domain, DataType data_type) { + int dims[MAX_TENSOR_DIM]; + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(data_type); + switch (domain.get_dim()) { + case 1: { + Rect<1> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + return cudnnSetTensor4dDescriptor( + tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, dims[0], 1, 1, 1); + } + case 2: { + Rect<2> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + return cudnnSetTensor4dDescriptor( + tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, dims[1], dims[0], 1, 1); + } + case 3: { + Rect<3> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + return cudnnSetTensor4dDescriptor(tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + dims[2] * dims[1], + dims[0], + 1, + 1); + } + case 4: { + Rect<4> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + dims[3] = rect.hi[3] - rect.lo[3] + 1; + return cudnnSetTensor4dDescriptor(tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + dims[3] * dims[2] * dims[1], + dims[0], + 1, + 1); + } + default: + assert(false && "Unsupported dim number"); + } + return CUDNN_STATUS_BAD_PARAM; +} + cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Domain domain) { int dims[MAX_TENSOR_DIM]; @@ -370,3 +440,8 @@ template __host__ void print_tensor(int32_t const *ptr, size_t rect, char const *prefix); template __host__ void print_tensor(int64_t const *ptr, size_t rect, char const *prefix); +template __host__ void + save_tensor(float const *ptr, size_t rect, char const *file_name); +template __host__ void save_tensor(int32_t const *ptr, + size_t rect, + char const *file_name); diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 4b55a39104..5310588477 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1718,6 +1718,7 @@ GraphOptimalViewSerialized case OP_SOFTMAX: { Softmax *softmax = (Softmax *)op; sez.serialize(softmax->dim); + sez.serialize(softmax->last_layer); break; } case OP_REPARTITION: { @@ -2098,8 +2099,11 @@ void FFModel::deserialize_graph_optimal_view( case OP_SOFTMAX: { assert(num_inputs == 1); int softmax_dim; + bool last_layer; dez.deserialize(softmax_dim); - node = get_or_create_node(inputs[0], {softmax_dim}); + dez.deserialize(last_layer); + node = + get_or_create_node(inputs[0], {softmax_dim, last_layer}); break; } case OP_TRANSPOSE: { diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index 2925eb7555..6d52e135cd 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -1930,8 +1930,8 @@ void GraphSearchHelper::graph_optimize( } } best_graph->print_strategy_computation_graph(optimal.views); - //std::cout << "PCG:" << std::endl; - //best_graph->print_dot(); + // std::cout << "PCG:" << std::endl; + // best_graph->print_dot(); optimal_views = real_optimal_views; } @@ -3120,7 +3120,7 @@ void FFModel::graph_optimize( std::unordered_map &optimal_views) { this->graph_search->graph_optimize( budget, only_data_parallel, best_graph, optimal_views); - best_graph->print_dot(); + best_graph->print_dot(); } bool FFModel::convert_graph_to_operators( @@ -3221,7 +3221,8 @@ bool FFModel::convert_graph_to_operators( case OP_SOFTMAX: { assert(inList.size() == 1); Softmax *softmax = (Softmax *)node.ptr; - new_op = new Softmax(*this, inputs[0], softmax->dim, NULL); + new_op = new Softmax( + *this, inputs[0], softmax->dim, softmax->last_layer, NULL); break; } case OP_COMBINE: { From 2d28c15d92461bc8d74c0397f2aaf887952e4b92 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 4 Jul 2023 19:43:48 -0400 Subject: [PATCH 36/52] revert changes --- examples/python/pytorch/mt5/mt5_ff.py | 26 +- include/flexflow/model.h | 6 +- .../flexflow/ops/kernels/softmax_kernels.h | 6 +- include/flexflow/ops/layer_norm.h | 11 +- include/flexflow/ops/softmax.h | 2 - include/flexflow/ops/softmax_params.h | 1 - include/flexflow/utils/cuda_helper.h | 6 - python/flexflow/core/flexflow_cffi.py | 23 +- python/flexflow/torch/model.py | 51 ++-- python/flexflow_c.cc | 7 +- python/flexflow_c.h | 1 - src/loss_functions/loss_functions.cc | 11 +- src/loss_functions/loss_functions.cu | 59 ++-- src/metrics_functions/metrics_functions.cc | 6 +- src/metrics_functions/metrics_functions.cu | 18 +- src/ops/element_unary.cu | 5 +- src/ops/kernels/softmax.cu | 37 +-- src/ops/layer_norm.cc | 254 +++--------------- src/ops/layer_norm.cu | 248 +++++------------ src/ops/softmax.cc | 46 +--- src/runtime/cuda_helper.cu | 77 +----- src/runtime/graph.cc | 6 +- src/runtime/substitution.cc | 9 +- 23 files changed, 196 insertions(+), 720 deletions(-) diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py index c2868e9d1e..08af8d88a7 100644 --- a/examples/python/pytorch/mt5/mt5_ff.py +++ b/examples/python/pytorch/mt5/mt5_ff.py @@ -64,8 +64,8 @@ def preprocess_train() -> None: y_shape = y.shape assert len(y.shape) == 2, \ "`y` should have shape (num examples, sequence length)" - y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32) - lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32) + y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long) + lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long) y_ids[:, :] = y[:, :-1] lm_labels[:, :] = y[:, 1:] @@ -89,29 +89,26 @@ def top_level_task(): #model = BertModel.from_pretrained("bert-base-uncased") # Load train data as numpy arrays print("Loading data...") - ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")).astype('int32') + ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")) ids = np.pad(ids, ((0,0), (0,17)), 'constant') #ids = np.random.randint(0, 5, (1000, 512)) #print('ids_shape', ids.shape) #print('ids', ids) - mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")).astype('int32') + mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")) mask = np.pad(mask, ((0,0), (0,17)), 'constant') #mask = np.random.randint(0, 2, (1000, 512)) #y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) - lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")).astype('int32') + lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")) lm_labels = np.pad(lm_labels, ((0,0), (0,17)), 'constant') #lm_labels = np.random.randint(-1, 5, (1000, 512)) - position_id = torch.arange(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy() - token_type_ids = torch.zeros(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy() - batch_size = ffconfig.batch_size input_ids_shape = (batch_size, ids.shape[1]) attention_mask_shape = (batch_size, mask.shape[1]) #decoder_input_ids_shape = (batch_size, y_ids.shape[1]) input_tensors = [ - ffmodel.create_tensor(input_ids_shape, DataType.DT_INT32), # input_ids - ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT32), # attention_mask + ffmodel.create_tensor(input_ids_shape, DataType.DT_INT64), # input_ids + ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT64), # attention_mask #ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids ] encoder_seq_length = ids.shape[1] @@ -129,7 +126,7 @@ def top_level_task(): output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True) #from flexflow.torch.model import file_to_ff #file_to_ff("mt5.ff", ffmodel, input_tensors) - ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, beta1=0.9, beta2=0.98, weight_decay=0.0, epsilon=2e-8) + ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, adam_beta1=0.9, adam_beta2=0.98, weight_decay=0.0, adam_epsilon=2e-8) # ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) print("Compiling the model...") @@ -141,9 +138,6 @@ def top_level_task(): MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY, ], ) - - # load weights here - ffmodel.load_bert_pretrained(checkpoint=model) print("Creating data loaders...") print('id_dtype', ids.dtype) @@ -154,8 +148,6 @@ def top_level_task(): #decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids) # NOTE: We cast down the label tensor data to 32-bit to accommodate the # label tensor's required dtype - token_type_ids_dl = ffmodel.create_data_loader(input_tensors[2], token_type_ids) - position_id_dl = ffmodel.create_data_loader(input_tensors[3], position_id) labels_dl = ffmodel.create_data_loader( ffmodel.label_tensor, lm_labels.astype("int32") ) @@ -167,7 +159,7 @@ def top_level_task(): epochs = ffconfig.epochs ffmodel.fit( #x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl], - x=[input_ids_dl, attention_mask_dl, position_id_dl, token_type_ids_dl], + x=[input_ids_dl, attention_mask_dl], y=labels_dl, batch_size=batch_size, epochs=epochs, ) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 0496d5fa8f..c6bc6929ad 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -472,7 +472,6 @@ class FFModel { std::vector const &axes, bool elementwise_affine, float eps, - DataType data_type = DT_NONE, char const *name = NULL); // Add a batch_norm layer Tensor @@ -519,10 +518,7 @@ class FFModel { // Add a flat layer Tensor flat(const Tensor input, char const *name = NULL); // Add a softmax layer - Tensor softmax(const Tensor input, - int dim = -1, - bool last_layer = false, - char const *name = NULL); + Tensor softmax(const Tensor input, int dim = -1, char const *name = NULL); // Create input tensors and constants Tensor transpose(const Tensor input, std::vector const &perm, diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index 9aec9f57c9..81b34d8558 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -20,7 +20,6 @@ class SoftmaxMeta : public OpMeta { #endif bool profiling; int dim; - bool last_layer; char op_name[MAX_OPNAME]; }; @@ -34,7 +33,6 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, float *input_grad_ptr, float const *output_grad_ptr, - float const *output_ptr, size_t num_elements); namespace Internal { @@ -42,10 +40,8 @@ void forward_kernel(SoftmaxMeta const *m, float const *input_ptr, float *output_ptr, ffStream_t stream); -void backward_kernel(SoftmaxMeta const *m, - float *input_grad_ptr, +void backward_kernel(float *input_grad_ptr, float const *output_grad_ptr, - float const *output_ptr, size_t num_elements, ffStream_t stream); } // namespace Internal diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 552b9cf365..8273b9ab52 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -66,11 +66,12 @@ class LayerNorm : public Op { T *gamma_ptr, T *beta_ptr, ffStream_t stream); + template static void forward_kernel_wrapper(LayerNormMeta const *m, - GenericTensorAccessorR const &input, - GenericTensorAccessorW &output, - GenericTensorAccessorW &gamma, - GenericTensorAccessorW &beta); + T const *input_ptr, + T *output_ptr, + T *gamma_ptr, + T *beta_ptr); template static void backward_kernel(LayerNormMeta const *m, T const *output_grad_ptr, @@ -104,7 +105,7 @@ class LayerNormMeta : public OpMeta { bool elementwise_affine; int64_t effective_batch_size, effective_num_elements; float eps; - void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; + float *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; char op_name[MAX_OPNAME]; }; diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 2616294a3a..25a20315bd 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -15,7 +15,6 @@ class Softmax : public Op { Softmax(FFModel &model, const ParallelTensor logit, int dim, - bool _last_layer, char const *name); Softmax(FFModel &model, Params const ¶ms, @@ -65,7 +64,6 @@ class Softmax : public Op { public: int dim; - bool last_layer; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/softmax_params.h b/include/flexflow/ops/softmax_params.h index 545e3a5cb9..d805d9966d 100644 --- a/include/flexflow/ops/softmax_params.h +++ b/include/flexflow/ops/softmax_params.h @@ -7,7 +7,6 @@ namespace FlexFlow { struct SoftmaxParams { int dim; - bool last_layer; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(SoftmaxParams const &, SoftmaxParams const &); diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index a4b2be0a66..46e323b186 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -132,15 +132,9 @@ __host__ void updateGAS(float *para_ptr, template void print_tensor(T const *ptr, size_t num_elements, char const *prefix); -template -void save_tensor(T const *ptr, size_t num_elements, char const *file_name); cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Legion::Domain domain); -cudnnStatus_t - cudnnSetTensorDescriptorFromDomain4SoftMax(cudnnTensorDescriptor_t tensor, - Legion::Domain domain, - DataType data_type = DT_FLOAT); cudaDataType_t ff_to_cuda_datatype(DataType type); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 4c01057109..42339d781c 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -1595,7 +1595,7 @@ def flat(self, input, name=None): self.add_layer(OpType.FLAT, name) return Tensor(handle, owner_op_type=OpType.FLAT) - def softmax(self, input, axis=-1, last_layer=False, name=None): + def softmax(self, input, axis=-1, name=None): """Softmax activation function. :param input: the input Tensor. @@ -1607,7 +1607,7 @@ def softmax(self, input, axis=-1, last_layer=False, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_softmax(self.handle, input.handle, axis, last_layer, c_name) + handle = ffc.flexflow_model_add_softmax(self.handle, input.handle, axis, c_name) self.add_layer(OpType.SOFTMAX, name) return Tensor(handle, owner_op_type=OpType.SOFTMAX) @@ -2041,25 +2041,6 @@ def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): ff_tensor.set_tensor(self, np_tensor) print("Compiled ffmodel!") - def load_bert_pretrained(self, checkpoint=None): - # store weights in dict - weights_dict = {} - for name, params in checkpoint.named_parameters(): - weights_dict[name.replace("LayerNorm", "layer_norm").replace(".", "_")] = params.detach().cpu().numpy() - print(name.replace("LayerNorm", "layer_norm").replace(".", "_")) - # some weights not in params - weights_dict['cls_predictions_decoder_weight'] = checkpoint.cls.predictions.decoder.weight.detach().cpu().numpy() - weights_dict['cls_predictions_decoder_bias'] = checkpoint.cls.predictions.decoder.bias.detach().cpu().numpy() - for i in range (self._nb_layers): - layer = self._layers[i] - if (layer.name + "_weight") in weights_dict: - print('weight: ' + layer.name) - weight = layer.get_parameter_by_id(0); - weight.set_tensor(self, weights_dict[layer.name + "_weight"]) - if (layer.name + "_bias") in weights_dict: - print('bias: ' + layer.name) - bias = layer.get_parameter_by_id(1); - bias.set_tensor(self, weights_dict[layer.name + "_bias"]) def fit(self, x=None, y=None, batch_size=None, epochs=1): """Trains the model for a fixed number of epochs (iterations on a dataset). diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index 8ebac2146c..11e0c16e48 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -664,13 +664,12 @@ def string_to_ff(string, ffmodel, node_to_output): def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] - axes = [0] - eps = self.module.eps + axes = [len(input_tensor.dims) - 1] return ffmodel.layer_norm( input=input_tensor, axes=axes, elementwise_affine=True, - eps=eps, + eps=1e-6, name=self.name, ) @@ -1198,24 +1197,16 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_sub( - input=input_tensor, scalar=scalar, inplace=False, name=name, + input=input_tensor, scalar=scalar, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor, scalar = \ FunctionNode.parse_scalar_op(self, node_to_output) - if self.scalar_pos == FunctionNode.ScalarPosition.RIGHT: - return ffmodel.scalar_sub( - input=input_tensor, scalar=scalar, inplace=False, name=self.name, - ) - else: - negative_input = ffmodel.scalar_multiply( - input=input_tensor, scalar=-1, inplace=False, name=self.name + '_negative', - ) - return ffmodel.scalar_sub( - input=negative_input, scalar=-scalar, inplace=False, name=self.name, - ) - + return ffmodel.scalar_sub( + input=input_tensor, scalar=scalar, name=self.name, + ) + class ScalarTrueDivNode(FunctionNode): def __init__(self, node): @@ -1240,16 +1231,15 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_true_divide( - input=input_tensor, scalar=scalar, inplace=False, name=name, + input=input_tensor, scalar=scalar, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] scalar = self.innodes[1] assert type(scalar) is float - return ffmodel.scalar_true_divide( - input=input_tensor, scalar=scalar, inplace=False, name=self.name, + input=input_tensor, scalar=scalar, name=self.name, ) @@ -1662,14 +1652,14 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_multiply( - input=input_tensor, scalar=scalar, inplace=False, name=name, + input=input_tensor, scalar=scalar, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor, scalar = \ FunctionNode.parse_scalar_op(self, node_to_output) return ffmodel.scalar_multiply( - input=input_tensor, scalar=scalar, inplace=False, name=self.name, + input=input_tensor, scalar=scalar, name=self.name, ) @@ -2369,13 +2359,11 @@ def string_to_ff(string, ffmodel, node_to_output): "since attributes require access to the PyTorch model" ) - def to_ff(self, ffmodel, node_to_output, input_tensors): - return self.attr_to_ff_tensor(ffmodel, input_tensors) - - def attr_to_ff_tensor(self, ffmodel, input_tensors): - + def to_ff(self, ffmodel, node_to_output): + return self.attr_to_ff_tensor(ffmodel) - torch_tensor = self.attr + def attr_to_ff_tensor(self, ffmodel): + torch_tensor = self.attr assert (torch_tensor.shape[0] == 1) batch_size = ffmodel._ffconfig.batch_size torch_tensor = np.repeat(torch_tensor, batch_size, axis=0) @@ -2394,16 +2382,15 @@ def attr_to_ff_tensor(self, ffmodel, input_tensors): np_tensor = np_tensor.astype(np.float32) print('attr: ', torch_tensor.shape) - assert (torch_tensor.shape[0] == batch_size) + assert (torch_tensor.shape[0] == batch_size) ff_tensor = ffmodel.create_tensor( - torch_tensor.shape, ff_dtype, True, + torch_tensor.shape, ff_dtype, requires_grad, ) # delay set_tensor, add to ffmodel ffmodel.attr_tensors[ff_tensor] = np_tensor # ff_tensor.set_tensor( # ffmodel, np_tensor # ) - input_tensors.append(ff_tensor) return ff_tensor @@ -2485,7 +2472,7 @@ def to_ff(self, ffmodel, node_to_output, output_tensors): # `CrossEntropyLoss()` implementation logits = node_to_output[other["logits"].name] softmax_logits = ffmodel.softmax( - input=logits, last_layer=True, name=self.name, + input=logits, name=self.name, ) output_tensors[:] += [softmax_logits] else: @@ -2619,8 +2606,6 @@ def torch_to_ff(self, ffmodel, input_tensors, verbose=False): elif isinstance(node, OutputNode): node.to_ff(ffmodel, node_to_output, output_tensors) node_output = None - elif isinstance(node, AttributeNode): - node_output = node.to_ff(ffmodel, node_to_output, input_tensors) else: node_output = node.to_ff(ffmodel, node_to_output) diff --git a/python/flexflow_c.cc b/python/flexflow_c.cc index fd688c0c6a..74a5da6ce1 100644 --- a/python/flexflow_c.cc +++ b/python/flexflow_c.cc @@ -568,8 +568,8 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_, for (int i = 0; i < n; i++) { axes_vec.push_back(axes[i]); } - Tensor tensor = handle->layer_norm( - input, axes_vec, elementwise_affine, eps, input->data_type, name); + Tensor tensor = + handle->layer_norm(input, axes_vec, elementwise_affine, eps, name); DEBUG_PRINT("[LayerNorm] new Tensor %p, input %p, elementwise_affine %d, eps " "%f, name %s", tensor, @@ -730,11 +730,10 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle_, flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle_, const flexflow_tensor_t input_, int dim, - bool last_layer, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); - Tensor tensor = handle->softmax(input, dim, last_layer, name); + Tensor tensor = handle->softmax(input, dim, name); DEBUG_PRINT( "[Softmax] new Tensor %p, input %p, name %s", tensor, input, name); return FFCObjectWrapper::wrap(tensor); diff --git a/python/flexflow_c.h b/python/flexflow_c.h index 5409002e5e..fb64c78fd2 100644 --- a/python/flexflow_c.h +++ b/python/flexflow_c.h @@ -276,7 +276,6 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle, flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle, const flexflow_tensor_t input, int dim, - bool last_layer, char const *name); flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle, diff --git a/src/loss_functions/loss_functions.cc b/src/loss_functions/loss_functions.cc index d887ee9243..ae89c3d469 100644 --- a/src/loss_functions/loss_functions.cc +++ b/src/loss_functions/loss_functions.cc @@ -49,8 +49,6 @@ void Loss::backward(FFModel *model, if (loss_type == LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE) { assert(logit->get_volume() == label->get_volume()); scale_factor = 2.0f / logit->get_volume(); - } else if (loss_type == LOSS_SPARSE_CATEGORICAL_CROSSENTROPY) { - scale_factor = 1.0f; } else { scale_factor = 1.0f / model->config.batchSize; } @@ -133,12 +131,9 @@ void Loss::backward_task_with_dim(Task const *task, regions[2], task->regions[2], FID_DATA, ctx, runtime); // assertion the outter-most dim is replica dim and replica degree is 1 assert(acc_logit.rect.hi[NDIM - 1] == acc_logit.rect.lo[NDIM - 1]); - - int num_classes = acc_logit.rect.hi[0] - acc_logit.rect.lo[0] + 1; - int num_samples = acc_logit.rect.volume() / num_classes; - // int num_samples = - // acc_logit.rect.hi[NDIM - 2] - acc_logit.rect.lo[NDIM - 2] + 1; - // int num_classes = acc_logit.rect.volume() / num_samples; + int num_samples = + acc_logit.rect.hi[NDIM - 2] - acc_logit.rect.lo[NDIM - 2] + 1; + int num_classes = acc_logit.rect.volume() / num_samples; assert(acc_logit_grad.rect == acc_logit.rect); int k = 1; if (loss->repl_labels) { diff --git a/src/loss_functions/loss_functions.cu b/src/loss_functions/loss_functions.cu index edd8f03fa4..01766347b0 100644 --- a/src/loss_functions/loss_functions.cu +++ b/src/loss_functions/loss_functions.cu @@ -18,7 +18,6 @@ namespace FlexFlow { -int const MASK_TOKEN = -100; using namespace Legion; __global__ void @@ -33,25 +32,6 @@ __global__ void } } -__global__ void - sparse_categorical_crossentropy_loss_backward_with_mask(float *logit_grad, - int const *label, - coord_t num_samples, - coord_t num_classes, - int const k, - float *num) { - CUDA_KERNEL_LOOP(i, num_samples * num_classes) { - int sample_id = i / num_classes; - int label_idx = label[i / (k * num_classes)]; - if (label_idx != MASK_TOKEN && (i == sample_id * num_classes + label_idx)) { - logit_grad[i] -= 1.0f; - atomicAdd(&num[0], 1.0f); - } else if (label_idx == MASK_TOKEN) { - logit_grad[i] = 0.0f; - } - } -} - __global__ void categorical_crossentropy_loss_backward(float *logit_grad, float const *logit, float const *label, @@ -94,25 +74,14 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_ptr, logit_volume * sizeof(float), cudaMemcpyDeviceToDevice)); - // calculate the scale factor inside kernel; - assert(scale_factor == 1.0f); - float *num; - checkCUDA(cudaMalloc(&num, sizeof(float))); - float effective_tokens; - int parallelism = num_samples * num_classes; - // sparse_categorical_crossentropy_loss_backward<<>>( - // logit_grad_ptr, label_ptr, num_samples, num_classes, k, num); - sparse_categorical_crossentropy_loss_backward_with_mask<<< - GET_BLOCKS(parallelism), - CUDA_NUM_THREADS, - 0, - stream>>>(logit_grad_ptr, label_ptr, num_samples, num_classes, k, num); - cudaMemcpy(&effective_tokens, num, sizeof(float), cudaMemcpyDeviceToHost); + sparse_categorical_crossentropy_loss_backward<<>>( + logit_grad_ptr, label_ptr, num_samples, num_classes, k); + // Scale logit gradients by op->scale_factor scale_kernel<<>>( - logit_grad_ptr, logit_grad_volume, 0, 1.0f / effective_tokens); + logit_grad_ptr, logit_grad_volume, 0, scale_factor * k); } void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( @@ -153,17 +122,19 @@ void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper( logit_grad_ptr, logit_grad_volume, 0, scale_factor); } -void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr, - float const *loss_ptr, - size_t loss_volume, - size_t loss_grad_volume, - float scale_factor) { +void Loss::identity_loss_backward_kernel_wrapper( + float *loss_grad_ptr, + float const *loss_ptr, + size_t loss_volume, + size_t loss_grad_volume, + float scale_factor) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); identity_loss_backward<<>>(loss_grad_ptr, loss_ptr, loss_volume); + stream>>>( + loss_grad_ptr, loss_ptr, loss_volume); // Scale logit gradients by loss->scale_factor scale_kernel<<>>( loss_grad_ptr, loss_grad_volume, 0, scale_factor); diff --git a/src/metrics_functions/metrics_functions.cc b/src/metrics_functions/metrics_functions.cc index 8c7e23ad8a..7244b06925 100644 --- a/src/metrics_functions/metrics_functions.cc +++ b/src/metrics_functions/metrics_functions.cc @@ -91,8 +91,8 @@ void Metrics::compute(FFModel *model, false /*must*/, 0 /*mapper_id*/, logit->machine_view.hash()); - // std::cout << "logit shape: " << logit->get_shape() << std::endl; - // std::cout << "label shape: " << label->get_shape() << std::endl; + std::cout << "logit shape: " << logit->get_shape() << std::endl; + std::cout << "label shape: " << label->get_shape() << std::endl; launcher.add_region_requirement(RegionRequirement( logit->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, logit->region)); launcher.add_field(0, FID_DATA); @@ -157,7 +157,7 @@ PerfMetrics assert(acc_label.rect.lo[0] == acc_label.rect.hi[0]); // Cannot measure categorical_crossentropy w/ sparse labels // Use measure_sparse_categorical_crossentropy instead - // std::cout << "num_classes: " << num_classes << std::endl; + std::cout << "num_classes: " << num_classes << std::endl; assert(!me->measure_categorical_crossentropy); Metrics::update_metrics_sparse_label_kernel_wrapper(acc_logit.ptr, acc_label.ptr, diff --git a/src/metrics_functions/metrics_functions.cu b/src/metrics_functions/metrics_functions.cu index 8c584c397c..b68b10d873 100644 --- a/src/metrics_functions/metrics_functions.cu +++ b/src/metrics_functions/metrics_functions.cu @@ -19,7 +19,6 @@ namespace FlexFlow { float const LOG_MIN_VALUE = 0.00000001f; -int const MASK_TOKEN = -100; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, @@ -30,7 +29,7 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, CUDA_KERNEL_LOOP(b, num_samples) { if (metrics.measure_accuracy) { float max_val = -1.0f; - int my_label = 0; + int my_label = 0; for (int i = 0; i < num_classes; i++) { float my_logit = logits[b * num_classes + i]; if (my_logit > max_val) { @@ -39,19 +38,14 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, } } assert(my_label >= 0); - if (labels[b] != MASK_TOKEN) { - atomicAdd(&(perf->train_all), 1); - if (labels[b] == my_label) { - atomicAdd(&(perf->train_correct), 1); - } + atomicAdd(&(perf->train_all), 1); + if (labels[b] == my_label) { + atomicAdd(&(perf->train_correct), 1); } } if (metrics.measure_sparse_categorical_crossentropy) { - if (labels[b] != MASK_TOKEN) { - float my_logit = - max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); - atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); - } + float my_logit = max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); + atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); } if (metrics.measure_mean_squared_error || metrics.measure_root_mean_squared_error || diff --git a/src/ops/element_unary.cu b/src/ops/element_unary.cu index 187e60282f..d6e5bcfdc3 100644 --- a/src/ops/element_unary.cu +++ b/src/ops/element_unary.cu @@ -202,9 +202,8 @@ __global__ void elewise_unary_backward_kernel(coord_t volume, case OP_GELU: { input_grad[i] = (T)(output_grad[i] * - (0.5 * erfc(-input[i] * M_SQRT1_2) + - 0.5 * M_SQRT1_2 * input[i] * - ((2 / sqrt(M_PI)) * exp(-input[i] * input[i] * 0.5f)))); + (0.5 * erfc(-input[i] * M_SQRT1_2) - + 0.5 * M_SQRT1_2 * input[i] * exp(-input[i] * input[i] * 0.5))); break; } case OP_RSQRT: { diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index e163c9a0c7..d83d9952c9 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -26,10 +26,8 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, Domain const &input_domain) : OpMeta(handler) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); - checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( - inputTensor, input_domain, softmax->data_type)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain(inputTensor, input_domain)); dim = softmax->dim; - last_layer = softmax->last_layer; profiling = softmax->profiling; std::strcpy(op_name, softmax->name); } @@ -68,7 +66,6 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, float *input_grad_ptr, float const *output_grad_ptr, - float const *output_ptr, size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -80,7 +77,7 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, cudaEventRecord(t_start, stream); } Internal::backward_kernel( - m, input_grad_ptr, output_grad_ptr, output_ptr, num_elements, stream); + input_grad_ptr, output_grad_ptr, num_elements, stream); if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -116,33 +113,15 @@ void forward_kernel(SoftmaxMeta const *m, output_ptr)); } -void backward_kernel(SoftmaxMeta const *m, - float *input_grad_ptr, +void backward_kernel(float *input_grad_ptr, float const *output_grad_ptr, - float const *output_ptr, size_t num_elements, cudaStream_t stream) { - - if (m->last_layer) { - checkCUDA(cudaMemcpyAsync(input_grad_ptr, - output_grad_ptr, - num_elements * sizeof(float), - cudaMemcpyDeviceToDevice, - stream)); - } else { - float alpha = 1.0f, beta = 0.0f; - checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &alpha, - m->inputTensor, - output_ptr, - m->inputTensor, - output_grad_ptr, - &beta, - m->inputTensor, - input_grad_ptr)); - } + checkCUDA(cudaMemcpyAsync(input_grad_ptr, + output_grad_ptr, + num_elements * sizeof(float), + cudaMemcpyDeviceToDevice, + stream)); } } // namespace Internal diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 915041d2bb..e8c65b4b03 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -61,27 +61,10 @@ Tensor FFModel::layer_norm(const Tensor input, std::vector const &axes, bool elementwise_affine, float eps, - DataType data_type, char const *name) { - // In PyTorch, axes must be the sizes of the last axes.size() dimensions of - // the input tensor. However, since the tensor dimensions are reversed in - // FlexFlow (batch size is the last dimension), we require that axes must be - // the sizes of the FIRST axes.size() dimensions of the input tensor. - - // Another difference is that in PyTorch, the axes vector should contain the - // sizes of the dimensions with respect to which you want to compute the - // layernorm. In FlexFlow, instead, axes should contain the INDICES of the - // dimensions in question. We do this because the size of a dimension might be - // different when splitting a tensor in model parallelism. - assert( - axes.size() <= input->num_dims && - "number of axes must be less than tensor dimensions"); // input does not - // have replica - // dimension here - for (int i = 0; i < axes.size(); i++) { - assert(axes[i] == i && "axes must be the first axes.size() dimensions"); - } -#ifdef DEADCODE + // FIXME: currently disable elementwise_affine + elementwise_affine = false; + // axes must be the last axes.size() dimensions for (int i = 0; i < axes.size(); i++) { bool found = false; for (int j = 0; j < axes.size(); j++) { @@ -93,33 +76,15 @@ Tensor FFModel::layer_norm(const Tensor input, assert(false && "axes must be the last axes.size() dimensions"); } } -#endif - if (data_type == DT_NONE) { - data_type = input->data_type; - } int num_weights = elementwise_affine ? 2 : 0; - Layer *ln = nullptr; - if (data_type != input->data_type) { - Tensor casted_input = cast(input, data_type, "type cast for layer_norm"); - ln = new Layer(this, - OP_LAYERNORM, - data_type, - name, - 1 /*inputs*/, - num_weights, - 1 /*outputs*/, - casted_input); - } else { - ln = new Layer(this, - OP_LAYERNORM, - data_type, - name, - 1 /*inputs*/, - num_weights, - 1 /*outputs*/, - input); - } - + Layer *ln = new Layer(this, + OP_LAYERNORM, + DT_FLOAT, + name, + 1 /*inputs*/, + num_weights, + 1 /*outputs*/, + input); ln->outputs[0] = create_tensor_legion_ordering(input->num_dims, input->dims, input->data_type, @@ -127,19 +92,19 @@ Tensor FFModel::layer_norm(const Tensor input, 0, true /*create_grad*/); if (num_weights == 2) { - int numdims = axes.size(); - int dims[numdims]; - for (int i = 0; i < numdims; i++) { - dims[i] = input->dims[axes[i]]; + int M = 1; + for (int i = 0; i < axes.size(); i++) { + M *= input->dims[input->num_dims - 1 - axes[i]]; } - ln->weights[0] = create_weight_legion_ordering(numdims, + int dims[1] = {M}; + ln->weights[0] = create_weight_legion_ordering(1, dims, input->data_type, ln, true /*create_grad*/, nullptr, CHOSEN_SYNC_TYPE); - ln->weights[1] = create_weight_legion_ordering(numdims, + ln->weights[1] = create_weight_legion_ordering(1, dims, input->data_type, ln, @@ -214,36 +179,19 @@ LayerNorm::LayerNorm(FFModel &model, ParallelDim output_dims[MAX_TENSOR_DIM]; int M = 1; for (int i = 0; i < axes.size(); i++) { - M *= inputs[0]->dims[axes[i]].size; + M *= inputs[0]->dims[inputs[0]->num_dims - 1 - axes[i]].size; } effective_num_elements = M; - effective_batch_size = inputs[0]->get_volume() / M; - assert(elementwise_affine == (numWeights == 2)); + effective_batch_size = inputs[0]->get_shape().get_piece_num_elements() / M; if (numWeights > 0 && allocate_weights) { - ParallelDim dims[axes.size()]; - for (int i = 0; i < axes.size(); i++) { - dims[i] = inputs[0]->dims[i]; - } - int seed = std::rand(); - Initializer *gamma_initializer = new UniformInitializer(seed, 0.0f, 1.0f); - Initializer *beta_initializer = new UniformInitializer(seed, 0.0f, 1.0f); - weights[0] = - model.create_parallel_weight_legion_ordering(axes.size(), - dims, - _input->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - gamma_initializer, - CHOSEN_SYNC_TYPE); - weights[1] = - model.create_parallel_weight_legion_ordering(axes.size(), - dims, - _input->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - beta_initializer, - CHOSEN_SYNC_TYPE); + int kernel_dims = 2; + assert(false); + // weights[0] = model.create_parallel_weight_legion_ordering( + // kernel_dims, + } else { + // do nothing } + return; } void LayerNorm::init(FFModel const &ff) { @@ -273,20 +221,6 @@ void LayerNorm::init(FFModel const &ff) { EXCLUSIVE, inputs[0]->region)); launcher.add_field(1, FID_DATA); - if (elementwise_affine) { - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(2, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region)); - launcher.add_field(3, FID_DATA); - } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap(ff, fm); @@ -299,8 +233,6 @@ OpMeta *LayerNorm::init_task(Task const *task, LayerNorm *ln = (LayerNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); LayerNormMeta *meta = new LayerNormMeta(handle, ln); - meta->input_type[0] = ln->inputs[0]->data_type; - meta->output_type[0] = ln->outputs[0]->data_type; return meta; } @@ -360,21 +292,14 @@ void LayerNorm::forward_task(Task const *task, assert(task->regions.size() == regions.size()); float const *in_ptr = NULL; float *out_ptr = NULL, *gamma_ptr = NULL, *beta_ptr = NULL; - GenericTensorAccessorR in; - GenericTensorAccessorW out, gamma, beta; - Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - // in_ptr = helperGetTensorPointerRO( - // regions[0], task->regions[0], FID_DATA, ctx, runtime); - in = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + in_ptr = helperGetTensorPointerRO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); Domain out_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - // out_ptr = helperGetTensorPointerWO( - // regions[1], task->regions[1], FID_DATA, ctx, runtime); - out = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + out_ptr = helperGetTensorPointerWO( + regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(in_domain == out_domain); assert(in_domain.get_volume() == m->effective_num_elements * m->effective_batch_size); @@ -382,28 +307,20 @@ void LayerNorm::forward_task(Task const *task, assert(regions.size() == 4); Domain gamma_domain = runtime->get_index_space_domain( ctx, task->regions[2].region.get_index_space()); - // gamma_ptr = helperGetTensorPointerRW( - // regions[2], task->regions[2], FID_DATA, ctx, runtime); - gamma = helperGetGenericTensorAccessorRW( - m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + gamma_ptr = helperGetTensorPointerRW( + regions[2], task->regions[2], FID_DATA, ctx, runtime); Domain beta_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); - // beta_ptr = helperGetTensorPointerRW( - // regions[3], task->regions[3], FID_DATA, ctx, runtime); - beta = helperGetGenericTensorAccessorRW( - m->input_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); + beta_ptr = helperGetTensorPointerRW( + regions[3], task->regions[3], FID_DATA, ctx, runtime); assert(gamma_domain == beta_domain); assert(gamma_domain.get_volume() == m->effective_num_elements); - int numdims = gamma_domain.get_dim(); - for (int i = 0; i < numdims; i++) { - int g_d = gamma_domain.hi()[i] - gamma_domain.lo()[i] + 1; - int in_d = in_domain.hi()[i] - in_domain.lo()[i] + 1; - assert(g_d == in_d); - } } else { assert(regions.size() == 2); } - LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta); + + LayerNorm::forward_kernel_wrapper( + m, in_ptr, out_ptr, gamma_ptr, beta_ptr); } void LayerNorm::backward(FFModel const &ff) { @@ -530,100 +447,7 @@ void LayerNorm::backward_task(Task const *task, bool LayerNorm::measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { - ParallelTensorBase sub_output, sub_input; - if (!outputs[0]->get_sub_tensor(mv, sub_output)) { - return false; - } - if (!inputs[0]->get_sub_tensor(mv, sub_input)) { - return false; - } - Domain input_domain = sub_input.get_domain(); - Domain output_domain = sub_output.get_domain(); - LayerNormMeta *m = new LayerNormMeta(sim->handler, this); - - sim->free_all(); - float *in_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - assert(in_ptr != NULL); - GenericTensorAccessorR input1_acc(inputs[0]->data_type, input_domain, in_ptr); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - float *out_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - assert(out_ptr != NULL); - GenericTensorAccessorW output_acc( - outputs[0]->data_type, output_domain, out_ptr); - cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - // FIXME please add gamma_ptr and beta_ptr after finish the implementation - float *gamma_ptr = NULL, *beta_ptr = NULL; - GenericTensorAccessorW gamma_acc; - GenericTensorAccessorW beta_acc; - - bool out_of_memory = - (in_ptr == NULL) || (out_ptr == NULL) || - (((gamma_ptr == NULL) || (beta_ptr == NULL)) && (m->elementwise_affine)); - if (out_of_memory) { - cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - return true; - } - - std::function forward, backward; - forward = [&] { - forward_kernel_wrapper(m, input1_acc, output_acc, gamma_acc, beta_acc); - }; - - if (sim->computationMode == COMP_MODE_TRAINING) { - float *in_grad_ptr = - (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - assert(in_grad_ptr != NULL); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - float *out_grad_ptr = NULL; - out_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - assert(out_grad_ptr != NULL); - cost_metrics.outputs_memory += - cost_metrics.total_mem_diff_from(sim->offset); - - float *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL; - - out_of_memory = (in_grad_ptr == NULL) || (out_grad_ptr == NULL) || - (((gamma_grad_ptr == NULL) || (beta_grad_ptr == NULL)) && - (m->elementwise_affine)); - if (out_of_memory) { - cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - return true; - } - - backward = [&] { - backward_kernel_wrapper(m, - out_grad_ptr, - in_ptr, - in_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr); - }; - } - - inner_measure_operator_cost(sim, forward, backward, cost_metrics); - - if (sim->computationMode == COMP_MODE_TRAINING) { - log_measure.debug("[Measure LayerNorm] name(%s) num_elements(%zu) " - "forward_time(%.4lf) backward_time(%.4lf)\n", - name, - sub_output.get_volume(), - cost_metrics.forward_time, - cost_metrics.backward_time); - } else { - log_measure.debug("[Measure LayerNorm] name(%s) num_elements(%zu) " - "forward_time(%.4lf)\n", - name, - sub_output.get_volume(), - cost_metrics.forward_time); - } - - return true; + return false; } void LayerNorm::serialize(Legion::Serializer &sez) const { @@ -688,4 +512,4 @@ size_t hash::operator()( hash_combine(key, params.elementwise_affine); return key; } -}; // namespace std \ No newline at end of file +}; // namespace std diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index f0539f8405..ac477ba2ad 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -13,7 +13,6 @@ * limitations under the License. */ -#include "flexflow/ffconst_utils.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/utils/cuda_helper.h" @@ -31,19 +30,12 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln) effective_num_elements = ln->effective_num_elements; profiling = ln->profiling; eps = ln->eps; - DataType data_type = ln->data_type; - checkCUDA( - cudaMalloc(&mean_ptr, data_type_size(data_type) * effective_batch_size)); - checkCUDA( - cudaMalloc(&rstd_ptr, data_type_size(data_type) * effective_batch_size)); - checkCUDA( - cudaMalloc(&ds_ptr, data_type_size(data_type) * effective_batch_size)); - checkCUDA( - cudaMalloc(&db_ptr, data_type_size(data_type) * effective_batch_size)); - checkCUDA( - cudaMalloc(&scale_ptr, data_type_size(data_type) * effective_batch_size)); - checkCUDA( - cudaMalloc(&bias_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA(cudaMalloc(&mean_ptr, sizeof(float) * effective_batch_size)); + checkCUDA(cudaMalloc(&rstd_ptr, sizeof(float) * effective_batch_size)); + checkCUDA(cudaMalloc(&ds_ptr, sizeof(float) * effective_batch_size)); + checkCUDA(cudaMalloc(&db_ptr, sizeof(float) * effective_batch_size)); + checkCUDA(cudaMalloc(&scale_ptr, sizeof(float) * effective_batch_size)); + checkCUDA(cudaMalloc(&bias_ptr, sizeof(float) * effective_batch_size)); } template @@ -85,26 +77,26 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { } template -__global__ void RowwiseMomentsCUDAKernel( - int64_t N, float eps, T const *X, T *mean, T *rstd) { - __shared__ float m_shared[C10_WARP_SIZE]; - __shared__ float v_shared[C10_WARP_SIZE]; +__global__ void + RowwiseMomentsCUDAKernel(int64_t N, T eps, T const *X, T *mean, T *rstd) { + __shared__ T m_shared[C10_WARP_SIZE]; + __shared__ T v_shared[C10_WARP_SIZE]; const int64_t i = blockIdx.x; - float sum1 = 0.0f; - float sum2 = 0.0f; + T sum1 = 0; + T sum2 = 0; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - sum1 += static_cast(X[index]); - sum2 += static_cast(X[index]) * static_cast(X[index]); + sum1 += static_cast(X[index]); + sum2 += static_cast(X[index]) * static_cast(X[index]); } - sum1 = BlockReduceSum(sum1, m_shared); - sum2 = BlockReduceSum(sum2, v_shared); + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); if (threadIdx.x == 0) { - float const scale = float(1) / static_cast(N); + const T scale = T(1) / static_cast(N); sum1 *= scale; - sum2 = max(sum2 * scale - sum1 * sum1, float(0)); - mean[i] = static_cast(sum1); - rstd[i] = static_cast(rsqrt(sum2 + eps)); + sum2 = max(sum2 * scale - sum1 * sum1, T(0)); + mean[i] = sum1; + rstd[i] = rsqrt(sum2 + static_cast(eps)); } } @@ -138,30 +130,27 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, T *gamma_ptr, T *beta_ptr, cudaStream_t stream) { - RowwiseMomentsCUDAKernel + RowwiseMomentsCUDAKernel <<effective_batch_size, kCUDABlockReduceNumThreads, 0, stream>>>( - m->effective_num_elements, - m->eps, - in_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr)); - LayerNormForwardCUDAKernel + m->effective_num_elements, m->eps, in_ptr, m->mean_ptr, m->rstd_ptr); + LayerNormForwardCUDAKernel <<effective_batch_size, kCUDANumThreads, 0, stream>>>( m->effective_num_elements, in_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), + m->mean_ptr, + m->rstd_ptr, gamma_ptr, beta_ptr, out_ptr); } /*static*/ +template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - GenericTensorAccessorR const &input, - GenericTensorAccessorW &output, - GenericTensorAccessorW &gamma, - GenericTensorAccessorW &beta) { + T const *in_ptr, + T *out_ptr, + T *gamma_ptr, + T *beta_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -171,24 +160,8 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - if (m->input_type[0] == DT_FLOAT) { - LayerNorm::forward_kernel(m, - input.get_float_ptr(), - output.get_float_ptr(), - gamma.get_float_ptr(), - beta.get_float_ptr(), - stream); - } else if (m->input_type[0] == DT_HALF) { - LayerNorm::forward_kernel(m, - input.get_half_ptr(), - output.get_half_ptr(), - gamma.get_half_ptr(), - beta.get_half_ptr(), - stream); - } else { - assert(false && "unsupport datatype in layernorm"); - } - + LayerNorm::forward_kernel( + m, in_ptr, out_ptr, gamma_ptr, beta_ptr, stream); if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -197,8 +170,8 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, cudaEventDestroy(t_start); cudaEventDestroy(t_end); printf("[LayerNorm] forward time (CF) = %.2fms\n", elapsed); - // print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); - // print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); + print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); + print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); } } @@ -379,82 +352,6 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M, } } -template -__device__ __inline__ void compute_gI(T const *__restrict__ dY, - T const *__restrict__ X, - T const *__restrict__ mean, - T const *__restrict__ rstd, - T const *__restrict__ gamma, - T *dX, - int const N, - T *buf) { - auto const i1 = blockIdx.x; - const T mean_val = mean[i1]; - const T rstd_val = rstd[i1]; - T stats_x1{0}, stats_x2{0}; - constexpr int unroll = 4; - auto l = unroll * threadIdx.x; - T const *X_i = X + i1 * N; - T const *dY_i = dY + i1 * N; - T *dX_i = dX + i1 * N; - // vectorized reads don't improve perf, so use regular unrolling - - for (; l + unroll - 1 < N; l += blockDim.x * unroll) { -#pragma unroll - for (int k = 0; k < unroll; k++) { - T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); - const T c_h = static_cast(X_i[l + k]); - const T c_loss = static_cast(dY_i[l + k]); - stats_x1 += c_loss * gamma_val; - stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; - } - } - for (; l < N; l++) { - T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); - const T c_h = static_cast(X_i[l]); - const T c_loss = static_cast(dY_i[l]); - stats_x1 += c_loss * gamma_val; - stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; - } - - stats_x1 = BlockReduceSum(stats_x1, buf); - stats_x2 = BlockReduceSum(stats_x2, buf); - if (threadIdx.x == 0) { - buf[0] = stats_x1; - buf[1] = stats_x2; - } - __syncthreads(); - stats_x1 = buf[0]; - stats_x2 = buf[1]; - T fH = N; - T term1 = (T(1) / fH) * rstd_val; - - for (int l = threadIdx.x; l < N; l += blockDim.x) { - const T x = X_i[l]; - const T dy = dY_i[l]; - T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); - T f_grad_input = fH * gamma_val * dy; - f_grad_input -= (x - mean_val) * rstd_val * stats_x2; - f_grad_input -= stats_x1; - f_grad_input *= term1; - dX_i[l] = f_grad_input; - } -} - -template -__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, - T const *__restrict__ X, - T const *__restrict__ mean, - T const *__restrict__ rstd, - T const *__restrict__ gamma, - T *dX, - int const N) { - alignas(sizeof(double)) extern __shared__ char s_data1[]; - T *buf = reinterpret_cast(&s_data1); - - compute_gI(dY, X, mean, rstd, gamma, dX, N, buf); -} - /*static*/ template void LayerNorm::backward_kernel(LayerNormMeta const *m, @@ -469,34 +366,17 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, const int64_t N = m->effective_num_elements; ComputeInternalGradientsCUDAKernel <<>>( - N, - output_grad_ptr, - input_ptr, - gamma_ptr, - static_cast(m->ds_ptr), - static_cast(m->db_ptr)); + N, output_grad_ptr, input_ptr, gamma_ptr, m->ds_ptr, m->db_ptr); const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; ComputeGradientFusedParamsCUDAKernel <<>>(M, N, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - static_cast(m->ds_ptr), - static_cast(m->db_ptr), - static_cast(m->scale_ptr), - static_cast(m->bias_ptr)); - int const warp_size = C10_WARP_SIZE; - int const num_threads = 128; - const dim3 blocks(M); - int nshared = (num_threads / warp_size) * sizeof(T); - layer_norm_grad_input_kernel<<>>( - output_grad_ptr, - input_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_ptr, - input_grad_ptr, - N); + m->mean_ptr, + m->rstd_ptr, + m->ds_ptr, + m->db_ptr, + m->scale_ptr, + m->bias_ptr); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { if (M < 512) { // For small batch size, do colwise reduce directly @@ -506,8 +386,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, N, output_grad_ptr, input_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), + m->mean_ptr, + m->rstd_ptr, gamma_grad_ptr, beta_grad_ptr); } else { @@ -516,15 +396,14 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, constexpr int kThreadX = kColwiseReduceTileSize; constexpr int kThreadY = kColwiseReduceTileSize / 2; GammaBetaBackwardCUDAKernel - <<>>( - M, - N, - output_grad_ptr, - input_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_grad_ptr, - beta_grad_ptr); + <<>>(M, + N, + output_grad_ptr, + input_ptr, + m->mean_ptr, + m->rstd_ptr, + gamma_grad_ptr, + beta_grad_ptr); } } } @@ -540,18 +419,21 @@ void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, T *beta_grad_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - if (m->output_type[0] == DT_FLOAT) { - LayerNorm::backward_kernel(m, - output_grad_ptr, - input_ptr, - input_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr, - stream); - } + LayerNorm::backward_kernel(m, + output_grad_ptr, + input_ptr, + input_grad_ptr, + gamma_ptr, + gamma_grad_ptr, + beta_grad_ptr, + stream); } +template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, + float const *in_ptr, + float *out_ptr, + float *gamma_ptr, + float *beta_ptr); template void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, float const *output_grad_ptr, @@ -561,4 +443,4 @@ template void float *gamma_grad_ptr, float *beta_grad_ptr); -}; // namespace FlexFlow \ No newline at end of file +}; // namespace FlexFlow diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index ab65db542e..029b20afd1 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -52,10 +52,7 @@ SoftmaxParams Softmax::get_params() const { return params; } -Tensor FFModel::softmax(const Tensor _input, - int dim, - bool last_layer, - char const *name) { +Tensor FFModel::softmax(const Tensor _input, int dim, char const *name) { Layer *sm = new Layer(this, OP_SOFTMAX, DT_FLOAT, @@ -72,8 +69,6 @@ Tensor FFModel::softmax(const Tensor _input, sm->outputs[0] = create_tensor_legion_ordering( numdims, dims, DT_FLOAT, sm, 0, true /*create_grad*/); sm->add_int_property("softmax_dim", dim); - - sm->add_int_property("last_layer", last_layer); layers.push_back(sm); return sm->outputs[0]; } @@ -85,19 +80,15 @@ Op *Softmax::create_operator_from_layer( long long value; layer->get_int_property("softmax_dim", value); int dim = (int)value; - layer->get_int_property("last_layer", value); - bool last_layer = (bool)value; return new Softmax(model, inputs[0], (inputs[0]->num_dims - 1 - dim) % inputs[0]->num_dims, - last_layer, layer->name); } Softmax::Softmax(FFModel &model, const ParallelTensor _input, int _dim, - bool _last_layer, char const *name) : Op(model, OP_SOFTMAX, @@ -107,7 +98,7 @@ Softmax::Softmax(FFModel &model, 0 /*weights*/, 1 /*outputs*/, _input), - dim(_dim), last_layer(_last_layer) { + dim(_dim) { // Currently assume we always perform softmax along the inner most dim assert(dim == 0); ParallelDim dims[MAX_TENSOR_DIM]; @@ -122,7 +113,7 @@ Softmax::Softmax(FFModel &model, SoftmaxParams const ¶ms, const ParallelTensor input, char const *name) - : Softmax(model, input, params.dim, params.last_layer, name) {} + : Softmax(model, input, params.dim, name) {} void Softmax::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); @@ -292,13 +283,6 @@ void Softmax::backward(FFModel const &ff) { EXCLUSIVE, outputs[0]->region_grad)); launcher.add_field(1, FID_DATA); - - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(2, FID_DATA); runtime->execute_index_space(ctx, launcher); } @@ -331,8 +315,8 @@ void Softmax::backward_task_with_dim(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(regions.size() == 3); - assert(task->regions.size() == 3); + assert(regions.size() == 2); + assert(task->regions.size() == 2); // const Softmax* softmax = (Softmax*) task->args; SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); TensorAccessorW acc_input_grad(regions[0], @@ -343,16 +327,11 @@ void Softmax::backward_task_with_dim(Task const *task, true /*readOutput*/); TensorAccessorR acc_output_grad( regions[1], task->regions[1], FID_DATA, ctx, runtime); - TensorAccessorR acc_output( - regions[2], task->regions[1], FID_DATA, ctx, runtime); // make sure the image indices match! assert(acc_input_grad.rect == acc_output_grad.rect); - backward_kernel_wrapper(m, - acc_input_grad.ptr, - acc_output_grad.ptr, - acc_output.ptr, - acc_input_grad.rect.volume()); + backward_kernel_wrapper( + m, acc_input_grad.ptr, acc_output_grad.ptr, acc_input_grad.rect.volume()); } bool Softmax::get_int_parameter(PMParameter para, int *value) const { @@ -398,17 +377,11 @@ bool Softmax::measure_operator_cost(Simulator *sim, float *output_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); assert(output_grad_ptr != NULL); - float *output_ptr = - (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); backward = [&] { - backward_kernel_wrapper(m, - input_grad_ptr, - output_grad_ptr, - output_ptr, - sub_output.get_volume()); + backward_kernel_wrapper( + m, input_grad_ptr, output_grad_ptr, sub_output.get_volume()); }; } @@ -440,7 +413,6 @@ size_t hash::operator()( FlexFlow::SoftmaxParams const ¶ms) const { size_t key = 0; hash_combine(key, params.dim); - hash_combine(key, params.last_layer); return key; } }; // namespace std diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index b6004af14a..53e61b90d9 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -215,7 +215,7 @@ __host__ void int idx = 0; printf("%s", prefix); for (idx = 0; idx < num_elements; idx++) { - printf(" %.10lf", (float)host_ptr[idx]); + printf(" %.4lf", (float)host_ptr[idx]); if (idx >= 16) { break; } @@ -224,76 +224,6 @@ __host__ void checkCUDA(cudaFreeHost(host_ptr)); } -template -__host__ void - save_tensor(T const *ptr, size_t num_elements, char const *file_name) { - T *host_ptr; - checkCUDA(cudaHostAlloc(&host_ptr, - sizeof(T) * num_elements, - cudaHostAllocPortable | cudaHostAllocMapped)); - checkCUDA(cudaMemcpy( - host_ptr, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost)); - FILE *tensor_file; - tensor_file = fopen(file_name, "w"); - for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%.8f, ", (float)host_ptr[i]); - } - - fclose(tensor_file); - checkCUDA(cudaFreeHost(host_ptr)); -} - -cudnnStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax( - cudnnTensorDescriptor_t tensor, Domain domain, DataType data_type) { - int dims[MAX_TENSOR_DIM]; - cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(data_type); - switch (domain.get_dim()) { - case 1: { - Rect<1> rect = domain; - dims[0] = rect.hi[0] - rect.lo[0] + 1; - return cudnnSetTensor4dDescriptor( - tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, dims[0], 1, 1, 1); - } - case 2: { - Rect<2> rect = domain; - dims[0] = rect.hi[0] - rect.lo[0] + 1; - dims[1] = rect.hi[1] - rect.lo[1] + 1; - return cudnnSetTensor4dDescriptor( - tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, dims[1], dims[0], 1, 1); - } - case 3: { - Rect<3> rect = domain; - dims[0] = rect.hi[0] - rect.lo[0] + 1; - dims[1] = rect.hi[1] - rect.lo[1] + 1; - dims[2] = rect.hi[2] - rect.lo[2] + 1; - return cudnnSetTensor4dDescriptor(tensor, - CUDNN_TENSOR_NCHW, - cudnn_data_type, - dims[2] * dims[1], - dims[0], - 1, - 1); - } - case 4: { - Rect<4> rect = domain; - dims[0] = rect.hi[0] - rect.lo[0] + 1; - dims[1] = rect.hi[1] - rect.lo[1] + 1; - dims[2] = rect.hi[2] - rect.lo[2] + 1; - dims[3] = rect.hi[3] - rect.lo[3] + 1; - return cudnnSetTensor4dDescriptor(tensor, - CUDNN_TENSOR_NCHW, - cudnn_data_type, - dims[3] * dims[2] * dims[1], - dims[0], - 1, - 1); - } - default: - assert(false && "Unsupported dim number"); - } - return CUDNN_STATUS_BAD_PARAM; -} - cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Domain domain) { int dims[MAX_TENSOR_DIM]; @@ -440,8 +370,3 @@ template __host__ void print_tensor(int32_t const *ptr, size_t rect, char const *prefix); template __host__ void print_tensor(int64_t const *ptr, size_t rect, char const *prefix); -template __host__ void - save_tensor(float const *ptr, size_t rect, char const *file_name); -template __host__ void save_tensor(int32_t const *ptr, - size_t rect, - char const *file_name); diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 5310588477..4b55a39104 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1718,7 +1718,6 @@ GraphOptimalViewSerialized case OP_SOFTMAX: { Softmax *softmax = (Softmax *)op; sez.serialize(softmax->dim); - sez.serialize(softmax->last_layer); break; } case OP_REPARTITION: { @@ -2099,11 +2098,8 @@ void FFModel::deserialize_graph_optimal_view( case OP_SOFTMAX: { assert(num_inputs == 1); int softmax_dim; - bool last_layer; dez.deserialize(softmax_dim); - dez.deserialize(last_layer); - node = - get_or_create_node(inputs[0], {softmax_dim, last_layer}); + node = get_or_create_node(inputs[0], {softmax_dim}); break; } case OP_TRANSPOSE: { diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index 6d52e135cd..2925eb7555 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -1930,8 +1930,8 @@ void GraphSearchHelper::graph_optimize( } } best_graph->print_strategy_computation_graph(optimal.views); - // std::cout << "PCG:" << std::endl; - // best_graph->print_dot(); + //std::cout << "PCG:" << std::endl; + //best_graph->print_dot(); optimal_views = real_optimal_views; } @@ -3120,7 +3120,7 @@ void FFModel::graph_optimize( std::unordered_map &optimal_views) { this->graph_search->graph_optimize( budget, only_data_parallel, best_graph, optimal_views); - best_graph->print_dot(); + best_graph->print_dot(); } bool FFModel::convert_graph_to_operators( @@ -3221,8 +3221,7 @@ bool FFModel::convert_graph_to_operators( case OP_SOFTMAX: { assert(inList.size() == 1); Softmax *softmax = (Softmax *)node.ptr; - new_op = new Softmax( - *this, inputs[0], softmax->dim, softmax->last_layer, NULL); + new_op = new Softmax(*this, inputs[0], softmax->dim, NULL); break; } case OP_COMBINE: { From 2025d565e8bdcc9a03b8a0e84d64e7b69a77cb1d Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Wed, 5 Jul 2023 13:40:54 -0400 Subject: [PATCH 37/52] fix bert training issue. (#832) --- examples/python/pytorch/mt5/mt5_ff.py | 26 +- include/flexflow/model.h | 6 +- .../flexflow/ops/kernels/softmax_kernels.h | 6 +- include/flexflow/ops/layer_norm.h | 11 +- include/flexflow/ops/softmax.h | 2 + include/flexflow/ops/softmax_params.h | 1 + include/flexflow/parallel_ops/replicate.h | 17 +- include/flexflow/utils/cuda_helper.h | 6 + python/flexflow/core/flexflow_cffi.py | 23 +- python/flexflow/torch/model.py | 51 ++-- python/flexflow_c.cc | 7 +- python/flexflow_c.h | 1 + src/loss_functions/loss_functions.cc | 11 +- src/loss_functions/loss_functions.cu | 59 ++-- src/metrics_functions/metrics_functions.cc | 6 +- src/metrics_functions/metrics_functions.cu | 18 +- src/ops/element_unary.cu | 5 +- src/ops/kernels/softmax.cu | 37 ++- src/ops/layer_norm.cc | 254 +++++++++++++++--- src/ops/layer_norm.cu | 248 ++++++++++++----- src/ops/softmax.cc | 46 +++- src/runtime/cuda_helper.cu | 77 +++++- src/runtime/graph.cc | 6 +- src/runtime/substitution.cc | 9 +- 24 files changed, 729 insertions(+), 204 deletions(-) diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py index 08af8d88a7..c2868e9d1e 100644 --- a/examples/python/pytorch/mt5/mt5_ff.py +++ b/examples/python/pytorch/mt5/mt5_ff.py @@ -64,8 +64,8 @@ def preprocess_train() -> None: y_shape = y.shape assert len(y.shape) == 2, \ "`y` should have shape (num examples, sequence length)" - y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long) - lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long) + y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32) + lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32) y_ids[:, :] = y[:, :-1] lm_labels[:, :] = y[:, 1:] @@ -89,26 +89,29 @@ def top_level_task(): #model = BertModel.from_pretrained("bert-base-uncased") # Load train data as numpy arrays print("Loading data...") - ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")) + ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")).astype('int32') ids = np.pad(ids, ((0,0), (0,17)), 'constant') #ids = np.random.randint(0, 5, (1000, 512)) #print('ids_shape', ids.shape) #print('ids', ids) - mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")) + mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")).astype('int32') mask = np.pad(mask, ((0,0), (0,17)), 'constant') #mask = np.random.randint(0, 2, (1000, 512)) #y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) - lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")) + lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")).astype('int32') lm_labels = np.pad(lm_labels, ((0,0), (0,17)), 'constant') #lm_labels = np.random.randint(-1, 5, (1000, 512)) + position_id = torch.arange(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy() + token_type_ids = torch.zeros(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy() + batch_size = ffconfig.batch_size input_ids_shape = (batch_size, ids.shape[1]) attention_mask_shape = (batch_size, mask.shape[1]) #decoder_input_ids_shape = (batch_size, y_ids.shape[1]) input_tensors = [ - ffmodel.create_tensor(input_ids_shape, DataType.DT_INT64), # input_ids - ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT64), # attention_mask + ffmodel.create_tensor(input_ids_shape, DataType.DT_INT32), # input_ids + ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT32), # attention_mask #ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids ] encoder_seq_length = ids.shape[1] @@ -126,7 +129,7 @@ def top_level_task(): output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True) #from flexflow.torch.model import file_to_ff #file_to_ff("mt5.ff", ffmodel, input_tensors) - ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, adam_beta1=0.9, adam_beta2=0.98, weight_decay=0.0, adam_epsilon=2e-8) + ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, beta1=0.9, beta2=0.98, weight_decay=0.0, epsilon=2e-8) # ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) print("Compiling the model...") @@ -138,6 +141,9 @@ def top_level_task(): MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY, ], ) + + # load weights here + ffmodel.load_bert_pretrained(checkpoint=model) print("Creating data loaders...") print('id_dtype', ids.dtype) @@ -148,6 +154,8 @@ def top_level_task(): #decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids) # NOTE: We cast down the label tensor data to 32-bit to accommodate the # label tensor's required dtype + token_type_ids_dl = ffmodel.create_data_loader(input_tensors[2], token_type_ids) + position_id_dl = ffmodel.create_data_loader(input_tensors[3], position_id) labels_dl = ffmodel.create_data_loader( ffmodel.label_tensor, lm_labels.astype("int32") ) @@ -159,7 +167,7 @@ def top_level_task(): epochs = ffconfig.epochs ffmodel.fit( #x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl], - x=[input_ids_dl, attention_mask_dl], + x=[input_ids_dl, attention_mask_dl, position_id_dl, token_type_ids_dl], y=labels_dl, batch_size=batch_size, epochs=epochs, ) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index c6bc6929ad..0496d5fa8f 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -472,6 +472,7 @@ class FFModel { std::vector const &axes, bool elementwise_affine, float eps, + DataType data_type = DT_NONE, char const *name = NULL); // Add a batch_norm layer Tensor @@ -518,7 +519,10 @@ class FFModel { // Add a flat layer Tensor flat(const Tensor input, char const *name = NULL); // Add a softmax layer - Tensor softmax(const Tensor input, int dim = -1, char const *name = NULL); + Tensor softmax(const Tensor input, + int dim = -1, + bool last_layer = false, + char const *name = NULL); // Create input tensors and constants Tensor transpose(const Tensor input, std::vector const &perm, diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index 81b34d8558..9aec9f57c9 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -20,6 +20,7 @@ class SoftmaxMeta : public OpMeta { #endif bool profiling; int dim; + bool last_layer; char op_name[MAX_OPNAME]; }; @@ -33,6 +34,7 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements); namespace Internal { @@ -40,8 +42,10 @@ void forward_kernel(SoftmaxMeta const *m, float const *input_ptr, float *output_ptr, ffStream_t stream); -void backward_kernel(float *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements, ffStream_t stream); } // namespace Internal diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 8273b9ab52..552b9cf365 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -66,12 +66,11 @@ class LayerNorm : public Op { T *gamma_ptr, T *beta_ptr, ffStream_t stream); - template static void forward_kernel_wrapper(LayerNormMeta const *m, - T const *input_ptr, - T *output_ptr, - T *gamma_ptr, - T *beta_ptr); + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorW &gamma, + GenericTensorAccessorW &beta); template static void backward_kernel(LayerNormMeta const *m, T const *output_grad_ptr, @@ -105,7 +104,7 @@ class LayerNormMeta : public OpMeta { bool elementwise_affine; int64_t effective_batch_size, effective_num_elements; float eps; - float *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; + void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; char op_name[MAX_OPNAME]; }; diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 25a20315bd..2616294a3a 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -15,6 +15,7 @@ class Softmax : public Op { Softmax(FFModel &model, const ParallelTensor logit, int dim, + bool _last_layer, char const *name); Softmax(FFModel &model, Params const ¶ms, @@ -64,6 +65,7 @@ class Softmax : public Op { public: int dim; + bool last_layer; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/softmax_params.h b/include/flexflow/ops/softmax_params.h index d805d9966d..545e3a5cb9 100644 --- a/include/flexflow/ops/softmax_params.h +++ b/include/flexflow/ops/softmax_params.h @@ -7,6 +7,7 @@ namespace FlexFlow { struct SoftmaxParams { int dim; + bool last_layer; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(SoftmaxParams const &, SoftmaxParams const &); diff --git a/include/flexflow/parallel_ops/replicate.h b/include/flexflow/parallel_ops/replicate.h index 9514d055f3..ac41a6437e 100644 --- a/include/flexflow/parallel_ops/replicate.h +++ b/include/flexflow/parallel_ops/replicate.h @@ -32,9 +32,9 @@ class Replicate : public ParallelOp { bool append_parallel_op_info( std::vector ¶llel_ops) const override; static void init_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void forward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, @@ -46,16 +46,17 @@ class Replicate : public ParallelOp { template static void - forward_task_with_type(Legion::Task const *task, + forward_task_with_type(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); template - static void backward_task_with_type(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); + static void backward_task_with_type( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index 46e323b186..a4b2be0a66 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -132,9 +132,15 @@ __host__ void updateGAS(float *para_ptr, template void print_tensor(T const *ptr, size_t num_elements, char const *prefix); +template +void save_tensor(T const *ptr, size_t num_elements, char const *file_name); cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Legion::Domain domain); +cudnnStatus_t + cudnnSetTensorDescriptorFromDomain4SoftMax(cudnnTensorDescriptor_t tensor, + Legion::Domain domain, + DataType data_type = DT_FLOAT); cudaDataType_t ff_to_cuda_datatype(DataType type); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 42339d781c..4c01057109 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -1595,7 +1595,7 @@ def flat(self, input, name=None): self.add_layer(OpType.FLAT, name) return Tensor(handle, owner_op_type=OpType.FLAT) - def softmax(self, input, axis=-1, name=None): + def softmax(self, input, axis=-1, last_layer=False, name=None): """Softmax activation function. :param input: the input Tensor. @@ -1607,7 +1607,7 @@ def softmax(self, input, axis=-1, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_softmax(self.handle, input.handle, axis, c_name) + handle = ffc.flexflow_model_add_softmax(self.handle, input.handle, axis, last_layer, c_name) self.add_layer(OpType.SOFTMAX, name) return Tensor(handle, owner_op_type=OpType.SOFTMAX) @@ -2041,6 +2041,25 @@ def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): ff_tensor.set_tensor(self, np_tensor) print("Compiled ffmodel!") + def load_bert_pretrained(self, checkpoint=None): + # store weights in dict + weights_dict = {} + for name, params in checkpoint.named_parameters(): + weights_dict[name.replace("LayerNorm", "layer_norm").replace(".", "_")] = params.detach().cpu().numpy() + print(name.replace("LayerNorm", "layer_norm").replace(".", "_")) + # some weights not in params + weights_dict['cls_predictions_decoder_weight'] = checkpoint.cls.predictions.decoder.weight.detach().cpu().numpy() + weights_dict['cls_predictions_decoder_bias'] = checkpoint.cls.predictions.decoder.bias.detach().cpu().numpy() + for i in range (self._nb_layers): + layer = self._layers[i] + if (layer.name + "_weight") in weights_dict: + print('weight: ' + layer.name) + weight = layer.get_parameter_by_id(0); + weight.set_tensor(self, weights_dict[layer.name + "_weight"]) + if (layer.name + "_bias") in weights_dict: + print('bias: ' + layer.name) + bias = layer.get_parameter_by_id(1); + bias.set_tensor(self, weights_dict[layer.name + "_bias"]) def fit(self, x=None, y=None, batch_size=None, epochs=1): """Trains the model for a fixed number of epochs (iterations on a dataset). diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index 11e0c16e48..8ebac2146c 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -664,12 +664,13 @@ def string_to_ff(string, ffmodel, node_to_output): def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] - axes = [len(input_tensor.dims) - 1] + axes = [0] + eps = self.module.eps return ffmodel.layer_norm( input=input_tensor, axes=axes, elementwise_affine=True, - eps=1e-6, + eps=eps, name=self.name, ) @@ -1197,16 +1198,24 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_sub( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor, scalar = \ FunctionNode.parse_scalar_op(self, node_to_output) - return ffmodel.scalar_sub( - input=input_tensor, scalar=scalar, name=self.name, - ) - + if self.scalar_pos == FunctionNode.ScalarPosition.RIGHT: + return ffmodel.scalar_sub( + input=input_tensor, scalar=scalar, inplace=False, name=self.name, + ) + else: + negative_input = ffmodel.scalar_multiply( + input=input_tensor, scalar=-1, inplace=False, name=self.name + '_negative', + ) + return ffmodel.scalar_sub( + input=negative_input, scalar=-scalar, inplace=False, name=self.name, + ) + class ScalarTrueDivNode(FunctionNode): def __init__(self, node): @@ -1231,15 +1240,16 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_true_divide( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] scalar = self.innodes[1] assert type(scalar) is float + return ffmodel.scalar_true_divide( - input=input_tensor, scalar=scalar, name=self.name, + input=input_tensor, scalar=scalar, inplace=False, name=self.name, ) @@ -1652,14 +1662,14 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_multiply( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor, scalar = \ FunctionNode.parse_scalar_op(self, node_to_output) return ffmodel.scalar_multiply( - input=input_tensor, scalar=scalar, name=self.name, + input=input_tensor, scalar=scalar, inplace=False, name=self.name, ) @@ -2359,11 +2369,13 @@ def string_to_ff(string, ffmodel, node_to_output): "since attributes require access to the PyTorch model" ) - def to_ff(self, ffmodel, node_to_output): - return self.attr_to_ff_tensor(ffmodel) + def to_ff(self, ffmodel, node_to_output, input_tensors): + return self.attr_to_ff_tensor(ffmodel, input_tensors) + + def attr_to_ff_tensor(self, ffmodel, input_tensors): + - def attr_to_ff_tensor(self, ffmodel): - torch_tensor = self.attr + torch_tensor = self.attr assert (torch_tensor.shape[0] == 1) batch_size = ffmodel._ffconfig.batch_size torch_tensor = np.repeat(torch_tensor, batch_size, axis=0) @@ -2382,15 +2394,16 @@ def attr_to_ff_tensor(self, ffmodel): np_tensor = np_tensor.astype(np.float32) print('attr: ', torch_tensor.shape) - assert (torch_tensor.shape[0] == batch_size) + assert (torch_tensor.shape[0] == batch_size) ff_tensor = ffmodel.create_tensor( - torch_tensor.shape, ff_dtype, requires_grad, + torch_tensor.shape, ff_dtype, True, ) # delay set_tensor, add to ffmodel ffmodel.attr_tensors[ff_tensor] = np_tensor # ff_tensor.set_tensor( # ffmodel, np_tensor # ) + input_tensors.append(ff_tensor) return ff_tensor @@ -2472,7 +2485,7 @@ def to_ff(self, ffmodel, node_to_output, output_tensors): # `CrossEntropyLoss()` implementation logits = node_to_output[other["logits"].name] softmax_logits = ffmodel.softmax( - input=logits, name=self.name, + input=logits, last_layer=True, name=self.name, ) output_tensors[:] += [softmax_logits] else: @@ -2606,6 +2619,8 @@ def torch_to_ff(self, ffmodel, input_tensors, verbose=False): elif isinstance(node, OutputNode): node.to_ff(ffmodel, node_to_output, output_tensors) node_output = None + elif isinstance(node, AttributeNode): + node_output = node.to_ff(ffmodel, node_to_output, input_tensors) else: node_output = node.to_ff(ffmodel, node_to_output) diff --git a/python/flexflow_c.cc b/python/flexflow_c.cc index 74a5da6ce1..fd688c0c6a 100644 --- a/python/flexflow_c.cc +++ b/python/flexflow_c.cc @@ -568,8 +568,8 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_, for (int i = 0; i < n; i++) { axes_vec.push_back(axes[i]); } - Tensor tensor = - handle->layer_norm(input, axes_vec, elementwise_affine, eps, name); + Tensor tensor = handle->layer_norm( + input, axes_vec, elementwise_affine, eps, input->data_type, name); DEBUG_PRINT("[LayerNorm] new Tensor %p, input %p, elementwise_affine %d, eps " "%f, name %s", tensor, @@ -730,10 +730,11 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle_, flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle_, const flexflow_tensor_t input_, int dim, + bool last_layer, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); - Tensor tensor = handle->softmax(input, dim, name); + Tensor tensor = handle->softmax(input, dim, last_layer, name); DEBUG_PRINT( "[Softmax] new Tensor %p, input %p, name %s", tensor, input, name); return FFCObjectWrapper::wrap(tensor); diff --git a/python/flexflow_c.h b/python/flexflow_c.h index fb64c78fd2..5409002e5e 100644 --- a/python/flexflow_c.h +++ b/python/flexflow_c.h @@ -276,6 +276,7 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle, flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle, const flexflow_tensor_t input, int dim, + bool last_layer, char const *name); flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle, diff --git a/src/loss_functions/loss_functions.cc b/src/loss_functions/loss_functions.cc index ae89c3d469..d887ee9243 100644 --- a/src/loss_functions/loss_functions.cc +++ b/src/loss_functions/loss_functions.cc @@ -49,6 +49,8 @@ void Loss::backward(FFModel *model, if (loss_type == LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE) { assert(logit->get_volume() == label->get_volume()); scale_factor = 2.0f / logit->get_volume(); + } else if (loss_type == LOSS_SPARSE_CATEGORICAL_CROSSENTROPY) { + scale_factor = 1.0f; } else { scale_factor = 1.0f / model->config.batchSize; } @@ -131,9 +133,12 @@ void Loss::backward_task_with_dim(Task const *task, regions[2], task->regions[2], FID_DATA, ctx, runtime); // assertion the outter-most dim is replica dim and replica degree is 1 assert(acc_logit.rect.hi[NDIM - 1] == acc_logit.rect.lo[NDIM - 1]); - int num_samples = - acc_logit.rect.hi[NDIM - 2] - acc_logit.rect.lo[NDIM - 2] + 1; - int num_classes = acc_logit.rect.volume() / num_samples; + + int num_classes = acc_logit.rect.hi[0] - acc_logit.rect.lo[0] + 1; + int num_samples = acc_logit.rect.volume() / num_classes; + // int num_samples = + // acc_logit.rect.hi[NDIM - 2] - acc_logit.rect.lo[NDIM - 2] + 1; + // int num_classes = acc_logit.rect.volume() / num_samples; assert(acc_logit_grad.rect == acc_logit.rect); int k = 1; if (loss->repl_labels) { diff --git a/src/loss_functions/loss_functions.cu b/src/loss_functions/loss_functions.cu index 01766347b0..edd8f03fa4 100644 --- a/src/loss_functions/loss_functions.cu +++ b/src/loss_functions/loss_functions.cu @@ -18,6 +18,7 @@ namespace FlexFlow { +int const MASK_TOKEN = -100; using namespace Legion; __global__ void @@ -32,6 +33,25 @@ __global__ void } } +__global__ void + sparse_categorical_crossentropy_loss_backward_with_mask(float *logit_grad, + int const *label, + coord_t num_samples, + coord_t num_classes, + int const k, + float *num) { + CUDA_KERNEL_LOOP(i, num_samples * num_classes) { + int sample_id = i / num_classes; + int label_idx = label[i / (k * num_classes)]; + if (label_idx != MASK_TOKEN && (i == sample_id * num_classes + label_idx)) { + logit_grad[i] -= 1.0f; + atomicAdd(&num[0], 1.0f); + } else if (label_idx == MASK_TOKEN) { + logit_grad[i] = 0.0f; + } + } +} + __global__ void categorical_crossentropy_loss_backward(float *logit_grad, float const *logit, float const *label, @@ -74,14 +94,25 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_ptr, logit_volume * sizeof(float), cudaMemcpyDeviceToDevice)); - sparse_categorical_crossentropy_loss_backward<<>>( - logit_grad_ptr, label_ptr, num_samples, num_classes, k); - // Scale logit gradients by op->scale_factor + // calculate the scale factor inside kernel; + assert(scale_factor == 1.0f); + float *num; + checkCUDA(cudaMalloc(&num, sizeof(float))); + float effective_tokens; + int parallelism = num_samples * num_classes; + // sparse_categorical_crossentropy_loss_backward<<>>( + // logit_grad_ptr, label_ptr, num_samples, num_classes, k, num); + sparse_categorical_crossentropy_loss_backward_with_mask<<< + GET_BLOCKS(parallelism), + CUDA_NUM_THREADS, + 0, + stream>>>(logit_grad_ptr, label_ptr, num_samples, num_classes, k, num); + cudaMemcpy(&effective_tokens, num, sizeof(float), cudaMemcpyDeviceToHost); scale_kernel<<>>( - logit_grad_ptr, logit_grad_volume, 0, scale_factor * k); + logit_grad_ptr, logit_grad_volume, 0, 1.0f / effective_tokens); } void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( @@ -122,19 +153,17 @@ void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper( logit_grad_ptr, logit_grad_volume, 0, scale_factor); } -void Loss::identity_loss_backward_kernel_wrapper( - float *loss_grad_ptr, - float const *loss_ptr, - size_t loss_volume, - size_t loss_grad_volume, - float scale_factor) { +void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr, + float const *loss_ptr, + size_t loss_volume, + size_t loss_grad_volume, + float scale_factor) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); identity_loss_backward<<>>( - loss_grad_ptr, loss_ptr, loss_volume); + stream>>>(loss_grad_ptr, loss_ptr, loss_volume); // Scale logit gradients by loss->scale_factor scale_kernel<<>>( loss_grad_ptr, loss_grad_volume, 0, scale_factor); diff --git a/src/metrics_functions/metrics_functions.cc b/src/metrics_functions/metrics_functions.cc index 7244b06925..8c7e23ad8a 100644 --- a/src/metrics_functions/metrics_functions.cc +++ b/src/metrics_functions/metrics_functions.cc @@ -91,8 +91,8 @@ void Metrics::compute(FFModel *model, false /*must*/, 0 /*mapper_id*/, logit->machine_view.hash()); - std::cout << "logit shape: " << logit->get_shape() << std::endl; - std::cout << "label shape: " << label->get_shape() << std::endl; + // std::cout << "logit shape: " << logit->get_shape() << std::endl; + // std::cout << "label shape: " << label->get_shape() << std::endl; launcher.add_region_requirement(RegionRequirement( logit->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, logit->region)); launcher.add_field(0, FID_DATA); @@ -157,7 +157,7 @@ PerfMetrics assert(acc_label.rect.lo[0] == acc_label.rect.hi[0]); // Cannot measure categorical_crossentropy w/ sparse labels // Use measure_sparse_categorical_crossentropy instead - std::cout << "num_classes: " << num_classes << std::endl; + // std::cout << "num_classes: " << num_classes << std::endl; assert(!me->measure_categorical_crossentropy); Metrics::update_metrics_sparse_label_kernel_wrapper(acc_logit.ptr, acc_label.ptr, diff --git a/src/metrics_functions/metrics_functions.cu b/src/metrics_functions/metrics_functions.cu index b68b10d873..8c584c397c 100644 --- a/src/metrics_functions/metrics_functions.cu +++ b/src/metrics_functions/metrics_functions.cu @@ -19,6 +19,7 @@ namespace FlexFlow { float const LOG_MIN_VALUE = 0.00000001f; +int const MASK_TOKEN = -100; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, @@ -29,7 +30,7 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, CUDA_KERNEL_LOOP(b, num_samples) { if (metrics.measure_accuracy) { float max_val = -1.0f; - int my_label = 0; + int my_label = 0; for (int i = 0; i < num_classes; i++) { float my_logit = logits[b * num_classes + i]; if (my_logit > max_val) { @@ -38,14 +39,19 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, } } assert(my_label >= 0); - atomicAdd(&(perf->train_all), 1); - if (labels[b] == my_label) { - atomicAdd(&(perf->train_correct), 1); + if (labels[b] != MASK_TOKEN) { + atomicAdd(&(perf->train_all), 1); + if (labels[b] == my_label) { + atomicAdd(&(perf->train_correct), 1); + } } } if (metrics.measure_sparse_categorical_crossentropy) { - float my_logit = max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); - atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + if (labels[b] != MASK_TOKEN) { + float my_logit = + max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); + atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + } } if (metrics.measure_mean_squared_error || metrics.measure_root_mean_squared_error || diff --git a/src/ops/element_unary.cu b/src/ops/element_unary.cu index d6e5bcfdc3..187e60282f 100644 --- a/src/ops/element_unary.cu +++ b/src/ops/element_unary.cu @@ -202,8 +202,9 @@ __global__ void elewise_unary_backward_kernel(coord_t volume, case OP_GELU: { input_grad[i] = (T)(output_grad[i] * - (0.5 * erfc(-input[i] * M_SQRT1_2) - - 0.5 * M_SQRT1_2 * input[i] * exp(-input[i] * input[i] * 0.5))); + (0.5 * erfc(-input[i] * M_SQRT1_2) + + 0.5 * M_SQRT1_2 * input[i] * + ((2 / sqrt(M_PI)) * exp(-input[i] * input[i] * 0.5f)))); break; } case OP_RSQRT: { diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index d83d9952c9..e163c9a0c7 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -26,8 +26,10 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, Domain const &input_domain) : OpMeta(handler) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); - checkCUDNN(cudnnSetTensorDescriptorFromDomain(inputTensor, input_domain)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( + inputTensor, input_domain, softmax->data_type)); dim = softmax->dim; + last_layer = softmax->last_layer; profiling = softmax->profiling; std::strcpy(op_name, softmax->name); } @@ -66,6 +68,7 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -77,7 +80,7 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, cudaEventRecord(t_start, stream); } Internal::backward_kernel( - input_grad_ptr, output_grad_ptr, num_elements, stream); + m, input_grad_ptr, output_grad_ptr, output_ptr, num_elements, stream); if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -113,15 +116,33 @@ void forward_kernel(SoftmaxMeta const *m, output_ptr)); } -void backward_kernel(float *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements, cudaStream_t stream) { - checkCUDA(cudaMemcpyAsync(input_grad_ptr, - output_grad_ptr, - num_elements * sizeof(float), - cudaMemcpyDeviceToDevice, - stream)); + + if (m->last_layer) { + checkCUDA(cudaMemcpyAsync(input_grad_ptr, + output_grad_ptr, + num_elements * sizeof(float), + cudaMemcpyDeviceToDevice, + stream)); + } else { + float alpha = 1.0f, beta = 0.0f; + checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + m->inputTensor, + output_ptr, + m->inputTensor, + output_grad_ptr, + &beta, + m->inputTensor, + input_grad_ptr)); + } } } // namespace Internal diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index e8c65b4b03..915041d2bb 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -61,10 +61,27 @@ Tensor FFModel::layer_norm(const Tensor input, std::vector const &axes, bool elementwise_affine, float eps, + DataType data_type, char const *name) { - // FIXME: currently disable elementwise_affine - elementwise_affine = false; - // axes must be the last axes.size() dimensions + // In PyTorch, axes must be the sizes of the last axes.size() dimensions of + // the input tensor. However, since the tensor dimensions are reversed in + // FlexFlow (batch size is the last dimension), we require that axes must be + // the sizes of the FIRST axes.size() dimensions of the input tensor. + + // Another difference is that in PyTorch, the axes vector should contain the + // sizes of the dimensions with respect to which you want to compute the + // layernorm. In FlexFlow, instead, axes should contain the INDICES of the + // dimensions in question. We do this because the size of a dimension might be + // different when splitting a tensor in model parallelism. + assert( + axes.size() <= input->num_dims && + "number of axes must be less than tensor dimensions"); // input does not + // have replica + // dimension here + for (int i = 0; i < axes.size(); i++) { + assert(axes[i] == i && "axes must be the first axes.size() dimensions"); + } +#ifdef DEADCODE for (int i = 0; i < axes.size(); i++) { bool found = false; for (int j = 0; j < axes.size(); j++) { @@ -76,15 +93,33 @@ Tensor FFModel::layer_norm(const Tensor input, assert(false && "axes must be the last axes.size() dimensions"); } } +#endif + if (data_type == DT_NONE) { + data_type = input->data_type; + } int num_weights = elementwise_affine ? 2 : 0; - Layer *ln = new Layer(this, - OP_LAYERNORM, - DT_FLOAT, - name, - 1 /*inputs*/, - num_weights, - 1 /*outputs*/, - input); + Layer *ln = nullptr; + if (data_type != input->data_type) { + Tensor casted_input = cast(input, data_type, "type cast for layer_norm"); + ln = new Layer(this, + OP_LAYERNORM, + data_type, + name, + 1 /*inputs*/, + num_weights, + 1 /*outputs*/, + casted_input); + } else { + ln = new Layer(this, + OP_LAYERNORM, + data_type, + name, + 1 /*inputs*/, + num_weights, + 1 /*outputs*/, + input); + } + ln->outputs[0] = create_tensor_legion_ordering(input->num_dims, input->dims, input->data_type, @@ -92,19 +127,19 @@ Tensor FFModel::layer_norm(const Tensor input, 0, true /*create_grad*/); if (num_weights == 2) { - int M = 1; - for (int i = 0; i < axes.size(); i++) { - M *= input->dims[input->num_dims - 1 - axes[i]]; + int numdims = axes.size(); + int dims[numdims]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[axes[i]]; } - int dims[1] = {M}; - ln->weights[0] = create_weight_legion_ordering(1, + ln->weights[0] = create_weight_legion_ordering(numdims, dims, input->data_type, ln, true /*create_grad*/, nullptr, CHOSEN_SYNC_TYPE); - ln->weights[1] = create_weight_legion_ordering(1, + ln->weights[1] = create_weight_legion_ordering(numdims, dims, input->data_type, ln, @@ -179,19 +214,36 @@ LayerNorm::LayerNorm(FFModel &model, ParallelDim output_dims[MAX_TENSOR_DIM]; int M = 1; for (int i = 0; i < axes.size(); i++) { - M *= inputs[0]->dims[inputs[0]->num_dims - 1 - axes[i]].size; + M *= inputs[0]->dims[axes[i]].size; } effective_num_elements = M; - effective_batch_size = inputs[0]->get_shape().get_piece_num_elements() / M; + effective_batch_size = inputs[0]->get_volume() / M; + assert(elementwise_affine == (numWeights == 2)); if (numWeights > 0 && allocate_weights) { - int kernel_dims = 2; - assert(false); - // weights[0] = model.create_parallel_weight_legion_ordering( - // kernel_dims, - } else { - // do nothing + ParallelDim dims[axes.size()]; + for (int i = 0; i < axes.size(); i++) { + dims[i] = inputs[0]->dims[i]; + } + int seed = std::rand(); + Initializer *gamma_initializer = new UniformInitializer(seed, 0.0f, 1.0f); + Initializer *beta_initializer = new UniformInitializer(seed, 0.0f, 1.0f); + weights[0] = + model.create_parallel_weight_legion_ordering(axes.size(), + dims, + _input->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + gamma_initializer, + CHOSEN_SYNC_TYPE); + weights[1] = + model.create_parallel_weight_legion_ordering(axes.size(), + dims, + _input->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + beta_initializer, + CHOSEN_SYNC_TYPE); } - return; } void LayerNorm::init(FFModel const &ff) { @@ -221,6 +273,20 @@ void LayerNorm::init(FFModel const &ff) { EXCLUSIVE, inputs[0]->region)); launcher.add_field(1, FID_DATA); + if (elementwise_affine) { + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(3, FID_DATA); + } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap(ff, fm); @@ -233,6 +299,8 @@ OpMeta *LayerNorm::init_task(Task const *task, LayerNorm *ln = (LayerNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); LayerNormMeta *meta = new LayerNormMeta(handle, ln); + meta->input_type[0] = ln->inputs[0]->data_type; + meta->output_type[0] = ln->outputs[0]->data_type; return meta; } @@ -292,14 +360,21 @@ void LayerNorm::forward_task(Task const *task, assert(task->regions.size() == regions.size()); float const *in_ptr = NULL; float *out_ptr = NULL, *gamma_ptr = NULL, *beta_ptr = NULL; + GenericTensorAccessorR in; + GenericTensorAccessorW out, gamma, beta; + Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); + // in_ptr = helperGetTensorPointerRO( + // regions[0], task->regions[0], FID_DATA, ctx, runtime); + in = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); Domain out_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - out_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); + // out_ptr = helperGetTensorPointerWO( + // regions[1], task->regions[1], FID_DATA, ctx, runtime); + out = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(in_domain == out_domain); assert(in_domain.get_volume() == m->effective_num_elements * m->effective_batch_size); @@ -307,20 +382,28 @@ void LayerNorm::forward_task(Task const *task, assert(regions.size() == 4); Domain gamma_domain = runtime->get_index_space_domain( ctx, task->regions[2].region.get_index_space()); - gamma_ptr = helperGetTensorPointerRW( - regions[2], task->regions[2], FID_DATA, ctx, runtime); + // gamma_ptr = helperGetTensorPointerRW( + // regions[2], task->regions[2], FID_DATA, ctx, runtime); + gamma = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); Domain beta_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); - beta_ptr = helperGetTensorPointerRW( - regions[3], task->regions[3], FID_DATA, ctx, runtime); + // beta_ptr = helperGetTensorPointerRW( + // regions[3], task->regions[3], FID_DATA, ctx, runtime); + beta = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); assert(gamma_domain == beta_domain); assert(gamma_domain.get_volume() == m->effective_num_elements); + int numdims = gamma_domain.get_dim(); + for (int i = 0; i < numdims; i++) { + int g_d = gamma_domain.hi()[i] - gamma_domain.lo()[i] + 1; + int in_d = in_domain.hi()[i] - in_domain.lo()[i] + 1; + assert(g_d == in_d); + } } else { assert(regions.size() == 2); } - - LayerNorm::forward_kernel_wrapper( - m, in_ptr, out_ptr, gamma_ptr, beta_ptr); + LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta); } void LayerNorm::backward(FFModel const &ff) { @@ -447,7 +530,100 @@ void LayerNorm::backward_task(Task const *task, bool LayerNorm::measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { - return false; + ParallelTensorBase sub_output, sub_input; + if (!outputs[0]->get_sub_tensor(mv, sub_output)) { + return false; + } + if (!inputs[0]->get_sub_tensor(mv, sub_input)) { + return false; + } + Domain input_domain = sub_input.get_domain(); + Domain output_domain = sub_output.get_domain(); + LayerNormMeta *m = new LayerNormMeta(sim->handler, this); + + sim->free_all(); + float *in_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); + assert(in_ptr != NULL); + GenericTensorAccessorR input1_acc(inputs[0]->data_type, input_domain, in_ptr); + cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + + float *out_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + assert(out_ptr != NULL); + GenericTensorAccessorW output_acc( + outputs[0]->data_type, output_domain, out_ptr); + cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + + // FIXME please add gamma_ptr and beta_ptr after finish the implementation + float *gamma_ptr = NULL, *beta_ptr = NULL; + GenericTensorAccessorW gamma_acc; + GenericTensorAccessorW beta_acc; + + bool out_of_memory = + (in_ptr == NULL) || (out_ptr == NULL) || + (((gamma_ptr == NULL) || (beta_ptr == NULL)) && (m->elementwise_affine)); + if (out_of_memory) { + cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME; + cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME; + return true; + } + + std::function forward, backward; + forward = [&] { + forward_kernel_wrapper(m, input1_acc, output_acc, gamma_acc, beta_acc); + }; + + if (sim->computationMode == COMP_MODE_TRAINING) { + float *in_grad_ptr = + (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); + assert(in_grad_ptr != NULL); + cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + + float *out_grad_ptr = NULL; + out_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + assert(out_grad_ptr != NULL); + cost_metrics.outputs_memory += + cost_metrics.total_mem_diff_from(sim->offset); + + float *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL; + + out_of_memory = (in_grad_ptr == NULL) || (out_grad_ptr == NULL) || + (((gamma_grad_ptr == NULL) || (beta_grad_ptr == NULL)) && + (m->elementwise_affine)); + if (out_of_memory) { + cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME; + cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME; + return true; + } + + backward = [&] { + backward_kernel_wrapper(m, + out_grad_ptr, + in_ptr, + in_grad_ptr, + gamma_ptr, + gamma_grad_ptr, + beta_grad_ptr); + }; + } + + inner_measure_operator_cost(sim, forward, backward, cost_metrics); + + if (sim->computationMode == COMP_MODE_TRAINING) { + log_measure.debug("[Measure LayerNorm] name(%s) num_elements(%zu) " + "forward_time(%.4lf) backward_time(%.4lf)\n", + name, + sub_output.get_volume(), + cost_metrics.forward_time, + cost_metrics.backward_time); + } else { + log_measure.debug("[Measure LayerNorm] name(%s) num_elements(%zu) " + "forward_time(%.4lf)\n", + name, + sub_output.get_volume(), + cost_metrics.forward_time); + } + + return true; } void LayerNorm::serialize(Legion::Serializer &sez) const { @@ -512,4 +688,4 @@ size_t hash::operator()( hash_combine(key, params.elementwise_affine); return key; } -}; // namespace std +}; // namespace std \ No newline at end of file diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index ac477ba2ad..f0539f8405 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -13,6 +13,7 @@ * limitations under the License. */ +#include "flexflow/ffconst_utils.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/utils/cuda_helper.h" @@ -30,12 +31,19 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln) effective_num_elements = ln->effective_num_elements; profiling = ln->profiling; eps = ln->eps; - checkCUDA(cudaMalloc(&mean_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&rstd_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&ds_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&db_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&scale_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&bias_ptr, sizeof(float) * effective_batch_size)); + DataType data_type = ln->data_type; + checkCUDA( + cudaMalloc(&mean_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&rstd_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&ds_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&db_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&scale_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&bias_ptr, data_type_size(data_type) * effective_batch_size)); } template @@ -77,26 +85,26 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { } template -__global__ void - RowwiseMomentsCUDAKernel(int64_t N, T eps, T const *X, T *mean, T *rstd) { - __shared__ T m_shared[C10_WARP_SIZE]; - __shared__ T v_shared[C10_WARP_SIZE]; +__global__ void RowwiseMomentsCUDAKernel( + int64_t N, float eps, T const *X, T *mean, T *rstd) { + __shared__ float m_shared[C10_WARP_SIZE]; + __shared__ float v_shared[C10_WARP_SIZE]; const int64_t i = blockIdx.x; - T sum1 = 0; - T sum2 = 0; + float sum1 = 0.0f; + float sum2 = 0.0f; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - sum1 += static_cast(X[index]); - sum2 += static_cast(X[index]) * static_cast(X[index]); + sum1 += static_cast(X[index]); + sum2 += static_cast(X[index]) * static_cast(X[index]); } - sum1 = BlockReduceSum(sum1, m_shared); - sum2 = BlockReduceSum(sum2, v_shared); + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); if (threadIdx.x == 0) { - const T scale = T(1) / static_cast(N); + float const scale = float(1) / static_cast(N); sum1 *= scale; - sum2 = max(sum2 * scale - sum1 * sum1, T(0)); - mean[i] = sum1; - rstd[i] = rsqrt(sum2 + static_cast(eps)); + sum2 = max(sum2 * scale - sum1 * sum1, float(0)); + mean[i] = static_cast(sum1); + rstd[i] = static_cast(rsqrt(sum2 + eps)); } } @@ -130,27 +138,30 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, T *gamma_ptr, T *beta_ptr, cudaStream_t stream) { - RowwiseMomentsCUDAKernel + RowwiseMomentsCUDAKernel <<effective_batch_size, kCUDABlockReduceNumThreads, 0, stream>>>( - m->effective_num_elements, m->eps, in_ptr, m->mean_ptr, m->rstd_ptr); - LayerNormForwardCUDAKernel + m->effective_num_elements, + m->eps, + in_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr)); + LayerNormForwardCUDAKernel <<effective_batch_size, kCUDANumThreads, 0, stream>>>( m->effective_num_elements, in_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_ptr, beta_ptr, out_ptr); } /*static*/ -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - T const *in_ptr, - T *out_ptr, - T *gamma_ptr, - T *beta_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorW &gamma, + GenericTensorAccessorW &beta) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -160,8 +171,24 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - LayerNorm::forward_kernel( - m, in_ptr, out_ptr, gamma_ptr, beta_ptr, stream); + if (m->input_type[0] == DT_FLOAT) { + LayerNorm::forward_kernel(m, + input.get_float_ptr(), + output.get_float_ptr(), + gamma.get_float_ptr(), + beta.get_float_ptr(), + stream); + } else if (m->input_type[0] == DT_HALF) { + LayerNorm::forward_kernel(m, + input.get_half_ptr(), + output.get_half_ptr(), + gamma.get_half_ptr(), + beta.get_half_ptr(), + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } + if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -170,8 +197,8 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, cudaEventDestroy(t_start); cudaEventDestroy(t_end); printf("[LayerNorm] forward time (CF) = %.2fms\n", elapsed); - print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); - print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); + // print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); + // print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); } } @@ -352,6 +379,82 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M, } } +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + dX_i[l] = f_grad_input; + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + + compute_gI(dY, X, mean, rstd, gamma, dX, N, buf); +} + /*static*/ template void LayerNorm::backward_kernel(LayerNormMeta const *m, @@ -366,17 +469,34 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, const int64_t N = m->effective_num_elements; ComputeInternalGradientsCUDAKernel <<>>( - N, output_grad_ptr, input_ptr, gamma_ptr, m->ds_ptr, m->db_ptr); + N, + output_grad_ptr, + input_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; ComputeGradientFusedParamsCUDAKernel <<>>(M, N, - m->mean_ptr, - m->rstd_ptr, - m->ds_ptr, - m->db_ptr, - m->scale_ptr, - m->bias_ptr); + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + N); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { if (M < 512) { // For small batch size, do colwise reduce directly @@ -386,8 +506,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, N, output_grad_ptr, input_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_grad_ptr, beta_grad_ptr); } else { @@ -396,14 +516,15 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, constexpr int kThreadX = kColwiseReduceTileSize; constexpr int kThreadY = kColwiseReduceTileSize / 2; GammaBetaBackwardCUDAKernel - <<>>(M, - N, - output_grad_ptr, - input_ptr, - m->mean_ptr, - m->rstd_ptr, - gamma_grad_ptr, - beta_grad_ptr); + <<>>( + M, + N, + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); } } } @@ -419,21 +540,18 @@ void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, T *beta_grad_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - LayerNorm::backward_kernel(m, - output_grad_ptr, - input_ptr, - input_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr, - stream); + if (m->output_type[0] == DT_FLOAT) { + LayerNorm::backward_kernel(m, + output_grad_ptr, + input_ptr, + input_grad_ptr, + gamma_ptr, + gamma_grad_ptr, + beta_grad_ptr, + stream); + } } -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - float const *in_ptr, - float *out_ptr, - float *gamma_ptr, - float *beta_ptr); template void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, float const *output_grad_ptr, @@ -443,4 +561,4 @@ template void float *gamma_grad_ptr, float *beta_grad_ptr); -}; // namespace FlexFlow +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 029b20afd1..ab65db542e 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -52,7 +52,10 @@ SoftmaxParams Softmax::get_params() const { return params; } -Tensor FFModel::softmax(const Tensor _input, int dim, char const *name) { +Tensor FFModel::softmax(const Tensor _input, + int dim, + bool last_layer, + char const *name) { Layer *sm = new Layer(this, OP_SOFTMAX, DT_FLOAT, @@ -69,6 +72,8 @@ Tensor FFModel::softmax(const Tensor _input, int dim, char const *name) { sm->outputs[0] = create_tensor_legion_ordering( numdims, dims, DT_FLOAT, sm, 0, true /*create_grad*/); sm->add_int_property("softmax_dim", dim); + + sm->add_int_property("last_layer", last_layer); layers.push_back(sm); return sm->outputs[0]; } @@ -80,15 +85,19 @@ Op *Softmax::create_operator_from_layer( long long value; layer->get_int_property("softmax_dim", value); int dim = (int)value; + layer->get_int_property("last_layer", value); + bool last_layer = (bool)value; return new Softmax(model, inputs[0], (inputs[0]->num_dims - 1 - dim) % inputs[0]->num_dims, + last_layer, layer->name); } Softmax::Softmax(FFModel &model, const ParallelTensor _input, int _dim, + bool _last_layer, char const *name) : Op(model, OP_SOFTMAX, @@ -98,7 +107,7 @@ Softmax::Softmax(FFModel &model, 0 /*weights*/, 1 /*outputs*/, _input), - dim(_dim) { + dim(_dim), last_layer(_last_layer) { // Currently assume we always perform softmax along the inner most dim assert(dim == 0); ParallelDim dims[MAX_TENSOR_DIM]; @@ -113,7 +122,7 @@ Softmax::Softmax(FFModel &model, SoftmaxParams const ¶ms, const ParallelTensor input, char const *name) - : Softmax(model, input, params.dim, name) {} + : Softmax(model, input, params.dim, params.last_layer, name) {} void Softmax::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); @@ -283,6 +292,13 @@ void Softmax::backward(FFModel const &ff) { EXCLUSIVE, outputs[0]->region_grad)); launcher.add_field(1, FID_DATA); + + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(2, FID_DATA); runtime->execute_index_space(ctx, launcher); } @@ -315,8 +331,8 @@ void Softmax::backward_task_with_dim(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); + assert(regions.size() == 3); + assert(task->regions.size() == 3); // const Softmax* softmax = (Softmax*) task->args; SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); TensorAccessorW acc_input_grad(regions[0], @@ -327,11 +343,16 @@ void Softmax::backward_task_with_dim(Task const *task, true /*readOutput*/); TensorAccessorR acc_output_grad( regions[1], task->regions[1], FID_DATA, ctx, runtime); + TensorAccessorR acc_output( + regions[2], task->regions[1], FID_DATA, ctx, runtime); // make sure the image indices match! assert(acc_input_grad.rect == acc_output_grad.rect); - backward_kernel_wrapper( - m, acc_input_grad.ptr, acc_output_grad.ptr, acc_input_grad.rect.volume()); + backward_kernel_wrapper(m, + acc_input_grad.ptr, + acc_output_grad.ptr, + acc_output.ptr, + acc_input_grad.rect.volume()); } bool Softmax::get_int_parameter(PMParameter para, int *value) const { @@ -377,11 +398,17 @@ bool Softmax::measure_operator_cost(Simulator *sim, float *output_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); assert(output_grad_ptr != NULL); + float *output_ptr = + (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); backward = [&] { - backward_kernel_wrapper( - m, input_grad_ptr, output_grad_ptr, sub_output.get_volume()); + backward_kernel_wrapper(m, + input_grad_ptr, + output_grad_ptr, + output_ptr, + sub_output.get_volume()); }; } @@ -413,6 +440,7 @@ size_t hash::operator()( FlexFlow::SoftmaxParams const ¶ms) const { size_t key = 0; hash_combine(key, params.dim); + hash_combine(key, params.last_layer); return key; } }; // namespace std diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 53e61b90d9..b6004af14a 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -215,7 +215,7 @@ __host__ void int idx = 0; printf("%s", prefix); for (idx = 0; idx < num_elements; idx++) { - printf(" %.4lf", (float)host_ptr[idx]); + printf(" %.10lf", (float)host_ptr[idx]); if (idx >= 16) { break; } @@ -224,6 +224,76 @@ __host__ void checkCUDA(cudaFreeHost(host_ptr)); } +template +__host__ void + save_tensor(T const *ptr, size_t num_elements, char const *file_name) { + T *host_ptr; + checkCUDA(cudaHostAlloc(&host_ptr, + sizeof(T) * num_elements, + cudaHostAllocPortable | cudaHostAllocMapped)); + checkCUDA(cudaMemcpy( + host_ptr, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost)); + FILE *tensor_file; + tensor_file = fopen(file_name, "w"); + for (unsigned i = 0; i < num_elements; i++) { + fprintf(tensor_file, "%.8f, ", (float)host_ptr[i]); + } + + fclose(tensor_file); + checkCUDA(cudaFreeHost(host_ptr)); +} + +cudnnStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax( + cudnnTensorDescriptor_t tensor, Domain domain, DataType data_type) { + int dims[MAX_TENSOR_DIM]; + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(data_type); + switch (domain.get_dim()) { + case 1: { + Rect<1> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + return cudnnSetTensor4dDescriptor( + tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, dims[0], 1, 1, 1); + } + case 2: { + Rect<2> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + return cudnnSetTensor4dDescriptor( + tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, dims[1], dims[0], 1, 1); + } + case 3: { + Rect<3> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + return cudnnSetTensor4dDescriptor(tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + dims[2] * dims[1], + dims[0], + 1, + 1); + } + case 4: { + Rect<4> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + dims[3] = rect.hi[3] - rect.lo[3] + 1; + return cudnnSetTensor4dDescriptor(tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + dims[3] * dims[2] * dims[1], + dims[0], + 1, + 1); + } + default: + assert(false && "Unsupported dim number"); + } + return CUDNN_STATUS_BAD_PARAM; +} + cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Domain domain) { int dims[MAX_TENSOR_DIM]; @@ -370,3 +440,8 @@ template __host__ void print_tensor(int32_t const *ptr, size_t rect, char const *prefix); template __host__ void print_tensor(int64_t const *ptr, size_t rect, char const *prefix); +template __host__ void + save_tensor(float const *ptr, size_t rect, char const *file_name); +template __host__ void save_tensor(int32_t const *ptr, + size_t rect, + char const *file_name); diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 4b55a39104..5310588477 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1718,6 +1718,7 @@ GraphOptimalViewSerialized case OP_SOFTMAX: { Softmax *softmax = (Softmax *)op; sez.serialize(softmax->dim); + sez.serialize(softmax->last_layer); break; } case OP_REPARTITION: { @@ -2098,8 +2099,11 @@ void FFModel::deserialize_graph_optimal_view( case OP_SOFTMAX: { assert(num_inputs == 1); int softmax_dim; + bool last_layer; dez.deserialize(softmax_dim); - node = get_or_create_node(inputs[0], {softmax_dim}); + dez.deserialize(last_layer); + node = + get_or_create_node(inputs[0], {softmax_dim, last_layer}); break; } case OP_TRANSPOSE: { diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index 2925eb7555..6d52e135cd 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -1930,8 +1930,8 @@ void GraphSearchHelper::graph_optimize( } } best_graph->print_strategy_computation_graph(optimal.views); - //std::cout << "PCG:" << std::endl; - //best_graph->print_dot(); + // std::cout << "PCG:" << std::endl; + // best_graph->print_dot(); optimal_views = real_optimal_views; } @@ -3120,7 +3120,7 @@ void FFModel::graph_optimize( std::unordered_map &optimal_views) { this->graph_search->graph_optimize( budget, only_data_parallel, best_graph, optimal_views); - best_graph->print_dot(); + best_graph->print_dot(); } bool FFModel::convert_graph_to_operators( @@ -3221,7 +3221,8 @@ bool FFModel::convert_graph_to_operators( case OP_SOFTMAX: { assert(inList.size() == 1); Softmax *softmax = (Softmax *)node.ptr; - new_op = new Softmax(*this, inputs[0], softmax->dim, NULL); + new_op = new Softmax( + *this, inputs[0], softmax->dim, softmax->last_layer, NULL); break; } case OP_COMBINE: { From 5f793c1ef5dfee3e26932d171550f42c44ece2cb Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Tue, 25 Jul 2023 05:40:45 -0700 Subject: [PATCH 38/52] Improve machine_view hash --- src/runtime/machine_view.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/runtime/machine_view.cc b/src/runtime/machine_view.cc index dadece7691..0e3112da12 100644 --- a/src/runtime/machine_view.cc +++ b/src/runtime/machine_view.cc @@ -1,4 +1,5 @@ #include "flexflow/machine_view.h" +#include "flexflow/utils/hash_utils.h" namespace FlexFlow { @@ -47,13 +48,13 @@ size_t MachineView::num_parts() const { } size_t MachineView::hash() const { - size_t ret = 17; - ret = ret * 31 + std::hash()(device_type); - ret = ret * 31 + std::hash()(ndims); - ret = ret * 31 + std::hash()(start_device_id); + size_t h = 0; + hash_combine(h, device_type); + hash_combine(h, ndims); + hash_combine(start_device_id); for (int i = 0; i < ndims; i++) { - ret = ret * 31 + std::hash()(dim[i]); - ret = ret * 31 + std::hash()(stride[i]); + hash_combine(h, dim[i]); + hash_combine(h, stride[i]); } return ret; } From 2c09397312c005e29eb03d63b9b48c7cda1c6791 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Tue, 25 Jul 2023 05:43:08 -0700 Subject: [PATCH 39/52] Fix bugs in improved hashing --- src/runtime/machine_view.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/runtime/machine_view.cc b/src/runtime/machine_view.cc index 0e3112da12..44dff5a2da 100644 --- a/src/runtime/machine_view.cc +++ b/src/runtime/machine_view.cc @@ -51,12 +51,12 @@ size_t MachineView::hash() const { size_t h = 0; hash_combine(h, device_type); hash_combine(h, ndims); - hash_combine(start_device_id); + hash_combine(h, start_device_id); for (int i = 0; i < ndims; i++) { hash_combine(h, dim[i]); hash_combine(h, stride[i]); } - return ret; + return h; } int MachineView::get_device_id(DomainPoint const &p) const { From 862e9d7cac0a87c3af9adf5c17959458665ed861 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 25 Jul 2023 22:36:15 +0000 Subject: [PATCH 40/52] fix weight dimension in layernorm --- src/ops/layer_norm.cc | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 915041d2bb..062eb28955 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -220,15 +220,24 @@ LayerNorm::LayerNorm(FFModel &model, effective_batch_size = inputs[0]->get_volume() / M; assert(elementwise_affine == (numWeights == 2)); if (numWeights > 0 && allocate_weights) { - ParallelDim dims[axes.size()]; - for (int i = 0; i < axes.size(); i++) { + ParallelDim dims[axes.size() + 1]; + int num_dims = axes.size(); + for (int i = 0; i < num_dims; i++) { dims[i] = inputs[0]->dims[i]; } + assert(numInputs == 1); + dims[num_dims].degree = inputs[0]->dims[inputs[0]->num_dims - 2].degree; + dims[num_dims].size = dims[num_dims].degree; + dims[num_dims].parallel_idx = + inputs[0]->dims[inputs[0]->num_dims - 2].parallel_idx; + dims[num_dims].is_replica_dim = true; + num_dims += 1; + int seed = std::rand(); Initializer *gamma_initializer = new UniformInitializer(seed, 0.0f, 1.0f); Initializer *beta_initializer = new UniformInitializer(seed, 0.0f, 1.0f); weights[0] = - model.create_parallel_weight_legion_ordering(axes.size(), + model.create_parallel_weight_legion_ordering(num_dims, dims, _input->data_type, NULL /*owner_op*/, @@ -236,7 +245,7 @@ LayerNorm::LayerNorm(FFModel &model, gamma_initializer, CHOSEN_SYNC_TYPE); weights[1] = - model.create_parallel_weight_legion_ordering(axes.size(), + model.create_parallel_weight_legion_ordering(num_dims, dims, _input->data_type, NULL /*owner_op*/, @@ -376,8 +385,9 @@ void LayerNorm::forward_task(Task const *task, out = helperGetGenericTensorAccessorWO( m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(in_domain == out_domain); - assert(in_domain.get_volume() == - m->effective_num_elements * m->effective_batch_size); + // assert(in_domain.get_volume() == + // m->effective_num_elements * m->effective_batch_size); + if (m->elementwise_affine) { assert(regions.size() == 4); Domain gamma_domain = runtime->get_index_space_domain( @@ -394,7 +404,7 @@ void LayerNorm::forward_task(Task const *task, m->input_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); assert(gamma_domain == beta_domain); assert(gamma_domain.get_volume() == m->effective_num_elements); - int numdims = gamma_domain.get_dim(); + int numdims = gamma_domain.get_dim() - 1; for (int i = 0; i < numdims; i++) { int g_d = gamma_domain.hi()[i] - gamma_domain.lo()[i] + 1; int in_d = in_domain.hi()[i] - in_domain.lo()[i] + 1; @@ -495,8 +505,8 @@ void LayerNorm::backward_task(Task const *task, in_grad_ptr = helperGetTensorPointerRW( regions[2], task->regions[2], FID_DATA, ctx, runtime); assert(in_domain == out_grad_domain); - assert(in_domain.get_volume() == - m->effective_num_elements * m->effective_batch_size); + // assert(in_domain.get_volume() == + // m->effective_num_elements * m->effective_batch_size); if (m->elementwise_affine) { assert(regions.size() == 6); Domain gamma_domain = runtime->get_index_space_domain( From 2eee87525591144db8d96817dbe8d300f89719cf Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 11 Aug 2023 19:21:22 +0000 Subject: [PATCH 41/52] fix `preregister_task_variant` issue, linting --- include/flexflow/graph.h | 2 +- src/ops/element_binary.cc | 2 +- src/ops/element_unary.cc | 2 +- src/ops/fused.cu | 6 ++- src/ops/linear.cc | 10 ++-- .../kernels/replicate_kernels.cpp | 48 +++++++++---------- src/parallel_ops/kernels/replicate_kernels.cu | 48 +++++++++---------- src/parallel_ops/replicate.cc | 35 +++++++------- src/runtime/model.cc | 18 +++++-- src/runtime/parallel_tensor.cc | 1 - 10 files changed, 91 insertions(+), 81 deletions(-) diff --git a/include/flexflow/graph.h b/include/flexflow/graph.h index 5a0754a426..c4fed8ff58 100644 --- a/include/flexflow/graph.h +++ b/include/flexflow/graph.h @@ -91,7 +91,7 @@ struct NodeCompare { struct GraphOptimalViewSerialized { #ifdef LEGION_MAX_RETURN_SIZE - static const size_t buffer_size = 4*LEGION_MAX_RETURN_SIZE - 8; + static const size_t buffer_size = 4 * LEGION_MAX_RETURN_SIZE - 8; #else static const size_t buffer_size = 1024 * 1024 - 8; #endif diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 6adea1480c..0e0d8e7c31 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -448,7 +448,7 @@ __host__ void std::vector const ®ions, Context ctx, Runtime *runtime) { - const ElementBinary* ele = (const ElementBinary*) task->args; + ElementBinary const *ele = (ElementBinary const *)task->args; ElementBinaryMeta const *m = *((ElementBinaryMeta **)task->local_args); Domain in1_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index aa50cf935e..46643b655b 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -131,7 +131,7 @@ Tensor FFModel::tanh(const Tensor x, char const *name) { } Tensor FFModel::identity(const Tensor x, char const *name) { - //return this->unary(OP_IDENTITY, x, false /*inplace*/, name); + // return this->unary(OP_IDENTITY, x, false /*inplace*/, name); return x; } diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 720f1ed693..15072513a7 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -414,7 +414,8 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); - assert(my_input_accessor[0].data_type == my_output_accessor[0].data_type); + assert(my_input_accessor[0].data_type == + my_output_accessor[0].data_type); if (my_input_accessor[0].data_type == DT_INT64) { Kernels::Reshape::forward_kernel_wrapper( my_input_accessor[0].get_int64_ptr(), @@ -442,7 +443,8 @@ __host__ void FusedOp::forward_task(Task const *task, assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); TransposeMeta *m = (TransposeMeta *)metas->meta[op]; - assert(my_input_accessor[0].data_type == my_output_accessor[0].data_type); + assert(my_input_accessor[0].data_type == + my_output_accessor[0].data_type); Kernels::Transpose::forward_kernel_wrapper( m, my_input_accessor[0].get_float_ptr(), diff --git a/src/ops/linear.cc b/src/ops/linear.cc index cfea321dda..d5257b9c3e 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -624,11 +624,11 @@ void Linear::backward_task_with_dim(Task const *task, float *acc_bias_grad_ptr = NULL; if (m->use_bias) { TensorAccessorW acc_bias_grad(regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); + task->regions[rid], + FID_DATA, + ctx, + runtime, + true /*readOutput*/); rid++; assert(acc_bias_grad.rect.volume() == static_cast(out_dim)); acc_bias_grad_ptr = static_cast(acc_bias_grad.ptr); diff --git a/src/parallel_ops/kernels/replicate_kernels.cpp b/src/parallel_ops/kernels/replicate_kernels.cpp index ff9751ee34..c66995877e 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cpp +++ b/src/parallel_ops/kernels/replicate_kernels.cpp @@ -77,43 +77,43 @@ template void backward_kernel(float const *output_grad_ptr, size_t num_replicas); template void forward_kernel(double const *input_ptr, - double *output_ptr, - size_t num_elements); + double *output_ptr, + size_t num_elements); template __global__ void replicate_backward_kernel(double const *input_ptr, - double *output_ptr, - size_t num_elements, - size_t num_replicas); + double *output_ptr, + size_t num_elements, + size_t num_replicas); template void backward_kernel(double const *output_grad_ptr, - double *input_grad_ptr, - size_t num_elements, - size_t num_replicas); + double *input_grad_ptr, + size_t num_elements, + size_t num_replicas); template void forward_kernel(int64_t const *input_ptr, - int64_t *output_ptr, - size_t num_elements); + int64_t *output_ptr, + size_t num_elements); template __global__ void replicate_backward_kernel(int64_t const *input_ptr, - int64_t *output_ptr, - size_t num_elements, - size_t num_replicas); + int64_t *output_ptr, + size_t num_elements, + size_t num_replicas); template void backward_kernel(int64_t const *output_grad_ptr, - int64_t *input_grad_ptr, - size_t num_elements, - size_t num_replicas); + int64_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); template void forward_kernel(int32_t const *input_ptr, - int32_t *output_ptr, - size_t num_elements); + int32_t *output_ptr, + size_t num_elements); template __global__ void replicate_backward_kernel(int32_t const *input_ptr, - int32_t *output_ptr, - size_t num_elements, - size_t num_replicas); + int32_t *output_ptr, + size_t num_elements, + size_t num_replicas); template void backward_kernel(int32_t const *output_grad_ptr, - int32_t *input_grad_ptr, - size_t num_elements, - size_t num_replicas); + int32_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); } // namespace Replicate } // namespace Kernels diff --git a/src/parallel_ops/kernels/replicate_kernels.cu b/src/parallel_ops/kernels/replicate_kernels.cu index 2b8ff55eac..6ed4f424cf 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cu +++ b/src/parallel_ops/kernels/replicate_kernels.cu @@ -69,41 +69,41 @@ template void backward_kernel(float const *output_grad_ptr, size_t num_elements, size_t num_replicas); template void forward_kernel(double const *input_ptr, - double *output_ptr, - size_t num_elements); + double *output_ptr, + size_t num_elements); template __global__ void replicate_backward_kernel(double const *input_ptr, - double *output_ptr, - size_t num_elements, - size_t num_replicas); + double *output_ptr, + size_t num_elements, + size_t num_replicas); template void backward_kernel(double const *output_grad_ptr, - double *input_grad_ptr, - size_t num_elements, - size_t num_replicas); + double *input_grad_ptr, + size_t num_elements, + size_t num_replicas); template void forward_kernel(int32_t const *input_ptr, - int32_t *output_ptr, - size_t num_elements); + int32_t *output_ptr, + size_t num_elements); template __global__ void replicate_backward_kernel(int32_t const *input_ptr, - int32_t *output_ptr, - size_t num_elements, - size_t num_replicas); + int32_t *output_ptr, + size_t num_elements, + size_t num_replicas); template void backward_kernel(int32_t const *output_grad_ptr, - int32_t *input_grad_ptr, - size_t num_elements, - size_t num_replicas); + int32_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); template void forward_kernel(int64_t const *input_ptr, - int64_t *output_ptr, - size_t num_elements); + int64_t *output_ptr, + size_t num_elements); template __global__ void replicate_backward_kernel(int64_t const *input_ptr, - int64_t *output_ptr, - size_t num_elements, - size_t num_replicas); + int64_t *output_ptr, + size_t num_elements, + size_t num_replicas); template void backward_kernel(int64_t const *output_grad_ptr, - int64_t *input_grad_ptr, - size_t num_elements, - size_t num_replicas); + int64_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); } // namespace Replicate } // namespace Kernels diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index 1714684f75..322ab061e5 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -230,15 +230,14 @@ bool Replicate::append_parallel_op_info( void Replicate::init_task(Task const *task, std::vector const ®ions, - Context ctx, - Runtime *runtime) { -} + Context ctx, + Runtime *runtime) {} /*static*/ void Replicate::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { + std::vector const ®ions, + Context ctx, + Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); DataType data_type = *((DataType *)task->args); @@ -256,10 +255,11 @@ void Replicate::forward_task(Task const *task, } template -void Replicate::forward_task_with_type(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { +void Replicate::forward_task_with_type( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); Domain input_domain = runtime->get_index_space_domain( @@ -281,9 +281,9 @@ void Replicate::forward_task_with_type(Task const *task, } void Replicate::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { + std::vector const ®ions, + Context ctx, + Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); DataType data_type = *((DataType *)task->args); @@ -301,10 +301,11 @@ void Replicate::backward_task(Task const *task, } template -void Replicate::backward_task_with_type(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { +void Replicate::backward_task_with_type( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); Domain output_grad_domain = runtime->get_index_space_domain( diff --git a/src/runtime/model.cc b/src/runtime/model.cc index b5a2dcbd0c..fc5dc7d740 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -2635,11 +2635,12 @@ Op *FFModel::create_operator_from_layer( assert(tensor->parallel_tensor == nullptr); tensor->parallel_tensor = pt; // start from data parllel tensor - if (config.only_data_parallel && config.numNodes * config.workersPerNode > 1) { - if (pt->dims[num_dims-1].size == 1) { + if (config.only_data_parallel && + config.numNodes * config.workersPerNode > 1) { + if (pt->dims[num_dims - 1].size == 1) { Replicate *repl = new Replicate( *this, pt, num_dims, config.numNodes * config.workersPerNode); - repl->outputs[0]->dims[num_dims].is_replica_dim = true; + repl->outputs[0]->dims[num_dims].is_replica_dim = true; operators.push_back(repl); } else { Repartition *part = new Repartition( @@ -5137,8 +5138,15 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(REPLICATE_INIT_TASK_ID, "Replicate Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Replicate Init Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Replicate Init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } } { TaskVariantRegistrar registrar(REPLICATE_FWD_TASK_ID, "Replicate Forward"); diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 479bde3898..6cc0d0067e 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -146,7 +146,6 @@ size_t ParallelTensorShape::get_piece_num_elements() const { return piece_num_elements; } - RecordFormatter ParallelTensorShape::as_dot() const { RecordFormatter r; for (int i = 0; i < this->num_dims; i++) { From b9d13327861a161915a8b78bc408b4a4ccb5718f Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 14 Aug 2023 14:16:19 -0400 Subject: [PATCH 42/52] try to run graph_optimize on each node --- SPECINFER.md | 145 +++++++++++++++++++++++++++++++++++++++ include/flexflow/graph.h | 2 + src/runtime/graph.cc | 23 +++++-- src/runtime/model.cc | 16 +++-- 4 files changed, 174 insertions(+), 12 deletions(-) create mode 100644 SPECINFER.md diff --git a/SPECINFER.md b/SPECINFER.md new file mode 100644 index 0000000000..347b394db2 --- /dev/null +++ b/SPECINFER.md @@ -0,0 +1,145 @@ +# FlexFlow Serve: Low-Latency, High-Performance LLM Serving +![build](https://github.com/flexflow/flexflow/workflows/build/badge.svg?branch=master) ![gpu tests](https://github.com/flexflow/flexflow/workflows/gpu-ci/badge.svg?branch=master) ![multinode gpu tests](https://github.com/flexflow/flexflow/workflows/multinode-test/badge.svg?branch=master) ![docker](https://github.com/flexflow/flexflow/workflows/docker-build/badge.svg?branch=master) ![pip](https://github.com/flexflow/flexflow/workflows/pip-install/badge.svg?branch=master) ![shell-check](https://github.com/flexflow/flexflow/workflows/Shell%20Check/badge.svg?branch=master) ![clang-format](https://github.com/flexflow/flexflow/workflows/clang-format%20Check/badge.svg?branch=master) [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest) + +

+A SpecInfer Demo +

+ +## What is FlexFlow Serve + +

+An overview of SpecInfer +

+ +The high computational and memory requirements of generative large language +models (LLMs) make it challenging to serve them quickly and cheaply. +FlexFlow Serve is an open-source compiler and distributed system for +__low latency__, __high performance__ LLM serving. FlexFlow Serve outperforms +existing systems by 1.3-2.0x for single-node, multi-GPU inference and by +1.4-2.4x for multi-node, multi-GPU inference. + +that accelerates generative LLM +inference with __speculative inference__ and __token tree verification__. A key insight +behind SpecInfer is to combine various collectively boost-tuned small speculative +models (SSMs) to jointly predict the LLM’s outputs; the predictions are organized as a +token tree, whose nodes each represent a candidate token sequence. The correctness +of all candidate token sequences represented by a token tree is verified against the +LLM’s output in parallel using a novel tree-based parallel decoding mechanism. +SpecInfer uses an LLM as a token tree verifier instead of an incremental decoder, +which largely reduces the end-to-end inference latency and computational requirement +for serving generative LLMs while provably preserving model quality. + +

+Performance comparison +

+ +## Build/Install SpecInfer +SpecInfer is built on top of FlexFlow. You can build/install SpecInfer by building the inference branch of FlexFlow. Please read the [instructions](../INSTALL.md) for building/installing FlexFlow from source code. If you would like to quickly try SpecInfer, we also provide pre-built Docker packages ([specinfer-cuda](https://github.com/flexflow/FlexFlow/pkgs/container/specinfer-cuda) with a CUDA backend, [specinfer-hip_rocm](https://github.com/flexflow/FlexFlow/pkgs/container/specinfer-hip_rocm) with a HIP-ROCM backend) with all dependencies pre-installed (N.B.: currently, the CUDA pre-built containers are only fully compatible with host machines that have CUDA 11.7 installed), together with [Dockerfiles](./docker) if you wish to build the containers manually. + +## Run SpecInfer +The source code of the SpecInfer pipeline is available at [this folder](../inference/spec_infer/). The SpecInfer executable will be available at `/build_dir/inference/spec_infer/spec_infer` at compilation. You can use the following command-line arguments to run SpecInfer: + +* `-ll:gpu`: number of GPU processors to use on each node for serving an LLM (default: 0) +* `-ll:fsize`: size of device memory on each GPU in MB +* `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. SpecInfer keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters. +* `-llm-model`: the LLM model type as a case-insensitive string (e.g. "opt" or "llama") +* `-llm-weight`: path to the folder that stores the LLM weights +* `-llm-config`: path to the json file that stores the LLM model configs +* `-ssm-model`: the LLM model type as a case-insensitive string (e.g. "opt" or "llama"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs. +* `-ssm-weight`: path to the folder that stores the small speculative models' weights. The number of `-ssm-weight`s must match the number of `-ssm-model`s and `-ssm-config`s. +* `-ssm-config`: path to the json file that stores the SSM model configs. The number of `-ssm-config`s must match the number of `-ssm-model`s and `-ssm-weight`s. +* `-tokenizer`: path to the tokenizer file (see [Tokenizers](#tokenizers) for preparing a tokenizer for SpecInfer). +* `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used. +* `-prompt`: (optional) path to the prompt file. SpecInfer expects a json format file for prompts, all of which will be served by SpecInfer. In addition, users can also use the following API for registering requests: +* `-output-file`: (optional) filepath to use to save the output of the model, together with the generation latency + + +```c++ +class RequestManager { + RequestGuid register_new_request(std::string const &prompt, int max_sequence_length); +} +``` +For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-190M models for speculative inference. + +```bash +./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight /path/to/llm/weights -llm-config /path/to/llm/config.json -ssm-model llama -ssm-weight /path/to/ssm1/weights -ssm-config /path/to/ssm/config.json -ssm-model llama -smm-weight /path/to/ssm2/weights -ssm-config /path/to/ssm2/config.json -tokenizer /path/to/tokenizer.model -prompt /path/to/prompt.json --use-full-precision -tensor-parallelism-degree 2 -pipeline-parallelism-degree 2 +``` + +### Tokenizers +SpecInfer supports two tokenizers: + +* The SentencePiece tokenizer is used to support the LLaMA model family (e.g., LLaMA-6B, LLaMA-13B, and LLaMA-190M in our demo). We used the pretrained sentencepiece tokenizer from LLAMA, which is also available on Hugging Face (model id: `decapoda-research/llama-7b-hf`). If you are using our LLAMA-160M weights for the demo, however, you should use the tokenizer from the [JackFram/llama-160m](https://huggingface.co/JackFram/llama-160m/resolve/main/tokenizer.model) HuggingFace repo. +* The GPT2 tokenizer is used to support the Open Pre-trained Transformer model family (e.g., OPT-13B and OPT-125M). To use it, download the [vocab](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-vocab.json) and [merges](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-merges.txt) files and pass the folder containing them as a parameter. + +### Mixed-precision Support +SpecInfer now supports single-precision floating points and half-precision floating points. By default we use half-precision. Add `--use-full-precision` to the command line to run the demo with single-precision, please make sure to use the correct weight files in the form below. + +### CPU Offloading +SpecInfer offers offloading-based inference for running large models (e.g., llama-7B) on a single GPU. CPU offloading is a choice to save tensors in CPU memory, and only copy the tensor to GPU when doing calculation. Notice that now we selectively offload the largest weight tensors (weights tensor in Linear, Attention). Besides, since the small model occupies considerably less space, it it does not pose a bottleneck for GPU memory, the offloading will bring more runtime space and computational cost, so we only do the offloading for the large model. You can run the offloading example by adding `-offload` and `-offload-reserve-space-size` flags. +#### Quantization +To reduce data transferred between the CPU and GPU, SpecInfer provides int4 and int8 quantization. The compressed tensors are stored on the CPU side. Once copied to the GPU, these tensors undergo decompression and conversion back to their original precision. Please find the compressed weight files in our s3 bucket, or use [this script](../inference/utils/compress_llama_weights.py) from [FlexGen](https://github.com/FMInference/FlexGen) project to do the compression manually. The quantization method can be selected using the `--4bit-quantization` and `--8bit-quantization` flags. + +Below is an example command line to use offloading and quantization in SpecInfer. + +```bash +./inference/spec_infer/spec_infer -ll:gpu 1 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight /path/to/llm/weights -llm-config /path/to/llm/config.json -ssm-model llama -ssm-weight /path/to/ssm1/weights -ssm-config /path/to/ssm/config.json -ssm-model llama -smm-weight /path/to/ssm2/weights -ssm-config /path/to/ssm2/config.json -tokenizer /path/to/tokenizer.model -prompt /path/to/prompt.json --use-full-precision -offload -offload-reserve-space-size 6000 --8bit-quantization +``` + + + +### LLM Weights +The weight files used in our demo are extracted from HuggingFace, and stored in our AWS S3 bucket. + +| Model | Model id on Hugging Face | Storage Location (single precision) | Storage Location (half precision) | +| :---- | :---- | :---- | :---- | +| LLaMA-7B | decapoda-research/llama-7b-hf | s3://specinfer/weights/llama_7B_weights.tar.gz | s3://specinfer/half_weights/llama_7B_weights.tar.gz +| LLaMA-190M | JackFram/llama-160m | s3://specinfer/weights/llama_160M_weights.tar.gz | s3://specinfer/half_weights/llama_160M_weights.tar.gz +| OPT-6.7B | facebook/opt-6.7b | s3://specinfer/weights/opt_6B_weights.tar.gz | s3://specinfer/half_weights/opt_6B_weights.tar.gz +| OPT-125M | facebook/opt-125m | s3://specinfer/weights/opt_125M_weights.tar.gz | s3://specinfer/half_weights/opt_125M_weights.tar.gz + +You can use [this script](../inference/utils/download_llama_weights.py) to automatically download and convert the weights of a HuggingFace LLAMA LLM and a LLAMA SSM to the SpecInfer weight format. The script also downloads the LLAMA tokenizer. If you would like to try the OPT model instead, use [this script](../inference/utils/download_opt_weights.py) to download (and convert) the OPT weights and tokenizer. + +### Prompt Datasets +We have evaluated SpecInfer on the following prompts datasets: [Chatbot instruction prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatbot.json), [ChatGPT Prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatgpt.json), [WebQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/webqa.json), [Alpaca](https://specinfer.s3.us-east-2.amazonaws.com/prompts/alpaca.json), and [PIQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/piqa.json). + +### Script to run the demo +You can take a look at [this script](../tests/inference_tests.sh), which is run in CI for each new commit, for an example of how to run the demo. + +## Difference between SpecInfer and HuggingFace Assistant Model + +There are two major differences between the two systems. + +* First, the HuggingFace assistant model produces a single candidate token sequence during speculation, while SpecInfer generates and verifies a speculated token tree, whose tokens each represent a candidate token sequence. To deal with the more complex verification task, SpecInfer includes a number of systems and algorithmic optimizations to quickly and efficiently verify all tokens of a token tree in parallel. + +* Second, instead of considering a single assistant model, SpecInfer combines a variety of collectively boost-tuned small speculative models (SSMs) to jointly predict the LLM's outputs. We observe that using multiple boost-tuned SSMs is critical for improving speculative performance. + +## TODOs + +SpecInfer is under active development. We currently focus on the following tasks and strongly welcome all contributions to SpecInfer from bug fixes to new features and extensions. + +* Low-precision and mixed-precision support. The current version uses single-precision floating points for computing tree attention. We are actively working on support half-precision floating points, and int4 and int8 quantizations. +* Offloading-based generative LLM inference. Another promising avenue for future work is using speculative inference and token tree verification to reduce the end-to-end inference for offloading-based generative LLM inference. A potential application of this technique is enabling a single commodity GPU to serve LLMs for latency critical tasks. + +## Acknowledgements +This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting SpecInfer and the underlying FlexFlow runtime system. The following paper describes design, implementation, and key optimizations of SpecInfer. + +* Xupeng Miao*, Gabriele Oliaro*, Zhihao Zhang*, Xinhao Cheng, Zeyu Wang, Rae Ying Yee Wong, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia. [SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification](https://arxiv.org/abs/2305.09781). + +\* Denotes equal contribution + +### Citation +Please cite as: + +``` bibtex +@misc{miao2023specinfer, + title={SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification}, + author={Xupeng Miao and Gabriele Oliaro and Zhihao Zhang and Xinhao Cheng and Zeyu Wang and Rae Ying Yee Wong and Zhuoming Chen and Daiyaan Arfeen and Reyna Abhyankar and Zhihao Jia}, + year={2023}, + eprint={2305.09781}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +## License +Both SpecInfer and FlexFlow use Apache License 2.0. diff --git a/include/flexflow/graph.h b/include/flexflow/graph.h index c4fed8ff58..2c92eeeb31 100644 --- a/include/flexflow/graph.h +++ b/include/flexflow/graph.h @@ -332,6 +332,8 @@ class Graph { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static GraphOptimalViewSerialized + graph_optimize_wrapper(FFModel * model); Node find_bottleneck_node(Node const &sink_node, Node const &source_node) const; void print_strategy_computation_graph( diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 15ba8ce951..5c49687712 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1882,11 +1882,11 @@ namespace { */ std::pair, std::unordered_map> try_one_lambda(std::pair &lambda, - Task const *task, + FFModel *model, std::shared_ptr &cached_simulator, bool perform_memory_search) { // Create a new fresh model - FFModel *model = *((FFModel **)task->args); + //FFModel *model = *((FFModel **)task->args); model->clear_graph_search_cache(); if (model->config.search_num_nodes.has_value()) { @@ -1900,6 +1900,9 @@ std::pair, std::unordered_map> model->config.workersPerNode, model->config.cpusPerNode, model->all_valid_views); + Runtime *runtime = model->config.lg_hlr; + Context ctx = model->config.lg_ctx; + const Task* task = runtime->get_current_task(ctx); Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) .only_kind(Memory::GPU_FB_MEM) .best_affinity_to(task->target_proc) @@ -2045,12 +2048,20 @@ bool is_valid_strategy( * @param runtime Not used * @return GraphOptimalViewSerialized Serialized optimal PCG */ + GraphOptimalViewSerialized Graph::graph_optimize_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - auto model_config = (*((FFModel **)task->args))->config; + FFModel* model = *((FFModel **)task->args); + return Graph::graph_optimize_wrapper(model); +} + +/*static*/ +GraphOptimalViewSerialized + Graph::graph_optimize_wrapper(FFModel *model) { + auto model_config = model->config; bool perform_memory_search = model_config.perform_memory_search; float memory_threshold = model_config.device_mem; bool only_data_parallel = model_config.only_data_parallel; @@ -2066,7 +2077,7 @@ GraphOptimalViewSerialized // Be optimistic lambdas.emplace_back(std::make_pair(1.0, MemorySearchResult{})); auto try_result = try_one_lambda( - lambdas.back(), task, cached_simulator, perform_memory_search); + lambdas.back(), model, cached_simulator, perform_memory_search); best_graph = std::move(try_result.first); optimal_views = try_result.second; @@ -2082,7 +2093,7 @@ GraphOptimalViewSerialized // Not found the strategy; need to do binary search lambdas.emplace_back(std::make_pair(0.0, MemorySearchResult{})); try_result = try_one_lambda( - lambdas.back(), task, cached_simulator, perform_memory_search); + lambdas.back(), model, cached_simulator, perform_memory_search); best_graph = std::move(try_result.first); optimal_views = try_result.second; @@ -2109,7 +2120,7 @@ GraphOptimalViewSerialized lambdas.emplace_back(std::make_pair(mid, MemorySearchResult{})); try_result = try_one_lambda( - lambdas.back(), task, cached_simulator, perform_memory_search); + lambdas.back(), model, cached_simulator, perform_memory_search); if (!is_valid_strategy(lambdas, try_result.first.get(), diff --git a/src/runtime/model.cc b/src/runtime/model.cc index fc5dc7d740..8461668aaa 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -2831,12 +2831,16 @@ void FFModel::compile(LossType loss_type, // Launch the graph optimize task { FFModel *model = this; - TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID, - TaskArgument(&model, sizeof(FFModel *))); - Future future = runtime->execute_task(ctx, launcher); - - PCG::GraphOptimalViewSerialized ret = - future.get_result(); + PCG::GraphOptimalViewSerialized ret; + if (false) { + TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID, + TaskArgument(&model, sizeof(FFModel *))); + Future future = runtime->execute_task(ctx, launcher); + ret = + future.get_result(); + } else { + ret = PCG::Graph::graph_optimize_wrapper(this); + } Deserializer dez(ret.data, ret.total_bytes); // Reconstruct operators PCG::Graph *best_graph = new PCG::Graph(this); From b5b0815d83fd9a73afb4bd3b99a2f9cc5ca977ea Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 14 Aug 2023 14:16:38 -0400 Subject: [PATCH 43/52] remove unnecessary file --- SPECINFER.md | 145 --------------------------------------------------- 1 file changed, 145 deletions(-) delete mode 100644 SPECINFER.md diff --git a/SPECINFER.md b/SPECINFER.md deleted file mode 100644 index 347b394db2..0000000000 --- a/SPECINFER.md +++ /dev/null @@ -1,145 +0,0 @@ -# FlexFlow Serve: Low-Latency, High-Performance LLM Serving -![build](https://github.com/flexflow/flexflow/workflows/build/badge.svg?branch=master) ![gpu tests](https://github.com/flexflow/flexflow/workflows/gpu-ci/badge.svg?branch=master) ![multinode gpu tests](https://github.com/flexflow/flexflow/workflows/multinode-test/badge.svg?branch=master) ![docker](https://github.com/flexflow/flexflow/workflows/docker-build/badge.svg?branch=master) ![pip](https://github.com/flexflow/flexflow/workflows/pip-install/badge.svg?branch=master) ![shell-check](https://github.com/flexflow/flexflow/workflows/Shell%20Check/badge.svg?branch=master) ![clang-format](https://github.com/flexflow/flexflow/workflows/clang-format%20Check/badge.svg?branch=master) [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest) - -

-A SpecInfer Demo -

- -## What is FlexFlow Serve - -

-An overview of SpecInfer -

- -The high computational and memory requirements of generative large language -models (LLMs) make it challenging to serve them quickly and cheaply. -FlexFlow Serve is an open-source compiler and distributed system for -__low latency__, __high performance__ LLM serving. FlexFlow Serve outperforms -existing systems by 1.3-2.0x for single-node, multi-GPU inference and by -1.4-2.4x for multi-node, multi-GPU inference. - -that accelerates generative LLM -inference with __speculative inference__ and __token tree verification__. A key insight -behind SpecInfer is to combine various collectively boost-tuned small speculative -models (SSMs) to jointly predict the LLM’s outputs; the predictions are organized as a -token tree, whose nodes each represent a candidate token sequence. The correctness -of all candidate token sequences represented by a token tree is verified against the -LLM’s output in parallel using a novel tree-based parallel decoding mechanism. -SpecInfer uses an LLM as a token tree verifier instead of an incremental decoder, -which largely reduces the end-to-end inference latency and computational requirement -for serving generative LLMs while provably preserving model quality. - -

-Performance comparison -

- -## Build/Install SpecInfer -SpecInfer is built on top of FlexFlow. You can build/install SpecInfer by building the inference branch of FlexFlow. Please read the [instructions](../INSTALL.md) for building/installing FlexFlow from source code. If you would like to quickly try SpecInfer, we also provide pre-built Docker packages ([specinfer-cuda](https://github.com/flexflow/FlexFlow/pkgs/container/specinfer-cuda) with a CUDA backend, [specinfer-hip_rocm](https://github.com/flexflow/FlexFlow/pkgs/container/specinfer-hip_rocm) with a HIP-ROCM backend) with all dependencies pre-installed (N.B.: currently, the CUDA pre-built containers are only fully compatible with host machines that have CUDA 11.7 installed), together with [Dockerfiles](./docker) if you wish to build the containers manually. - -## Run SpecInfer -The source code of the SpecInfer pipeline is available at [this folder](../inference/spec_infer/). The SpecInfer executable will be available at `/build_dir/inference/spec_infer/spec_infer` at compilation. You can use the following command-line arguments to run SpecInfer: - -* `-ll:gpu`: number of GPU processors to use on each node for serving an LLM (default: 0) -* `-ll:fsize`: size of device memory on each GPU in MB -* `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. SpecInfer keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters. -* `-llm-model`: the LLM model type as a case-insensitive string (e.g. "opt" or "llama") -* `-llm-weight`: path to the folder that stores the LLM weights -* `-llm-config`: path to the json file that stores the LLM model configs -* `-ssm-model`: the LLM model type as a case-insensitive string (e.g. "opt" or "llama"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs. -* `-ssm-weight`: path to the folder that stores the small speculative models' weights. The number of `-ssm-weight`s must match the number of `-ssm-model`s and `-ssm-config`s. -* `-ssm-config`: path to the json file that stores the SSM model configs. The number of `-ssm-config`s must match the number of `-ssm-model`s and `-ssm-weight`s. -* `-tokenizer`: path to the tokenizer file (see [Tokenizers](#tokenizers) for preparing a tokenizer for SpecInfer). -* `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used. -* `-prompt`: (optional) path to the prompt file. SpecInfer expects a json format file for prompts, all of which will be served by SpecInfer. In addition, users can also use the following API for registering requests: -* `-output-file`: (optional) filepath to use to save the output of the model, together with the generation latency - - -```c++ -class RequestManager { - RequestGuid register_new_request(std::string const &prompt, int max_sequence_length); -} -``` -For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-190M models for speculative inference. - -```bash -./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight /path/to/llm/weights -llm-config /path/to/llm/config.json -ssm-model llama -ssm-weight /path/to/ssm1/weights -ssm-config /path/to/ssm/config.json -ssm-model llama -smm-weight /path/to/ssm2/weights -ssm-config /path/to/ssm2/config.json -tokenizer /path/to/tokenizer.model -prompt /path/to/prompt.json --use-full-precision -tensor-parallelism-degree 2 -pipeline-parallelism-degree 2 -``` - -### Tokenizers -SpecInfer supports two tokenizers: - -* The SentencePiece tokenizer is used to support the LLaMA model family (e.g., LLaMA-6B, LLaMA-13B, and LLaMA-190M in our demo). We used the pretrained sentencepiece tokenizer from LLAMA, which is also available on Hugging Face (model id: `decapoda-research/llama-7b-hf`). If you are using our LLAMA-160M weights for the demo, however, you should use the tokenizer from the [JackFram/llama-160m](https://huggingface.co/JackFram/llama-160m/resolve/main/tokenizer.model) HuggingFace repo. -* The GPT2 tokenizer is used to support the Open Pre-trained Transformer model family (e.g., OPT-13B and OPT-125M). To use it, download the [vocab](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-vocab.json) and [merges](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-merges.txt) files and pass the folder containing them as a parameter. - -### Mixed-precision Support -SpecInfer now supports single-precision floating points and half-precision floating points. By default we use half-precision. Add `--use-full-precision` to the command line to run the demo with single-precision, please make sure to use the correct weight files in the form below. - -### CPU Offloading -SpecInfer offers offloading-based inference for running large models (e.g., llama-7B) on a single GPU. CPU offloading is a choice to save tensors in CPU memory, and only copy the tensor to GPU when doing calculation. Notice that now we selectively offload the largest weight tensors (weights tensor in Linear, Attention). Besides, since the small model occupies considerably less space, it it does not pose a bottleneck for GPU memory, the offloading will bring more runtime space and computational cost, so we only do the offloading for the large model. You can run the offloading example by adding `-offload` and `-offload-reserve-space-size` flags. -#### Quantization -To reduce data transferred between the CPU and GPU, SpecInfer provides int4 and int8 quantization. The compressed tensors are stored on the CPU side. Once copied to the GPU, these tensors undergo decompression and conversion back to their original precision. Please find the compressed weight files in our s3 bucket, or use [this script](../inference/utils/compress_llama_weights.py) from [FlexGen](https://github.com/FMInference/FlexGen) project to do the compression manually. The quantization method can be selected using the `--4bit-quantization` and `--8bit-quantization` flags. - -Below is an example command line to use offloading and quantization in SpecInfer. - -```bash -./inference/spec_infer/spec_infer -ll:gpu 1 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight /path/to/llm/weights -llm-config /path/to/llm/config.json -ssm-model llama -ssm-weight /path/to/ssm1/weights -ssm-config /path/to/ssm/config.json -ssm-model llama -smm-weight /path/to/ssm2/weights -ssm-config /path/to/ssm2/config.json -tokenizer /path/to/tokenizer.model -prompt /path/to/prompt.json --use-full-precision -offload -offload-reserve-space-size 6000 --8bit-quantization -``` - - - -### LLM Weights -The weight files used in our demo are extracted from HuggingFace, and stored in our AWS S3 bucket. - -| Model | Model id on Hugging Face | Storage Location (single precision) | Storage Location (half precision) | -| :---- | :---- | :---- | :---- | -| LLaMA-7B | decapoda-research/llama-7b-hf | s3://specinfer/weights/llama_7B_weights.tar.gz | s3://specinfer/half_weights/llama_7B_weights.tar.gz -| LLaMA-190M | JackFram/llama-160m | s3://specinfer/weights/llama_160M_weights.tar.gz | s3://specinfer/half_weights/llama_160M_weights.tar.gz -| OPT-6.7B | facebook/opt-6.7b | s3://specinfer/weights/opt_6B_weights.tar.gz | s3://specinfer/half_weights/opt_6B_weights.tar.gz -| OPT-125M | facebook/opt-125m | s3://specinfer/weights/opt_125M_weights.tar.gz | s3://specinfer/half_weights/opt_125M_weights.tar.gz - -You can use [this script](../inference/utils/download_llama_weights.py) to automatically download and convert the weights of a HuggingFace LLAMA LLM and a LLAMA SSM to the SpecInfer weight format. The script also downloads the LLAMA tokenizer. If you would like to try the OPT model instead, use [this script](../inference/utils/download_opt_weights.py) to download (and convert) the OPT weights and tokenizer. - -### Prompt Datasets -We have evaluated SpecInfer on the following prompts datasets: [Chatbot instruction prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatbot.json), [ChatGPT Prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatgpt.json), [WebQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/webqa.json), [Alpaca](https://specinfer.s3.us-east-2.amazonaws.com/prompts/alpaca.json), and [PIQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/piqa.json). - -### Script to run the demo -You can take a look at [this script](../tests/inference_tests.sh), which is run in CI for each new commit, for an example of how to run the demo. - -## Difference between SpecInfer and HuggingFace Assistant Model - -There are two major differences between the two systems. - -* First, the HuggingFace assistant model produces a single candidate token sequence during speculation, while SpecInfer generates and verifies a speculated token tree, whose tokens each represent a candidate token sequence. To deal with the more complex verification task, SpecInfer includes a number of systems and algorithmic optimizations to quickly and efficiently verify all tokens of a token tree in parallel. - -* Second, instead of considering a single assistant model, SpecInfer combines a variety of collectively boost-tuned small speculative models (SSMs) to jointly predict the LLM's outputs. We observe that using multiple boost-tuned SSMs is critical for improving speculative performance. - -## TODOs - -SpecInfer is under active development. We currently focus on the following tasks and strongly welcome all contributions to SpecInfer from bug fixes to new features and extensions. - -* Low-precision and mixed-precision support. The current version uses single-precision floating points for computing tree attention. We are actively working on support half-precision floating points, and int4 and int8 quantizations. -* Offloading-based generative LLM inference. Another promising avenue for future work is using speculative inference and token tree verification to reduce the end-to-end inference for offloading-based generative LLM inference. A potential application of this technique is enabling a single commodity GPU to serve LLMs for latency critical tasks. - -## Acknowledgements -This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting SpecInfer and the underlying FlexFlow runtime system. The following paper describes design, implementation, and key optimizations of SpecInfer. - -* Xupeng Miao*, Gabriele Oliaro*, Zhihao Zhang*, Xinhao Cheng, Zeyu Wang, Rae Ying Yee Wong, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia. [SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification](https://arxiv.org/abs/2305.09781). - -\* Denotes equal contribution - -### Citation -Please cite as: - -``` bibtex -@misc{miao2023specinfer, - title={SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification}, - author={Xupeng Miao and Gabriele Oliaro and Zhihao Zhang and Xinhao Cheng and Zeyu Wang and Rae Ying Yee Wong and Zhuoming Chen and Daiyaan Arfeen and Reyna Abhyankar and Zhihao Jia}, - year={2023}, - eprint={2305.09781}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -} -``` - -## License -Both SpecInfer and FlexFlow use Apache License 2.0. From 94e35d96f33f7c7a8a7be1399e7294dc7f191852 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 15 Aug 2023 17:05:47 +0000 Subject: [PATCH 44/52] fix hip build --- include/flexflow/utils/hip_helper.h | 4 + src/loss_functions/loss_functions.cpp | 37 +++- src/metrics_functions/metrics_functions.cpp | 16 +- src/ops/element_unary.cpp | 5 +- src/ops/kernels/softmax.cpp | 37 +++- src/ops/layer_norm.cpp | 232 +++++++++++++++----- src/runtime/hip_helper.cpp | 51 +++++ 7 files changed, 309 insertions(+), 73 deletions(-) diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h index 2ea09770d6..709e78f517 100644 --- a/include/flexflow/utils/hip_helper.h +++ b/include/flexflow/utils/hip_helper.h @@ -137,6 +137,10 @@ miopenStatus_t cudnnSetTensorDescriptorFromDomain(miopenTensorDescriptor_t tensor, Legion::Domain domain); +miopenStatus_t + cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor, + Legion::Domain domain); + hipblasDatatype_t ff_to_cuda_datatype(DataType type); miopenDataType_t ff_to_cudnn_datatype(DataType type); diff --git a/src/loss_functions/loss_functions.cpp b/src/loss_functions/loss_functions.cpp index a87aaade84..3453f3fbf6 100644 --- a/src/loss_functions/loss_functions.cpp +++ b/src/loss_functions/loss_functions.cpp @@ -20,6 +20,7 @@ namespace FlexFlow { using namespace Legion; +int const MASK_TOKEN = -100; __global__ void sparse_categorical_crossentropy_loss_backward(float *logit_grad, @@ -33,6 +34,25 @@ __global__ void } } +__global__ void + sparse_categorical_crossentropy_loss_backward_with_mask(float *logit_grad, + int const *label, + coord_t num_samples, + coord_t num_classes, + int const k, + float *num) { + CUDA_KERNEL_LOOP(i, num_samples * num_classes) { + int sample_id = i / num_classes; + int label_idx = label[i / (k * num_classes)]; + if (label_idx != MASK_TOKEN && (i == sample_id * num_classes + label_idx)) { + logit_grad[i] -= 1.0f; + atomicAdd(&num[0], 1.0f); + } else if (label_idx == MASK_TOKEN) { + logit_grad[i] = 0.0f; + } + } +} + __global__ void categorical_crossentropy_loss_backward(float *logit_grad, float const *logit, float const *label, @@ -75,8 +95,14 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_ptr, logit_volume * sizeof(float), hipMemcpyDeviceToDevice)); - hipLaunchKernelGGL(sparse_categorical_crossentropy_loss_backward, - GET_BLOCKS(num_samples), + + assert(scale_factor == 1.0f); + float *num; + checkCUDA(hipMalloc(&num, sizeof(float))); + float effective_tokens; + int parallelism = num_samples * num_classes; + hipLaunchKernelGGL(sparse_categorical_crossentropy_loss_backward_with_mask, + GET_BLOCKS(parallelism), CUDA_NUM_THREADS, 0, stream, @@ -84,7 +110,10 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( label_ptr, num_samples, num_classes, - k); + k, + num); + + hipMemcpy(&effective_tokens, num, sizeof(float), hipMemcpyDeviceToHost); // Scale logit gradients by op->scale_factor hipLaunchKernelGGL(scale_kernel, GET_BLOCKS(logit_grad_volume), @@ -94,7 +123,7 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_grad_ptr, logit_grad_volume, 0, - scale_factor * k); + 1.0f / effective_tokens); } void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( diff --git a/src/metrics_functions/metrics_functions.cpp b/src/metrics_functions/metrics_functions.cpp index d30686be24..1c57bd6ba9 100644 --- a/src/metrics_functions/metrics_functions.cpp +++ b/src/metrics_functions/metrics_functions.cpp @@ -20,6 +20,7 @@ namespace FlexFlow { float const LOG_MIN_VALUE = 0.00000001f; +int const MASK_TOKEN = -100; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, @@ -39,14 +40,19 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, } } assert(my_label >= 0); - atomicAdd(&(perf->train_all), 1); - if (labels[b] == my_label) { - atomicAdd(&(perf->train_correct), 1); + if (labels[b] != MASK_TOKEN) { + atomicAdd(&(perf->train_all), 1); + if (labels[b] == my_label) { + atomicAdd(&(perf->train_correct), 1); + } } } if (metrics.measure_sparse_categorical_crossentropy) { - float my_logit = max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); - atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + if (labels[b] != MASK_TOKEN) { + float my_logit = + max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); + atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + } } if (metrics.measure_mean_squared_error || metrics.measure_root_mean_squared_error || diff --git a/src/ops/element_unary.cpp b/src/ops/element_unary.cpp index 43c84b0c41..38c6043297 100644 --- a/src/ops/element_unary.cpp +++ b/src/ops/element_unary.cpp @@ -189,8 +189,9 @@ __global__ void elewise_unary_backward_kernel(coord_t volume, case OP_GELU: { input_grad[i] = (T)(output_grad[i] * - (0.5 * erfc(-input[i] * M_SQRT1_2) - - 0.5 * M_SQRT1_2 * input[i] * exp(-input[i] * input[i] * 0.5))); + (0.5 * erfc(-input[i] * M_SQRT1_2) + + 0.5 * M_SQRT1_2 * input[i] * + ((2 / sqrt(M_PI)) * exp(-input[i] * input[i] * 0.5f)))); break; } case OP_RSQRT: { diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp index d63bd0edc5..6df6351bb0 100644 --- a/src/ops/kernels/softmax.cpp +++ b/src/ops/kernels/softmax.cpp @@ -27,8 +27,11 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, Domain const &input_domain) : OpMeta(handler) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); - checkCUDNN(cudnnSetTensorDescriptorFromDomain(inputTensor, input_domain)); + // checkCUDNN(cudnnSetTensorDescriptorFromDomain(inputTensor, input_domain)); + checkCUDNN( + cudnnSetTensorDescriptorFromDomain4SoftMax(inputTensor, input_domain)); dim = softmax->dim; + last_layer = softmax->last_layer; profiling = softmax->profiling; std::strcpy(op_name, softmax->name); } @@ -67,6 +70,7 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, void backward_kernel_wrapper(SoftmaxMeta const *m, float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -78,7 +82,7 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, hipEventRecord(t_start, stream); } Internal::backward_kernel( - input_grad_ptr, output_grad_ptr, num_elements, stream); + m, input_grad_ptr, output_grad_ptr, output_ptr, num_elements, stream); if (m->profiling) { hipEventRecord(t_end, stream); checkCUDA(hipEventSynchronize(t_end)); @@ -114,15 +118,32 @@ void forward_kernel(SoftmaxMeta const *m, MIOPEN_SOFTMAX_MODE_CHANNEL)); } -void backward_kernel(float *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + float *input_grad_ptr, float const *output_grad_ptr, + float const *output_ptr, size_t num_elements, hipStream_t stream) { - checkCUDA(hipMemcpyAsync(input_grad_ptr, - output_grad_ptr, - num_elements * sizeof(float), - hipMemcpyDeviceToDevice, - stream)); + if (m->last_layer) { + checkCUDA(hipMemcpyAsync(input_grad_ptr, + output_grad_ptr, + num_elements * sizeof(float), + hipMemcpyDeviceToDevice, + stream)); + } else { + float alpha = 1.0f, beta = 0.0f; + checkCUDNN(miopenSoftmaxBackward_V2(m->handle.dnn, + &alpha, + m->inputTensor, + output_ptr, + m->inputTensor, + output_grad_ptr, + &beta, + m->inputTensor, + input_grad_ptr, + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_CHANNEL)); + } } } // namespace Internal diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index c3030e20b4..e03da48935 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -14,6 +14,7 @@ */ #include "flexflow/ops/layer_norm.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/utils/hip_helper.h" #include @@ -30,12 +31,26 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln) effective_batch_size = ln->effective_batch_size; effective_num_elements = ln->effective_num_elements; eps = ln->eps; - checkCUDA(hipMalloc(&mean_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&rstd_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&ds_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&db_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&scale_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&bias_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&mean_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&rstd_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&ds_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&db_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&scale_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&bias_ptr, sizeof(float) * effective_batch_size)); + + DataType data_type = ln->data_type; + checkCUDA( + hipMalloc(&mean_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&rstd_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&ds_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&db_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&scale_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&bias_ptr, data_type_size(data_type) * effective_batch_size)); } template @@ -43,12 +58,10 @@ __device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff) { -#if 0 #ifndef __HIP_PLATFORM_HCC__ - return __shfl_down_sync(mask, value, delta, width); + return __shfl_down_sync(mask, value, delta, width); #else - return __shfl_down(value, delta, width); -#endif + return __shfl_down(value, delta, width); #endif } @@ -79,26 +92,26 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { } template -__global__ void - RowwiseMomentsCUDAKernel(int64_t N, T eps, T const *X, T *mean, T *rstd) { - __shared__ T m_shared[C10_WARP_SIZE]; - __shared__ T v_shared[C10_WARP_SIZE]; +__global__ void RowwiseMomentsCUDAKernel( + int64_t N, float eps, T const *X, T *mean, T *rstd) { + __shared__ float m_shared[C10_WARP_SIZE]; + __shared__ float v_shared[C10_WARP_SIZE]; const int64_t i = blockIdx.x; - T sum1 = 0; - T sum2 = 0; + float sum1 = 0.0f; + float sum2 = 0.0f; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - sum1 += static_cast(X[index]); - sum2 += static_cast(X[index]) * static_cast(X[index]); + sum1 += static_cast(X[index]); + sum2 += static_cast(X[index]) * static_cast(X[index]); } - sum1 = BlockReduceSum(sum1, m_shared); - sum2 = BlockReduceSum(sum2, v_shared); + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); if (threadIdx.x == 0) { - const T scale = T(1) / static_cast(N); + float const scale = float(1) / static_cast(N); sum1 *= scale; - sum2 = max(sum2 * scale - sum1 * sum1, T(0)); - mean[i] = sum1; - rstd[i] = rsqrt(sum2 + static_cast(eps)); + sum2 = max(sum2 * scale - sum1 * sum1, float(0)); + mean[i] = static_cast(sum1); + rstd[i] = static_cast(rsqrt(sum2 + eps)); } } @@ -132,7 +145,7 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, T *gamma_ptr, T *beta_ptr, hipStream_t stream) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(RowwiseMomentsCUDAKernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(RowwiseMomentsCUDAKernel), m->effective_batch_size, kCUDABlockReduceNumThreads, 0, @@ -140,33 +153,50 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, m->effective_num_elements, m->eps, in_ptr, - m->mean_ptr, - m->rstd_ptr); - hipLaunchKernelGGL(HIP_KERNEL_NAME(LayerNormForwardCUDAKernel), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr)); + hipLaunchKernelGGL(HIP_KERNEL_NAME(LayerNormForwardCUDAKernel), m->effective_batch_size, kCUDANumThreads, 0, stream, m->effective_num_elements, in_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_ptr, beta_ptr, out_ptr); } /*static*/ -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - T const *in_ptr, - T *out_ptr, - T *gamma_ptr, - T *beta_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorW &gamma, + GenericTensorAccessorW &beta) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - LayerNorm::forward_kernel( - m, in_ptr, out_ptr, gamma_ptr, beta_ptr, stream); + // LayerNorm::forward_kernel( + // m, in_ptr, out_ptr, gamma_ptr, beta_ptr, stream); + + if (m->input_type[0] == DT_FLOAT) { + LayerNorm::forward_kernel(m, + input.get_float_ptr(), + output.get_float_ptr(), + gamma.get_float_ptr(), + beta.get_float_ptr(), + stream); + } else if (m->input_type[0] == DT_HALF) { + LayerNorm::forward_kernel(m, + input.get_half_ptr(), + output.get_half_ptr(), + gamma.get_half_ptr(), + beta.get_half_ptr(), + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } } template @@ -346,6 +376,82 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M, } } +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + dX_i[l] = f_grad_input; + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + + compute_gI(dY, X, mean, rstd, gamma, dX, N, buf); +} + /*static*/ template void LayerNorm::backward_kernel(LayerNormMeta const *m, @@ -367,8 +473,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, output_grad_ptr, input_ptr, gamma_ptr, - m->ds_ptr, - m->db_ptr); + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel), B, @@ -377,12 +483,29 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, stream, M, N, - m->mean_ptr, - m->rstd_ptr, - m->ds_ptr, - m->db_ptr, - m->scale_ptr, - m->bias_ptr); + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel), + blocks, + num_threads, + nshared, + stream, + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + N); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { if (M < 512) { // For small batch size, do colwise reduce directly @@ -396,8 +519,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, N, output_grad_ptr, input_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_grad_ptr, beta_grad_ptr); } else { @@ -414,8 +537,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, N, output_grad_ptr, input_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_grad_ptr, beta_grad_ptr); } @@ -443,11 +566,12 @@ void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, stream); } -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - float const *in_ptr, - float *out_ptr, - float *gamma_ptr, - float *beta_ptr); +// template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const +// *m, +// float const *in_ptr, +// float *out_ptr, +// float *gamma_ptr, +// float *beta_ptr); template void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, float const *output_grad_ptr, diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index 375b4f3d53..ffdcf0dac1 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -298,6 +298,57 @@ miopenStatus_t return miopenStatusBadParm; } +miopenStatus_t + cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor, + Domain domain) { + int dims[MAX_TENSOR_DIM]; + switch (domain.get_dim()) { + case 1: { + Rect<1> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + return miopenSet4dTensorDescriptor(tensor, miopenFloat, dims[0], 1, 1, 1); + } + case 2: { + Rect<2> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[1], dims[0], 1, 1); + } + case 3: { + Rect<3> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[2] * dims[1], dims[0], 1, 1); + } + case 4: { + Rect<4> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + dims[3] = rect.hi[3] - rect.lo[3] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[3] * dims[2] * dims[1], dims[0], 1, 1); + } + case 5: { + Rect<5> rect = domain; + int leading_dim_size = rect.hi[4] - rect.lo[4] + 1; + assert(leading_dim_size == 1); + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + dims[3] = rect.hi[3] - rect.lo[3] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[3], dims[2], dims[1], dims[0]); + } + default: + assert(false && "Unsupported dim number"); + } + return miopenStatusBadParm; +} + miopenDataType_t ff_to_cudnn_datatype(DataType type) { switch (type) { case DT_FLOAT: From ded175c67d043cec87fdfaa34164f2b566db22d7 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 18 Aug 2023 13:01:34 -0400 Subject: [PATCH 45/52] bypass simulator creation when only_data_parallel is specified --- src/runtime/graph.cc | 77 ++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 5c49687712..f17b4dd547 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1900,6 +1900,37 @@ std::pair, std::unordered_map> model->config.workersPerNode, model->config.cpusPerNode, model->all_valid_views); + if (model->config.only_data_parallel) { + Graph *graph = new Graph(model); + graph->print_dot(); + std::unordered_map op_to_node_map; + for (FlexFlow::Op const *dstOp : model->operators) { + Node dstNode; + dstNode.ptr = dstOp; + dstNode.guid = model->node_global_guid++; + op_to_node_map[dstOp] = dstNode; + for (int j = 0; j < dstOp->numInputs; j++) { + FlexFlow::Op const *srcOp = dstOp->inputs[j]->owner_op; + assert(op_to_node_map.find(srcOp) != op_to_node_map.end()); + Node srcNode = op_to_node_map[srcOp]; + graph->add_edge(srcNode, dstNode, dstOp->inputs[j]->owner_idx, j); + } + } + graph->print_dot(); + curr_best_graph = std::unique_ptr(graph); + MachineView data_parallel_view; + data_parallel_view.device_type = MachineView::GPU; + data_parallel_view.ndims = 1; + data_parallel_view.dim[0] = + model->config.numNodes * model->config.workersPerNode; + data_parallel_view.stride[0] = 1; + data_parallel_view.start_device_id = 0; + for (auto const &node : curr_best_graph->inEdges) { + curr_optimal_views[node.first] = data_parallel_view; + } + return std::make_pair(std::move(curr_best_graph), curr_optimal_views); + } + Runtime *runtime = model->config.lg_hlr; Context ctx = model->config.lg_ctx; const Task* task = runtime->get_current_task(ctx); @@ -1939,44 +1970,14 @@ std::pair, std::unordered_map> std::unique_ptr curr_best_graph; std::unordered_map curr_optimal_views; - if (model->config.only_data_parallel) { - Graph *graph = new Graph(model); - graph->print_dot(); - std::unordered_map op_to_node_map; - for (FlexFlow::Op const *dstOp : model->operators) { - Node dstNode; - dstNode.ptr = dstOp; - dstNode.guid = model->node_global_guid++; - op_to_node_map[dstOp] = dstNode; - for (int j = 0; j < dstOp->numInputs; j++) { - FlexFlow::Op const *srcOp = dstOp->inputs[j]->owner_op; - assert(op_to_node_map.find(srcOp) != op_to_node_map.end()); - Node srcNode = op_to_node_map[srcOp]; - graph->add_edge(srcNode, dstNode, dstOp->inputs[j]->owner_idx, j); - } - } - graph->print_dot(); - curr_best_graph = std::unique_ptr(graph); - MachineView data_parallel_view; - data_parallel_view.device_type = MachineView::GPU; - data_parallel_view.ndims = 1; - data_parallel_view.dim[0] = - model->config.numNodes * model->config.workersPerNode; - data_parallel_view.stride[0] = 1; - data_parallel_view.start_device_id = 0; - for (auto const &node : curr_best_graph->inEdges) { - curr_optimal_views[node.first] = data_parallel_view; - } - } else { - // Main step to optimize the PCG of an FFModel - model->graph_optimize(model->config.search_budget, - model->config.only_data_parallel, - curr_best_graph, - curr_optimal_views, - perform_memory_search, - MemoryOptimConfig{lambda.first}, - lambda.second); - } + // Main step to optimize the PCG of an FFModel + model->graph_optimize(model->config.search_budget, + model->config.only_data_parallel, + curr_best_graph, + curr_optimal_views, + perform_memory_search, + MemoryOptimConfig{lambda.first}, + lambda.second); // Return the best result of the current search return std::make_pair(std::move(curr_best_graph), curr_optimal_views); }; From 1f7e8b79ba3ae341fb0cd100dc42dbeb1683a334 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 18 Aug 2023 14:33:59 -0400 Subject: [PATCH 46/52] add nccl prints --- src/runtime/model.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 8461668aaa..c52d5e7f0c 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -584,9 +584,10 @@ ncclComm_t Op::init_nccl_comms_task(Task const *task, } } ncclComm_t ncclComm; + fprintf(stderr, "Before ncclCommInitRank\n"); checkNCCL(ncclCommInitRank(&ncclComm, allRanks, ncclId, myRank)); - // fprintf(stderr, "ncclComm(%p) allRanks(%d) myRank(%d) ncclId(%p)\n", - // ncclComm, allRanks, myRank, ncclId); + fprintf(stderr, "After ncclCommInitRank ncclComm(%p) allRanks(%d) myRank(%d) ncclId(%p)\n", + ncclComm, allRanks, myRank, ncclId); return ncclComm; } #endif From 3fb70f6be3c53c9338ad936e7a0ea63350fc17d7 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 21 Aug 2023 15:59:05 -0400 Subject: [PATCH 47/52] . --- src/runtime/graph.cc | 2 ++ src/runtime/parallel_tensor.cc | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index f17b4dd547..7447125197 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1917,6 +1917,8 @@ std::pair, std::unordered_map> } } graph->print_dot(); + std::unique_ptr curr_best_graph; + std::unordered_map curr_optimal_views; curr_best_graph = std::unique_ptr(graph); MachineView data_parallel_view; data_parallel_view.device_type = MachineView::GPU; diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 6cc0d0067e..18318db3ce 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -672,6 +672,20 @@ bool ParallelTensorBase::set_tensor(FFModel const *ff, for (size_t i = 0; i < dim_sizes.size(); i++) { volume = volume * dim_sizes[i]; } + // Debug prints + { + std::string tensor_name; + if (owner_op == nullptr) { + tensor_name = "No OwnerOp"; + } else { + tensor_name = std::string(owner_op->name); + } + std::ostringstream oss; + for (int i = 0; i < dim_sizes.size(); i++) + oss << dim_sizes[i] << ", "; + printf("%s num_replicas(%zu) volume(%zu) dims(%s)\n", tensor_name.c_str(), + num_replicas, volume, oss.str().c_str()); + } RegionRequirement req(region, READ_WRITE, EXCLUSIVE, region); req.add_field(FID_DATA); InlineLauncher launcher(req); From d652b6202e25570c23903375ba0d36c87b522c4d Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 29 Aug 2023 13:21:06 -0400 Subject: [PATCH 48/52] rccl --- CMakeLists.txt | 17 ++++++++++------- config/config.linux | 4 +--- include/flexflow/config.h | 4 +++- include/flexflow/machine_view.h | 4 +++- src/runtime/optimizer_kernel.cpp | 2 ++ 5 files changed, 19 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e7504d7026..81845dd7b3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -158,10 +158,6 @@ endif() # option for nccl option(FF_USE_NCCL "Run FlexFlow with NCCL" OFF) -if (FF_GPU_BACKEND STREQUAL "hip_rocm" AND FF_USE_NCCL STREQUAL "ON") - message(FATAL_ERROR "NCCL: ON for FF_GPU_BACKEND: hip_rocm. hip_rocm backend must have NCCL disabled.") -endif() - # option for avx2 option(FF_USE_AVX2 "Run FlexFlow with AVX2" OFF) @@ -224,7 +220,9 @@ endif() # NCCL if(FF_USE_NCCL) - include(nccl) + if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda") + include(nccl) + endif() list(APPEND FF_CC_FLAGS -DFF_USE_NCCL) list(APPEND FF_NVCC_FLAGS @@ -369,11 +367,13 @@ elseif(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm") elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") find_package(hipblas REQUIRED) find_package(miopen REQUIRED) + if(FF_USE_NCCL) + find_package(rccl REQUIRED) + endif() # find_package(rocrand REQUIRED) find_library(HIP_RAND_LIBRARY hiprand REQUIRED) add_compile_definitions(FF_USE_HIP_ROCM) - # The hip cmake config module defines three targets, # hip::amdhip64, hip::host, and hip::device. # @@ -387,12 +387,15 @@ elseif(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm") # Docs (outdated): # https://rocmdocs.amd.com/en/latest/Installation_Guide/Using-CMake-with-AMD-ROCm.html target_link_libraries(flexflow hip::device roc::hipblas MIOpen ${HIP_RAND_LIBRARY}) + if(FF_USE_NCCL) + target_link_libraries(flexflow rccl) + endif() endif() else() message(FATAL_ERROR "Unsupported FF_GPU_BACKEND for cmake: ${FF_GPU_BACKEND}") endif() -if(FF_USE_NCCL) +if(FF_USE_NCCL AND (FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda")) add_dependencies(flexflow ${NCCL_NAME}) endif() diff --git a/config/config.linux b/config/config.linux index a5bb093584..d3729aea4c 100755 --- a/config/config.linux +++ b/config/config.linux @@ -70,11 +70,9 @@ FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda} if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid." exit 1 -elif [[ "$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" ]]; then +elif [["$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" || "$FF_GPU_BACKEND" == "hip_rocm"]]; then # enable NCCL FF_USE_NCCL=${FF_USE_NCCL:-ON} -else - FF_USE_NCCL=OFF fi function get_build_configs() { diff --git a/include/flexflow/config.h b/include/flexflow/config.h index d82b1377c7..d1fe6231da 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -28,8 +28,10 @@ #error "Unknown device" #endif #include "tl/optional.hpp" -#ifdef FF_USE_NCCL +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include +#else +#include #endif namespace FlexFlow { diff --git a/include/flexflow/machine_view.h b/include/flexflow/machine_view.h index 8843dc4d6a..b843555e06 100644 --- a/include/flexflow/machine_view.h +++ b/include/flexflow/machine_view.h @@ -3,8 +3,10 @@ #include "legion.h" #include -#ifdef FF_USE_NCCL +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include +#else +#include #endif #include "flexflow/config.h" diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index 232799e027..e71adc87a8 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -87,6 +87,7 @@ __host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op, #ifdef FF_USE_NCCL __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, + OpMeta const *meta, float const *w_grad_ptr, size_t size, float *w_ptr, @@ -208,6 +209,7 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, #ifdef FF_USE_NCCL __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, + OpMeta const *meta, float const *w_grad_ptr, size_t size, float *w_ptr, From b39528b635adee41e98db319466663d962461ff2 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 6 Oct 2023 16:15:59 +0000 Subject: [PATCH 49/52] fix fuse --- include/flexflow/ops/layer_norm.h | 8 +- src/ops/fused.cpp | 1181 +++++++++++++++++------------ src/ops/fused.cu | 171 ++++- src/ops/layer_norm.cc | 4 +- src/ops/layer_norm.cpp | 4 +- src/ops/layer_norm.cu | 8 +- 6 files changed, 882 insertions(+), 494 deletions(-) diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 552b9cf365..de5ed48df2 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -63,14 +63,14 @@ class LayerNorm : public Op { static void forward_kernel(LayerNormMeta const *m, T const *input_ptr, T *output_ptr, - T *gamma_ptr, - T *beta_ptr, + T const *gamma_ptr, + T const *beta_ptr, ffStream_t stream); static void forward_kernel_wrapper(LayerNormMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW &output, - GenericTensorAccessorW &gamma, - GenericTensorAccessorW &beta); + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta); template static void backward_kernel(LayerNormMeta const *m, T const *output_grad_ptr, diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index a602c5d6b1..348680b2fd 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -18,6 +18,7 @@ #include "flexflow/ops/batch_norm.h" #include "flexflow/ops/element_unary.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" +#include "flexflow/ops/kernels/cast_kernels.h" #include "flexflow/ops/kernels/concat_kernels.h" #include "flexflow/ops/kernels/conv_2d_kernels.h" #include "flexflow/ops/kernels/dropout_kernels.h" @@ -26,7 +27,9 @@ #include "flexflow/ops/kernels/linear_kernels.h" #include "flexflow/ops/kernels/pool_2d_kernels.h" #include "flexflow/ops/kernels/reshape_kernels.h" +#include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" +#include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" #include "flexflow/utils/hip_helper.h" #include @@ -281,8 +284,8 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; Kernels::ElementBinary::forward_kernel_wrapper( m, @@ -292,488 +295,730 @@ __host__ void FusedOp::forward_task(Task const *task, break; break; } - case OP_RELU: - case OP_SIGMOID: - case OP_TANH: - case OP_ELU: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - break; - } - case OP_POOL2D: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; - Kernels::Pool2D::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); - break; - } - case OP_FLAT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - Kernels::Flat::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - break; - } - case OP_RESHAPE: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - break; - } - case OP_TRANSPOSE: { + + case OP_EMBEDDING: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - TransposeMeta *m = (TransposeMeta *)metas->meta[op]; - Kernels::Transpose::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain, - my_output_accessor[0].domain); - break; - } - default: { - fprintf(stderr, - "Fusion currently does not support type = %d\n", - fused->op_op_type[op]); - assert(false && "Fusion currently does not support type"); + EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; + if (m->aggr == AGGR_MODE_NONE) { + // assert(kernel_domain.get_dim() == 2); + assert(my_input_accessor[0].domain.get_dim() + 1 == + my_output_accessor[0].domain.get_dim()); + for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); i++) { + assert(my_input_accessor[0].domain.hi()[i] == + my_output_accessor[0].domain.hi()[i + 1]); + assert(my_input_accessor[0].domain.lo()[i] == + my_output_accessor[0].domain.lo()[i + 1]); + } + assert(my_weight_accessor[0].domain.hi()[0] - + my_weight_accessor[0].domain.lo()[0] == + my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0]); + } else { + assert(my_input_accessor[0].domain.get_dim() == + my_output_accessor[0].domain.get_dim()); + for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); i++) { + assert(my_input_accessor[0].domain.hi()[i] == + my_output_accessor[0].domain.hi()[i]); + assert(my_input_accessor[0].domain.lo()[i] == + my_output_accessor[0].domain.lo()[i]); + } + assert(my_weight_accessor[0].domain.hi()[0] - + my_weight_accessor[0].domain.lo()[0] == + my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0]); + } + int in_dim, out_dim, effective_batch_size; + if (m->aggr == AGGR_MODE_NONE) { + in_dim = 1; + out_dim = my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } else { + assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM); + in_dim = my_input_accessor[0].domain.hi()[0] - + my_input_accessor[0].domain.lo()[0] + 1; + out_dim = my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } + assert(my_input_accessor[0].data_type == DT_INT32 || + my_input_accessor[0].data_type == DT_INT64); + Kernels::Embedding::forward_kernel_wrapper(m, + my_input_accessor[0], + my_output_accessor[0], + my_weight_accessor[0], + in_dim, + out_dim, + effective_batch_size); + case OP_GELU: + case OP_RELU: + case OP_SIGMOID: + case OP_TANH: + case OP_ELU: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_POOL2D: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; + Kernels::Pool2D::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + break; + } + case OP_FLAT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + Kernels::Flat::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (m->input_type == DT_FLOAT) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + } else { + assert(false); + } + break; + } + case OP_RESHAPE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_TRANSPOSE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + TransposeMeta *m = (TransposeMeta *)metas->meta[op]; + Kernels::Transpose::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain, + my_output_accessor[0].domain); + break; + } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; + } + LayerNorm::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0], gamma, beta); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_INT64) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_output_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_accessor[0].domain.get_volume()); + } else { + assert(false); + } + default: { + fprintf(stderr, + "Fusion currently does not support type = %d\n", + fused->op_op_type[op]); + assert(false && "Fusion currently does not support type"); + } + } + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; } + // for (int i = 0; i < fused->numOutputs; i++) + // print_tensor(output_ptr[i], output_domain[i].get_volume(), + // "[Fused:forward:output]"); } - ioff += fused->op_num_inputs[op]; - woff += fused->op_num_weights[op]; - ooff += fused->op_num_outputs[op]; - } - // for (int i = 0; i < fused->numOutputs; i++) - // print_tensor(output_ptr[i], output_domain[i].get_volume(), - // "[Fused:forward:output]"); -} -/* - regions[...](I): input - regions[...](I): weight - regions[...](I): output - regions[...](I/O): input_grad - regions[...](I/O): weight_grad - regions[...](I/O): output_grad -*/ + /* + regions[...](I): input + regions[...](I): weight + regions[...](I): output + regions[...](I/O): input_grad + regions[...](I/O): weight_grad + regions[...](I/O): output_grad + */ -__host__ void FusedOp::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - // const FusedOp* fused = (FusedOp*) task->args; - FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); - FusedOp const *fused = metas->fused_op; + __host__ void FusedOp::backward_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + // const FusedOp* fused = (FusedOp*) task->args; + FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); + FusedOp const *fused = metas->fused_op; - assert(metas->numOperators == fused->numOperators); - assert(regions.size() == task->regions.size()); - { - int sum = fused->numInputs + fused->numWeights + fused->numOutputs; - assert(sum * 2 == (int)regions.size()); - } - GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorW weight_grad_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorR output_accessor[MAX_NUM_OUTPUTS]; - GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS]; - int roff = 0; - assert(fused->numInputs <= MAX_NUM_INPUTS); - for (int i = 0; i < fused->numInputs; i++) { - input_accessor[i] = - helperGetGenericTensorAccessorRO(fused->input_data_types[i], - regions[i], - task->regions[i], - FID_DATA, - ctx, - runtime); - } - roff += fused->numInputs; - assert(fused->numWeights <= MAX_NUM_WEIGHTS); - for (int i = 0; i < fused->numWeights; i++) { - weight_accessor[i] = - helperGetGenericTensorAccessorRO(fused->weight_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); - } - roff += fused->numWeights; - assert(fused->numOutputs <= MAX_NUM_OUTPUTS); - for (int i = 0; i < fused->numOutputs; i++) { - output_accessor[i] = - helperGetGenericTensorAccessorRO(fused->output_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); - } - roff += fused->numOutputs; - for (int i = 0; i < fused->numInputs; i++) { - input_grad_accessor[i] = - helperGetGenericTensorAccessorRW(fused->input_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); - assert(input_grad_accessor[i].domain == input_accessor[i].domain); - } - roff += fused->numInputs; - for (int i = 0; i < fused->numWeights; i++) { - weight_grad_accessor[i] = - helperGetGenericTensorAccessorRW(fused->weight_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); - assert(weight_grad_accessor[i].domain.get_volume() == - weight_accessor[i].domain.get_volume()); - } - roff += fused->numWeights; - for (int i = 0; i < fused->numOutputs; i++) { - output_grad_accessor[i] = - helperGetGenericTensorAccessorRW(fused->output_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); - assert(output_grad_accessor[i].domain == output_accessor[i].domain); - } - roff += fused->numOutputs; - // Assert that all meta share the same dnn/blas handler - int start = 0; - for (start = 0; start < fused->numOperators; start++) { - if (metas->meta[start] != NULL) { - break; - } - } - for (int op = start + 1; op < fused->numOperators; op++) { - if (metas->meta[op] != NULL) { - assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas); - assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn); - } - } - - hipStream_t stream; - checkCUDA(get_legion_stream(&stream)); - - int ioff = 0, woff = 0, ooff = 0; - GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorR my_output_accessor[MAX_NUM_OUTPUTS]; - GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorW my_weight_grad_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS]; - // Do backpropagation in the reverse ordering - for (int op = 0; op < fused->numOperators; op++) { - ioff += fused->op_num_inputs[op]; - woff += fused->op_num_weights[op]; - ooff += fused->op_num_outputs[op]; - } - - for (int op = fused->numOperators - 1; op >= 0; op--) { - ioff -= fused->op_num_inputs[op]; - woff -= fused->op_num_weights[op]; - ooff -= fused->op_num_outputs[op]; - for (int i = 0; i < fused->op_num_inputs[op]; i++) { - int my_off = fused->op_input_idx[i + ioff]; - if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { - my_input_accessor[i] = input_accessor[my_off]; - my_input_grad_accessor[i] = input_grad_accessor[my_off]; - } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { - my_input_accessor[i] = output_accessor[my_off]; - my_input_grad_accessor[i] = output_grad_accessor[my_off]; - assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain); - } else { - assert(false); + assert(metas->numOperators == fused->numOperators); + assert(regions.size() == task->regions.size()); + { + int sum = fused->numInputs + fused->numWeights + fused->numOutputs; + assert(sum * 2 == (int)regions.size()); } - } - for (int i = 0; i < fused->op_num_weights[op]; i++) { - assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; - my_weight_grad_accessor[i] = - weight_grad_accessor[fused->op_weight_idx[i + woff]]; - assert(my_weight_grad_accessor[i].domain.get_volume() == - my_weight_accessor[i].domain.get_volume()); - } - for (int i = 0; i < fused->op_num_outputs[op]; i++) { - assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); - my_output_accessor[i] = output_accessor[fused->op_output_idx[i + ooff]]; - my_output_grad_accessor[i] = - output_grad_accessor[fused->op_output_idx[i + ooff]]; - assert(my_output_grad_accessor[i].domain == my_output_accessor[i].domain); - } - switch (fused->op_op_type[op]) { - case OP_CONCAT: { - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - ConcatMeta *m = (ConcatMeta *)metas->meta[op]; - int num_inputs = fused->op_num_inputs[op]; - Kernels::Concat::backward_kernel_wrapper(m, - my_output_grad_accessor[0], - my_input_grad_accessor, - num_inputs, - m->legion_axis); - break; + GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW weight_grad_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorR output_accessor[MAX_NUM_OUTPUTS]; + GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS]; + int roff = 0; + assert(fused->numInputs <= MAX_NUM_INPUTS); + for (int i = 0; i < fused->numInputs; i++) { + input_accessor[i] = + helperGetGenericTensorAccessorRO(fused->input_data_types[i], + regions[i], + task->regions[i], + FID_DATA, + ctx, + runtime); } - case OP_CONV2D: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_dim() == 5); - assert(my_weight_accessor[0].domain.get_dim() == 5); - assert(my_output_accessor[0].domain.get_dim() == 5); - Conv2DMeta *m = (Conv2DMeta *)metas->meta[op]; - Kernels::Conv2D::backward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - my_weight_grad_accessor[0].get_float_ptr(), - my_weight_grad_accessor[1].get_float_ptr()); - break; + roff += fused->numInputs; + assert(fused->numWeights <= MAX_NUM_WEIGHTS); + for (int i = 0; i < fused->numWeights; i++) { + weight_accessor[i] = + helperGetGenericTensorAccessorRO(fused->weight_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); } - case OP_BATCHNORM: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_dim() == 5); - assert(my_weight_accessor[0].domain.get_dim() == 2); - assert(my_weight_accessor[1].domain.get_dim() == 2); - assert(my_output_accessor[0].domain.get_dim() == 5); - BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; - BatchNorm::backward_kernel( - m, - (float const *)my_input_accessor[0].get_float_ptr(), - (float *)my_output_grad_accessor[0].get_float_ptr(), - (float const *)my_output_accessor[0].get_float_ptr(), - (float *)my_input_grad_accessor[0].get_float_ptr(), - (float const *)my_weight_accessor[0].get_float_ptr(), - (float *)my_weight_grad_accessor[0].get_float_ptr(), - (float *)my_weight_grad_accessor[1].get_float_ptr(), - my_output_accessor[0].domain.get_volume()); - break; + roff += fused->numWeights; + assert(fused->numOutputs <= MAX_NUM_OUTPUTS); + for (int i = 0; i < fused->numOutputs; i++) { + output_accessor[i] = + helperGetGenericTensorAccessorRO(fused->output_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); } - case OP_DROPOUT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - DropoutMeta *m = (DropoutMeta *)metas->meta[op]; - Kernels::Dropout::backward_kernel_wrapper( - m, - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr()); - break; - } - case OP_LINEAR: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - Domain kernel_domain = my_weight_accessor[0].domain; - int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1; - int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1; - int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; - assert(my_output_accessor[0].domain.get_volume() == - out_dim * batch_size); - assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); - float *bias_grad_ptr = nullptr; - if (fused->op_num_weights[op] == 2) { - assert(my_weight_accessor[1].domain.get_volume() == out_dim); - bias_grad_ptr = my_weight_grad_accessor[1].get_float_ptr(); - } else { - assert(fused->op_num_weights[op] == 1); - } - LinearMeta *m = (LinearMeta *)metas->meta[op]; - Kernels::Linear::backward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - my_weight_grad_accessor[0].get_float_ptr(), - bias_grad_ptr, - in_dim, - out_dim, - batch_size); - break; - } - case OP_BATCHMATMUL: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - Domain out_domain = my_output_accessor[0].domain; - Domain a_domain = my_input_accessor[0].domain; - Domain b_domain = my_input_accessor[1].domain; - // check dims - int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; - assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); - int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; - assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); - int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; - assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); - assert(a_domain.get_dim() == b_domain.get_dim()); - assert(a_domain.get_dim() == out_domain.get_dim()); - int batch = 1; - for (int i = 2; i < a_domain.get_dim(); i++) { - int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; - assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); - assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); - batch *= dim_size; - } - BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; - Kernels::BatchMatmul::backward_kernel_wrapper( - meta, - (float const *)my_output_accessor[0].get_float_ptr(), - (float const *)my_output_grad_accessor[0].get_float_ptr(), - (float const *)my_input_accessor[0].get_float_ptr(), - (float *)my_input_grad_accessor[0].get_float_ptr(), - (float const *)my_input_accessor[1].get_float_ptr(), - (float *)my_input_grad_accessor[1].get_float_ptr(), - (float *)nullptr, - m, - n, - k, - batch); - break; + roff += fused->numOutputs; + for (int i = 0; i < fused->numInputs; i++) { + input_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->input_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + assert(input_grad_accessor[i].domain == input_accessor[i].domain); } - case OP_EW_ADD: - case OP_EW_SUB: - case OP_EW_MUL: - case OP_EW_DIV: - case OP_EW_MAX: - case OP_EW_MIN: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; - Kernels::ElementBinary::backward_kernel_wrapper( - m, - my_output_grad_accessor[0].get_float_ptr(), - my_input_accessor[0].get_float_ptr(), - my_input_accessor[1].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[1].get_float_ptr()); - break; + roff += fused->numInputs; + for (int i = 0; i < fused->numWeights; i++) { + weight_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->weight_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + assert(weight_grad_accessor[i].domain.get_volume() == + weight_accessor[i].domain.get_volume()); } - case OP_RELU: - case OP_SIGMOID: - case OP_TANH: - case OP_ELU: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - ElementUnary::backward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - break; + roff += fused->numWeights; + for (int i = 0; i < fused->numOutputs; i++) { + output_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->output_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + assert(output_grad_accessor[i].domain == output_accessor[i].domain); } - case OP_POOL2D: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; - Kernels::Pool2D::backward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr()); - break; + roff += fused->numOutputs; + // Assert that all meta share the same dnn/blas handler + int start = 0; + for (start = 0; start < fused->numOperators; start++) { + if (metas->meta[start] != NULL) { + break; + } } - case OP_FLAT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_grad_accessor[0].domain.get_volume() == - my_output_grad_accessor[0].domain.get_volume()); - Kernels::Flat::backward_kernel_wrapper( - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain.get_volume()); - break; + for (int op = start + 1; op < fused->numOperators; op++) { + if (metas->meta[op] != NULL) { + assert(metas->meta[start]->handle.blas == + metas->meta[op]->handle.blas); + assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn); + } } - case OP_RESHAPE: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_grad_accessor[0].domain.get_volume() == - my_output_grad_accessor[0].domain.get_volume()); - Kernels::Reshape::backward_kernel_wrapper( - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain.get_volume()); - break; + + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + int ioff = 0, woff = 0, ooff = 0; + GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorR my_output_accessor[MAX_NUM_OUTPUTS]; + GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorW my_weight_grad_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS]; + // Do backpropagation in the reverse ordering + for (int op = 0; op < fused->numOperators; op++) { + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; } - case OP_TRANSPOSE: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_grad_accessor[0].domain.get_volume() == - my_output_grad_accessor[0].domain.get_volume()); - TransposeMeta *m = (TransposeMeta *)metas->meta[op]; - Kernels::Transpose::backward_kernel_wrapper( - m, - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain, - my_output_grad_accessor[0].domain); - break; + + for (int op = fused->numOperators - 1; op >= 0; op--) { + ioff -= fused->op_num_inputs[op]; + woff -= fused->op_num_weights[op]; + ooff -= fused->op_num_outputs[op]; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + int my_off = fused->op_input_idx[i + ioff]; + if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { + my_input_accessor[i] = input_accessor[my_off]; + my_input_grad_accessor[i] = input_grad_accessor[my_off]; + } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { + my_input_accessor[i] = output_accessor[my_off]; + my_input_grad_accessor[i] = output_grad_accessor[my_off]; + assert(my_input_grad_accessor[i].domain == + my_input_accessor[i].domain); + } else { + assert(false); + } + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); + my_weight_accessor[i] = + weight_accessor[fused->op_weight_idx[i + woff]]; + my_weight_grad_accessor[i] = + weight_grad_accessor[fused->op_weight_idx[i + woff]]; + assert(my_weight_grad_accessor[i].domain.get_volume() == + my_weight_accessor[i].domain.get_volume()); + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); + my_output_accessor[i] = + output_accessor[fused->op_output_idx[i + ooff]]; + my_output_grad_accessor[i] = + output_grad_accessor[fused->op_output_idx[i + ooff]]; + assert(my_output_grad_accessor[i].domain == + my_output_accessor[i].domain); + } + switch (fused->op_op_type[op]) { + case OP_CONCAT: { + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + ConcatMeta *m = (ConcatMeta *)metas->meta[op]; + int num_inputs = fused->op_num_inputs[op]; + Kernels::Concat::backward_kernel_wrapper(m, + my_output_grad_accessor[0], + my_input_grad_accessor, + num_inputs, + m->legion_axis); + break; + } + case OP_CONV2D: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 5); + assert(my_output_accessor[0].domain.get_dim() == 5); + Conv2DMeta *m = (Conv2DMeta *)metas->meta[op]; + Kernels::Conv2D::backward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + my_weight_grad_accessor[0].get_float_ptr(), + my_weight_grad_accessor[1].get_float_ptr()); + break; + } + case OP_BATCHNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 2); + assert(my_weight_accessor[1].domain.get_dim() == 2); + assert(my_output_accessor[0].domain.get_dim() == 5); + BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; + BatchNorm::backward_kernel( + m, + (float const *)my_input_accessor[0].get_float_ptr(), + (float *)my_output_grad_accessor[0].get_float_ptr(), + (float const *)my_output_accessor[0].get_float_ptr(), + (float *)my_input_grad_accessor[0].get_float_ptr(), + (float const *)my_weight_accessor[0].get_float_ptr(), + (float *)my_weight_grad_accessor[0].get_float_ptr(), + (float *)my_weight_grad_accessor[1].get_float_ptr(), + my_output_accessor[0].domain.get_volume()); + break; + } + case OP_DROPOUT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + DropoutMeta *m = (DropoutMeta *)metas->meta[op]; + Kernels::Dropout::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr()); + break; + } + case OP_LINEAR: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + Domain kernel_domain = my_weight_accessor[0].domain; + int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1; + int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1; + int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; + assert(my_output_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_accessor[0].domain.get_volume() == + in_dim * batch_size); + float *bias_grad_ptr = nullptr; + if (fused->op_num_weights[op] == 2) { + assert(my_weight_accessor[1].domain.get_volume() == out_dim); + bias_grad_ptr = my_weight_grad_accessor[1].get_float_ptr(); + } else { + assert(fused->op_num_weights[op] == 1); + } + LinearMeta *m = (LinearMeta *)metas->meta[op]; + Kernels::Linear::backward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + my_weight_grad_accessor[0].get_float_ptr(), + bias_grad_ptr, + in_dim, + out_dim, + batch_size); + break; + } + case OP_BATCHMATMUL: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + Domain out_domain = my_output_accessor[0].domain; + Domain a_domain = my_input_accessor[0].domain; + Domain b_domain = my_input_accessor[1].domain; + // check dims + int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; + assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); + int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; + assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); + int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; + assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); + assert(a_domain.get_dim() == b_domain.get_dim()); + assert(a_domain.get_dim() == out_domain.get_dim()); + int batch = 1; + for (int i = 2; i < a_domain.get_dim(); i++) { + int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; + assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); + assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); + batch *= dim_size; + } + BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; + Kernels::BatchMatmul::backward_kernel_wrapper( + meta, + (float const *)my_output_accessor[0].get_float_ptr(), + (float const *)my_output_grad_accessor[0].get_float_ptr(), + (float const *)my_input_accessor[0].get_float_ptr(), + (float *)my_input_grad_accessor[0].get_float_ptr(), + (float const *)my_input_accessor[1].get_float_ptr(), + (float *)my_input_grad_accessor[1].get_float_ptr(), + (float *)nullptr, + m, + n, + k, + batch); + break; + } + case OP_EW_ADD: + case OP_EW_SUB: + case OP_EW_MUL: + case OP_EW_DIV: + case OP_EW_MAX: + case OP_EW_MIN: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + // assert(my_input_accessor[0].domain == + // my_input_accessor[1].domain); assert(my_input_accessor[0].domain + // == my_output_accessor[0].domain); + ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; + Kernels::ElementBinary::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_accessor[1].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[1].get_float_ptr()); + break; + } + case OP_EMBEDDING: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; + assert(my_input_accessor[0].data_type == DT_INT64 || + my_input_accessor[0].data_type == DT_INT32); + int in_dim, out_dim, effective_batch_size; + if (m->aggr == AGGR_MODE_NONE) { + in_dim = 1; + out_dim = my_output_grad_accessor[0].domain.hi()[0] - + my_output_grad_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_grad_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } else { + in_dim = my_input_accessor[0].domain.hi()[0] - + my_input_accessor[0].domain.lo()[0] + 1; + out_dim = my_output_grad_accessor[0].domain.hi()[0] - + my_output_grad_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_grad_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } + Kernels::Embedding::backward_kernel_wrapper( + m, + my_input_accessor[0], + my_output_grad_accessor[0], + my_weight_grad_accessor[0], + in_dim, + out_dim, + effective_batch_size); + break; + } + case OP_GELU: + case OP_RELU: + case OP_SIGMOID: + case OP_TANH: + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; + ElementUnary::backward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_POOL2D: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + // assert(my_input_accessor[0].domain == + // my_output_accessor[0].domain); + Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; + Kernels::Pool2D::backward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr()); + break; + } + case OP_FLAT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_volume() == + my_output_grad_accessor[0].domain.get_volume()); + Kernels::Flat::backward_kernel_wrapper( + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + break; + } + case OP_RESHAPE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_volume() == + my_output_grad_accessor[0].domain.get_volume()); + if (my_input_grad_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_int64_ptr(), + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].get_int32_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::backward_kernel_wrapper( + m, + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } + case OP_TRANSPOSE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_volume() == + my_output_grad_accessor[0].domain.get_volume()); + TransposeMeta *m = (TransposeMeta *)metas->meta[op]; + Kernels::Transpose::backward_kernel_wrapper( + m, + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain, + my_output_grad_accessor[0].domain); + break; + } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == + 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; + } + LayerNorm::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + gamma.get_float_ptr(), + my_weight_grad_accessor[0].get_float_ptr(), + my_weight_grad_accessor[1].get_float_ptr()); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_INT64) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + default: + assert(false && "Fusion currently does not support type"); + } + } + assert(ioff == 0); + assert(woff == 0); + assert(ooff == 0); + // for (int i = 0; i < fused->numWeights; i++) + // print_tensor(weight_grad_ptr[i], + // weight_grad_domain[i].get_volume(), + // "[Fused:backward:weight_grad]"); + // for (int i = 0; i < fused->numInputs; i++) + // print_tensor(input_grad_ptr[i], + // input_grad_domain[i].get_volume(), + // "[Fused:backward:input_grad]"); + // for (int i = 0; i < fused->numOutputs; i++) + // print_tensor(output_grad_ptr[i], + // output_grad_domain[i].get_volume(), + // "[Fused:backward:output_grad]"); } - default: - assert(false && "Fusion currently does not support type"); - } - } - assert(ioff == 0); - assert(woff == 0); - assert(ooff == 0); - // for (int i = 0; i < fused->numWeights; i++) - // print_tensor(weight_grad_ptr[i], - // weight_grad_domain[i].get_volume(), "[Fused:backward:weight_grad]"); - // for (int i = 0; i < fused->numInputs; i++) - // print_tensor(input_grad_ptr[i], input_grad_domain[i].get_volume(), - // "[Fused:backward:input_grad]"); - // for (int i = 0; i < fused->numOutputs; i++) - // print_tensor(output_grad_ptr[i], - // output_grad_domain[i].get_volume(), "[Fused:backward:output_grad]"); -} -}; // namespace FlexFlow + }; // namespace FlexFlow diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 15072513a7..62262c89af 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -21,6 +21,7 @@ #include "flexflow/ops/flat.h" #include "flexflow/ops/fused.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" +#include "flexflow/ops/kernels/cast_kernels.h" #include "flexflow/ops/kernels/concat_kernels.h" #include "flexflow/ops/kernels/conv_2d_kernels.h" #include "flexflow/ops/kernels/dropout_kernels.h" @@ -30,7 +31,9 @@ #include "flexflow/ops/kernels/linear_kernels.h" #include "flexflow/ops/kernels/pool_2d_kernels.h" #include "flexflow/ops/kernels/reshape_kernels.h" +#include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" +#include "flexflow/ops/layer_norm.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { @@ -294,8 +297,8 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; Kernels::ElementBinary::forward_kernel_wrapper( m, @@ -358,7 +361,8 @@ __host__ void FusedOp::forward_task(Task const *task, my_input_accessor[0].domain.get_volume()); } - assert(my_input_accessor[0].data_type == DT_INT64); + assert(my_input_accessor[0].data_type == DT_INT32 || + my_input_accessor[0].data_type == DT_INT64); Kernels::Embedding::forward_kernel_wrapper(m, my_input_accessor[0], my_output_accessor[0], @@ -368,10 +372,15 @@ __host__ void FusedOp::forward_task(Task const *task, effective_batch_size); break; } + case OP_GELU: case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); @@ -428,14 +437,31 @@ __host__ void FusedOp::forward_task(Task const *task, my_input_accessor[0].domain.get_volume()); } else if (my_input_accessor[0].data_type == DT_FLOAT) { Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_int64_ptr(), - my_output_accessor[0].get_int64_ptr(), + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), my_input_accessor[0].domain.get_volume()); } else { assert(false && "Unsupported data type"); } break; } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + } else { + assert(false); + } + break; + } case OP_TRANSPOSE: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); @@ -453,6 +479,43 @@ __host__ void FusedOp::forward_task(Task const *task, my_output_accessor[0].domain); break; } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; + } + LayerNorm::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0], gamma, beta); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_output_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_accessor[0].domain.get_volume()); + } else { + assert(false); + } + + break; + } default: { fprintf(stderr, "Fusion currently does not support type = %d\n", @@ -770,8 +833,8 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; Kernels::ElementBinary::backward_kernel_wrapper( m, @@ -787,7 +850,8 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 1); EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; - assert(my_input_accessor[0].data_type == DT_INT64); + assert(my_input_accessor[0].data_type == DT_INT64 || + my_input_accessor[0].data_type == DT_INT32); int in_dim, out_dim, effective_batch_size; if (m->aggr == AGGR_MODE_NONE) { in_dim = 1; @@ -848,10 +912,15 @@ __host__ void FusedOp::backward_task(Task const *task, batch_size); break; } + case OP_GELU: case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); @@ -898,10 +967,43 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); assert(my_input_grad_accessor[0].domain.get_volume() == my_output_grad_accessor[0].domain.get_volume()); - Kernels::Reshape::backward_kernel_wrapper( - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain.get_volume()); + if (my_input_grad_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_int64_ptr(), + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].get_int32_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::backward_kernel_wrapper( + m, + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false); + } break; } case OP_TRANSPOSE: { @@ -919,6 +1021,47 @@ __host__ void FusedOp::backward_task(Task const *task, my_output_grad_accessor[0].domain); break; } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; + } + LayerNorm::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + gamma.get_float_ptr(), + my_weight_grad_accessor[0].get_float_ptr(), + my_weight_grad_accessor[1].get_float_ptr()); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + + break; + } default: assert(false && "Fusion currently does not support type"); } diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 664b8c9f0b..a6130bb425 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -565,8 +565,8 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, // FIXME please add gamma_ptr and beta_ptr after finish the implementation float *gamma_ptr = NULL, *beta_ptr = NULL; - GenericTensorAccessorW gamma_acc; - GenericTensorAccessorW beta_acc; + GenericTensorAccessorR gamma_acc; + GenericTensorAccessorR beta_acc; bool out_of_memory = (in_ptr == NULL) || (out_ptr == NULL) || diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index e03da48935..75cf06b18b 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -173,8 +173,8 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW &output, - GenericTensorAccessorW &gamma, - GenericTensorAccessorW &beta) { + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); // LayerNorm::forward_kernel( diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index f0539f8405..736d122513 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -135,8 +135,8 @@ template void LayerNorm::forward_kernel(LayerNormMeta const *m, T const *in_ptr, T *out_ptr, - T *gamma_ptr, - T *beta_ptr, + T const *gamma_ptr, + T const *beta_ptr, cudaStream_t stream) { RowwiseMomentsCUDAKernel <<effective_batch_size, kCUDABlockReduceNumThreads, 0, stream>>>( @@ -160,8 +160,8 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW &output, - GenericTensorAccessorW &gamma, - GenericTensorAccessorW &beta) { + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); From 0cf3c8edde8e916b0963e802f3cc5b6ccfe35628 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 6 Oct 2023 12:37:46 -0400 Subject: [PATCH 50/52] fix hip --- src/ops/fused.cpp | 1268 ++++++++++++++++++++-------------------- src/ops/layer_norm.cpp | 4 +- 2 files changed, 634 insertions(+), 638 deletions(-) diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index 348680b2fd..226184f1a2 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -13,16 +13,18 @@ * limitations under the License. */ -#include "flexflow/ops/fused.h" #include "flexflow/model.h" #include "flexflow/ops/batch_norm.h" #include "flexflow/ops/element_unary.h" +#include "flexflow/ops/embedding.h" +#include "flexflow/ops/fused.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" #include "flexflow/ops/kernels/cast_kernels.h" #include "flexflow/ops/kernels/concat_kernels.h" #include "flexflow/ops/kernels/conv_2d_kernels.h" #include "flexflow/ops/kernels/dropout_kernels.h" #include "flexflow/ops/kernels/element_binary_kernels.h" +#include "flexflow/ops/kernels/embedding_kernels.h" #include "flexflow/ops/kernels/flat_kernels.h" #include "flexflow/ops/kernels/linear_kernels.h" #include "flexflow/ops/kernels/pool_2d_kernels.h" @@ -293,7 +295,6 @@ __host__ void FusedOp::forward_task(Task const *task, my_input_accessor[1].get_float_ptr(), my_output_accessor[0].get_float_ptr()); break; - break; } case OP_EMBEDDING: { @@ -358,667 +359,662 @@ __host__ void FusedOp::forward_task(Task const *task, in_dim, out_dim, effective_batch_size); - case OP_GELU: - case OP_RELU: - case OP_SIGMOID: - case OP_TANH: - case OP_ELU: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - break; - } - case OP_POOL2D: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; - Kernels::Pool2D::forward_kernel_wrapper( + break; + } + case OP_GELU: + case OP_RELU: + case OP_SIGMOID: + case OP_TANH: + case OP_ELU: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_POOL2D: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; + Kernels::Pool2D::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + break; + } + case OP_FLAT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + Kernels::Flat::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::forward_kernel_wrapper( m, my_input_accessor[0].get_float_ptr(), my_output_accessor[0].get_float_ptr()); - break; - } - case OP_FLAT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - Kernels::Flat::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - break; - } - case OP_SOFTMAX: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; - if (m->input_type == DT_FLOAT) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); - } else { - assert(false); - } - break; + } else { + assert(false); } - case OP_RESHAPE: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - break; + break; + } + case OP_RESHAPE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_TRANSPOSE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + TransposeMeta *m = (TransposeMeta *)metas->meta[op]; + Kernels::Transpose::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain, + my_output_accessor[0].domain); + break; + } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; } - case OP_TRANSPOSE: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - TransposeMeta *m = (TransposeMeta *)metas->meta[op]; - Kernels::Transpose::forward_kernel_wrapper( + LayerNorm::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0], gamma, beta); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::forward_kernel_wrapper( m, - my_input_accessor[0].get_float_ptr(), + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_output_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain, - my_output_accessor[0].domain); - break; - } - case OP_LAYERNORM: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; - assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); - GenericTensorAccessorR gamma, beta; - if (m->elementwise_affine) { - gamma = my_weight_accessor[0]; - beta = my_weight_accessor[1]; - } - LayerNorm::forward_kernel_wrapper( - m, my_input_accessor[0], my_output_accessor[0], gamma, beta); - break; - } - case OP_CAST: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - CastMeta const *m = (CastMeta *)metas->meta[op]; - if (m->input_data_type == DT_INT32 && - m->output_data_type == DT_INT64) { - Kernels::Cast::forward_kernel_wrapper( - m, - my_input_accessor[0].get_int32_ptr(), - my_output_accessor[0].get_int64_ptr(), - my_output_accessor[0].domain.get_volume()); - } else if (m->input_data_type == DT_INT32 && - m->output_data_type == DT_FLOAT) { - Kernels::Cast::forward_kernel_wrapper( - m, - my_input_accessor[0].get_int32_ptr(), - my_output_accessor[0].get_float_ptr(), - my_output_accessor[0].domain.get_volume()); - } else { - assert(false); - } - default: { - fprintf(stderr, - "Fusion currently does not support type = %d\n", - fused->op_op_type[op]); - assert(false && "Fusion currently does not support type"); - } + my_output_accessor[0].domain.get_volume()); + } else { + assert(false); } - ioff += fused->op_num_inputs[op]; - woff += fused->op_num_weights[op]; - ooff += fused->op_num_outputs[op]; + break; + } + default: { + fprintf(stderr, + "Fusion currently does not support type = %d\n", + fused->op_op_type[op]); + assert(false && "Fusion currently does not support type"); } - // for (int i = 0; i < fused->numOutputs; i++) - // print_tensor(output_ptr[i], output_domain[i].get_volume(), - // "[Fused:forward:output]"); } + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; + } + // for (int i = 0; i < fused->numOutputs; i++) + // print_tensor(output_ptr[i], output_domain[i].get_volume(), + // "[Fused:forward:output]"); +} - /* - regions[...](I): input - regions[...](I): weight - regions[...](I): output - regions[...](I/O): input_grad - regions[...](I/O): weight_grad - regions[...](I/O): output_grad - */ +/* + regions[...](I): input + regions[...](I): weight + regions[...](I): output + regions[...](I/O): input_grad + regions[...](I/O): weight_grad + regions[...](I/O): output_grad +*/ - __host__ void FusedOp::backward_task( - Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - // const FusedOp* fused = (FusedOp*) task->args; - FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); - FusedOp const *fused = metas->fused_op; +__host__ void FusedOp::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + // const FusedOp* fused = (FusedOp*) task->args; + FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); + FusedOp const *fused = metas->fused_op; - assert(metas->numOperators == fused->numOperators); - assert(regions.size() == task->regions.size()); - { - int sum = fused->numInputs + fused->numWeights + fused->numOutputs; - assert(sum * 2 == (int)regions.size()); - } - GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorW weight_grad_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorR output_accessor[MAX_NUM_OUTPUTS]; - GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS]; - int roff = 0; - assert(fused->numInputs <= MAX_NUM_INPUTS); - for (int i = 0; i < fused->numInputs; i++) { - input_accessor[i] = - helperGetGenericTensorAccessorRO(fused->input_data_types[i], - regions[i], - task->regions[i], - FID_DATA, - ctx, - runtime); + assert(metas->numOperators == fused->numOperators); + assert(regions.size() == task->regions.size()); + { + int sum = fused->numInputs + fused->numWeights + fused->numOutputs; + assert(sum * 2 == (int)regions.size()); + } + GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW weight_grad_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorR output_accessor[MAX_NUM_OUTPUTS]; + GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS]; + int roff = 0; + assert(fused->numInputs <= MAX_NUM_INPUTS); + for (int i = 0; i < fused->numInputs; i++) { + input_accessor[i] = + helperGetGenericTensorAccessorRO(fused->input_data_types[i], + regions[i], + task->regions[i], + FID_DATA, + ctx, + runtime); + } + roff += fused->numInputs; + assert(fused->numWeights <= MAX_NUM_WEIGHTS); + for (int i = 0; i < fused->numWeights; i++) { + weight_accessor[i] = + helperGetGenericTensorAccessorRO(fused->weight_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + } + roff += fused->numWeights; + assert(fused->numOutputs <= MAX_NUM_OUTPUTS); + for (int i = 0; i < fused->numOutputs; i++) { + output_accessor[i] = + helperGetGenericTensorAccessorRO(fused->output_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + } + roff += fused->numOutputs; + for (int i = 0; i < fused->numInputs; i++) { + input_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->input_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + assert(input_grad_accessor[i].domain == input_accessor[i].domain); + } + roff += fused->numInputs; + for (int i = 0; i < fused->numWeights; i++) { + weight_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->weight_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + assert(weight_grad_accessor[i].domain.get_volume() == + weight_accessor[i].domain.get_volume()); + } + roff += fused->numWeights; + for (int i = 0; i < fused->numOutputs; i++) { + output_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->output_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + assert(output_grad_accessor[i].domain == output_accessor[i].domain); + } + roff += fused->numOutputs; + // Assert that all meta share the same dnn/blas handler + int start = 0; + for (start = 0; start < fused->numOperators; start++) { + if (metas->meta[start] != NULL) { + break; + } + } + for (int op = start + 1; op < fused->numOperators; op++) { + if (metas->meta[op] != NULL) { + assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas); + assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn); + } + } + + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + int ioff = 0, woff = 0, ooff = 0; + GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorR my_output_accessor[MAX_NUM_OUTPUTS]; + GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorW my_weight_grad_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS]; + // Do backpropagation in the reverse ordering + for (int op = 0; op < fused->numOperators; op++) { + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; + } + + for (int op = fused->numOperators - 1; op >= 0; op--) { + ioff -= fused->op_num_inputs[op]; + woff -= fused->op_num_weights[op]; + ooff -= fused->op_num_outputs[op]; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + int my_off = fused->op_input_idx[i + ioff]; + if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { + my_input_accessor[i] = input_accessor[my_off]; + my_input_grad_accessor[i] = input_grad_accessor[my_off]; + } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { + my_input_accessor[i] = output_accessor[my_off]; + my_input_grad_accessor[i] = output_grad_accessor[my_off]; + assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain); + } else { + assert(false); } - roff += fused->numInputs; - assert(fused->numWeights <= MAX_NUM_WEIGHTS); - for (int i = 0; i < fused->numWeights; i++) { - weight_accessor[i] = - helperGetGenericTensorAccessorRO(fused->weight_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); + my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; + my_weight_grad_accessor[i] = + weight_grad_accessor[fused->op_weight_idx[i + woff]]; + assert(my_weight_grad_accessor[i].domain.get_volume() == + my_weight_accessor[i].domain.get_volume()); + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); + my_output_accessor[i] = output_accessor[fused->op_output_idx[i + ooff]]; + my_output_grad_accessor[i] = + output_grad_accessor[fused->op_output_idx[i + ooff]]; + assert(my_output_grad_accessor[i].domain == my_output_accessor[i].domain); + } + switch (fused->op_op_type[op]) { + case OP_CONCAT: { + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + ConcatMeta *m = (ConcatMeta *)metas->meta[op]; + int num_inputs = fused->op_num_inputs[op]; + Kernels::Concat::backward_kernel_wrapper(m, + my_output_grad_accessor[0], + my_input_grad_accessor, + num_inputs, + m->legion_axis); + break; } - roff += fused->numWeights; - assert(fused->numOutputs <= MAX_NUM_OUTPUTS); - for (int i = 0; i < fused->numOutputs; i++) { - output_accessor[i] = - helperGetGenericTensorAccessorRO(fused->output_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); + case OP_CONV2D: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 5); + assert(my_output_accessor[0].domain.get_dim() == 5); + Conv2DMeta *m = (Conv2DMeta *)metas->meta[op]; + Kernels::Conv2D::backward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + my_weight_grad_accessor[0].get_float_ptr(), + my_weight_grad_accessor[1].get_float_ptr()); + break; } - roff += fused->numOutputs; - for (int i = 0; i < fused->numInputs; i++) { - input_grad_accessor[i] = - helperGetGenericTensorAccessorRW(fused->input_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); - assert(input_grad_accessor[i].domain == input_accessor[i].domain); + case OP_BATCHNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 2); + assert(my_weight_accessor[1].domain.get_dim() == 2); + assert(my_output_accessor[0].domain.get_dim() == 5); + BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; + BatchNorm::backward_kernel( + m, + (float const *)my_input_accessor[0].get_float_ptr(), + (float *)my_output_grad_accessor[0].get_float_ptr(), + (float const *)my_output_accessor[0].get_float_ptr(), + (float *)my_input_grad_accessor[0].get_float_ptr(), + (float const *)my_weight_accessor[0].get_float_ptr(), + (float *)my_weight_grad_accessor[0].get_float_ptr(), + (float *)my_weight_grad_accessor[1].get_float_ptr(), + my_output_accessor[0].domain.get_volume()); + break; } - roff += fused->numInputs; - for (int i = 0; i < fused->numWeights; i++) { - weight_grad_accessor[i] = - helperGetGenericTensorAccessorRW(fused->weight_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); - assert(weight_grad_accessor[i].domain.get_volume() == - weight_accessor[i].domain.get_volume()); + case OP_DROPOUT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + DropoutMeta *m = (DropoutMeta *)metas->meta[op]; + Kernels::Dropout::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr()); + break; } - roff += fused->numWeights; - for (int i = 0; i < fused->numOutputs; i++) { - output_grad_accessor[i] = - helperGetGenericTensorAccessorRW(fused->output_data_types[i], - regions[i + roff], - task->regions[i + roff], - FID_DATA, - ctx, - runtime); - assert(output_grad_accessor[i].domain == output_accessor[i].domain); + case OP_LINEAR: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + Domain kernel_domain = my_weight_accessor[0].domain; + int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1; + int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1; + int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; + assert(my_output_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); + float *bias_grad_ptr = nullptr; + if (fused->op_num_weights[op] == 2) { + assert(my_weight_accessor[1].domain.get_volume() == out_dim); + bias_grad_ptr = my_weight_grad_accessor[1].get_float_ptr(); + } else { + assert(fused->op_num_weights[op] == 1); + } + LinearMeta *m = (LinearMeta *)metas->meta[op]; + Kernels::Linear::backward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + my_weight_grad_accessor[0].get_float_ptr(), + bias_grad_ptr, + in_dim, + out_dim, + batch_size); + break; } - roff += fused->numOutputs; - // Assert that all meta share the same dnn/blas handler - int start = 0; - for (start = 0; start < fused->numOperators; start++) { - if (metas->meta[start] != NULL) { - break; + case OP_BATCHMATMUL: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + Domain out_domain = my_output_accessor[0].domain; + Domain a_domain = my_input_accessor[0].domain; + Domain b_domain = my_input_accessor[1].domain; + // check dims + int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; + assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); + int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; + assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); + int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; + assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); + assert(a_domain.get_dim() == b_domain.get_dim()); + assert(a_domain.get_dim() == out_domain.get_dim()); + int batch = 1; + for (int i = 2; i < a_domain.get_dim(); i++) { + int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; + assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); + assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); + batch *= dim_size; } + BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; + Kernels::BatchMatmul::backward_kernel_wrapper( + meta, + (float const *)my_output_accessor[0].get_float_ptr(), + (float const *)my_output_grad_accessor[0].get_float_ptr(), + (float const *)my_input_accessor[0].get_float_ptr(), + (float *)my_input_grad_accessor[0].get_float_ptr(), + (float const *)my_input_accessor[1].get_float_ptr(), + (float *)my_input_grad_accessor[1].get_float_ptr(), + (float *)nullptr, + m, + n, + k, + batch); + break; + } + case OP_EW_ADD: + case OP_EW_SUB: + case OP_EW_MUL: + case OP_EW_DIV: + case OP_EW_MAX: + case OP_EW_MIN: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + // assert(my_input_accessor[0].domain == + // my_input_accessor[1].domain); assert(my_input_accessor[0].domain + // == my_output_accessor[0].domain); + ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; + Kernels::ElementBinary::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_accessor[1].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[1].get_float_ptr()); + break; } - for (int op = start + 1; op < fused->numOperators; op++) { - if (metas->meta[op] != NULL) { - assert(metas->meta[start]->handle.blas == - metas->meta[op]->handle.blas); - assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn); + case OP_EMBEDDING: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; + assert(my_input_accessor[0].data_type == DT_INT64 || + my_input_accessor[0].data_type == DT_INT32); + int in_dim, out_dim, effective_batch_size; + if (m->aggr == AGGR_MODE_NONE) { + in_dim = 1; + out_dim = my_output_grad_accessor[0].domain.hi()[0] - + my_output_grad_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_grad_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } else { + in_dim = my_input_accessor[0].domain.hi()[0] - + my_input_accessor[0].domain.lo()[0] + 1; + out_dim = my_output_grad_accessor[0].domain.hi()[0] - + my_output_grad_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_grad_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); } + Kernels::Embedding::backward_kernel_wrapper(m, + my_input_accessor[0], + my_output_grad_accessor[0], + my_weight_grad_accessor[0], + in_dim, + out_dim, + effective_batch_size); + break; } - - hipStream_t stream; - checkCUDA(get_legion_stream(&stream)); - - int ioff = 0, woff = 0, ooff = 0; - GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorR my_output_accessor[MAX_NUM_OUTPUTS]; - GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorW my_weight_grad_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS]; - // Do backpropagation in the reverse ordering - for (int op = 0; op < fused->numOperators; op++) { - ioff += fused->op_num_inputs[op]; - woff += fused->op_num_weights[op]; - ooff += fused->op_num_outputs[op]; + case OP_GELU: + case OP_RELU: + case OP_SIGMOID: + case OP_TANH: + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; + ElementUnary::backward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; } - - for (int op = fused->numOperators - 1; op >= 0; op--) { - ioff -= fused->op_num_inputs[op]; - woff -= fused->op_num_weights[op]; - ooff -= fused->op_num_outputs[op]; - for (int i = 0; i < fused->op_num_inputs[op]; i++) { - int my_off = fused->op_input_idx[i + ioff]; - if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { - my_input_accessor[i] = input_accessor[my_off]; - my_input_grad_accessor[i] = input_grad_accessor[my_off]; - } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { - my_input_accessor[i] = output_accessor[my_off]; - my_input_grad_accessor[i] = output_grad_accessor[my_off]; - assert(my_input_grad_accessor[i].domain == - my_input_accessor[i].domain); - } else { - assert(false); - } + case OP_POOL2D: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + // assert(my_input_accessor[0].domain == + // my_output_accessor[0].domain); + Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; + Kernels::Pool2D::backward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr()); + break; + } + case OP_FLAT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_volume() == + my_output_grad_accessor[0].domain.get_volume()); + Kernels::Flat::backward_kernel_wrapper( + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + break; + } + case OP_RESHAPE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_volume() == + my_output_grad_accessor[0].domain.get_volume()); + if (my_input_grad_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_int64_ptr(), + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].get_int32_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else { + assert(false); } - for (int i = 0; i < fused->op_num_weights[op]; i++) { - assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - my_weight_accessor[i] = - weight_accessor[fused->op_weight_idx[i + woff]]; - my_weight_grad_accessor[i] = - weight_grad_accessor[fused->op_weight_idx[i + woff]]; - assert(my_weight_grad_accessor[i].domain.get_volume() == - my_weight_accessor[i].domain.get_volume()); + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::backward_kernel_wrapper( + m, + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false); } - for (int i = 0; i < fused->op_num_outputs[op]; i++) { - assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); - my_output_accessor[i] = - output_accessor[fused->op_output_idx[i + ooff]]; - my_output_grad_accessor[i] = - output_grad_accessor[fused->op_output_idx[i + ooff]]; - assert(my_output_grad_accessor[i].domain == - my_output_accessor[i].domain); + break; + } + case OP_TRANSPOSE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_volume() == + my_output_grad_accessor[0].domain.get_volume()); + TransposeMeta *m = (TransposeMeta *)metas->meta[op]; + Kernels::Transpose::backward_kernel_wrapper( + m, + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain, + my_output_grad_accessor[0].domain); + break; + } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; } - switch (fused->op_op_type[op]) { - case OP_CONCAT: { - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - ConcatMeta *m = (ConcatMeta *)metas->meta[op]; - int num_inputs = fused->op_num_inputs[op]; - Kernels::Concat::backward_kernel_wrapper(m, - my_output_grad_accessor[0], - my_input_grad_accessor, - num_inputs, - m->legion_axis); - break; - } - case OP_CONV2D: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_dim() == 5); - assert(my_weight_accessor[0].domain.get_dim() == 5); - assert(my_output_accessor[0].domain.get_dim() == 5); - Conv2DMeta *m = (Conv2DMeta *)metas->meta[op]; - Kernels::Conv2D::backward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - my_weight_grad_accessor[0].get_float_ptr(), - my_weight_grad_accessor[1].get_float_ptr()); - break; - } - case OP_BATCHNORM: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_dim() == 5); - assert(my_weight_accessor[0].domain.get_dim() == 2); - assert(my_weight_accessor[1].domain.get_dim() == 2); - assert(my_output_accessor[0].domain.get_dim() == 5); - BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; - BatchNorm::backward_kernel( - m, - (float const *)my_input_accessor[0].get_float_ptr(), - (float *)my_output_grad_accessor[0].get_float_ptr(), - (float const *)my_output_accessor[0].get_float_ptr(), - (float *)my_input_grad_accessor[0].get_float_ptr(), - (float const *)my_weight_accessor[0].get_float_ptr(), - (float *)my_weight_grad_accessor[0].get_float_ptr(), - (float *)my_weight_grad_accessor[1].get_float_ptr(), - my_output_accessor[0].domain.get_volume()); - break; - } - case OP_DROPOUT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - DropoutMeta *m = (DropoutMeta *)metas->meta[op]; - Kernels::Dropout::backward_kernel_wrapper( - m, - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr()); - break; - } - case OP_LINEAR: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - Domain kernel_domain = my_weight_accessor[0].domain; - int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1; - int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1; - int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; - assert(my_output_accessor[0].domain.get_volume() == - out_dim * batch_size); - assert(my_input_accessor[0].domain.get_volume() == - in_dim * batch_size); - float *bias_grad_ptr = nullptr; - if (fused->op_num_weights[op] == 2) { - assert(my_weight_accessor[1].domain.get_volume() == out_dim); - bias_grad_ptr = my_weight_grad_accessor[1].get_float_ptr(); - } else { - assert(fused->op_num_weights[op] == 1); - } - LinearMeta *m = (LinearMeta *)metas->meta[op]; - Kernels::Linear::backward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - my_weight_grad_accessor[0].get_float_ptr(), - bias_grad_ptr, - in_dim, - out_dim, - batch_size); - break; - } - case OP_BATCHMATMUL: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - Domain out_domain = my_output_accessor[0].domain; - Domain a_domain = my_input_accessor[0].domain; - Domain b_domain = my_input_accessor[1].domain; - // check dims - int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; - assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); - int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; - assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); - int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; - assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); - assert(a_domain.get_dim() == b_domain.get_dim()); - assert(a_domain.get_dim() == out_domain.get_dim()); - int batch = 1; - for (int i = 2; i < a_domain.get_dim(); i++) { - int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; - assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); - assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); - batch *= dim_size; - } - BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; - Kernels::BatchMatmul::backward_kernel_wrapper( - meta, - (float const *)my_output_accessor[0].get_float_ptr(), - (float const *)my_output_grad_accessor[0].get_float_ptr(), - (float const *)my_input_accessor[0].get_float_ptr(), - (float *)my_input_grad_accessor[0].get_float_ptr(), - (float const *)my_input_accessor[1].get_float_ptr(), - (float *)my_input_grad_accessor[1].get_float_ptr(), - (float *)nullptr, - m, - n, - k, - batch); - break; - } - case OP_EW_ADD: - case OP_EW_SUB: - case OP_EW_MUL: - case OP_EW_DIV: - case OP_EW_MAX: - case OP_EW_MIN: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - // assert(my_input_accessor[0].domain == - // my_input_accessor[1].domain); assert(my_input_accessor[0].domain - // == my_output_accessor[0].domain); - ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; - Kernels::ElementBinary::backward_kernel_wrapper( - m, - my_output_grad_accessor[0].get_float_ptr(), - my_input_accessor[0].get_float_ptr(), - my_input_accessor[1].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[1].get_float_ptr()); - break; - } - case OP_EMBEDDING: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 1); - assert(fused->op_num_outputs[op] == 1); - EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; - assert(my_input_accessor[0].data_type == DT_INT64 || - my_input_accessor[0].data_type == DT_INT32); - int in_dim, out_dim, effective_batch_size; - if (m->aggr == AGGR_MODE_NONE) { - in_dim = 1; - out_dim = my_output_grad_accessor[0].domain.hi()[0] - - my_output_grad_accessor[0].domain.lo()[0] + 1; - effective_batch_size = - my_output_grad_accessor[0].domain.get_volume() / out_dim; - assert(effective_batch_size * in_dim == - my_input_accessor[0].domain.get_volume()); - } else { - in_dim = my_input_accessor[0].domain.hi()[0] - - my_input_accessor[0].domain.lo()[0] + 1; - out_dim = my_output_grad_accessor[0].domain.hi()[0] - - my_output_grad_accessor[0].domain.lo()[0] + 1; - effective_batch_size = - my_output_grad_accessor[0].domain.get_volume() / out_dim; - assert(effective_batch_size * in_dim == - my_input_accessor[0].domain.get_volume()); - } - Kernels::Embedding::backward_kernel_wrapper( - m, - my_input_accessor[0], - my_output_grad_accessor[0], - my_weight_grad_accessor[0], - in_dim, - out_dim, - effective_batch_size); - break; - } - case OP_GELU: - case OP_RELU: - case OP_SIGMOID: - case OP_TANH: - case OP_ELU: - case OP_SCALAR_ADD: - case OP_SCALAR_MULTIPLY: - case OP_SCALAR_SUB: - case OP_SCALAR_TRUE_DIV: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - ElementUnary::backward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - break; - } - case OP_POOL2D: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - // assert(my_input_accessor[0].domain == - // my_output_accessor[0].domain); - Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; - Kernels::Pool2D::backward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr()); - break; - } - case OP_FLAT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_grad_accessor[0].domain.get_volume() == - my_output_grad_accessor[0].domain.get_volume()); - Kernels::Flat::backward_kernel_wrapper( - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain.get_volume()); - break; - } - case OP_RESHAPE: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_grad_accessor[0].domain.get_volume() == - my_output_grad_accessor[0].domain.get_volume()); - if (my_input_grad_accessor[0].data_type == DT_INT64) { - Kernels::Reshape::backward_kernel_wrapper( - my_input_grad_accessor[0].get_int64_ptr(), - my_output_grad_accessor[0].get_int64_ptr(), - my_input_grad_accessor[0].domain.get_volume()); - } else if (my_input_grad_accessor[0].data_type == DT_INT32) { - Kernels::Reshape::forward_kernel_wrapper( - my_input_grad_accessor[0].get_int32_ptr(), - my_output_grad_accessor[0].get_int32_ptr(), - my_input_grad_accessor[0].domain.get_volume()); - } else if (my_input_grad_accessor[0].data_type == DT_FLOAT) { - Kernels::Reshape::backward_kernel_wrapper( - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain.get_volume()); - } else { - assert(false); - } - break; - } - case OP_SOFTMAX: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; - if (my_input_accessor[0].data_type == DT_FLOAT) { - Kernels::Softmax::backward_kernel_wrapper( - m, - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - } else { - assert(false); - } - break; - } - case OP_TRANSPOSE: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_grad_accessor[0].domain.get_volume() == - my_output_grad_accessor[0].domain.get_volume()); - TransposeMeta *m = (TransposeMeta *)metas->meta[op]; - Kernels::Transpose::backward_kernel_wrapper( - m, - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain, - my_output_grad_accessor[0].domain); - break; - } - case OP_LAYERNORM: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; - assert(fused->op_num_weights[op] == - 2 * (int)(m->elementwise_affine)); - GenericTensorAccessorR gamma, beta; - if (m->elementwise_affine) { - gamma = my_weight_accessor[0]; - beta = my_weight_accessor[1]; - } - LayerNorm::backward_kernel_wrapper( - m, - my_output_grad_accessor[0].get_float_ptr(), - my_input_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - gamma.get_float_ptr(), - my_weight_grad_accessor[0].get_float_ptr(), - my_weight_grad_accessor[1].get_float_ptr()); - break; - } - case OP_CAST: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - CastMeta const *m = (CastMeta *)metas->meta[op]; - if (m->input_data_type == DT_INT32 && - m->output_data_type == DT_INT64) { - Kernels::Cast::backward_kernel_wrapper( - my_output_grad_accessor[0].get_int64_ptr(), - my_input_grad_accessor[0].get_int32_ptr(), - my_output_grad_accessor[0].domain.get_volume()); - } else if (m->input_data_type == DT_INT32 && - m->output_data_type == DT_FLOAT) { - Kernels::Cast::backward_kernel_wrapper( - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_int32_ptr(), - my_output_grad_accessor[0].domain.get_volume()); - } else { - assert(false); - } - default: - assert(false && "Fusion currently does not support type"); - } + LayerNorm::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + gamma.get_float_ptr(), + my_weight_grad_accessor[0].get_float_ptr(), + my_weight_grad_accessor[1].get_float_ptr()); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else { + assert(false); } - assert(ioff == 0); - assert(woff == 0); - assert(ooff == 0); - // for (int i = 0; i < fused->numWeights; i++) - // print_tensor(weight_grad_ptr[i], - // weight_grad_domain[i].get_volume(), - // "[Fused:backward:weight_grad]"); - // for (int i = 0; i < fused->numInputs; i++) - // print_tensor(input_grad_ptr[i], - // input_grad_domain[i].get_volume(), - // "[Fused:backward:input_grad]"); - // for (int i = 0; i < fused->numOutputs; i++) - // print_tensor(output_grad_ptr[i], - // output_grad_domain[i].get_volume(), - // "[Fused:backward:output_grad]"); + break; } + default: + assert(false && "Fusion currently does not support type"); + } + } + assert(ioff == 0); + assert(woff == 0); + assert(ooff == 0); + // for (int i = 0; i < fused->numWeights; i++) + // print_tensor(weight_grad_ptr[i], + // weight_grad_domain[i].get_volume(), + // "[Fused:backward:weight_grad]"); + // for (int i = 0; i < fused->numInputs; i++) + // print_tensor(input_grad_ptr[i], + // input_grad_domain[i].get_volume(), + // "[Fused:backward:input_grad]"); + // for (int i = 0; i < fused->numOutputs; i++) + // print_tensor(output_grad_ptr[i], + // output_grad_domain[i].get_volume(), + // "[Fused:backward:output_grad]"); +} - }; // namespace FlexFlow +}; // namespace FlexFlow diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index 75cf06b18b..8ea2ebba9a 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -142,8 +142,8 @@ template void LayerNorm::forward_kernel(LayerNormMeta const *m, T const *in_ptr, T *out_ptr, - T *gamma_ptr, - T *beta_ptr, + T const *gamma_ptr, + T const *beta_ptr, hipStream_t stream) { hipLaunchKernelGGL(HIP_KERNEL_NAME(RowwiseMomentsCUDAKernel), m->effective_batch_size, From 17a1c4e6bb6ae499f76c2fc49092e25dc2813993 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 6 Oct 2023 12:47:53 -0400 Subject: [PATCH 51/52] more fix to hip --- src/ops/fused.cpp | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index 226184f1a2..cb2b81fbcf 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -13,11 +13,11 @@ * limitations under the License. */ +#include "flexflow/ops/fused.h" #include "flexflow/model.h" #include "flexflow/ops/batch_norm.h" #include "flexflow/ops/element_unary.h" #include "flexflow/ops/embedding.h" -#include "flexflow/ops/fused.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" #include "flexflow/ops/kernels/cast_kernels.h" #include "flexflow/ops/kernels/concat_kernels.h" @@ -365,7 +365,11 @@ __host__ void FusedOp::forward_task(Task const *task, case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); @@ -424,10 +428,24 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); - Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + if (my_input_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int64_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int32_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false && "Unsupported data type"); + } break; } case OP_TRANSPOSE: { From f65044dcd63c91e0729a3c53a9f49cc3f5871278 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 3 Nov 2023 04:59:11 -0400 Subject: [PATCH 52/52] customized kernel for broadcasting add. --- include/flexflow/ops/element_binary.h | 1 + .../ops/kernels/element_binary_kernels.h | 2 ++ src/ops/element_binary.cc | 9 ++++++ src/ops/kernels/element_binary_kernels.cpp | 32 ++++++++++++++++--- 4 files changed, 40 insertions(+), 4 deletions(-) diff --git a/include/flexflow/ops/element_binary.h b/include/flexflow/ops/element_binary.h index cfacec50f7..f83ce20609 100644 --- a/include/flexflow/ops/element_binary.h +++ b/include/flexflow/ops/element_binary.h @@ -58,6 +58,7 @@ class ElementBinary : public Op { public: bool inplace_a, has_same_operands; bool broadcast_input1, broadcast_input2; + int batch_size; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/kernels/element_binary_kernels.h b/include/flexflow/ops/kernels/element_binary_kernels.h index 529859195e..50c7f2b80c 100644 --- a/include/flexflow/ops/kernels/element_binary_kernels.h +++ b/include/flexflow/ops/kernels/element_binary_kernels.h @@ -22,6 +22,8 @@ class ElementBinaryMeta : public OpMeta { OperatorType op_type; bool inplace_a, has_same_operands; bool broadcast_input1, broadcast_input2; + int batch_size; + size_t replicate_size; char op_name[MAX_OPNAME]; }; diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 0e0d8e7c31..12895cfd98 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -211,6 +211,9 @@ ElementBinary::ElementBinary(FFModel &model, numdim, dims, in1->data_type, this); broadcast_input1 = (inputs[0]->get_volume() != outputs[0]->get_volume()); broadcast_input2 = (inputs[1]->get_volume() != outputs[0]->get_volume()); + + batch_size = dims[numdim - 2].size; + } ElementBinary::ElementBinary( @@ -337,6 +340,8 @@ OpMeta *ElementBinary::init_task(Task const *task, m->has_same_operands = eb->has_same_operands; m->broadcast_input1 = eb->broadcast_input1; m->broadcast_input2 = eb->broadcast_input2; + m->batch_size = eb->batch_size; + std::strcpy(m->op_name, eb->name); Domain input1_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); @@ -368,6 +373,10 @@ OpMeta *ElementBinary::init_task(Task const *task, } else { output_domain = input1_domain; } + m->replicate_size = m->broadcast_input1 + ? (input1_domain.get_volume() / m->batch_size) + : (input2_domain.get_volume() / m->batch_size); + assert(task->regions.size() == regions.size()); assert(regions.size() == num_regions); init_kernel(m, input1_domain, input2_domain, output_domain); diff --git a/src/ops/kernels/element_binary_kernels.cpp b/src/ops/kernels/element_binary_kernels.cpp index 4cdc839b59..325edba6d0 100644 --- a/src/ops/kernels/element_binary_kernels.cpp +++ b/src/ops/kernels/element_binary_kernels.cpp @@ -72,15 +72,12 @@ void forward_kernel_wrapper(ElementBinaryMeta const *m, float *out_ptr) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - hipEvent_t t_start, t_end; if (m->profiling) { hipEventCreate(&t_start); hipEventCreate(&t_end); hipEventRecord(t_start, stream); } - // print_tensor(in1_ptr, in1_domain.get_volume(), "input1:"); - // print_tensor(in2_ptr, in2_domain.get_volume(), "input2:"); Internal::forward_kernel(m, in1_ptr, in2_ptr, out_ptr, stream); // print_tensor(out_ptr, in1_domain.get_volume(), "output:"); if (m->profiling) { @@ -199,6 +196,21 @@ __global__ void elewise_binary_forward_kernel(coord_t volume, } } +// for simplicity, assume the replicate dimension is the batchsize +__global__ void + elewise_binary_forward_kernel_broadcast2(float const *in1_ptr, + float const *in2_ptr, + float *output_ptr, + size_t volume, + size_t batch_size, + size_t replicate_size) { + CUDA_KERNEL_LOOP(i, volume) { + size_t batch = i / replicate_size; + output_ptr[i] = + in1_ptr[i] + in2_ptr[batch * replicate_size + i % replicate_size]; + } +} + __global__ void elewise_binary_backward_kernel(coord_t volume, float const alpha, float const beta, @@ -245,7 +257,6 @@ void forward_kernel(ElementBinaryMeta const *m, hipStream_t stream) { checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - float alpha1 = 1.0f, alpha2 = 1.0f, beta = 0.0f; switch (m->op_type) { case OP_EW_SUB: @@ -284,6 +295,19 @@ void forward_kernel(ElementBinaryMeta const *m, &alpha1, m->outputTensor, out_ptr)); + } else if (m->op_type == OP_EW_ADD && m->broadcast_input2) { + int parallelism = m->batch_size * m->replicate_size; + hipLaunchKernelGGL(elewise_binary_forward_kernel_broadcast2, + GET_BLOCKS(parallelism), + CUDA_NUM_THREADS, + 0, + stream, + in1_ptr, + in2_ptr, + out_ptr, + m->batch_size * m->replicate_size, + m->batch_size, + m->replicate_size); } else { checkCUDNN(miopenOpTensor(m->handle.dnn, m->opDesc,