diff --git a/CMakeLists.txt b/CMakeLists.txt index 43ce4f7044..377ede0d0d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -168,6 +168,17 @@ if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") include(cudnn) endif() +# NCCL +if(FF_USE_NCCL) + if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda") + include(nccl) + endif() + list(APPEND FF_CC_FLAGS + -DFF_USE_NCCL) + list(APPEND FF_NVCC_FLAGS + -DFF_USE_NCCL) +endif() + # Legion include(legion) @@ -383,19 +394,78 @@ if(NOT BUILD_LEGION_ONLY) add_dependencies(flexflow ${NCCL_NAME}) endif() - target_include_directories(flexflow PUBLIC ${FLEXFLOW_INCLUDE_DIRS}) - # LEGION_URL is defined if we found a precompiled Legion library to download - if(LEGION_URL) - # Legion builds produce two library files: one for the Legion runtime and one for the Realm runtime. - # When linking FlexFlow to a precompiled version of Legion, we need to manually link to both library files. - target_link_libraries(flexflow ${LEGION_LIBRARY} ${REALM_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional) - add_dependencies(flexflow ${LEGION_NAME}) - else() - # When building Legion from source, we do so by calling add_subdirectory(), and obtain a library with both the - # Legion and Realm runtimes. The library's name is saved into the LEGION_LIBRARY variable. Hence, we only need - # to link FlexFlow to ${LEGION_LIBRARY} - target_link_libraries(flexflow ${LEGION_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional) + list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}/hip ${ROCM_PATH}) + + find_package(hip REQUIRED) + + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + # The targets defined by the hip cmake config only target amd devices. + # For targeting nvidia devices, we'll make our own interface target, + # hip_device_nvidia, that includes the rocm and hip headers. + add_library(hip_device_nvidia INTERFACE) + + if (NOT FF_CUDA_ARCH STREQUAL "") + target_compile_options(hip_device_nvidia INTERFACE -arch=compute_${FF_CUDA_ARCH}) + endif() + + target_include_directories(hip_device_nvidia SYSTEM INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include) + target_include_directories(hip_device_nvidia INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include) + + add_compile_definitions(FF_USE_HIP_CUDA) + + # Linking cuda: + # We do not explicitly link cuda. hipcc when targeting nvidia will + # use nvcc under the hood. nvcc when used for linking will handle + # linking cuda dependencies + target_link_libraries(flexflow hip_device_nvidia) + elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + find_package(hipblas REQUIRED) + find_package(miopen REQUIRED) + if(FF_USE_NCCL) + find_package(rccl REQUIRED) + endif() + # find_package(rocrand REQUIRED) + find_library(HIP_RAND_LIBRARY hiprand REQUIRED) + + add_compile_definitions(FF_USE_HIP_ROCM) + # The hip cmake config module defines three targets, + # hip::amdhip64, hip::host, and hip::device. + # + # hip::host and hip::device are interface targets. hip::amdhip64 is an + # imported target for libamdhip. + # + # You do not directly link to hip::amdhip64. hip::host links to hip::amdhip64 + # and hip::device links to hip::host. Link to hip::host to just use hip without + # compiling any GPU code. Link to hip::device to compile the GPU device code. + # + # Docs (outdated): + # https://rocmdocs.amd.com/en/latest/Installation_Guide/Using-CMake-with-AMD-ROCm.html + target_link_libraries(flexflow hip::device roc::hipblas MIOpen ${HIP_RAND_LIBRARY}) + if(FF_USE_NCCL) + target_link_libraries(flexflow rccl) + endif() endif() +else() + message(FATAL_ERROR "Unsupported FF_GPU_BACKEND for cmake: ${FF_GPU_BACKEND}") +endif() + +if(FF_USE_NCCL AND (FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda")) + add_dependencies(flexflow ${NCCL_NAME}) +endif() + +target_include_directories(flexflow PUBLIC ${FLEXFLOW_INCLUDE_DIRS}) +# LEGION_URL is defined if we found a precompiled Legion library to download +if(LEGION_URL) + # Legion builds produce two library files: one for the Legion runtime and one for the Realm runtime. + # When linking FlexFlow to a precompiled version of Legion, we need to manually link to both library files. + target_link_libraries(flexflow ${LEGION_LIBRARY} ${REALM_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional) + add_dependencies(flexflow ${LEGION_NAME}) +else() + # When building Legion from source, we do so by calling add_subdirectory(), and obtain a library with both the + # Legion and Realm runtimes. The library's name is saved into the LEGION_LIBRARY variable. Hence, we only need + # to link FlexFlow to ${LEGION_LIBRARY} + target_link_libraries(flexflow ${LEGION_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional) +endif() #library api version, bump from time to time set(SOVERSION 1) diff --git a/cmake/json.cmake b/cmake/json.cmake index 63ac50b203..3cf57a7864 100644 --- a/cmake/json.cmake +++ b/cmake/json.cmake @@ -1,4 +1 @@ -include(FetchContent) - -FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz) -FetchContent_MakeAvailable(json) \ No newline at end of file +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/json) diff --git a/config/config.inc b/config/config.inc index 7d7b2db9cf..35709971ad 100644 --- a/config/config.inc +++ b/config/config.inc @@ -118,6 +118,11 @@ if [ "$FF_LEGION_NETWORKS" = "gasnet" ]; then SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=mpi" elif [ "$FF_GASNET_CONDUIT" = "udp" ]; then SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=udp" + elif [ "$FF_GASNET_CONDUIT" = "ucx" ]; then + SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=ucx" + SET_LEGION_NETWORKS+=" -DFF_UCX_URL=$FF_UCX_URL" + elif [ "$FF_GASNET_CONDUIT" = "ofi" ]; then + SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=ofi" fi elif [ "$FF_LEGION_NETWORKS" = "ucx" ]; then SET_LEGION_NETWORKS+=" -DFF_LEGION_NETWORKS=ucx" @@ -235,7 +240,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then SET_CXX="-DCMAKE_CXX_COMPILER=$(pwd)/nvidia_hipcc -DCMAKE_CXX_LINKER=$(pwd)/nvidia_hipcc" else ADD_ROCM_TO_PATH="PATH=${PATH}:${ROCM_PATH}/bin" - #SET_CXX="-DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_CXX_LINKER=/opt/rocm/bin/hipcc" + SET_CXX="-DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc -DCMAKE_CXX_LINKER=$ROCM_PATH/bin/hipcc -DHIP_PATH=$ROCM_PATH/hip -DCMAKE_CXX_FLAGS='-I${MPICH_DIR}/include' -DCMAKE_EXE_LINKER_FLAGS='-L${MPICH_DIR}/lib -lmpi' -DCMAKE_SHARED_LINKER_FLAGS='-L${MPICH_DIR}/lib -lmpi'" fi fi fi diff --git a/config/config.linux b/config/config.linux index acffc210f5..9a00d3b69f 100755 --- a/config/config.linux +++ b/config/config.linux @@ -59,7 +59,7 @@ FF_USE_PYTHON=${FF_USE_PYTHON:-ON} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS:-} # select GASNET conduit -FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv} +FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ofi} # set UCX dir if Legion networks is set to ucx UCX_DIR=${UCX_DIR:-""} @@ -99,11 +99,9 @@ FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda} if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid." exit 1 -elif [[ "$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" || "$FF_GPU_BACKEND" == "hip_rocm" ]]; then +elif [["$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" || "$FF_GPU_BACKEND" == "hip_rocm"]]; then # enable NCCL FF_USE_NCCL=${FF_USE_NCCL:-ON} -else - FF_USE_NCCL=OFF fi function get_build_configs() { diff --git a/deps/legion b/deps/legion index 24e8c45234..626b55689c 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit 24e8c452341dea41427e0ce61e154d61715e6835 +Subproject commit 626b55689c77848b246e1da19678c7ad58899f0c diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py index 41b84a269e..c2868e9d1e 100644 --- a/examples/python/pytorch/mt5/mt5_ff.py +++ b/examples/python/pytorch/mt5/mt5_ff.py @@ -3,16 +3,17 @@ import sys import numpy as np +import torch from flexflow.core import * from flexflow.torch.model import PyTorchModel -from transformers import MT5ForConditionalGeneration, T5Tokenizer - +#from transformers import MT5ForConditionalGeneration, T5Tokenizer +from transformers import BertForMaskedLM, BertTokenizer sys.path.append("./examples/python/pytorch/mt5") from mt5_torch import DataPreparer, get_dataloaders, set_seed BASE_DIR = "examples/python/pytorch/mt5" DATA_DIR = os.path.join(BASE_DIR, "data") -NUMPY_DIR = os.path.join(DATA_DIR, "numpy") +NUMPY_DIR = os.path.join(DATA_DIR, "numpy_candle") def data_to_numpy() -> None: @@ -28,7 +29,8 @@ def data_to_numpy() -> None: """ model_params = { "SEED": 42, - "MODEL": "google/mt5-small", + #"MODEL": "google/mt5-small", + "MODEL": "bert-base-uncased", "TRAIN_BATCH_SIZE": None, # use the full dataset as one batch "EVAL_BATCH_SIZE": None, # use the full dataset as one batch "TRAIN_EPOCHS": 1, # unused @@ -36,7 +38,8 @@ def data_to_numpy() -> None: "MAX_TARGET_TEXT_LENGTH": 48, } set_seed(model_params) - tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"]) + #tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"]) + tokenizer = BertTokenizer.from_pretrained(model_params["MODEL"]) print("Getting dataloaders...") train_loader, eval_loader = get_dataloaders(tokenizer, model_params) assert len(train_loader) == 1 @@ -61,8 +64,8 @@ def preprocess_train() -> None: y_shape = y.shape assert len(y.shape) == 2, \ "`y` should have shape (num examples, sequence length)" - y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long) - lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long) + y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32) + lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32) y_ids[:, :] = y[:, :-1] lm_labels[:, :] = y[:, 1:] @@ -81,28 +84,42 @@ def preprocess_train() -> None: def top_level_task(): ffconfig = FFConfig() ffmodel = FFModel(ffconfig) - model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small") - + #model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small") + model = BertForMaskedLM.from_pretrained("bert-base-uncased") + #model = BertModel.from_pretrained("bert-base-uncased") # Load train data as numpy arrays print("Loading data...") - ids = np.load(os.path.join(NUMPY_DIR, "train_source_ids.npy")) - mask = np.load(os.path.join(NUMPY_DIR, "train_source_mask.npy")) - y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) - lm_labels = np.load(os.path.join(NUMPY_DIR, "train_lm_labels.npy")) + ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")).astype('int32') + ids = np.pad(ids, ((0,0), (0,17)), 'constant') + #ids = np.random.randint(0, 5, (1000, 512)) + #print('ids_shape', ids.shape) + #print('ids', ids) + mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")).astype('int32') + mask = np.pad(mask, ((0,0), (0,17)), 'constant') + #mask = np.random.randint(0, 2, (1000, 512)) + #y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy")) + lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")).astype('int32') + lm_labels = np.pad(lm_labels, ((0,0), (0,17)), 'constant') + #lm_labels = np.random.randint(-1, 5, (1000, 512)) + position_id = torch.arange(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy() + token_type_ids = torch.zeros(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy() + batch_size = ffconfig.batch_size input_ids_shape = (batch_size, ids.shape[1]) attention_mask_shape = (batch_size, mask.shape[1]) - decoder_input_ids_shape = (batch_size, y_ids.shape[1]) + #decoder_input_ids_shape = (batch_size, y_ids.shape[1]) input_tensors = [ - ffmodel.create_tensor(input_ids_shape, DataType.DT_INT64), # input_ids - ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT64), # attention_mask - ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids + ffmodel.create_tensor(input_ids_shape, DataType.DT_INT32), # input_ids + ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT32), # attention_mask + #ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids ] encoder_seq_length = ids.shape[1] - decoder_seq_length = y_ids.shape[1] - seq_length = (encoder_seq_length, decoder_seq_length) - input_names = ["input_ids", "attention_mask", "decoder_input_ids"] + #decoder_seq_length = y_ids.shape[1] + #seq_length = (encoder_seq_length, decoder_seq_length) + seq_length = encoder_seq_length + #input_names = ["input_ids", "attention_mask", "decoder_input_ids"] + input_names = ["input_ids", "attention_mask"] print("Tracing the model...") hf_model = PyTorchModel( @@ -110,7 +127,10 @@ def top_level_task(): batch_size=batch_size, seq_length=seq_length, ) output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True) - ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) + #from flexflow.torch.model import file_to_ff + #file_to_ff("mt5.ff", ffmodel, input_tensors) + ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, beta1=0.9, beta2=0.98, weight_decay=0.0, epsilon=2e-8) + # ffoptimizer = SGDOptimizer(ffmodel, lr=0.01) print("Compiling the model...") ffmodel.compile( @@ -121,13 +141,21 @@ def top_level_task(): MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY, ], ) + + # load weights here + ffmodel.load_bert_pretrained(checkpoint=model) print("Creating data loaders...") + print('id_dtype', ids.dtype) + print('mask_dtype', mask.dtype) + print('labels_dtype', lm_labels.dtype) input_ids_dl = ffmodel.create_data_loader(input_tensors[0], ids) attention_mask_dl = ffmodel.create_data_loader(input_tensors[1], mask) - decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids) + #decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids) # NOTE: We cast down the label tensor data to 32-bit to accommodate the # label tensor's required dtype + token_type_ids_dl = ffmodel.create_data_loader(input_tensors[2], token_type_ids) + position_id_dl = ffmodel.create_data_loader(input_tensors[3], position_id) labels_dl = ffmodel.create_data_loader( ffmodel.label_tensor, lm_labels.astype("int32") ) @@ -138,31 +166,32 @@ def top_level_task(): print("Training...") epochs = ffconfig.epochs ffmodel.fit( - x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl], + #x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl], + x=[input_ids_dl, attention_mask_dl, position_id_dl, token_type_ids_dl], y=labels_dl, batch_size=batch_size, epochs=epochs, ) if __name__ == "__main__": - # Generate the .tsv files if needed - if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \ - not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")): - DataPreparer.data_to_tsv() - # Convert the .tsv files to .npy if needed - if not os.path.exists(NUMPY_DIR): - os.mkdir(NUMPY_DIR) - prefixes = ["train_", "eval_"] - suffixes = ["source_ids.npy", "source_mask.npy", "target_ids.npy"] - npy_filenames = [ - pre + suf for pre, suf in itertools.product(prefixes, suffixes) - ] - if any( - not os.path.exists(os.path.join(NUMPY_DIR, filename)) - for filename in npy_filenames - ): - data_to_numpy() - # Preprocess the training data if needed - if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \ - not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")): - preprocess_train() + ## Generate the .tsv files if needed + #if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \ + # not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")): + # DataPreparer.data_to_tsv() + ## Convert the .tsv files to .npy if needed + #if not os.path.exists(NUMPY_DIR): + # os.mkdir(NUMPY_DIR) + #prefixes = ["train_", "eval_"] + #suffixes = ["source_ids.npy", "source_mask.npy", "target_ids.npy"] + #npy_filenames = [ + # pre + suf for pre, suf in itertools.product(prefixes, suffixes) + #] + #if any( + # not os.path.exists(os.path.join(NUMPY_DIR, filename)) + # for filename in npy_filenames + #): + # data_to_numpy() + ## Preprocess the training data if needed + #if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \ + # not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")): + # preprocess_train() top_level_task() diff --git a/examples/python/pytorch/mt5/mt5_torch.py b/examples/python/pytorch/mt5/mt5_torch.py index 78886eed6c..4d741c44a5 100644 --- a/examples/python/pytorch/mt5/mt5_torch.py +++ b/examples/python/pytorch/mt5/mt5_torch.py @@ -7,7 +7,7 @@ import os import numpy as np -import pandas as pd +#import pandas as pd import torch from torch.utils.data import DataLoader, Dataset from transformers import MT5ForConditionalGeneration, T5Tokenizer @@ -311,5 +311,5 @@ def TorchMT5Trainer( "MAX_TARGET_TEXT_LENGTH": 48, "LEARNING_RATE": 1e-4, } - device = torch.device(0) + device = torch.device('cpu') TorchMT5Trainer(model_params, device) diff --git a/gdb/pretty_print.py b/gdb/pretty_print.py index 4cccc9b76b..e6fbe298ce 100644 --- a/gdb/pretty_print.py +++ b/gdb/pretty_print.py @@ -61,7 +61,11 @@ def to_string(self): size = dim['size'] degree = dim['degree'] parallel_idx = dim['parallel_idx'] - toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx}]') + if dim['is_replica_dim']: + is_replica = 'r=t' + else: + is_replica = 'r=f' + toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx} {is_replica}]') return f'TensorShape<{" ".join(toks)}>' class ParallelTensorBasePrinter: @@ -77,9 +81,31 @@ def to_string(self): size = dim['size'] degree = dim['degree'] parallel_idx = dim['parallel_idx'] - toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx}]') + tok = f'{i}=[s={size} d={degree} pi={parallel_idx} ' + if dim['is_replica_dim']: + tok += 'r=t' + else: + tok += 'r=f' + tok += ']' + toks.append(tok) return f'ParallelTensorBase<{" ".join(toks)}>' +class ParallelDimPrinter: + def __init__(self, val): + self.val = val + + def to_string(self): + size = self.val['size'] + degree = self.val['degree'] + parallel_idx = self.val['parallel_idx'] + tok = f's={size} d={degree} pi={parallel_idx} ' + if dim['is_replica_dim']: + tok += 'r=t' + else: + tok += 'r=f' + return f'ParallelDim<{tok}>' + + def build_pretty_printer(): pp = gdb.printing.RegexpCollectionPrettyPrinter( "flexflow") @@ -89,6 +115,7 @@ def build_pretty_printer(): pp.add_printer('Domain', '^Legion::Domain$', DomainPrinter) pp.add_printer('ParallelTensorShape', '^FlexFlow::ParallelTensorShape$', TensorShapePrinter) pp.add_printer('ParallelTensorBase', '^FlexFlow::ParallelTensorBase$', ParallelTensorBasePrinter) + pp.add_printer('ParallelDim', '^FlexFlow::ParallelDim$', ParallelDimPrinter) return pp gdb.printing.register_pretty_printer( diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 0b74b7fce4..38cdfc9688 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -337,6 +337,7 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle, flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle, const flexflow_tensor_t input, int dim, + bool last_layer, char const *name); flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle, diff --git a/include/flexflow/graph.h b/include/flexflow/graph.h index 2e0cf1ca4b..2c92eeeb31 100644 --- a/include/flexflow/graph.h +++ b/include/flexflow/graph.h @@ -91,7 +91,7 @@ struct NodeCompare { struct GraphOptimalViewSerialized { #ifdef LEGION_MAX_RETURN_SIZE - static const size_t buffer_size = LEGION_MAX_RETURN_SIZE - 8; + static const size_t buffer_size = 4 * LEGION_MAX_RETURN_SIZE - 8; #else static const size_t buffer_size = 1024 * 1024 - 8; #endif @@ -332,6 +332,8 @@ class Graph { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static GraphOptimalViewSerialized + graph_optimize_wrapper(FFModel * model); Node find_bottleneck_node(Node const &sink_node, Node const &source_node) const; void print_strategy_computation_graph( diff --git a/include/flexflow/initializer.h b/include/flexflow/initializer.h index 062530a655..3c44d1184a 100644 --- a/include/flexflow/initializer.h +++ b/include/flexflow/initializer.h @@ -46,7 +46,7 @@ class GlorotUniform : public Initializer { class Op; struct ZeroInitMeta { - static int const MAX_NUM_REGIONS = 64; + static int const MAX_NUM_REGIONS = 128; int num_regions; Op *op_ptr; DataType data_types[MAX_NUM_REGIONS]; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 95be9ab581..b7f4b37795 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -657,6 +657,7 @@ class FFModel { Tensor softmax(const Tensor input, int dim = -1, DataType data_type = DT_NONE, + bool last_layer = false, char const *name = NULL); // Create input tensors and constants Tensor transpose(const Tensor input, diff --git a/include/flexflow/ops/element_binary.h b/include/flexflow/ops/element_binary.h index 08747bb9a4..4df7ea7bd8 100644 --- a/include/flexflow/ops/element_binary.h +++ b/include/flexflow/ops/element_binary.h @@ -78,6 +78,7 @@ class ElementBinary : public Op { public: bool inplace_a, has_same_operands; bool broadcast_input1, broadcast_input2; + int batch_size; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/kernels/element_binary_kernels.h b/include/flexflow/ops/kernels/element_binary_kernels.h index 5a375fb661..5740fde660 100644 --- a/include/flexflow/ops/kernels/element_binary_kernels.h +++ b/include/flexflow/ops/kernels/element_binary_kernels.h @@ -23,6 +23,8 @@ class ElementBinaryMeta : public OpMeta { OperatorType op_type; bool inplace_a, has_same_operands; bool broadcast_input1, broadcast_input2; + int batch_size; + size_t replicate_size; }; namespace Kernels { diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index 8cfaf3c586..21b546c360 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -24,6 +24,7 @@ class SoftmaxMeta : public OpMeta { bool inference_debugging; int dim; DataType input_type, output_type; + bool last_layer; }; namespace Kernels { @@ -36,6 +37,7 @@ template void backward_kernel_wrapper(SoftmaxMeta const *m, DT *input_grad_ptr, DT const *output_grad_ptr, + DT const *output_ptr, size_t num_elements); namespace Internal { @@ -46,8 +48,10 @@ void forward_kernel(SoftmaxMeta const *m, ffStream_t stream); template -void backward_kernel(DT *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + DT *input_grad_ptr, DT const *output_grad_ptr, + DT const *output_ptr, size_t num_elements, ffStream_t stream); } // namespace Internal diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 61094f7361..b145d7ca37 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -17,6 +17,7 @@ class Softmax : public Op { LayerID const &_layer_guid, const ParallelTensor logit, int dim, + bool _last_layer, char const *name); Softmax(FFModel &model, Params const ¶ms, @@ -84,6 +85,7 @@ class Softmax : public Op { public: int dim; + bool last_layer; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/softmax_params.h b/include/flexflow/ops/softmax_params.h index 63dc87641f..a6ad3a2dbd 100644 --- a/include/flexflow/ops/softmax_params.h +++ b/include/flexflow/ops/softmax_params.h @@ -9,6 +9,7 @@ struct SoftmaxParams { LayerID layer_guid; int dim; char name[MAX_OPNAME]; + bool last_layer; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(SoftmaxParams const &, SoftmaxParams const &); diff --git a/include/flexflow/ops/split.h b/include/flexflow/ops/split.h index cb9c6bdb57..95c569738d 100644 --- a/include/flexflow/ops/split.h +++ b/include/flexflow/ops/split.h @@ -59,6 +59,8 @@ class Split : public Op { Params get_params() const; + tl::optional as_dot() const override; + public: int legion_axis; std::vector splits; diff --git a/include/flexflow/parallel_ops/replicate.h b/include/flexflow/parallel_ops/replicate.h index 65d69d8564..719d99dd8d 100644 --- a/include/flexflow/parallel_ops/replicate.h +++ b/include/flexflow/parallel_ops/replicate.h @@ -63,6 +63,20 @@ class Replicate : public ParallelOp { GenericTensorAccessorW const &output, size_t num_elements, size_t num_replicas); + template + static void + forward_task_with_type(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + + template + static void backward_task_with_type( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/include/flexflow/parallel_tensor.h b/include/flexflow/parallel_tensor.h index d06ecd7bac..a04c1afe86 100644 --- a/include/flexflow/parallel_tensor.h +++ b/include/flexflow/parallel_tensor.h @@ -101,6 +101,7 @@ struct ParallelTensorShape { RecordFormatter as_dot() const; size_t get_piece_size() const; + size_t get_piece_num_elements() const; bool is_valid() const; int get_num_replica_dims() const; diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h index 5d3c831d4f..af00afa206 100644 --- a/include/flexflow/utils/hip_helper.h +++ b/include/flexflow/utils/hip_helper.h @@ -23,7 +23,7 @@ do { \ std::stringstream _error; \ if (status != miopenStatusSuccess) { \ - _error << "CUDNN failure: " << status; \ + _error << "CUDNN failure: " << miopenGetErrorString(status); \ FatalError(_error.str()); \ } \ } while (0) @@ -155,6 +155,10 @@ miopenStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor, Legion::Domain domain); +miopenStatus_t + cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor, + Legion::Domain domain); + hipblasDatatype_t ff_to_cuda_datatype(DataType type); miopenDataType_t ff_to_cudnn_datatype(DataType type); diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index df4042748f..5d4f892ccc 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -1,4 +1,4 @@ -# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# Copyright 2020 Stanford University, Los Alamos National Laboratory # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,6 @@ from collections import OrderedDict from enum import Enum from typing import List -import copy import numpy as np from flexflow.core.flexflow_cffi import Tensor, NormInitializer @@ -26,6 +25,7 @@ try: import torch + print(torch.__version__) from torch.fx.immutable_collections import immutable_dict except: pass @@ -653,14 +653,26 @@ def string_to_ff(string, ffmodel, node_to_output): data = Node.StringData(string) name = data.name input_tensor = node_to_output[data.innodes[0]] - return ffmodel.identity(input=input_tensor, name=name) - # TODO: Change to ffmodel.layernorm() once supported + axes = [len(input_tensor.dims) - 1] + return ffmodel.layer_norm( + input=input_tensor, + axes=axes, + elementwise_affine=True, + eps=1e-6, + name=name, + ) def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] - return ffmodel.identity(input=input_tensor, name=self.name) - # TODO: Change to ffmodel.layernorm() once supported - + axes = [0] + eps = self.module.eps + return ffmodel.layer_norm( + input=input_tensor, + axes=axes, + elementwise_affine=True, + eps=eps, + name=self.name, + ) class T5LayerNormNode(Node): """ @@ -931,7 +943,7 @@ def construct_node(node): elif name.find("contiguous") >= 0: return ContiguousNode(node) elif name.find("tanh") >= 0: return TanhFNode(node) elif name.find("gelu") >= 0: return GeluFNode(node) - assert 0, f"Unknown function or method: {name}" + assert 0, f"Unknown function or method: {name} {node}" @staticmethod def is_right_scalar_op(node): @@ -1186,16 +1198,24 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_sub( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor, scalar = \ FunctionNode.parse_scalar_op(self, node_to_output) - return ffmodel.scalar_sub( - input=input_tensor, scalar=scalar, name=self.name, - ) - + if self.scalar_pos == FunctionNode.ScalarPosition.RIGHT: + return ffmodel.scalar_sub( + input=input_tensor, scalar=scalar, inplace=False, name=self.name, + ) + else: + negative_input = ffmodel.scalar_multiply( + input=input_tensor, scalar=-1, inplace=False, name=self.name + '_negative', + ) + return ffmodel.scalar_sub( + input=negative_input, scalar=-scalar, inplace=False, name=self.name, + ) + class ScalarTrueDivNode(FunctionNode): def __init__(self, node): @@ -1220,15 +1240,16 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_true_divide( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor = node_to_output[self.innodes[0].name] scalar = self.innodes[1] assert type(scalar) is float + return ffmodel.scalar_true_divide( - input=input_tensor, scalar=scalar, name=self.name, + input=input_tensor, scalar=scalar, inplace=False, name=self.name, ) @@ -1409,6 +1430,10 @@ def to_ff(self, ffmodel, node_to_output): @staticmethod def slice_tensor(ffmodel, tensor, slices, name): + + print('slices', slices) + old_shape = tensor.dims + print('old_shape', tensor.dims) """Returns a reshaped tensor based on the given slices.""" def is_colon(slice_elem): """Returns if the slice is equivalent to `:`.""" @@ -1424,11 +1449,20 @@ def is_truncate(slice_elem, old_size): stop = old_size if slice_elem.stop == None else slice_elem.stop new_size = stop - start return new_size < old_size - + def is_single_element(slice_elem): return isinstance(slice_elem, int) + def is_exact(slice_elem, old_size): + if slice_elem is None: + return False + start = 0 if slice_elem.start == None else slice_elem.start + stop = old_size if slice_elem.stop == None else slice_elem.stop + new_size = stop - start + return new_size == old_size + shape = tensor.dims + print('input dims', tensor.dims) # Fewer slices than input dimensions diff = len(shape) - len(slices) @@ -1441,12 +1475,18 @@ def is_single_element(slice_elem): # Match dimensions from right to left new_shape = [] # append then reverse j = len(shape) - 1 + import copy curr_tensor = copy.copy(tensor) for slice_elem in reversed(slices): - if is_colon(slice_elem): + print('slice_elem', slice_elem) + if is_colon(slice_elem) or is_exact(slice_elem, shape[j]): + print('shape', shape) assert j >= 0 + print('j', j) + print('new_shape_bef', new_shape) new_shape.append(shape[j]) + print('new_shape_aft', new_shape) j -= 1 elif is_unsqueeze(slice_elem): new_shape.append(1) @@ -1456,6 +1496,8 @@ def is_single_element(slice_elem): curr_tensor = ffmodel.split(input=curr_tensor, sizes=splits, axis=j, name=name)[0] new_shape.append(1) j -= 1 + elif is_exact(slice_elem, shape[j]): + print('exact') elif is_truncate(slice_elem, shape[j]): assert j >= 0 start = 0 if slice_elem.start == None else slice_elem.start @@ -1481,8 +1523,45 @@ def is_single_element(slice_elem): assert 0, f"Unsupported slice element: {slice_elem}" new_shape.reverse() - return ffmodel.reshape(input=curr_tensor, shape=new_shape, name=name,) - + if len(new_shape) == 0: + return curr_tensor + else: + print('new_shape', new_shape) + if old_shape == new_shape: + return curr_tensor + return ffmodel.reshape(input=curr_tensor, shape=new_shape, name=name,) + + + +# """Returns a reshaped tensor based on the given slices.""" +# def is_colon(slice_elem): +# """Returns if the slice is equivalent to `:`.""" +# return slice_elem == slice(None, None, None) +# +# def is_unsqueeze(slice_elem): +# """Returns if the slice is equivalent to unsqueezing that +# dimension.""" +# return slice_elem is None +# shape = tensor.dims +# # Match dimensions from right to left +# new_shape = [] # append then reverse +# j = len(shape) - 1 +# for slice_elem in reversed(slices): +# if is_colon(slice_elem): +# assert j >= 0 +# new_shape.append(shape[j]) +# j -= 1 +# elif is_unsqueeze(slice_elem): +# new_shape.append(1) +# else: +# assert 0, f"Unsupported slice element: {slice_elem}" +# new_shape.reverse() +# return ffmodel.reshape( +# input=tensor, shape=new_shape, name=name, +# ) + + + @staticmethod def strings_to_slices(strings: List[str]): # Extract slice elements @@ -1583,14 +1662,14 @@ def string_to_ff(string, ffmodel, node_to_output): input_tensor = node_to_output[data.innodes[0]] scalar = float(data.items[4]) return ffmodel.scalar_multiply( - input=input_tensor, scalar=scalar, name=name, + input=input_tensor, scalar=scalar, inplace=False, name=name, ) def to_ff(self, ffmodel, node_to_output): input_tensor, scalar = \ FunctionNode.parse_scalar_op(self, node_to_output) return ffmodel.scalar_multiply( - input=input_tensor, scalar=scalar, name=self.name, + input=input_tensor, scalar=scalar, inplace=False, name=self.name, ) @@ -1751,7 +1830,7 @@ def __init__(self, node): def parse(self): s = [self.name] scalar = self.innodes[1] - if type(scalar) is not int or type(scalar) is not float: + if not isinstance(scalar, [int, float]): assert 0, "FlexFlow does not support tensor floor division" innodes = (self.innodes[0],) s.append(self.parse_inoutnodes(innodes)) @@ -2290,11 +2369,16 @@ def string_to_ff(string, ffmodel, node_to_output): "since attributes require access to the PyTorch model" ) - def to_ff(self, ffmodel, node_to_output): - return self.attr_to_ff_tensor(ffmodel) + def to_ff(self, ffmodel, node_to_output, input_tensors): + return self.attr_to_ff_tensor(ffmodel, input_tensors) + + def attr_to_ff_tensor(self, ffmodel, input_tensors): - def attr_to_ff_tensor(self, ffmodel): - torch_tensor = self.attr + + torch_tensor = self.attr + assert (torch_tensor.shape[0] == 1) + batch_size = ffmodel._ffconfig.batch_size + torch_tensor = np.repeat(torch_tensor, batch_size, axis=0) ff_dtype = Node.torch_to_ff_dtype(torch_tensor.dtype) requires_grad = torch_tensor.requires_grad @@ -2309,14 +2393,17 @@ def attr_to_ff_tensor(self, ffmodel): ff_dtype = DataType.DT_FLOAT np_tensor = np_tensor.astype(np.float32) + print('attr: ', torch_tensor.shape) + assert (torch_tensor.shape[0] == batch_size) ff_tensor = ffmodel.create_tensor( - torch_tensor.shape, ff_dtype, requires_grad, + torch_tensor.shape, ff_dtype, True, ) # delay set_tensor, add to ffmodel ffmodel.attr_tensors[ff_tensor] = np_tensor # ff_tensor.set_tensor( # ffmodel, np_tensor # ) + input_tensors.append(ff_tensor) return ff_tensor @@ -2398,7 +2485,7 @@ def to_ff(self, ffmodel, node_to_output, output_tensors): # `CrossEntropyLoss()` implementation logits = node_to_output[other["logits"].name] softmax_logits = ffmodel.softmax( - input=logits, name=self.name, + input=logits, last_layer=True, name=self.name, ) output_tensors[:] += [softmax_logits] else: @@ -2440,6 +2527,11 @@ def _trace_model(self): batch_size=self.batch_size, sequence_length=self.seq_length, ) + + #import pickle + #with open('symbolic_trace', 'rb') as f: + #traced = pickle.load(f) + else: traced = torch.fx.symbolic_trace(self.model) @@ -2527,6 +2619,8 @@ def torch_to_ff(self, ffmodel, input_tensors, verbose=False): elif isinstance(node, OutputNode): node.to_ff(ffmodel, node_to_output, output_tensors) node_output = None + elif isinstance(node, AttributeNode): + node_output = node.to_ff(ffmodel, node_to_output, input_tensors) else: node_output = node.to_ff(ffmodel, node_to_output) diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 5714c8fe3d..7c6013d994 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -916,10 +916,11 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle_, flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle_, const flexflow_tensor_t input_, int dim, + bool last_layer, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); - Tensor tensor = handle->softmax(input, dim, input->data_type, name); + Tensor tensor = handle->softmax(input, dim, input->data_type, last_layer, name); DEBUG_PRINT( "[Softmax] new Tensor %p, input %p, name %s", tensor, input, name); return FFCObjectWrapper::wrap(tensor); diff --git a/src/loss_functions/loss_functions.cc b/src/loss_functions/loss_functions.cc index ae89c3d469..d887ee9243 100644 --- a/src/loss_functions/loss_functions.cc +++ b/src/loss_functions/loss_functions.cc @@ -49,6 +49,8 @@ void Loss::backward(FFModel *model, if (loss_type == LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE) { assert(logit->get_volume() == label->get_volume()); scale_factor = 2.0f / logit->get_volume(); + } else if (loss_type == LOSS_SPARSE_CATEGORICAL_CROSSENTROPY) { + scale_factor = 1.0f; } else { scale_factor = 1.0f / model->config.batchSize; } @@ -131,9 +133,12 @@ void Loss::backward_task_with_dim(Task const *task, regions[2], task->regions[2], FID_DATA, ctx, runtime); // assertion the outter-most dim is replica dim and replica degree is 1 assert(acc_logit.rect.hi[NDIM - 1] == acc_logit.rect.lo[NDIM - 1]); - int num_samples = - acc_logit.rect.hi[NDIM - 2] - acc_logit.rect.lo[NDIM - 2] + 1; - int num_classes = acc_logit.rect.volume() / num_samples; + + int num_classes = acc_logit.rect.hi[0] - acc_logit.rect.lo[0] + 1; + int num_samples = acc_logit.rect.volume() / num_classes; + // int num_samples = + // acc_logit.rect.hi[NDIM - 2] - acc_logit.rect.lo[NDIM - 2] + 1; + // int num_classes = acc_logit.rect.volume() / num_samples; assert(acc_logit_grad.rect == acc_logit.rect); int k = 1; if (loss->repl_labels) { diff --git a/src/loss_functions/loss_functions.cpp b/src/loss_functions/loss_functions.cpp index a87aaade84..3453f3fbf6 100644 --- a/src/loss_functions/loss_functions.cpp +++ b/src/loss_functions/loss_functions.cpp @@ -20,6 +20,7 @@ namespace FlexFlow { using namespace Legion; +int const MASK_TOKEN = -100; __global__ void sparse_categorical_crossentropy_loss_backward(float *logit_grad, @@ -33,6 +34,25 @@ __global__ void } } +__global__ void + sparse_categorical_crossentropy_loss_backward_with_mask(float *logit_grad, + int const *label, + coord_t num_samples, + coord_t num_classes, + int const k, + float *num) { + CUDA_KERNEL_LOOP(i, num_samples * num_classes) { + int sample_id = i / num_classes; + int label_idx = label[i / (k * num_classes)]; + if (label_idx != MASK_TOKEN && (i == sample_id * num_classes + label_idx)) { + logit_grad[i] -= 1.0f; + atomicAdd(&num[0], 1.0f); + } else if (label_idx == MASK_TOKEN) { + logit_grad[i] = 0.0f; + } + } +} + __global__ void categorical_crossentropy_loss_backward(float *logit_grad, float const *logit, float const *label, @@ -75,8 +95,14 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_ptr, logit_volume * sizeof(float), hipMemcpyDeviceToDevice)); - hipLaunchKernelGGL(sparse_categorical_crossentropy_loss_backward, - GET_BLOCKS(num_samples), + + assert(scale_factor == 1.0f); + float *num; + checkCUDA(hipMalloc(&num, sizeof(float))); + float effective_tokens; + int parallelism = num_samples * num_classes; + hipLaunchKernelGGL(sparse_categorical_crossentropy_loss_backward_with_mask, + GET_BLOCKS(parallelism), CUDA_NUM_THREADS, 0, stream, @@ -84,7 +110,10 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( label_ptr, num_samples, num_classes, - k); + k, + num); + + hipMemcpy(&effective_tokens, num, sizeof(float), hipMemcpyDeviceToHost); // Scale logit gradients by op->scale_factor hipLaunchKernelGGL(scale_kernel, GET_BLOCKS(logit_grad_volume), @@ -94,7 +123,7 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_grad_ptr, logit_grad_volume, 0, - scale_factor * k); + 1.0f / effective_tokens); } void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( diff --git a/src/loss_functions/loss_functions.cu b/src/loss_functions/loss_functions.cu index f78311980c..edd8f03fa4 100644 --- a/src/loss_functions/loss_functions.cu +++ b/src/loss_functions/loss_functions.cu @@ -18,6 +18,7 @@ namespace FlexFlow { +int const MASK_TOKEN = -100; using namespace Legion; __global__ void @@ -32,6 +33,25 @@ __global__ void } } +__global__ void + sparse_categorical_crossentropy_loss_backward_with_mask(float *logit_grad, + int const *label, + coord_t num_samples, + coord_t num_classes, + int const k, + float *num) { + CUDA_KERNEL_LOOP(i, num_samples * num_classes) { + int sample_id = i / num_classes; + int label_idx = label[i / (k * num_classes)]; + if (label_idx != MASK_TOKEN && (i == sample_id * num_classes + label_idx)) { + logit_grad[i] -= 1.0f; + atomicAdd(&num[0], 1.0f); + } else if (label_idx == MASK_TOKEN) { + logit_grad[i] = 0.0f; + } + } +} + __global__ void categorical_crossentropy_loss_backward(float *logit_grad, float const *logit, float const *label, @@ -74,14 +94,25 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_ptr, logit_volume * sizeof(float), cudaMemcpyDeviceToDevice)); - sparse_categorical_crossentropy_loss_backward<<>>( - logit_grad_ptr, label_ptr, num_samples, num_classes, k); - // Scale logit gradients by op->scale_factor + // calculate the scale factor inside kernel; + assert(scale_factor == 1.0f); + float *num; + checkCUDA(cudaMalloc(&num, sizeof(float))); + float effective_tokens; + int parallelism = num_samples * num_classes; + // sparse_categorical_crossentropy_loss_backward<<>>( + // logit_grad_ptr, label_ptr, num_samples, num_classes, k, num); + sparse_categorical_crossentropy_loss_backward_with_mask<<< + GET_BLOCKS(parallelism), + CUDA_NUM_THREADS, + 0, + stream>>>(logit_grad_ptr, label_ptr, num_samples, num_classes, k, num); + cudaMemcpy(&effective_tokens, num, sizeof(float), cudaMemcpyDeviceToHost); scale_kernel<<>>( - logit_grad_ptr, logit_grad_volume, 0, scale_factor * k); + logit_grad_ptr, logit_grad_volume, 0, 1.0f / effective_tokens); } void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( diff --git a/src/metrics_functions/metrics_functions.cc b/src/metrics_functions/metrics_functions.cc index e8ccbfe2e4..8c7e23ad8a 100644 --- a/src/metrics_functions/metrics_functions.cc +++ b/src/metrics_functions/metrics_functions.cc @@ -15,6 +15,7 @@ #include "flexflow/metrics_functions.h" #include "flexflow/model.h" +#include namespace FlexFlow { @@ -90,6 +91,8 @@ void Metrics::compute(FFModel *model, false /*must*/, 0 /*mapper_id*/, logit->machine_view.hash()); + // std::cout << "logit shape: " << logit->get_shape() << std::endl; + // std::cout << "label shape: " << label->get_shape() << std::endl; launcher.add_region_requirement(RegionRequirement( logit->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, logit->region)); launcher.add_field(0, FID_DATA); @@ -154,6 +157,7 @@ PerfMetrics assert(acc_label.rect.lo[0] == acc_label.rect.hi[0]); // Cannot measure categorical_crossentropy w/ sparse labels // Use measure_sparse_categorical_crossentropy instead + // std::cout << "num_classes: " << num_classes << std::endl; assert(!me->measure_categorical_crossentropy); Metrics::update_metrics_sparse_label_kernel_wrapper(acc_logit.ptr, acc_label.ptr, diff --git a/src/metrics_functions/metrics_functions.cpp b/src/metrics_functions/metrics_functions.cpp index 90d727b9b1..1c57bd6ba9 100644 --- a/src/metrics_functions/metrics_functions.cpp +++ b/src/metrics_functions/metrics_functions.cpp @@ -20,6 +20,7 @@ namespace FlexFlow { float const LOG_MIN_VALUE = 0.00000001f; +int const MASK_TOKEN = -100; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, @@ -30,7 +31,7 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, CUDA_KERNEL_LOOP(b, num_samples) { if (metrics.measure_accuracy) { float max_val = -1.0f; - int my_label = -1; + int my_label = 0; for (int i = 0; i < num_classes; i++) { float my_logit = logits[b * num_classes + i]; if (my_logit > max_val) { @@ -39,14 +40,19 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, } } assert(my_label >= 0); - atomicAdd(&(perf->train_all), 1); - if (labels[b] == my_label) { - atomicAdd(&(perf->train_correct), 1); + if (labels[b] != MASK_TOKEN) { + atomicAdd(&(perf->train_all), 1); + if (labels[b] == my_label) { + atomicAdd(&(perf->train_correct), 1); + } } } if (metrics.measure_sparse_categorical_crossentropy) { - float my_logit = max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); - atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + if (labels[b] != MASK_TOKEN) { + float my_logit = + max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); + atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + } } if (metrics.measure_mean_squared_error || metrics.measure_root_mean_squared_error || diff --git a/src/metrics_functions/metrics_functions.cu b/src/metrics_functions/metrics_functions.cu index 2e037eb472..8c584c397c 100644 --- a/src/metrics_functions/metrics_functions.cu +++ b/src/metrics_functions/metrics_functions.cu @@ -19,6 +19,7 @@ namespace FlexFlow { float const LOG_MIN_VALUE = 0.00000001f; +int const MASK_TOKEN = -100; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, @@ -29,7 +30,7 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, CUDA_KERNEL_LOOP(b, num_samples) { if (metrics.measure_accuracy) { float max_val = -1.0f; - int my_label = -1; + int my_label = 0; for (int i = 0; i < num_classes; i++) { float my_logit = logits[b * num_classes + i]; if (my_logit > max_val) { @@ -38,14 +39,19 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, } } assert(my_label >= 0); - atomicAdd(&(perf->train_all), 1); - if (labels[b] == my_label) { - atomicAdd(&(perf->train_correct), 1); + if (labels[b] != MASK_TOKEN) { + atomicAdd(&(perf->train_all), 1); + if (labels[b] == my_label) { + atomicAdd(&(perf->train_correct), 1); + } } } if (metrics.measure_sparse_categorical_crossentropy) { - float my_logit = max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); - atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + if (labels[b] != MASK_TOKEN) { + float my_logit = + max(logits[b * num_classes + labels[b]], LOG_MIN_VALUE); + atomicAdd(&(perf->sparse_cce_loss), -log(my_logit)); + } } if (metrics.measure_mean_squared_error || metrics.measure_root_mean_squared_error || diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 4352f459b9..03a2f1a67c 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -239,6 +239,9 @@ ElementBinary::ElementBinary(FFModel &model, numdim, dims, in1->data_type, this); broadcast_input1 = (inputs[0]->get_volume() != outputs[0]->get_volume()); broadcast_input2 = (inputs[1]->get_volume() != outputs[0]->get_volume()); + + batch_size = dims[numdim - 2].size; + } ElementBinary::ElementBinary( @@ -438,6 +441,8 @@ OpMeta *ElementBinary::init_task(Task const *task, m->has_same_operands = eb->has_same_operands; m->broadcast_input1 = eb->broadcast_input1; m->broadcast_input2 = eb->broadcast_input2; + m->batch_size = eb->batch_size; + std::strcpy(m->op_name, eb->name); m->layer_guid = eb->layer_guid; Domain input1_domain = runtime->get_index_space_domain( @@ -470,6 +475,10 @@ OpMeta *ElementBinary::init_task(Task const *task, } else { output_domain = input1_domain; } + m->replicate_size = m->broadcast_input1 + ? (input1_domain.get_volume() / m->batch_size) + : (input2_domain.get_volume() / m->batch_size); + assert(task->regions.size() == regions.size()); assert(regions.size() == num_regions); init_kernel(m, input1_domain, input2_domain, output_domain); @@ -483,7 +492,7 @@ void ElementBinary::forward(FFModel const &ff) { set_argumentmap_for_forward(ff, argmap); IndexLauncher launcher(ELEMENTBINARY_FWD_TASK_ID, parallel_is, - TaskArgument(NULL, 0), + TaskArgument(this, sizeof(ElementBinary)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -740,7 +749,7 @@ __host__ void std::vector const ®ions, Context ctx, Runtime *runtime) { - // const ElementBinary* ele = (const ElementBinary*) task->args; + ElementBinary const *ele = (ElementBinary const *)task->args; ElementBinaryMeta const *m = *((ElementBinaryMeta **)task->local_args); GenericTensorAccessorR in1, in2; GenericTensorAccessorW out; diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index 0e1d115557..31c0f9f8a3 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -131,7 +131,8 @@ Tensor FFModel::tanh(const Tensor x, char const *name) { } Tensor FFModel::identity(const Tensor x, char const *name) { - return this->unary(OP_IDENTITY, x, false /*inplace*/, name); + // return this->unary(OP_IDENTITY, x, false /*inplace*/, name); + return x; } Tensor FFModel::gelu(const Tensor x, char const *name) { diff --git a/src/ops/element_unary.cpp b/src/ops/element_unary.cpp index e20200420f..9c1e61032a 100644 --- a/src/ops/element_unary.cpp +++ b/src/ops/element_unary.cpp @@ -192,9 +192,9 @@ __global__ void elewise_unary_backward_kernel(coord_t volume, case OP_GELU: { input_grad[i] = (T)(output_grad[i] * - (0.5 * static_cast(erfc(-input[i] * M_SQRT1_2)) + + (0.5 * erfc(-input[i] * M_SQRT1_2) + 0.5 * M_SQRT1_2 * input[i] * - ((2 / sqrt(M_PI)) * exp(-input[i] * input[i] * 0.5)))); + ((2 / sqrt(M_PI)) * exp(-input[i] * input[i] * 0.5f)))); break; } case OP_RSQRT: { diff --git a/src/ops/element_unary.cu b/src/ops/element_unary.cu index c7f5e90f4c..20ca34e9d7 100644 --- a/src/ops/element_unary.cu +++ b/src/ops/element_unary.cu @@ -205,9 +205,9 @@ __global__ void elewise_unary_backward_kernel(coord_t volume, case OP_GELU: { input_grad[i] = (T)(output_grad[i] * - (0.5 * static_cast(erfc(-input[i] * M_SQRT1_2)) + + (0.5 * erfc(-input[i] * M_SQRT1_2) + 0.5 * M_SQRT1_2 * input[i] * - ((2 / sqrt(M_PI)) * exp(-input[i] * input[i] * 0.5)))); + ((2 / sqrt(M_PI)) * exp(-input[i] * input[i] * 0.5f)))); break; } case OP_RSQRT: { diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index e630563b63..0b79ec727d 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -148,7 +148,7 @@ int Embedding::output_size(ParallelDim output_dims[MAX_TENSOR_DIM]) { int const OUT_CHANNELS = Output::OUT_CHANNELS; if (aggr == AGGR_MODE_NONE) { int num_dims = input->num_dims + 1; - for (int i = 1; i < num_dims - 1; i++) { + for (int i = 1; i < num_dims; i++) { output_dims[i] = input->dims[i - 1]; } assert(OUT_CHANNELS == 0); @@ -160,7 +160,7 @@ int Embedding::output_size(ParallelDim output_dims[MAX_TENSOR_DIM]) { return num_dims; } else { int num_dims = input->num_dims; - for (int i = 1; i < num_dims - 1; i++) { + for (int i = 1; i < num_dims; i++) { output_dims[i] = input->dims[i]; } assert(OUT_CHANNELS == 0); diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 9ad5c4dc9c..a1c911db75 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -167,7 +167,9 @@ bool FusedOp::add_operator( // paralel_is in forward and backward assert(!op->is_parallel_op() || op->op_type == OP_ALLREDUCE); // Currently don't consider nested fusion - assert(op->op_type != OP_FUSED); + if (op->op_type == OP_FUSED) { + return false; + } MachineView my_view = outputs[0]->machine_view; MachineView op_view = op->outputs[0]->machine_view; if (my_view == op_view) { diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index 3282bc57d9..309a77dfcd 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -23,6 +23,7 @@ #include "flexflow/ops/flat.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" +#include "flexflow/ops/kernels/cast_kernels.h" #include "flexflow/ops/kernels/concat_kernels.h" #include "flexflow/ops/kernels/conv_2d_kernels.h" #include "flexflow/ops/kernels/dropout_kernels.h" @@ -42,6 +43,7 @@ #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/kernels/allreduce_kernels.h" +#include "flexflow/ops/linear.h" #include "flexflow/utils/hip_helper.h" #include @@ -297,8 +299,8 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; Kernels::ElementBinary::forward_kernel_wrapper(m, my_input_accessor[0], @@ -359,7 +361,6 @@ __host__ void FusedOp::forward_task(Task const *task, assert(effective_batch_size * in_dim == my_input_accessor[0].domain.get_volume()); } - assert(my_input_accessor[0].data_type == DT_INT32 || my_input_accessor[0].data_type == DT_INT64); Kernels::Embedding::forward_kernel_wrapper(m, @@ -375,7 +376,11 @@ __host__ void FusedOp::forward_task(Task const *task, case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); @@ -428,6 +433,8 @@ __host__ void FusedOp::forward_task(Task const *task, m, my_input_accessor[0].get_float_ptr(), my_output_accessor[0].get_float_ptr()); + } else { + assert(false); } break; } @@ -437,10 +444,24 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); - Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + if (my_input_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int64_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int32_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false && "Unsupported data type"); + } break; } case OP_TRANSPOSE: { @@ -1050,6 +1071,29 @@ __host__ void m, bc, my_input_accessor[0], my_output_accessor[0]); break; } + + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_output_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } default: { fprintf(stderr, "Fusion currently does not support type = %d\n", @@ -1412,11 +1456,111 @@ __host__ void FusedOp::backward_task(Task const *task, batch_size); break; } + case OP_BATCHMATMUL: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + Domain out_domain = my_output_accessor[0].domain; + Domain a_domain = my_input_accessor[0].domain; + Domain b_domain = my_input_accessor[1].domain; + // check dims + int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; + assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); + int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; + assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); + int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; + assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); + assert(a_domain.get_dim() == b_domain.get_dim()); + assert(a_domain.get_dim() == out_domain.get_dim()); + int batch = 1; + for (int i = 2; i < a_domain.get_dim(); i++) { + int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; + assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); + assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); + batch *= dim_size; + } + BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; + Kernels::BatchMatmul::backward_kernel_wrapper( + meta, + (float const *)my_output_accessor[0].get_float_ptr(), + (float const *)my_output_grad_accessor[0].get_float_ptr(), + (float const *)my_input_accessor[0].get_float_ptr(), + (float *)my_input_grad_accessor[0].get_float_ptr(), + (float const *)my_input_accessor[1].get_float_ptr(), + (float *)my_input_grad_accessor[1].get_float_ptr(), + (float *)nullptr, + m, + n, + k, + batch); + break; + } + case OP_EW_ADD: + case OP_EW_SUB: + case OP_EW_MUL: + case OP_EW_DIV: + case OP_EW_MAX: + case OP_EW_MIN: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + // assert(my_input_accessor[0].domain == + // my_input_accessor[1].domain); assert(my_input_accessor[0].domain + // == my_output_accessor[0].domain); + ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; + Kernels::ElementBinary::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_accessor[1].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[1].get_float_ptr()); + break; + } + case OP_EMBEDDING: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; + assert(my_input_accessor[0].data_type == DT_INT64 || + my_input_accessor[0].data_type == DT_INT32); + int in_dim, out_dim, effective_batch_size; + if (m->aggr == AGGR_MODE_NONE) { + in_dim = 1; + out_dim = my_output_grad_accessor[0].domain.hi()[0] - + my_output_grad_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_grad_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } else { + in_dim = my_input_accessor[0].domain.hi()[0] - + my_input_accessor[0].domain.lo()[0] + 1; + out_dim = my_output_grad_accessor[0].domain.hi()[0] - + my_output_grad_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_grad_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } + Kernels::Embedding::backward_kernel_wrapper(m, + my_input_accessor[0], + my_output_grad_accessor[0], + my_weight_grad_accessor[0], + in_dim, + out_dim, + effective_batch_size); + break; + } case OP_GELU: case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); @@ -1435,7 +1579,8 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == + // my_output_accessor[0].domain); Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; Kernels::Pool2D::backward_kernel_wrapper( m, @@ -1463,10 +1608,43 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); assert(my_input_grad_accessor[0].domain.get_volume() == my_output_grad_accessor[0].domain.get_volume()); - Kernels::Reshape::backward_kernel_wrapper( - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain.get_volume()); + if (my_input_grad_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_int64_ptr(), + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].get_int32_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::backward_kernel_wrapper( + m, + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false); + } break; } case OP_TRANSPOSE: { @@ -1484,6 +1662,46 @@ __host__ void FusedOp::backward_task(Task const *task, my_output_grad_accessor[0].domain); break; } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; + } + LayerNorm::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + gamma.get_float_ptr(), + my_weight_grad_accessor[0].get_float_ptr(), + my_weight_grad_accessor[1].get_float_ptr()); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } default: assert(false && "Fusion currently does not support type"); } @@ -1493,13 +1711,16 @@ __host__ void FusedOp::backward_task(Task const *task, assert(ooff == 0); // for (int i = 0; i < fused->numWeights; i++) // print_tensor(weight_grad_ptr[i], - // weight_grad_domain[i].get_volume(), "[Fused:backward:weight_grad]"); + // weight_grad_domain[i].get_volume(), + // "[Fused:backward:weight_grad]"); // for (int i = 0; i < fused->numInputs; i++) - // print_tensor(input_grad_ptr[i], input_grad_domain[i].get_volume(), + // print_tensor(input_grad_ptr[i], + // input_grad_domain[i].get_volume(), // "[Fused:backward:input_grad]"); // for (int i = 0; i < fused->numOutputs; i++) // print_tensor(output_grad_ptr[i], - // output_grad_domain[i].get_volume(), "[Fused:backward:output_grad]"); + // output_grad_domain[i].get_volume(), + // "[Fused:backward:output_grad]"); } }; // namespace FlexFlow diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 483028599e..b6ffe0ee46 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -23,6 +23,7 @@ #include "flexflow/ops/fused.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" +#include "flexflow/ops/kernels/cast_kernels.h" #include "flexflow/ops/kernels/concat_kernels.h" #include "flexflow/ops/kernels/conv_2d_kernels.h" #include "flexflow/ops/kernels/dropout_kernels.h" @@ -310,8 +311,8 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; Kernels::ElementBinary::forward_kernel_wrapper(m, my_input_accessor[0], @@ -388,7 +389,11 @@ __host__ void FusedOp::forward_task(Task const *task, case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); @@ -451,10 +456,43 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); - Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + assert(my_input_accessor[0].data_type == + my_output_accessor[0].data_type); + if (my_input_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int64_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int32_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false && "Unsupported data type"); + } + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + } else { + assert(false); + } break; } case OP_TRANSPOSE: { @@ -464,6 +502,8 @@ __host__ void FusedOp::forward_task(Task const *task, assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); TransposeMeta *m = (TransposeMeta *)metas->meta[op]; + assert(my_input_accessor[0].data_type == + my_output_accessor[0].data_type); Kernels::Transpose::forward_kernel_wrapper( m, my_input_accessor[0].get_float_ptr(), @@ -476,15 +516,11 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; - if (m->elementwise_affine) { - assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias)); - } + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); GenericTensorAccessorR gamma, beta; if (m->elementwise_affine) { gamma = my_weight_accessor[0]; - if (m->use_bias) { - beta = my_weight_accessor[1]; - } + beta = my_weight_accessor[1]; } LayerNorm::forward_kernel_wrapper( m, my_input_accessor[0], my_output_accessor[0], gamma, beta); @@ -1087,6 +1123,28 @@ __host__ void m, bc, my_input_accessor[0], my_output_accessor[0]); break; } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_int64_ptr(), + my_output_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::forward_kernel_wrapper( + m, + my_input_accessor[0].get_int32_ptr(), + my_output_accessor[0].get_float_ptr(), + my_output_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } default: { fprintf(stderr, "Fusion currently does not support type = %d\n", @@ -1434,8 +1492,8 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + // assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; Kernels::ElementBinary::backward_kernel_wrapper( m, @@ -1451,7 +1509,8 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 1); EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; - assert(my_input_accessor[0].data_type == DT_INT64); + assert(my_input_accessor[0].data_type == DT_INT64 || + my_input_accessor[0].data_type == DT_INT32); int in_dim, out_dim, effective_batch_size; if (m->aggr == AGGR_MODE_NONE) { in_dim = 1; @@ -1516,7 +1575,11 @@ __host__ void FusedOp::backward_task(Task const *task, case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_ADD: + case OP_SCALAR_MULTIPLY: + case OP_SCALAR_SUB: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); @@ -1563,10 +1626,43 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); assert(my_input_grad_accessor[0].domain.get_volume() == my_output_grad_accessor[0].domain.get_volume()); - Kernels::Reshape::backward_kernel_wrapper( - my_input_grad_accessor[0].get_float_ptr(), - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].domain.get_volume()); + if (my_input_grad_accessor[0].data_type == DT_INT64) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_int64_ptr(), + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_INT32) { + Kernels::Reshape::forward_kernel_wrapper( + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].get_int32_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else if (my_input_grad_accessor[0].data_type == DT_FLOAT) { + Kernels::Reshape::backward_kernel_wrapper( + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (my_input_accessor[0].data_type == DT_FLOAT) { + Kernels::Softmax::backward_kernel_wrapper( + m, + my_input_grad_accessor[0].get_float_ptr(), + my_output_grad_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false); + } break; } case OP_TRANSPOSE: { @@ -1584,6 +1680,47 @@ __host__ void FusedOp::backward_task(Task const *task, my_output_grad_accessor[0].domain); break; } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; + } + LayerNorm::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + gamma.get_float_ptr(), + my_weight_grad_accessor[0].get_float_ptr(), + my_weight_grad_accessor[1].get_float_ptr()); + break; + } + case OP_CAST: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + CastMeta const *m = (CastMeta *)metas->meta[op]; + if (m->input_data_type == DT_INT32 && m->output_data_type == DT_INT64) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_int64_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else if (m->input_data_type == DT_INT32 && + m->output_data_type == DT_FLOAT) { + Kernels::Cast::backward_kernel_wrapper( + my_output_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[0].get_int32_ptr(), + my_output_grad_accessor[0].domain.get_volume()); + } else { + assert(false); + } + + break; + } default: assert(false && "Fusion currently does not support type"); } diff --git a/src/ops/kernels/element_binary_kernels.cpp b/src/ops/kernels/element_binary_kernels.cpp index a65372de85..3ae1e76cb6 100644 --- a/src/ops/kernels/element_binary_kernels.cpp +++ b/src/ops/kernels/element_binary_kernels.cpp @@ -73,7 +73,6 @@ void forward_kernel_wrapper(ElementBinaryMeta const *m, GenericTensorAccessorW const &out) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - hipEvent_t t_start, t_end; if (m->profiling) { checkCUDA(hipEventCreate(&t_start)); @@ -201,6 +200,21 @@ __global__ void elewise_binary_forward_kernel(coord_t volume, } } +// for simplicity, assume the replicate dimension is the batchsize +__global__ void + elewise_binary_forward_kernel_broadcast2(float const *in1_ptr, + float const *in2_ptr, + float *output_ptr, + size_t volume, + size_t batch_size, + size_t replicate_size) { + CUDA_KERNEL_LOOP(i, volume) { + size_t batch = i / replicate_size; + output_ptr[i] = + in1_ptr[i] + in2_ptr[batch * replicate_size + i % replicate_size]; + } +} + __global__ void elewise_binary_backward_kernel(coord_t volume, float const alpha, float const beta, @@ -248,7 +262,6 @@ void forward_kernel(ElementBinaryMeta const *m, hipStream_t stream) { checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - float alpha1 = 1.0f, alpha2 = 1.0f, beta = 0.0f; switch (m->op_type) { case OP_EW_SUB: @@ -287,6 +300,19 @@ void forward_kernel(ElementBinaryMeta const *m, &alpha1, m->outputTensor, out_ptr)); + } else if (m->op_type == OP_EW_ADD && m->broadcast_input2) { + int parallelism = m->batch_size * m->replicate_size; + hipLaunchKernelGGL(elewise_binary_forward_kernel_broadcast2, + GET_BLOCKS(parallelism), + CUDA_NUM_THREADS, + 0, + stream, + in1_ptr, + in2_ptr, + out_ptr, + m->batch_size * m->replicate_size, + m->batch_size, + m->replicate_size); } else { checkCUDNN(miopenOpTensor(m->handle.dnn, m->opDesc, diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp index 89c9f14a01..c4b3394181 100644 --- a/src/ops/kernels/softmax.cpp +++ b/src/ops/kernels/softmax.cpp @@ -33,6 +33,7 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, checkCUDNN( cudnnSetTensorDescriptorFromDomain4SoftMax(outputTensor, input_domain)); dim = softmax->dim; + last_layer = softmax->last_layer; profiling = softmax->profiling; inference_debugging = softmax->inference_debugging; std::strcpy(op_name, softmax->name); @@ -74,6 +75,7 @@ template void backward_kernel_wrapper(SoftmaxMeta const *m, DT *input_grad_ptr, DT const *output_grad_ptr, + DT const *output_ptr, size_t num_elements) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -85,7 +87,7 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, checkCUDA(hipEventRecord(t_start, stream)); } Internal::backward_kernel( - input_grad_ptr, output_grad_ptr, num_elements, stream); + m, input_grad_ptr, output_grad_ptr, output_ptr, num_elements, stream); if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); @@ -138,15 +140,32 @@ void forward_kernel(SoftmaxMeta const *m, } template -void backward_kernel(DT *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + DT *input_grad_ptr, DT const *output_grad_ptr, + DT const *output_ptr, size_t num_elements, hipStream_t stream) { - checkCUDA(hipMemcpyAsync(input_grad_ptr, - output_grad_ptr, - num_elements * sizeof(DT), - hipMemcpyDeviceToDevice, - stream)); + if (m->last_layer) { + checkCUDA(hipMemcpyAsync(input_grad_ptr, + output_grad_ptr, + num_elements * sizeof(DT), + hipMemcpyDeviceToDevice, + stream)); + } else { + float alpha = 1.0f, beta = 0.0f; + checkCUDNN(miopenSoftmaxBackward_V2(m->handle.dnn, + &alpha, + m->inputTensor, + output_ptr, + m->inputTensor, + output_grad_ptr, + &beta, + m->inputTensor, + input_grad_ptr, + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_CHANNEL)); + } } } // namespace Internal diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index e47006cc9d..2d180c1f49 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -32,6 +32,7 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( outputTensor, input_domain, softmax->data_type)); dim = softmax->dim; + last_layer = softmax->last_layer; profiling = softmax->profiling; inference_debugging = softmax->inference_debugging; std::strcpy(op_name, softmax->name); @@ -72,6 +73,7 @@ template void backward_kernel_wrapper(SoftmaxMeta const *m, DT *input_grad_ptr, DT const *output_grad_ptr, + DT const *output_ptr, size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -83,7 +85,7 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, cudaEventRecord(t_start, stream); } Internal::backward_kernel( - input_grad_ptr, output_grad_ptr, num_elements, stream); + m, input_grad_ptr, output_grad_ptr, output_ptr, num_elements, stream); if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -135,15 +137,33 @@ void forward_kernel(SoftmaxMeta const *m, } template -void backward_kernel(DT *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + DT *input_grad_ptr, DT const *output_grad_ptr, + DT const *output_ptr, size_t num_elements, cudaStream_t stream) { - checkCUDA(cudaMemcpyAsync(input_grad_ptr, - output_grad_ptr, - num_elements * sizeof(DT), - cudaMemcpyDeviceToDevice, - stream)); + + if (m->last_layer) { + checkCUDA(cudaMemcpyAsync(input_grad_ptr, + output_grad_ptr, + num_elements * sizeof(DT), + cudaMemcpyDeviceToDevice, + stream)); + } else { + float alpha = 1.0f, beta = 0.0f; + checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + m->inputTensor, + output_ptr, + m->inputTensor, + output_grad_ptr, + &beta, + m->inputTensor, + input_grad_ptr)); + } } } // namespace Internal diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 2218ffe392..665cb2d5ae 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -132,8 +132,7 @@ Tensor FFModel::layer_norm(const Tensor input, ln, 0, true /*create_grad*/); - if (num_weights > 0) { - assert(elementwise_affine); + if (num_weights == 2) { int numdims = axes.size(); int dims[numdims]; for (int i = 0; i < numdims; i++) { @@ -611,8 +610,9 @@ void LayerNorm::forward_task(Task const *task, out = helperGetGenericTensorAccessorWO( m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(in_domain == out_domain); - assert(in_domain.get_volume() == - m->effective_num_elements * m->effective_batch_size); + // assert(in_domain.get_volume() == + // m->effective_num_elements * m->effective_batch_size); + if (m->elementwise_affine) { assert(m->use_bias == (regions.size() == 4)); Domain gamma_domain = runtime->get_index_space_domain( @@ -740,8 +740,8 @@ void LayerNorm::backward_task(Task const *task, in_grad_ptr = helperGetTensorPointerRW( regions[2], task->regions[2], FID_DATA, ctx, runtime); assert(in_domain == out_grad_domain); - assert(in_domain.get_volume() == - m->effective_num_elements * m->effective_batch_size); + // assert(in_domain.get_volume() == + // m->effective_num_elements * m->effective_batch_size); if (m->elementwise_affine) { assert(m->use_bias == (regions.size() == 6)); Domain gamma_domain = runtime->get_index_space_domain( @@ -807,6 +807,7 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, GenericTensorAccessorW gamma_acc; GenericTensorAccessorW beta_acc; + bool out_of_memory = (in_ptr == NULL) || (out_ptr == NULL) || (((gamma_ptr == NULL) || (beta_ptr == NULL)) && (m->elementwise_affine)); diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index 07dbdb3dfb..1b13ba968a 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -14,6 +14,7 @@ */ #include "flexflow/ops/layer_norm.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/utils/hip_helper.h" #include @@ -33,12 +34,26 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, effective_num_elements = ln->effective_num_elements; use_bias = ln->use_bias; eps = ln->eps; - checkCUDA(hipMalloc(&mean_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&rstd_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&ds_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&db_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&scale_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&bias_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&mean_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&rstd_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&ds_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&db_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&scale_ptr, sizeof(float) * effective_batch_size)); + // checkCUDA(hipMalloc(&bias_ptr, sizeof(float) * effective_batch_size)); + + DataType data_type = ln->data_type; + checkCUDA( + hipMalloc(&mean_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&rstd_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&ds_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&db_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&scale_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + hipMalloc(&bias_ptr, data_type_size(data_type) * effective_batch_size)); } LayerNormMeta::~LayerNormMeta(void) {} @@ -555,6 +570,7 @@ void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, stream); } + template void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, float const *output_grad_ptr, diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 44979c48fe..a921d9af41 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -56,6 +56,7 @@ LayerNormMeta::~LayerNormMeta(void) { if (reserveInst != Realm::RegionInstance::NO_INST) { reserveInst.destroy(); } + } template @@ -220,7 +221,6 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, T const *gamma_ptr, T const *beta_ptr, cudaStream_t stream) { - std::pair kernel1_parallelism = std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads); std::pair kernel2_parallelism = @@ -585,7 +585,6 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, gamma_ptr, input_grad_ptr, N); - if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { if (M < 512) { // For small batch size, do colwise reduce directly @@ -649,6 +648,7 @@ void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, // beta_grad_ptr, // stream); // } + } template void @@ -660,4 +660,4 @@ template void float *gamma_grad_ptr, float *beta_grad_ptr); -}; // namespace FlexFlow +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 03618423be..4c2a78d165 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -92,9 +92,10 @@ SoftmaxParams Softmax::get_params() const { return params; } -Tensor FFModel::softmax(const Tensor _input, +Tensor FFModel::softmax(Tensor const _input, int dim, DataType data_type, + bool last_layer, char const *name) { if (data_type == DT_NONE) { data_type = _input->data_type; @@ -115,6 +116,8 @@ Tensor FFModel::softmax(const Tensor _input, sm->outputs[0] = create_tensor_legion_ordering( numdims, dims, data_type, sm, 0, true /*create_grad*/); sm->add_int_property("softmax_dim", dim); + + sm->add_int_property("last_layer", last_layer); layers.push_back(sm); return sm->outputs[0]; } @@ -126,17 +129,21 @@ Op *Softmax::create_operator_from_layer( long long value; layer->get_int_property("softmax_dim", value); int dim = (int)value; + layer->get_int_property("last_layer", value); + bool last_layer = (bool)value; return new Softmax(model, layer->layer_guid, inputs[0], (inputs[0]->num_dims - 1 - dim) % inputs[0]->num_dims, + last_layer, layer->name); } Softmax::Softmax(FFModel &model, LayerID const &_layer_guid, - const ParallelTensor _input, + ParallelTensor const _input, int _dim, + bool _last_layer, char const *name) : Op(model, OP_SOFTMAX, @@ -146,7 +153,7 @@ Softmax::Softmax(FFModel &model, 0 /*weights*/, 1 /*outputs*/, _input), - dim(_dim) { + dim(_dim), last_layer(_last_layer) { // Currently assume we always perform softmax along the inner most dim assert(dim == 0); layer_guid = _layer_guid; @@ -160,9 +167,14 @@ Softmax::Softmax(FFModel &model, Softmax::Softmax(FFModel &model, SoftmaxParams const ¶ms, - const ParallelTensor input, + ParallelTensor const input, char const *name) - : Softmax(model, params.layer_guid, input, params.dim, params.name) {} + : Softmax(model, + params.layer_guid, + input, + params.dim, + params.last_layer, + params.name) {} void Softmax::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -392,6 +404,13 @@ void Softmax::backward(FFModel const &ff) { EXCLUSIVE, outputs[0]->region_grad)); launcher.add_field(1, FID_DATA); + + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(2, FID_DATA); runtime->execute_index_space(ctx, launcher); } @@ -431,8 +450,8 @@ void Softmax::backward_task_with_dim(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); + assert(regions.size() == 3); + assert(task->regions.size() == 3); // const Softmax* softmax = (Softmax*) task->args; SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); TensorAccessorW acc_input_grad(regions[0], @@ -443,11 +462,16 @@ void Softmax::backward_task_with_dim(Task const *task, true /*readOutput*/); TensorAccessorR acc_output_grad( regions[1], task->regions[1], FID_DATA, ctx, runtime); + TensorAccessorR acc_output( + regions[2], task->regions[1], FID_DATA, ctx, runtime); // make sure the image indices match! assert(acc_input_grad.rect == acc_output_grad.rect); - backward_kernel_wrapper( - m, acc_input_grad.ptr, acc_output_grad.ptr, acc_input_grad.rect.volume()); + backward_kernel_wrapper(m, + acc_input_grad.ptr, + acc_output_grad.ptr, + acc_output.ptr, + acc_input_grad.rect.volume()); } void Softmax::inference_task(Task const *task, @@ -526,11 +550,17 @@ bool Softmax::measure_operator_cost(Simulator *sim, float *output_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); assert(output_grad_ptr != NULL); + float *output_ptr = + (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); backward = [&] { - backward_kernel_wrapper( - m, input_grad_ptr, output_grad_ptr, sub_output.get_volume()); + backward_kernel_wrapper(m, + input_grad_ptr, + output_grad_ptr, + output_ptr, + sub_output.get_volume()); }; } @@ -563,6 +593,7 @@ size_t hash::operator()( size_t key = 0; hash_combine(key, params.layer_guid.id); hash_combine(key, params.dim); + hash_combine(key, params.last_layer); return key; } }; // namespace std diff --git a/src/ops/split.cc b/src/ops/split.cc index 7c6b631b20..e3745f834f 100644 --- a/src/ops/split.cc +++ b/src/ops/split.cc @@ -413,6 +413,17 @@ void Split::backward_task(Task const *task, split->numOutputs); } +tl::optional Split::as_dot() const { + RecordFormatter rr; + RecordFormatter r; + + r << this->inputs[0]->get_shape().as_dot(); + r << this->outputs[0]->get_shape().as_dot(); + rr << r; + + return rr; +} + bool Split::measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { diff --git a/src/parallel_ops/kernels/replicate_kernels.cpp b/src/parallel_ops/kernels/replicate_kernels.cpp index 1647f014be..10d91e3a16 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cpp +++ b/src/parallel_ops/kernels/replicate_kernels.cpp @@ -83,6 +83,45 @@ template void backward_kernel(float const *output_grad_ptr, size_t num_elements, size_t num_replicas); +template void forward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(double const *output_grad_ptr, + double *input_grad_ptr, + size_t num_elements, + size_t num_replicas); + +template void forward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(int64_t const *output_grad_ptr, + int64_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); + +template void forward_kernel(int32_t const *input_ptr, + int32_t *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(int32_t const *input_ptr, + int32_t *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(int32_t const *output_grad_ptr, + int32_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); + } // namespace Replicate } // namespace Kernels } // namespace FlexFlow diff --git a/src/parallel_ops/kernels/replicate_kernels.cu b/src/parallel_ops/kernels/replicate_kernels.cu index 35bc109bd3..11202323c1 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cu +++ b/src/parallel_ops/kernels/replicate_kernels.cu @@ -75,6 +75,42 @@ template void backward_kernel(float const *output_grad_ptr, float *input_grad_ptr, size_t num_elements, size_t num_replicas); +template void forward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(double const *input_ptr, + double *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(double const *output_grad_ptr, + double *input_grad_ptr, + size_t num_elements, + size_t num_replicas); +template void forward_kernel(int32_t const *input_ptr, + int32_t *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(int32_t const *input_ptr, + int32_t *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(int32_t const *output_grad_ptr, + int32_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); +template void forward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, + size_t num_elements); +template __global__ void + replicate_backward_kernel(int64_t const *input_ptr, + int64_t *output_ptr, + size_t num_elements, + size_t num_replicas); +template void backward_kernel(int64_t const *output_grad_ptr, + int64_t *input_grad_ptr, + size_t num_elements, + size_t num_replicas); } // namespace Replicate } // namespace Kernels diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index ba7bb6677f..05ec5d6cfa 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -256,9 +256,10 @@ void Replicate::forward(FFModel const &ff) { assert(numOutputs == 1); assert(numInputs == 1); set_argumentmap_for_forward(ff, argmap); + DataType data_type = inputs[0]->data_type; IndexLauncher launcher(REPLICATE_FWD_TASK_ID, outputs[0]->parallel_is, - TaskArgument(NULL, 0), + TaskArgument(&data_type, sizeof(DataType)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -284,7 +285,7 @@ void Replicate::backward(FFModel const &ff) { assert(numInputs == 1); IndexLauncher launcher(REPLICATE_BWD_TASK_ID, inputs[0]->parallel_is, - TaskArgument(NULL, 0), + TaskArgument(&data_type, sizeof(DataType)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -342,15 +343,21 @@ bool Replicate::append_parallel_op_info( return true; } +void Replicate::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) {} + +/*static*/ void Replicate::forward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); - ReplicateMeta const *m = *((ReplicateMeta **)task->local_args); + Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); Domain output_domain = runtime->get_index_space_domain( @@ -361,7 +368,6 @@ void Replicate::forward_task(Task const *task, assert(output_domain.hi()[i] == input_domain.hi()[i]); } assert(input_domain.get_volume() == output_domain.get_volume()); - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( @@ -387,6 +393,28 @@ void Replicate::backward_task(Task const *task, Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); + DataType data_type = *((DataType *)task->args); + if (data_type == DT_FLOAT) { + backward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_DOUBLE) { + backward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT32) { + backward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT64) { + backward_task_with_type(task, regions, ctx, runtime); + } else { + assert(false && "Unsupported data type in Embedding forward"); + } +} + +template +void Replicate::backward_task_with_type( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); Domain output_grad_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); Domain input_grad_domain = runtime->get_index_space_domain( @@ -398,12 +426,12 @@ void Replicate::backward_task(Task const *task, } size_t num_elements = input_grad_domain.get_volume(); size_t num_replicas = output_grad_domain.get_volume() / num_elements; - float const *output_grad_ptr = helperGetTensorPointerRO( + T const *output_grad_ptr = helperGetTensorPointerRO( regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *input_grad_ptr = helperGetTensorPointerRW( + T *input_grad_ptr = helperGetTensorPointerRW( regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel( + backward_kernel( output_grad_ptr, input_grad_ptr, num_elements, num_replicas); } diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index c7b6e1257a..257bc2fcdf 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -203,6 +203,8 @@ std::string get_operator_type_name(OperatorType type) { return "Pipeline"; case OP_FUSED_PARALLEL: return "FusedParallelOp"; + case OP_GELU: + return "Gelu"; default: throw std::runtime_error("Operator type unsupported: " + std::to_string(type)); diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index f8e8240ccf..57a76cf20d 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1896,11 +1896,11 @@ namespace { */ std::pair, std::unordered_map> try_one_lambda(std::pair &lambda, - Task const *task, + FFModel *model, std::shared_ptr &cached_simulator, bool perform_memory_search) { // Create a new fresh model - FFModel *model = *((FFModel **)task->args); + //FFModel *model = *((FFModel **)task->args); model->clear_graph_search_cache(); if (model->config.search_num_nodes.has_value()) { @@ -1914,6 +1914,42 @@ std::pair, std::unordered_map> model->config.workersPerNode, model->config.cpusPerNode, model->all_valid_views); + if (model->config.only_data_parallel) { + Graph *graph = new Graph(model); + graph->print_dot(); + std::unordered_map op_to_node_map; + for (FlexFlow::Op const *dstOp : model->operators) { + Node dstNode; + dstNode.ptr = dstOp; + dstNode.guid = model->node_global_guid++; + op_to_node_map[dstOp] = dstNode; + for (int j = 0; j < dstOp->numInputs; j++) { + FlexFlow::Op const *srcOp = dstOp->inputs[j]->owner_op; + assert(op_to_node_map.find(srcOp) != op_to_node_map.end()); + Node srcNode = op_to_node_map[srcOp]; + graph->add_edge(srcNode, dstNode, dstOp->inputs[j]->owner_idx, j); + } + } + graph->print_dot(); + std::unique_ptr curr_best_graph; + std::unordered_map curr_optimal_views; + curr_best_graph = std::unique_ptr(graph); + MachineView data_parallel_view; + data_parallel_view.device_type = MachineView::GPU; + data_parallel_view.ndims = 1; + data_parallel_view.dim[0] = + model->config.numNodes * model->config.workersPerNode; + data_parallel_view.stride[0] = 1; + data_parallel_view.start_device_id = 0; + for (auto const &node : curr_best_graph->inEdges) { + curr_optimal_views[node.first] = data_parallel_view; + } + return std::make_pair(std::move(curr_best_graph), curr_optimal_views); + } + + Runtime *runtime = model->config.lg_hlr; + Context ctx = model->config.lg_ctx; + const Task* task = runtime->get_current_task(ctx); Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) .only_kind(Memory::GPU_FB_MEM) .best_affinity_to(task->target_proc) @@ -1949,7 +1985,6 @@ std::pair, std::unordered_map> // Perform the search std::unique_ptr curr_best_graph; std::unordered_map curr_optimal_views; - if (model->config.only_data_parallel) { Graph *graph = new Graph(model); std::unordered_map op_to_node_map; @@ -2104,12 +2139,20 @@ bool is_valid_strategy( * @param runtime Not used * @return GraphOptimalViewSerialized Serialized optimal PCG */ + GraphOptimalViewSerialized Graph::graph_optimize_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - auto model_config = (*((FFModel **)task->args))->config; + FFModel* model = *((FFModel **)task->args); + return Graph::graph_optimize_wrapper(model); +} + +/*static*/ +GraphOptimalViewSerialized + Graph::graph_optimize_wrapper(FFModel *model) { + auto model_config = model->config; bool perform_memory_search = model_config.perform_memory_search; float memory_threshold = model_config.device_mem; bool only_data_parallel = model_config.only_data_parallel; @@ -2125,7 +2168,7 @@ GraphOptimalViewSerialized // Be optimistic lambdas.emplace_back(std::make_pair(1.0, MemorySearchResult{})); auto try_result = try_one_lambda( - lambdas.back(), task, cached_simulator, perform_memory_search); + lambdas.back(), model, cached_simulator, perform_memory_search); best_graph = std::move(try_result.first); optimal_views = try_result.second; @@ -2141,7 +2184,7 @@ GraphOptimalViewSerialized // Not found the strategy; need to do binary search lambdas.emplace_back(std::make_pair(0.0, MemorySearchResult{})); try_result = try_one_lambda( - lambdas.back(), task, cached_simulator, perform_memory_search); + lambdas.back(), model, cached_simulator, perform_memory_search); best_graph = std::move(try_result.first); optimal_views = try_result.second; @@ -2168,7 +2211,7 @@ GraphOptimalViewSerialized lambdas.emplace_back(std::make_pair(mid, MemorySearchResult{})); try_result = try_one_lambda( - lambdas.back(), task, cached_simulator, perform_memory_search); + lambdas.back(), model, cached_simulator, perform_memory_search); if (!is_valid_strategy(lambdas, try_result.first.get(), @@ -2398,6 +2441,12 @@ GraphOptimalViewSerialized sez.serialize(attn->name, strlen(attn->name)); break; } + case OP_SOFTMAX: { + Softmax *softmax = (Softmax *)op; + sez.serialize(softmax->dim); + sez.serialize(softmax->last_layer); + break; + } case OP_REPARTITION: { Repartition *repart = (Repartition *)op; sez.serialize(repart->repartition_dim); @@ -3021,7 +3070,13 @@ void FFModel::deserialize_graph_optimal_view( break; } case OP_SOFTMAX: { - node = Softmax::deserialize(*this, dez, inputs, num_inputs); + assert(num_inputs == 1); + int softmax_dim; + bool last_layer; + dez.deserialize(softmax_dim); + dez.deserialize(last_layer); + node = + get_or_create_node(inputs[0], {softmax_dim, last_layer}); break; } case OP_TRANSPOSE: { diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index 613df1cbcf..7e89646465 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -501,6 +501,57 @@ miopenStatus_t return miopenStatusBadParm; } +miopenStatus_t + cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor, + Domain domain) { + int dims[MAX_TENSOR_DIM]; + switch (domain.get_dim()) { + case 1: { + Rect<1> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + return miopenSet4dTensorDescriptor(tensor, miopenFloat, dims[0], 1, 1, 1); + } + case 2: { + Rect<2> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[1], dims[0], 1, 1); + } + case 3: { + Rect<3> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[2] * dims[1], dims[0], 1, 1); + } + case 4: { + Rect<4> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + dims[3] = rect.hi[3] - rect.lo[3] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[3] * dims[2] * dims[1], dims[0], 1, 1); + } + case 5: { + Rect<5> rect = domain; + int leading_dim_size = rect.hi[4] - rect.lo[4] + 1; + assert(leading_dim_size == 1); + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + dims[3] = rect.hi[3] - rect.lo[3] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[3], dims[2], dims[1], dims[0]); + } + default: + assert(false && "Unsupported dim number"); + } + return miopenStatusBadParm; +} + miopenDataType_t ff_to_cudnn_datatype(DataType type) { switch (type) { case DT_HALF: diff --git a/src/runtime/machine_view.cc b/src/runtime/machine_view.cc index dadece7691..44dff5a2da 100644 --- a/src/runtime/machine_view.cc +++ b/src/runtime/machine_view.cc @@ -1,4 +1,5 @@ #include "flexflow/machine_view.h" +#include "flexflow/utils/hash_utils.h" namespace FlexFlow { @@ -47,15 +48,15 @@ size_t MachineView::num_parts() const { } size_t MachineView::hash() const { - size_t ret = 17; - ret = ret * 31 + std::hash()(device_type); - ret = ret * 31 + std::hash()(ndims); - ret = ret * 31 + std::hash()(start_device_id); + size_t h = 0; + hash_combine(h, device_type); + hash_combine(h, ndims); + hash_combine(h, start_device_id); for (int i = 0; i < ndims; i++) { - ret = ret * 31 + std::hash()(dim[i]); - ret = ret * 31 + std::hash()(stride[i]); + hash_combine(h, dim[i]); + hash_combine(h, stride[i]); } - return ret; + return h; } int MachineView::get_device_id(DomainPoint const &p) const { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 1fa281777a..65969b45dd 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -93,10 +93,10 @@ Op::Op(FFModel &model, int numWeights, bool allocate_weights, int numOutputs, - const ParallelTensor input1, - const ParallelTensor input2, - const ParallelTensor input3, - const ParallelTensor input4) + ParallelTensor const input1, + ParallelTensor const input2, + ParallelTensor const input3, + ParallelTensor const input4) : Op(model, otype, dtype, @@ -116,10 +116,10 @@ Op::Op(FFModel &model, int _numInputs, int _numWeights, int _numOutputs, - const ParallelTensor _input1, - const ParallelTensor _input2, - const ParallelTensor _input3, - const ParallelTensor _input4) + ParallelTensor const _input1, + ParallelTensor const _input2, + ParallelTensor const _input3, + ParallelTensor const _input4) : op_type(_otype), data_type(_dtype), op_guid(model.op_global_guid++), numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs), profiling(model.config.profiling), @@ -601,9 +601,15 @@ ncclComm_t Op::init_nccl_comms_task(Task const *task, } } ncclComm_t ncclComm; + fprintf(stderr, "Before ncclCommInitRank\n"); checkNCCL(ncclCommInitRank(&ncclComm, allRanks, ncclId, myRank)); - // fprintf(stderr, "ncclComm(%p) allRanks(%d) myRank(%d) ncclId(%p)\n", - // ncclComm, allRanks, myRank, ncclId); + fprintf(stderr, + "After ncclCommInitRank ncclComm(%p) allRanks(%d) myRank(%d) " + "ncclId(%p)\n", + ncclComm, + allRanks, + myRank, + ncclId); return ncclComm; } @@ -1035,9 +1041,9 @@ void Op::register_output_parallel_dims( operation); } -int Op::get_output_to_input_dim_mapping(const ParallelTensor output, +int Op::get_output_to_input_dim_mapping(ParallelTensor const output, int output_dim, - const ParallelTensor input) { + ParallelTensor const input) { int output_idx = -1, input_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -1070,9 +1076,9 @@ int Op::get_output_to_input_dim_mapping(const ParallelTensor output, return -1; } -int Op::get_output_to_weight_dim_mapping(const ParallelTensor output, +int Op::get_output_to_weight_dim_mapping(ParallelTensor const output, int output_dim, - const ParallelTensor weight) { + ParallelTensor const weight) { int output_idx = -1, weight_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -1706,7 +1712,7 @@ Tensor FFModel::create_tensor(int numdim, } ParallelTensor FFModel::create_parallel_tensor(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *op, int idx, @@ -1739,7 +1745,7 @@ Tensor FFModel::create_tensor_legion_ordering(int numdim, ParallelTensor FFModel::create_parallel_tensor_legion_ordering(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *op, int idx, @@ -1789,7 +1795,7 @@ Tensor FFModel::create_tensor(int const dims[], } template -ParallelTensor FFModel::create_parallel_tensor(const ParallelDim dims[], +ParallelTensor FFModel::create_parallel_tensor(ParallelDim const dims[], DataType data_type, Op const *owner_op, int owner_idx, @@ -1870,7 +1876,7 @@ Parameter FFModel::create_weight(int numdim, } template -ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[], +ParallelParameter FFModel::create_parallel_weight(ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1901,7 +1907,7 @@ ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[], } ParallelParameter FFModel::create_parallel_weight(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1921,7 +1927,7 @@ ParallelParameter FFModel::create_parallel_weight(int numdim, ParallelParameter FFModel::create_parallel_weight_legion_ordering( int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -2135,7 +2141,7 @@ void FFModel::map_weight_with_dim(ParallelTensor weight, } bool FFModel::get_parallel_tensor_from_tensor( - const Tensor tensor, ParallelTensor ¶llel_tensor) const { + Tensor const tensor, ParallelTensor ¶llel_tensor) const { // check if tensor->parallel_tensor is already set if (tensor->parallel_tensor != nullptr) { parallel_tensor = tensor->parallel_tensor; @@ -2172,7 +2178,7 @@ bool FFModel::get_parallel_tensor_from_tensor( } void FFModel::create_disjoint_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], IndexSpace const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -2195,7 +2201,7 @@ void FFModel::create_disjoint_partition(int num_dims, template void FFModel::create_disjoint_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], IndexSpaceT const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -2228,7 +2234,7 @@ void FFModel::create_disjoint_partition_with_dim2( } void FFModel::create_aliased_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, IndexSpace const &part_is, LogicalRegion const ®ion, @@ -2252,7 +2258,7 @@ void FFModel::create_aliased_partition(int num_dims, template void FFModel::create_aliased_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, IndexSpaceT const &part_is, LogicalRegion const ®ion, @@ -2289,7 +2295,7 @@ void FFModel::create_aliased_partition_with_dim2( } template -void FFModel::create_disjoint_partition(const ParallelTensor tensor, +void FFModel::create_disjoint_partition(ParallelTensor const tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -2337,7 +2343,7 @@ void FFModel::create_disjoint_partition(const ParallelTensor tensor, template void FFModel::create_data_parallel_partition_with_diff_dims( - const ParallelTensor tensor, + ParallelTensor const tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -2719,7 +2725,7 @@ IndexSpace FFModel::get_task_is(ParallelConfig const &pc) const { return get_task_is(view); } -IndexSpace FFModel::get_or_create_task_is(const ParallelTensor tensor) { +IndexSpace FFModel::get_or_create_task_is(ParallelTensor const tensor) { MachineView view; view.ndims = 0; for (int i = 0; i < tensor->num_dims; i++) { @@ -3067,10 +3073,17 @@ Op *FFModel::create_operator_from_layer( tensor->parallel_tensor = pt; // start from data parllel tensor if (config.only_data_parallel && - config.computationMode == COMP_MODE_TRAINING) { - Repartition *part = new Repartition( - *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); - operators.push_back(part); + config.numNodes * config.workersPerNode > 1) { + if (pt->dims[num_dims - 1].size == 1) { + Replicate *repl = new Replicate( + *this, pt, num_dims, config.numNodes * config.workersPerNode); + repl->outputs[0]->dims[num_dims].is_replica_dim = true; + operators.push_back(repl); + } else { + Repartition *part = new Repartition( + *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); + operators.push_back(part); + } } return operators[operators.size() - 1]; } @@ -3308,7 +3321,7 @@ bool FFModel::is_mlp_block(int layer_idx) const { } void FFModel::create_operators_from_layers() { - std::map tensors_to_parallel_tensors; + std::map tensors_to_parallel_tensors; // for (auto const &l : layers) { for (int layer_idx = 0; layer_idx < layers.size(); layer_idx++) { auto const &l = layers[layer_idx]; @@ -3405,12 +3418,15 @@ void FFModel::compile(LossType loss_type, // Launch the graph optimize task { FFModel *model = this; - TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID, - TaskArgument(&model, sizeof(FFModel *))); - Future future = runtime->execute_task(ctx, launcher); - - PCG::GraphOptimalViewSerialized ret = - future.get_result(); + PCG::GraphOptimalViewSerialized ret; + if (false) { + TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID, + TaskArgument(&model, sizeof(FFModel *))); + Future future = runtime->execute_task(ctx, launcher); + ret = future.get_result(); + } else { + ret = PCG::Graph::graph_optimize_wrapper(this); + } Deserializer dez(ret.data, ret.total_bytes); // Reconstruct operators PCG::Graph *best_graph = new PCG::Graph(this); @@ -4061,39 +4077,39 @@ void FFIterationConfig::reset() { // Default Config Parameters struct DefaultConfig { - const static int epochs = 1; + static int const epochs = 1; // const static int iterations = 1; - const static int batchSize = 64; - const static bool profiling = false; - const static bool benchmarking = false; - const static bool inference_debugging = false; + static int const batchSize = 64; + static bool const profiling = false; + static bool const benchmarking = false; + static bool const inference_debugging = false; constexpr static float learningRate = 0.01f; constexpr static float weightDecay = 0.0001f; - const static size_t workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB - const static int numNodes = 1; - const static int workersPerNode = 0; - const static int cpusPerNode = 0; - const static size_t searchBudget = -1; - const static size_t simulatorWorkSpaceSize = + static size_t const workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB + static int const numNodes = 1; + static int const workersPerNode = 0; + static int const cpusPerNode = 0; + static size_t const searchBudget = -1; + static size_t const simulatorWorkSpaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2 GB constexpr static float searchAlpha = 1.2f; - const static bool searchOverlapBackwardUpdate = false; - const static size_t offloadReserveSpaceSize = + static bool const searchOverlapBackwardUpdate = false; + static size_t const offloadReserveSpaceSize = (size_t)8 * 1024 * 1024 * 1024; // 8 GB - const static bool cpuOffload = false; - const static bool onlyDataParallel = true; - const static bool enableSampleParallel = true; - const static bool enableParameterParallel = false; - const static bool enableAttributeParallel = false; - const static bool enableInplaceOptimizations = false; - const static bool allowTensorOpMathConversion = false; - const static int machine_model_version = 0; - const static int simulator_segment_size = 16777216; // 16 MB - const static int simulator_max_num_segments = 1; - const static int base_optimize_threshold = 10; - const static bool enable_control_replication = true; + static bool const cpuOffload = false; + static bool const onlyDataParallel = true; + static bool const enableSampleParallel = true; + static bool const enableParameterParallel = false; + static bool const enableAttributeParallel = false; + static bool const enableInplaceOptimizations = false; + static bool const allowTensorOpMathConversion = false; + static int const machine_model_version = 0; + static int const simulator_segment_size = 16777216; // 16 MB + static int const simulator_max_num_segments = 1; + static int const base_optimize_threshold = 10; + static bool const enable_control_replication = true; // The default python data loader type is 2 to enable control replication - const static int python_data_loader_type = 2; + static int const python_data_loader_type = 2; }; FFConfig::FFConfig() { diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 8f1be15fd1..693da36b1d 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -136,10 +136,15 @@ bool ParallelTensorShape::operator!=(ParallelTensorShape const &other) const { size_t ParallelTensorShape::get_piece_size() const { size_t piece_size = data_type_size(this->data_type); + return piece_size * this->get_piece_num_elements(); +} + +size_t ParallelTensorShape::get_piece_num_elements() const { + size_t piece_num_elements = 1; for (int i = 0; i < this->num_dims; i++) { - piece_size *= this->dims[i].size / this->dims[i].degree; + piece_num_elements *= this->dims[i].size / this->dims[i].degree; } - return piece_size; + return piece_num_elements; } RecordFormatter ParallelTensorShape::as_dot() const { @@ -270,7 +275,7 @@ void ParallelTensorBase::attach_raw_ptr(FFConfig &config, Runtime *runtime = config.lg_hlr; AttachLauncher launcher(EXTERNAL_INSTANCE, region, region); std::vector fields(1, FID_DATA); - const Memory local_sysmem = + Memory const local_sysmem = Machine::MemoryQuery(Machine::get_machine()) .has_affinity_to(runtime->get_executing_processor(ctx)) .only_kind(Memory::SYSTEM_MEM) diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index c0804d6e19..a585424afc 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -1214,6 +1214,7 @@ void Graph::export_strategy_computation_graph( for (auto const &node : s.get_nodes(*this)) { // Add node if (strategy.find(node) == strategy.end()) { + dot.add_node(node, {{"label", node.to_string()}}); // Check FusedParallel node here and print out the detailed information if (node.ptr->op_type == OperatorType::OP_FUSED_PARALLEL) { RecordFormatter rf; @@ -1924,6 +1925,7 @@ void GraphSearchHelper::graph_optimize( this->logger->debug() << "Starting graph optimization"; Graph *graph = this->construct_graph(); + graph->print_dot(); graph->duplicate_input_nodes(); std::unordered_map empty_strategy; if (!this->config.export_strategy_computation_graph_file.empty()) { @@ -3622,6 +3624,7 @@ void FFModel::graph_optimize( this->graph_search->graph_optimize( budget, only_data_parallel, best_graph, optimal_views); } + best_graph->print_dot(); } bool FFModel::convert_graph_to_operators( @@ -3754,7 +3757,7 @@ bool FFModel::convert_graph_to_operators( assert(inList.size() == 1); Softmax *softmax = (Softmax *)node.ptr; new_op = new Softmax( - *this, softmax->layer_guid, inputs[0], softmax->dim, NULL); + *this, inputs[0], softmax->dim, softmax->last_layer, NULL); break; } case OP_COMBINE: {