Merge remote-tracking branch 'xinhao/xinhao_candle' into xinhao_infer…

…ence
xinhaoc · May 17, 2024 · 024d188 · 024d188
2 parents d54e4b6 + f65044d
commit 024d188
Show file tree

Hide file tree

Showing 55 changed files with 1,377 additions and 310 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -168,6 +168,17 @@ if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
   include(cudnn)
 endif()
 
+# NCCL
+if(FF_USE_NCCL)
+  if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda")
+    include(nccl)
+  endif()
+  list(APPEND FF_CC_FLAGS
+    -DFF_USE_NCCL)
+  list(APPEND FF_NVCC_FLAGS
+    -DFF_USE_NCCL)
+endif()
+
 # Legion
 include(legion)
 
@@ -383,19 +394,78 @@ if(NOT BUILD_LEGION_ONLY)
     add_dependencies(flexflow ${NCCL_NAME})
   endif()
 
-  target_include_directories(flexflow PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
-  # LEGION_URL is defined if we found a precompiled Legion library to download
-  if(LEGION_URL)
-    # Legion builds produce two library files: one for the Legion runtime and one for the Realm runtime. 
-    # When linking FlexFlow to a precompiled version of Legion, we need to manually link to both library files.
-    target_link_libraries(flexflow ${LEGION_LIBRARY} ${REALM_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional)
-    add_dependencies(flexflow ${LEGION_NAME})
-  else()
-    # When building Legion from source, we do so by calling add_subdirectory(), and obtain a library with both the
-    # Legion and Realm runtimes. The library's name is saved into the LEGION_LIBRARY variable. Hence, we only need
-    # to link FlexFlow to ${LEGION_LIBRARY}
-    target_link_libraries(flexflow ${LEGION_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional)
+  list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}/hip ${ROCM_PATH})
+
+  find_package(hip REQUIRED)
+
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    # The targets defined by the hip cmake config only target amd devices.
+    # For targeting nvidia devices, we'll make our own interface target,
+    # hip_device_nvidia, that includes the rocm and hip headers. 
+    add_library(hip_device_nvidia INTERFACE)
+
+    if (NOT FF_CUDA_ARCH STREQUAL "")
+      target_compile_options(hip_device_nvidia INTERFACE -arch=compute_${FF_CUDA_ARCH})
+    endif()
+
+    target_include_directories(hip_device_nvidia SYSTEM INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include)
+    target_include_directories(hip_device_nvidia INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include)
+
+    add_compile_definitions(FF_USE_HIP_CUDA)
+
+    # Linking cuda: 
+    # We do not explicitly link cuda. hipcc when targeting nvidia will 
+    # use nvcc under the hood. nvcc when used for linking will handle 
+    # linking cuda dependencies
+    target_link_libraries(flexflow hip_device_nvidia)
+  elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+    find_package(hipblas REQUIRED)
+    find_package(miopen REQUIRED)
+    if(FF_USE_NCCL)
+      find_package(rccl REQUIRED)
+    endif()
+    # find_package(rocrand REQUIRED)
+    find_library(HIP_RAND_LIBRARY hiprand REQUIRED)
+
+    add_compile_definitions(FF_USE_HIP_ROCM)
+    # The hip cmake config module defines three targets, 
+    # hip::amdhip64, hip::host, and hip::device.
+    #
+    # hip::host and hip::device are interface targets. hip::amdhip64 is an 
+    # imported target for libamdhip.
+    #
+    # You do not directly link to hip::amdhip64. hip::host links to hip::amdhip64
+    # and hip::device links to hip::host. Link to hip::host to just use hip without 
+    # compiling any GPU code. Link to hip::device to compile the GPU device code.
+    #
+    # Docs (outdated):
+    # https://rocmdocs.amd.com/en/latest/Installation_Guide/Using-CMake-with-AMD-ROCm.html
+    target_link_libraries(flexflow hip::device roc::hipblas MIOpen ${HIP_RAND_LIBRARY})
+    if(FF_USE_NCCL)
+        target_link_libraries(flexflow rccl)
+    endif()
   endif()
+else()
+  message(FATAL_ERROR "Unsupported FF_GPU_BACKEND for cmake: ${FF_GPU_BACKEND}")
+endif()
+
+if(FF_USE_NCCL AND (FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda"))
+  add_dependencies(flexflow ${NCCL_NAME})
+endif()
+
+target_include_directories(flexflow PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
+# LEGION_URL is defined if we found a precompiled Legion library to download
+if(LEGION_URL)
+  # Legion builds produce two library files: one for the Legion runtime and one for the Realm runtime. 
+  # When linking FlexFlow to a precompiled version of Legion, we need to manually link to both library files.
+  target_link_libraries(flexflow ${LEGION_LIBRARY} ${REALM_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional)
+  add_dependencies(flexflow ${LEGION_NAME})
+else()
+  # When building Legion from source, we do so by calling add_subdirectory(), and obtain a library with both the
+  # Legion and Realm runtimes. The library's name is saved into the LEGION_LIBRARY variable. Hence, we only need
+  # to link FlexFlow to ${LEGION_LIBRARY}
+  target_link_libraries(flexflow ${LEGION_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional)
+endif()
 
   #library api version, bump from time to time
   set(SOVERSION 1)

diff --git a/cmake/json.cmake b/cmake/json.cmake
@@ -1,4 +1 @@
-include(FetchContent)
-
-FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz)
-FetchContent_MakeAvailable(json)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/json)
diff --git a/config/config.inc b/config/config.inc
@@ -118,6 +118,11 @@ if [ "$FF_LEGION_NETWORKS" = "gasnet" ]; then
     SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=mpi"
   elif [ "$FF_GASNET_CONDUIT" = "udp" ]; then
     SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=udp"
+  elif [ "$FF_GASNET_CONDUIT" = "ucx" ]; then
+    SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=ucx"
+    SET_LEGION_NETWORKS+=" -DFF_UCX_URL=$FF_UCX_URL"
+  elif [ "$FF_GASNET_CONDUIT" = "ofi" ]; then 
+    SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=ofi"
   fi
 elif [ "$FF_LEGION_NETWORKS" = "ucx" ]; then
   SET_LEGION_NETWORKS+=" -DFF_LEGION_NETWORKS=ucx"
@@ -235,7 +240,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then
         SET_CXX="-DCMAKE_CXX_COMPILER=$(pwd)/nvidia_hipcc -DCMAKE_CXX_LINKER=$(pwd)/nvidia_hipcc"
       else
         ADD_ROCM_TO_PATH="PATH=${PATH}:${ROCM_PATH}/bin"
-        #SET_CXX="-DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_CXX_LINKER=/opt/rocm/bin/hipcc"
+        SET_CXX="-DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc -DCMAKE_CXX_LINKER=$ROCM_PATH/bin/hipcc -DHIP_PATH=$ROCM_PATH/hip -DCMAKE_CXX_FLAGS='-I${MPICH_DIR}/include' -DCMAKE_EXE_LINKER_FLAGS='-L${MPICH_DIR}/lib -lmpi' -DCMAKE_SHARED_LINKER_FLAGS='-L${MPICH_DIR}/lib -lmpi'"
       fi
     fi
   fi

diff --git a/config/config.linux b/config/config.linux
@@ -59,7 +59,7 @@ FF_USE_PYTHON=${FF_USE_PYTHON:-ON}
 FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS:-}
 
 # select GASNET conduit
-FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv}
+FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ofi}
 
 # set UCX dir if Legion networks is set to ucx
 UCX_DIR=${UCX_DIR:-""}
@@ -99,11 +99,9 @@ FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
 if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then
   echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid."
   exit 1
-elif [[ "$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" || "$FF_GPU_BACKEND" == "hip_rocm" ]]; then
+elif [["$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" || "$FF_GPU_BACKEND" == "hip_rocm"]]; then
     # enable NCCL
     FF_USE_NCCL=${FF_USE_NCCL:-ON}
-else
-    FF_USE_NCCL=OFF
 fi
 
 function get_build_configs() {

diff --git a/deps/legion b/deps/legion
diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py
@@ -3,16 +3,17 @@
 import sys
 
 import numpy as np
+import torch
 from flexflow.core import *
 from flexflow.torch.model import PyTorchModel
-from transformers import MT5ForConditionalGeneration, T5Tokenizer
-
+#from transformers import MT5ForConditionalGeneration, T5Tokenizer
+from transformers import BertForMaskedLM, BertTokenizer
 sys.path.append("./examples/python/pytorch/mt5")
 from mt5_torch import DataPreparer, get_dataloaders, set_seed
 
 BASE_DIR = "examples/python/pytorch/mt5"
 DATA_DIR = os.path.join(BASE_DIR, "data")
-NUMPY_DIR = os.path.join(DATA_DIR, "numpy")
+NUMPY_DIR = os.path.join(DATA_DIR, "numpy_candle")
 
 
 def data_to_numpy() -> None:
@@ -28,15 +29,17 @@ def data_to_numpy() -> None:
     """
     model_params = {
         "SEED": 42,
-        "MODEL": "google/mt5-small",
+        #"MODEL": "google/mt5-small",
+        "MODEL": "bert-base-uncased",
         "TRAIN_BATCH_SIZE": None,  # use the full dataset as one batch
         "EVAL_BATCH_SIZE": None,   # use the full dataset as one batch
         "TRAIN_EPOCHS": 1,         # unused
         "MAX_SOURCE_TEXT_LENGTH": 48,
         "MAX_TARGET_TEXT_LENGTH": 48,
     }
     set_seed(model_params)
-    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
+    #tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
+    tokenizer = BertTokenizer.from_pretrained(model_params["MODEL"])
     print("Getting dataloaders...")
     train_loader, eval_loader = get_dataloaders(tokenizer, model_params)
     assert len(train_loader) == 1
@@ -61,8 +64,8 @@ def preprocess_train() -> None:
     y_shape = y.shape
     assert len(y.shape) == 2, \
         "`y` should have shape (num examples, sequence length)"
-    y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long)
-    lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long)
+    y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32)
+    lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32)
     y_ids[:, :] = y[:, :-1]
     lm_labels[:, :] = y[:, 1:]
 
@@ -81,36 +84,53 @@ def preprocess_train() -> None:
 def top_level_task():
     ffconfig = FFConfig()
     ffmodel = FFModel(ffconfig)
-    model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
-
+    #model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
+    model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+    #model = BertModel.from_pretrained("bert-base-uncased")
     # Load train data as numpy arrays
     print("Loading data...")
-    ids = np.load(os.path.join(NUMPY_DIR, "train_source_ids.npy"))
-    mask = np.load(os.path.join(NUMPY_DIR, "train_source_mask.npy"))
-    y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy"))
-    lm_labels = np.load(os.path.join(NUMPY_DIR, "train_lm_labels.npy"))
+    ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")).astype('int32')
+    ids = np.pad(ids, ((0,0), (0,17)), 'constant')
+    #ids = np.random.randint(0, 5, (1000, 512))
+    #print('ids_shape', ids.shape)
+    #print('ids', ids)
+    mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")).astype('int32')
+    mask = np.pad(mask, ((0,0), (0,17)), 'constant')
+    #mask = np.random.randint(0, 2, (1000, 512))
+    #y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy"))
+    lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")).astype('int32')
+    lm_labels = np.pad(lm_labels, ((0,0), (0,17)), 'constant')
+    #lm_labels = np.random.randint(-1, 5, (1000, 512))
+    position_id = torch.arange(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy()
+    token_type_ids = torch.zeros(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy()
+
 
     batch_size = ffconfig.batch_size
     input_ids_shape = (batch_size, ids.shape[1])
     attention_mask_shape = (batch_size, mask.shape[1])
-    decoder_input_ids_shape = (batch_size, y_ids.shape[1])
+    #decoder_input_ids_shape = (batch_size, y_ids.shape[1])
     input_tensors = [
-        ffmodel.create_tensor(input_ids_shape, DataType.DT_INT64),          # input_ids
-        ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT64),     # attention_mask
-        ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64),  # decoder_input_ids
+        ffmodel.create_tensor(input_ids_shape, DataType.DT_INT32),          # input_ids
+        ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT32),     # attention_mask
+        #ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64),  # decoder_input_ids
     ]
     encoder_seq_length = ids.shape[1]
-    decoder_seq_length = y_ids.shape[1]
-    seq_length = (encoder_seq_length, decoder_seq_length)
-    input_names = ["input_ids", "attention_mask", "decoder_input_ids"]
+    #decoder_seq_length = y_ids.shape[1]
+    #seq_length = (encoder_seq_length, decoder_seq_length)
+    seq_length = encoder_seq_length
+    #input_names = ["input_ids", "attention_mask", "decoder_input_ids"]
+    input_names = ["input_ids", "attention_mask"]
 
     print("Tracing the model...")
     hf_model = PyTorchModel(
         model, is_hf_model=True, input_names=input_names,
         batch_size=batch_size, seq_length=seq_length,
     )
     output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True)
-    ffoptimizer = SGDOptimizer(ffmodel, lr=0.01)
+    #from flexflow.torch.model import file_to_ff
+    #file_to_ff("mt5.ff", ffmodel, input_tensors)
+    ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, beta1=0.9, beta2=0.98, weight_decay=0.0, epsilon=2e-8)
+    # ffoptimizer = SGDOptimizer(ffmodel, lr=0.01)
 
     print("Compiling the model...")
     ffmodel.compile(
@@ -121,13 +141,21 @@ def top_level_task():
             MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY,
         ],
     )
+
+    # load weights here
+    ffmodel.load_bert_pretrained(checkpoint=model)
 
     print("Creating data loaders...")
+    print('id_dtype', ids.dtype)
+    print('mask_dtype', mask.dtype)
+    print('labels_dtype', lm_labels.dtype)
     input_ids_dl = ffmodel.create_data_loader(input_tensors[0], ids)
     attention_mask_dl = ffmodel.create_data_loader(input_tensors[1], mask)
-    decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids)
+    #decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids)
     # NOTE: We cast down the label tensor data to 32-bit to accommodate the
     # label tensor's required dtype
+    token_type_ids_dl = ffmodel.create_data_loader(input_tensors[2], token_type_ids)
+    position_id_dl = ffmodel.create_data_loader(input_tensors[3], position_id)
     labels_dl = ffmodel.create_data_loader(
         ffmodel.label_tensor, lm_labels.astype("int32")
     )
@@ -138,31 +166,32 @@ def top_level_task():
     print("Training...")
     epochs = ffconfig.epochs
     ffmodel.fit(
-        x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl],
+        #x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl],
+        x=[input_ids_dl, attention_mask_dl, position_id_dl, token_type_ids_dl],
         y=labels_dl, batch_size=batch_size, epochs=epochs,
     )
 
 
 if __name__ == "__main__":
-    # Generate the .tsv files if needed
-    if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \
-            not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")):
-        DataPreparer.data_to_tsv()
-    # Convert the .tsv files to .npy if needed
-    if not os.path.exists(NUMPY_DIR):
-        os.mkdir(NUMPY_DIR)
-    prefixes = ["train_", "eval_"]
-    suffixes = ["source_ids.npy", "source_mask.npy", "target_ids.npy"]
-    npy_filenames = [
-        pre + suf for pre, suf in itertools.product(prefixes, suffixes)
-    ]
-    if any(
-        not os.path.exists(os.path.join(NUMPY_DIR, filename))
-        for filename in npy_filenames
-    ):
-        data_to_numpy()
-    # Preprocess the training data if needed
-    if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \
-            not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")):
-        preprocess_train()
+    ## Generate the .tsv files if needed
+    #if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \
+    #        not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")):
+    #    DataPreparer.data_to_tsv()
+    ## Convert the .tsv files to .npy if needed
+    #if not os.path.exists(NUMPY_DIR):
+    #    os.mkdir(NUMPY_DIR)
+    #prefixes = ["train_", "eval_"]
+    #suffixes = ["source_ids.npy", "source_mask.npy", "target_ids.npy"]
+    #npy_filenames = [
+    #    pre + suf for pre, suf in itertools.product(prefixes, suffixes)
+    #]
+    #if any(
+    #    not os.path.exists(os.path.join(NUMPY_DIR, filename))
+    #    for filename in npy_filenames
+    #):
+    #    data_to_numpy()
+    ## Preprocess the training data if needed
+    #if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \
+    #        not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")):
+    #    preprocess_train()
     top_level_task()
diff --git a/examples/python/pytorch/mt5/mt5_torch.py b/examples/python/pytorch/mt5/mt5_torch.py
@@ -7,7 +7,7 @@
 import os
 
 import numpy as np
-import pandas as pd
+#import pandas as pd
 import torch
 from torch.utils.data import DataLoader, Dataset
 from transformers import MT5ForConditionalGeneration, T5Tokenizer
@@ -311,5 +311,5 @@ def TorchMT5Trainer(
         "MAX_TARGET_TEXT_LENGTH": 48,
         "LEARNING_RATE": 1e-4,
     }
-    device = torch.device(0)
+    device = torch.device('cpu')
     TorchMT5Trainer(model_params, device)