xinhaoc · xinhaoc · Apr 17, 2023 · Apr 21, 2023 · May 7, 2023 · May 7, 2023
diff --git a/cmake/json.cmake b/cmake/json.cmake
@@ -1,4 +1 @@
-include(FetchContent)
-
-FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz)
-FetchContent_MakeAvailable(json)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/json)
diff --git a/config/config.linux b/config/config.linux
@@ -59,7 +59,7 @@ FF_USE_PYTHON=${FF_USE_PYTHON:-ON}
 FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS:-}
 
 # select GASNET conduit
-FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv}
+FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ofi}
 
 # set UCX dir if Legion networks is set to ucx
 UCX_DIR=${UCX_DIR:-""}
@@ -102,8 +102,6 @@ if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then
 elif [[ "$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" || "$FF_GPU_BACKEND" == "hip_rocm" ]]; then
     # enable NCCL
     FF_USE_NCCL=${FF_USE_NCCL:-ON}
-else
-    FF_USE_NCCL=OFF
 fi
 
 function get_build_configs() {

diff --git a/deps/legion b/deps/legion
diff --git a/deps/tokenizers-cpp b/deps/tokenizers-cpp
diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py
@@ -3,16 +3,18 @@
 import sys
 
 import numpy as np
+import torch
 from flexflow.core import *
+import flexflow.core as ff
 from flexflow.torch.model import PyTorchModel
-from transformers import MT5ForConditionalGeneration, T5Tokenizer
-
+#from transformers import MT5ForConditionalGeneration, T5Tokenizer
+from transformers import BertForMaskedLM, BertTokenizer, BertConfig
 sys.path.append("./examples/python/pytorch/mt5")
 from mt5_torch import DataPreparer, get_dataloaders, set_seed
 
 BASE_DIR = "examples/python/pytorch/mt5"
 DATA_DIR = os.path.join(BASE_DIR, "data")
-NUMPY_DIR = os.path.join(DATA_DIR, "numpy")
+NUMPY_DIR = os.path.join(DATA_DIR, "numpy_candle")
 
 
 def data_to_numpy() -> None:
@@ -28,15 +30,17 @@ def data_to_numpy() -> None:
     """
     model_params = {
         "SEED": 42,
-        "MODEL": "google/mt5-small",
+        #"MODEL": "google/mt5-small",
+        "MODEL": "bert-base-uncased",
         "TRAIN_BATCH_SIZE": None,  # use the full dataset as one batch
         "EVAL_BATCH_SIZE": None,   # use the full dataset as one batch
         "TRAIN_EPOCHS": 1,         # unused
         "MAX_SOURCE_TEXT_LENGTH": 48,
         "MAX_TARGET_TEXT_LENGTH": 48,
     }
     set_seed(model_params)
-    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
+    #tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
+    tokenizer = BertTokenizer.from_pretrained(model_params["MODEL"])
     print("Getting dataloaders...")
     train_loader, eval_loader = get_dataloaders(tokenizer, model_params)
     assert len(train_loader) == 1
@@ -61,8 +65,8 @@ def preprocess_train() -> None:
     y_shape = y.shape
     assert len(y.shape) == 2, \
         "`y` should have shape (num examples, sequence length)"
-    y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long)
-    lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long)
+    y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32)
+    lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32)
     y_ids[:, :] = y[:, :-1]
     lm_labels[:, :] = y[:, 1:]
 
@@ -81,36 +85,60 @@ def preprocess_train() -> None:
 def top_level_task():
     ffconfig = FFConfig()
     ffmodel = FFModel(ffconfig)
-    model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
-
+    #model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
+    # config = BertConfig.from_pretrained('bert-base-uncased')
+
+    # # Modify the configuration to set a different number of layers
+    # config.num_hidden_layers = 1  # Set the number of layers you want
+    # model = BertForMaskedLM.from_pretrained("bert-base-uncased", config=config)
+    # model.num_layers = 1
+    model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+    #model = BertModel.from_pretrained("bert-base-uncased")
     # Load train data as numpy arrays
     print("Loading data...")
-    ids = np.load(os.path.join(NUMPY_DIR, "train_source_ids.npy"))
-    mask = np.load(os.path.join(NUMPY_DIR, "train_source_mask.npy"))
-    y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy"))
-    lm_labels = np.load(os.path.join(NUMPY_DIR, "train_lm_labels.npy"))
+    ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")).astype('int32')
+    ids = np.pad(ids, ((0,0), (0,17)), 'constant')
+    #ids = np.random.randint(0, 5, (1000, 512))
+    #print('ids_shape', ids.shape)
+    #print('ids', ids)
+    mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")).astype('int32')
+    mask = np.pad(mask, ((0,0), (0,17)), 'constant')
+    #mask = np.random.randint(0, 2, (1000, 512))
+    #y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy"))
+    lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")).astype('int32')
+    lm_labels = np.pad(lm_labels, ((0,0), (0,17)), 'constant')
+    #lm_labels = np.random.randint(-1, 5, (1000, 512))
+    position_id = torch.arange(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy()
+    token_type_ids = torch.zeros(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy()
+
 
     batch_size = ffconfig.batch_size
     input_ids_shape = (batch_size, ids.shape[1])
     attention_mask_shape = (batch_size, mask.shape[1])
-    decoder_input_ids_shape = (batch_size, y_ids.shape[1])
+    #decoder_input_ids_shape = (batch_size, y_ids.shape[1])
     input_tensors = [
-        ffmodel.create_tensor(input_ids_shape, DataType.DT_INT64),          # input_ids
-        ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT64),     # attention_mask
-        ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64),  # decoder_input_ids
+        ffmodel.create_tensor(input_ids_shape, DataType.DT_INT32),          # input_ids
+        ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT32),     # attention_mask
+        #ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64),  # decoder_input_ids
     ]
     encoder_seq_length = ids.shape[1]
-    decoder_seq_length = y_ids.shape[1]
-    seq_length = (encoder_seq_length, decoder_seq_length)
-    input_names = ["input_ids", "attention_mask", "decoder_input_ids"]
+    #decoder_seq_length = y_ids.shape[1]
+    #seq_length = (encoder_seq_length, decoder_seq_length)
+    seq_length = encoder_seq_length
+    #input_names = ["input_ids", "attention_mask", "decoder_input_ids"]
+    input_names = ["input_ids", "attention_mask"]
 
     print("Tracing the model...")
+    print(batch_size)
     hf_model = PyTorchModel(
         model, is_hf_model=True, input_names=input_names,
         batch_size=batch_size, seq_length=seq_length,
     )
     output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True)
-    ffoptimizer = SGDOptimizer(ffmodel, lr=0.01)
+    #from flexflow.torch.model import file_to_ff
+    #file_to_ff("mt5.ff", ffmodel, input_tensors)
+    ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, beta1=0.9, beta2=0.98, weight_decay=0.0, epsilon=2e-8)
+    # ffoptimizer = SGDOptimizer(ffmodel, lr=0.01)
 
     print("Compiling the model...")
     ffmodel.compile(
@@ -121,13 +149,21 @@ def top_level_task():
             MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY,
         ],
     )
+
+    # load weights here
+    ffmodel.load_bert_pretrained(checkpoint=model)
 
     print("Creating data loaders...")
+    print('id_dtype', ids.dtype)
+    print('mask_dtype', mask.dtype)
+    print('labels_dtype', lm_labels.dtype)
     input_ids_dl = ffmodel.create_data_loader(input_tensors[0], ids)
     attention_mask_dl = ffmodel.create_data_loader(input_tensors[1], mask)
-    decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids)
+    #decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids)
     # NOTE: We cast down the label tensor data to 32-bit to accommodate the
     # label tensor's required dtype
+    token_type_ids_dl = ffmodel.create_data_loader(input_tensors[2], token_type_ids)
+    position_id_dl = ffmodel.create_data_loader(input_tensors[3], position_id)
     labels_dl = ffmodel.create_data_loader(
         ffmodel.label_tensor, lm_labels.astype("int32")
     )
@@ -138,31 +174,34 @@ def top_level_task():
     print("Training...")
     epochs = ffconfig.epochs
     ffmodel.fit(
-        x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl],
+        #x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl],
+        x=[input_ids_dl, attention_mask_dl, position_id_dl, token_type_ids_dl],
         y=labels_dl, batch_size=batch_size, epochs=epochs,
     )
 
 
 if __name__ == "__main__":
-    # Generate the .tsv files if needed
-    if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \
-            not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")):
-        DataPreparer.data_to_tsv()
-    # Convert the .tsv files to .npy if needed
-    if not os.path.exists(NUMPY_DIR):
-        os.mkdir(NUMPY_DIR)
-    prefixes = ["train_", "eval_"]
-    suffixes = ["source_ids.npy", "source_mask.npy", "target_ids.npy"]
-    npy_filenames = [
-        pre + suf for pre, suf in itertools.product(prefixes, suffixes)
-    ]
-    if any(
-        not os.path.exists(os.path.join(NUMPY_DIR, filename))
-        for filename in npy_filenames
-    ):
-        data_to_numpy()
-    # Preprocess the training data if needed
-    if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \
-            not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")):
-        preprocess_train()
+    ## Generate the .tsv files if needed
+    #if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \
+    #        not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")):
+    #    DataPreparer.data_to_tsv()
+    ## Convert the .tsv files to .npy if needed
+    #if not os.path.exists(NUMPY_DIR):
+    #    os.mkdir(NUMPY_DIR)
+    #prefixes = ["train_", "eval_"]
+    #suffixes = ["source_ids.npy", "source_mask.npy", "target_ids.npy"]
+    #npy_filenames = [
+    #    pre + suf for pre, suf in itertools.product(prefixes, suffixes)
+    #]
+    #if any(
+    #    not os.path.exists(os.path.join(NUMPY_DIR, filename))
+    #    for filename in npy_filenames
+    #):
+    #    data_to_numpy()
+    ## Preprocess the training data if needed
+    #if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \
+    #        not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")):
+    #    preprocess_train()
+    configs = ff.get_configs()
+    ff.init_flexflow_runtime(configs)
     top_level_task()
diff --git a/examples/python/pytorch/mt5/mt5_torch.py b/examples/python/pytorch/mt5/mt5_torch.py
@@ -7,7 +7,7 @@
 import os
 
 import numpy as np
-import pandas as pd
+#import pandas as pd
 import torch
 from torch.utils.data import DataLoader, Dataset
 from transformers import MT5ForConditionalGeneration, T5Tokenizer
@@ -311,5 +311,5 @@ def TorchMT5Trainer(
         "MAX_TARGET_TEXT_LENGTH": 48,
         "LEARNING_RATE": 1e-4,
     }
-    device = torch.device(0)
+    device = torch.device('cpu')
     TorchMT5Trainer(model_params, device)
diff --git a/gdb/pretty_print.py b/gdb/pretty_print.py
@@ -61,7 +61,11 @@ def to_string(self):
             size = dim['size']
             degree = dim['degree']
             parallel_idx = dim['parallel_idx']
-            toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx}]')
+            if dim['is_replica_dim']:
+                is_replica = 'r=t'
+            else:
+                is_replica = 'r=f'
+            toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx} {is_replica}]') 
         return f'TensorShape<{" ".join(toks)}>'
 
 class ParallelTensorBasePrinter:
@@ -77,9 +81,31 @@ def to_string(self):
             size = dim['size']
             degree = dim['degree']
             parallel_idx = dim['parallel_idx']
-            toks.append(f'{i}=[s={size} d={degree} pi={parallel_idx}]')
+            tok = f'{i}=[s={size} d={degree} pi={parallel_idx} '
+            if dim['is_replica_dim']:
+                tok += 'r=t'
+            else:
+                tok += 'r=f'
+            tok += ']'
+            toks.append(tok)
         return f'ParallelTensorBase<{" ".join(toks)}>'
 
+class ParallelDimPrinter: 
+    def __init__(self, val):
+        self.val = val
+
+    def to_string(self):
+        size = self.val['size']
+        degree = self.val['degree']
+        parallel_idx = self.val['parallel_idx']
+        tok = f's={size} d={degree} pi={parallel_idx} '
+        if dim['is_replica_dim']:
+            tok += 'r=t'
+        else:
+            tok += 'r=f'
+        return f'ParallelDim<{tok}>'
+
+
 def build_pretty_printer():
     pp = gdb.printing.RegexpCollectionPrettyPrinter(
         "flexflow")
@@ -89,6 +115,7 @@ def build_pretty_printer():
     pp.add_printer('Domain', '^Legion::Domain$', DomainPrinter)
     pp.add_printer('ParallelTensorShape', '^FlexFlow::ParallelTensorShape$', TensorShapePrinter)
     pp.add_printer('ParallelTensorBase', '^FlexFlow::ParallelTensorBase$', ParallelTensorBasePrinter)
+    pp.add_printer('ParallelDim', '^FlexFlow::ParallelDim$', ParallelDimPrinter)
     return pp
 
 gdb.printing.register_pretty_printer(

diff --git a/ichanges.txt b/ichanges.txt
@@ -0,0 +1,5 @@
+changes:
+cudnnSetTensorDescriptorFromDomain4SoftMax
+try_one_lambda in grpah.cc
+
+field_space = runtime->create_field_space(lg_ctx in model.cc
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
@@ -165,8 +165,10 @@ class FFConfig {
   Legion::Context lg_ctx;
   Legion::Runtime *lg_hlr;
   Legion::IndexSpaceT<1> all_gpu_task_is;
+  Legion::FieldSpace field_space;
+  bool syntheticInput, profiling, perform_fusion;
   // Legion::FieldSpace field_space;
-  bool benchmarking, profiling, perform_fusion;
+  bool benchmarking;
   bool inference_debugging;
   size_t simulator_work_space_size;
   size_t search_budget;
@@ -227,4 +229,4 @@ enum FieldIDs {
 
 }; // namespace FlexFlow
 
-#endif //_FLEXFLOW_CONFIG_H_
+#endif //_FLEXFLOW_CONFIG_H_
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
@@ -127,6 +127,8 @@ void flexflow_model_compute_metrics(flexflow_model_t handle);
 
 void flexflow_model_update(flexflow_model_t handle);
 
+void flexflow_model_unified_update(flexflow_model_t handle);
+
 void flexflow_model_compile(flexflow_model_t handle,
                             enum LossType loss_type,
                             int *metrics,
@@ -344,6 +346,7 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle,
 flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle,
                                              const flexflow_tensor_t input,
                                              int dim,
+                                             bool last_layer,
                                              char const *name);
 
 flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle,

diff --git a/include/flexflow/graph.h b/include/flexflow/graph.h
@@ -91,9 +91,9 @@ struct NodeCompare {
 
 struct GraphOptimalViewSerialized {
 #ifdef LEGION_MAX_RETURN_SIZE
-  static const size_t buffer_size = LEGION_MAX_RETURN_SIZE - 8;
+  static size_t const buffer_size = 4 * LEGION_MAX_RETURN_SIZE - 8;
 #else
-  static const size_t buffer_size = 1024 * 1024 - 8;
+  static size_t const buffer_size = 1024 * 1024 - 8;
 #endif
   size_t total_bytes;
   char data[buffer_size];
@@ -279,7 +279,7 @@ class SearchHelper {
 
   mutable std::unordered_map<size_t, float> cached_graph_costs;
   mutable std::unordered_map<size_t,
-                             std::unique_ptr<const std::vector<MachineView>>>
+                             std::unique_ptr<std::vector<MachineView> const>>
       cached_operator_valid_views;
 };
 
@@ -332,6 +332,8 @@ class Graph {
                           std::vector<Legion::PhysicalRegion> const &regions,
                           Legion::Context ctx,
                           Legion::Runtime *runtime);
+  // static GraphOptimalViewSerialized
+  //     graph_optimize_wrapper(FFModel * model);
   Node find_bottleneck_node(Node const &sink_node,
                             Node const &source_node) const;
   void print_strategy_computation_graph(

diff --git a/include/flexflow/initializer.h b/include/flexflow/initializer.h
@@ -46,7 +46,7 @@ class GlorotUniform : public Initializer {
 
 class Op;
 struct ZeroInitMeta {
-  static int const MAX_NUM_REGIONS = 64;
+  static int const MAX_NUM_REGIONS = 128;
   int num_regions;
   Op *op_ptr;
   DataType data_types[MAX_NUM_REGIONS];

diff --git a/include/flexflow/machine_view.h b/include/flexflow/machine_view.h
@@ -16,7 +16,7 @@ namespace FlexFlow {
 class FFConfig;
 
 struct MachineView {
-  static const MachineView NO_VIEW;
+  static MachineView const NO_VIEW;
   MachineView();
 
   int get_device_id(Legion::DomainPoint const &p) const;