From e767ef75fecc874afecd0f932df5266152068fef Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Sat, 6 Jan 2024 15:25:00 -0500
Subject: [PATCH 01/11] init

---
 include/flexflow/accessor.h                   |   5 +
 include/flexflow/ffconst.h                    |   7 +
 .../inc_multihead_self_attention_utils.cuh    | 114 +++++++++++++
 inference/incr_decoding/incr_decoding.cc      |   7 +-
 inference/models/llama.cc                     |   6 +-
 inference/models/llama.h                      |   2 +-
 inference/spec_infer/spec_infer.cc            |   5 +-
 inference/utils/download_hf_model.py          |   9 +-
 python/flexflow/serve/models/llama.py         |   2 +-
 python/flexflow/serve/serve.py                |   5 +-
 python/flexflow/type.py                       |   1 +
 src/ops/add_bias_residual_layer_norm.cu       |  22 ++-
 src/ops/arg_topk.cu                           |  14 +-
 src/ops/argmax.cu                             |  35 +++-
 src/ops/element_unary.cu                      |   7 +
 src/ops/fused.cu                              |  16 ++
 src/ops/inc_multihead_self_attention.cu       |  45 ++++++
 src/ops/kernels/decompress_kernels.cu         |  17 ++
 src/ops/kernels/element_binary_kernels.cu     |   3 +
 src/ops/kernels/embedding_kernels.cu          | 152 ++++++++++++++++++
 src/ops/kernels/linear_kernels.cu             |  29 ++++
 src/ops/kernels/residual_rms_norm_kernels.cu  |  10 +-
 src/ops/kernels/rms_norm_kernels.cu           |  12 +-
 src/ops/kernels/softmax.cu                    |  11 +-
 src/ops/layer_norm.cu                         |  10 +-
 src/ops/linear.cc                             |  27 +++-
 src/ops/residual_layer_norm.cu                |  13 +-
 src/ops/sampling.cu                           |  25 ++-
 src/ops/sigmoid_silu_multi.cu                 |   8 +
 src/ops/softmax.cc                            |  15 +-
 src/ops/spec_inc_multihead_self_attention.cu  |  12 ++
 src/ops/tree_inc_multihead_self_attention.cu  |  19 +++
 src/parallel_ops/combine.cc                   |   4 +-
 src/parallel_ops/kernels/combine_kernels.cu   |   5 +
 src/parallel_ops/kernels/reduction_kernels.cu |  13 ++
 src/parallel_ops/kernels/replicate_kernels.cu |   3 +
 src/parallel_ops/reduction.cc                 |   7 +-
 src/parallel_ops/replicate.cc                 |   6 +-
 src/runtime/accessor.cc                       |  33 ++++
 src/runtime/cuda_helper.cu                    |  40 +++++
 src/runtime/ffconst_utils.cc                  |   2 +
 src/runtime/initializer_kernel.cu             |   6 +
 src/runtime/model.cc                          |   3 +
 src/runtime/operator.cc                       |  12 ++
 44 files changed, 761 insertions(+), 38 deletions(-)

diff --git a/include/flexflow/accessor.h b/include/flexflow/accessor.h
index 65ab33b513..26ed006646 100644
--- a/include/flexflow/accessor.h
+++ b/include/flexflow/accessor.h
@@ -5,10 +5,13 @@
 
 #if defined(FF_USE_CUDA)
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #elif defined(FF_USE_HIP_CUDA)
 #include <cuda_fp16.h>
+#include <hip_bfloat16.h>
 #elif defined(FF_USE_HIP_ROCM)
 #include <hip/hip_fp16.h>
+#include <hip_bfloat16.h>
 #endif
 
 // using namespace Legion;
@@ -61,6 +64,7 @@ class GenericTensorAccessorW {
   float *get_float_ptr() const;
   double *get_double_ptr() const;
   half *get_half_ptr() const;
+  __nv_bfloat16 *get_bfloat16_ptr() const;
   char *get_byte_ptr() const;
   DataType data_type;
   Legion::Domain domain;
@@ -80,6 +84,7 @@ class GenericTensorAccessorR {
   float const *get_float_ptr() const;
   double const *get_double_ptr() const;
   half const *get_half_ptr() const;
+  __nv_bfloat16 const *get_bfloat16_ptr() const;
   char const *get_byte_ptr() const;
   DataType data_type;
   Legion::Domain domain;
diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index 512645e624..9613a041f7 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -35,6 +35,7 @@ enum DataType {
   DT_DOUBLE = 45,
   DT_INT4 = 46,
   DT_INT8 = 47,
+  DT_B16 = 48,
   DT_NONE = 49,
 };
 
@@ -269,4 +270,10 @@ enum {
   PARALLEL_TENSOR_GUID_FIRST_VALID = 4000000,
   NODE_GUID_FIRST_VALID = 5000000,
 };
+
+enum InferencePrecision {
+  INFERENCE_FLOAT = 800,
+  INFERENCE_HALF = 801,
+  INFERENCE_BFLOAT16 = 802,
+};
 #endif // _FLEXFLOW_CONST_H_
diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
index d1e0e050b2..dfc99bc612 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
@@ -23,6 +23,25 @@ struct half8 {
   half c;
   half d;
 };
+
+struct __nv_bfloat164 {
+  __nv_bfloat16 x;
+  __nv_bfloat16 y;
+  __nv_bfloat16 z;
+  __nv_bfloat16 w;
+};
+
+struct __nv_bfloat168 {
+  __nv_bfloat16 x;
+  __nv_bfloat16 y;
+  __nv_bfloat16 z;
+  __nv_bfloat16 w;
+  __nv_bfloat16 a;
+  __nv_bfloat16 b;
+  __nv_bfloat16 c;
+  __nv_bfloat16 d;
+};
+
 struct float8 {
   float x;
   float y;
@@ -61,6 +80,18 @@ template <>
 struct VEC_K<half, 4> {
   using Type = half4;
 };
+template <>
+struct VEC_K<__nv_bfloat16, 1> {
+  using Type = __nv_bfloat16;
+};
+template <>
+struct VEC_K<__nv_bfloat16, 2> {
+  using Type = __nv_bfloat162;
+};
+template <>
+struct VEC_K<__nv_bfloat16, 4> {
+  using Type = __nv_bfloat164;
+};
 
 // data type for QK production
 template <typename T>
@@ -95,6 +126,23 @@ struct Vec_fp32_<half8> {
   using Type = float8;
 };
 
+template <>
+struct Vec_fp32_<__nv_bfloat16> {
+  using Type = float;
+};
+template <>
+struct Vec_fp32_<__nv_bfloat162> {
+  using Type = float2;
+};
+template <>
+struct Vec_fp32_<__nv_bfloat164> {
+  using Type = float4;
+};
+template <>
+struct Vec_fp32_<__nv_bfloat168> {
+  using Type = float8;
+};
+
 template <typename DT>
 struct VEC_V {};
 template <>
@@ -105,6 +153,10 @@ template <>
 struct VEC_V<half> {
   using Type = half8;
 };
+template <>
+struct VEC_V<__nv_bfloat16> {
+  using Type = __nv_bfloat168;
+};
 
 ////////////////data structures half///////////////
 
@@ -331,6 +383,42 @@ inline __device__ float8 cast_to_float(half8 u) {
   return tmp;
 }
 
+inline __device__ float cast_to_float(__nv_bfloat16 u) {
+  return __bfloat162float(u);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 cast_to_float(__nv_bfloat162 u) {
+  float2 tmp;
+  tmp.x = __bfloat162float(u.x);
+  tmp.y = __bfloat162float(u.y);
+  return tmp;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float4 cast_to_float(__nv_bfloat164 u) {
+  float4 tmp;
+  tmp.x = __bfloat162float(u.x);
+  tmp.y = __bfloat162float(u.y);
+  tmp.z = __bfloat162float(u.z);
+  tmp.w = __bfloat162float(u.w);
+  return tmp;
+}
+inline __device__ float8 cast_to_float(__nv_bfloat168 u) {
+  float8 tmp;
+  tmp.x = __bfloat162float(u.x);
+  tmp.y = __bfloat162float(u.y);
+  tmp.z = __bfloat162float(u.z);
+  tmp.w = __bfloat162float(u.w);
+  tmp.a = __bfloat162float(u.a);
+  tmp.b = __bfloat162float(u.b);
+  tmp.c = __bfloat162float(u.c);
+  tmp.d = __bfloat162float(u.d);
+  return tmp;
+}
+
 inline __device__ void convert_from_float(float4 &dst, float4 src) {
   dst = src;
 }
@@ -369,6 +457,32 @@ inline __device__ void convert_from_float(half &dst, float src) {
   dst = __float2half(src);
 }
 
+//
+inline __device__ void convert_from_float(__nv_bfloat164 &dst, float4 src) {
+  dst.x = __float2bfloat16(src.x);
+  dst.y = __float2bfloat16(src.y);
+  dst.z = __float2bfloat16(src.z);
+  dst.w = __float2bfloat16(src.w);
+}
+
+inline __device__ void convert_from_float(__nv_bfloat168 &dst, float8 src) {
+  dst.x = __float2bfloat16(src.x);
+  dst.y = __float2bfloat16(src.y);
+  dst.z = __float2bfloat16(src.z);
+  dst.w = __float2bfloat16(src.w);
+  dst.a = __float2bfloat16(src.a);
+  dst.b = __float2bfloat16(src.b);
+  dst.c = __float2bfloat16(src.c);
+  dst.d = __float2bfloat16(src.d);
+}
+inline __device__ void convert_from_float(__nv_bfloat162 &dst, float2 src) {
+  dst.x = __float2bfloat16(src.x);
+  dst.y = __float2bfloat16(src.y);
+}
+inline __device__ void convert_from_float(__nv_bfloat16 &dst, float src) {
+  dst = __float2bfloat16(src);
+}
+
 //////////////////////////////////////utils///////////////////////////////////////////////
 
 template <typename T>
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index c3f9052305..61652b3ae3 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -39,6 +39,7 @@ void parse_input_args(char **argv,
                       int argc,
                       FilePaths &paths,
                       std::string &llm_model_name,
+                      DataType &data_type,
                       bool &use_full_precision,
                       bool &verbose,
                       bool &do_sample,
@@ -72,7 +73,7 @@ void parse_input_args(char **argv,
       continue;
     }
     if (!strcmp(argv[i], "--use-full-precision")) {
-      use_full_precision = true;
+      data_type = DT_FLOAT;
       continue;
     }
     // verbose logging to stdout
@@ -125,6 +126,7 @@ void FlexFlow::top_level_task(Task const *task,
   }
   FilePaths file_paths;
   std::string llm_model_name;
+  DataType data_type = DT_B16;
   bool use_full_precision = false;
   bool verbose = false;
   bool do_sample = false;
@@ -141,6 +143,7 @@ void FlexFlow::top_level_task(Task const *task,
                    argc,
                    file_paths,
                    llm_model_name,
+                   data_type,
                    use_full_precision,
                    verbose,
                    do_sample,
@@ -219,7 +222,7 @@ void FlexFlow::top_level_task(Task const *task,
                               weights_filepath,
                               INC_DECODING_MODE,
                               generationConfig,
-                              use_full_precision);
+                              data_type);
   } else if (model_type == ModelType::OPT) {
     OPT::create_opt_model(model,
                           config_filepath,
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index e54d6d8811..f4655b1750 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -25,7 +25,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                                std::string const &weight_file_path,
                                InferenceMode mode,
                                GenerationConfig generation_config,
-                               bool use_full_precision) {
+                               DataType data_type) {
   // do not apply cpu offload in beam search model.
   LLAMAConfig llama_config(model_config_file_path);
   llama_config.print();
@@ -54,7 +54,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                               llama_config.vocab_size,
                               llama_config.hidden_size,
                               AGGR_MODE_NONE,
-                              use_full_precision ? DT_FLOAT : DT_HALF,
+                              data_type,
                               NULL,
                               embed_init,
                               "tok_embeddings");
@@ -276,7 +276,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                             llama_config.hidden_size /
                                 llama_config.num_attention_heads,
                             ff.config.tensor_parallelism_degree);
-  fileloader.load_weights(&ff, use_full_precision);
+  fileloader.load_weights(&ff, data_type);
   std::cout << "------load weight finished----------" << std::endl;
 
   // init operators
diff --git a/inference/models/llama.h b/inference/models/llama.h
index 8ecfcd7155..6c72b432ad 100644
--- a/inference/models/llama.h
+++ b/inference/models/llama.h
@@ -83,7 +83,7 @@ class LLAMA {
                                  std::string const &weight_file_path,
                                  InferenceMode mode,
                                  GenerationConfig generation_config,
-                                 bool use_full_precision = false);
+                                 DataType DataType);
 };
 
 }; // namespace FlexFlow
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index b369a13c1d..8d3d23ae4e 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -264,6 +264,7 @@ void FlexFlow::top_level_task(Task const *task,
   FFConfig ffconfig;
   FilePaths file_paths;
   ModelMeta model_metadata;
+  DataType data_type = DT_B16;
   bool use_full_precision = false;
   bool verbose = false;
   int max_requests_per_batch = 16;
@@ -313,7 +314,7 @@ void FlexFlow::top_level_task(Task const *task,
                               model_metadata.llm_weights_path,
                               TREE_VERIFY_MODE,
                               generationConfig,
-                              use_full_precision);
+                              data_type);
   } else if (model_metadata.llm_model_type == ModelType::OPT) {
     OPT::create_opt_model(tree_model,
                           model_metadata.llm_model_config_path,
@@ -357,7 +358,7 @@ void FlexFlow::top_level_task(Task const *task,
                                 model_metadata.ssm_model_weights_paths[ssm_id],
                                 BEAM_SEARCH_MODE,
                                 generationConfig,
-                                use_full_precision);
+                                data_type);
     } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::OPT) {
       OPT::create_opt_model(beam_model,
                             model_metadata.ssm_model_config_paths[ssm_id],
diff --git a/inference/utils/download_hf_model.py b/inference/utils/download_hf_model.py
index 03fc8e1633..f158d4e480 100644
--- a/inference/utils/download_hf_model.py
+++ b/inference/utils/download_hf_model.py
@@ -30,6 +30,11 @@ def parse_args():
         action="store_true",
         help="Only download the half precision version of the weights",
     )
+    group.add_argument(
+        "--bfloat16-precision-only",
+        action="store_true",
+        help="Only download the bfloat16 precision version of the weights",
+    )
     args = parser.parse_args()
     return args
 
@@ -39,8 +44,10 @@ def main(args):
         data_types = ff.DataType.DT_FLOAT
     elif args.half_precision_only:
         data_types = ff.DataType.DT_HALF
+    elif args.bfloat16_precision_only:
+        data_types = ff.DataType.DT_B16
     else:
-        data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF)
+        data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF, ff.DataType.DT_B16)
 
     for model_name in args.model_names:
         for data_type in data_types:
diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index 7ba0e78a37..f37a14ed9e 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -15,7 +15,7 @@
 from flexflow.core import *
 from .base import FlexFlowModel
 import random
-
+import torch
 
 class LLAMAConfig:
     def __init__(self, hf_config):
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 55601f957e..5c1709a7ff 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -113,7 +113,7 @@ def __init__(
             self.config_class,
         ) = self.__get_ff_model_type()
         self.data_type = data_type
-        assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT
+        assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT or self.data_type == DataType.DT_B16
         self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow"
         self.refresh_cache = refresh_cache
         self.output_file = output_file
@@ -172,6 +172,8 @@ def download_hf_weights_if_needed(self):
             torch.set_default_tensor_type(torch.HalfTensor)
         elif self.data_type == DataType.DT_FLOAT:
             torch.set_default_tensor_type(torch.FloatTensor)
+        elif self.data_type == DataType.DT_B16:
+            torch.set_default_tensor_type(torch.BFloat16Tensor)
         else:
             assert False, "Data type not yet supported -- cannot download weights!"
 
@@ -182,6 +184,7 @@ def download_hf_weights_if_needed(self):
             self.model_name.lower(),
             "full-precision"
             if self.data_type == DataType.DT_FLOAT
+            else "bfloat16-precision" if self.data_type == DataType.DT_B16
             else "half-precision",
         )
         if self.refresh_cache:
diff --git a/python/flexflow/type.py b/python/flexflow/type.py
index 994a85f57e..df113438e8 100644
--- a/python/flexflow/type.py
+++ b/python/flexflow/type.py
@@ -35,6 +35,7 @@ class DataType(Enum):
     DT_HALF = 43
     DT_FLOAT = 44
     DT_DOUBLE = 45
+    DT_B16 = 48
     DT_NONE = 49
 
 
diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu
index ceb1a6514e..b137d6faef 100644
--- a/src/ops/add_bias_residual_layer_norm.cu
+++ b/src/ops/add_bias_residual_layer_norm.cu
@@ -107,13 +107,13 @@ __global__ void LayerNormFusedForwardKernel(int64_t N,
                                             T *Y) {
   __shared__ float m_shared[C10_WARP_SIZE];
   __shared__ float v_shared[C10_WARP_SIZE];
-  const int64_t i = blockIdx.x;
+  int64_t const i = blockIdx.x;
   float sum1 = 0.0f;
   float sum2 = 0.0f;
   for (int64_t j = threadIdx.x; j < N;
        j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
-    const int64_t index = i * N + j;
-    const int64_t bias_idx = index % attn_bias_dim;
+    int64_t const index = i * N + j;
+    int64_t const bias_idx = index % attn_bias_dim;
     X[index] = input_ptr[index] + attn_bias_ptr[bias_idx] + residual_ptr[index];
     sum1 += static_cast<float>(X[index]);
     sum2 += static_cast<float>(X[index]) * static_cast<float>(X[index]);
@@ -136,7 +136,7 @@ __global__ void LayerNormFusedForwardKernel(int64_t N,
 
   using T_ACC = T;
   for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
-    const int64_t index = i * N + j;
+    int64_t const index = i * N + j;
     const T_ACC gamma_v =
         gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
     const T_ACC beta_v =
@@ -234,6 +234,20 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
         m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
         (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr,
         stream);
+  } else if (m->input_type[0] == DT_B16) {
+    AddBiasResidualLayerNorm::inference_kernel<__nv_bfloat16>(
+        m,
+        attn_bias_dim,
+        residual_volume,
+        input.get_bfloat16_ptr(),
+        attn_bias.get_bfloat16_ptr(),
+        residual.get_bfloat16_ptr(),
+        added_output.get_bfloat16_ptr(),
+        output.get_bfloat16_ptr(),
+        m->elementwise_affine ? gamma.get_bfloat16_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_bfloat16_ptr()
+                                               : nullptr,
+        stream);
   } else {
     assert(false && "unsupport datatype in layernorm");
   }
diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index 5b7978812c..7f43aba948 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -520,7 +520,19 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
                             m->sorted,
                             m->speculative_decoding ? bc : nullptr,
                             stream);
-  } else {
+  } else if (input.data_type == DT_B16) {
+    ArgTopK::forward_kernel(m,
+                            input.get_bfloat16_ptr(),
+                            m->speculative_decoding ? probs.get_float_ptr()
+                                                    : nullptr,
+                            indices.get_int32_ptr(),
+                            batch_size,
+                            length,
+                            k,
+                            m->sorted,
+                            m->speculative_decoding ? bc : nullptr,
+                            stream);
+  }else {
     assert(false && "Unsupported data type");
   }
 
diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu
index 05c84719c1..32cd872a55 100644
--- a/src/ops/argmax.cu
+++ b/src/ops/argmax.cu
@@ -123,6 +123,16 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
                                   length,
                                   batch_size,
                                   stream);
+  } else if (input.data_type == DT_B16) {
+    ArgMax::forward_kernel<__nv_bfloat16>(
+        m,
+        input.get_bfloat16_ptr(),
+        indices.get_int32_ptr(),
+        m->probs,
+        m->beam_search ? parent.get_int32_ptr() : nullptr,
+        length,
+        batch_size,
+        stream);
   } else {
     assert(false && "Unsupported data type");
   }
@@ -153,20 +163,27 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler,
 
   size_t d_offsets_size = batch_size;
   size_t prob_size = batch_size;
-  assert(data_type == DT_FLOAT || data_type == DT_HALF);
+  assert(data_type == DT_FLOAT || data_type == DT_HALF || data_type == DT_B16);
   size_t total_size =
       d_offsets_size * sizeof(int) +
       (data_type == DT_FLOAT
            ? sizeof(cub::KeyValuePair<int, float>) * batch_size
-           : sizeof(cub::KeyValuePair<int, half>) * batch_size) +
+           : (data_type == DT_HALF
+                  ? sizeof(cub::KeyValuePair<int, half>) * batch_size
+                  : sizeof(cub::KeyValuePair<int, __nv_bfloat16>) *
+                        batch_size)) +
       prob_size * sizeof(float);
   gpu_mem_allocator.create_legion_instance(reserveInst, total_size);
   d_offsets = gpu_mem_allocator.allocate_instance<int>(d_offsets_size);
   d_out = data_type == DT_FLOAT
               ? gpu_mem_allocator.allocate_instance_untyped(
                     batch_size * sizeof(cub::KeyValuePair<int, float>))
-              : gpu_mem_allocator.allocate_instance_untyped(
-                    batch_size * sizeof(cub::KeyValuePair<int, half>));
+              : (data_type == DT_HALF
+                     ? gpu_mem_allocator.allocate_instance_untyped(
+                           batch_size * sizeof(cub::KeyValuePair<int, half>))
+                     : gpu_mem_allocator.allocate_instance_untyped(
+                           batch_size *
+                           sizeof(cub::KeyValuePair<int, __nv_bfloat16>)));
   probs = gpu_mem_allocator.allocate_instance<float>(prob_size);
   // init offset
   int parallelism = total_ele;
@@ -197,6 +214,16 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler,
         d_offsets,
         d_offsets + 1,
         stream));
+  } else if (data_type == DT_B16) {
+    checkCUDA(cub::DeviceSegmentedReduce::ArgMax(
+        d_temp_storage,
+        temp_storage_bytes,
+        input.get_bfloat16_ptr(),
+        static_cast<cub::KeyValuePair<int, __nv_bfloat16> *>(d_out),
+        batch_size,
+        d_offsets,
+        d_offsets + 1,
+        stream));
   }
 
   gpu_mem_allocator.create_legion_instance(reserveInst, temp_storage_bytes);
diff --git a/src/ops/element_unary.cu b/src/ops/element_unary.cu
index c7f5e90f4c..ed4fb15e18 100644
--- a/src/ops/element_unary.cu
+++ b/src/ops/element_unary.cu
@@ -307,6 +307,12 @@ template void
                                                 float const *input_ptr,
                                                 float *output_ptr,
                                                 size_t num_elements);
+template void
+    ElementUnary::forward_kernel_wrapper<__nv_bfloat16>(ElementUnaryMeta const *m,
+                                               __nv_bfloat16 const *input_ptr,
+                                               __nv_bfloat16 *output_ptr,
+                                               size_t num_elements);
+
 template void
     ElementUnary::forward_kernel_wrapper<double>(ElementUnaryMeta const *m,
                                                  double const *input_ptr,
@@ -329,6 +335,7 @@ template void
                                                  float const *output_ptr,
                                                  float const *output_grad_ptr,
                                                  size_t num_elements);
+
 template void
     ElementUnary::backward_kernel_wrapper<double>(ElementUnaryMeta const *m,
                                                   double const *input_ptr,
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index c6ba0b04c5..77e12828a1 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -442,6 +442,11 @@ __host__ void FusedOp::forward_task(Task const *task,
               m,
               my_input_accessor[0].get_float_ptr(),
               my_output_accessor[0].get_float_ptr());
+        } else if (m->input_type == DT_B16) {
+          Kernels::Softmax::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_bfloat16_ptr(),
+              my_output_accessor[0].get_bfloat16_ptr());
         }
         break;
       }
@@ -850,6 +855,12 @@ __host__ void
               my_input_accessor[0].get_float_ptr(),
               my_output_accessor[0].get_float_ptr(),
               my_input_accessor[0].domain.get_volume());
+        } else if (m->data_type == DT_B16) {
+          ElementUnary::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_bfloat16_ptr(),
+              my_output_accessor[0].get_bfloat16_ptr(),
+              my_input_accessor[0].domain.get_volume());
         } else {
           assert(false && "Unsupported data type in ElementUnary forward");
         }
@@ -1076,6 +1087,11 @@ __host__ void
               m,
               my_input_accessor[0].get_float_ptr(),
               my_output_accessor[0].get_float_ptr());
+        } else if (m->input_type == DT_B16) {
+          Kernels::Softmax::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_bfloat16_ptr(),
+              my_output_accessor[0].get_bfloat16_ptr());
         }
         break;
       }
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 42933cee27..4e8f97650a 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -792,6 +792,12 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
                       m->weightSize,
                       cudaMemcpyHostToDevice,
                       stream);
+    } else if (data_type == DT_B16) {
+      cudaMemcpyAsync(m->weight_ptr,
+                      weight.get_bfloat16_ptr(),
+                      m->weightSize,
+                      cudaMemcpyHostToDevice,
+                      stream);
     } else {
       assert(false);
     }
@@ -1185,6 +1191,22 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
         output.get_float_ptr(),
         bias_ptr,
         stream);
+  }else if (input.data_type == DT_B16) {
+    if (m->offload) {
+      pre_build_weight_kernel<__nv_bfloat16>(m, weight, input.data_type, stream);
+    }
+    __nv_bfloat16 const *bias_ptr =
+        use_bias ? bias.get_bfloat16_ptr() : static_cast<__nv_bfloat16 const *>(nullptr);
+    Kernels::IncMultiHeadAttention::inference_kernel(
+        m,
+        bc,
+        shard_id,
+        input.get_bfloat16_ptr(),
+        m->offload ? static_cast<__nv_bfloat16 *>(m->weight_ptr)
+                   : weight.get_bfloat16_ptr(),
+        output.get_bfloat16_ptr(),
+        bias_ptr,
+        stream);
   } else {
     assert(false && "Unspported data type");
   }
@@ -1493,6 +1515,12 @@ template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<half>(
     DataType data_type,
     cudaStream_t stream);
 
+template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<__nv_bfloat16>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    GenericTensorAccessorR const weight,
+    DataType data_type,
+    cudaStream_t stream);
+
 template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<float>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
@@ -1513,6 +1541,16 @@ template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<half>(
     int num_tokens,
     cudaStream_t stream);
 
+template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<__nv_bfloat16>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    int shard_id,
+    __nv_bfloat16 *output_ptr,
+    __nv_bfloat16 const *weight_ptr,
+    __nv_bfloat16 const *bias_ptr,
+    int num_tokens,
+    cudaStream_t stream);
+
 template void
     Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<float>(
         IncMultiHeadSelfAttentionMeta const *m,
@@ -1526,4 +1564,11 @@ template void
         BatchConfig const *bc,
         half *output_ptr,
         cudaStream_t stream);
+
+template void
+    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<__nv_bfloat16>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        __nv_bfloat16 *output_ptr,
+        cudaStream_t stream);
 }; // namespace FlexFlow
diff --git a/src/ops/kernels/decompress_kernels.cu b/src/ops/kernels/decompress_kernels.cu
index 2e02ce1eec..2165967477 100644
--- a/src/ops/kernels/decompress_kernels.cu
+++ b/src/ops/kernels/decompress_kernels.cu
@@ -209,10 +209,14 @@ template __global__ void decompress_int4_general_weights<float>(
     char const *input_weight_ptr, float *weight_ptr, int in_dim, int valueSize);
 template __global__ void decompress_int4_general_weights<half>(
     char const *input_weight_ptr, half *weight_ptr, int in_dim, int valueSize);
+template __global__ void decompress_int4_general_weights<__nv_bfloat16>(
+    char const *input_weight_ptr, __nv_bfloat16 *weight_ptr, int in_dim, int valueSize);
 template __global__ void decompress_int8_general_weights<float>(
     char const *input_weight_ptr, float *weight_ptr, int in_dim, int valueSize);
 template __global__ void decompress_int8_general_weights<half>(
     char const *input_weight_ptr, half *weight_ptr, int in_dim, int valueSize);
+template __global__ void decompress_int8_general_weights<__nv_bfloat16>(
+    char const *input_weight_ptr, __nv_bfloat16 *weight_ptr, int in_dim, int valueSize);
 template __global__ void
     decompress_int4_attention_weights<float>(char *input_weight_ptr,
                                              float *weight_ptr,
@@ -226,6 +230,12 @@ template __global__ void
                                             int qProjSize,
                                             int qSize,
                                             int num_heads);
+template __global__ void
+    decompress_int4_attention_weights<__nv_bfloat16>(char *input_weight_ptr,
+                                            __nv_bfloat16 *weight_ptr,
+                                            int qProjSize,
+                                            int qSize,
+                                            int num_heads);
 
 template __global__ void
     decompress_int8_attention_weights<float>(char *input_weight_ptr,
@@ -240,6 +250,13 @@ template __global__ void
                                             int qProjSize,
                                             int qSize,
                                             int num_heads);
+
+template __global__ void
+    decompress_int8_attention_weights<__nv_bfloat16>(char *input_weight_ptr,
+                                            __nv_bfloat16 *weight_ptr,
+                                            int qProjSize,
+                                            int qSize,
+                                            int num_heads);                                            
 // template <typename T1, typename T2>
 // void decompress_weight_bias(T1 *input_weight_ptr,
 //                             T2 *weight_ptr,
diff --git a/src/ops/kernels/element_binary_kernels.cu b/src/ops/kernels/element_binary_kernels.cu
index 42b31a664a..26d9ea9cc9 100644
--- a/src/ops/kernels/element_binary_kernels.cu
+++ b/src/ops/kernels/element_binary_kernels.cu
@@ -105,6 +105,9 @@ void forward_kernel_wrapper(ElementBinaryMeta const *m,
                              in2.get_float_ptr(),
                              out.get_float_ptr(),
                              stream);
+  } else if (out.data_type == DT_B16) {
+    Internal::forward_kernel(
+        m, in1.get_bfloat16_ptr(), in2.get_bfloat16_ptr(), out.get_bfloat16_ptr(), stream);
   } else {
     assert(false && "Unsupported data type");
   }
diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu
index 22d8161ff1..8e40f4bf95 100644
--- a/src/ops/kernels/embedding_kernels.cu
+++ b/src/ops/kernels/embedding_kernels.cu
@@ -70,6 +70,16 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                m->aggr,
                                output.domain.get_volume(),
                                stream);
+    } else if (weight.data_type == DT_B16) {
+      Internal::forward_kernel(input.get_int32_ptr(),
+                               output.get_bfloat16_ptr(),
+                               weight.get_bfloat16_ptr(),
+                               in_dim,
+                               out_dim,
+                               batch_size,
+                               m->aggr,
+                               output.domain.get_volume(),
+                               stream);
     } else {
       assert(false && "Unsupported DataType in Embedding");
     }
@@ -104,6 +114,16 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                m->aggr,
                                output.domain.get_volume(),
                                stream);
+    } else if (weight.data_type == DT_B16) {
+      Internal::forward_kernel(input.get_int64_ptr(),
+                               output.get_bfloat16_ptr(),
+                               weight.get_bfloat16_ptr(),
+                               in_dim,
+                               out_dim,
+                               batch_size,
+                               m->aggr,
+                               output.domain.get_volume(),
+                               stream);
     } else {
       assert(false && "Unsupported DataType in Embedding");
     }
@@ -161,6 +181,16 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 m->aggr,
                                 output.domain.get_volume(),
                                 stream);
+    } else if (m->output_type[0] == DT_B16) {
+      Internal::backward_kernel(input.get_int32_ptr(),
+                                output.get_bfloat16_ptr(),
+                                weight_grad.get_bfloat16_ptr(),
+                                in_dim,
+                                out_dim,
+                                batch_size,
+                                m->aggr,
+                                output.domain.get_volume(),
+                                stream);
     } else {
       assert(false && "Unsupported DataType in Embedding");
     }
@@ -195,6 +225,16 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 m->aggr,
                                 output.domain.get_volume(),
                                 stream);
+    } else if (m->output_type[0] == DT_B16) {
+      Internal::backward_kernel(input.get_int64_ptr(),
+                                output.get_bfloat16_ptr(),
+                                weight_grad.get_bfloat16_ptr(),
+                                in_dim,
+                                out_dim,
+                                batch_size,
+                                m->aggr,
+                                output.domain.get_volume(),
+                                stream);
     } else {
       assert(false && "Unsupported DataType in Embedding");
     }
@@ -322,6 +362,50 @@ __global__ void embed_backward_no_aggr<int64_t, half>(int64_t const *input,
   }
 }
 
+template <>
+__global__ void
+    embed_backward_no_aggr<int, __nv_bfloat16>(int const *input,
+                                               __nv_bfloat16 const *output,
+                                               __nv_bfloat16 *embed,
+                                               int out_dim,
+                                               int batch_size) {
+  CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
+    int idx = i / out_dim;
+    int off = i % out_dim;
+    int wordIdx = input[idx];
+#if __CUDA_ARCH__ >= 700
+    atomicAdd(embed + wordIdx * out_dim + off, output[i]);
+#else
+    assert(false);
+    // TODO: this implementation may result in race condition
+    // so we use an assertion failure to warn users
+    embed[wordIdx * out_dim + off] += output[i];
+#endif
+  }
+}
+
+template <>
+__global__ void
+    embed_backward_no_aggr<int64_t, __nv_bfloat16>(int64_t const *input,
+                                                   __nv_bfloat16 const *output,
+                                                   __nv_bfloat16 *embed,
+                                                   int out_dim,
+                                                   int batch_size) {
+  CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
+    int idx = i / out_dim;
+    int off = i % out_dim;
+    int64_t wordIdx = input[idx];
+#if __CUDA_ARCH__ >= 700
+    atomicAdd(embed + wordIdx * out_dim + off, output[i]);
+#else
+    assert(false);
+    // TODO: this implementation may result in race condition
+    // so we use an assertion failure to warn users
+    embed[wordIdx * out_dim + off] += output[i];
+#endif
+  }
+}
+
 template <typename TI, typename TD>
 __global__ void embed_backward_with_aggr(TI const *input,
                                          TD const *output,
@@ -416,6 +500,74 @@ __global__ void embed_backward_with_aggr<int64_t, half>(int64_t const *input,
   }
 }
 
+template <>
+__global__ void
+    embed_backward_with_aggr<int, __nv_bfloat16>(int const *input,
+                                                 __nv_bfloat16 const *output,
+                                                 __nv_bfloat16 *embed,
+                                                 int out_dim,
+                                                 int in_dim,
+                                                 int batch_size,
+                                                 AggrMode aggr) {
+  __nv_bfloat16 scale = 1.0f / in_dim;
+  CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
+    int idx = i / out_dim;
+    int off = i % out_dim;
+    __nv_bfloat16 gradient;
+    if (aggr == AGGR_MODE_SUM) {
+      gradient = output[i];
+    } else {
+      assert(aggr == AGGR_MODE_AVG);
+      gradient = output[i] * scale;
+    }
+    for (int j = 0; j < in_dim; j++) {
+      int wordIdx = input[idx * in_dim + j];
+#if __CUDA_ARCH__ >= 700
+      atomicAdd(embed + wordIdx * out_dim + off, gradient);
+#else
+      assert(false);
+      // TODO: this implementation may result in race condition
+      // so we use an assertion failure to warn users
+      embed[wordIdx * out_dim + off] += gradient;
+#endif
+    }
+  }
+}
+
+template <>
+__global__ void embed_backward_with_aggr<int64_t, __nv_bfloat16>(
+    int64_t const *input,
+    __nv_bfloat16 const *output,
+    __nv_bfloat16 *embed,
+    int out_dim,
+    int in_dim,
+    int batch_size,
+    AggrMode aggr) {
+  __nv_bfloat16 scale = 1.0f / in_dim;
+  CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
+    int idx = i / out_dim;
+    int off = i % out_dim;
+    __nv_bfloat16 gradient;
+    if (aggr == AGGR_MODE_SUM) {
+      gradient = output[i];
+    } else {
+      assert(aggr == AGGR_MODE_AVG);
+      gradient = output[i] * scale;
+    }
+    for (int j = 0; j < in_dim; j++) {
+      int64_t wordIdx = input[idx * in_dim + j];
+#if __CUDA_ARCH__ >= 700
+      atomicAdd(embed + wordIdx * out_dim + off, gradient);
+#else
+      assert(false);
+      // TODO: this implementation may result in race condition
+      // so we use an assertion failure to warn users
+      embed[wordIdx * out_dim + off] += gradient;
+#endif
+    }
+  }
+}
+
 /*static*/
 template <typename TI, typename TD>
 void forward_kernel(TI const *input_ptr,
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 9373c2fb2f..87a884cb99 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -58,6 +58,12 @@ LinearMeta::LinearMeta(FFHandler handler,
                         min(CUDA_NUM_THREADS, parallelism),
                         0,
                         stream>>>((half *)one_ptr, batch_size);
+  } else if (data_type == DT_B16) {
+    Kernels::Linear::Internal::
+        build_one_ptr<<<GET_BLOCKS(parallelism),
+                        min(CUDA_NUM_THREADS, parallelism),
+                        0,
+                        stream>>>((__nv_bfloat16 *)one_ptr, batch_size);
   }
 
   // Allocate descriptors
@@ -152,6 +158,16 @@ void forward_kernel_wrapper(LinearMeta const *m,
                                    out_dim,
                                    batch_size,
                                    stream);
+  } else if (m->input_type[0] == DT_B16) {
+    Internal::forward_kernel<__nv_bfloat16>(m,
+                                   input_ptr,
+                                   output_ptr,
+                                   weight_ptr,
+                                   bias_ptr,
+                                   in_dim,
+                                   out_dim,
+                                   batch_size,
+                                   stream);
   }
 
   if (m->profiling) {
@@ -216,6 +232,19 @@ void backward_kernel_wrapper(LinearMeta const *m,
                                     out_dim,
                                     batch_size,
                                     stream);
+  } else if (m->input_type[0] == DT_B16) {
+    Internal::backward_kernel<__nv_bfloat16>(m,
+                                    input_ptr,
+                                    input_grad_ptr,
+                                    output_ptr,
+                                    output_grad_ptr,
+                                    kernel_ptr,
+                                    kernel_grad_ptr,
+                                    bias_grad_ptr,
+                                    in_dim,
+                                    out_dim,
+                                    batch_size,
+                                    stream);
   }
 
   if (m->profiling) {
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 17ac14449b..16a113c9cb 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -132,7 +132,7 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N,
 
   using T_ACC = T;
   for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
-    const int64_t index = i * N + j;
+    int64_t const index = i * N + j;
     Y[index] = static_cast<T_ACC>(X_out[index]) * static_cast<T_ACC>(rms[i]);
     output[index] = Y[index] * weights[index % N];
   }
@@ -204,6 +204,14 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
                    residual_output.get_float_ptr(),
                    output.get_float_ptr(),
                    stream);
+  } else if (output.data_type == DT_B16) {
+    forward_kernel(m,
+                   input1.get_bfloat16_ptr(),
+                   input2.get_bfloat16_ptr(),
+                   weight.get_bfloat16_ptr(),
+                   residual_output.get_bfloat16_ptr(),
+                   output.get_bfloat16_ptr(),
+                   stream);
   } else {
     assert(false && "Unsupported data type");
   }
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index 7c9f4a9f98..67a0b156b5 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -137,9 +137,9 @@ __global__ void
 template <typename T>
 __global__ void NormKernel(int64_t N, T const *X, T const *rstd, T *Y) {
   using T_ACC = T;
-  const int64_t i = blockIdx.x;
+  int64_t const i = blockIdx.x;
   for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    const int64_t index = i * N + j;
+    int64_t const index = i * N + j;
     Y[index] = static_cast<T_ACC>(X[index]) * static_cast<T_ACC>(rstd[i]);
   }
 }
@@ -186,7 +186,7 @@ __global__ void RMSNormFusedForwardKernel(int64_t N,
 
   using T_ACC = T;
   for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
-    const int64_t index = i * N + j;
+    int64_t const index = i * N + j;
     Y[index] = static_cast<T_ACC>(X[index]) * static_cast<T_ACC>(rms[i]);
     output[index] = Y[index] * weights[index % N];
   }
@@ -246,6 +246,12 @@ void forward_kernel_wrapper(RMSNormMeta const *m,
                    weight.get_float_ptr(),
                    output.get_float_ptr(),
                    stream);
+  } else if (output.data_type == DT_B16) {
+    forward_kernel(m,
+                   input.get_bfloat16_ptr(),
+                   weight.get_bfloat16_ptr(),
+                   output.get_bfloat16_ptr(),
+                   stream);
   } else {
     assert(false && "Unsupported data type");
   }
diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index e47006cc9d..2418f457dd 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -105,7 +105,10 @@ template void forward_kernel_wrapper<float>(SoftmaxMeta const *m,
 template void forward_kernel_wrapper<half>(SoftmaxMeta const *m,
                                            half const *input_ptr,
                                            half *output_ptr);
-
+template void
+    forward_kernel_wrapper<__nv_bfloat16>(SoftmaxMeta const *m,
+                                          __nv_bfloat16 const *input_ptr,
+                                          __nv_bfloat16 *output_ptr);
 template void backward_kernel_wrapper<float>(SoftmaxMeta const *m,
                                              float *input_grad_ptr,
                                              float const *output_grad_ptr,
@@ -114,6 +117,12 @@ template void backward_kernel_wrapper<half>(SoftmaxMeta const *m,
                                             half *input_grad_ptr,
                                             half const *output_grad_ptr,
                                             size_t num_elements);
+
+template void
+    backward_kernel_wrapper<__nv_bfloat16>(SoftmaxMeta const *m,
+                                           __nv_bfloat16 *input_grad_ptr,
+                                           __nv_bfloat16 const *output_grad_ptr,
+                                           size_t num_elements);
 namespace Internal {
 template <typename DT>
 void forward_kernel(SoftmaxMeta const *m,
diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu
index 44979c48fe..e239023768 100644
--- a/src/ops/layer_norm.cu
+++ b/src/ops/layer_norm.cu
@@ -273,7 +273,15 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m,
         m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
         (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr,
         stream);
-  } else {
+  } else if (m->input_type[0] == DT_B16) {
+    LayerNorm::forward_kernel<__nv_bfloat16>(
+        m,
+        input.get_bfloat16_ptr(),
+        output.get_bfloat16_ptr(),
+        m->elementwise_affine ? gamma.get_bfloat16_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_bfloat16_ptr() : nullptr,
+        stream);
+  }else {
     assert(false && "unsupport datatype in layernorm");
   }
 
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 63b26bfe7d..8cdc392c96 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -33,7 +33,7 @@ using namespace FlexFlow::Kernels::Linear;
 static constexpr int KERNEL_IDX = 0;
 static constexpr int BIAS_IDX = 1;
 
-Tensor FFModel::dense(const Tensor input,
+Tensor FFModel::dense(Tensor const input,
                       int outDim,
                       ActiMode activation,
                       bool use_bias,
@@ -157,7 +157,7 @@ Op *Linear::create_operator_from_layer(
 
 Linear::Linear(FFModel &model,
                Linear const &other,
-               const ParallelTensor input,
+               ParallelTensor const input,
                bool allocate_weights)
     : Linear(model,
              other.layer_guid,
@@ -194,7 +194,7 @@ Linear::Linear(FFModel &model,
 
 Linear::Linear(FFModel &model,
                LayerID const &_layer_guid,
-               const ParallelTensor _input,
+               ParallelTensor const _input,
                int out_dim,
                ActiMode _activation,
                RegularizerMode _kernel_reg_type,
@@ -437,6 +437,14 @@ OpMeta *Linear::init_task(Task const *task,
         return init_task_with_dim<float, float, DIM>(                          \
             task, regions, ctx, runtime);                                      \
       }                                                                        \
+    } else if (output.data_type == DT_B16) {                                   \
+      if (linear->quantization_type != DT_NONE) {                              \
+        return init_task_with_dim<__nv_bfloat16, char, DIM>(                   \
+            task, regions, ctx, runtime);                                      \
+      } else {                                                                 \
+        return init_task_with_dim<__nv_bfloat16, __nv_bfloat16, DIM>(          \
+            task, regions, ctx, runtime);                                      \
+      }                                                                        \
     } else {                                                                   \
       assert(false && "Unsupported data type");                                \
     }
@@ -704,6 +712,14 @@ void Linear::forward_task(Task const *task,
         return forward_task_with_dim<float, float, DIM>(                       \
             task, regions, ctx, runtime);                                      \
       }                                                                        \
+    } else if (m->output_type[0] == DT_B16) {                                  \
+      if (m->quantization_type != DT_NONE) {                                   \
+        return forward_task_with_dim<__nv_bfloat16, char, DIM>(                \
+            task, regions, ctx, runtime);                                      \
+      } else {                                                                 \
+        return forward_task_with_dim<__nv_bfloat16, __nv_bfloat16, DIM>(       \
+            task, regions, ctx, runtime);                                      \
+      }                                                                        \
     } else {                                                                   \
       assert(false && "Unsupported data type");                                \
     }
@@ -859,6 +875,9 @@ void Linear::backward_task(Task const *task,
       return backward_task_with_dim<half, DIM>(task, regions, ctx, runtime);   \
     } else if (m->output_type[0] == DT_FLOAT) {                                \
       return backward_task_with_dim<float, DIM>(task, regions, ctx, runtime);  \
+    } else if (m->output_type[0] == DT_B16) {                                  \
+      return backward_task_with_dim<__nv_bfloat16, DIM>(                       \
+          task, regions, ctx, runtime);                                        \
     } else {                                                                   \
       assert(false && "Unsupported data type");                                \
     }
@@ -1341,7 +1360,7 @@ bool LinearParams::is_valid(ParallelTensorShape const &input_shape) const {
  * It takes a the input tensor as a parameter, instead of the input's
  * ParallelTensorShape.
  */
-void LinearParams::solve_dims(const ParallelTensor input,
+void LinearParams::solve_dims(ParallelTensor const input,
                               ParallelDim output_dims[MAX_TENSOR_DIM],
                               int *output_ndims,
                               ParallelDim kernel_dims[MAX_TENSOR_DIM],
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index e5ebdce6ed..cbb7e241f3 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -225,7 +225,18 @@ void ResidualLayerNorm::inference_kernel_wrapper(
         m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
         (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr,
         stream);
-  } else {
+  } else if (m->input_type[0] == DT_B16) {
+    ResidualLayerNorm::inference_kernel<__nv_bfloat16>(
+        m,
+        input.get_bfloat16_ptr(),
+        residual1.get_bfloat16_ptr(),
+        m->use_two_residuals ? residual2.get_bfloat16_ptr() : nullptr,
+        added_output.get_bfloat16_ptr(),
+        output.get_bfloat16_ptr(),
+        m->elementwise_affine ? gamma.get_bfloat16_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_bfloat16_ptr() : nullptr,
+        stream);
+  }else {
     assert(false && "unsupport datatype in layernorm");
   }
 
diff --git a/src/ops/sampling.cu b/src/ops/sampling.cu
index 461d72ec71..d4044a9cc3 100644
--- a/src/ops/sampling.cu
+++ b/src/ops/sampling.cu
@@ -182,6 +182,14 @@ void Sampling::forward_kernel_wrapper(SamplingMeta const *m,
                                     length,
                                     batch_size,
                                     stream);
+  } else if (input.data_type == DT_B16) {
+    Sampling::forward_kernel<__nv_bfloat16>(m,
+                                            input.get_bfloat16_ptr(),
+                                            indices.get_int32_ptr(),
+                                            m->top_p,
+                                            length,
+                                            batch_size,
+                                            stream);
   } else {
     assert(false && "Unsupported data type");
   }
@@ -270,7 +278,22 @@ SamplingMeta::SamplingMeta(FFHandler handler,
         0,                             // begin_bit
         data_type_size(data_type) * 8, // end_bit = sizeof(KeyT) * 8
         stream));
-  } else {
+  }  else if (data_type == DT_B16) {
+    checkCUDA(cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        d_temp_storage,
+        temp_storage_bytes,
+        input.get_bfloat16_ptr(),
+        input.get_bfloat16_ptr(),
+        idx,
+        idx,
+        total_ele,
+        batch_size,
+        begin_offset,
+        end_offset + 1,
+        0,                             // begin_bit
+        data_type_size(data_type) * 8, // end_bit = sizeof(KeyT) * 8
+        stream));
+  }else {
     assert(false && "input type in float and half");
   }
 
diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index 590b641b5a..6373aaaae5 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -80,6 +80,14 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
                                        input1.get_half_ptr(),
                                        input2.get_half_ptr(),
                                        output.get_half_ptr());
+  } else if (m->input_type[0] == DT_B16) {
+    SigmoidSiluMultiKernel<<<GET_BLOCKS(num_elements),
+                             min(CUDA_NUM_THREADS, num_elements),
+                             0,
+                             stream>>>(input1.domain.get_volume(),
+                                       input1.get_bfloat16_ptr(),
+                                       input2.get_bfloat16_ptr(),
+                                       output.get_bfloat16_ptr());
   } else {
     assert(false && "unsupport datatype in SigmoidSiluMulti");
   }
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index ba0a1288d6..101eff9417 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -52,7 +52,7 @@ SoftmaxParams Softmax::get_params() const {
   return params;
 }
 
-Tensor FFModel::softmax(const Tensor _input,
+Tensor FFModel::softmax(Tensor const _input,
                         int dim,
                         DataType data_type,
                         char const *name) {
@@ -93,7 +93,7 @@ Op *Softmax::create_operator_from_layer(
 }
 
 Softmax::Softmax(FFModel &model,
-                 const ParallelTensor _input,
+                 ParallelTensor const _input,
                  int _dim,
                  char const *name)
     : Op(model,
@@ -117,7 +117,7 @@ Softmax::Softmax(FFModel &model,
 
 Softmax::Softmax(FFModel &model,
                  SoftmaxParams const &params,
-                 const ParallelTensor input,
+                 ParallelTensor const input,
                  char const *name)
     : Softmax(model, input, params.dim, name) {}
 
@@ -319,6 +319,9 @@ void Softmax::forward_task(Task const *task,
     forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr());
   } else if (m->output_type == DT_FLOAT) {
     forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr());
+  } else if (m->output_type == DT_B16) {
+    forward_kernel_wrapper(
+        m, input.get_bfloat16_ptr(), output.get_bfloat16_ptr());
   } else {
     assert(false && "Unsupported data type");
   }
@@ -366,6 +369,9 @@ void Softmax::backward_task(Task const *task,
       return backward_task_with_dim<half, DIM>(task, regions, ctx, runtime);   \
     } else if (m->output_type == DT_FLOAT) {                                   \
       return backward_task_with_dim<float, DIM>(task, regions, ctx, runtime);  \
+    } else if (m->output_type == DT_B16) {                                     \
+      return backward_task_with_dim<__nv_bfloat16, DIM>(                       \
+          task, regions, ctx, runtime);                                        \
     } else {                                                                   \
       assert(false && "Unsupported data type");                                \
     }
@@ -429,6 +435,9 @@ void Softmax::inference_task(Task const *task,
     forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr());
   } else if (m->output_type == DT_FLOAT) {
     forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr());
+  } else if (m->output_type == DT_B16) {
+    forward_kernel_wrapper(
+        m, input.get_bfloat16_ptr(), output.get_bfloat16_ptr());
   } else {
     assert(false && "Unsupported data type");
   }
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 2d80ed2221..0b978c23a6 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -789,6 +789,18 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
         output.get_float_ptr(),
         bias_ptr,
         stream);
+  }else if (input.data_type == DT_B16) {
+    __nv_bfloat16 const *bias_ptr =
+        use_bias ? bias.get_bfloat16_ptr() : static_cast<__nv_bfloat16 const *>(nullptr);
+    Kernels::SpecIncMultiHeadSelfAttention::inference_kernel(
+        m,
+        bc,
+        shard_id,
+        input.get_bfloat16_ptr(),
+        weight.get_bfloat16_ptr(),
+        output.get_bfloat16_ptr(),
+        bias_ptr,
+        stream);
   } else {
     assert(false && "Unspported data type");
   }
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index fc86e1498e..1863dde1e7 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -1003,6 +1003,25 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
         output.get_float_ptr(),
         bias_ptr,
         stream);
+  } else if (input.data_type == DT_B16) {
+    if (m->offload) {
+      pre_build_weight_kernel<__nv_bfloat16>(
+          m, weight, input.data_type, stream);
+    }
+
+    __nv_bfloat16 const *bias_ptr =
+        use_bias ? bias.get_bfloat16_ptr()
+                 : static_cast<__nv_bfloat16 const *>(nullptr);
+    Kernels::TreeIncMultiHeadAttention::inference_kernel(
+        m,
+        bc,
+        shard_id,
+        input.get_bfloat16_ptr(),
+        m->offload ? static_cast<__nv_bfloat16 *>(m->weight_ptr)
+                   : weight.get_bfloat16_ptr(),
+        output.get_bfloat16_ptr(),
+        bias_ptr,
+        stream);
   } else {
     assert(false && "Unspported data type");
   }
diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc
index 7c266c5392..ffe57006fc 100644
--- a/src/parallel_ops/combine.cc
+++ b/src/parallel_ops/combine.cc
@@ -72,7 +72,7 @@ Combine::Combine(FFModel &model,
               name) {}
 
 Combine::Combine(FFModel &model,
-                 const ParallelTensor _input,
+                 ParallelTensor const _input,
                  int _combine_legion_dim,
                  int _combine_degree,
                  char const *name)
@@ -365,6 +365,8 @@ void Combine::forward_task(Task const *task,
   DataType data_type = m->input_type[0];
   if (data_type == DT_HALF) {
     forward_task_with_type<half>(task, regions, ctx, runtime);
+  } else if (data_type == DT_B16) {
+    forward_task_with_type<__nv_bfloat16>(task, regions, ctx, runtime);
   } else if (data_type == DT_FLOAT) {
     forward_task_with_type<float>(task, regions, ctx, runtime);
   } else if (data_type == DT_DOUBLE) {
diff --git a/src/parallel_ops/kernels/combine_kernels.cu b/src/parallel_ops/kernels/combine_kernels.cu
index 1ab79a7944..ca9451ed64 100644
--- a/src/parallel_ops/kernels/combine_kernels.cu
+++ b/src/parallel_ops/kernels/combine_kernels.cu
@@ -50,6 +50,11 @@ template void forward_kernel<half>(half const *input_ptr,
 template void forward_kernel<float>(float const *input_ptr,
                                     float *output_ptr,
                                     size_t num_elements);
+
+template void forward_kernel<__nv_bfloat16>(__nv_bfloat16 const *input_ptr,
+                                            __nv_bfloat16 *output_ptr,
+                                            size_t num_elements);
+
 template void forward_kernel<double>(double const *input_ptr,
                                      double *output_ptr,
                                      size_t num_elements);
diff --git a/src/parallel_ops/kernels/reduction_kernels.cu b/src/parallel_ops/kernels/reduction_kernels.cu
index 34ae8007da..67547c6d17 100644
--- a/src/parallel_ops/kernels/reduction_kernels.cu
+++ b/src/parallel_ops/kernels/reduction_kernels.cu
@@ -71,6 +71,13 @@ template __global__ void reduction_forward_kernel<half>(half const *input_ptr,
                                                         half *output_ptr,
                                                         size_t num_elements,
                                                         size_t num_replicas);
+
+template __global__ void
+    reduction_forward_kernel<__nv_bfloat16>(__nv_bfloat16 const *input_ptr,
+                                            __nv_bfloat16 *output_ptr,
+                                            size_t num_elements,
+                                            size_t num_replicas);
+
 template void forward_kernel<float>(float const *input_ptr,
                                     float *output_ptr,
                                     size_t num_elements,
@@ -79,6 +86,12 @@ template void forward_kernel<half>(half const *input_ptr,
                                    half *output_ptr,
                                    size_t num_elements,
                                    size_t num_replicas);
+
+template void forward_kernel<__nv_bfloat16>(__nv_bfloat16 const *input_ptr,
+                                            __nv_bfloat16 *output_ptr,
+                                            size_t num_elements,
+                                            size_t num_replicas);
+
 template void backward_kernel<float>(float const *output_grad_ptr,
                                      float *input_grad_ptr,
                                      size_t num_elements);
diff --git a/src/parallel_ops/kernels/replicate_kernels.cu b/src/parallel_ops/kernels/replicate_kernels.cu
index 35bc109bd3..573de9ae2e 100644
--- a/src/parallel_ops/kernels/replicate_kernels.cu
+++ b/src/parallel_ops/kernels/replicate_kernels.cu
@@ -66,6 +66,9 @@ template void forward_kernel<float>(float const *input_ptr,
 template void forward_kernel<half>(half const *input_ptr,
                                    half *output_ptr,
                                    size_t num_elements);
+template void forward_kernel<__nv_bfloat16>(__nv_bfloat16 const *input_ptr,
+                                            __nv_bfloat16 *output_ptr,
+                                            size_t num_elements);
 template __global__ void
     replicate_backward_kernel<float>(float const *input_ptr,
                                      float *output_ptr,
diff --git a/src/parallel_ops/reduction.cc b/src/parallel_ops/reduction.cc
index 5dca591328..d53cae330b 100644
--- a/src/parallel_ops/reduction.cc
+++ b/src/parallel_ops/reduction.cc
@@ -60,7 +60,7 @@ ReductionParams Reduction::get_params() const {
 }
 
 Reduction::Reduction(FFModel &model,
-                     const ParallelTensor _input,
+                     ParallelTensor const _input,
                      int _reduction_legion_dim,
                      int _reduction_degree,
                      char const *name)
@@ -380,6 +380,11 @@ void Reduction::forward_task(Task const *task,
                           output.get_float_ptr(),
                           num_elements,
                           num_replicas);
+  } else if (input.data_type == DT_B16) {
+    forward_kernel<__nv_bfloat16>(input.get_bfloat16_ptr(),
+                                  output.get_bfloat16_ptr(),
+                                  num_elements,
+                                  num_replicas);
   } else {
     assert(false && "Unspported data type");
   }
diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc
index 20face74e8..d9f7b2263b 100644
--- a/src/parallel_ops/replicate.cc
+++ b/src/parallel_ops/replicate.cc
@@ -59,7 +59,7 @@ ReplicateParams Replicate::get_params() const {
 }
 
 Replicate::Replicate(FFModel &model,
-                     const ParallelTensor _input,
+                     ParallelTensor const _input,
                      int _replicate_legion_dim,
                      int _replicate_degree,
                      char const *name)
@@ -373,6 +373,10 @@ void Replicate::forward_task(Task const *task,
     forward_kernel<float>(input.get_float_ptr(),
                           output.get_float_ptr(),
                           input_domain.get_volume());
+  } else if (input.data_type == DT_B16) {
+    forward_kernel<__nv_bfloat16>(input.get_bfloat16_ptr(),
+                                  output.get_bfloat16_ptr(),
+                                  input_domain.get_volume());
   } else {
     assert(false && "Unspported data type");
   }
diff --git a/src/runtime/accessor.cc b/src/runtime/accessor.cc
index d3b94bf14a..9bb6c08eea 100644
--- a/src/runtime/accessor.cc
+++ b/src/runtime/accessor.cc
@@ -77,6 +77,15 @@ half const *GenericTensorAccessorR::get_half_ptr() const {
   }
 }
 
+__nv_bfloat16 const *GenericTensorAccessorR::get_bfloat16_ptr() const {
+  if (data_type == DT_B16) {
+    return static_cast<__nv_bfloat16 const *>(ptr);
+  } else {
+    assert(false && "Invalid Accessor Type");
+    return static_cast<__nv_bfloat16 const *>(nullptr);
+  }
+}
+
 char const *GenericTensorAccessorR::get_byte_ptr() const {
   if (data_type == DT_INT4 || data_type == DT_INT8) {
     return static_cast<char const *>(ptr);
@@ -165,6 +174,15 @@ half *GenericTensorAccessorW::get_half_ptr() const {
   }
 }
 
+__nv_bfloat16 *GenericTensorAccessorW::get_bfloat16_ptr() const {
+  if (data_type == DT_B16) {
+    return static_cast<__nv_bfloat16 *>(ptr);
+  } else {
+    assert(false && "Invalid Accessor Type");
+    return static_cast<__nv_bfloat16 *>(nullptr);
+  }
+}
+
 char *GenericTensorAccessorW::get_byte_ptr() const {
   if (data_type == DT_INT4 || data_type == DT_INT8) {
     return static_cast<char *>(ptr);
@@ -271,6 +289,11 @@ GenericTensorAccessorR
       ptr = helperGetTensorPointerRO<half>(region, req, fid, ctx, runtime);
       break;
     }
+    case DT_B16: {
+      ptr = helperGetTensorPointerRO<__nv_bfloat16>(
+          region, req, fid, ctx, runtime);
+      break;
+    }
     case DT_FLOAT: {
       ptr = helperGetTensorPointerRO<float>(region, req, fid, ctx, runtime);
       break;
@@ -317,6 +340,11 @@ GenericTensorAccessorW
       ptr = helperGetTensorPointerWO<half>(region, req, fid, ctx, runtime);
       break;
     }
+    case DT_B16: {
+      ptr = helperGetTensorPointerWO<__nv_bfloat16>(
+          region, req, fid, ctx, runtime);
+      break;
+    }
     case DT_FLOAT: {
       ptr = helperGetTensorPointerWO<float>(region, req, fid, ctx, runtime);
       break;
@@ -363,6 +391,11 @@ GenericTensorAccessorW
       ptr = helperGetTensorPointerRW<half>(region, req, fid, ctx, runtime);
       break;
     }
+    case DT_B16: {
+      ptr = helperGetTensorPointerRW<__nv_bfloat16>(
+          region, req, fid, ctx, runtime);
+      break;
+    }
     case DT_FLOAT: {
       ptr = helperGetTensorPointerRW<float>(region, req, fid, ctx, runtime);
       break;
diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu
index fa6bf55fe5..91d139a414 100644
--- a/src/runtime/cuda_helper.cu
+++ b/src/runtime/cuda_helper.cu
@@ -320,6 +320,33 @@ __host__ void
   checkCUDA(cudaFreeHost(host_ptr));
 }
 
+template <>
+__host__ void save_tensor(__nv_bfloat16 const *ptr,
+                          size_t num_elements,
+                          char const *file_name) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  __nv_bfloat16 *host_ptr;
+  checkCUDA(cudaHostAlloc(&host_ptr,
+                          sizeof(__nv_bfloat16) * num_elements,
+                          cudaHostAllocPortable | cudaHostAllocMapped));
+  checkCUDA(cudaMemcpyAsync(host_ptr,
+                            ptr,
+                            sizeof(__nv_bfloat16) * num_elements,
+                            cudaMemcpyDeviceToHost,
+                            stream));
+  checkCUDA(cudaDeviceSynchronize());
+  FILE *tensor_file;
+  tensor_file = fopen(file_name, "w");
+  assert(tensor_file != NULL);
+  for (unsigned i = 0; i < num_elements; i++) {
+    fprintf(tensor_file, "%.9f, ", (float)host_ptr[i]);
+  }
+
+  fclose(tensor_file);
+  checkCUDA(cudaFreeHost(host_ptr));
+}
+
 template <>
 __host__ void save_tensor(int32_t const *ptr,
                           size_t num_elements,
@@ -519,6 +546,8 @@ cudnnDataType_t ff_to_cudnn_datatype(DataType type) {
   switch (type) {
     case DT_HALF:
       return CUDNN_DATA_HALF;
+    case DT_B16:
+      return CUDNN_DATA_BFLOAT16;
     case DT_FLOAT:
       return CUDNN_DATA_FLOAT;
     case DT_DOUBLE:
@@ -535,6 +564,8 @@ cudaDataType_t ff_to_cuda_datatype(DataType type) {
   switch (type) {
     case DT_HALF:
       return CUDA_R_16F;
+    case DT_B16:
+      return CUDA_R_16BF;
     case DT_FLOAT:
       return CUDA_R_32F;
     case DT_DOUBLE:
@@ -552,6 +583,8 @@ ncclDataType_t ff_to_nccl_datatype(DataType type) {
   switch (type) {
     case DT_HALF:
       return ncclHalf;
+    case DT_B16:
+      return ncclBfloat16;
     case DT_FLOAT:
       return ncclFloat;
     case DT_DOUBLE:
@@ -595,6 +628,9 @@ cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type) {
 
 template __global__ void
     assign_kernel<half>(half *ptr, coord_t size, half value);
+template __global__ void assign_kernel<__nv_bfloat16>(__nv_bfloat16 *ptr,
+                                                      coord_t size,
+                                                      __nv_bfloat16 value);
 template __global__ void
     assign_kernel<float>(float *ptr, coord_t size, float value);
 template __global__ void
@@ -700,6 +736,10 @@ template __host__ void save_tensor<int64_t>(int64_t const *ptr,
 template __host__ void
     save_tensor<half>(half const *ptr, size_t rect, char const *file_name);
 
+template __host__ void save_tensor<__nv_bfloat16>(__nv_bfloat16 const *ptr,
+                                                  size_t rect,
+                                                  char const *file_name);
+
 template __host__ float *download_tensor<float>(float const *ptr,
                                                 size_t num_elements);
 template __host__ half *download_tensor<half>(half const *ptr,
diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc
index c7b6e1257a..764c938dc3 100644
--- a/src/runtime/ffconst_utils.cc
+++ b/src/runtime/ffconst_utils.cc
@@ -213,6 +213,8 @@ size_t data_type_size(DataType type) {
   switch (type) {
     case DT_HALF:
       return sizeof(half);
+    case DT_B16:
+      return sizeof(__nv_bfloat16);
     case DT_FLOAT:
       return sizeof(float);
     case DT_DOUBLE:
diff --git a/src/runtime/initializer_kernel.cu b/src/runtime/initializer_kernel.cu
index b6629ec90b..b6b9e29b8b 100644
--- a/src/runtime/initializer_kernel.cu
+++ b/src/runtime/initializer_kernel.cu
@@ -235,6 +235,12 @@ void ZeroInitializer::init_task(Task const *task,
       assign_kernel<half>
           <<<GET_BLOCKS(domain.get_volume()), CUDA_NUM_THREADS, 0, stream>>>(
               w, domain.get_volume(), 0.0f);
+    } else if (meta->data_types[i] == DT_B16) {
+      __nv_bfloat16 *w = helperGetTensorPointerWO<__nv_bfloat16>(
+          regions[i], task->regions[i], FID_DATA, ctx, runtime);
+      assign_kernel<__nv_bfloat16>
+          <<<GET_BLOCKS(domain.get_volume()), CUDA_NUM_THREADS, 0, stream>>>(
+              w, domain.get_volume(), 0.0f);
     } else if (meta->data_types[i] == DT_FLOAT) {
       float *w = helperGetTensorPointerWO<float>(
           regions[i], task->regions[i], FID_DATA, ctx, runtime);
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 76bed36bda..f31be1bcdb 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -1947,6 +1947,9 @@ void FFModel::map_tensor_with_dim2(ParallelTensor tensor,
     case DT_HALF:
       allocator.allocate_field(sizeof(half), FID_DATA);
       break;
+    case DT_B16:
+      allocator.allocate_field(sizeof(__nv_bfloat16), FID_DATA);
+      break;
     case DT_FLOAT:
       allocator.allocate_field(sizeof(float), FID_DATA);
       break;
diff --git a/src/runtime/operator.cc b/src/runtime/operator.cc
index 0b3813f41c..6349bfd1a5 100644
--- a/src/runtime/operator.cc
+++ b/src/runtime/operator.cc
@@ -62,6 +62,10 @@ void Op::save_inference_tensors_to_file(
       save_tensor(input_tensors[i].get_half_ptr(),
                   input_tensors[i].domain.get_volume(),
                   filename.c_str());
+    } else if (input_tensors[i].data_type == DT_B16) {
+      save_tensor(input_tensors[i].get_bfloat16_ptr(),
+                  input_tensors[i].domain.get_volume(),
+                  filename.c_str());
     } else if (input_tensors[i].data_type == DT_INT32) {
       save_tensor(input_tensors[i].get_int32_ptr(),
                   input_tensors[i].domain.get_volume(),
@@ -86,6 +90,10 @@ void Op::save_inference_tensors_to_file(
         save_tensor(weight_tensors[i].get_half_ptr(),
                     weight_tensors[i].domain.get_volume(),
                     filename.c_str());
+      } else if (weight_tensors[i].data_type == DT_B16) {
+        save_tensor(weight_tensors[i].get_bfloat16_ptr(),
+                    weight_tensors[i].domain.get_volume(),
+                    filename.c_str());
       } else if (weight_tensors[i].data_type == DT_INT32) {
         save_tensor(weight_tensors[i].get_int32_ptr(),
                     weight_tensors[i].domain.get_volume(),
@@ -110,6 +118,10 @@ void Op::save_inference_tensors_to_file(
       save_tensor(output_tensors[i].get_half_ptr(),
                   output_tensors[i].domain.get_volume(),
                   filename.c_str());
+    } else if (output_tensors[i].data_type == DT_B16) {
+      save_tensor(output_tensors[i].get_bfloat16_ptr(),
+                  output_tensors[i].domain.get_volume(),
+                  filename.c_str());
     } else if (output_tensors[i].data_type == DT_INT32) {
       save_tensor(output_tensors[i].get_int32_ptr(),
                   output_tensors[i].domain.get_volume(),

From 73569cb9dcb26e63b6a71f3d3d655f5238538465 Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Sun, 7 Jan 2024 00:48:40 -0500
Subject: [PATCH 02/11] .

---
 inference/file_loader.cc | 336 ++++++++++++++++++++++++++++++++++++++-
 inference/file_loader.h  |   9 +-
 2 files changed, 342 insertions(+), 3 deletions(-)

diff --git a/inference/file_loader.cc b/inference/file_loader.cc
index 7c6870d439..9666a60358 100644
--- a/inference/file_loader.cc
+++ b/inference/file_loader.cc
@@ -125,6 +125,335 @@ void load_attention_weights_multi_query(DT *ptr,
   }
 }
 
+///////////////////////bfloat16 function///////////////////////
+
+void load_from_file_b16(__nv_bfloat16 *ptr, size_t size, std::string filepath) {
+  std::ifstream in(filepath, std::ios::in | std::ios::binary);
+  if (!in.good()) {
+    std::cout << "Could not open file: " << filepath << std::endl;
+  }
+  assert(in.good() && "incorrect weight file path");
+  std::vector<float> host_array(size);
+  size_t loaded_data_size = sizeof(float) * size;
+  in.seekg(0, in.end);
+  in.seekg(0, in.beg);
+  in.read((char *)host_array.data(), loaded_data_size);
+
+  size_t in_get_size = in.gcount();
+  if (in_get_size != loaded_data_size) {
+    std::cout << "load weight data error " << in_get_size << ", "
+              << loaded_data_size << ", " << sizeof(float) << ", file path: " << filepath << std::endl;
+    assert(false);
+  }
+  assert(size == host_array.size());
+
+  // normal
+  long data_index = 0;
+  for (auto v : host_array) {
+    ptr[data_index++] = __float2bfloat16(v);
+  }
+  in.close();
+}
+
+void load_attention_weights_v2_b16(__nv_bfloat16 *ptr,
+                                   int num_heads,
+                                   int num_kv_heads,
+                                   size_t hidden_dim,
+                                   size_t qkv_inner_dim,
+                                   std::string layer_name,
+                                   std::string weights_folder,
+                                   size_t volume,
+                                   int tensor_parallelism_degree) {
+  // layers_0_attention_wq_weight
+  // layers_0_self_attn_q_proj_weight
+  std::string q_file = layer_name + "_wq_weight";
+  std::string k_file = layer_name + "_wk_weight";
+  std::string v_file = layer_name + "_wv_weight";
+  std::string o_file = layer_name + "_wo_weight";
+  std::vector<std::string> weight_filenames = {q_file, k_file, v_file};
+  int file_index = 0;
+
+  int base_index = 0;
+  size_t single_proj_size =
+      hidden_dim *
+      qkv_inner_dim; // size of each of Q,K,V,O weights for a single head
+  size_t one_weight_file_size =
+      num_heads * single_proj_size; // size of each of Q/K/V/O for all heads
+
+  size_t q_size = one_weight_file_size, o_size = one_weight_file_size;
+  size_t k_size = single_proj_size * num_kv_heads,
+         v_size = single_proj_size * num_kv_heads;
+
+  size_t k_replicate_size = one_weight_file_size;
+  size_t v_replicate_size = one_weight_file_size;
+
+  int replicate_num = num_heads / num_kv_heads;
+
+  // stride for q, k, v, o
+  size_t stride_size = (q_size + v_replicate_size + k_replicate_size + o_size) /
+                       tensor_parallelism_degree;
+  for (auto filename : weight_filenames) {
+    std::cout << "Loading weight file " << filename << std::endl;
+    std::string weight_filepath = join_path({weights_folder, filename});
+
+    int data_index = 0;
+    size_t partial_size = (file_index == 0 || file_index == 3)
+                              ? one_weight_file_size
+                              : single_proj_size * num_kv_heads;
+    size_t one_partition_size =
+        one_weight_file_size / tensor_parallelism_degree;
+
+    std::ifstream in(weight_filepath, std::ios::in | std::ios::binary);
+    if (!in.good()) {
+      std::cout << "Could not open file: " << weight_filepath << std::endl;
+    }
+    assert(in.good() && "incorrect weight file path");
+    std::vector<float> host_array(partial_size);
+    size_t loaded_data_size = sizeof(float) * partial_size;
+    in.seekg(0, in.end);
+    in.seekg(0, in.beg);
+    in.read((char *)host_array.data(), loaded_data_size);
+    size_t in_get_size = in.gcount();
+
+    if (in_get_size != loaded_data_size) {
+      std::cout << "load attention data error " << in_get_size << ", "
+                << loaded_data_size << ", " << file_index << ", "
+                << weight_filepath << "\n";
+      assert(false && "data size mismatch");
+    }
+    // wq, wk, wo
+    if (file_index == 0) {
+      for (int i = 0; i < tensor_parallelism_degree; i++) {
+        for (int j = 0; j < one_partition_size; j++) {
+          ptr[base_index + i * stride_size + j] =
+              __float2bfloat16(host_array.at(data_index++));
+        }
+      }
+    } else {
+      for (int i = 0; i < num_heads; i++) {
+        int kv_idx = i / (num_heads / num_kv_heads);
+        int head_idx = i % (num_heads / tensor_parallelism_degree);
+        int tp_idx = (i / (num_heads / tensor_parallelism_degree));
+        for (int j = 0; j < single_proj_size; j++) {
+          ptr[base_index + tp_idx * stride_size + single_proj_size * head_idx +
+              j] =
+              __float2bfloat16(host_array.at(kv_idx * single_proj_size + j));
+        }
+      }
+    }
+
+    // assert(data_index == partial_size);
+    base_index += one_partition_size;
+    file_index++;
+  }
+  assert(base_index == (q_size + k_replicate_size + v_replicate_size) /
+                           tensor_parallelism_degree);
+
+  {
+    std::cout << "Loading weight file " << o_file << std::endl;
+    std::string weight_filepath = join_path({weights_folder, o_file});
+
+    std::ifstream in(weight_filepath, std::ios::in | std::ios::binary);
+    if (!in.good()) {
+      std::cout << "Could not open file: " << weight_filepath << std::endl;
+    }
+    assert(in.good() && "incorrect weight file path");
+    std::vector<float> host_array(one_weight_file_size);
+    size_t loaded_data_size = sizeof(float) * one_weight_file_size;
+    in.seekg(0, in.end);
+    in.seekg(0, in.beg);
+    in.read((char *)host_array.data(), loaded_data_size);
+    size_t in_get_size = in.gcount();
+
+    if (in_get_size != loaded_data_size) {
+      std::cout << "load data error" << std::endl;
+      assert(false);
+    }
+    assert(one_weight_file_size == host_array.size());
+    int data_index = 0;
+
+    int one_partition_size =
+        qkv_inner_dim * (num_heads / tensor_parallelism_degree);
+    for (int i = 0; i < one_weight_file_size; i++) {
+      int part_idx = (i / one_partition_size) % tensor_parallelism_degree;
+      int block_num = (i / one_partition_size);
+      int offset = block_num / tensor_parallelism_degree * one_partition_size +
+                   (i % one_partition_size);
+      ptr[base_index + part_idx * stride_size + offset] =
+          __float2bfloat16(host_array.at(data_index++));
+    }
+
+    in.close();
+
+    assert(data_index == one_weight_file_size);
+  }
+}
+
+void load_attention_bias_v2_b16(__nv_bfloat16 *ptr,
+                                int num_heads,
+                                int num_kv_heads,
+                                size_t hidden_dim,
+                                size_t qkv_inner_dim,
+                                bool final_bias,
+                                std::string layer_name,
+                                std::string weights_folder) {
+  std::string q_file = layer_name + "_wq_bias";
+  std::string k_file = layer_name + "_wk_bias";
+  std::string v_file = layer_name + "_wv_bias";
+  std::vector<std::string> bias_files = {q_file, k_file, v_file};
+  if (final_bias) {
+    std::string o_file = layer_name + "_wo_bias";
+    bias_files.push_back(o_file);
+  }
+
+  int file_index = 0;
+
+  // now only opt use this.
+  // assert(num_heads == num_kv_heads);
+  int idx = 0;
+
+  for (auto filename : bias_files) {
+    std::cout << "Loading weight file " << filename << std::endl;
+    std::string weight_filepath = join_path({weights_folder, filename});
+
+    int n_heads = file_index == 0 ? num_heads : num_kv_heads;
+
+    int replicate_num = num_heads / num_kv_heads;
+
+    size_t qkv_partial_size = qkv_inner_dim * n_heads;
+    size_t qkv_replicate_size = qkv_inner_dim * num_heads;
+    size_t out_partial_size = hidden_dim;
+    size_t partial_size =
+        (file_index < 3) ? qkv_partial_size : out_partial_size;
+    std::ifstream in(weight_filepath, std::ios::in | std::ios::binary);
+    assert(in.good() && "incorrect bias file path");
+    std::vector<float> host_array(partial_size);
+    size_t loaded_data_size = sizeof(float) * partial_size;
+    in.seekg(0, in.end);
+    in.seekg(0, in.beg);
+    in.read((char *)host_array.data(), loaded_data_size);
+    size_t in_get_size = in.gcount();
+
+    if (in_get_size != loaded_data_size) {
+      printf(
+          "load bias data error: in_get_size (%lu) != loaded_data_size (%lu)\n",
+          in_get_size,
+          loaded_data_size);
+      assert(false);
+    }
+    assert(partial_size == host_array.size());
+
+    size_t data_index = 0;
+
+    // q, o
+    if (file_index == 0 || file_index == 3) {
+      for (int i = 0; i < partial_size; i++) {
+        ptr[idx + i] = __float2bfloat16(host_array.at(data_index));
+        data_index++;
+      }
+    } else {
+      // k, v
+      for (int i = 0; i < partial_size; i++) {
+        for (int j = 0; j < replicate_num; j++) {
+          ptr[idx + j * partial_size + i] =
+              __float2bfloat16(host_array.at(data_index));
+        }
+        data_index++;
+      }
+    }
+
+    file_index++;
+    idx += qkv_replicate_size;
+
+    in.close();
+  }
+}
+
+void FileDataLoader::load_single_weight_tensor_b16(FFModel *ff,
+                                                   Layer *l,
+                                                   int weight_idx) {
+  Tensor weight = l->weights[weight_idx];
+  
+
+  // Create a buffer to store weight data from the file
+  size_t volume = 1;
+  std::vector<int> dims_vec;
+  for (int i = 0; i < weight->num_dims; i++) {
+    dims_vec.push_back(weight->dims[i]);
+    volume *= weight->dims[i];
+  }
+  assert(data_type_size(weight->data_type) == sizeof(__nv_bfloat16));
+  __nv_bfloat16 *data = (__nv_bfloat16 *)malloc(sizeof(__nv_bfloat16) * volume);
+
+  std::string weight_filename = removeGuidOperatorName(std::string(l->name));
+
+  if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
+      l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION ||
+      l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) {
+    if (weight_filename.find("self_attention") != std::string::npos) {
+      load_attention_weights_multi_query(
+          data, weight_filename, weights_folder, hidden_dim, num_heads);
+    } else if (weight_filename.find("attention") != std::string::npos &&
+               weight_filename.rfind("attention") ==
+                   weight_filename.length() - strlen("attention")) {
+      if (weight_idx == 0) {
+        load_attention_weights_v2_b16(data,
+                                      num_heads,
+                                      num_kv_heads,
+                                      hidden_dim,
+                                      qkv_inner_dim,
+                                      weight_filename,
+                                      weights_folder,
+                                      volume,
+                                      tensor_parallelism_degree);
+      } else {
+        long long value;
+        l->get_int_property("final_bias", value);
+        bool final_bias = (bool)value;
+        load_attention_bias_v2_b16(data,
+                                   num_heads,
+                                   num_kv_heads,
+                                   hidden_dim,
+                                   qkv_inner_dim,
+                                   final_bias,
+                                   weight_filename,
+                                   weights_folder);
+      }
+
+    } else {
+      assert(false);
+    }
+  } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) {
+    assert(weight_idx >= 0 || weight_idx <= 2);
+    weight_filename += (weight_idx == 0)
+                           ? "_attn_bias"
+                           : ((weight_idx == 1) ? "_weight" : "_bias");
+    std::cout << "Loading weight file " << weight_filename << std::endl;
+    std::string weight_filepath = join_path({weights_folder, weight_filename});
+    load_from_file_b16(data, volume, weight_filepath);
+  } else {
+    // default op
+    assert(weight_idx == 0 || weight_idx == 1);
+    // handle exception
+    if (weight_filename != "embed_tokens_weight_lm_head") {
+      weight_filename += weight_idx == 0 ? "_weight" : "_bias";
+    }
+    std::cout << "Loading weight file " << weight_filename << std::endl;
+    std::string weight_filepath = join_path({weights_folder, weight_filename});
+    load_from_file_b16(data, volume, weight_filepath);
+  }
+
+  // Copy the weight data from the buffer to the weight's ParallelTensor
+  ParallelTensor weight_pt;
+  ff->get_parallel_tensor_from_tensor(weight, weight_pt);
+  weight_pt->set_tensor<__nv_bfloat16>(ff, dims_vec, data);
+
+  // Free buffer memory
+  delete data;
+}
+
+///////////////////////////bdlot16 functions////////////
+
 template <typename DT>
 void load_attention_bias_v2(DT *ptr,
                             int num_heads,
@@ -339,7 +668,9 @@ void load_attention_weights_v2(DT *ptr,
 }
 
 template <typename DT>
-void load_from_file(DT *ptr, size_t size, std::string filepath) {
+void FileDataLoader::load_from_file(DT *ptr,
+                                    size_t size,
+                                    std::string filepath) {
   std::ifstream in(filepath, std::ios::in | std::ios::binary);
   if (!in.good()) {
     std::cout << "Could not open file: " << filepath << std::endl;
@@ -806,6 +1137,9 @@ void FileDataLoader::load_weights(FFModel *ff, bool use_full_precision) {
         case DT_FLOAT:
           load_single_weight_tensor<float>(ff, l, i);
           break;
+        case DT_B16:
+          load_single_weight_tensor_b16(ff, l, i);
+          break;
         case DT_INT4:
         case DT_INT8:
           // load weights in quantization
diff --git a/inference/file_loader.h b/inference/file_loader.h
index 6f01a79b80..579c6d4239 100644
--- a/inference/file_loader.h
+++ b/inference/file_loader.h
@@ -37,6 +37,8 @@ class FileDataLoader {
   template <typename DT>
   void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx);
 
+  void load_single_weight_tensor_b16(FFModel *ff, Layer *l, int weight_idx);
+
   void load_quantization_weight(FFModel *ff,
                                 Layer *l,
                                 int weight_idx,
@@ -48,9 +50,12 @@ class FileDataLoader {
                       ParallelTensor position_pt,
                       int max_seq_length,
                       int offset);
+  template <typename DT>
+  void load_from_file(DT *ptr, size_t size, std::string filepath);
 
-private:
-  int num_heads, num_kv_heads, tensor_parallelism_degree;
+      private : int num_heads,
+                num_kv_heads,
+                tensor_parallelism_degree;
   size_t hidden_dim, qkv_inner_dim;
   std::string prompts_filepath;
   std::string weights_folder;

From 0f8337e41282cab284418bef06f9100cdd1010f3 Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Sun, 7 Jan 2024 14:53:26 -0500
Subject: [PATCH 03/11] fix

---
 inference/file_loader.cc                     |  7 ++-
 inference/incr_decoding/incr_decoding.cc     | 31 ++++++----
 inference/models/falcon.cc                   |  6 +-
 inference/models/falcon.h                    |  2 +-
 inference/models/llama.cc                    |  2 +-
 inference/models/mpt.cc                      |  6 +-
 inference/models/mpt.h                       |  2 +-
 inference/models/opt.cc                      |  8 +--
 inference/models/opt.h                       |  2 +-
 inference/models/starcoder.cc                |  8 +--
 inference/models/starcoder.h                 |  2 +-
 inference/spec_infer/spec_infer.cc           | 52 ++++++++++------
 src/ops/inc_multihead_self_attention.cu      | 63 ++++++++++++--------
 src/ops/kernels/linear_kernels.cu            | 42 ++++++-------
 src/ops/spec_inc_multihead_self_attention.cu |  4 +-
 src/runtime/parallel_tensor.cc               |  6 ++
 16 files changed, 145 insertions(+), 98 deletions(-)

diff --git a/inference/file_loader.cc b/inference/file_loader.cc
index 9666a60358..9f9bce56b7 100644
--- a/inference/file_loader.cc
+++ b/inference/file_loader.cc
@@ -142,7 +142,8 @@ void load_from_file_b16(__nv_bfloat16 *ptr, size_t size, std::string filepath) {
   size_t in_get_size = in.gcount();
   if (in_get_size != loaded_data_size) {
     std::cout << "load weight data error " << in_get_size << ", "
-              << loaded_data_size << ", " << sizeof(float) << ", file path: " << filepath << std::endl;
+              << loaded_data_size << ", " << sizeof(float)
+              << ", file path: " << filepath << std::endl;
     assert(false);
   }
   assert(size == host_array.size());
@@ -373,7 +374,6 @@ void FileDataLoader::load_single_weight_tensor_b16(FFModel *ff,
                                                    Layer *l,
                                                    int weight_idx) {
   Tensor weight = l->weights[weight_idx];
-  
 
   // Create a buffer to store weight data from the file
   size_t volume = 1;
@@ -685,7 +685,8 @@ void FileDataLoader::load_from_file(DT *ptr,
   size_t in_get_size = in.gcount();
   if (in_get_size != loaded_data_size) {
     std::cout << "load weight data error " << in_get_size << ", "
-              << loaded_data_size << ", " << sizeof(DT) << std::endl;
+              << loaded_data_size << ", " << sizeof(DT)
+              << ", filepath: " << filepath << std::endl;
     assert(false);
   }
   assert(size == host_array.size());
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 61652b3ae3..e11eb14113 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -41,6 +41,7 @@ void parse_input_args(char **argv,
                       std::string &llm_model_name,
                       DataType &data_type,
                       bool &use_full_precision,
+                      bool &use_bfloat16_precision,
                       bool &verbose,
                       bool &do_sample,
                       float &temperature,
@@ -73,9 +74,15 @@ void parse_input_args(char **argv,
       continue;
     }
     if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
       data_type = DT_FLOAT;
       continue;
     }
+    if (!strcmp(argv[i], "--use-bfloat16-precision")) {
+      use_bfloat16_precision = true;
+      data_type = DT_B16;
+      continue;
+    }
     // verbose logging to stdout
     if (!strcmp(argv[i], "--verbose")) {
       verbose = true;
@@ -126,8 +133,9 @@ void FlexFlow::top_level_task(Task const *task,
   }
   FilePaths file_paths;
   std::string llm_model_name;
-  DataType data_type = DT_B16;
+  DataType data_type = DT_HALF;
   bool use_full_precision = false;
+  bool use_bfloat16_precision = false;
   bool verbose = false;
   bool do_sample = false;
   float temperature = 0.0f;
@@ -145,6 +153,7 @@ void FlexFlow::top_level_task(Task const *task,
                    llm_model_name,
                    data_type,
                    use_full_precision,
+                   use_bfloat16_precision,
                    verbose,
                    do_sample,
                    temperature,
@@ -161,11 +170,13 @@ void FlexFlow::top_level_task(Task const *task,
       {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
   std::string tokenizer_filepath =
       join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name});
-  std::string weights_filepath =
-      join_path({file_paths.cache_folder_path,
-                 "weights",
-                 llm_model_name,
-                 use_full_precision ? "full-precision" : "half-precision"});
+  std::string weights_filepath = join_path(
+      {file_paths.cache_folder_path,
+       "weights",
+       llm_model_name,
+       use_full_precision
+           ? "full-precision"
+           : (use_bfloat16_precision ? "full-precision" : "half-precision")});
   std::ifstream config_file_handle(config_filepath);
   if (!config_file_handle.good()) {
     std::cout << "Model config file " << config_filepath << " not found."
@@ -228,27 +239,27 @@ void FlexFlow::top_level_task(Task const *task,
                           config_filepath,
                           weights_filepath,
                           INC_DECODING_MODE,
-                          use_full_precision);
+                          data_type);
   } else if (model_type == ModelType::FALCON) {
     FALCON::create_falcon_model(model,
                                 config_filepath,
                                 weights_filepath,
                                 INC_DECODING_MODE,
-                                use_full_precision);
+                                data_type);
   } else if (model_type == ModelType::STARCODER) {
     STARCODER::create_starcoder_model(model,
                                       config_filepath,
                                       weights_filepath,
                                       INC_DECODING_MODE,
                                       generationConfig,
-                                      use_full_precision);
+                                      data_type);
   } else if (model_type == ModelType::MPT) {
     MPT::create_mpt_model(model,
                           config_filepath,
                           weights_filepath,
                           INC_DECODING_MODE,
                           generationConfig,
-                          use_full_precision);
+                          data_type);
   } else {
     assert(false && "unknow model type");
   }
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index 999ca37037..998abd7ae6 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -24,7 +24,7 @@ void FALCON::create_falcon_model(FFModel &ff,
                                  std::string const &model_config_file_path,
                                  std::string const &weight_file_path,
                                  InferenceMode mode,
-                                 bool use_full_precision) {
+                                 DataType data_type) {
   FalconConfig falcon_config(model_config_file_path);
   falcon_config.print();
 
@@ -53,7 +53,7 @@ void FALCON::create_falcon_model(FFModel &ff,
                               falcon_config.vocab_size,
                               falcon_config.hidden_size,
                               AGGR_MODE_NONE,
-                              use_full_precision ? DT_FLOAT : DT_HALF,
+                              data_type,
                               NULL,
                               embed_init,
                               "word_embeddings");
@@ -251,7 +251,7 @@ void FALCON::create_falcon_model(FFModel &ff,
                             falcon_config.hidden_size / falcon_config.n_head,
                             ff.config.tensor_parallelism_degree);
   std::cout << "------load weights ----------" << std::endl;
-  fileloader.load_weights(&ff, use_full_precision);
+  fileloader.load_weights(&ff, false);
   std::cout << "------load weight finished----------" << std::endl;
 
   // init operators
diff --git a/inference/models/falcon.h b/inference/models/falcon.h
index 01226a30dc..b87d88fd0f 100644
--- a/inference/models/falcon.h
+++ b/inference/models/falcon.h
@@ -94,7 +94,7 @@ class FALCON {
                                   std::string const &model_config_file_path,
                                   std::string const &weight_file_path,
                                   InferenceMode mode,
-                                  bool use_full_precision = false);
+                                  DataType DataType);
 };
 
 }; // namespace FlexFlow
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index f4655b1750..40075bcdbf 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -276,7 +276,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                             llama_config.hidden_size /
                                 llama_config.num_attention_heads,
                             ff.config.tensor_parallelism_degree);
-  fileloader.load_weights(&ff, data_type);
+  fileloader.load_weights(&ff, false);
   std::cout << "------load weight finished----------" << std::endl;
 
   // init operators
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index 3df67b264c..74a275699c 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -25,7 +25,7 @@ void MPT::create_mpt_model(FFModel &ff,
                            std::string const &weight_file_path,
                            InferenceMode mode,
                            GenerationConfig generationConfig,
-                           bool use_full_precision) {
+                           DataType data_type) {
   MPTConfig mpt_config(model_config_file_path);
   mpt_config.print();
 
@@ -54,7 +54,7 @@ void MPT::create_mpt_model(FFModel &ff,
                                       mpt_config.vocab_size,
                                       mpt_config.hidden_size,
                                       AGGR_MODE_NONE,
-                                      use_full_precision ? DT_FLOAT : DT_HALF,
+                                      data_type,
                                       NULL,
                                       embed_init,
                                       "transformer_wte");
@@ -257,7 +257,7 @@ void MPT::create_mpt_model(FFModel &ff,
                             mpt_config.hidden_size,
                             mpt_config.hidden_size / mpt_config.n_heads,
                             ff.config.tensor_parallelism_degree);
-  fileloader.load_weights(&ff, use_full_precision);
+  fileloader.load_weights(&ff, false);
   im->init_operators_inference(&ff);
 }
 
diff --git a/inference/models/mpt.h b/inference/models/mpt.h
index 1969cd9c89..24c2ab9999 100644
--- a/inference/models/mpt.h
+++ b/inference/models/mpt.h
@@ -70,7 +70,7 @@ class MPT {
                                std::string const &weight_file_path,
                                InferenceMode mode,
                                GenerationConfig generationConfig,
-                               bool use_full_precision = false);
+                               DataType DataType);
 };
 
 }; // namespace FlexFlow
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 0279f83239..3c771fb005 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -24,7 +24,7 @@ void OPT::create_opt_model(FFModel &ff,
                            std::string const &model_config_file_path,
                            std::string const &weight_file_path,
                            InferenceMode mode,
-                           bool use_full_precision) {
+                           DataType data_type) {
   OPTConfig opt_config(model_config_file_path);
   opt_config.print();
 
@@ -57,7 +57,7 @@ void OPT::create_opt_model(FFModel &ff,
                               opt_config.vocab_size,
                               opt_config.word_embed_proj_dim,
                               AGGR_MODE_NONE,
-                              use_full_precision ? DT_FLOAT : DT_HALF,
+                              data_type,
                               NULL,
                               embed_init,
                               "embed_tokens");
@@ -67,7 +67,7 @@ void OPT::create_opt_model(FFModel &ff,
                    opt_config.max_position_embeddings,
                    opt_config.hidden_size,
                    AGGR_MODE_NONE,
-                   use_full_precision ? DT_FLOAT : DT_HALF,
+                   data_type,
                    NULL,
                    embed_init,
                    "embed_positions");
@@ -267,7 +267,7 @@ void OPT::create_opt_model(FFModel &ff,
                             opt_config.hidden_size /
                                 opt_config.num_attention_heads,
                             ff.config.tensor_parallelism_degree);
-  fileloader.load_weights(&ff, use_full_precision);
+  fileloader.load_weights(&ff, false);
   std::cout << "------finished loading weights----------" << std::endl;
   im->init_operators_inference(&ff);
 }
diff --git a/inference/models/opt.h b/inference/models/opt.h
index 1ffe096bca..365cc3d512 100644
--- a/inference/models/opt.h
+++ b/inference/models/opt.h
@@ -97,7 +97,7 @@ class OPT {
                                std::string const &model_config_file_path,
                                std::string const &weight_file_path,
                                InferenceMode mode,
-                               bool use_full_precision = false);
+                               DataType DataType);
 };
 
 }; // namespace FlexFlow
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index e683376e47..5714ca1369 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -26,7 +26,7 @@ void STARCODER::create_starcoder_model(
     std::string const &weight_file_path,
     InferenceMode mode,
     GenerationConfig generationConfig,
-    bool use_full_precision) {
+    DataType data_type) {
   // do not apply cpu offload in beam search model.
   STARCODERConfig startcoder_config(model_config_file_path);
   startcoder_config.print();
@@ -62,7 +62,7 @@ void STARCODER::create_starcoder_model(
                               startcoder_config.vocab_size,
                               startcoder_config.hidden_size,
                               AGGR_MODE_NONE,
-                              use_full_precision ? DT_FLOAT : DT_HALF,
+                              data_type,
                               NULL,
                               embed_init,
                               "transformer_wte");
@@ -72,7 +72,7 @@ void STARCODER::create_starcoder_model(
                    startcoder_config.max_position_embeddings,
                    startcoder_config.hidden_size,
                    AGGR_MODE_NONE,
-                   use_full_precision ? DT_FLOAT : DT_HALF,
+                   data_type,
                    NULL,
                    embed_init,
                    "transformer_wpe");
@@ -232,7 +232,7 @@ void STARCODER::create_starcoder_model(
                             startcoder_config.hidden_size /
                                 startcoder_config.num_attention_heads,
                             ff.config.tensor_parallelism_degree);
-  fileloader.load_weights(&ff, use_full_precision);
+  fileloader.load_weights(&ff, false);
   std::cout << "------load weight finished----------" << std::endl;
 
   // init operators
diff --git a/inference/models/starcoder.h b/inference/models/starcoder.h
index bc113e4d52..c37b7af5af 100644
--- a/inference/models/starcoder.h
+++ b/inference/models/starcoder.h
@@ -71,7 +71,7 @@ class STARCODER {
                                      std::string const &weight_file_path,
                                      InferenceMode mode,
                                      GenerationConfig generationConfig,
-                                     bool use_full_precision = false);
+                                     DataType DataType);
 };
 
 }; // namespace FlexFlow
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 8d3d23ae4e..585a4a7ee7 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -57,7 +57,9 @@ void parse_input_args(char **argv,
                       int argc,
                       FilePaths &paths,
                       ModelNames &model_names,
+                      DataType &data_type,
                       bool &use_full_precision,
+                      bool &use_bfloat16_precision,
                       bool &verbose,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
@@ -99,6 +101,10 @@ void parse_input_args(char **argv,
       use_full_precision = true;
       continue;
     }
+    if (!strcmp(argv[i], "--use-bfloat16-precision")) {
+      use_bfloat16_precision = true;
+      data_type = DT_B16;
+    }
     // verbose logging to stdout
     if (!strcmp(argv[i], "--verbose")) {
       verbose = true;
@@ -129,7 +135,8 @@ void parse_input_args(char **argv,
 
 void get_model_meta(FilePaths &file_paths,
                     ModelMeta &model_metadata,
-                    bool use_full_precision) {
+                    bool use_full_precision,
+                    bool use_bfloat16_precision) {
   if (model_metadata.model_names.llm_model_name.empty() ||
       model_metadata.model_names.ssm_model_names.size() == 0) {
     assert(false && "SpecInfer needs at least one LLM and one SSM for "
@@ -144,11 +151,13 @@ void get_model_meta(FilePaths &file_paths,
       join_path({file_paths.cache_folder_path,
                  "tokenizers",
                  model_metadata.model_names.llm_model_name});
-  model_metadata.llm_weights_path =
-      join_path({file_paths.cache_folder_path,
-                 "weights",
-                 model_metadata.model_names.llm_model_name,
-                 use_full_precision ? "full-precision" : "half-precision"});
+  model_metadata.llm_weights_path = join_path(
+      {file_paths.cache_folder_path,
+       "weights",
+       model_metadata.model_names.llm_model_name,
+       use_full_precision
+           ? "full-precision"
+           : (use_bfloat16_precision ? "full-precision" : "half-precision")});
 
   std::ifstream llm_config_file_handle(model_metadata.llm_model_config_path);
   if (!llm_config_file_handle.good()) {
@@ -195,11 +204,13 @@ void get_model_meta(FilePaths &file_paths,
                                              "config.json"});
     std::string ssm_tokenizer_path =
         join_path({file_paths.cache_folder_path, "tokenizers", ssm_model_name});
-    std::string ssm_weights_path =
-        join_path({file_paths.cache_folder_path,
-                   "weights",
-                   ssm_model_name,
-                   use_full_precision ? "full-precision" : "half-precision"});
+    std::string ssm_weights_path = join_path(
+        {file_paths.cache_folder_path,
+         "weights",
+         ssm_model_name,
+         use_full_precision
+             ? "full-precision"
+             : (use_bfloat16_precision ? "full-precision" : "half-precision")});
 
     std::ifstream ssm_config_file_handle(ssm_config_path);
     if (!ssm_config_file_handle.good()) {
@@ -264,8 +275,9 @@ void FlexFlow::top_level_task(Task const *task,
   FFConfig ffconfig;
   FilePaths file_paths;
   ModelMeta model_metadata;
-  DataType data_type = DT_B16;
+  DataType data_type = DT_HALF;
   bool use_full_precision = false;
+  bool use_bfloat16_precision = false;
   bool verbose = false;
   int max_requests_per_batch = 16;
   int max_tokens_per_batch = 256;
@@ -278,13 +290,15 @@ void FlexFlow::top_level_task(Task const *task,
                    argc,
                    file_paths,
                    model_metadata.model_names,
+                   data_type,
                    use_full_precision,
+                   use_bfloat16_precision,
                    verbose,
                    max_requests_per_batch,
                    max_tokens_per_batch,
                    max_sequence_length);
 
-  get_model_meta(file_paths, model_metadata, use_full_precision);
+  get_model_meta(file_paths, model_metadata, use_full_precision, use_bfloat16_precision);
 
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
@@ -320,20 +334,20 @@ void FlexFlow::top_level_task(Task const *task,
                           model_metadata.llm_model_config_path,
                           model_metadata.llm_weights_path,
                           TREE_VERIFY_MODE,
-                          use_full_precision);
+                          data_type);
   } else if (model_metadata.llm_model_type == ModelType::FALCON) {
     FALCON::create_falcon_model(tree_model,
                                 model_metadata.llm_model_config_path,
                                 model_metadata.llm_weights_path,
                                 TREE_VERIFY_MODE,
-                                use_full_precision);
+                                data_type);
   } else if (model_metadata.llm_model_type == ModelType::MPT) {
     MPT::create_mpt_model(tree_model,
                           model_metadata.llm_model_config_path,
                           model_metadata.llm_weights_path,
                           TREE_VERIFY_MODE,
                           generationConfig,
-                          use_full_precision);
+                          data_type);
   } else {
     assert(false && "Invalid LLM model type passed (or no type was passed).");
   }
@@ -364,21 +378,21 @@ void FlexFlow::top_level_task(Task const *task,
                             model_metadata.ssm_model_config_paths[ssm_id],
                             model_metadata.ssm_model_weights_paths[ssm_id],
                             BEAM_SEARCH_MODE,
-                            use_full_precision);
+                            data_type);
     } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::FALCON) {
       FALCON::create_falcon_model(
           beam_model,
           model_metadata.ssm_model_config_paths[ssm_id],
           model_metadata.ssm_model_weights_paths[ssm_id],
           BEAM_SEARCH_MODE,
-          use_full_precision);
+          data_type);
     } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::MPT) {
       MPT::create_mpt_model(beam_model,
                             model_metadata.ssm_model_config_paths[ssm_id],
                             model_metadata.ssm_model_weights_paths[ssm_id],
                             BEAM_SEARCH_MODE,
                             generationConfig,
-                            use_full_precision);
+                            data_type);
     } else {
       assert(false && "Invalid SSM model type passed.");
     }
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 4e8f97650a..37ee080eec 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -506,12 +506,14 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   if (m->output_type[0] == DT_FLOAT) {
     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  } else if (m->output_type[0] == DT_B16) {
+    compute_type = CUBLAS_COMPUTE_32F;
   }
 #endif
 
   // Step 1: Compute QKV projections
   {
-    DT alpha = 1.0f, beta = 0.0f;
+    float alpha = 1.0f, beta = 0.0f;
     // after transpositions
     int m_q = m->qProjSize * m->num_q_heads;
     int m_k = m->kProjSize * m->num_q_heads;
@@ -637,10 +639,13 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
 #else
   cudaDataType_t compute_type = cublas_data_type;
+  if (m->output_type[0] == DT_B16) {
+    compute_type = CUDA_R_32F;
+  }
 #endif
   // Project to output, save result directly on output tensor
   {
-    DT alpha = 1.0f, beta = 0.0f;
+    float alpha = 1.0f, beta = 0.0f;
     // after transpositions
     int m_ = m->oProjSize;
     int k = m->vProjSize * m->num_q_heads;
@@ -920,6 +925,8 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   if (m->output_type[0] == DT_FLOAT) {
     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  }else if (m->output_type[0] == DT_B16) {
+    compute_type = CUBLAS_COMPUTE_32F;
   }
 #endif
   // int num_requests = bc->num_active_requests();
@@ -944,7 +951,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
     // Step 1: compute query-key product QK.T/sqrt(d_k)
     {
       // Scale by sqrt(d_k) as per the original attention paper
-      DT alpha = 1.0f, beta = 0.0f;
+      float alpha = 1.0f, beta = 0.0f;
       if (*m->qk_prod_scaling) {
         alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
       }
@@ -1075,7 +1082,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
     // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @
     // softmax(QK.T/sqrt(d_k)).T
     {
-      DT alpha = 1.0f, beta = 0.0f;
+      float alpha = 1.0f, beta = 0.0f;
       // after transpositions
       int m_ = m->vProjSize;
       int n = num_new_tokens;
@@ -1191,12 +1198,14 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
         output.get_float_ptr(),
         bias_ptr,
         stream);
-  }else if (input.data_type == DT_B16) {
+  } else if (input.data_type == DT_B16) {
     if (m->offload) {
-      pre_build_weight_kernel<__nv_bfloat16>(m, weight, input.data_type, stream);
+      pre_build_weight_kernel<__nv_bfloat16>(
+          m, weight, input.data_type, stream);
     }
     __nv_bfloat16 const *bias_ptr =
-        use_bias ? bias.get_bfloat16_ptr() : static_cast<__nv_bfloat16 const *>(nullptr);
+        use_bias ? bias.get_bfloat16_ptr()
+                 : static_cast<__nv_bfloat16 const *>(nullptr);
     Kernels::IncMultiHeadAttention::inference_kernel(
         m,
         bc,
@@ -1515,11 +1524,12 @@ template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<half>(
     DataType data_type,
     cudaStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<__nv_bfloat16>(
-    IncMultiHeadSelfAttentionMeta const *m,
-    GenericTensorAccessorR const weight,
-    DataType data_type,
-    cudaStream_t stream);
+template void
+    Kernels::IncMultiHeadAttention::pre_build_weight_kernel<__nv_bfloat16>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        GenericTensorAccessorR const weight,
+        DataType data_type,
+        cudaStream_t stream);
 
 template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<float>(
     IncMultiHeadSelfAttentionMeta const *m,
@@ -1541,15 +1551,16 @@ template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<half>(
     int num_tokens,
     cudaStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<__nv_bfloat16>(
-    IncMultiHeadSelfAttentionMeta const *m,
-    BatchConfig const *bc,
-    int shard_id,
-    __nv_bfloat16 *output_ptr,
-    __nv_bfloat16 const *weight_ptr,
-    __nv_bfloat16 const *bias_ptr,
-    int num_tokens,
-    cudaStream_t stream);
+template void
+    Kernels::IncMultiHeadAttention::compute_o_prod_bias<__nv_bfloat16>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        int shard_id,
+        __nv_bfloat16 *output_ptr,
+        __nv_bfloat16 const *weight_ptr,
+        __nv_bfloat16 const *bias_ptr,
+        int num_tokens,
+        cudaStream_t stream);
 
 template void
     Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<float>(
@@ -1566,9 +1577,9 @@ template void
         cudaStream_t stream);
 
 template void
-    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<__nv_bfloat16>(
-        IncMultiHeadSelfAttentionMeta const *m,
-        BatchConfig const *bc,
-        __nv_bfloat16 *output_ptr,
-        cudaStream_t stream);
+    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<
+        __nv_bfloat16>(IncMultiHeadSelfAttentionMeta const *m,
+                       BatchConfig const *bc,
+                       __nv_bfloat16 *output_ptr,
+                       cudaStream_t stream);
 }; // namespace FlexFlow
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 87a884cb99..db75a5bf32 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -160,14 +160,14 @@ void forward_kernel_wrapper(LinearMeta const *m,
                                    stream);
   } else if (m->input_type[0] == DT_B16) {
     Internal::forward_kernel<__nv_bfloat16>(m,
-                                   input_ptr,
-                                   output_ptr,
-                                   weight_ptr,
-                                   bias_ptr,
-                                   in_dim,
-                                   out_dim,
-                                   batch_size,
-                                   stream);
+                                            input_ptr,
+                                            output_ptr,
+                                            weight_ptr,
+                                            bias_ptr,
+                                            in_dim,
+                                            out_dim,
+                                            batch_size,
+                                            stream);
   }
 
   if (m->profiling) {
@@ -234,17 +234,17 @@ void backward_kernel_wrapper(LinearMeta const *m,
                                     stream);
   } else if (m->input_type[0] == DT_B16) {
     Internal::backward_kernel<__nv_bfloat16>(m,
-                                    input_ptr,
-                                    input_grad_ptr,
-                                    output_ptr,
-                                    output_grad_ptr,
-                                    kernel_ptr,
-                                    kernel_grad_ptr,
-                                    bias_grad_ptr,
-                                    in_dim,
-                                    out_dim,
-                                    batch_size,
-                                    stream);
+                                             input_ptr,
+                                             input_grad_ptr,
+                                             output_ptr,
+                                             output_grad_ptr,
+                                             kernel_ptr,
+                                             kernel_grad_ptr,
+                                             bias_grad_ptr,
+                                             in_dim,
+                                             out_dim,
+                                             batch_size,
+                                             stream);
   }
 
   if (m->profiling) {
@@ -333,7 +333,7 @@ void forward_kernel(LinearMeta const *m,
   }
   checkCUDA(cublasSetStream(m->handle.blas, stream));
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  DT alpha = 1.0f, beta = 0.0f;
+  float alpha = 1.0f, beta = 0.0f;
   cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   cudaDataType_t weight_type = m->offload
                                    ? ff_to_cuda_datatype(m->weight_ptr_type)
@@ -349,6 +349,8 @@ void forward_kernel(LinearMeta const *m,
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   if (m->output_type[0] == DT_FLOAT) {
     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  } else if (m->output_type[0] == DT_B16) {
+    compute_type = CUBLAS_COMPUTE_32F;
   }
 #endif
   checkCUDA(cublasGemmEx(m->handle.blas,
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 0b978c23a6..cc1a08ab9b 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -480,6 +480,8 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   if (m->output_type[0] == DT_FLOAT) {
     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  }else if (m->output_type[0] == DT_B16) {
+    compute_type = CUBLAS_COMPUTE_32F;
   }
 #endif
   // int num_requests = bc->num_active_requests();
@@ -533,7 +535,7 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
     int strideC = num_new_tokens * total_tokens;
 
     // a flag of using this scaling alpha
-    DT alpha = 1.0f, beta = 0.0f;
+    float alpha = 1.0f, beta = 0.0f;
     if (*m->qk_prod_scaling) {
       alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
     }
diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc
index 8f1be15fd1..e329593970 100644
--- a/src/runtime/parallel_tensor.cc
+++ b/src/runtime/parallel_tensor.cc
@@ -847,6 +847,12 @@ template bool ParallelTensorBase::get_tensor<half>(FFModel const *ff,
                                                    half *data,
                                                    bool get_gradients);
 
+template bool ParallelTensorBase::set_tensor<__nv_bfloat16>(
+    FFModel const *ff, std::vector<int> const &dims, __nv_bfloat16 const *data);
+template bool ParallelTensorBase::get_tensor<__nv_bfloat16>(FFModel const *ff,
+                                                            __nv_bfloat16 *data,
+                                                            bool get_gradients);
+
 template bool ParallelTensorBase::set_tensor<char>(FFModel const *ff,
                                                    std::vector<int> const &dims,
                                                    char const *data);

From badbf53daa6a64b3e2b8ca362240e074f8310072 Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Sun, 7 Jan 2024 15:02:03 -0500
Subject: [PATCH 04/11] .

---
 inference/spec_infer/spec_infer.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 585a4a7ee7..b5bd63d4b7 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -99,6 +99,7 @@ void parse_input_args(char **argv,
     }
     if (!strcmp(argv[i], "--use-full-precision")) {
       use_full_precision = true;
+      data_type = DT_FLOAT;
       continue;
     }
     if (!strcmp(argv[i], "--use-bfloat16-precision")) {

From fbdaba21951e8b508af6328112d654ec326ed863 Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Mon, 15 Jan 2024 11:22:08 -0500
Subject: [PATCH 05/11] hip

---
 include/flexflow/accessor.h                   |  12 +-
 include/flexflow/utils/cuda_helper.h          |  14 ++
 inference/file_loader.cc                      |  12 +-
 src/ops/arg_topk.cpp                          |  12 ++
 src/ops/argmax.cpp                            |  12 +-
 src/ops/element_unary.cpp                     |   6 +
 src/ops/fused.cpp                             |  16 ++
 src/ops/inc_multihead_self_attention.cpp      |  24 +++
 src/ops/inc_multihead_self_attention.cu       |  14 +-
 src/ops/kernels/decompress_kernels.cpp        |  23 ++-
 src/ops/kernels/element_binary_kernels.cpp    |  20 ++-
 src/ops/kernels/embedding_kernels.cpp         | 154 +++++++++++++++++-
 src/ops/kernels/linear_kernels.cpp            |  23 +++
 src/ops/kernels/linear_kernels.cu             |   3 +-
 src/ops/kernels/rms_norm_kernels.cpp          |   6 +
 src/ops/kernels/rms_norm_kernels.cu           |   2 +-
 src/ops/kernels/softmax.cpp                   |  11 +-
 src/ops/layer_norm.cpp                        |   9 +
 src/ops/linear.cc                             |  16 +-
 src/ops/residual_layer_norm.cpp               |  12 ++
 src/ops/residual_layer_norm.cu                |   5 +-
 src/ops/sigmoid_silu_multi.cpp                |   8 +
 src/ops/softmax.cc                            |   8 +-
 src/ops/spec_inc_multihead_self_attention.cpp |  13 ++
 src/ops/spec_inc_multihead_self_attention.cu  |  10 +-
 src/ops/tree_inc_multihead_self_attention.cpp |  18 ++
 src/parallel_ops/combine.cc                   |   4 +-
 src/parallel_ops/kernels/combine_kernels.cpp  |   3 +
 .../kernels/reduction_kernels.cpp             |  10 ++
 .../kernels/replicate_kernels.cpp             |   3 +
 src/parallel_ops/reduction.cc                 |   4 +-
 src/parallel_ops/replicate.cc                 |   4 +-
 src/runtime/accessor.cc                       |  18 +-
 src/runtime/ffconst_utils.cc                  |   2 +-
 src/runtime/hip_helper.cpp                    |  42 ++++-
 src/runtime/initializer_kernel.cpp            |  11 ++
 src/runtime/model.cc                          |   2 +-
 src/runtime/parallel_tensor.cc                |   8 +-
 38 files changed, 511 insertions(+), 63 deletions(-)

diff --git a/include/flexflow/accessor.h b/include/flexflow/accessor.h
index 26ed006646..121140c926 100644
--- a/include/flexflow/accessor.h
+++ b/include/flexflow/accessor.h
@@ -8,7 +8,7 @@
 #include <cuda_bf16.h>
 #elif defined(FF_USE_HIP_CUDA)
 #include <cuda_fp16.h>
-#include <hip_bfloat16.h>
+#include <cuda_bf16.h>
 #elif defined(FF_USE_HIP_ROCM)
 #include <hip/hip_fp16.h>
 #include <hip_bfloat16.h>
@@ -18,6 +18,12 @@
 
 namespace FlexFlow {
 
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+typedef __nv_bfloat16 __ff_bfloat16;
+#elif defined(FF_USE_HIP_ROCM)
+typedef hip_bfloat16 __ff_bfloat16;
+#endif
+
 template <typename FT, int N, typename T = Legion::coord_t>
 using AccessorRO =
     Legion::FieldAccessor<READ_ONLY, FT, N, T, Realm::AffineAccessor<FT, N, T>>;
@@ -64,7 +70,7 @@ class GenericTensorAccessorW {
   float *get_float_ptr() const;
   double *get_double_ptr() const;
   half *get_half_ptr() const;
-  __nv_bfloat16 *get_bfloat16_ptr() const;
+  __ff_bfloat16 *get_bfloat16_ptr() const;
   char *get_byte_ptr() const;
   DataType data_type;
   Legion::Domain domain;
@@ -84,7 +90,7 @@ class GenericTensorAccessorR {
   float const *get_float_ptr() const;
   double const *get_double_ptr() const;
   half const *get_half_ptr() const;
-  __nv_bfloat16 const *get_bfloat16_ptr() const;
+  __ff_bfloat16 const *get_bfloat16_ptr() const;
   char const *get_byte_ptr() const;
   DataType data_type;
   Legion::Domain domain;
diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h
index f8bf67b3e1..f3038860d7 100644
--- a/include/flexflow/utils/cuda_helper.h
+++ b/include/flexflow/utils/cuda_helper.h
@@ -161,6 +161,20 @@ T *download_tensor(T const *ptr, size_t num_elements);
 template <typename T>
 bool download_tensor(T const *ptr, T *dst, size_t num_elements);
 
+// data type for cublasgemm
+template <typename T>
+struct cublasAlphaBetaType {
+  using type = float; // default
+};
+template <>
+struct cublasAlphaBetaType<half> {
+  using type = half;
+};
+template <>
+struct cublasAlphaBetaType<__nv_bfloat16> {
+  using type = float;
+};
+
 cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor,
                                                  Legion::Domain domain,
                                                  DataType data_type = DT_FLOAT);
diff --git a/inference/file_loader.cc b/inference/file_loader.cc
index 9f9bce56b7..60b2c80b06 100644
--- a/inference/file_loader.cc
+++ b/inference/file_loader.cc
@@ -127,7 +127,7 @@ void load_attention_weights_multi_query(DT *ptr,
 
 ///////////////////////bfloat16 function///////////////////////
 
-void load_from_file_b16(__nv_bfloat16 *ptr, size_t size, std::string filepath) {
+void load_from_file_b16(__ff_bfloat16 *ptr, size_t size, std::string filepath) {
   std::ifstream in(filepath, std::ios::in | std::ios::binary);
   if (!in.good()) {
     std::cout << "Could not open file: " << filepath << std::endl;
@@ -156,7 +156,7 @@ void load_from_file_b16(__nv_bfloat16 *ptr, size_t size, std::string filepath) {
   in.close();
 }
 
-void load_attention_weights_v2_b16(__nv_bfloat16 *ptr,
+void load_attention_weights_v2_b16(__ff_bfloat16 *ptr,
                                    int num_heads,
                                    int num_kv_heads,
                                    size_t hidden_dim,
@@ -290,7 +290,7 @@ void load_attention_weights_v2_b16(__nv_bfloat16 *ptr,
   }
 }
 
-void load_attention_bias_v2_b16(__nv_bfloat16 *ptr,
+void load_attention_bias_v2_b16(__ff_bfloat16 *ptr,
                                 int num_heads,
                                 int num_kv_heads,
                                 size_t hidden_dim,
@@ -382,8 +382,8 @@ void FileDataLoader::load_single_weight_tensor_b16(FFModel *ff,
     dims_vec.push_back(weight->dims[i]);
     volume *= weight->dims[i];
   }
-  assert(data_type_size(weight->data_type) == sizeof(__nv_bfloat16));
-  __nv_bfloat16 *data = (__nv_bfloat16 *)malloc(sizeof(__nv_bfloat16) * volume);
+  assert(data_type_size(weight->data_type) == sizeof(__ff_bfloat16));
+  __ff_bfloat16 *data = (__ff_bfloat16 *)malloc(sizeof(__ff_bfloat16) * volume);
 
   std::string weight_filename = removeGuidOperatorName(std::string(l->name));
 
@@ -446,7 +446,7 @@ void FileDataLoader::load_single_weight_tensor_b16(FFModel *ff,
   // Copy the weight data from the buffer to the weight's ParallelTensor
   ParallelTensor weight_pt;
   ff->get_parallel_tensor_from_tensor(weight, weight_pt);
-  weight_pt->set_tensor<__nv_bfloat16>(ff, dims_vec, data);
+  weight_pt->set_tensor<__ff_bfloat16>(ff, dims_vec, data);
 
   // Free buffer memory
   delete data;
diff --git a/src/ops/arg_topk.cpp b/src/ops/arg_topk.cpp
index f431d3d4bf..17cc7534ad 100644
--- a/src/ops/arg_topk.cpp
+++ b/src/ops/arg_topk.cpp
@@ -515,6 +515,18 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
                             m->sorted,
                             m->speculative_decoding ? bc : nullptr,
                             stream);
+  } else if (input.data_type == DT_B16) {
+    ArgTopK::forward_kernel(m,
+                            input.get_bfloat16_ptr(),
+                            m->speculative_decoding ? probs.get_float_ptr()
+                                                    : nullptr,
+                            indices.get_int32_ptr(),
+                            batch_size,
+                            length,
+                            k,
+                            m->sorted,
+                            m->speculative_decoding ? bc : nullptr,
+                            stream);
   } else {
     assert(false && "Unsupported data type");
   }
diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp
index 8a1cf0b3b0..7437b8b969 100644
--- a/src/ops/argmax.cpp
+++ b/src/ops/argmax.cpp
@@ -466,6 +466,16 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
                                   length,
                                   batch_size,
                                   stream);
+  } else if (input.data_type == DT_B16) {
+    ArgMax::forward_kernel<hip_bfloat16>(m,
+                                         input.get_bfloat16_ptr(),
+                                         indices.get_int32_ptr(),
+                                         m->probs,
+                                         m->beam_search ? parent.get_int32_ptr()
+                                                        : nullptr,
+                                         length,
+                                         batch_size,
+                                         stream);
   } else {
     assert(false && "Unsupported data type");
   }
@@ -491,7 +501,7 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler,
     : OpMeta(handler, op) {
   DataType data_type = op->data_type;
   size_t prob_size = batch_size;
-  assert(data_type == DT_FLOAT || data_type == DT_HALF);
+  assert(data_type == DT_FLOAT || data_type == DT_HALF || data_type == DT_B16);
   size_t total_size = prob_size * sizeof(float);
   gpu_mem_allocator.create_legion_instance(reserveInst, total_size);
   probs = gpu_mem_allocator.allocate_instance<float>(prob_size);
diff --git a/src/ops/element_unary.cpp b/src/ops/element_unary.cpp
index e20200420f..06a4d3c36a 100644
--- a/src/ops/element_unary.cpp
+++ b/src/ops/element_unary.cpp
@@ -314,6 +314,12 @@ template void
                                                   int64_t *output_ptr,
                                                   size_t num_elements);
 
+template void ElementUnary::forward_kernel_wrapper<hip_bfloat16>(
+    ElementUnaryMeta const *m,
+    hip_bfloat16 const *input_ptr,
+    hip_bfloat16 *output_ptr,
+    size_t num_elements);
+
 template void
     ElementUnary::backward_kernel_wrapper<float>(ElementUnaryMeta const *m,
                                                  float const *input_ptr,
diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp
index 3282bc57d9..af98f153e2 100644
--- a/src/ops/fused.cpp
+++ b/src/ops/fused.cpp
@@ -428,6 +428,11 @@ __host__ void FusedOp::forward_task(Task const *task,
               m,
               my_input_accessor[0].get_float_ptr(),
               my_output_accessor[0].get_float_ptr());
+        } else if (m->input_type == DT_B16) {
+          Kernels::Softmax::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_bfloat16_ptr(),
+              my_output_accessor[0].get_bfloat16_ptr());
         }
         break;
       }
@@ -815,6 +820,12 @@ __host__ void
               my_input_accessor[0].get_float_ptr(),
               my_output_accessor[0].get_float_ptr(),
               my_input_accessor[0].domain.get_volume());
+        } else if (m->data_type == DT_B16) {
+          ElementUnary::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_bfloat16_ptr(),
+              my_output_accessor[0].get_bfloat16_ptr(),
+              my_input_accessor[0].domain.get_volume());
         } else {
           assert(false && "Unsupported data type in ElementUnary forward");
         }
@@ -1039,6 +1050,11 @@ __host__ void
               m,
               my_input_accessor[0].get_float_ptr(),
               my_output_accessor[0].get_float_ptr());
+        } else if (m->input_type == DT_B16) {
+          Kernels::Softmax::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_bfloat16_ptr(),
+              my_output_accessor[0].get_bfloat16_ptr());
         }
         break;
       }
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index d60386f927..4d63ba628d 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -802,6 +802,23 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
         output.get_float_ptr(),
         bias_ptr,
         stream);
+  } else if (input.data_type == DT_B16) {
+    if (m->offload) {
+      pre_build_weight_kernel<hip_bfloat16>(m, weight, input.data_type, stream);
+    }
+    hip_bfloat16 const *bias_ptr =
+        use_bias ? bias.get_bfloat16_ptr()
+                 : static_cast<hip_bfloat16 const *>(nullptr);
+    Kernels::IncMultiHeadAttention::inference_kernel(
+        m,
+        bc,
+        shard_id,
+        input.get_bfloat16_ptr(),
+        m->offload ? static_cast<hip_bfloat16 *>(m->weight_ptr)
+                   : weight.get_bfloat16_ptr(),
+        output.get_bfloat16_ptr(),
+        bias_ptr,
+        stream);
   } else {
     assert(false && "Unspported data type");
   }
@@ -1098,4 +1115,11 @@ template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<half>(
     DataType data_type,
     hipStream_t stream);
 
+template void
+    Kernels::IncMultiHeadAttention::pre_build_weight_kernel<hip_bfloat16>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        GenericTensorAccessorR const weight,
+        DataType data_type,
+        cudaStream_t stream);
+
 }; // namespace FlexFlow
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 37ee080eec..af77ac9f6d 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -513,7 +513,8 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
 
   // Step 1: Compute QKV projections
   {
-    float alpha = 1.0f, beta = 0.0f;
+    typename cublasAlphaBetaType<DT>::type alpha = 1.0;
+    typename cublasAlphaBetaType<DT>::type beta = 0.0;
     // after transpositions
     int m_q = m->qProjSize * m->num_q_heads;
     int m_k = m->kProjSize * m->num_q_heads;
@@ -645,7 +646,8 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
 #endif
   // Project to output, save result directly on output tensor
   {
-    float alpha = 1.0f, beta = 0.0f;
+    typename cublasAlphaBetaType<DT>::type alpha = 1.0;
+    typename cublasAlphaBetaType<DT>::type beta = 0.0;
     // after transpositions
     int m_ = m->oProjSize;
     int k = m->vProjSize * m->num_q_heads;
@@ -925,7 +927,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   if (m->output_type[0] == DT_FLOAT) {
     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }else if (m->output_type[0] == DT_B16) {
+  } else if (m->output_type[0] == DT_B16) {
     compute_type = CUBLAS_COMPUTE_32F;
   }
 #endif
@@ -951,7 +953,8 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
     // Step 1: compute query-key product QK.T/sqrt(d_k)
     {
       // Scale by sqrt(d_k) as per the original attention paper
-      float alpha = 1.0f, beta = 0.0f;
+      typename cublasAlphaBetaType<DT>::type alpha = 1.0;
+      typename cublasAlphaBetaType<DT>::type beta = 0.0;
       if (*m->qk_prod_scaling) {
         alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
       }
@@ -1082,7 +1085,8 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
     // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @
     // softmax(QK.T/sqrt(d_k)).T
     {
-      float alpha = 1.0f, beta = 0.0f;
+      typename cublasAlphaBetaType<DT>::type alpha = 1.0;
+      typename cublasAlphaBetaType<DT>::type beta = 0.0;
       // after transpositions
       int m_ = m->vProjSize;
       int n = num_new_tokens;
diff --git a/src/ops/kernels/decompress_kernels.cpp b/src/ops/kernels/decompress_kernels.cpp
index 22bf93d449..5127e0842e 100644
--- a/src/ops/kernels/decompress_kernels.cpp
+++ b/src/ops/kernels/decompress_kernels.cpp
@@ -54,10 +54,20 @@ template __global__ void decompress_int4_general_weights<float>(
     char const *input_weight_ptr, float *weight_ptr, int in_dim, int valueSize);
 template __global__ void decompress_int4_general_weights<half>(
     char const *input_weight_ptr, half *weight_ptr, int in_dim, int valueSize);
+template __global__ void
+    decompress_int4_general_weights<hip_bfloat16>(char const *input_weight_ptr,
+                                                  hip_bfloat16 *weight_ptr,
+                                                  int in_dim,
+                                                  int valueSize);
 template __global__ void decompress_int8_general_weights<float>(
     char const *input_weight_ptr, float *weight_ptr, int in_dim, int valueSize);
 template __global__ void decompress_int8_general_weights<half>(
     char const *input_weight_ptr, half *weight_ptr, int in_dim, int valueSize);
+template __global__ void
+    decompress_int8_general_weights<hip_bfloat16>(char const *input_weight_ptr,
+                                                  hip_bfloat16 *weight_ptr,
+                                                  int in_dim,
+                                                  int valueSize);
 template __global__ void
     decompress_int4_attention_weights<float>(char *input_weight_ptr,
                                              float *weight_ptr,
@@ -71,7 +81,12 @@ template __global__ void
                                             int qProjSize,
                                             int qSize,
                                             int num_heads);
-
+template __global__ void
+    decompress_int4_attention_weights<hip_bfloat16>(char *input_weight_ptr,
+                                                    hip_bfloat16 *weight_ptr,
+                                                    int qProjSize,
+                                                    int qSize,
+                                                    int num_heads);
 template __global__ void
     decompress_int8_attention_weights<float>(char *input_weight_ptr,
                                              float *weight_ptr,
@@ -86,5 +101,11 @@ template __global__ void
                                             int qSize,
                                             int num_heads);
 
+template __global__ void
+    decompress_int8_attention_weights<hip_bfloat16>(char *input_weight_ptr,
+                                                    hip_bfloat16 *weight_ptr,
+                                                    int qProjSize,
+                                                    int qSize,
+                                                    int num_heads);
 } // namespace Kernels
 }; // namespace FlexFlow
\ No newline at end of file
diff --git a/src/ops/kernels/element_binary_kernels.cpp b/src/ops/kernels/element_binary_kernels.cpp
index a65372de85..05467182e7 100644
--- a/src/ops/kernels/element_binary_kernels.cpp
+++ b/src/ops/kernels/element_binary_kernels.cpp
@@ -82,8 +82,24 @@ void forward_kernel_wrapper(ElementBinaryMeta const *m,
   }
   // print_tensor<float>(in1_ptr, in1_domain.get_volume(), "input1:");
   // print_tensor<float>(in2_ptr, in2_domain.get_volume(), "input2:");
-  Internal::forward_kernel(
-      m, in1.get_float_ptr(), in2.get_float_ptr(), out.get_float_ptr(), stream);
+  if (out.data_type == DT_HALF) {
+    Internal::forward_kernel(
+        m, in1.get_half_ptr(), in2.get_half_ptr(), out.get_half_ptr(), stream);
+  } else if (out.data_type == DT_FLOAT) {
+    Internal::forward_kernel(m,
+                             in1.get_float_ptr(),
+                             in2.get_float_ptr(),
+                             out.get_float_ptr(),
+                             stream);
+  } else if (out.data_type == DT_B16) {
+    Internal::forward_kernel(m,
+                             in1.get_bfloat16_ptr(),
+                             in2.get_bfloat16_ptr(),
+                             out.get_bfloat16_ptr(),
+                             stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
   // print_tensor<float>(out_ptr, in1_domain.get_volume(), "output:");
   if (m->profiling) {
     checkCUDA(hipEventRecord(t_end, stream));
diff --git a/src/ops/kernels/embedding_kernels.cpp b/src/ops/kernels/embedding_kernels.cpp
index ee4a6fcea1..7384f4e7e6 100644
--- a/src/ops/kernels/embedding_kernels.cpp
+++ b/src/ops/kernels/embedding_kernels.cpp
@@ -60,7 +60,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                m->aggr,
                                output.domain.get_volume(),
                                stream);
-    } else if (weight.data_type == DT_HALF) {
+    } else if (weight.data_type == DT_DOUBLE) {
       Internal::forward_kernel(input.get_int32_ptr(),
                                output.get_double_ptr(),
                                weight.get_double_ptr(),
@@ -70,6 +70,16 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                m->aggr,
                                output.domain.get_volume(),
                                stream);
+    } else if (weight.data_type == DT_B16) {
+      Internal::forward_kernel(input.get_int32_ptr(),
+                               output.get_bfloat16_ptr(),
+                               weight.get_bfloat16_ptr(),
+                               in_dim,
+                               out_dim,
+                               batch_size,
+                               m->aggr,
+                               output.domain.get_volume(),
+                               stream);
     } else {
       assert(false && "Unsupported DataType in Embedding");
     }
@@ -104,6 +114,16 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                m->aggr,
                                output.domain.get_volume(),
                                stream);
+    } else if (weight.data_type == DT_B16) {
+      Internal::forward_kernel(input.get_int64_ptr(),
+                               output.get_bfloat16_ptr(),
+                               weight.get_bfloat16_ptr(),
+                               in_dim,
+                               out_dim,
+                               batch_size,
+                               m->aggr,
+                               output.domain.get_volume(),
+                               stream);
     } else {
       assert(false && "Unsupported DataType in Embedding");
     }
@@ -162,6 +182,16 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 m->aggr,
                                 output.domain.get_volume(),
                                 stream);
+    } else if (m->output_type[0] == DT_B16) {
+      Internal::backward_kernel(input.get_int32_ptr(),
+                                output.get_bfloat16_ptr(),
+                                weight_grad.get_bfloat16_ptr(),
+                                in_dim,
+                                out_dim,
+                                batch_size,
+                                m->aggr,
+                                output.domain.get_volume(),
+                                stream);
     } else {
       assert(false && "Unsupported DataType in Embedding");
     }
@@ -196,6 +226,16 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 m->aggr,
                                 output.domain.get_volume(),
                                 stream);
+    } else if (m->output_type[0] == DT_B16) {
+      Internal::backward_kernel(input.get_int64_ptr(),
+                                output.get_bfloat16_ptr(),
+                                weight_grad.get_bfloat16_ptr(),
+                                in_dim,
+                                out_dim,
+                                batch_size,
+                                m->aggr,
+                                output.domain.get_volume(),
+                                stream);
     } else {
       assert(false && "Unsupported DataType in Embedding");
     }
@@ -332,6 +372,50 @@ __global__ void embed_backward_no_aggr<int64_t, half>(int64_t const *input,
   }
 }
 
+template <>
+__global__ void
+    embed_backward_no_aggr<int, hip_bfloat16>(int const *input,
+                                              hip_bfloat16 const *output,
+                                              hip_bfloat16 *embed,
+                                              int out_dim,
+                                              int batch_size) {
+  CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
+    int idx = i / out_dim;
+    int off = i % out_dim;
+    int wordIdx = input[idx];
+#if __CUDA_ARCH__ >= 700
+    atomicAdd(embed + wordIdx * out_dim + off, output[i]);
+#else
+    assert(false);
+    // TODO: this implementation may result in race condition
+    // so we use an assertion failure to warn users
+    embed[wordIdx * out_dim + off] += output[i];
+#endif
+  }
+}
+
+template <>
+__global__ void
+    embed_backward_no_aggr<int64_t, hip_bfloat16>(int64_t const *input,
+                                                  hip_bfloat16 const *output,
+                                                  hip_bfloat16 *embed,
+                                                  int out_dim,
+                                                  int batch_size) {
+  CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
+    int idx = i / out_dim;
+    int off = i % out_dim;
+    int64_t wordIdx = input[idx];
+#if __CUDA_ARCH__ >= 700
+    atomicAdd(embed + wordIdx * out_dim + off, output[i]);
+#else
+    assert(false);
+    // TODO: this implementation may result in race condition
+    // so we use an assertion failure to warn users
+    embed[wordIdx * out_dim + off] += output[i];
+#endif
+  }
+}
+
 template <typename TI, typename TD>
 __global__ void embed_backward_with_aggr(TI const *input,
                                          TD const *output,
@@ -426,6 +510,74 @@ __global__ void embed_backward_with_aggr<int64_t, half>(int64_t const *input,
   }
 }
 
+template <>
+__global__ void
+    embed_backward_with_aggr<int, hip_bfloat16>(int const *input,
+                                                hip_bfloat16 const *output,
+                                                hip_bfloat16 *embed,
+                                                int out_dim,
+                                                int in_dim,
+                                                int batch_size,
+                                                AggrMode aggr) {
+  hip_bfloat16 scale = 1.0f / in_dim;
+  CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
+    int idx = i / out_dim;
+    int off = i % out_dim;
+    hip_bfloat16 gradient;
+    if (aggr == AGGR_MODE_SUM) {
+      gradient = output[i];
+    } else {
+      assert(aggr == AGGR_MODE_AVG);
+      gradient = output[i] * scale;
+    }
+    for (int j = 0; j < in_dim; j++) {
+      int wordIdx = input[idx * in_dim + j];
+#if __CUDA_ARCH__ >= 700
+      atomicAdd(embed + wordIdx * out_dim + off, gradient);
+#else
+      assert(false);
+      // TODO: this implementation may result in race condition
+      // so we use an assertion failure to warn users
+      embed[wordIdx * out_dim + off] += gradient;
+#endif
+    }
+  }
+}
+
+template <>
+__global__ void
+    embed_backward_with_aggr<int64_t, hip_bfloat16>(int64_t const *input,
+                                                    hip_bfloat16 const *output,
+                                                    hip_bfloat16 *embed,
+                                                    int out_dim,
+                                                    int in_dim,
+                                                    int batch_size,
+                                                    AggrMode aggr) {
+  hip_bfloat16 scale = 1.0f / in_dim;
+  CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
+    int idx = i / out_dim;
+    int off = i % out_dim;
+    hip_bfloat16 gradient;
+    if (aggr == AGGR_MODE_SUM) {
+      gradient = output[i];
+    } else {
+      assert(aggr == AGGR_MODE_AVG);
+      gradient = output[i] * scale;
+    }
+    for (int j = 0; j < in_dim; j++) {
+      int64_t wordIdx = input[idx * in_dim + j];
+#if __CUDA_ARCH__ >= 700
+      atomicAdd(embed + wordIdx * out_dim + off, gradient);
+#else
+      assert(false);
+      // TODO: this implementation may result in race condition
+      // so we use an assertion failure to warn users
+      embed[wordIdx * out_dim + off] += gradient;
+#endif
+    }
+  }
+}
+
 /*static*/
 template <typename TI, typename TD>
 void forward_kernel(TI const *input_ptr,
diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp
index 072eb5e96b..3cd9862bba 100644
--- a/src/ops/kernels/linear_kernels.cpp
+++ b/src/ops/kernels/linear_kernels.cpp
@@ -124,6 +124,16 @@ void forward_kernel_wrapper(LinearMeta const *m,
                                    out_dim,
                                    batch_size,
                                    stream);
+  } else if (m->input_type[0] == DT_B16) {
+    Internal::forward_kernel<hip_bfloat16>(m,
+                                           input_ptr,
+                                           output_ptr,
+                                           weight_ptr,
+                                           bias_ptr,
+                                           in_dim,
+                                           out_dim,
+                                           batch_size,
+                                           stream);
   }
 
   if (m->profiling) {
@@ -189,6 +199,19 @@ void backward_kernel_wrapper(LinearMeta const *m,
                                     out_dim,
                                     batch_size,
                                     stream);
+  } else if (m->input_type[0] == DT_B16) {
+    Internal::backward_kernel<hip_bfloat16>(m,
+                                            input_ptr,
+                                            input_grad_ptr,
+                                            output_ptr,
+                                            output_grad_ptr,
+                                            kernel_ptr,
+                                            kernel_grad_ptr,
+                                            bias_grad_ptr,
+                                            in_dim,
+                                            out_dim,
+                                            batch_size,
+                                            stream);
   }
 
   if (m->profiling) {
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index db75a5bf32..ad416b433c 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -333,7 +333,8 @@ void forward_kernel(LinearMeta const *m,
   }
   checkCUDA(cublasSetStream(m->handle.blas, stream));
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  float alpha = 1.0f, beta = 0.0f;
+  typename cublasAlphaBetaType<DT>::type alpha = 1.0;
+  typename cublasAlphaBetaType<DT>::type beta = 0.0;
   cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   cudaDataType_t weight_type = m->offload
                                    ? ff_to_cuda_datatype(m->weight_ptr_type)
diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp
index 24ab7051e6..c81e73dd77 100644
--- a/src/ops/kernels/rms_norm_kernels.cpp
+++ b/src/ops/kernels/rms_norm_kernels.cpp
@@ -190,6 +190,12 @@ void forward_kernel_wrapper(RMSNormMeta const *m,
                    weight.get_float_ptr(),
                    output.get_float_ptr(),
                    stream);
+  } else if (output.data_type == DT_B16) {
+    forward_kernel(m,
+                   input.get_bfloat16_ptr(),
+                   weight.get_bfloat16_ptr(),
+                   output.get_bfloat16_ptr(),
+                   stream);
   } else {
     assert(false && "Unsupported data type");
   }
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index 67a0b156b5..19661724d2 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -137,7 +137,7 @@ __global__ void
 template <typename T>
 __global__ void NormKernel(int64_t N, T const *X, T const *rstd, T *Y) {
   using T_ACC = T;
-  int64_t const i = blockIdx.x;
+  const int64_t i = blockIdx.x;
   for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     int64_t const index = i * N + j;
     Y[index] = static_cast<T_ACC>(X[index]) * static_cast<T_ACC>(rstd[i]);
diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp
index 89c9f14a01..e785d404d7 100644
--- a/src/ops/kernels/softmax.cpp
+++ b/src/ops/kernels/softmax.cpp
@@ -107,7 +107,10 @@ template void forward_kernel_wrapper<float>(SoftmaxMeta const *m,
 template void forward_kernel_wrapper<half>(SoftmaxMeta const *m,
                                            half const *input_ptr,
                                            half *output_ptr);
-
+template void
+    forward_kernel_wrapper<hip_bfloat16>(SoftmaxMeta const *m,
+                                         hip_bfloat16 const *input_ptr,
+                                         hip_bfloat16 *output_ptr);
 template void backward_kernel_wrapper<float>(SoftmaxMeta const *m,
                                              float *input_grad_ptr,
                                              float const *output_grad_ptr,
@@ -116,7 +119,11 @@ template void backward_kernel_wrapper<half>(SoftmaxMeta const *m,
                                             half *input_grad_ptr,
                                             half const *output_grad_ptr,
                                             size_t num_elements);
-
+template void
+    backward_kernel_wrapper<hip_bfloat16>(SoftmaxMeta const *m,
+                                          hip_bfloat16 *input_grad_ptr,
+                                          hip_bfloat16 const *output_grad_ptr,
+                                          size_t num_elements);
 namespace Internal {
 template <typename DT>
 void forward_kernel(SoftmaxMeta const *m,
diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp
index 07dbdb3dfb..ff420901e4 100644
--- a/src/ops/layer_norm.cpp
+++ b/src/ops/layer_norm.cpp
@@ -182,6 +182,15 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m,
                                     gamma.get_half_ptr(),
                                     m->use_bias ? beta.get_half_ptr() : nullptr,
                                     stream);
+  } else if (m->input_type[0] == DT_B16) {
+    LayerNorm::forward_kernel<hip_bfloat16>(
+        m,
+        input.get_bfloat16_ptr(),
+        output.get_bfloat16_ptr(),
+        m->elementwise_affine ? gamma.get_bfloat16_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_bfloat16_ptr()
+                                               : nullptr,
+        stream);
   } else {
     assert(false && "unsupport datatype in layernorm");
   }
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 8cdc392c96..cce2fb9c9e 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -157,7 +157,7 @@ Op *Linear::create_operator_from_layer(
 
 Linear::Linear(FFModel &model,
                Linear const &other,
-               ParallelTensor const input,
+               const ParallelTensor input,
                bool allocate_weights)
     : Linear(model,
              other.layer_guid,
@@ -175,7 +175,7 @@ Linear::Linear(FFModel &model,
 
 Linear::Linear(FFModel &model,
                LinearParams const &params,
-               ParallelTensor const input,
+               const ParallelTensor input,
                char const *name,
                bool allocate_weights)
     : Linear(model,
@@ -439,10 +439,10 @@ OpMeta *Linear::init_task(Task const *task,
       }                                                                        \
     } else if (output.data_type == DT_B16) {                                   \
       if (linear->quantization_type != DT_NONE) {                              \
-        return init_task_with_dim<__nv_bfloat16, char, DIM>(                   \
+        return init_task_with_dim<__ff_bfloat16, char, DIM>(                   \
             task, regions, ctx, runtime);                                      \
       } else {                                                                 \
-        return init_task_with_dim<__nv_bfloat16, __nv_bfloat16, DIM>(          \
+        return init_task_with_dim<__ff_bfloat16, __ff_bfloat16, DIM>(          \
             task, regions, ctx, runtime);                                      \
       }                                                                        \
     } else {                                                                   \
@@ -714,10 +714,10 @@ void Linear::forward_task(Task const *task,
       }                                                                        \
     } else if (m->output_type[0] == DT_B16) {                                  \
       if (m->quantization_type != DT_NONE) {                                   \
-        return forward_task_with_dim<__nv_bfloat16, char, DIM>(                \
+        return forward_task_with_dim<__ff_bfloat16, char, DIM>(                \
             task, regions, ctx, runtime);                                      \
       } else {                                                                 \
-        return forward_task_with_dim<__nv_bfloat16, __nv_bfloat16, DIM>(       \
+        return forward_task_with_dim<__ff_bfloat16, __ff_bfloat16, DIM>(       \
             task, regions, ctx, runtime);                                      \
       }                                                                        \
     } else {                                                                   \
@@ -876,7 +876,7 @@ void Linear::backward_task(Task const *task,
     } else if (m->output_type[0] == DT_FLOAT) {                                \
       return backward_task_with_dim<float, DIM>(task, regions, ctx, runtime);  \
     } else if (m->output_type[0] == DT_B16) {                                  \
-      return backward_task_with_dim<__nv_bfloat16, DIM>(                       \
+      return backward_task_with_dim<__ff_bfloat16, DIM>(                       \
           task, regions, ctx, runtime);                                        \
     } else {                                                                   \
       assert(false && "Unsupported data type");                                \
@@ -1360,7 +1360,7 @@ bool LinearParams::is_valid(ParallelTensorShape const &input_shape) const {
  * It takes a the input tensor as a parameter, instead of the input's
  * ParallelTensorShape.
  */
-void LinearParams::solve_dims(ParallelTensor const input,
+void LinearParams::solve_dims(const ParallelTensor input,
                               ParallelDim output_dims[MAX_TENSOR_DIM],
                               int *output_ndims,
                               ParallelDim kernel_dims[MAX_TENSOR_DIM],
diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp
index f1b7a537b0..09823adf27 100644
--- a/src/ops/residual_layer_norm.cpp
+++ b/src/ops/residual_layer_norm.cpp
@@ -230,6 +230,18 @@ void ResidualLayerNorm::inference_kernel_wrapper(
         m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
         (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr,
         stream);
+  } else if (m->input_type[0] == DT_B16) {
+    ResidualLayerNorm::inference_kernel<hip_bfloat16>(
+        m,
+        input.get_bfloat16_ptr(),
+        residual1.get_bfloat16_ptr(),
+        m->use_two_residuals ? residual2.get_bfloat16_ptr() : nullptr,
+        added_output.get_bfloat16_ptr(),
+        output.get_bfloat16_ptr(),
+        m->elementwise_affine ? gamma.get_bfloat16_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_bfloat16_ptr()
+                                               : nullptr,
+        stream);
   } else {
     assert(false && "unsupport datatype in layernorm");
   }
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index cbb7e241f3..fc0bd5d5a2 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -234,9 +234,10 @@ void ResidualLayerNorm::inference_kernel_wrapper(
         added_output.get_bfloat16_ptr(),
         output.get_bfloat16_ptr(),
         m->elementwise_affine ? gamma.get_bfloat16_ptr() : nullptr,
-        (m->elementwise_affine && m->use_bias) ? beta.get_bfloat16_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_bfloat16_ptr()
+                                               : nullptr,
         stream);
-  }else {
+  } else {
     assert(false && "unsupport datatype in layernorm");
   }
 
diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp
index 7b7f30a288..b446ed0065 100644
--- a/src/ops/sigmoid_silu_multi.cpp
+++ b/src/ops/sigmoid_silu_multi.cpp
@@ -101,6 +101,14 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
                        input1.get_half_ptr(),
                        input2.get_half_ptr(),
                        output.get_half_ptr());
+  } else if (m->input_type[0] == DT_B16) {
+    SigmoidSiluMultiKernel<<<GET_BLOCKS(num_elements),
+                             min(CUDA_NUM_THREADS, num_elements),
+                             0,
+                             stream>>>(input1.domain.get_volume(),
+                                       input1.get_bfloat16_ptr(),
+                                       input2.get_bfloat16_ptr(),
+                                       output.get_bfloat16_ptr());
   } else {
     assert(false && "unsupport datatype in SigmoidSiluMulti");
   }
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index 101eff9417..87598de634 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -52,7 +52,7 @@ SoftmaxParams Softmax::get_params() const {
   return params;
 }
 
-Tensor FFModel::softmax(Tensor const _input,
+Tensor FFModel::softmax(const Tensor _input,
                         int dim,
                         DataType data_type,
                         char const *name) {
@@ -93,7 +93,7 @@ Op *Softmax::create_operator_from_layer(
 }
 
 Softmax::Softmax(FFModel &model,
-                 ParallelTensor const _input,
+                 const ParallelTensor _input,
                  int _dim,
                  char const *name)
     : Op(model,
@@ -117,7 +117,7 @@ Softmax::Softmax(FFModel &model,
 
 Softmax::Softmax(FFModel &model,
                  SoftmaxParams const &params,
-                 ParallelTensor const input,
+                 const ParallelTensor input,
                  char const *name)
     : Softmax(model, input, params.dim, name) {}
 
@@ -370,7 +370,7 @@ void Softmax::backward_task(Task const *task,
     } else if (m->output_type == DT_FLOAT) {                                   \
       return backward_task_with_dim<float, DIM>(task, regions, ctx, runtime);  \
     } else if (m->output_type == DT_B16) {                                     \
-      return backward_task_with_dim<__nv_bfloat16, DIM>(                       \
+      return backward_task_with_dim<__ff_bfloat16, DIM>(                       \
           task, regions, ctx, runtime);                                        \
     } else {                                                                   \
       assert(false && "Unsupported data type");                                \
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index b1687d12a2..a7b8d4535c 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -562,6 +562,19 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
                                                          output.get_float_ptr(),
                                                          bias_ptr,
                                                          stream);
+  } else if (input.data_type == DT_B16) {
+    float const *bias_ptr = use_bias
+                                ? bias.get_bfloat16_ptr()
+                                : static_cast<hip_bfloat16 const *>(nullptr);
+    Kernels::SpecIncMultiHeadAttention::inference_kernel(
+        m,
+        bc,
+        shard_id,
+        input.get_bfloat16_ptr(),
+        weight.get_bfloat16_ptr(),
+        output.get_bfloat16_ptr(),
+        bias_ptr,
+        stream);
   } else {
     assert(false && "Unspported data type");
   }
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index cc1a08ab9b..885a09c980 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -480,7 +480,7 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   if (m->output_type[0] == DT_FLOAT) {
     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }else if (m->output_type[0] == DT_B16) {
+  } else if (m->output_type[0] == DT_B16) {
     compute_type = CUBLAS_COMPUTE_32F;
   }
 #endif
@@ -535,7 +535,8 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
     int strideC = num_new_tokens * total_tokens;
 
     // a flag of using this scaling alpha
-    float alpha = 1.0f, beta = 0.0f;
+    typename cublasAlphaBetaType<DT>::type alpha = 1.0;
+    typename cublasAlphaBetaType<DT>::type beta = 0.0;
     if (*m->qk_prod_scaling) {
       alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
     }
@@ -791,9 +792,10 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
         output.get_float_ptr(),
         bias_ptr,
         stream);
-  }else if (input.data_type == DT_B16) {
+  } else if (input.data_type == DT_B16) {
     __nv_bfloat16 const *bias_ptr =
-        use_bias ? bias.get_bfloat16_ptr() : static_cast<__nv_bfloat16 const *>(nullptr);
+        use_bias ? bias.get_bfloat16_ptr()
+                 : static_cast<__nv_bfloat16 const *>(nullptr);
     Kernels::SpecIncMultiHeadSelfAttention::inference_kernel(
         m,
         bc,
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index 26291fb3b4..4f399aafc5 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -570,6 +570,24 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
         output.get_float_ptr(),
         bias_ptr,
         stream);
+  } else if (input.data_type == DT_B16) {
+    if (m->offload) {
+      pre_build_weight_kernel<hip_bfloat16>(m, weight, input.data_type, stream);
+    }
+
+    hip_bfloat16 const *bias_ptr =
+        use_bias ? bias.get_bfloat16_ptr()
+                 : static_cast<hip_bfloat16 const *>(nullptr);
+    Kernels::TreeIncMultiHeadAttention::inference_kernel(
+        m,
+        bc,
+        shard_id,
+        input.get_bfloat16_ptr(),
+        m->offload ? static_cast<hip_bfloat16 *>(m->weight_ptr)
+                   : weight.get_bfloat16_ptr(),
+        output.get_bfloat16_ptr(),
+        bias_ptr,
+        stream);
   } else {
     assert(false && "Unspported data type");
   }
diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc
index ffe57006fc..8bd963b3d3 100644
--- a/src/parallel_ops/combine.cc
+++ b/src/parallel_ops/combine.cc
@@ -72,7 +72,7 @@ Combine::Combine(FFModel &model,
               name) {}
 
 Combine::Combine(FFModel &model,
-                 ParallelTensor const _input,
+                 const ParallelTensor _input,
                  int _combine_legion_dim,
                  int _combine_degree,
                  char const *name)
@@ -366,7 +366,7 @@ void Combine::forward_task(Task const *task,
   if (data_type == DT_HALF) {
     forward_task_with_type<half>(task, regions, ctx, runtime);
   } else if (data_type == DT_B16) {
-    forward_task_with_type<__nv_bfloat16>(task, regions, ctx, runtime);
+    forward_task_with_type<__ff_bfloat16>(task, regions, ctx, runtime);
   } else if (data_type == DT_FLOAT) {
     forward_task_with_type<float>(task, regions, ctx, runtime);
   } else if (data_type == DT_DOUBLE) {
diff --git a/src/parallel_ops/kernels/combine_kernels.cpp b/src/parallel_ops/kernels/combine_kernels.cpp
index d6e9568223..b593a92a7d 100644
--- a/src/parallel_ops/kernels/combine_kernels.cpp
+++ b/src/parallel_ops/kernels/combine_kernels.cpp
@@ -57,6 +57,9 @@ template void forward_kernel<half>(half const *input_ptr,
 template void forward_kernel<float>(float const *input_ptr,
                                     float *output_ptr,
                                     size_t num_elements);
+template void forward_kernel<hip_bfloat16>(hip_bfloat16 const *input_ptr,
+                                           hip_bfloat16 *output_ptr,
+                                           size_t num_elements);
 template void forward_kernel<double>(double const *input_ptr,
                                      double *output_ptr,
                                      size_t num_elements);
diff --git a/src/parallel_ops/kernels/reduction_kernels.cpp b/src/parallel_ops/kernels/reduction_kernels.cpp
index 2a3fe5cca1..ade5d9b402 100644
--- a/src/parallel_ops/kernels/reduction_kernels.cpp
+++ b/src/parallel_ops/kernels/reduction_kernels.cpp
@@ -78,6 +78,12 @@ template __global__ void reduction_forward_kernel<half>(half const *input_ptr,
                                                         half *output_ptr,
                                                         size_t num_elements,
                                                         size_t num_replicas);
+template __global__ void
+    reduction_forward_kernel<hip_bfloat16>(hip_bfloat16 const *input_ptr,
+                                           hip_bfloat16 *output_ptr,
+                                           size_t num_elements,
+                                           size_t num_replicas);
+
 template void forward_kernel<float>(float const *input_ptr,
                                     float *output_ptr,
                                     size_t num_elements,
@@ -86,6 +92,10 @@ template void forward_kernel<half>(half const *input_ptr,
                                    half *output_ptr,
                                    size_t num_elements,
                                    size_t num_replicas);
+template void forward_kernel<hip_bfloat16>(hip_bfloat16 const *input_ptr,
+                                           hip_bfloat16 *output_ptr,
+                                           size_t num_elements,
+                                           size_t num_replicas);
 template void backward_kernel<float>(float const *output_grad_ptr,
                                      float *input_grad_ptr,
                                      size_t num_elements);
diff --git a/src/parallel_ops/kernels/replicate_kernels.cpp b/src/parallel_ops/kernels/replicate_kernels.cpp
index 1647f014be..7d6b2fc63a 100644
--- a/src/parallel_ops/kernels/replicate_kernels.cpp
+++ b/src/parallel_ops/kernels/replicate_kernels.cpp
@@ -73,6 +73,9 @@ template void forward_kernel<float>(float const *input_ptr,
 template void forward_kernel<half>(half const *input_ptr,
                                    half *output_ptr,
                                    size_t num_elements);
+template void forward_kernel<hip_bfloat16>(hip_bfloat16 const *input_ptr,
+                                           hip_bfloat16 *output_ptr,
+                                           size_t num_elements);
 template __global__ void
     replicate_backward_kernel<float>(float const *input_ptr,
                                      float *output_ptr,
diff --git a/src/parallel_ops/reduction.cc b/src/parallel_ops/reduction.cc
index d53cae330b..bd7cb4085b 100644
--- a/src/parallel_ops/reduction.cc
+++ b/src/parallel_ops/reduction.cc
@@ -60,7 +60,7 @@ ReductionParams Reduction::get_params() const {
 }
 
 Reduction::Reduction(FFModel &model,
-                     ParallelTensor const _input,
+                     const ParallelTensor _input,
                      int _reduction_legion_dim,
                      int _reduction_degree,
                      char const *name)
@@ -381,7 +381,7 @@ void Reduction::forward_task(Task const *task,
                           num_elements,
                           num_replicas);
   } else if (input.data_type == DT_B16) {
-    forward_kernel<__nv_bfloat16>(input.get_bfloat16_ptr(),
+    forward_kernel<__ff_bfloat16>(input.get_bfloat16_ptr(),
                                   output.get_bfloat16_ptr(),
                                   num_elements,
                                   num_replicas);
diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc
index d9f7b2263b..18e4d26f26 100644
--- a/src/parallel_ops/replicate.cc
+++ b/src/parallel_ops/replicate.cc
@@ -59,7 +59,7 @@ ReplicateParams Replicate::get_params() const {
 }
 
 Replicate::Replicate(FFModel &model,
-                     ParallelTensor const _input,
+                     const ParallelTensor _input,
                      int _replicate_legion_dim,
                      int _replicate_degree,
                      char const *name)
@@ -374,7 +374,7 @@ void Replicate::forward_task(Task const *task,
                           output.get_float_ptr(),
                           input_domain.get_volume());
   } else if (input.data_type == DT_B16) {
-    forward_kernel<__nv_bfloat16>(input.get_bfloat16_ptr(),
+    forward_kernel<__ff_bfloat16>(input.get_bfloat16_ptr(),
                                   output.get_bfloat16_ptr(),
                                   input_domain.get_volume());
   } else {
diff --git a/src/runtime/accessor.cc b/src/runtime/accessor.cc
index 9bb6c08eea..30aef93df9 100644
--- a/src/runtime/accessor.cc
+++ b/src/runtime/accessor.cc
@@ -77,12 +77,12 @@ half const *GenericTensorAccessorR::get_half_ptr() const {
   }
 }
 
-__nv_bfloat16 const *GenericTensorAccessorR::get_bfloat16_ptr() const {
+__ff_bfloat16 const *GenericTensorAccessorR::get_bfloat16_ptr() const {
   if (data_type == DT_B16) {
-    return static_cast<__nv_bfloat16 const *>(ptr);
+    return static_cast<__ff_bfloat16 const *>(ptr);
   } else {
     assert(false && "Invalid Accessor Type");
-    return static_cast<__nv_bfloat16 const *>(nullptr);
+    return static_cast<__ff_bfloat16 const *>(nullptr);
   }
 }
 
@@ -174,12 +174,12 @@ half *GenericTensorAccessorW::get_half_ptr() const {
   }
 }
 
-__nv_bfloat16 *GenericTensorAccessorW::get_bfloat16_ptr() const {
+__ff_bfloat16 *GenericTensorAccessorW::get_bfloat16_ptr() const {
   if (data_type == DT_B16) {
-    return static_cast<__nv_bfloat16 *>(ptr);
+    return static_cast<__ff_bfloat16 *>(ptr);
   } else {
     assert(false && "Invalid Accessor Type");
-    return static_cast<__nv_bfloat16 *>(nullptr);
+    return static_cast<__ff_bfloat16 *>(nullptr);
   }
 }
 
@@ -290,7 +290,7 @@ GenericTensorAccessorR
       break;
     }
     case DT_B16: {
-      ptr = helperGetTensorPointerRO<__nv_bfloat16>(
+      ptr = helperGetTensorPointerRO<__ff_bfloat16>(
           region, req, fid, ctx, runtime);
       break;
     }
@@ -341,7 +341,7 @@ GenericTensorAccessorW
       break;
     }
     case DT_B16: {
-      ptr = helperGetTensorPointerWO<__nv_bfloat16>(
+      ptr = helperGetTensorPointerWO<__ff_bfloat16>(
           region, req, fid, ctx, runtime);
       break;
     }
@@ -392,7 +392,7 @@ GenericTensorAccessorW
       break;
     }
     case DT_B16: {
-      ptr = helperGetTensorPointerRW<__nv_bfloat16>(
+      ptr = helperGetTensorPointerRW<__ff_bfloat16>(
           region, req, fid, ctx, runtime);
       break;
     }
diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc
index 764c938dc3..232d711d82 100644
--- a/src/runtime/ffconst_utils.cc
+++ b/src/runtime/ffconst_utils.cc
@@ -214,7 +214,7 @@ size_t data_type_size(DataType type) {
     case DT_HALF:
       return sizeof(half);
     case DT_B16:
-      return sizeof(__nv_bfloat16);
+      return sizeof(__ff_bfloat16);
     case DT_FLOAT:
       return sizeof(float);
     case DT_DOUBLE:
diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp
index fb94135c8f..00a6730de0 100644
--- a/src/runtime/hip_helper.cpp
+++ b/src/runtime/hip_helper.cpp
@@ -299,6 +299,33 @@ __host__ void
   checkCUDA(hipHostFree(host_ptr));
 }
 
+template <>
+__host__ void save_tensor(hip_bfloat16 const *ptr,
+                          size_t num_elements,
+                          char const *file_name) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hip_bfloat16 *host_ptr;
+  checkCUDA(hipHostMalloc(&host_ptr,
+                          sizeof(hip_bfloat16) * num_elements,
+                          hipHostMallocPortable | hipHostMallocMapped));
+  checkCUDA(hipMemcpyAsync(host_ptr,
+                           ptr,
+                           sizeof(hip_bfloat16) * num_elements,
+                           hipMemcpyDeviceToHost,
+                           stream));
+  checkCUDA(hipDeviceSynchronize());
+  FILE *tensor_file;
+  tensor_file = fopen(file_name, "w");
+  assert(tensor_file != NULL);
+  for (unsigned i = 0; i < num_elements; i++) {
+    fprintf(tensor_file, "%.9f, ", (float)host_ptr[i]);
+  }
+
+  fclose(tensor_file);
+  checkCUDA(hipHostFree(host_ptr));
+}
+
 template <>
 __host__ void save_tensor(int32_t const *ptr,
                           size_t num_elements,
@@ -489,6 +516,8 @@ miopenDataType_t ff_to_cudnn_datatype(DataType type) {
   switch (type) {
     case DT_HALF:
       return miopenHalf;
+    case DT_B16:
+      return miopenBFloat16;
     case DT_FLOAT:
       return miopenFloat;
     case DT_DOUBLE:
@@ -510,6 +539,10 @@ hipblasDatatype_t ff_to_cuda_datatype(DataType type) {
       return HIPBLAS_R_64F;
     case DT_INT32:
       return HIPBLAS_R_32I;
+    case DT_B16:
+      return HIPBLAS_R_16B;
+    case DT_HALF:
+      return HIPBLAS_R_16F;
     default:
       assert(false && "Unspoorted cuda data type");
   }
@@ -520,6 +553,8 @@ ncclDataType_t ff_to_nccl_datatype(DataType type) {
   switch (type) {
     case DT_HALF:
       return ncclHalf;
+    case DT_B16:
+      return ncclBfloat16;
     case DT_FLOAT:
       return ncclFloat;
     case DT_DOUBLE:
@@ -540,6 +575,9 @@ void handle_unimplemented_hip_kernel(OperatorType op_type) {
 
 template __global__ void
     assign_kernel<half>(half *ptr, coord_t size, half value);
+template __global__ void assign_kernel<hip_bfloat16>(hip_bfloat16 *ptr,
+                                                     coord_t size,
+                                                     hip_bfloat16 value);
 template __global__ void
     assign_kernel<float>(float *ptr, coord_t size, float value);
 template __global__ void
@@ -609,7 +647,9 @@ template __host__ void save_tensor<int64_t>(int64_t const *ptr,
                                             char const *file_name);
 template __host__ void
     save_tensor<half>(half const *ptr, size_t rect, char const *file_name);
-
+template __host__ void save_tensor<hip_bfloat16>(hip_bfloat16 const *ptr,
+                                                 size_t rect,
+                                                 char const *file_name);
 template __host__ float *download_tensor<float>(float const *ptr,
                                                 size_t num_elements);
 template __host__ half *download_tensor<half>(half const *ptr,
diff --git a/src/runtime/initializer_kernel.cpp b/src/runtime/initializer_kernel.cpp
index 1005d93cec..2e55591729 100644
--- a/src/runtime/initializer_kernel.cpp
+++ b/src/runtime/initializer_kernel.cpp
@@ -259,6 +259,17 @@ void ZeroInitializer::init_task(Task const *task,
                          w,
                          domain.get_volume(),
                          0.0f);
+    } else if (meta->data_types[i] == DT_B16) {
+      hip_bfloat16 *w = helperGetTensorPointerWO<hip_bfloat16>(
+          regions[i], task->regions[i], FID_DATA, ctx, runtime);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(assign_kernel<hip_bfloat16>),
+                         GET_BLOCKS(domain.get_volume()),
+                         CUDA_NUM_THREADS,
+                         0,
+                         stream,
+                         w,
+                         domain.get_volume(),
+                         0.0f);
     } else if (meta->data_types[i] == DT_INT32) {
       int32_t *w = helperGetTensorPointerWO<int32_t>(
           regions[i], task->regions[i], FID_DATA, ctx, runtime);
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index f31be1bcdb..f184a41934 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -1948,7 +1948,7 @@ void FFModel::map_tensor_with_dim2(ParallelTensor tensor,
       allocator.allocate_field(sizeof(half), FID_DATA);
       break;
     case DT_B16:
-      allocator.allocate_field(sizeof(__nv_bfloat16), FID_DATA);
+      allocator.allocate_field(sizeof(__ff_bfloat16), FID_DATA);
       break;
     case DT_FLOAT:
       allocator.allocate_field(sizeof(float), FID_DATA);
diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc
index e329593970..d3cc8a0cb1 100644
--- a/src/runtime/parallel_tensor.cc
+++ b/src/runtime/parallel_tensor.cc
@@ -847,10 +847,10 @@ template bool ParallelTensorBase::get_tensor<half>(FFModel const *ff,
                                                    half *data,
                                                    bool get_gradients);
 
-template bool ParallelTensorBase::set_tensor<__nv_bfloat16>(
-    FFModel const *ff, std::vector<int> const &dims, __nv_bfloat16 const *data);
-template bool ParallelTensorBase::get_tensor<__nv_bfloat16>(FFModel const *ff,
-                                                            __nv_bfloat16 *data,
+template bool ParallelTensorBase::set_tensor<__ff_bfloat16>(
+    FFModel const *ff, std::vector<int> const &dims, __ff_bfloat16 const *data);
+template bool ParallelTensorBase::get_tensor<__ff_bfloat16>(FFModel const *ff,
+                                                            __ff_bfloat16 *data,
                                                             bool get_gradients);
 
 template bool ParallelTensorBase::set_tensor<char>(FFModel const *ff,

From bc150e360cc8de0feb7d45208bd47e306bc54b30 Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Tue, 16 Jan 2024 11:04:40 -0500
Subject: [PATCH 06/11] .

---
 inference/file_loader.cc                 |  4 +---
 inference/file_loader.h                  | 10 +++++-----
 inference/incr_decoding/incr_decoding.cc | 14 ++++----------
 inference/models/llama.cc                |  6 +++---
 4 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/inference/file_loader.cc b/inference/file_loader.cc
index 60b2c80b06..257ebfc52b 100644
--- a/inference/file_loader.cc
+++ b/inference/file_loader.cc
@@ -668,9 +668,7 @@ void load_attention_weights_v2(DT *ptr,
 }
 
 template <typename DT>
-void FileDataLoader::load_from_file(DT *ptr,
-                                    size_t size,
-                                    std::string filepath) {
+void load_from_file(DT *ptr, size_t size, std::string filepath) {
   std::ifstream in(filepath, std::ios::in | std::ios::binary);
   if (!in.good()) {
     std::cout << "Could not open file: " << filepath << std::endl;
diff --git a/inference/file_loader.h b/inference/file_loader.h
index 579c6d4239..5da2d02a60 100644
--- a/inference/file_loader.h
+++ b/inference/file_loader.h
@@ -50,12 +50,12 @@ class FileDataLoader {
                       ParallelTensor position_pt,
                       int max_seq_length,
                       int offset);
-  template <typename DT>
-  void load_from_file(DT *ptr, size_t size, std::string filepath);
+  // template <typename DT>
+  // void load_from_file(DT *ptr, size_t size, std::string filepath);
+
 
-      private : int num_heads,
-                num_kv_heads,
-                tensor_parallelism_degree;
+private:
+  int num_heads, num_kv_heads, tensor_parallelism_degree;
   size_t hidden_dim, qkv_inner_dim;
   std::string prompts_filepath;
   std::string weights_folder;
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index e11eb14113..06e32c4855 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -235,17 +235,11 @@ void FlexFlow::top_level_task(Task const *task,
                               generationConfig,
                               data_type);
   } else if (model_type == ModelType::OPT) {
-    OPT::create_opt_model(model,
-                          config_filepath,
-                          weights_filepath,
-                          INC_DECODING_MODE,
-                          data_type);
+    OPT::create_opt_model(
+        model, config_filepath, weights_filepath, INC_DECODING_MODE, data_type);
   } else if (model_type == ModelType::FALCON) {
-    FALCON::create_falcon_model(model,
-                                config_filepath,
-                                weights_filepath,
-                                INC_DECODING_MODE,
-                                data_type);
+    FALCON::create_falcon_model(
+        model, config_filepath, weights_filepath, INC_DECODING_MODE, data_type);
   } else if (model_type == ModelType::STARCODER) {
     STARCODER::create_starcoder_model(model,
                                       config_filepath,
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 40075bcdbf..e54d6d8811 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -25,7 +25,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                                std::string const &weight_file_path,
                                InferenceMode mode,
                                GenerationConfig generation_config,
-                               DataType data_type) {
+                               bool use_full_precision) {
   // do not apply cpu offload in beam search model.
   LLAMAConfig llama_config(model_config_file_path);
   llama_config.print();
@@ -54,7 +54,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                               llama_config.vocab_size,
                               llama_config.hidden_size,
                               AGGR_MODE_NONE,
-                              data_type,
+                              use_full_precision ? DT_FLOAT : DT_HALF,
                               NULL,
                               embed_init,
                               "tok_embeddings");
@@ -276,7 +276,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                             llama_config.hidden_size /
                                 llama_config.num_attention_heads,
                             ff.config.tensor_parallelism_degree);
-  fileloader.load_weights(&ff, false);
+  fileloader.load_weights(&ff, use_full_precision);
   std::cout << "------load weight finished----------" << std::endl;
 
   // init operators

From b56ce9053db84e2f4eed9755c2b4251a5fe135cc Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Tue, 16 Jan 2024 11:11:38 -0500
Subject: [PATCH 07/11] .

---
 inference/models/starcoder.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 5714ca1369..e683376e47 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -26,7 +26,7 @@ void STARCODER::create_starcoder_model(
     std::string const &weight_file_path,
     InferenceMode mode,
     GenerationConfig generationConfig,
-    DataType data_type) {
+    bool use_full_precision) {
   // do not apply cpu offload in beam search model.
   STARCODERConfig startcoder_config(model_config_file_path);
   startcoder_config.print();
@@ -62,7 +62,7 @@ void STARCODER::create_starcoder_model(
                               startcoder_config.vocab_size,
                               startcoder_config.hidden_size,
                               AGGR_MODE_NONE,
-                              data_type,
+                              use_full_precision ? DT_FLOAT : DT_HALF,
                               NULL,
                               embed_init,
                               "transformer_wte");
@@ -72,7 +72,7 @@ void STARCODER::create_starcoder_model(
                    startcoder_config.max_position_embeddings,
                    startcoder_config.hidden_size,
                    AGGR_MODE_NONE,
-                   data_type,
+                   use_full_precision ? DT_FLOAT : DT_HALF,
                    NULL,
                    embed_init,
                    "transformer_wpe");
@@ -232,7 +232,7 @@ void STARCODER::create_starcoder_model(
                             startcoder_config.hidden_size /
                                 startcoder_config.num_attention_heads,
                             ff.config.tensor_parallelism_degree);
-  fileloader.load_weights(&ff, false);
+  fileloader.load_weights(&ff, use_full_precision);
   std::cout << "------load weight finished----------" << std::endl;
 
   // init operators

From d664239a6eae4fc083bb0e34cac9f6b4d368edcd Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Wed, 17 Jan 2024 10:18:32 -0500
Subject: [PATCH 08/11] fix

---
 include/flexflow/ffconst.h                    |  2 +-
 include/flexflow/utils/file_loader.h          |  9 ---------
 inference/incr_decoding/incr_decoding.cc      |  4 +++-
 inference/models/falcon.cc                    |  2 +-
 inference/models/falcon.h                     |  2 +-
 inference/models/llama.cc                     |  6 +++---
 inference/models/llama.h                      |  2 +-
 inference/models/mpt.cc                       |  2 +-
 inference/models/mpt.h                        |  2 +-
 inference/models/opt.cc                       |  2 +-
 inference/models/opt.h                        |  2 +-
 inference/models/starcoder.cc                 |  8 ++++----
 inference/models/starcoder.h                  |  2 +-
 inference/spec_infer/spec_infer.cc            |  2 +-
 inference/utils/download_hf_model.py          |  4 ++--
 python/flexflow/serve/serve.py                |  6 +++---
 python/flexflow/type.py                       |  2 +-
 src/ops/add_bias_residual_layer_norm.cu       |  2 +-
 src/ops/arg_topk.cpp                          |  2 +-
 src/ops/arg_topk.cu                           |  2 +-
 src/ops/argmax.cpp                            |  4 ++--
 src/ops/argmax.cu                             |  6 +++---
 src/ops/fused.cpp                             |  6 +++---
 src/ops/fused.cu                              |  6 +++---
 src/ops/inc_multihead_self_attention.cpp      |  2 +-
 src/ops/inc_multihead_self_attention.cu       | 10 +++++-----
 src/ops/kernels/element_binary_kernels.cpp    |  2 +-
 src/ops/kernels/element_binary_kernels.cu     |  2 +-
 src/ops/kernels/embedding_kernels.cpp         |  8 ++++----
 src/ops/kernels/embedding_kernels.cu          |  8 ++++----
 src/ops/kernels/linear_kernels.cpp            |  4 ++--
 src/ops/kernels/linear_kernels.cu             |  8 ++++----
 src/ops/kernels/residual_rms_norm_kernels.cu  |  2 +-
 src/ops/kernels/rms_norm_kernels.cpp          |  2 +-
 src/ops/kernels/rms_norm_kernels.cu           |  2 +-
 src/ops/layer_norm.cpp                        |  2 +-
 src/ops/layer_norm.cu                         |  2 +-
 src/ops/linear.cc                             |  6 +++---
 src/ops/residual_layer_norm.cpp               |  2 +-
 src/ops/residual_layer_norm.cu                |  2 +-
 src/ops/sampling.cu                           |  4 ++--
 src/ops/sigmoid_silu_multi.cpp                |  2 +-
 src/ops/sigmoid_silu_multi.cu                 |  2 +-
 src/ops/softmax.cc                            |  6 +++---
 src/ops/spec_inc_multihead_self_attention.cpp |  2 +-
 src/ops/spec_inc_multihead_self_attention.cu  |  4 ++--
 src/ops/tree_inc_multihead_self_attention.cpp |  2 +-
 src/ops/tree_inc_multihead_self_attention.cu  |  2 +-
 src/parallel_ops/combine.cc                   |  2 +-
 src/parallel_ops/reduction.cc                 |  2 +-
 src/parallel_ops/replicate.cc                 |  2 +-
 src/runtime/accessor.cc                       | 10 +++++-----
 src/runtime/cuda_helper.cu                    |  6 +++---
 src/runtime/ffconst_utils.cc                  |  2 +-
 src/runtime/file_loader.cc                    |  2 +-
 src/runtime/hip_helper.cpp                    |  6 +++---
 src/runtime/initializer_kernel.cpp            |  2 +-
 src/runtime/initializer_kernel.cu             |  2 +-
 src/runtime/model.cc                          |  2 +-
 src/runtime/operator.cc                       |  6 +++---
 60 files changed, 106 insertions(+), 113 deletions(-)

diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index 9613a041f7..49d5f4d1e4 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -35,7 +35,7 @@ enum DataType {
   DT_DOUBLE = 45,
   DT_INT4 = 46,
   DT_INT8 = 47,
-  DT_B16 = 48,
+  DT_BF16 = 48,
   DT_NONE = 49,
 };
 
diff --git a/include/flexflow/utils/file_loader.h b/include/flexflow/utils/file_loader.h
index 262bb42e72..39b46955b5 100644
--- a/include/flexflow/utils/file_loader.h
+++ b/include/flexflow/utils/file_loader.h
@@ -38,18 +38,10 @@ class FileDataLoader {
   template <typename DT>
   void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx);
 
-<<<<<<< HEAD:inference/file_loader.h
   void load_single_weight_tensor_b16(FFModel *ff, Layer *l, int weight_idx);
 
-  void load_quantization_weight(FFModel *ff,
-                                Layer *l,
-                                int weight_idx,
-                                bool use_full_precision);
-  void load_weights(FFModel *ff, bool use_full_precision);
-=======
   void load_quantization_weight(FFModel *ff, Layer *l, int weight_idx);
   void load_weights(FFModel *ff);
->>>>>>> origin/inference:include/flexflow/utils/file_loader.h
 
   void load_positions(FFModel *ff,
                       Tensor pt,
@@ -59,7 +51,6 @@ class FileDataLoader {
   // template <typename DT>
   // void load_from_file(DT *ptr, size_t size, std::string filepath);
 
-
 private:
   int num_heads, num_kv_heads, tensor_parallelism_degree;
   size_t hidden_dim, qkv_inner_dim;
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 4dd44aa5cd..96017008db 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -81,7 +81,7 @@ void parse_input_args(char **argv,
     }
     if (!strcmp(argv[i], "--use-bfloat16-precision")) {
       use_bfloat16_precision = true;
-      data_type = DT_B16;
+      data_type = DT_BF16;
       continue;
     }
     // verbose logging to stdout
@@ -171,6 +171,8 @@ void FlexFlow::top_level_task(Task const *task,
       {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
   std::string tokenizer_filepath =
       join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name});
+
+  // bfloat16 shares same weight file with float32    
   std::string weights_filepath = join_path(
       {file_paths.cache_folder_path,
        "weights",
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index 6ba950b984..5396b32fbd 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -248,7 +248,7 @@ void FALCON::create_falcon_model(FFModel &ff,
                          falcon_config.hidden_size,
                          falcon_config.hidden_size / falcon_config.n_head,
                          ff.config.tensor_parallelism_degree,
-                         use_full_precision);
+                         true);
 
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
diff --git a/inference/models/falcon.h b/inference/models/falcon.h
index 26f0a094a1..7477b3523d 100644
--- a/inference/models/falcon.h
+++ b/inference/models/falcon.h
@@ -94,7 +94,7 @@ class FALCON {
                                   std::string const &model_config_file_path,
                                   std::string const &weight_file_path,
                                   InferenceMode mode,
-                                  DataType DataType);
+                                  DataType data_type);
 };
 
 }; // namespace FlexFlow
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 14b8c31fa1..fcd5c64bfe 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -25,7 +25,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                                std::string const &weight_file_path,
                                InferenceMode mode,
                                GenerationConfig generation_config,
-                               bool use_full_precision) {
+                               DataType data_type) {
   // do not apply cpu offload in beam search model.
   LLAMAConfig llama_config(model_config_file_path);
   llama_config.print();
@@ -55,7 +55,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                               llama_config.vocab_size,
                               llama_config.hidden_size,
                               AGGR_MODE_NONE,
-                              use_full_precision ? DT_FLOAT : DT_HALF,
+                              data_type,
                               NULL,
                               embed_init,
                               "tok_embeddings");
@@ -273,7 +273,7 @@ void LLAMA::create_llama_model(FFModel &ff,
       llama_config.hidden_size,
       llama_config.hidden_size / llama_config.num_attention_heads,
       ff.config.tensor_parallelism_degree,
-      use_full_precision);
+      true);
 
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
diff --git a/inference/models/llama.h b/inference/models/llama.h
index 9530b25061..b0f8c6b207 100644
--- a/inference/models/llama.h
+++ b/inference/models/llama.h
@@ -83,7 +83,7 @@ class LLAMA {
                                  std::string const &weight_file_path,
                                  InferenceMode mode,
                                  GenerationConfig generation_config,
-                                 DataType DataType);
+                                 DataType data_type);
 };
 
 }; // namespace FlexFlow
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index 6b23a4ede9..c5572deb47 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -255,7 +255,7 @@ void MPT::create_mpt_model(FFModel &ff,
                          mpt_config.hidden_size,
                          mpt_config.hidden_size / mpt_config.n_heads,
                          ff.config.tensor_parallelism_degree,
-                         use_full_precision);
+                         true);
 
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
diff --git a/inference/models/mpt.h b/inference/models/mpt.h
index a396814f58..510eaae5b0 100644
--- a/inference/models/mpt.h
+++ b/inference/models/mpt.h
@@ -70,7 +70,7 @@ class MPT {
                                std::string const &weight_file_path,
                                InferenceMode mode,
                                GenerationConfig generationConfig,
-                               DataType DataType);
+                               DataType data_type);
 };
 
 }; // namespace FlexFlow
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index a861953a69..a185af8b38 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -263,7 +263,7 @@ void OPT::create_opt_model(FFModel &ff,
       opt_config.hidden_size,
       opt_config.hidden_size / opt_config.num_attention_heads,
       ff.config.tensor_parallelism_degree,
-      use_full_precision);
+      true);
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
 
diff --git a/inference/models/opt.h b/inference/models/opt.h
index 238ae19d4a..cc1cdf576b 100644
--- a/inference/models/opt.h
+++ b/inference/models/opt.h
@@ -97,7 +97,7 @@ class OPT {
                                std::string const &model_config_file_path,
                                std::string const &weight_file_path,
                                InferenceMode mode,
-                               DataType DataType);
+                               DataType data_type);
 };
 
 }; // namespace FlexFlow
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 2327c86119..71dbfb8fbb 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -26,7 +26,7 @@ void STARCODER::create_starcoder_model(
     std::string const &weight_file_path,
     InferenceMode mode,
     GenerationConfig generationConfig,
-    bool use_full_precision) {
+    DataType data_type) {
   // do not apply cpu offload in beam search model.
   STARCODERConfig startcoder_config(model_config_file_path);
   startcoder_config.print();
@@ -63,7 +63,7 @@ void STARCODER::create_starcoder_model(
                               startcoder_config.vocab_size,
                               startcoder_config.hidden_size,
                               AGGR_MODE_NONE,
-                              use_full_precision ? DT_FLOAT : DT_HALF,
+                              data_type,
                               NULL,
                               embed_init,
                               "transformer_wte");
@@ -73,7 +73,7 @@ void STARCODER::create_starcoder_model(
                    startcoder_config.max_position_embeddings,
                    startcoder_config.hidden_size,
                    AGGR_MODE_NONE,
-                   use_full_precision ? DT_FLOAT : DT_HALF,
+                   data_type,
                    NULL,
                    embed_init,
                    "transformer_wpe");
@@ -230,7 +230,7 @@ void STARCODER::create_starcoder_model(
       startcoder_config.hidden_size,
       startcoder_config.hidden_size / startcoder_config.num_attention_heads,
       ff.config.tensor_parallelism_degree,
-      use_full_precision);
+      true);
   im->register_model_weights_loader(&ff, fileloader);
 #ifdef DEADCODE
   // Compile the model
diff --git a/inference/models/starcoder.h b/inference/models/starcoder.h
index d35ae78435..fce7a27467 100644
--- a/inference/models/starcoder.h
+++ b/inference/models/starcoder.h
@@ -71,7 +71,7 @@ class STARCODER {
                                      std::string const &weight_file_path,
                                      InferenceMode mode,
                                      GenerationConfig generationConfig,
-                                     DataType DataType);
+                                     DataType data_type);
 };
 
 }; // namespace FlexFlow
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index e0299527bb..d37b46d5ae 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -105,7 +105,7 @@ void parse_input_args(char **argv,
     }
     if (!strcmp(argv[i], "--use-bfloat16-precision")) {
       use_bfloat16_precision = true;
-      data_type = DT_B16;
+      data_type = DT_BF16;
     }
     // verbose logging to stdout
     if (!strcmp(argv[i], "--verbose")) {
diff --git a/inference/utils/download_hf_model.py b/inference/utils/download_hf_model.py
index f158d4e480..2a23527fb7 100644
--- a/inference/utils/download_hf_model.py
+++ b/inference/utils/download_hf_model.py
@@ -45,9 +45,9 @@ def main(args):
     elif args.half_precision_only:
         data_types = ff.DataType.DT_HALF
     elif args.bfloat16_precision_only:
-        data_types = ff.DataType.DT_B16
+        data_types = ff.DataType.DT_BF16
     else:
-        data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF, ff.DataType.DT_B16)
+        data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF, ff.DataType.DT_BF16)
 
     for model_name in args.model_names:
         for data_type in data_types:
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index b8afb285ad..3231b222b4 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -113,7 +113,7 @@ def __init__(
             self.config_class,
         ) = self.__get_ff_model_type()
         self.data_type = data_type
-        assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT or self.data_type == DataType.DT_B16
+        assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT or self.data_type == DataType.DT_BF16
         self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow"
         self.refresh_cache = refresh_cache
         self.output_file = output_file
@@ -177,7 +177,7 @@ def download_hf_weights_if_needed(self):
             torch.set_default_tensor_type(torch.HalfTensor)
         elif self.data_type == DataType.DT_FLOAT:
             torch.set_default_tensor_type(torch.FloatTensor)
-        elif self.data_type == DataType.DT_B16:
+        elif self.data_type == DataType.DT_BF16:
             torch.set_default_tensor_type(torch.BFloat16Tensor)
         else:
             assert False, "Data type not yet supported -- cannot download weights!"
@@ -189,7 +189,7 @@ def download_hf_weights_if_needed(self):
             self.model_name.lower(),
             "full-precision"
             if self.data_type == DataType.DT_FLOAT
-            else "bfloat16-precision" if self.data_type == DataType.DT_B16
+            else "bfloat16-precision" if self.data_type == DataType.DT_BF16
             else "half-precision",
         )
         if self.refresh_cache:
diff --git a/python/flexflow/type.py b/python/flexflow/type.py
index df113438e8..04de9fa7d6 100644
--- a/python/flexflow/type.py
+++ b/python/flexflow/type.py
@@ -35,7 +35,7 @@ class DataType(Enum):
     DT_HALF = 43
     DT_FLOAT = 44
     DT_DOUBLE = 45
-    DT_B16 = 48
+    DT_BF16 = 48
     DT_NONE = 49
 
 
diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu
index b137d6faef..8b372875ea 100644
--- a/src/ops/add_bias_residual_layer_norm.cu
+++ b/src/ops/add_bias_residual_layer_norm.cu
@@ -234,7 +234,7 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
         m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
         (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr,
         stream);
-  } else if (m->input_type[0] == DT_B16) {
+  } else if (m->input_type[0] == DT_BF16) {
     AddBiasResidualLayerNorm::inference_kernel<__nv_bfloat16>(
         m,
         attn_bias_dim,
diff --git a/src/ops/arg_topk.cpp b/src/ops/arg_topk.cpp
index 17cc7534ad..54166ab8d2 100644
--- a/src/ops/arg_topk.cpp
+++ b/src/ops/arg_topk.cpp
@@ -515,7 +515,7 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
                             m->sorted,
                             m->speculative_decoding ? bc : nullptr,
                             stream);
-  } else if (input.data_type == DT_B16) {
+  } else if (input.data_type == DT_BF16) {
     ArgTopK::forward_kernel(m,
                             input.get_bfloat16_ptr(),
                             m->speculative_decoding ? probs.get_float_ptr()
diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index 7f43aba948..671e9aeb25 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -520,7 +520,7 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
                             m->sorted,
                             m->speculative_decoding ? bc : nullptr,
                             stream);
-  } else if (input.data_type == DT_B16) {
+  } else if (input.data_type == DT_BF16) {
     ArgTopK::forward_kernel(m,
                             input.get_bfloat16_ptr(),
                             m->speculative_decoding ? probs.get_float_ptr()
diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp
index 7437b8b969..1c94585ba1 100644
--- a/src/ops/argmax.cpp
+++ b/src/ops/argmax.cpp
@@ -466,7 +466,7 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
                                   length,
                                   batch_size,
                                   stream);
-  } else if (input.data_type == DT_B16) {
+  } else if (input.data_type == DT_BF16) {
     ArgMax::forward_kernel<hip_bfloat16>(m,
                                          input.get_bfloat16_ptr(),
                                          indices.get_int32_ptr(),
@@ -501,7 +501,7 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler,
     : OpMeta(handler, op) {
   DataType data_type = op->data_type;
   size_t prob_size = batch_size;
-  assert(data_type == DT_FLOAT || data_type == DT_HALF || data_type == DT_B16);
+  assert(data_type == DT_FLOAT || data_type == DT_HALF || data_type == DT_BF16);
   size_t total_size = prob_size * sizeof(float);
   gpu_mem_allocator.create_legion_instance(reserveInst, total_size);
   probs = gpu_mem_allocator.allocate_instance<float>(prob_size);
diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu
index 32cd872a55..aebafc2fff 100644
--- a/src/ops/argmax.cu
+++ b/src/ops/argmax.cu
@@ -123,7 +123,7 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
                                   length,
                                   batch_size,
                                   stream);
-  } else if (input.data_type == DT_B16) {
+  } else if (input.data_type == DT_BF16) {
     ArgMax::forward_kernel<__nv_bfloat16>(
         m,
         input.get_bfloat16_ptr(),
@@ -163,7 +163,7 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler,
 
   size_t d_offsets_size = batch_size;
   size_t prob_size = batch_size;
-  assert(data_type == DT_FLOAT || data_type == DT_HALF || data_type == DT_B16);
+  assert(data_type == DT_FLOAT || data_type == DT_HALF || data_type == DT_BF16);
   size_t total_size =
       d_offsets_size * sizeof(int) +
       (data_type == DT_FLOAT
@@ -214,7 +214,7 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler,
         d_offsets,
         d_offsets + 1,
         stream));
-  } else if (data_type == DT_B16) {
+  } else if (data_type == DT_BF16) {
     checkCUDA(cub::DeviceSegmentedReduce::ArgMax(
         d_temp_storage,
         temp_storage_bytes,
diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp
index af98f153e2..2daa48c3d0 100644
--- a/src/ops/fused.cpp
+++ b/src/ops/fused.cpp
@@ -428,7 +428,7 @@ __host__ void FusedOp::forward_task(Task const *task,
               m,
               my_input_accessor[0].get_float_ptr(),
               my_output_accessor[0].get_float_ptr());
-        } else if (m->input_type == DT_B16) {
+        } else if (m->input_type == DT_BF16) {
           Kernels::Softmax::forward_kernel_wrapper(
               m,
               my_input_accessor[0].get_bfloat16_ptr(),
@@ -820,7 +820,7 @@ __host__ void
               my_input_accessor[0].get_float_ptr(),
               my_output_accessor[0].get_float_ptr(),
               my_input_accessor[0].domain.get_volume());
-        } else if (m->data_type == DT_B16) {
+        } else if (m->data_type == DT_BF16) {
           ElementUnary::forward_kernel_wrapper(
               m,
               my_input_accessor[0].get_bfloat16_ptr(),
@@ -1050,7 +1050,7 @@ __host__ void
               m,
               my_input_accessor[0].get_float_ptr(),
               my_output_accessor[0].get_float_ptr());
-        } else if (m->input_type == DT_B16) {
+        } else if (m->input_type == DT_BF16) {
           Kernels::Softmax::forward_kernel_wrapper(
               m,
               my_input_accessor[0].get_bfloat16_ptr(),
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 77e12828a1..16b9a2b782 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -442,7 +442,7 @@ __host__ void FusedOp::forward_task(Task const *task,
               m,
               my_input_accessor[0].get_float_ptr(),
               my_output_accessor[0].get_float_ptr());
-        } else if (m->input_type == DT_B16) {
+        } else if (m->input_type == DT_BF16) {
           Kernels::Softmax::forward_kernel_wrapper(
               m,
               my_input_accessor[0].get_bfloat16_ptr(),
@@ -855,7 +855,7 @@ __host__ void
               my_input_accessor[0].get_float_ptr(),
               my_output_accessor[0].get_float_ptr(),
               my_input_accessor[0].domain.get_volume());
-        } else if (m->data_type == DT_B16) {
+        } else if (m->data_type == DT_BF16) {
           ElementUnary::forward_kernel_wrapper(
               m,
               my_input_accessor[0].get_bfloat16_ptr(),
@@ -1087,7 +1087,7 @@ __host__ void
               m,
               my_input_accessor[0].get_float_ptr(),
               my_output_accessor[0].get_float_ptr());
-        } else if (m->input_type == DT_B16) {
+        } else if (m->input_type == DT_BF16) {
           Kernels::Softmax::forward_kernel_wrapper(
               m,
               my_input_accessor[0].get_bfloat16_ptr(),
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 4d63ba628d..ca83e7196a 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -802,7 +802,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
         output.get_float_ptr(),
         bias_ptr,
         stream);
-  } else if (input.data_type == DT_B16) {
+  } else if (input.data_type == DT_BF16) {
     if (m->offload) {
       pre_build_weight_kernel<hip_bfloat16>(m, weight, input.data_type, stream);
     }
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index af77ac9f6d..9b4cffad4c 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -506,7 +506,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   if (m->output_type[0] == DT_FLOAT) {
     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  } else if (m->output_type[0] == DT_B16) {
+  } else if (m->output_type[0] == DT_BF16) {
     compute_type = CUBLAS_COMPUTE_32F;
   }
 #endif
@@ -640,7 +640,7 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
 #else
   cudaDataType_t compute_type = cublas_data_type;
-  if (m->output_type[0] == DT_B16) {
+  if (m->output_type[0] == DT_BF16) {
     compute_type = CUDA_R_32F;
   }
 #endif
@@ -799,7 +799,7 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
                       m->weightSize,
                       cudaMemcpyHostToDevice,
                       stream);
-    } else if (data_type == DT_B16) {
+    } else if (data_type == DT_BF16) {
       cudaMemcpyAsync(m->weight_ptr,
                       weight.get_bfloat16_ptr(),
                       m->weightSize,
@@ -927,7 +927,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   if (m->output_type[0] == DT_FLOAT) {
     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  } else if (m->output_type[0] == DT_B16) {
+  } else if (m->output_type[0] == DT_BF16) {
     compute_type = CUBLAS_COMPUTE_32F;
   }
 #endif
@@ -1202,7 +1202,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
         output.get_float_ptr(),
         bias_ptr,
         stream);
-  } else if (input.data_type == DT_B16) {
+  } else if (input.data_type == DT_BF16) {
     if (m->offload) {
       pre_build_weight_kernel<__nv_bfloat16>(
           m, weight, input.data_type, stream);
diff --git a/src/ops/kernels/element_binary_kernels.cpp b/src/ops/kernels/element_binary_kernels.cpp
index 05467182e7..a353ab76ea 100644
--- a/src/ops/kernels/element_binary_kernels.cpp
+++ b/src/ops/kernels/element_binary_kernels.cpp
@@ -91,7 +91,7 @@ void forward_kernel_wrapper(ElementBinaryMeta const *m,
                              in2.get_float_ptr(),
                              out.get_float_ptr(),
                              stream);
-  } else if (out.data_type == DT_B16) {
+  } else if (out.data_type == DT_BF16) {
     Internal::forward_kernel(m,
                              in1.get_bfloat16_ptr(),
                              in2.get_bfloat16_ptr(),
diff --git a/src/ops/kernels/element_binary_kernels.cu b/src/ops/kernels/element_binary_kernels.cu
index 26d9ea9cc9..a762f17774 100644
--- a/src/ops/kernels/element_binary_kernels.cu
+++ b/src/ops/kernels/element_binary_kernels.cu
@@ -105,7 +105,7 @@ void forward_kernel_wrapper(ElementBinaryMeta const *m,
                              in2.get_float_ptr(),
                              out.get_float_ptr(),
                              stream);
-  } else if (out.data_type == DT_B16) {
+  } else if (out.data_type == DT_BF16) {
     Internal::forward_kernel(
         m, in1.get_bfloat16_ptr(), in2.get_bfloat16_ptr(), out.get_bfloat16_ptr(), stream);
   } else {
diff --git a/src/ops/kernels/embedding_kernels.cpp b/src/ops/kernels/embedding_kernels.cpp
index 7384f4e7e6..7612abd0f0 100644
--- a/src/ops/kernels/embedding_kernels.cpp
+++ b/src/ops/kernels/embedding_kernels.cpp
@@ -70,7 +70,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                m->aggr,
                                output.domain.get_volume(),
                                stream);
-    } else if (weight.data_type == DT_B16) {
+    } else if (weight.data_type == DT_BF16) {
       Internal::forward_kernel(input.get_int32_ptr(),
                                output.get_bfloat16_ptr(),
                                weight.get_bfloat16_ptr(),
@@ -114,7 +114,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                m->aggr,
                                output.domain.get_volume(),
                                stream);
-    } else if (weight.data_type == DT_B16) {
+    } else if (weight.data_type == DT_BF16) {
       Internal::forward_kernel(input.get_int64_ptr(),
                                output.get_bfloat16_ptr(),
                                weight.get_bfloat16_ptr(),
@@ -182,7 +182,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 m->aggr,
                                 output.domain.get_volume(),
                                 stream);
-    } else if (m->output_type[0] == DT_B16) {
+    } else if (m->output_type[0] == DT_BF16) {
       Internal::backward_kernel(input.get_int32_ptr(),
                                 output.get_bfloat16_ptr(),
                                 weight_grad.get_bfloat16_ptr(),
@@ -226,7 +226,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 m->aggr,
                                 output.domain.get_volume(),
                                 stream);
-    } else if (m->output_type[0] == DT_B16) {
+    } else if (m->output_type[0] == DT_BF16) {
       Internal::backward_kernel(input.get_int64_ptr(),
                                 output.get_bfloat16_ptr(),
                                 weight_grad.get_bfloat16_ptr(),
diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu
index 8e40f4bf95..911aa5c45d 100644
--- a/src/ops/kernels/embedding_kernels.cu
+++ b/src/ops/kernels/embedding_kernels.cu
@@ -70,7 +70,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                m->aggr,
                                output.domain.get_volume(),
                                stream);
-    } else if (weight.data_type == DT_B16) {
+    } else if (weight.data_type == DT_BF16) {
       Internal::forward_kernel(input.get_int32_ptr(),
                                output.get_bfloat16_ptr(),
                                weight.get_bfloat16_ptr(),
@@ -114,7 +114,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                m->aggr,
                                output.domain.get_volume(),
                                stream);
-    } else if (weight.data_type == DT_B16) {
+    } else if (weight.data_type == DT_BF16) {
       Internal::forward_kernel(input.get_int64_ptr(),
                                output.get_bfloat16_ptr(),
                                weight.get_bfloat16_ptr(),
@@ -181,7 +181,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 m->aggr,
                                 output.domain.get_volume(),
                                 stream);
-    } else if (m->output_type[0] == DT_B16) {
+    } else if (m->output_type[0] == DT_BF16) {
       Internal::backward_kernel(input.get_int32_ptr(),
                                 output.get_bfloat16_ptr(),
                                 weight_grad.get_bfloat16_ptr(),
@@ -225,7 +225,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 m->aggr,
                                 output.domain.get_volume(),
                                 stream);
-    } else if (m->output_type[0] == DT_B16) {
+    } else if (m->output_type[0] == DT_BF16) {
       Internal::backward_kernel(input.get_int64_ptr(),
                                 output.get_bfloat16_ptr(),
                                 weight_grad.get_bfloat16_ptr(),
diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp
index 3cd9862bba..1004fe2767 100644
--- a/src/ops/kernels/linear_kernels.cpp
+++ b/src/ops/kernels/linear_kernels.cpp
@@ -124,7 +124,7 @@ void forward_kernel_wrapper(LinearMeta const *m,
                                    out_dim,
                                    batch_size,
                                    stream);
-  } else if (m->input_type[0] == DT_B16) {
+  } else if (m->input_type[0] == DT_BF16) {
     Internal::forward_kernel<hip_bfloat16>(m,
                                            input_ptr,
                                            output_ptr,
@@ -199,7 +199,7 @@ void backward_kernel_wrapper(LinearMeta const *m,
                                     out_dim,
                                     batch_size,
                                     stream);
-  } else if (m->input_type[0] == DT_B16) {
+  } else if (m->input_type[0] == DT_BF16) {
     Internal::backward_kernel<hip_bfloat16>(m,
                                             input_ptr,
                                             input_grad_ptr,
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index f31fa112c0..77ed3b9a9e 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -58,7 +58,7 @@ LinearMeta::LinearMeta(FFHandler handler,
                         min(CUDA_NUM_THREADS, parallelism),
                         0,
                         stream>>>((half *)one_ptr, batch_size);
-  } else if (data_type == DT_B16) {
+  } else if (data_type == DT_BF16) {
     Kernels::Linear::Internal::
         build_one_ptr<<<GET_BLOCKS(parallelism),
                         min(CUDA_NUM_THREADS, parallelism),
@@ -158,7 +158,7 @@ void forward_kernel_wrapper(LinearMeta const *m,
                                    out_dim,
                                    batch_size,
                                    stream);
-  } else if (m->input_type[0] == DT_B16) {
+  } else if (m->input_type[0] == DT_BF16) {
     Internal::forward_kernel<__nv_bfloat16>(m,
                                             input_ptr,
                                             output_ptr,
@@ -232,7 +232,7 @@ void backward_kernel_wrapper(LinearMeta const *m,
                                     out_dim,
                                     batch_size,
                                     stream);
-  } else if (m->input_type[0] == DT_B16) {
+  } else if (m->input_type[0] == DT_BF16) {
     Internal::backward_kernel<__nv_bfloat16>(m,
                                              input_ptr,
                                              input_grad_ptr,
@@ -362,7 +362,7 @@ void forward_kernel(LinearMeta const *m,
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   if (m->output_type[0] == DT_FLOAT) {
     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  } else if (m->output_type[0] == DT_B16) {
+  } else if (m->output_type[0] == DT_BF16) {
     compute_type = CUBLAS_COMPUTE_32F;
   }
 #endif
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 16a113c9cb..46fc43fe53 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -204,7 +204,7 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
                    residual_output.get_float_ptr(),
                    output.get_float_ptr(),
                    stream);
-  } else if (output.data_type == DT_B16) {
+  } else if (output.data_type == DT_BF16) {
     forward_kernel(m,
                    input1.get_bfloat16_ptr(),
                    input2.get_bfloat16_ptr(),
diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp
index c81e73dd77..45ff40f0d0 100644
--- a/src/ops/kernels/rms_norm_kernels.cpp
+++ b/src/ops/kernels/rms_norm_kernels.cpp
@@ -190,7 +190,7 @@ void forward_kernel_wrapper(RMSNormMeta const *m,
                    weight.get_float_ptr(),
                    output.get_float_ptr(),
                    stream);
-  } else if (output.data_type == DT_B16) {
+  } else if (output.data_type == DT_BF16) {
     forward_kernel(m,
                    input.get_bfloat16_ptr(),
                    weight.get_bfloat16_ptr(),
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index 19661724d2..115f290f14 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -246,7 +246,7 @@ void forward_kernel_wrapper(RMSNormMeta const *m,
                    weight.get_float_ptr(),
                    output.get_float_ptr(),
                    stream);
-  } else if (output.data_type == DT_B16) {
+  } else if (output.data_type == DT_BF16) {
     forward_kernel(m,
                    input.get_bfloat16_ptr(),
                    weight.get_bfloat16_ptr(),
diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp
index ff420901e4..f675899fcb 100644
--- a/src/ops/layer_norm.cpp
+++ b/src/ops/layer_norm.cpp
@@ -182,7 +182,7 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m,
                                     gamma.get_half_ptr(),
                                     m->use_bias ? beta.get_half_ptr() : nullptr,
                                     stream);
-  } else if (m->input_type[0] == DT_B16) {
+  } else if (m->input_type[0] == DT_BF16) {
     LayerNorm::forward_kernel<hip_bfloat16>(
         m,
         input.get_bfloat16_ptr(),
diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu
index e239023768..d6a67682e9 100644
--- a/src/ops/layer_norm.cu
+++ b/src/ops/layer_norm.cu
@@ -273,7 +273,7 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m,
         m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
         (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr,
         stream);
-  } else if (m->input_type[0] == DT_B16) {
+  } else if (m->input_type[0] == DT_BF16) {
     LayerNorm::forward_kernel<__nv_bfloat16>(
         m,
         input.get_bfloat16_ptr(),
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index b33851d7be..29e06d3948 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -437,7 +437,7 @@ OpMeta *Linear::init_task(Task const *task,
         return init_task_with_dim<float, float, DIM>(                          \
             task, regions, ctx, runtime);                                      \
       }                                                                        \
-    } else if (output.data_type == DT_B16) {                                   \
+    } else if (output.data_type == DT_BF16) {                                   \
       if (linear->quantization_type != DT_NONE) {                              \
         return init_task_with_dim<__ff_bfloat16, char, DIM>(                   \
             task, regions, ctx, runtime);                                      \
@@ -712,7 +712,7 @@ void Linear::forward_task(Task const *task,
         return forward_task_with_dim<float, float, DIM>(                       \
             task, regions, ctx, runtime);                                      \
       }                                                                        \
-    } else if (m->output_type[0] == DT_B16) {                                  \
+    } else if (m->output_type[0] == DT_BF16) {                                  \
       if (m->quantization_type != DT_NONE) {                                   \
         return forward_task_with_dim<__ff_bfloat16, char, DIM>(                \
             task, regions, ctx, runtime);                                      \
@@ -875,7 +875,7 @@ void Linear::backward_task(Task const *task,
       return backward_task_with_dim<half, DIM>(task, regions, ctx, runtime);   \
     } else if (m->output_type[0] == DT_FLOAT) {                                \
       return backward_task_with_dim<float, DIM>(task, regions, ctx, runtime);  \
-    } else if (m->output_type[0] == DT_B16) {                                  \
+    } else if (m->output_type[0] == DT_BF16) {                                  \
       return backward_task_with_dim<__ff_bfloat16, DIM>(                       \
           task, regions, ctx, runtime);                                        \
     } else {                                                                   \
diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp
index 09823adf27..290d8cb3d9 100644
--- a/src/ops/residual_layer_norm.cpp
+++ b/src/ops/residual_layer_norm.cpp
@@ -230,7 +230,7 @@ void ResidualLayerNorm::inference_kernel_wrapper(
         m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
         (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr,
         stream);
-  } else if (m->input_type[0] == DT_B16) {
+  } else if (m->input_type[0] == DT_BF16) {
     ResidualLayerNorm::inference_kernel<hip_bfloat16>(
         m,
         input.get_bfloat16_ptr(),
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index fc0bd5d5a2..2f310c3c2c 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -225,7 +225,7 @@ void ResidualLayerNorm::inference_kernel_wrapper(
         m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
         (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr,
         stream);
-  } else if (m->input_type[0] == DT_B16) {
+  } else if (m->input_type[0] == DT_BF16) {
     ResidualLayerNorm::inference_kernel<__nv_bfloat16>(
         m,
         input.get_bfloat16_ptr(),
diff --git a/src/ops/sampling.cu b/src/ops/sampling.cu
index d4044a9cc3..96438fa93a 100644
--- a/src/ops/sampling.cu
+++ b/src/ops/sampling.cu
@@ -182,7 +182,7 @@ void Sampling::forward_kernel_wrapper(SamplingMeta const *m,
                                     length,
                                     batch_size,
                                     stream);
-  } else if (input.data_type == DT_B16) {
+  } else if (input.data_type == DT_BF16) {
     Sampling::forward_kernel<__nv_bfloat16>(m,
                                             input.get_bfloat16_ptr(),
                                             indices.get_int32_ptr(),
@@ -278,7 +278,7 @@ SamplingMeta::SamplingMeta(FFHandler handler,
         0,                             // begin_bit
         data_type_size(data_type) * 8, // end_bit = sizeof(KeyT) * 8
         stream));
-  }  else if (data_type == DT_B16) {
+  }  else if (data_type == DT_BF16) {
     checkCUDA(cub::DeviceSegmentedRadixSort::SortPairsDescending(
         d_temp_storage,
         temp_storage_bytes,
diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp
index b446ed0065..b45625d204 100644
--- a/src/ops/sigmoid_silu_multi.cpp
+++ b/src/ops/sigmoid_silu_multi.cpp
@@ -101,7 +101,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
                        input1.get_half_ptr(),
                        input2.get_half_ptr(),
                        output.get_half_ptr());
-  } else if (m->input_type[0] == DT_B16) {
+  } else if (m->input_type[0] == DT_BF16) {
     SigmoidSiluMultiKernel<<<GET_BLOCKS(num_elements),
                              min(CUDA_NUM_THREADS, num_elements),
                              0,
diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index 6373aaaae5..05bf8b9be8 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -80,7 +80,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
                                        input1.get_half_ptr(),
                                        input2.get_half_ptr(),
                                        output.get_half_ptr());
-  } else if (m->input_type[0] == DT_B16) {
+  } else if (m->input_type[0] == DT_BF16) {
     SigmoidSiluMultiKernel<<<GET_BLOCKS(num_elements),
                              min(CUDA_NUM_THREADS, num_elements),
                              0,
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index 87598de634..0d0cceb5c4 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -319,7 +319,7 @@ void Softmax::forward_task(Task const *task,
     forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr());
   } else if (m->output_type == DT_FLOAT) {
     forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr());
-  } else if (m->output_type == DT_B16) {
+  } else if (m->output_type == DT_BF16) {
     forward_kernel_wrapper(
         m, input.get_bfloat16_ptr(), output.get_bfloat16_ptr());
   } else {
@@ -369,7 +369,7 @@ void Softmax::backward_task(Task const *task,
       return backward_task_with_dim<half, DIM>(task, regions, ctx, runtime);   \
     } else if (m->output_type == DT_FLOAT) {                                   \
       return backward_task_with_dim<float, DIM>(task, regions, ctx, runtime);  \
-    } else if (m->output_type == DT_B16) {                                     \
+    } else if (m->output_type == DT_BF16) {                                     \
       return backward_task_with_dim<__ff_bfloat16, DIM>(                       \
           task, regions, ctx, runtime);                                        \
     } else {                                                                   \
@@ -435,7 +435,7 @@ void Softmax::inference_task(Task const *task,
     forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr());
   } else if (m->output_type == DT_FLOAT) {
     forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr());
-  } else if (m->output_type == DT_B16) {
+  } else if (m->output_type == DT_BF16) {
     forward_kernel_wrapper(
         m, input.get_bfloat16_ptr(), output.get_bfloat16_ptr());
   } else {
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index a7b8d4535c..ca0d607ecd 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -562,7 +562,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
                                                          output.get_float_ptr(),
                                                          bias_ptr,
                                                          stream);
-  } else if (input.data_type == DT_B16) {
+  } else if (input.data_type == DT_BF16) {
     float const *bias_ptr = use_bias
                                 ? bias.get_bfloat16_ptr()
                                 : static_cast<hip_bfloat16 const *>(nullptr);
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 885a09c980..0d3e3a1250 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -480,7 +480,7 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
   if (m->output_type[0] == DT_FLOAT) {
     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  } else if (m->output_type[0] == DT_B16) {
+  } else if (m->output_type[0] == DT_BF16) {
     compute_type = CUBLAS_COMPUTE_32F;
   }
 #endif
@@ -792,7 +792,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
         output.get_float_ptr(),
         bias_ptr,
         stream);
-  } else if (input.data_type == DT_B16) {
+  } else if (input.data_type == DT_BF16) {
     __nv_bfloat16 const *bias_ptr =
         use_bias ? bias.get_bfloat16_ptr()
                  : static_cast<__nv_bfloat16 const *>(nullptr);
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index 4f399aafc5..754ae24793 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -570,7 +570,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
         output.get_float_ptr(),
         bias_ptr,
         stream);
-  } else if (input.data_type == DT_B16) {
+  } else if (input.data_type == DT_BF16) {
     if (m->offload) {
       pre_build_weight_kernel<hip_bfloat16>(m, weight, input.data_type, stream);
     }
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 1863dde1e7..2b50e93a02 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -1003,7 +1003,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
         output.get_float_ptr(),
         bias_ptr,
         stream);
-  } else if (input.data_type == DT_B16) {
+  } else if (input.data_type == DT_BF16) {
     if (m->offload) {
       pre_build_weight_kernel<__nv_bfloat16>(
           m, weight, input.data_type, stream);
diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc
index 8bd963b3d3..ddbafc23c9 100644
--- a/src/parallel_ops/combine.cc
+++ b/src/parallel_ops/combine.cc
@@ -365,7 +365,7 @@ void Combine::forward_task(Task const *task,
   DataType data_type = m->input_type[0];
   if (data_type == DT_HALF) {
     forward_task_with_type<half>(task, regions, ctx, runtime);
-  } else if (data_type == DT_B16) {
+  } else if (data_type == DT_BF16) {
     forward_task_with_type<__ff_bfloat16>(task, regions, ctx, runtime);
   } else if (data_type == DT_FLOAT) {
     forward_task_with_type<float>(task, regions, ctx, runtime);
diff --git a/src/parallel_ops/reduction.cc b/src/parallel_ops/reduction.cc
index bd7cb4085b..dd2d4a54c2 100644
--- a/src/parallel_ops/reduction.cc
+++ b/src/parallel_ops/reduction.cc
@@ -380,7 +380,7 @@ void Reduction::forward_task(Task const *task,
                           output.get_float_ptr(),
                           num_elements,
                           num_replicas);
-  } else if (input.data_type == DT_B16) {
+  } else if (input.data_type == DT_BF16) {
     forward_kernel<__ff_bfloat16>(input.get_bfloat16_ptr(),
                                   output.get_bfloat16_ptr(),
                                   num_elements,
diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc
index 18e4d26f26..c3821a7d74 100644
--- a/src/parallel_ops/replicate.cc
+++ b/src/parallel_ops/replicate.cc
@@ -373,7 +373,7 @@ void Replicate::forward_task(Task const *task,
     forward_kernel<float>(input.get_float_ptr(),
                           output.get_float_ptr(),
                           input_domain.get_volume());
-  } else if (input.data_type == DT_B16) {
+  } else if (input.data_type == DT_BF16) {
     forward_kernel<__ff_bfloat16>(input.get_bfloat16_ptr(),
                                   output.get_bfloat16_ptr(),
                                   input_domain.get_volume());
diff --git a/src/runtime/accessor.cc b/src/runtime/accessor.cc
index 30aef93df9..a4bd0c11a2 100644
--- a/src/runtime/accessor.cc
+++ b/src/runtime/accessor.cc
@@ -78,7 +78,7 @@ half const *GenericTensorAccessorR::get_half_ptr() const {
 }
 
 __ff_bfloat16 const *GenericTensorAccessorR::get_bfloat16_ptr() const {
-  if (data_type == DT_B16) {
+  if (data_type == DT_BF16) {
     return static_cast<__ff_bfloat16 const *>(ptr);
   } else {
     assert(false && "Invalid Accessor Type");
@@ -175,7 +175,7 @@ half *GenericTensorAccessorW::get_half_ptr() const {
 }
 
 __ff_bfloat16 *GenericTensorAccessorW::get_bfloat16_ptr() const {
-  if (data_type == DT_B16) {
+  if (data_type == DT_BF16) {
     return static_cast<__ff_bfloat16 *>(ptr);
   } else {
     assert(false && "Invalid Accessor Type");
@@ -289,7 +289,7 @@ GenericTensorAccessorR
       ptr = helperGetTensorPointerRO<half>(region, req, fid, ctx, runtime);
       break;
     }
-    case DT_B16: {
+    case DT_BF16: {
       ptr = helperGetTensorPointerRO<__ff_bfloat16>(
           region, req, fid, ctx, runtime);
       break;
@@ -340,7 +340,7 @@ GenericTensorAccessorW
       ptr = helperGetTensorPointerWO<half>(region, req, fid, ctx, runtime);
       break;
     }
-    case DT_B16: {
+    case DT_BF16: {
       ptr = helperGetTensorPointerWO<__ff_bfloat16>(
           region, req, fid, ctx, runtime);
       break;
@@ -391,7 +391,7 @@ GenericTensorAccessorW
       ptr = helperGetTensorPointerRW<half>(region, req, fid, ctx, runtime);
       break;
     }
-    case DT_B16: {
+    case DT_BF16: {
       ptr = helperGetTensorPointerRW<__ff_bfloat16>(
           region, req, fid, ctx, runtime);
       break;
diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu
index 91d139a414..46ab6b42da 100644
--- a/src/runtime/cuda_helper.cu
+++ b/src/runtime/cuda_helper.cu
@@ -546,7 +546,7 @@ cudnnDataType_t ff_to_cudnn_datatype(DataType type) {
   switch (type) {
     case DT_HALF:
       return CUDNN_DATA_HALF;
-    case DT_B16:
+    case DT_BF16:
       return CUDNN_DATA_BFLOAT16;
     case DT_FLOAT:
       return CUDNN_DATA_FLOAT;
@@ -564,7 +564,7 @@ cudaDataType_t ff_to_cuda_datatype(DataType type) {
   switch (type) {
     case DT_HALF:
       return CUDA_R_16F;
-    case DT_B16:
+    case DT_BF16:
       return CUDA_R_16BF;
     case DT_FLOAT:
       return CUDA_R_32F;
@@ -583,7 +583,7 @@ ncclDataType_t ff_to_nccl_datatype(DataType type) {
   switch (type) {
     case DT_HALF:
       return ncclHalf;
-    case DT_B16:
+    case DT_BF16:
       return ncclBfloat16;
     case DT_FLOAT:
       return ncclFloat;
diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc
index 232d711d82..8c130fcfb1 100644
--- a/src/runtime/ffconst_utils.cc
+++ b/src/runtime/ffconst_utils.cc
@@ -213,7 +213,7 @@ size_t data_type_size(DataType type) {
   switch (type) {
     case DT_HALF:
       return sizeof(half);
-    case DT_B16:
+    case DT_BF16:
       return sizeof(__ff_bfloat16);
     case DT_FLOAT:
       return sizeof(float);
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index 3a765dbeb1..6f5864acff 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -1137,7 +1137,7 @@ void FileDataLoader::load_weights(FFModel *ff) {
         case DT_FLOAT:
           load_single_weight_tensor<float>(ff, l, i);
           break;
-        case DT_B16:
+        case DT_BF16:
           load_single_weight_tensor_b16(ff, l, i);
           break;
         case DT_INT4:
diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp
index 00a6730de0..ab1f9d6ece 100644
--- a/src/runtime/hip_helper.cpp
+++ b/src/runtime/hip_helper.cpp
@@ -516,7 +516,7 @@ miopenDataType_t ff_to_cudnn_datatype(DataType type) {
   switch (type) {
     case DT_HALF:
       return miopenHalf;
-    case DT_B16:
+    case DT_BF16:
       return miopenBFloat16;
     case DT_FLOAT:
       return miopenFloat;
@@ -539,7 +539,7 @@ hipblasDatatype_t ff_to_cuda_datatype(DataType type) {
       return HIPBLAS_R_64F;
     case DT_INT32:
       return HIPBLAS_R_32I;
-    case DT_B16:
+    case DT_BF16:
       return HIPBLAS_R_16B;
     case DT_HALF:
       return HIPBLAS_R_16F;
@@ -553,7 +553,7 @@ ncclDataType_t ff_to_nccl_datatype(DataType type) {
   switch (type) {
     case DT_HALF:
       return ncclHalf;
-    case DT_B16:
+    case DT_BF16:
       return ncclBfloat16;
     case DT_FLOAT:
       return ncclFloat;
diff --git a/src/runtime/initializer_kernel.cpp b/src/runtime/initializer_kernel.cpp
index 2e55591729..659c8a4d2c 100644
--- a/src/runtime/initializer_kernel.cpp
+++ b/src/runtime/initializer_kernel.cpp
@@ -259,7 +259,7 @@ void ZeroInitializer::init_task(Task const *task,
                          w,
                          domain.get_volume(),
                          0.0f);
-    } else if (meta->data_types[i] == DT_B16) {
+    } else if (meta->data_types[i] == DT_BF16) {
       hip_bfloat16 *w = helperGetTensorPointerWO<hip_bfloat16>(
           regions[i], task->regions[i], FID_DATA, ctx, runtime);
       hipLaunchKernelGGL(HIP_KERNEL_NAME(assign_kernel<hip_bfloat16>),
diff --git a/src/runtime/initializer_kernel.cu b/src/runtime/initializer_kernel.cu
index b6b9e29b8b..4c114f5de6 100644
--- a/src/runtime/initializer_kernel.cu
+++ b/src/runtime/initializer_kernel.cu
@@ -235,7 +235,7 @@ void ZeroInitializer::init_task(Task const *task,
       assign_kernel<half>
           <<<GET_BLOCKS(domain.get_volume()), CUDA_NUM_THREADS, 0, stream>>>(
               w, domain.get_volume(), 0.0f);
-    } else if (meta->data_types[i] == DT_B16) {
+    } else if (meta->data_types[i] == DT_BF16) {
       __nv_bfloat16 *w = helperGetTensorPointerWO<__nv_bfloat16>(
           regions[i], task->regions[i], FID_DATA, ctx, runtime);
       assign_kernel<__nv_bfloat16>
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index acaee6b7e4..a826bb5c34 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -1947,7 +1947,7 @@ void FFModel::map_tensor_with_dim2(ParallelTensor tensor,
     case DT_HALF:
       allocator.allocate_field(sizeof(half), FID_DATA);
       break;
-    case DT_B16:
+    case DT_BF16:
       allocator.allocate_field(sizeof(__ff_bfloat16), FID_DATA);
       break;
     case DT_FLOAT:
diff --git a/src/runtime/operator.cc b/src/runtime/operator.cc
index 6349bfd1a5..3cb71f910e 100644
--- a/src/runtime/operator.cc
+++ b/src/runtime/operator.cc
@@ -62,7 +62,7 @@ void Op::save_inference_tensors_to_file(
       save_tensor(input_tensors[i].get_half_ptr(),
                   input_tensors[i].domain.get_volume(),
                   filename.c_str());
-    } else if (input_tensors[i].data_type == DT_B16) {
+    } else if (input_tensors[i].data_type == DT_BF16) {
       save_tensor(input_tensors[i].get_bfloat16_ptr(),
                   input_tensors[i].domain.get_volume(),
                   filename.c_str());
@@ -90,7 +90,7 @@ void Op::save_inference_tensors_to_file(
         save_tensor(weight_tensors[i].get_half_ptr(),
                     weight_tensors[i].domain.get_volume(),
                     filename.c_str());
-      } else if (weight_tensors[i].data_type == DT_B16) {
+      } else if (weight_tensors[i].data_type == DT_BF16) {
         save_tensor(weight_tensors[i].get_bfloat16_ptr(),
                     weight_tensors[i].domain.get_volume(),
                     filename.c_str());
@@ -118,7 +118,7 @@ void Op::save_inference_tensors_to_file(
       save_tensor(output_tensors[i].get_half_ptr(),
                   output_tensors[i].domain.get_volume(),
                   filename.c_str());
-    } else if (output_tensors[i].data_type == DT_B16) {
+    } else if (output_tensors[i].data_type == DT_BF16) {
       save_tensor(output_tensors[i].get_bfloat16_ptr(),
                   output_tensors[i].domain.get_volume(),
                   filename.c_str());

From 5f2afc67e91f076ba6bc213225ce7f682b1c1100 Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Wed, 17 Jan 2024 10:40:25 -0500
Subject: [PATCH 09/11] .

---
 inference/incr_decoding/incr_decoding.cc |  2 +-
 inference/models/opt.cc                  | 17 ++++++++---------
 tests/inference/huggingface_inference.py |  9 ++++++++-
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 96017008db..75f5a7b2bc 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -172,7 +172,7 @@ void FlexFlow::top_level_task(Task const *task,
   std::string tokenizer_filepath =
       join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name});
 
-  // bfloat16 shares same weight file with float32    
+  // bfloat16 shares same weight file with float32
   std::string weights_filepath = join_path(
       {file_paths.cache_folder_path,
        "weights",
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index a185af8b38..23f910315e 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -63,15 +63,14 @@ void OPT::create_opt_model(FFModel &ff,
                               embed_init,
                               "embed_tokens");
 
-  Tensor positional_embedding =
-      ff.embedding(position_input,
-                   opt_config.max_position_embeddings,
-                   opt_config.hidden_size,
-                   AGGR_MODE_NONE,
-                   data_type,
-                   NULL,
-                   embed_init,
-                   "embed_positions");
+  Tensor positional_embedding = ff.embedding(position_input,
+                                             opt_config.max_position_embeddings,
+                                             opt_config.hidden_size,
+                                             AGGR_MODE_NONE,
+                                             data_type,
+                                             NULL,
+                                             embed_init,
+                                             "embed_positions");
 
   Tensor fc2 = nullptr, added = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py
index 5b533bf3c0..4a8b5920ea 100644
--- a/tests/inference/huggingface_inference.py
+++ b/tests/inference/huggingface_inference.py
@@ -26,6 +26,9 @@ def main():
     parser.add_argument(
         "--use-full-precision", action="store_true", help="Use full precision"
     )
+    parser.add_argument(
+        "--use-bfloat16-precision", action="store_true", help="Use bf16 precision"
+    )
     parser.add_argument("--do-sample", action="store_true", help="Use sampling")
     parser.add_argument("--gpu", action="store_true", help="Run on GPU")
     args = parser.parse_args()
@@ -47,7 +50,11 @@ def main():
             return
 
     # Set default tensor type depending on argument indicating the float type to use
-    if not args.use_full_precision:
+    if args.use_full_precision:
+        torch.set_default_tensor_type(torch.FloatTensor)
+    elif args.use_bfloat16_precision:
+        torch.set_default_tensor_type(torch.BFloat16Tensor)
+    else:
         torch.set_default_tensor_type(torch.HalfTensor)
 
     # Run huggingface model

From f458f3c8a1085572eb6e6081e1e30c07e0134398 Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Wed, 17 Jan 2024 10:48:56 -0500
Subject: [PATCH 10/11] format

---
 inference/spec_infer/spec_infer.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index d37b46d5ae..9fe686f8e7 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -300,7 +300,8 @@ void FlexFlow::top_level_task(Task const *task,
                    max_tokens_per_batch,
                    max_sequence_length);
 
-  get_model_meta(file_paths, model_metadata, use_full_precision, use_bfloat16_precision);
+  get_model_meta(
+      file_paths, model_metadata, use_full_precision, use_bfloat16_precision);
 
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==

From 6fcfbad172ecee05c5f2aa39962dc9fe93a9ef8f Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Fri, 19 Jan 2024 07:49:53 -0500
Subject: [PATCH 11/11] fix

---
 src/ops/add_bias_residual_layer_norm.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu
index 8b372875ea..e503c2603d 100644
--- a/src/ops/add_bias_residual_layer_norm.cu
+++ b/src/ops/add_bias_residual_layer_norm.cu
@@ -138,9 +138,9 @@ __global__ void LayerNormFusedForwardKernel(int64_t N,
   for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
     int64_t const index = i * N + j;
     const T_ACC gamma_v =
-        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
+        gamma == nullptr ? T_ACC(1.0f) : static_cast<T_ACC>(gamma[j]);
     const T_ACC beta_v =
-        beta == nullptr ? T_ACC(0) : static_cast<T_ACC>(beta[j]);
+        beta == nullptr ? T_ACC(0.0f) : static_cast<T_ACC>(beta[j]);
     Y[index] = (static_cast<T_ACC>(X[index]) - static_cast<T_ACC>(mean[i])) *
                    static_cast<T_ACC>(rstd[i]) * gamma_v +
                beta_v;