From e767ef75fecc874afecd0f932df5266152068fef Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sat, 6 Jan 2024 15:25:00 -0500 Subject: [PATCH 01/11] init --- include/flexflow/accessor.h | 5 + include/flexflow/ffconst.h | 7 + .../inc_multihead_self_attention_utils.cuh | 114 +++++++++++++ inference/incr_decoding/incr_decoding.cc | 7 +- inference/models/llama.cc | 6 +- inference/models/llama.h | 2 +- inference/spec_infer/spec_infer.cc | 5 +- inference/utils/download_hf_model.py | 9 +- python/flexflow/serve/models/llama.py | 2 +- python/flexflow/serve/serve.py | 5 +- python/flexflow/type.py | 1 + src/ops/add_bias_residual_layer_norm.cu | 22 ++- src/ops/arg_topk.cu | 14 +- src/ops/argmax.cu | 35 +++- src/ops/element_unary.cu | 7 + src/ops/fused.cu | 16 ++ src/ops/inc_multihead_self_attention.cu | 45 ++++++ src/ops/kernels/decompress_kernels.cu | 17 ++ src/ops/kernels/element_binary_kernels.cu | 3 + src/ops/kernels/embedding_kernels.cu | 152 ++++++++++++++++++ src/ops/kernels/linear_kernels.cu | 29 ++++ src/ops/kernels/residual_rms_norm_kernels.cu | 10 +- src/ops/kernels/rms_norm_kernels.cu | 12 +- src/ops/kernels/softmax.cu | 11 +- src/ops/layer_norm.cu | 10 +- src/ops/linear.cc | 27 +++- src/ops/residual_layer_norm.cu | 13 +- src/ops/sampling.cu | 25 ++- src/ops/sigmoid_silu_multi.cu | 8 + src/ops/softmax.cc | 15 +- src/ops/spec_inc_multihead_self_attention.cu | 12 ++ src/ops/tree_inc_multihead_self_attention.cu | 19 +++ src/parallel_ops/combine.cc | 4 +- src/parallel_ops/kernels/combine_kernels.cu | 5 + src/parallel_ops/kernels/reduction_kernels.cu | 13 ++ src/parallel_ops/kernels/replicate_kernels.cu | 3 + src/parallel_ops/reduction.cc | 7 +- src/parallel_ops/replicate.cc | 6 +- src/runtime/accessor.cc | 33 ++++ src/runtime/cuda_helper.cu | 40 +++++ src/runtime/ffconst_utils.cc | 2 + src/runtime/initializer_kernel.cu | 6 + src/runtime/model.cc | 3 + src/runtime/operator.cc | 12 ++ 44 files changed, 761 insertions(+), 38 deletions(-) diff --git a/include/flexflow/accessor.h b/include/flexflow/accessor.h index 65ab33b513..26ed006646 100644 --- a/include/flexflow/accessor.h +++ b/include/flexflow/accessor.h @@ -5,10 +5,13 @@ #if defined(FF_USE_CUDA) #include +#include #elif defined(FF_USE_HIP_CUDA) #include +#include #elif defined(FF_USE_HIP_ROCM) #include +#include #endif // using namespace Legion; @@ -61,6 +64,7 @@ class GenericTensorAccessorW { float *get_float_ptr() const; double *get_double_ptr() const; half *get_half_ptr() const; + __nv_bfloat16 *get_bfloat16_ptr() const; char *get_byte_ptr() const; DataType data_type; Legion::Domain domain; @@ -80,6 +84,7 @@ class GenericTensorAccessorR { float const *get_float_ptr() const; double const *get_double_ptr() const; half const *get_half_ptr() const; + __nv_bfloat16 const *get_bfloat16_ptr() const; char const *get_byte_ptr() const; DataType data_type; Legion::Domain domain; diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 512645e624..9613a041f7 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -35,6 +35,7 @@ enum DataType { DT_DOUBLE = 45, DT_INT4 = 46, DT_INT8 = 47, + DT_B16 = 48, DT_NONE = 49, }; @@ -269,4 +270,10 @@ enum { PARALLEL_TENSOR_GUID_FIRST_VALID = 4000000, NODE_GUID_FIRST_VALID = 5000000, }; + +enum InferencePrecision { + INFERENCE_FLOAT = 800, + INFERENCE_HALF = 801, + INFERENCE_BFLOAT16 = 802, +}; #endif // _FLEXFLOW_CONST_H_ diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh index d1e0e050b2..dfc99bc612 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh @@ -23,6 +23,25 @@ struct half8 { half c; half d; }; + +struct __nv_bfloat164 { + __nv_bfloat16 x; + __nv_bfloat16 y; + __nv_bfloat16 z; + __nv_bfloat16 w; +}; + +struct __nv_bfloat168 { + __nv_bfloat16 x; + __nv_bfloat16 y; + __nv_bfloat16 z; + __nv_bfloat16 w; + __nv_bfloat16 a; + __nv_bfloat16 b; + __nv_bfloat16 c; + __nv_bfloat16 d; +}; + struct float8 { float x; float y; @@ -61,6 +80,18 @@ template <> struct VEC_K { using Type = half4; }; +template <> +struct VEC_K<__nv_bfloat16, 1> { + using Type = __nv_bfloat16; +}; +template <> +struct VEC_K<__nv_bfloat16, 2> { + using Type = __nv_bfloat162; +}; +template <> +struct VEC_K<__nv_bfloat16, 4> { + using Type = __nv_bfloat164; +}; // data type for QK production template @@ -95,6 +126,23 @@ struct Vec_fp32_ { using Type = float8; }; +template <> +struct Vec_fp32_<__nv_bfloat16> { + using Type = float; +}; +template <> +struct Vec_fp32_<__nv_bfloat162> { + using Type = float2; +}; +template <> +struct Vec_fp32_<__nv_bfloat164> { + using Type = float4; +}; +template <> +struct Vec_fp32_<__nv_bfloat168> { + using Type = float8; +}; + template struct VEC_V {}; template <> @@ -105,6 +153,10 @@ template <> struct VEC_V { using Type = half8; }; +template <> +struct VEC_V<__nv_bfloat16> { + using Type = __nv_bfloat168; +}; ////////////////data structures half/////////////// @@ -331,6 +383,42 @@ inline __device__ float8 cast_to_float(half8 u) { return tmp; } +inline __device__ float cast_to_float(__nv_bfloat16 u) { + return __bfloat162float(u); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 cast_to_float(__nv_bfloat162 u) { + float2 tmp; + tmp.x = __bfloat162float(u.x); + tmp.y = __bfloat162float(u.y); + return tmp; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float4 cast_to_float(__nv_bfloat164 u) { + float4 tmp; + tmp.x = __bfloat162float(u.x); + tmp.y = __bfloat162float(u.y); + tmp.z = __bfloat162float(u.z); + tmp.w = __bfloat162float(u.w); + return tmp; +} +inline __device__ float8 cast_to_float(__nv_bfloat168 u) { + float8 tmp; + tmp.x = __bfloat162float(u.x); + tmp.y = __bfloat162float(u.y); + tmp.z = __bfloat162float(u.z); + tmp.w = __bfloat162float(u.w); + tmp.a = __bfloat162float(u.a); + tmp.b = __bfloat162float(u.b); + tmp.c = __bfloat162float(u.c); + tmp.d = __bfloat162float(u.d); + return tmp; +} + inline __device__ void convert_from_float(float4 &dst, float4 src) { dst = src; } @@ -369,6 +457,32 @@ inline __device__ void convert_from_float(half &dst, float src) { dst = __float2half(src); } +// +inline __device__ void convert_from_float(__nv_bfloat164 &dst, float4 src) { + dst.x = __float2bfloat16(src.x); + dst.y = __float2bfloat16(src.y); + dst.z = __float2bfloat16(src.z); + dst.w = __float2bfloat16(src.w); +} + +inline __device__ void convert_from_float(__nv_bfloat168 &dst, float8 src) { + dst.x = __float2bfloat16(src.x); + dst.y = __float2bfloat16(src.y); + dst.z = __float2bfloat16(src.z); + dst.w = __float2bfloat16(src.w); + dst.a = __float2bfloat16(src.a); + dst.b = __float2bfloat16(src.b); + dst.c = __float2bfloat16(src.c); + dst.d = __float2bfloat16(src.d); +} +inline __device__ void convert_from_float(__nv_bfloat162 &dst, float2 src) { + dst.x = __float2bfloat16(src.x); + dst.y = __float2bfloat16(src.y); +} +inline __device__ void convert_from_float(__nv_bfloat16 &dst, float src) { + dst = __float2bfloat16(src); +} + //////////////////////////////////////utils/////////////////////////////////////////////// template diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index c3f9052305..61652b3ae3 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -39,6 +39,7 @@ void parse_input_args(char **argv, int argc, FilePaths &paths, std::string &llm_model_name, + DataType &data_type, bool &use_full_precision, bool &verbose, bool &do_sample, @@ -72,7 +73,7 @@ void parse_input_args(char **argv, continue; } if (!strcmp(argv[i], "--use-full-precision")) { - use_full_precision = true; + data_type = DT_FLOAT; continue; } // verbose logging to stdout @@ -125,6 +126,7 @@ void FlexFlow::top_level_task(Task const *task, } FilePaths file_paths; std::string llm_model_name; + DataType data_type = DT_B16; bool use_full_precision = false; bool verbose = false; bool do_sample = false; @@ -141,6 +143,7 @@ void FlexFlow::top_level_task(Task const *task, argc, file_paths, llm_model_name, + data_type, use_full_precision, verbose, do_sample, @@ -219,7 +222,7 @@ void FlexFlow::top_level_task(Task const *task, weights_filepath, INC_DECODING_MODE, generationConfig, - use_full_precision); + data_type); } else if (model_type == ModelType::OPT) { OPT::create_opt_model(model, config_filepath, diff --git a/inference/models/llama.cc b/inference/models/llama.cc index e54d6d8811..f4655b1750 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -25,7 +25,7 @@ void LLAMA::create_llama_model(FFModel &ff, std::string const &weight_file_path, InferenceMode mode, GenerationConfig generation_config, - bool use_full_precision) { + DataType data_type) { // do not apply cpu offload in beam search model. LLAMAConfig llama_config(model_config_file_path); llama_config.print(); @@ -54,7 +54,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.vocab_size, llama_config.hidden_size, AGGR_MODE_NONE, - use_full_precision ? DT_FLOAT : DT_HALF, + data_type, NULL, embed_init, "tok_embeddings"); @@ -276,7 +276,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.hidden_size / llama_config.num_attention_heads, ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, use_full_precision); + fileloader.load_weights(&ff, data_type); std::cout << "------load weight finished----------" << std::endl; // init operators diff --git a/inference/models/llama.h b/inference/models/llama.h index 8ecfcd7155..6c72b432ad 100644 --- a/inference/models/llama.h +++ b/inference/models/llama.h @@ -83,7 +83,7 @@ class LLAMA { std::string const &weight_file_path, InferenceMode mode, GenerationConfig generation_config, - bool use_full_precision = false); + DataType DataType); }; }; // namespace FlexFlow diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index b369a13c1d..8d3d23ae4e 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -264,6 +264,7 @@ void FlexFlow::top_level_task(Task const *task, FFConfig ffconfig; FilePaths file_paths; ModelMeta model_metadata; + DataType data_type = DT_B16; bool use_full_precision = false; bool verbose = false; int max_requests_per_batch = 16; @@ -313,7 +314,7 @@ void FlexFlow::top_level_task(Task const *task, model_metadata.llm_weights_path, TREE_VERIFY_MODE, generationConfig, - use_full_precision); + data_type); } else if (model_metadata.llm_model_type == ModelType::OPT) { OPT::create_opt_model(tree_model, model_metadata.llm_model_config_path, @@ -357,7 +358,7 @@ void FlexFlow::top_level_task(Task const *task, model_metadata.ssm_model_weights_paths[ssm_id], BEAM_SEARCH_MODE, generationConfig, - use_full_precision); + data_type); } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::OPT) { OPT::create_opt_model(beam_model, model_metadata.ssm_model_config_paths[ssm_id], diff --git a/inference/utils/download_hf_model.py b/inference/utils/download_hf_model.py index 03fc8e1633..f158d4e480 100644 --- a/inference/utils/download_hf_model.py +++ b/inference/utils/download_hf_model.py @@ -30,6 +30,11 @@ def parse_args(): action="store_true", help="Only download the half precision version of the weights", ) + group.add_argument( + "--bfloat16-precision-only", + action="store_true", + help="Only download the bfloat16 precision version of the weights", + ) args = parser.parse_args() return args @@ -39,8 +44,10 @@ def main(args): data_types = ff.DataType.DT_FLOAT elif args.half_precision_only: data_types = ff.DataType.DT_HALF + elif args.bfloat16_precision_only: + data_types = ff.DataType.DT_B16 else: - data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF) + data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF, ff.DataType.DT_B16) for model_name in args.model_names: for data_type in data_types: diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index 7ba0e78a37..f37a14ed9e 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -15,7 +15,7 @@ from flexflow.core import * from .base import FlexFlowModel import random - +import torch class LLAMAConfig: def __init__(self, hf_config): diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 55601f957e..5c1709a7ff 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -113,7 +113,7 @@ def __init__( self.config_class, ) = self.__get_ff_model_type() self.data_type = data_type - assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT + assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT or self.data_type == DataType.DT_B16 self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow" self.refresh_cache = refresh_cache self.output_file = output_file @@ -172,6 +172,8 @@ def download_hf_weights_if_needed(self): torch.set_default_tensor_type(torch.HalfTensor) elif self.data_type == DataType.DT_FLOAT: torch.set_default_tensor_type(torch.FloatTensor) + elif self.data_type == DataType.DT_B16: + torch.set_default_tensor_type(torch.BFloat16Tensor) else: assert False, "Data type not yet supported -- cannot download weights!" @@ -182,6 +184,7 @@ def download_hf_weights_if_needed(self): self.model_name.lower(), "full-precision" if self.data_type == DataType.DT_FLOAT + else "bfloat16-precision" if self.data_type == DataType.DT_B16 else "half-precision", ) if self.refresh_cache: diff --git a/python/flexflow/type.py b/python/flexflow/type.py index 994a85f57e..df113438e8 100644 --- a/python/flexflow/type.py +++ b/python/flexflow/type.py @@ -35,6 +35,7 @@ class DataType(Enum): DT_HALF = 43 DT_FLOAT = 44 DT_DOUBLE = 45 + DT_B16 = 48 DT_NONE = 49 diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu index ceb1a6514e..b137d6faef 100644 --- a/src/ops/add_bias_residual_layer_norm.cu +++ b/src/ops/add_bias_residual_layer_norm.cu @@ -107,13 +107,13 @@ __global__ void LayerNormFusedForwardKernel(int64_t N, T *Y) { __shared__ float m_shared[C10_WARP_SIZE]; __shared__ float v_shared[C10_WARP_SIZE]; - const int64_t i = blockIdx.x; + int64_t const i = blockIdx.x; float sum1 = 0.0f; float sum2 = 0.0f; for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDABlockReduceNumThreads)) { - const int64_t index = i * N + j; - const int64_t bias_idx = index % attn_bias_dim; + int64_t const index = i * N + j; + int64_t const bias_idx = index % attn_bias_dim; X[index] = input_ptr[index] + attn_bias_ptr[bias_idx] + residual_ptr[index]; sum1 += static_cast(X[index]); sum2 += static_cast(X[index]) * static_cast(X[index]); @@ -136,7 +136,7 @@ __global__ void LayerNormFusedForwardKernel(int64_t N, using T_ACC = T; for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { - const int64_t index = i * N + j; + int64_t const index = i * N + j; const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); const T_ACC beta_v = @@ -234,6 +234,20 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( m->elementwise_affine ? gamma.get_half_ptr() : nullptr, (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, stream); + } else if (m->input_type[0] == DT_B16) { + AddBiasResidualLayerNorm::inference_kernel<__nv_bfloat16>( + m, + attn_bias_dim, + residual_volume, + input.get_bfloat16_ptr(), + attn_bias.get_bfloat16_ptr(), + residual.get_bfloat16_ptr(), + added_output.get_bfloat16_ptr(), + output.get_bfloat16_ptr(), + m->elementwise_affine ? gamma.get_bfloat16_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_bfloat16_ptr() + : nullptr, + stream); } else { assert(false && "unsupport datatype in layernorm"); } diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu index 5b7978812c..7f43aba948 100644 --- a/src/ops/arg_topk.cu +++ b/src/ops/arg_topk.cu @@ -520,7 +520,19 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, m->sorted, m->speculative_decoding ? bc : nullptr, stream); - } else { + } else if (input.data_type == DT_B16) { + ArgTopK::forward_kernel(m, + input.get_bfloat16_ptr(), + m->speculative_decoding ? probs.get_float_ptr() + : nullptr, + indices.get_int32_ptr(), + batch_size, + length, + k, + m->sorted, + m->speculative_decoding ? bc : nullptr, + stream); + }else { assert(false && "Unsupported data type"); } diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu index 05c84719c1..32cd872a55 100644 --- a/src/ops/argmax.cu +++ b/src/ops/argmax.cu @@ -123,6 +123,16 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, length, batch_size, stream); + } else if (input.data_type == DT_B16) { + ArgMax::forward_kernel<__nv_bfloat16>( + m, + input.get_bfloat16_ptr(), + indices.get_int32_ptr(), + m->probs, + m->beam_search ? parent.get_int32_ptr() : nullptr, + length, + batch_size, + stream); } else { assert(false && "Unsupported data type"); } @@ -153,20 +163,27 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler, size_t d_offsets_size = batch_size; size_t prob_size = batch_size; - assert(data_type == DT_FLOAT || data_type == DT_HALF); + assert(data_type == DT_FLOAT || data_type == DT_HALF || data_type == DT_B16); size_t total_size = d_offsets_size * sizeof(int) + (data_type == DT_FLOAT ? sizeof(cub::KeyValuePair) * batch_size - : sizeof(cub::KeyValuePair) * batch_size) + + : (data_type == DT_HALF + ? sizeof(cub::KeyValuePair) * batch_size + : sizeof(cub::KeyValuePair) * + batch_size)) + prob_size * sizeof(float); gpu_mem_allocator.create_legion_instance(reserveInst, total_size); d_offsets = gpu_mem_allocator.allocate_instance(d_offsets_size); d_out = data_type == DT_FLOAT ? gpu_mem_allocator.allocate_instance_untyped( batch_size * sizeof(cub::KeyValuePair)) - : gpu_mem_allocator.allocate_instance_untyped( - batch_size * sizeof(cub::KeyValuePair)); + : (data_type == DT_HALF + ? gpu_mem_allocator.allocate_instance_untyped( + batch_size * sizeof(cub::KeyValuePair)) + : gpu_mem_allocator.allocate_instance_untyped( + batch_size * + sizeof(cub::KeyValuePair))); probs = gpu_mem_allocator.allocate_instance(prob_size); // init offset int parallelism = total_ele; @@ -197,6 +214,16 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler, d_offsets, d_offsets + 1, stream)); + } else if (data_type == DT_B16) { + checkCUDA(cub::DeviceSegmentedReduce::ArgMax( + d_temp_storage, + temp_storage_bytes, + input.get_bfloat16_ptr(), + static_cast *>(d_out), + batch_size, + d_offsets, + d_offsets + 1, + stream)); } gpu_mem_allocator.create_legion_instance(reserveInst, temp_storage_bytes); diff --git a/src/ops/element_unary.cu b/src/ops/element_unary.cu index c7f5e90f4c..ed4fb15e18 100644 --- a/src/ops/element_unary.cu +++ b/src/ops/element_unary.cu @@ -307,6 +307,12 @@ template void float const *input_ptr, float *output_ptr, size_t num_elements); +template void + ElementUnary::forward_kernel_wrapper<__nv_bfloat16>(ElementUnaryMeta const *m, + __nv_bfloat16 const *input_ptr, + __nv_bfloat16 *output_ptr, + size_t num_elements); + template void ElementUnary::forward_kernel_wrapper(ElementUnaryMeta const *m, double const *input_ptr, @@ -329,6 +335,7 @@ template void float const *output_ptr, float const *output_grad_ptr, size_t num_elements); + template void ElementUnary::backward_kernel_wrapper(ElementUnaryMeta const *m, double const *input_ptr, diff --git a/src/ops/fused.cu b/src/ops/fused.cu index c6ba0b04c5..77e12828a1 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -442,6 +442,11 @@ __host__ void FusedOp::forward_task(Task const *task, m, my_input_accessor[0].get_float_ptr(), my_output_accessor[0].get_float_ptr()); + } else if (m->input_type == DT_B16) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_bfloat16_ptr(), + my_output_accessor[0].get_bfloat16_ptr()); } break; } @@ -850,6 +855,12 @@ __host__ void my_input_accessor[0].get_float_ptr(), my_output_accessor[0].get_float_ptr(), my_input_accessor[0].domain.get_volume()); + } else if (m->data_type == DT_B16) { + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_bfloat16_ptr(), + my_output_accessor[0].get_bfloat16_ptr(), + my_input_accessor[0].domain.get_volume()); } else { assert(false && "Unsupported data type in ElementUnary forward"); } @@ -1076,6 +1087,11 @@ __host__ void m, my_input_accessor[0].get_float_ptr(), my_output_accessor[0].get_float_ptr()); + } else if (m->input_type == DT_B16) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_bfloat16_ptr(), + my_output_accessor[0].get_bfloat16_ptr()); } break; } diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 42933cee27..4e8f97650a 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -792,6 +792,12 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, m->weightSize, cudaMemcpyHostToDevice, stream); + } else if (data_type == DT_B16) { + cudaMemcpyAsync(m->weight_ptr, + weight.get_bfloat16_ptr(), + m->weightSize, + cudaMemcpyHostToDevice, + stream); } else { assert(false); } @@ -1185,6 +1191,22 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( output.get_float_ptr(), bias_ptr, stream); + }else if (input.data_type == DT_B16) { + if (m->offload) { + pre_build_weight_kernel<__nv_bfloat16>(m, weight, input.data_type, stream); + } + __nv_bfloat16 const *bias_ptr = + use_bias ? bias.get_bfloat16_ptr() : static_cast<__nv_bfloat16 const *>(nullptr); + Kernels::IncMultiHeadAttention::inference_kernel( + m, + bc, + shard_id, + input.get_bfloat16_ptr(), + m->offload ? static_cast<__nv_bfloat16 *>(m->weight_ptr) + : weight.get_bfloat16_ptr(), + output.get_bfloat16_ptr(), + bias_ptr, + stream); } else { assert(false && "Unspported data type"); } @@ -1493,6 +1515,12 @@ template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( DataType data_type, cudaStream_t stream); +template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<__nv_bfloat16>( + IncMultiHeadSelfAttentionMeta const *m, + GenericTensorAccessorR const weight, + DataType data_type, + cudaStream_t stream); + template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, @@ -1513,6 +1541,16 @@ template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( int num_tokens, cudaStream_t stream); +template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<__nv_bfloat16>( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + __nv_bfloat16 *output_ptr, + __nv_bfloat16 const *weight_ptr, + __nv_bfloat16 const *bias_ptr, + int num_tokens, + cudaStream_t stream); + template void Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( IncMultiHeadSelfAttentionMeta const *m, @@ -1526,4 +1564,11 @@ template void BatchConfig const *bc, half *output_ptr, cudaStream_t stream); + +template void + Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<__nv_bfloat16>( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + __nv_bfloat16 *output_ptr, + cudaStream_t stream); }; // namespace FlexFlow diff --git a/src/ops/kernels/decompress_kernels.cu b/src/ops/kernels/decompress_kernels.cu index 2e02ce1eec..2165967477 100644 --- a/src/ops/kernels/decompress_kernels.cu +++ b/src/ops/kernels/decompress_kernels.cu @@ -209,10 +209,14 @@ template __global__ void decompress_int4_general_weights( char const *input_weight_ptr, float *weight_ptr, int in_dim, int valueSize); template __global__ void decompress_int4_general_weights( char const *input_weight_ptr, half *weight_ptr, int in_dim, int valueSize); +template __global__ void decompress_int4_general_weights<__nv_bfloat16>( + char const *input_weight_ptr, __nv_bfloat16 *weight_ptr, int in_dim, int valueSize); template __global__ void decompress_int8_general_weights( char const *input_weight_ptr, float *weight_ptr, int in_dim, int valueSize); template __global__ void decompress_int8_general_weights( char const *input_weight_ptr, half *weight_ptr, int in_dim, int valueSize); +template __global__ void decompress_int8_general_weights<__nv_bfloat16>( + char const *input_weight_ptr, __nv_bfloat16 *weight_ptr, int in_dim, int valueSize); template __global__ void decompress_int4_attention_weights(char *input_weight_ptr, float *weight_ptr, @@ -226,6 +230,12 @@ template __global__ void int qProjSize, int qSize, int num_heads); +template __global__ void + decompress_int4_attention_weights<__nv_bfloat16>(char *input_weight_ptr, + __nv_bfloat16 *weight_ptr, + int qProjSize, + int qSize, + int num_heads); template __global__ void decompress_int8_attention_weights(char *input_weight_ptr, @@ -240,6 +250,13 @@ template __global__ void int qProjSize, int qSize, int num_heads); + +template __global__ void + decompress_int8_attention_weights<__nv_bfloat16>(char *input_weight_ptr, + __nv_bfloat16 *weight_ptr, + int qProjSize, + int qSize, + int num_heads); // template // void decompress_weight_bias(T1 *input_weight_ptr, // T2 *weight_ptr, diff --git a/src/ops/kernels/element_binary_kernels.cu b/src/ops/kernels/element_binary_kernels.cu index 42b31a664a..26d9ea9cc9 100644 --- a/src/ops/kernels/element_binary_kernels.cu +++ b/src/ops/kernels/element_binary_kernels.cu @@ -105,6 +105,9 @@ void forward_kernel_wrapper(ElementBinaryMeta const *m, in2.get_float_ptr(), out.get_float_ptr(), stream); + } else if (out.data_type == DT_B16) { + Internal::forward_kernel( + m, in1.get_bfloat16_ptr(), in2.get_bfloat16_ptr(), out.get_bfloat16_ptr(), stream); } else { assert(false && "Unsupported data type"); } diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu index 22d8161ff1..8e40f4bf95 100644 --- a/src/ops/kernels/embedding_kernels.cu +++ b/src/ops/kernels/embedding_kernels.cu @@ -70,6 +70,16 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, m->aggr, output.domain.get_volume(), stream); + } else if (weight.data_type == DT_B16) { + Internal::forward_kernel(input.get_int32_ptr(), + output.get_bfloat16_ptr(), + weight.get_bfloat16_ptr(), + in_dim, + out_dim, + batch_size, + m->aggr, + output.domain.get_volume(), + stream); } else { assert(false && "Unsupported DataType in Embedding"); } @@ -104,6 +114,16 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, m->aggr, output.domain.get_volume(), stream); + } else if (weight.data_type == DT_B16) { + Internal::forward_kernel(input.get_int64_ptr(), + output.get_bfloat16_ptr(), + weight.get_bfloat16_ptr(), + in_dim, + out_dim, + batch_size, + m->aggr, + output.domain.get_volume(), + stream); } else { assert(false && "Unsupported DataType in Embedding"); } @@ -161,6 +181,16 @@ void backward_kernel_wrapper(EmbeddingMeta const *m, m->aggr, output.domain.get_volume(), stream); + } else if (m->output_type[0] == DT_B16) { + Internal::backward_kernel(input.get_int32_ptr(), + output.get_bfloat16_ptr(), + weight_grad.get_bfloat16_ptr(), + in_dim, + out_dim, + batch_size, + m->aggr, + output.domain.get_volume(), + stream); } else { assert(false && "Unsupported DataType in Embedding"); } @@ -195,6 +225,16 @@ void backward_kernel_wrapper(EmbeddingMeta const *m, m->aggr, output.domain.get_volume(), stream); + } else if (m->output_type[0] == DT_B16) { + Internal::backward_kernel(input.get_int64_ptr(), + output.get_bfloat16_ptr(), + weight_grad.get_bfloat16_ptr(), + in_dim, + out_dim, + batch_size, + m->aggr, + output.domain.get_volume(), + stream); } else { assert(false && "Unsupported DataType in Embedding"); } @@ -322,6 +362,50 @@ __global__ void embed_backward_no_aggr(int64_t const *input, } } +template <> +__global__ void + embed_backward_no_aggr(int const *input, + __nv_bfloat16 const *output, + __nv_bfloat16 *embed, + int out_dim, + int batch_size) { + CUDA_KERNEL_LOOP(i, batch_size * out_dim) { + int idx = i / out_dim; + int off = i % out_dim; + int wordIdx = input[idx]; +#if __CUDA_ARCH__ >= 700 + atomicAdd(embed + wordIdx * out_dim + off, output[i]); +#else + assert(false); + // TODO: this implementation may result in race condition + // so we use an assertion failure to warn users + embed[wordIdx * out_dim + off] += output[i]; +#endif + } +} + +template <> +__global__ void + embed_backward_no_aggr(int64_t const *input, + __nv_bfloat16 const *output, + __nv_bfloat16 *embed, + int out_dim, + int batch_size) { + CUDA_KERNEL_LOOP(i, batch_size * out_dim) { + int idx = i / out_dim; + int off = i % out_dim; + int64_t wordIdx = input[idx]; +#if __CUDA_ARCH__ >= 700 + atomicAdd(embed + wordIdx * out_dim + off, output[i]); +#else + assert(false); + // TODO: this implementation may result in race condition + // so we use an assertion failure to warn users + embed[wordIdx * out_dim + off] += output[i]; +#endif + } +} + template __global__ void embed_backward_with_aggr(TI const *input, TD const *output, @@ -416,6 +500,74 @@ __global__ void embed_backward_with_aggr(int64_t const *input, } } +template <> +__global__ void + embed_backward_with_aggr(int const *input, + __nv_bfloat16 const *output, + __nv_bfloat16 *embed, + int out_dim, + int in_dim, + int batch_size, + AggrMode aggr) { + __nv_bfloat16 scale = 1.0f / in_dim; + CUDA_KERNEL_LOOP(i, batch_size * out_dim) { + int idx = i / out_dim; + int off = i % out_dim; + __nv_bfloat16 gradient; + if (aggr == AGGR_MODE_SUM) { + gradient = output[i]; + } else { + assert(aggr == AGGR_MODE_AVG); + gradient = output[i] * scale; + } + for (int j = 0; j < in_dim; j++) { + int wordIdx = input[idx * in_dim + j]; +#if __CUDA_ARCH__ >= 700 + atomicAdd(embed + wordIdx * out_dim + off, gradient); +#else + assert(false); + // TODO: this implementation may result in race condition + // so we use an assertion failure to warn users + embed[wordIdx * out_dim + off] += gradient; +#endif + } + } +} + +template <> +__global__ void embed_backward_with_aggr( + int64_t const *input, + __nv_bfloat16 const *output, + __nv_bfloat16 *embed, + int out_dim, + int in_dim, + int batch_size, + AggrMode aggr) { + __nv_bfloat16 scale = 1.0f / in_dim; + CUDA_KERNEL_LOOP(i, batch_size * out_dim) { + int idx = i / out_dim; + int off = i % out_dim; + __nv_bfloat16 gradient; + if (aggr == AGGR_MODE_SUM) { + gradient = output[i]; + } else { + assert(aggr == AGGR_MODE_AVG); + gradient = output[i] * scale; + } + for (int j = 0; j < in_dim; j++) { + int64_t wordIdx = input[idx * in_dim + j]; +#if __CUDA_ARCH__ >= 700 + atomicAdd(embed + wordIdx * out_dim + off, gradient); +#else + assert(false); + // TODO: this implementation may result in race condition + // so we use an assertion failure to warn users + embed[wordIdx * out_dim + off] += gradient; +#endif + } + } +} + /*static*/ template void forward_kernel(TI const *input_ptr, diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 9373c2fb2f..87a884cb99 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -58,6 +58,12 @@ LinearMeta::LinearMeta(FFHandler handler, min(CUDA_NUM_THREADS, parallelism), 0, stream>>>((half *)one_ptr, batch_size); + } else if (data_type == DT_B16) { + Kernels::Linear::Internal:: + build_one_ptr<<>>((__nv_bfloat16 *)one_ptr, batch_size); } // Allocate descriptors @@ -152,6 +158,16 @@ void forward_kernel_wrapper(LinearMeta const *m, out_dim, batch_size, stream); + } else if (m->input_type[0] == DT_B16) { + Internal::forward_kernel<__nv_bfloat16>(m, + input_ptr, + output_ptr, + weight_ptr, + bias_ptr, + in_dim, + out_dim, + batch_size, + stream); } if (m->profiling) { @@ -216,6 +232,19 @@ void backward_kernel_wrapper(LinearMeta const *m, out_dim, batch_size, stream); + } else if (m->input_type[0] == DT_B16) { + Internal::backward_kernel<__nv_bfloat16>(m, + input_ptr, + input_grad_ptr, + output_ptr, + output_grad_ptr, + kernel_ptr, + kernel_grad_ptr, + bias_grad_ptr, + in_dim, + out_dim, + batch_size, + stream); } if (m->profiling) { diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index 17ac14449b..16a113c9cb 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -132,7 +132,7 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N, using T_ACC = T; for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { - const int64_t index = i * N + j; + int64_t const index = i * N + j; Y[index] = static_cast(X_out[index]) * static_cast(rms[i]); output[index] = Y[index] * weights[index % N]; } @@ -204,6 +204,14 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, residual_output.get_float_ptr(), output.get_float_ptr(), stream); + } else if (output.data_type == DT_B16) { + forward_kernel(m, + input1.get_bfloat16_ptr(), + input2.get_bfloat16_ptr(), + weight.get_bfloat16_ptr(), + residual_output.get_bfloat16_ptr(), + output.get_bfloat16_ptr(), + stream); } else { assert(false && "Unsupported data type"); } diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index 7c9f4a9f98..67a0b156b5 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -137,9 +137,9 @@ __global__ void template __global__ void NormKernel(int64_t N, T const *X, T const *rstd, T *Y) { using T_ACC = T; - const int64_t i = blockIdx.x; + int64_t const i = blockIdx.x; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; + int64_t const index = i * N + j; Y[index] = static_cast(X[index]) * static_cast(rstd[i]); } } @@ -186,7 +186,7 @@ __global__ void RMSNormFusedForwardKernel(int64_t N, using T_ACC = T; for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { - const int64_t index = i * N + j; + int64_t const index = i * N + j; Y[index] = static_cast(X[index]) * static_cast(rms[i]); output[index] = Y[index] * weights[index % N]; } @@ -246,6 +246,12 @@ void forward_kernel_wrapper(RMSNormMeta const *m, weight.get_float_ptr(), output.get_float_ptr(), stream); + } else if (output.data_type == DT_B16) { + forward_kernel(m, + input.get_bfloat16_ptr(), + weight.get_bfloat16_ptr(), + output.get_bfloat16_ptr(), + stream); } else { assert(false && "Unsupported data type"); } diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index e47006cc9d..2418f457dd 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -105,7 +105,10 @@ template void forward_kernel_wrapper(SoftmaxMeta const *m, template void forward_kernel_wrapper(SoftmaxMeta const *m, half const *input_ptr, half *output_ptr); - +template void + forward_kernel_wrapper<__nv_bfloat16>(SoftmaxMeta const *m, + __nv_bfloat16 const *input_ptr, + __nv_bfloat16 *output_ptr); template void backward_kernel_wrapper(SoftmaxMeta const *m, float *input_grad_ptr, float const *output_grad_ptr, @@ -114,6 +117,12 @@ template void backward_kernel_wrapper(SoftmaxMeta const *m, half *input_grad_ptr, half const *output_grad_ptr, size_t num_elements); + +template void + backward_kernel_wrapper<__nv_bfloat16>(SoftmaxMeta const *m, + __nv_bfloat16 *input_grad_ptr, + __nv_bfloat16 const *output_grad_ptr, + size_t num_elements); namespace Internal { template void forward_kernel(SoftmaxMeta const *m, diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 44979c48fe..e239023768 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -273,7 +273,15 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, m->elementwise_affine ? gamma.get_half_ptr() : nullptr, (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, stream); - } else { + } else if (m->input_type[0] == DT_B16) { + LayerNorm::forward_kernel<__nv_bfloat16>( + m, + input.get_bfloat16_ptr(), + output.get_bfloat16_ptr(), + m->elementwise_affine ? gamma.get_bfloat16_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_bfloat16_ptr() : nullptr, + stream); + }else { assert(false && "unsupport datatype in layernorm"); } diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 63b26bfe7d..8cdc392c96 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -33,7 +33,7 @@ using namespace FlexFlow::Kernels::Linear; static constexpr int KERNEL_IDX = 0; static constexpr int BIAS_IDX = 1; -Tensor FFModel::dense(const Tensor input, +Tensor FFModel::dense(Tensor const input, int outDim, ActiMode activation, bool use_bias, @@ -157,7 +157,7 @@ Op *Linear::create_operator_from_layer( Linear::Linear(FFModel &model, Linear const &other, - const ParallelTensor input, + ParallelTensor const input, bool allocate_weights) : Linear(model, other.layer_guid, @@ -194,7 +194,7 @@ Linear::Linear(FFModel &model, Linear::Linear(FFModel &model, LayerID const &_layer_guid, - const ParallelTensor _input, + ParallelTensor const _input, int out_dim, ActiMode _activation, RegularizerMode _kernel_reg_type, @@ -437,6 +437,14 @@ OpMeta *Linear::init_task(Task const *task, return init_task_with_dim( \ task, regions, ctx, runtime); \ } \ + } else if (output.data_type == DT_B16) { \ + if (linear->quantization_type != DT_NONE) { \ + return init_task_with_dim<__nv_bfloat16, char, DIM>( \ + task, regions, ctx, runtime); \ + } else { \ + return init_task_with_dim<__nv_bfloat16, __nv_bfloat16, DIM>( \ + task, regions, ctx, runtime); \ + } \ } else { \ assert(false && "Unsupported data type"); \ } @@ -704,6 +712,14 @@ void Linear::forward_task(Task const *task, return forward_task_with_dim( \ task, regions, ctx, runtime); \ } \ + } else if (m->output_type[0] == DT_B16) { \ + if (m->quantization_type != DT_NONE) { \ + return forward_task_with_dim<__nv_bfloat16, char, DIM>( \ + task, regions, ctx, runtime); \ + } else { \ + return forward_task_with_dim<__nv_bfloat16, __nv_bfloat16, DIM>( \ + task, regions, ctx, runtime); \ + } \ } else { \ assert(false && "Unsupported data type"); \ } @@ -859,6 +875,9 @@ void Linear::backward_task(Task const *task, return backward_task_with_dim(task, regions, ctx, runtime); \ } else if (m->output_type[0] == DT_FLOAT) { \ return backward_task_with_dim(task, regions, ctx, runtime); \ + } else if (m->output_type[0] == DT_B16) { \ + return backward_task_with_dim<__nv_bfloat16, DIM>( \ + task, regions, ctx, runtime); \ } else { \ assert(false && "Unsupported data type"); \ } @@ -1341,7 +1360,7 @@ bool LinearParams::is_valid(ParallelTensorShape const &input_shape) const { * It takes a the input tensor as a parameter, instead of the input's * ParallelTensorShape. */ -void LinearParams::solve_dims(const ParallelTensor input, +void LinearParams::solve_dims(ParallelTensor const input, ParallelDim output_dims[MAX_TENSOR_DIM], int *output_ndims, ParallelDim kernel_dims[MAX_TENSOR_DIM], diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index e5ebdce6ed..cbb7e241f3 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -225,7 +225,18 @@ void ResidualLayerNorm::inference_kernel_wrapper( m->elementwise_affine ? gamma.get_half_ptr() : nullptr, (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, stream); - } else { + } else if (m->input_type[0] == DT_B16) { + ResidualLayerNorm::inference_kernel<__nv_bfloat16>( + m, + input.get_bfloat16_ptr(), + residual1.get_bfloat16_ptr(), + m->use_two_residuals ? residual2.get_bfloat16_ptr() : nullptr, + added_output.get_bfloat16_ptr(), + output.get_bfloat16_ptr(), + m->elementwise_affine ? gamma.get_bfloat16_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_bfloat16_ptr() : nullptr, + stream); + }else { assert(false && "unsupport datatype in layernorm"); } diff --git a/src/ops/sampling.cu b/src/ops/sampling.cu index 461d72ec71..d4044a9cc3 100644 --- a/src/ops/sampling.cu +++ b/src/ops/sampling.cu @@ -182,6 +182,14 @@ void Sampling::forward_kernel_wrapper(SamplingMeta const *m, length, batch_size, stream); + } else if (input.data_type == DT_B16) { + Sampling::forward_kernel<__nv_bfloat16>(m, + input.get_bfloat16_ptr(), + indices.get_int32_ptr(), + m->top_p, + length, + batch_size, + stream); } else { assert(false && "Unsupported data type"); } @@ -270,7 +278,22 @@ SamplingMeta::SamplingMeta(FFHandler handler, 0, // begin_bit data_type_size(data_type) * 8, // end_bit = sizeof(KeyT) * 8 stream)); - } else { + } else if (data_type == DT_B16) { + checkCUDA(cub::DeviceSegmentedRadixSort::SortPairsDescending( + d_temp_storage, + temp_storage_bytes, + input.get_bfloat16_ptr(), + input.get_bfloat16_ptr(), + idx, + idx, + total_ele, + batch_size, + begin_offset, + end_offset + 1, + 0, // begin_bit + data_type_size(data_type) * 8, // end_bit = sizeof(KeyT) * 8 + stream)); + }else { assert(false && "input type in float and half"); } diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu index 590b641b5a..6373aaaae5 100644 --- a/src/ops/sigmoid_silu_multi.cu +++ b/src/ops/sigmoid_silu_multi.cu @@ -80,6 +80,14 @@ void SigmoidSiluMulti::inference_kernel_wrapper( input1.get_half_ptr(), input2.get_half_ptr(), output.get_half_ptr()); + } else if (m->input_type[0] == DT_B16) { + SigmoidSiluMultiKernel<<>>(input1.domain.get_volume(), + input1.get_bfloat16_ptr(), + input2.get_bfloat16_ptr(), + output.get_bfloat16_ptr()); } else { assert(false && "unsupport datatype in SigmoidSiluMulti"); } diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index ba0a1288d6..101eff9417 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -52,7 +52,7 @@ SoftmaxParams Softmax::get_params() const { return params; } -Tensor FFModel::softmax(const Tensor _input, +Tensor FFModel::softmax(Tensor const _input, int dim, DataType data_type, char const *name) { @@ -93,7 +93,7 @@ Op *Softmax::create_operator_from_layer( } Softmax::Softmax(FFModel &model, - const ParallelTensor _input, + ParallelTensor const _input, int _dim, char const *name) : Op(model, @@ -117,7 +117,7 @@ Softmax::Softmax(FFModel &model, Softmax::Softmax(FFModel &model, SoftmaxParams const ¶ms, - const ParallelTensor input, + ParallelTensor const input, char const *name) : Softmax(model, input, params.dim, name) {} @@ -319,6 +319,9 @@ void Softmax::forward_task(Task const *task, forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr()); } else if (m->output_type == DT_FLOAT) { forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr()); + } else if (m->output_type == DT_B16) { + forward_kernel_wrapper( + m, input.get_bfloat16_ptr(), output.get_bfloat16_ptr()); } else { assert(false && "Unsupported data type"); } @@ -366,6 +369,9 @@ void Softmax::backward_task(Task const *task, return backward_task_with_dim(task, regions, ctx, runtime); \ } else if (m->output_type == DT_FLOAT) { \ return backward_task_with_dim(task, regions, ctx, runtime); \ + } else if (m->output_type == DT_B16) { \ + return backward_task_with_dim<__nv_bfloat16, DIM>( \ + task, regions, ctx, runtime); \ } else { \ assert(false && "Unsupported data type"); \ } @@ -429,6 +435,9 @@ void Softmax::inference_task(Task const *task, forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr()); } else if (m->output_type == DT_FLOAT) { forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr()); + } else if (m->output_type == DT_B16) { + forward_kernel_wrapper( + m, input.get_bfloat16_ptr(), output.get_bfloat16_ptr()); } else { assert(false && "Unsupported data type"); } diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 2d80ed2221..0b978c23a6 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -789,6 +789,18 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( output.get_float_ptr(), bias_ptr, stream); + }else if (input.data_type == DT_B16) { + __nv_bfloat16 const *bias_ptr = + use_bias ? bias.get_bfloat16_ptr() : static_cast<__nv_bfloat16 const *>(nullptr); + Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( + m, + bc, + shard_id, + input.get_bfloat16_ptr(), + weight.get_bfloat16_ptr(), + output.get_bfloat16_ptr(), + bias_ptr, + stream); } else { assert(false && "Unspported data type"); } diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index fc86e1498e..1863dde1e7 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -1003,6 +1003,25 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( output.get_float_ptr(), bias_ptr, stream); + } else if (input.data_type == DT_B16) { + if (m->offload) { + pre_build_weight_kernel<__nv_bfloat16>( + m, weight, input.data_type, stream); + } + + __nv_bfloat16 const *bias_ptr = + use_bias ? bias.get_bfloat16_ptr() + : static_cast<__nv_bfloat16 const *>(nullptr); + Kernels::TreeIncMultiHeadAttention::inference_kernel( + m, + bc, + shard_id, + input.get_bfloat16_ptr(), + m->offload ? static_cast<__nv_bfloat16 *>(m->weight_ptr) + : weight.get_bfloat16_ptr(), + output.get_bfloat16_ptr(), + bias_ptr, + stream); } else { assert(false && "Unspported data type"); } diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc index 7c266c5392..ffe57006fc 100644 --- a/src/parallel_ops/combine.cc +++ b/src/parallel_ops/combine.cc @@ -72,7 +72,7 @@ Combine::Combine(FFModel &model, name) {} Combine::Combine(FFModel &model, - const ParallelTensor _input, + ParallelTensor const _input, int _combine_legion_dim, int _combine_degree, char const *name) @@ -365,6 +365,8 @@ void Combine::forward_task(Task const *task, DataType data_type = m->input_type[0]; if (data_type == DT_HALF) { forward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_B16) { + forward_task_with_type<__nv_bfloat16>(task, regions, ctx, runtime); } else if (data_type == DT_FLOAT) { forward_task_with_type(task, regions, ctx, runtime); } else if (data_type == DT_DOUBLE) { diff --git a/src/parallel_ops/kernels/combine_kernels.cu b/src/parallel_ops/kernels/combine_kernels.cu index 1ab79a7944..ca9451ed64 100644 --- a/src/parallel_ops/kernels/combine_kernels.cu +++ b/src/parallel_ops/kernels/combine_kernels.cu @@ -50,6 +50,11 @@ template void forward_kernel(half const *input_ptr, template void forward_kernel(float const *input_ptr, float *output_ptr, size_t num_elements); + +template void forward_kernel<__nv_bfloat16>(__nv_bfloat16 const *input_ptr, + __nv_bfloat16 *output_ptr, + size_t num_elements); + template void forward_kernel(double const *input_ptr, double *output_ptr, size_t num_elements); diff --git a/src/parallel_ops/kernels/reduction_kernels.cu b/src/parallel_ops/kernels/reduction_kernels.cu index 34ae8007da..67547c6d17 100644 --- a/src/parallel_ops/kernels/reduction_kernels.cu +++ b/src/parallel_ops/kernels/reduction_kernels.cu @@ -71,6 +71,13 @@ template __global__ void reduction_forward_kernel(half const *input_ptr, half *output_ptr, size_t num_elements, size_t num_replicas); + +template __global__ void + reduction_forward_kernel<__nv_bfloat16>(__nv_bfloat16 const *input_ptr, + __nv_bfloat16 *output_ptr, + size_t num_elements, + size_t num_replicas); + template void forward_kernel(float const *input_ptr, float *output_ptr, size_t num_elements, @@ -79,6 +86,12 @@ template void forward_kernel(half const *input_ptr, half *output_ptr, size_t num_elements, size_t num_replicas); + +template void forward_kernel<__nv_bfloat16>(__nv_bfloat16 const *input_ptr, + __nv_bfloat16 *output_ptr, + size_t num_elements, + size_t num_replicas); + template void backward_kernel(float const *output_grad_ptr, float *input_grad_ptr, size_t num_elements); diff --git a/src/parallel_ops/kernels/replicate_kernels.cu b/src/parallel_ops/kernels/replicate_kernels.cu index 35bc109bd3..573de9ae2e 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cu +++ b/src/parallel_ops/kernels/replicate_kernels.cu @@ -66,6 +66,9 @@ template void forward_kernel(float const *input_ptr, template void forward_kernel(half const *input_ptr, half *output_ptr, size_t num_elements); +template void forward_kernel<__nv_bfloat16>(__nv_bfloat16 const *input_ptr, + __nv_bfloat16 *output_ptr, + size_t num_elements); template __global__ void replicate_backward_kernel(float const *input_ptr, float *output_ptr, diff --git a/src/parallel_ops/reduction.cc b/src/parallel_ops/reduction.cc index 5dca591328..d53cae330b 100644 --- a/src/parallel_ops/reduction.cc +++ b/src/parallel_ops/reduction.cc @@ -60,7 +60,7 @@ ReductionParams Reduction::get_params() const { } Reduction::Reduction(FFModel &model, - const ParallelTensor _input, + ParallelTensor const _input, int _reduction_legion_dim, int _reduction_degree, char const *name) @@ -380,6 +380,11 @@ void Reduction::forward_task(Task const *task, output.get_float_ptr(), num_elements, num_replicas); + } else if (input.data_type == DT_B16) { + forward_kernel<__nv_bfloat16>(input.get_bfloat16_ptr(), + output.get_bfloat16_ptr(), + num_elements, + num_replicas); } else { assert(false && "Unspported data type"); } diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index 20face74e8..d9f7b2263b 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -59,7 +59,7 @@ ReplicateParams Replicate::get_params() const { } Replicate::Replicate(FFModel &model, - const ParallelTensor _input, + ParallelTensor const _input, int _replicate_legion_dim, int _replicate_degree, char const *name) @@ -373,6 +373,10 @@ void Replicate::forward_task(Task const *task, forward_kernel(input.get_float_ptr(), output.get_float_ptr(), input_domain.get_volume()); + } else if (input.data_type == DT_B16) { + forward_kernel<__nv_bfloat16>(input.get_bfloat16_ptr(), + output.get_bfloat16_ptr(), + input_domain.get_volume()); } else { assert(false && "Unspported data type"); } diff --git a/src/runtime/accessor.cc b/src/runtime/accessor.cc index d3b94bf14a..9bb6c08eea 100644 --- a/src/runtime/accessor.cc +++ b/src/runtime/accessor.cc @@ -77,6 +77,15 @@ half const *GenericTensorAccessorR::get_half_ptr() const { } } +__nv_bfloat16 const *GenericTensorAccessorR::get_bfloat16_ptr() const { + if (data_type == DT_B16) { + return static_cast<__nv_bfloat16 const *>(ptr); + } else { + assert(false && "Invalid Accessor Type"); + return static_cast<__nv_bfloat16 const *>(nullptr); + } +} + char const *GenericTensorAccessorR::get_byte_ptr() const { if (data_type == DT_INT4 || data_type == DT_INT8) { return static_cast(ptr); @@ -165,6 +174,15 @@ half *GenericTensorAccessorW::get_half_ptr() const { } } +__nv_bfloat16 *GenericTensorAccessorW::get_bfloat16_ptr() const { + if (data_type == DT_B16) { + return static_cast<__nv_bfloat16 *>(ptr); + } else { + assert(false && "Invalid Accessor Type"); + return static_cast<__nv_bfloat16 *>(nullptr); + } +} + char *GenericTensorAccessorW::get_byte_ptr() const { if (data_type == DT_INT4 || data_type == DT_INT8) { return static_cast(ptr); @@ -271,6 +289,11 @@ GenericTensorAccessorR ptr = helperGetTensorPointerRO(region, req, fid, ctx, runtime); break; } + case DT_B16: { + ptr = helperGetTensorPointerRO<__nv_bfloat16>( + region, req, fid, ctx, runtime); + break; + } case DT_FLOAT: { ptr = helperGetTensorPointerRO(region, req, fid, ctx, runtime); break; @@ -317,6 +340,11 @@ GenericTensorAccessorW ptr = helperGetTensorPointerWO(region, req, fid, ctx, runtime); break; } + case DT_B16: { + ptr = helperGetTensorPointerWO<__nv_bfloat16>( + region, req, fid, ctx, runtime); + break; + } case DT_FLOAT: { ptr = helperGetTensorPointerWO(region, req, fid, ctx, runtime); break; @@ -363,6 +391,11 @@ GenericTensorAccessorW ptr = helperGetTensorPointerRW(region, req, fid, ctx, runtime); break; } + case DT_B16: { + ptr = helperGetTensorPointerRW<__nv_bfloat16>( + region, req, fid, ctx, runtime); + break; + } case DT_FLOAT: { ptr = helperGetTensorPointerRW(region, req, fid, ctx, runtime); break; diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index fa6bf55fe5..91d139a414 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -320,6 +320,33 @@ __host__ void checkCUDA(cudaFreeHost(host_ptr)); } +template <> +__host__ void save_tensor(__nv_bfloat16 const *ptr, + size_t num_elements, + char const *file_name) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + __nv_bfloat16 *host_ptr; + checkCUDA(cudaHostAlloc(&host_ptr, + sizeof(__nv_bfloat16) * num_elements, + cudaHostAllocPortable | cudaHostAllocMapped)); + checkCUDA(cudaMemcpyAsync(host_ptr, + ptr, + sizeof(__nv_bfloat16) * num_elements, + cudaMemcpyDeviceToHost, + stream)); + checkCUDA(cudaDeviceSynchronize()); + FILE *tensor_file; + tensor_file = fopen(file_name, "w"); + assert(tensor_file != NULL); + for (unsigned i = 0; i < num_elements; i++) { + fprintf(tensor_file, "%.9f, ", (float)host_ptr[i]); + } + + fclose(tensor_file); + checkCUDA(cudaFreeHost(host_ptr)); +} + template <> __host__ void save_tensor(int32_t const *ptr, size_t num_elements, @@ -519,6 +546,8 @@ cudnnDataType_t ff_to_cudnn_datatype(DataType type) { switch (type) { case DT_HALF: return CUDNN_DATA_HALF; + case DT_B16: + return CUDNN_DATA_BFLOAT16; case DT_FLOAT: return CUDNN_DATA_FLOAT; case DT_DOUBLE: @@ -535,6 +564,8 @@ cudaDataType_t ff_to_cuda_datatype(DataType type) { switch (type) { case DT_HALF: return CUDA_R_16F; + case DT_B16: + return CUDA_R_16BF; case DT_FLOAT: return CUDA_R_32F; case DT_DOUBLE: @@ -552,6 +583,8 @@ ncclDataType_t ff_to_nccl_datatype(DataType type) { switch (type) { case DT_HALF: return ncclHalf; + case DT_B16: + return ncclBfloat16; case DT_FLOAT: return ncclFloat; case DT_DOUBLE: @@ -595,6 +628,9 @@ cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type) { template __global__ void assign_kernel(half *ptr, coord_t size, half value); +template __global__ void assign_kernel<__nv_bfloat16>(__nv_bfloat16 *ptr, + coord_t size, + __nv_bfloat16 value); template __global__ void assign_kernel(float *ptr, coord_t size, float value); template __global__ void @@ -700,6 +736,10 @@ template __host__ void save_tensor(int64_t const *ptr, template __host__ void save_tensor(half const *ptr, size_t rect, char const *file_name); +template __host__ void save_tensor<__nv_bfloat16>(__nv_bfloat16 const *ptr, + size_t rect, + char const *file_name); + template __host__ float *download_tensor(float const *ptr, size_t num_elements); template __host__ half *download_tensor(half const *ptr, diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index c7b6e1257a..764c938dc3 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -213,6 +213,8 @@ size_t data_type_size(DataType type) { switch (type) { case DT_HALF: return sizeof(half); + case DT_B16: + return sizeof(__nv_bfloat16); case DT_FLOAT: return sizeof(float); case DT_DOUBLE: diff --git a/src/runtime/initializer_kernel.cu b/src/runtime/initializer_kernel.cu index b6629ec90b..b6b9e29b8b 100644 --- a/src/runtime/initializer_kernel.cu +++ b/src/runtime/initializer_kernel.cu @@ -235,6 +235,12 @@ void ZeroInitializer::init_task(Task const *task, assign_kernel <<>>( w, domain.get_volume(), 0.0f); + } else if (meta->data_types[i] == DT_B16) { + __nv_bfloat16 *w = helperGetTensorPointerWO<__nv_bfloat16>( + regions[i], task->regions[i], FID_DATA, ctx, runtime); + assign_kernel<__nv_bfloat16> + <<>>( + w, domain.get_volume(), 0.0f); } else if (meta->data_types[i] == DT_FLOAT) { float *w = helperGetTensorPointerWO( regions[i], task->regions[i], FID_DATA, ctx, runtime); diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 76bed36bda..f31be1bcdb 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1947,6 +1947,9 @@ void FFModel::map_tensor_with_dim2(ParallelTensor tensor, case DT_HALF: allocator.allocate_field(sizeof(half), FID_DATA); break; + case DT_B16: + allocator.allocate_field(sizeof(__nv_bfloat16), FID_DATA); + break; case DT_FLOAT: allocator.allocate_field(sizeof(float), FID_DATA); break; diff --git a/src/runtime/operator.cc b/src/runtime/operator.cc index 0b3813f41c..6349bfd1a5 100644 --- a/src/runtime/operator.cc +++ b/src/runtime/operator.cc @@ -62,6 +62,10 @@ void Op::save_inference_tensors_to_file( save_tensor(input_tensors[i].get_half_ptr(), input_tensors[i].domain.get_volume(), filename.c_str()); + } else if (input_tensors[i].data_type == DT_B16) { + save_tensor(input_tensors[i].get_bfloat16_ptr(), + input_tensors[i].domain.get_volume(), + filename.c_str()); } else if (input_tensors[i].data_type == DT_INT32) { save_tensor(input_tensors[i].get_int32_ptr(), input_tensors[i].domain.get_volume(), @@ -86,6 +90,10 @@ void Op::save_inference_tensors_to_file( save_tensor(weight_tensors[i].get_half_ptr(), weight_tensors[i].domain.get_volume(), filename.c_str()); + } else if (weight_tensors[i].data_type == DT_B16) { + save_tensor(weight_tensors[i].get_bfloat16_ptr(), + weight_tensors[i].domain.get_volume(), + filename.c_str()); } else if (weight_tensors[i].data_type == DT_INT32) { save_tensor(weight_tensors[i].get_int32_ptr(), weight_tensors[i].domain.get_volume(), @@ -110,6 +118,10 @@ void Op::save_inference_tensors_to_file( save_tensor(output_tensors[i].get_half_ptr(), output_tensors[i].domain.get_volume(), filename.c_str()); + } else if (output_tensors[i].data_type == DT_B16) { + save_tensor(output_tensors[i].get_bfloat16_ptr(), + output_tensors[i].domain.get_volume(), + filename.c_str()); } else if (output_tensors[i].data_type == DT_INT32) { save_tensor(output_tensors[i].get_int32_ptr(), output_tensors[i].domain.get_volume(), From 73569cb9dcb26e63b6a71f3d3d655f5238538465 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sun, 7 Jan 2024 00:48:40 -0500 Subject: [PATCH 02/11] . --- inference/file_loader.cc | 336 ++++++++++++++++++++++++++++++++++++++- inference/file_loader.h | 9 +- 2 files changed, 342 insertions(+), 3 deletions(-) diff --git a/inference/file_loader.cc b/inference/file_loader.cc index 7c6870d439..9666a60358 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -125,6 +125,335 @@ void load_attention_weights_multi_query(DT *ptr, } } +///////////////////////bfloat16 function/////////////////////// + +void load_from_file_b16(__nv_bfloat16 *ptr, size_t size, std::string filepath) { + std::ifstream in(filepath, std::ios::in | std::ios::binary); + if (!in.good()) { + std::cout << "Could not open file: " << filepath << std::endl; + } + assert(in.good() && "incorrect weight file path"); + std::vector host_array(size); + size_t loaded_data_size = sizeof(float) * size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + + size_t in_get_size = in.gcount(); + if (in_get_size != loaded_data_size) { + std::cout << "load weight data error " << in_get_size << ", " + << loaded_data_size << ", " << sizeof(float) << ", file path: " << filepath << std::endl; + assert(false); + } + assert(size == host_array.size()); + + // normal + long data_index = 0; + for (auto v : host_array) { + ptr[data_index++] = __float2bfloat16(v); + } + in.close(); +} + +void load_attention_weights_v2_b16(__nv_bfloat16 *ptr, + int num_heads, + int num_kv_heads, + size_t hidden_dim, + size_t qkv_inner_dim, + std::string layer_name, + std::string weights_folder, + size_t volume, + int tensor_parallelism_degree) { + // layers_0_attention_wq_weight + // layers_0_self_attn_q_proj_weight + std::string q_file = layer_name + "_wq_weight"; + std::string k_file = layer_name + "_wk_weight"; + std::string v_file = layer_name + "_wv_weight"; + std::string o_file = layer_name + "_wo_weight"; + std::vector weight_filenames = {q_file, k_file, v_file}; + int file_index = 0; + + int base_index = 0; + size_t single_proj_size = + hidden_dim * + qkv_inner_dim; // size of each of Q,K,V,O weights for a single head + size_t one_weight_file_size = + num_heads * single_proj_size; // size of each of Q/K/V/O for all heads + + size_t q_size = one_weight_file_size, o_size = one_weight_file_size; + size_t k_size = single_proj_size * num_kv_heads, + v_size = single_proj_size * num_kv_heads; + + size_t k_replicate_size = one_weight_file_size; + size_t v_replicate_size = one_weight_file_size; + + int replicate_num = num_heads / num_kv_heads; + + // stride for q, k, v, o + size_t stride_size = (q_size + v_replicate_size + k_replicate_size + o_size) / + tensor_parallelism_degree; + for (auto filename : weight_filenames) { + std::cout << "Loading weight file " << filename << std::endl; + std::string weight_filepath = join_path({weights_folder, filename}); + + int data_index = 0; + size_t partial_size = (file_index == 0 || file_index == 3) + ? one_weight_file_size + : single_proj_size * num_kv_heads; + size_t one_partition_size = + one_weight_file_size / tensor_parallelism_degree; + + std::ifstream in(weight_filepath, std::ios::in | std::ios::binary); + if (!in.good()) { + std::cout << "Could not open file: " << weight_filepath << std::endl; + } + assert(in.good() && "incorrect weight file path"); + std::vector host_array(partial_size); + size_t loaded_data_size = sizeof(float) * partial_size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + size_t in_get_size = in.gcount(); + + if (in_get_size != loaded_data_size) { + std::cout << "load attention data error " << in_get_size << ", " + << loaded_data_size << ", " << file_index << ", " + << weight_filepath << "\n"; + assert(false && "data size mismatch"); + } + // wq, wk, wo + if (file_index == 0) { + for (int i = 0; i < tensor_parallelism_degree; i++) { + for (int j = 0; j < one_partition_size; j++) { + ptr[base_index + i * stride_size + j] = + __float2bfloat16(host_array.at(data_index++)); + } + } + } else { + for (int i = 0; i < num_heads; i++) { + int kv_idx = i / (num_heads / num_kv_heads); + int head_idx = i % (num_heads / tensor_parallelism_degree); + int tp_idx = (i / (num_heads / tensor_parallelism_degree)); + for (int j = 0; j < single_proj_size; j++) { + ptr[base_index + tp_idx * stride_size + single_proj_size * head_idx + + j] = + __float2bfloat16(host_array.at(kv_idx * single_proj_size + j)); + } + } + } + + // assert(data_index == partial_size); + base_index += one_partition_size; + file_index++; + } + assert(base_index == (q_size + k_replicate_size + v_replicate_size) / + tensor_parallelism_degree); + + { + std::cout << "Loading weight file " << o_file << std::endl; + std::string weight_filepath = join_path({weights_folder, o_file}); + + std::ifstream in(weight_filepath, std::ios::in | std::ios::binary); + if (!in.good()) { + std::cout << "Could not open file: " << weight_filepath << std::endl; + } + assert(in.good() && "incorrect weight file path"); + std::vector host_array(one_weight_file_size); + size_t loaded_data_size = sizeof(float) * one_weight_file_size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + size_t in_get_size = in.gcount(); + + if (in_get_size != loaded_data_size) { + std::cout << "load data error" << std::endl; + assert(false); + } + assert(one_weight_file_size == host_array.size()); + int data_index = 0; + + int one_partition_size = + qkv_inner_dim * (num_heads / tensor_parallelism_degree); + for (int i = 0; i < one_weight_file_size; i++) { + int part_idx = (i / one_partition_size) % tensor_parallelism_degree; + int block_num = (i / one_partition_size); + int offset = block_num / tensor_parallelism_degree * one_partition_size + + (i % one_partition_size); + ptr[base_index + part_idx * stride_size + offset] = + __float2bfloat16(host_array.at(data_index++)); + } + + in.close(); + + assert(data_index == one_weight_file_size); + } +} + +void load_attention_bias_v2_b16(__nv_bfloat16 *ptr, + int num_heads, + int num_kv_heads, + size_t hidden_dim, + size_t qkv_inner_dim, + bool final_bias, + std::string layer_name, + std::string weights_folder) { + std::string q_file = layer_name + "_wq_bias"; + std::string k_file = layer_name + "_wk_bias"; + std::string v_file = layer_name + "_wv_bias"; + std::vector bias_files = {q_file, k_file, v_file}; + if (final_bias) { + std::string o_file = layer_name + "_wo_bias"; + bias_files.push_back(o_file); + } + + int file_index = 0; + + // now only opt use this. + // assert(num_heads == num_kv_heads); + int idx = 0; + + for (auto filename : bias_files) { + std::cout << "Loading weight file " << filename << std::endl; + std::string weight_filepath = join_path({weights_folder, filename}); + + int n_heads = file_index == 0 ? num_heads : num_kv_heads; + + int replicate_num = num_heads / num_kv_heads; + + size_t qkv_partial_size = qkv_inner_dim * n_heads; + size_t qkv_replicate_size = qkv_inner_dim * num_heads; + size_t out_partial_size = hidden_dim; + size_t partial_size = + (file_index < 3) ? qkv_partial_size : out_partial_size; + std::ifstream in(weight_filepath, std::ios::in | std::ios::binary); + assert(in.good() && "incorrect bias file path"); + std::vector host_array(partial_size); + size_t loaded_data_size = sizeof(float) * partial_size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + size_t in_get_size = in.gcount(); + + if (in_get_size != loaded_data_size) { + printf( + "load bias data error: in_get_size (%lu) != loaded_data_size (%lu)\n", + in_get_size, + loaded_data_size); + assert(false); + } + assert(partial_size == host_array.size()); + + size_t data_index = 0; + + // q, o + if (file_index == 0 || file_index == 3) { + for (int i = 0; i < partial_size; i++) { + ptr[idx + i] = __float2bfloat16(host_array.at(data_index)); + data_index++; + } + } else { + // k, v + for (int i = 0; i < partial_size; i++) { + for (int j = 0; j < replicate_num; j++) { + ptr[idx + j * partial_size + i] = + __float2bfloat16(host_array.at(data_index)); + } + data_index++; + } + } + + file_index++; + idx += qkv_replicate_size; + + in.close(); + } +} + +void FileDataLoader::load_single_weight_tensor_b16(FFModel *ff, + Layer *l, + int weight_idx) { + Tensor weight = l->weights[weight_idx]; + + + // Create a buffer to store weight data from the file + size_t volume = 1; + std::vector dims_vec; + for (int i = 0; i < weight->num_dims; i++) { + dims_vec.push_back(weight->dims[i]); + volume *= weight->dims[i]; + } + assert(data_type_size(weight->data_type) == sizeof(__nv_bfloat16)); + __nv_bfloat16 *data = (__nv_bfloat16 *)malloc(sizeof(__nv_bfloat16) * volume); + + std::string weight_filename = removeGuidOperatorName(std::string(l->name)); + + if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || + l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION || + l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) { + if (weight_filename.find("self_attention") != std::string::npos) { + load_attention_weights_multi_query( + data, weight_filename, weights_folder, hidden_dim, num_heads); + } else if (weight_filename.find("attention") != std::string::npos && + weight_filename.rfind("attention") == + weight_filename.length() - strlen("attention")) { + if (weight_idx == 0) { + load_attention_weights_v2_b16(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + weight_filename, + weights_folder, + volume, + tensor_parallelism_degree); + } else { + long long value; + l->get_int_property("final_bias", value); + bool final_bias = (bool)value; + load_attention_bias_v2_b16(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + final_bias, + weight_filename, + weights_folder); + } + + } else { + assert(false); + } + } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) { + assert(weight_idx >= 0 || weight_idx <= 2); + weight_filename += (weight_idx == 0) + ? "_attn_bias" + : ((weight_idx == 1) ? "_weight" : "_bias"); + std::cout << "Loading weight file " << weight_filename << std::endl; + std::string weight_filepath = join_path({weights_folder, weight_filename}); + load_from_file_b16(data, volume, weight_filepath); + } else { + // default op + assert(weight_idx == 0 || weight_idx == 1); + // handle exception + if (weight_filename != "embed_tokens_weight_lm_head") { + weight_filename += weight_idx == 0 ? "_weight" : "_bias"; + } + std::cout << "Loading weight file " << weight_filename << std::endl; + std::string weight_filepath = join_path({weights_folder, weight_filename}); + load_from_file_b16(data, volume, weight_filepath); + } + + // Copy the weight data from the buffer to the weight's ParallelTensor + ParallelTensor weight_pt; + ff->get_parallel_tensor_from_tensor(weight, weight_pt); + weight_pt->set_tensor<__nv_bfloat16>(ff, dims_vec, data); + + // Free buffer memory + delete data; +} + +///////////////////////////bdlot16 functions//////////// + template void load_attention_bias_v2(DT *ptr, int num_heads, @@ -339,7 +668,9 @@ void load_attention_weights_v2(DT *ptr, } template -void load_from_file(DT *ptr, size_t size, std::string filepath) { +void FileDataLoader::load_from_file(DT *ptr, + size_t size, + std::string filepath) { std::ifstream in(filepath, std::ios::in | std::ios::binary); if (!in.good()) { std::cout << "Could not open file: " << filepath << std::endl; @@ -806,6 +1137,9 @@ void FileDataLoader::load_weights(FFModel *ff, bool use_full_precision) { case DT_FLOAT: load_single_weight_tensor(ff, l, i); break; + case DT_B16: + load_single_weight_tensor_b16(ff, l, i); + break; case DT_INT4: case DT_INT8: // load weights in quantization diff --git a/inference/file_loader.h b/inference/file_loader.h index 6f01a79b80..579c6d4239 100644 --- a/inference/file_loader.h +++ b/inference/file_loader.h @@ -37,6 +37,8 @@ class FileDataLoader { template void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx); + void load_single_weight_tensor_b16(FFModel *ff, Layer *l, int weight_idx); + void load_quantization_weight(FFModel *ff, Layer *l, int weight_idx, @@ -48,9 +50,12 @@ class FileDataLoader { ParallelTensor position_pt, int max_seq_length, int offset); + template + void load_from_file(DT *ptr, size_t size, std::string filepath); -private: - int num_heads, num_kv_heads, tensor_parallelism_degree; + private : int num_heads, + num_kv_heads, + tensor_parallelism_degree; size_t hidden_dim, qkv_inner_dim; std::string prompts_filepath; std::string weights_folder; From 0f8337e41282cab284418bef06f9100cdd1010f3 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sun, 7 Jan 2024 14:53:26 -0500 Subject: [PATCH 03/11] fix --- inference/file_loader.cc | 7 ++- inference/incr_decoding/incr_decoding.cc | 31 ++++++---- inference/models/falcon.cc | 6 +- inference/models/falcon.h | 2 +- inference/models/llama.cc | 2 +- inference/models/mpt.cc | 6 +- inference/models/mpt.h | 2 +- inference/models/opt.cc | 8 +-- inference/models/opt.h | 2 +- inference/models/starcoder.cc | 8 +-- inference/models/starcoder.h | 2 +- inference/spec_infer/spec_infer.cc | 52 ++++++++++------ src/ops/inc_multihead_self_attention.cu | 63 ++++++++++++-------- src/ops/kernels/linear_kernels.cu | 42 ++++++------- src/ops/spec_inc_multihead_self_attention.cu | 4 +- src/runtime/parallel_tensor.cc | 6 ++ 16 files changed, 145 insertions(+), 98 deletions(-) diff --git a/inference/file_loader.cc b/inference/file_loader.cc index 9666a60358..9f9bce56b7 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -142,7 +142,8 @@ void load_from_file_b16(__nv_bfloat16 *ptr, size_t size, std::string filepath) { size_t in_get_size = in.gcount(); if (in_get_size != loaded_data_size) { std::cout << "load weight data error " << in_get_size << ", " - << loaded_data_size << ", " << sizeof(float) << ", file path: " << filepath << std::endl; + << loaded_data_size << ", " << sizeof(float) + << ", file path: " << filepath << std::endl; assert(false); } assert(size == host_array.size()); @@ -373,7 +374,6 @@ void FileDataLoader::load_single_weight_tensor_b16(FFModel *ff, Layer *l, int weight_idx) { Tensor weight = l->weights[weight_idx]; - // Create a buffer to store weight data from the file size_t volume = 1; @@ -685,7 +685,8 @@ void FileDataLoader::load_from_file(DT *ptr, size_t in_get_size = in.gcount(); if (in_get_size != loaded_data_size) { std::cout << "load weight data error " << in_get_size << ", " - << loaded_data_size << ", " << sizeof(DT) << std::endl; + << loaded_data_size << ", " << sizeof(DT) + << ", filepath: " << filepath << std::endl; assert(false); } assert(size == host_array.size()); diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 61652b3ae3..e11eb14113 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -41,6 +41,7 @@ void parse_input_args(char **argv, std::string &llm_model_name, DataType &data_type, bool &use_full_precision, + bool &use_bfloat16_precision, bool &verbose, bool &do_sample, float &temperature, @@ -73,9 +74,15 @@ void parse_input_args(char **argv, continue; } if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; data_type = DT_FLOAT; continue; } + if (!strcmp(argv[i], "--use-bfloat16-precision")) { + use_bfloat16_precision = true; + data_type = DT_B16; + continue; + } // verbose logging to stdout if (!strcmp(argv[i], "--verbose")) { verbose = true; @@ -126,8 +133,9 @@ void FlexFlow::top_level_task(Task const *task, } FilePaths file_paths; std::string llm_model_name; - DataType data_type = DT_B16; + DataType data_type = DT_HALF; bool use_full_precision = false; + bool use_bfloat16_precision = false; bool verbose = false; bool do_sample = false; float temperature = 0.0f; @@ -145,6 +153,7 @@ void FlexFlow::top_level_task(Task const *task, llm_model_name, data_type, use_full_precision, + use_bfloat16_precision, verbose, do_sample, temperature, @@ -161,11 +170,13 @@ void FlexFlow::top_level_task(Task const *task, {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"}); std::string tokenizer_filepath = join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name}); - std::string weights_filepath = - join_path({file_paths.cache_folder_path, - "weights", - llm_model_name, - use_full_precision ? "full-precision" : "half-precision"}); + std::string weights_filepath = join_path( + {file_paths.cache_folder_path, + "weights", + llm_model_name, + use_full_precision + ? "full-precision" + : (use_bfloat16_precision ? "full-precision" : "half-precision")}); std::ifstream config_file_handle(config_filepath); if (!config_file_handle.good()) { std::cout << "Model config file " << config_filepath << " not found." @@ -228,27 +239,27 @@ void FlexFlow::top_level_task(Task const *task, config_filepath, weights_filepath, INC_DECODING_MODE, - use_full_precision); + data_type); } else if (model_type == ModelType::FALCON) { FALCON::create_falcon_model(model, config_filepath, weights_filepath, INC_DECODING_MODE, - use_full_precision); + data_type); } else if (model_type == ModelType::STARCODER) { STARCODER::create_starcoder_model(model, config_filepath, weights_filepath, INC_DECODING_MODE, generationConfig, - use_full_precision); + data_type); } else if (model_type == ModelType::MPT) { MPT::create_mpt_model(model, config_filepath, weights_filepath, INC_DECODING_MODE, generationConfig, - use_full_precision); + data_type); } else { assert(false && "unknow model type"); } diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index 999ca37037..998abd7ae6 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -24,7 +24,7 @@ void FALCON::create_falcon_model(FFModel &ff, std::string const &model_config_file_path, std::string const &weight_file_path, InferenceMode mode, - bool use_full_precision) { + DataType data_type) { FalconConfig falcon_config(model_config_file_path); falcon_config.print(); @@ -53,7 +53,7 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.vocab_size, falcon_config.hidden_size, AGGR_MODE_NONE, - use_full_precision ? DT_FLOAT : DT_HALF, + data_type, NULL, embed_init, "word_embeddings"); @@ -251,7 +251,7 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.hidden_size / falcon_config.n_head, ff.config.tensor_parallelism_degree); std::cout << "------load weights ----------" << std::endl; - fileloader.load_weights(&ff, use_full_precision); + fileloader.load_weights(&ff, false); std::cout << "------load weight finished----------" << std::endl; // init operators diff --git a/inference/models/falcon.h b/inference/models/falcon.h index 01226a30dc..b87d88fd0f 100644 --- a/inference/models/falcon.h +++ b/inference/models/falcon.h @@ -94,7 +94,7 @@ class FALCON { std::string const &model_config_file_path, std::string const &weight_file_path, InferenceMode mode, - bool use_full_precision = false); + DataType DataType); }; }; // namespace FlexFlow diff --git a/inference/models/llama.cc b/inference/models/llama.cc index f4655b1750..40075bcdbf 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -276,7 +276,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.hidden_size / llama_config.num_attention_heads, ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, data_type); + fileloader.load_weights(&ff, false); std::cout << "------load weight finished----------" << std::endl; // init operators diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index 3df67b264c..74a275699c 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -25,7 +25,7 @@ void MPT::create_mpt_model(FFModel &ff, std::string const &weight_file_path, InferenceMode mode, GenerationConfig generationConfig, - bool use_full_precision) { + DataType data_type) { MPTConfig mpt_config(model_config_file_path); mpt_config.print(); @@ -54,7 +54,7 @@ void MPT::create_mpt_model(FFModel &ff, mpt_config.vocab_size, mpt_config.hidden_size, AGGR_MODE_NONE, - use_full_precision ? DT_FLOAT : DT_HALF, + data_type, NULL, embed_init, "transformer_wte"); @@ -257,7 +257,7 @@ void MPT::create_mpt_model(FFModel &ff, mpt_config.hidden_size, mpt_config.hidden_size / mpt_config.n_heads, ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, use_full_precision); + fileloader.load_weights(&ff, false); im->init_operators_inference(&ff); } diff --git a/inference/models/mpt.h b/inference/models/mpt.h index 1969cd9c89..24c2ab9999 100644 --- a/inference/models/mpt.h +++ b/inference/models/mpt.h @@ -70,7 +70,7 @@ class MPT { std::string const &weight_file_path, InferenceMode mode, GenerationConfig generationConfig, - bool use_full_precision = false); + DataType DataType); }; }; // namespace FlexFlow diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 0279f83239..3c771fb005 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -24,7 +24,7 @@ void OPT::create_opt_model(FFModel &ff, std::string const &model_config_file_path, std::string const &weight_file_path, InferenceMode mode, - bool use_full_precision) { + DataType data_type) { OPTConfig opt_config(model_config_file_path); opt_config.print(); @@ -57,7 +57,7 @@ void OPT::create_opt_model(FFModel &ff, opt_config.vocab_size, opt_config.word_embed_proj_dim, AGGR_MODE_NONE, - use_full_precision ? DT_FLOAT : DT_HALF, + data_type, NULL, embed_init, "embed_tokens"); @@ -67,7 +67,7 @@ void OPT::create_opt_model(FFModel &ff, opt_config.max_position_embeddings, opt_config.hidden_size, AGGR_MODE_NONE, - use_full_precision ? DT_FLOAT : DT_HALF, + data_type, NULL, embed_init, "embed_positions"); @@ -267,7 +267,7 @@ void OPT::create_opt_model(FFModel &ff, opt_config.hidden_size / opt_config.num_attention_heads, ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, use_full_precision); + fileloader.load_weights(&ff, false); std::cout << "------finished loading weights----------" << std::endl; im->init_operators_inference(&ff); } diff --git a/inference/models/opt.h b/inference/models/opt.h index 1ffe096bca..365cc3d512 100644 --- a/inference/models/opt.h +++ b/inference/models/opt.h @@ -97,7 +97,7 @@ class OPT { std::string const &model_config_file_path, std::string const &weight_file_path, InferenceMode mode, - bool use_full_precision = false); + DataType DataType); }; }; // namespace FlexFlow diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index e683376e47..5714ca1369 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -26,7 +26,7 @@ void STARCODER::create_starcoder_model( std::string const &weight_file_path, InferenceMode mode, GenerationConfig generationConfig, - bool use_full_precision) { + DataType data_type) { // do not apply cpu offload in beam search model. STARCODERConfig startcoder_config(model_config_file_path); startcoder_config.print(); @@ -62,7 +62,7 @@ void STARCODER::create_starcoder_model( startcoder_config.vocab_size, startcoder_config.hidden_size, AGGR_MODE_NONE, - use_full_precision ? DT_FLOAT : DT_HALF, + data_type, NULL, embed_init, "transformer_wte"); @@ -72,7 +72,7 @@ void STARCODER::create_starcoder_model( startcoder_config.max_position_embeddings, startcoder_config.hidden_size, AGGR_MODE_NONE, - use_full_precision ? DT_FLOAT : DT_HALF, + data_type, NULL, embed_init, "transformer_wpe"); @@ -232,7 +232,7 @@ void STARCODER::create_starcoder_model( startcoder_config.hidden_size / startcoder_config.num_attention_heads, ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, use_full_precision); + fileloader.load_weights(&ff, false); std::cout << "------load weight finished----------" << std::endl; // init operators diff --git a/inference/models/starcoder.h b/inference/models/starcoder.h index bc113e4d52..c37b7af5af 100644 --- a/inference/models/starcoder.h +++ b/inference/models/starcoder.h @@ -71,7 +71,7 @@ class STARCODER { std::string const &weight_file_path, InferenceMode mode, GenerationConfig generationConfig, - bool use_full_precision = false); + DataType DataType); }; }; // namespace FlexFlow diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 8d3d23ae4e..585a4a7ee7 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -57,7 +57,9 @@ void parse_input_args(char **argv, int argc, FilePaths &paths, ModelNames &model_names, + DataType &data_type, bool &use_full_precision, + bool &use_bfloat16_precision, bool &verbose, int &max_requests_per_batch, int &max_tokens_per_batch, @@ -99,6 +101,10 @@ void parse_input_args(char **argv, use_full_precision = true; continue; } + if (!strcmp(argv[i], "--use-bfloat16-precision")) { + use_bfloat16_precision = true; + data_type = DT_B16; + } // verbose logging to stdout if (!strcmp(argv[i], "--verbose")) { verbose = true; @@ -129,7 +135,8 @@ void parse_input_args(char **argv, void get_model_meta(FilePaths &file_paths, ModelMeta &model_metadata, - bool use_full_precision) { + bool use_full_precision, + bool use_bfloat16_precision) { if (model_metadata.model_names.llm_model_name.empty() || model_metadata.model_names.ssm_model_names.size() == 0) { assert(false && "SpecInfer needs at least one LLM and one SSM for " @@ -144,11 +151,13 @@ void get_model_meta(FilePaths &file_paths, join_path({file_paths.cache_folder_path, "tokenizers", model_metadata.model_names.llm_model_name}); - model_metadata.llm_weights_path = - join_path({file_paths.cache_folder_path, - "weights", - model_metadata.model_names.llm_model_name, - use_full_precision ? "full-precision" : "half-precision"}); + model_metadata.llm_weights_path = join_path( + {file_paths.cache_folder_path, + "weights", + model_metadata.model_names.llm_model_name, + use_full_precision + ? "full-precision" + : (use_bfloat16_precision ? "full-precision" : "half-precision")}); std::ifstream llm_config_file_handle(model_metadata.llm_model_config_path); if (!llm_config_file_handle.good()) { @@ -195,11 +204,13 @@ void get_model_meta(FilePaths &file_paths, "config.json"}); std::string ssm_tokenizer_path = join_path({file_paths.cache_folder_path, "tokenizers", ssm_model_name}); - std::string ssm_weights_path = - join_path({file_paths.cache_folder_path, - "weights", - ssm_model_name, - use_full_precision ? "full-precision" : "half-precision"}); + std::string ssm_weights_path = join_path( + {file_paths.cache_folder_path, + "weights", + ssm_model_name, + use_full_precision + ? "full-precision" + : (use_bfloat16_precision ? "full-precision" : "half-precision")}); std::ifstream ssm_config_file_handle(ssm_config_path); if (!ssm_config_file_handle.good()) { @@ -264,8 +275,9 @@ void FlexFlow::top_level_task(Task const *task, FFConfig ffconfig; FilePaths file_paths; ModelMeta model_metadata; - DataType data_type = DT_B16; + DataType data_type = DT_HALF; bool use_full_precision = false; + bool use_bfloat16_precision = false; bool verbose = false; int max_requests_per_batch = 16; int max_tokens_per_batch = 256; @@ -278,13 +290,15 @@ void FlexFlow::top_level_task(Task const *task, argc, file_paths, model_metadata.model_names, + data_type, use_full_precision, + use_bfloat16_precision, verbose, max_requests_per_batch, max_tokens_per_batch, max_sequence_length); - get_model_meta(file_paths, model_metadata, use_full_precision); + get_model_meta(file_paths, model_metadata, use_full_precision, use_bfloat16_precision); assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * ffconfig.pipeline_parallelism_degree == @@ -320,20 +334,20 @@ void FlexFlow::top_level_task(Task const *task, model_metadata.llm_model_config_path, model_metadata.llm_weights_path, TREE_VERIFY_MODE, - use_full_precision); + data_type); } else if (model_metadata.llm_model_type == ModelType::FALCON) { FALCON::create_falcon_model(tree_model, model_metadata.llm_model_config_path, model_metadata.llm_weights_path, TREE_VERIFY_MODE, - use_full_precision); + data_type); } else if (model_metadata.llm_model_type == ModelType::MPT) { MPT::create_mpt_model(tree_model, model_metadata.llm_model_config_path, model_metadata.llm_weights_path, TREE_VERIFY_MODE, generationConfig, - use_full_precision); + data_type); } else { assert(false && "Invalid LLM model type passed (or no type was passed)."); } @@ -364,21 +378,21 @@ void FlexFlow::top_level_task(Task const *task, model_metadata.ssm_model_config_paths[ssm_id], model_metadata.ssm_model_weights_paths[ssm_id], BEAM_SEARCH_MODE, - use_full_precision); + data_type); } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::FALCON) { FALCON::create_falcon_model( beam_model, model_metadata.ssm_model_config_paths[ssm_id], model_metadata.ssm_model_weights_paths[ssm_id], BEAM_SEARCH_MODE, - use_full_precision); + data_type); } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::MPT) { MPT::create_mpt_model(beam_model, model_metadata.ssm_model_config_paths[ssm_id], model_metadata.ssm_model_weights_paths[ssm_id], BEAM_SEARCH_MODE, generationConfig, - use_full_precision); + data_type); } else { assert(false && "Invalid SSM model type passed."); } diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 4e8f97650a..37ee080eec 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -506,12 +506,14 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; if (m->output_type[0] == DT_FLOAT) { compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } else if (m->output_type[0] == DT_B16) { + compute_type = CUBLAS_COMPUTE_32F; } #endif // Step 1: Compute QKV projections { - DT alpha = 1.0f, beta = 0.0f; + float alpha = 1.0f, beta = 0.0f; // after transpositions int m_q = m->qProjSize * m->num_q_heads; int m_k = m->kProjSize * m->num_q_heads; @@ -637,10 +639,13 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; #else cudaDataType_t compute_type = cublas_data_type; + if (m->output_type[0] == DT_B16) { + compute_type = CUDA_R_32F; + } #endif // Project to output, save result directly on output tensor { - DT alpha = 1.0f, beta = 0.0f; + float alpha = 1.0f, beta = 0.0f; // after transpositions int m_ = m->oProjSize; int k = m->vProjSize * m->num_q_heads; @@ -920,6 +925,8 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; if (m->output_type[0] == DT_FLOAT) { compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + }else if (m->output_type[0] == DT_B16) { + compute_type = CUBLAS_COMPUTE_32F; } #endif // int num_requests = bc->num_active_requests(); @@ -944,7 +951,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, // Step 1: compute query-key product QK.T/sqrt(d_k) { // Scale by sqrt(d_k) as per the original attention paper - DT alpha = 1.0f, beta = 0.0f; + float alpha = 1.0f, beta = 0.0f; if (*m->qk_prod_scaling) { alpha = static_cast
(1.0f / sqrt(m->kProjSize)); } @@ -1075,7 +1082,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @ // softmax(QK.T/sqrt(d_k)).T { - DT alpha = 1.0f, beta = 0.0f; + float alpha = 1.0f, beta = 0.0f; // after transpositions int m_ = m->vProjSize; int n = num_new_tokens; @@ -1191,12 +1198,14 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( output.get_float_ptr(), bias_ptr, stream); - }else if (input.data_type == DT_B16) { + } else if (input.data_type == DT_B16) { if (m->offload) { - pre_build_weight_kernel<__nv_bfloat16>(m, weight, input.data_type, stream); + pre_build_weight_kernel<__nv_bfloat16>( + m, weight, input.data_type, stream); } __nv_bfloat16 const *bias_ptr = - use_bias ? bias.get_bfloat16_ptr() : static_cast<__nv_bfloat16 const *>(nullptr); + use_bias ? bias.get_bfloat16_ptr() + : static_cast<__nv_bfloat16 const *>(nullptr); Kernels::IncMultiHeadAttention::inference_kernel( m, bc, @@ -1515,11 +1524,12 @@ template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( DataType data_type, cudaStream_t stream); -template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<__nv_bfloat16>( - IncMultiHeadSelfAttentionMeta const *m, - GenericTensorAccessorR const weight, - DataType data_type, - cudaStream_t stream); +template void + Kernels::IncMultiHeadAttention::pre_build_weight_kernel<__nv_bfloat16>( + IncMultiHeadSelfAttentionMeta const *m, + GenericTensorAccessorR const weight, + DataType data_type, + cudaStream_t stream); template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( IncMultiHeadSelfAttentionMeta const *m, @@ -1541,15 +1551,16 @@ template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( int num_tokens, cudaStream_t stream); -template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<__nv_bfloat16>( - IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - __nv_bfloat16 *output_ptr, - __nv_bfloat16 const *weight_ptr, - __nv_bfloat16 const *bias_ptr, - int num_tokens, - cudaStream_t stream); +template void + Kernels::IncMultiHeadAttention::compute_o_prod_bias<__nv_bfloat16>( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + __nv_bfloat16 *output_ptr, + __nv_bfloat16 const *weight_ptr, + __nv_bfloat16 const *bias_ptr, + int num_tokens, + cudaStream_t stream); template void Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( @@ -1566,9 +1577,9 @@ template void cudaStream_t stream); template void - Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<__nv_bfloat16>( - IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - __nv_bfloat16 *output_ptr, - cudaStream_t stream); + Kernels::IncMultiHeadAttention::compute_attention_kernel_generation< + __nv_bfloat16>(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + __nv_bfloat16 *output_ptr, + cudaStream_t stream); }; // namespace FlexFlow diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 87a884cb99..db75a5bf32 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -160,14 +160,14 @@ void forward_kernel_wrapper(LinearMeta const *m, stream); } else if (m->input_type[0] == DT_B16) { Internal::forward_kernel<__nv_bfloat16>(m, - input_ptr, - output_ptr, - weight_ptr, - bias_ptr, - in_dim, - out_dim, - batch_size, - stream); + input_ptr, + output_ptr, + weight_ptr, + bias_ptr, + in_dim, + out_dim, + batch_size, + stream); } if (m->profiling) { @@ -234,17 +234,17 @@ void backward_kernel_wrapper(LinearMeta const *m, stream); } else if (m->input_type[0] == DT_B16) { Internal::backward_kernel<__nv_bfloat16>(m, - input_ptr, - input_grad_ptr, - output_ptr, - output_grad_ptr, - kernel_ptr, - kernel_grad_ptr, - bias_grad_ptr, - in_dim, - out_dim, - batch_size, - stream); + input_ptr, + input_grad_ptr, + output_ptr, + output_grad_ptr, + kernel_ptr, + kernel_grad_ptr, + bias_grad_ptr, + in_dim, + out_dim, + batch_size, + stream); } if (m->profiling) { @@ -333,7 +333,7 @@ void forward_kernel(LinearMeta const *m, } checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - DT alpha = 1.0f, beta = 0.0f; + float alpha = 1.0f, beta = 0.0f; cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); cudaDataType_t weight_type = m->offload ? ff_to_cuda_datatype(m->weight_ptr_type) @@ -349,6 +349,8 @@ void forward_kernel(LinearMeta const *m, cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; if (m->output_type[0] == DT_FLOAT) { compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } else if (m->output_type[0] == DT_B16) { + compute_type = CUBLAS_COMPUTE_32F; } #endif checkCUDA(cublasGemmEx(m->handle.blas, diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 0b978c23a6..cc1a08ab9b 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -480,6 +480,8 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; if (m->output_type[0] == DT_FLOAT) { compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + }else if (m->output_type[0] == DT_B16) { + compute_type = CUBLAS_COMPUTE_32F; } #endif // int num_requests = bc->num_active_requests(); @@ -533,7 +535,7 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, int strideC = num_new_tokens * total_tokens; // a flag of using this scaling alpha - DT alpha = 1.0f, beta = 0.0f; + float alpha = 1.0f, beta = 0.0f; if (*m->qk_prod_scaling) { alpha = static_cast
(1.0f / sqrt(m->kProjSize)); } diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 8f1be15fd1..e329593970 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -847,6 +847,12 @@ template bool ParallelTensorBase::get_tensor(FFModel const *ff, half *data, bool get_gradients); +template bool ParallelTensorBase::set_tensor<__nv_bfloat16>( + FFModel const *ff, std::vector const &dims, __nv_bfloat16 const *data); +template bool ParallelTensorBase::get_tensor<__nv_bfloat16>(FFModel const *ff, + __nv_bfloat16 *data, + bool get_gradients); + template bool ParallelTensorBase::set_tensor(FFModel const *ff, std::vector const &dims, char const *data); From badbf53daa6a64b3e2b8ca362240e074f8310072 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sun, 7 Jan 2024 15:02:03 -0500 Subject: [PATCH 04/11] . --- inference/spec_infer/spec_infer.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 585a4a7ee7..b5bd63d4b7 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -99,6 +99,7 @@ void parse_input_args(char **argv, } if (!strcmp(argv[i], "--use-full-precision")) { use_full_precision = true; + data_type = DT_FLOAT; continue; } if (!strcmp(argv[i], "--use-bfloat16-precision")) { From fbdaba21951e8b508af6328112d654ec326ed863 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Mon, 15 Jan 2024 11:22:08 -0500 Subject: [PATCH 05/11] hip --- include/flexflow/accessor.h | 12 +- include/flexflow/utils/cuda_helper.h | 14 ++ inference/file_loader.cc | 12 +- src/ops/arg_topk.cpp | 12 ++ src/ops/argmax.cpp | 12 +- src/ops/element_unary.cpp | 6 + src/ops/fused.cpp | 16 ++ src/ops/inc_multihead_self_attention.cpp | 24 +++ src/ops/inc_multihead_self_attention.cu | 14 +- src/ops/kernels/decompress_kernels.cpp | 23 ++- src/ops/kernels/element_binary_kernels.cpp | 20 ++- src/ops/kernels/embedding_kernels.cpp | 154 +++++++++++++++++- src/ops/kernels/linear_kernels.cpp | 23 +++ src/ops/kernels/linear_kernels.cu | 3 +- src/ops/kernels/rms_norm_kernels.cpp | 6 + src/ops/kernels/rms_norm_kernels.cu | 2 +- src/ops/kernels/softmax.cpp | 11 +- src/ops/layer_norm.cpp | 9 + src/ops/linear.cc | 16 +- src/ops/residual_layer_norm.cpp | 12 ++ src/ops/residual_layer_norm.cu | 5 +- src/ops/sigmoid_silu_multi.cpp | 8 + src/ops/softmax.cc | 8 +- src/ops/spec_inc_multihead_self_attention.cpp | 13 ++ src/ops/spec_inc_multihead_self_attention.cu | 10 +- src/ops/tree_inc_multihead_self_attention.cpp | 18 ++ src/parallel_ops/combine.cc | 4 +- src/parallel_ops/kernels/combine_kernels.cpp | 3 + .../kernels/reduction_kernels.cpp | 10 ++ .../kernels/replicate_kernels.cpp | 3 + src/parallel_ops/reduction.cc | 4 +- src/parallel_ops/replicate.cc | 4 +- src/runtime/accessor.cc | 18 +- src/runtime/ffconst_utils.cc | 2 +- src/runtime/hip_helper.cpp | 42 ++++- src/runtime/initializer_kernel.cpp | 11 ++ src/runtime/model.cc | 2 +- src/runtime/parallel_tensor.cc | 8 +- 38 files changed, 511 insertions(+), 63 deletions(-) diff --git a/include/flexflow/accessor.h b/include/flexflow/accessor.h index 26ed006646..121140c926 100644 --- a/include/flexflow/accessor.h +++ b/include/flexflow/accessor.h @@ -8,7 +8,7 @@ #include #elif defined(FF_USE_HIP_CUDA) #include -#include +#include #elif defined(FF_USE_HIP_ROCM) #include #include @@ -18,6 +18,12 @@ namespace FlexFlow { +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +typedef __nv_bfloat16 __ff_bfloat16; +#elif defined(FF_USE_HIP_ROCM) +typedef hip_bfloat16 __ff_bfloat16; +#endif + template using AccessorRO = Legion::FieldAccessor>; @@ -64,7 +70,7 @@ class GenericTensorAccessorW { float *get_float_ptr() const; double *get_double_ptr() const; half *get_half_ptr() const; - __nv_bfloat16 *get_bfloat16_ptr() const; + __ff_bfloat16 *get_bfloat16_ptr() const; char *get_byte_ptr() const; DataType data_type; Legion::Domain domain; @@ -84,7 +90,7 @@ class GenericTensorAccessorR { float const *get_float_ptr() const; double const *get_double_ptr() const; half const *get_half_ptr() const; - __nv_bfloat16 const *get_bfloat16_ptr() const; + __ff_bfloat16 const *get_bfloat16_ptr() const; char const *get_byte_ptr() const; DataType data_type; Legion::Domain domain; diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index f8bf67b3e1..f3038860d7 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -161,6 +161,20 @@ T *download_tensor(T const *ptr, size_t num_elements); template bool download_tensor(T const *ptr, T *dst, size_t num_elements); +// data type for cublasgemm +template +struct cublasAlphaBetaType { + using type = float; // default +}; +template <> +struct cublasAlphaBetaType { + using type = half; +}; +template <> +struct cublasAlphaBetaType<__nv_bfloat16> { + using type = float; +}; + cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Legion::Domain domain, DataType data_type = DT_FLOAT); diff --git a/inference/file_loader.cc b/inference/file_loader.cc index 9f9bce56b7..60b2c80b06 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -127,7 +127,7 @@ void load_attention_weights_multi_query(DT *ptr, ///////////////////////bfloat16 function/////////////////////// -void load_from_file_b16(__nv_bfloat16 *ptr, size_t size, std::string filepath) { +void load_from_file_b16(__ff_bfloat16 *ptr, size_t size, std::string filepath) { std::ifstream in(filepath, std::ios::in | std::ios::binary); if (!in.good()) { std::cout << "Could not open file: " << filepath << std::endl; @@ -156,7 +156,7 @@ void load_from_file_b16(__nv_bfloat16 *ptr, size_t size, std::string filepath) { in.close(); } -void load_attention_weights_v2_b16(__nv_bfloat16 *ptr, +void load_attention_weights_v2_b16(__ff_bfloat16 *ptr, int num_heads, int num_kv_heads, size_t hidden_dim, @@ -290,7 +290,7 @@ void load_attention_weights_v2_b16(__nv_bfloat16 *ptr, } } -void load_attention_bias_v2_b16(__nv_bfloat16 *ptr, +void load_attention_bias_v2_b16(__ff_bfloat16 *ptr, int num_heads, int num_kv_heads, size_t hidden_dim, @@ -382,8 +382,8 @@ void FileDataLoader::load_single_weight_tensor_b16(FFModel *ff, dims_vec.push_back(weight->dims[i]); volume *= weight->dims[i]; } - assert(data_type_size(weight->data_type) == sizeof(__nv_bfloat16)); - __nv_bfloat16 *data = (__nv_bfloat16 *)malloc(sizeof(__nv_bfloat16) * volume); + assert(data_type_size(weight->data_type) == sizeof(__ff_bfloat16)); + __ff_bfloat16 *data = (__ff_bfloat16 *)malloc(sizeof(__ff_bfloat16) * volume); std::string weight_filename = removeGuidOperatorName(std::string(l->name)); @@ -446,7 +446,7 @@ void FileDataLoader::load_single_weight_tensor_b16(FFModel *ff, // Copy the weight data from the buffer to the weight's ParallelTensor ParallelTensor weight_pt; ff->get_parallel_tensor_from_tensor(weight, weight_pt); - weight_pt->set_tensor<__nv_bfloat16>(ff, dims_vec, data); + weight_pt->set_tensor<__ff_bfloat16>(ff, dims_vec, data); // Free buffer memory delete data; diff --git a/src/ops/arg_topk.cpp b/src/ops/arg_topk.cpp index f431d3d4bf..17cc7534ad 100644 --- a/src/ops/arg_topk.cpp +++ b/src/ops/arg_topk.cpp @@ -515,6 +515,18 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, m->sorted, m->speculative_decoding ? bc : nullptr, stream); + } else if (input.data_type == DT_B16) { + ArgTopK::forward_kernel(m, + input.get_bfloat16_ptr(), + m->speculative_decoding ? probs.get_float_ptr() + : nullptr, + indices.get_int32_ptr(), + batch_size, + length, + k, + m->sorted, + m->speculative_decoding ? bc : nullptr, + stream); } else { assert(false && "Unsupported data type"); } diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp index 8a1cf0b3b0..7437b8b969 100644 --- a/src/ops/argmax.cpp +++ b/src/ops/argmax.cpp @@ -466,6 +466,16 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, length, batch_size, stream); + } else if (input.data_type == DT_B16) { + ArgMax::forward_kernel(m, + input.get_bfloat16_ptr(), + indices.get_int32_ptr(), + m->probs, + m->beam_search ? parent.get_int32_ptr() + : nullptr, + length, + batch_size, + stream); } else { assert(false && "Unsupported data type"); } @@ -491,7 +501,7 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler, : OpMeta(handler, op) { DataType data_type = op->data_type; size_t prob_size = batch_size; - assert(data_type == DT_FLOAT || data_type == DT_HALF); + assert(data_type == DT_FLOAT || data_type == DT_HALF || data_type == DT_B16); size_t total_size = prob_size * sizeof(float); gpu_mem_allocator.create_legion_instance(reserveInst, total_size); probs = gpu_mem_allocator.allocate_instance(prob_size); diff --git a/src/ops/element_unary.cpp b/src/ops/element_unary.cpp index e20200420f..06a4d3c36a 100644 --- a/src/ops/element_unary.cpp +++ b/src/ops/element_unary.cpp @@ -314,6 +314,12 @@ template void int64_t *output_ptr, size_t num_elements); +template void ElementUnary::forward_kernel_wrapper( + ElementUnaryMeta const *m, + hip_bfloat16 const *input_ptr, + hip_bfloat16 *output_ptr, + size_t num_elements); + template void ElementUnary::backward_kernel_wrapper(ElementUnaryMeta const *m, float const *input_ptr, diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index 3282bc57d9..af98f153e2 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -428,6 +428,11 @@ __host__ void FusedOp::forward_task(Task const *task, m, my_input_accessor[0].get_float_ptr(), my_output_accessor[0].get_float_ptr()); + } else if (m->input_type == DT_B16) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_bfloat16_ptr(), + my_output_accessor[0].get_bfloat16_ptr()); } break; } @@ -815,6 +820,12 @@ __host__ void my_input_accessor[0].get_float_ptr(), my_output_accessor[0].get_float_ptr(), my_input_accessor[0].domain.get_volume()); + } else if (m->data_type == DT_B16) { + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_bfloat16_ptr(), + my_output_accessor[0].get_bfloat16_ptr(), + my_input_accessor[0].domain.get_volume()); } else { assert(false && "Unsupported data type in ElementUnary forward"); } @@ -1039,6 +1050,11 @@ __host__ void m, my_input_accessor[0].get_float_ptr(), my_output_accessor[0].get_float_ptr()); + } else if (m->input_type == DT_B16) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_bfloat16_ptr(), + my_output_accessor[0].get_bfloat16_ptr()); } break; } diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index d60386f927..4d63ba628d 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -802,6 +802,23 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( output.get_float_ptr(), bias_ptr, stream); + } else if (input.data_type == DT_B16) { + if (m->offload) { + pre_build_weight_kernel(m, weight, input.data_type, stream); + } + hip_bfloat16 const *bias_ptr = + use_bias ? bias.get_bfloat16_ptr() + : static_cast(nullptr); + Kernels::IncMultiHeadAttention::inference_kernel( + m, + bc, + shard_id, + input.get_bfloat16_ptr(), + m->offload ? static_cast(m->weight_ptr) + : weight.get_bfloat16_ptr(), + output.get_bfloat16_ptr(), + bias_ptr, + stream); } else { assert(false && "Unspported data type"); } @@ -1098,4 +1115,11 @@ template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( DataType data_type, hipStream_t stream); +template void + Kernels::IncMultiHeadAttention::pre_build_weight_kernel( + IncMultiHeadSelfAttentionMeta const *m, + GenericTensorAccessorR const weight, + DataType data_type, + cudaStream_t stream); + }; // namespace FlexFlow diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 37ee080eec..af77ac9f6d 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -513,7 +513,8 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, // Step 1: Compute QKV projections { - float alpha = 1.0f, beta = 0.0f; + typename cublasAlphaBetaType
::type alpha = 1.0; + typename cublasAlphaBetaType
::type beta = 0.0; // after transpositions int m_q = m->qProjSize * m->num_q_heads; int m_k = m->kProjSize * m->num_q_heads; @@ -645,7 +646,8 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, #endif // Project to output, save result directly on output tensor { - float alpha = 1.0f, beta = 0.0f; + typename cublasAlphaBetaType
::type alpha = 1.0; + typename cublasAlphaBetaType
::type beta = 0.0; // after transpositions int m_ = m->oProjSize; int k = m->vProjSize * m->num_q_heads; @@ -925,7 +927,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; if (m->output_type[0] == DT_FLOAT) { compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - }else if (m->output_type[0] == DT_B16) { + } else if (m->output_type[0] == DT_B16) { compute_type = CUBLAS_COMPUTE_32F; } #endif @@ -951,7 +953,8 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, // Step 1: compute query-key product QK.T/sqrt(d_k) { // Scale by sqrt(d_k) as per the original attention paper - float alpha = 1.0f, beta = 0.0f; + typename cublasAlphaBetaType
::type alpha = 1.0; + typename cublasAlphaBetaType
::type beta = 0.0; if (*m->qk_prod_scaling) { alpha = static_cast
(1.0f / sqrt(m->kProjSize)); } @@ -1082,7 +1085,8 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @ // softmax(QK.T/sqrt(d_k)).T { - float alpha = 1.0f, beta = 0.0f; + typename cublasAlphaBetaType
::type alpha = 1.0; + typename cublasAlphaBetaType
::type beta = 0.0; // after transpositions int m_ = m->vProjSize; int n = num_new_tokens; diff --git a/src/ops/kernels/decompress_kernels.cpp b/src/ops/kernels/decompress_kernels.cpp index 22bf93d449..5127e0842e 100644 --- a/src/ops/kernels/decompress_kernels.cpp +++ b/src/ops/kernels/decompress_kernels.cpp @@ -54,10 +54,20 @@ template __global__ void decompress_int4_general_weights( char const *input_weight_ptr, float *weight_ptr, int in_dim, int valueSize); template __global__ void decompress_int4_general_weights( char const *input_weight_ptr, half *weight_ptr, int in_dim, int valueSize); +template __global__ void + decompress_int4_general_weights(char const *input_weight_ptr, + hip_bfloat16 *weight_ptr, + int in_dim, + int valueSize); template __global__ void decompress_int8_general_weights( char const *input_weight_ptr, float *weight_ptr, int in_dim, int valueSize); template __global__ void decompress_int8_general_weights( char const *input_weight_ptr, half *weight_ptr, int in_dim, int valueSize); +template __global__ void + decompress_int8_general_weights(char const *input_weight_ptr, + hip_bfloat16 *weight_ptr, + int in_dim, + int valueSize); template __global__ void decompress_int4_attention_weights(char *input_weight_ptr, float *weight_ptr, @@ -71,7 +81,12 @@ template __global__ void int qProjSize, int qSize, int num_heads); - +template __global__ void + decompress_int4_attention_weights(char *input_weight_ptr, + hip_bfloat16 *weight_ptr, + int qProjSize, + int qSize, + int num_heads); template __global__ void decompress_int8_attention_weights(char *input_weight_ptr, float *weight_ptr, @@ -86,5 +101,11 @@ template __global__ void int qSize, int num_heads); +template __global__ void + decompress_int8_attention_weights(char *input_weight_ptr, + hip_bfloat16 *weight_ptr, + int qProjSize, + int qSize, + int num_heads); } // namespace Kernels }; // namespace FlexFlow \ No newline at end of file diff --git a/src/ops/kernels/element_binary_kernels.cpp b/src/ops/kernels/element_binary_kernels.cpp index a65372de85..05467182e7 100644 --- a/src/ops/kernels/element_binary_kernels.cpp +++ b/src/ops/kernels/element_binary_kernels.cpp @@ -82,8 +82,24 @@ void forward_kernel_wrapper(ElementBinaryMeta const *m, } // print_tensor(in1_ptr, in1_domain.get_volume(), "input1:"); // print_tensor(in2_ptr, in2_domain.get_volume(), "input2:"); - Internal::forward_kernel( - m, in1.get_float_ptr(), in2.get_float_ptr(), out.get_float_ptr(), stream); + if (out.data_type == DT_HALF) { + Internal::forward_kernel( + m, in1.get_half_ptr(), in2.get_half_ptr(), out.get_half_ptr(), stream); + } else if (out.data_type == DT_FLOAT) { + Internal::forward_kernel(m, + in1.get_float_ptr(), + in2.get_float_ptr(), + out.get_float_ptr(), + stream); + } else if (out.data_type == DT_B16) { + Internal::forward_kernel(m, + in1.get_bfloat16_ptr(), + in2.get_bfloat16_ptr(), + out.get_bfloat16_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } // print_tensor(out_ptr, in1_domain.get_volume(), "output:"); if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); diff --git a/src/ops/kernels/embedding_kernels.cpp b/src/ops/kernels/embedding_kernels.cpp index ee4a6fcea1..7384f4e7e6 100644 --- a/src/ops/kernels/embedding_kernels.cpp +++ b/src/ops/kernels/embedding_kernels.cpp @@ -60,7 +60,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, m->aggr, output.domain.get_volume(), stream); - } else if (weight.data_type == DT_HALF) { + } else if (weight.data_type == DT_DOUBLE) { Internal::forward_kernel(input.get_int32_ptr(), output.get_double_ptr(), weight.get_double_ptr(), @@ -70,6 +70,16 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, m->aggr, output.domain.get_volume(), stream); + } else if (weight.data_type == DT_B16) { + Internal::forward_kernel(input.get_int32_ptr(), + output.get_bfloat16_ptr(), + weight.get_bfloat16_ptr(), + in_dim, + out_dim, + batch_size, + m->aggr, + output.domain.get_volume(), + stream); } else { assert(false && "Unsupported DataType in Embedding"); } @@ -104,6 +114,16 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, m->aggr, output.domain.get_volume(), stream); + } else if (weight.data_type == DT_B16) { + Internal::forward_kernel(input.get_int64_ptr(), + output.get_bfloat16_ptr(), + weight.get_bfloat16_ptr(), + in_dim, + out_dim, + batch_size, + m->aggr, + output.domain.get_volume(), + stream); } else { assert(false && "Unsupported DataType in Embedding"); } @@ -162,6 +182,16 @@ void backward_kernel_wrapper(EmbeddingMeta const *m, m->aggr, output.domain.get_volume(), stream); + } else if (m->output_type[0] == DT_B16) { + Internal::backward_kernel(input.get_int32_ptr(), + output.get_bfloat16_ptr(), + weight_grad.get_bfloat16_ptr(), + in_dim, + out_dim, + batch_size, + m->aggr, + output.domain.get_volume(), + stream); } else { assert(false && "Unsupported DataType in Embedding"); } @@ -196,6 +226,16 @@ void backward_kernel_wrapper(EmbeddingMeta const *m, m->aggr, output.domain.get_volume(), stream); + } else if (m->output_type[0] == DT_B16) { + Internal::backward_kernel(input.get_int64_ptr(), + output.get_bfloat16_ptr(), + weight_grad.get_bfloat16_ptr(), + in_dim, + out_dim, + batch_size, + m->aggr, + output.domain.get_volume(), + stream); } else { assert(false && "Unsupported DataType in Embedding"); } @@ -332,6 +372,50 @@ __global__ void embed_backward_no_aggr(int64_t const *input, } } +template <> +__global__ void + embed_backward_no_aggr(int const *input, + hip_bfloat16 const *output, + hip_bfloat16 *embed, + int out_dim, + int batch_size) { + CUDA_KERNEL_LOOP(i, batch_size * out_dim) { + int idx = i / out_dim; + int off = i % out_dim; + int wordIdx = input[idx]; +#if __CUDA_ARCH__ >= 700 + atomicAdd(embed + wordIdx * out_dim + off, output[i]); +#else + assert(false); + // TODO: this implementation may result in race condition + // so we use an assertion failure to warn users + embed[wordIdx * out_dim + off] += output[i]; +#endif + } +} + +template <> +__global__ void + embed_backward_no_aggr(int64_t const *input, + hip_bfloat16 const *output, + hip_bfloat16 *embed, + int out_dim, + int batch_size) { + CUDA_KERNEL_LOOP(i, batch_size * out_dim) { + int idx = i / out_dim; + int off = i % out_dim; + int64_t wordIdx = input[idx]; +#if __CUDA_ARCH__ >= 700 + atomicAdd(embed + wordIdx * out_dim + off, output[i]); +#else + assert(false); + // TODO: this implementation may result in race condition + // so we use an assertion failure to warn users + embed[wordIdx * out_dim + off] += output[i]; +#endif + } +} + template __global__ void embed_backward_with_aggr(TI const *input, TD const *output, @@ -426,6 +510,74 @@ __global__ void embed_backward_with_aggr(int64_t const *input, } } +template <> +__global__ void + embed_backward_with_aggr(int const *input, + hip_bfloat16 const *output, + hip_bfloat16 *embed, + int out_dim, + int in_dim, + int batch_size, + AggrMode aggr) { + hip_bfloat16 scale = 1.0f / in_dim; + CUDA_KERNEL_LOOP(i, batch_size * out_dim) { + int idx = i / out_dim; + int off = i % out_dim; + hip_bfloat16 gradient; + if (aggr == AGGR_MODE_SUM) { + gradient = output[i]; + } else { + assert(aggr == AGGR_MODE_AVG); + gradient = output[i] * scale; + } + for (int j = 0; j < in_dim; j++) { + int wordIdx = input[idx * in_dim + j]; +#if __CUDA_ARCH__ >= 700 + atomicAdd(embed + wordIdx * out_dim + off, gradient); +#else + assert(false); + // TODO: this implementation may result in race condition + // so we use an assertion failure to warn users + embed[wordIdx * out_dim + off] += gradient; +#endif + } + } +} + +template <> +__global__ void + embed_backward_with_aggr(int64_t const *input, + hip_bfloat16 const *output, + hip_bfloat16 *embed, + int out_dim, + int in_dim, + int batch_size, + AggrMode aggr) { + hip_bfloat16 scale = 1.0f / in_dim; + CUDA_KERNEL_LOOP(i, batch_size * out_dim) { + int idx = i / out_dim; + int off = i % out_dim; + hip_bfloat16 gradient; + if (aggr == AGGR_MODE_SUM) { + gradient = output[i]; + } else { + assert(aggr == AGGR_MODE_AVG); + gradient = output[i] * scale; + } + for (int j = 0; j < in_dim; j++) { + int64_t wordIdx = input[idx * in_dim + j]; +#if __CUDA_ARCH__ >= 700 + atomicAdd(embed + wordIdx * out_dim + off, gradient); +#else + assert(false); + // TODO: this implementation may result in race condition + // so we use an assertion failure to warn users + embed[wordIdx * out_dim + off] += gradient; +#endif + } + } +} + /*static*/ template void forward_kernel(TI const *input_ptr, diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index 072eb5e96b..3cd9862bba 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -124,6 +124,16 @@ void forward_kernel_wrapper(LinearMeta const *m, out_dim, batch_size, stream); + } else if (m->input_type[0] == DT_B16) { + Internal::forward_kernel(m, + input_ptr, + output_ptr, + weight_ptr, + bias_ptr, + in_dim, + out_dim, + batch_size, + stream); } if (m->profiling) { @@ -189,6 +199,19 @@ void backward_kernel_wrapper(LinearMeta const *m, out_dim, batch_size, stream); + } else if (m->input_type[0] == DT_B16) { + Internal::backward_kernel(m, + input_ptr, + input_grad_ptr, + output_ptr, + output_grad_ptr, + kernel_ptr, + kernel_grad_ptr, + bias_grad_ptr, + in_dim, + out_dim, + batch_size, + stream); } if (m->profiling) { diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index db75a5bf32..ad416b433c 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -333,7 +333,8 @@ void forward_kernel(LinearMeta const *m, } checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - float alpha = 1.0f, beta = 0.0f; + typename cublasAlphaBetaType
::type alpha = 1.0; + typename cublasAlphaBetaType
::type beta = 0.0; cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); cudaDataType_t weight_type = m->offload ? ff_to_cuda_datatype(m->weight_ptr_type) diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp index 24ab7051e6..c81e73dd77 100644 --- a/src/ops/kernels/rms_norm_kernels.cpp +++ b/src/ops/kernels/rms_norm_kernels.cpp @@ -190,6 +190,12 @@ void forward_kernel_wrapper(RMSNormMeta const *m, weight.get_float_ptr(), output.get_float_ptr(), stream); + } else if (output.data_type == DT_B16) { + forward_kernel(m, + input.get_bfloat16_ptr(), + weight.get_bfloat16_ptr(), + output.get_bfloat16_ptr(), + stream); } else { assert(false && "Unsupported data type"); } diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index 67a0b156b5..19661724d2 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -137,7 +137,7 @@ __global__ void template __global__ void NormKernel(int64_t N, T const *X, T const *rstd, T *Y) { using T_ACC = T; - int64_t const i = blockIdx.x; + const int64_t i = blockIdx.x; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { int64_t const index = i * N + j; Y[index] = static_cast(X[index]) * static_cast(rstd[i]); diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp index 89c9f14a01..e785d404d7 100644 --- a/src/ops/kernels/softmax.cpp +++ b/src/ops/kernels/softmax.cpp @@ -107,7 +107,10 @@ template void forward_kernel_wrapper(SoftmaxMeta const *m, template void forward_kernel_wrapper(SoftmaxMeta const *m, half const *input_ptr, half *output_ptr); - +template void + forward_kernel_wrapper(SoftmaxMeta const *m, + hip_bfloat16 const *input_ptr, + hip_bfloat16 *output_ptr); template void backward_kernel_wrapper(SoftmaxMeta const *m, float *input_grad_ptr, float const *output_grad_ptr, @@ -116,7 +119,11 @@ template void backward_kernel_wrapper(SoftmaxMeta const *m, half *input_grad_ptr, half const *output_grad_ptr, size_t num_elements); - +template void + backward_kernel_wrapper(SoftmaxMeta const *m, + hip_bfloat16 *input_grad_ptr, + hip_bfloat16 const *output_grad_ptr, + size_t num_elements); namespace Internal { template void forward_kernel(SoftmaxMeta const *m, diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index 07dbdb3dfb..ff420901e4 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -182,6 +182,15 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, gamma.get_half_ptr(), m->use_bias ? beta.get_half_ptr() : nullptr, stream); + } else if (m->input_type[0] == DT_B16) { + LayerNorm::forward_kernel( + m, + input.get_bfloat16_ptr(), + output.get_bfloat16_ptr(), + m->elementwise_affine ? gamma.get_bfloat16_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_bfloat16_ptr() + : nullptr, + stream); } else { assert(false && "unsupport datatype in layernorm"); } diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 8cdc392c96..cce2fb9c9e 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -157,7 +157,7 @@ Op *Linear::create_operator_from_layer( Linear::Linear(FFModel &model, Linear const &other, - ParallelTensor const input, + const ParallelTensor input, bool allocate_weights) : Linear(model, other.layer_guid, @@ -175,7 +175,7 @@ Linear::Linear(FFModel &model, Linear::Linear(FFModel &model, LinearParams const ¶ms, - ParallelTensor const input, + const ParallelTensor input, char const *name, bool allocate_weights) : Linear(model, @@ -439,10 +439,10 @@ OpMeta *Linear::init_task(Task const *task, } \ } else if (output.data_type == DT_B16) { \ if (linear->quantization_type != DT_NONE) { \ - return init_task_with_dim<__nv_bfloat16, char, DIM>( \ + return init_task_with_dim<__ff_bfloat16, char, DIM>( \ task, regions, ctx, runtime); \ } else { \ - return init_task_with_dim<__nv_bfloat16, __nv_bfloat16, DIM>( \ + return init_task_with_dim<__ff_bfloat16, __ff_bfloat16, DIM>( \ task, regions, ctx, runtime); \ } \ } else { \ @@ -714,10 +714,10 @@ void Linear::forward_task(Task const *task, } \ } else if (m->output_type[0] == DT_B16) { \ if (m->quantization_type != DT_NONE) { \ - return forward_task_with_dim<__nv_bfloat16, char, DIM>( \ + return forward_task_with_dim<__ff_bfloat16, char, DIM>( \ task, regions, ctx, runtime); \ } else { \ - return forward_task_with_dim<__nv_bfloat16, __nv_bfloat16, DIM>( \ + return forward_task_with_dim<__ff_bfloat16, __ff_bfloat16, DIM>( \ task, regions, ctx, runtime); \ } \ } else { \ @@ -876,7 +876,7 @@ void Linear::backward_task(Task const *task, } else if (m->output_type[0] == DT_FLOAT) { \ return backward_task_with_dim(task, regions, ctx, runtime); \ } else if (m->output_type[0] == DT_B16) { \ - return backward_task_with_dim<__nv_bfloat16, DIM>( \ + return backward_task_with_dim<__ff_bfloat16, DIM>( \ task, regions, ctx, runtime); \ } else { \ assert(false && "Unsupported data type"); \ @@ -1360,7 +1360,7 @@ bool LinearParams::is_valid(ParallelTensorShape const &input_shape) const { * It takes a the input tensor as a parameter, instead of the input's * ParallelTensorShape. */ -void LinearParams::solve_dims(ParallelTensor const input, +void LinearParams::solve_dims(const ParallelTensor input, ParallelDim output_dims[MAX_TENSOR_DIM], int *output_ndims, ParallelDim kernel_dims[MAX_TENSOR_DIM], diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp index f1b7a537b0..09823adf27 100644 --- a/src/ops/residual_layer_norm.cpp +++ b/src/ops/residual_layer_norm.cpp @@ -230,6 +230,18 @@ void ResidualLayerNorm::inference_kernel_wrapper( m->elementwise_affine ? gamma.get_half_ptr() : nullptr, (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, stream); + } else if (m->input_type[0] == DT_B16) { + ResidualLayerNorm::inference_kernel( + m, + input.get_bfloat16_ptr(), + residual1.get_bfloat16_ptr(), + m->use_two_residuals ? residual2.get_bfloat16_ptr() : nullptr, + added_output.get_bfloat16_ptr(), + output.get_bfloat16_ptr(), + m->elementwise_affine ? gamma.get_bfloat16_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_bfloat16_ptr() + : nullptr, + stream); } else { assert(false && "unsupport datatype in layernorm"); } diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index cbb7e241f3..fc0bd5d5a2 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -234,9 +234,10 @@ void ResidualLayerNorm::inference_kernel_wrapper( added_output.get_bfloat16_ptr(), output.get_bfloat16_ptr(), m->elementwise_affine ? gamma.get_bfloat16_ptr() : nullptr, - (m->elementwise_affine && m->use_bias) ? beta.get_bfloat16_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_bfloat16_ptr() + : nullptr, stream); - }else { + } else { assert(false && "unsupport datatype in layernorm"); } diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp index 7b7f30a288..b446ed0065 100644 --- a/src/ops/sigmoid_silu_multi.cpp +++ b/src/ops/sigmoid_silu_multi.cpp @@ -101,6 +101,14 @@ void SigmoidSiluMulti::inference_kernel_wrapper( input1.get_half_ptr(), input2.get_half_ptr(), output.get_half_ptr()); + } else if (m->input_type[0] == DT_B16) { + SigmoidSiluMultiKernel<<>>(input1.domain.get_volume(), + input1.get_bfloat16_ptr(), + input2.get_bfloat16_ptr(), + output.get_bfloat16_ptr()); } else { assert(false && "unsupport datatype in SigmoidSiluMulti"); } diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 101eff9417..87598de634 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -52,7 +52,7 @@ SoftmaxParams Softmax::get_params() const { return params; } -Tensor FFModel::softmax(Tensor const _input, +Tensor FFModel::softmax(const Tensor _input, int dim, DataType data_type, char const *name) { @@ -93,7 +93,7 @@ Op *Softmax::create_operator_from_layer( } Softmax::Softmax(FFModel &model, - ParallelTensor const _input, + const ParallelTensor _input, int _dim, char const *name) : Op(model, @@ -117,7 +117,7 @@ Softmax::Softmax(FFModel &model, Softmax::Softmax(FFModel &model, SoftmaxParams const ¶ms, - ParallelTensor const input, + const ParallelTensor input, char const *name) : Softmax(model, input, params.dim, name) {} @@ -370,7 +370,7 @@ void Softmax::backward_task(Task const *task, } else if (m->output_type == DT_FLOAT) { \ return backward_task_with_dim(task, regions, ctx, runtime); \ } else if (m->output_type == DT_B16) { \ - return backward_task_with_dim<__nv_bfloat16, DIM>( \ + return backward_task_with_dim<__ff_bfloat16, DIM>( \ task, regions, ctx, runtime); \ } else { \ assert(false && "Unsupported data type"); \ diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index b1687d12a2..a7b8d4535c 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -562,6 +562,19 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( output.get_float_ptr(), bias_ptr, stream); + } else if (input.data_type == DT_B16) { + float const *bias_ptr = use_bias + ? bias.get_bfloat16_ptr() + : static_cast(nullptr); + Kernels::SpecIncMultiHeadAttention::inference_kernel( + m, + bc, + shard_id, + input.get_bfloat16_ptr(), + weight.get_bfloat16_ptr(), + output.get_bfloat16_ptr(), + bias_ptr, + stream); } else { assert(false && "Unspported data type"); } diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index cc1a08ab9b..885a09c980 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -480,7 +480,7 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; if (m->output_type[0] == DT_FLOAT) { compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - }else if (m->output_type[0] == DT_B16) { + } else if (m->output_type[0] == DT_B16) { compute_type = CUBLAS_COMPUTE_32F; } #endif @@ -535,7 +535,8 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, int strideC = num_new_tokens * total_tokens; // a flag of using this scaling alpha - float alpha = 1.0f, beta = 0.0f; + typename cublasAlphaBetaType
::type alpha = 1.0; + typename cublasAlphaBetaType
::type beta = 0.0; if (*m->qk_prod_scaling) { alpha = static_cast
(1.0f / sqrt(m->kProjSize)); } @@ -791,9 +792,10 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( output.get_float_ptr(), bias_ptr, stream); - }else if (input.data_type == DT_B16) { + } else if (input.data_type == DT_B16) { __nv_bfloat16 const *bias_ptr = - use_bias ? bias.get_bfloat16_ptr() : static_cast<__nv_bfloat16 const *>(nullptr); + use_bias ? bias.get_bfloat16_ptr() + : static_cast<__nv_bfloat16 const *>(nullptr); Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( m, bc, diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 26291fb3b4..4f399aafc5 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -570,6 +570,24 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( output.get_float_ptr(), bias_ptr, stream); + } else if (input.data_type == DT_B16) { + if (m->offload) { + pre_build_weight_kernel(m, weight, input.data_type, stream); + } + + hip_bfloat16 const *bias_ptr = + use_bias ? bias.get_bfloat16_ptr() + : static_cast(nullptr); + Kernels::TreeIncMultiHeadAttention::inference_kernel( + m, + bc, + shard_id, + input.get_bfloat16_ptr(), + m->offload ? static_cast(m->weight_ptr) + : weight.get_bfloat16_ptr(), + output.get_bfloat16_ptr(), + bias_ptr, + stream); } else { assert(false && "Unspported data type"); } diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc index ffe57006fc..8bd963b3d3 100644 --- a/src/parallel_ops/combine.cc +++ b/src/parallel_ops/combine.cc @@ -72,7 +72,7 @@ Combine::Combine(FFModel &model, name) {} Combine::Combine(FFModel &model, - ParallelTensor const _input, + const ParallelTensor _input, int _combine_legion_dim, int _combine_degree, char const *name) @@ -366,7 +366,7 @@ void Combine::forward_task(Task const *task, if (data_type == DT_HALF) { forward_task_with_type(task, regions, ctx, runtime); } else if (data_type == DT_B16) { - forward_task_with_type<__nv_bfloat16>(task, regions, ctx, runtime); + forward_task_with_type<__ff_bfloat16>(task, regions, ctx, runtime); } else if (data_type == DT_FLOAT) { forward_task_with_type(task, regions, ctx, runtime); } else if (data_type == DT_DOUBLE) { diff --git a/src/parallel_ops/kernels/combine_kernels.cpp b/src/parallel_ops/kernels/combine_kernels.cpp index d6e9568223..b593a92a7d 100644 --- a/src/parallel_ops/kernels/combine_kernels.cpp +++ b/src/parallel_ops/kernels/combine_kernels.cpp @@ -57,6 +57,9 @@ template void forward_kernel(half const *input_ptr, template void forward_kernel(float const *input_ptr, float *output_ptr, size_t num_elements); +template void forward_kernel(hip_bfloat16 const *input_ptr, + hip_bfloat16 *output_ptr, + size_t num_elements); template void forward_kernel(double const *input_ptr, double *output_ptr, size_t num_elements); diff --git a/src/parallel_ops/kernels/reduction_kernels.cpp b/src/parallel_ops/kernels/reduction_kernels.cpp index 2a3fe5cca1..ade5d9b402 100644 --- a/src/parallel_ops/kernels/reduction_kernels.cpp +++ b/src/parallel_ops/kernels/reduction_kernels.cpp @@ -78,6 +78,12 @@ template __global__ void reduction_forward_kernel(half const *input_ptr, half *output_ptr, size_t num_elements, size_t num_replicas); +template __global__ void + reduction_forward_kernel(hip_bfloat16 const *input_ptr, + hip_bfloat16 *output_ptr, + size_t num_elements, + size_t num_replicas); + template void forward_kernel(float const *input_ptr, float *output_ptr, size_t num_elements, @@ -86,6 +92,10 @@ template void forward_kernel(half const *input_ptr, half *output_ptr, size_t num_elements, size_t num_replicas); +template void forward_kernel(hip_bfloat16 const *input_ptr, + hip_bfloat16 *output_ptr, + size_t num_elements, + size_t num_replicas); template void backward_kernel(float const *output_grad_ptr, float *input_grad_ptr, size_t num_elements); diff --git a/src/parallel_ops/kernels/replicate_kernels.cpp b/src/parallel_ops/kernels/replicate_kernels.cpp index 1647f014be..7d6b2fc63a 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cpp +++ b/src/parallel_ops/kernels/replicate_kernels.cpp @@ -73,6 +73,9 @@ template void forward_kernel(float const *input_ptr, template void forward_kernel(half const *input_ptr, half *output_ptr, size_t num_elements); +template void forward_kernel(hip_bfloat16 const *input_ptr, + hip_bfloat16 *output_ptr, + size_t num_elements); template __global__ void replicate_backward_kernel(float const *input_ptr, float *output_ptr, diff --git a/src/parallel_ops/reduction.cc b/src/parallel_ops/reduction.cc index d53cae330b..bd7cb4085b 100644 --- a/src/parallel_ops/reduction.cc +++ b/src/parallel_ops/reduction.cc @@ -60,7 +60,7 @@ ReductionParams Reduction::get_params() const { } Reduction::Reduction(FFModel &model, - ParallelTensor const _input, + const ParallelTensor _input, int _reduction_legion_dim, int _reduction_degree, char const *name) @@ -381,7 +381,7 @@ void Reduction::forward_task(Task const *task, num_elements, num_replicas); } else if (input.data_type == DT_B16) { - forward_kernel<__nv_bfloat16>(input.get_bfloat16_ptr(), + forward_kernel<__ff_bfloat16>(input.get_bfloat16_ptr(), output.get_bfloat16_ptr(), num_elements, num_replicas); diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index d9f7b2263b..18e4d26f26 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -59,7 +59,7 @@ ReplicateParams Replicate::get_params() const { } Replicate::Replicate(FFModel &model, - ParallelTensor const _input, + const ParallelTensor _input, int _replicate_legion_dim, int _replicate_degree, char const *name) @@ -374,7 +374,7 @@ void Replicate::forward_task(Task const *task, output.get_float_ptr(), input_domain.get_volume()); } else if (input.data_type == DT_B16) { - forward_kernel<__nv_bfloat16>(input.get_bfloat16_ptr(), + forward_kernel<__ff_bfloat16>(input.get_bfloat16_ptr(), output.get_bfloat16_ptr(), input_domain.get_volume()); } else { diff --git a/src/runtime/accessor.cc b/src/runtime/accessor.cc index 9bb6c08eea..30aef93df9 100644 --- a/src/runtime/accessor.cc +++ b/src/runtime/accessor.cc @@ -77,12 +77,12 @@ half const *GenericTensorAccessorR::get_half_ptr() const { } } -__nv_bfloat16 const *GenericTensorAccessorR::get_bfloat16_ptr() const { +__ff_bfloat16 const *GenericTensorAccessorR::get_bfloat16_ptr() const { if (data_type == DT_B16) { - return static_cast<__nv_bfloat16 const *>(ptr); + return static_cast<__ff_bfloat16 const *>(ptr); } else { assert(false && "Invalid Accessor Type"); - return static_cast<__nv_bfloat16 const *>(nullptr); + return static_cast<__ff_bfloat16 const *>(nullptr); } } @@ -174,12 +174,12 @@ half *GenericTensorAccessorW::get_half_ptr() const { } } -__nv_bfloat16 *GenericTensorAccessorW::get_bfloat16_ptr() const { +__ff_bfloat16 *GenericTensorAccessorW::get_bfloat16_ptr() const { if (data_type == DT_B16) { - return static_cast<__nv_bfloat16 *>(ptr); + return static_cast<__ff_bfloat16 *>(ptr); } else { assert(false && "Invalid Accessor Type"); - return static_cast<__nv_bfloat16 *>(nullptr); + return static_cast<__ff_bfloat16 *>(nullptr); } } @@ -290,7 +290,7 @@ GenericTensorAccessorR break; } case DT_B16: { - ptr = helperGetTensorPointerRO<__nv_bfloat16>( + ptr = helperGetTensorPointerRO<__ff_bfloat16>( region, req, fid, ctx, runtime); break; } @@ -341,7 +341,7 @@ GenericTensorAccessorW break; } case DT_B16: { - ptr = helperGetTensorPointerWO<__nv_bfloat16>( + ptr = helperGetTensorPointerWO<__ff_bfloat16>( region, req, fid, ctx, runtime); break; } @@ -392,7 +392,7 @@ GenericTensorAccessorW break; } case DT_B16: { - ptr = helperGetTensorPointerRW<__nv_bfloat16>( + ptr = helperGetTensorPointerRW<__ff_bfloat16>( region, req, fid, ctx, runtime); break; } diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index 764c938dc3..232d711d82 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -214,7 +214,7 @@ size_t data_type_size(DataType type) { case DT_HALF: return sizeof(half); case DT_B16: - return sizeof(__nv_bfloat16); + return sizeof(__ff_bfloat16); case DT_FLOAT: return sizeof(float); case DT_DOUBLE: diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index fb94135c8f..00a6730de0 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -299,6 +299,33 @@ __host__ void checkCUDA(hipHostFree(host_ptr)); } +template <> +__host__ void save_tensor(hip_bfloat16 const *ptr, + size_t num_elements, + char const *file_name) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hip_bfloat16 *host_ptr; + checkCUDA(hipHostMalloc(&host_ptr, + sizeof(hip_bfloat16) * num_elements, + hipHostMallocPortable | hipHostMallocMapped)); + checkCUDA(hipMemcpyAsync(host_ptr, + ptr, + sizeof(hip_bfloat16) * num_elements, + hipMemcpyDeviceToHost, + stream)); + checkCUDA(hipDeviceSynchronize()); + FILE *tensor_file; + tensor_file = fopen(file_name, "w"); + assert(tensor_file != NULL); + for (unsigned i = 0; i < num_elements; i++) { + fprintf(tensor_file, "%.9f, ", (float)host_ptr[i]); + } + + fclose(tensor_file); + checkCUDA(hipHostFree(host_ptr)); +} + template <> __host__ void save_tensor(int32_t const *ptr, size_t num_elements, @@ -489,6 +516,8 @@ miopenDataType_t ff_to_cudnn_datatype(DataType type) { switch (type) { case DT_HALF: return miopenHalf; + case DT_B16: + return miopenBFloat16; case DT_FLOAT: return miopenFloat; case DT_DOUBLE: @@ -510,6 +539,10 @@ hipblasDatatype_t ff_to_cuda_datatype(DataType type) { return HIPBLAS_R_64F; case DT_INT32: return HIPBLAS_R_32I; + case DT_B16: + return HIPBLAS_R_16B; + case DT_HALF: + return HIPBLAS_R_16F; default: assert(false && "Unspoorted cuda data type"); } @@ -520,6 +553,8 @@ ncclDataType_t ff_to_nccl_datatype(DataType type) { switch (type) { case DT_HALF: return ncclHalf; + case DT_B16: + return ncclBfloat16; case DT_FLOAT: return ncclFloat; case DT_DOUBLE: @@ -540,6 +575,9 @@ void handle_unimplemented_hip_kernel(OperatorType op_type) { template __global__ void assign_kernel(half *ptr, coord_t size, half value); +template __global__ void assign_kernel(hip_bfloat16 *ptr, + coord_t size, + hip_bfloat16 value); template __global__ void assign_kernel(float *ptr, coord_t size, float value); template __global__ void @@ -609,7 +647,9 @@ template __host__ void save_tensor(int64_t const *ptr, char const *file_name); template __host__ void save_tensor(half const *ptr, size_t rect, char const *file_name); - +template __host__ void save_tensor(hip_bfloat16 const *ptr, + size_t rect, + char const *file_name); template __host__ float *download_tensor(float const *ptr, size_t num_elements); template __host__ half *download_tensor(half const *ptr, diff --git a/src/runtime/initializer_kernel.cpp b/src/runtime/initializer_kernel.cpp index 1005d93cec..2e55591729 100644 --- a/src/runtime/initializer_kernel.cpp +++ b/src/runtime/initializer_kernel.cpp @@ -259,6 +259,17 @@ void ZeroInitializer::init_task(Task const *task, w, domain.get_volume(), 0.0f); + } else if (meta->data_types[i] == DT_B16) { + hip_bfloat16 *w = helperGetTensorPointerWO( + regions[i], task->regions[i], FID_DATA, ctx, runtime); + hipLaunchKernelGGL(HIP_KERNEL_NAME(assign_kernel), + GET_BLOCKS(domain.get_volume()), + CUDA_NUM_THREADS, + 0, + stream, + w, + domain.get_volume(), + 0.0f); } else if (meta->data_types[i] == DT_INT32) { int32_t *w = helperGetTensorPointerWO( regions[i], task->regions[i], FID_DATA, ctx, runtime); diff --git a/src/runtime/model.cc b/src/runtime/model.cc index f31be1bcdb..f184a41934 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1948,7 +1948,7 @@ void FFModel::map_tensor_with_dim2(ParallelTensor tensor, allocator.allocate_field(sizeof(half), FID_DATA); break; case DT_B16: - allocator.allocate_field(sizeof(__nv_bfloat16), FID_DATA); + allocator.allocate_field(sizeof(__ff_bfloat16), FID_DATA); break; case DT_FLOAT: allocator.allocate_field(sizeof(float), FID_DATA); diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index e329593970..d3cc8a0cb1 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -847,10 +847,10 @@ template bool ParallelTensorBase::get_tensor(FFModel const *ff, half *data, bool get_gradients); -template bool ParallelTensorBase::set_tensor<__nv_bfloat16>( - FFModel const *ff, std::vector const &dims, __nv_bfloat16 const *data); -template bool ParallelTensorBase::get_tensor<__nv_bfloat16>(FFModel const *ff, - __nv_bfloat16 *data, +template bool ParallelTensorBase::set_tensor<__ff_bfloat16>( + FFModel const *ff, std::vector const &dims, __ff_bfloat16 const *data); +template bool ParallelTensorBase::get_tensor<__ff_bfloat16>(FFModel const *ff, + __ff_bfloat16 *data, bool get_gradients); template bool ParallelTensorBase::set_tensor(FFModel const *ff, From bc150e360cc8de0feb7d45208bd47e306bc54b30 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 16 Jan 2024 11:04:40 -0500 Subject: [PATCH 06/11] . --- inference/file_loader.cc | 4 +--- inference/file_loader.h | 10 +++++----- inference/incr_decoding/incr_decoding.cc | 14 ++++---------- inference/models/llama.cc | 6 +++--- 4 files changed, 13 insertions(+), 21 deletions(-) diff --git a/inference/file_loader.cc b/inference/file_loader.cc index 60b2c80b06..257ebfc52b 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -668,9 +668,7 @@ void load_attention_weights_v2(DT *ptr, } template -void FileDataLoader::load_from_file(DT *ptr, - size_t size, - std::string filepath) { +void load_from_file(DT *ptr, size_t size, std::string filepath) { std::ifstream in(filepath, std::ios::in | std::ios::binary); if (!in.good()) { std::cout << "Could not open file: " << filepath << std::endl; diff --git a/inference/file_loader.h b/inference/file_loader.h index 579c6d4239..5da2d02a60 100644 --- a/inference/file_loader.h +++ b/inference/file_loader.h @@ -50,12 +50,12 @@ class FileDataLoader { ParallelTensor position_pt, int max_seq_length, int offset); - template - void load_from_file(DT *ptr, size_t size, std::string filepath); + // template + // void load_from_file(DT *ptr, size_t size, std::string filepath); + - private : int num_heads, - num_kv_heads, - tensor_parallelism_degree; +private: + int num_heads, num_kv_heads, tensor_parallelism_degree; size_t hidden_dim, qkv_inner_dim; std::string prompts_filepath; std::string weights_folder; diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index e11eb14113..06e32c4855 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -235,17 +235,11 @@ void FlexFlow::top_level_task(Task const *task, generationConfig, data_type); } else if (model_type == ModelType::OPT) { - OPT::create_opt_model(model, - config_filepath, - weights_filepath, - INC_DECODING_MODE, - data_type); + OPT::create_opt_model( + model, config_filepath, weights_filepath, INC_DECODING_MODE, data_type); } else if (model_type == ModelType::FALCON) { - FALCON::create_falcon_model(model, - config_filepath, - weights_filepath, - INC_DECODING_MODE, - data_type); + FALCON::create_falcon_model( + model, config_filepath, weights_filepath, INC_DECODING_MODE, data_type); } else if (model_type == ModelType::STARCODER) { STARCODER::create_starcoder_model(model, config_filepath, diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 40075bcdbf..e54d6d8811 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -25,7 +25,7 @@ void LLAMA::create_llama_model(FFModel &ff, std::string const &weight_file_path, InferenceMode mode, GenerationConfig generation_config, - DataType data_type) { + bool use_full_precision) { // do not apply cpu offload in beam search model. LLAMAConfig llama_config(model_config_file_path); llama_config.print(); @@ -54,7 +54,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.vocab_size, llama_config.hidden_size, AGGR_MODE_NONE, - data_type, + use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, "tok_embeddings"); @@ -276,7 +276,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.hidden_size / llama_config.num_attention_heads, ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, false); + fileloader.load_weights(&ff, use_full_precision); std::cout << "------load weight finished----------" << std::endl; // init operators From b56ce9053db84e2f4eed9755c2b4251a5fe135cc Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 16 Jan 2024 11:11:38 -0500 Subject: [PATCH 07/11] . --- inference/models/starcoder.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index 5714ca1369..e683376e47 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -26,7 +26,7 @@ void STARCODER::create_starcoder_model( std::string const &weight_file_path, InferenceMode mode, GenerationConfig generationConfig, - DataType data_type) { + bool use_full_precision) { // do not apply cpu offload in beam search model. STARCODERConfig startcoder_config(model_config_file_path); startcoder_config.print(); @@ -62,7 +62,7 @@ void STARCODER::create_starcoder_model( startcoder_config.vocab_size, startcoder_config.hidden_size, AGGR_MODE_NONE, - data_type, + use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, "transformer_wte"); @@ -72,7 +72,7 @@ void STARCODER::create_starcoder_model( startcoder_config.max_position_embeddings, startcoder_config.hidden_size, AGGR_MODE_NONE, - data_type, + use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, "transformer_wpe"); @@ -232,7 +232,7 @@ void STARCODER::create_starcoder_model( startcoder_config.hidden_size / startcoder_config.num_attention_heads, ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, false); + fileloader.load_weights(&ff, use_full_precision); std::cout << "------load weight finished----------" << std::endl; // init operators From d664239a6eae4fc083bb0e34cac9f6b4d368edcd Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Wed, 17 Jan 2024 10:18:32 -0500 Subject: [PATCH 08/11] fix --- include/flexflow/ffconst.h | 2 +- include/flexflow/utils/file_loader.h | 9 --------- inference/incr_decoding/incr_decoding.cc | 4 +++- inference/models/falcon.cc | 2 +- inference/models/falcon.h | 2 +- inference/models/llama.cc | 6 +++--- inference/models/llama.h | 2 +- inference/models/mpt.cc | 2 +- inference/models/mpt.h | 2 +- inference/models/opt.cc | 2 +- inference/models/opt.h | 2 +- inference/models/starcoder.cc | 8 ++++---- inference/models/starcoder.h | 2 +- inference/spec_infer/spec_infer.cc | 2 +- inference/utils/download_hf_model.py | 4 ++-- python/flexflow/serve/serve.py | 6 +++--- python/flexflow/type.py | 2 +- src/ops/add_bias_residual_layer_norm.cu | 2 +- src/ops/arg_topk.cpp | 2 +- src/ops/arg_topk.cu | 2 +- src/ops/argmax.cpp | 4 ++-- src/ops/argmax.cu | 6 +++--- src/ops/fused.cpp | 6 +++--- src/ops/fused.cu | 6 +++--- src/ops/inc_multihead_self_attention.cpp | 2 +- src/ops/inc_multihead_self_attention.cu | 10 +++++----- src/ops/kernels/element_binary_kernels.cpp | 2 +- src/ops/kernels/element_binary_kernels.cu | 2 +- src/ops/kernels/embedding_kernels.cpp | 8 ++++---- src/ops/kernels/embedding_kernels.cu | 8 ++++---- src/ops/kernels/linear_kernels.cpp | 4 ++-- src/ops/kernels/linear_kernels.cu | 8 ++++---- src/ops/kernels/residual_rms_norm_kernels.cu | 2 +- src/ops/kernels/rms_norm_kernels.cpp | 2 +- src/ops/kernels/rms_norm_kernels.cu | 2 +- src/ops/layer_norm.cpp | 2 +- src/ops/layer_norm.cu | 2 +- src/ops/linear.cc | 6 +++--- src/ops/residual_layer_norm.cpp | 2 +- src/ops/residual_layer_norm.cu | 2 +- src/ops/sampling.cu | 4 ++-- src/ops/sigmoid_silu_multi.cpp | 2 +- src/ops/sigmoid_silu_multi.cu | 2 +- src/ops/softmax.cc | 6 +++--- src/ops/spec_inc_multihead_self_attention.cpp | 2 +- src/ops/spec_inc_multihead_self_attention.cu | 4 ++-- src/ops/tree_inc_multihead_self_attention.cpp | 2 +- src/ops/tree_inc_multihead_self_attention.cu | 2 +- src/parallel_ops/combine.cc | 2 +- src/parallel_ops/reduction.cc | 2 +- src/parallel_ops/replicate.cc | 2 +- src/runtime/accessor.cc | 10 +++++----- src/runtime/cuda_helper.cu | 6 +++--- src/runtime/ffconst_utils.cc | 2 +- src/runtime/file_loader.cc | 2 +- src/runtime/hip_helper.cpp | 6 +++--- src/runtime/initializer_kernel.cpp | 2 +- src/runtime/initializer_kernel.cu | 2 +- src/runtime/model.cc | 2 +- src/runtime/operator.cc | 6 +++--- 60 files changed, 106 insertions(+), 113 deletions(-) diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 9613a041f7..49d5f4d1e4 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -35,7 +35,7 @@ enum DataType { DT_DOUBLE = 45, DT_INT4 = 46, DT_INT8 = 47, - DT_B16 = 48, + DT_BF16 = 48, DT_NONE = 49, }; diff --git a/include/flexflow/utils/file_loader.h b/include/flexflow/utils/file_loader.h index 262bb42e72..39b46955b5 100644 --- a/include/flexflow/utils/file_loader.h +++ b/include/flexflow/utils/file_loader.h @@ -38,18 +38,10 @@ class FileDataLoader { template void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx); -<<<<<<< HEAD:inference/file_loader.h void load_single_weight_tensor_b16(FFModel *ff, Layer *l, int weight_idx); - void load_quantization_weight(FFModel *ff, - Layer *l, - int weight_idx, - bool use_full_precision); - void load_weights(FFModel *ff, bool use_full_precision); -======= void load_quantization_weight(FFModel *ff, Layer *l, int weight_idx); void load_weights(FFModel *ff); ->>>>>>> origin/inference:include/flexflow/utils/file_loader.h void load_positions(FFModel *ff, Tensor pt, @@ -59,7 +51,6 @@ class FileDataLoader { // template // void load_from_file(DT *ptr, size_t size, std::string filepath); - private: int num_heads, num_kv_heads, tensor_parallelism_degree; size_t hidden_dim, qkv_inner_dim; diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 4dd44aa5cd..96017008db 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -81,7 +81,7 @@ void parse_input_args(char **argv, } if (!strcmp(argv[i], "--use-bfloat16-precision")) { use_bfloat16_precision = true; - data_type = DT_B16; + data_type = DT_BF16; continue; } // verbose logging to stdout @@ -171,6 +171,8 @@ void FlexFlow::top_level_task(Task const *task, {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"}); std::string tokenizer_filepath = join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name}); + + // bfloat16 shares same weight file with float32 std::string weights_filepath = join_path( {file_paths.cache_folder_path, "weights", diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index 6ba950b984..5396b32fbd 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -248,7 +248,7 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.hidden_size, falcon_config.hidden_size / falcon_config.n_head, ff.config.tensor_parallelism_degree, - use_full_precision); + true); InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); diff --git a/inference/models/falcon.h b/inference/models/falcon.h index 26f0a094a1..7477b3523d 100644 --- a/inference/models/falcon.h +++ b/inference/models/falcon.h @@ -94,7 +94,7 @@ class FALCON { std::string const &model_config_file_path, std::string const &weight_file_path, InferenceMode mode, - DataType DataType); + DataType data_type); }; }; // namespace FlexFlow diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 14b8c31fa1..fcd5c64bfe 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -25,7 +25,7 @@ void LLAMA::create_llama_model(FFModel &ff, std::string const &weight_file_path, InferenceMode mode, GenerationConfig generation_config, - bool use_full_precision) { + DataType data_type) { // do not apply cpu offload in beam search model. LLAMAConfig llama_config(model_config_file_path); llama_config.print(); @@ -55,7 +55,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.vocab_size, llama_config.hidden_size, AGGR_MODE_NONE, - use_full_precision ? DT_FLOAT : DT_HALF, + data_type, NULL, embed_init, "tok_embeddings"); @@ -273,7 +273,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.hidden_size, llama_config.hidden_size / llama_config.num_attention_heads, ff.config.tensor_parallelism_degree, - use_full_precision); + true); InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); diff --git a/inference/models/llama.h b/inference/models/llama.h index 9530b25061..b0f8c6b207 100644 --- a/inference/models/llama.h +++ b/inference/models/llama.h @@ -83,7 +83,7 @@ class LLAMA { std::string const &weight_file_path, InferenceMode mode, GenerationConfig generation_config, - DataType DataType); + DataType data_type); }; }; // namespace FlexFlow diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index 6b23a4ede9..c5572deb47 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -255,7 +255,7 @@ void MPT::create_mpt_model(FFModel &ff, mpt_config.hidden_size, mpt_config.hidden_size / mpt_config.n_heads, ff.config.tensor_parallelism_degree, - use_full_precision); + true); InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); diff --git a/inference/models/mpt.h b/inference/models/mpt.h index a396814f58..510eaae5b0 100644 --- a/inference/models/mpt.h +++ b/inference/models/mpt.h @@ -70,7 +70,7 @@ class MPT { std::string const &weight_file_path, InferenceMode mode, GenerationConfig generationConfig, - DataType DataType); + DataType data_type); }; }; // namespace FlexFlow diff --git a/inference/models/opt.cc b/inference/models/opt.cc index a861953a69..a185af8b38 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -263,7 +263,7 @@ void OPT::create_opt_model(FFModel &ff, opt_config.hidden_size, opt_config.hidden_size / opt_config.num_attention_heads, ff.config.tensor_parallelism_degree, - use_full_precision); + true); InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); diff --git a/inference/models/opt.h b/inference/models/opt.h index 238ae19d4a..cc1cdf576b 100644 --- a/inference/models/opt.h +++ b/inference/models/opt.h @@ -97,7 +97,7 @@ class OPT { std::string const &model_config_file_path, std::string const &weight_file_path, InferenceMode mode, - DataType DataType); + DataType data_type); }; }; // namespace FlexFlow diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index 2327c86119..71dbfb8fbb 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -26,7 +26,7 @@ void STARCODER::create_starcoder_model( std::string const &weight_file_path, InferenceMode mode, GenerationConfig generationConfig, - bool use_full_precision) { + DataType data_type) { // do not apply cpu offload in beam search model. STARCODERConfig startcoder_config(model_config_file_path); startcoder_config.print(); @@ -63,7 +63,7 @@ void STARCODER::create_starcoder_model( startcoder_config.vocab_size, startcoder_config.hidden_size, AGGR_MODE_NONE, - use_full_precision ? DT_FLOAT : DT_HALF, + data_type, NULL, embed_init, "transformer_wte"); @@ -73,7 +73,7 @@ void STARCODER::create_starcoder_model( startcoder_config.max_position_embeddings, startcoder_config.hidden_size, AGGR_MODE_NONE, - use_full_precision ? DT_FLOAT : DT_HALF, + data_type, NULL, embed_init, "transformer_wpe"); @@ -230,7 +230,7 @@ void STARCODER::create_starcoder_model( startcoder_config.hidden_size, startcoder_config.hidden_size / startcoder_config.num_attention_heads, ff.config.tensor_parallelism_degree, - use_full_precision); + true); im->register_model_weights_loader(&ff, fileloader); #ifdef DEADCODE // Compile the model diff --git a/inference/models/starcoder.h b/inference/models/starcoder.h index d35ae78435..fce7a27467 100644 --- a/inference/models/starcoder.h +++ b/inference/models/starcoder.h @@ -71,7 +71,7 @@ class STARCODER { std::string const &weight_file_path, InferenceMode mode, GenerationConfig generationConfig, - DataType DataType); + DataType data_type); }; }; // namespace FlexFlow diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index e0299527bb..d37b46d5ae 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -105,7 +105,7 @@ void parse_input_args(char **argv, } if (!strcmp(argv[i], "--use-bfloat16-precision")) { use_bfloat16_precision = true; - data_type = DT_B16; + data_type = DT_BF16; } // verbose logging to stdout if (!strcmp(argv[i], "--verbose")) { diff --git a/inference/utils/download_hf_model.py b/inference/utils/download_hf_model.py index f158d4e480..2a23527fb7 100644 --- a/inference/utils/download_hf_model.py +++ b/inference/utils/download_hf_model.py @@ -45,9 +45,9 @@ def main(args): elif args.half_precision_only: data_types = ff.DataType.DT_HALF elif args.bfloat16_precision_only: - data_types = ff.DataType.DT_B16 + data_types = ff.DataType.DT_BF16 else: - data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF, ff.DataType.DT_B16) + data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF, ff.DataType.DT_BF16) for model_name in args.model_names: for data_type in data_types: diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index b8afb285ad..3231b222b4 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -113,7 +113,7 @@ def __init__( self.config_class, ) = self.__get_ff_model_type() self.data_type = data_type - assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT or self.data_type == DataType.DT_B16 + assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT or self.data_type == DataType.DT_BF16 self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow" self.refresh_cache = refresh_cache self.output_file = output_file @@ -177,7 +177,7 @@ def download_hf_weights_if_needed(self): torch.set_default_tensor_type(torch.HalfTensor) elif self.data_type == DataType.DT_FLOAT: torch.set_default_tensor_type(torch.FloatTensor) - elif self.data_type == DataType.DT_B16: + elif self.data_type == DataType.DT_BF16: torch.set_default_tensor_type(torch.BFloat16Tensor) else: assert False, "Data type not yet supported -- cannot download weights!" @@ -189,7 +189,7 @@ def download_hf_weights_if_needed(self): self.model_name.lower(), "full-precision" if self.data_type == DataType.DT_FLOAT - else "bfloat16-precision" if self.data_type == DataType.DT_B16 + else "bfloat16-precision" if self.data_type == DataType.DT_BF16 else "half-precision", ) if self.refresh_cache: diff --git a/python/flexflow/type.py b/python/flexflow/type.py index df113438e8..04de9fa7d6 100644 --- a/python/flexflow/type.py +++ b/python/flexflow/type.py @@ -35,7 +35,7 @@ class DataType(Enum): DT_HALF = 43 DT_FLOAT = 44 DT_DOUBLE = 45 - DT_B16 = 48 + DT_BF16 = 48 DT_NONE = 49 diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu index b137d6faef..8b372875ea 100644 --- a/src/ops/add_bias_residual_layer_norm.cu +++ b/src/ops/add_bias_residual_layer_norm.cu @@ -234,7 +234,7 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( m->elementwise_affine ? gamma.get_half_ptr() : nullptr, (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, stream); - } else if (m->input_type[0] == DT_B16) { + } else if (m->input_type[0] == DT_BF16) { AddBiasResidualLayerNorm::inference_kernel<__nv_bfloat16>( m, attn_bias_dim, diff --git a/src/ops/arg_topk.cpp b/src/ops/arg_topk.cpp index 17cc7534ad..54166ab8d2 100644 --- a/src/ops/arg_topk.cpp +++ b/src/ops/arg_topk.cpp @@ -515,7 +515,7 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, m->sorted, m->speculative_decoding ? bc : nullptr, stream); - } else if (input.data_type == DT_B16) { + } else if (input.data_type == DT_BF16) { ArgTopK::forward_kernel(m, input.get_bfloat16_ptr(), m->speculative_decoding ? probs.get_float_ptr() diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu index 7f43aba948..671e9aeb25 100644 --- a/src/ops/arg_topk.cu +++ b/src/ops/arg_topk.cu @@ -520,7 +520,7 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, m->sorted, m->speculative_decoding ? bc : nullptr, stream); - } else if (input.data_type == DT_B16) { + } else if (input.data_type == DT_BF16) { ArgTopK::forward_kernel(m, input.get_bfloat16_ptr(), m->speculative_decoding ? probs.get_float_ptr() diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp index 7437b8b969..1c94585ba1 100644 --- a/src/ops/argmax.cpp +++ b/src/ops/argmax.cpp @@ -466,7 +466,7 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, length, batch_size, stream); - } else if (input.data_type == DT_B16) { + } else if (input.data_type == DT_BF16) { ArgMax::forward_kernel(m, input.get_bfloat16_ptr(), indices.get_int32_ptr(), @@ -501,7 +501,7 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler, : OpMeta(handler, op) { DataType data_type = op->data_type; size_t prob_size = batch_size; - assert(data_type == DT_FLOAT || data_type == DT_HALF || data_type == DT_B16); + assert(data_type == DT_FLOAT || data_type == DT_HALF || data_type == DT_BF16); size_t total_size = prob_size * sizeof(float); gpu_mem_allocator.create_legion_instance(reserveInst, total_size); probs = gpu_mem_allocator.allocate_instance(prob_size); diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu index 32cd872a55..aebafc2fff 100644 --- a/src/ops/argmax.cu +++ b/src/ops/argmax.cu @@ -123,7 +123,7 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, length, batch_size, stream); - } else if (input.data_type == DT_B16) { + } else if (input.data_type == DT_BF16) { ArgMax::forward_kernel<__nv_bfloat16>( m, input.get_bfloat16_ptr(), @@ -163,7 +163,7 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler, size_t d_offsets_size = batch_size; size_t prob_size = batch_size; - assert(data_type == DT_FLOAT || data_type == DT_HALF || data_type == DT_B16); + assert(data_type == DT_FLOAT || data_type == DT_HALF || data_type == DT_BF16); size_t total_size = d_offsets_size * sizeof(int) + (data_type == DT_FLOAT @@ -214,7 +214,7 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler, d_offsets, d_offsets + 1, stream)); - } else if (data_type == DT_B16) { + } else if (data_type == DT_BF16) { checkCUDA(cub::DeviceSegmentedReduce::ArgMax( d_temp_storage, temp_storage_bytes, diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index af98f153e2..2daa48c3d0 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -428,7 +428,7 @@ __host__ void FusedOp::forward_task(Task const *task, m, my_input_accessor[0].get_float_ptr(), my_output_accessor[0].get_float_ptr()); - } else if (m->input_type == DT_B16) { + } else if (m->input_type == DT_BF16) { Kernels::Softmax::forward_kernel_wrapper( m, my_input_accessor[0].get_bfloat16_ptr(), @@ -820,7 +820,7 @@ __host__ void my_input_accessor[0].get_float_ptr(), my_output_accessor[0].get_float_ptr(), my_input_accessor[0].domain.get_volume()); - } else if (m->data_type == DT_B16) { + } else if (m->data_type == DT_BF16) { ElementUnary::forward_kernel_wrapper( m, my_input_accessor[0].get_bfloat16_ptr(), @@ -1050,7 +1050,7 @@ __host__ void m, my_input_accessor[0].get_float_ptr(), my_output_accessor[0].get_float_ptr()); - } else if (m->input_type == DT_B16) { + } else if (m->input_type == DT_BF16) { Kernels::Softmax::forward_kernel_wrapper( m, my_input_accessor[0].get_bfloat16_ptr(), diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 77e12828a1..16b9a2b782 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -442,7 +442,7 @@ __host__ void FusedOp::forward_task(Task const *task, m, my_input_accessor[0].get_float_ptr(), my_output_accessor[0].get_float_ptr()); - } else if (m->input_type == DT_B16) { + } else if (m->input_type == DT_BF16) { Kernels::Softmax::forward_kernel_wrapper( m, my_input_accessor[0].get_bfloat16_ptr(), @@ -855,7 +855,7 @@ __host__ void my_input_accessor[0].get_float_ptr(), my_output_accessor[0].get_float_ptr(), my_input_accessor[0].domain.get_volume()); - } else if (m->data_type == DT_B16) { + } else if (m->data_type == DT_BF16) { ElementUnary::forward_kernel_wrapper( m, my_input_accessor[0].get_bfloat16_ptr(), @@ -1087,7 +1087,7 @@ __host__ void m, my_input_accessor[0].get_float_ptr(), my_output_accessor[0].get_float_ptr()); - } else if (m->input_type == DT_B16) { + } else if (m->input_type == DT_BF16) { Kernels::Softmax::forward_kernel_wrapper( m, my_input_accessor[0].get_bfloat16_ptr(), diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 4d63ba628d..ca83e7196a 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -802,7 +802,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( output.get_float_ptr(), bias_ptr, stream); - } else if (input.data_type == DT_B16) { + } else if (input.data_type == DT_BF16) { if (m->offload) { pre_build_weight_kernel(m, weight, input.data_type, stream); } diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index af77ac9f6d..9b4cffad4c 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -506,7 +506,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; if (m->output_type[0] == DT_FLOAT) { compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } else if (m->output_type[0] == DT_B16) { + } else if (m->output_type[0] == DT_BF16) { compute_type = CUBLAS_COMPUTE_32F; } #endif @@ -640,7 +640,7 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; #else cudaDataType_t compute_type = cublas_data_type; - if (m->output_type[0] == DT_B16) { + if (m->output_type[0] == DT_BF16) { compute_type = CUDA_R_32F; } #endif @@ -799,7 +799,7 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, m->weightSize, cudaMemcpyHostToDevice, stream); - } else if (data_type == DT_B16) { + } else if (data_type == DT_BF16) { cudaMemcpyAsync(m->weight_ptr, weight.get_bfloat16_ptr(), m->weightSize, @@ -927,7 +927,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; if (m->output_type[0] == DT_FLOAT) { compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } else if (m->output_type[0] == DT_B16) { + } else if (m->output_type[0] == DT_BF16) { compute_type = CUBLAS_COMPUTE_32F; } #endif @@ -1202,7 +1202,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( output.get_float_ptr(), bias_ptr, stream); - } else if (input.data_type == DT_B16) { + } else if (input.data_type == DT_BF16) { if (m->offload) { pre_build_weight_kernel<__nv_bfloat16>( m, weight, input.data_type, stream); diff --git a/src/ops/kernels/element_binary_kernels.cpp b/src/ops/kernels/element_binary_kernels.cpp index 05467182e7..a353ab76ea 100644 --- a/src/ops/kernels/element_binary_kernels.cpp +++ b/src/ops/kernels/element_binary_kernels.cpp @@ -91,7 +91,7 @@ void forward_kernel_wrapper(ElementBinaryMeta const *m, in2.get_float_ptr(), out.get_float_ptr(), stream); - } else if (out.data_type == DT_B16) { + } else if (out.data_type == DT_BF16) { Internal::forward_kernel(m, in1.get_bfloat16_ptr(), in2.get_bfloat16_ptr(), diff --git a/src/ops/kernels/element_binary_kernels.cu b/src/ops/kernels/element_binary_kernels.cu index 26d9ea9cc9..a762f17774 100644 --- a/src/ops/kernels/element_binary_kernels.cu +++ b/src/ops/kernels/element_binary_kernels.cu @@ -105,7 +105,7 @@ void forward_kernel_wrapper(ElementBinaryMeta const *m, in2.get_float_ptr(), out.get_float_ptr(), stream); - } else if (out.data_type == DT_B16) { + } else if (out.data_type == DT_BF16) { Internal::forward_kernel( m, in1.get_bfloat16_ptr(), in2.get_bfloat16_ptr(), out.get_bfloat16_ptr(), stream); } else { diff --git a/src/ops/kernels/embedding_kernels.cpp b/src/ops/kernels/embedding_kernels.cpp index 7384f4e7e6..7612abd0f0 100644 --- a/src/ops/kernels/embedding_kernels.cpp +++ b/src/ops/kernels/embedding_kernels.cpp @@ -70,7 +70,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, m->aggr, output.domain.get_volume(), stream); - } else if (weight.data_type == DT_B16) { + } else if (weight.data_type == DT_BF16) { Internal::forward_kernel(input.get_int32_ptr(), output.get_bfloat16_ptr(), weight.get_bfloat16_ptr(), @@ -114,7 +114,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, m->aggr, output.domain.get_volume(), stream); - } else if (weight.data_type == DT_B16) { + } else if (weight.data_type == DT_BF16) { Internal::forward_kernel(input.get_int64_ptr(), output.get_bfloat16_ptr(), weight.get_bfloat16_ptr(), @@ -182,7 +182,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m, m->aggr, output.domain.get_volume(), stream); - } else if (m->output_type[0] == DT_B16) { + } else if (m->output_type[0] == DT_BF16) { Internal::backward_kernel(input.get_int32_ptr(), output.get_bfloat16_ptr(), weight_grad.get_bfloat16_ptr(), @@ -226,7 +226,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m, m->aggr, output.domain.get_volume(), stream); - } else if (m->output_type[0] == DT_B16) { + } else if (m->output_type[0] == DT_BF16) { Internal::backward_kernel(input.get_int64_ptr(), output.get_bfloat16_ptr(), weight_grad.get_bfloat16_ptr(), diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu index 8e40f4bf95..911aa5c45d 100644 --- a/src/ops/kernels/embedding_kernels.cu +++ b/src/ops/kernels/embedding_kernels.cu @@ -70,7 +70,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, m->aggr, output.domain.get_volume(), stream); - } else if (weight.data_type == DT_B16) { + } else if (weight.data_type == DT_BF16) { Internal::forward_kernel(input.get_int32_ptr(), output.get_bfloat16_ptr(), weight.get_bfloat16_ptr(), @@ -114,7 +114,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, m->aggr, output.domain.get_volume(), stream); - } else if (weight.data_type == DT_B16) { + } else if (weight.data_type == DT_BF16) { Internal::forward_kernel(input.get_int64_ptr(), output.get_bfloat16_ptr(), weight.get_bfloat16_ptr(), @@ -181,7 +181,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m, m->aggr, output.domain.get_volume(), stream); - } else if (m->output_type[0] == DT_B16) { + } else if (m->output_type[0] == DT_BF16) { Internal::backward_kernel(input.get_int32_ptr(), output.get_bfloat16_ptr(), weight_grad.get_bfloat16_ptr(), @@ -225,7 +225,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m, m->aggr, output.domain.get_volume(), stream); - } else if (m->output_type[0] == DT_B16) { + } else if (m->output_type[0] == DT_BF16) { Internal::backward_kernel(input.get_int64_ptr(), output.get_bfloat16_ptr(), weight_grad.get_bfloat16_ptr(), diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index 3cd9862bba..1004fe2767 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -124,7 +124,7 @@ void forward_kernel_wrapper(LinearMeta const *m, out_dim, batch_size, stream); - } else if (m->input_type[0] == DT_B16) { + } else if (m->input_type[0] == DT_BF16) { Internal::forward_kernel(m, input_ptr, output_ptr, @@ -199,7 +199,7 @@ void backward_kernel_wrapper(LinearMeta const *m, out_dim, batch_size, stream); - } else if (m->input_type[0] == DT_B16) { + } else if (m->input_type[0] == DT_BF16) { Internal::backward_kernel(m, input_ptr, input_grad_ptr, diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index f31fa112c0..77ed3b9a9e 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -58,7 +58,7 @@ LinearMeta::LinearMeta(FFHandler handler, min(CUDA_NUM_THREADS, parallelism), 0, stream>>>((half *)one_ptr, batch_size); - } else if (data_type == DT_B16) { + } else if (data_type == DT_BF16) { Kernels::Linear::Internal:: build_one_ptr<<input_type[0] == DT_B16) { + } else if (m->input_type[0] == DT_BF16) { Internal::forward_kernel<__nv_bfloat16>(m, input_ptr, output_ptr, @@ -232,7 +232,7 @@ void backward_kernel_wrapper(LinearMeta const *m, out_dim, batch_size, stream); - } else if (m->input_type[0] == DT_B16) { + } else if (m->input_type[0] == DT_BF16) { Internal::backward_kernel<__nv_bfloat16>(m, input_ptr, input_grad_ptr, @@ -362,7 +362,7 @@ void forward_kernel(LinearMeta const *m, cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; if (m->output_type[0] == DT_FLOAT) { compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } else if (m->output_type[0] == DT_B16) { + } else if (m->output_type[0] == DT_BF16) { compute_type = CUBLAS_COMPUTE_32F; } #endif diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index 16a113c9cb..46fc43fe53 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -204,7 +204,7 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, residual_output.get_float_ptr(), output.get_float_ptr(), stream); - } else if (output.data_type == DT_B16) { + } else if (output.data_type == DT_BF16) { forward_kernel(m, input1.get_bfloat16_ptr(), input2.get_bfloat16_ptr(), diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp index c81e73dd77..45ff40f0d0 100644 --- a/src/ops/kernels/rms_norm_kernels.cpp +++ b/src/ops/kernels/rms_norm_kernels.cpp @@ -190,7 +190,7 @@ void forward_kernel_wrapper(RMSNormMeta const *m, weight.get_float_ptr(), output.get_float_ptr(), stream); - } else if (output.data_type == DT_B16) { + } else if (output.data_type == DT_BF16) { forward_kernel(m, input.get_bfloat16_ptr(), weight.get_bfloat16_ptr(), diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index 19661724d2..115f290f14 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -246,7 +246,7 @@ void forward_kernel_wrapper(RMSNormMeta const *m, weight.get_float_ptr(), output.get_float_ptr(), stream); - } else if (output.data_type == DT_B16) { + } else if (output.data_type == DT_BF16) { forward_kernel(m, input.get_bfloat16_ptr(), weight.get_bfloat16_ptr(), diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index ff420901e4..f675899fcb 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -182,7 +182,7 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, gamma.get_half_ptr(), m->use_bias ? beta.get_half_ptr() : nullptr, stream); - } else if (m->input_type[0] == DT_B16) { + } else if (m->input_type[0] == DT_BF16) { LayerNorm::forward_kernel( m, input.get_bfloat16_ptr(), diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index e239023768..d6a67682e9 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -273,7 +273,7 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, m->elementwise_affine ? gamma.get_half_ptr() : nullptr, (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, stream); - } else if (m->input_type[0] == DT_B16) { + } else if (m->input_type[0] == DT_BF16) { LayerNorm::forward_kernel<__nv_bfloat16>( m, input.get_bfloat16_ptr(), diff --git a/src/ops/linear.cc b/src/ops/linear.cc index b33851d7be..29e06d3948 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -437,7 +437,7 @@ OpMeta *Linear::init_task(Task const *task, return init_task_with_dim( \ task, regions, ctx, runtime); \ } \ - } else if (output.data_type == DT_B16) { \ + } else if (output.data_type == DT_BF16) { \ if (linear->quantization_type != DT_NONE) { \ return init_task_with_dim<__ff_bfloat16, char, DIM>( \ task, regions, ctx, runtime); \ @@ -712,7 +712,7 @@ void Linear::forward_task(Task const *task, return forward_task_with_dim( \ task, regions, ctx, runtime); \ } \ - } else if (m->output_type[0] == DT_B16) { \ + } else if (m->output_type[0] == DT_BF16) { \ if (m->quantization_type != DT_NONE) { \ return forward_task_with_dim<__ff_bfloat16, char, DIM>( \ task, regions, ctx, runtime); \ @@ -875,7 +875,7 @@ void Linear::backward_task(Task const *task, return backward_task_with_dim(task, regions, ctx, runtime); \ } else if (m->output_type[0] == DT_FLOAT) { \ return backward_task_with_dim(task, regions, ctx, runtime); \ - } else if (m->output_type[0] == DT_B16) { \ + } else if (m->output_type[0] == DT_BF16) { \ return backward_task_with_dim<__ff_bfloat16, DIM>( \ task, regions, ctx, runtime); \ } else { \ diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp index 09823adf27..290d8cb3d9 100644 --- a/src/ops/residual_layer_norm.cpp +++ b/src/ops/residual_layer_norm.cpp @@ -230,7 +230,7 @@ void ResidualLayerNorm::inference_kernel_wrapper( m->elementwise_affine ? gamma.get_half_ptr() : nullptr, (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, stream); - } else if (m->input_type[0] == DT_B16) { + } else if (m->input_type[0] == DT_BF16) { ResidualLayerNorm::inference_kernel( m, input.get_bfloat16_ptr(), diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index fc0bd5d5a2..2f310c3c2c 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -225,7 +225,7 @@ void ResidualLayerNorm::inference_kernel_wrapper( m->elementwise_affine ? gamma.get_half_ptr() : nullptr, (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, stream); - } else if (m->input_type[0] == DT_B16) { + } else if (m->input_type[0] == DT_BF16) { ResidualLayerNorm::inference_kernel<__nv_bfloat16>( m, input.get_bfloat16_ptr(), diff --git a/src/ops/sampling.cu b/src/ops/sampling.cu index d4044a9cc3..96438fa93a 100644 --- a/src/ops/sampling.cu +++ b/src/ops/sampling.cu @@ -182,7 +182,7 @@ void Sampling::forward_kernel_wrapper(SamplingMeta const *m, length, batch_size, stream); - } else if (input.data_type == DT_B16) { + } else if (input.data_type == DT_BF16) { Sampling::forward_kernel<__nv_bfloat16>(m, input.get_bfloat16_ptr(), indices.get_int32_ptr(), @@ -278,7 +278,7 @@ SamplingMeta::SamplingMeta(FFHandler handler, 0, // begin_bit data_type_size(data_type) * 8, // end_bit = sizeof(KeyT) * 8 stream)); - } else if (data_type == DT_B16) { + } else if (data_type == DT_BF16) { checkCUDA(cub::DeviceSegmentedRadixSort::SortPairsDescending( d_temp_storage, temp_storage_bytes, diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp index b446ed0065..b45625d204 100644 --- a/src/ops/sigmoid_silu_multi.cpp +++ b/src/ops/sigmoid_silu_multi.cpp @@ -101,7 +101,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper( input1.get_half_ptr(), input2.get_half_ptr(), output.get_half_ptr()); - } else if (m->input_type[0] == DT_B16) { + } else if (m->input_type[0] == DT_BF16) { SigmoidSiluMultiKernel<<input_type[0] == DT_B16) { + } else if (m->input_type[0] == DT_BF16) { SigmoidSiluMultiKernel<<output_type == DT_FLOAT) { forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr()); - } else if (m->output_type == DT_B16) { + } else if (m->output_type == DT_BF16) { forward_kernel_wrapper( m, input.get_bfloat16_ptr(), output.get_bfloat16_ptr()); } else { @@ -369,7 +369,7 @@ void Softmax::backward_task(Task const *task, return backward_task_with_dim(task, regions, ctx, runtime); \ } else if (m->output_type == DT_FLOAT) { \ return backward_task_with_dim(task, regions, ctx, runtime); \ - } else if (m->output_type == DT_B16) { \ + } else if (m->output_type == DT_BF16) { \ return backward_task_with_dim<__ff_bfloat16, DIM>( \ task, regions, ctx, runtime); \ } else { \ @@ -435,7 +435,7 @@ void Softmax::inference_task(Task const *task, forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr()); } else if (m->output_type == DT_FLOAT) { forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr()); - } else if (m->output_type == DT_B16) { + } else if (m->output_type == DT_BF16) { forward_kernel_wrapper( m, input.get_bfloat16_ptr(), output.get_bfloat16_ptr()); } else { diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index a7b8d4535c..ca0d607ecd 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -562,7 +562,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( output.get_float_ptr(), bias_ptr, stream); - } else if (input.data_type == DT_B16) { + } else if (input.data_type == DT_BF16) { float const *bias_ptr = use_bias ? bias.get_bfloat16_ptr() : static_cast(nullptr); diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 885a09c980..0d3e3a1250 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -480,7 +480,7 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; if (m->output_type[0] == DT_FLOAT) { compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } else if (m->output_type[0] == DT_B16) { + } else if (m->output_type[0] == DT_BF16) { compute_type = CUBLAS_COMPUTE_32F; } #endif @@ -792,7 +792,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( output.get_float_ptr(), bias_ptr, stream); - } else if (input.data_type == DT_B16) { + } else if (input.data_type == DT_BF16) { __nv_bfloat16 const *bias_ptr = use_bias ? bias.get_bfloat16_ptr() : static_cast<__nv_bfloat16 const *>(nullptr); diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 4f399aafc5..754ae24793 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -570,7 +570,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( output.get_float_ptr(), bias_ptr, stream); - } else if (input.data_type == DT_B16) { + } else if (input.data_type == DT_BF16) { if (m->offload) { pre_build_weight_kernel(m, weight, input.data_type, stream); } diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 1863dde1e7..2b50e93a02 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -1003,7 +1003,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( output.get_float_ptr(), bias_ptr, stream); - } else if (input.data_type == DT_B16) { + } else if (input.data_type == DT_BF16) { if (m->offload) { pre_build_weight_kernel<__nv_bfloat16>( m, weight, input.data_type, stream); diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc index 8bd963b3d3..ddbafc23c9 100644 --- a/src/parallel_ops/combine.cc +++ b/src/parallel_ops/combine.cc @@ -365,7 +365,7 @@ void Combine::forward_task(Task const *task, DataType data_type = m->input_type[0]; if (data_type == DT_HALF) { forward_task_with_type(task, regions, ctx, runtime); - } else if (data_type == DT_B16) { + } else if (data_type == DT_BF16) { forward_task_with_type<__ff_bfloat16>(task, regions, ctx, runtime); } else if (data_type == DT_FLOAT) { forward_task_with_type(task, regions, ctx, runtime); diff --git a/src/parallel_ops/reduction.cc b/src/parallel_ops/reduction.cc index bd7cb4085b..dd2d4a54c2 100644 --- a/src/parallel_ops/reduction.cc +++ b/src/parallel_ops/reduction.cc @@ -380,7 +380,7 @@ void Reduction::forward_task(Task const *task, output.get_float_ptr(), num_elements, num_replicas); - } else if (input.data_type == DT_B16) { + } else if (input.data_type == DT_BF16) { forward_kernel<__ff_bfloat16>(input.get_bfloat16_ptr(), output.get_bfloat16_ptr(), num_elements, diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index 18e4d26f26..c3821a7d74 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -373,7 +373,7 @@ void Replicate::forward_task(Task const *task, forward_kernel(input.get_float_ptr(), output.get_float_ptr(), input_domain.get_volume()); - } else if (input.data_type == DT_B16) { + } else if (input.data_type == DT_BF16) { forward_kernel<__ff_bfloat16>(input.get_bfloat16_ptr(), output.get_bfloat16_ptr(), input_domain.get_volume()); diff --git a/src/runtime/accessor.cc b/src/runtime/accessor.cc index 30aef93df9..a4bd0c11a2 100644 --- a/src/runtime/accessor.cc +++ b/src/runtime/accessor.cc @@ -78,7 +78,7 @@ half const *GenericTensorAccessorR::get_half_ptr() const { } __ff_bfloat16 const *GenericTensorAccessorR::get_bfloat16_ptr() const { - if (data_type == DT_B16) { + if (data_type == DT_BF16) { return static_cast<__ff_bfloat16 const *>(ptr); } else { assert(false && "Invalid Accessor Type"); @@ -175,7 +175,7 @@ half *GenericTensorAccessorW::get_half_ptr() const { } __ff_bfloat16 *GenericTensorAccessorW::get_bfloat16_ptr() const { - if (data_type == DT_B16) { + if (data_type == DT_BF16) { return static_cast<__ff_bfloat16 *>(ptr); } else { assert(false && "Invalid Accessor Type"); @@ -289,7 +289,7 @@ GenericTensorAccessorR ptr = helperGetTensorPointerRO(region, req, fid, ctx, runtime); break; } - case DT_B16: { + case DT_BF16: { ptr = helperGetTensorPointerRO<__ff_bfloat16>( region, req, fid, ctx, runtime); break; @@ -340,7 +340,7 @@ GenericTensorAccessorW ptr = helperGetTensorPointerWO(region, req, fid, ctx, runtime); break; } - case DT_B16: { + case DT_BF16: { ptr = helperGetTensorPointerWO<__ff_bfloat16>( region, req, fid, ctx, runtime); break; @@ -391,7 +391,7 @@ GenericTensorAccessorW ptr = helperGetTensorPointerRW(region, req, fid, ctx, runtime); break; } - case DT_B16: { + case DT_BF16: { ptr = helperGetTensorPointerRW<__ff_bfloat16>( region, req, fid, ctx, runtime); break; diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 91d139a414..46ab6b42da 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -546,7 +546,7 @@ cudnnDataType_t ff_to_cudnn_datatype(DataType type) { switch (type) { case DT_HALF: return CUDNN_DATA_HALF; - case DT_B16: + case DT_BF16: return CUDNN_DATA_BFLOAT16; case DT_FLOAT: return CUDNN_DATA_FLOAT; @@ -564,7 +564,7 @@ cudaDataType_t ff_to_cuda_datatype(DataType type) { switch (type) { case DT_HALF: return CUDA_R_16F; - case DT_B16: + case DT_BF16: return CUDA_R_16BF; case DT_FLOAT: return CUDA_R_32F; @@ -583,7 +583,7 @@ ncclDataType_t ff_to_nccl_datatype(DataType type) { switch (type) { case DT_HALF: return ncclHalf; - case DT_B16: + case DT_BF16: return ncclBfloat16; case DT_FLOAT: return ncclFloat; diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index 232d711d82..8c130fcfb1 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -213,7 +213,7 @@ size_t data_type_size(DataType type) { switch (type) { case DT_HALF: return sizeof(half); - case DT_B16: + case DT_BF16: return sizeof(__ff_bfloat16); case DT_FLOAT: return sizeof(float); diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index 3a765dbeb1..6f5864acff 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -1137,7 +1137,7 @@ void FileDataLoader::load_weights(FFModel *ff) { case DT_FLOAT: load_single_weight_tensor(ff, l, i); break; - case DT_B16: + case DT_BF16: load_single_weight_tensor_b16(ff, l, i); break; case DT_INT4: diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index 00a6730de0..ab1f9d6ece 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -516,7 +516,7 @@ miopenDataType_t ff_to_cudnn_datatype(DataType type) { switch (type) { case DT_HALF: return miopenHalf; - case DT_B16: + case DT_BF16: return miopenBFloat16; case DT_FLOAT: return miopenFloat; @@ -539,7 +539,7 @@ hipblasDatatype_t ff_to_cuda_datatype(DataType type) { return HIPBLAS_R_64F; case DT_INT32: return HIPBLAS_R_32I; - case DT_B16: + case DT_BF16: return HIPBLAS_R_16B; case DT_HALF: return HIPBLAS_R_16F; @@ -553,7 +553,7 @@ ncclDataType_t ff_to_nccl_datatype(DataType type) { switch (type) { case DT_HALF: return ncclHalf; - case DT_B16: + case DT_BF16: return ncclBfloat16; case DT_FLOAT: return ncclFloat; diff --git a/src/runtime/initializer_kernel.cpp b/src/runtime/initializer_kernel.cpp index 2e55591729..659c8a4d2c 100644 --- a/src/runtime/initializer_kernel.cpp +++ b/src/runtime/initializer_kernel.cpp @@ -259,7 +259,7 @@ void ZeroInitializer::init_task(Task const *task, w, domain.get_volume(), 0.0f); - } else if (meta->data_types[i] == DT_B16) { + } else if (meta->data_types[i] == DT_BF16) { hip_bfloat16 *w = helperGetTensorPointerWO( regions[i], task->regions[i], FID_DATA, ctx, runtime); hipLaunchKernelGGL(HIP_KERNEL_NAME(assign_kernel), diff --git a/src/runtime/initializer_kernel.cu b/src/runtime/initializer_kernel.cu index b6b9e29b8b..4c114f5de6 100644 --- a/src/runtime/initializer_kernel.cu +++ b/src/runtime/initializer_kernel.cu @@ -235,7 +235,7 @@ void ZeroInitializer::init_task(Task const *task, assign_kernel <<>>( w, domain.get_volume(), 0.0f); - } else if (meta->data_types[i] == DT_B16) { + } else if (meta->data_types[i] == DT_BF16) { __nv_bfloat16 *w = helperGetTensorPointerWO<__nv_bfloat16>( regions[i], task->regions[i], FID_DATA, ctx, runtime); assign_kernel<__nv_bfloat16> diff --git a/src/runtime/model.cc b/src/runtime/model.cc index acaee6b7e4..a826bb5c34 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1947,7 +1947,7 @@ void FFModel::map_tensor_with_dim2(ParallelTensor tensor, case DT_HALF: allocator.allocate_field(sizeof(half), FID_DATA); break; - case DT_B16: + case DT_BF16: allocator.allocate_field(sizeof(__ff_bfloat16), FID_DATA); break; case DT_FLOAT: diff --git a/src/runtime/operator.cc b/src/runtime/operator.cc index 6349bfd1a5..3cb71f910e 100644 --- a/src/runtime/operator.cc +++ b/src/runtime/operator.cc @@ -62,7 +62,7 @@ void Op::save_inference_tensors_to_file( save_tensor(input_tensors[i].get_half_ptr(), input_tensors[i].domain.get_volume(), filename.c_str()); - } else if (input_tensors[i].data_type == DT_B16) { + } else if (input_tensors[i].data_type == DT_BF16) { save_tensor(input_tensors[i].get_bfloat16_ptr(), input_tensors[i].domain.get_volume(), filename.c_str()); @@ -90,7 +90,7 @@ void Op::save_inference_tensors_to_file( save_tensor(weight_tensors[i].get_half_ptr(), weight_tensors[i].domain.get_volume(), filename.c_str()); - } else if (weight_tensors[i].data_type == DT_B16) { + } else if (weight_tensors[i].data_type == DT_BF16) { save_tensor(weight_tensors[i].get_bfloat16_ptr(), weight_tensors[i].domain.get_volume(), filename.c_str()); @@ -118,7 +118,7 @@ void Op::save_inference_tensors_to_file( save_tensor(output_tensors[i].get_half_ptr(), output_tensors[i].domain.get_volume(), filename.c_str()); - } else if (output_tensors[i].data_type == DT_B16) { + } else if (output_tensors[i].data_type == DT_BF16) { save_tensor(output_tensors[i].get_bfloat16_ptr(), output_tensors[i].domain.get_volume(), filename.c_str()); From 5f2afc67e91f076ba6bc213225ce7f682b1c1100 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Wed, 17 Jan 2024 10:40:25 -0500 Subject: [PATCH 09/11] . --- inference/incr_decoding/incr_decoding.cc | 2 +- inference/models/opt.cc | 17 ++++++++--------- tests/inference/huggingface_inference.py | 9 ++++++++- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 96017008db..75f5a7b2bc 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -172,7 +172,7 @@ void FlexFlow::top_level_task(Task const *task, std::string tokenizer_filepath = join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name}); - // bfloat16 shares same weight file with float32 + // bfloat16 shares same weight file with float32 std::string weights_filepath = join_path( {file_paths.cache_folder_path, "weights", diff --git a/inference/models/opt.cc b/inference/models/opt.cc index a185af8b38..23f910315e 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -63,15 +63,14 @@ void OPT::create_opt_model(FFModel &ff, embed_init, "embed_tokens"); - Tensor positional_embedding = - ff.embedding(position_input, - opt_config.max_position_embeddings, - opt_config.hidden_size, - AGGR_MODE_NONE, - data_type, - NULL, - embed_init, - "embed_positions"); + Tensor positional_embedding = ff.embedding(position_input, + opt_config.max_position_embeddings, + opt_config.hidden_size, + AGGR_MODE_NONE, + data_type, + NULL, + embed_init, + "embed_positions"); Tensor fc2 = nullptr, added = nullptr; Tensor res_ln_outputs[2] = {nullptr, nullptr}; diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py index 5b533bf3c0..4a8b5920ea 100644 --- a/tests/inference/huggingface_inference.py +++ b/tests/inference/huggingface_inference.py @@ -26,6 +26,9 @@ def main(): parser.add_argument( "--use-full-precision", action="store_true", help="Use full precision" ) + parser.add_argument( + "--use-bfloat16-precision", action="store_true", help="Use bf16 precision" + ) parser.add_argument("--do-sample", action="store_true", help="Use sampling") parser.add_argument("--gpu", action="store_true", help="Run on GPU") args = parser.parse_args() @@ -47,7 +50,11 @@ def main(): return # Set default tensor type depending on argument indicating the float type to use - if not args.use_full_precision: + if args.use_full_precision: + torch.set_default_tensor_type(torch.FloatTensor) + elif args.use_bfloat16_precision: + torch.set_default_tensor_type(torch.BFloat16Tensor) + else: torch.set_default_tensor_type(torch.HalfTensor) # Run huggingface model From f458f3c8a1085572eb6e6081e1e30c07e0134398 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Wed, 17 Jan 2024 10:48:56 -0500 Subject: [PATCH 10/11] format --- inference/spec_infer/spec_infer.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index d37b46d5ae..9fe686f8e7 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -300,7 +300,8 @@ void FlexFlow::top_level_task(Task const *task, max_tokens_per_batch, max_sequence_length); - get_model_meta(file_paths, model_metadata, use_full_precision, use_bfloat16_precision); + get_model_meta( + file_paths, model_metadata, use_full_precision, use_bfloat16_precision); assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * ffconfig.pipeline_parallelism_degree == From 6fcfbad172ecee05c5f2aa39962dc9fe93a9ef8f Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 19 Jan 2024 07:49:53 -0500 Subject: [PATCH 11/11] fix --- src/ops/add_bias_residual_layer_norm.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu index 8b372875ea..e503c2603d 100644 --- a/src/ops/add_bias_residual_layer_norm.cu +++ b/src/ops/add_bias_residual_layer_norm.cu @@ -138,9 +138,9 @@ __global__ void LayerNormFusedForwardKernel(int64_t N, for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { int64_t const index = i * N + j; const T_ACC gamma_v = - gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); + gamma == nullptr ? T_ACC(1.0f) : static_cast(gamma[j]); const T_ACC beta_v = - beta == nullptr ? T_ACC(0) : static_cast(beta[j]); + beta == nullptr ? T_ACC(0.0f) : static_cast(beta[j]); Y[index] = (static_cast(X[index]) - static_cast(mean[i])) * static_cast(rstd[i]) * gamma_v + beta_v;