diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 37cc986f5e..d60386f927 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -257,10 +257,11 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, DT alpha = 1.0f, beta = 0.0f; assert(m->qSize == m->vSize && m->qSize == m->kSize); hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to HIPBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = HIPBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = hipblas_data_type; #else + // TODO: currently use the hipblas_data_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; hipblasDatatype_t compute_type = hipblas_data_type; #endif // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) @@ -509,10 +510,11 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = hipblas_data_type; #else + // TODO: currently use the hipblas_data_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; hipblasDatatype_t compute_type = hipblas_data_type; #endif // int num_requests = bc->num_active_requests(); diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index e5a1f4f2fc..20f7d64936 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -507,11 +507,16 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, DT alpha = 1.0f, beta = 0.0f; assert(m->qSize == m->vSize && m->qSize == m->kSize); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; +#else + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) // Weights: qSize x qProjSize x 3 x num_q_heads @@ -905,11 +910,16 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; +#else + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index 231ca0f3d7..072eb5e96b 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -241,11 +241,12 @@ void forward_kernel(LinearMeta const *m, hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = output_type; #else - hipblasDatatype_t compute_type = input_type; + // TODO: currently use the output_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + hipblasDatatype_t compute_type = output_type; #endif checkCUDA(hipblasGemmEx(m->handle.blas, HIPBLAS_OP_T, @@ -337,11 +338,12 @@ void backward_kernel(LinearMeta const *m, hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = output_type; #else - hipblasDatatype_t compute_type = HIPBLAS_R_32F; + // TODO: currently use output_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + hipblasDatatype_t compute_type = output_type; #endif int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 8a93357dcf..9373c2fb2f 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -311,11 +311,16 @@ void forward_kernel(LinearMeta const *m, : ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); assert(input_type == weight_type && weight_type == output_type); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + cudaDataType_t compute_type = cublas_data_type; #else - cudaDataType_t compute_type = input_type; + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, @@ -401,11 +406,16 @@ void backward_kernel(LinearMeta const *m, cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + cudaDataType_t compute_type = cublas_data_type; #else - cudaDataType_t compute_type = CUDA_R_32F; + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index 1d81ae0c11..b1687d12a2 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -200,10 +200,11 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = hipblas_data_type; #else + // TODO: currently use the hipblas_data_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; hipblasDatatype_t compute_type = hipblas_data_type; #endif // int num_requests = bc->num_active_requests(); diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 9adc7468bb..6dad1c6de9 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -216,11 +216,16 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; +#else + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 1d9ebf67e0..26291fb3b4 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -157,10 +157,11 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = hipblas_data_type; #else + // TODO: currently use the hipblas_data_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; hipblasDatatype_t compute_type = hipblas_data_type; #endif // int num_requests = bc->num_active_requests(); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 1851d850b5..bc7d1017b7 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -430,11 +430,16 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; +#else + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif // int num_requests = bc->num_active_requests(); int processed_tokens_in_batch = 0;