ginkgo-project · yhmtsai · Nov 29, 2022 · Jan 5, 2023 · Jan 8, 2023 · Jan 11, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -32,6 +32,12 @@ option(GINKGO_BUILD_DOC "Generate documentation" OFF)
 option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF)
 option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP tests" OFF)
 option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF)
+option(GINKGO_ENABLE_HALF "Enable the use of half precision" ON)
+# We do not support MSVC. SYCL will come later
+if(MSVC OR GINKGO_BUILD_SYCL)
+    message(STATUS "HALF is not supported in MSVC, and later support in SYCL")
+    set(GINKGO_ENABLE_HALF OFF CACHE BOOL "Enable the use of half precision" FORCE)
+endif()
 option(GINKGO_SKIP_DEPENDENCY_UPDATE
     "Do not update dependencies each time the project is rebuilt" ON)
 option(GINKGO_WITH_CLANG_TIDY "Make Ginkgo call `clang-tidy` to find programming issues." OFF)

diff --git a/accessor/cuda_helper.hpp b/accessor/cuda_helper.hpp
@@ -17,7 +17,15 @@
 #include "utils.hpp"
 
 
+struct __half;
+
+
 namespace gko {
+
+
+class half;
+
+
 namespace acc {
 namespace detail {
 
@@ -27,6 +35,11 @@ struct cuda_type {
     using type = T;
 };
 
+template <>
+struct cuda_type<gko::half> {
+    using type = __half;
+};
+
 // Unpack cv and reference / pointer qualifiers
 template <typename T>
 struct cuda_type<const T> {
@@ -57,7 +70,7 @@ struct cuda_type<T&&> {
 // Transform std::complex to thrust::complex
 template <typename T>
 struct cuda_type<std::complex<T>> {
-    using type = thrust::complex<T>;
+    using type = thrust::complex<typename cuda_type<T>::type>;
 };
 
 

diff --git a/accessor/hip_helper.hpp b/accessor/hip_helper.hpp
@@ -17,7 +17,15 @@
 #include "utils.hpp"
 
 
+struct __half;
+
+
 namespace gko {
+
+
+class half;
+
+
 namespace acc {
 namespace detail {
 
@@ -53,11 +61,15 @@ struct hip_type<T&&> {
     using type = typename hip_type<T>::type&&;
 };
 
+template <>
+struct hip_type<gko::half> {
+    using type = __half;
+};
 
 // Transform std::complex to thrust::complex
 template <typename T>
 struct hip_type<std::complex<T>> {
-    using type = thrust::complex<T>;
+    using type = thrust::complex<typename hip_type<T>::type>;
 };
 
 

diff --git a/accessor/reference_helper.hpp b/accessor/reference_helper.hpp
@@ -14,8 +14,10 @@
 
 // CUDA TOOLKIT < 11 does not support constexpr in combination with
 // thrust::complex, which is why constexpr is only present in later versions
-#if defined(__CUDA_ARCH__) && defined(__CUDACC_VER_MAJOR__) && \
-    (__CUDACC_VER_MAJOR__ < 11)
+// TODO: NVC++ constexpr
+#if (defined(__CUDA_ARCH__) && defined(__CUDACC_VER_MAJOR__) && \
+     (__CUDACC_VER_MAJOR__ < 11)) ||                            \
+    (defined(__NVCOMPILER) && GINKGO_ENABLE_HALF)
 
 #define GKO_ACC_ENABLE_REFERENCE_CONSTEXPR
 

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -77,17 +77,20 @@ function(ginkgo_add_single_benchmark_executable name use_lib_linops macro_def ty
     target_compile_definitions("${name}" PRIVATE "${macro_def}")
     ginkgo_benchmark_add_tuning_maybe("${name}")
     if("${use_lib_linops}")
-        if (GINKGO_BUILD_CUDA)
+        if(GINKGO_BUILD_CUDA)
             target_compile_definitions("${name}" PRIVATE HAS_CUDA=1)
             target_link_libraries("${name}" cusparse_linops_${type})
         endif()
-        if (GINKGO_BUILD_HIP)
-            target_compile_definitions("${name}" PRIVATE HAS_HIP=1)
-            target_link_libraries("${name}" hipsparse_linops_${type})
-        endif()
-        if (GINKGO_BUILD_SYCL)
-            target_compile_definitions("${name}" PRIVATE HAS_DPCPP=1)
-            target_link_libraries("${name}" onemkl_linops_${type})
+        # only cuda supports half currently
+        if(NOT ("${type}" STREQUAL "h"))
+            if (GINKGO_BUILD_HIP)
+                target_compile_definitions("${name}" PRIVATE HAS_HIP=1)
+                target_link_libraries("${name}" hipsparse_linops_${type})
+            endif()
+            if (GINKGO_BUILD_SYCL)
+                target_compile_definitions("${name}" PRIVATE HAS_DPCPP=1)
+                target_link_libraries("${name}" onemkl_linops_${type})
+            endif()
         endif()
     endif()
 endfunction(ginkgo_add_single_benchmark_executable)
@@ -117,6 +120,9 @@ if (GINKGO_BUILD_CUDA)
     ginkgo_benchmark_cusparse_linops(s GKO_BENCHMARK_USE_SINGLE_PRECISION)
     ginkgo_benchmark_cusparse_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION)
     ginkgo_benchmark_cusparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION)
+    if (GINKGO_ENABLE_HALF)
+        ginkgo_benchmark_cusparse_linops(h GKO_BENCHMARK_USE_HALF_PRECISION)
+    endif()
     add_library(cuda_timer utils/cuda_timer.cpp)
     target_link_libraries(cuda_timer PRIVATE ginkgo CUDA::cudart)
     ginkgo_compile_features(cuda_timer)

diff --git a/benchmark/run_all_benchmarks.sh b/benchmark/run_all_benchmarks.sh
@@ -110,9 +110,11 @@ elif [ "${BENCHMARK_PRECISION}" == "dcomplex" ]; then
     BENCH_SUFFIX="_dcomplex"
 elif [ "${BENCHMARK_PRECISION}" == "scomplex" ]; then
     BENCH_SUFFIX="_scomplex"
+elif [ "${BENCHMARK_PRECISION}" == "half" ]; then
+    BENCH_SUFFIX="_half"
 else
     echo "BENCHMARK_PRECISION is set to the not supported \"${BENCHMARK_PRECISION}\"." 1>&2
-    echo "Currently supported values: \"double\", \"single\", \"dcomplex\" and \"scomplex\"" 1>&2
+    echo "Currently supported values: \"double\", \"single\", \"half\", \"dcomplex\" and \"scomplex\"" 1>&2
     exit 1
 fi
 
@@ -216,9 +218,16 @@ keep_latest() {
 compute_matrix_statistics() {
     [ "${DRY_RUN}" == "true" ] && return
     cp "$1" "$1.imd" # make sure we're not loosing the original input
-    ./matrix_statistics/matrix_statistics${BENCH_SUFFIX} \
-        --backup="$1.bkp" --double_buffer="$1.bkp2" \
-        <"$1.imd" 2>&1 >"$1"
+    if [ "${BENCH_SUFFIX}" == "_half" ]; then
+        # half precision benchmark still uses single for statistics
+        ./matrix_statistics/matrix_statistics_single \
+            --backup="$1.bkp" --double_buffer="$1.bkp2" \
+            <"$1.imd" 2>&1 >"$1"
+    else
+        ./matrix_statistics/matrix_statistics${BENCH_SUFFIX} \
+            --backup="$1.bkp" --double_buffer="$1.bkp2" \
+            <"$1.imd" 2>&1 >"$1"
+    fi
     keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
 }
 

diff --git a/benchmark/spmv/CMakeLists.txt b/benchmark/spmv/CMakeLists.txt
@@ -1,4 +1,9 @@
 ginkgo_add_typed_benchmark_executables(spmv "YES" spmv.cpp)
+# TODO: move to all benchmark
+if (GINKGO_ENABLE_HALF)
+    ginkgo_add_single_benchmark_executable(
+        "spmv_half" "YES" "GKO_BENCHMARK_USE_HALF_PRECISION" "h" spmv.cpp)
+endif()
 if(GINKGO_BUILD_MPI)
     add_subdirectory(distributed)
 endif()
diff --git a/benchmark/spmv/spmv_common.hpp b/benchmark/spmv/spmv_common.hpp
@@ -118,7 +118,9 @@ struct SpmvBenchmark : Benchmark<spmv_benchmark_state<Generator>> {
             exec->synchronize();
             auto max_relative_norm2 =
                 compute_max_relative_norm2(x_clone.get(), state.answer.get());
-            format_case["max_relative_norm2"] = max_relative_norm2;
+            format_case["max_relative_norm2"] =
+                static_cast<typename gko::detail::arth_type<rc_etype>::type>(
+                    max_relative_norm2);
         }
 
         IterationControl ic{timer};

diff --git a/benchmark/utils/cuda_linops.cpp b/benchmark/utils/cuda_linops.cpp
@@ -527,14 +527,19 @@ class CusparseHybrid
     ((CUDA_VERSION >= 10020) && !(defined(_WIN32) || defined(__CYGWIN__)))
 
 
+// cuSPARSE does not support 16 bit compute for full 16 bit floating point
+// input. Also, the scalar must be the compute type, i.e. float.
 template <typename ValueType>
-void cusparse_generic_spmv(std::shared_ptr<const gko::CudaExecutor> gpu_exec,
-                           const cusparseSpMatDescr_t mat,
-                           const gko::array<ValueType>& scalars,
-                           const gko::LinOp* b, gko::LinOp* x,
-                           cusparseOperation_t trans, cusparseSpMVAlg_t alg)
+void cusparse_generic_spmv(
+    std::shared_ptr<const gko::CudaExecutor> gpu_exec,
+    const cusparseSpMatDescr_t mat,
+    const gko::array<typename gko::detail::arth_type<ValueType>::type>& scalars,
+    const gko::LinOp* b, gko::LinOp* x, cusparseOperation_t trans,
+    cusparseSpMVAlg_t alg)
 {
     cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type<ValueType>();
+    cudaDataType_t compute_value = gko::kernels::cuda::cuda_data_type<
+        typename gko::detail::arth_type<ValueType>::type>();
     using gko::kernels::cuda::as_culibs_type;
     auto dense_b = gko::as<gko::matrix::Dense<ValueType>>(b);
     auto dense_x = gko::as<gko::matrix::Dense<ValueType>>(x);
@@ -553,13 +558,14 @@ void cusparse_generic_spmv(std::shared_ptr<const gko::CudaExecutor> gpu_exec,
     gko::size_type buffer_size = 0;
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV_bufferSize(
         gpu_exec->get_sparselib_handle(), trans, &scalars.get_const_data()[0],
-        mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg,
+        mat, vecb, &scalars.get_const_data()[1], vecx, compute_value, alg,
         &buffer_size));
     gko::array<char> buffer_array(gpu_exec, buffer_size);
     auto dbuffer = buffer_array.get_data();
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV(
         gpu_exec->get_sparselib_handle(), trans, &scalars.get_const_data()[0],
-        mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg, dbuffer));
+        mat, vecb, &scalars.get_const_data()[1], vecx, compute_value, alg,
+        dbuffer));
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecx));
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecb));
 }
@@ -638,8 +644,8 @@ class CusparseGenericCsr
 protected:
     void apply_impl(const gko::LinOp* b, gko::LinOp* x) const override
     {
-        cusparse_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_,
-                              Alg);
+        cusparse_generic_spmv<ValueType>(this->get_gpu_exec(), mat_, scalars, b,
+                                         x, trans_, Alg);
     }
 
     void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
@@ -655,9 +661,11 @@ class CusparseGenericCsr
     {}
 
 private:
+    using compute_type = typename gko::detail::arth_type<ValueType>::type;
     // Contains {alpha, beta}
-    gko::array<ValueType> scalars{
-        this->get_executor(), {gko::one<ValueType>(), gko::zero<ValueType>()}};
+    gko::array<compute_type> scalars{
+        this->get_executor(),
+        {gko::one<compute_type>(), gko::zero<compute_type>()}};
     std::shared_ptr<csr> csr_;
     cusparseOperation_t trans_;
     cusparseSpMatDescr_t mat_;
@@ -730,8 +738,8 @@ class CusparseGenericCoo
 protected:
     void apply_impl(const gko::LinOp* b, gko::LinOp* x) const override
     {
-        cusparse_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_,
-                              default_csr_alg);
+        cusparse_generic_spmv<ValueType>(this->get_gpu_exec(), mat_, scalars, b,
+                                         x, trans_, default_csr_alg);
     }
 
     void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
@@ -746,9 +754,11 @@ class CusparseGenericCoo
     {}
 
 private:
+    using compute_type = typename gko::detail::arth_type<ValueType>::type;
     // Contains {alpha, beta}
-    gko::array<ValueType> scalars{
-        this->get_executor(), {gko::one<ValueType>(), gko::zero<ValueType>()}};
+    gko::array<compute_type> scalars{
+        this->get_executor(),
+        {gko::one<compute_type>(), gko::zero<compute_type>()}};
     std::shared_ptr<coo> coo_;
     cusparseOperation_t trans_;
     cusparseSpMatDescr_t mat_;

diff --git a/benchmark/utils/generator.hpp b/benchmark/utils/generator.hpp
@@ -132,10 +132,7 @@ struct DefaultSystemGenerator {
     {
         auto res = Vec::create(exec);
         res->read(gko::matrix_data<ValueType, itype>(
-            size,
-            std::uniform_real_distribution<gko::remove_complex<ValueType>>(-1.0,
-                                                                           1.0),
-            get_engine()));
+            size, std::uniform_real_distribution<>(-1.0, 1.0), get_engine()));
         return res;
     }
 

diff --git a/benchmark/utils/types.hpp b/benchmark/utils/types.hpp
@@ -17,7 +17,8 @@ using itype = gko::int32;
 #if defined(GKO_BENCHMARK_USE_DOUBLE_PRECISION) ||         \
     defined(GKO_BENCHMARK_USE_SINGLE_PRECISION) ||         \
     defined(GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION) || \
-    defined(GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION)
+    defined(GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) || \
+    defined(GKO_BENCHMARK_USE_HALF_PRECISION)
 // separate ifdefs to catch duplicate definitions
 #ifdef GKO_BENCHMARK_USE_DOUBLE_PRECISION
 using etype = double;
@@ -31,11 +32,44 @@ using etype = std::complex<double>;
 #ifdef GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION
 using etype = std::complex<float>;
 #endif
+#ifdef GKO_BENCHMARK_USE_HALF_PRECISION
+#include <ginkgo/core/base/half.hpp>
+using etype = gko::half;
+#endif
 #else  // default to double precision
 using etype = double;
 #endif
 
 using rc_etype = gko::remove_complex<etype>;
 
 
+namespace detail {
+
+
+// singly linked list of all our supported precisions
+template <typename T>
+struct next_precision_impl {};
+
+template <>
+struct next_precision_impl<float> {
+    using type = double;
+};
+
+template <>
+struct next_precision_impl<double> {
+    using type = float;
+};
+
+
+template <typename T>
+struct next_precision_impl<std::complex<T>> {
+    using type = std::complex<typename next_precision_impl<T>::type>;
+};
+
+
+}  // namespace detail
+
+template <typename T>
+using next_precision = typename detail::next_precision_impl<T>::type;
+
 #endif  // GKO_BENCHMARK_UTILS_TYPES_HPP_
diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake
@@ -204,11 +204,14 @@ if(TARGET hwloc)
     ginkgo_print_variable(${detailed_log} "HWLOC_LIBRARIES")
     ginkgo_print_variable(${detailed_log} "HWLOC_INCLUDE_DIRS")
 endif()
+ginkgo_print_variable(${minimal_log} "GINKGO_ENABLE_HALF")
+ginkgo_print_variable(${detailed_log} "GINKGO_ENABLE_HALF")
 ginkgo_print_module_footer(${detailed_log} "")
 
 ginkgo_print_generic_header(${detailed_log} "  Extensions:")
 ginkgo_print_variable(${detailed_log} "GINKGO_EXTENSION_KOKKOS_CHECK_TYPE_ALIGNMENT")
 
+
 _minimal(
     "
 --\n--  Detailed information (More compiler flags, module configuration) can be found in detailed.log

diff --git a/common/cuda_hip/base/device_matrix_data_kernels.cpp b/common/cuda_hip/base/device_matrix_data_kernels.cpp
@@ -12,6 +12,7 @@
 #include <thrust/sort.h>
 #include <thrust/tuple.h>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 
@@ -22,6 +23,15 @@ namespace GKO_DEVICE_NAMESPACE {
 namespace components {
 
 
+// __half != only in __device__
+// Although gko::is_nonzero is constexpr, it still shows calling __device__ in
+// __host__
+template <typename T>
+GKO_INLINE __device__ constexpr bool is_nonzero(T value)
+{
+    return value != zero<T>();
+}
+
 template <typename ValueType, typename IndexType>
 void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
                   array<ValueType>& values, array<IndexType>& row_idxs,