From af95620257e5dcfa395f71179dddd4fff8f6a0f6 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 23 Oct 2024 18:02:36 +0200 Subject: [PATCH 01/69] instantiation/testing/next/prev/stub type definition --- core/base/mixed_precision_types.hpp | 151 +++++++++++++++++++++++ core/device_hooks/common_kernels.inc.cpp | 63 +++++++++- core/test/utils.hpp | 48 ++++++- include/ginkgo/core/base/math.hpp | 45 +++++++ include/ginkgo/core/base/types.hpp | 116 +++++++++++++++++ 5 files changed, 418 insertions(+), 5 deletions(-) diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp index d9747e5cad8..5ef5de94e34 100644 --- a/core/base/mixed_precision_types.hpp +++ b/core/base/mixed_precision_types.hpp @@ -7,23 +7,44 @@ #include +#include #include #ifdef GINKGO_MIXED_PRECISION + #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \ template _macro(float, float, float, __VA_ARGS__); \ template _macro(float, float, double, __VA_ARGS__); \ template _macro(float, double, float, __VA_ARGS__); \ template _macro(float, double, double, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF(_macro, \ + ...) \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, __VA_ARGS__); \ + GKO_ADAPT_HF(template _macro(float, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(float, half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(float, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(float, float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(float, double, half, __VA_ARGS__)) + #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...) \ template _macro(double, float, float, __VA_ARGS__); \ template _macro(double, float, double, __VA_ARGS__); \ template _macro(double, double, float, __VA_ARGS__); \ template _macro(double, double, double, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF(_macro, \ + ...) \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, __VA_ARGS__); \ + GKO_ADAPT_HF(template _macro(double, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(double, half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(double, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(double, float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(double, double, half, __VA_ARGS__)) + + #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...) \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ @@ -33,6 +54,19 @@ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF(_macro, \ + ...) \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, __VA_ARGS__); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...) \ template _macro(std::complex, std::complex, \ @@ -44,22 +78,95 @@ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF(_macro, \ + ...) \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, __VA_ARGS__); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF(_macro, \ + ...) \ + GKO_ADAPT_HF(template _macro(half, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, float, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, float, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, double, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, double, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, double, double, __VA_ARGS__)) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF(_macro, \ + ...) \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) + #else #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \ template _macro(float, float, float, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF(_macro, \ + ...) \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, __VA_ARGS__) + #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...) \ template _macro(double, double, double, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF(_macro, \ + ...) \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, __VA_ARGS__) + #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...) \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF(_macro, \ + ...) \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, __VA_ARGS__) + #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...) \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF(_macro, \ + ...) \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, __VA_ARGS__) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF(_macro, \ + ...) \ + GKO_ADAPT_HF(template _macro(half, half, half, __VA_ARGS__)) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF(_macro, \ + ...) \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) + + #endif @@ -69,11 +176,27 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, __VA_ARGS__); \ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_WITH_HALF(_macro, ...) \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF(_macro, \ + __VA_ARGS__); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF(_macro, \ + __VA_ARGS__); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF(_macro, \ + __VA_ARGS__); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF(_macro, \ + __VA_ARGS__); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF(_macro, \ + __VA_ARGS__); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF(_macro, \ + __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, int32); \ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, int64) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro) \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_WITH_HALF(_macro, int32); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_WITH_HALF(_macro, int64) #ifdef GINKGO_MIXED_PRECISION #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ @@ -85,12 +208,36 @@ template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2_WITH_HALF(_macro, ...) \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, __VA_ARGS__); \ + GKO_ADAPT_HF(template _macro(half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(double, half, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + template _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + __VA_ARGS__)) #else #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ template _macro(float, float, __VA_ARGS__); \ template _macro(double, double, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2_WITH_HALF(_macro, ...) \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, __VA_ARGS__); \ + GKO_ADAPT_HF(template _macro(half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + template _macro(std::complex, std::complex, __VA_ARGS__)) #endif @@ -98,5 +245,9 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, int32); \ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, int64) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF( \ + _macro) \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2_WITH_HALF(_macro, int32); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2_WITH_HALF(_macro, int64) #endif // GKO_CORE_BASE_MIXED_PRECISION_TYPES_HPP_ diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 98d85b2b6d2..6ffeb1c5f71 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -79,26 +79,37 @@ #define GKO_STUB(_macro) _macro GKO_NOT_COMPILED(GKO_HOOK_MODULE) -#define GKO_STUB_VALUE_CONVERSION(_macro) \ - template \ - _macro(SourceType, TargetType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ - GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) #define GKO_STUB_NON_COMPLEX_VALUE_TYPE(_macro) \ template \ _macro(ValueType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) +#define GKO_STUB_NON_COMPLEX_VALUE_TYPE_WITH_HALF(_macro) \ + template \ + _macro(ValueType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ + GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(_macro) + #define GKO_STUB_VALUE_TYPE(_macro) \ template \ _macro(ValueType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) +#define GKO_STUB_VALUE_TYPE_WITH_HALF(_macro) \ + template \ + _macro(ValueType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ + GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(_macro) + #define GKO_STUB_VALUE_AND_SCALAR_TYPE(_macro) \ template \ _macro(ValueType, ScalarType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) +#define GKO_STUB_VALUE_AND_SCALAR_TYPE_WITH_HALF(_macro) \ + template \ + _macro(ValueType, ScalarType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ + GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(_macro) + #define GKO_STUB_INDEX_TYPE(_macro) \ template \ _macro(IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ @@ -114,16 +125,31 @@ _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) +#define GKO_STUB_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro) \ + template \ + _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ + GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro) + #define GKO_STUB_VALUE_AND_INDEX_TYPE(_macro) \ template \ _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) +#define GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro) \ + template \ + _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ + GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro) + #define GKO_STUB_VALUE_AND_INT32_TYPE(_macro) \ template \ _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro) +#define GKO_STUB_VALUE_AND_INT32_TYPE_WITH_HALF(_macro) \ + template \ + _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ + GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(_macro) + #define GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(_macro) \ template \ @@ -131,6 +157,13 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) +#define GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro) \ + template \ + _macro(InputValueType, MatrixValueType, OutputValueType, IndexType) \ + GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro) + #define GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2(_macro) \ template \ @@ -138,6 +171,13 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(_macro) +#define GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(_macro) \ + template \ + _macro(InputValueType, OutputValueType, IndexType) \ + GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF(_macro) + #define GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ template \ @@ -150,16 +190,31 @@ _macro(IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(_macro) +#define GKO_STUB_TEMPLATE_TYPE_WITH_HALF(_macro) \ + template \ + _macro(IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ + GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(_macro) + #define GKO_STUB_VALUE_CONVERSION(_macro) \ template \ _macro(SourceType, TargetType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) +#define GKO_STUB_VALUE_CONVERSION_WITH_HALF(_macro) \ + template \ + _macro(SourceType, TargetType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ + GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF(_macro) + #define GKO_STUB_VALUE_CONVERSION_OR_COPY(_macro) \ template \ _macro(SourceType, TargetType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) +#define GKO_STUB_VALUE_CONVERSION_OR_COPY_WITH_HALF(_macro) \ + template \ + _macro(SourceType, TargetType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ + GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY_WITH_HALF(_macro) + #define GKO_STUB_CB_GMRES(_macro) \ template \ _macro(ValueType, ValueTypeKrylovBases) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ diff --git a/core/test/utils.hpp b/core/test/utils.hpp index eee2900d731..ab9326400e0 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -15,6 +15,7 @@ #include +#include #include #include #include @@ -327,10 +328,25 @@ using RealValueTypes = ::testing::Types; #endif +using RealValueTypesWithHalf = ::testing::Types< +#if GINKGO_ENABLE_HALF + gko::half, +#endif +#if !GINKGO_DPCPP_SINGLE_MODE + double, +#endif + float>; + using ComplexValueTypes = add_inner_wrapper_t; +using ComplexValueTypesWithHalf = + add_inner_wrapper_t; + using ValueTypes = merge_type_list_t; +using ValueTypesWithHalf = + merge_type_list_t; + using IndexTypes = ::testing::Types; using IntegerTypes = merge_type_list_t>; @@ -341,22 +357,44 @@ using LocalGlobalIndexTypes = using PODTypes = merge_type_list_t; +using PODTypesWithHalf = + merge_type_list_t; + using ComplexAndPODTypes = merge_type_list_t; +using ComplexAndPODTypesWithHalf = + merge_type_list_t; + using ValueIndexTypes = cartesian_type_product_t; +using ValueIndexTypesWithHalf = + cartesian_type_product_t; + using RealValueIndexTypes = cartesian_type_product_t; +using RealValueIndexTypesWithHalf = + cartesian_type_product_t; + using ComplexValueIndexTypes = cartesian_type_product_t; +using ComplexValueIndexTypesWithHalf = + cartesian_type_product_t; + using TwoValueIndexType = add_to_cartesian_type_product_t< merge_type_list_t< cartesian_type_product_t, cartesian_type_product_t>, IndexTypes>; +using TwoValueIndexTypeWithHalf = add_to_cartesian_type_product_t< + merge_type_list_t, + cartesian_type_product_t>, + IndexTypes>; + using ValueLocalGlobalIndexTypes = add_to_cartesian_type_product_left_t; @@ -365,7 +403,6 @@ template struct reduction_factor { using nc_output = remove_complex; using nc_precision = remove_complex; - static const nc_output value; }; @@ -456,4 +493,13 @@ struct TupleTypenameNameGenerator { }; +#define SKIP_IF_HALF(type) \ + if (std::is_same, gko::half>::value) { \ + GTEST_SKIP() << "Skip due to half mode"; \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + + #endif // GKO_CORE_TEST_UTILS_HPP_ diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 5e15bb05d6a..73da407194e 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -383,6 +383,31 @@ struct next_precision_impl> { }; +template +struct next_precision_with_half_impl {}; + + +template <> +struct next_precision_with_half_impl { + using type = float; +}; + +template <> +struct next_precision_with_half_impl { + using type = double; +}; + +template <> +struct next_precision_with_half_impl { + using type = gko::half; +}; + +template +struct next_precision_with_half_impl> { + using type = std::complex::type>; +}; + + template struct reduce_precision_impl { using type = T; @@ -477,6 +502,26 @@ using next_precision = typename detail::next_precision_impl::type; template using previous_precision = next_precision; +/** + * Obtains the next type in the singly-linked precision list with half. + */ +#if GINKGO_ENABLE_HALF +template +using next_precision_with_half = + typename detail::next_precision_with_half_impl::type; + +template +using previous_precision_with_half = + next_precision_with_half>; +#else +// fallback to float/double list +template +using next_precision_with_half = next_precision; + +template +using previous_precision_with_half = previous_precision; +#endif + /** * Obtains the next type in the hierarchy with lower precision than T. diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 1d5963c0fe8..5e1fb2a14e3 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -17,6 +17,7 @@ #include #include +#include #include @@ -399,6 +400,17 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, _enable_macro(CudaExecutor, cuda) +// cuda half operation is supported from arch 5.3 +#if GINKGO_ENABLE_HALF && (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530) +#define GKO_ADAPT_HF(_macro) _macro +#else +#define GKO_ADAPT_HF(_macro) \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") +#endif + + /** * Instantiates a template for each non-complex value type compiled by Ginkgo. * @@ -418,6 +430,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(double) #endif +#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF(_macro) \ + GKO_ADAPT_HF(template _macro(half)); \ + GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) + /** * Instantiates a template for each value type compiled by Ginkgo. @@ -440,6 +456,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(std::complex) #endif +#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(_macro) \ + GKO_ADAPT_HF(template _macro(half)); \ + GKO_ADAPT_HF(template _macro(std::complex)); \ + GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) + // Helper macro to make Windows builds work // In MSVC, __VA_ARGS__ behave like one argument by default. @@ -528,6 +549,12 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(std::complex, double) #endif +#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF(_macro) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro); \ + GKO_ADAPT_HF(template _macro(half, half)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(template _macro(std::complex, half)) + /** * Instantiates a template for each index type compiled by Ginkgo. @@ -566,6 +593,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(float, int64); \ template _macro(double, int64) #endif +#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF( \ + _macro) \ + GKO_ADAPT_HF(template _macro(half, int32)); \ + GKO_ADAPT_HF(template _macro(half, int64)); \ + GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro) \ @@ -583,6 +615,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(std::complex, int32) #endif +#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE_WITH_HALF(_macro) \ + GKO_ADAPT_HF(template _macro(half, int32)); \ + GKO_ADAPT_HF(template _macro(std::complex, int32)); \ + GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro) + /** * Instantiates a template for each value and index type compiled by Ginkgo. @@ -610,6 +647,13 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(std::complex, int64) #endif +#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(_macro) \ + GKO_ADAPT_HF(template _macro(half, int32)); \ + GKO_ADAPT_HF(template _macro(half, int64)); \ + GKO_ADAPT_HF(template _macro(std::complex, int32)); \ + GKO_ADAPT_HF(template _macro(std::complex, int64)); \ + GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) + /** * Instantiates a template for each non-complex value, local and global index @@ -643,6 +687,14 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(double, int64, int64) #endif +#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_WITH_HALF( \ + _macro) \ + GKO_ADAPT_HF(template _macro(half, int32, int32)); \ + GKO_ADAPT_HF(template _macro(half, int32, int64)); \ + GKO_ADAPT_HF(template _macro(half, int64, int64)); \ + GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ + _macro) + /** * Instantiates a template for each value and index type compiled by Ginkgo. @@ -677,6 +729,16 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(std::complex, int64, int64) #endif +#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_WITH_HALF( \ + _macro) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro); \ + GKO_ADAPT_HF(template _macro(half, int32, int32)); \ + GKO_ADAPT_HF(template _macro(half, int32, int64)); \ + GKO_ADAPT_HF(template _macro(half, int64, int64)); \ + GKO_ADAPT_HF(template _macro(std::complex, int32, int32)); \ + GKO_ADAPT_HF(template _macro(std::complex, int32, int64)); \ + GKO_ADAPT_HF(template _macro(std::complex, int64, int64)) + #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ @@ -732,6 +794,40 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(std::complex, std::complex) #endif +#if GINKGO_DPCPP_SINGLE_MODE +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF(_macro) \ + GKO_ADAPT_HF(template <> _macro(half, double) GKO_NOT_IMPLEMENTED); \ + GKO_ADAPT_HF(template <> _macro(double, half) GKO_NOT_IMPLEMENTED); \ + GKO_ADAPT_HF(template _macro(float, half)); \ + GKO_ADAPT_HF(template _macro(half, float)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(template <> _macro(std::complex, std::complex) \ + GKO_NOT_IMPLEMENTED); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(template <> _macro(std::complex, std::complex) \ + GKO_NOT_IMPLEMENTED); \ + GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) +#else +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF(_macro) \ + GKO_ADAPT_HF(template _macro(half, double)); \ + GKO_ADAPT_HF(template _macro(double, half)); \ + GKO_ADAPT_HF(template _macro(float, half)); \ + GKO_ADAPT_HF(template _macro(half, float)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) +#endif + +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY_WITH_HALF(_macro) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF(_macro); \ + GKO_ADAPT_HF(template _macro(half, half)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + template _macro(float, float); \ + template _macro(double, double); \ + template _macro(std::complex, std::complex); \ + template _macro(std::complex, std::complex) /** * Instantiates a template for each value type pair compiled by Ginkgo. @@ -749,6 +845,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) +#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR_WITH_HALF(_macro) \ + GKO_ADAPT_HF(template _macro(half, half)); \ + GKO_ADAPT_HF(template _macro(std::complex, half)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro) /** * Instantiates a template for each combined value and index type compiled by @@ -771,6 +872,12 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) +#define GKO_INSTANTIATE_FOR_EACH_COMBINED_VALUE_AND_INDEX_TYPE_WITH_HALF( \ + _macro) \ + GKO_ADAPT_HF(template _macro(half, half)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + GKO_INSTANTIATE_FOR_EACH_COMBINED_VALUE_AND_INDEX_TYPE(_macro) + /** * Instantiates a template for each value and index type compiled by Ginkgo. * @@ -789,6 +896,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(int32); \ template _macro(int64) +#define GKO_INSTANTIATE_FOR_EACH_POD_TYPE_WITH_HALF(_macro) \ + GKO_ADAPT_HF(template _macro(half)); \ + GKO_ADAPT_HF(template _macro(std::complex)); \ + GKO_INSTANTIATE_FOR_EACH_POD_TYPE(_macro) /** * Instantiates a template for each normal type @@ -803,6 +914,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(_macro); \ template _macro(gko::size_type) +#define GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(_macro) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(_macro); \ + GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(_macro); \ + template _macro(gko::size_type) + /** * Instantiates a template for int32 type. From 3d4673c3f65a44a0fbb0b09f9e1ccc6fd68c622b Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 23 Oct 2024 18:25:12 +0200 Subject: [PATCH 02/69] half option --- CMakeLists.txt | 6 ++++++ cmake/get_info.cmake | 2 +- include/ginkgo/config.hpp.in | 5 +++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4cad0e1bca4..c48d12989aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,12 @@ option(GINKGO_BUILD_DOC "Generate documentation" OFF) option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF) option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP tests" OFF) option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF) +option(GINKGO_ENABLE_HALF "Enable the use of half precision" ON) +# We do not support MSVC. SYCL will come later +if(MSVC OR GINKGO_BUILD_SYCL) + message(STATUS "HALF is not supported in MSVC, and later support in SYCL") + set(GINKGO_ENABLE_HALF OFF CACHE BOOL "Enable the use of half precision" FORCE) +endif() option(GINKGO_SKIP_DEPENDENCY_UPDATE "Do not update dependencies each time the project is rebuilt" ON) option(GINKGO_WITH_CLANG_TIDY "Make Ginkgo call `clang-tidy` to find programming issues." OFF) diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake index 63f43c645f0..57816ab8008 100644 --- a/cmake/get_info.cmake +++ b/cmake/get_info.cmake @@ -130,7 +130,7 @@ foreach(log_type ${log_types}) "GINKGO_BUILD_OMP;GINKGO_BUILD_MPI;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_SYCL") ginkgo_print_module_footer(${${log_type}} " Enabled features:") ginkgo_print_foreach_variable(${${log_type}} - "GINKGO_MIXED_PRECISION;GINKGO_HAVE_GPU_AWARE_MPI") + "GINKGO_MIXED_PRECISION;GINKGO_HAVE_GPU_AWARE_MPI;GINKGO_ENABLE_HALF") ginkgo_print_module_footer(${${log_type}} " Tests, benchmarks and examples:") ginkgo_print_foreach_variable(${${log_type}} "GINKGO_BUILD_TESTS;GINKGO_FAST_TESTS;GINKGO_BUILD_EXAMPLES;GINKGO_EXTLIB_EXAMPLE;GINKGO_BUILD_BENCHMARKS;GINKGO_BENCHMARK_ENABLE_TUNING") diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in index 1dfa6bc61bc..cf25dcd3c77 100644 --- a/include/ginkgo/config.hpp.in +++ b/include/ginkgo/config.hpp.in @@ -105,6 +105,11 @@ #define GKO_HAVE_HWLOC @GINKGO_HAVE_HWLOC@ // clang-format on +/* Is half operation available ? */ +// clang-format off +#cmakedefine01 GINKGO_ENABLE_HALF +// clang-format on + /* Do we need to use blocking communication in our SpMV? */ // clang-format off From 918f6263c43d8cf345e6bd141f0ee74b22f44ba6 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 23 Oct 2024 22:39:38 +0200 Subject: [PATCH 03/69] device type mapping --- common/cuda_hip/base/math.hpp | 124 +++++++++++++++++++++++++++++---- common/cuda_hip/base/types.hpp | 14 ++++ cuda/base/types.hpp | 1 - hip/base/types.hip.hpp | 1 - 4 files changed, 126 insertions(+), 14 deletions(-) diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp index 8c655174524..7f0391d904c 100644 --- a/common/cuda_hip/base/math.hpp +++ b/common/cuda_hip/base/math.hpp @@ -11,6 +11,21 @@ #include +#ifdef GKO_COMPILING_CUDA + + +#include + + +#elif defined(GKO_COMPILING_HIP) + + +#include + + +#endif + + namespace gko { @@ -18,9 +33,35 @@ namespace gko { // __device__ function (even though it is constexpr) template struct device_numeric_limits { - static constexpr auto inf = std::numeric_limits::infinity(); - static constexpr auto max = std::numeric_limits::max(); - static constexpr auto min = std::numeric_limits::min(); + static constexpr auto inf() { return std::numeric_limits::infinity(); } + static constexpr auto max() { return std::numeric_limits::max(); } + static constexpr auto min() { return std::numeric_limits::min(); } +}; + +template <> +struct device_numeric_limits<__half> { + // from __half documentation, it accepts unsigned short + // __half and __half_raw does not have constexpr constructor + static GKO_ATTRIBUTES GKO_INLINE auto inf() + { + __half_raw bits; + bits.x = static_cast(0b0111110000000000u); + return __half{bits}; + } + + static GKO_ATTRIBUTES GKO_INLINE auto max() + { + __half_raw bits; + bits.x = static_cast(0b0111101111111111u); + return __half{bits}; + } + + static GKO_ATTRIBUTES GKO_INLINE auto min() + { + __half_raw bits; + bits.x = static_cast(0b0000010000000000u); + return __half{bits}; + } }; @@ -33,15 +74,6 @@ struct remove_complex_impl> { }; -template -struct is_complex_impl> - : public std::integral_constant {}; - - -template -struct is_complex_or_scalar_impl> : std::is_scalar {}; - - template struct truncate_type_impl> { using type = thrust::complex::type>; @@ -52,4 +84,72 @@ struct truncate_type_impl> { } // namespace gko +namespace thrust { + + +template <> +GKO_ATTRIBUTES GKO_INLINE complex<__half> sqrt<__half>(const complex<__half>& a) +{ + return sqrt(static_cast>(a)); +} + + +template <> +GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) +{ + return abs(static_cast>(z)); +} + + +} // namespace thrust + + +namespace gko { + + +// It is required by NVHPC 23.3, `isnan` is undefined when NVHPC is used as a +// host compiler. +#if defined(__CUDACC__) || defined(GKO_COMPILING_HIP) + +__device__ __forceinline__ bool is_nan(const __half& val) +{ + // from the cuda_fp16.hpp +#if GINKGO_HIP_PLATFORM_HCC || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) + return __hisnan(val); +#else + return isnan(static_cast(val)); +#endif +} + +__device__ __forceinline__ bool is_nan(const thrust::complex<__half>& val) +{ + return is_nan(val.real()) || is_nan(val.imag()); +} + + +__device__ __forceinline__ __half abs(const __half& val) +{ +#if GINKGO_HIP_PLATFORM_HCC || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) + return __habs(val); +#else + return abs(static_cast(val)); +#endif +} + +__device__ __forceinline__ __half sqrt(const __half& val) +{ +#if GINKGO_HIP_PLATFORM_HCC || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) + return hsqrt(val); +#else + return sqrt(static_cast(val)); +#endif +} + + +#endif + + +} // namespace gko + + #endif // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_ diff --git a/common/cuda_hip/base/types.hpp b/common/cuda_hip/base/types.hpp index 08f0516d691..42ca57eb0bf 100644 --- a/common/cuda_hip/base/types.hpp +++ b/common/cuda_hip/base/types.hpp @@ -14,5 +14,19 @@ #error "Executor definition missing" #endif +#define THRUST_HALF_FRIEND_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> operator _op( \ + const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \ + { \ + return thrust::complex{lhs} _op thrust::complex(rhs); \ + } + +THRUST_HALF_FRIEND_OPERATOR(+, +=) +THRUST_HALF_FRIEND_OPERATOR(-, -=) +THRUST_HALF_FRIEND_OPERATOR(*, *=) +THRUST_HALF_FRIEND_OPERATOR(/, /=) + +#undef THRUST_HALF_FRIEND_OPERATOR + #endif // GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_ diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 05f07ceb8dd..05b604923da 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -20,7 +20,6 @@ namespace gko { - namespace kernels { namespace cuda { namespace detail { diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index c3982b7562e..6b78cceea99 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -26,7 +26,6 @@ #include "common/cuda_hip/base/runtime.hpp" - namespace gko { namespace kernels { namespace hip { From 1ffd285f3d9b3afc87c7e91e9f20ffb7260a9079 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 28 Nov 2024 16:46:04 +0100 Subject: [PATCH 04/69] consider custom namespace for thrust::complex<__half> and benchmark --- benchmark/CMakeLists.txt | 6 ++++++ common/cuda_hip/base/math.hpp | 5 +++++ common/cuda_hip/base/thrust_macro.hpp | 22 ++++++++++++++++++++++ common/cuda_hip/base/types.hpp | 15 +++++++++------ 4 files changed, 42 insertions(+), 6 deletions(-) create mode 100644 common/cuda_hip/base/thrust_macro.hpp diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 55ed76d1613..c780a497c32 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -18,6 +18,9 @@ function(ginkgo_benchmark_cusparse_linops type def) PRIVATE $<$:--expt-relaxed-constexpr>) endif() + if(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE) + target_compile_definitions(cusparse_linops_${type} PRIVATE THRUST_CUB_WRAPPED_NAMESPACE=gko) + endif() # make the dependency public to catch issues target_compile_definitions(cusparse_linops_${type} PUBLIC ${def}) target_compile_definitions(cusparse_linops_${type} PRIVATE GKO_COMPILING_CUDA) @@ -28,6 +31,9 @@ endfunction() function(ginkgo_benchmark_hipsparse_linops type def) add_library(hipsparse_linops_${type} utils/hip_linops.hip.cpp) set_source_files_properties(utils/hip_linops.hip.cpp PROPERTIES LANGUAGE HIP) + if(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE) + target_compile_definitions(hipsparse_linops_${type} PRIVATE THRUST_CUB_WRAPPED_NAMESPACE=gko) + endif() target_compile_definitions(hipsparse_linops_${type} PUBLIC ${def}) target_compile_definitions(hipsparse_linops_${type} PRIVATE GKO_COMPILING_HIP) target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS}) diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp index 7f0391d904c..3d2975c1eee 100644 --- a/common/cuda_hip/base/math.hpp +++ b/common/cuda_hip/base/math.hpp @@ -26,6 +26,9 @@ #endif +#include "common/cuda_hip/base/thrust_macro.hpp" + + namespace gko { @@ -84,6 +87,7 @@ struct truncate_type_impl> { } // namespace gko +GKO_THRUST_NAEMSPACE_PREFIX namespace thrust { @@ -102,6 +106,7 @@ GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) } // namespace thrust +GKO_THRUST_NAEMSPACE_POSTFIX namespace gko { diff --git a/common/cuda_hip/base/thrust_macro.hpp b/common/cuda_hip/base/thrust_macro.hpp new file mode 100644 index 00000000000..c5e3fc40010 --- /dev/null +++ b/common/cuda_hip/base/thrust_macro.hpp @@ -0,0 +1,22 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_BASE_THRUST_MACRO_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_THRUST_MACRO_HPP_ + +// although thrust provides the similar thing, these macro are only defined when +// they supported. Thus, we need to provide our own macro to make it work with +// the old version +#ifdef THRUST_CUB_WRAPPED_NAMESPACE +#define GKO_THRUST_NAEMSPACE_PREFIX namespace THRUST_CUB_WRAPPED_NAMESPACE { +#define GKO_THRUST_NAEMSPACE_POSTFIX } +#define GKO_THRUST_QUALIFIER ::THRUST_CUB_WRAPPED_NAMESPACE::thrust +#else +#define GKO_THRUST_NAEMSPACE_PREFIX +#define GKO_THRUST_NAEMSPACE_POSTFIX +#define GKO_THRUST_QUALIFIER ::thrust +#endif // THRUST_CUB_WRAPPED_NAMESPACE + + +#endif // GKO_COMMON_CUDA_HIP_BASE_THRUST_MACRO_HPP_ diff --git a/common/cuda_hip/base/types.hpp b/common/cuda_hip/base/types.hpp index 42ca57eb0bf..e65b179ed68 100644 --- a/common/cuda_hip/base/types.hpp +++ b/common/cuda_hip/base/types.hpp @@ -5,7 +5,7 @@ #ifndef GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_ #define GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_ - +#include "common/cuda_hip/base/math.hpp" #if defined(GKO_COMPILING_CUDA) #include "cuda/base/types.hpp" #elif defined(GKO_COMPILING_HIP) @@ -14,11 +14,14 @@ #error "Executor definition missing" #endif -#define THRUST_HALF_FRIEND_OPERATOR(_op, _opeq) \ - GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> operator _op( \ - const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \ - { \ - return thrust::complex{lhs} _op thrust::complex(rhs); \ + +#define THRUST_HALF_FRIEND_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES GKO_INLINE GKO_THRUST_QUALIFIER::complex<__half> \ + operator _op(const GKO_THRUST_QUALIFIER::complex<__half> lhs, \ + const GKO_THRUST_QUALIFIER::complex<__half> rhs) \ + { \ + return GKO_THRUST_QUALIFIER::complex{ \ + lhs} _op GKO_THRUST_QUALIFIER::complex(rhs); \ } THRUST_HALF_FRIEND_OPERATOR(+, +=) From 96dd2da9d1457506dfdfd1060aa968f01d458e52 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 24 Oct 2024 01:11:29 +0200 Subject: [PATCH 05/69] atomic and cooperative_groups --- common/cuda_hip/components/atomic.hpp | 54 ++++++++++++++++++++++- hip/components/cooperative_groups.hip.hpp | 12 ++--- omp/components/atomic.hpp | 54 +++++++++++++++++++++-- 3 files changed, 108 insertions(+), 12 deletions(-) diff --git a/common/cuda_hip/components/atomic.hpp b/common/cuda_hip/components/atomic.hpp index aeb77d48c75..954bc7476ed 100644 --- a/common/cuda_hip/components/atomic.hpp +++ b/common/cuda_hip/components/atomic.hpp @@ -39,6 +39,7 @@ struct atomic_helper { }; +// TODO: consider it implemented by memcpy. template __forceinline__ __device__ ResultType reinterpret(ValueType val) { @@ -95,15 +96,64 @@ __forceinline__ __device__ ResultType reinterpret(ValueType val) } \ }; + +#define GKO_BIND_ATOMIC_HELPER_FAKE_STRUCTURE(CONVERTER_TYPE) \ + template \ + struct atomic_helper< \ + ValueType, \ + std::enable_if_t<(sizeof(ValueType) == sizeof(CONVERTER_TYPE))>> { \ + __forceinline__ __device__ static ValueType atomic_add( \ + ValueType* __restrict__ addr, ValueType val) \ + { \ + assert(false); \ + using c_type = CONVERTER_TYPE; \ + return atomic_wrapper( \ + addr, [&val](c_type& old, c_type assumed, c_type* c_addr) { \ + old = *c_addr; \ + *c_addr = reinterpret( \ + val + reinterpret(assumed)); \ + }); \ + } \ + __forceinline__ __device__ static ValueType atomic_max( \ + ValueType* __restrict__ addr, ValueType val) \ + { \ + assert(false); \ + using c_type = CONVERTER_TYPE; \ + return atomic_wrapper( \ + addr, [&val](c_type& old, c_type assumed, c_type* c_addr) { \ + if (reinterpret(assumed) < val) { \ + old = *c_addr; \ + *c_addr = reinterpret(assumed); \ + } \ + }); \ + } \ + \ + private: \ + template \ + __forceinline__ __device__ static ValueType atomic_wrapper( \ + ValueType* __restrict__ addr, Callable set_old) \ + { \ + CONVERTER_TYPE* address_as_converter = \ + reinterpret_cast(addr); \ + CONVERTER_TYPE old = *address_as_converter; \ + CONVERTER_TYPE assumed = old; \ + set_old(old, assumed, address_as_converter); \ + return reinterpret(old); \ + } \ + }; + // Support 64-bit ATOMIC_ADD and ATOMIC_MAX GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int); // Support 32-bit ATOMIC_ADD and ATOMIC_MAX GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int); -#if defined(CUDA_VERSION) -// Support 16-bit ATOMIC_ADD and ATOMIC_MAX only on CUDA +#if defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700) +// Support 16-bit atomicCAS, atomicADD, and atomicMAX only on CUDA with CC +// >= 7.0 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int); +#else +GKO_BIND_ATOMIC_HELPER_FAKE_STRUCTURE(unsigned short int) #endif diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp index 36618bb7f3e..46c2fb195bc 100644 --- a/hip/components/cooperative_groups.hip.hpp +++ b/hip/components/cooperative_groups.hip.hpp @@ -306,7 +306,7 @@ class enable_extended_shuffle : public Group { SelectorType selector) const \ { \ return shuffle_impl( \ - [this](uint32 v, SelectorType s) { \ + [this](uint16 v, SelectorType s) { \ return static_cast(this)->_name(v, s); \ }, \ var, selector); \ @@ -326,12 +326,12 @@ class enable_extended_shuffle : public Group { shuffle_impl(ShuffleOperator intrinsic_shuffle, const ValueType var, SelectorType selector) { - static_assert(sizeof(ValueType) % sizeof(uint32) == 0, - "Unable to shuffle sizes which are not 4-byte multiples"); - constexpr auto value_size = sizeof(ValueType) / sizeof(uint32); + static_assert(sizeof(ValueType) % sizeof(uint16) == 0, + "Unable to shuffle sizes which are not 2-byte multiples"); + constexpr auto value_size = sizeof(ValueType) / sizeof(uint16); ValueType result; - auto var_array = reinterpret_cast(&var); - auto result_array = reinterpret_cast(&result); + auto var_array = reinterpret_cast(&var); + auto result_array = reinterpret_cast(&result); #pragma unroll for (std::size_t i = 0; i < value_size; ++i) { result_array[i] = intrinsic_shuffle(var_array[i], selector); diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp index c3580cd36bb..35b94a65fe5 100644 --- a/omp/components/atomic.hpp +++ b/omp/components/atomic.hpp @@ -8,6 +8,7 @@ #include +#include #include #include @@ -32,10 +33,55 @@ void atomic_add(ValueType& out, ValueType val) // The C++ standard explicitly allows casting complex* to double* // [complex.numbers.general] auto values = reinterpret_cast*>(&out); -#pragma omp atomic - values[0] += real(val); -#pragma omp atomic - values[1] += imag(val); + atomic_add(values[0], real(val)); + atomic_add(values[1], imag(val)); +} + + +template +inline ResultType copy_cast(const ValueType& val) +{ + static_assert( + sizeof(ValueType) == sizeof(ResultType) && + std::alignment_of_v == std::alignment_of_v, + "only copy the same alignment and size type"); + ResultType res; + std::memcpy(&res, &val, sizeof(ValueType)); + return res; +} + + +template <> +void atomic_add(half& out, half val) +{ +#ifdef __NVCOMPILER +// NVC++ uses atomic capture on uint16 leads the following error. +// use of undefined value '%L.B*' br label %L.B* !llvm.loop !*, !dbg !* +#pragma omp critical + { + out += val; + } +#else + static_assert( + sizeof(half) == sizeof(uint16_t) && + std::alignment_of_v == std::alignment_of_v, + "half does not fulfill the requirement of reinterpret_cast to half or " + "vice versa."); + // It is undefined behavior with reinterpret_cast, but we do not have any + // workaround when the #omp atomic does not support custom precision + uint16_t* address_as_converter = reinterpret_cast(&out); + uint16_t old = *address_as_converter; + uint16_t assumed; + do { + assumed = old; + auto answer = copy_cast(copy_cast(assumed) + val); +#pragma omp atomic capture + { + old = *address_as_converter; + *address_as_converter = (old == assumed) ? answer : old; + } + } while (assumed != old); +#endif } From ee0e55da9152017c2056f07e78128a563655674d Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 24 Oct 2024 01:16:50 +0200 Subject: [PATCH 06/69] fix math and device_numeric_limit --- common/cuda_hip/base/math.hpp | 11 ++ common/cuda_hip/components/merging.hpp | 4 +- .../factorization/par_ict_kernels.cpp | 4 +- .../factorization/par_ilut_select_kernels.hpp | 4 +- .../factorization/par_ilut_spgeam_kernels.cpp | 4 +- common/cuda_hip/reorder/rcm_kernels.cpp | 2 +- cuda/test/base/math.cu | 4 +- hip/test/base/math.hip.cpp | 4 +- include/ginkgo/core/base/math.hpp | 103 +++++------------- 9 files changed, 54 insertions(+), 86 deletions(-) diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp index 3d2975c1eee..f83533d8f0d 100644 --- a/common/cuda_hip/base/math.hpp +++ b/common/cuda_hip/base/math.hpp @@ -83,6 +83,17 @@ struct truncate_type_impl> { }; +template +struct is_complex_impl> : public std::true_type {}; + +template <> +struct is_complex_or_scalar_impl<__half> : public std::true_type {}; + +template +struct is_complex_or_scalar_impl> + : public is_complex_or_scalar_impl {}; + + } // namespace detail } // namespace gko diff --git a/common/cuda_hip/components/merging.hpp b/common/cuda_hip/components/merging.hpp index ab070741fbd..b832a97176e 100644 --- a/common/cuda_hip/components/merging.hpp +++ b/common/cuda_hip/components/merging.hpp @@ -131,7 +131,7 @@ __forceinline__ __device__ void group_merge(const ValueType* __restrict__ a, IndexType a_begin{}; IndexType b_begin{}; auto lane = static_cast(group.thread_rank()); - auto sentinel = device_numeric_limits::max; + auto sentinel = device_numeric_limits::max(); auto a_cur = checked_load(a, a_begin + lane, a_size, sentinel); auto b_cur = checked_load(b, b_begin + lane, b_size, sentinel); for (IndexType c_begin{}; c_begin < c_size; c_begin += group_size) { @@ -240,7 +240,7 @@ __forceinline__ __device__ void sequential_merge( auto c_size = a_size + b_size; IndexType a_begin{}; IndexType b_begin{}; - auto sentinel = device_numeric_limits::max; + auto sentinel = device_numeric_limits::max(); auto a_cur = checked_load(a, a_begin, a_size, sentinel); auto b_cur = checked_load(b, b_begin, b_size, sentinel); for (IndexType c_begin{}; c_begin < c_size; c_begin++) { diff --git a/common/cuda_hip/factorization/par_ict_kernels.cpp b/common/cuda_hip/factorization/par_ict_kernels.cpp index 94aa5e5124e..3446f124123 100644 --- a/common/cuda_hip/factorization/par_ict_kernels.cpp +++ b/common/cuda_hip/factorization/par_ict_kernels.cpp @@ -128,7 +128,7 @@ __global__ __launch_bounds__(default_block_size) void ict_tri_spgeam_init( IndexType l_new_begin = l_new_row_ptrs[row]; - constexpr auto sentinel = device_numeric_limits::max; + constexpr auto sentinel = device_numeric_limits::max(); // load column indices and values for the first merge step auto a_col = checked_load(a_col_idxs, a_begin + lane, a_end, sentinel); auto a_val = checked_load(a_vals, a_begin + lane, a_end, zero()); @@ -456,4 +456,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace par_ict_factorization } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp index 6f5940c2b14..79a562ff834 100644 --- a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp +++ b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp @@ -254,7 +254,7 @@ __global__ __launch_bounds__(basecase_block_size) void basecase_select( const ValueType* __restrict__ input, IndexType size, IndexType rank, ValueType* __restrict__ out) { - constexpr auto sentinel = device_numeric_limits::inf; + constexpr auto sentinel = device_numeric_limits::inf(); ValueType local[basecase_local_size]; __shared__ ValueType sh_local[basecase_size]; for (int i = 0; i < basecase_local_size; ++i) { @@ -301,4 +301,4 @@ __global__ __launch_bounds__(config::warp_size) void find_bucket( } // namespace kernels } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_SELECT_KERNELS_HIP_HPP_ \ No newline at end of file +#endif // GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_SELECT_KERNELS_HIP_HPP_ diff --git a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp index 6cc77660394..a29cf6f2cb3 100644 --- a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp +++ b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp @@ -150,7 +150,7 @@ __global__ __launch_bounds__(default_block_size) void tri_spgeam_init( IndexType l_new_begin = l_new_row_ptrs[row]; IndexType u_new_begin = u_new_row_ptrs[row]; - constexpr auto sentinel = device_numeric_limits::max; + constexpr auto sentinel = device_numeric_limits::max(); // load column indices and values for the first merge step auto a_col = checked_load(a_col_idxs, a_begin + lane, a_end, sentinel); auto a_val = checked_load(a_vals, a_begin + lane, a_end, zero()); @@ -396,4 +396,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace par_ilut_factorization } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/reorder/rcm_kernels.cpp b/common/cuda_hip/reorder/rcm_kernels.cpp index 75050d3e977..2bb18cbdd22 100644 --- a/common/cuda_hip/reorder/rcm_kernels.cpp +++ b/common/cuda_hip/reorder/rcm_kernels.cpp @@ -525,7 +525,7 @@ __global__ __launch_bounds__(default_block_size) void ubfs_min_neighbor_kernel( const auto begin = row_ptrs[row]; const auto end = row_ptrs[row + 1]; const auto cur_level = node_levels[row]; - auto min_neighbor = device_numeric_limits::max; + auto min_neighbor = device_numeric_limits::max(); for (auto nz = begin; nz < end; nz++) { const auto col = col_idxs[nz]; const auto neighbor_level = node_levels[col]; diff --git a/cuda/test/base/math.cu b/cuda/test/base/math.cu index d1d9373b0ef..1025c3cc489 100644 --- a/cuda/test/base/math.cu +++ b/cuda/test/base/math.cu @@ -26,7 +26,7 @@ namespace kernel { template __device__ bool test_real_is_finite_function(FuncType isfin) { - constexpr T inf = gko::device_numeric_limits::inf; + constexpr T inf = gko::device_numeric_limits::inf(); constexpr T quiet_nan = NAN; bool test_true{}; bool test_false{}; @@ -46,7 +46,7 @@ __device__ bool test_complex_is_finite_function(FuncType isfin) "Template type must be a complex type."); using T = gko::remove_complex; using c_type = gko::kernels::cuda::cuda_type; - constexpr T inf = gko::device_numeric_limits::inf; + constexpr T inf = gko::device_numeric_limits::inf(); constexpr T quiet_nan = NAN; bool test_true{}; bool test_false{}; diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp index f01b56739d9..f69ca804aa9 100644 --- a/hip/test/base/math.hip.cpp +++ b/hip/test/base/math.hip.cpp @@ -32,7 +32,7 @@ namespace kernel { template __device__ bool test_real_is_finite_function(FuncType isfin) { - constexpr T inf = gko::device_numeric_limits::inf; + constexpr T inf = gko::device_numeric_limits::inf(); constexpr T quiet_nan = NAN; bool test_true{}; bool test_false{}; @@ -52,7 +52,7 @@ __device__ bool test_complex_is_finite_function(FuncType isfin) "Template type must be a complex type."); using T = gko::remove_complex; using c_type = gko::kernels::hip::hip_type; - constexpr T inf = gko::device_numeric_limits::inf; + constexpr T inf = gko::device_numeric_limits::inf(); constexpr T quiet_nan = NAN; bool test_true{}; bool test_false{}; diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 73da407194e..e308b092ea6 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -21,79 +22,6 @@ namespace gko { -class half; - - -// HIP should not see std::abs or std::sqrt, we want the custom implementation. -// Hence, provide the using declaration only for some cases -namespace kernels { -namespace reference { - - -using std::abs; - - -using std::sqrt; - - -} // namespace reference -} // namespace kernels - - -namespace kernels { -namespace omp { - - -using std::abs; - - -using std::sqrt; - - -} // namespace omp -} // namespace kernels - - -namespace kernels { -namespace cuda { - - -using std::abs; - - -using std::sqrt; - - -} // namespace cuda -} // namespace kernels - - -namespace kernels { -namespace dpcpp { - - -using std::abs; - - -using std::sqrt; - - -} // namespace dpcpp -} // namespace kernels - - -namespace test { - - -using std::abs; - - -using std::sqrt; - - -} // namespace test - - // type manipulations @@ -706,6 +634,13 @@ GKO_INLINE constexpr T one() return T(1); } +template <> +GKO_INLINE constexpr half one() +{ + constexpr auto bits = static_cast(0b0'01111'0000000000u); + return half::create_from_bits(bits); +} + /** * Returns the multiplicative identity for T. @@ -983,6 +918,7 @@ GKO_INLINE constexpr auto squared_norm(const T& x) return real(conj(x) * x); } +using std::abs; /** * Returns the absolute value of the object. @@ -1008,6 +944,27 @@ abs(const T& x) return sqrt(squared_norm(x)); } +// increase the priority in function lookup +GKO_INLINE gko::half abs(const std::complex& x) +{ + // Using float abs not sqrt on norm to avoid overflow + return static_cast(abs(std::complex(x))); +} + + +using std::sqrt; + +GKO_INLINE gko::half sqrt(gko::half a) +{ + return gko::half(std::sqrt(float(a))); +} + +GKO_INLINE std::complex sqrt(std::complex a) +{ + return std::complex(sqrt(std::complex( + static_cast(a.real()), static_cast(a.imag())))); +} + /** * Returns the value of pi. From c3645784bc75861180f7bb521a37ab12d71fd0ae Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 24 Oct 2024 02:00:19 +0200 Subject: [PATCH 07/69] array operation in half --- .../components/absolute_array_kernels.cpp | 6 ++++-- .../unified/components/fill_array_kernels.cpp | 17 +++++++++++++---- .../components/precision_conversion_kernels.cpp | 3 ++- .../unified/components/reduce_array_kernels.cpp | 3 ++- core/base/array.cpp | 9 +++++---- core/base/segmented_array.cpp | 2 +- core/device_hooks/common_kernels.inc.cpp | 12 ++++++------ include/ginkgo/core/base/segmented_array.hpp | 7 ++++++- reference/components/absolute_array_kernels.cpp | 6 ++++-- reference/components/fill_array_kernels.cpp | 5 +++-- .../components/precision_conversion_kernels.cpp | 3 ++- reference/components/reduce_array_kernels.cpp | 3 ++- 12 files changed, 50 insertions(+), 26 deletions(-) diff --git a/common/unified/components/absolute_array_kernels.cpp b/common/unified/components/absolute_array_kernels.cpp index c9ab364353c..423fa234c39 100644 --- a/common/unified/components/absolute_array_kernels.cpp +++ b/common/unified/components/absolute_array_kernels.cpp @@ -23,7 +23,8 @@ void inplace_absolute_array(std::shared_ptr exec, data); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL); template @@ -37,7 +38,8 @@ void outplace_absolute_array(std::shared_ptr exec, n, in, out); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL); } // namespace components diff --git a/common/unified/components/fill_array_kernels.cpp b/common/unified/components/fill_array_kernels.cpp index d78a6e9f346..3e87d782974 100644 --- a/common/unified/components/fill_array_kernels.cpp +++ b/common/unified/components/fill_array_kernels.cpp @@ -23,7 +23,7 @@ void fill_array(std::shared_ptr exec, ValueType* array, array, val); } -GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_FILL_ARRAY_KERNEL); template GKO_DECLARE_FILL_ARRAY_KERNEL(bool); @@ -32,11 +32,20 @@ void fill_seq_array(std::shared_ptr exec, ValueType* array, size_type n) { run_kernel( - exec, [] GKO_KERNEL(auto idx, auto array) { array[idx] = idx; }, n, - array); + exec, + [] GKO_KERNEL(auto idx, auto array) { + if constexpr (std::is_same_v, half>) { + // __half can not be from int64_t + array[idx] = static_cast(idx); + } else { + array[idx] = idx; + } + }, + n, array); } -GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF( + GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL); } // namespace components diff --git a/common/unified/components/precision_conversion_kernels.cpp b/common/unified/components/precision_conversion_kernels.cpp index 0402d9bef68..94a8d4e4d0f 100644 --- a/common/unified/components/precision_conversion_kernels.cpp +++ b/common/unified/components/precision_conversion_kernels.cpp @@ -23,7 +23,8 @@ void convert_precision(std::shared_ptr exec, size, in, out); } -GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_CONVERT_PRECISION_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF( + GKO_DECLARE_CONVERT_PRECISION_KERNEL); } // namespace components diff --git a/common/unified/components/reduce_array_kernels.cpp b/common/unified/components/reduce_array_kernels.cpp index bc8da6fa311..1e7d19264cd 100644 --- a/common/unified/components/reduce_array_kernels.cpp +++ b/common/unified/components/reduce_array_kernels.cpp @@ -34,7 +34,8 @@ void reduce_add_array(std::shared_ptr exec, arr, result); } -GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF( + GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL); } // namespace components diff --git a/core/base/array.cpp b/core/base/array.cpp index a41f7c07e55..7a98223a7b2 100644 --- a/core/base/array.cpp +++ b/core/base/array.cpp @@ -51,7 +51,8 @@ void convert_data(std::shared_ptr exec, size_type size, void convert_data(std::shared_ptr, size_type, \ const From*, To*) -GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_ARRAY_CONVERSION); +GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF( + GKO_DECLARE_ARRAY_CONVERSION); } // namespace detail @@ -88,19 +89,19 @@ ValueType reduce_add(const array& input_arr, #define GKO_DECLARE_ARRAY_FILL(_type) void array<_type>::fill(const _type value) -GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_FILL); +GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_ARRAY_FILL); #define GKO_DECLARE_ARRAY_REDUCE_ADD(_type) \ void reduce_add(const array<_type>& arr, array<_type>& value) -GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_REDUCE_ADD); +GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_ARRAY_REDUCE_ADD); #define GKO_DECLARE_ARRAY_REDUCE_ADD2(_type) \ _type reduce_add(const array<_type>& arr, const _type val) -GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_ARRAY_REDUCE_ADD2); +GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_ARRAY_REDUCE_ADD2); } // namespace gko diff --git a/core/base/segmented_array.cpp b/core/base/segmented_array.cpp index d113139f8e2..4c6356799f9 100644 --- a/core/base/segmented_array.cpp +++ b/core/base/segmented_array.cpp @@ -180,7 +180,7 @@ segmented_array& segmented_array::operator=(segmented_array&& other) #define GKO_DECLARE_SEGMENTED_ARRAY(_type) struct segmented_array<_type> -GKO_INSTANTIATE_FOR_EACH_POD_TYPE(GKO_DECLARE_SEGMENTED_ARRAY); +GKO_INSTANTIATE_FOR_EACH_POD_TYPE_WITH_HALF(GKO_DECLARE_SEGMENTED_ARRAY); } // namespace gko diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 6ffeb1c5f71..224aacc7369 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -238,19 +238,19 @@ namespace GKO_HOOK_MODULE { namespace components { -GKO_STUB_VALUE_CONVERSION(GKO_DECLARE_CONVERT_PRECISION_KERNEL); +GKO_STUB_VALUE_CONVERSION_WITH_HALF(GKO_DECLARE_CONVERT_PRECISION_KERNEL); GKO_STUB_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_NONNEGATIVE_KERNEL); // explicitly instantiate for size_type, as this is // used in the SellP format template GKO_DECLARE_PREFIX_SUM_NONNEGATIVE_KERNEL(size_type); -GKO_STUB_TEMPLATE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL); +GKO_STUB_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_FILL_ARRAY_KERNEL); template GKO_DECLARE_FILL_ARRAY_KERNEL(bool); -GKO_STUB_TEMPLATE_TYPE(GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL); -GKO_STUB_TEMPLATE_TYPE(GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL); +GKO_STUB_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL); +GKO_STUB_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE( diff --git a/include/ginkgo/core/base/segmented_array.hpp b/include/ginkgo/core/base/segmented_array.hpp index 49a7e6f9d38..b34605cc902 100644 --- a/include/ginkgo/core/base/segmented_array.hpp +++ b/include/ginkgo/core/base/segmented_array.hpp @@ -2,7 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#pragma once +#ifndef GKO_PUBLIC_CORE_BASE_SEGMENTED_ARRAY_HPP_ +#define GKO_PUBLIC_CORE_BASE_SEGMENTED_ARRAY_HPP_ + + #include #include @@ -183,3 +186,5 @@ class copy_back_deleter> } // namespace detail } // namespace gko + +#endif // GKO_PUBLIC_CORE_BASE_SEGMENTED_ARRAY_HPP_ diff --git a/reference/components/absolute_array_kernels.cpp b/reference/components/absolute_array_kernels.cpp index 964e1f80d6a..759caae894c 100644 --- a/reference/components/absolute_array_kernels.cpp +++ b/reference/components/absolute_array_kernels.cpp @@ -20,7 +20,8 @@ void inplace_absolute_array(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL); template @@ -33,7 +34,8 @@ void outplace_absolute_array(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL); } // namespace components diff --git a/reference/components/fill_array_kernels.cpp b/reference/components/fill_array_kernels.cpp index 1649aa87982..663ad8f5b6b 100644 --- a/reference/components/fill_array_kernels.cpp +++ b/reference/components/fill_array_kernels.cpp @@ -20,7 +20,7 @@ void fill_array(std::shared_ptr exec, ValueType* array, std::fill_n(array, n, val); } -GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_FILL_ARRAY_KERNEL); template GKO_DECLARE_FILL_ARRAY_KERNEL(bool); @@ -31,7 +31,8 @@ void fill_seq_array(std::shared_ptr exec, std::iota(array, array + n, 0); } -GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF( + GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL); } // namespace components diff --git a/reference/components/precision_conversion_kernels.cpp b/reference/components/precision_conversion_kernels.cpp index db12d9316ee..5ec37a1cd72 100644 --- a/reference/components/precision_conversion_kernels.cpp +++ b/reference/components/precision_conversion_kernels.cpp @@ -20,7 +20,8 @@ void convert_precision(std::shared_ptr exec, std::copy_n(in, size, out); } -GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_CONVERT_PRECISION_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_WITH_HALF( + GKO_DECLARE_CONVERT_PRECISION_KERNEL); } // namespace components diff --git a/reference/components/reduce_array_kernels.cpp b/reference/components/reduce_array_kernels.cpp index a70ef95a878..3c3c6f620ec 100644 --- a/reference/components/reduce_array_kernels.cpp +++ b/reference/components/reduce_array_kernels.cpp @@ -22,7 +22,8 @@ void reduce_add_array(std::shared_ptr exec, val.get_const_data()[0]); } -GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE_WITH_HALF( + GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL); } // namespace components From 7b98069316124814547e0a11681668fa0652631a Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 23 Oct 2024 18:30:29 +0200 Subject: [PATCH 08/69] matrix with half --- common/cuda_hip/matrix/coo_kernels.cpp | 10 +- .../matrix/csr_kernels.instantiate.cpp | 124 ++++--- .../cuda_hip/matrix/csr_kernels.template.cpp | 6 +- common/cuda_hip/matrix/dense_kernels.cpp | 33 +- common/cuda_hip/matrix/diagonal_kernels.cpp | 2 +- common/cuda_hip/matrix/ell_kernels.cpp | 4 +- .../matrix/fbcsr_kernels.instantiate.cpp | 21 +- .../matrix/fbcsr_kernels.template.cpp | 2 +- common/cuda_hip/matrix/sellp_kernels.cpp | 5 +- .../cuda_hip/matrix/sparsity_csr_kernels.cpp | 14 +- common/unified/matrix/coo_kernels.cpp | 4 +- common/unified/matrix/csr_kernels.cpp | 16 +- .../matrix/dense_kernels.instantiate.cpp | 100 ++--- common/unified/matrix/diagonal_kernels.cpp | 14 +- common/unified/matrix/ell_kernels.cpp | 13 +- common/unified/matrix/hybrid_kernels.cpp | 4 +- .../matrix/scaled_permutation_kernels.cpp | 4 +- common/unified/matrix/sellp_kernels.cpp | 10 +- .../unified/matrix/sparsity_csr_kernels.cpp | 6 +- core/device_hooks/common_kernels.inc.cpp | 351 +++++++++++------- core/matrix/coo.cpp | 29 +- core/matrix/csr.cpp | 29 +- core/matrix/dense.cpp | 38 +- core/matrix/diagonal.cpp | 30 +- core/matrix/ell.cpp | 30 +- core/matrix/fbcsr.cpp | 32 +- core/matrix/hybrid.cpp | 32 +- core/matrix/identity.cpp | 4 +- core/matrix/permutation.cpp | 7 +- core/matrix/row_gatherer.cpp | 13 +- core/matrix/scaled_permutation.cpp | 2 +- core/matrix/sellp.cpp | 33 +- core/matrix/sparsity_csr.cpp | 3 +- dpcpp/matrix/coo_kernels.dp.cpp | 10 +- dpcpp/matrix/csr_kernels.dp.cpp | 74 ++-- dpcpp/matrix/dense_kernels.dp.cpp | 86 +++-- dpcpp/matrix/diagonal_kernels.dp.cpp | 2 +- dpcpp/matrix/ell_kernels.dp.cpp | 4 +- dpcpp/matrix/fbcsr_kernels.dp.cpp | 21 +- dpcpp/matrix/sellp_kernels.dp.cpp | 5 +- dpcpp/matrix/sparsity_csr_kernels.dp.cpp | 14 +- .../ginkgo/core/base/precision_dispatch.hpp | 29 +- include/ginkgo/core/matrix/coo.hpp | 57 ++- include/ginkgo/core/matrix/csr.hpp | 73 ++-- include/ginkgo/core/matrix/dense.hpp | 34 +- include/ginkgo/core/matrix/diagonal.hpp | 36 +- include/ginkgo/core/matrix/ell.hpp | 57 ++- include/ginkgo/core/matrix/fbcsr.hpp | 61 ++- include/ginkgo/core/matrix/hybrid.hpp | 40 +- include/ginkgo/core/matrix/sellp.hpp | 57 ++- omp/matrix/coo_kernels.cpp | 10 +- omp/matrix/csr_kernels.cpp | 60 +-- omp/matrix/dense_kernels.cpp | 33 +- omp/matrix/diagonal_kernels.cpp | 2 +- omp/matrix/ell_kernels.cpp | 4 +- omp/matrix/fbcsr_kernels.cpp | 21 +- omp/matrix/sellp_kernels.cpp | 5 +- omp/matrix/sparsity_csr_kernels.cpp | 10 +- reference/matrix/coo_kernels.cpp | 14 +- reference/matrix/csr_kernels.cpp | 76 ++-- reference/matrix/dense_kernels.cpp | 132 ++++--- reference/matrix/diagonal_kernels.cpp | 16 +- reference/matrix/ell_kernels.cpp | 19 +- reference/matrix/fbcsr_kernels.cpp | 21 +- reference/matrix/hybrid_kernels.cpp | 4 +- .../matrix/scaled_permutation_kernels.cpp | 4 +- reference/matrix/sellp_kernels.cpp | 15 +- reference/matrix/sparsity_csr_kernels.cpp | 16 +- reference/test/base/combination.cpp | 13 +- reference/test/matrix/coo_kernels.cpp | 17 +- reference/test/matrix/csr_kernels.cpp | 8 +- reference/test/matrix/dense_kernels.cpp | 21 +- reference/test/matrix/diagonal_kernels.cpp | 15 +- reference/test/matrix/ell_kernels.cpp | 17 +- reference/test/matrix/fbcsr_kernels.cpp | 17 +- reference/test/matrix/hybrid_kernels.cpp | 17 +- reference/test/matrix/scaled_permutation.cpp | 3 +- reference/test/matrix/sellp_kernels.cpp | 26 +- test/matrix/fbcsr_kernels.cpp | 14 +- test/matrix/matrix.cpp | 10 +- 80 files changed, 1480 insertions(+), 845 deletions(-) diff --git a/common/cuda_hip/matrix/coo_kernels.cpp b/common/cuda_hip/matrix/coo_kernels.cpp index cffe18d981b..4609f9f7f95 100644 --- a/common/cuda_hip/matrix/coo_kernels.cpp +++ b/common/cuda_hip/matrix/coo_kernels.cpp @@ -238,7 +238,8 @@ void spmv(std::shared_ptr exec, spmv2(exec, a, b, c); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_COO_SPMV_KERNEL); template @@ -253,7 +254,7 @@ void advanced_spmv(std::shared_ptr exec, advanced_spmv2(exec, alpha, a, b, c); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL); @@ -295,7 +296,8 @@ void spmv2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_COO_SPMV2_KERNEL); template @@ -338,7 +340,7 @@ void advanced_spmv2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); diff --git a/common/cuda_hip/matrix/csr_kernels.instantiate.cpp b/common/cuda_hip/matrix/csr_kernels.instantiate.cpp index f62ca1c1815..2e28de95f5d 100644 --- a/common/cuda_hip/matrix/csr_kernels.instantiate.cpp +++ b/common/cuda_hip/matrix/csr_kernels.instantiate.cpp @@ -17,108 +17,136 @@ namespace csr { // begin -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(GKO_DECLARE_CSR_SPMV_KERNEL, - int32); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF( + GKO_DECLARE_CSR_SPMV_KERNEL, int32); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(GKO_DECLARE_CSR_SPMV_KERNEL, - int32); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF( + GKO_DECLARE_CSR_SPMV_KERNEL, int32); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(GKO_DECLARE_CSR_SPMV_KERNEL, - int32); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF( + GKO_DECLARE_CSR_SPMV_KERNEL, int32); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(GKO_DECLARE_CSR_SPMV_KERNEL, - int32); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF( + GKO_DECLARE_CSR_SPMV_KERNEL, int32); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(GKO_DECLARE_CSR_SPMV_KERNEL, - int64); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF( + GKO_DECLARE_CSR_SPMV_KERNEL, int32); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(GKO_DECLARE_CSR_SPMV_KERNEL, - int64); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF( + GKO_DECLARE_CSR_SPMV_KERNEL, int32); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(GKO_DECLARE_CSR_SPMV_KERNEL, - int64); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF( + GKO_DECLARE_CSR_SPMV_KERNEL, int64); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(GKO_DECLARE_CSR_SPMV_KERNEL, - int64); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF( + GKO_DECLARE_CSR_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF( + GKO_DECLARE_CSR_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF( + GKO_DECLARE_CSR_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF( + GKO_DECLARE_CSR_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF( + GKO_DECLARE_CSR_SPMV_KERNEL, int64); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1_WITH_HALF( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2_WITH_HALF( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3_WITH_HALF( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4_WITH_HALF( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5_WITH_HALF( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6_WITH_HALF( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); // split -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_TRANSPOSE_KERNEL); // split -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL); // split -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX); // split -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_SPGEMM_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL); GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_SPGEAM_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_EXTRACT_DIAGONAL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); // end diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp index 909349ed7ab..f808e234670 100644 --- a/common/cuda_hip/matrix/csr_kernels.template.cpp +++ b/common/cuda_hip/matrix/csr_kernels.template.cpp @@ -278,7 +278,7 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_spmv( { using arithmetic_type = typename output_accessor::arithmetic_type; using output_type = typename output_accessor::storage_type; - const arithmetic_type scale_factor = alpha[0]; + const auto scale_factor = static_cast(alpha[0]); spmv_kernel(nwarps, num_rows, val, col_idxs, row_ptrs, srow, b, c, [&scale_factor](const arithmetic_type& x) { return static_cast(scale_factor * x); @@ -486,7 +486,7 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_reduce( const IndexType* __restrict__ last_row, const MatrixValueType* __restrict__ alpha, acc::range c) { - const arithmetic_type alpha_val = alpha[0]; + const auto alpha_val = static_cast(alpha[0]); merge_path_reduce( nwarps, last_val, last_row, c, [&alpha_val](const arithmetic_type& x) { return alpha_val * x; }); @@ -1193,7 +1193,7 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( const auto i = base_i + lane; const auto col = i < row_len ? local_cols[i] - : device_numeric_limits::max; + : device_numeric_limits::max(); const auto rel_col = static_cast(col - min_col); const auto block = rel_col / bitmap_block_size; const auto col_in_block = rel_col % bitmap_block_size; diff --git a/common/cuda_hip/matrix/dense_kernels.cpp b/common/cuda_hip/matrix/dense_kernels.cpp index d8391ace023..d0d4985dd82 100644 --- a/common/cuda_hip/matrix/dense_kernels.cpp +++ b/common/cuda_hip/matrix/dense_kernels.cpp @@ -461,7 +461,7 @@ void convert_to_coo(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL); @@ -491,7 +491,7 @@ void convert_to_csr(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL); @@ -521,7 +521,7 @@ void convert_to_ell(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL); @@ -544,7 +544,7 @@ void convert_to_fbcsr(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL); @@ -565,7 +565,7 @@ void count_nonzero_blocks_per_row(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL); @@ -598,7 +598,7 @@ void convert_to_hybrid(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL); @@ -629,7 +629,7 @@ void convert_to_sellp(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL); @@ -657,7 +657,7 @@ void convert_to_sparsity_csr(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL); @@ -681,7 +681,7 @@ void compute_dot_dispatch(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); @@ -706,7 +706,7 @@ void compute_conj_dot_dispatch(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); @@ -729,7 +729,7 @@ void compute_norm2_dispatch(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); @@ -760,7 +760,8 @@ void simple_apply(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); template @@ -787,7 +788,7 @@ void apply(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_APPLY_KERNEL); template @@ -812,7 +813,8 @@ void transpose(std::shared_ptr exec, } }; -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); template @@ -837,7 +839,8 @@ void conj_transpose(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); } // namespace dense diff --git a/common/cuda_hip/matrix/diagonal_kernels.cpp b/common/cuda_hip/matrix/diagonal_kernels.cpp index e12d3ed4f9f..baee454c36d 100644 --- a/common/cuda_hip/matrix/diagonal_kernels.cpp +++ b/common/cuda_hip/matrix/diagonal_kernels.cpp @@ -81,7 +81,7 @@ void apply_to_csr(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL); diff --git a/common/cuda_hip/matrix/ell_kernels.cpp b/common/cuda_hip/matrix/ell_kernels.cpp index bfdd3f21e51..16371166662 100644 --- a/common/cuda_hip/matrix/ell_kernels.cpp +++ b/common/cuda_hip/matrix/ell_kernels.cpp @@ -354,7 +354,7 @@ void spmv(std::shared_ptr exec, b, c); } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ELL_SPMV_KERNEL); @@ -388,7 +388,7 @@ void advanced_spmv(std::shared_ptr exec, b, c, alpha, beta); } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); diff --git a/common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp b/common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp index a3beaac4a85..a7a0263cd35 100644 --- a/common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp +++ b/common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp @@ -17,26 +17,27 @@ namespace fbcsr { // begin -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL); // split -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_FBCSR_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL); // split -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); // end diff --git a/common/cuda_hip/matrix/fbcsr_kernels.template.cpp b/common/cuda_hip/matrix/fbcsr_kernels.template.cpp index 23f865b6ace..e10cf10b540 100644 --- a/common/cuda_hip/matrix/fbcsr_kernels.template.cpp +++ b/common/cuda_hip/matrix/fbcsr_kernels.template.cpp @@ -564,7 +564,7 @@ void transpose_blocks_impl(syn::value_list, if (grid_dim > 0) { kernel::transpose_blocks <<get_stream()>>>( - nbnz, mat->get_values()); + nbnz, as_device_type(mat->get_values())); } } diff --git a/common/cuda_hip/matrix/sellp_kernels.cpp b/common/cuda_hip/matrix/sellp_kernels.cpp index 3e8fba395b3..4d37a0452a6 100644 --- a/common/cuda_hip/matrix/sellp_kernels.cpp +++ b/common/cuda_hip/matrix/sellp_kernels.cpp @@ -105,7 +105,8 @@ void spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SELLP_SPMV_KERNEL); template @@ -131,7 +132,7 @@ void advanced_spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); diff --git a/common/cuda_hip/matrix/sparsity_csr_kernels.cpp b/common/cuda_hip/matrix/sparsity_csr_kernels.cpp index 269708e19ae..ddda357fa31 100644 --- a/common/cuda_hip/matrix/sparsity_csr_kernels.cpp +++ b/common/cuda_hip/matrix/sparsity_csr_kernels.cpp @@ -72,11 +72,11 @@ __device__ void device_classical_spmv(const size_type num_rows, const auto subrow = thread::get_subwarp_num_flat(); const auto subid = subwarp_tile.thread_rank(); const IndexType column_id = blockIdx.y; - const arithmetic_type value = val[0]; + const auto value = static_cast(val[0]); auto row = thread::get_subwarp_id_flat(); for (; row < num_rows; row += subrow) { const auto ind_end = row_ptrs[row + 1]; - arithmetic_type temp_val = zero(); + auto temp_val = zero(); for (auto ind = row_ptrs[row] + subid; ind < ind_end; ind += subwarp_size) { temp_val += value * b(col_idxs[ind], column_id); @@ -138,7 +138,7 @@ void transpose(std::shared_ptr exec, matrix::SparsityCsr* trans) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL); @@ -246,7 +246,7 @@ void spmv(std::shared_ptr exec, syn::value_list(), syn::type_list<>(), exec, a, b, c); } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL); @@ -264,7 +264,7 @@ void advanced_spmv(std::shared_ptr exec, syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, beta); } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); @@ -297,7 +297,7 @@ void sort_by_column_index(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX); @@ -320,7 +320,7 @@ void is_sorted_by_column_index( cpu_array = gpu_array; } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX); diff --git a/common/unified/matrix/coo_kernels.cpp b/common/unified/matrix/coo_kernels.cpp index ce13d7500ab..233dffc6f37 100644 --- a/common/unified/matrix/coo_kernels.cpp +++ b/common/unified/matrix/coo_kernels.cpp @@ -38,7 +38,7 @@ void extract_diagonal(std::shared_ptr exec, diag->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL); @@ -58,7 +58,7 @@ void fill_in_dense(std::shared_ptr exec, orig->get_const_row_idxs(), orig->get_const_col_idxs(), result); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_COO_FILL_IN_DENSE_KERNEL); diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp index 5236c1c9da9..d5741bb3e1c 100644 --- a/common/unified/matrix/csr_kernels.cpp +++ b/common/unified/matrix/csr_kernels.cpp @@ -52,7 +52,7 @@ void inv_col_permute(std::shared_ptr exec, col_permuted->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL); @@ -86,7 +86,7 @@ void inv_col_scale_permute(std::shared_ptr exec, col_permuted->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL); @@ -102,7 +102,8 @@ void scale(std::shared_ptr exec, x->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_SCALE_KERNEL); template @@ -117,7 +118,8 @@ void inv_scale(std::shared_ptr exec, x->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_INV_SCALE_KERNEL); template @@ -152,7 +154,7 @@ void convert_to_sellp(std::shared_ptr exec, output->get_col_idxs(), output->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL); @@ -183,7 +185,7 @@ void convert_to_ell(std::shared_ptr exec, output->get_col_idxs(), output->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL); @@ -227,7 +229,7 @@ void convert_to_hybrid(std::shared_ptr exec, result->get_coo_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); diff --git a/common/unified/matrix/dense_kernels.instantiate.cpp b/common/unified/matrix/dense_kernels.instantiate.cpp index aca8ad5bec4..dcf48573fc6 100644 --- a/common/unified/matrix/dense_kernels.instantiate.cpp +++ b/common/unified/matrix/dense_kernels.instantiate.cpp @@ -12,87 +12,99 @@ namespace dense { // begin -GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY( +GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY_WITH_HALF( GKO_DECLARE_DENSE_COPY_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_FILL_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_FILL_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF( GKO_DECLARE_DENSE_INV_SCALE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF( GKO_DECLARE_DENSE_ADD_SCALED_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF( GKO_DECLARE_DENSE_SUB_SCALED_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF( GKO_DECLARE_DENSE_ROW_GATHER_KERNEL); -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF( GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_MAKE_COMPLEX_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GET_REAL_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GET_IMAG_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF( GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL); // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL); // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL); // split -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T); // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL); // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL); // end diff --git a/common/unified/matrix/diagonal_kernels.cpp b/common/unified/matrix/diagonal_kernels.cpp index dae037a5134..75960e800d7 100644 --- a/common/unified/matrix/diagonal_kernels.cpp +++ b/common/unified/matrix/diagonal_kernels.cpp @@ -36,7 +36,8 @@ void apply_to_dense(std::shared_ptr exec, b->get_size(), a->get_const_values(), b, c, inverse); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DIAGONAL_APPLY_TO_DENSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DIAGONAL_APPLY_TO_DENSE_KERNEL); template @@ -53,7 +54,7 @@ void right_apply_to_dense(std::shared_ptr exec, b->get_size(), a->get_const_values(), b, c); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_DENSE_KERNEL); @@ -74,7 +75,7 @@ void right_apply_to_csr(std::shared_ptr exec, c->get_const_col_idxs()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_CSR_KERNEL); @@ -95,7 +96,7 @@ void fill_in_matrix_data(std::shared_ptr exec, output->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DIAGONAL_FILL_IN_MATRIX_DATA_KERNEL); @@ -120,7 +121,7 @@ void convert_to_csr(std::shared_ptr exec, result->get_col_idxs(), result->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DIAGONAL_CONVERT_TO_CSR_KERNEL); @@ -137,7 +138,8 @@ void conj_transpose(std::shared_ptr exec, orig->get_size()[0], orig->get_const_values(), trans->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DIAGONAL_CONJ_TRANSPOSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DIAGONAL_CONJ_TRANSPOSE_KERNEL); } // namespace diagonal diff --git a/common/unified/matrix/ell_kernels.cpp b/common/unified/matrix/ell_kernels.cpp index 6d23e08b68b..24fc90a888e 100644 --- a/common/unified/matrix/ell_kernels.cpp +++ b/common/unified/matrix/ell_kernels.cpp @@ -67,7 +67,7 @@ void fill_in_matrix_data(std::shared_ptr exec, output->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ELL_FILL_IN_MATRIX_DATA_KERNEL); @@ -94,7 +94,7 @@ void fill_in_dense(std::shared_ptr exec, source->get_const_values(), result); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL); @@ -121,7 +121,8 @@ void copy(std::shared_ptr exec, result->get_col_idxs(), result->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_COPY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_ELL_COPY_KERNEL); template @@ -150,7 +151,7 @@ void convert_to_csr(std::shared_ptr exec, result->get_col_idxs(), result->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL); @@ -172,7 +173,7 @@ void count_nonzeros_per_row(std::shared_ptr exec, static_cast(source->get_stride()), source->get_const_col_idxs()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ELL_COUNT_NONZEROS_PER_ROW_KERNEL); @@ -198,7 +199,7 @@ void extract_diagonal(std::shared_ptr exec, orig->get_const_values(), diag->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL); diff --git a/common/unified/matrix/hybrid_kernels.cpp b/common/unified/matrix/hybrid_kernels.cpp index 8a21a2415f7..79a596febea 100644 --- a/common/unified/matrix/hybrid_kernels.cpp +++ b/common/unified/matrix/hybrid_kernels.cpp @@ -89,7 +89,7 @@ void fill_in_matrix_data(std::shared_ptr exec, result->get_coo_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL); @@ -150,7 +150,7 @@ void convert_to_csr(std::shared_ptr exec, coo_row_ptrs, result->get_col_idxs(), result->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL); diff --git a/common/unified/matrix/scaled_permutation_kernels.cpp b/common/unified/matrix/scaled_permutation_kernels.cpp index 3eaab65e8e6..4cdc7974e50 100644 --- a/common/unified/matrix/scaled_permutation_kernels.cpp +++ b/common/unified/matrix/scaled_permutation_kernels.cpp @@ -32,7 +32,7 @@ void invert(std::shared_ptr exec, size, input_scale, input_permutation, output_scale, output_permutation); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL); @@ -58,7 +58,7 @@ void compose(std::shared_ptr exec, output_permutation, output_scale); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL); diff --git a/common/unified/matrix/sellp_kernels.cpp b/common/unified/matrix/sellp_kernels.cpp index 93b71ff43f2..23bfe160a69 100644 --- a/common/unified/matrix/sellp_kernels.cpp +++ b/common/unified/matrix/sellp_kernels.cpp @@ -87,7 +87,7 @@ void fill_in_matrix_data(std::shared_ptr exec, output->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SELLP_FILL_IN_MATRIX_DATA_KERNEL); @@ -119,7 +119,7 @@ void fill_in_dense(std::shared_ptr exec, source->get_const_values(), result); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL); @@ -149,7 +149,7 @@ void count_nonzeros_per_row(std::shared_ptr exec, source->get_const_slice_sets(), source->get_const_col_idxs(), result); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SELLP_COUNT_NONZEROS_PER_ROW_KERNEL); @@ -183,7 +183,7 @@ void convert_to_csr(std::shared_ptr exec, result->get_col_idxs(), result->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL); @@ -215,7 +215,7 @@ void extract_diagonal(std::shared_ptr exec, orig->get_const_values(), diag->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL); diff --git a/common/unified/matrix/sparsity_csr_kernels.cpp b/common/unified/matrix/sparsity_csr_kernels.cpp index c5a9c79a89b..b3f26358ad3 100644 --- a/common/unified/matrix/sparsity_csr_kernels.cpp +++ b/common/unified/matrix/sparsity_csr_kernels.cpp @@ -41,7 +41,7 @@ void fill_in_dense(std::shared_ptr exec, input->get_const_col_idxs(), input->get_const_value(), output); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL); @@ -70,7 +70,7 @@ void diagonal_element_prefix_sum( components::prefix_sum_nonnegative(exec, prefix_sum, num_rows + 1); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_DIAGONAL_ELEMENT_PREFIX_SUM_KERNEL); @@ -106,7 +106,7 @@ void remove_diagonal_elements(std::shared_ptr exec, matrix->get_col_idxs()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_REMOVE_DIAGONAL_ELEMENTS_KERNEL); diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 224aacc7369..78b80ec2859 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -411,69 +411,93 @@ GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_ELL_ADD_SCALED_IDENTITY_KERNEL); namespace dense { -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); -GKO_STUB_VALUE_CONVERSION_OR_COPY(GKO_DECLARE_DENSE_COPY_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_FILL_KERNEL); -GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL); -GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_INV_SCALE_KERNEL); -GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_KERNEL); -GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_KERNEL); -GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_APPLY_KERNEL); +GKO_STUB_VALUE_CONVERSION_OR_COPY_WITH_HALF(GKO_DECLARE_DENSE_COPY_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_FILL_KERNEL); +GKO_STUB_VALUE_AND_SCALAR_TYPE_WITH_HALF(GKO_DECLARE_DENSE_SCALE_KERNEL); +GKO_STUB_VALUE_AND_SCALAR_TYPE_WITH_HALF(GKO_DECLARE_DENSE_INV_SCALE_KERNEL); +GKO_STUB_VALUE_AND_SCALAR_TYPE_WITH_HALF(GKO_DECLARE_DENSE_ADD_SCALED_KERNEL); +GKO_STUB_VALUE_AND_SCALAR_TYPE_WITH_HALF(GKO_DECLARE_DENSE_SUB_SCALED_KERNEL); +GKO_STUB_VALUE_AND_SCALAR_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T); -GKO_STUB_VALUE_AND_INDEX_TYPE( +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL); -GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2(GKO_DECLARE_DENSE_ROW_GATHER_KERNEL); -GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2( +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL); +GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF( + GKO_DECLARE_DENSE_ROW_GATHER_KERNEL); +GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF( GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE( +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_MAKE_COMPLEX_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GET_REAL_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GET_IMAG_KERNEL); } // namespace dense @@ -482,13 +506,17 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL); namespace diagonal { -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DIAGONAL_APPLY_TO_DENSE_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_DENSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_CSR_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIAGONAL_CONVERT_TO_CSR_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DIAGONAL_CONJ_TRANSPOSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIAGONAL_FILL_IN_MATRIX_DATA_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DIAGONAL_APPLY_TO_DENSE_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_DENSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_CSR_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DIAGONAL_CONVERT_TO_CSR_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DIAGONAL_CONJ_TRANSPOSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DIAGONAL_FILL_IN_MATRIX_DATA_KERNEL); } // namespace diagonal @@ -675,17 +703,21 @@ GKO_STUB_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL); namespace sparsity_csr { -GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL); -GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE( +GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL); +GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE( +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_DIAGONAL_ELEMENT_PREFIX_SUM_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE( +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_REMOVE_DIAGONAL_ELEMENTS_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX); -GKO_STUB_VALUE_AND_INDEX_TYPE( +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX); @@ -695,38 +727,54 @@ GKO_STUB_VALUE_AND_INDEX_TYPE( namespace csr { -GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPMV_KERNEL); -GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE( +GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_SPMV_KERNEL); +GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_SPGEMM_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_SPGEAM_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_EXTRACT_DIAGONAL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE( +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL); GKO_STUB_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL); GKO_STUB_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); @@ -735,12 +783,14 @@ GKO_STUB_INDEX_TYPE(GKO_DECLARE_CSR_BENCHMARK_LOOKUP_KERNEL); template GKO_DECLARE_CSR_SCALE_KERNEL(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_SCALE_KERNEL); template GKO_DECLARE_CSR_INV_SCALE_KERNEL(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_INV_SCALE_KERNEL); } // namespace csr @@ -749,16 +799,20 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SCALE_KERNEL); namespace fbcsr { -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_FBCSR_SPMV_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL); } // namespace fbcsr @@ -767,12 +821,13 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL); namespace coo { -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_FILL_IN_DENSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_COO_SPMV_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_COO_SPMV2_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_COO_FILL_IN_DENSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL); } // namespace coo @@ -781,15 +836,19 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL); namespace ell { -GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL); -GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_FILL_IN_MATRIX_DATA_KERNEL); +GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ELL_SPMV_KERNEL); +GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_ELL_FILL_IN_MATRIX_DATA_KERNEL); GKO_STUB_INDEX_TYPE(GKO_DECLARE_ELL_COMPUTE_MAX_ROW_NNZ_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_COPY_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_COUNT_NONZEROS_PER_ROW_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ELL_COPY_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_ELL_COUNT_NONZEROS_PER_ROW_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL); } // namespace ell @@ -822,8 +881,10 @@ namespace hybrid { GKO_STUB(GKO_DECLARE_HYBRID_COMPUTE_COO_ROW_PTRS_KERNEL); GKO_STUB(GKO_DECLARE_HYBRID_COMPUTE_ROW_NNZ); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL); } // namespace hybrid @@ -842,8 +903,10 @@ GKO_STUB_INDEX_TYPE(GKO_DECLARE_PERMUTATION_COMPOSE_KERNEL); namespace scaled_permutation { -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL); } // namespace scaled_permutation @@ -852,14 +915,18 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL); namespace sellp { -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_FILL_IN_MATRIX_DATA_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_SELLP_SPMV_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SELLP_FILL_IN_MATRIX_DATA_KERNEL); GKO_STUB_INDEX_TYPE(GKO_DECLARE_SELLP_COMPUTE_SLICE_SETS_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_COUNT_NONZEROS_PER_ROW_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SELLP_COUNT_NONZEROS_PER_ROW_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL); } // namespace sellp diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp index 1368dc261c3..7b3b3876295 100644 --- a/core/matrix/coo.cpp +++ b/core/matrix/coo.cpp @@ -214,7 +214,7 @@ void Coo::apply2_impl(const LinOp* alpha, const LinOp* b, template void Coo::convert_to( - Coo, IndexType>* result) const + Coo, IndexType>* result) const { result->values_ = this->values_; result->row_idxs_ = this->row_idxs_; @@ -225,12 +225,35 @@ void Coo::convert_to( template void Coo::move_to( - Coo, IndexType>* result) + Coo, IndexType>* result) { this->convert_to(result); } +#if GINKGO_ENABLE_HALF +template +void Coo::convert_to( + Coo>, + IndexType>* result) const +{ + result->values_ = this->values_; + result->row_idxs_ = this->row_idxs_; + result->col_idxs_ = this->col_idxs_; + result->set_size(this->get_size()); +} + + +template +void Coo::move_to( + Coo>, + IndexType>* result) +{ + this->convert_to(result); +} +#endif + + template void Coo::convert_to( Csr* result) const @@ -404,7 +427,7 @@ Coo::compute_absolute() const #define GKO_DECLARE_COO_MATRIX(ValueType, IndexType) \ class Coo -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_MATRIX); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_COO_MATRIX); } // namespace matrix diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index 897eb1a48db..1bb3e778478 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -304,7 +304,7 @@ void Csr::apply_impl(const LinOp* alpha, const LinOp* b, template void Csr::convert_to( - Csr, IndexType>* result) const + Csr, IndexType>* result) const { result->values_ = this->values_; result->col_idxs_ = this->col_idxs_; @@ -316,11 +316,34 @@ void Csr::convert_to( template void Csr::move_to( - Csr, IndexType>* result) + Csr, IndexType>* result) { this->convert_to(result); } +#if GINKGO_ENABLE_HALF +template +void Csr::convert_to( + Csr>, + IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->row_ptrs_ = this->row_ptrs_; + result->set_size(this->get_size()); + convert_strategy_helper(result); +} + + +template +void Csr::move_to( + Csr>, + IndexType>* result) +{ + this->convert_to(result); +} +#endif + template void Csr::convert_to( @@ -1047,7 +1070,7 @@ void Csr::add_scaled_identity_impl(const LinOp* a, #define GKO_DECLARE_CSR_MATRIX(ValueType, IndexType) \ class Csr -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_MATRIX); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CSR_MATRIX); } // namespace matrix diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 367b0232969..071e689232e 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -582,7 +582,7 @@ Dense::Dense(Dense&& other) : Dense(other.get_executor()) template void Dense::convert_to( - Dense>* result) const + Dense>* result) const { if (result->get_size() != this->get_size()) { result->set_size(this->get_size()); @@ -597,12 +597,41 @@ void Dense::convert_to( template -void Dense::move_to(Dense>* result) +void Dense::move_to( + Dense>* result) { this->convert_to(result); } +#if GINKGO_ENABLE_HALF +template +void Dense::convert_to( + Dense>>* + result) const +{ + if (result->get_size() != this->get_size()) { + result->set_size(this->get_size()); + result->stride_ = stride_; + result->values_.resize_and_reset(result->get_size()[0] * + result->stride_); + } + auto exec = this->get_executor(); + exec->run(dense::make_copy( + this, make_temporary_output_clone(exec, result).get())); +} + + +template +void Dense::move_to( + Dense>>* + result) +{ + this->convert_to(result); +} +#endif + + template template void Dense::convert_impl(Coo* result) const @@ -1519,7 +1548,8 @@ template void gather_mixed_real_complex(Function fn, LinOp* out) { #ifdef GINKGO_MIXED_PRECISION - run>(out, fn); + run, + next_precision_with_half>>(out, fn); #else precision_dispatch(fn, out); #endif @@ -2029,7 +2059,7 @@ Dense::Dense(std::shared_ptr exec, #define GKO_DECLARE_DENSE_MATRIX(_type) class Dense<_type> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_MATRIX); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_MATRIX); } // namespace matrix diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp index 1a442ffc789..85c5739b529 100644 --- a/core/matrix/diagonal.cpp +++ b/core/matrix/diagonal.cpp @@ -149,7 +149,7 @@ std::unique_ptr Diagonal::conj_transpose() const template void Diagonal::convert_to( - Diagonal>* result) const + Diagonal>* result) const { result->values_ = this->values_; result->set_size(this->get_size()); @@ -157,12 +157,34 @@ void Diagonal::convert_to( template -void Diagonal::move_to(Diagonal>* result) +void Diagonal::move_to( + Diagonal>* result) { this->convert_to(result); } +#if GINKGO_ENABLE_HALF +template +void Diagonal::convert_to( + Diagonal>>* + result) const +{ + result->values_ = this->values_; + result->set_size(this->get_size()); +} + + +template +void Diagonal::move_to( + Diagonal>>* + result) +{ + this->convert_to(result); +} +#endif + + template void Diagonal::convert_to(Csr* result) const { @@ -373,7 +395,7 @@ std::unique_ptr> Diagonal::create_const( #define GKO_DECLARE_DIAGONAL_MATRIX(value_type) class Diagonal -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DIAGONAL_MATRIX); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DIAGONAL_MATRIX); } // namespace matrix @@ -391,7 +413,7 @@ std::unique_ptr DiagonalExtractable::extract_diagonal_linop() #define GKO_DECLARE_DIAGONAL_EXTRACTABLE(value_type) \ std::unique_ptr \ DiagonalExtractable::extract_diagonal_linop() const -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DIAGONAL_EXTRACTABLE); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DIAGONAL_EXTRACTABLE); } // namespace gko diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index 600c2ceb9d2..eafd9fa9cad 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -154,7 +154,7 @@ void Ell::apply_impl(const LinOp* alpha, const LinOp* b, template void Ell::convert_to( - Ell, IndexType>* result) const + Ell, IndexType>* result) const { result->values_ = this->values_; result->col_idxs_ = this->col_idxs_; @@ -166,12 +166,36 @@ void Ell::convert_to( template void Ell::move_to( - Ell, IndexType>* result) + Ell, IndexType>* result) { this->convert_to(result); } +#if GINKGO_ENABLE_HALF +template +void Ell::convert_to( + Ell>, + IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->num_stored_elements_per_row_ = this->num_stored_elements_per_row_; + result->stride_ = this->stride_; + result->set_size(this->get_size()); +} + + +template +void Ell::move_to( + Ell>, + IndexType>* result) +{ + this->convert_to(result); +} +#endif + + template void Ell::convert_to(Dense* result) const { @@ -401,7 +425,7 @@ Ell::Ell(std::shared_ptr exec, #define GKO_DECLARE_ELL_MATRIX(ValueType, IndexType) \ class Ell -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_MATRIX); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ELL_MATRIX); } // namespace matrix diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index 8ed9b117280..f1612be10e0 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -145,7 +145,7 @@ void Fbcsr::apply_impl(const LinOp* alpha, const LinOp* b, template void Fbcsr::convert_to( - Fbcsr, IndexType>* result) const + Fbcsr, IndexType>* const result) const { result->values_ = this->values_; result->col_idxs_ = this->col_idxs_; @@ -158,12 +158,37 @@ void Fbcsr::convert_to( template void Fbcsr::move_to( - Fbcsr, IndexType>* result) + Fbcsr, IndexType>* const result) { this->convert_to(result); } +#if GINKGO_ENABLE_HALF +template +void Fbcsr::convert_to( + Fbcsr>, + IndexType>* const result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->row_ptrs_ = this->row_ptrs_; + result->set_size(this->get_size()); + // block sizes are immutable except for assignment/conversion + result->bs_ = this->bs_; +} + + +template +void Fbcsr::move_to( + Fbcsr>, + IndexType>* const result) +{ + this->convert_to(result); +} +#endif + + template void Fbcsr::convert_to(Dense* result) const { @@ -474,7 +499,8 @@ Fbcsr::Fbcsr(std::shared_ptr exec, #define GKO_DECLARE_FBCSR_MATRIX(ValueType, IndexType) \ class Fbcsr -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_MATRIX); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_FBCSR_MATRIX); } // namespace matrix diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp index d450a0dfc35..72137558a10 100644 --- a/core/matrix/hybrid.cpp +++ b/core/matrix/hybrid.cpp @@ -203,7 +203,7 @@ void Hybrid::apply_impl(const LinOp* alpha, template void Hybrid::convert_to( - Hybrid, IndexType>* result) const + Hybrid, IndexType>* result) const { this->ell_->convert_to(result->ell_); this->coo_->convert_to(result->coo_); @@ -216,12 +216,37 @@ void Hybrid::convert_to( template void Hybrid::move_to( - Hybrid, IndexType>* result) + Hybrid, IndexType>* result) { this->convert_to(result); } +#if GINKGO_ENABLE_HALF +template +void Hybrid::convert_to( + Hybrid>, + IndexType>* result) const +{ + this->ell_->convert_to(result->ell_.get()); + this->coo_->convert_to(result->coo_.get()); + // TODO set strategy correctly + // There is no way to correctly clone the strategy like in + // Csr::convert_to + result->set_size(this->get_size()); +} + + +template +void Hybrid::move_to( + Hybrid>, + IndexType>* result) +{ + this->convert_to(result); +} +#endif + + template void Hybrid::convert_to(Dense* result) const { @@ -418,7 +443,8 @@ Hybrid::compute_absolute() const #define GKO_DECLARE_HYBRID_MATRIX(ValueType, IndexType) \ class Hybrid -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_MATRIX); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_HYBRID_MATRIX); } // namespace matrix diff --git a/core/matrix/identity.cpp b/core/matrix/identity.cpp index 7e035be82a3..ecd93b6f959 100644 --- a/core/matrix/identity.cpp +++ b/core/matrix/identity.cpp @@ -83,9 +83,9 @@ std::unique_ptr> Identity::create( #define GKO_DECLARE_IDENTITY_MATRIX(_type) class Identity<_type> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDENTITY_MATRIX); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDENTITY_MATRIX); #define GKO_DECLARE_IDENTITY_FACTORY(_type) class IdentityFactory<_type> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDENTITY_FACTORY); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDENTITY_FACTORY); } // namespace matrix diff --git a/core/matrix/permutation.cpp b/core/matrix/permutation.cpp index 0fe7ba2b2ce..b6b9ff2d7e4 100644 --- a/core/matrix/permutation.cpp +++ b/core/matrix/permutation.cpp @@ -267,8 +267,11 @@ void dispatch_dense(const LinOp* op, Functor fn) { using matrix::Dense; using std::complex; - run, std::complex>(op, - fn); + run, +#endif + double, float, std::complex, std::complex>(op, fn); } diff --git a/core/matrix/row_gatherer.cpp b/core/matrix/row_gatherer.cpp index fecc60a0ca9..56fcbf93d88 100644 --- a/core/matrix/row_gatherer.cpp +++ b/core/matrix/row_gatherer.cpp @@ -4,6 +4,7 @@ #include "ginkgo/core/matrix/row_gatherer.hpp" +#include #include #include "core/base/dispatch_helper.hpp" @@ -64,7 +65,11 @@ RowGatherer::create_const( template void RowGatherer::apply_impl(const LinOp* in, LinOp* out) const { - run, std::complex>( + run, +#endif + float, double, std::complex, std::complex>( in, [&](auto gather) { gather->row_gather(&row_idxs_, out); }); } @@ -72,7 +77,11 @@ template void RowGatherer::apply_impl(const LinOp* alpha, const LinOp* in, const LinOp* beta, LinOp* out) const { - run, std::complex>( + run, +#endif + float, double, std::complex, std::complex>( in, [&](auto gather) { gather->row_gather(alpha, &row_idxs_, beta, out); }); } diff --git a/core/matrix/scaled_permutation.cpp b/core/matrix/scaled_permutation.cpp index 0f295d6b5be..bbe353e543e 100644 --- a/core/matrix/scaled_permutation.cpp +++ b/core/matrix/scaled_permutation.cpp @@ -174,7 +174,7 @@ void ScaledPermutation::write( #define GKO_DECLARE_SCALED_PERMUTATION_MATRIX(ValueType, IndexType) \ class ScaledPermutation -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SCALED_PERMUTATION_MATRIX); diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp index a4787e758bf..bd81b08bada 100644 --- a/core/matrix/sellp.cpp +++ b/core/matrix/sellp.cpp @@ -176,7 +176,7 @@ void Sellp::apply_impl(const LinOp* alpha, const LinOp* b, template void Sellp::convert_to( - Sellp, IndexType>* result) const + Sellp, IndexType>* result) const { result->values_ = this->values_; result->col_idxs_ = this->col_idxs_; @@ -190,12 +190,38 @@ void Sellp::convert_to( template void Sellp::move_to( - Sellp, IndexType>* result) + Sellp, IndexType>* result) { this->convert_to(result); } +#if GINKGO_ENABLE_HALF +template +void Sellp::convert_to( + Sellp>, + IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->slice_lengths_ = this->slice_lengths_; + result->slice_sets_ = this->slice_sets_; + result->slice_size_ = this->slice_size_; + result->stride_factor_ = this->stride_factor_; + result->set_size(this->get_size()); +} + + +template +void Sellp::move_to( + Sellp>, + IndexType>* result) +{ + this->convert_to(result); +} +#endif + + template void Sellp::convert_to(Dense* result) const { @@ -363,7 +389,8 @@ Sellp::compute_absolute() const #define GKO_DECLARE_SELLP_MATRIX(ValueType, IndexType) \ class Sellp -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_MATRIX); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SELLP_MATRIX); } // namespace matrix diff --git a/core/matrix/sparsity_csr.cpp b/core/matrix/sparsity_csr.cpp index 9b8ea04da52..a4d8b2fa281 100644 --- a/core/matrix/sparsity_csr.cpp +++ b/core/matrix/sparsity_csr.cpp @@ -346,7 +346,8 @@ bool SparsityCsr::is_sorted_by_column_index() const #define GKO_DECLARE_SPARSITY_MATRIX(ValueType, IndexType) \ class SparsityCsr -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_MATRIX); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SPARSITY_MATRIX); } // namespace matrix diff --git a/dpcpp/matrix/coo_kernels.dp.cpp b/dpcpp/matrix/coo_kernels.dp.cpp index 595af92b33b..7e8a9acfac3 100644 --- a/dpcpp/matrix/coo_kernels.dp.cpp +++ b/dpcpp/matrix/coo_kernels.dp.cpp @@ -259,7 +259,8 @@ void spmv(std::shared_ptr exec, spmv2(exec, a, b, c); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_COO_SPMV_KERNEL); template @@ -274,7 +275,7 @@ void advanced_spmv(std::shared_ptr exec, advanced_spmv2(exec, alpha, a, b, c); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL); @@ -311,7 +312,8 @@ void spmv2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_COO_SPMV2_KERNEL); template @@ -350,7 +352,7 @@ void advanced_spmv2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 4dce0aa6ac2..efcb9b7f470 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -31,6 +31,7 @@ #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" +#include "dpcpp/base/onemkl_bindings.hpp" #include "dpcpp/components/atomic.dp.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" @@ -266,7 +267,7 @@ void abstract_spmv( { using arithmetic_type = typename output_accessor::arithmetic_type; using output_type = typename output_accessor::storage_type; - const arithmetic_type scale_factor = alpha[0]; + const auto scale_factor = static_cast(alpha[0]); spmv_kernel( nwarps, num_rows, val, col_idxs, row_ptrs, srow, b, c, [&scale_factor](const arithmetic_type& x) { @@ -479,8 +480,8 @@ void abstract_merge_path_spmv( sycl::nd_item<3> item_ct1, IndexType* shared_row_ptrs) { using type = typename output_accessor::arithmetic_type; - const type alpha_val = alpha[0]; - const type beta_val = beta[0]; + const type alpha_val = static_cast(alpha[0]); + const type beta_val = static_cast(beta[0]); merge_path_spmv( num_rows, val, col_idxs, row_ptrs, srow, b, c, row_out, val_out, [&alpha_val](const type& x) { return alpha_val * x; }, @@ -566,7 +567,7 @@ void abstract_reduce( uninitialized_array& tmp_ind, uninitialized_array& tmp_val) { - const arithmetic_type alpha_val = alpha[0]; + const auto alpha_val = static_cast(alpha[0]); merge_path_reduce( nwarps, last_val, last_row, c, [&alpha_val](const arithmetic_type& x) { return alpha_val * x; }, @@ -694,8 +695,8 @@ void abstract_classical_spmv( acc::range c, sycl::nd_item<3> item_ct1) { using type = typename output_accessor::arithmetic_type; - const type alpha_val = alpha[0]; - const type beta_val = beta[0]; + const type alpha_val = static_cast(alpha[0]); + const type beta_val = static_cast(beta[0]); device_classical_spmv( num_rows, val, col_idxs, row_ptrs, b, c, [&alpha_val, &beta_val](const type& x, const type& y) { @@ -1393,8 +1394,9 @@ bool try_general_sparselib_spmv(std::shared_ptr exec, const ValueType host_beta, matrix::Dense* c) { - bool try_sparselib = !is_complex(); - if (try_sparselib) { + constexpr bool try_sparselib = + !is_complex() && !std::is_same::value; + if constexpr (try_sparselib) { oneapi::mkl::sparse::matrix_handle_t mat_handle; oneapi::mkl::sparse::init_matrix_handle(&mat_handle); oneapi::mkl::sparse::set_csr_data( @@ -1532,7 +1534,7 @@ void spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_SPMV_KERNEL); @@ -1604,7 +1606,7 @@ void advanced_spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL); @@ -1684,7 +1686,7 @@ void calculate_nonzeros_per_row_in_span( row_nnz->get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL); @@ -1696,7 +1698,7 @@ void calculate_nonzeros_per_row_in_index_set( const gko::index_set& col_index_set, IndexType* row_nnz) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL); @@ -1723,7 +1725,7 @@ void compute_submatrix(std::shared_ptr exec, result->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL); @@ -1735,7 +1737,7 @@ void compute_submatrix_from_index_set( const gko::index_set& col_index_set, matrix::Csr* result) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL); @@ -1997,7 +1999,8 @@ void spgemm(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_SPGEMM_KERNEL); template @@ -2130,7 +2133,7 @@ void advanced_spgemm(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL); @@ -2216,7 +2219,8 @@ void spgeam(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_SPGEAM_KERNEL); template @@ -2237,7 +2241,7 @@ void fill_in_dense(std::shared_ptr exec, result->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); @@ -2247,7 +2251,7 @@ void convert_to_fbcsr(std::shared_ptr exec, array& row_ptrs, array& col_idxs, array& values) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); @@ -2310,7 +2314,8 @@ void transpose(std::shared_ptr exec, generic_transpose(exec, orig, trans); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_TRANSPOSE_KERNEL); template @@ -2321,7 +2326,7 @@ void conj_transpose(std::shared_ptr exec, generic_transpose(exec, orig, trans); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL); @@ -2347,7 +2352,7 @@ void inv_symm_permute(std::shared_ptr exec, permuted->get_col_idxs(), permuted->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); @@ -2374,7 +2379,7 @@ void inv_nonsymm_permute(std::shared_ptr exec, permuted->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL); @@ -2400,7 +2405,7 @@ void row_permute(std::shared_ptr exec, row_permuted->get_col_idxs(), row_permuted->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); @@ -2426,7 +2431,7 @@ void inv_row_permute(std::shared_ptr exec, row_permuted->get_col_idxs(), row_permuted->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL); @@ -2452,7 +2457,7 @@ void inv_symm_scale_permute(std::shared_ptr exec, permuted->get_col_idxs(), permuted->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL); @@ -2482,7 +2487,7 @@ void inv_nonsymm_scale_permute(std::shared_ptr exec, permuted->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL); @@ -2508,7 +2513,7 @@ void row_scale_permute(std::shared_ptr exec, row_permuted->get_col_idxs(), row_permuted->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL); @@ -2534,7 +2539,7 @@ void inv_row_scale_permute(std::shared_ptr exec, row_permuted->get_col_idxs(), row_permuted->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL); @@ -2592,7 +2597,7 @@ void sort_by_column_index(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX); @@ -2624,7 +2629,7 @@ void is_sorted_by_column_index( *is_sorted = get_element(is_sorted_device_array, 0); }; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX); @@ -2648,7 +2653,8 @@ void extract_diagonal(std::shared_ptr exec, orig_row_ptrs, orig_col_idxs, diag_values); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_EXTRACT_DIAGONAL); template @@ -2672,7 +2678,7 @@ void check_diagonal_entries_exist(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST); @@ -2695,7 +2701,7 @@ void add_scaled_identity(std::shared_ptr exec, mtx->get_const_col_idxs(), mtx->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 04f3229eaed..c6eb163bc7d 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -177,7 +177,7 @@ void compute_dot_dispatch(std::shared_ptr exec, compute_dot(exec, x, y, result, tmp); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); @@ -192,7 +192,7 @@ void compute_conj_dot_dispatch(std::shared_ptr exec, compute_conj_dot(exec, x, y, result, tmp); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); @@ -206,7 +206,7 @@ void compute_norm2_dispatch(std::shared_ptr exec, compute_norm2(exec, x, result, tmp); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); @@ -217,21 +217,26 @@ void simple_apply(std::shared_ptr exec, matrix::Dense* c) { using namespace oneapi::mkl; - if (b->get_stride() != 0 && c->get_stride() != 0) { - if (a->get_size()[1] > 0) { - oneapi::mkl::blas::row_major::gemm( - *exec->get_queue(), transpose::nontrans, transpose::nontrans, - c->get_size()[0], c->get_size()[1], a->get_size()[1], - one(), a->get_const_values(), a->get_stride(), - b->get_const_values(), b->get_stride(), zero(), - c->get_values(), c->get_stride()); - } else { - dense::fill(exec, c, zero()); + if constexpr (onemkl::is_supported::value) { + if (b->get_stride() != 0 && c->get_stride() != 0) { + if (a->get_size()[1] > 0) { + oneapi::mkl::blas::row_major::gemm( + *exec->get_queue(), transpose::nontrans, + transpose::nontrans, c->get_size()[0], c->get_size()[1], + a->get_size()[1], one(), a->get_const_values(), + a->get_stride(), b->get_const_values(), b->get_stride(), + zero(), c->get_values(), c->get_stride()); + } else { + dense::fill(exec, c, zero()); + } } + } else { + GKO_NOT_IMPLEMENTED; } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); template @@ -241,23 +246,28 @@ void apply(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { using namespace oneapi::mkl; - if (b->get_stride() != 0 && c->get_stride() != 0) { - if (a->get_size()[1] > 0) { - oneapi::mkl::blas::row_major::gemm( - *exec->get_queue(), transpose::nontrans, transpose::nontrans, - c->get_size()[0], c->get_size()[1], a->get_size()[1], - exec->copy_val_to_host(alpha->get_const_values()), - a->get_const_values(), a->get_stride(), b->get_const_values(), - b->get_stride(), - exec->copy_val_to_host(beta->get_const_values()), - c->get_values(), c->get_stride()); - } else { - dense::scale(exec, beta, c); + if constexpr (onemkl::is_supported::value) { + if (b->get_stride() != 0 && c->get_stride() != 0) { + if (a->get_size()[1] > 0) { + oneapi::mkl::blas::row_major::gemm( + *exec->get_queue(), transpose::nontrans, + transpose::nontrans, c->get_size()[0], c->get_size()[1], + a->get_size()[1], + exec->copy_val_to_host(alpha->get_const_values()), + a->get_const_values(), a->get_stride(), + b->get_const_values(), b->get_stride(), + exec->copy_val_to_host(beta->get_const_values()), + c->get_values(), c->get_stride()); + } else { + dense::scale(exec, beta, c); + } } + } else { + GKO_NOT_IMPLEMENTED; } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_APPLY_KERNEL); template @@ -292,7 +302,7 @@ void convert_to_coo(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL); @@ -326,7 +336,7 @@ void convert_to_csr(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL); @@ -365,7 +375,7 @@ void convert_to_ell(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL); @@ -375,7 +385,7 @@ void convert_to_fbcsr(std::shared_ptr exec, matrix::Fbcsr* result) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL); @@ -385,7 +395,7 @@ void count_nonzero_blocks_per_row(std::shared_ptr exec, int bs, IndexType* result) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL); @@ -441,7 +451,7 @@ void convert_to_hybrid(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL); @@ -484,7 +494,7 @@ void convert_to_sellp(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL); @@ -516,7 +526,7 @@ void convert_to_sparsity_csr(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL); @@ -538,7 +548,8 @@ void transpose(std::shared_ptr exec, queue, orig, trans); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); template @@ -565,7 +576,8 @@ void conj_transpose(std::shared_ptr exec, trans->get_values(), trans->get_stride()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); } // namespace dense diff --git a/dpcpp/matrix/diagonal_kernels.dp.cpp b/dpcpp/matrix/diagonal_kernels.dp.cpp index 2b63138abbe..272a6dbd581 100644 --- a/dpcpp/matrix/diagonal_kernels.dp.cpp +++ b/dpcpp/matrix/diagonal_kernels.dp.cpp @@ -82,7 +82,7 @@ void apply_to_csr(std::shared_ptr exec, inverse); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL); diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index a97cb602d52..b33ed28b12d 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -415,7 +415,7 @@ void spmv(std::shared_ptr exec, exec, num_worker_per_row, a, b, c); } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ELL_SPMV_KERNEL); @@ -451,7 +451,7 @@ void advanced_spmv(std::shared_ptr exec, exec, num_worker_per_row, a, b, c, alpha, beta); } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); diff --git a/dpcpp/matrix/fbcsr_kernels.dp.cpp b/dpcpp/matrix/fbcsr_kernels.dp.cpp index e9eb02f5fb2..7d53b862d67 100644 --- a/dpcpp/matrix/fbcsr_kernels.dp.cpp +++ b/dpcpp/matrix/fbcsr_kernels.dp.cpp @@ -32,7 +32,8 @@ void spmv(std::shared_ptr exec, const matrix::Dense* b, matrix::Dense* c) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_FBCSR_SPMV_KERNEL); template @@ -43,7 +44,7 @@ void advanced_spmv(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL); @@ -54,7 +55,7 @@ void fill_in_matrix_data(std::shared_ptr exec, array& col_idxs, array& values) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL); @@ -63,7 +64,7 @@ void fill_in_dense(std::shared_ptr exec, const matrix::Fbcsr* source, matrix::Dense* result) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); @@ -73,7 +74,7 @@ void convert_to_csr(const std::shared_ptr exec, matrix::Csr* result) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL); @@ -82,7 +83,7 @@ void transpose(std::shared_ptr exec, const matrix::Fbcsr* orig, matrix::Fbcsr* trans) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL); @@ -92,7 +93,7 @@ void conj_transpose(std::shared_ptr exec, matrix::Fbcsr* trans) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); @@ -102,7 +103,7 @@ void is_sorted_by_column_index( const matrix::Fbcsr* to_check, bool* is_sorted) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX); @@ -111,7 +112,7 @@ void sort_by_column_index(const std::shared_ptr exec, matrix::Fbcsr* to_sort) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX); @@ -120,7 +121,7 @@ void extract_diagonal(std::shared_ptr exec, const matrix::Fbcsr* orig, matrix::Diagonal* diag) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL); diff --git a/dpcpp/matrix/sellp_kernels.dp.cpp b/dpcpp/matrix/sellp_kernels.dp.cpp index 9c0fe717e8a..e83e8f2ce1a 100644 --- a/dpcpp/matrix/sellp_kernels.dp.cpp +++ b/dpcpp/matrix/sellp_kernels.dp.cpp @@ -119,7 +119,8 @@ void spmv(std::shared_ptr exec, b->get_const_values(), c->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SELLP_SPMV_KERNEL); template @@ -142,7 +143,7 @@ void advanced_spmv(std::shared_ptr exec, beta->get_const_values(), c->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); diff --git a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp index 66c57ac5b35..0e076794ac8 100644 --- a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp +++ b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp @@ -57,11 +57,11 @@ void device_classical_spmv(const size_type num_rows, const auto subrow = thread::get_subwarp_num_flat(item_ct1); const auto subid = subgroup_tile.thread_rank(); const IndexType column_id = item_ct1.get_group(1); - const arithmetic_type value = static_cast(val[0]); + const auto value = static_cast(val[0]); auto row = thread::get_subwarp_id_flat(item_ct1); for (; row < num_rows; row += subrow) { const auto ind_end = row_ptrs[row + 1]; - arithmetic_type temp_val = zero(); + auto temp_val = zero(); for (auto ind = row_ptrs[row] + subid; ind < ind_end; ind += subgroup_size) { temp_val += value * b(col_idxs[ind], column_id); @@ -237,7 +237,7 @@ void spmv(std::shared_ptr exec, syn::value_list(), syn::type_list<>(), exec, a, b, c); } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL); @@ -255,7 +255,7 @@ void advanced_spmv(std::shared_ptr exec, syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, beta); } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); @@ -265,7 +265,7 @@ void transpose(std::shared_ptr exec, matrix::SparsityCsr* trans) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL); @@ -290,7 +290,7 @@ void sort_by_column_index(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX); @@ -324,7 +324,7 @@ void is_sorted_by_column_index( cpu_array = gpu_array; }; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX); diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index 8875b7d46f3..ad31a6b19e8 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -48,13 +48,15 @@ make_temporary_conversion(Ptr&& matrix) { using Pointee = detail::pointee; using Dense = matrix::Dense; - using NextDense = matrix::Dense>; + using NextDense = matrix::Dense>; + using NextNextDense = matrix::Dense< + next_precision_with_half>>; using MaybeConstDense = std::conditional_t::value, const Dense, Dense>; auto result = detail::temporary_conversion< - MaybeConstDense>::template create(matrix); + MaybeConstDense>::template create(matrix); if (!result) { - GKO_NOT_SUPPORTED(*matrix); + GKO_NOT_SUPPORTED(matrix); } return result; } @@ -226,23 +228,26 @@ void mixed_precision_dispatch(Function fn, const LinOp* in, LinOp* out) { #ifdef GINKGO_MIXED_PRECISION using fst_type = matrix::Dense; - using snd_type = matrix::Dense>; - if (auto dense_in = dynamic_cast(in)) { + using snd_type = matrix::Dense>; + using trd_type = matrix::Dense< + next_precision_with_half>>; + auto dispatch_out_vector = [&](auto dense_in) { if (auto dense_out = dynamic_cast(out)) { fn(dense_in, dense_out); } else if (auto dense_out = dynamic_cast(out)) { fn(dense_in, dense_out); - } else { - GKO_NOT_SUPPORTED(out); - } - } else if (auto dense_in = dynamic_cast(in)) { - if (auto dense_out = dynamic_cast(out)) { - fn(dense_in, dense_out); - } else if (auto dense_out = dynamic_cast(out)) { + } else if (auto dense_out = dynamic_cast(out)) { fn(dense_in, dense_out); } else { GKO_NOT_SUPPORTED(out); } + }; + if (auto dense_in = dynamic_cast(in)) { + dispatch_out_vector(dense_in); + } else if (auto dense_in = dynamic_cast(in)) { + dispatch_out_vector(dense_in); + } else if (auto dense_in = dynamic_cast(in)) { + dispatch_out_vector(dense_in); } else { GKO_NOT_SUPPORTED(in); } diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp index 9373107df69..a0edf5aa862 100644 --- a/include/ginkgo/core/matrix/coo.hpp +++ b/include/ginkgo/core/matrix/coo.hpp @@ -47,15 +47,21 @@ class Hybrid; * @ingroup LinOp */ template -class Coo : public EnableLinOp>, - public ConvertibleTo, IndexType>>, - public ConvertibleTo>, - public ConvertibleTo>, - public DiagonalExtractable, - public ReadableFromMatrixData, - public WritableToMatrixData, - public EnableAbsoluteComputation< - remove_complex>> { +class Coo + : public EnableLinOp>, + public ConvertibleTo, IndexType>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo< + Coo>, + IndexType>>, +#endif + public ConvertibleTo>, + public ConvertibleTo>, + public DiagonalExtractable, + public ReadableFromMatrixData, + public WritableToMatrixData, + public EnableAbsoluteComputation< + remove_complex>> { friend class EnablePolymorphicObject; friend class Csr; friend class Dense; @@ -66,8 +72,10 @@ class Coo : public EnableLinOp>, public: using EnableLinOp::convert_to; using EnableLinOp::move_to; - using ConvertibleTo, IndexType>>::convert_to; - using ConvertibleTo, IndexType>>::move_to; + using ConvertibleTo< + Coo, IndexType>>::convert_to; + using ConvertibleTo< + Coo, IndexType>>::move_to; using ConvertibleTo>::convert_to; using ConvertibleTo>::move_to; using ConvertibleTo>::convert_to; @@ -80,12 +88,33 @@ class Coo : public EnableLinOp>, using device_mat_data = device_matrix_data; using absolute_type = remove_complex; - friend class Coo, IndexType>; + friend class Coo, IndexType>; + + void convert_to(Coo, IndexType>* result) + const override; + + void move_to( + Coo, IndexType>* result) override; + +#if GINKGO_ENABLE_HALF + friend class Coo< + previous_precision_with_half>, + IndexType>; + using ConvertibleTo< + Coo>, + IndexType>>::convert_to; + using ConvertibleTo< + Coo>, + IndexType>>::move_to; void convert_to( - Coo, IndexType>* result) const override; + Coo>, + IndexType>* result) const override; - void move_to(Coo, IndexType>* result) override; + void move_to( + Coo>, + IndexType>* result) override; +#endif void convert_to(Csr* other) const override; diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index f27fe12a934..2f66683085f 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -98,23 +98,29 @@ void strategy_rebuild_helper(Csr* result); * @ingroup LinOp */ template -class Csr : public EnableLinOp>, - public ConvertibleTo, IndexType>>, - public ConvertibleTo>, - public ConvertibleTo>, - public ConvertibleTo>, - public ConvertibleTo>, - public ConvertibleTo>, - public ConvertibleTo>, - public ConvertibleTo>, - public DiagonalExtractable, - public ReadableFromMatrixData, - public WritableToMatrixData, - public Transposable, - public Permutable, - public EnableAbsoluteComputation< - remove_complex>>, - public ScaledIdentityAddable { +class Csr + : public EnableLinOp>, + public ConvertibleTo, IndexType>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo< + Csr>, + IndexType>>, +#endif + public ConvertibleTo>, + public ConvertibleTo>, + public ConvertibleTo>, + public ConvertibleTo>, + public ConvertibleTo>, + public ConvertibleTo>, + public ConvertibleTo>, + public DiagonalExtractable, + public ReadableFromMatrixData, + public WritableToMatrixData, + public Transposable, + public Permutable, + public EnableAbsoluteComputation< + remove_complex>>, + public ScaledIdentityAddable { friend class EnablePolymorphicObject; friend class Coo; friend class Dense; @@ -130,8 +136,10 @@ class Csr : public EnableLinOp>, public: using EnableLinOp::convert_to; using EnableLinOp::move_to; - using ConvertibleTo, IndexType>>::convert_to; - using ConvertibleTo, IndexType>>::move_to; + using ConvertibleTo< + Csr, IndexType>>::convert_to; + using ConvertibleTo< + Csr, IndexType>>::move_to; using ConvertibleTo>::convert_to; using ConvertibleTo>::move_to; using ConvertibleTo>::convert_to; @@ -688,12 +696,33 @@ class Csr : public EnableLinOp>, index_type max_length_per_row_; }; - friend class Csr, IndexType>; + friend class Csr, IndexType>; + + void convert_to(Csr, IndexType>* result) + const override; + + void move_to( + Csr, IndexType>* result) override; + +#if GINKGO_ENABLE_HALF + friend class Csr< + previous_precision_with_half>, + IndexType>; + using ConvertibleTo< + Csr>, + IndexType>>::convert_to; + using ConvertibleTo< + Csr>, + IndexType>>::move_to; void convert_to( - Csr, IndexType>* result) const override; + Csr>, + IndexType>* result) const override; - void move_to(Csr, IndexType>* result) override; + void move_to( + Csr>, + IndexType>* result) override; +#endif void convert_to(Dense* other) const override; diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index bccd3adcd54..9ae96ca46d6 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -87,7 +87,11 @@ class SparsityCsr; template class Dense : public EnableLinOp>, - public ConvertibleTo>>, + public ConvertibleTo>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo< + Dense>>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, @@ -135,8 +139,8 @@ class Dense public: using EnableLinOp::convert_to; using EnableLinOp::move_to; - using ConvertibleTo>>::convert_to; - using ConvertibleTo>>::move_to; + using ConvertibleTo>>::convert_to; + using ConvertibleTo>>::move_to; using ConvertibleTo>::convert_to; using ConvertibleTo>::move_to; using ConvertibleTo>::convert_to; @@ -276,11 +280,29 @@ class Dense return other->create_const_view_of_impl(); } - friend class Dense>; + friend class Dense>; - void convert_to(Dense>* result) const override; + void convert_to( + Dense>* result) const override; - void move_to(Dense>* result) override; + void move_to(Dense>* result) override; + +#if GINKGO_ENABLE_HALF + friend class Dense< + previous_precision_with_half>>; + using ConvertibleTo>>>::convert_to; + using ConvertibleTo>>>::move_to; + + void convert_to( + Dense>>* + result) const override; + + void move_to( + Dense>>* + result) override; +#endif void convert_to(Coo* result) const override; diff --git a/include/ginkgo/core/matrix/diagonal.hpp b/include/ginkgo/core/matrix/diagonal.hpp index 56906a4d96f..3b11399138b 100644 --- a/include/ginkgo/core/matrix/diagonal.hpp +++ b/include/ginkgo/core/matrix/diagonal.hpp @@ -41,7 +41,11 @@ class Diagonal : public EnableLinOp>, public ConvertibleTo>, public ConvertibleTo>, - public ConvertibleTo>>, + public ConvertibleTo>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo>>>, +#endif public Transposable, public WritableToMatrixData, public WritableToMatrixData, @@ -60,8 +64,9 @@ class Diagonal using ConvertibleTo>::move_to; using ConvertibleTo>::convert_to; using ConvertibleTo>::move_to; - using ConvertibleTo>>::convert_to; - using ConvertibleTo>>::move_to; + using ConvertibleTo< + Diagonal>>::convert_to; + using ConvertibleTo>>::move_to; using value_type = ValueType; using index_type = int64; @@ -71,15 +76,34 @@ class Diagonal using device_mat_data32 = device_matrix_data; using absolute_type = remove_complex; - friend class Diagonal>; + friend class Diagonal>; std::unique_ptr transpose() const override; std::unique_ptr conj_transpose() const override; - void convert_to(Diagonal>* result) const override; + void convert_to( + Diagonal>* result) const override; - void move_to(Diagonal>* result) override; + void move_to( + Diagonal>* result) override; + +#if GINKGO_ENABLE_HALF + friend class Diagonal< + previous_precision_with_half>>; + using ConvertibleTo>>>::convert_to; + using ConvertibleTo>>>::move_to; + + void convert_to( + Diagonal>>* + result) const override; + + void move_to( + Diagonal>>* + result) override; +#endif void convert_to(Csr* result) const override; diff --git a/include/ginkgo/core/matrix/ell.hpp b/include/ginkgo/core/matrix/ell.hpp index 37f4c0e7f55..adbd3505855 100644 --- a/include/ginkgo/core/matrix/ell.hpp +++ b/include/ginkgo/core/matrix/ell.hpp @@ -49,28 +49,36 @@ class Hybrid; * @ingroup LinOp */ template -class Ell : public EnableLinOp>, - public ConvertibleTo, IndexType>>, - public ConvertibleTo>, - public ConvertibleTo>, - public DiagonalExtractable, - public ReadableFromMatrixData, - public WritableToMatrixData, - public EnableAbsoluteComputation< - remove_complex>> { +class Ell + : public EnableLinOp>, + public ConvertibleTo, IndexType>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo< + Ell>, + IndexType>>, +#endif + public ConvertibleTo>, + public ConvertibleTo>, + public DiagonalExtractable, + public ReadableFromMatrixData, + public WritableToMatrixData, + public EnableAbsoluteComputation< + remove_complex>> { friend class EnablePolymorphicObject; friend class Dense; friend class Coo; friend class Csr; friend class Ell, IndexType>; - friend class Ell, IndexType>; + friend class Ell, IndexType>; friend class Hybrid; public: using EnableLinOp::convert_to; using EnableLinOp::move_to; - using ConvertibleTo, IndexType>>::convert_to; - using ConvertibleTo, IndexType>>::move_to; + using ConvertibleTo< + Ell, IndexType>>::convert_to; + using ConvertibleTo< + Ell, IndexType>>::move_to; using ConvertibleTo>::convert_to; using ConvertibleTo>::move_to; using ConvertibleTo>::convert_to; @@ -83,10 +91,31 @@ class Ell : public EnableLinOp>, using device_mat_data = device_matrix_data; using absolute_type = remove_complex; + void convert_to(Ell, IndexType>* result) + const override; + + void move_to( + Ell, IndexType>* result) override; + +#if GINKGO_ENABLE_HALF + friend class Ell< + previous_precision_with_half>, + IndexType>; + using ConvertibleTo< + Ell>, + IndexType>>::convert_to; + using ConvertibleTo< + Ell>, + IndexType>>::move_to; + void convert_to( - Ell, IndexType>* result) const override; + Ell>, + IndexType>* result) const override; - void move_to(Ell, IndexType>* result) override; + void move_to( + Ell>, + IndexType>* result) override; +#endif void convert_to(Dense* other) const override; diff --git a/include/ginkgo/core/matrix/fbcsr.hpp b/include/ginkgo/core/matrix/fbcsr.hpp index ce327e7e8a0..283807b242c 100644 --- a/include/ginkgo/core/matrix/fbcsr.hpp +++ b/include/ginkgo/core/matrix/fbcsr.hpp @@ -96,17 +96,24 @@ inline IndexType get_num_blocks(const int block_size, const IndexType size) * @ingroup LinOp */ template -class Fbcsr : public EnableLinOp>, - public ConvertibleTo, IndexType>>, - public ConvertibleTo>, - public ConvertibleTo>, - public ConvertibleTo>, - public DiagonalExtractable, - public ReadableFromMatrixData, - public WritableToMatrixData, - public Transposable, - public EnableAbsoluteComputation< - remove_complex>> { +class Fbcsr + : public EnableLinOp>, + public ConvertibleTo< + Fbcsr, IndexType>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo< + Fbcsr>, + IndexType>>, +#endif + public ConvertibleTo>, + public ConvertibleTo>, + public ConvertibleTo>, + public DiagonalExtractable, + public ReadableFromMatrixData, + public WritableToMatrixData, + public Transposable, + public EnableAbsoluteComputation< + remove_complex>> { friend class EnablePolymorphicObject; friend class Csr; friend class Dense; @@ -136,8 +143,9 @@ class Fbcsr : public EnableLinOp>, using EnableLinOp>::convert_to; using ConvertibleTo< - Fbcsr, IndexType>>::convert_to; - using ConvertibleTo, IndexType>>::move_to; + Fbcsr, IndexType>>::convert_to; + using ConvertibleTo< + Fbcsr, IndexType>>::move_to; using ConvertibleTo>::convert_to; using ConvertibleTo>::move_to; using ConvertibleTo>::convert_to; @@ -145,12 +153,33 @@ class Fbcsr : public EnableLinOp>, using ConvertibleTo>::convert_to; using ConvertibleTo>::move_to; - friend class Fbcsr, IndexType>; + friend class Fbcsr, IndexType>; + + void convert_to(Fbcsr, IndexType>* + result) const override; + + void move_to( + Fbcsr, IndexType>* result) override; + +#if GINKGO_ENABLE_HALF + friend class Fbcsr< + previous_precision_with_half>, + IndexType>; + using ConvertibleTo< + Fbcsr>, + IndexType>>::convert_to; + using ConvertibleTo< + Fbcsr>, + IndexType>>::move_to; void convert_to( - Fbcsr, IndexType>* result) const override; + Fbcsr>, + IndexType>* result) const override; - void move_to(Fbcsr, IndexType>* result) override; + void move_to( + Fbcsr>, + IndexType>* result) override; +#endif void convert_to(Dense* other) const override; diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp index 5e995cb0ba0..24cb3ed26c7 100644 --- a/include/ginkgo/core/matrix/hybrid.hpp +++ b/include/ginkgo/core/matrix/hybrid.hpp @@ -41,7 +41,13 @@ class Csr; template class Hybrid : public EnableLinOp>, - public ConvertibleTo, IndexType>>, + public ConvertibleTo< + Hybrid, IndexType>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo< + Hybrid>, + IndexType>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -59,8 +65,9 @@ class Hybrid using EnableLinOp::convert_to; using EnableLinOp::move_to; using ConvertibleTo< - Hybrid, IndexType>>::convert_to; - using ConvertibleTo, IndexType>>::move_to; + Hybrid, IndexType>>::convert_to; + using ConvertibleTo< + Hybrid, IndexType>>::move_to; using ConvertibleTo>::convert_to; using ConvertibleTo>::move_to; using ConvertibleTo>::convert_to; @@ -355,12 +362,33 @@ class Hybrid imbalance_bounded_limit strategy_; }; - friend class Hybrid, IndexType>; + friend class Hybrid, IndexType>; + + void convert_to(Hybrid, IndexType>* + result) const override; + + void move_to(Hybrid, IndexType>* result) + override; + +#if GINKGO_ENABLE_HALF + friend class Hybrid< + previous_precision_with_half>, + IndexType>; + using ConvertibleTo< + Hybrid>, + IndexType>>::convert_to; + using ConvertibleTo< + Hybrid>, + IndexType>>::move_to; void convert_to( - Hybrid, IndexType>* result) const override; + Hybrid>, + IndexType>* result) const override; - void move_to(Hybrid, IndexType>* result) override; + void move_to( + Hybrid>, + IndexType>* result) override; +#endif void convert_to(Dense* other) const override; diff --git a/include/ginkgo/core/matrix/sellp.hpp b/include/ginkgo/core/matrix/sellp.hpp index e6520324030..6140a832c85 100644 --- a/include/ginkgo/core/matrix/sellp.hpp +++ b/include/ginkgo/core/matrix/sellp.hpp @@ -40,15 +40,22 @@ class Csr; * @ingroup LinOp */ template -class Sellp : public EnableLinOp>, - public ConvertibleTo, IndexType>>, - public ConvertibleTo>, - public ConvertibleTo>, - public DiagonalExtractable, - public ReadableFromMatrixData, - public WritableToMatrixData, - public EnableAbsoluteComputation< - remove_complex>> { +class Sellp + : public EnableLinOp>, + public ConvertibleTo< + Sellp, IndexType>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo< + Sellp>, + IndexType>>, +#endif + public ConvertibleTo>, + public ConvertibleTo>, + public DiagonalExtractable, + public ReadableFromMatrixData, + public WritableToMatrixData, + public EnableAbsoluteComputation< + remove_complex>> { friend class EnablePolymorphicObject; friend class Dense; friend class Csr; @@ -58,8 +65,9 @@ class Sellp : public EnableLinOp>, using EnableLinOp::convert_to; using EnableLinOp::move_to; using ConvertibleTo< - Sellp, IndexType>>::convert_to; - using ConvertibleTo, IndexType>>::move_to; + Sellp, IndexType>>::convert_to; + using ConvertibleTo< + Sellp, IndexType>>::move_to; using ConvertibleTo>::convert_to; using ConvertibleTo>::move_to; using ConvertibleTo>::convert_to; @@ -72,12 +80,33 @@ class Sellp : public EnableLinOp>, using device_mat_data = device_matrix_data; using absolute_type = remove_complex; - friend class Sellp, IndexType>; + friend class Sellp, IndexType>; + + void convert_to(Sellp, IndexType>* + result) const override; + + void move_to( + Sellp, IndexType>* result) override; + +#if GINKGO_ENABLE_HALF + friend class Sellp< + previous_precision_with_half>, + IndexType>; + using ConvertibleTo< + Sellp>, + IndexType>>::convert_to; + using ConvertibleTo< + Sellp>, + IndexType>>::move_to; void convert_to( - Sellp, IndexType>* result) const override; + Sellp>, + IndexType>* result) const override; - void move_to(Sellp, IndexType>* result) override; + void move_to( + Sellp>, + IndexType>* result) override; +#endif void convert_to(Dense* other) const override; diff --git a/omp/matrix/coo_kernels.cpp b/omp/matrix/coo_kernels.cpp index 021795d8e9c..6d4a46b7ed3 100644 --- a/omp/matrix/coo_kernels.cpp +++ b/omp/matrix/coo_kernels.cpp @@ -42,7 +42,8 @@ void spmv(std::shared_ptr exec, spmv2(exec, a, b, c); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_COO_SPMV_KERNEL); template @@ -57,7 +58,7 @@ void advanced_spmv(std::shared_ptr exec, advanced_spmv2(exec, alpha, a, b, c); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL); @@ -306,7 +307,8 @@ void spmv2(std::shared_ptr exec, generic_spmv2(exec, a, b, c, one()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_COO_SPMV2_KERNEL); template @@ -319,7 +321,7 @@ void advanced_spmv2(std::shared_ptr exec, generic_spmv2(exec, a, b, c, alpha->at(0, 0)); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index 87b328b1093..d9c7b9840c1 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -77,7 +77,7 @@ void spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_SPMV_KERNEL); @@ -95,8 +95,8 @@ void advanced_spmv(std::shared_ptr exec, auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - arithmetic_type valpha = alpha->at(0, 0); - arithmetic_type vbeta = beta->at(0, 0); + auto valpha = static_cast(alpha->at(0, 0)); + auto vbeta = static_cast(beta->at(0, 0)); const auto a_vals = acc::helper::build_const_rrm_accessor(a); @@ -118,7 +118,7 @@ void advanced_spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL); @@ -374,7 +374,8 @@ void spgemm(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_SPGEMM_KERNEL); template @@ -490,7 +491,7 @@ void advanced_spgemm(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL); @@ -540,7 +541,8 @@ void spgeam(std::shared_ptr exec, [](IndexType, IndexType) {}); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_SPGEAM_KERNEL); template @@ -563,7 +565,7 @@ void fill_in_dense(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); @@ -633,7 +635,7 @@ void convert_to_fbcsr(std::shared_ptr exec, std::copy(col_idx_vec.begin(), col_idx_vec.end(), col_idxs.get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); @@ -692,7 +694,8 @@ void transpose(std::shared_ptr exec, [](const ValueType x) { return x; }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_TRANSPOSE_KERNEL); template @@ -704,7 +707,7 @@ void conj_transpose(std::shared_ptr exec, [](const ValueType x) { return conj(x); }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL); @@ -728,7 +731,7 @@ void calculate_nonzeros_per_row_in_span( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL); @@ -775,7 +778,7 @@ void calculate_nonzeros_per_row_in_index_set( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL); @@ -808,7 +811,7 @@ void compute_submatrix(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL); @@ -868,7 +871,7 @@ void compute_submatrix_from_index_set( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL); @@ -881,7 +884,7 @@ void inv_symm_permute(std::shared_ptr exec, inv_nonsymm_permute(exec, perm, perm, orig, permuted); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); @@ -921,7 +924,7 @@ void inv_nonsymm_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL); @@ -959,7 +962,7 @@ void row_permute(std::shared_ptr exec, const IndexType* perm, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); @@ -998,7 +1001,7 @@ void inv_row_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL); @@ -1011,7 +1014,7 @@ void inv_symm_scale_permute(std::shared_ptr exec, inv_nonsymm_scale_permute(exec, scale, perm, scale, perm, orig, permuted); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL); @@ -1055,7 +1058,7 @@ void inv_nonsymm_scale_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL); @@ -1096,7 +1099,7 @@ void row_scale_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL); @@ -1137,7 +1140,7 @@ void inv_row_scale_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL); @@ -1160,7 +1163,7 @@ void sort_by_column_index(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX); @@ -1188,7 +1191,7 @@ void is_sorted_by_column_index( *is_sorted = local_is_sorted; } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX); @@ -1214,7 +1217,8 @@ void extract_diagonal(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_EXTRACT_DIAGONAL); template @@ -1241,7 +1245,7 @@ void check_diagonal_entries_exist(std::shared_ptr exec, has_all_diags = l_has_all_diags; } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST); @@ -1270,7 +1274,7 @@ void add_scaled_identity(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp index d1c0f2f8949..4ca5aa0c075 100644 --- a/omp/matrix/dense_kernels.cpp +++ b/omp/matrix/dense_kernels.cpp @@ -46,7 +46,7 @@ void compute_dot_dispatch(std::shared_ptr exec, compute_dot(exec, x, y, result, tmp); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); @@ -60,7 +60,7 @@ void compute_conj_dot_dispatch(std::shared_ptr exec, compute_conj_dot(exec, x, y, result, tmp); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); @@ -73,7 +73,7 @@ void compute_norm2_dispatch(std::shared_ptr exec, compute_norm2(exec, x, result, tmp); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); @@ -100,7 +100,8 @@ void simple_apply(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); template @@ -136,7 +137,7 @@ void apply(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_APPLY_KERNEL); template @@ -168,7 +169,7 @@ void convert_to_coo(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL); @@ -199,7 +200,7 @@ void convert_to_csr(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL); @@ -232,7 +233,7 @@ void convert_to_ell(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL); @@ -280,7 +281,7 @@ void convert_to_fbcsr(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL); @@ -326,7 +327,7 @@ void convert_to_hybrid(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL); @@ -368,7 +369,7 @@ void convert_to_sellp(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL); @@ -398,7 +399,7 @@ void convert_to_sparsity_csr(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL); @@ -415,7 +416,8 @@ void transpose(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); template @@ -431,7 +433,8 @@ void conj_transpose(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); template @@ -461,7 +464,7 @@ void count_nonzero_blocks_per_row(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL); diff --git a/omp/matrix/diagonal_kernels.cpp b/omp/matrix/diagonal_kernels.cpp index 71363c7bc6e..c16e740dc45 100644 --- a/omp/matrix/diagonal_kernels.cpp +++ b/omp/matrix/diagonal_kernels.cpp @@ -43,7 +43,7 @@ void apply_to_csr(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL); diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp index c35a3654b86..dc200ae0f93 100644 --- a/omp/matrix/ell_kernels.cpp +++ b/omp/matrix/ell_kernels.cpp @@ -185,7 +185,7 @@ void spmv(std::shared_ptr exec, spmv_blocked<4>(exec, a, b, c, out); } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ELL_SPMV_KERNEL); @@ -228,7 +228,7 @@ void advanced_spmv(std::shared_ptr exec, spmv_blocked<4>(exec, a, b, c, out); } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp index d17d47a7467..14dcb1db77a 100644 --- a/omp/matrix/fbcsr_kernels.cpp +++ b/omp/matrix/fbcsr_kernels.cpp @@ -74,7 +74,8 @@ void spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_FBCSR_SPMV_KERNEL); template @@ -118,7 +119,7 @@ void advanced_spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL); @@ -176,7 +177,7 @@ void fill_in_matrix_data(std::shared_ptr exec, std::copy(col_idx_vec.begin(), col_idx_vec.end(), col_idxs.get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL); @@ -209,7 +210,7 @@ void fill_in_dense(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); @@ -255,7 +256,7 @@ void convert_to_csr(const std::shared_ptr exec, row_ptrs[result->get_size()[0]] = source->get_num_stored_elements(); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL); @@ -330,7 +331,7 @@ void transpose(std::shared_ptr exec, [](const ValueType x) { return x; }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL); @@ -343,7 +344,7 @@ void conj_transpose(std::shared_ptr exec, [](const ValueType x) { return conj(x); }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); @@ -371,7 +372,7 @@ void is_sorted_by_column_index( *is_sorted = local_is_sorted; } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX); @@ -426,7 +427,7 @@ void sort_by_column_index(const std::shared_ptr exec, syn::value_list(), syn::type_list<>(), to_sort); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX); @@ -463,7 +464,7 @@ void extract_diagonal(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL); diff --git a/omp/matrix/sellp_kernels.cpp b/omp/matrix/sellp_kernels.cpp index 7f8b16264ce..6306093b36d 100644 --- a/omp/matrix/sellp_kernels.cpp +++ b/omp/matrix/sellp_kernels.cpp @@ -155,7 +155,8 @@ void spmv(std::shared_ptr exec, spmv_blocked<4>(exec, a, b, c, out); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SELLP_SPMV_KERNEL); template @@ -194,7 +195,7 @@ void advanced_spmv(std::shared_ptr exec, spmv_blocked<4>(exec, a, b, c, out); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); diff --git a/omp/matrix/sparsity_csr_kernels.cpp b/omp/matrix/sparsity_csr_kernels.cpp index 35bb42c70a6..560ee6d4890 100644 --- a/omp/matrix/sparsity_csr_kernels.cpp +++ b/omp/matrix/sparsity_csr_kernels.cpp @@ -58,7 +58,7 @@ void spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL); @@ -95,7 +95,7 @@ void advanced_spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); @@ -149,7 +149,7 @@ void transpose(std::shared_ptr exec, transpose_and_transform(exec, trans, orig); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL); @@ -168,7 +168,7 @@ void sort_by_column_index(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX); @@ -197,7 +197,7 @@ void is_sorted_by_column_index( *is_sorted = local_is_sorted; } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX); diff --git a/reference/matrix/coo_kernels.cpp b/reference/matrix/coo_kernels.cpp index f9bf9f5f33d..ebb8c1dfce6 100644 --- a/reference/matrix/coo_kernels.cpp +++ b/reference/matrix/coo_kernels.cpp @@ -38,7 +38,8 @@ void spmv(std::shared_ptr exec, spmv2(exec, a, b, c); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_COO_SPMV_KERNEL); template @@ -53,7 +54,7 @@ void advanced_spmv(std::shared_ptr exec, advanced_spmv2(exec, alpha, a, b, c); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL); @@ -73,7 +74,8 @@ void spmv2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_COO_SPMV2_KERNEL); template @@ -96,7 +98,7 @@ void advanced_spmv2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); @@ -113,7 +115,7 @@ void fill_in_dense(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_COO_FILL_IN_DENSE_KERNEL); @@ -136,7 +138,7 @@ void extract_diagonal(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL); diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index a0607110b79..679844084d2 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -76,7 +76,7 @@ void spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_SPMV_KERNEL); @@ -94,8 +94,8 @@ void advanced_spmv(std::shared_ptr exec, auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - arithmetic_type valpha = alpha->at(0, 0); - arithmetic_type vbeta = beta->at(0, 0); + auto valpha = static_cast(alpha->at(0, 0)); + auto vbeta = static_cast(beta->at(0, 0)); const auto a_vals = acc::helper::build_const_rrm_accessor(a); @@ -116,7 +116,7 @@ void advanced_spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL); @@ -240,7 +240,8 @@ void spgemm(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_SPGEMM_KERNEL); template @@ -295,7 +296,7 @@ void advanced_spgemm(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL); @@ -345,7 +346,8 @@ void spgeam(std::shared_ptr exec, [](IndexType, IndexType) {}); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_SPGEAM_KERNEL); template @@ -367,7 +369,7 @@ void fill_in_dense(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); @@ -414,7 +416,7 @@ void convert_to_sellp(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL); @@ -445,7 +447,7 @@ void convert_to_ell(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL); @@ -515,7 +517,7 @@ void convert_to_fbcsr(std::shared_ptr exec, std::copy(col_idx_vec.begin(), col_idx_vec.end(), col_idxs.get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); @@ -574,7 +576,8 @@ void transpose(std::shared_ptr exec, [](const ValueType x) { return x; }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_TRANSPOSE_KERNEL); template @@ -586,7 +589,7 @@ void conj_transpose(std::shared_ptr exec, [](const ValueType x) { return conj(x); }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL); @@ -610,7 +613,7 @@ void calculate_nonzeros_per_row_in_span( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL); @@ -657,7 +660,7 @@ void calculate_nonzeros_per_row_in_index_set( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL); @@ -691,7 +694,7 @@ void compute_submatrix(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL); @@ -749,7 +752,7 @@ void compute_submatrix_from_index_set( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL); @@ -800,7 +803,7 @@ void convert_to_hybrid(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); @@ -813,7 +816,7 @@ void inv_symm_permute(std::shared_ptr exec, inv_nonsymm_permute(exec, perm, perm, orig, permuted); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); @@ -851,7 +854,7 @@ void inv_nonsymm_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL); @@ -886,7 +889,7 @@ void row_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); @@ -921,7 +924,7 @@ void inv_row_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL); @@ -951,7 +954,7 @@ void inv_col_permute(std::shared_ptr exec, cp_row_ptrs[num_rows] = in_row_ptrs[num_rows]; } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL); @@ -964,7 +967,7 @@ void inv_symm_scale_permute(std::shared_ptr exec, inv_nonsymm_scale_permute(exec, scale, perm, scale, perm, orig, permuted); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL); @@ -1006,7 +1009,7 @@ void inv_nonsymm_scale_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL); @@ -1043,7 +1046,7 @@ void row_scale_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL); @@ -1080,7 +1083,7 @@ void inv_row_scale_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL); @@ -1111,7 +1114,7 @@ void inv_col_scale_permute(std::shared_ptr exec, cp_row_ptrs[num_rows] = in_row_ptrs[num_rows]; } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL); @@ -1133,7 +1136,7 @@ void sort_by_column_index(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX); @@ -1157,7 +1160,7 @@ void is_sorted_by_column_index( return; } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX); @@ -1182,7 +1185,8 @@ void extract_diagonal(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_EXTRACT_DIAGONAL); template @@ -1198,7 +1202,8 @@ void scale(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_SCALE_KERNEL); template @@ -1214,7 +1219,8 @@ void inv_scale(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CSR_INV_SCALE_KERNEL); template @@ -1240,7 +1246,7 @@ void check_diagonal_entries_exist(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST); @@ -1263,7 +1269,7 @@ void add_scaled_identity(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp index 921a49998b7..561073c8c2d 100644 --- a/reference/matrix/dense_kernels.cpp +++ b/reference/matrix/dense_kernels.cpp @@ -56,7 +56,8 @@ void simple_apply(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); template @@ -89,7 +90,7 @@ void apply(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_APPLY_KERNEL); template @@ -105,7 +106,7 @@ void copy(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY( +GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY_WITH_HALF( GKO_DECLARE_DENSE_COPY_KERNEL); @@ -120,7 +121,7 @@ void fill(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_FILL_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_FILL_KERNEL); template @@ -142,7 +143,8 @@ void scale(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_SCALE_KERNEL); template @@ -165,7 +167,7 @@ void inv_scale(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF( GKO_DECLARE_DENSE_INV_SCALE_KERNEL); @@ -189,7 +191,7 @@ void add_scaled(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF( GKO_DECLARE_DENSE_ADD_SCALED_KERNEL); @@ -213,7 +215,7 @@ void sub_scaled(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF( GKO_DECLARE_DENSE_SUB_SCALED_KERNEL); @@ -229,7 +231,8 @@ void add_scaled_diag(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL); template @@ -244,7 +247,8 @@ void sub_scaled_diag(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL); template @@ -263,7 +267,8 @@ void compute_dot(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); template @@ -275,7 +280,7 @@ void compute_dot_dispatch(std::shared_ptr exec, compute_dot(exec, x, y, result, tmp); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); @@ -295,7 +300,8 @@ void compute_conj_dot(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); template @@ -308,7 +314,7 @@ void compute_conj_dot_dispatch(std::shared_ptr exec, compute_conj_dot(exec, x, y, result, tmp); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); @@ -331,7 +337,8 @@ void compute_norm2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); template @@ -343,7 +350,7 @@ void compute_norm2_dispatch(std::shared_ptr exec, compute_norm2(exec, x, result, tmp); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); @@ -363,7 +370,8 @@ void compute_norm1(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); template @@ -386,7 +394,8 @@ void compute_mean(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL); template @@ -400,7 +409,7 @@ void fill_in_matrix_data(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL); @@ -420,7 +429,7 @@ void compute_squared_norm2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL); @@ -435,7 +444,7 @@ void compute_sqrt(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL); @@ -466,7 +475,7 @@ void convert_to_coo(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL); @@ -498,7 +507,7 @@ void convert_to_csr(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL); @@ -530,7 +539,7 @@ void convert_to_ell(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL); @@ -577,7 +586,7 @@ void convert_to_fbcsr(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL); @@ -626,7 +635,7 @@ void convert_to_hybrid(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL); @@ -662,7 +671,7 @@ void convert_to_sellp(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL); @@ -692,7 +701,7 @@ void convert_to_sparsity_csr(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL); @@ -713,7 +722,7 @@ void compute_max_nnz_per_row(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL); @@ -745,7 +754,7 @@ void compute_slice_sets(std::shared_ptr exec, components::prefix_sum_nonnegative(exec, slice_sets, num_slices + 1); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL); @@ -765,9 +774,9 @@ void count_nonzeros_per_row(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T); @@ -797,7 +806,7 @@ void count_nonzero_blocks_per_row(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL); @@ -813,7 +822,8 @@ void transpose(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); template @@ -828,7 +838,8 @@ void conj_transpose(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); template @@ -844,7 +855,7 @@ void symm_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL); @@ -862,7 +873,7 @@ void inv_symm_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL); @@ -879,7 +890,7 @@ void nonsymm_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL); @@ -896,7 +907,7 @@ void inv_nonsymm_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL); @@ -912,7 +923,7 @@ void row_gather(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF( GKO_DECLARE_DENSE_ROW_GATHER_KERNEL); @@ -937,7 +948,7 @@ void advanced_row_gather(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2_WITH_HALF( GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL); @@ -953,7 +964,7 @@ void col_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL); @@ -970,7 +981,7 @@ void inv_row_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL); @@ -987,7 +998,7 @@ void inv_col_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL); @@ -1006,7 +1017,7 @@ void symm_scale_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL); @@ -1025,7 +1036,7 @@ void inv_symm_scale_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL); @@ -1048,7 +1059,7 @@ void nonsymm_scale_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL); @@ -1071,7 +1082,7 @@ void inv_nonsymm_scale_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL); @@ -1089,7 +1100,7 @@ void row_scale_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL); @@ -1107,7 +1118,7 @@ void inv_row_scale_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL); @@ -1125,7 +1136,7 @@ void col_scale_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL); @@ -1143,7 +1154,7 @@ void inv_col_scale_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL); @@ -1158,7 +1169,8 @@ void extract_diagonal(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL); template @@ -1173,7 +1185,8 @@ void inplace_absolute_dense(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL); template @@ -1189,7 +1202,8 @@ void outplace_absolute_dense(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL); template @@ -1205,7 +1219,7 @@ void make_complex(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_MAKE_COMPLEX_KERNEL); template @@ -1221,7 +1235,7 @@ void get_real(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GET_REAL_KERNEL); template @@ -1237,7 +1251,7 @@ void get_imag(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GET_IMAG_KERNEL); template @@ -1257,7 +1271,7 @@ void add_scaled_identity(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE_WITH_HALF( GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL); diff --git a/reference/matrix/diagonal_kernels.cpp b/reference/matrix/diagonal_kernels.cpp index 028b7685c2b..47d59728ab0 100644 --- a/reference/matrix/diagonal_kernels.cpp +++ b/reference/matrix/diagonal_kernels.cpp @@ -35,7 +35,8 @@ void apply_to_dense(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DIAGONAL_APPLY_TO_DENSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DIAGONAL_APPLY_TO_DENSE_KERNEL); template @@ -52,7 +53,7 @@ void right_apply_to_dense(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_DENSE_KERNEL); @@ -77,7 +78,7 @@ void apply_to_csr(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL); @@ -101,7 +102,7 @@ void right_apply_to_csr(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DIAGONAL_RIGHT_APPLY_TO_CSR_KERNEL); @@ -118,7 +119,7 @@ void fill_in_matrix_data(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DIAGONAL_FILL_IN_MATRIX_DATA_KERNEL); @@ -141,7 +142,7 @@ void convert_to_csr(std::shared_ptr exec, row_ptrs[size] = size; } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DIAGONAL_CONVERT_TO_CSR_KERNEL); @@ -159,7 +160,8 @@ void conj_transpose(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DIAGONAL_CONJ_TRANSPOSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_DIAGONAL_CONJ_TRANSPOSE_KERNEL); } // namespace diagonal diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp index 1fa37c4e250..ece95b38a39 100644 --- a/reference/matrix/ell_kernels.cpp +++ b/reference/matrix/ell_kernels.cpp @@ -68,7 +68,7 @@ void spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ELL_SPMV_KERNEL); @@ -107,7 +107,7 @@ void advanced_spmv(std::shared_ptr exec, for (size_type j = 0; j < c->get_size()[1]; j++) { for (size_type row = 0; row < a->get_size()[0]; row++) { - arithmetic_type result = c->at(row, j); + auto result = static_cast(c->at(row, j)); result *= beta_val; for (size_type i = 0; i < num_stored_elements_per_row; i++) { arithmetic_type val = a_vals(row + i * stride); @@ -121,7 +121,7 @@ void advanced_spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); @@ -161,7 +161,7 @@ void fill_in_matrix_data(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ELL_FILL_IN_MATRIX_DATA_KERNEL); @@ -185,7 +185,7 @@ void fill_in_dense(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL); @@ -203,7 +203,8 @@ void copy(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_COPY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_ELL_COPY_KERNEL); template @@ -234,7 +235,7 @@ void convert_to_csr(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL); @@ -258,7 +259,7 @@ void count_nonzeros_per_row(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ELL_COUNT_NONZEROS_PER_ROW_KERNEL); @@ -283,7 +284,7 @@ void extract_diagonal(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL); diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp index 4c170a973a7..048158136be 100644 --- a/reference/matrix/fbcsr_kernels.cpp +++ b/reference/matrix/fbcsr_kernels.cpp @@ -74,7 +74,8 @@ void spmv(const std::shared_ptr, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_FBCSR_SPMV_KERNEL); template @@ -118,7 +119,7 @@ void advanced_spmv(const std::shared_ptr, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL); @@ -176,7 +177,7 @@ void fill_in_matrix_data(std::shared_ptr exec, std::copy(col_idx_vec.begin(), col_idx_vec.end(), col_idxs.get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL); @@ -212,7 +213,7 @@ void fill_in_dense(const std::shared_ptr, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); @@ -271,7 +272,7 @@ void convert_to_csr(const std::shared_ptr, static_cast(source->get_num_stored_elements()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL); @@ -353,7 +354,7 @@ void transpose(std::shared_ptr exec, [](const ValueType x) { return x; }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL); @@ -366,7 +367,7 @@ void conj_transpose(std::shared_ptr exec, [](const ValueType x) { return conj(x); }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); @@ -391,7 +392,7 @@ void is_sorted_by_column_index( return; } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX); @@ -448,7 +449,7 @@ void sort_by_column_index(const std::shared_ptr exec, syn::value_list(), syn::type_list<>(), to_sort); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX); @@ -487,7 +488,7 @@ void extract_diagonal(std::shared_ptr, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL); diff --git a/reference/matrix/hybrid_kernels.cpp b/reference/matrix/hybrid_kernels.cpp index f2a06c321f2..5fe013297f3 100644 --- a/reference/matrix/hybrid_kernels.cpp +++ b/reference/matrix/hybrid_kernels.cpp @@ -86,7 +86,7 @@ void fill_in_matrix_data(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL); @@ -130,7 +130,7 @@ void convert_to_csr(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL); diff --git a/reference/matrix/scaled_permutation_kernels.cpp b/reference/matrix/scaled_permutation_kernels.cpp index b00e06f72f2..a352c0f777d 100644 --- a/reference/matrix/scaled_permutation_kernels.cpp +++ b/reference/matrix/scaled_permutation_kernels.cpp @@ -26,7 +26,7 @@ void invert(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL); @@ -51,7 +51,7 @@ void compose(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL); diff --git a/reference/matrix/sellp_kernels.cpp b/reference/matrix/sellp_kernels.cpp index 120194d6952..70cfc3cac3a 100644 --- a/reference/matrix/sellp_kernels.cpp +++ b/reference/matrix/sellp_kernels.cpp @@ -55,7 +55,8 @@ void spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SELLP_SPMV_KERNEL); template @@ -96,7 +97,7 @@ void advanced_spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); @@ -163,7 +164,7 @@ void fill_in_matrix_data(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SELLP_FILL_IN_MATRIX_DATA_KERNEL); @@ -198,7 +199,7 @@ void fill_in_dense(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL); @@ -234,7 +235,7 @@ void count_nonzeros_per_row(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SELLP_COUNT_NONZEROS_PER_ROW_KERNEL); @@ -280,7 +281,7 @@ void convert_to_csr(std::shared_ptr exec, result_row_ptrs[num_rows] = cur_ptr; } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL); @@ -317,7 +318,7 @@ void extract_diagonal(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL); diff --git a/reference/matrix/sparsity_csr_kernels.cpp b/reference/matrix/sparsity_csr_kernels.cpp index c511a16a292..b773d3b9a50 100644 --- a/reference/matrix/sparsity_csr_kernels.cpp +++ b/reference/matrix/sparsity_csr_kernels.cpp @@ -55,7 +55,7 @@ void spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL); @@ -92,7 +92,7 @@ void advanced_spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); @@ -113,7 +113,7 @@ void fill_in_dense(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL); @@ -138,7 +138,7 @@ void diagonal_element_prefix_sum( prefix_sum[num_rows] = num_diag; } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_DIAGONAL_ELEMENT_PREFIX_SUM_KERNEL); @@ -173,7 +173,7 @@ void remove_diagonal_elements(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_REMOVE_DIAGONAL_ELEMENTS_KERNEL); @@ -227,7 +227,7 @@ void transpose(std::shared_ptr exec, transpose_and_transform(exec, orig, trans); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL); @@ -245,7 +245,7 @@ void sort_by_column_index(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX); @@ -269,7 +269,7 @@ void is_sorted_by_column_index( return; } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX); diff --git a/reference/test/base/combination.cpp b/reference/test/base/combination.cpp index aea578f4e7e..149aaa33256 100644 --- a/reference/test/base/combination.cpp +++ b/reference/test/base/combination.cpp @@ -34,7 +34,8 @@ class Combination : public ::testing::Test { std::vector> operators; }; -TYPED_TEST_SUITE(Combination, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Combination, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(Combination, CopiesOnSameExecutor) @@ -114,7 +115,7 @@ TYPED_TEST(Combination, AppliesToMixedVector) cmb = [ 8 7 ] [ 5 4 ] */ - using value_type = gko::next_precision; + using value_type = gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto cmb = gko::Combination::create( this->coefficients[0], this->operators[0], this->coefficients[1], @@ -156,7 +157,8 @@ TYPED_TEST(Combination, AppliesToMixedComplexVector) cmb = [ 8 7 ] [ 5 4 ] */ - using value_type = gko::to_complex>; + using value_type = + gko::to_complex>; using Mtx = gko::matrix::Dense; auto cmb = gko::Combination::create( this->coefficients[0], this->operators[0], this->coefficients[1], @@ -200,7 +202,7 @@ TYPED_TEST(Combination, AppliesLinearCombinationToMixedVector) cmb = [ 8 7 ] [ 5 4 ] */ - using value_type = gko::next_precision; + using value_type = gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto cmb = gko::Combination::create( this->coefficients[0], this->operators[0], this->coefficients[1], @@ -248,7 +250,8 @@ TYPED_TEST(Combination, AppliesLinearCombinationToMixedComplexVector) cmb = [ 8 7 ] [ 5 4 ] */ - using MixedDense = gko::matrix::Dense>; + using MixedDense = + gko::matrix::Dense>; using MixedDenseComplex = gko::to_complex; using value_type = typename MixedDenseComplex::value_type; auto cmb = gko::Combination::create( diff --git a/reference/test/matrix/coo_kernels.cpp b/reference/test/matrix/coo_kernels.cpp index fcca61a33d4..53efc588e1c 100644 --- a/reference/test/matrix/coo_kernels.cpp +++ b/reference/test/matrix/coo_kernels.cpp @@ -79,16 +79,17 @@ TYPED_TEST(Coo, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto tmp = OtherCoo::create(this->exec); auto res = Coo::create(this->exec); // If OtherType is more precise: 0, otherwise r - auto residual = - r::value < r::value - ? gko::remove_complex{0} - : static_cast>(r::value); + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx->convert_to(tmp); tmp->convert_to(res); @@ -101,7 +102,7 @@ TYPED_TEST(Coo, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto tmp = OtherCoo::create(this->exec); @@ -214,7 +215,7 @@ TYPED_TEST(Coo, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto empty = OtherCoo::create(this->exec); @@ -231,7 +232,7 @@ TYPED_TEST(Coo, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto empty = OtherCoo::create(this->exec); diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index 2dd68bd9239..b84ac958f02 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -788,7 +788,7 @@ TYPED_TEST(Csr, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto tmp = OtherCsr::create(this->exec); @@ -814,7 +814,7 @@ TYPED_TEST(Csr, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto tmp = OtherCsr::create(this->exec); @@ -992,7 +992,7 @@ TYPED_TEST(Csr, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto empty = OtherCsr::create(this->exec); @@ -1011,7 +1011,7 @@ TYPED_TEST(Csr, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto empty = OtherCsr::create(this->exec); diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index 51b0aa148fd..a8d37ce5a09 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -75,8 +75,7 @@ class Dense : public ::testing::Test { return gko::test::generate_random_matrix( num_rows, num_cols, std::uniform_int_distribution(num_cols, num_cols), - std::normal_distribution>(0.0, 1.0), - rand_engine, exec); + std::normal_distribution<>(0.0, 1.0), rand_engine, exec); } }; @@ -751,9 +750,11 @@ TYPED_TEST(Dense, ConvertsToPrecision) auto tmp = OtherDense::create(this->exec); auto res = Dense::create(this->exec); // If OtherT is more precise: 0, otherwise r - auto residual = r::value < r::value - ? gko::remove_complex{0} - : static_cast>(r::value); + auto residual = + r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{ + static_cast>(r::value)}; this->mtx1->convert_to(tmp); tmp->convert_to(res); @@ -771,9 +772,11 @@ TYPED_TEST(Dense, MovesToPrecision) auto tmp = OtherDense::create(this->exec); auto res = Dense::create(this->exec); // If OtherT is more precise: 0, otherwise r - auto residual = r::value < r::value - ? gko::remove_complex{0} - : static_cast>(r::value); + auto residual = + r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{ + static_cast>(r::value)}; this->mtx1->move_to(tmp); tmp->move_to(res); @@ -3549,7 +3552,7 @@ class DenseComplex : public ::testing::Test { }; -TYPED_TEST_SUITE(DenseComplex, gko::test::ComplexValueTypes, +TYPED_TEST_SUITE(DenseComplex, gko::test::ComplexValueTypesWithHalf, TypenameNameGenerator); diff --git a/reference/test/matrix/diagonal_kernels.cpp b/reference/test/matrix/diagonal_kernels.cpp index b0932c7eb66..e2ac67190d0 100644 --- a/reference/test/matrix/diagonal_kernels.cpp +++ b/reference/test/matrix/diagonal_kernels.cpp @@ -85,16 +85,17 @@ TYPED_TEST_SUITE(Diagonal, gko::test::ValueTypes, TypenameNameGenerator); TYPED_TEST(Diagonal, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Diagonal = typename TestFixture::Diag; using OtherDiagonal = gko::matrix::Diagonal; auto tmp = OtherDiagonal::create(this->exec); auto res = Diagonal::create(this->exec); // If OtherType is more precise: 0, otherwise r - auto residual = - r::value < r::value - ? gko::remove_complex{0} - : static_cast>(r::value); + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{ + static_cast>( + r::value)}; this->diag1->convert_to(tmp); tmp->convert_to(res); @@ -106,7 +107,7 @@ TYPED_TEST(Diagonal, ConvertsToPrecision) TYPED_TEST(Diagonal, MovesToPrecision) { using ValueType = typename TestFixture::value_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Diagonal = typename TestFixture::Diag; using OtherDiagonal = gko::matrix::Diagonal; auto tmp = OtherDiagonal::create(this->exec); @@ -672,7 +673,7 @@ class DiagonalComplex : public ::testing::Test { using Diag = gko::matrix::Diagonal; }; -TYPED_TEST_SUITE(DiagonalComplex, gko::test::ComplexValueTypes, +TYPED_TEST_SUITE(DiagonalComplex, gko::test::ComplexValueTypesWithHalf, TypenameNameGenerator); diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp index e1eef9f087c..6214db82d1c 100644 --- a/reference/test/matrix/ell_kernels.cpp +++ b/reference/test/matrix/ell_kernels.cpp @@ -443,16 +443,17 @@ TYPED_TEST(Ell, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto tmp = OtherEll::create(this->exec); auto res = Ell::create(this->exec); // If OtherType is more precise: 0, otherwise r - auto residual = - r::value < r::value - ? gko::remove_complex{0} - : static_cast>(r::value); + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->convert_to(tmp); tmp->convert_to(res); @@ -465,7 +466,7 @@ TYPED_TEST(Ell, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto tmp = OtherEll::create(this->exec); @@ -734,7 +735,7 @@ TYPED_TEST(Ell, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto empty = Ell::create(this->exec); @@ -751,7 +752,7 @@ TYPED_TEST(Ell, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto empty = Ell::create(this->exec); diff --git a/reference/test/matrix/fbcsr_kernels.cpp b/reference/test/matrix/fbcsr_kernels.cpp index f7c6d2197ef..665df4ace31 100644 --- a/reference/test/matrix/fbcsr_kernels.cpp +++ b/reference/test/matrix/fbcsr_kernels.cpp @@ -271,16 +271,17 @@ TYPED_TEST(Fbcsr, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto tmp = OtherFbcsr::create(this->exec); auto res = Fbcsr::create(this->exec); // If OtherType is more precise: 0, otherwise r - auto residual = - r::value < r::value - ? gko::remove_complex{0} - : static_cast>(r::value); + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx->convert_to(tmp); tmp->convert_to(res); @@ -293,7 +294,7 @@ TYPED_TEST(Fbcsr, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto tmp = OtherFbcsr::create(this->exec); @@ -391,7 +392,7 @@ TYPED_TEST(Fbcsr, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto empty = OtherFbcsr::create(this->exec); @@ -410,7 +411,7 @@ TYPED_TEST(Fbcsr, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto empty = OtherFbcsr::create(this->exec); diff --git a/reference/test/matrix/hybrid_kernels.cpp b/reference/test/matrix/hybrid_kernels.cpp index 754e599b8fe..87fd4c02811 100644 --- a/reference/test/matrix/hybrid_kernels.cpp +++ b/reference/test/matrix/hybrid_kernels.cpp @@ -233,16 +233,17 @@ TYPED_TEST(Hybrid, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto tmp = OtherHybrid::create(this->exec); auto res = Hybrid::create(this->exec); // If OtherType is more precise: 0, otherwise r - auto residual = - r::value < r::value - ? gko::remove_complex{0} - : static_cast>(r::value); + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->convert_to(tmp); tmp->convert_to(res); @@ -255,7 +256,7 @@ TYPED_TEST(Hybrid, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto tmp = OtherHybrid::create(this->exec); @@ -366,7 +367,7 @@ TYPED_TEST(Hybrid, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto other = Hybrid::create(this->exec); @@ -383,7 +384,7 @@ TYPED_TEST(Hybrid, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto other = Hybrid::create(this->exec); diff --git a/reference/test/matrix/scaled_permutation.cpp b/reference/test/matrix/scaled_permutation.cpp index ba65705bf29..6d8d49f5662 100644 --- a/reference/test/matrix/scaled_permutation.cpp +++ b/reference/test/matrix/scaled_permutation.cpp @@ -145,8 +145,7 @@ TYPED_TEST(ScaledPermutation, CombineWithInverse) using index_type = typename TestFixture::index_type; const gko::size_type size = 20; auto rng = std::default_random_engine{3754}; - auto dist = std::uniform_real_distribution>{ - 1.0, 2.0}; + auto dist = std::uniform_real_distribution<>{1.0, 2.0}; auto perm = gko::matrix::ScaledPermutation::create( this->exec, size); std::iota(perm->get_permutation(), perm->get_permutation() + size, 0); diff --git a/reference/test/matrix/sellp_kernels.cpp b/reference/test/matrix/sellp_kernels.cpp index a39d8e16832..3208b8c42be 100644 --- a/reference/test/matrix/sellp_kernels.cpp +++ b/reference/test/matrix/sellp_kernels.cpp @@ -189,16 +189,17 @@ TYPED_TEST(Sellp, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto tmp = OtherSellp::create(this->exec); auto res = Sellp::create(this->exec); // If OtherType is more precise: 0, otherwise r - auto residual = - r::value < r::value - ? gko::remove_complex{0} - : static_cast>(r::value); + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->convert_to(tmp); tmp->convert_to(res); @@ -211,16 +212,17 @@ TYPED_TEST(Sellp, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto tmp = OtherSellp::create(this->exec); auto res = Sellp::create(this->exec); // If OtherType is more precise: 0, otherwise r - auto residual = - r::value < r::value - ? gko::remove_complex{0} - : static_cast>(r::value); + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->move_to(tmp); tmp->move_to(res); @@ -308,7 +310,7 @@ TYPED_TEST(Sellp, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto empty = OtherSellp::create(this->exec); @@ -327,7 +329,7 @@ TYPED_TEST(Sellp, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = gko::next_precision; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto empty = OtherSellp::create(this->exec); diff --git a/test/matrix/fbcsr_kernels.cpp b/test/matrix/fbcsr_kernels.cpp index 8cff04c28a0..4ff8e1fc36a 100644 --- a/test/matrix/fbcsr_kernels.cpp +++ b/test/matrix/fbcsr_kernels.cpp @@ -37,7 +37,7 @@ class Fbcsr : public CommonTestFixture { std::unique_ptr rsorted; - std::normal_distribution> distb; + std::normal_distribution<> distb; std::default_random_engine engine; value_type get_random_value() @@ -123,6 +123,9 @@ TYPED_TEST(Fbcsr, SpmvIsEquivalentToRefSorted) using Mtx = typename TestFixture::Mtx; using Dense = typename TestFixture::Dense; using value_type = typename Mtx::value_type; + if (this->exec->get_master() != this->exec) { + SKIP_IF_HALF(value_type); + } auto drand = gko::clone(this->exec, this->rsorted); auto x = Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 1)); @@ -145,6 +148,9 @@ TYPED_TEST(Fbcsr, SpmvMultiIsEquivalentToRefSorted) using Mtx = typename TestFixture::Mtx; using Dense = typename TestFixture::Dense; using value_type = typename Mtx::value_type; + if (this->exec->get_master() != this->exec) { + SKIP_IF_HALF(value_type); + } auto drand = gko::clone(this->exec, this->rsorted); auto x = Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 3)); @@ -168,6 +174,9 @@ TYPED_TEST(Fbcsr, AdvancedSpmvIsEquivalentToRefSorted) using Dense = typename TestFixture::Dense; using value_type = typename TestFixture::value_type; using real_type = typename TestFixture::real_type; + if (this->exec->get_master() != this->exec) { + SKIP_IF_HALF(value_type); + } auto drand = gko::clone(this->exec, this->rsorted); auto x = Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 1)); @@ -198,6 +207,9 @@ TYPED_TEST(Fbcsr, AdvancedSpmvMultiIsEquivalentToRefSorted) using Dense = typename TestFixture::Dense; using value_type = typename TestFixture::value_type; using real_type = typename TestFixture::real_type; + if (this->exec->get_master() != this->exec) { + SKIP_IF_HALF(value_type); + } auto drand = gko::clone(this->exec, this->rsorted); auto x = Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 3)); diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp index eea1a67ef5f..0b06f76df85 100644 --- a/test/matrix/matrix.cpp +++ b/test/matrix/matrix.cpp @@ -586,10 +586,7 @@ class Matrix : public CommonTestFixture { template gko::matrix_data gen_dense_data(gko::dim<2> size) { - return { - size, - std::normal_distribution>(0.0, 1.0), - rand_engine}; + return {size, std::normal_distribution<>(0.0, 1.0), rand_engine}; } template @@ -609,10 +606,7 @@ class Matrix : public CommonTestFixture { return {gko::initialize( {gko::test::detail::get_rand_value< typename VecType::value_type>( - std::normal_distribution< - gko::remove_complex>( - 0.0, 1.0), - rand_engine)}, + std::normal_distribution<>(0.0, 1.0), rand_engine)}, ref), exec}; } From 15e51ba02e779f52798a5adb494f0d3ce053b7d5 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 24 Oct 2024 11:59:30 +0200 Subject: [PATCH 09/69] device_matrix_data and mtx_io --- .../base/device_matrix_data_kernels.cpp | 16 +++++++-- .../base/device_matrix_data_kernels.cpp | 4 +-- core/base/device_matrix_data.cpp | 3 +- core/base/mtx_io.cpp | 35 ++++++++++++++----- core/device_hooks/common_kernels.inc.cpp | 12 ++++--- core/test/base/mtx_io.cpp | 20 +++++++++-- dpcpp/base/device_matrix_data_kernels.dp.cpp | 4 +-- omp/base/device_matrix_data_kernels.cpp | 6 ++-- reference/base/device_matrix_data_kernels.cpp | 10 +++--- test/base/device_matrix_data_kernels.cpp | 7 ++-- 10 files changed, 80 insertions(+), 37 deletions(-) diff --git a/common/cuda_hip/base/device_matrix_data_kernels.cpp b/common/cuda_hip/base/device_matrix_data_kernels.cpp index c5742653a93..ebfed84dba2 100644 --- a/common/cuda_hip/base/device_matrix_data_kernels.cpp +++ b/common/cuda_hip/base/device_matrix_data_kernels.cpp @@ -12,6 +12,7 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" @@ -22,6 +23,15 @@ namespace GKO_DEVICE_NAMESPACE { namespace components { +// __half `!=` operation is only available in __device__ +// Although gko::is_nonzero is constexpr, it still shows calling __device__ in +// __host__ +template +GKO_INLINE __device__ constexpr bool is_nonzero(T value) +{ + return value != zero(); +} + template void remove_zeros(std::shared_ptr exec, array& values, array& row_idxs, @@ -58,7 +68,7 @@ void remove_zeros(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL); @@ -102,7 +112,7 @@ void sum_duplicates(std::shared_ptr exec, size_type, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DEVICE_MATRIX_DATA_SUM_DUPLICATES_KERNEL); @@ -117,7 +127,7 @@ void sort_row_major(std::shared_ptr exec, it + data.get_num_stored_elements(), vals); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL); diff --git a/common/unified/base/device_matrix_data_kernels.cpp b/common/unified/base/device_matrix_data_kernels.cpp index d801b47fcd5..b72c6bf3476 100644 --- a/common/unified/base/device_matrix_data_kernels.cpp +++ b/common/unified/base/device_matrix_data_kernels.cpp @@ -30,7 +30,7 @@ void soa_to_aos(std::shared_ptr exec, in.get_const_col_idxs(), in.get_const_values(), out); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DEVICE_MATRIX_DATA_SOA_TO_AOS_KERNEL); @@ -50,7 +50,7 @@ void aos_to_soa(std::shared_ptr exec, out.get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DEVICE_MATRIX_DATA_AOS_TO_SOA_KERNEL); diff --git a/core/base/device_matrix_data.cpp b/core/base/device_matrix_data.cpp index 4c71fffe275..cb9d332f5ab 100644 --- a/core/base/device_matrix_data.cpp +++ b/core/base/device_matrix_data.cpp @@ -157,7 +157,8 @@ device_matrix_data::empty_out() #define GKO_DECLARE_DEVICE_MATRIX_DATA(ValueType, IndexType) \ class device_matrix_data -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DEVICE_MATRIX_DATA); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DEVICE_MATRIX_DATA); } // namespace gko diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp index 33c3b07d487..0897349d08c 100644 --- a/core/base/mtx_io.cpp +++ b/core/base/mtx_io.cpp @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -757,19 +758,28 @@ static constexpr uint64 binary_format_magic() { constexpr auto is_int = std::is_same::value; constexpr auto is_long = std::is_same::value; + constexpr auto is_half = std::is_same::value; constexpr auto is_double = std::is_same::value; constexpr auto is_float = std::is_same::value; constexpr auto is_complex_double = std::is_same>::value; constexpr auto is_complex_float = std::is_same>::value; + constexpr auto is_complex_half = + std::is_same>::value; static_assert(is_int || is_long, "invalid storage index type"); - static_assert( - is_double || is_float || is_complex_double || is_complex_float, - "invalid storage value type"); + static_assert(is_half || is_complex_half || is_double || is_float || + is_complex_double || is_complex_float, + "invalid storage value type"); constexpr auto index_bit = is_int ? 'I' : 'L'; constexpr auto value_bit = - is_double ? 'D' : (is_float ? 'S' : (is_complex_double ? 'Z' : 'C')); + is_double + ? 'D' + : (is_float + ? 'S' + : (is_complex_double + ? 'Z' + : (is_complex_float ? 'C' : (is_half ? 'H' : 'X')))); constexpr uint64 shift = 256; constexpr uint64 type_bits = index_bit * shift + value_bit; return 'G' + @@ -879,12 +889,16 @@ matrix_data read_binary_raw(std::istream& is) } DECLARE_OVERLOAD(double, int32) DECLARE_OVERLOAD(float, int32) + DECLARE_OVERLOAD(half, int32) DECLARE_OVERLOAD(std::complex, int32) DECLARE_OVERLOAD(std::complex, int32) + DECLARE_OVERLOAD(std::complex, int32) DECLARE_OVERLOAD(double, int64) DECLARE_OVERLOAD(float, int64) + DECLARE_OVERLOAD(half, int64) DECLARE_OVERLOAD(std::complex, int64) DECLARE_OVERLOAD(std::complex, int64) + DECLARE_OVERLOAD(std::complex, int64) #undef DECLARE_OVERLOAD else { @@ -970,11 +984,14 @@ void write_raw(std::ostream& os, const matrix_data& data, const matrix_data& data) #define GKO_DECLARE_READ_GENERIC_RAW(ValueType, IndexType) \ matrix_data read_generic_raw(std::istream& is) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_READ_RAW); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_WRITE_RAW); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_READ_BINARY_RAW); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_WRITE_BINARY_RAW); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_READ_GENERIC_RAW); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_READ_RAW); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_WRITE_RAW); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_READ_BINARY_RAW); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_WRITE_BINARY_RAW); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_READ_GENERIC_RAW); } // namespace gko diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 78b80ec2859..439cda481a2 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -251,14 +251,16 @@ GKO_STUB_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL); GKO_STUB_TEMPLATE_TYPE_WITH_HALF(GKO_DECLARE_REDUCE_ADD_ARRAY_KERNEL); GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_INPLACE_ABSOLUTE_ARRAY_KERNEL); GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_OUTPLACE_ABSOLUTE_ARRAY_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE( +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE( +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DEVICE_MATRIX_DATA_SUM_DUPLICATES_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE( +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DEVICE_MATRIX_DATA_AOS_TO_SOA_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DEVICE_MATRIX_DATA_SOA_TO_AOS_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DEVICE_MATRIX_DATA_AOS_TO_SOA_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DEVICE_MATRIX_DATA_SOA_TO_AOS_KERNEL); template GKO_DECLARE_CONVERT_PTRS_TO_IDXS(IndexType, RowPtrType) diff --git a/core/test/base/mtx_io.cpp b/core/test/base/mtx_io.cpp index 8ac1ced0e50..14d44335b85 100644 --- a/core/test/base/mtx_io.cpp +++ b/core/test/base/mtx_io.cpp @@ -7,6 +7,7 @@ #include +#include #include #include #include @@ -570,6 +571,12 @@ TEST(MtxReader, ReadsBinary) test_read(gko::matrix_data{}); test_read(gko::matrix_data, gko::int64>{}); test_read(gko::matrix_data, gko::int64>{}); +#if GINKGO_ENABLE_HALF + test_read(gko::matrix_data{}); + test_read(gko::matrix_data, gko::int32>{}); + test_read(gko::matrix_data{}); + test_read(gko::matrix_data, gko::int64>{}); +#endif } @@ -625,6 +632,12 @@ TEST(MtxReader, ReadsComplexBinary) test_read_fail(gko::matrix_data{}); test_read(gko::matrix_data, gko::int64>{}); test_read(gko::matrix_data, gko::int64>{}); +#if GINKGO_ENABLE_HALF + test_read_fail(gko::matrix_data{}); + test_read(gko::matrix_data, gko::int32>{}); + test_read_fail(gko::matrix_data{}); + test_read(gko::matrix_data, gko::int64>{}); +#endif } @@ -960,7 +973,7 @@ class RealDummyLinOpTest : public ::testing::Test { typename std::tuple_element<1, decltype(ValueIndexType())>::type; }; -TYPED_TEST_SUITE(RealDummyLinOpTest, gko::test::RealValueIndexTypes, +TYPED_TEST_SUITE(RealDummyLinOpTest, gko::test::RealValueIndexTypesWithHalf, PairTypenameNameGenerator); @@ -1165,7 +1178,7 @@ class DenseTest : public ::testing::Test { using index_type = typename std::tuple_element<1, ValueIndexType>::type; }; -TYPED_TEST_SUITE(DenseTest, gko::test::RealValueIndexTypes, +TYPED_TEST_SUITE(DenseTest, gko::test::RealValueIndexTypesWithHalf, PairTypenameNameGenerator); @@ -1209,7 +1222,8 @@ class ComplexDummyLinOpTest : public ::testing::Test { typename std::tuple_element<1, decltype(ValueIndexType())>::type; }; -TYPED_TEST_SUITE(ComplexDummyLinOpTest, gko::test::ComplexValueIndexTypes, +TYPED_TEST_SUITE(ComplexDummyLinOpTest, + gko::test::ComplexValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/dpcpp/base/device_matrix_data_kernels.dp.cpp b/dpcpp/base/device_matrix_data_kernels.dp.cpp index f39615613fe..a5f58831a27 100644 --- a/dpcpp/base/device_matrix_data_kernels.dp.cpp +++ b/dpcpp/base/device_matrix_data_kernels.dp.cpp @@ -49,7 +49,7 @@ void remove_zeros(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL); @@ -112,7 +112,7 @@ void sort_row_major(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL); diff --git a/omp/base/device_matrix_data_kernels.cpp b/omp/base/device_matrix_data_kernels.cpp index bce89e2f409..cb2dabd3010 100644 --- a/omp/base/device_matrix_data_kernels.cpp +++ b/omp/base/device_matrix_data_kernels.cpp @@ -69,7 +69,7 @@ void remove_zeros(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL); @@ -127,7 +127,7 @@ void sum_duplicates(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DEVICE_MATRIX_DATA_SUM_DUPLICATES_KERNEL); @@ -142,7 +142,7 @@ void sort_row_major(std::shared_ptr exec, aos_to_soa(exec, tmp, data); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL); diff --git a/reference/base/device_matrix_data_kernels.cpp b/reference/base/device_matrix_data_kernels.cpp index f9a23b35e69..78a2e25a712 100644 --- a/reference/base/device_matrix_data_kernels.cpp +++ b/reference/base/device_matrix_data_kernels.cpp @@ -29,7 +29,7 @@ void soa_to_aos(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DEVICE_MATRIX_DATA_SOA_TO_AOS_KERNEL); @@ -46,7 +46,7 @@ void aos_to_soa(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DEVICE_MATRIX_DATA_AOS_TO_SOA_KERNEL); @@ -78,7 +78,7 @@ void remove_zeros(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL); @@ -127,7 +127,7 @@ void sum_duplicates(std::shared_ptr exec, size_type, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DEVICE_MATRIX_DATA_SUM_DUPLICATES_KERNEL); @@ -142,7 +142,7 @@ void sort_row_major(std::shared_ptr exec, aos_to_soa(exec, tmp, data); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL); diff --git a/test/base/device_matrix_data_kernels.cpp b/test/base/device_matrix_data_kernels.cpp index 6ddc926b76c..d2543ae7cbb 100644 --- a/test/base/device_matrix_data_kernels.cpp +++ b/test/base/device_matrix_data_kernels.cpp @@ -35,8 +35,7 @@ class DeviceMatrixData : public CommonTestFixture { 0, host_data.size[0] - 1); std::uniform_int_distribution col_distr( 0, host_data.size[1] - 1); - std::uniform_real_distribution> - val_distr(1.0, 2.0); + std::uniform_real_distribution<> val_distr(1.0, 2.0); // add random entries for (int i = 0; i < 1000; i++) { host_data.nonzeros.emplace_back( @@ -85,7 +84,7 @@ class DeviceMatrixData : public CommonTestFixture { gko::matrix_data deduplicated_data; }; -TYPED_TEST_SUITE(DeviceMatrixData, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(DeviceMatrixData, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); @@ -339,7 +338,7 @@ TYPED_TEST(DeviceMatrixData, SumsDuplicates) arrays.values.set_executor(this->exec->get_master()); for (int i = 0; i < arrays.values.get_size(); i++) { max_error = std::max( - max_error, std::abs(arrays.values.get_const_data()[i] - + max_error, gko::abs(arrays.values.get_const_data()[i] - ref_arrays.values.get_const_data()[i])); } // when Hip with GNU < 7, it will give a little difference. From 0bc9285a1d2fd1e3a4ed52b1c6135f610975ccb5 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 24 Oct 2024 15:39:04 +0200 Subject: [PATCH 10/69] components such as array/iterator/segmented_array test with half --- core/test/base/array.cpp | 3 ++- core/test/base/iterator_factory.cpp | 4 ++-- core/test/base/segmented_array.cpp | 3 ++- core/test/components/addressable_pq.cpp | 4 ++-- cuda/test/base/array.cpp | 3 ++- reference/test/base/array.cpp | 3 ++- reference/test/components/absolute_array_kernels.cpp | 3 ++- reference/test/components/fill_array_kernels.cpp | 2 +- reference/test/components/reduce_array_kernels.cpp | 2 +- test/components/fill_array_kernels.cpp | 2 +- test/components/reduce_array_kernels.cpp | 11 ++++++++--- 11 files changed, 25 insertions(+), 15 deletions(-) diff --git a/core/test/base/array.cpp b/core/test/base/array.cpp index f7e03855d06..23515d70fc4 100644 --- a/core/test/base/array.cpp +++ b/core/test/base/array.cpp @@ -40,7 +40,8 @@ class Array : public ::testing::Test { gko::array x; }; -TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(Array, CanBeCreatedWithoutAnExecutor) diff --git a/core/test/base/iterator_factory.cpp b/core/test/base/iterator_factory.cpp index bbc3bbfd04f..3685242f78a 100644 --- a/core/test/base/iterator_factory.cpp +++ b/core/test/base/iterator_factory.cpp @@ -78,7 +78,7 @@ class ZipIterator : public ::testing::Test { const std::vector ordered_value; }; -TYPED_TEST_SUITE(ZipIterator, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(ZipIterator, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); @@ -366,7 +366,7 @@ class PermuteIterator : public ::testing::Test { using value_type = ValueType; }; -TYPED_TEST_SUITE(PermuteIterator, gko::test::ComplexAndPODTypes, +TYPED_TEST_SUITE(PermuteIterator, gko::test::ComplexAndPODTypesWithHalf, TypenameNameGenerator); diff --git a/core/test/base/segmented_array.cpp b/core/test/base/segmented_array.cpp index 2741990036f..31444d71d18 100644 --- a/core/test/base/segmented_array.cpp +++ b/core/test/base/segmented_array.cpp @@ -27,7 +27,8 @@ class SegmentedArray : public ::testing::Test { std::shared_ptr exec = gko::ReferenceExecutor::create(); }; -TYPED_TEST_SUITE(SegmentedArray, gko::test::PODTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(SegmentedArray, gko::test::PODTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(SegmentedArray, CanConstructFromExecutor) diff --git a/core/test/components/addressable_pq.cpp b/core/test/components/addressable_pq.cpp index 6301cd44fb4..87fcb289a77 100644 --- a/core/test/components/addressable_pq.cpp +++ b/core/test/components/addressable_pq.cpp @@ -91,8 +91,8 @@ class AddressablePriorityQueue : public ::testing::Test { std::shared_ptr exec; }; -TYPED_TEST_SUITE(AddressablePriorityQueue, gko::test::RealValueIndexTypes, - TypenameNameGenerator); +TYPED_TEST_SUITE(AddressablePriorityQueue, + gko::test::RealValueIndexTypesWithHalf, TypenameNameGenerator); TYPED_TEST(AddressablePriorityQueue, InitializesCorrectly) diff --git a/cuda/test/base/array.cpp b/cuda/test/base/array.cpp index db7d4c54536..7294cbff29f 100644 --- a/cuda/test/base/array.cpp +++ b/cuda/test/base/array.cpp @@ -32,7 +32,8 @@ class Array : public CudaTestFixture { gko::array x; }; -TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(Array, CanCreateTemporaryCloneOnDifferentExecutor) diff --git a/reference/test/base/array.cpp b/reference/test/base/array.cpp index 666ab13063c..2c69f1afc8e 100644 --- a/reference/test/base/array.cpp +++ b/reference/test/base/array.cpp @@ -28,7 +28,8 @@ class Array : public ::testing::Test { gko::array x; }; -TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(Array, CanBeFilledWithValue) diff --git a/reference/test/components/absolute_array_kernels.cpp b/reference/test/components/absolute_array_kernels.cpp index c192d540032..5ad75440c88 100644 --- a/reference/test/components/absolute_array_kernels.cpp +++ b/reference/test/components/absolute_array_kernels.cpp @@ -43,7 +43,8 @@ class AbsoluteArray : public ::testing::Test { gko::array vals; }; -TYPED_TEST_SUITE(AbsoluteArray, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(AbsoluteArray, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(AbsoluteArray, InplaceEqualsExpected) diff --git a/reference/test/components/fill_array_kernels.cpp b/reference/test/components/fill_array_kernels.cpp index 3c7520c6847..0a9239ce1bd 100644 --- a/reference/test/components/fill_array_kernels.cpp +++ b/reference/test/components/fill_array_kernels.cpp @@ -40,7 +40,7 @@ class FillArray : public ::testing::Test { gko::array seqs; }; -TYPED_TEST_SUITE(FillArray, gko::test::ComplexAndPODTypes, +TYPED_TEST_SUITE(FillArray, gko::test::ComplexAndPODTypesWithHalf, TypenameNameGenerator); diff --git a/reference/test/components/reduce_array_kernels.cpp b/reference/test/components/reduce_array_kernels.cpp index 8286817c853..c8839bc178d 100644 --- a/reference/test/components/reduce_array_kernels.cpp +++ b/reference/test/components/reduce_array_kernels.cpp @@ -31,7 +31,7 @@ class ReduceArray : public ::testing::Test { gko::array vals; }; -TYPED_TEST_SUITE(ReduceArray, gko::test::ComplexAndPODTypes, +TYPED_TEST_SUITE(ReduceArray, gko::test::ComplexAndPODTypesWithHalf, TypenameNameGenerator); diff --git a/test/components/fill_array_kernels.cpp b/test/components/fill_array_kernels.cpp index 3d494b3f5f0..4237a75304a 100644 --- a/test/components/fill_array_kernels.cpp +++ b/test/components/fill_array_kernels.cpp @@ -36,7 +36,7 @@ class FillArray : public CommonTestFixture { gko::array seqs; }; -TYPED_TEST_SUITE(FillArray, gko::test::ComplexAndPODTypes, +TYPED_TEST_SUITE(FillArray, gko::test::ComplexAndPODTypesWithHalf, TypenameNameGenerator); diff --git a/test/components/reduce_array_kernels.cpp b/test/components/reduce_array_kernels.cpp index b7407801a32..7940feec661 100644 --- a/test/components/reduce_array_kernels.cpp +++ b/test/components/reduce_array_kernels.cpp @@ -20,14 +20,19 @@ template class ReduceArray : public CommonTestFixture { protected: using value_type = T; + static constexpr bool using_half = + std::is_same_v, gko::half>; + + // due to half accuracy, the summation ordering will affect the result + // easily ReduceArray() - : total_size(6355), + : total_size(using_half ? 1024 : 6355), out{ref, I{2}}, dout{exec, out}, vals{ref, total_size}, dvals{exec} { - std::fill_n(vals.get_data(), total_size, 3); + std::fill_n(vals.get_data(), total_size, using_half ? 1 : 3); dvals = vals; } @@ -38,7 +43,7 @@ class ReduceArray : public CommonTestFixture { gko::array dvals; }; -TYPED_TEST_SUITE(ReduceArray, gko::test::ComplexAndPODTypes, +TYPED_TEST_SUITE(ReduceArray, gko::test::ComplexAndPODTypesWithHalf, TypenameNameGenerator); From c5720ffb99148db0b5a91ec6a808d1cc7294a1f4 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 24 Oct 2024 15:39:41 +0200 Subject: [PATCH 11/69] matrix test with half --- core/test/matrix/coo.cpp | 3 +- core/test/matrix/coo_builder.cpp | 2 +- core/test/matrix/csr.cpp | 3 +- core/test/matrix/csr_builder.cpp | 2 +- core/test/matrix/dense.cpp | 2 +- core/test/matrix/diagonal.cpp | 3 +- core/test/matrix/ell.cpp | 3 +- core/test/matrix/fbcsr.cpp | 7 ++- core/test/matrix/fbcsr_builder.cpp | 2 +- core/test/matrix/hybrid.cpp | 3 +- core/test/matrix/identity.cpp | 6 +- core/test/matrix/permutation.cpp | 2 +- core/test/matrix/row_gatherer.cpp | 2 +- core/test/matrix/sellp.cpp | 3 +- core/test/matrix/sparsity_csr.cpp | 2 +- core/test/utils/fb_matrix_generator.hpp | 13 ++--- core/test/utils/value_generator.hpp | 6 +- hip/test/matrix/fbcsr_kernels.cpp | 56 +++++++++++++------ reference/test/matrix/coo_kernels.cpp | 33 +++++------ reference/test/matrix/csr_kernels.cpp | 48 ++++++++-------- reference/test/matrix/dense_kernels.cpp | 19 ++++--- reference/test/matrix/diagonal_kernels.cpp | 14 +++-- reference/test/matrix/ell_kernels.cpp | 46 +++++++-------- reference/test/matrix/fbcsr_kernels.cpp | 15 ++--- reference/test/matrix/hybrid_kernels.cpp | 20 ++++--- reference/test/matrix/identity.cpp | 6 +- reference/test/matrix/permutation.cpp | 2 +- reference/test/matrix/scaled_permutation.cpp | 2 +- reference/test/matrix/sellp_kernels.cpp | 19 ++++--- reference/test/matrix/sparsity_csr.cpp | 2 +- .../test/matrix/sparsity_csr_kernels.cpp | 12 ++-- test/matrix/fbcsr_kernels.cpp | 23 ++++++-- 32 files changed, 219 insertions(+), 162 deletions(-) diff --git a/core/test/matrix/coo.cpp b/core/test/matrix/coo.cpp index ffb8d5aee9f..56735e792d5 100644 --- a/core/test/matrix/coo.cpp +++ b/core/test/matrix/coo.cpp @@ -77,7 +77,8 @@ class Coo : public ::testing::Test { } }; -TYPED_TEST_SUITE(Coo, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Coo, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Coo, KnowsItsSize) diff --git a/core/test/matrix/coo_builder.cpp b/core/test/matrix/coo_builder.cpp index 9bfae5cf3af..b1b22c5848a 100644 --- a/core/test/matrix/coo_builder.cpp +++ b/core/test/matrix/coo_builder.cpp @@ -32,7 +32,7 @@ class CooBuilder : public ::testing::Test { std::unique_ptr mtx; }; -TYPED_TEST_SUITE(CooBuilder, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(CooBuilder, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/core/test/matrix/csr.cpp b/core/test/matrix/csr.cpp index 4bbdc63851a..f199de423e8 100644 --- a/core/test/matrix/csr.cpp +++ b/core/test/matrix/csr.cpp @@ -82,7 +82,8 @@ class Csr : public ::testing::Test { } }; -TYPED_TEST_SUITE(Csr, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Csr, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Csr, KnowsItsSize) diff --git a/core/test/matrix/csr_builder.cpp b/core/test/matrix/csr_builder.cpp index 24cbe4718c5..2accb57770c 100644 --- a/core/test/matrix/csr_builder.cpp +++ b/core/test/matrix/csr_builder.cpp @@ -33,7 +33,7 @@ class CsrBuilder : public ::testing::Test { std::unique_ptr mtx; }; -TYPED_TEST_SUITE(CsrBuilder, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(CsrBuilder, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/core/test/matrix/dense.cpp b/core/test/matrix/dense.cpp index e7158a15aed..f1a673840ea 100644 --- a/core/test/matrix/dense.cpp +++ b/core/test/matrix/dense.cpp @@ -48,7 +48,7 @@ class Dense : public ::testing::Test { std::unique_ptr> mtx; }; -TYPED_TEST_SUITE(Dense, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Dense, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Dense, CanBeEmpty) diff --git a/core/test/matrix/diagonal.cpp b/core/test/matrix/diagonal.cpp index de03a9350bb..7e598d67a5e 100644 --- a/core/test/matrix/diagonal.cpp +++ b/core/test/matrix/diagonal.cpp @@ -47,7 +47,8 @@ class Diagonal : public ::testing::Test { } }; -TYPED_TEST_SUITE(Diagonal, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Diagonal, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(Diagonal, KnowsItsSize) diff --git a/core/test/matrix/ell.cpp b/core/test/matrix/ell.cpp index bcc2b591a50..93fc73dde18 100644 --- a/core/test/matrix/ell.cpp +++ b/core/test/matrix/ell.cpp @@ -79,7 +79,8 @@ class Ell : public ::testing::Test { } }; -TYPED_TEST_SUITE(Ell, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Ell, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Ell, KnowsItsSize) diff --git a/core/test/matrix/fbcsr.cpp b/core/test/matrix/fbcsr.cpp index 3d3d4ee738d..fd024532a14 100644 --- a/core/test/matrix/fbcsr.cpp +++ b/core/test/matrix/fbcsr.cpp @@ -131,7 +131,7 @@ class FbcsrSample : public ::testing::Test { }; -TYPED_TEST_SUITE(FbcsrSample, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(FbcsrSample, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); @@ -183,7 +183,7 @@ template class FbcsrSampleComplex : public FbcsrSample {}; -TYPED_TEST_SUITE(FbcsrSampleComplex, gko::test::ComplexValueIndexTypes, +TYPED_TEST_SUITE(FbcsrSampleComplex, gko::test::ComplexValueIndexTypesWithHalf, PairTypenameNameGenerator); @@ -282,7 +282,8 @@ class Fbcsr : public ::testing::Test { } }; -TYPED_TEST_SUITE(Fbcsr, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Fbcsr, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Fbcsr, GetNumBlocksCorrectlyThrows) diff --git a/core/test/matrix/fbcsr_builder.cpp b/core/test/matrix/fbcsr_builder.cpp index d91a0c7b70a..241c7ccc6eb 100644 --- a/core/test/matrix/fbcsr_builder.cpp +++ b/core/test/matrix/fbcsr_builder.cpp @@ -33,7 +33,7 @@ class FbcsrBuilder : public ::testing::Test { std::unique_ptr mtx; }; -TYPED_TEST_SUITE(FbcsrBuilder, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(FbcsrBuilder, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/core/test/matrix/hybrid.cpp b/core/test/matrix/hybrid.cpp index d1a69312755..6b1e2a4a747 100644 --- a/core/test/matrix/hybrid.cpp +++ b/core/test/matrix/hybrid.cpp @@ -96,7 +96,8 @@ class Hybrid : public ::testing::Test { } }; -TYPED_TEST_SUITE(Hybrid, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Hybrid, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Hybrid, KnowsItsSize) diff --git a/core/test/matrix/identity.cpp b/core/test/matrix/identity.cpp index bcf9c036992..80defae4441 100644 --- a/core/test/matrix/identity.cpp +++ b/core/test/matrix/identity.cpp @@ -31,7 +31,8 @@ class Identity : public ::testing::Test { std::shared_ptr exec; }; -TYPED_TEST_SUITE(Identity, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Identity, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(Identity, CanBeEmpty) @@ -81,7 +82,8 @@ class IdentityFactory : public ::testing::Test { using value_type = T; }; -TYPED_TEST_SUITE(IdentityFactory, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(IdentityFactory, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(IdentityFactory, CanGenerateIdentityMatrix) diff --git a/core/test/matrix/permutation.cpp b/core/test/matrix/permutation.cpp index edb1532696b..fcd5aad789c 100644 --- a/core/test/matrix/permutation.cpp +++ b/core/test/matrix/permutation.cpp @@ -52,7 +52,7 @@ class Permutation : public ::testing::Test { std::unique_ptr> mtx; }; -TYPED_TEST_SUITE(Permutation, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(Permutation, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/core/test/matrix/row_gatherer.cpp b/core/test/matrix/row_gatherer.cpp index 801f639c206..b808828cc08 100644 --- a/core/test/matrix/row_gatherer.cpp +++ b/core/test/matrix/row_gatherer.cpp @@ -65,7 +65,7 @@ class RowGatherer : public ::testing::Test { std::unique_ptr out; }; -TYPED_TEST_SUITE(RowGatherer, gko::test::TwoValueIndexType, +TYPED_TEST_SUITE(RowGatherer, gko::test::TwoValueIndexTypeWithHalf, TupleTypenameNameGenerator); diff --git a/core/test/matrix/sellp.cpp b/core/test/matrix/sellp.cpp index 123d7bae773..a79fcf2bbd3 100644 --- a/core/test/matrix/sellp.cpp +++ b/core/test/matrix/sellp.cpp @@ -107,7 +107,8 @@ class Sellp : public ::testing::Test { } }; -TYPED_TEST_SUITE(Sellp, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Sellp, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Sellp, KnowsItsSize) diff --git a/core/test/matrix/sparsity_csr.cpp b/core/test/matrix/sparsity_csr.cpp index e929f960f1e..67f8237adb6 100644 --- a/core/test/matrix/sparsity_csr.cpp +++ b/core/test/matrix/sparsity_csr.cpp @@ -74,7 +74,7 @@ class SparsityCsr : public ::testing::Test { } }; -TYPED_TEST_SUITE(SparsityCsr, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(SparsityCsr, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/core/test/utils/fb_matrix_generator.hpp b/core/test/utils/fb_matrix_generator.hpp index 034dd95fce1..786f836e10a 100644 --- a/core/test/utils/fb_matrix_generator.hpp +++ b/core/test/utils/fb_matrix_generator.hpp @@ -131,16 +131,15 @@ std::unique_ptr> generate_fbcsr_from_csr( const IndexType* const row_ptrs = fmtx->get_const_row_ptrs(); const IndexType* const col_idxs = fmtx->get_const_col_idxs(); ValueType* const vals = fmtx->get_values(); - std::uniform_real_distribution> - off_diag_dist(-1.0, 1.0); + std::uniform_real_distribution<> off_diag_dist(-1.0, 1.0); for (IndexType ibrow = 0; ibrow < nbrows; ibrow++) { if (row_diag_dominant) { const IndexType nrownz = (row_ptrs[ibrow + 1] - row_ptrs[ibrow]) * block_size; - std::uniform_real_distribution> - diag_dist(1.01 * nrownz, 2 * nrownz); + std::uniform_real_distribution<> diag_dist(1.01 * nrownz, + 2 * nrownz); for (IndexType ibz = row_ptrs[ibrow]; ibz < row_ptrs[ibrow + 1]; ibz++) { @@ -205,13 +204,11 @@ std::unique_ptr> generate_random_fbcsr( matrix::Csr>( nbrows, nbcols, std::uniform_int_distribution(0, nbcols - 1), - std::normal_distribution(0.0, 1.0), - std::move(engine), ref) + std::normal_distribution<>(0.0, 1.0), std::move(engine), ref) : generate_random_matrix>( nbrows, nbcols, std::uniform_int_distribution(0, nbcols - 1), - std::normal_distribution(0.0, 1.0), - std::move(engine), ref); + std::normal_distribution<>(0.0, 1.0), std::move(engine), ref); if (unsort && rand_csr_ref->is_sorted_by_column_index()) { unsort_matrix(rand_csr_ref, engine); } diff --git a/core/test/utils/value_generator.hpp b/core/test/utils/value_generator.hpp index f18f2170c96..19e01b33356 100644 --- a/core/test/utils/value_generator.hpp +++ b/core/test/utils/value_generator.hpp @@ -33,7 +33,7 @@ template typename std::enable_if::value, ValueType>::type get_rand_value(ValueDistribution&& value_dist, Engine&& gen) { - return value_dist(gen); + return static_cast(value_dist(gen)); } /** @@ -45,7 +45,9 @@ template typename std::enable_if::value, ValueType>::type get_rand_value(ValueDistribution&& value_dist, Engine&& gen) { - return ValueType(value_dist(gen), value_dist(gen)); + using real_type = remove_complex; + return ValueType(static_cast(value_dist(gen)), + static_cast(value_dist(gen))); } diff --git a/hip/test/matrix/fbcsr_kernels.cpp b/hip/test/matrix/fbcsr_kernels.cpp index 0b4b16086ca..536ff3dc01c 100644 --- a/hip/test/matrix/fbcsr_kernels.cpp +++ b/hip/test/matrix/fbcsr_kernels.cpp @@ -8,6 +8,7 @@ #include +#include #include #include @@ -41,7 +42,7 @@ class Fbcsr : public HipTestFixture { std::unique_ptr rsorted_ref; - std::normal_distribution> distb; + std::normal_distribution<> distb; std::default_random_engine engine; value_type get_random_value() @@ -60,7 +61,8 @@ class Fbcsr : public HipTestFixture { } }; -TYPED_TEST_SUITE(Fbcsr, gko::test::RealValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Fbcsr, gko::test::RealValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(Fbcsr, CanWriteFromMatrixOnDevice) @@ -145,11 +147,15 @@ TYPED_TEST(Fbcsr, SpmvIsEquivalentToRefSorted) this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 1)); auto prod_hip = Dense::create(this->exec, prod_ref->get_size()); - rand_hip->apply(x_hip, prod_hip); - this->rsorted_ref->apply(x_ref, prod_ref); + if (std::is_same::value) { + ASSERT_THROW(rand_hip->apply(x_hip, prod_hip), gko::NotImplemented); + } else { + rand_hip->apply(x_hip, prod_hip); + this->rsorted_ref->apply(x_ref, prod_ref); - const double tol = r::value; - GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + } } @@ -169,11 +175,15 @@ TYPED_TEST(Fbcsr, SpmvMultiIsEquivalentToRefSorted) this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 3)); auto prod_hip = Dense::create(this->exec, prod_ref->get_size()); - rand_hip->apply(x_hip, prod_hip); - this->rsorted_ref->apply(x_ref, prod_ref); + if (std::is_same::value) { + ASSERT_THROW(rand_hip->apply(x_hip, prod_hip), gko::NotImplemented); + } else { + rand_hip->apply(x_hip, prod_hip); + this->rsorted_ref->apply(x_ref, prod_ref); - const double tol = r::value; - GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + } } @@ -205,11 +215,16 @@ TYPED_TEST(Fbcsr, AdvancedSpmvIsEquivalentToRefSorted) auto beta = Dense::create(this->exec); beta->copy_from(beta_ref); - rand_hip->apply(alpha, x_hip, beta, prod_hip); - this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref); + if (std::is_same::value) { + ASSERT_THROW(rand_hip->apply(alpha, x_hip, beta, prod_hip), + gko::NotImplemented); + } else { + rand_hip->apply(alpha, x_hip, beta, prod_hip); + this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref); - const double tol = r::value; - GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + } } @@ -241,11 +256,16 @@ TYPED_TEST(Fbcsr, AdvancedSpmvMultiIsEquivalentToRefSorted) auto beta = Dense::create(this->exec); beta->copy_from(beta_ref); - rand_hip->apply(alpha, x_hip, beta, prod_hip); - this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref); + if (std::is_same::value) { + ASSERT_THROW(rand_hip->apply(alpha, x_hip, beta, prod_hip), + gko::NotImplemented); + } else { + rand_hip->apply(alpha, x_hip, beta, prod_hip); + this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref); - const double tol = r::value; - GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + } } diff --git a/reference/test/matrix/coo_kernels.cpp b/reference/test/matrix/coo_kernels.cpp index 53efc588e1c..6ffea5d0e7d 100644 --- a/reference/test/matrix/coo_kernels.cpp +++ b/reference/test/matrix/coo_kernels.cpp @@ -32,7 +32,8 @@ class Coo : public ::testing::Test { using Csr = gko::matrix::Csr; using Mtx = gko::matrix::Coo; using Vec = gko::matrix::Dense; - using MixedVec = gko::matrix::Dense>; + using MixedVec = + gko::matrix::Dense>; Coo() : exec(gko::ReferenceExecutor::create()), mtx(Mtx::create(exec)) { @@ -72,24 +73,24 @@ class Coo : public ::testing::Test { std::unique_ptr uns_mtx; }; -TYPED_TEST_SUITE(Coo, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Coo, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Coo, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto tmp = OtherCoo::create(this->exec); auto res = Coo::create(this->exec); // If OtherType is more precise: 0, otherwise r - auto residual = r::value < r::value - ? gko::remove_complex{0} - : gko::remove_complex{ - static_cast>( - r::value)}; + auto residual = + r::value < r::value + ? gko::remove_complex{0} + : static_cast>(r::value); this->mtx->convert_to(tmp); tmp->convert_to(res); @@ -102,7 +103,7 @@ TYPED_TEST(Coo, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto tmp = OtherCoo::create(this->exec); @@ -215,7 +216,7 @@ TYPED_TEST(Coo, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto empty = OtherCoo::create(this->exec); @@ -232,7 +233,7 @@ TYPED_TEST(Coo, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto empty = OtherCoo::create(this->exec); @@ -703,7 +704,7 @@ TYPED_TEST(Coo, AppliesToComplex) TYPED_TEST(Coo, AppliesToMixedComplex) { using mixed_value_type = - gko::next_precision; + gko::next_precision_with_half; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -759,7 +760,7 @@ TYPED_TEST(Coo, AdvancedAppliesToComplex) TYPED_TEST(Coo, AdvancedAppliesToMixedComplex) { using mixed_value_type = - gko::next_precision; + gko::next_precision_with_half; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; @@ -817,7 +818,7 @@ TYPED_TEST(Coo, ApplyAddsToComplex) TYPED_TEST(Coo, ApplyAddsToMixedComplex) { using mixed_value_type = - gko::next_precision; + gko::next_precision_with_half; using mixed_complex_type = gko::to_complex; using MixedVec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -874,7 +875,7 @@ TYPED_TEST(Coo, ApplyAddsScaledToComplex) TYPED_TEST(Coo, ApplyAddsScaledToMixedComplex) { using mixed_value_type = - gko::next_precision; + gko::next_precision_with_half; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; @@ -911,7 +912,7 @@ class CooComplex : public ::testing::Test { using Mtx = gko::matrix::Coo; }; -TYPED_TEST_SUITE(CooComplex, gko::test::ComplexValueIndexTypes, +TYPED_TEST_SUITE(CooComplex, gko::test::ComplexValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index b84ac958f02..b417eb93f52 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -46,7 +46,8 @@ class Csr : public ::testing::Test { using Ell = gko::matrix::Ell; using Hybrid = gko::matrix::Hybrid; using Vec = gko::matrix::Dense; - using MixedVec = gko::matrix::Dense>; + using MixedVec = + gko::matrix::Dense>; using Perm = gko::matrix::Permutation; using ScaledPerm = gko::matrix::ScaledPermutation; @@ -347,7 +348,8 @@ class Csr : public ::testing::Test { index_type invalid_index = gko::invalid_index(); }; -TYPED_TEST_SUITE(Csr, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Csr, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Csr, AppliesToDenseVector) @@ -368,7 +370,7 @@ TYPED_TEST(Csr, MixedAppliesToDenseVector1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec = typename gko::matrix::Dense; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); @@ -383,7 +385,7 @@ TYPED_TEST(Csr, MixedAppliesToDenseVector2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); @@ -399,9 +401,9 @@ TYPED_TEST(Csr, MixedAppliesToDenseVector3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec1 = typename TestFixture::Vec; - using Vec2 = gko::matrix::Dense>; + using Vec2 = gko::matrix::Dense>; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); auto y = Vec1::create(this->exec, gko::dim<2>{2, 1}); @@ -432,7 +434,7 @@ TYPED_TEST(Csr, MixedAppliesToDenseMatrix1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec = gko::matrix::Dense; // clang-format off auto x = gko::initialize( @@ -456,7 +458,7 @@ TYPED_TEST(Csr, MixedAppliesToDenseMatrix2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; // clang-format off @@ -481,7 +483,7 @@ TYPED_TEST(Csr, MixedAppliesToDenseMatrix3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; // clang-format off @@ -522,7 +524,7 @@ TYPED_TEST(Csr, MixedAppliesLinearCombinationToDenseVector1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); auto beta = gko::initialize({2.0}, this->exec); @@ -539,7 +541,7 @@ TYPED_TEST(Csr, MixedAppliesLinearCombinationToDenseVector2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -557,7 +559,7 @@ TYPED_TEST(Csr, MixedAppliesLinearCombinationToDenseVector3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -595,7 +597,7 @@ TYPED_TEST(Csr, MixedAppliesLinearCombinationToDenseMatrix1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); auto beta = gko::initialize({2.0}, this->exec); @@ -619,7 +621,7 @@ TYPED_TEST(Csr, MixedAppliesLinearCombinationToDenseMatrix2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -639,7 +641,7 @@ TYPED_TEST(Csr, MixedAppliesLinearCombinationToDenseMatrix3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -788,7 +790,7 @@ TYPED_TEST(Csr, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto tmp = OtherCsr::create(this->exec); @@ -814,7 +816,7 @@ TYPED_TEST(Csr, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto tmp = OtherCsr::create(this->exec); @@ -992,7 +994,7 @@ TYPED_TEST(Csr, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto empty = OtherCsr::create(this->exec); @@ -1011,7 +1013,7 @@ TYPED_TEST(Csr, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto empty = OtherCsr::create(this->exec); @@ -2048,7 +2050,7 @@ TYPED_TEST(Csr, AppliesToComplex) TYPED_TEST(Csr, AppliesToMixedComplex) { using mixed_value_type = - gko::next_precision; + gko::next_precision_with_half; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -2104,7 +2106,7 @@ TYPED_TEST(Csr, AdvancedAppliesToComplex) TYPED_TEST(Csr, AdvancedAppliesToMixedComplex) { using mixed_value_type = - gko::next_precision; + gko::next_precision_with_half; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; @@ -2245,7 +2247,7 @@ class CsrComplex : public ::testing::Test { using Mtx = gko::matrix::Csr; }; -TYPED_TEST_SUITE(CsrComplex, gko::test::ComplexValueIndexTypes, +TYPED_TEST_SUITE(CsrComplex, gko::test::ComplexValueIndexTypesWithHalf, PairTypenameNameGenerator); @@ -2590,7 +2592,7 @@ class CsrLookup : public ::testing::Test { index_type invalid_index = gko::invalid_index(); }; -TYPED_TEST_SUITE(CsrLookup, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(CsrLookup, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); TYPED_TEST(CsrLookup, GeneratesLookupDataOffsets) diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index a8d37ce5a09..3854cd56dff 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -37,7 +37,8 @@ class Dense : public ::testing::Test { protected: using value_type = T; using Mtx = gko::matrix::Dense; - using MixedMtx = gko::matrix::Dense>; + using MixedMtx = + gko::matrix::Dense>; using ComplexMtx = gko::to_complex; using RealMtx = gko::remove_complex; Dense() @@ -80,7 +81,7 @@ class Dense : public ::testing::Test { }; -TYPED_TEST_SUITE(Dense, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Dense, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Dense, CopyRespectsStride) @@ -745,7 +746,7 @@ TYPED_TEST(Dense, ConvertsToPrecision) { using Dense = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; + using OtherT = typename gko::next_precision_with_half; using OtherDense = typename gko::matrix::Dense; auto tmp = OtherDense::create(this->exec); auto res = Dense::create(this->exec); @@ -767,7 +768,7 @@ TYPED_TEST(Dense, MovesToPrecision) { using Dense = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; + using OtherT = typename gko::next_precision_with_half; using OtherDense = typename gko::matrix::Dense; auto tmp = OtherDense::create(this->exec); auto res = Dense::create(this->exec); @@ -1066,7 +1067,7 @@ TYPED_TEST(Dense, AppliesToComplex) TYPED_TEST(Dense, AppliesToMixedComplex) { using mixed_value_type = - gko::next_precision; + gko::next_precision_with_half; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -1120,7 +1121,7 @@ TYPED_TEST(Dense, AdvancedAppliesToComplex) TYPED_TEST(Dense, AdvancedAppliesToMixedComplex) { using mixed_value_type = - gko::next_precision; + gko::next_precision_with_half; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; @@ -1359,7 +1360,7 @@ class DenseWithIndexType std::unique_ptr scale_perm0; }; -TYPED_TEST_SUITE(DenseWithIndexType, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(DenseWithIndexType, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); @@ -2013,7 +2014,7 @@ TYPED_TEST(Dense, ConvertsEmptyToPrecision) { using Dense = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; + using OtherT = typename gko::next_precision_with_half; using OtherDense = typename gko::matrix::Dense; auto empty = OtherDense::create(this->exec); auto res = Dense::create(this->exec); @@ -2028,7 +2029,7 @@ TYPED_TEST(Dense, MovesEmptyToPrecision) { using Dense = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; + using OtherT = typename gko::next_precision_with_half; using OtherDense = typename gko::matrix::Dense; auto empty = OtherDense::create(this->exec); auto res = Dense::create(this->exec); diff --git a/reference/test/matrix/diagonal_kernels.cpp b/reference/test/matrix/diagonal_kernels.cpp index e2ac67190d0..d1208e96178 100644 --- a/reference/test/matrix/diagonal_kernels.cpp +++ b/reference/test/matrix/diagonal_kernels.cpp @@ -30,7 +30,8 @@ class Diagonal : public ::testing::Test { using Csr = gko::matrix::Csr; using Diag = gko::matrix::Diagonal; using Dense = gko::matrix::Dense; - using MixedDense = gko::matrix::Dense>; + using MixedDense = + gko::matrix::Dense>; Diagonal() : exec(gko::ReferenceExecutor::create()), @@ -79,13 +80,14 @@ class Diagonal : public ::testing::Test { std::unique_ptr dense3; }; -TYPED_TEST_SUITE(Diagonal, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Diagonal, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(Diagonal, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Diagonal = typename TestFixture::Diag; using OtherDiagonal = gko::matrix::Diagonal; auto tmp = OtherDiagonal::create(this->exec); @@ -107,7 +109,7 @@ TYPED_TEST(Diagonal, ConvertsToPrecision) TYPED_TEST(Diagonal, MovesToPrecision) { using ValueType = typename TestFixture::value_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Diagonal = typename TestFixture::Diag; using OtherDiagonal = gko::matrix::Diagonal; auto tmp = OtherDiagonal::create(this->exec); @@ -574,7 +576,7 @@ TYPED_TEST(Diagonal, AppliesToComplex) TYPED_TEST(Diagonal, AppliesToMixedComplex) { using mixed_value_type = - gko::next_precision; + gko::next_precision_with_half; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -634,7 +636,7 @@ TYPED_TEST(Diagonal, AppliesLinearCombinationToComplex) TYPED_TEST(Diagonal, AppliesLinearCombinationToMixedComplex) { using mixed_value_type = - gko::next_precision; + gko::next_precision_with_half; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; using Scalar = gko::matrix::Dense; diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp index 6214db82d1c..7f3c770c603 100644 --- a/reference/test/matrix/ell_kernels.cpp +++ b/reference/test/matrix/ell_kernels.cpp @@ -30,7 +30,8 @@ class Ell : public ::testing::Test { using Mtx = gko::matrix::Ell; using Csr = gko::matrix::Csr; using Vec = gko::matrix::Dense; - using MixedVec = gko::matrix::Dense>; + using MixedVec = + gko::matrix::Dense>; Ell() : exec(gko::ReferenceExecutor::create()), @@ -72,7 +73,8 @@ class Ell : public ::testing::Test { std::unique_ptr mtx2; }; -TYPED_TEST_SUITE(Ell, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Ell, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Ell, AppliesToDenseVector) @@ -91,7 +93,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseVector1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec = typename gko::matrix::Dense; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); @@ -106,7 +108,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseVector2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); @@ -122,9 +124,9 @@ TYPED_TEST(Ell, MixedAppliesToDenseVector3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec1 = typename TestFixture::Vec; - using Vec2 = gko::matrix::Dense>; + using Vec2 = gko::matrix::Dense>; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); auto y = Vec1::create(this->exec, gko::dim<2>{2, 1}); @@ -160,7 +162,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseMatrix1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec = gko::matrix::Dense; // clang-format off auto x = gko::initialize( @@ -184,7 +186,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseMatrix2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; // clang-format off @@ -209,7 +211,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseMatrix3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; // clang-format off @@ -248,7 +250,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); auto beta = gko::initialize({2.0}, this->exec); @@ -265,7 +267,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -283,7 +285,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -327,7 +329,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); auto beta = gko::initialize({2.0}, this->exec); @@ -355,7 +357,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -384,7 +386,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = gko::next_precision_with_half; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -443,7 +445,7 @@ TYPED_TEST(Ell, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto tmp = OtherEll::create(this->exec); @@ -466,7 +468,7 @@ TYPED_TEST(Ell, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto tmp = OtherEll::create(this->exec); @@ -735,7 +737,7 @@ TYPED_TEST(Ell, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto empty = Ell::create(this->exec); @@ -752,7 +754,7 @@ TYPED_TEST(Ell, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto empty = Ell::create(this->exec); @@ -897,7 +899,7 @@ TYPED_TEST(Ell, AppliesToComplex) TYPED_TEST(Ell, AppliesToMixedComplex) { using mixed_value_type = - gko::next_precision; + gko::next_precision_with_half; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -954,7 +956,7 @@ TYPED_TEST(Ell, AdvancedAppliesToComplex) TYPED_TEST(Ell, AdvancedAppliesToMixedComplex) { using mixed_value_type = - gko::next_precision; + gko::next_precision_with_half; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; @@ -992,7 +994,7 @@ class EllComplex : public ::testing::Test { using Mtx = gko::matrix::Ell; }; -TYPED_TEST_SUITE(EllComplex, gko::test::ComplexValueIndexTypes, +TYPED_TEST_SUITE(EllComplex, gko::test::ComplexValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/reference/test/matrix/fbcsr_kernels.cpp b/reference/test/matrix/fbcsr_kernels.cpp index 665df4ace31..9d9e2144cc3 100644 --- a/reference/test/matrix/fbcsr_kernels.cpp +++ b/reference/test/matrix/fbcsr_kernels.cpp @@ -104,7 +104,8 @@ class Fbcsr : public ::testing::Test { const std::unique_ptr mtxsq; }; -TYPED_TEST_SUITE(Fbcsr, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Fbcsr, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); template @@ -114,7 +115,7 @@ std::unique_ptr> get_some_vectors( { using RT = gko::remove_complex; std::default_random_engine engine(39); - std::normal_distribution dist(0.0, 5.0); + std::normal_distribution<> dist(0.0, 5.0); std::uniform_int_distribution<> nnzdist(1, nrhs); return gko::test::generate_random_matrix>( nrows, nrhs, nnzdist, dist, engine, exec); @@ -271,7 +272,7 @@ TYPED_TEST(Fbcsr, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto tmp = OtherFbcsr::create(this->exec); @@ -294,7 +295,7 @@ TYPED_TEST(Fbcsr, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto tmp = OtherFbcsr::create(this->exec); @@ -392,7 +393,7 @@ TYPED_TEST(Fbcsr, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto empty = OtherFbcsr::create(this->exec); @@ -411,7 +412,7 @@ TYPED_TEST(Fbcsr, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto empty = OtherFbcsr::create(this->exec); @@ -619,7 +620,7 @@ class FbcsrComplex : public ::testing::Test { using Csr = gko::matrix::Csr; }; -TYPED_TEST_SUITE(FbcsrComplex, gko::test::ComplexValueIndexTypes, +TYPED_TEST_SUITE(FbcsrComplex, gko::test::ComplexValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/reference/test/matrix/hybrid_kernels.cpp b/reference/test/matrix/hybrid_kernels.cpp index 87fd4c02811..c5e6496dce1 100644 --- a/reference/test/matrix/hybrid_kernels.cpp +++ b/reference/test/matrix/hybrid_kernels.cpp @@ -32,7 +32,8 @@ class Hybrid : public ::testing::Test { using Mtx = gko::matrix::Hybrid; using Vec = gko::matrix::Dense; using Csr = gko::matrix::Csr; - using MixedVec = gko::matrix::Dense>; + using MixedVec = + gko::matrix::Dense>; Hybrid() : exec(gko::ReferenceExecutor::create()), @@ -96,7 +97,8 @@ class Hybrid : public ::testing::Test { std::unique_ptr mtx3; }; -TYPED_TEST_SUITE(Hybrid, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Hybrid, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Hybrid, AppliesToDenseVector) @@ -233,7 +235,7 @@ TYPED_TEST(Hybrid, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto tmp = OtherHybrid::create(this->exec); @@ -256,7 +258,7 @@ TYPED_TEST(Hybrid, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto tmp = OtherHybrid::create(this->exec); @@ -367,7 +369,7 @@ TYPED_TEST(Hybrid, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto other = Hybrid::create(this->exec); @@ -384,7 +386,7 @@ TYPED_TEST(Hybrid, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto other = Hybrid::create(this->exec); @@ -699,7 +701,7 @@ TYPED_TEST(Hybrid, AppliesToComplex) TYPED_TEST(Hybrid, AppliesToMixedComplex) { using mixed_value_type = - gko::next_precision; + gko::next_precision_with_half; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -756,7 +758,7 @@ TYPED_TEST(Hybrid, AdvancedAppliesToComplex) TYPED_TEST(Hybrid, AdvancedAppliesToMixedComplex) { using mixed_value_type = - gko::next_precision; + gko::next_precision_with_half; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; @@ -795,7 +797,7 @@ class HybridComplex : public ::testing::Test { using Mtx = gko::matrix::Hybrid; }; -TYPED_TEST_SUITE(HybridComplex, gko::test::ComplexValueIndexTypes, +TYPED_TEST_SUITE(HybridComplex, gko::test::ComplexValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/reference/test/matrix/identity.cpp b/reference/test/matrix/identity.cpp index 11953de338a..82704145978 100644 --- a/reference/test/matrix/identity.cpp +++ b/reference/test/matrix/identity.cpp @@ -19,7 +19,8 @@ class Identity : public ::testing::Test { using value_type = T; using Id = gko::matrix::Identity; using Vec = gko::matrix::Dense; - using MixedVec = gko::matrix::Dense>; + using MixedVec = + gko::matrix::Dense>; using ComplexVec = gko::to_complex; using MixedComplexVec = gko::to_complex; @@ -29,7 +30,8 @@ class Identity : public ::testing::Test { }; -TYPED_TEST_SUITE(Identity, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Identity, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(Identity, AppliesToVector) diff --git a/reference/test/matrix/permutation.cpp b/reference/test/matrix/permutation.cpp index 5418f97353b..b646a6fc67f 100644 --- a/reference/test/matrix/permutation.cpp +++ b/reference/test/matrix/permutation.cpp @@ -51,7 +51,7 @@ class Permutation : public ::testing::Test { std::shared_ptr exec; }; -TYPED_TEST_SUITE(Permutation, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(Permutation, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/reference/test/matrix/scaled_permutation.cpp b/reference/test/matrix/scaled_permutation.cpp index 6d8d49f5662..f2b3e66b4cd 100644 --- a/reference/test/matrix/scaled_permutation.cpp +++ b/reference/test/matrix/scaled_permutation.cpp @@ -58,7 +58,7 @@ class ScaledPermutation : public ::testing::Test { std::unique_ptr perm2; }; -TYPED_TEST_SUITE(ScaledPermutation, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(ScaledPermutation, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/reference/test/matrix/sellp_kernels.cpp b/reference/test/matrix/sellp_kernels.cpp index 3208b8c42be..23251c63b8f 100644 --- a/reference/test/matrix/sellp_kernels.cpp +++ b/reference/test/matrix/sellp_kernels.cpp @@ -50,7 +50,8 @@ class Sellp : public ::testing::Test { std::unique_ptr mtx2; }; -TYPED_TEST_SUITE(Sellp, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Sellp, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Sellp, AppliesToDenseVector) @@ -67,7 +68,8 @@ TYPED_TEST(Sellp, AppliesToDenseVector) TYPED_TEST(Sellp, AppliesToMixedDenseVector) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Vec = gko::matrix::Dense; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); @@ -116,7 +118,8 @@ TYPED_TEST(Sellp, AppliesLinearCombinationToDenseVector) TYPED_TEST(Sellp, AppliesLinearCombinationToMixedDenseVector) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Vec = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); auto beta = gko::initialize({2.0}, this->exec); @@ -189,7 +192,7 @@ TYPED_TEST(Sellp, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto tmp = OtherSellp::create(this->exec); @@ -212,7 +215,7 @@ TYPED_TEST(Sellp, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto tmp = OtherSellp::create(this->exec); @@ -310,7 +313,7 @@ TYPED_TEST(Sellp, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto empty = OtherSellp::create(this->exec); @@ -329,7 +332,7 @@ TYPED_TEST(Sellp, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = gko::next_precision; + using OtherType = gko::next_precision_with_half; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto empty = OtherSellp::create(this->exec); @@ -751,7 +754,7 @@ class SellpComplex : public ::testing::Test { using Mtx = gko::matrix::Sellp; }; -TYPED_TEST_SUITE(SellpComplex, gko::test::ComplexValueIndexTypes, +TYPED_TEST_SUITE(SellpComplex, gko::test::ComplexValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/reference/test/matrix/sparsity_csr.cpp b/reference/test/matrix/sparsity_csr.cpp index d8ed6147e30..8db0dee144f 100644 --- a/reference/test/matrix/sparsity_csr.cpp +++ b/reference/test/matrix/sparsity_csr.cpp @@ -47,7 +47,7 @@ class SparsityCsr : public ::testing::Test { std::unique_ptr mtx; }; -TYPED_TEST_SUITE(SparsityCsr, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(SparsityCsr, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/reference/test/matrix/sparsity_csr_kernels.cpp b/reference/test/matrix/sparsity_csr_kernels.cpp index f08d6c352ca..30805d033ab 100644 --- a/reference/test/matrix/sparsity_csr_kernels.cpp +++ b/reference/test/matrix/sparsity_csr_kernels.cpp @@ -125,7 +125,7 @@ class SparsityCsr : public ::testing::Test { std::unique_ptr mtx3_unsorted; }; -TYPED_TEST_SUITE(SparsityCsr, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(SparsityCsr, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); @@ -145,7 +145,7 @@ TYPED_TEST(SparsityCsr, AppliesToDenseVector) TYPED_TEST(SparsityCsr, AppliesToMixedDenseVector) { - using T = gko::next_precision; + using T = gko::next_precision_with_half; using Vec = gko::matrix::Dense; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); @@ -192,7 +192,7 @@ TYPED_TEST(SparsityCsr, AppliesLinearCombinationToDenseVector) TYPED_TEST(SparsityCsr, AppliesLinearCombinationToMixedDenseVector) { - using T = gko::next_precision; + using T = gko::next_precision_with_half; using Vec = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); auto beta = gko::initialize({2.0}, this->exec); @@ -243,8 +243,8 @@ TYPED_TEST(SparsityCsr, AppliesToComplex) TYPED_TEST(SparsityCsr, AppliesToMixedComplex) { - using T = - gko::next_precision>; + using T = gko::next_precision_with_half< + gko::to_complex>; using Vec = gko::matrix::Dense; auto x = gko::initialize({T{2.0, 4.0}, T{1.0, 2.0}, T{4.0, 8.0}}, this->exec); @@ -279,7 +279,7 @@ TYPED_TEST(SparsityCsr, AppliesLinearCombinationToComplex) TYPED_TEST(SparsityCsr, AppliesLinearCombinationToMixedComplex) { using Vec = gko::matrix::Dense< - gko::next_precision>; + gko::next_precision_with_half>; using ComplexVec = gko::to_complex; using T = typename ComplexVec::value_type; auto alpha = gko::initialize({-1.0}, this->exec); diff --git a/test/matrix/fbcsr_kernels.cpp b/test/matrix/fbcsr_kernels.cpp index 4ff8e1fc36a..5e3d4b1a112 100644 --- a/test/matrix/fbcsr_kernels.cpp +++ b/test/matrix/fbcsr_kernels.cpp @@ -48,18 +48,23 @@ class Fbcsr : public CommonTestFixture { void generate_sin(gko::ptr_param x) { value_type* const xarr = x->get_values(); + // we do not have sin for half, so we compute sin in double or + // complex + using working_type = std::conditional_t(), + std::complex, double>; for (index_type i = 0; i < x->get_size()[0] * x->get_size()[1]; i++) { - xarr[i] = - static_cast(2.0) * - std::sin(static_cast(i / 2.0) + get_random_value()); + xarr[i] = static_cast( + 2.0 * std::sin(i / 2.0 + + static_cast(get_random_value()))); } } }; #ifdef GKO_COMPILING_HIP -TYPED_TEST_SUITE(Fbcsr, gko::test::RealValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Fbcsr, gko::test::RealValueTypesWithHalf, + TypenameNameGenerator); #else -TYPED_TEST_SUITE(Fbcsr, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Fbcsr, gko::test::ValueTypesWithHalf, TypenameNameGenerator); #endif TYPED_TEST(Fbcsr, CanWriteFromMatrixOnDevice) @@ -124,6 +129,8 @@ TYPED_TEST(Fbcsr, SpmvIsEquivalentToRefSorted) using Dense = typename TestFixture::Dense; using value_type = typename Mtx::value_type; if (this->exec->get_master() != this->exec) { + // FBCSR on accelerator does not have half precision apply through + // vendor libraries. SKIP_IF_HALF(value_type); } auto drand = gko::clone(this->exec, this->rsorted); @@ -149,6 +156,8 @@ TYPED_TEST(Fbcsr, SpmvMultiIsEquivalentToRefSorted) using Dense = typename TestFixture::Dense; using value_type = typename Mtx::value_type; if (this->exec->get_master() != this->exec) { + // FBCSR on accelerator does not have half precision apply through + // vendor libraries. SKIP_IF_HALF(value_type); } auto drand = gko::clone(this->exec, this->rsorted); @@ -175,6 +184,8 @@ TYPED_TEST(Fbcsr, AdvancedSpmvIsEquivalentToRefSorted) using value_type = typename TestFixture::value_type; using real_type = typename TestFixture::real_type; if (this->exec->get_master() != this->exec) { + // FBCSR on accelerator does not have half precision apply through + // vendor libraries. SKIP_IF_HALF(value_type); } auto drand = gko::clone(this->exec, this->rsorted); @@ -208,6 +219,8 @@ TYPED_TEST(Fbcsr, AdvancedSpmvMultiIsEquivalentToRefSorted) using value_type = typename TestFixture::value_type; using real_type = typename TestFixture::real_type; if (this->exec->get_master() != this->exec) { + // FBCSR on accelerator does not have half precision apply through + // vendor libraries. SKIP_IF_HALF(value_type); } auto drand = gko::clone(this->exec, this->rsorted); From 2b09f038a83b38c4f21c42da3cf41bb61450320f Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 25 Oct 2024 16:29:56 +0200 Subject: [PATCH 12/69] base such as composition/combination with half and corr. test --- core/base/block_operator.cpp | 8 ++++++-- core/base/combination.cpp | 2 +- core/base/composition.cpp | 2 +- core/base/dense_cache.cpp | 2 +- core/base/perturbation.cpp | 2 +- core/test/base/combination.cpp | 3 ++- core/test/base/composition.cpp | 3 ++- core/test/base/dense_cache.cpp | 3 ++- reference/test/base/composition.cpp | 13 ++++++++----- reference/test/base/perturbation.cpp | 13 ++++++++----- 10 files changed, 32 insertions(+), 19 deletions(-) diff --git a/core/base/block_operator.cpp b/core/base/block_operator.cpp index f53375301a8..68c00aeee70 100644 --- a/core/base/block_operator.cpp +++ b/core/base/block_operator.cpp @@ -19,8 +19,12 @@ namespace { template auto dispatch_dense(Fn&& fn, LinOp* v) { - return run, - std::complex>(v, std::forward(fn)); + return run, +#endif + std::complex, std::complex>(v, + std::forward(fn)); } diff --git a/core/base/combination.cpp b/core/base/combination.cpp index 3b30b77d38c..53af6742f6e 100644 --- a/core/base/combination.cpp +++ b/core/base/combination.cpp @@ -168,7 +168,7 @@ void Combination::apply_impl(const LinOp* alpha, const LinOp* b, #define GKO_DECLARE_COMBINATION(_type) class Combination<_type> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_COMBINATION); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_COMBINATION); } // namespace gko diff --git a/core/base/composition.cpp b/core/base/composition.cpp index 82c8152300b..f6a7df21e45 100644 --- a/core/base/composition.cpp +++ b/core/base/composition.cpp @@ -222,7 +222,7 @@ void Composition::apply_impl(const LinOp* alpha, const LinOp* b, #define GKO_DECLARE_COMPOSITION(_type) class Composition<_type> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_COMPOSITION); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_COMPOSITION); } // namespace gko diff --git a/core/base/dense_cache.cpp b/core/base/dense_cache.cpp index 38a0decfa46..096ad1f761a 100644 --- a/core/base/dense_cache.cpp +++ b/core/base/dense_cache.cpp @@ -33,7 +33,7 @@ void DenseCache::init_from( #define GKO_DECLARE_DENSE_CACHE(_type) struct DenseCache<_type> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CACHE); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_DENSE_CACHE); } // namespace detail diff --git a/core/base/perturbation.cpp b/core/base/perturbation.cpp index 87501361c05..b17cba209e1 100644 --- a/core/base/perturbation.cpp +++ b/core/base/perturbation.cpp @@ -182,7 +182,7 @@ void Perturbation::apply_impl(const LinOp* alpha, const LinOp* b, #define GKO_DECLARE_PERTURBATION(_type) class Perturbation<_type> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_PERTURBATION); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_PERTURBATION); } // namespace gko diff --git a/core/test/base/combination.cpp b/core/test/base/combination.cpp index 73c30ffe11c..63c73cfa168 100644 --- a/core/test/base/combination.cpp +++ b/core/test/base/combination.cpp @@ -43,7 +43,8 @@ class Combination : public ::testing::Test { std::vector> coefficients; }; -TYPED_TEST_SUITE(Combination, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Combination, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(Combination, CanBeEmpty) diff --git a/core/test/base/composition.cpp b/core/test/base/composition.cpp index 122755b8f92..58c86894fc8 100644 --- a/core/test/base/composition.cpp +++ b/core/test/base/composition.cpp @@ -41,7 +41,8 @@ class Composition : public ::testing::Test { std::vector> operators; }; -TYPED_TEST_SUITE(Composition, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Composition, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(Composition, CanBeEmpty) diff --git a/core/test/base/dense_cache.cpp b/core/test/base/dense_cache.cpp index 526187610a4..54d904617db 100644 --- a/core/test/base/dense_cache.cpp +++ b/core/test/base/dense_cache.cpp @@ -31,7 +31,8 @@ class DenseCache : public ::testing::Test { }; -TYPED_TEST_SUITE(DenseCache, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(DenseCache, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(DenseCache, CanDefaultConstruct) diff --git a/reference/test/base/composition.cpp b/reference/test/base/composition.cpp index f736edb53f9..d17b8602ce8 100644 --- a/reference/test/base/composition.cpp +++ b/reference/test/base/composition.cpp @@ -75,7 +75,8 @@ class Composition : public ::testing::Test { std::shared_ptr product; }; -TYPED_TEST_SUITE(Composition, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Composition, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(Composition, CopiesOnSameExecutor) @@ -142,7 +143,7 @@ TYPED_TEST(Composition, AppliesSingleToMixedVector) cmp = [ -9 -2 ] [ 27 26 ] */ - using Mtx = gko::matrix::Dense>; + using Mtx = gko::matrix::Dense>; using value_type = typename Mtx::value_type; auto cmp = gko::Composition::create(this->product); auto x = gko::initialize({1.0, 2.0}, this->exec); @@ -182,7 +183,8 @@ TYPED_TEST(Composition, AppliesSingleToMixedComplexVector) cmp = [ -9 -2 ] [ 27 26 ] */ - using value_type = gko::next_precision>; + using value_type = + gko::next_precision_with_half>; using Mtx = gko::matrix::Dense; auto cmp = gko::Composition::create(this->product); auto x = gko::initialize( @@ -222,7 +224,7 @@ TYPED_TEST(Composition, AppliesSingleLinearCombinationToMixedVector) cmp = [ -9 -2 ] [ 27 26 ] */ - using value_type = gko::next_precision; + using value_type = gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto cmp = gko::Composition::create(this->product); auto alpha = gko::initialize({3.0}, this->exec); @@ -267,7 +269,8 @@ TYPED_TEST(Composition, AppliesSingleLinearCombinationToMixedComplexVector) cmp = [ -9 -2 ] [ 27 26 ] */ - using MixedDense = gko::matrix::Dense>; + using MixedDense = + gko::matrix::Dense>; using MixedDenseComplex = gko::to_complex; using value_type = typename MixedDenseComplex::value_type; auto cmp = gko::Composition::create(this->product); diff --git a/reference/test/base/perturbation.cpp b/reference/test/base/perturbation.cpp index b6be9ab1563..50a5fe7db20 100644 --- a/reference/test/base/perturbation.cpp +++ b/reference/test/base/perturbation.cpp @@ -33,7 +33,8 @@ class Perturbation : public ::testing::Test { std::shared_ptr scalar; }; -TYPED_TEST_SUITE(Perturbation, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Perturbation, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(Perturbation, CopiesOnSameExecutor) @@ -101,7 +102,7 @@ TYPED_TEST(Perturbation, AppliesToMixedVector) cmp = I + 2 * [ 2 ] * [ 3 2 ] [ 1 ] */ - using Mtx = gko::matrix::Dense>; + using Mtx = gko::matrix::Dense>; using value_type = typename Mtx::value_type; auto cmp = gko::Perturbation::create(this->scalar, this->basis, this->projector); @@ -143,7 +144,8 @@ TYPED_TEST(Perturbation, AppliesToMixedComplexVector) cmp = I + 2 * [ 2 ] * [ 3 2 ] [ 1 ] */ - using value_type = gko::to_complex>; + using value_type = + gko::to_complex>; using Mtx = gko::matrix::Dense; auto cmp = gko::Perturbation::create(this->scalar, this->basis, this->projector); @@ -185,7 +187,7 @@ TYPED_TEST(Perturbation, AppliesLinearCombinationToMixedVector) cmp = I + 2 * [ 2 ] * [ 3 2 ] [ 1 ] */ - using value_type = gko::next_precision; + using value_type = gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto cmp = gko::Perturbation::create(this->scalar, this->basis, this->projector); @@ -232,7 +234,8 @@ TYPED_TEST(Perturbation, AppliesLinearCombinationToMixedComplexVector) cmp = I + 2 * [ 2 ] * [ 3 2 ] [ 1 ] */ - using MixedDense = gko::matrix::Dense>; + using MixedDense = + gko::matrix::Dense>; using MixedDenseComplex = gko::to_complex; using value_type = typename MixedDenseComplex::value_type; auto cmp = gko::Perturbation::create(this->scalar, this->basis, From a0d4a042bb5cab76fcbcdc5eb29ad519565c7cf6 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 4 Nov 2024 15:15:17 +0100 Subject: [PATCH 13/69] test_utils test --- core/test/utils/array_generator_test.cpp | 18 +++++---- core/test/utils/matrix_generator.hpp | 18 +++++++-- core/test/utils/matrix_generator_test.cpp | 49 ++++++++++++++--------- core/test/utils/matrix_utils_test.cpp | 11 ++--- core/test/utils/unsort_matrix_test.cpp | 2 +- core/test/utils/value_generator_test.cpp | 16 +++++--- reference/test/utils/assertions_test.cpp | 3 +- 7 files changed, 73 insertions(+), 44 deletions(-) diff --git a/core/test/utils/array_generator_test.cpp b/core/test/utils/array_generator_test.cpp index ae66e4686da..ca96761ea4e 100644 --- a/core/test/utils/array_generator_test.cpp +++ b/core/test/utils/array_generator_test.cpp @@ -18,11 +18,12 @@ template class ArrayGenerator : public ::testing::Test { protected: using value_type = T; + using check_type = double; ArrayGenerator() : exec(gko::ReferenceExecutor::create()) { array = gko::test::generate_random_array( - 500, std::normal_distribution>(20.0, 5.0), + 500, std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec); } @@ -30,15 +31,17 @@ class ArrayGenerator : public ::testing::Test { gko::array array; template - ValueType get_nth_moment(int n, ValueType c, InputIterator sample_start, - InputIterator sample_end, Closure closure_op) + check_type get_nth_moment(int n, ValueType c, InputIterator sample_start, + InputIterator sample_end, Closure closure_op) { using std::pow; - ValueType res = 0; - ValueType num_elems = 0; + check_type res = 0; + check_type num_elems = 0; while (sample_start != sample_end) { auto tmp = *(sample_start++); - res += pow(closure_op(tmp) - c, n); + res += pow(static_cast(closure_op(tmp)) - + static_cast(c), + n); num_elems += 1; } return res / num_elems; @@ -62,7 +65,8 @@ class ArrayGenerator : public ::testing::Test { } }; -TYPED_TEST_SUITE(ArrayGenerator, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(ArrayGenerator, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(ArrayGenerator, OutputHasCorrectSize) diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp index 56ff38c520d..01ee40cdadc 100644 --- a/core/test/utils/matrix_generator.hpp +++ b/core/test/utils/matrix_generator.hpp @@ -659,10 +659,20 @@ gko::matrix_data generate_tridiag_inverse_matrix_data( auto off_diag = i < j ? upper : lower; auto min_idx = std::min(i, j); auto max_idx = std::max(i, j); - auto val = sign * - static_cast( - std::pow(off_diag, max_idx - min_idx)) * - alpha[min_idx] * beta[max_idx + 1] / alpha.back(); + // NVHPC 23.3 with O3 gives wrong result with std::pow on + // complex. We use the float variant to help it, also for + // half. + using pow_type = std::conditional_t< + std::is_same, + gko::half>::value, + std::conditional_t(), + std::complex, float>, + ValueType>; + auto val = + sign * + static_cast(std::pow( + static_cast(off_diag), max_idx - min_idx)) * + alpha[min_idx] * beta[max_idx + 1] / alpha.back(); md.nonzeros.emplace_back(i, j, val); } } diff --git a/core/test/utils/matrix_generator_test.cpp b/core/test/utils/matrix_generator_test.cpp index 43756bc1709..61710540e24 100644 --- a/core/test/utils/matrix_generator_test.cpp +++ b/core/test/utils/matrix_generator_test.cpp @@ -20,31 +20,32 @@ template class MatrixGenerator : public ::testing::Test { protected: using value_type = T; + using check_type = double; using real_type = gko::remove_complex; using mtx_type = gko::matrix::Dense; MatrixGenerator() : exec(gko::ReferenceExecutor::create()), mtx(gko::test::generate_random_matrix( - 500, 100, std::normal_distribution(50, 5), - std::normal_distribution(20.0, 5.0), + 500, 100, std::normal_distribution<>(50, 5), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec)), dense_mtx(gko::test::generate_random_dense_matrix( - 500, 100, std::normal_distribution(20.0, 5.0), + 500, 100, std::normal_distribution<>(20.0, 5.0), std::default_random_engine(41), exec)), l_mtx(gko::test::generate_random_lower_triangular_matrix( - 4, true, std::normal_distribution(50, 5), - std::normal_distribution(20.0, 5.0), + 4, true, std::normal_distribution<>(50, 5), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec)), u_mtx(gko::test::generate_random_upper_triangular_matrix( - 4, true, std::normal_distribution(50, 5), - std::normal_distribution(20.0, 5.0), + 4, true, std::normal_distribution<>(50, 5), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec)), lower_bandwidth(2), upper_bandwidth(3), band_mtx(gko::test::generate_random_band_matrix( 100, lower_bandwidth, upper_bandwidth, - std::normal_distribution(20.0, 5.0), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec)), nnz_per_row_sample(500, 0), values_sample(0), @@ -96,15 +97,17 @@ class MatrixGenerator : public ::testing::Test { template - ValueType get_nth_moment(int n, ValueType c, InputIterator sample_start, - InputIterator sample_end, Closure closure_op) + check_type get_nth_moment(int n, ValueType c, InputIterator sample_start, + InputIterator sample_end, Closure closure_op) { using std::pow; - ValueType res = 0; - ValueType num_elems = 0; + check_type res = 0; + check_type num_elems = 0; while (sample_start != sample_end) { auto tmp = *(sample_start++); - res += pow(closure_op(tmp) - c, n); + res += pow(static_cast(closure_op(tmp)) - + static_cast(c), + n); num_elems += 1; } return res / num_elems; @@ -128,7 +131,8 @@ class MatrixGenerator : public ::testing::Test { } }; -TYPED_TEST_SUITE(MatrixGenerator, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(MatrixGenerator, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(MatrixGenerator, OutputHasCorrectSize) @@ -247,7 +251,7 @@ TYPED_TEST(MatrixGenerator, CanGenerateTridiagMatrix) { using T = typename TestFixture::value_type; using Dense = typename TestFixture::mtx_type; - auto dist = std::normal_distribution>(0, 1); + auto dist = std::normal_distribution<>(0, 1); auto engine = std::default_random_engine(42); auto lower = gko::test::detail::get_rand_value(dist, engine); auto diag = gko::test::detail::get_rand_value(dist, engine); @@ -271,18 +275,23 @@ TYPED_TEST(MatrixGenerator, CanGenerateTridiagInverseMatrix) { using T = typename TestFixture::value_type; using Dense = typename TestFixture::mtx_type; - auto dist = std::normal_distribution>(0, 1); + auto dist = std::normal_distribution<>(0, 1); auto engine = std::default_random_engine(42); auto lower = gko::test::detail::get_rand_value(dist, engine); auto upper = gko::test::detail::get_rand_value(dist, engine); // make diagonally dominant - auto diag = std::abs(gko::test::detail::get_rand_value(dist, engine)) + - std::abs(lower) + std::abs(upper); + auto diag = gko::abs(gko::test::detail::get_rand_value(dist, engine)) + + gko::abs(lower) + gko::abs(upper); + gko::size_type size = 50; + if (std::is_same_v, gko::half>) { + // half precision can only handle the inverse of small matrix. + size = 5; + } auto mtx = gko::test::generate_tridiag_matrix( - 50, {lower, diag, upper}, this->exec); + size, {lower, diag, upper}, this->exec); auto inv_mtx = gko::test::generate_tridiag_inverse_matrix( - 50, {lower, diag, upper}, this->exec); + size, {lower, diag, upper}, this->exec); auto result = Dense::create(this->exec, mtx->get_size()); inv_mtx->apply(mtx, result); diff --git a/core/test/utils/matrix_utils_test.cpp b/core/test/utils/matrix_utils_test.cpp index 3c67571e1b2..f742d4561a2 100644 --- a/core/test/utils/matrix_utils_test.cpp +++ b/core/test/utils/matrix_utils_test.cpp @@ -30,8 +30,8 @@ class MatrixUtils : public ::testing::Test { MatrixUtils() : exec(gko::ReferenceExecutor::create()), data(gko::test::generate_random_matrix_data( - 500, 500, std::normal_distribution(50, 5), - std::normal_distribution(20.0, 5.0), + 500, 500, std::normal_distribution<>(50, 5), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42))), rectangular_data(gko::dim<2>(500, 100)) {} @@ -41,7 +41,8 @@ class MatrixUtils : public ::testing::Test { mtx_data rectangular_data; }; -TYPED_TEST_SUITE(MatrixUtils, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(MatrixUtils, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(MatrixUtils, MakeSymmetricThrowsError) @@ -241,7 +242,7 @@ TYPED_TEST(MatrixUtils, MakeHpdMatrixCorrectly) TYPED_TEST(MatrixUtils, MakeHpdMatrixWithRatioCorrectly) { using T = typename TestFixture::value_type; - gko::remove_complex ratio = 1.00001; + gko::remove_complex ratio = 1.01; auto cpy_data = this->data; gko::utils::make_hpd(this->data, ratio); @@ -276,7 +277,7 @@ TYPED_TEST(MatrixUtils, MakeSpdMatrixCorrectly) TYPED_TEST(MatrixUtils, MakeSpdMatrixWithRatioCorrectly) { using T = typename TestFixture::value_type; - gko::remove_complex ratio = 1.00001; + gko::remove_complex ratio = 1.01; auto cpy_data = this->data; gko::utils::make_spd(this->data, ratio); diff --git a/core/test/utils/unsort_matrix_test.cpp b/core/test/utils/unsort_matrix_test.cpp index 5d2f88f982a..40ec65b08db 100644 --- a/core/test/utils/unsort_matrix_test.cpp +++ b/core/test/utils/unsort_matrix_test.cpp @@ -119,7 +119,7 @@ class UnsortMatrix : public ::testing::Test { std::unique_ptr coo_empty; }; -TYPED_TEST_SUITE(UnsortMatrix, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(UnsortMatrix, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/core/test/utils/value_generator_test.cpp b/core/test/utils/value_generator_test.cpp index 633565a66ef..57473c41b6e 100644 --- a/core/test/utils/value_generator_test.cpp +++ b/core/test/utils/value_generator_test.cpp @@ -20,19 +20,22 @@ template class ValueGenerator : public ::testing::Test { protected: using value_type = T; + using check_type = double; ValueGenerator() {} template - ValueType get_nth_moment(int n, ValueType c, InputIterator sample_start, - InputIterator sample_end, Closure closure_op) + check_type get_nth_moment(int n, ValueType c, InputIterator sample_start, + InputIterator sample_end, Closure closure_op) { using std::pow; - ValueType res = 0; - ValueType num_elems = 0; + check_type res = 0; + check_type num_elems = 0; while (sample_start != sample_end) { auto tmp = *(sample_start++); - res += pow(closure_op(tmp) - c, n); + res += pow(static_cast(closure_op(tmp)) - + static_cast(c), + n); num_elems += 1; } return res / num_elems; @@ -56,7 +59,8 @@ class ValueGenerator : public ::testing::Test { } }; -TYPED_TEST_SUITE(ValueGenerator, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(ValueGenerator, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(ValueGenerator, OutputHasCorrectAverageAndDeviation) diff --git a/reference/test/utils/assertions_test.cpp b/reference/test/utils/assertions_test.cpp index 98f1ec68e0d..9c6b544172e 100644 --- a/reference/test/utils/assertions_test.cpp +++ b/reference/test/utils/assertions_test.cpp @@ -17,7 +17,8 @@ namespace { template class MatricesNear : public ::testing::Test {}; -TYPED_TEST_SUITE(MatricesNear, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(MatricesNear, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(MatricesNear, CanPassAnyMatrixType) From c366df516045a59387ad11f427b447d8e89fc12f Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 21 Nov 2024 11:14:50 +0100 Subject: [PATCH 14/69] constexpr restriction for nvc++ --- accessor/reference_helper.hpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/accessor/reference_helper.hpp b/accessor/reference_helper.hpp index a3a77352f8f..61e15bf8b22 100644 --- a/accessor/reference_helper.hpp +++ b/accessor/reference_helper.hpp @@ -12,10 +12,8 @@ #include "utils.hpp" -// CUDA TOOLKIT < 11 does not support constexpr in combination with -// thrust::complex, which is why constexpr is only present in later versions -#if defined(__CUDA_ARCH__) && defined(__CUDACC_VER_MAJOR__) && \ - (__CUDACC_VER_MAJOR__ < 11) +// NVC++ disallow a constexpr function has a nonliteral return type like half +#if defined(__NVCOMPILER) && GINKGO_ENABLE_HALF #define GKO_ACC_ENABLE_REFERENCE_CONSTEXPR @@ -23,7 +21,7 @@ #define GKO_ACC_ENABLE_REFERENCE_CONSTEXPR constexpr -#endif // __CUDA_ARCH__ && __CUDACC_VER_MAJOR__ && __CUDACC_VER_MAJOR__ < 11 +#endif namespace gko { From 8d28cc797fbbba6f65c635fdd7d8b800446a9316 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 25 Nov 2024 15:03:07 +0100 Subject: [PATCH 15/69] cuda with CC<70 and hip do not support 16 bit atomic. throw error or fallback to a working version if it is the case for matrix --- common/cuda_hip/components/atomic.hpp | 48 -------- common/cuda_hip/matrix/coo_kernels.cpp | 114 ++++++++++-------- .../cuda_hip/matrix/csr_kernels.template.cpp | 97 ++++++++------- common/cuda_hip/matrix/ell_kernels.cpp | 93 ++++++++------ hip/components/cooperative_groups.hip.hpp | 12 +- 5 files changed, 182 insertions(+), 182 deletions(-) diff --git a/common/cuda_hip/components/atomic.hpp b/common/cuda_hip/components/atomic.hpp index 954bc7476ed..cd59485dac9 100644 --- a/common/cuda_hip/components/atomic.hpp +++ b/common/cuda_hip/components/atomic.hpp @@ -96,52 +96,6 @@ __forceinline__ __device__ ResultType reinterpret(ValueType val) } \ }; - -#define GKO_BIND_ATOMIC_HELPER_FAKE_STRUCTURE(CONVERTER_TYPE) \ - template \ - struct atomic_helper< \ - ValueType, \ - std::enable_if_t<(sizeof(ValueType) == sizeof(CONVERTER_TYPE))>> { \ - __forceinline__ __device__ static ValueType atomic_add( \ - ValueType* __restrict__ addr, ValueType val) \ - { \ - assert(false); \ - using c_type = CONVERTER_TYPE; \ - return atomic_wrapper( \ - addr, [&val](c_type& old, c_type assumed, c_type* c_addr) { \ - old = *c_addr; \ - *c_addr = reinterpret( \ - val + reinterpret(assumed)); \ - }); \ - } \ - __forceinline__ __device__ static ValueType atomic_max( \ - ValueType* __restrict__ addr, ValueType val) \ - { \ - assert(false); \ - using c_type = CONVERTER_TYPE; \ - return atomic_wrapper( \ - addr, [&val](c_type& old, c_type assumed, c_type* c_addr) { \ - if (reinterpret(assumed) < val) { \ - old = *c_addr; \ - *c_addr = reinterpret(assumed); \ - } \ - }); \ - } \ - \ - private: \ - template \ - __forceinline__ __device__ static ValueType atomic_wrapper( \ - ValueType* __restrict__ addr, Callable set_old) \ - { \ - CONVERTER_TYPE* address_as_converter = \ - reinterpret_cast(addr); \ - CONVERTER_TYPE old = *address_as_converter; \ - CONVERTER_TYPE assumed = old; \ - set_old(old, assumed, address_as_converter); \ - return reinterpret(old); \ - } \ - }; - // Support 64-bit ATOMIC_ADD and ATOMIC_MAX GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int); // Support 32-bit ATOMIC_ADD and ATOMIC_MAX @@ -152,8 +106,6 @@ GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int); // Support 16-bit atomicCAS, atomicADD, and atomicMAX only on CUDA with CC // >= 7.0 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int); -#else -GKO_BIND_ATOMIC_HELPER_FAKE_STRUCTURE(unsigned short int) #endif diff --git a/common/cuda_hip/matrix/coo_kernels.cpp b/common/cuda_hip/matrix/coo_kernels.cpp index 4609f9f7f95..88d6dced504 100644 --- a/common/cuda_hip/matrix/coo_kernels.cpp +++ b/common/cuda_hip/matrix/coo_kernels.cpp @@ -268,30 +268,38 @@ void spmv2(std::shared_ptr exec, const dim3 coo_block(config::warp_size, warps_in_block, 1); const auto nwarps = host_kernel::calculate_nwarps(exec, nnz); - if (nwarps > 0 && b_ncols > 0) { - // TODO: b_ncols needs to be tuned for ROCm. - if (b_ncols < 4) { - const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols); - int num_lines = ceildiv(nnz, nwarps * config::warp_size); - - abstract_spmv<<get_stream()>>>( - nnz, num_lines, as_device_type(a->get_const_values()), - a->get_const_col_idxs(), - as_device_type(a->get_const_row_idxs()), - as_device_type(b->get_const_values()), b->get_stride(), - as_device_type(c->get_values()), c->get_stride()); - } else { - int num_elems = - ceildiv(nnz, nwarps * config::warp_size) * config::warp_size; - const dim3 coo_grid(ceildiv(nwarps, warps_in_block), - ceildiv(b_ncols, config::warp_size)); - - abstract_spmm<<get_stream()>>>( - nnz, num_elems, as_device_type(a->get_const_values()), - a->get_const_col_idxs(), - as_device_type(a->get_const_row_idxs()), b_ncols, - as_device_type(b->get_const_values()), b->get_stride(), - as_device_type(c->get_values()), c->get_stride()); +// not support 16 bit atomic +#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700)) + if constexpr (std::is_same_v, gko::half>) { + GKO_NOT_SUPPORTED(c); + } else +#endif + { + if (nwarps > 0 && b_ncols > 0) { + // TODO: b_ncols needs to be tuned for ROCm. + if (b_ncols < 4) { + const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols); + int num_lines = ceildiv(nnz, nwarps * config::warp_size); + + abstract_spmv<<get_stream()>>>( + nnz, num_lines, as_device_type(a->get_const_values()), + a->get_const_col_idxs(), + as_device_type(a->get_const_row_idxs()), + as_device_type(b->get_const_values()), b->get_stride(), + as_device_type(c->get_values()), c->get_stride()); + } else { + int num_elems = ceildiv(nnz, nwarps * config::warp_size) * + config::warp_size; + const dim3 coo_grid(ceildiv(nwarps, warps_in_block), + ceildiv(b_ncols, config::warp_size)); + + abstract_spmm<<get_stream()>>>( + nnz, num_elems, as_device_type(a->get_const_values()), + a->get_const_col_idxs(), + as_device_type(a->get_const_row_idxs()), b_ncols, + as_device_type(b->get_const_values()), b->get_stride(), + as_device_type(c->get_values()), c->get_stride()); + } } } } @@ -312,30 +320,40 @@ void advanced_spmv2(std::shared_ptr exec, const dim3 coo_block(config::warp_size, warps_in_block, 1); const auto b_ncols = b->get_size()[1]; - if (nwarps > 0 && b_ncols > 0) { - // TODO: b_ncols needs to be tuned for ROCm. - if (b_ncols < 4) { - int num_lines = ceildiv(nnz, nwarps * config::warp_size); - const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols); - - abstract_spmv<<get_stream()>>>( - nnz, num_lines, as_device_type(alpha->get_const_values()), - as_device_type(a->get_const_values()), a->get_const_col_idxs(), - as_device_type(a->get_const_row_idxs()), - as_device_type(b->get_const_values()), b->get_stride(), - as_device_type(c->get_values()), c->get_stride()); - } else { - int num_elems = - ceildiv(nnz, nwarps * config::warp_size) * config::warp_size; - const dim3 coo_grid(ceildiv(nwarps, warps_in_block), - ceildiv(b_ncols, config::warp_size)); - - abstract_spmm<<get_stream()>>>( - nnz, num_elems, as_device_type(alpha->get_const_values()), - as_device_type(a->get_const_values()), a->get_const_col_idxs(), - as_device_type(a->get_const_row_idxs()), b_ncols, - as_device_type(b->get_const_values()), b->get_stride(), - as_device_type(c->get_values()), c->get_stride()); + // not support 16 bit atomic +#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700)) + if constexpr (std::is_same_v, gko::half>) { + GKO_NOT_SUPPORTED(c); + } else +#endif + { + if (nwarps > 0 && b_ncols > 0) { + // TODO: b_ncols needs to be tuned for ROCm. + if (b_ncols < 4) { + int num_lines = ceildiv(nnz, nwarps * config::warp_size); + const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols); + + abstract_spmv<<get_stream()>>>( + nnz, num_lines, as_device_type(alpha->get_const_values()), + as_device_type(a->get_const_values()), + a->get_const_col_idxs(), + as_device_type(a->get_const_row_idxs()), + as_device_type(b->get_const_values()), b->get_stride(), + as_device_type(c->get_values()), c->get_stride()); + } else { + int num_elems = ceildiv(nnz, nwarps * config::warp_size) * + config::warp_size; + const dim3 coo_grid(ceildiv(nwarps, warps_in_block), + ceildiv(b_ncols, config::warp_size)); + + abstract_spmm<<get_stream()>>>( + nnz, num_elems, as_device_type(alpha->get_const_values()), + as_device_type(a->get_const_values()), + a->get_const_col_idxs(), + as_device_type(a->get_const_row_idxs()), b_ncols, + as_device_type(b->get_const_values()), b->get_stride(), + as_device_type(c->get_values()), c->get_stride()); + } } } } diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp index f808e234670..bd2423d4306 100644 --- a/common/cuda_hip/matrix/csr_kernels.template.cpp +++ b/common/cuda_hip/matrix/csr_kernels.template.cpp @@ -2064,7 +2064,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv); template -void load_balance_spmv(std::shared_ptr exec, +bool load_balance_spmv(std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, matrix::Dense* c, @@ -2074,42 +2074,54 @@ void load_balance_spmv(std::shared_ptr exec, using arithmetic_type = highest_precision; - if (beta) { - dense::scale(exec, beta, c); - } else { - dense::fill(exec, c, zero()); - } - const IndexType nwarps = a->get_num_srow_elements(); - if (nwarps > 0) { - const dim3 csr_block(config::warp_size, warps_in_block, 1); - const dim3 csr_grid(ceildiv(nwarps, warps_in_block), b->get_size()[1]); - const auto a_vals = - acc::helper::build_const_rrm_accessor(a); - const auto b_vals = - acc::helper::build_const_rrm_accessor(b); - auto c_vals = acc::helper::build_rrm_accessor(c); - if (alpha) { - if (csr_grid.x > 0 && csr_grid.y > 0) { - kernel::abstract_spmv<<get_stream()>>>( - nwarps, static_cast(a->get_size()[0]), - as_device_type(alpha->get_const_values()), - acc::as_device_range(a_vals), a->get_const_col_idxs(), - as_device_type(a->get_const_row_ptrs()), - as_device_type(a->get_const_srow()), - acc::as_device_range(b_vals), acc::as_device_range(c_vals)); - } + // not support 16 bit atomic +#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700)) + if constexpr (std::is_same_v, half>) { + return false; + } else +#endif + { + if (beta) { + dense::scale(exec, beta, c); } else { - if (csr_grid.x > 0 && csr_grid.y > 0) { - kernel::abstract_spmv<<get_stream()>>>( - nwarps, static_cast(a->get_size()[0]), - acc::as_device_range(a_vals), a->get_const_col_idxs(), - as_device_type(a->get_const_row_ptrs()), - as_device_type(a->get_const_srow()), - acc::as_device_range(b_vals), acc::as_device_range(c_vals)); + dense::fill(exec, c, zero()); + } + const IndexType nwarps = a->get_num_srow_elements(); + if (nwarps > 0) { + const dim3 csr_block(config::warp_size, warps_in_block, 1); + const dim3 csr_grid(ceildiv(nwarps, warps_in_block), + b->get_size()[1]); + const auto a_vals = + acc::helper::build_const_rrm_accessor(a); + const auto b_vals = + acc::helper::build_const_rrm_accessor(b); + auto c_vals = acc::helper::build_rrm_accessor(c); + if (alpha) { + if (csr_grid.x > 0 && csr_grid.y > 0) { + kernel::abstract_spmv<<get_stream()>>>( + nwarps, static_cast(a->get_size()[0]), + as_device_type(alpha->get_const_values()), + acc::as_device_range(a_vals), a->get_const_col_idxs(), + as_device_type(a->get_const_row_ptrs()), + as_device_type(a->get_const_srow()), + acc::as_device_range(b_vals), + acc::as_device_range(c_vals)); + } + } else { + if (csr_grid.x > 0 && csr_grid.y > 0) { + kernel::abstract_spmv<<get_stream()>>>( + nwarps, static_cast(a->get_size()[0]), + acc::as_device_range(a_vals), a->get_const_col_idxs(), + as_device_type(a->get_const_row_ptrs()), + as_device_type(a->get_const_srow()), + acc::as_device_range(b_vals), + acc::as_device_range(c_vals)); + } } } + return true; } } @@ -2257,8 +2269,6 @@ void spmv(std::shared_ptr exec, { if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { // empty output: nothing to do - } else if (a->get_strategy()->get_name() == "load_balance") { - host_kernel::load_balance_spmv(exec, a, b, c); } else if (a->get_strategy()->get_name() == "merge_path") { using arithmetic_type = highest_precision; @@ -2273,8 +2283,10 @@ void spmv(std::shared_ptr exec, syn::value_list(), syn::type_list<>(), exec, a, b, c); } else { bool use_classical = true; - if (a->get_strategy()->get_name() == "sparselib" || - a->get_strategy()->get_name() == "cusparse") { + if (a->get_strategy()->get_name() == "load_balance") { + use_classical = !host_kernel::load_balance_spmv(exec, a, b, c); + } else if (a->get_strategy()->get_name() == "sparselib" || + a->get_strategy()->get_name() == "cusparse") { use_classical = !host_kernel::try_sparselib_spmv(exec, a, b, c); } if (use_classical) { @@ -2316,8 +2328,6 @@ void advanced_spmv(std::shared_ptr exec, { if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { // empty output: nothing to do - } else if (a->get_strategy()->get_name() == "load_balance") { - host_kernel::load_balance_spmv(exec, a, b, c, alpha, beta); } else if (a->get_strategy()->get_name() == "merge_path") { using arithmetic_type = highest_precision; @@ -2333,8 +2343,11 @@ void advanced_spmv(std::shared_ptr exec, beta); } else { bool use_classical = true; - if (a->get_strategy()->get_name() == "sparselib" || - a->get_strategy()->get_name() == "cusparse") { + if (a->get_strategy()->get_name() == "load_balance") { + use_classical = + !host_kernel::load_balance_spmv(exec, a, b, c, alpha, beta); + } else if (a->get_strategy()->get_name() == "sparselib" || + a->get_strategy()->get_name() == "cusparse") { use_classical = !host_kernel::try_sparselib_spmv(exec, a, b, c, alpha, beta); } diff --git a/common/cuda_hip/matrix/ell_kernels.cpp b/common/cuda_hip/matrix/ell_kernels.cpp index 16371166662..23079092162 100644 --- a/common/cuda_hip/matrix/ell_kernels.cpp +++ b/common/cuda_hip/matrix/ell_kernels.cpp @@ -91,7 +91,7 @@ __device__ void spmv_kernel( using arithmetic_type = typename a_accessor::arithmetic_type; const auto tidx = thread::get_thread_id_flat(); const decltype(tidx) column_id = blockIdx.y; - if (num_thread_per_worker == 1) { + if constexpr (num_thread_per_worker == 1) { // Specialize the num_thread_per_worker = 1. It doesn't need the shared // memory, __syncthreads, and atomic_add if (tidx < num_rows) { @@ -137,7 +137,7 @@ __device__ void spmv_kernel( __syncthreads(); if (idx_in_worker == 0) { const auto c_ind = x * c_stride + column_id; - if (atomic) { + if constexpr (atomic) { atomic_add(&(c[c_ind]), op(storage[threadIdx.x], c[c_ind])); } else { c[c_ind] = op(storage[threadIdx.x], c[c_ind]); @@ -179,7 +179,7 @@ __global__ __launch_bounds__(default_block_size) void spmv( using arithmetic_type = typename a_accessor::arithmetic_type; const auto alpha_val = alpha(0); const OutputValueType beta_val = beta[0]; - if (atomic) { + if constexpr (atomic) { // Because the atomic operation changes the values of c during // computation, it can not directly do alpha * a * b + beta * c // operation. The beta * c needs to be done before calling this kernel. @@ -240,42 +240,59 @@ void abstract_spmv(syn::value_list, const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x), b->get_size()[1], 1); - const auto a_vals = acc::range( - std::array{{static_cast( - num_stored_elements_per_row * stride)}}, - a->get_const_values()); - const auto b_vals = acc::range( - std::array{ - {static_cast(b->get_size()[0]), - static_cast(b->get_size()[1])}}, - b->get_const_values(), - std::array{ - {static_cast(b->get_stride())}}); - - if (alpha == nullptr && beta == nullptr) { - if (grid_size.x > 0 && grid_size.y > 0) { - kernel::spmv - <<get_stream()>>>( - nrows, num_worker_per_row, acc::as_device_range(a_vals), - a->get_const_col_idxs(), stride, - num_stored_elements_per_row, acc::as_device_range(b_vals), - as_device_type(c->get_values()), c->get_stride()); - } - } else if (alpha != nullptr && beta != nullptr) { - const auto alpha_val = acc::range( - std::array{1}, alpha->get_const_values()); - if (grid_size.x > 0 && grid_size.y > 0) { - kernel::spmv - <<get_stream()>>>( - nrows, num_worker_per_row, acc::as_device_range(alpha_val), - acc::as_device_range(a_vals), a->get_const_col_idxs(), - stride, num_stored_elements_per_row, - acc::as_device_range(b_vals), - as_device_type(beta->get_const_values()), - as_device_type(c->get_values()), c->get_stride()); - } - } else { +// not support 16 bit atomic +#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700)) + // We do atomic on shared memory when num_thread_per_worker is not 1. + // If atomic is also true, we also do atomic on out_vector. + constexpr bool shared_half = + std::is_same_v, half>; + constexpr bool atomic_half_out = + atomic && std::is_same_v, half>; + if constexpr (num_thread_per_worker != 1 && + (shared_half || atomic_half_out)) { GKO_KERNEL_NOT_FOUND; + } else +#endif + { + const auto a_vals = acc::range( + std::array{{static_cast( + num_stored_elements_per_row * stride)}}, + a->get_const_values()); + const auto b_vals = acc::range( + std::array{ + {static_cast(b->get_size()[0]), + static_cast(b->get_size()[1])}}, + b->get_const_values(), + std::array{ + {static_cast(b->get_stride())}}); + + if (alpha == nullptr && beta == nullptr) { + if (grid_size.x > 0 && grid_size.y > 0) { + kernel::spmv + <<get_stream()>>>( + nrows, num_worker_per_row, acc::as_device_range(a_vals), + a->get_const_col_idxs(), stride, + num_stored_elements_per_row, + acc::as_device_range(b_vals), + as_device_type(c->get_values()), c->get_stride()); + } + } else if (alpha != nullptr && beta != nullptr) { + const auto alpha_val = acc::range( + std::array{1}, alpha->get_const_values()); + if (grid_size.x > 0 && grid_size.y > 0) { + kernel::spmv + <<get_stream()>>>( + nrows, num_worker_per_row, + acc::as_device_range(alpha_val), + acc::as_device_range(a_vals), a->get_const_col_idxs(), + stride, num_stored_elements_per_row, + acc::as_device_range(b_vals), + as_device_type(beta->get_const_values()), + as_device_type(c->get_values()), c->get_stride()); + } + } else { + GKO_KERNEL_NOT_FOUND; + } } } diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp index 46c2fb195bc..36618bb7f3e 100644 --- a/hip/components/cooperative_groups.hip.hpp +++ b/hip/components/cooperative_groups.hip.hpp @@ -306,7 +306,7 @@ class enable_extended_shuffle : public Group { SelectorType selector) const \ { \ return shuffle_impl( \ - [this](uint16 v, SelectorType s) { \ + [this](uint32 v, SelectorType s) { \ return static_cast(this)->_name(v, s); \ }, \ var, selector); \ @@ -326,12 +326,12 @@ class enable_extended_shuffle : public Group { shuffle_impl(ShuffleOperator intrinsic_shuffle, const ValueType var, SelectorType selector) { - static_assert(sizeof(ValueType) % sizeof(uint16) == 0, - "Unable to shuffle sizes which are not 2-byte multiples"); - constexpr auto value_size = sizeof(ValueType) / sizeof(uint16); + static_assert(sizeof(ValueType) % sizeof(uint32) == 0, + "Unable to shuffle sizes which are not 4-byte multiples"); + constexpr auto value_size = sizeof(ValueType) / sizeof(uint32); ValueType result; - auto var_array = reinterpret_cast(&var); - auto result_array = reinterpret_cast(&result); + auto var_array = reinterpret_cast(&var); + auto result_array = reinterpret_cast(&result); #pragma unroll for (std::size_t i = 0; i < value_size; ++i) { result_array[i] = intrinsic_shuffle(var_array[i], selector); From 70a8f260f8299cbe51d9cf965b66cfe8fa858654 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 26 Nov 2024 11:13:59 +0100 Subject: [PATCH 16/69] implement half shuffle via 32 bit impl --- hip/components/cooperative_groups.hip.hpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp index 36618bb7f3e..dce69421a31 100644 --- a/hip/components/cooperative_groups.hip.hpp +++ b/hip/components/cooperative_groups.hip.hpp @@ -319,6 +319,27 @@ class enable_extended_shuffle : public Group { #undef GKO_ENABLE_SHUFFLE_OPERATION +// hip does not support 16bit shuffle directly +#define GKO_ENABLE_SHUFFLE_OPERATION_HALF(_name, SelectorType) \ + __device__ __forceinline__ __half _name(const __half& var, \ + SelectorType selector) const \ + { \ + uint32 u; \ + memcpy(&u, &var, sizeof(__half)); \ + u = static_cast(this)->_name(u, selector); \ + __half result; \ + memcpy(&result, &u, sizeof(__half)); \ + return result; \ + } + + GKO_ENABLE_SHUFFLE_OPERATION_HALF(shfl, int32) + GKO_ENABLE_SHUFFLE_OPERATION_HALF(shfl_up, uint32) + GKO_ENABLE_SHUFFLE_OPERATION_HALF(shfl_down, uint32) + GKO_ENABLE_SHUFFLE_OPERATION_HALF(shfl_xor, int32) + +#undef GKO_ENABLE_SHUFFLE_OPERATION_HALF + + private: template From c31f6ec5ba78ead885ece0da680974474b4a92c9 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 25 Oct 2024 16:27:44 +0200 Subject: [PATCH 17/69] config --- core/config/config_helper.hpp | 4 ++- core/config/dispatch.hpp | 8 +++++ core/config/parse_macro.hpp | 50 ++++++++++++++++---------- core/config/type_descriptor.cpp | 2 +- core/config/type_descriptor_helper.hpp | 3 ++ 5 files changed, 47 insertions(+), 20 deletions(-) diff --git a/core/config/config_helper.hpp b/core/config/config_helper.hpp index 483366765aa..b1fa7bd69b5 100644 --- a/core/config/config_helper.hpp +++ b/core/config/config_helper.hpp @@ -202,7 +202,9 @@ get_value(const pnode& config) * This is specialization for floating point type */ template -inline std::enable_if_t::value, ValueType> +inline std::enable_if_t::value || + std::is_same::value, + ValueType> get_value(const pnode& config) { auto val = config.get_real(); diff --git a/core/config/dispatch.hpp b/core/config/dispatch.hpp index 0138665aac2..1c6d0eb12cd 100644 --- a/core/config/dispatch.hpp +++ b/core/config/dispatch.hpp @@ -105,6 +105,14 @@ deferred_factory_parameter dispatch( using value_type_list = syn::type_list, std::complex>; +#if GINKGO_ENABLE_HALF +using value_type_list_with_half = + syn::type_list, + std::complex, std::complex>; +#else +using value_type_list_with_half = value_type_list; +#endif // GINKGO_ENABLE_HALF + using index_type_list = syn::type_list; } // namespace config diff --git a/core/config/parse_macro.hpp b/core/config/parse_macro.hpp index 800b42f9493..e3734e5db7a 100644 --- a/core/config/parse_macro.hpp +++ b/core/config/parse_macro.hpp @@ -16,27 +16,33 @@ // for value_type only -#define GKO_PARSE_VALUE_TYPE(_type, _configurator) \ - template <> \ - deferred_factory_parameter \ - parse( \ - const gko::config::pnode& config, \ - const gko::config::registry& context, \ - const gko::config::type_descriptor& td) \ - { \ - auto updated = gko::config::update_type(config, td); \ - return gko::config::dispatch( \ - config, context, updated, \ - gko::config::make_type_selector(updated.get_value_typestr(), \ - gko::config::value_type_list())); \ - } \ - static_assert(true, \ - "This assert is used to counter the false positive extra " \ +#define GKO_PARSE_VALUE_TYPE_(_type, _configurator, _value_type_list) \ + template <> \ + deferred_factory_parameter \ + parse( \ + const gko::config::pnode& config, \ + const gko::config::registry& context, \ + const gko::config::type_descriptor& td) \ + { \ + auto updated = gko::config::update_type(config, td); \ + return gko::config::dispatch( \ + config, context, updated, \ + gko::config::make_type_selector(updated.get_value_typestr(), \ + _value_type_list)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ "semi-colon warnings") +#define GKO_PARSE_VALUE_TYPE(_type, _configurator) \ + GKO_PARSE_VALUE_TYPE_(_type, _configurator, gko::config::value_type_list()) +#define GKO_PARSE_VALUE_TYPE_WITH_HALF(_type, _configurator) \ + GKO_PARSE_VALUE_TYPE_(_type, _configurator, \ + gko::config::value_type_list_with_half()) // for value_type and index_type -#define GKO_PARSE_VALUE_AND_INDEX_TYPE(_type, _configurator) \ +#define GKO_PARSE_VALUE_AND_INDEX_TYPE_(_type, _configurator, \ + _value_type_list) \ template <> \ deferred_factory_parameter \ parse( \ @@ -48,7 +54,7 @@ return gko::config::dispatch( \ config, context, updated, \ gko::config::make_type_selector(updated.get_value_typestr(), \ - gko::config::value_type_list()), \ + _value_type_list), \ gko::config::make_type_selector(updated.get_index_typestr(), \ gko::config::index_type_list())); \ } \ @@ -56,5 +62,13 @@ "This assert is used to counter the false positive extra " \ "semi-colon warnings") +#define GKO_PARSE_VALUE_AND_INDEX_TYPE(_type, _configurator) \ + GKO_PARSE_VALUE_AND_INDEX_TYPE_(_type, _configurator, \ + gko::config::value_type_list()) + +#define GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(_type, _configurator) \ + GKO_PARSE_VALUE_AND_INDEX_TYPE_(_type, _configurator, \ + gko::config::value_type_list_with_half()) + #endif // GKO_CORE_CONFIG_PARSE_MACRO_HPP_ diff --git a/core/config/type_descriptor.cpp b/core/config/type_descriptor.cpp index fe11b785d6f..ef4cdc692f9 100644 --- a/core/config/type_descriptor.cpp +++ b/core/config/type_descriptor.cpp @@ -50,7 +50,7 @@ type_descriptor make_type_descriptor() GlobalIndexType) \ type_descriptor \ make_type_descriptor() -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_WITH_HALF( GKO_DECLARE_MAKE_TYPE_DESCRIPTOR); #define GKO_DECLARE_MAKE_VOID_TYPE_DESCRIPTOR(LocalIndexType, GlobalIndexType) \ diff --git a/core/config/type_descriptor_helper.hpp b/core/config/type_descriptor_helper.hpp index 0edc4376f1a..63a953e3a1e 100644 --- a/core/config/type_descriptor_helper.hpp +++ b/core/config/type_descriptor_helper.hpp @@ -8,6 +8,7 @@ #include +#include #include #include #include @@ -38,8 +39,10 @@ struct type_string {}; TYPE_STRING_OVERLOAD(void, "void"); TYPE_STRING_OVERLOAD(double, "float64"); TYPE_STRING_OVERLOAD(float, "float32"); +TYPE_STRING_OVERLOAD(half, "float16"); TYPE_STRING_OVERLOAD(std::complex, "complex"); TYPE_STRING_OVERLOAD(std::complex, "complex"); +TYPE_STRING_OVERLOAD(std::complex, "complex"); TYPE_STRING_OVERLOAD(int32, "int32"); TYPE_STRING_OVERLOAD(int64, "int64"); From c74aa24cd129032a06f703c59723cff1d6e7a84b Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 25 Oct 2024 14:47:14 +0200 Subject: [PATCH 18/69] residual with half --- .../cuda_hip/stop/residual_norm_kernels.cpp | 5 ++-- core/device_hooks/common_kernels.inc.cpp | 4 +-- core/stop/residual_norm.cpp | 5 ++-- dpcpp/stop/residual_norm_kernels.dp.cpp | 7 +++-- omp/stop/residual_norm_kernels.cpp | 5 ++-- reference/stop/residual_norm_kernels.cpp | 5 ++-- reference/test/stop/residual_norm_kernels.cpp | 21 ++++++++----- test/stop/residual_norm_kernels.cpp | 30 ++++++++++++------- 8 files changed, 51 insertions(+), 31 deletions(-) diff --git a/common/cuda_hip/stop/residual_norm_kernels.cpp b/common/cuda_hip/stop/residual_norm_kernels.cpp index 9d6db5211e8..23ca8e5d5f1 100644 --- a/common/cuda_hip/stop/residual_norm_kernels.cpp +++ b/common/cuda_hip/stop/residual_norm_kernels.cpp @@ -91,7 +91,7 @@ void residual_norm(std::shared_ptr exec, *one_changed = get_element(*device_storage, 1); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF( GKO_DECLARE_RESIDUAL_NORM_KERNEL); @@ -171,7 +171,8 @@ void implicit_residual_norm( *one_changed = get_element(*device_storage, 1); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL); } // namespace implicit_residual_norm diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 439cda481a2..c41f9e921cb 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -1136,7 +1136,7 @@ GKO_STUB(GKO_DECLARE_SET_ALL_STATUSES_KERNEL); namespace residual_norm { -GKO_STUB_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_KERNEL); +GKO_STUB_NON_COMPLEX_VALUE_TYPE_WITH_HALF(GKO_DECLARE_RESIDUAL_NORM_KERNEL); } // namespace residual_norm @@ -1145,7 +1145,7 @@ GKO_STUB_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_KERNEL); namespace implicit_residual_norm { -GKO_STUB_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL); } // namespace implicit_residual_norm diff --git a/core/stop/residual_norm.cpp b/core/stop/residual_norm.cpp index c962784033a..5f75efcec82 100644 --- a/core/stop/residual_norm.cpp +++ b/core/stop/residual_norm.cpp @@ -227,12 +227,13 @@ bool ImplicitResidualNorm::check_impl( #define GKO_DECLARE_RESIDUAL_NORM(_type) class ResidualNormBase<_type> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_RESIDUAL_NORM); #define GKO_DECLARE_IMPLICIT_RESIDUAL_NORM(_type) \ class ImplicitResidualNorm<_type> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_IMPLICIT_RESIDUAL_NORM); } // namespace stop diff --git a/dpcpp/stop/residual_norm_kernels.dp.cpp b/dpcpp/stop/residual_norm_kernels.dp.cpp index ddb617a1a84..23d62e83729 100644 --- a/dpcpp/stop/residual_norm_kernels.dp.cpp +++ b/dpcpp/stop/residual_norm_kernels.dp.cpp @@ -69,7 +69,7 @@ void residual_norm(std::shared_ptr exec, *one_changed = get_element(*device_storage, 1); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF( GKO_DECLARE_RESIDUAL_NORM_KERNEL); @@ -108,7 +108,7 @@ void implicit_residual_norm( cgh.parallel_for( sycl::range<1>{tau->get_size()[1]}, [=](sycl::id<1> idx_id) { const auto tidx = idx_id[0]; - if (std::sqrt(std::abs(tau_val[tidx])) <= + if (gko::sqrt(gko::abs(tau_val[tidx])) <= rel_residual_goal * orig_tau_val[tidx]) { stop_status_val[tidx].converge(stoppingId, setFinalized); device_storage_val[1] = true; @@ -126,7 +126,8 @@ void implicit_residual_norm( *one_changed = get_element(*device_storage, 1); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL); } // namespace implicit_residual_norm diff --git a/omp/stop/residual_norm_kernels.cpp b/omp/stop/residual_norm_kernels.cpp index 0ec4395a16b..ff259477d03 100644 --- a/omp/stop/residual_norm_kernels.cpp +++ b/omp/stop/residual_norm_kernels.cpp @@ -53,7 +53,7 @@ void residual_norm(std::shared_ptr exec, *all_converged = local_all_converged; } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF( GKO_DECLARE_RESIDUAL_NORM_KERNEL); @@ -98,7 +98,8 @@ void implicit_residual_norm( *all_converged = local_all_converged; } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL); } // namespace implicit_residual_norm diff --git a/reference/stop/residual_norm_kernels.cpp b/reference/stop/residual_norm_kernels.cpp index ba2672edc28..ed91ff390b6 100644 --- a/reference/stop/residual_norm_kernels.cpp +++ b/reference/stop/residual_norm_kernels.cpp @@ -50,7 +50,7 @@ void residual_norm(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF( GKO_DECLARE_RESIDUAL_NORM_KERNEL); @@ -90,7 +90,8 @@ void implicit_residual_norm( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL); } // namespace implicit_residual_norm diff --git a/reference/test/stop/residual_norm_kernels.cpp b/reference/test/stop/residual_norm_kernels.cpp index 43b865796b7..aed801afacf 100644 --- a/reference/test/stop/residual_norm_kernels.cpp +++ b/reference/test/stop/residual_norm_kernels.cpp @@ -45,7 +45,8 @@ class ResidualNorm : public ::testing::Test { std::shared_ptr exec_; }; -TYPED_TEST_SUITE(ResidualNorm, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(ResidualNorm, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(ResidualNorm, CanCreateFactory) @@ -85,7 +86,8 @@ TYPED_TEST(ResidualNorm, CheckIfResZeroConverges) for (auto baseline : {mode::rhs_norm, mode::initial_resnorm, mode::absolute}) { gko::remove_complex factor = - (baseline == mode::absolute) ? 0.0 : r::value; + (baseline == mode::absolute) ? gko::zero>() + : r::value; auto criterion = gko::stop::ResidualNorm::build() .with_reduction_factor(factor) .with_baseline(baseline) @@ -399,7 +401,9 @@ TYPED_TEST(ResidualNorm, SelfCalculatesAndWaitsTillResidualGoal) ASSERT_FALSE(abs_criterion->update().solution(solution).check( RelativeStoppingId, true, &stop_status, &one_changed)); - solution->at(0) = rhs_val - r::value * T{1.2}; + // TODO FIXME: NVHPC calculates different result of rhs - r*1.2 from + // rhs - tmp = rhs - (r * 1.2). https://godbolt.org/z/GrGE9PE67 + solution->at(0) = rhs_val - r::value * T{1.4}; ASSERT_FALSE(abs_criterion->update().solution(solution).check( RelativeStoppingId, true, &stop_status, &one_changed)); ASSERT_EQ(stop_status.get_data()[0].has_converged(), false); @@ -526,7 +530,7 @@ class ResidualNormWithInitialResnorm : public ::testing::Test { std::shared_ptr exec_; }; -TYPED_TEST_SUITE(ResidualNormWithInitialResnorm, gko::test::ValueTypes, +TYPED_TEST_SUITE(ResidualNormWithInitialResnorm, gko::test::ValueTypesWithHalf, TypenameNameGenerator); @@ -667,7 +671,7 @@ class ResidualNormWithRhsNorm : public ::testing::Test { std::shared_ptr exec_; }; -TYPED_TEST_SUITE(ResidualNormWithRhsNorm, gko::test::ValueTypes, +TYPED_TEST_SUITE(ResidualNormWithRhsNorm, gko::test::ValueTypesWithHalf, TypenameNameGenerator); @@ -804,7 +808,7 @@ class ImplicitResidualNorm : public ::testing::Test { std::shared_ptr exec_; }; -TYPED_TEST_SUITE(ImplicitResidualNorm, gko::test::ValueTypes, +TYPED_TEST_SUITE(ImplicitResidualNorm, gko::test::ValueTypesWithHalf, TypenameNameGenerator); @@ -836,7 +840,8 @@ TYPED_TEST(ImplicitResidualNorm, CheckIfResZeroConverges) for (auto baseline : {mode::rhs_norm, mode::initial_resnorm, mode::absolute}) { gko::remove_complex factor = - (baseline == mode::absolute) ? 0.0 : r::value; + (baseline == mode::absolute) ? gko::zero>() + : r::value; auto criterion = gko::stop::ImplicitResidualNorm::build() .with_reduction_factor(factor) .with_baseline(baseline) @@ -979,7 +984,7 @@ class ResidualNormWithAbsolute : public ::testing::Test { std::shared_ptr exec_; }; -TYPED_TEST_SUITE(ResidualNormWithAbsolute, gko::test::ValueTypes, +TYPED_TEST_SUITE(ResidualNormWithAbsolute, gko::test::ValueTypesWithHalf, TypenameNameGenerator); diff --git a/test/stop/residual_norm_kernels.cpp b/test/stop/residual_norm_kernels.cpp index a0a144bcf3b..62f656bed59 100644 --- a/test/stop/residual_norm_kernels.cpp +++ b/test/stop/residual_norm_kernels.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include "core/test/utils.hpp" @@ -57,7 +58,8 @@ class ResidualNorm : public CommonTestFixture { std::unique_ptr::Factory> abs_factory; }; -TYPED_TEST_SUITE(ResidualNorm, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(ResidualNorm, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(ResidualNorm, CanIgorneResidualNorm) @@ -81,13 +83,16 @@ TYPED_TEST(ResidualNorm, CanIgorneResidualNorm) gko::NotSupported); } + TYPED_TEST(ResidualNorm, CheckIfResZeroConverges) { using Mtx = typename TestFixture::Mtx; using NormVector = typename TestFixture::NormVector; using T = typename TestFixture::ValueType; + // use csr to use half apply + using Csr = gko::matrix::Csr; using mode = gko::stop::mode; - std::shared_ptr mtx = gko::initialize({1.0}, this->exec); + std::shared_ptr mtx = gko::initialize({1.0}, this->exec); std::shared_ptr rhs = gko::initialize({0.0}, this->exec); std::shared_ptr x = gko::initialize({0.0}, this->exec); std::shared_ptr res_norm = @@ -96,7 +101,8 @@ TYPED_TEST(ResidualNorm, CheckIfResZeroConverges) for (auto baseline : {mode::rhs_norm, mode::initial_resnorm, mode::absolute}) { gko::remove_complex factor = - (baseline == mode::absolute) ? 0.0 : r::value; + (baseline == mode::absolute) ? gko::zero>() + : r::value; auto criterion = gko::stop::ResidualNorm::build() .with_reduction_factor(factor) .with_baseline(baseline) @@ -116,6 +122,7 @@ TYPED_TEST(ResidualNorm, CheckIfResZeroConverges) } } + TYPED_TEST(ResidualNorm, WaitsTillResidualGoal) { using Mtx = typename TestFixture::Mtx; @@ -338,7 +345,7 @@ class ResidualNormWithInitialResnorm : public CommonTestFixture { std::unique_ptr::Factory> factory; }; -TYPED_TEST_SUITE(ResidualNormWithInitialResnorm, gko::test::ValueTypes, +TYPED_TEST_SUITE(ResidualNormWithInitialResnorm, gko::test::ValueTypesWithHalf, TypenameNameGenerator); @@ -435,7 +442,7 @@ class ResidualNormWithRhsNorm : public CommonTestFixture { std::unique_ptr::Factory> factory; }; -TYPED_TEST_SUITE(ResidualNormWithRhsNorm, gko::test::ValueTypes, +TYPED_TEST_SUITE(ResidualNormWithRhsNorm, gko::test::ValueTypesWithHalf, TypenameNameGenerator); @@ -540,16 +547,18 @@ class ImplicitResidualNorm : public CommonTestFixture { factory; }; -TYPED_TEST_SUITE(ImplicitResidualNorm, gko::test::ValueTypes, +TYPED_TEST_SUITE(ImplicitResidualNorm, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(ImplicitResidualNorm, CheckIfResZeroConverges) { - using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::ValueType; + using Mtx = typename TestFixture::Mtx; + // use csr to use half apply + using Csr = gko::matrix::Csr; using gko::stop::mode; - std::shared_ptr mtx = gko::initialize({1.0}, this->exec); + std::shared_ptr mtx = gko::initialize({1.0}, this->exec); std::shared_ptr rhs = gko::initialize({0.0}, this->exec); std::shared_ptr x = gko::initialize({0.0}, this->exec); std::shared_ptr implicit_sq_res_norm = @@ -558,7 +567,8 @@ TYPED_TEST(ImplicitResidualNorm, CheckIfResZeroConverges) for (auto baseline : {mode::rhs_norm, mode::initial_resnorm, mode::absolute}) { gko::remove_complex factor = - (baseline == mode::absolute) ? 0.0 : r::value; + (baseline == mode::absolute) ? gko::zero>() + : r::value; auto criterion = gko::stop::ImplicitResidualNorm::build() .with_reduction_factor(factor) .with_baseline(baseline) @@ -683,7 +693,7 @@ class ResidualNormWithAbsolute : public CommonTestFixture { std::unique_ptr::Factory> factory; }; -TYPED_TEST_SUITE(ResidualNormWithAbsolute, gko::test::ValueTypes, +TYPED_TEST_SUITE(ResidualNormWithAbsolute, gko::test::ValueTypesWithHalf, TypenameNameGenerator); From 1ecaabf8bcb08858df77039c9de6c5103af08c93 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 12 Nov 2024 16:56:55 +0100 Subject: [PATCH 19/69] residual norm default reduction_factor respect to precision Co-authored-by: Marcel Koch --- include/ginkgo/core/stop/residual_norm.hpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/include/ginkgo/core/stop/residual_norm.hpp b/include/ginkgo/core/stop/residual_norm.hpp index 7ee020207d4..c7f240950fa 100644 --- a/include/ginkgo/core/stop/residual_norm.hpp +++ b/include/ginkgo/core/stop/residual_norm.hpp @@ -6,10 +6,12 @@ #define GKO_PUBLIC_CORE_STOP_RESIDUAL_NORM_HPP_ +#include #include #include #include +#include #include #include #include @@ -120,7 +122,8 @@ class ResidualNorm : public ResidualNormBase { * Residual norm reduction factor */ remove_complex GKO_FACTORY_PARAMETER_SCALAR( - reduction_factor, static_cast>(1e-15)); + reduction_factor, + 5 * std::numeric_limits>::epsilon()); /** * The quantity the reduction is relative to. Choices include @@ -176,7 +179,8 @@ class ImplicitResidualNorm : public ResidualNormBase { * Implicit Residual norm goal */ remove_complex GKO_FACTORY_PARAMETER_SCALAR( - reduction_factor, static_cast>(1e-15)); + reduction_factor, + 5 * std::numeric_limits>::epsilon()); /** * The quantity the reduction is relative to. Choices include @@ -251,7 +255,8 @@ class GKO_DEPRECATED( * Factor by which the residual norm will be reduced */ remove_complex GKO_FACTORY_PARAMETER_SCALAR( - reduction_factor, static_cast>(1e-15)); + reduction_factor, + 5 * std::numeric_limits>::epsilon()); }; GKO_ENABLE_CRITERION_FACTORY(ResidualNormReduction, parameters, Factory); @@ -307,7 +312,8 @@ class GKO_DEPRECATED( * Relative residual norm goal */ remove_complex GKO_FACTORY_PARAMETER_SCALAR( - tolerance, static_cast>(1e-15)); + tolerance, + 5 * std::numeric_limits>::epsilon()); }; GKO_ENABLE_CRITERION_FACTORY(RelativeResidualNorm, parameters, Factory); @@ -360,7 +366,8 @@ class GKO_DEPRECATED( * Absolute residual norm goal */ remove_complex GKO_FACTORY_PARAMETER_SCALAR( - tolerance, static_cast>(1e-15)); + tolerance, + 5 * std::numeric_limits>::epsilon()); }; GKO_ENABLE_CRITERION_FACTORY(AbsoluteResidualNorm, parameters, Factory); From 475c3b81ce6ae9ff845480eefa7e97cab4c84d86 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 25 Oct 2024 16:57:19 +0200 Subject: [PATCH 20/69] residual config dispatch --- core/config/stop_config.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/core/config/stop_config.cpp b/core/config/stop_config.cpp index 4623eb768fc..2696b471a21 100644 --- a/core/config/stop_config.cpp +++ b/core/config/stop_config.cpp @@ -87,7 +87,8 @@ deferred_factory_parameter configure_residual( auto updated = update_type(config, td); return dispatch( config, context, updated, - make_type_selector(updated.get_value_typestr(), value_type_list())); + make_type_selector(updated.get_value_typestr(), + value_type_list_with_half())); } @@ -119,7 +120,8 @@ deferred_factory_parameter configure_implicit_residual( auto updated = update_type(config, td); return dispatch( config, context, updated, - make_type_selector(updated.get_value_typestr(), value_type_list())); + make_type_selector(updated.get_value_typestr(), + value_type_list_with_half())); } From 88bab82a6b6b51915ae501ac438566dc46f69daf Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 25 Oct 2024 16:44:50 +0200 Subject: [PATCH 21/69] krylov solver --- common/cuda_hip/solver/idr_kernels.cpp | 20 ++++-- common/unified/solver/bicg_kernels.cpp | 7 +- common/unified/solver/bicgstab_kernels.cpp | 15 +++-- common/unified/solver/cg_kernels.cpp | 6 +- common/unified/solver/cgs_kernels.cpp | 9 +-- .../unified/solver/common_gmres_kernels.cpp | 7 +- common/unified/solver/fcg_kernels.cpp | 7 +- common/unified/solver/gcr_kernels.cpp | 7 +- common/unified/solver/gmres_kernels.cpp | 8 ++- core/device_hooks/common_kernels.inc.cpp | 64 +++++++++---------- core/solver/bicg.cpp | 4 +- core/solver/bicgstab.cpp | 4 +- core/solver/cg.cpp | 4 +- core/solver/cgs.cpp | 4 +- core/solver/fcg.cpp | 4 +- core/solver/gcr.cpp | 4 +- core/solver/gmres.cpp | 4 +- core/solver/idr.cpp | 16 ++++- core/solver/ir.cpp | 4 +- core/test/solver/bicg.cpp | 2 +- core/test/solver/bicgstab.cpp | 3 +- core/test/solver/cg.cpp | 2 +- core/test/solver/cgs.cpp | 2 +- core/test/solver/fcg.cpp | 2 +- core/test/solver/gcr.cpp | 23 +++---- core/test/solver/gmres.cpp | 23 +++---- core/test/solver/idr.cpp | 2 +- core/test/solver/ir.cpp | 2 +- cuda/base/curand_bindings.hpp | 13 ++++ dpcpp/solver/cb_gmres_kernels.dp.cpp | 10 +-- dpcpp/solver/common_gmres_kernels.dp.inc | 10 +-- dpcpp/solver/idr_kernels.dp.cpp | 35 ++++++---- hip/base/hiprand_bindings.hip.hpp | 13 ++++ omp/solver/idr_kernels.cpp | 24 ++++--- reference/solver/bicg_kernels.cpp | 7 +- reference/solver/bicgstab_kernels.cpp | 15 +++-- reference/solver/cg_kernels.cpp | 6 +- reference/solver/cgs_kernels.cpp | 9 +-- reference/solver/common_gmres_kernels.cpp | 7 +- reference/solver/fcg_kernels.cpp | 7 +- reference/solver/gcr_kernels.cpp | 7 +- reference/solver/gmres_kernels.cpp | 8 ++- reference/solver/idr_kernels.cpp | 24 ++++--- reference/test/solver/bicg_kernels.cpp | 22 +++++-- reference/test/solver/bicgstab_kernels.cpp | 40 ++++++++---- reference/test/solver/cg_kernels.cpp | 26 ++++++-- reference/test/solver/cgs_kernels.cpp | 35 +++++++--- reference/test/solver/fcg_kernels.cpp | 26 ++++++-- reference/test/solver/gcr_kernels.cpp | 38 ++++++++--- reference/test/solver/gmres_kernels.cpp | 30 +++++++-- reference/test/solver/idr_kernels.cpp | 28 ++++++-- reference/test/solver/ir_kernels.cpp | 11 ++-- test/solver/cb_gmres_kernels.cpp | 2 +- 53 files changed, 455 insertions(+), 257 deletions(-) diff --git a/common/cuda_hip/solver/idr_kernels.cpp b/common/cuda_hip/solver/idr_kernels.cpp index a0f605134eb..0dc310ebd2e 100644 --- a/common/cuda_hip/solver/idr_kernels.cpp +++ b/common/cuda_hip/solver/idr_kernels.cpp @@ -344,9 +344,13 @@ __global__ __launch_bounds__(config::warp_size) void compute_omega_kernel( if (!stop_status[global_id].has_stopped()) { auto thr = omega[global_id]; + const auto normt = sqrt(real(tht[global_id])); + if (normt == zero>()) { + omega[global_id] = zero(); + return; + } omega[global_id] /= tht[global_id]; - auto absrho = - abs(thr / (sqrt(real(tht[global_id])) * residual_norm[global_id])); + auto absrho = abs(thr / (normt * residual_norm[global_id])); if (absrho < kappa) { omega[global_id] *= kappa / absrho; @@ -555,7 +559,8 @@ void initialize(std::shared_ptr exec, orthonormalize_subspace_vectors(exec, subspace_vectors); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_IDR_INITIALIZE_KERNEL); template @@ -582,7 +587,7 @@ void step_1(std::shared_ptr exec, const size_type nrhs, stop_status->get_const_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_1_KERNEL); template @@ -609,7 +614,7 @@ void step_2(std::shared_ptr exec, const size_type nrhs, stop_status->get_const_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_2_KERNEL); template @@ -626,7 +631,7 @@ void step_3(std::shared_ptr exec, const size_type nrhs, update_x_r_and_f(exec, nrhs, k, m, g, u, f, residual, x, stop_status); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_3_KERNEL); template @@ -644,7 +649,8 @@ void compute_omega( as_device_type(omega->get_values()), stop_status->get_const_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL); } // namespace idr diff --git a/common/unified/solver/bicg_kernels.cpp b/common/unified/solver/bicg_kernels.cpp index 7d15718c05d..4c6fe8cdc98 100644 --- a/common/unified/solver/bicg_kernels.cpp +++ b/common/unified/solver/bicg_kernels.cpp @@ -64,7 +64,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_BICG_INITIALIZE_KERNEL); template @@ -90,7 +91,7 @@ void step_1(std::shared_ptr exec, row_vector(prev_rho), *stop_status); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_STEP_1_KERNEL); template @@ -119,7 +120,7 @@ void step_2(std::shared_ptr exec, default_stride(q2), row_vector(beta), row_vector(rho), *stop_status); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_STEP_2_KERNEL); } // namespace bicg diff --git a/common/unified/solver/bicgstab_kernels.cpp b/common/unified/solver/bicgstab_kernels.cpp index c403da3bf96..ad5b1ed3302 100644 --- a/common/unified/solver/bicgstab_kernels.cpp +++ b/common/unified/solver/bicgstab_kernels.cpp @@ -69,7 +69,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL); template @@ -98,7 +99,8 @@ void step_1(std::shared_ptr exec, row_vector(alpha), row_vector(omega), *stop_status); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_BICGSTAB_STEP_1_KERNEL); template @@ -127,7 +129,8 @@ void step_2(std::shared_ptr exec, *stop_status); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_BICGSTAB_STEP_2_KERNEL); template @@ -159,7 +162,8 @@ void step_3( row_vector(omega), *stop_status); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_BICGSTAB_STEP_3_KERNEL); template @@ -188,7 +192,8 @@ void finalize(std::shared_ptr exec, x->get_size()[1], *stop_status); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL); } // namespace bicgstab diff --git a/common/unified/solver/cg_kernels.cpp b/common/unified/solver/cg_kernels.cpp index 822dddf1c3b..e77f01de748 100644 --- a/common/unified/solver/cg_kernels.cpp +++ b/common/unified/solver/cg_kernels.cpp @@ -57,7 +57,7 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_INITIALIZE_KERNEL); template @@ -80,7 +80,7 @@ void step_1(std::shared_ptr exec, row_vector(rho), row_vector(prev_rho), *stop_status); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_STEP_1_KERNEL); template @@ -106,7 +106,7 @@ void step_2(std::shared_ptr exec, default_stride(q), row_vector(beta), row_vector(rho), *stop_status); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_STEP_2_KERNEL); } // namespace cg diff --git a/common/unified/solver/cgs_kernels.cpp b/common/unified/solver/cgs_kernels.cpp index 0618b8f8208..6ceaa883c9f 100644 --- a/common/unified/solver/cgs_kernels.cpp +++ b/common/unified/solver/cgs_kernels.cpp @@ -72,7 +72,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_CGS_INITIALIZE_KERNEL); template @@ -103,7 +104,7 @@ void step_1(std::shared_ptr exec, row_vector(prev_rho), *stop_status); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_1_KERNEL); template @@ -134,7 +135,7 @@ void step_2(std::shared_ptr exec, row_vector(alpha), row_vector(rho), row_vector(gamma), *stop_status); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_2_KERNEL); template void step_3(std::shared_ptr exec, @@ -157,7 +158,7 @@ void step_3(std::shared_ptr exec, *stop_status); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_3_KERNEL); } // namespace cgs diff --git a/common/unified/solver/common_gmres_kernels.cpp b/common/unified/solver/common_gmres_kernels.cpp index 679aebcfaa2..32fe526d7f6 100644 --- a/common/unified/solver/common_gmres_kernels.cpp +++ b/common/unified/solver/common_gmres_kernels.cpp @@ -52,7 +52,8 @@ void initialize(std::shared_ptr exec, b->get_size()[0]); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL); template @@ -125,7 +126,7 @@ void hessenberg_qr(std::shared_ptr exec, stop_status); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_COMMON_GMRES_HESSENBERG_QR_KERNEL); @@ -158,7 +159,7 @@ void solve_krylov(std::shared_ptr exec, residual_norm_collection->get_size()[1]); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL); diff --git a/common/unified/solver/fcg_kernels.cpp b/common/unified/solver/fcg_kernels.cpp index 7853d97c358..01dd3cb3d9a 100644 --- a/common/unified/solver/fcg_kernels.cpp +++ b/common/unified/solver/fcg_kernels.cpp @@ -61,7 +61,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_FCG_INITIALIZE_KERNEL); template @@ -84,7 +85,7 @@ void step_1(std::shared_ptr exec, row_vector(rho_t), row_vector(prev_rho), *stop_status); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_STEP_1_KERNEL); template @@ -113,7 +114,7 @@ void step_2(std::shared_ptr exec, *stop_status); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_STEP_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_STEP_2_KERNEL); } // namespace fcg diff --git a/common/unified/solver/gcr_kernels.cpp b/common/unified/solver/gcr_kernels.cpp index 7adef77dfb1..d5c2e27097d 100644 --- a/common/unified/solver/gcr_kernels.cpp +++ b/common/unified/solver/gcr_kernels.cpp @@ -44,7 +44,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_GCR_INITIALIZE_KERNEL); template @@ -78,7 +79,7 @@ void restart(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_RESTART_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_RESTART_KERNEL); template @@ -104,7 +105,7 @@ void step_1(std::shared_ptr exec, stop_status); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_STEP_1_KERNEL); } // namespace gcr } // namespace GKO_DEVICE_NAMESPACE diff --git a/common/unified/solver/gmres_kernels.cpp b/common/unified/solver/gmres_kernels.cpp index f24ae445edb..38bb935df9f 100644 --- a/common/unified/solver/gmres_kernels.cpp +++ b/common/unified/solver/gmres_kernels.cpp @@ -56,7 +56,7 @@ void restart(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_RESTART_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES_RESTART_KERNEL); template @@ -92,7 +92,8 @@ void multi_axpy(std::shared_ptr exec, before_preconditioner->get_size()[1], stop_status); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL); template @@ -119,7 +120,8 @@ void multi_dot(std::shared_ptr exec, next_krylov->get_size()[0]); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_DOT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_GMRES_MULTI_DOT_KERNEL); } // namespace gmres } // namespace GKO_DEVICE_NAMESPACE diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index c41f9e921cb..1c57ca45177 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -549,9 +549,9 @@ GKO_STUB_BATCH_VALUE_MATRIX_PRECONDITIONER( namespace cg { -GKO_STUB_VALUE_TYPE(GKO_DECLARE_CG_INITIALIZE_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_CG_STEP_1_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_CG_STEP_2_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_INITIALIZE_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_STEP_1_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_STEP_2_KERNEL); } // namespace cg @@ -560,9 +560,9 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_CG_STEP_2_KERNEL); namespace bicg { -GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICG_STEP_1_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICG_STEP_2_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_INITIALIZE_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_STEP_1_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_STEP_2_KERNEL); } // namespace bicg @@ -593,9 +593,9 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL); namespace fcg { -GKO_STUB_VALUE_TYPE(GKO_DECLARE_FCG_INITIALIZE_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_FCG_STEP_1_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_FCG_STEP_2_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_INITIALIZE_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_STEP_1_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_STEP_2_KERNEL); } // namespace fcg @@ -604,11 +604,11 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_FCG_STEP_2_KERNEL); namespace bicgstab { -GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_1_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_2_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_3_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB_STEP_1_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB_STEP_2_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB_STEP_3_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL); } // namespace bicgstab @@ -617,11 +617,11 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL); namespace idr { -GKO_STUB_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_INITIALIZE_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_1_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_2_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_3_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL); } // namespace idr @@ -630,10 +630,10 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL); namespace cgs { -GKO_STUB_VALUE_TYPE(GKO_DECLARE_CGS_INITIALIZE_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_CGS_STEP_1_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_CGS_STEP_2_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_CGS_STEP_3_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_INITIALIZE_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_1_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_2_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_3_KERNEL); } // namespace cgs @@ -641,9 +641,9 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_CGS_STEP_3_KERNEL); namespace gcr { -GKO_STUB_VALUE_TYPE(GKO_DECLARE_GCR_INITIALIZE_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_GCR_RESTART_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_GCR_STEP_1_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_INITIALIZE_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_RESTART_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_STEP_1_KERNEL); } // namespace gcr @@ -651,9 +651,9 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_GCR_STEP_1_KERNEL); namespace common_gmres { -GKO_STUB_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_HESSENBERG_QR_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_COMMON_GMRES_HESSENBERG_QR_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL); } // namespace common_gmres @@ -662,9 +662,9 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL); namespace gmres { -GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_RESTART_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_DOT_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES_RESTART_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES_MULTI_DOT_KERNEL); } // namespace gmres diff --git a/core/solver/bicg.cpp b/core/solver/bicg.cpp index 0b39b3664cc..55d18f7f01d 100644 --- a/core/solver/bicg.cpp +++ b/core/solver/bicg.cpp @@ -293,8 +293,8 @@ std::vector workspace_traits>::vectors(const Solver&) #define GKO_DECLARE_BICG(_type) class Bicg<_type> #define GKO_DECLARE_BICG_TRAITS(_type) struct workspace_traits> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_TRAITS); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_TRAITS); } // namespace solver diff --git a/core/solver/bicgstab.cpp b/core/solver/bicgstab.cpp index c254b417765..1e27c046186 100644 --- a/core/solver/bicgstab.cpp +++ b/core/solver/bicgstab.cpp @@ -298,8 +298,8 @@ std::vector workspace_traits>::vectors(const Solver&) #define GKO_DECLARE_BICGSTAB(_type) class Bicgstab<_type> #define GKO_DECLARE_BICGSTAB_TRAITS(_type) \ struct workspace_traits> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_TRAITS); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICGSTAB_TRAITS); } // namespace solver diff --git a/core/solver/cg.cpp b/core/solver/cg.cpp index c512dc4313b..a7898577b8a 100644 --- a/core/solver/cg.cpp +++ b/core/solver/cg.cpp @@ -243,8 +243,8 @@ std::vector workspace_traits>::vectors(const Solver&) #define GKO_DECLARE_CG(_type) class Cg<_type> #define GKO_DECLARE_CG_TRAITS(_type) struct workspace_traits> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_TRAITS); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_TRAITS); } // namespace solver diff --git a/core/solver/cgs.cpp b/core/solver/cgs.cpp index 19f625228a3..4ec702a8db5 100644 --- a/core/solver/cgs.cpp +++ b/core/solver/cgs.cpp @@ -265,8 +265,8 @@ std::vector workspace_traits>::vectors(const Solver&) #define GKO_DECLARE_CGS(_type) class Cgs<_type> #define GKO_DECLARE_CGS_TRAITS(_type) struct workspace_traits> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_TRAITS); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_TRAITS); } // namespace solver diff --git a/core/solver/fcg.cpp b/core/solver/fcg.cpp index 6c65f63ccae..569061626ff 100644 --- a/core/solver/fcg.cpp +++ b/core/solver/fcg.cpp @@ -247,8 +247,8 @@ std::vector workspace_traits>::vectors(const Solver&) #define GKO_DECLARE_FCG(_type) class Fcg<_type> #define GKO_DECLARE_FCG_TRAITS(_type) struct workspace_traits> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_TRAITS); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_TRAITS); } // namespace solver diff --git a/core/solver/gcr.cpp b/core/solver/gcr.cpp index d5131632dc3..8219de79ef4 100644 --- a/core/solver/gcr.cpp +++ b/core/solver/gcr.cpp @@ -371,8 +371,8 @@ std::vector workspace_traits>::vectors(const Solver&) #define GKO_DECLARE_GCR(_type) class Gcr<_type> #define GKO_DECLARE_GCR_TRAITS(_type) struct workspace_traits> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_TRAITS); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_TRAITS); } // namespace solver diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index e066fc696a1..8a4fdf563c3 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -707,8 +707,8 @@ std::vector workspace_traits>::vectors(const Solver&) #define GKO_DECLARE_GMRES(_type) class Gmres<_type> #define GKO_DECLARE_GMRES_TRAITS(_type) struct workspace_traits> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_TRAITS); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES_TRAITS); } // namespace solver diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp index c6d89b84ea6..d090324fea1 100644 --- a/core/solver/idr.cpp +++ b/core/solver/idr.cpp @@ -65,6 +65,10 @@ std::unique_ptr Idr::transpose() const .with_generated_preconditioner( share(as(this->get_preconditioner())->transpose())) .with_criteria(this->get_stop_criterion_factory()) + .with_subspace_dim(this->get_subspace_dim()) + .with_kappa(this->get_kappa()) + .with_deterministic(this->get_deterministic()) + .with_complex_subspace(this->get_complex_subspace()) .on(this->get_executor()) ->generate( share(as(this->get_system_matrix())->transpose())); @@ -78,6 +82,10 @@ std::unique_ptr Idr::conj_transpose() const .with_generated_preconditioner(share( as(this->get_preconditioner())->conj_transpose())) .with_criteria(this->get_stop_criterion_factory()) + .with_subspace_dim(this->get_subspace_dim()) + .with_kappa(this->get_kappa()) + .with_deterministic(this->get_deterministic()) + .with_complex_subspace(this->get_complex_subspace()) .on(this->get_executor()) ->generate(share( as(this->get_system_matrix())->conj_transpose())); @@ -272,7 +280,9 @@ void Idr::iterate(const VectorType* dense_b, // omega = (t^H * residual) / (t^H * t) // rho = (t^H * residual) / (norm(t) * norm(residual)) - // if abs(rho) < kappa then + // if norm(t) is zero then + // omega = 0 + // else if abs(rho) < kappa then // omega *= kappa / abs(rho) // end if // residual -= omega * t @@ -396,8 +406,8 @@ std::vector workspace_traits>::vectors(const Solver&) #define GKO_DECLARE_IDR(_type) class Idr<_type> #define GKO_DECLARE_IDR_TRAITS(_type) struct workspace_traits> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_TRAITS); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_TRAITS); } // namespace solver diff --git a/core/solver/ir.cpp b/core/solver/ir.cpp index 75efac351f9..3c2854dcf98 100644 --- a/core/solver/ir.cpp +++ b/core/solver/ir.cpp @@ -370,8 +370,8 @@ std::vector workspace_traits>::vectors(const Solver&) #define GKO_DECLARE_IR(_type) class Ir<_type> #define GKO_DECLARE_IR_TRAITS(_type) struct workspace_traits> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IR); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IR_TRAITS); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IR); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IR_TRAITS); } // namespace solver diff --git a/core/test/solver/bicg.cpp b/core/test/solver/bicg.cpp index e5a40e0c4f8..a229bd85ed9 100644 --- a/core/test/solver/bicg.cpp +++ b/core/test/solver/bicg.cpp @@ -46,7 +46,7 @@ class Bicg : public ::testing::Test { std::unique_ptr solver; }; -TYPED_TEST_SUITE(Bicg, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Bicg, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Bicg, BicgFactoryKnowsItsExecutor) diff --git a/core/test/solver/bicgstab.cpp b/core/test/solver/bicgstab.cpp index f8b8d3c7b05..23695fe1355 100644 --- a/core/test/solver/bicgstab.cpp +++ b/core/test/solver/bicgstab.cpp @@ -45,7 +45,8 @@ class Bicgstab : public ::testing::Test { std::unique_ptr solver; }; -TYPED_TEST_SUITE(Bicgstab, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Bicgstab, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(Bicgstab, BicgstabFactoryKnowsItsExecutor) diff --git a/core/test/solver/cg.cpp b/core/test/solver/cg.cpp index cbf637de302..95552d841ac 100644 --- a/core/test/solver/cg.cpp +++ b/core/test/solver/cg.cpp @@ -46,7 +46,7 @@ class Cg : public ::testing::Test { std::unique_ptr solver; }; -TYPED_TEST_SUITE(Cg, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Cg, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Cg, CgFactoryKnowsItsExecutor) diff --git a/core/test/solver/cgs.cpp b/core/test/solver/cgs.cpp index 5dc80892a1b..cc355b58270 100644 --- a/core/test/solver/cgs.cpp +++ b/core/test/solver/cgs.cpp @@ -46,7 +46,7 @@ class Cgs : public ::testing::Test { std::unique_ptr solver; }; -TYPED_TEST_SUITE(Cgs, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Cgs, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Cgs, CgsFactoryKnowsItsExecutor) diff --git a/core/test/solver/fcg.cpp b/core/test/solver/fcg.cpp index 2898a5f5c46..c92fa4bb7f1 100644 --- a/core/test/solver/fcg.cpp +++ b/core/test/solver/fcg.cpp @@ -44,7 +44,7 @@ class Fcg : public ::testing::Test { std::unique_ptr solver; }; -TYPED_TEST_SUITE(Fcg, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Fcg, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Fcg, FcgFactoryKnowsItsExecutor) diff --git a/core/test/solver/gcr.cpp b/core/test/solver/gcr.cpp index 2d7b5ea7974..58194f6e92a 100644 --- a/core/test/solver/gcr.cpp +++ b/core/test/solver/gcr.cpp @@ -27,8 +27,8 @@ class Gcr : public ::testing::Test { using Solver = gko::solver::Gcr; using Big_solver = gko::solver::Gcr; - static constexpr gko::remove_complex reduction_factor = - gko::remove_complex(1e-6); + const gko::remove_complex reduction_factor = + r>::value; Gcr() : exec(gko::ReferenceExecutor::create()), @@ -70,10 +70,7 @@ class Gcr : public ::testing::Test { } }; -template -constexpr gko::remove_complex Gcr::reduction_factor; - -TYPED_TEST_SUITE(Gcr, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Gcr, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Gcr, GcrFactoryKnowsItsExecutor) @@ -160,10 +157,9 @@ TYPED_TEST(Gcr, CanSetPreconditionerGenerator) using value_type = typename TestFixture::value_type; auto gcr_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u), - gko::stop::ResidualNorm::build() - .with_reduction_factor(TestFixture::reduction_factor)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(this->reduction_factor)) .with_preconditioner(Solver::build().with_criteria( gko::stop::Iteration::build().with_max_iters(3u))) .on(this->exec); @@ -210,10 +206,9 @@ TYPED_TEST(Gcr, CanSetKrylovDim) auto gcr_factory = Solver::build() .with_krylov_dim(4u) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u), - gko::stop::ResidualNorm::build() - .with_reduction_factor(TestFixture::reduction_factor)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(4u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(this->reduction_factor)) .on(this->exec); auto solver = gcr_factory->generate(this->mtx); auto krylov_dim = solver->get_krylov_dim(); diff --git a/core/test/solver/gmres.cpp b/core/test/solver/gmres.cpp index 5d9c9e3c40e..50f505f6321 100644 --- a/core/test/solver/gmres.cpp +++ b/core/test/solver/gmres.cpp @@ -27,8 +27,8 @@ class Gmres : public ::testing::Test { using Solver = gko::solver::Gmres; using Big_solver = gko::solver::Gmres; - static constexpr gko::remove_complex reduction_factor = - gko::remove_complex(1e-6); + const gko::remove_complex reduction_factor = + r>::value; Gmres() : exec(gko::ReferenceExecutor::create()), @@ -60,10 +60,7 @@ class Gmres : public ::testing::Test { std::unique_ptr big_solver; }; -template -constexpr gko::remove_complex Gmres::reduction_factor; - -TYPED_TEST_SUITE(Gmres, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Gmres, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Gmres, GmresFactoryKnowsItsExecutor) @@ -146,10 +143,9 @@ TYPED_TEST(Gmres, CanSetPreconditionerGenerator) using value_type = typename TestFixture::value_type; auto gmres_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u), - gko::stop::ResidualNorm::build() - .with_reduction_factor(TestFixture::reduction_factor)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(this->reduction_factor)) .with_preconditioner(Solver::build().with_criteria( gko::stop::Iteration::build().with_max_iters(3u))) .on(this->exec); @@ -197,10 +193,9 @@ TYPED_TEST(Gmres, CanSetKrylovDim) auto gmres_factory = Solver::build() .with_krylov_dim(4u) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u), - gko::stop::ResidualNorm::build() - .with_reduction_factor(TestFixture::reduction_factor)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(4u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(this->reduction_factor)) .on(this->exec); auto solver = gmres_factory->generate(this->mtx); auto krylov_dim = solver->get_krylov_dim(); diff --git a/core/test/solver/idr.cpp b/core/test/solver/idr.cpp index 9eb79356046..823327e337e 100644 --- a/core/test/solver/idr.cpp +++ b/core/test/solver/idr.cpp @@ -45,7 +45,7 @@ class Idr : public ::testing::Test { std::unique_ptr solver; }; -TYPED_TEST_SUITE(Idr, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Idr, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Idr, IdrFactoryKnowsItsExecutor) diff --git a/core/test/solver/ir.cpp b/core/test/solver/ir.cpp index 1137862a395..59f85f42321 100644 --- a/core/test/solver/ir.cpp +++ b/core/test/solver/ir.cpp @@ -46,7 +46,7 @@ class Ir : public ::testing::Test { std::unique_ptr solver; }; -TYPED_TEST_SUITE(Ir, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Ir, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Ir, IrFactoryKnowsItsExecutor) diff --git a/cuda/base/curand_bindings.hpp b/cuda/base/curand_bindings.hpp index eb3dbee6b7b..80ceff2dacd 100644 --- a/cuda/base/curand_bindings.hpp +++ b/cuda/base/curand_bindings.hpp @@ -23,6 +23,17 @@ namespace cuda { * @ingroup curand */ namespace curand { +namespace detail { + + +template +inline int64 not_implemented(Args...) +{ + return static_cast(CURAND_STATUS_TYPE_ERROR); +} + + +} // namespace detail template @@ -77,6 +88,8 @@ GKO_BIND_CURAND_RANDOM_VECTOR(float, curandGenerateNormal); GKO_BIND_CURAND_RANDOM_VECTOR(double, curandGenerateNormalDouble); GKO_BIND_CURAND_RANDOM_VECTOR(std::complex, curandGenerateNormal); GKO_BIND_CURAND_RANDOM_VECTOR(std::complex, curandGenerateNormalDouble); +template +GKO_BIND_CURAND_RANDOM_VECTOR(ValueType, detail::not_implemented); #undef GKO_BIND_CURAND_RANDOM_VECTOR diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp index 7ab010ba29f..e3424944309 100644 --- a/dpcpp/solver/cb_gmres_kernels.dp.cpp +++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp @@ -285,9 +285,9 @@ void multinorminf_without_stop_kernel( i += default_dot_dim) { const auto next_krylov_idx = i * stride_next_krylov + col_idx; local_max = - (local_max >= std::abs(next_krylov_basis[next_krylov_idx])) + (local_max >= gko::abs(next_krylov_basis[next_krylov_idx])) ? local_max - : std::abs(next_krylov_basis[next_krylov_idx]); + : gko::abs(next_krylov_basis[next_krylov_idx]); } } reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_max; @@ -373,7 +373,7 @@ void multinorm2_inf_kernel( local_res += squared_norm(num); if (compute_inf) { local_max = - ((local_max >= std::abs(num)) ? local_max : std::abs(num)); + ((local_max >= gko::abs(num)) ? local_max : gko::abs(num)); } } } @@ -729,8 +729,8 @@ void check_arnoldi_norms( gko::cb_gmres::detail::has_3d_scaled_accessor::value; if (col_idx < num_rhs && !stop_status[col_idx].has_stopped()) { - const auto num0 = (std::sqrt(eta_squared * arnoldi_norm[col_idx])); - const auto num11 = std::sqrt(arnoldi_norm[col_idx + stride_norm]); + const auto num0 = gko::sqrt(eta_squared * arnoldi_norm[col_idx]); + const auto num11 = gko::sqrt(arnoldi_norm[col_idx + stride_norm]); const auto num2 = has_scalar ? (arnoldi_norm[col_idx + 2 * stride_norm]) : remove_complex{}; if (num11 < num0) { diff --git a/dpcpp/solver/common_gmres_kernels.dp.inc b/dpcpp/solver/common_gmres_kernels.dp.inc index 0b5de8188f2..f8a54fe5116 100644 --- a/dpcpp/solver/common_gmres_kernels.dp.inc +++ b/dpcpp/solver/common_gmres_kernels.dp.inc @@ -72,12 +72,12 @@ void calculate_sin_and_cos_kernel(size_type col_idx, size_type num_cols, register_cos = zero(); register_sin = one(); } else { - const auto scale = std::abs(this_hess) + std::abs(next_hess); + const auto scale = gko::abs(this_hess) + gko::abs(next_hess); const auto hypotenuse = scale * - std::sqrt( - std::abs(this_hess / scale) * std::abs(this_hess / scale) + - std::abs(next_hess / scale) * std::abs(next_hess / scale)); + gko::sqrt( + gko::abs(this_hess / scale) * gko::abs(this_hess / scale) + + gko::abs(next_hess / scale) * gko::abs(next_hess / scale)); register_cos = conj(this_hess) / hypotenuse; register_sin = conj(next_hess) / hypotenuse; } @@ -102,7 +102,7 @@ void calculate_residual_norm_kernel(size_type col_idx, size_type num_cols, const auto next_rnc = -conj(register_sin) * this_rnc; residual_norm_collection[iter * stride_residual_norm_collection + col_idx] = register_cos * this_rnc; - residual_norm[col_idx] = std::abs(next_rnc); + residual_norm[col_idx] = gko::abs(next_rnc); residual_norm_collection[(iter + 1) * stride_residual_norm_collection + col_idx] = next_rnc; } diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp index d59ada362f9..29cdd70cd64 100644 --- a/dpcpp/solver/idr_kernels.dp.cpp +++ b/dpcpp/solver/idr_kernels.dp.cpp @@ -8,6 +8,7 @@ #include #include +#include #include @@ -127,7 +128,7 @@ void orthonormalize_subspace_vectors_kernel( const remove_complex& b) { return a + b; }); item_ct1.barrier(sycl::access::fence_space::local_space); - norm = std::sqrt(reduction_helper_real[0]); + norm = gko::sqrt(reduction_helper_real[0]); for (size_type j = tidx; j < num_cols; j += block_size) { values[row * stride + j] /= norm; } @@ -542,8 +543,12 @@ void compute_omega_kernel( if (!stop_status[global_id].has_stopped()) { auto thr = omega[global_id]; omega[global_id] /= tht[global_id]; - auto absrho = std::abs( - thr / (std::sqrt(real(tht[global_id])) * residual_norm[global_id])); + const auto normt = sqrt(real(tht[global_id])); + if (normt == zero>()) { + omega[global_id] = zero(); + return; + } + auto absrho = gko::abs(thr / (normt * residual_norm[global_id])); if (absrho < kappa) { omega[global_id] *= kappa / absrho; @@ -594,18 +599,20 @@ void initialize_subspace_vectors(std::shared_ptr exec, { if (!deterministic) { auto seed = std::random_device{}(); - auto work = reinterpret_cast*>( - subspace_vectors->get_values()); + using real_type = remove_complex; + auto work = + reinterpret_cast(subspace_vectors->get_values()); auto n = subspace_vectors->get_size()[0] * subspace_vectors->get_stride(); + using rand_type = std::conditional_t, + float, real_type>; n = is_complex() ? 2 * n : n; exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for(sycl::range<1>(n), [=](sycl::item<1> idx) { std::uint64_t offset = idx.get_linear_id(); oneapi::dpl::minstd_rand engine(seed, offset); - oneapi::dpl::normal_distribution> - distr(0, 1); - auto res = distr(engine); + oneapi::dpl::normal_distribution distr(0, 1); + auto res = static_cast(distr(engine)); work[idx] = res; }); @@ -761,7 +768,8 @@ void initialize(std::shared_ptr exec, const size_type nrhs, orthonormalize_subspace_vectors(exec, subspace_vectors); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_IDR_INITIALIZE_KERNEL); template @@ -787,7 +795,7 @@ void step_1(std::shared_ptr exec, const size_type nrhs, stop_status->get_const_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_1_KERNEL); template @@ -812,7 +820,7 @@ void step_2(std::shared_ptr exec, const size_type nrhs, stop_status->get_const_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_2_KERNEL); template @@ -829,7 +837,7 @@ void step_3(std::shared_ptr exec, const size_type nrhs, update_x_r_and_f(exec, nrhs, k, m, g, u, f, residual, x, stop_status); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_3_KERNEL); template @@ -846,7 +854,8 @@ void compute_omega( stop_status->get_const_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL); } // namespace idr diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp index 7cd76b9d320..76a7f4e79ce 100644 --- a/hip/base/hiprand_bindings.hip.hpp +++ b/hip/base/hiprand_bindings.hip.hpp @@ -29,6 +29,17 @@ namespace hip { * @ingroup hiprand */ namespace hiprand { +namespace detail { + + +template +inline int64 not_implemented(Args...) +{ + return static_cast(HIPRAND_STATUS_TYPE_ERROR); +} + + +} // namespace detail template @@ -83,6 +94,8 @@ GKO_BIND_HIPRAND_RANDOM_VECTOR(double, hiprandGenerateNormalDouble); GKO_BIND_HIPRAND_RANDOM_VECTOR(std::complex, hiprandGenerateNormal); GKO_BIND_HIPRAND_RANDOM_VECTOR(std::complex, hiprandGenerateNormalDouble); +template +GKO_BIND_HIPRAND_RANDOM_VECTOR(ValueType, detail::not_implemented); #undef GKO_BIND_HIPRAND_RANDOM_VECTOR diff --git a/omp/solver/idr_kernels.cpp b/omp/solver/idr_kernels.cpp index a93002e4833..eb0eb1074e5 100644 --- a/omp/solver/idr_kernels.cpp +++ b/omp/solver/idr_kernels.cpp @@ -93,7 +93,7 @@ template typename std::enable_if::value, ValueType>::type get_rand_value(Distribution&& dist, Generator&& gen) { - return dist(gen); + return static_cast(dist(gen)); } @@ -101,7 +101,9 @@ template typename std::enable_if::value, ValueType>::type get_rand_value(Distribution&& dist, Generator&& gen) { - return ValueType(dist(gen), dist(gen)); + using real_value_type = remove_complex; + return ValueType(get_rand_value(dist, gen), + get_rand_value(dist, gen)); } @@ -135,7 +137,7 @@ void initialize(std::shared_ptr exec, const size_type nrhs, // Initialize and Orthonormalize P const auto num_rows = subspace_vectors->get_size()[0]; const auto num_cols = subspace_vectors->get_size()[1]; - auto dist = std::normal_distribution>(0.0, 1.0); + auto dist = std::normal_distribution<>(0.0, 1.0); auto seed = std::random_device{}(); auto gen = std::default_random_engine(seed); for (size_type row = 0; row < num_rows; row++) { @@ -182,7 +184,8 @@ void initialize(std::shared_ptr exec, const size_type nrhs, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_IDR_INITIALIZE_KERNEL); template @@ -216,7 +219,7 @@ void step_1(std::shared_ptr exec, const size_type nrhs, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_1_KERNEL); template @@ -242,7 +245,7 @@ void step_2(std::shared_ptr exec, const size_type nrhs, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_2_KERNEL); template @@ -288,7 +291,7 @@ void step_3(std::shared_ptr exec, const size_type nrhs, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_3_KERNEL); template @@ -306,6 +309,10 @@ void compute_omega( auto thr = omega->at(0, i); auto normt = sqrt(real(tht->at(0, i))); + if (normt == zero>()) { + omega->at(0, i) = 0; + continue; + } omega->at(0, i) /= tht->at(0, i); auto absrho = abs(thr / (normt * residual_norm->at(0, i))); @@ -315,7 +322,8 @@ void compute_omega( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL); } // namespace idr diff --git a/reference/solver/bicg_kernels.cpp b/reference/solver/bicg_kernels.cpp index dee2d30b8dc..511d4375ae5 100644 --- a/reference/solver/bicg_kernels.cpp +++ b/reference/solver/bicg_kernels.cpp @@ -46,7 +46,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_BICG_INITIALIZE_KERNEL); template @@ -74,7 +75,7 @@ void step_1(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_STEP_1_KERNEL); template @@ -102,7 +103,7 @@ void step_2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_BICG_STEP_2_KERNEL); } // namespace bicg diff --git a/reference/solver/bicgstab_kernels.cpp b/reference/solver/bicgstab_kernels.cpp index 31955a59c53..e762dc88533 100644 --- a/reference/solver/bicgstab_kernels.cpp +++ b/reference/solver/bicgstab_kernels.cpp @@ -57,7 +57,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL); template @@ -87,7 +88,8 @@ void step_1(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_BICGSTAB_STEP_1_KERNEL); template @@ -115,7 +117,8 @@ void step_2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_BICGSTAB_STEP_2_KERNEL); template @@ -149,7 +152,8 @@ void step_3( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_BICGSTAB_STEP_3_KERNEL); template @@ -169,7 +173,8 @@ void finalize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL); } // namespace bicgstab diff --git a/reference/solver/cg_kernels.cpp b/reference/solver/cg_kernels.cpp index 5af15692414..fe548b9a03a 100644 --- a/reference/solver/cg_kernels.cpp +++ b/reference/solver/cg_kernels.cpp @@ -42,7 +42,7 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_INITIALIZE_KERNEL); template @@ -67,7 +67,7 @@ void step_1(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_STEP_1_KERNEL); template @@ -93,7 +93,7 @@ void step_2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CG_STEP_2_KERNEL); } // namespace cg diff --git a/reference/solver/cgs_kernels.cpp b/reference/solver/cgs_kernels.cpp index a5a5f8c5862..f2f2200b996 100644 --- a/reference/solver/cgs_kernels.cpp +++ b/reference/solver/cgs_kernels.cpp @@ -51,7 +51,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_CGS_INITIALIZE_KERNEL); template @@ -83,7 +84,7 @@ void step_1(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_1_KERNEL); template @@ -114,7 +115,7 @@ void step_2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_2_KERNEL); template @@ -135,7 +136,7 @@ void step_3(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CGS_STEP_3_KERNEL); } // namespace cgs diff --git a/reference/solver/common_gmres_kernels.cpp b/reference/solver/common_gmres_kernels.cpp index 4ba091e03ae..24c6135f0b1 100644 --- a/reference/solver/common_gmres_kernels.cpp +++ b/reference/solver/common_gmres_kernels.cpp @@ -132,7 +132,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL); template @@ -156,7 +157,7 @@ void hessenberg_qr(std::shared_ptr exec, residual_norm_collection, iter, stop_status); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_COMMON_GMRES_HESSENBERG_QR_KERNEL); @@ -186,7 +187,7 @@ void solve_krylov(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL); diff --git a/reference/solver/fcg_kernels.cpp b/reference/solver/fcg_kernels.cpp index 65b6bf27698..5ba997da941 100644 --- a/reference/solver/fcg_kernels.cpp +++ b/reference/solver/fcg_kernels.cpp @@ -43,7 +43,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_FCG_INITIALIZE_KERNEL); template @@ -68,7 +69,7 @@ void step_1(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_STEP_1_KERNEL); template @@ -96,7 +97,7 @@ void step_2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_STEP_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_FCG_STEP_2_KERNEL); } // namespace fcg diff --git a/reference/solver/gcr_kernels.cpp b/reference/solver/gcr_kernels.cpp index 531814c641e..d51728b15cf 100644 --- a/reference/solver/gcr_kernels.cpp +++ b/reference/solver/gcr_kernels.cpp @@ -37,7 +37,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_GCR_INITIALIZE_KERNEL); template @@ -56,7 +57,7 @@ void restart(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_RESTART_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_RESTART_KERNEL); template @@ -82,7 +83,7 @@ void step_1(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GCR_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GCR_STEP_1_KERNEL); } // namespace gcr diff --git a/reference/solver/gmres_kernels.cpp b/reference/solver/gmres_kernels.cpp index a7f5a751a3b..6d5eaae1490 100644 --- a/reference/solver/gmres_kernels.cpp +++ b/reference/solver/gmres_kernels.cpp @@ -40,7 +40,7 @@ void restart(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_RESTART_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_GMRES_RESTART_KERNEL); template @@ -69,7 +69,8 @@ void multi_axpy(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL); template void multi_dot(std::shared_ptr exec, @@ -91,7 +92,8 @@ void multi_dot(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_DOT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_GMRES_MULTI_DOT_KERNEL); } // namespace gmres } // namespace reference diff --git a/reference/solver/idr_kernels.cpp b/reference/solver/idr_kernels.cpp index 606def8a18b..27315da3565 100644 --- a/reference/solver/idr_kernels.cpp +++ b/reference/solver/idr_kernels.cpp @@ -86,7 +86,7 @@ template typename std::enable_if::value, ValueType>::type get_rand_value(Distribution&& dist, Generator&& gen) { - return dist(gen); + return static_cast(dist(gen)); } @@ -94,7 +94,9 @@ template typename std::enable_if::value, ValueType>::type get_rand_value(Distribution&& dist, Generator&& gen) { - return ValueType(dist(gen), dist(gen)); + using real_value_type = remove_complex; + return ValueType(get_rand_value(dist, gen), + get_rand_value(dist, gen)); } @@ -122,7 +124,7 @@ void initialize(std::shared_ptr exec, // Initialize and Orthonormalize P const auto num_rows = subspace_vectors->get_size()[0]; const auto num_cols = subspace_vectors->get_size()[1]; - auto dist = std::normal_distribution>(0.0, 1.0); + auto dist = std::normal_distribution<>(0.0, 1.0); auto seed = std::random_device{}(); auto gen = std::default_random_engine(seed); for (size_type row = 0; row < num_rows; row++) { @@ -158,7 +160,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_IDR_INITIALIZE_KERNEL); template @@ -188,7 +191,7 @@ void step_1(std::shared_ptr exec, const size_type nrhs, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_1_KERNEL); template @@ -213,7 +216,7 @@ void step_2(std::shared_ptr exec, const size_type nrhs, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_2_KERNEL); template @@ -256,7 +259,7 @@ void step_3(std::shared_ptr exec, const size_type nrhs, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_IDR_STEP_3_KERNEL); template @@ -275,14 +278,17 @@ void compute_omega( auto normt = sqrt(real(tht->at(0, i))); omega->at(0, i) /= tht->at(0, i); auto absrho = abs(thr / (normt * residual_norm->at(0, i))); - if (absrho < kappa) { omega->at(0, i) *= kappa / absrho; } + if (normt == zero>()) { + omega->at(0, i) = 0; + } } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL); } // namespace idr diff --git a/reference/test/solver/bicg_kernels.cpp b/reference/test/solver/bicg_kernels.cpp index fd24c52bcc8..13d81de0c7a 100644 --- a/reference/test/solver/bicg_kernels.cpp +++ b/reference/test/solver/bicg_kernels.cpp @@ -119,7 +119,7 @@ class Bicg : public ::testing::Test { std::unique_ptr bicg_factory_non_symmetric; }; -TYPED_TEST_SUITE(Bicg, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Bicg, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Bicg, KernelInitialize) @@ -266,7 +266,8 @@ TYPED_TEST(Bicg, SolvesStencilSystem) TYPED_TEST(Bicg, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto solver = this->bicg_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -302,8 +303,8 @@ TYPED_TEST(Bicg, SolvesStencilSystemComplex) TYPED_TEST(Bicg, SolvesStencilSystemMixedComplex) { - using value_type = - gko::to_complex>; + using value_type = gko::to_complex< + gko::next_precision_with_half>; using Mtx = gko::matrix::Dense; auto solver = this->bicg_factory->generate(this->mtx); auto b = gko::initialize( @@ -358,7 +359,8 @@ TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto solver = this->bicg_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -400,7 +402,7 @@ TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { using Scalar = gko::matrix::Dense< - gko::next_precision>; + gko::next_precision_with_half>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->bicg_factory->generate(this->mtx); @@ -446,6 +448,8 @@ TYPED_TEST(Bicg, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->bicg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -463,6 +467,8 @@ TYPED_TEST(Bicg, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->bicg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -480,6 +486,8 @@ TYPED_TEST(Bicg, SolvesBigDenseSystemImplicitResNormCrit) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->bicg_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -511,6 +519,8 @@ TYPED_TEST(Bicg, SolvesMultipleDenseSystemForDivergenceCheck) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->bicg_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, diff --git a/reference/test/solver/bicgstab_kernels.cpp b/reference/test/solver/bicgstab_kernels.cpp index f09e78137b3..ce17f25f47e 100644 --- a/reference/test/solver/bicgstab_kernels.cpp +++ b/reference/test/solver/bicgstab_kernels.cpp @@ -121,7 +121,8 @@ class Bicgstab : public ::testing::Test { std::unique_ptr bicgstab_factory_precision; }; -TYPED_TEST_SUITE(Bicgstab, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Bicgstab, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(Bicgstab, KernelInitialize) @@ -383,7 +384,8 @@ TYPED_TEST(Bicgstab, SolvesDenseSystem) TYPED_TEST(Bicgstab, SolvesDenseSystemMixed) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto solver = this->bicgstab_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -419,8 +421,8 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemComplex) TYPED_TEST(Bicgstab, SolvesDenseSystemMixedComplex) { - using value_type = - gko::to_complex>; + using value_type = gko::to_complex< + gko::next_precision_with_half>; using Mtx = gko::matrix::Dense; auto solver = this->bicgstab_factory->generate(this->mtx); auto b = gko::initialize( @@ -489,13 +491,14 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApply) solver->apply(alpha, b, beta, x); - GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), r::value); + GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), 2 * r::value); } TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto solver = this->bicgstab_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -506,7 +509,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixed) solver->apply(alpha, b, beta, x); GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), - (r_mixed())); + (2 * r_mixed())); } @@ -522,14 +525,14 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyComplex) {value_type{-1.0, 2.0}, value_type{3.0, -6.0}, value_type{1.0, -2.0}}, this->exec); auto x = gko::initialize( - {value_type{0.5, -1.0}, value_type{1.0, -2.0}, value_type{2.0, -4.0}}, + {value_type{0.5, -0.5}, value_type{1.0, 0.5}, value_type{2.0, -1.0}}, this->exec); solver->apply(alpha, b, beta, x); GKO_ASSERT_MTX_NEAR(x, - l({value_type{-8.5, 17.0}, value_type{-3.0, 6.0}, - value_type{6.0, -12.0}}), + l({value_type{-8.5, 16.5}, value_type{-3.0, 3.5}, + value_type{6.0, -15.0}}), r::value); } @@ -537,7 +540,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyComplex) TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixedComplex) { using Scalar = gko::matrix::Dense< - gko::next_precision>; + gko::next_precision_with_half>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->bicgstab_factory->generate(this->mtx); @@ -547,14 +550,14 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixedComplex) {value_type{-1.0, 2.0}, value_type{3.0, -6.0}, value_type{1.0, -2.0}}, this->exec); auto x = gko::initialize( - {value_type{0.5, -1.0}, value_type{1.0, -2.0}, value_type{2.0, -4.0}}, + {value_type{0.5, -0.5}, value_type{1.0, 0.5}, value_type{2.0, -1.0}}, this->exec); solver->apply(alpha, b, beta, x); GKO_ASSERT_MTX_NEAR(x, - l({value_type{-8.5, 17.0}, value_type{-3.0, 6.0}, - value_type{6.0, -12.0}}), + l({value_type{-8.5, 16.5}, value_type{-3.0, 3.5}, + value_type{6.0, -15.0}}), (r_mixed())); } @@ -585,6 +588,9 @@ TYPED_TEST(Bicgstab, SolvesBigDenseSystemForDivergenceCheck1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // beta encounters huge value out of the half-precision range in the first + // part of the second iteration + SKIP_IF_HALF(value_type); auto half_tol = std::sqrt(r::value); std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, @@ -613,6 +619,9 @@ TYPED_TEST(Bicgstab, SolvesBigDenseSystemForDivergenceCheck2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // beta encounters huge value out of the half-precision range in the first + // part of second iteration + SKIP_IF_HALF(value_type); auto half_tol = std::sqrt(r::value); std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, @@ -642,6 +651,9 @@ TYPED_TEST(Bicgstab, SolvesMultipleDenseSystemsDivergenceCheck) using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using T = value_type; + // beta encounters huge value out of the half-precision range in the first + // part of second iteration + SKIP_IF_HALF(value_type); std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, {-8.0, -66.0, 29.0, -96.0, -95.0, -14.0}, diff --git a/reference/test/solver/cg_kernels.cpp b/reference/test/solver/cg_kernels.cpp index 7cbc629717c..fd708d736bc 100644 --- a/reference/test/solver/cg_kernels.cpp +++ b/reference/test/solver/cg_kernels.cpp @@ -107,7 +107,7 @@ class Cg : public ::testing::Test { std::unique_ptr cg_factory_big2; }; -TYPED_TEST_SUITE(Cg, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Cg, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Cg, KernelInitialize) @@ -228,7 +228,8 @@ TYPED_TEST(Cg, SolvesStencilSystem) TYPED_TEST(Cg, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto solver = this->cg_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -264,8 +265,8 @@ TYPED_TEST(Cg, SolvesStencilSystemComplex) TYPED_TEST(Cg, SolvesStencilSystemMixedComplex) { - using value_type = - gko::to_complex>; + using value_type = gko::to_complex< + gko::next_precision_with_half>; using Mtx = gko::matrix::Dense; auto solver = this->cg_factory->generate(this->mtx); auto b = gko::initialize( @@ -320,7 +321,8 @@ TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto solver = this->cg_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -362,7 +364,7 @@ TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { using Scalar = gko::matrix::Dense< - gko::next_precision>; + gko::next_precision_with_half>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->cg_factory->generate(this->mtx); @@ -408,6 +410,8 @@ TYPED_TEST(Cg, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -425,6 +429,8 @@ TYPED_TEST(Cg, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -442,6 +448,8 @@ TYPED_TEST(Cg, SolvesBigDenseSystem3) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -459,6 +467,8 @@ TYPED_TEST(Cg, SolvesMultipleDenseSystemForDivergenceCheck) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -527,6 +537,8 @@ TYPED_TEST(Cg, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -544,6 +556,8 @@ TYPED_TEST(Cg, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, diff --git a/reference/test/solver/cgs_kernels.cpp b/reference/test/solver/cgs_kernels.cpp index 9024623ade8..a06c087776c 100644 --- a/reference/test/solver/cgs_kernels.cpp +++ b/reference/test/solver/cgs_kernels.cpp @@ -121,7 +121,7 @@ class Cgs : public ::testing::Test { std::unique_ptr cgs_factory_big2; }; -TYPED_TEST_SUITE(Cgs, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Cgs, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Cgs, KernelInitialize) @@ -293,7 +293,8 @@ TYPED_TEST(Cgs, SolvesDenseSystem) TYPED_TEST(Cgs, SolvesDenseSystemMixed) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto solver = this->cgs_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -329,8 +330,8 @@ TYPED_TEST(Cgs, SolvesDenseSystemComplex) TYPED_TEST(Cgs, SolvesDenseSystemMixedComplex) { - using value_type = - gko::to_complex>; + using value_type = gko::to_complex< + gko::next_precision_with_half>; using Mtx = gko::matrix::Dense; auto solver = this->cgs_factory->generate(this->mtx); auto b = gko::initialize( @@ -386,7 +387,8 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApply) TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto solver = this->cgs_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -413,13 +415,13 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyComplex) {value_type{-1.0, 2.0}, value_type{3.0, -6.0}, value_type{1.0, -2.0}}, this->exec); auto x = gko::initialize( - {value_type{0.5, -1.0}, value_type{1.0, -2.0}, value_type{2.0, -4.0}}, + {value_type{-2.0, 4.0}, value_type{-0.5, 1.0}, value_type{2.0, -4.0}}, this->exec); solver->apply(alpha, b, beta, x); GKO_ASSERT_MTX_NEAR(x, - l({value_type{-8.5, 17.0}, value_type{-3.0, 6.0}, + l({value_type{-6.0, 12.0}, value_type{-1.5, 3.0}, value_type{6.0, -12.0}}), r::value * 1e3); } @@ -428,7 +430,7 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyComplex) TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyMixedComplex) { using Scalar = gko::matrix::Dense< - gko::next_precision>; + gko::next_precision_with_half>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->cgs_factory->generate(this->mtx); @@ -438,13 +440,14 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyMixedComplex) {value_type{-1.0, 2.0}, value_type{3.0, -6.0}, value_type{1.0, -2.0}}, this->exec); auto x = gko::initialize( - {value_type{0.5, -1.0}, value_type{1.0, -2.0}, value_type{2.0, -4.0}}, + {value_type{-2.0, 4.0}, value_type{-0.5, 1.0}, value_type{2.0, -4.0}}, this->exec); + solver->apply(alpha, b, beta, x); GKO_ASSERT_MTX_NEAR(x, - l({value_type{-8.5, 17.0}, value_type{-3.0, 6.0}, + l({value_type{-6.0, 12.0}, value_type{-1.5, 3.0}, value_type{6.0, -12.0}}), (r_mixed()) * 1e3); } @@ -475,6 +478,8 @@ TYPED_TEST(Cgs, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // squared_norm of residual(=b) exceeds the range of half precision. + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big); auto b = gko::initialize( {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec); @@ -491,6 +496,8 @@ TYPED_TEST(Cgs, SolvesBigDenseSystemWithImplicitResNormCrit) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // squared_norm of residual(=b) exceeds the range of half precision. + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {17356.0, 5466.0, 748.0, -456.0, 3434.0, -7020.0}, this->exec); @@ -507,6 +514,8 @@ TYPED_TEST(Cgs, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // squared_norm of residual(=b) exceeds the range of half precision. + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big); auto b = gko::initialize( {17356.0, 5466.0, 748.0, -456.0, 3434.0, -7020.0}, this->exec); @@ -523,6 +532,8 @@ TYPED_TEST(Cgs, SolvesMultipleDenseSystems) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // squared_norm of residual(=b) exceeds the range of half precision. + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec); @@ -589,6 +600,8 @@ TYPED_TEST(Cgs, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // squared_norm of residual(=b) exceeds the range of half precision. + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big->transpose()); auto b = gko::initialize( {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec); @@ -605,6 +618,8 @@ TYPED_TEST(Cgs, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // squared_norm of residual(=b) exceeds the range of half precision. + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big->conj_transpose()); auto b = gko::initialize( diff --git a/reference/test/solver/fcg_kernels.cpp b/reference/test/solver/fcg_kernels.cpp index 2b7b97ffc3b..88615921f34 100644 --- a/reference/test/solver/fcg_kernels.cpp +++ b/reference/test/solver/fcg_kernels.cpp @@ -112,7 +112,7 @@ class Fcg : public ::testing::Test { std::unique_ptr fcg_factory_big2; }; -TYPED_TEST_SUITE(Fcg, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Fcg, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Fcg, KernelInitialize) @@ -242,7 +242,8 @@ TYPED_TEST(Fcg, SolvesStencilSystem) TYPED_TEST(Fcg, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto solver = this->fcg_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -278,8 +279,8 @@ TYPED_TEST(Fcg, SolvesStencilSystemComplex) TYPED_TEST(Fcg, SolvesStencilSystemMixedComplex) { - using value_type = - gko::to_complex>; + using value_type = gko::to_complex< + gko::next_precision_with_half>; using Mtx = gko::matrix::Dense; auto solver = this->fcg_factory->generate(this->mtx); auto b = gko::initialize( @@ -334,7 +335,8 @@ TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto solver = this->fcg_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -376,7 +378,7 @@ TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { using Scalar = gko::matrix::Dense< - gko::next_precision>; + gko::next_precision_with_half>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->fcg_factory->generate(this->mtx); @@ -422,6 +424,8 @@ TYPED_TEST(Fcg, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -439,6 +443,8 @@ TYPED_TEST(Fcg, SolvesBigDenseSystemWithImplicitResNormCrit) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -456,6 +462,8 @@ TYPED_TEST(Fcg, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -473,6 +481,8 @@ TYPED_TEST(Fcg, SolvesMultipleBigDenseSystems) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -541,6 +551,8 @@ TYPED_TEST(Fcg, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -558,6 +570,8 @@ TYPED_TEST(Fcg, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, diff --git a/reference/test/solver/gcr_kernels.cpp b/reference/test/solver/gcr_kernels.cpp index 7ca885cfab8..af8e74888d0 100644 --- a/reference/test/solver/gcr_kernels.cpp +++ b/reference/test/solver/gcr_kernels.cpp @@ -119,7 +119,7 @@ class Gcr : public ::testing::Test { std::unique_ptr gcr_factory_big2; }; -TYPED_TEST_SUITE(Gcr, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Gcr, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Gcr, KernelInitialize) @@ -225,7 +225,8 @@ TYPED_TEST(Gcr, SolvesStencilSystem) TYPED_TEST(Gcr, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto solver = this->gcr_factory->generate(this->mtx); auto b = gko::initialize({13.0, 7.0, 1.0}, this->exec); @@ -234,7 +235,7 @@ TYPED_TEST(Gcr, SolvesStencilSystemMixed) solver->apply(b.get(), x.get()); GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), - (r_mixed())); + (r_mixed() * 1e1)); } @@ -256,14 +257,14 @@ TYPED_TEST(Gcr, SolvesStencilSystemComplex) GKO_ASSERT_MTX_NEAR(x, l({value_type{1.0, -2.0}, value_type{3.0, -6.0}, value_type{2.0, -4.0}}), - r::value * 1e1); + r::value); } TYPED_TEST(Gcr, SolvesStencilSystemMixedComplex) { - using value_type = - gko::to_complex>; + using value_type = gko::to_complex< + gko::next_precision_with_half>; using Mtx = gko::matrix::Dense; auto solver = this->gcr_factory->generate(this->mtx); auto b = @@ -319,7 +320,8 @@ TYPED_TEST(Gcr, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(Gcr, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto solver = this->gcr_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -330,7 +332,7 @@ TYPED_TEST(Gcr, SolvesStencilSystemUsingAdvancedApplyMixed) solver->apply(alpha.get(), b.get(), beta.get(), x.get()); GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), - (r_mixed()) * 1e1); + (r_mixed() * 2e1)); } @@ -362,7 +364,7 @@ TYPED_TEST(Gcr, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(Gcr, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { using Scalar = gko::matrix::Dense< - gko::next_precision>; + gko::next_precision_with_half>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->gcr_factory->generate(this->mtx); @@ -409,6 +411,8 @@ TYPED_TEST(Gcr, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big); auto b = gko::initialize( {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, @@ -426,6 +430,8 @@ TYPED_TEST(Gcr, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big); auto b = gko::initialize( {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90}, @@ -443,6 +449,8 @@ TYPED_TEST(Gcr, SolveWithImplicitResNormCritIsDisabled) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90}, @@ -456,7 +464,7 @@ TYPED_TEST(Gcr, SolveWithImplicitResNormCritIsDisabled) template gko::remove_complex infNorm(gko::matrix::Dense* mat, size_t col = 0) { - using std::abs; + using gko::abs; using no_cpx_t = gko::remove_complex; no_cpx_t norm = 0.0; for (size_t i = 0; i < mat->get_size()[0]; ++i) { @@ -471,6 +479,8 @@ TYPED_TEST(Gcr, SolvesMultipleDenseSystemForDivergenceCheck) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -537,6 +547,8 @@ TYPED_TEST(Gcr, SolvesBigDenseSystem1WithRestart) using Mtx = typename TestFixture::Mtx; using Solver = typename TestFixture::Solver; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto half_tol = std::sqrt(r::value); auto gcr_factory_restart = Solver::build() @@ -562,6 +574,8 @@ TYPED_TEST(Gcr, SolvesWithPreconditioner) using Mtx = typename TestFixture::Mtx; using Solver = typename TestFixture::Solver; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto gcr_factory_preconditioner = Solver::build() .with_criteria(gko::stop::Iteration::build().with_max_iters(100u), @@ -588,6 +602,8 @@ TYPED_TEST(Gcr, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big->transpose()); auto b = gko::initialize( {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, @@ -605,6 +621,8 @@ TYPED_TEST(Gcr, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big->conj_transpose()); auto b = gko::initialize( diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp index 3f11b087bb7..abecc6b2a79 100644 --- a/reference/test/solver/gmres_kernels.cpp +++ b/reference/test/solver/gmres_kernels.cpp @@ -136,7 +136,7 @@ class Gmres : public ::testing::Test { std::unique_ptr gmres_factory_big2; }; -TYPED_TEST_SUITE(Gmres, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Gmres, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Gmres, KernelInitialize) @@ -434,7 +434,8 @@ TYPED_TEST(Gmres, SolvesStencilSystem) TYPED_TEST(Gmres, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto solver = this->gmres_factory->generate(this->mtx); auto b = gko::initialize({13.0, 7.0, 1.0}, this->exec); @@ -471,8 +472,8 @@ TYPED_TEST(Gmres, SolvesStencilSystemComplex) TYPED_TEST(Gmres, SolvesStencilSystemMixedComplex) { - using value_type = - gko::to_complex>; + using value_type = gko::to_complex< + gko::next_precision_with_half>; using Mtx = gko::matrix::Dense; auto solver = this->gmres_factory->generate(this->mtx); auto b = @@ -528,7 +529,8 @@ TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto solver = this->gmres_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -571,7 +573,7 @@ TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { using Scalar = gko::matrix::Dense< - gko::next_precision>; + gko::next_precision_with_half>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->gmres_factory->generate(this->mtx); @@ -618,6 +620,8 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big); auto b = gko::initialize( {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, @@ -635,6 +639,8 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big); auto b = gko::initialize( {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90}, @@ -652,6 +658,8 @@ TYPED_TEST(Gmres, SolveWithImplicitResNormCritIsDisabled) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90}, @@ -666,6 +674,8 @@ TYPED_TEST(Gmres, SolvesMultipleDenseSystemForDivergenceCheck) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -732,6 +742,8 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem1WithRestart) using Mtx = typename TestFixture::Mtx; using Solver = typename TestFixture::Solver; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto half_tol = std::sqrt(r::value); auto gmres_factory_restart = Solver::build() @@ -759,6 +771,8 @@ TYPED_TEST(Gmres, SolvesWithPreconditioner) using Mtx = typename TestFixture::Mtx; using Solver = typename TestFixture::Solver; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); for (auto ortho : {ortho_method::mgs, ortho_method::cgs, ortho_method::cgs2}) { SCOPED_TRACE(ortho); @@ -792,6 +806,8 @@ TYPED_TEST(Gmres, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big->transpose()); auto b = gko::initialize( {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, @@ -809,6 +825,8 @@ TYPED_TEST(Gmres, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the system is already out of half precision range + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big->conj_transpose()); auto b = gko::initialize( diff --git a/reference/test/solver/idr_kernels.cpp b/reference/test/solver/idr_kernels.cpp index c3ca4fc1bd9..420a3f15684 100644 --- a/reference/test/solver/idr_kernels.cpp +++ b/reference/test/solver/idr_kernels.cpp @@ -2,10 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + #include #include #include +#include #include #include #include @@ -57,7 +60,7 @@ class Idr : public ::testing::Test { std::unique_ptr idr_factory_precision; }; -TYPED_TEST_SUITE(Idr, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Idr, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Idr, SolvesDenseSystem) @@ -76,7 +79,8 @@ TYPED_TEST(Idr, SolvesDenseSystem) TYPED_TEST(Idr, SolvesDenseSystemMixed) { - using value_type = gko::next_precision; + using T = typename TestFixture::value_type; + using value_type = gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto solver = this->idr_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -91,6 +95,7 @@ TYPED_TEST(Idr, SolvesDenseSystemMixed) TYPED_TEST(Idr, SolvesDenseSystemComplex) { + using T = typename TestFixture::value_type; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->idr_factory->generate(this->mtx); @@ -112,8 +117,8 @@ TYPED_TEST(Idr, SolvesDenseSystemComplex) TYPED_TEST(Idr, SolvesDenseSystemMixedComplex) { - using value_type = - gko::to_complex>; + using T = typename TestFixture::value_type; + using value_type = gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->idr_factory->generate(this->mtx); auto b = gko::initialize( @@ -137,6 +142,7 @@ TYPED_TEST(Idr, SolvesDenseSystemWithComplexSubSpace) using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using Solver = typename TestFixture::Solver; + // intermediate value is too small to represent in half auto half_tol = std::sqrt(r::value); auto solver_factory = Solver::build() @@ -231,7 +237,8 @@ TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApply) TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto solver = this->idr_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -273,7 +280,7 @@ TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyComplex) TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyMixedComplex) { using Scalar = gko::matrix::Dense< - gko::next_precision>; + gko::next_precision_with_half>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->idr_factory->generate(this->mtx); @@ -321,6 +328,9 @@ TYPED_TEST(Idr, SolvesBigDenseSystemForDivergenceCheck1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the internal vector t will be too large in the first run and then out of + // the half precision range. + SKIP_IF_HALF(value_type); auto half_tol = std::sqrt(r::value); std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, @@ -357,6 +367,9 @@ TYPED_TEST(Idr, SolvesBigDenseSystemForDivergenceCheck2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + // the internal vector t will be too large in the first run and then out of + // the half precision range. + SKIP_IF_HALF(value_type); auto half_tol = std::sqrt(r::value); std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, @@ -386,6 +399,9 @@ TYPED_TEST(Idr, SolvesMultipleDenseSystemsDivergenceCheck) using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using T = value_type; + // the internal vector t will be too large in the first run and then out of + // the half precision range. + SKIP_IF_HALF(value_type); std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, {-8.0, -66.0, 29.0, -96.0, -95.0, -14.0}, diff --git a/reference/test/solver/ir_kernels.cpp b/reference/test/solver/ir_kernels.cpp index b0c1029f693..f329a16d932 100644 --- a/reference/test/solver/ir_kernels.cpp +++ b/reference/test/solver/ir_kernels.cpp @@ -47,7 +47,7 @@ class Ir : public ::testing::Test { std::unique_ptr ir_factory; }; -TYPED_TEST_SUITE(Ir, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Ir, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Ir, KernelInitialize) @@ -82,7 +82,8 @@ TYPED_TEST(Ir, SolvesTriangularSystem) TYPED_TEST(Ir, SolvesTriangularSystemMixed) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto solver = this->ir_factory->generate(this->mtx); auto b = gko::initialize({3.9, 9.0, 2.2}, this->exec); @@ -118,8 +119,8 @@ TYPED_TEST(Ir, SolvesTriangularSystemComplex) TYPED_TEST(Ir, SolvesTriangularSystemMixedComplex) { - using value_type = - gko::to_complex>; + using value_type = gko::to_complex< + gko::next_precision_with_half>; using Mtx = gko::matrix::Dense; auto solver = this->ir_factory->generate(this->mtx); auto b = gko::initialize( @@ -244,7 +245,7 @@ TYPED_TEST(Ir, SolvesTriangularSystemUsingAdvancedApplyComplex) TYPED_TEST(Ir, SolvesTriangularSystemUsingAdvancedApplyMixedComplex) { using Scalar = gko::matrix::Dense< - gko::next_precision>; + gko::next_precision_with_half>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->ir_factory->generate(this->mtx); diff --git a/test/solver/cb_gmres_kernels.cpp b/test/solver/cb_gmres_kernels.cpp index 022899d21e6..98eb295091b 100644 --- a/test/solver/cb_gmres_kernels.cpp +++ b/test/solver/cb_gmres_kernels.cpp @@ -146,7 +146,7 @@ class CbGmres : public CommonTestFixture { auto& krylov_bases = range_helper.get_bases(); d_to_host = d_range_helper.get_bases(); const auto tolerance = r::value; - using std::abs; + using gko::abs; for (gko::size_type i = 0; i < krylov_bases.get_size(); ++i) { const auto ref_value = krylov_bases.get_const_data()[i]; const auto dev_value = d_to_host.get_const_data()[i]; From 87125563d9666b650ca8e9fb4eba823ab1c094e1 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 25 Oct 2024 16:58:01 +0200 Subject: [PATCH 22/69] solver config dispatch --- core/config/solver_config.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/core/config/solver_config.cpp b/core/config/solver_config.cpp index b35a639b8e7..eb566986526 100644 --- a/core/config/solver_config.cpp +++ b/core/config/solver_config.cpp @@ -30,15 +30,15 @@ namespace gko { namespace config { -GKO_PARSE_VALUE_TYPE(Cg, gko::solver::Cg); -GKO_PARSE_VALUE_TYPE(Bicg, gko::solver::Bicg); -GKO_PARSE_VALUE_TYPE(Bicgstab, gko::solver::Bicgstab); -GKO_PARSE_VALUE_TYPE(Cgs, gko::solver::Cgs); -GKO_PARSE_VALUE_TYPE(Fcg, gko::solver::Fcg); -GKO_PARSE_VALUE_TYPE(Ir, gko::solver::Ir); -GKO_PARSE_VALUE_TYPE(Idr, gko::solver::Idr); -GKO_PARSE_VALUE_TYPE(Gcr, gko::solver::Gcr); -GKO_PARSE_VALUE_TYPE(Gmres, gko::solver::Gmres); +GKO_PARSE_VALUE_TYPE_WITH_HALF(Cg, gko::solver::Cg); +GKO_PARSE_VALUE_TYPE_WITH_HALF(Bicg, gko::solver::Bicg); +GKO_PARSE_VALUE_TYPE_WITH_HALF(Bicgstab, gko::solver::Bicgstab); +GKO_PARSE_VALUE_TYPE_WITH_HALF(Cgs, gko::solver::Cgs); +GKO_PARSE_VALUE_TYPE_WITH_HALF(Fcg, gko::solver::Fcg); +GKO_PARSE_VALUE_TYPE_WITH_HALF(Ir, gko::solver::Ir); +GKO_PARSE_VALUE_TYPE_WITH_HALF(Idr, gko::solver::Idr); +GKO_PARSE_VALUE_TYPE_WITH_HALF(Gcr, gko::solver::Gcr); +GKO_PARSE_VALUE_TYPE_WITH_HALF(Gmres, gko::solver::Gmres); GKO_PARSE_VALUE_TYPE(CbGmres, gko::solver::CbGmres); GKO_PARSE_VALUE_AND_INDEX_TYPE(Direct, gko::experimental::solver::Direct); GKO_PARSE_VALUE_AND_INDEX_TYPE(LowerTrs, gko::solver::LowerTrs); From ac216bc595980ffe30c69b6019ff343cea8fa3fe Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 25 Nov 2024 16:11:20 +0100 Subject: [PATCH 23/69] cuda with CC<70 and hip do not support 16 bit atomic. throw error for idr --- common/cuda_hip/solver/idr_kernels.cpp | 34 +++++++++++++++++++------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/common/cuda_hip/solver/idr_kernels.cpp b/common/cuda_hip/solver/idr_kernels.cpp index 0dc310ebd2e..649d8a1769c 100644 --- a/common/cuda_hip/solver/idr_kernels.cpp +++ b/common/cuda_hip/solver/idr_kernels.cpp @@ -454,11 +454,19 @@ void update_g_and_u(std::shared_ptr exec, if (nrhs > 1 || is_complex()) { components::fill_array(exec, alpha->get_values(), nrhs, zero()); - multidot_kernel<<get_stream()>>>( - size, nrhs, as_device_type(p_i), - as_device_type(g_k->get_values()), g_k->get_stride(), - as_device_type(alpha->get_values()), - stop_status->get_const_data()); + // not support 16 bit atomic +#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700)) + if constexpr (std::is_same_v, half>) { + GKO_NOT_SUPPORTED(alpha); + } else +#endif + { + multidot_kernel<<get_stream()>>>( + size, nrhs, as_device_type(p_i), + as_device_type(g_k->get_values()), g_k->get_stride(), + as_device_type(alpha->get_values()), + stop_status->get_const_data()); + } } else { blas::dot(exec->get_blas_handle(), size, p_i, 1, g_k->get_values(), g_k->get_stride(), alpha->get_values()); @@ -505,10 +513,18 @@ void update_m(std::shared_ptr exec, const size_type nrhs, auto m_i = m->get_values() + i * m_stride + k * nrhs; if (nrhs > 1 || is_complex()) { components::fill_array(exec, m_i, nrhs, zero()); - multidot_kernel<<get_stream()>>>( - size, nrhs, as_device_type(p_i), - as_device_type(g_k->get_const_values()), g_k->get_stride(), - as_device_type(m_i), stop_status->get_const_data()); + // not support 16 bit atomic +#if !(defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700)) + if constexpr (std::is_same_v, half>) { + GKO_NOT_SUPPORTED(m_i); + } else +#endif + { + multidot_kernel<<get_stream()>>>( + size, nrhs, as_device_type(p_i), + as_device_type(g_k->get_const_values()), g_k->get_stride(), + as_device_type(m_i), stop_status->get_const_data()); + } } else { blas::dot(exec->get_blas_handle(), size, p_i, 1, g_k->get_const_values(), g_k->get_stride(), m_i); From fa69a93ba1006925c65bb32aa1f5198d241520dd Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 25 Oct 2024 16:47:50 +0200 Subject: [PATCH 24/69] triangular and direct solver --- .../cuda_hip/components/memory.nvidia.hpp.inc | 364 ++++++++++++++++++ core/device_hooks/common_kernels.inc.cpp | 8 +- core/solver/direct.cpp | 5 +- core/solver/lower_trs.cpp | 5 +- core/solver/upper_trs.cpp | 5 +- core/test/solver/direct.cpp | 3 +- core/test/solver/lower_trs.cpp | 2 +- core/test/solver/upper_trs.cpp | 2 +- cuda/solver/common_trs_kernels.cuh | 15 +- cuda/solver/lower_trs_kernels.cu | 4 +- cuda/solver/upper_trs_kernels.cu | 4 +- dev_tools/scripts/generate_cuda_memory_ptx.py | 96 +++++ dpcpp/solver/lower_trs_kernels.dp.cpp | 4 +- dpcpp/solver/upper_trs_kernels.dp.cpp | 4 +- hip/solver/lower_trs_kernels.hip.cpp | 4 +- hip/solver/upper_trs_kernels.hip.cpp | 4 +- omp/solver/lower_trs_kernels.cpp | 4 +- omp/solver/upper_trs_kernels.cpp | 4 +- reference/solver/lower_trs_kernels.cpp | 4 +- reference/solver/upper_trs_kernels.cpp | 4 +- reference/test/solver/direct.cpp | 5 +- reference/test/solver/lower_trs.cpp | 2 +- reference/test/solver/lower_trs_kernels.cpp | 12 +- reference/test/solver/upper_trs.cpp | 2 +- reference/test/solver/upper_trs_kernels.cpp | 12 +- test/solver/direct.cpp | 6 +- 26 files changed, 530 insertions(+), 54 deletions(-) diff --git a/common/cuda_hip/components/memory.nvidia.hpp.inc b/common/cuda_hip/components/memory.nvidia.hpp.inc index a695904e82a..f39c600ce6c 100644 --- a/common/cuda_hip/components/memory.nvidia.hpp.inc +++ b/common/cuda_hip/components/memory.nvidia.hpp.inc @@ -1031,3 +1031,367 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, "d"(real_result), "d"(imag_result) : "memory"); } + + +__device__ __forceinline__ __half load_relaxed_shared(const __half* ptr) +{ + float result; + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" +#if __CUDA_ARCH__ < 700 + " ld.volatile.shared.b16 t, [%1];\n\t" +#else + " ld.relaxed.cta.shared.b16 t, [%1];\n\t" +#endif + " cvt.f32.f16 %0, t;\n\t" + "}" + : "=f"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast<__half*>(ptr))) + : "memory"); + + return static_cast<__half>(result); +} + + +__device__ __forceinline__ void store_relaxed_shared(__half* ptr, __half result) +{ + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" + " cvt.rn.f16.f32 t, %1;\n\t" +#if __CUDA_ARCH__ < 700 + " st.volatile.shared.b16 [%0], t;\n\t" +#else + " st.relaxed.cta.shared.b16 [%0], t;\n\t" +#endif + "}" ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "f"(static_cast(result)) + : "memory"); +} + + +__device__ __forceinline__ __half load_acquire_shared(const __half* ptr) +{ + float result; + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" +#if __CUDA_ARCH__ < 700 + " ld.volatile.shared.b16 t, [%1];\n\t" +#else + " ld.acquire.cta.shared.b16 t, [%1];\n\t" +#endif + " cvt.f32.f16 %0, t;\n\t" + "}" + : "=f"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast<__half*>(ptr))) + : "memory"); + membar_acq_rel_shared(); + return static_cast<__half>(result); +} + + +__device__ __forceinline__ void store_release_shared(__half* ptr, __half result) +{ + membar_acq_rel_shared(); + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" + " cvt.rn.f16.f32 t, %1;\n\t" +#if __CUDA_ARCH__ < 700 + " st.volatile.shared.b16 [%0], t;\n\t" +#else + " st.release.cta.shared.b16 [%0], t;\n\t" +#endif + "}" ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "f"(static_cast(result)) + : "memory"); +} + + +__device__ __forceinline__ __half load_relaxed_local(const __half* ptr) +{ + float result; + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" +#if __CUDA_ARCH__ < 700 + " ld.volatile.b16 t, [%1];\n\t" +#else + " ld.relaxed.cta.b16 t, [%1];\n\t" +#endif + " cvt.f32.f16 %0, t;\n\t" + "}" + : "=f"(result) + : "l"(const_cast<__half*>(ptr)) + : "memory"); + + return static_cast<__half>(result); +} + + +__device__ __forceinline__ void store_relaxed_local(__half* ptr, __half result) +{ + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" + " cvt.rn.f16.f32 t, %1;\n\t" +#if __CUDA_ARCH__ < 700 + " st.volatile.b16 [%0], t;\n\t" +#else + " st.relaxed.cta.b16 [%0], t;\n\t" +#endif + "}" ::"l"(ptr), + "f"(static_cast(result)) + : "memory"); +} + + +__device__ __forceinline__ __half load_acquire_local(const __half* ptr) +{ + float result; + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" +#if __CUDA_ARCH__ < 700 + " ld.volatile.b16 t, [%1];\n\t" +#else + " ld.acquire.cta.b16 t, [%1];\n\t" +#endif + " cvt.f32.f16 %0, t;\n\t" + "}" + : "=f"(result) + : "l"(const_cast<__half*>(ptr)) + : "memory"); + membar_acq_rel_local(); + return static_cast<__half>(result); +} + + +__device__ __forceinline__ void store_release_local(__half* ptr, __half result) +{ + membar_acq_rel_local(); + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" + " cvt.rn.f16.f32 t, %1;\n\t" +#if __CUDA_ARCH__ < 700 + " st.volatile.b16 [%0], t;\n\t" +#else + " st.release.cta.b16 [%0], t;\n\t" +#endif + "}" ::"l"(ptr), + "f"(static_cast(result)) + : "memory"); +} + + +__device__ __forceinline__ __half load_relaxed(const __half* ptr) +{ + float result; + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" +#if __CUDA_ARCH__ < 700 + " ld.volatile.b16 t, [%1];\n\t" +#else + " ld.relaxed.gpu.b16 t, [%1];\n\t" +#endif + " cvt.f32.f16 %0, t;\n\t" + "}" + : "=f"(result) + : "l"(const_cast<__half*>(ptr)) + : "memory"); + + return static_cast<__half>(result); +} + + +__device__ __forceinline__ void store_relaxed(__half* ptr, __half result) +{ + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" + " cvt.rn.f16.f32 t, %1;\n\t" +#if __CUDA_ARCH__ < 700 + " st.volatile.b16 [%0], t;\n\t" +#else + " st.relaxed.gpu.b16 [%0], t;\n\t" +#endif + "}" ::"l"(ptr), + "f"(static_cast(result)) + : "memory"); +} + + +__device__ __forceinline__ __half load_acquire(const __half* ptr) +{ + float result; + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" +#if __CUDA_ARCH__ < 700 + " ld.volatile.b16 t, [%1];\n\t" +#else + " ld.acquire.gpu.b16 t, [%1];\n\t" +#endif + " cvt.f32.f16 %0, t;\n\t" + "}" + : "=f"(result) + : "l"(const_cast<__half*>(ptr)) + : "memory"); + membar_acq_rel(); + return static_cast<__half>(result); +} + + +__device__ __forceinline__ void store_release(__half* ptr, __half result) +{ + membar_acq_rel(); + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" + " cvt.rn.f16.f32 t, %1;\n\t" +#if __CUDA_ARCH__ < 700 + " st.volatile.b16 [%0], t;\n\t" +#else + " st.release.gpu.b16 [%0], t;\n\t" +#endif + "}" ::"l"(ptr), + "f"(static_cast(result)) + : "memory"); +} + + +__device__ __forceinline__ thrust::complex<__half> load_relaxed_shared( + const thrust::complex<__half>* ptr) +{ + float real_result; + float imag_result; + asm volatile( + "{\n\t" + " .reg .v2 .f16 t;\n\t" +#if __CUDA_ARCH__ < 700 + "ld.volatile.shared.v2.b16 {t.x, t.y}, [%2];\n\t" +#else + "ld.relaxed.cta.shared.v2.b16 {t.x, t.y}, [%2];\n\t" +#endif + " cvt.f32.f16 %0, t.x;\n\t" + " cvt.f32.f16 %1, t.y;\n\t" + "}" + : "=f"(real_result), "=f"(imag_result) + : "r"(convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))) + : "memory"); + return thrust::complex<__half>{real_result, imag_result}; +} + + +__device__ __forceinline__ void store_relaxed_shared( + thrust::complex<__half>* ptr, thrust::complex<__half> result) +{ + auto real_result = static_cast(result.real()); + auto imag_result = static_cast(result.imag()); + asm volatile( + "{\n\t" + " .reg .v2 .f16 t;\n\t" + " cvt.rn.f16.f32 t.x, %1;\n\t" + " cvt.rn.f16.f32 t.y, %2;\n\t" +#if __CUDA_ARCH__ < 700 + "st.volatile.shared.v2.b16 [%0], t;\n\t" +#else + "st.relaxed.cta.shared.v2.b16 [%0], t;\n\t" +#endif + "}" ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "f"(real_result), "f"(imag_result) + : "memory"); +} + + +__device__ __forceinline__ thrust::complex<__half> load_relaxed_local( + const thrust::complex<__half>* ptr) +{ + float real_result; + float imag_result; + asm volatile( + "{\n\t" + " .reg .v2 .f16 t;\n\t" +#if __CUDA_ARCH__ < 700 + "ld.volatile.v2.b16 {t.x, t.y}, [%2];\n\t" +#else + "ld.relaxed.cta.v2.b16 {t.x, t.y}, [%2];\n\t" +#endif + " cvt.f32.f16 %0, t.x;\n\t" + " cvt.f32.f16 %1, t.y;\n\t" + "}" + : "=f"(real_result), "=f"(imag_result) + : "l"(const_cast*>(ptr)) + : "memory"); + return thrust::complex<__half>{real_result, imag_result}; +} + + +__device__ __forceinline__ void store_relaxed_local( + thrust::complex<__half>* ptr, thrust::complex<__half> result) +{ + auto real_result = static_cast(result.real()); + auto imag_result = static_cast(result.imag()); + asm volatile( + "{\n\t" + " .reg .v2 .f16 t;\n\t" + " cvt.rn.f16.f32 t.x, %1;\n\t" + " cvt.rn.f16.f32 t.y, %2;\n\t" +#if __CUDA_ARCH__ < 700 + "st.volatile.v2.b16 [%0], t;\n\t" +#else + "st.relaxed.cta.v2.b16 [%0], t;\n\t" +#endif + "}" ::"l"(ptr), + "f"(real_result), "f"(imag_result) + : "memory"); +} + + +__device__ __forceinline__ thrust::complex<__half> load_relaxed( + const thrust::complex<__half>* ptr) +{ + float real_result; + float imag_result; + asm volatile( + "{\n\t" + " .reg .v2 .f16 t;\n\t" +#if __CUDA_ARCH__ < 700 + "ld.volatile.v2.b16 {t.x, t.y}, [%2];\n\t" +#else + "ld.relaxed.gpu.v2.b16 {t.x, t.y}, [%2];\n\t" +#endif + " cvt.f32.f16 %0, t.x;\n\t" + " cvt.f32.f16 %1, t.y;\n\t" + "}" + : "=f"(real_result), "=f"(imag_result) + : "l"(const_cast*>(ptr)) + : "memory"); + return thrust::complex<__half>{real_result, imag_result}; +} + + +__device__ __forceinline__ void store_relaxed(thrust::complex<__half>* ptr, + thrust::complex<__half> result) +{ + auto real_result = static_cast(result.real()); + auto imag_result = static_cast(result.imag()); + asm volatile( + "{\n\t" + " .reg .v2 .f16 t;\n\t" + " cvt.rn.f16.f32 t.x, %1;\n\t" + " cvt.rn.f16.f32 t.y, %2;\n\t" +#if __CUDA_ARCH__ < 700 + "st.volatile.v2.b16 [%0], t;\n\t" +#else + "st.relaxed.gpu.v2.b16 [%0], t;\n\t" +#endif + "}" ::"l"(ptr), + "f"(real_result), "f"(imag_result) + : "memory"); +} diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 1c57ca45177..f37166613b7 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -572,8 +572,8 @@ namespace lower_trs { GKO_STUB(GKO_DECLARE_LOWER_TRS_SHOULD_PERFORM_TRANSPOSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL); } // namespace lower_trs @@ -583,8 +583,8 @@ namespace upper_trs { GKO_STUB(GKO_DECLARE_UPPER_TRS_SHOULD_PERFORM_TRANSPOSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL); } // namespace upper_trs diff --git a/core/solver/direct.cpp b/core/solver/direct.cpp index cf15bc4a9ae..69c2f9512dd 100644 --- a/core/solver/direct.cpp +++ b/core/solver/direct.cpp @@ -221,7 +221,7 @@ void Direct::apply_impl(const LinOp* alpha, #define GKO_DECLARE_DIRECT(ValueType, IndexType) \ class Direct -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIRECT); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_DIRECT); } // namespace solver @@ -283,7 +283,8 @@ std::vector workspace_traits> -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIRECT_TRAITS); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_DIRECT_TRAITS); } // namespace solver diff --git a/core/solver/lower_trs.cpp b/core/solver/lower_trs.cpp index 3048c877dbd..da16061db03 100644 --- a/core/solver/lower_trs.cpp +++ b/core/solver/lower_trs.cpp @@ -248,8 +248,9 @@ std::vector workspace_traits>::vectors( #define GKO_DECLARE_LOWER_TRS(_vtype, _itype) class LowerTrs<_vtype, _itype> #define GKO_DECLARE_LOWER_TRS_TRAITS(_vtype, _itype) \ struct workspace_traits> -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWER_TRS); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWER_TRS_TRAITS); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LOWER_TRS); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_LOWER_TRS_TRAITS); } // namespace solver diff --git a/core/solver/upper_trs.cpp b/core/solver/upper_trs.cpp index c759c119647..5e1dfb23df2 100644 --- a/core/solver/upper_trs.cpp +++ b/core/solver/upper_trs.cpp @@ -248,8 +248,9 @@ std::vector workspace_traits>::vectors( #define GKO_DECLARE_UPPER_TRS(_vtype, _itype) class UpperTrs<_vtype, _itype> #define GKO_DECLARE_UPPER_TRS_TRAITS(_vtype, _itype) \ struct workspace_traits> -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_TRS); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_TRS_TRAITS); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_UPPER_TRS); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_UPPER_TRS_TRAITS); } // namespace solver diff --git a/core/test/solver/direct.cpp b/core/test/solver/direct.cpp index d895892a8be..43acdd0bdf1 100644 --- a/core/test/solver/direct.cpp +++ b/core/test/solver/direct.cpp @@ -35,7 +35,8 @@ class Direct : public ::testing::Test { std::unique_ptr factory; }; -TYPED_TEST_SUITE(Direct, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Direct, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Direct, FactoryKnowsItsExecutor) diff --git a/core/test/solver/lower_trs.cpp b/core/test/solver/lower_trs.cpp index dfcb564ca12..ae07e08c3f7 100644 --- a/core/test/solver/lower_trs.cpp +++ b/core/test/solver/lower_trs.cpp @@ -33,7 +33,7 @@ class LowerTrs : public ::testing::Test { std::unique_ptr lower_trs_factory; }; -TYPED_TEST_SUITE(LowerTrs, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(LowerTrs, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/core/test/solver/upper_trs.cpp b/core/test/solver/upper_trs.cpp index 2e84cb81e10..bc53d1a193c 100644 --- a/core/test/solver/upper_trs.cpp +++ b/core/test/solver/upper_trs.cpp @@ -33,7 +33,7 @@ class UpperTrs : public ::testing::Test { std::unique_ptr upper_trs_factory; }; -TYPED_TEST_SUITE(UpperTrs, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(UpperTrs, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 291c842325f..66643c0aa9f 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -212,12 +212,16 @@ struct CudaSolveStruct : gko::solver::SolveStruct { size_type work_size{}; + // nullptr is considered nullptr_t not casted to the function signature + // automatically Explicitly cast `nullptr` to `const ValueType*` to + // prevent compiler issues with gnu/llvm 9 sparselib::buffer_size_ext( handle, algorithm, SPARSELIB_OPERATION_NON_TRANSPOSE, SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), - matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, + matrix->get_const_col_idxs(), + static_cast(nullptr), num_rhs, solve_info, policy, &work_size); // allocate workspace @@ -228,7 +232,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct { SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), - matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, + matrix->get_const_col_idxs(), + static_cast(nullptr), num_rhs, solve_info, policy, work.get_data()); } @@ -357,6 +362,10 @@ struct float_to_unsigned_impl { using type = uint32; }; +template <> +struct float_to_unsigned_impl<__half> { + using type = uint16; +}; /** * Checks if a floating point number representation matches the representation @@ -503,7 +512,7 @@ __global__ void sptrsv_naive_legacy_kernel( const auto row_end = is_upper ? rowptrs[row] - 1 : rowptrs[row + 1]; const int row_step = is_upper ? -1 : 1; - ValueType sum = 0.0; + ValueType sum = zero(); auto j = row_begin; auto col = colidxs[j]; while (j != row_end) { diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu index b37f6536b0f..7832cf9e4c5 100644 --- a/cuda/solver/lower_trs_kernels.cu +++ b/cuda/solver/lower_trs_kernels.cu @@ -50,7 +50,7 @@ void generate(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL); @@ -70,7 +70,7 @@ void solve(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL); diff --git a/cuda/solver/upper_trs_kernels.cu b/cuda/solver/upper_trs_kernels.cu index eb7d8386083..b6828bc0c92 100644 --- a/cuda/solver/upper_trs_kernels.cu +++ b/cuda/solver/upper_trs_kernels.cu @@ -50,7 +50,7 @@ void generate(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL); @@ -70,7 +70,7 @@ void solve(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL); diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index 49f99d4d96f..a408f1bb879 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -191,3 +191,99 @@ class type_desc: : "memory"); }} """) + +# since there are no constraints for f16 register an intermediate conversion needs to happen +t = type_desc(ptx_type_suffix='.f16', val_constraint='f', name='__half') +t.parent_name = "float" +t.ptx_parent_type_suffix = '.f32' +t.ptx_mem_type_suffix = '.b16' +for s in memory_spaces: + for o in memory_orderings: + membar_expression = "" if o.is_relaxed else f"membar_acq_rel{s.fn_suffix}();" + const_ptr_expr = s.ptr_expr.format( + ptr=f"const_cast<{t.name}*>(ptr)") + mut_ptr_expr = s.ptr_expr.format(ptr="ptr") + print(f""" +__device__ __forceinline__ {t.name} load{o.fn_load_suffix}{s.fn_suffix}(const {t.name}* ptr) +{{ + {t.parent_name} result; + asm volatile("{{\\n\\t" + " .reg {t.ptx_type_suffix} t;\\n\\t" + #if __CUDA_ARCH__ < 700 + " ld.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];\\n\\t" + #else + " ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];\\n\\t" + #endif + " cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t;\\n\\t" + "}}" + : "={t.val_constraint}"(result) + : "{s.ptr_constraint}"({const_ptr_expr}) + : "memory"); + {membar_expression} + return static_cast<{t.name}>(result); +}} + + +__device__ __forceinline__ void store{o.fn_store_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} result) +{{ + {membar_expression} + asm volatile("{{\\n\\t" + " .reg {t.ptx_type_suffix} t;\\n\\t" + " cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t, %1;\\n\\t" + #if __CUDA_ARCH__ < 700 + " st.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;\\n\\t" + #else + " st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;\\n\\t" + #endif + "}}" + :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(static_cast<{t.parent_name}>(result)) + : "memory"); +}} +""") + +for s in memory_spaces: + o = ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed", + ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True) + const_ptr_expr = s.ptr_expr.format( + ptr=f"const_cast*>(ptr)") + mut_ptr_expr = s.ptr_expr.format(ptr="ptr") + print(f""" +__device__ __forceinline__ thrust::complex<{t.name}> load_relaxed{s.fn_suffix}(const thrust::complex<{t.name}>* ptr) +{{ + {t.parent_name} real_result; + {t.parent_name} imag_result; + asm volatile("{{\\n\\t" + " .reg .v2 {t.ptx_type_suffix} t;\\n\\t" +#if __CUDA_ARCH__ < 700 + "ld.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];\\n\\t" +#else + "ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];\\n\\t" +#endif + " cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t.x;\\n\\t" + " cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %1, t.y;\\n\\t" + "}}" + : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) + : "{s.ptr_constraint}"({const_ptr_expr}) + : "memory"); + return thrust::complex<{t.name}>{{real_result, imag_result}}; +}} + + +__device__ __forceinline__ void store_relaxed{s.fn_suffix}(thrust::complex<{t.name}>* ptr, thrust::complex<{t.name}> result) +{{ + auto real_result = static_cast<{t.parent_name}>(result.real()); + auto imag_result = static_cast<{t.parent_name}>(result.imag()); + asm volatile("{{\\n\\t" + " .reg .v2 {t.ptx_type_suffix} t;\\n\\t" + " cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.x, %1;\\n\\t" + " cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.y, %2;\\n\\t" +#if __CUDA_ARCH__ < 700 + "st.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;\\n\\t" +#else + "st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;\\n\\t" +#endif + "}}" + :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) + : "memory"); +}} +""") diff --git a/dpcpp/solver/lower_trs_kernels.dp.cpp b/dpcpp/solver/lower_trs_kernels.dp.cpp index 449bfe5cfcf..62cfe93a59d 100644 --- a/dpcpp/solver/lower_trs_kernels.dp.cpp +++ b/dpcpp/solver/lower_trs_kernels.dp.cpp @@ -42,7 +42,7 @@ void generate(std::shared_ptr exec, bool unit_diag, const solver::trisolve_algorithm algorithm, const size_type num_rhs) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL); @@ -59,7 +59,7 @@ void solve(std::shared_ptr exec, const matrix::Dense* b, matrix::Dense* x) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL); diff --git a/dpcpp/solver/upper_trs_kernels.dp.cpp b/dpcpp/solver/upper_trs_kernels.dp.cpp index 7ac4950fe82..49e0a931e74 100644 --- a/dpcpp/solver/upper_trs_kernels.dp.cpp +++ b/dpcpp/solver/upper_trs_kernels.dp.cpp @@ -42,7 +42,7 @@ void generate(std::shared_ptr exec, bool unit_diag, const solver::trisolve_algorithm algorithm, const size_type num_rhs) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL); @@ -59,7 +59,7 @@ void solve(std::shared_ptr exec, const matrix::Dense* b, matrix::Dense* x) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL); diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp index 5eab76ed5fa..6858f1eddc0 100644 --- a/hip/solver/lower_trs_kernels.hip.cpp +++ b/hip/solver/lower_trs_kernels.hip.cpp @@ -54,7 +54,7 @@ void generate(std::shared_ptr exec, false, unit_diag); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL); @@ -70,7 +70,7 @@ void solve(std::shared_ptr exec, trans_x, b, x); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL); diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp index fb480d9b22d..f1398faeea4 100644 --- a/hip/solver/upper_trs_kernels.hip.cpp +++ b/hip/solver/upper_trs_kernels.hip.cpp @@ -54,7 +54,7 @@ void generate(std::shared_ptr exec, true, unit_diag); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL); @@ -70,7 +70,7 @@ void solve(std::shared_ptr exec, trans_x, b, x); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL); diff --git a/omp/solver/lower_trs_kernels.cpp b/omp/solver/lower_trs_kernels.cpp index 6dac6b46078..c873e5e8958 100644 --- a/omp/solver/lower_trs_kernels.cpp +++ b/omp/solver/lower_trs_kernels.cpp @@ -47,7 +47,7 @@ void generate(std::shared_ptr exec, // "analysis" phase for the triangular matrix. } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL); @@ -88,7 +88,7 @@ void solve(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL); diff --git a/omp/solver/upper_trs_kernels.cpp b/omp/solver/upper_trs_kernels.cpp index ea05cabeb63..5014f823d35 100644 --- a/omp/solver/upper_trs_kernels.cpp +++ b/omp/solver/upper_trs_kernels.cpp @@ -47,7 +47,7 @@ void generate(std::shared_ptr exec, // "analysis" phase for the triangular matrix. } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL); @@ -90,7 +90,7 @@ void solve(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL); diff --git a/reference/solver/lower_trs_kernels.cpp b/reference/solver/lower_trs_kernels.cpp index ba02c9c838c..49e3829d9af 100644 --- a/reference/solver/lower_trs_kernels.cpp +++ b/reference/solver/lower_trs_kernels.cpp @@ -44,7 +44,7 @@ void generate(std::shared_ptr exec, // "analysis" phase for the triangular matrix. } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL); @@ -88,7 +88,7 @@ void solve(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL); diff --git a/reference/solver/upper_trs_kernels.cpp b/reference/solver/upper_trs_kernels.cpp index f0c23a9c4cc..b1d045eeadb 100644 --- a/reference/solver/upper_trs_kernels.cpp +++ b/reference/solver/upper_trs_kernels.cpp @@ -44,7 +44,7 @@ void generate(std::shared_ptr exec, // "analysis" phase for the triangular matrix. } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL); @@ -90,7 +90,7 @@ void solve(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL); diff --git a/reference/test/solver/direct.cpp b/reference/test/solver/direct.cpp index 1fb147a7a2b..e421811382f 100644 --- a/reference/test/solver/direct.cpp +++ b/reference/test/solver/direct.cpp @@ -49,7 +49,7 @@ class Direct : public ::testing::Test { symmetric)) .on(exec); solver = factory->generate(mtx); - std::normal_distribution> dist(0, 1); + std::normal_distribution<> dist(0, 1); x = gko::test::generate_random_dense_matrix( mtx->get_size()[0], nrhs, dist, rng, this->exec); x_ref = x->clone(); @@ -66,7 +66,8 @@ class Direct : public ::testing::Test { std::unique_ptr solver; }; -TYPED_TEST_SUITE(Direct, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Direct, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Direct, SolvesAni1SingleRhs) diff --git a/reference/test/solver/lower_trs.cpp b/reference/test/solver/lower_trs.cpp index d52ee028b53..fd6fe1e4b16 100644 --- a/reference/test/solver/lower_trs.cpp +++ b/reference/test/solver/lower_trs.cpp @@ -45,7 +45,7 @@ class LowerTrs : public ::testing::Test { std::unique_ptr solver; }; -TYPED_TEST_SUITE(LowerTrs, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(LowerTrs, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/reference/test/solver/lower_trs_kernels.cpp b/reference/test/solver/lower_trs_kernels.cpp index 3680f19681f..6d54efd2913 100644 --- a/reference/test/solver/lower_trs_kernels.cpp +++ b/reference/test/solver/lower_trs_kernels.cpp @@ -75,7 +75,7 @@ class LowerTrs : public ::testing::Test { std::unique_ptr lower_trs_factory_unit; }; -TYPED_TEST_SUITE(LowerTrs, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(LowerTrs, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); @@ -108,7 +108,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystem) TYPED_TEST(LowerTrs, SolvesTriangularSystemMixed) { using other_value_type = typename TestFixture::value_type; - using value_type = gko::next_precision; + using value_type = gko::next_precision_with_half; using Mtx = gko::matrix::Dense; std::shared_ptr b = gko::initialize({1.0, 2.0, 1.0}, this->exec); auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); @@ -146,7 +146,8 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystemComplex) TYPED_TEST(LowerTrs, SolvesTriangularSystemMixedComplex) { using other_value_type = typename TestFixture::value_type; - using Scalar = gko::matrix::Dense>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; std::shared_ptr b = gko::initialize( @@ -217,7 +218,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApply) TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApplyMixed) { using other_value_type = typename TestFixture::value_type; - using value_type = gko::next_precision; + using value_type = gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto alpha = gko::initialize({2.0}, this->exec); auto beta = gko::initialize({-1.0}, this->exec); @@ -259,7 +260,8 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApplyComplex) TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApplyMixedComplex) { using other_value_type = typename TestFixture::value_type; - using Scalar = gko::matrix::Dense>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto alpha = gko::initialize({2.0}, this->exec); diff --git a/reference/test/solver/upper_trs.cpp b/reference/test/solver/upper_trs.cpp index 9980c51f9d1..b59744a0e8c 100644 --- a/reference/test/solver/upper_trs.cpp +++ b/reference/test/solver/upper_trs.cpp @@ -45,7 +45,7 @@ class UpperTrs : public ::testing::Test { std::unique_ptr upper_trs_solver; }; -TYPED_TEST_SUITE(UpperTrs, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(UpperTrs, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/reference/test/solver/upper_trs_kernels.cpp b/reference/test/solver/upper_trs_kernels.cpp index a60f3b46079..870542593ff 100644 --- a/reference/test/solver/upper_trs_kernels.cpp +++ b/reference/test/solver/upper_trs_kernels.cpp @@ -75,7 +75,7 @@ class UpperTrs : public ::testing::Test { std::unique_ptr upper_trs_factory_unit; }; -TYPED_TEST_SUITE(UpperTrs, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(UpperTrs, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); @@ -108,7 +108,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystem) TYPED_TEST(UpperTrs, SolvesTriangularSystemMixed) { using other_value_type = typename TestFixture::value_type; - using value_type = gko::next_precision; + using value_type = gko::next_precision_with_half; using Mtx = gko::matrix::Dense; std::shared_ptr b = gko::initialize({4.0, 2.0, 3.0}, this->exec); auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); @@ -146,7 +146,8 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystemComplex) TYPED_TEST(UpperTrs, SolvesTriangularSystemMixedComplex) { using other_value_type = typename TestFixture::value_type; - using Scalar = gko::matrix::Dense>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; std::shared_ptr b = gko::initialize( @@ -218,7 +219,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApply) TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApplyMixed) { using other_value_type = typename TestFixture::value_type; - using value_type = gko::next_precision; + using value_type = gko::next_precision_with_half; using Mtx = gko::matrix::Dense; auto alpha = gko::initialize({2.0}, this->exec); auto beta = gko::initialize({-1.0}, this->exec); @@ -260,7 +261,8 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApplyComplex) TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApplyMixedComplex) { using other_value_type = typename TestFixture::value_type; - using Scalar = gko::matrix::Dense>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto alpha = gko::initialize({2.0}, this->exec); diff --git a/test/solver/direct.cpp b/test/solver/direct.cpp index da77682bcdd..1ee95806c37 100644 --- a/test/solver/direct.cpp +++ b/test/solver/direct.cpp @@ -51,9 +51,7 @@ class Direct : public CommonTestFixture { return gko::test::generate_random_matrix( num_rows, num_cols, std::uniform_int_distribution<>(num_cols, num_cols), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); } void initialize_data(const char* mtx_filename, int nrhs) @@ -102,7 +100,7 @@ class Direct : public CommonTestFixture { }; #ifdef GKO_COMPILING_OMP -using Types = gko::test::ValueIndexTypes; +using Types = gko::test::ValueIndexTypesWithHalf; #elif defined(GKO_COMPILING_CUDA) // CUDA don't support long indices for sorting, and the triangular solvers // seem broken From 153087f9400d2871344b12b6e2da755bd2afab9f Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 4 Nov 2024 10:21:40 +0100 Subject: [PATCH 25/69] workaround for half precision of load/store by using single precision in shared memory --- cuda/solver/common_trs_kernels.cuh | 32 ++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 66643c0aa9f..4058112a44b 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -213,7 +213,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct { size_type work_size{}; // nullptr is considered nullptr_t not casted to the function signature - // automatically Explicitly cast `nullptr` to `const ValueType*` to + // automatically explicitly cast `nullptr` to `const ValueType*` to // prevent compiler issues with gnu/llvm 9 sparselib::buffer_size_ext( handle, algorithm, SPARSELIB_OPERATION_NON_TRANSPOSE, @@ -406,7 +406,16 @@ __global__ void sptrsv_naive_caching_kernel( const size_type nrhs, bool unit_diag, bool* nan_produced, IndexType* atomic_counter) { - __shared__ uninitialized_array x_s_array; + // TODO: need to investigate + // memory operation on the half-precision shared_memory seem to give + // wrong result. we use float in shared_memory. + using SharedValueType = std::conditional_t< + std::is_same, __half>::value, + std::conditional_t(), thrust::complex, + float>, + ValueType>; + __shared__ uninitialized_array + x_s_array; __shared__ IndexType block_base_idx; if (threadIdx.x == 0) { @@ -426,8 +435,8 @@ __global__ void sptrsv_naive_caching_kernel( const auto self_shmem_id = full_gid / default_block_size; const auto self_shid = full_gid % default_block_size; - ValueType* x_s = x_s_array; - x_s[self_shid] = nan(); + SharedValueType* x_s = x_s_array; + x_s[self_shid] = nan(); __syncthreads(); @@ -439,20 +448,19 @@ __global__ void sptrsv_naive_caching_kernel( const auto row_end = is_upper ? rowptrs[row] - 1 : rowptrs[row + 1]; const int row_step = is_upper ? -1 : 1; - auto sum = zero(); + auto sum = zero(); auto i = row_begin; for (; i != row_end; i += row_step) { const auto dependency = colidxs[i]; if (is_upper ? dependency <= row : dependency >= row) { break; } - auto x_p = &x[dependency * x_stride + rhs]; const auto dependency_gid = is_upper ? (n - 1 - dependency) * nrhs + rhs : dependency * nrhs + rhs; const bool shmem_possible = (dependency_gid / default_block_size) == self_shmem_id; - ValueType val{}; + SharedValueType val{}; if (shmem_possible) { const auto dependency_shid = dependency_gid % default_block_size; while (is_nan_exact( @@ -464,15 +472,17 @@ __global__ void sptrsv_naive_caching_kernel( } } - sum += val * vals[i]; + sum += val * static_cast(vals[i]); } // The first entry past the triangular part will be the diagonal - const auto diag = unit_diag ? one() : vals[i]; - const auto r = (b[row * b_stride + rhs] - sum) / diag; + const auto diag = unit_diag ? one() + : static_cast(vals[i]); + const auto r = + (static_cast(b[row * b_stride + rhs]) - sum) / diag; store_relaxed_shared(x_s + self_shid, r); - store_relaxed(x + row * x_stride + rhs, r); + store_relaxed(x + row * x_stride + rhs, static_cast(r)); // This check to ensure no infinite loops happen. if (is_nan_exact(r)) { From e95076f128f71cad4b23c69ae23837e3fdc4551a Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 18 Nov 2024 12:14:07 +0100 Subject: [PATCH 26/69] delete the current unusable half memory op on shared memory --- .../cuda_hip/components/memory.nvidia.hpp.inc | 122 ------------------ dev_tools/scripts/generate_cuda_memory_ptx.py | 6 +- 2 files changed, 4 insertions(+), 124 deletions(-) diff --git a/common/cuda_hip/components/memory.nvidia.hpp.inc b/common/cuda_hip/components/memory.nvidia.hpp.inc index f39c600ce6c..f759c613f45 100644 --- a/common/cuda_hip/components/memory.nvidia.hpp.inc +++ b/common/cuda_hip/components/memory.nvidia.hpp.inc @@ -1033,83 +1033,6 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, } -__device__ __forceinline__ __half load_relaxed_shared(const __half* ptr) -{ - float result; - asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" -#if __CUDA_ARCH__ < 700 - " ld.volatile.shared.b16 t, [%1];\n\t" -#else - " ld.relaxed.cta.shared.b16 t, [%1];\n\t" -#endif - " cvt.f32.f16 %0, t;\n\t" - "}" - : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast<__half*>(ptr))) - : "memory"); - - return static_cast<__half>(result); -} - - -__device__ __forceinline__ void store_relaxed_shared(__half* ptr, __half result) -{ - asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" - " cvt.rn.f16.f32 t, %1;\n\t" -#if __CUDA_ARCH__ < 700 - " st.volatile.shared.b16 [%0], t;\n\t" -#else - " st.relaxed.cta.shared.b16 [%0], t;\n\t" -#endif - "}" ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), - "f"(static_cast(result)) - : "memory"); -} - - -__device__ __forceinline__ __half load_acquire_shared(const __half* ptr) -{ - float result; - asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" -#if __CUDA_ARCH__ < 700 - " ld.volatile.shared.b16 t, [%1];\n\t" -#else - " ld.acquire.cta.shared.b16 t, [%1];\n\t" -#endif - " cvt.f32.f16 %0, t;\n\t" - "}" - : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast<__half*>(ptr))) - : "memory"); - membar_acq_rel_shared(); - return static_cast<__half>(result); -} - - -__device__ __forceinline__ void store_release_shared(__half* ptr, __half result) -{ - membar_acq_rel_shared(); - asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" - " cvt.rn.f16.f32 t, %1;\n\t" -#if __CUDA_ARCH__ < 700 - " st.volatile.shared.b16 [%0], t;\n\t" -#else - " st.release.cta.shared.b16 [%0], t;\n\t" -#endif - "}" ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), - "f"(static_cast(result)) - : "memory"); -} - - __device__ __forceinline__ __half load_relaxed_local(const __half* ptr) { float result; @@ -1264,51 +1187,6 @@ __device__ __forceinline__ void store_release(__half* ptr, __half result) } -__device__ __forceinline__ thrust::complex<__half> load_relaxed_shared( - const thrust::complex<__half>* ptr) -{ - float real_result; - float imag_result; - asm volatile( - "{\n\t" - " .reg .v2 .f16 t;\n\t" -#if __CUDA_ARCH__ < 700 - "ld.volatile.shared.v2.b16 {t.x, t.y}, [%2];\n\t" -#else - "ld.relaxed.cta.shared.v2.b16 {t.x, t.y}, [%2];\n\t" -#endif - " cvt.f32.f16 %0, t.x;\n\t" - " cvt.f32.f16 %1, t.y;\n\t" - "}" - : "=f"(real_result), "=f"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr( - const_cast*>(ptr))) - : "memory"); - return thrust::complex<__half>{real_result, imag_result}; -} - - -__device__ __forceinline__ void store_relaxed_shared( - thrust::complex<__half>* ptr, thrust::complex<__half> result) -{ - auto real_result = static_cast(result.real()); - auto imag_result = static_cast(result.imag()); - asm volatile( - "{\n\t" - " .reg .v2 .f16 t;\n\t" - " cvt.rn.f16.f32 t.x, %1;\n\t" - " cvt.rn.f16.f32 t.y, %2;\n\t" -#if __CUDA_ARCH__ < 700 - "st.volatile.shared.v2.b16 [%0], t;\n\t" -#else - "st.relaxed.cta.shared.v2.b16 [%0], t;\n\t" -#endif - "}" ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), - "f"(real_result), "f"(imag_result) - : "memory"); -} - - __device__ __forceinline__ thrust::complex<__half> load_relaxed_local( const thrust::complex<__half>* ptr) { diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index a408f1bb879..834c49dba46 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -193,11 +193,13 @@ class type_desc: """) # since there are no constraints for f16 register an intermediate conversion needs to happen +# There are some issues when using f16 on shared memory. We disable them currently. +memory_spaces_without_shared=memory_spaces[1:] t = type_desc(ptx_type_suffix='.f16', val_constraint='f', name='__half') t.parent_name = "float" t.ptx_parent_type_suffix = '.f32' t.ptx_mem_type_suffix = '.b16' -for s in memory_spaces: +for s in memory_spaces_without_shared: for o in memory_orderings: membar_expression = "" if o.is_relaxed else f"membar_acq_rel{s.fn_suffix}();" const_ptr_expr = s.ptr_expr.format( @@ -241,7 +243,7 @@ class type_desc: }} """) -for s in memory_spaces: +for s in memory_spaces_without_shared: o = ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed", ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True) const_ptr_expr = s.ptr_expr.format( From f2de94cd53cecb15742e47a5b30b567be25212ce Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 25 Oct 2024 16:58:48 +0200 Subject: [PATCH 27/69] direct and tri config dispatch --- core/config/solver_config.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/core/config/solver_config.cpp b/core/config/solver_config.cpp index eb566986526..04bf5f5fcd5 100644 --- a/core/config/solver_config.cpp +++ b/core/config/solver_config.cpp @@ -40,9 +40,10 @@ GKO_PARSE_VALUE_TYPE_WITH_HALF(Idr, gko::solver::Idr); GKO_PARSE_VALUE_TYPE_WITH_HALF(Gcr, gko::solver::Gcr); GKO_PARSE_VALUE_TYPE_WITH_HALF(Gmres, gko::solver::Gmres); GKO_PARSE_VALUE_TYPE(CbGmres, gko::solver::CbGmres); -GKO_PARSE_VALUE_AND_INDEX_TYPE(Direct, gko::experimental::solver::Direct); -GKO_PARSE_VALUE_AND_INDEX_TYPE(LowerTrs, gko::solver::LowerTrs); -GKO_PARSE_VALUE_AND_INDEX_TYPE(UpperTrs, gko::solver::UpperTrs); +GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(Direct, + gko::experimental::solver::Direct); +GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(LowerTrs, gko::solver::LowerTrs); +GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(UpperTrs, gko::solver::UpperTrs); template <> From 17edd7354d90c351005069bfe37cf3a1d7205a80 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 25 Oct 2024 18:22:25 +0200 Subject: [PATCH 28/69] factorization --- .../factorization/cholesky_kernels.cpp | 12 ++-- .../factorization/factorization_kernels.cpp | 10 ++-- common/cuda_hip/factorization/ic_kernels.cpp | 2 +- common/cuda_hip/factorization/ilu_kernels.cpp | 2 +- common/cuda_hip/factorization/lu_kernels.cpp | 6 +- .../cuda_hip/factorization/par_ic_kernels.cpp | 29 +++++---- .../factorization/par_ict_kernels.cpp | 26 +++++--- .../factorization/par_ilu_kernels.cpp | 32 ++++++---- .../par_ilut_approx_filter_kernels.cpp | 2 +- .../factorization/par_ilut_filter_kernels.cpp | 2 +- .../factorization/par_ilut_select_common.cpp | 16 +++-- .../factorization/par_ilut_select_kernels.cpp | 19 ++++-- .../factorization/par_ilut_select_kernels.hpp | 4 +- .../factorization/par_ilut_spgeam_kernels.cpp | 2 +- .../factorization/par_ilut_sweep_kernels.cpp | 36 ++++++----- core/device_hooks/common_kernels.inc.cpp | 60 +++++++++++-------- core/factorization/cholesky.cpp | 2 +- core/factorization/elimination_forest.cpp | 3 +- core/factorization/factorization.cpp | 3 +- core/factorization/ic.cpp | 2 +- core/factorization/ilu.cpp | 2 +- core/factorization/lu.cpp | 2 +- core/factorization/par_ic.cpp | 2 +- core/factorization/par_ict.cpp | 2 +- core/factorization/par_ilu.cpp | 2 +- core/factorization/par_ilut.cpp | 2 +- core/factorization/symbolic.cpp | 8 ++- .../test/factorization/elimination_forest.cpp | 2 +- core/test/factorization/par_ic.cpp | 3 +- core/test/factorization/par_ict.cpp | 3 +- core/test/factorization/par_ilu.cpp | 3 +- core/test/factorization/par_ilut.cpp | 2 +- .../factorization_kernels.dp.cpp | 12 ++-- dpcpp/factorization/par_ic_kernels.dp.cpp | 8 +-- dpcpp/factorization/par_ict_kernels.dp.cpp | 6 +- .../par_ilut_filter_kernels.hpp.inc | 4 +- .../par_ilut_select_kernels.hpp.inc | 8 +-- omp/factorization/cholesky_kernels.cpp | 12 ++-- omp/factorization/factorization_kernels.cpp | 10 ++-- omp/factorization/ic_kernels.cpp | 2 +- omp/factorization/ilu_kernels.cpp | 2 +- omp/factorization/lu_kernels.cpp | 8 ++- omp/factorization/par_ic_kernels.cpp | 4 +- omp/factorization/par_ict_kernels.cpp | 4 +- omp/factorization/par_ilu_kernels.cpp | 2 +- omp/factorization/par_ilut_kernels.cpp | 17 ++++-- reference/factorization/cholesky_kernels.cpp | 12 ++-- .../factorization/factorization_kernels.cpp | 10 ++-- reference/factorization/ic_kernels.cpp | 2 +- reference/factorization/ilu_kernels.cpp | 2 +- reference/factorization/lu_kernels.cpp | 8 ++- reference/factorization/par_ic_kernels.cpp | 4 +- reference/factorization/par_ict_kernels.cpp | 4 +- reference/factorization/par_ilu_kernels.cpp | 2 +- reference/factorization/par_ilut_kernels.cpp | 17 ++++-- .../test/factorization/cholesky_kernels.cpp | 2 +- .../test/factorization/factorization.cpp | 2 +- reference/test/factorization/ic_kernels.cpp | 3 +- reference/test/factorization/ilu_kernels.cpp | 3 +- reference/test/factorization/lu_kernels.cpp | 11 ++-- .../test/factorization/par_ic_kernels.cpp | 3 +- .../test/factorization/par_ict_kernels.cpp | 3 +- .../test/factorization/par_ilu_kernels.cpp | 3 +- .../test/factorization/par_ilut_kernels.cpp | 27 ++++++--- test/factorization/lu_kernels.cpp | 2 +- test/factorization/par_ic_kernels.cpp | 8 ++- test/factorization/par_ict_kernels.cpp | 13 ++-- test/factorization/par_ilu_kernels.cpp | 9 ++- test/factorization/par_ilut_kernels.cpp | 53 +++++++++------- 69 files changed, 364 insertions(+), 241 deletions(-) diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp index 7ff1382d8c6..ef24bb47fe0 100644 --- a/common/cuda_hip/factorization/cholesky_kernels.cpp +++ b/common/cuda_hip/factorization/cholesky_kernels.cpp @@ -262,7 +262,7 @@ void symbolic_factorize( postorder, postorder_parent, out_row_ptrs, out_cols); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE); @@ -321,7 +321,7 @@ void forest_from_factor( build_children_from_parents(exec, forest); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR); @@ -355,7 +355,8 @@ void initialize(std::shared_ptr exec, transpose_idxs); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CHOLESKY_INITIALIZE); template @@ -390,7 +391,8 @@ void factorize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CHOLESKY_FACTORIZE); template @@ -446,7 +448,7 @@ void symbolic_count(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT); diff --git a/common/cuda_hip/factorization/factorization_kernels.cpp b/common/cuda_hip/factorization/factorization_kernels.cpp index f26ef668d34..8e8893df535 100644 --- a/common/cuda_hip/factorization/factorization_kernels.cpp +++ b/common/cuda_hip/factorization/factorization_kernels.cpp @@ -355,7 +355,7 @@ void add_diagonal_elements(std::shared_ptr exec, mtx_builder.get_col_idx_array() = std::move(new_col_idx_array); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL); @@ -385,7 +385,7 @@ void initialize_row_ptrs_l_u( components::prefix_sum_nonnegative(exec, u_row_ptrs, num_rows + 1); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL); @@ -418,7 +418,7 @@ void initialize_l_u(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL); @@ -446,7 +446,7 @@ void initialize_row_ptrs_l( components::prefix_sum_nonnegative(exec, l_row_ptrs, num_rows + 1); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL); @@ -483,7 +483,7 @@ void initialize_l(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL); diff --git a/common/cuda_hip/factorization/ic_kernels.cpp b/common/cuda_hip/factorization/ic_kernels.cpp index e84032bac35..c2ed0b17cf0 100644 --- a/common/cuda_hip/factorization/ic_kernels.cpp +++ b/common/cuda_hip/factorization/ic_kernels.cpp @@ -54,7 +54,7 @@ void sparselib_ic(std::shared_ptr exec, sparselib::destroy(desc); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_IC_SPARSELIB_IC_KERNEL); diff --git a/common/cuda_hip/factorization/ilu_kernels.cpp b/common/cuda_hip/factorization/ilu_kernels.cpp index b81f8fb9092..eb7677e117f 100644 --- a/common/cuda_hip/factorization/ilu_kernels.cpp +++ b/common/cuda_hip/factorization/ilu_kernels.cpp @@ -54,7 +54,7 @@ void sparselib_ilu(std::shared_ptr exec, sparselib::destroy(desc); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL); diff --git a/common/cuda_hip/factorization/lu_kernels.cpp b/common/cuda_hip/factorization/lu_kernels.cpp index b0d54e44217..4d98b611e28 100644 --- a/common/cuda_hip/factorization/lu_kernels.cpp +++ b/common/cuda_hip/factorization/lu_kernels.cpp @@ -253,7 +253,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_LU_INITIALIZE); template @@ -286,7 +287,8 @@ void factorize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_LU_FACTORIZE); template diff --git a/common/cuda_hip/factorization/par_ic_kernels.cpp b/common/cuda_hip/factorization/par_ic_kernels.cpp index ee8b7c97f64..87e2fefd823 100644 --- a/common/cuda_hip/factorization/par_ic_kernels.cpp +++ b/common/cuda_hip/factorization/par_ic_kernels.cpp @@ -110,7 +110,7 @@ void init_factor(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_IC_INIT_FACTOR_KERNEL); @@ -123,19 +123,28 @@ void compute_factor(std::shared_ptr exec, auto nnz = l->get_num_stored_elements(); auto num_blocks = ceildiv(nnz, default_block_size); if (num_blocks > 0) { - for (size_type i = 0; i < iterations; ++i) { - kernel::ic_sweep<<get_stream()>>>( - a_lower->get_const_row_idxs(), a_lower->get_const_col_idxs(), - as_device_type(a_lower->get_const_values()), - l->get_const_row_ptrs(), l->get_const_col_idxs(), - as_device_type(l->get_values()), - static_cast(l->get_num_stored_elements())); +#ifdef GKO_COMPILING_HIP + if constexpr (std::is_same, half>::value) { + // HIP does not support 16bit atomic operation + GKO_NOT_SUPPORTED(a_lower); + } else +#endif + { + for (size_type i = 0; i < iterations; ++i) { + kernel::ic_sweep<<get_stream()>>>( + a_lower->get_const_row_idxs(), + a_lower->get_const_col_idxs(), + as_device_type(a_lower->get_const_values()), + l->get_const_row_ptrs(), l->get_const_col_idxs(), + as_device_type(l->get_values()), + static_cast(l->get_num_stored_elements())); + } } } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL); diff --git a/common/cuda_hip/factorization/par_ict_kernels.cpp b/common/cuda_hip/factorization/par_ict_kernels.cpp index 3446f124123..0acf0633a2c 100644 --- a/common/cuda_hip/factorization/par_ict_kernels.cpp +++ b/common/cuda_hip/factorization/par_ict_kernels.cpp @@ -390,13 +390,21 @@ void compute_factor(syn::value_list, auto block_size = default_block_size / subwarp_size; auto num_blocks = ceildiv(total_nnz, block_size); if (num_blocks > 0) { - kernel::ict_sweep - <<get_stream()>>>( - a->get_const_row_ptrs(), a->get_const_col_idxs(), - as_device_type(a->get_const_values()), l->get_const_row_ptrs(), - l_coo->get_const_row_idxs(), l->get_const_col_idxs(), - as_device_type(l->get_values()), - static_cast(l->get_num_stored_elements())); +#ifdef GKO_COMPILING_HIP + if constexpr (std::is_same, half>::value) { + // HIP does not support 16bit atomic operation + GKO_NOT_SUPPORTED(l); + } else +#endif + { + kernel::ict_sweep + <<get_stream()>>>( + a->get_const_row_ptrs(), a->get_const_col_idxs(), + as_device_type(a->get_const_values()), + l->get_const_row_ptrs(), l_coo->get_const_row_idxs(), + l->get_const_col_idxs(), as_device_type(l->get_values()), + static_cast(l->get_num_stored_elements())); + } } } @@ -427,7 +435,7 @@ void add_candidates(std::shared_ptr exec, syn::value_list(), syn::type_list<>(), exec, llh, a, l, l_new); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); @@ -449,7 +457,7 @@ void compute_factor(std::shared_ptr exec, syn::value_list(), syn::type_list<>(), exec, a, l, l_coo); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); diff --git a/common/cuda_hip/factorization/par_ilu_kernels.cpp b/common/cuda_hip/factorization/par_ilu_kernels.cpp index 8bf71c471a8..a22bb85275a 100644 --- a/common/cuda_hip/factorization/par_ilu_kernels.cpp +++ b/common/cuda_hip/factorization/par_ilu_kernels.cpp @@ -94,21 +94,31 @@ void compute_l_u_factors(std::shared_ptr exec, const auto grid_dim = static_cast( ceildiv(num_elements, static_cast(block_size))); if (grid_dim > 0) { - for (size_type i = 0; i < iterations; ++i) { - kernel::compute_l_u_factors<<get_stream()>>>( - num_elements, system_matrix->get_const_row_idxs(), - system_matrix->get_const_col_idxs(), - as_device_type(system_matrix->get_const_values()), - l_factor->get_const_row_ptrs(), l_factor->get_const_col_idxs(), - as_device_type(l_factor->get_values()), - u_factor->get_const_row_ptrs(), u_factor->get_const_col_idxs(), - as_device_type(u_factor->get_values())); +#ifdef GKO_COMPILING_HIP + if constexpr (std::is_same, half>::value) { + // HIP does not support 16bit atomic operation + GKO_NOT_SUPPORTED(system_matrix); + } else +#endif + { + for (size_type i = 0; i < iterations; ++i) { + kernel::compute_l_u_factors<<get_stream()>>>( + num_elements, system_matrix->get_const_row_idxs(), + system_matrix->get_const_col_idxs(), + as_device_type(system_matrix->get_const_values()), + l_factor->get_const_row_ptrs(), + l_factor->get_const_col_idxs(), + as_device_type(l_factor->get_values()), + u_factor->get_const_row_ptrs(), + u_factor->get_const_col_idxs(), + as_device_type(u_factor->get_values())); + } } } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL); diff --git a/common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp b/common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp index 12d8da9e4f5..475d87b8bda 100644 --- a/common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp +++ b/common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp @@ -168,7 +168,7 @@ void threshold_filter_approx(std::shared_ptr exec, &threshold, m_out, m_out_coo); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL); diff --git a/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp b/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp index 25432fb44d2..d6ad2f477eb 100644 --- a/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp +++ b/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp @@ -123,7 +123,7 @@ void threshold_filter(std::shared_ptr exec, m_out_coo, lower); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL); diff --git a/common/cuda_hip/factorization/par_ilut_select_common.cpp b/common/cuda_hip/factorization/par_ilut_select_common.cpp index fccb89fcf5a..3bb67d96e4f 100644 --- a/common/cuda_hip/factorization/par_ilut_select_common.cpp +++ b/common/cuda_hip/factorization/par_ilut_select_common.cpp @@ -43,9 +43,17 @@ void sampleselect_count(std::shared_ptr exec, auto num_threads_total = ceildiv(size, items_per_thread); auto num_blocks = static_cast(ceildiv(num_threads_total, default_block_size)); - // pick sample, build searchtree - kernel::build_searchtree<<<1, bucket_count, 0, exec->get_stream()>>>( - as_device_type(values), size, as_device_type(tree)); +#ifdef GKO_COMPILING_HIP + if constexpr (std::is_same, half>::value) { + // HIP does not support 16bit atomic operation + GKO_NOT_SUPPORTED(values); + } else +#endif + { + // pick sample, build searchtree + kernel::build_searchtree<<<1, bucket_count, 0, exec->get_stream()>>>( + as_device_type(values), size, as_device_type(tree)); + } // determine bucket sizes if (num_blocks > 0) { kernel::count_buckets<< exec, unsigned char* oracles, IndexType* partial_counts, \ IndexType* total_counts) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(DECLARE_SSSS_COUNT); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(DECLARE_SSSS_COUNT); template diff --git a/common/cuda_hip/factorization/par_ilut_select_kernels.cpp b/common/cuda_hip/factorization/par_ilut_select_kernels.cpp index e03ee379977..a15adf580e8 100644 --- a/common/cuda_hip/factorization/par_ilut_select_kernels.cpp +++ b/common/cuda_hip/factorization/par_ilut_select_kernels.cpp @@ -141,13 +141,22 @@ void threshold_select(std::shared_ptr exec, // base case auto out_ptr = reinterpret_cast(tmp1.get_data()); - kernel::basecase_select<<<1, kernel::basecase_block_size, 0, - exec->get_stream()>>>( - as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr)); - threshold = exec->copy_val_to_host(out_ptr); + +#ifdef GKO_COMPILING_HIP + if constexpr (std::is_same, half>::value) { + // HIP does not support 16bit atomic operation + GKO_NOT_SUPPORTED(m); + } else +#endif + { + kernel::basecase_select<<<1, kernel::basecase_block_size, 0, + exec->get_stream()>>>( + as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr)); + threshold = exec->copy_val_to_host(out_ptr); + } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL); diff --git a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp index 79a562ff834..b88c052d19a 100644 --- a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp +++ b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp @@ -254,12 +254,12 @@ __global__ __launch_bounds__(basecase_block_size) void basecase_select( const ValueType* __restrict__ input, IndexType size, IndexType rank, ValueType* __restrict__ out) { - constexpr auto sentinel = device_numeric_limits::inf(); + const auto sentinel = device_numeric_limits::inf(); ValueType local[basecase_local_size]; __shared__ ValueType sh_local[basecase_size]; for (int i = 0; i < basecase_local_size; ++i) { auto idx = threadIdx.x + i * basecase_block_size; - local[i] = idx < size ? input[idx] : sentinel; + local[i] = idx < size ? input[idx] : static_cast(sentinel); } bitonic_sort(local, sh_local); if (threadIdx.x == rank / basecase_local_size) { diff --git a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp index a29cf6f2cb3..8f7a8af0443 100644 --- a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp +++ b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp @@ -389,7 +389,7 @@ void add_candidates(std::shared_ptr exec, u_new); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL); diff --git a/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp b/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp index 52f62b50e6a..c0f962a89c8 100644 --- a/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp +++ b/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp @@ -154,18 +154,26 @@ void compute_l_u_factors(syn::value_list, auto block_size = default_block_size / subwarp_size; auto num_blocks = ceildiv(total_nnz, block_size); if (num_blocks > 0) { - kernel::sweep - <<get_stream()>>>( - a->get_const_row_ptrs(), a->get_const_col_idxs(), - as_device_type(a->get_const_values()), l->get_const_row_ptrs(), - l_coo->get_const_row_idxs(), l->get_const_col_idxs(), - as_device_type(l->get_values()), - static_cast(l->get_num_stored_elements()), - u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(), - as_device_type(u->get_values()), u_csc->get_const_row_ptrs(), - u_csc->get_const_col_idxs(), - as_device_type(u_csc->get_values()), - static_cast(u->get_num_stored_elements())); +#ifdef GKO_COMPILING_HIP + if constexpr (std::is_same, half>::value) { + // HIP does not support 16bit atomic operation + GKO_NOT_SUPPORTED(a); + } else +#endif + { + kernel::sweep + <<get_stream()>>>( + a->get_const_row_ptrs(), a->get_const_col_idxs(), + as_device_type(a->get_const_values()), + l->get_const_row_ptrs(), l_coo->get_const_row_idxs(), + l->get_const_col_idxs(), as_device_type(l->get_values()), + static_cast(l->get_num_stored_elements()), + u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(), + as_device_type(u->get_values()), + u_csc->get_const_row_ptrs(), u_csc->get_const_col_idxs(), + as_device_type(u_csc->get_values()), + static_cast(u->get_num_stored_elements())); + } } } @@ -199,11 +207,11 @@ void compute_l_u_factors(std::shared_ptr exec, u_csc); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL); } // namespace par_ilut_factorization } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index f37166613b7..4e64134a9f2 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -995,11 +995,13 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL); namespace cholesky { -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CHOLESKY_INITIALIZE); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CHOLESKY_FACTORIZE); } // namespace cholesky @@ -1008,14 +1010,16 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE); namespace factorization { -GKO_STUB_VALUE_AND_INDEX_TYPE( +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE( +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE( +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL); } // namespace factorization @@ -1024,7 +1028,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL); namespace ic_factorization { -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_SPARSELIB_IC_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_IC_SPARSELIB_IC_KERNEL); } // namespace ic_factorization @@ -1033,7 +1037,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_SPARSELIB_IC_KERNEL); namespace ilu_factorization { -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL); } // namespace ilu_factorization @@ -1042,8 +1046,8 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL); namespace lu_factorization { -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LU_INITIALIZE); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LU_FACTORIZE); GKO_STUB_INDEX_TYPE(GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE); GKO_STUB_INDEX_TYPE(GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE); @@ -1054,8 +1058,9 @@ GKO_STUB_INDEX_TYPE(GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE); namespace par_ic_factorization { -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_IC_INIT_FACTOR_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PAR_IC_INIT_FACTOR_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL); } // namespace par_ic_factorization @@ -1064,8 +1069,10 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL); namespace par_ict_factorization { -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); } // namespace par_ict_factorization @@ -1074,7 +1081,8 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); namespace par_ilu_factorization { -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL); } // namespace par_ilu_factorization @@ -1083,11 +1091,15 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL); namespace par_ilut_factorization { -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE( +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL); diff --git a/core/factorization/cholesky.cpp b/core/factorization/cholesky.cpp index 92d598f0bd7..a552ec37ec1 100644 --- a/core/factorization/cholesky.cpp +++ b/core/factorization/cholesky.cpp @@ -146,7 +146,7 @@ std::unique_ptr Cholesky::generate_impl( #define GKO_DECLARE_CHOLESKY(ValueType, IndexType) \ class Cholesky -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_CHOLESKY); } // namespace factorization diff --git a/core/factorization/elimination_forest.cpp b/core/factorization/elimination_forest.cpp index 1dc8ff060a0..f8d6d861c2d 100644 --- a/core/factorization/elimination_forest.cpp +++ b/core/factorization/elimination_forest.cpp @@ -173,7 +173,8 @@ void compute_elim_forest(const matrix::Csr* mtx, const matrix::Csr* mtx, \ std::unique_ptr>& forest) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COMPUTE_ELIM_FOREST); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_COMPUTE_ELIM_FOREST); } // namespace factorization diff --git a/core/factorization/factorization.cpp b/core/factorization/factorization.cpp index 1df1f49aa13..e0e4ccdc3c7 100644 --- a/core/factorization/factorization.cpp +++ b/core/factorization/factorization.cpp @@ -362,7 +362,8 @@ void Factorization::apply_impl(const LinOp* alpha, #define GKO_DECLARE_FACTORIZATION(ValueType, IndexType) \ class Factorization -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FACTORIZATION); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_FACTORIZATION); } // namespace factorization diff --git a/core/factorization/ic.cpp b/core/factorization/ic.cpp index bf9d5e7bbf4..d8706c8b8e3 100644 --- a/core/factorization/ic.cpp +++ b/core/factorization/ic.cpp @@ -203,7 +203,7 @@ std::unique_ptr> Ic::generate( #define GKO_DECLARE_IC(ValueType, IndexType) class Ic -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_IC); } // namespace factorization diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp index f7703f3d20b..1c6079700e3 100644 --- a/core/factorization/ilu.cpp +++ b/core/factorization/ilu.cpp @@ -188,7 +188,7 @@ std::unique_ptr> Ilu::generate_l_u( #define GKO_DECLARE_ILU(ValueType, IndexType) class Ilu -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_ILU); } // namespace factorization diff --git a/core/factorization/lu.cpp b/core/factorization/lu.cpp index 4feb78083d2..dfdce26f140 100644 --- a/core/factorization/lu.cpp +++ b/core/factorization/lu.cpp @@ -166,7 +166,7 @@ std::unique_ptr Lu::generate_impl( #define GKO_DECLARE_LU(ValueType, IndexType) class Lu -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LU); } // namespace factorization diff --git a/core/factorization/par_ic.cpp b/core/factorization/par_ic.cpp index f4a4afd23d6..b310025eb8d 100644 --- a/core/factorization/par_ic.cpp +++ b/core/factorization/par_ic.cpp @@ -146,7 +146,7 @@ std::unique_ptr> ParIc::generate( #define GKO_DECLARE_PAR_IC(ValueType, IndexType) \ class ParIc -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_IC); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PAR_IC); } // namespace factorization diff --git a/core/factorization/par_ict.cpp b/core/factorization/par_ict.cpp index a0e8a628ca8..696b185e969 100644 --- a/core/factorization/par_ict.cpp +++ b/core/factorization/par_ict.cpp @@ -300,7 +300,7 @@ void ParIctState::iterate() #define GKO_DECLARE_PAR_ICT(ValueType, IndexType) \ class ParIct -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ICT); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PAR_ICT); } // namespace factorization diff --git a/core/factorization/par_ilu.cpp b/core/factorization/par_ilu.cpp index 68c0c0c4fc6..177c150df1d 100644 --- a/core/factorization/par_ilu.cpp +++ b/core/factorization/par_ilu.cpp @@ -161,7 +161,7 @@ ParIlu::generate_l_u( #define GKO_DECLARE_PAR_ILU(ValueType, IndexType) \ class ParIlu -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILU); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PAR_ILU); } // namespace factorization diff --git a/core/factorization/par_ilut.cpp b/core/factorization/par_ilut.cpp index 42e3cc03130..e90dbb8140f 100644 --- a/core/factorization/par_ilut.cpp +++ b/core/factorization/par_ilut.cpp @@ -352,7 +352,7 @@ void ParIlutState::iterate() #define GKO_DECLARE_PAR_ILUT(ValueType, IndexType) \ class ParIlut -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILUT); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PAR_ILUT); } // namespace factorization diff --git a/core/factorization/symbolic.cpp b/core/factorization/symbolic.cpp index 23f6b94cc14..495b830d7ea 100644 --- a/core/factorization/symbolic.cpp +++ b/core/factorization/symbolic.cpp @@ -80,7 +80,8 @@ void symbolic_cholesky( std::unique_ptr>& factors, \ std::unique_ptr>& forest) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SYMBOLIC_CHOLESKY); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SYMBOLIC_CHOLESKY); template @@ -158,7 +159,7 @@ void symbolic_lu_near_symm( const matrix::Csr* mtx, \ std::unique_ptr>& factors) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SYMBOLIC_LU_NEAR_SYMM); @@ -245,7 +246,8 @@ void symbolic_lu(const matrix::Csr* mtx, const matrix::Csr* mtx, \ std::unique_ptr>& factors) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SYMBOLIC_LU); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SYMBOLIC_LU); } // namespace factorization diff --git a/core/test/factorization/elimination_forest.cpp b/core/test/factorization/elimination_forest.cpp index 292b366f50e..cf9ddb7536e 100644 --- a/core/test/factorization/elimination_forest.cpp +++ b/core/test/factorization/elimination_forest.cpp @@ -33,7 +33,7 @@ class EliminationForest : public ::testing::Test { std::shared_ptr ref; }; -TYPED_TEST_SUITE(EliminationForest, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(EliminationForest, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/core/test/factorization/par_ic.cpp b/core/test/factorization/par_ic.cpp index d6de0f9fc98..efd4c1e3ebd 100644 --- a/core/test/factorization/par_ic.cpp +++ b/core/test/factorization/par_ic.cpp @@ -29,7 +29,8 @@ class ParIc : public ::testing::Test { std::shared_ptr ref; }; -TYPED_TEST_SUITE(ParIc, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(ParIc, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(ParIc, SetIterations) diff --git a/core/test/factorization/par_ict.cpp b/core/test/factorization/par_ict.cpp index 07eec8db549..5d5ac8bc815 100644 --- a/core/test/factorization/par_ict.cpp +++ b/core/test/factorization/par_ict.cpp @@ -29,7 +29,8 @@ class ParIct : public ::testing::Test { std::shared_ptr ref; }; -TYPED_TEST_SUITE(ParIct, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(ParIct, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(ParIct, SetIterations) diff --git a/core/test/factorization/par_ilu.cpp b/core/test/factorization/par_ilu.cpp index a0b8f37e3d4..e06a90741af 100644 --- a/core/test/factorization/par_ilu.cpp +++ b/core/test/factorization/par_ilu.cpp @@ -29,7 +29,8 @@ class ParIlu : public ::testing::Test { std::shared_ptr ref; }; -TYPED_TEST_SUITE(ParIlu, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(ParIlu, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(ParIlu, SetIterations) diff --git a/core/test/factorization/par_ilut.cpp b/core/test/factorization/par_ilut.cpp index ad466e62407..a2d0287d22a 100644 --- a/core/test/factorization/par_ilut.cpp +++ b/core/test/factorization/par_ilut.cpp @@ -30,7 +30,7 @@ class ParIlut : public ::testing::Test { std::shared_ptr ref; }; -TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/dpcpp/factorization/factorization_kernels.dp.cpp b/dpcpp/factorization/factorization_kernels.dp.cpp index 885fe481609..24736f9e00c 100644 --- a/dpcpp/factorization/factorization_kernels.dp.cpp +++ b/dpcpp/factorization/factorization_kernels.dp.cpp @@ -393,7 +393,7 @@ void initialize_l(dim3 grid, dim3 block, size_type dynamic_shared_memory, helpers::triangular_mtx_closure( [use_sqrt](auto val) { if (use_sqrt) { - val = sqrt(val); + val = gko::sqrt(val); if (!is_finite(val)) { val = one(); } @@ -482,7 +482,7 @@ void add_diagonal_elements(std::shared_ptr exec, mtx_builder.get_col_idx_array() = std::move(new_col_idxs); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL); @@ -509,7 +509,7 @@ void initialize_row_ptrs_l_u( components::prefix_sum_nonnegative(exec, u_row_ptrs, num_rows + 1); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL); @@ -534,7 +534,7 @@ void initialize_l_u(std::shared_ptr exec, csr_u->get_col_idxs(), csr_u->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL); @@ -559,7 +559,7 @@ void initialize_row_ptrs_l( components::prefix_sum_nonnegative(exec, l_row_ptrs, num_rows + 1); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL); @@ -582,7 +582,7 @@ void initialize_l(std::shared_ptr exec, csr_l->get_values(), diag_sqrt); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL); diff --git a/dpcpp/factorization/par_ic_kernels.dp.cpp b/dpcpp/factorization/par_ic_kernels.dp.cpp index 5428460fac5..91819dd98d0 100644 --- a/dpcpp/factorization/par_ic_kernels.dp.cpp +++ b/dpcpp/factorization/par_ic_kernels.dp.cpp @@ -41,7 +41,7 @@ void ic_init(const IndexType* __restrict__ l_row_ptrs, return; } auto l_nz = l_row_ptrs[row + 1] - 1; - auto diag = std::sqrt(l_vals[l_nz]); + auto diag = gko::sqrt(l_vals[l_nz]); if (is_finite(diag)) { l_vals[l_nz] = diag; } else { @@ -93,7 +93,7 @@ void ic_sweep(const IndexType* __restrict__ a_row_idxs, lh_col_begin += l_col >= lh_row; } auto to_write = row == col - ? std::sqrt(a_val - sum) + ? gko::sqrt(a_val - sum) : (a_val - sum) / l_vals[l_row_ptrs[col + 1] - 1]; if (is_finite(to_write)) { l_vals[l_nz] = to_write; @@ -130,7 +130,7 @@ void init_factor(std::shared_ptr exec, l_row_ptrs, l_vals, num_rows); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_IC_INIT_FACTOR_KERNEL); @@ -152,7 +152,7 @@ void compute_factor(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL); diff --git a/dpcpp/factorization/par_ict_kernels.dp.cpp b/dpcpp/factorization/par_ict_kernels.dp.cpp index fb99b662dec..6a704641252 100644 --- a/dpcpp/factorization/par_ict_kernels.dp.cpp +++ b/dpcpp/factorization/par_ict_kernels.dp.cpp @@ -356,7 +356,7 @@ void ict_sweep(const IndexType* __restrict__ a_row_ptrs, if (subwarp.thread_rank() == 0) { auto to_write = row == col - ? std::sqrt(a_val - sum) + ? gko::sqrt(a_val - sum) : (a_val - sum) / l_vals[l_row_ptrs[col + 1] - 1]; if (is_finite(to_write)) { l_vals[l_nz] = to_write; @@ -483,7 +483,7 @@ void add_candidates(std::shared_ptr exec, syn::value_list(), syn::type_list<>(), exec, llh, a, l, l_new); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); @@ -505,7 +505,7 @@ void compute_factor(std::shared_ptr exec, syn::value_list(), syn::type_list<>(), exec, a, l, l_coo); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); diff --git a/dpcpp/factorization/par_ilut_filter_kernels.hpp.inc b/dpcpp/factorization/par_ilut_filter_kernels.hpp.inc index d2345848d1f..6081bc0f417 100644 --- a/dpcpp/factorization/par_ilut_filter_kernels.hpp.inc +++ b/dpcpp/factorization/par_ilut_filter_kernels.hpp.inc @@ -102,7 +102,7 @@ void threshold_filter_nnz(const IndexType* __restrict__ row_ptrs, row_ptrs, num_rows, [&](IndexType idx, IndexType row_begin, IndexType row_end) { auto diag_idx = lower ? row_end - 1 : row_begin; - return std::abs(vals[idx]) >= threshold || idx == diag_idx; + return gko::abs(vals[idx]) >= threshold || idx == diag_idx; }, nnz, item_ct1); } @@ -140,7 +140,7 @@ void threshold_filter(const IndexType* __restrict__ old_row_ptrs, old_row_ptrs, old_col_idxs, old_vals, num_rows, [&](IndexType idx, IndexType row_begin, IndexType row_end) { auto diag_idx = lower ? row_end - 1 : row_begin; - return std::abs(old_vals[idx]) >= threshold || idx == diag_idx; + return gko::abs(old_vals[idx]) >= threshold || idx == diag_idx; }, new_row_ptrs, new_row_idxs, new_col_idxs, new_vals, item_ct1); } diff --git a/dpcpp/factorization/par_ilut_select_kernels.hpp.inc b/dpcpp/factorization/par_ilut_select_kernels.hpp.inc index 67cc9cdba15..430bf650e07 100644 --- a/dpcpp/factorization/par_ilut_select_kernels.hpp.inc +++ b/dpcpp/factorization/par_ilut_select_kernels.hpp.inc @@ -38,7 +38,7 @@ void build_searchtree(const ValueType* __restrict__ input, IndexType size, for (int i = 0; i < sampleselect_oversampling; ++i) { auto lidx = idx * sampleselect_oversampling + i; auto val = input[static_cast(lidx * stride)]; - samples[i] = std::abs(val); + samples[i] = gko::abs(val); } bitonic_sort(samples, sh_samples, @@ -113,7 +113,7 @@ void count_buckets(const ValueType* __restrict__ input, IndexType size, auto end = min(block_end, size); for (IndexType i = begin; i < end; i += default_block_size) { // traverse the search tree with the input element - auto el = std::abs(input[i]); + auto el = gko::abs(input[i]); IndexType tree_idx{}; #pragma unroll for (int level = 0; level < sampleselect_searchtree_height; ++level) { @@ -297,7 +297,7 @@ void filter_bucket(const ValueType* __restrict__ input, IndexType size, auto found = bucket == oracles[i]; auto ofs = atomic_add(&*counter, IndexType{found}); if (found) { - output[ofs] = std::abs(input[i]); + output[ofs] = gko::abs(input[i]); } } } @@ -337,7 +337,7 @@ void basecase_select(const ValueType* __restrict__ input, IndexType size, for (int i = 0; i < basecase_local_size; ++i) { auto idx = item_ct1.get_local_id(2) + i * basecase_block_size; - local[i] = idx < size ? input[idx] : sentinel; + local[i] = idx < size ? input[idx] : static_cast(sentinel); } bitonic_sort(local, sh_local, item_ct1); if (item_ct1.get_local_id(2) == rank / basecase_local_size) { diff --git a/omp/factorization/cholesky_kernels.cpp b/omp/factorization/cholesky_kernels.cpp index aa4aabfc731..0eb30441405 100644 --- a/omp/factorization/cholesky_kernels.cpp +++ b/omp/factorization/cholesky_kernels.cpp @@ -78,7 +78,7 @@ void symbolic_count(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT); @@ -126,7 +126,7 @@ void symbolic_factorize( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE); @@ -169,7 +169,7 @@ void forest_from_factor( num_rows, num_rows + 1, child_ptrs); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR); @@ -201,7 +201,8 @@ void initialize(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CHOLESKY_INITIALIZE); namespace { @@ -282,7 +283,8 @@ void factorize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CHOLESKY_FACTORIZE); } // namespace cholesky diff --git a/omp/factorization/factorization_kernels.cpp b/omp/factorization/factorization_kernels.cpp index e7b66f6f887..47cd38d89c3 100644 --- a/omp/factorization/factorization_kernels.cpp +++ b/omp/factorization/factorization_kernels.cpp @@ -180,7 +180,7 @@ void add_diagonal_elements(std::shared_ptr exec, mtx_builder.get_col_idx_array() = std::move(new_col_idxs); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL); @@ -215,7 +215,7 @@ void initialize_row_ptrs_l_u( components::prefix_sum_nonnegative(exec, u_row_ptrs, num_rows + 1); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL); @@ -233,7 +233,7 @@ void initialize_l_u(std::shared_ptr exec, helpers::identity{})); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL); @@ -264,7 +264,7 @@ void initialize_row_ptrs_l( components::prefix_sum_nonnegative(exec, l_row_ptrs, num_rows + 1); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL); @@ -287,7 +287,7 @@ void initialize_l(std::shared_ptr exec, helpers::identity{})); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL); diff --git a/omp/factorization/ic_kernels.cpp b/omp/factorization/ic_kernels.cpp index c071ba2ca87..313bf8c7982 100644 --- a/omp/factorization/ic_kernels.cpp +++ b/omp/factorization/ic_kernels.cpp @@ -20,7 +20,7 @@ template void sparselib_ic(std::shared_ptr exec, matrix::Csr* m) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_IC_SPARSELIB_IC_KERNEL); diff --git a/omp/factorization/ilu_kernels.cpp b/omp/factorization/ilu_kernels.cpp index b88e6a77900..db3fd5ef7a8 100644 --- a/omp/factorization/ilu_kernels.cpp +++ b/omp/factorization/ilu_kernels.cpp @@ -20,7 +20,7 @@ template void sparselib_ilu(std::shared_ptr exec, matrix::Csr* m) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL); diff --git a/omp/factorization/lu_kernels.cpp b/omp/factorization/lu_kernels.cpp index 4b13f9a352c..5f766a7208a 100644 --- a/omp/factorization/lu_kernels.cpp +++ b/omp/factorization/lu_kernels.cpp @@ -59,7 +59,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_LU_INITIALIZE); namespace { @@ -126,7 +127,8 @@ void factorize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_LU_FACTORIZE); template @@ -215,4 +217,4 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( } // namespace lu_factorization } // namespace omp } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/omp/factorization/par_ic_kernels.cpp b/omp/factorization/par_ic_kernels.cpp index 93093783acc..9488c448519 100644 --- a/omp/factorization/par_ic_kernels.cpp +++ b/omp/factorization/par_ic_kernels.cpp @@ -42,7 +42,7 @@ void init_factor(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_IC_INIT_FACTOR_KERNEL); @@ -96,7 +96,7 @@ void compute_factor(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL); diff --git a/omp/factorization/par_ict_kernels.cpp b/omp/factorization/par_ict_kernels.cpp index b5546e1a644..a67ad860965 100644 --- a/omp/factorization/par_ict_kernels.cpp +++ b/omp/factorization/par_ict_kernels.cpp @@ -91,7 +91,7 @@ void compute_factor(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); @@ -166,7 +166,7 @@ void add_candidates(std::shared_ptr exec, [](IndexType, row_state) {}); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); diff --git a/omp/factorization/par_ilu_kernels.cpp b/omp/factorization/par_ilu_kernels.cpp index da42a631b81..0504bca8b1d 100644 --- a/omp/factorization/par_ilu_kernels.cpp +++ b/omp/factorization/par_ilu_kernels.cpp @@ -88,7 +88,7 @@ void compute_l_u_factors(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL); diff --git a/omp/factorization/par_ilut_kernels.cpp b/omp/factorization/par_ilut_kernels.cpp index a24709e4f1a..af9229f3509 100644 --- a/omp/factorization/par_ilut_kernels.cpp +++ b/omp/factorization/par_ilut_kernels.cpp @@ -54,7 +54,7 @@ void threshold_select(std::shared_ptr exec, threshold = abs(*target); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL); @@ -144,7 +144,7 @@ void threshold_filter(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL); @@ -181,7 +181,12 @@ void threshold_filter_approx(std::shared_ptr exec, // pick splitters for (IndexType i = 0; i < bucket_count - 1; ++i) { // shift by one so we get upper bounds for the buckets - sample[i] = sample[(i + 1) * sampleselect_oversampling]; + // NVHPC 23.3 seems to handle assignment index with + // optimization wrongly on a custom class when IndexType is long. We set + // the index explicitly with volatile to solve it. NVHPC24.1 fixed this + // issue. https://godbolt.org/z/srYhGndKn + volatile auto index = (i + 1) * sampleselect_oversampling; + sample[i] = sample[index]; } // count elements per bucket auto total_histogram = reinterpret_cast(sample + bucket_count); @@ -228,7 +233,7 @@ void threshold_filter_approx(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL); @@ -312,7 +317,7 @@ void compute_l_u_factors(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL); @@ -428,7 +433,7 @@ void add_candidates(std::shared_ptr exec, [](IndexType, row_state) {}); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL); diff --git a/reference/factorization/cholesky_kernels.cpp b/reference/factorization/cholesky_kernels.cpp index e4d7112a15f..199cae4c8fa 100644 --- a/reference/factorization/cholesky_kernels.cpp +++ b/reference/factorization/cholesky_kernels.cpp @@ -63,7 +63,7 @@ void symbolic_count(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT); @@ -102,7 +102,7 @@ void symbolic_factorize( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE); @@ -140,7 +140,7 @@ void forest_from_factor( num_rows + 1, child_ptrs); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR); @@ -172,7 +172,8 @@ void initialize(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CHOLESKY_INITIALIZE); namespace { @@ -254,7 +255,8 @@ void factorize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_CHOLESKY_FACTORIZE); } // namespace cholesky diff --git a/reference/factorization/factorization_kernels.cpp b/reference/factorization/factorization_kernels.cpp index 99b522ffba9..15d778c2235 100644 --- a/reference/factorization/factorization_kernels.cpp +++ b/reference/factorization/factorization_kernels.cpp @@ -127,7 +127,7 @@ void add_diagonal_elements(std::shared_ptr exec, mtx_builder.get_col_idx_array() = std::move(new_col_idxs_array); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL); @@ -159,7 +159,7 @@ void initialize_row_ptrs_l_u( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL); @@ -177,7 +177,7 @@ void initialize_l_u(std::shared_ptr exec, helpers::identity{})); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL); @@ -204,7 +204,7 @@ void initialize_row_ptrs_l( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL); @@ -227,7 +227,7 @@ void initialize_l(std::shared_ptr exec, helpers::identity{})); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL); diff --git a/reference/factorization/ic_kernels.cpp b/reference/factorization/ic_kernels.cpp index 93945c2da14..3557ee0b978 100644 --- a/reference/factorization/ic_kernels.cpp +++ b/reference/factorization/ic_kernels.cpp @@ -69,7 +69,7 @@ void sparselib_ic(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_IC_SPARSELIB_IC_KERNEL); diff --git a/reference/factorization/ilu_kernels.cpp b/reference/factorization/ilu_kernels.cpp index 3323e0b6cef..2eedd988929 100644 --- a/reference/factorization/ilu_kernels.cpp +++ b/reference/factorization/ilu_kernels.cpp @@ -65,7 +65,7 @@ void sparselib_ilu(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL); diff --git a/reference/factorization/lu_kernels.cpp b/reference/factorization/lu_kernels.cpp index c72b14456e1..d8bb8c427ef 100644 --- a/reference/factorization/lu_kernels.cpp +++ b/reference/factorization/lu_kernels.cpp @@ -58,7 +58,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_LU_INITIALIZE); namespace { @@ -124,7 +125,8 @@ void factorize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_LU_FACTORIZE); template @@ -212,4 +214,4 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( } // namespace lu_factorization } // namespace reference } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/reference/factorization/par_ic_kernels.cpp b/reference/factorization/par_ic_kernels.cpp index 4da317cf201..e8f3a9273f4 100644 --- a/reference/factorization/par_ic_kernels.cpp +++ b/reference/factorization/par_ic_kernels.cpp @@ -46,7 +46,7 @@ void init_factor(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_IC_INIT_FACTOR_KERNEL); @@ -96,7 +96,7 @@ void compute_factor(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL); diff --git a/reference/factorization/par_ict_kernels.cpp b/reference/factorization/par_ict_kernels.cpp index 684158d380c..c6b192b328b 100644 --- a/reference/factorization/par_ict_kernels.cpp +++ b/reference/factorization/par_ict_kernels.cpp @@ -89,7 +89,7 @@ void compute_factor(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); @@ -167,7 +167,7 @@ void add_candidates(std::shared_ptr exec, [](IndexType, row_state) {}); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); diff --git a/reference/factorization/par_ilu_kernels.cpp b/reference/factorization/par_ilu_kernels.cpp index 44c2e5f66bc..ddcc41d1070 100644 --- a/reference/factorization/par_ilu_kernels.cpp +++ b/reference/factorization/par_ilu_kernels.cpp @@ -86,7 +86,7 @@ void compute_l_u_factors(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL); diff --git a/reference/factorization/par_ilut_kernels.cpp b/reference/factorization/par_ilut_kernels.cpp index abef6e9b5f2..c22c6924d6c 100644 --- a/reference/factorization/par_ilut_kernels.cpp +++ b/reference/factorization/par_ilut_kernels.cpp @@ -58,7 +58,7 @@ void threshold_select(std::shared_ptr exec, threshold = abs(*target); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL); @@ -150,7 +150,7 @@ void threshold_filter(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL); @@ -191,7 +191,12 @@ void threshold_filter_approx(std::shared_ptr exec, // pick splitters for (IndexType i = 0; i < bucket_count - 1; ++i) { // shift by one so we get upper bounds for the buckets - sample[i] = sample[(i + 1) * sampleselect_oversampling]; + // NVHPC 23.3 seems to handle assignment index with + // optimization wrongly on a custom class when IndexType is long. We set + // the index explicitly with volatile to solve it. NVHPC24.1 fixed this + // issue. https://godbolt.org/z/srYhGndKn + volatile auto index = (i + 1) * sampleselect_oversampling; + sample[i] = sample[index]; } // count elements per bucket auto histogram = reinterpret_cast(sample + bucket_count); @@ -221,7 +226,7 @@ void threshold_filter_approx(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL); @@ -309,7 +314,7 @@ void compute_l_u_factors(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL); @@ -432,7 +437,7 @@ void add_candidates(std::shared_ptr exec, [](IndexType, row_state) {}); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL); diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp index b4c33d76ab9..671630c99eb 100644 --- a/reference/test/factorization/cholesky_kernels.cpp +++ b/reference/test/factorization/cholesky_kernels.cpp @@ -245,7 +245,7 @@ class Cholesky : public ::testing::Test { std::shared_ptr combined_ref; }; -TYPED_TEST_SUITE(Cholesky, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(Cholesky, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/reference/test/factorization/factorization.cpp b/reference/test/factorization/factorization.cpp index 2ded81d4867..73bf8cdc321 100644 --- a/reference/test/factorization/factorization.cpp +++ b/reference/test/factorization/factorization.cpp @@ -70,7 +70,7 @@ class Factorization : public ::testing::Test { std::shared_ptr beta; }; -TYPED_TEST_SUITE(Factorization, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(Factorization, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/reference/test/factorization/ic_kernels.cpp b/reference/test/factorization/ic_kernels.cpp index 1593da136a4..84faa3c3b45 100644 --- a/reference/test/factorization/ic_kernels.cpp +++ b/reference/test/factorization/ic_kernels.cpp @@ -80,7 +80,8 @@ class Ic : public ::testing::Test { gko::remove_complex tol; }; -TYPED_TEST_SUITE(Ic, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Ic, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Ic, ThrowNotSupportedForWrongLinOp) diff --git a/reference/test/factorization/ilu_kernels.cpp b/reference/test/factorization/ilu_kernels.cpp index aaeb44382f1..1ba1fedf13f 100644 --- a/reference/test/factorization/ilu_kernels.cpp +++ b/reference/test/factorization/ilu_kernels.cpp @@ -170,7 +170,8 @@ class Ilu : public ::testing::Test { std::unique_ptr ilu_factory_sort; }; -TYPED_TEST_SUITE(Ilu, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Ilu, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Ilu, ThrowNotSupportedForWrongLinOp1) diff --git a/reference/test/factorization/lu_kernels.cpp b/reference/test/factorization/lu_kernels.cpp index c10937ac486..7b4a860b0d5 100644 --- a/reference/test/factorization/lu_kernels.cpp +++ b/reference/test/factorization/lu_kernels.cpp @@ -98,7 +98,8 @@ class Lu : public ::testing::Test { gko::array row_descs; }; -TYPED_TEST_SUITE(Lu, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Lu, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Lu, SymbolicCholeskyWorks) @@ -219,7 +220,7 @@ TYPED_TEST(Lu, KernelFactorizeWorks) diag_idxs.get_const_data(), this->mtx_lu.get(), true, tmp); GKO_ASSERT_MTX_NEAR(this->mtx_lu, mtx_lu_ref, - 15 * r::value); + 30 * r::value); }); } @@ -268,7 +269,7 @@ TYPED_TEST(Lu, FactorizeNonsymmetricWorks) GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_combined(), this->mtx_lu); GKO_ASSERT_MTX_NEAR(lu->get_combined(), this->mtx_lu, - 15 * r::value); + 30 * r::value); ASSERT_EQ(lu->get_storage_type(), gko::experimental::factorization::storage_type::combined_lu); ASSERT_EQ(lu->get_lower_factor(), nullptr); @@ -294,7 +295,7 @@ TYPED_TEST(Lu, FactorizeNearSymmetricWorks) GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_combined(), this->mtx_lu); GKO_ASSERT_MTX_NEAR(lu->get_combined(), this->mtx_lu, - 15 * r::value); + 30 * r::value); ASSERT_EQ(lu->get_storage_type(), gko::experimental::factorization::storage_type::combined_lu); ASSERT_EQ(lu->get_lower_factor(), nullptr); @@ -321,7 +322,7 @@ TYPED_TEST(Lu, FactorizeWithKnownSparsityWorks) auto lu = factory->generate(this->mtx); GKO_ASSERT_MTX_NEAR(lu->get_combined(), this->mtx_lu, - 15 * r::value); + 30 * r::value); ASSERT_EQ(lu->get_storage_type(), gko::experimental::factorization::storage_type::combined_lu); ASSERT_EQ(lu->get_lower_factor(), nullptr); diff --git a/reference/test/factorization/par_ic_kernels.cpp b/reference/test/factorization/par_ic_kernels.cpp index b9caf8c9e5e..481e89bb744 100644 --- a/reference/test/factorization/par_ic_kernels.cpp +++ b/reference/test/factorization/par_ic_kernels.cpp @@ -104,7 +104,8 @@ class ParIc : public ::testing::Test { gko::remove_complex tol; }; -TYPED_TEST_SUITE(ParIc, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(ParIc, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(ParIc, KernelCompute) diff --git a/reference/test/factorization/par_ict_kernels.cpp b/reference/test/factorization/par_ict_kernels.cpp index 55ac5771732..d3b6df59f42 100644 --- a/reference/test/factorization/par_ict_kernels.cpp +++ b/reference/test/factorization/par_ict_kernels.cpp @@ -137,7 +137,8 @@ class ParIct : public ::testing::Test { gko::remove_complex tol; }; -TYPED_TEST_SUITE(ParIct, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(ParIct, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(ParIct, KernelInitializeRowPtrsL) diff --git a/reference/test/factorization/par_ilu_kernels.cpp b/reference/test/factorization/par_ilu_kernels.cpp index bf4e422f640..3d590c1a6d6 100644 --- a/reference/test/factorization/par_ilu_kernels.cpp +++ b/reference/test/factorization/par_ilu_kernels.cpp @@ -180,7 +180,8 @@ class ParIlu : public ::testing::Test { std::unique_ptr ilu_factory_sort; }; -TYPED_TEST_SUITE(ParIlu, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(ParIlu, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(ParIlu, KernelAddDiagonalElementsEmpty) diff --git a/reference/test/factorization/par_ilut_kernels.cpp b/reference/test/factorization/par_ilut_kernels.cpp index 59805f246f8..3a6ba9232da 100644 --- a/reference/test/factorization/par_ilut_kernels.cpp +++ b/reference/test/factorization/par_ilut_kernels.cpp @@ -54,6 +54,7 @@ class ParIlut : public ::testing::Test { using ComplexCsr = gko::matrix::Csr>, index_type>; + using complex_value_type = std::complex>; ParIlut() : ref(gko::ReferenceExecutor::create()), @@ -75,16 +76,24 @@ class ParIlut : public ::testing::Test { {0., -3., 0., 1.}}, ref)), mtx1_complex(gko::initialize( - {{{.1, 0.}, {0., 0.}, {0., 0.}, {0., 0.}}, - {{-1., .1}, {.1, -1.}, {0., 0.}, {0., 0.}}, - {{-1., 1.}, {-2., .2}, {-1., -.3}, {0., 0.}}, - {{1., -2.}, {-3., -.1}, {-1., .1}, {.1, 2.}}}, + {{complex_value_type{.1, 0.}, complex_value_type{0., 0.}, + complex_value_type{0., 0.}, complex_value_type{0., 0.}}, + {complex_value_type{-1., .1}, complex_value_type{.1, -1.}, + complex_value_type{0., 0.}, complex_value_type{0., 0.}}, + {complex_value_type{-1., 1.}, complex_value_type{-2., .2}, + complex_value_type{-1., -.3}, complex_value_type{0., 0.}}, + {complex_value_type{1., -2.}, complex_value_type{-3., -.1}, + complex_value_type{-1., .1}, complex_value_type{.1, 2.}}}, ref)), mtx1_expect_complex_thrm(gko::initialize( - {{{.1, 0.}, {0., 0.}, {0., 0.}, {0., 0.}}, - {{0., 0.}, {.1, -1.}, {0., 0.}, {0., 0.}}, - {{-1., 1.}, {-2., .2}, {-1., -.3}, {0., 0.}}, - {{1., -2.}, {-3., -.1}, {0., 0.}, {.1, 2.}}}, + {{complex_value_type{.1, 0.}, complex_value_type{0., 0.}, + complex_value_type{0., 0.}, complex_value_type{0., 0.}}, + {complex_value_type{0., 0.}, complex_value_type{.1, -1.}, + complex_value_type{0., 0.}, complex_value_type{0., 0.}}, + {complex_value_type{-1., 1.}, complex_value_type{-2., .2}, + complex_value_type{-1., -.3}, complex_value_type{0., 0.}}, + {complex_value_type{1., -2.}, complex_value_type{-3., -.1}, + complex_value_type{0., 0.}, complex_value_type{.1, 2.}}}, ref)), identity(gko::initialize( {{1., 0., 0.}, {0., 1., 0.}, {0., 0., 1.}}, ref)), @@ -268,7 +277,7 @@ class ParIlut : public ::testing::Test { gko::remove_complex tol; }; // namespace -TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp index b9b8bbf00ee..e9f64bb1152 100644 --- a/test/factorization/lu_kernels.cpp +++ b/test/factorization/lu_kernels.cpp @@ -129,7 +129,7 @@ class Lu : public CommonTestFixture { }; #ifdef GKO_COMPILING_OMP -using Types = gko::test::ValueIndexTypes; +using Types = gko::test::ValueIndexTypesWithHalf; #elif defined(GKO_COMPILING_CUDA) // CUDA don't support long indices for sorting, and the triangular solvers // seem broken diff --git a/test/factorization/par_ic_kernels.cpp b/test/factorization/par_ic_kernels.cpp index de2342a28db..bb53a454e21 100644 --- a/test/factorization/par_ic_kernels.cpp +++ b/test/factorization/par_ic_kernels.cpp @@ -41,8 +41,7 @@ class ParIc : public CommonTestFixture { mtx_l = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution>(0, 10.0), - rand_engine, ref); + std::normal_distribution<>(0, 10.0), rand_engine, ref); dmtx_ani = Csr::create(exec); dmtx_l_ani = Csr::create(exec); dmtx_l_ani_init = Csr::create(exec); @@ -87,7 +86,8 @@ class ParIc : public CommonTestFixture { std::unique_ptr dmtx_l_ani_init; }; -TYPED_TEST_SUITE(ParIc, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(ParIc, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(ParIc, KernelInitFactorIsEquivalentToRef) @@ -107,6 +107,8 @@ TYPED_TEST(ParIc, KernelComputeFactorIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using Coo = typename TestFixture::Coo; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto square_size = this->mtx_ani->get_size(); auto mtx_l_coo = Coo::create(this->ref, square_size); this->mtx_l_ani->convert_to(mtx_l_coo); diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp index 3b33e52630c..945f874ef26 100644 --- a/test/factorization/par_ict_kernels.cpp +++ b/test/factorization/par_ict_kernels.cpp @@ -47,15 +47,11 @@ class ParIct : public CommonTestFixture { mtx = gko::test::generate_random_matrix( mtx_size[0], mtx_size[1], std::uniform_int_distribution(10, mtx_size[1]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_l = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); dmtx_ani = Csr::create(exec); dmtx_l_ani = Csr::create(exec); @@ -97,7 +93,8 @@ class ParIct : public CommonTestFixture { std::unique_ptr dmtx_l; }; -TYPED_TEST_SUITE(ParIct, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(ParIct, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(ParIct, KernelAddCandidatesIsEquivalentToRef) @@ -127,6 +124,8 @@ TYPED_TEST(ParIct, KernelComputeFactorIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using Coo = typename TestFixture::Coo; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto square_size = this->mtx_ani->get_size(); auto mtx_l_coo = Coo::create(this->ref, square_size); this->mtx_l_ani->convert_to(mtx_l_coo); diff --git a/test/factorization/par_ilu_kernels.cpp b/test/factorization/par_ilu_kernels.cpp index 88f5ecff0d9..216a4f597cb 100644 --- a/test/factorization/par_ilu_kernels.cpp +++ b/test/factorization/par_ilu_kernels.cpp @@ -59,8 +59,7 @@ class ParIlu : public CommonTestFixture { return gko::test::generate_random_matrix( num_rows, num_cols, std::uniform_int_distribution(0, num_cols - 1), - std::normal_distribution>(0.0, 1.0), - rand_engine, ref); + std::normal_distribution<>(0.0, 1.0), rand_engine, ref); } std::unique_ptr gen_unsorted_mtx(index_type num_rows, @@ -145,7 +144,8 @@ class ParIlu : public CommonTestFixture { } }; -TYPED_TEST_SUITE(ParIlu, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(ParIlu, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(ParIlu, KernelAddDiagonalElementsSortedEquivalentToRef) @@ -237,6 +237,8 @@ TYPED_TEST(ParIlu, KernelInitializeParILUIsEquivalentToRef) TYPED_TEST(ParIlu, KernelComputeParILUIsEquivalentToRef) { using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); std::unique_ptr l_mtx{}; std::unique_ptr u_mtx{}; std::unique_ptr dl_mtx{}; @@ -255,6 +257,7 @@ TYPED_TEST(ParIlu, KernelComputeParILUWithMoreIterationsIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); std::unique_ptr l_mtx{}; std::unique_ptr u_mtx{}; std::unique_ptr dl_mtx{}; diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp index dff3cc702c1..6804a3edcce 100644 --- a/test/factorization/par_ilut_kernels.cpp +++ b/test/factorization/par_ilut_kernels.cpp @@ -48,39 +48,27 @@ class ParIlut : public CommonTestFixture { mtx1 = gko::test::generate_random_matrix( mtx_size[0], mtx_size[1], std::uniform_int_distribution(10, mtx_size[1]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx2 = gko::test::generate_random_matrix( mtx_size[0], mtx_size[1], std::uniform_int_distribution(0, mtx_size[1]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_square = gko::test::generate_random_matrix( mtx_size[0], mtx_size[0], std::uniform_int_distribution(1, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_l = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_l2 = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], true, std::uniform_int_distribution(1, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_u = gko::test::generate_random_upper_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); dmtx1 = gko::clone(exec, mtx1); dmtx2 = gko::clone(exec, mtx2); @@ -134,7 +122,7 @@ class ParIlut : public CommonTestFixture { const std::unique_ptr& dmtx, index_type rank) { double tolerance = - gko::is_complex() ? r::value : 0.0; + gko::is_complex() ? double(r::value) : 0.0; auto size = index_type(mtx->get_num_stored_elements()); using ValueType = typename Mtx::value_type; @@ -189,7 +177,7 @@ class ParIlut : public CommonTestFixture { const std::unique_ptr& dmtx, index_type rank) { double tolerance = - gko::is_complex() ? r::value : 0.0; + gko::is_complex() ? double(r::value) : 0.0; auto res = Mtx::create(ref, mtx_size); auto dres = Mtx::create(exec, mtx_size); auto res_coo = Coo::create(ref, mtx_size); @@ -245,12 +233,15 @@ class ParIlut : public CommonTestFixture { std::unique_ptr dmtx_u; }; -TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); TYPED_TEST(ParIlut, KernelThresholdSelectIsEquivalentToRef) { + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); + this->test_select(this->mtx_l, this->dmtx_l, this->mtx_l->get_num_stored_elements() / 3); } @@ -258,12 +249,18 @@ TYPED_TEST(ParIlut, KernelThresholdSelectIsEquivalentToRef) TYPED_TEST(ParIlut, KernelThresholdSelectMinIsEquivalentToRef) { + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); + this->test_select(this->mtx_l, this->dmtx_l, 0); } TYPED_TEST(ParIlut, KernelThresholdSelectMaxIsEquivalentToRef) { + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); + this->test_select(this->mtx_l, this->dmtx_l, this->mtx_l->get_num_stored_elements() - 1); } @@ -330,6 +327,7 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef) using Coo = typename TestFixture::Coo; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; + SKIP_IF_HALF(value_type); this->test_filter(this->mtx_l, this->dmtx_l, 0.5, true); auto res = Csr::create(this->ref, this->mtx_size); auto dres = Csr::create(this->exec, this->mtx_size); @@ -355,6 +353,9 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef) TYPED_TEST(ParIlut, KernelThresholdFilterApproxLowerIsEquivalentToRef) { + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); + this->test_filter_approx(this->mtx_l, this->dmtx_l, this->mtx_l->get_num_stored_elements() / 2); } @@ -362,12 +363,18 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxLowerIsEquivalentToRef) TYPED_TEST(ParIlut, KernelThresholdFilterApproxNoneLowerIsEquivalentToRef) { + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); + this->test_filter_approx(this->mtx_l, this->dmtx_l, 0); } TYPED_TEST(ParIlut, KernelThresholdFilterApproxAllLowerIsEquivalentToRef) { + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); + this->test_filter_approx(this->mtx_l, this->dmtx_l, this->mtx_l->get_num_stored_elements() - 1); } @@ -377,6 +384,8 @@ TYPED_TEST(ParIlut, KernelAddCandidatesIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using value_type = typename TestFixture::value_type; + // there's one value larger than half range + SKIP_IF_HALF(value_type); auto square_size = this->mtx_square->get_size(); auto mtx_lu = Csr::create(this->ref, square_size); this->mtx_l2->apply(this->mtx_u, mtx_lu); @@ -405,6 +414,8 @@ TYPED_TEST(ParIlut, KernelComputeLUIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using Coo = typename TestFixture::Coo; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto square_size = this->mtx_ani->get_size(); auto mtx_l_coo = Coo::create(this->ref, square_size); auto mtx_u_coo = Coo::create(this->ref, square_size); From c6a92ce8278fa57f507f0b5e32b93ce3880e013c Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 25 Oct 2024 18:23:26 +0200 Subject: [PATCH 29/69] factorization config dispatch --- core/config/factorization_config.cpp | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/core/config/factorization_config.cpp b/core/config/factorization_config.cpp index 259d32cb872..dae4072cce8 100644 --- a/core/config/factorization_config.cpp +++ b/core/config/factorization_config.cpp @@ -23,15 +23,18 @@ namespace gko { namespace config { -GKO_PARSE_VALUE_AND_INDEX_TYPE(Factorization_Ic, gko::factorization::Ic); -GKO_PARSE_VALUE_AND_INDEX_TYPE(Factorization_Ilu, gko::factorization::Ilu); -GKO_PARSE_VALUE_AND_INDEX_TYPE(Cholesky, - gko::experimental::factorization::Cholesky); -GKO_PARSE_VALUE_AND_INDEX_TYPE(Lu, gko::experimental::factorization::Lu); -GKO_PARSE_VALUE_AND_INDEX_TYPE(ParIlu, gko::factorization::ParIlu); -GKO_PARSE_VALUE_AND_INDEX_TYPE(ParIlut, gko::factorization::ParIlut); -GKO_PARSE_VALUE_AND_INDEX_TYPE(ParIc, gko::factorization::ParIc); -GKO_PARSE_VALUE_AND_INDEX_TYPE(ParIct, gko::factorization::ParIct); +GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(Factorization_Ic, + gko::factorization::Ic); +GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(Factorization_Ilu, + gko::factorization::Ilu); +GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF( + Cholesky, gko::experimental::factorization::Cholesky); +GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(Lu, + gko::experimental::factorization::Lu); +GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(ParIlu, gko::factorization::ParIlu); +GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(ParIlut, gko::factorization::ParIlut); +GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(ParIc, gko::factorization::ParIc); +GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(ParIct, gko::factorization::ParIct); } // namespace config From 48c8e447a04b064e0e2ab7d16dbb35986f95bb02 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 5 Nov 2024 16:28:10 +0100 Subject: [PATCH 30/69] cmake cuda test with cuda arch and fix is_finite --- cmake/create_test.cmake | 4 ---- common/cuda_hip/base/math.hpp | 12 ++++++++++++ cuda/test/base/math.cu | 24 ++++++++++++++++++++---- hip/test/base/math.hip.cpp | 24 ++++++++++++++++++++---- 4 files changed, 52 insertions(+), 12 deletions(-) diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 20f074778a1..c540d6e2cf7 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -171,10 +171,6 @@ function(ginkgo_create_cuda_test_internal test_name filename test_target_name) PRIVATE $<$:--expt-extended-lambda --expt-relaxed-constexpr>) endif() - # we handle CUDA architecture flags for now, disable CMake handling - if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) - set_target_properties(${test_target_name} PROPERTIES CUDA_ARCHITECTURES OFF) - endif() ginkgo_set_test_target_properties(${test_target_name} "_cuda" ${ARGN}) ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE cudagpu) endfunction(ginkgo_create_cuda_test_internal) diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp index f83533d8f0d..51a7fedf0c4 100644 --- a/common/cuda_hip/base/math.hpp +++ b/common/cuda_hip/base/math.hpp @@ -162,6 +162,18 @@ __device__ __forceinline__ __half sqrt(const __half& val) } +// using overload here. Otherwise, compiler still think the is_finite +// specialization is still __host__ __device__ function. +__device__ __forceinline__ bool is_finite(const __half& value) +{ + return abs(value) < device_numeric_limits<__half>::inf(); +} + +__device__ __forceinline__ bool is_finite(const thrust::complex<__half>& value) +{ + return is_finite(value.real()) && is_finite(value.imag()); +} + #endif diff --git a/cuda/test/base/math.cu b/cuda/test/base/math.cu index 1025c3cc489..27a35b2421a 100644 --- a/cuda/test/base/math.cu +++ b/cuda/test/base/math.cu @@ -10,6 +10,7 @@ #include +#include #include #include #include @@ -26,8 +27,8 @@ namespace kernel { template __device__ bool test_real_is_finite_function(FuncType isfin) { - constexpr T inf = gko::device_numeric_limits::inf(); - constexpr T quiet_nan = NAN; + const T inf = gko::device_numeric_limits::inf(); + const auto quiet_nan = static_cast(NAN); bool test_true{}; bool test_false{}; @@ -46,8 +47,8 @@ __device__ bool test_complex_is_finite_function(FuncType isfin) "Template type must be a complex type."); using T = gko::remove_complex; using c_type = gko::kernels::cuda::cuda_type; - constexpr T inf = gko::device_numeric_limits::inf(); - constexpr T quiet_nan = NAN; + const T inf = gko::device_numeric_limits::inf(); + const auto quiet_nan = static_cast(NAN); bool test_true{}; bool test_false{}; @@ -109,6 +110,21 @@ TEST_F(IsFinite, Float) { ASSERT_TRUE(test_real_is_finite_kernel()); } TEST_F(IsFinite, Double) { ASSERT_TRUE(test_real_is_finite_kernel()); } +#if GINKGO_ENABLE_HALF + + +TEST_F(IsFinite, Half) { ASSERT_TRUE(test_real_is_finite_kernel<__half>()); } + + +TEST_F(IsFinite, HalfComplex) +{ + ASSERT_TRUE(test_complex_is_finite_kernel>()); +} + + +#endif // GINKGO_ENABLE_HALF + + TEST_F(IsFinite, FloatComplex) { ASSERT_TRUE(test_complex_is_finite_kernel>()); diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp index f69ca804aa9..ef25220957b 100644 --- a/hip/test/base/math.hip.cpp +++ b/hip/test/base/math.hip.cpp @@ -16,6 +16,7 @@ #include +#include #include #include #include @@ -32,8 +33,8 @@ namespace kernel { template __device__ bool test_real_is_finite_function(FuncType isfin) { - constexpr T inf = gko::device_numeric_limits::inf(); - constexpr T quiet_nan = NAN; + const T inf = gko::device_numeric_limits::inf(); + const auto quiet_nan = static_cast(NAN); bool test_true{}; bool test_false{}; @@ -52,8 +53,8 @@ __device__ bool test_complex_is_finite_function(FuncType isfin) "Template type must be a complex type."); using T = gko::remove_complex; using c_type = gko::kernels::hip::hip_type; - constexpr T inf = gko::device_numeric_limits::inf(); - constexpr T quiet_nan = NAN; + const T inf = gko::device_numeric_limits::inf(); + const auto quiet_nan = static_cast(NAN); bool test_true{}; bool test_false{}; @@ -115,6 +116,21 @@ TEST_F(IsFinite, Float) { ASSERT_TRUE(test_real_is_finite_kernel()); } TEST_F(IsFinite, Double) { ASSERT_TRUE(test_real_is_finite_kernel()); } +#if GINKGO_ENABLE_HALF + + +TEST_F(IsFinite, Half) { ASSERT_TRUE(test_real_is_finite_kernel<__half>()); } + + +TEST_F(IsFinite, HalfComplex) +{ + ASSERT_TRUE(test_complex_is_finite_kernel>()); +} + + +#endif // GINKGO_ENABLE_HALF + + TEST_F(IsFinite, FloatComplex) { ASSERT_TRUE(test_complex_is_finite_kernel>()); From 3baaa5354b7cc2b51a449f13638807b42073e1be Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 5 Nov 2024 17:02:00 +0100 Subject: [PATCH 31/69] figure out factorization test --- core/test/utils/assertions.hpp | 5 ++- test/factorization/cholesky_kernels.cpp | 2 +- test/factorization/lu_kernels.cpp | 2 +- test/factorization/par_ic_kernels.cpp | 15 +++++++- test/factorization/par_ict_kernels.cpp | 3 ++ test/factorization/par_ilu_kernels.cpp | 6 ++++ test/factorization/par_ilut_kernels.cpp | 46 ++++++++++++++++++++----- 7 files changed, 67 insertions(+), 12 deletions(-) diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index 7bdc71ea94e..3dae62151b3 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -259,9 +259,12 @@ template double get_relative_error(const MatrixData1& first, const MatrixData2& second) { using std::abs; - using vt = typename detail::biggest_valuetype< + using biggest_vt = typename detail::biggest_valuetype< typename MatrixData1::value_type, typename MatrixData2::value_type>::type; + // using the double or complex to check the error + using vt = std::conditional_t(), + std::complex, double>; using real_vt = remove_complex; real_vt diff = 0.0; diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp index 61bc86bbf43..007f3cbf6fd 100644 --- a/test/factorization/cholesky_kernels.cpp +++ b/test/factorization/cholesky_kernels.cpp @@ -115,7 +115,7 @@ using Types = gko::test::ValueIndexTypes; #elif defined(GKO_COMPILING_CUDA) // CUDA doesn't support long indices for sorting, and the triangular solvers // seem broken -using Types = gko::test::cartesian_type_product_t>; #else // HIP only supports real types and int32 diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp index e9f64bb1152..59f3cb30327 100644 --- a/test/factorization/lu_kernels.cpp +++ b/test/factorization/lu_kernels.cpp @@ -133,7 +133,7 @@ using Types = gko::test::ValueIndexTypesWithHalf; #elif defined(GKO_COMPILING_CUDA) // CUDA don't support long indices for sorting, and the triangular solvers // seem broken -using Types = gko::test::cartesian_type_product_t>; #else // HIP only supports real types and int32 diff --git a/test/factorization/par_ic_kernels.cpp b/test/factorization/par_ic_kernels.cpp index bb53a454e21..10eccb83f10 100644 --- a/test/factorization/par_ic_kernels.cpp +++ b/test/factorization/par_ic_kernels.cpp @@ -108,16 +108,29 @@ TYPED_TEST(ParIc, KernelComputeFactorIsEquivalentToRef) using Csr = typename TestFixture::Csr; using Coo = typename TestFixture::Coo; using value_type = typename TestFixture::value_type; +#ifdef GKO_COMPILING_HIP + // hip does not support memory operation in 16bit SKIP_IF_HALF(value_type); +#endif auto square_size = this->mtx_ani->get_size(); auto mtx_l_coo = Coo::create(this->ref, square_size); this->mtx_l_ani->convert_to(mtx_l_coo); auto dmtx_l_coo = gko::clone(this->exec, mtx_l_coo); + // If we compute the mtx_near in half, we still get less 1e-4 in half + // precision By using double in mtx_near, we get around 2.4e-4. + // TODO: when gko::half support subnormal value, revisit this. + // Use the reference result as initial values in device::compute_factor, it + // still converges to the same result, which gives around 2.4e-4 against the + // reference result. Applying more iterations on the device side does not + // change the result. It might mean some values are subnormal such that both + // converges to different stable result. + auto tol = std::max( + 1e-4, static_cast(r>::value)); gko::kernels::reference::par_ic_factorization::compute_factor( this->ref, 1, mtx_l_coo.get(), this->mtx_l_ani_init.get()); gko::kernels::GKO_DEVICE_NAMESPACE::par_ic_factorization::compute_factor( this->exec, 100, dmtx_l_coo.get(), this->dmtx_l_ani_init.get()); - GKO_ASSERT_MTX_NEAR(this->mtx_l_ani_init, this->dmtx_l_ani_init, 1e-4); + GKO_EXPECT_MTX_NEAR(this->mtx_l_ani_init, this->dmtx_l_ani_init, tol); } diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp index 945f874ef26..07a4ddc63ff 100644 --- a/test/factorization/par_ict_kernels.cpp +++ b/test/factorization/par_ict_kernels.cpp @@ -125,7 +125,10 @@ TYPED_TEST(ParIct, KernelComputeFactorIsEquivalentToRef) using Csr = typename TestFixture::Csr; using Coo = typename TestFixture::Coo; using value_type = typename TestFixture::value_type; +#ifdef GKO_COMPILING_HIP + // hip does not support memory operation in 16bit SKIP_IF_HALF(value_type); +#endif auto square_size = this->mtx_ani->get_size(); auto mtx_l_coo = Coo::create(this->ref, square_size); this->mtx_l_ani->convert_to(mtx_l_coo); diff --git a/test/factorization/par_ilu_kernels.cpp b/test/factorization/par_ilu_kernels.cpp index 216a4f597cb..8c3ab20a674 100644 --- a/test/factorization/par_ilu_kernels.cpp +++ b/test/factorization/par_ilu_kernels.cpp @@ -238,7 +238,10 @@ TYPED_TEST(ParIlu, KernelComputeParILUIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using value_type = typename TestFixture::value_type; +#ifdef GKO_COMPILING_HIP + // hip does not support memory operation in 16bit SKIP_IF_HALF(value_type); +#endif std::unique_ptr l_mtx{}; std::unique_ptr u_mtx{}; std::unique_ptr dl_mtx{}; @@ -257,7 +260,10 @@ TYPED_TEST(ParIlu, KernelComputeParILUWithMoreIterationsIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using value_type = typename TestFixture::value_type; +#ifdef GKO_COMPILING_HIP + // hip does not support memory operation in 16bit SKIP_IF_HALF(value_type); +#endif std::unique_ptr l_mtx{}; std::unique_ptr u_mtx{}; std::unique_ptr dl_mtx{}; diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp index 6804a3edcce..ba2d84b4cc7 100644 --- a/test/factorization/par_ilut_kernels.cpp +++ b/test/factorization/par_ilut_kernels.cpp @@ -48,27 +48,27 @@ class ParIlut : public CommonTestFixture { mtx1 = gko::test::generate_random_matrix( mtx_size[0], mtx_size[1], std::uniform_int_distribution(10, mtx_size[1]), - std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + std::normal_distribution<>(0.0, 1.0), rand_engine, ref); mtx2 = gko::test::generate_random_matrix( mtx_size[0], mtx_size[1], std::uniform_int_distribution(0, mtx_size[1]), - std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + std::normal_distribution<>(0.0, 1.0), rand_engine, ref); mtx_square = gko::test::generate_random_matrix( mtx_size[0], mtx_size[0], std::uniform_int_distribution(1, mtx_size[0]), - std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + std::normal_distribution<>(0.0, 1.0), rand_engine, ref); mtx_l = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + std::normal_distribution<>(0.0, 1.0), rand_engine, ref); mtx_l2 = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], true, std::uniform_int_distribution(1, mtx_size[0]), - std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + std::normal_distribution<>(0.0, 1.0), rand_engine, ref); mtx_u = gko::test::generate_random_upper_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + std::normal_distribution<>(0.0, 1.0), rand_engine, ref); dmtx1 = gko::clone(exec, mtx1); dmtx2 = gko::clone(exec, mtx2); @@ -240,7 +240,10 @@ TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypesWithHalf, TYPED_TEST(ParIlut, KernelThresholdSelectIsEquivalentToRef) { using value_type = typename TestFixture::value_type; +#ifdef GKO_COMPILING_HIP + // hip does not support memory operation in 16bit SKIP_IF_HALF(value_type); +#endif this->test_select(this->mtx_l, this->dmtx_l, this->mtx_l->get_num_stored_elements() / 3); @@ -250,7 +253,10 @@ TYPED_TEST(ParIlut, KernelThresholdSelectIsEquivalentToRef) TYPED_TEST(ParIlut, KernelThresholdSelectMinIsEquivalentToRef) { using value_type = typename TestFixture::value_type; +#ifdef GKO_COMPILING_HIP + // hip does not support memory operation in 16bit SKIP_IF_HALF(value_type); +#endif this->test_select(this->mtx_l, this->dmtx_l, 0); } @@ -259,7 +265,10 @@ TYPED_TEST(ParIlut, KernelThresholdSelectMinIsEquivalentToRef) TYPED_TEST(ParIlut, KernelThresholdSelectMaxIsEquivalentToRef) { using value_type = typename TestFixture::value_type; +#ifdef GKO_COMPILING_HIP + // hip does not support memory operation in 16bit SKIP_IF_HALF(value_type); +#endif this->test_select(this->mtx_l, this->dmtx_l, this->mtx_l->get_num_stored_elements() - 1); @@ -327,7 +336,12 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef) using Coo = typename TestFixture::Coo; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; +#ifdef GKO_COMPILING_HIP + // hip does not support memory operation in 16bit + // threshold_filter_approx calls sampleselect_count which needs 16 bits + // memory operation SKIP_IF_HALF(value_type); +#endif this->test_filter(this->mtx_l, this->dmtx_l, 0.5, true); auto res = Csr::create(this->ref, this->mtx_size); auto dres = Csr::create(this->exec, this->mtx_size); @@ -354,7 +368,12 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef) TYPED_TEST(ParIlut, KernelThresholdFilterApproxLowerIsEquivalentToRef) { using value_type = typename TestFixture::value_type; +#ifdef GKO_COMPILING_HIP + // hip does not support memory operation in 16bit + // threshold_filter_approx calls sampleselect_count which needs 16 bits + // memory operation SKIP_IF_HALF(value_type); +#endif this->test_filter_approx(this->mtx_l, this->dmtx_l, this->mtx_l->get_num_stored_elements() / 2); @@ -364,7 +383,12 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxLowerIsEquivalentToRef) TYPED_TEST(ParIlut, KernelThresholdFilterApproxNoneLowerIsEquivalentToRef) { using value_type = typename TestFixture::value_type; +#ifdef GKO_COMPILING_HIP + // hip does not support memory operation in 16bit + // threshold_filter_approx calls sampleselect_count which needs 16 bits + // memory operation SKIP_IF_HALF(value_type); +#endif this->test_filter_approx(this->mtx_l, this->dmtx_l, 0); } @@ -373,7 +397,12 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxNoneLowerIsEquivalentToRef) TYPED_TEST(ParIlut, KernelThresholdFilterApproxAllLowerIsEquivalentToRef) { using value_type = typename TestFixture::value_type; +#ifdef GKO_COMPILING_HIP + // hip does not support memory operation in 16bit + // threshold_filter_approx calls sampleselect_count which needs 16 bits + // memory operation SKIP_IF_HALF(value_type); +#endif this->test_filter_approx(this->mtx_l, this->dmtx_l, this->mtx_l->get_num_stored_elements() - 1); @@ -384,8 +413,6 @@ TYPED_TEST(ParIlut, KernelAddCandidatesIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using value_type = typename TestFixture::value_type; - // there's one value larger than half range - SKIP_IF_HALF(value_type); auto square_size = this->mtx_square->get_size(); auto mtx_lu = Csr::create(this->ref, square_size); this->mtx_l2->apply(this->mtx_u, mtx_lu); @@ -415,7 +442,10 @@ TYPED_TEST(ParIlut, KernelComputeLUIsEquivalentToRef) using Csr = typename TestFixture::Csr; using Coo = typename TestFixture::Coo; using value_type = typename TestFixture::value_type; +#ifdef GKO_COMPILING_HIP + // hip does not support memory operation in 16bit SKIP_IF_HALF(value_type); +#endif auto square_size = this->mtx_ani->get_size(); auto mtx_l_coo = Coo::create(this->ref, square_size); auto mtx_u_coo = Coo::create(this->ref, square_size); From 53a1d800b02ff7dcfe8f2222d60a17a10e8afa98 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 13 Nov 2024 19:14:11 +0100 Subject: [PATCH 32/69] change the diagonal to reduce random on parilut/parict --- test/factorization/par_ict_kernels.cpp | 13 +++++++++++++ test/factorization/par_ilut_kernels.cpp | 14 +++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp index 07a4ddc63ff..8d6579d584e 100644 --- a/test/factorization/par_ict_kernels.cpp +++ b/test/factorization/par_ict_kernels.cpp @@ -101,6 +101,19 @@ TYPED_TEST(ParIct, KernelAddCandidatesIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using value_type = typename TestFixture::value_type; + if (std::is_same_v, gko::half>) { + // We set the diagonal larger than 1 in half precision to reduce the + // possibility of resulting inf. It might introduce (a - llh)/diag when + // the entry is not presented in the original matrix + auto dist = std::uniform_real_distribution<>(1.0, 10.0); + for (gko::size_type i = 0; i < this->mtx_l->get_size()[0]; i++) { + this->mtx_l + ->get_values()[this->mtx_l->get_const_row_ptrs()[i + 1] - 1] = + gko::detail::get_rand_value(dist, + this->rand_engine); + } + this->dmtx_l->copy_from(this->mtx_l); + } auto mtx_llh = Csr::create(this->ref, this->mtx_size); this->mtx_l->apply(this->mtx_l->conj_transpose(), mtx_llh); auto dmtx_llh = Csr::create(this->exec, this->mtx_size); diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp index ba2d84b4cc7..b1af2b4c748 100644 --- a/test/factorization/par_ilut_kernels.cpp +++ b/test/factorization/par_ilut_kernels.cpp @@ -413,6 +413,18 @@ TYPED_TEST(ParIlut, KernelAddCandidatesIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using value_type = typename TestFixture::value_type; + if (std::is_same_v, gko::half>) { + // We set the diagonal larger than 1 in half precision to reduce the + // possibility of resulting inf. It might introduce (a - lu)/u_diag when + // the entry is not presented in the original matrix + auto dist = std::uniform_real_distribution<>(1.0, 10.0); + for (gko::size_type i = 0; i < this->mtx_u->get_size()[0]; i++) { + this->mtx_u->get_values()[this->mtx_u->get_const_row_ptrs()[i]] = + gko::detail::get_rand_value(dist, + this->rand_engine); + } + this->dmtx_u->copy_from(this->mtx_u); + } auto square_size = this->mtx_square->get_size(); auto mtx_lu = Csr::create(this->ref, square_size); this->mtx_l2->apply(this->mtx_u, mtx_lu); @@ -422,7 +434,7 @@ TYPED_TEST(ParIlut, KernelAddCandidatesIsEquivalentToRef) auto res_mtx_u = Csr::create(this->ref, square_size); auto dres_mtx_l = Csr::create(this->exec, square_size); auto dres_mtx_u = Csr::create(this->exec, square_size); - + // gko::write(std::cout, mtx_lu); gko::kernels::reference::par_ilut_factorization::add_candidates( this->ref, mtx_lu.get(), this->mtx_square.get(), this->mtx_l2.get(), this->mtx_u.get(), res_mtx_l.get(), res_mtx_u.get()); From 94bddee0b8ab2ff292a82623bb276f9a65926d39 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 25 Oct 2024 16:47:14 +0200 Subject: [PATCH 33/69] multigrid and the multigridlevel --- common/cuda_hip/multigrid/pgm_kernels.cpp | 5 +++-- common/cuda_hip/solver/multigrid_kernels.cpp | 12 +++++----- common/unified/multigrid/pgm_kernels.cpp | 4 ++-- core/device_hooks/common_kernels.inc.cpp | 16 ++++++++------ core/multigrid/fixed_coarsening.cpp | 3 ++- core/multigrid/pgm.cpp | 2 +- core/solver/multigrid.cpp | 22 +++++++++++++++++++ core/test/multigrid/fixed_coarsening.cpp | 2 +- core/test/multigrid/pgm.cpp | 2 +- core/test/solver/multigrid.cpp | 7 +++--- dpcpp/multigrid/pgm_kernels.dp.cpp | 5 +++-- dpcpp/solver/multigrid_kernels.dp.cpp | 8 ++++--- omp/multigrid/pgm_kernels.cpp | 5 +++-- omp/solver/multigrid_kernels.cpp | 8 ++++--- reference/multigrid/pgm_kernels.cpp | 9 ++++---- reference/solver/multigrid_kernels.cpp | 8 ++++--- .../multigrid/fixed_coarsening_kernels.cpp | 2 +- reference/test/multigrid/pgm_kernels.cpp | 3 ++- reference/test/solver/multigrid_kernels.cpp | 6 ++--- 19 files changed, 83 insertions(+), 46 deletions(-) diff --git a/common/cuda_hip/multigrid/pgm_kernels.cpp b/common/cuda_hip/multigrid/pgm_kernels.cpp index d3c44cf540e..0077b801e46 100644 --- a/common/cuda_hip/multigrid/pgm_kernels.cpp +++ b/common/cuda_hip/multigrid/pgm_kernels.cpp @@ -54,7 +54,8 @@ void sort_row_major(std::shared_ptr exec, size_type nnz, thrust::sort_by_key(thrust_policy(exec), it, it + nnz, vals_it); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_SORT_ROW_MAJOR); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_PGM_SORT_ROW_MAJOR); template @@ -78,7 +79,7 @@ void compute_coarse_coo(std::shared_ptr exec, vals_it, coarse_key_it, coarse_vals_it); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PGM_COMPUTE_COARSE_COO); diff --git a/common/cuda_hip/solver/multigrid_kernels.cpp b/common/cuda_hip/solver/multigrid_kernels.cpp index 9b22e457203..b5d8a0f77b9 100644 --- a/common/cuda_hip/solver/multigrid_kernels.cpp +++ b/common/cuda_hip/solver/multigrid_kernels.cpp @@ -141,7 +141,8 @@ void kcycle_step_1(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL); template @@ -174,7 +175,8 @@ void kcycle_step_2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL); template @@ -192,13 +194,13 @@ void kcycle_check_stop(std::shared_ptr exec, kernel::kcycle_check_stop_kernel<<get_stream()>>>( nrhs, as_device_type(old_norm->get_const_values()), - as_device_type(new_norm->get_const_values()), rel_tol, - as_device_type(dis_stop.get_data())); + as_device_type(new_norm->get_const_values()), + as_device_type(rel_tol), as_device_type(dis_stop.get_data())); } is_stop = get_element(dis_stop, 0); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF( GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL); diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp index 9ba144cba2e..9e59671a821 100644 --- a/common/unified/multigrid/pgm_kernels.cpp +++ b/common/unified/multigrid/pgm_kernels.cpp @@ -217,7 +217,7 @@ void find_strongest_neighbor( strongest_neighbor.get_data()); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PGM_FIND_STRONGEST_NEIGHBOR); template @@ -305,7 +305,7 @@ void assign_to_exist_agg(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PGM_ASSIGN_TO_EXIST_AGG); diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 4e64134a9f2..9c492871a84 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -694,9 +694,10 @@ GKO_STUB(GKO_DECLARE_IR_INITIALIZE_KERNEL); namespace multigrid { -GKO_STUB_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL); -GKO_STUB_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL); +GKO_STUB_NON_COMPLEX_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL); } // namespace multigrid @@ -1125,11 +1126,12 @@ GKO_STUB_INDEX_TYPE(GKO_DECLARE_PGM_SORT_AGG_KERNEL); GKO_STUB_INDEX_TYPE(GKO_DECLARE_PGM_MAP_ROW_KERNEL); GKO_STUB_INDEX_TYPE(GKO_DECLARE_PGM_MAP_COL_KERNEL); GKO_STUB_INDEX_TYPE(GKO_DECLARE_PGM_COUNT_UNREPEATED_NNZ_KERNEL); -GKO_STUB_NON_COMPLEX_VALUE_AND_INDEX_TYPE( +GKO_STUB_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PGM_FIND_STRONGEST_NEIGHBOR); -GKO_STUB_NON_COMPLEX_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_ASSIGN_TO_EXIST_AGG); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_SORT_ROW_MAJOR); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_COMPUTE_COARSE_COO); +GKO_STUB_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_PGM_ASSIGN_TO_EXIST_AGG); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PGM_SORT_ROW_MAJOR); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PGM_COMPUTE_COARSE_COO); GKO_STUB_INDEX_TYPE(GKO_DECLARE_PGM_GATHER_INDEX); diff --git a/core/multigrid/fixed_coarsening.cpp b/core/multigrid/fixed_coarsening.cpp index 1cbdd557fb4..f62ce746d6b 100644 --- a/core/multigrid/fixed_coarsening.cpp +++ b/core/multigrid/fixed_coarsening.cpp @@ -90,7 +90,8 @@ void FixedCoarsening::generate() #define GKO_DECLARE_FIXED_COARSENING(_vtype, _itype) \ class FixedCoarsening<_vtype, _itype> -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FIXED_COARSENING); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_FIXED_COARSENING); } // namespace multigrid diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp index 9f1f5b50ba6..e531fb2b996 100644 --- a/core/multigrid/pgm.cpp +++ b/core/multigrid/pgm.cpp @@ -541,7 +541,7 @@ void Pgm::generate() #define GKO_DECLARE_PGM(_vtype, _itype) class Pgm<_vtype, _itype> -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_PGM); } // namespace multigrid diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index 2ecd3dd74c4..0b918a13897 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -318,6 +319,9 @@ void MultigridState::generate(const LinOp* system_matrix_in, auto mg_level = mg_level_list.at(i); run, +#endif std::complex, std::complex>( mg_level, [&, this](auto mg_level, auto i, auto cycle, auto current_nrows, @@ -456,6 +460,9 @@ void MultigridState::run_mg_cycle(multigrid::cycle cycle, size_type level, } auto mg_level = multigrid->get_mg_level_list().at(level); run, +#endif std::complex, std::complex>( mg_level, [&, this](auto mg_level) { #if GINKGO_BUILD_MPI @@ -705,6 +712,9 @@ void Multigrid::generate() } run, +#endif std::complex, std::complex>( mg_level, [this](auto mg_level, auto index, auto matrix) { @@ -743,6 +753,9 @@ void Multigrid::generate() // generate coarsest solver run, +#endif std::complex, std::complex>( last_mg_level, [this](auto mg_level, auto level, auto matrix) { @@ -860,6 +873,9 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* b, LinOp* x, }; auto first_mg_level = this->get_mg_level_list().front(); run, +#endif std::complex, std::complex>(first_mg_level, lambda, b, x); } @@ -899,6 +915,9 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* alpha, }; auto first_mg_level = this->get_mg_level_list().front(); run, +#endif std::complex, std::complex>(first_mg_level, lambda, alpha, b, beta, x); } @@ -964,6 +983,9 @@ void Multigrid::apply_dense_impl(const VectorType* b, VectorType* x, auto first_mg_level = this->get_mg_level_list().front(); run, +#endif std::complex, std::complex>(first_mg_level, lambda, b, x); } diff --git a/core/test/multigrid/fixed_coarsening.cpp b/core/test/multigrid/fixed_coarsening.cpp index 5cab7282b5d..35bd04bb067 100644 --- a/core/test/multigrid/fixed_coarsening.cpp +++ b/core/test/multigrid/fixed_coarsening.cpp @@ -38,7 +38,7 @@ class FixedCoarseningFactory : public ::testing::Test { std::unique_ptr fixed_coarsening_factory; }; -TYPED_TEST_SUITE(FixedCoarseningFactory, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(FixedCoarseningFactory, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/core/test/multigrid/pgm.cpp b/core/test/multigrid/pgm.cpp index 7798e97f5d6..c06edda60a0 100644 --- a/core/test/multigrid/pgm.cpp +++ b/core/test/multigrid/pgm.cpp @@ -40,7 +40,7 @@ class PgmFactory : public ::testing::Test { std::unique_ptr pgm_factory; }; -TYPED_TEST_SUITE(PgmFactory, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(PgmFactory, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/core/test/solver/multigrid.cpp b/core/test/solver/multigrid.cpp index 8cb545f6cb2..54c4a18b8d3 100644 --- a/core/test/solver/multigrid.cpp +++ b/core/test/solver/multigrid.cpp @@ -75,9 +75,7 @@ class DummyLinOpWithFactory std::make_shared(this->get_executor(), gko::dim<2>{n_, n_ - 1}), gko::share(gko::test::generate_random_dense_matrix( - n_ - 1, n_ - 1, - std::uniform_real_distribution>( - 0, 1), + n_ - 1, n_ - 1, std::uniform_real_distribution<>(0, 1), std::default_random_engine{}, factory->get_executor())), std::make_shared(this->get_executor(), gko::dim<2>{n_ - 1, n_})); @@ -166,7 +164,8 @@ class Multigrid : public ::testing::Test { } }; -TYPED_TEST_SUITE(Multigrid, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Multigrid, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(Multigrid, MultigridFactoryKnowsItsExecutor) diff --git a/dpcpp/multigrid/pgm_kernels.dp.cpp b/dpcpp/multigrid/pgm_kernels.dp.cpp index a9148c54ff4..e645ba3bc6e 100644 --- a/dpcpp/multigrid/pgm_kernels.dp.cpp +++ b/dpcpp/multigrid/pgm_kernels.dp.cpp @@ -56,7 +56,8 @@ void sort_row_major(std::shared_ptr exec, size_type nnz, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_SORT_ROW_MAJOR); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_PGM_SORT_ROW_MAJOR); template @@ -89,7 +90,7 @@ void compute_coarse_coo(std::shared_ptr exec, [](auto a, auto b) { return a + b; }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PGM_COMPUTE_COARSE_COO); diff --git a/dpcpp/solver/multigrid_kernels.dp.cpp b/dpcpp/solver/multigrid_kernels.dp.cpp index aaf0ab63354..cdbcb39d043 100644 --- a/dpcpp/solver/multigrid_kernels.dp.cpp +++ b/dpcpp/solver/multigrid_kernels.dp.cpp @@ -31,7 +31,8 @@ void kcycle_step_1(std::shared_ptr exec, matrix::Dense* g, matrix::Dense* d, matrix::Dense* e) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL); template @@ -44,7 +45,8 @@ void kcycle_step_2(std::shared_ptr exec, const matrix::Dense* d, matrix::Dense* e) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL); template @@ -54,7 +56,7 @@ void kcycle_check_stop(std::shared_ptr exec, const ValueType rel_tol, bool& is_stop) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF( GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL); diff --git a/omp/multigrid/pgm_kernels.cpp b/omp/multigrid/pgm_kernels.cpp index 4c824a0140b..bfe95291f2e 100644 --- a/omp/multigrid/pgm_kernels.cpp +++ b/omp/multigrid/pgm_kernels.cpp @@ -47,7 +47,8 @@ void sort_row_major(std::shared_ptr exec, size_type nnz, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_SORT_ROW_MAJOR); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_PGM_SORT_ROW_MAJOR); template @@ -83,7 +84,7 @@ void compute_coarse_coo(std::shared_ptr exec, coarse_val[coarse_idxs] = temp_val; } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PGM_COMPUTE_COARSE_COO); diff --git a/omp/solver/multigrid_kernels.cpp b/omp/solver/multigrid_kernels.cpp index 12e5bad8577..509ecf51828 100644 --- a/omp/solver/multigrid_kernels.cpp +++ b/omp/solver/multigrid_kernels.cpp @@ -44,7 +44,8 @@ void kcycle_step_1(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL); template @@ -74,7 +75,8 @@ void kcycle_step_2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL); template @@ -92,7 +94,7 @@ void kcycle_check_stop(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF( GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL); diff --git a/reference/multigrid/pgm_kernels.cpp b/reference/multigrid/pgm_kernels.cpp index bff2a776c6b..2b4298377cb 100644 --- a/reference/multigrid/pgm_kernels.cpp +++ b/reference/multigrid/pgm_kernels.cpp @@ -208,7 +208,7 @@ void find_strongest_neighbor( } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PGM_FIND_STRONGEST_NEIGHBOR); @@ -260,7 +260,7 @@ void assign_to_exist_agg(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PGM_ASSIGN_TO_EXIST_AGG); @@ -274,7 +274,8 @@ void sort_row_major(std::shared_ptr exec, size_type nnz, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM_SORT_ROW_MAJOR); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_PGM_SORT_ROW_MAJOR); template @@ -311,7 +312,7 @@ void compute_coarse_coo(std::shared_ptr exec, coarse_val[coarse_idxs] = temp_val; } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_PGM_COMPUTE_COARSE_COO); diff --git a/reference/solver/multigrid_kernels.cpp b/reference/solver/multigrid_kernels.cpp index b08c9857d3a..4ce4491c990 100644 --- a/reference/solver/multigrid_kernels.cpp +++ b/reference/solver/multigrid_kernels.cpp @@ -43,7 +43,8 @@ void kcycle_step_1(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL); template @@ -72,7 +73,8 @@ void kcycle_step_2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL); template @@ -89,7 +91,7 @@ void kcycle_check_stop(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_WITH_HALF( GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL); diff --git a/reference/test/multigrid/fixed_coarsening_kernels.cpp b/reference/test/multigrid/fixed_coarsening_kernels.cpp index 582950b4e17..001e23d6124 100644 --- a/reference/test/multigrid/fixed_coarsening_kernels.cpp +++ b/reference/test/multigrid/fixed_coarsening_kernels.cpp @@ -143,7 +143,7 @@ class FixedCoarsening : public ::testing::Test { std::unique_ptr mg_level; }; -TYPED_TEST_SUITE(FixedCoarsening, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(FixedCoarsening, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/reference/test/multigrid/pgm_kernels.cpp b/reference/test/multigrid/pgm_kernels.cpp index 2fc754f23b3..e715b2175d3 100644 --- a/reference/test/multigrid/pgm_kernels.cpp +++ b/reference/test/multigrid/pgm_kernels.cpp @@ -187,7 +187,8 @@ class Pgm : public ::testing::Test { std::unique_ptr mg_level; }; -TYPED_TEST_SUITE(Pgm, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Pgm, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Pgm, CanBeCopied) diff --git a/reference/test/solver/multigrid_kernels.cpp b/reference/test/solver/multigrid_kernels.cpp index 57ba8fba84d..7b79ba98ad2 100644 --- a/reference/test/solver/multigrid_kernels.cpp +++ b/reference/test/solver/multigrid_kernels.cpp @@ -154,7 +154,7 @@ class DummyLinOpWithFactory { auto alpha_value = gko::as>(alpha)->at(0, 0); - gko::remove_complex scale = std::real(alpha_value); + gko::remove_complex scale = gko::real(alpha_value); global_step *= static_cast(scale); step.push_back(global_step); global_step++; @@ -233,7 +233,7 @@ class Multigrid : public ::testing::Test { using Smoother = gko::solver::Ir; using InnerSolver = gko::preconditioner::Jacobi; using CoarsestSolver = gko::solver::Cg; - using CoarsestNextSolver = gko::solver::Cg>; + using CoarsestNextSolver = gko::solver::Cg>; using DummyRPFactory = DummyMultigridLevelWithFactory; using DummyFactory = DummyLinOpWithFactory; Multigrid() @@ -415,7 +415,7 @@ class Multigrid : public ::testing::Test { std::shared_ptr x2; }; -TYPED_TEST_SUITE(Multigrid, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(Multigrid, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); From 1c2444a5573c3e0cc0f6b216ee5b607d9002124e Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 28 Oct 2024 17:07:11 +0100 Subject: [PATCH 34/69] pgm uses gko::max to avoid ambiguous in hip --- common/unified/multigrid/pgm_kernels.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp index 9e59671a821..2b0c04592a7 100644 --- a/common/unified/multigrid/pgm_kernels.cpp +++ b/common/unified/multigrid/pgm_kernels.cpp @@ -183,7 +183,7 @@ void find_strongest_neighbor( continue; } auto weight = - weight_vals[idx] / max(abs(diag[row]), abs(diag[col])); + weight_vals[idx] / gko::max(abs(diag[row]), abs(diag[col])); if (agg[col] == -1 && device_std::tie(weight, col) > device_std::tie(max_weight_unagg, strongest_unagg)) { @@ -246,8 +246,8 @@ void assign_to_exist_agg(std::shared_ptr exec, if (col == row) { continue; } - auto weight = - weight_vals[idx] / max(abs(diag[row]), abs(diag[col])); + auto weight = weight_vals[idx] / + gko::max(abs(diag[row]), abs(diag[col])); if (agg_const_val[col] != -1 && device_std::tie(weight, col) > device_std::tie(max_weight_agg, strongest_agg)) { @@ -284,8 +284,8 @@ void assign_to_exist_agg(std::shared_ptr exec, if (col == row) { continue; } - auto weight = - weight_vals[idx] / max(abs(diag[row]), abs(diag[col])); + auto weight = weight_vals[idx] / + gko::max(abs(diag[row]), abs(diag[col])); if (agg_val[col] != -1 && device_std::tie(weight, col) > device_std::tie(max_weight_agg, strongest_agg)) { From 1dd8144870eac104b954cae68ef8d63f441833ea Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 25 Oct 2024 16:59:11 +0200 Subject: [PATCH 35/69] multigrid config dispatch --- core/config/multigrid_config.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/config/multigrid_config.cpp b/core/config/multigrid_config.cpp index 83be1a1742b..8cc4b4e1ca3 100644 --- a/core/config/multigrid_config.cpp +++ b/core/config/multigrid_config.cpp @@ -10,7 +10,7 @@ namespace gko { namespace config { -GKO_PARSE_VALUE_AND_INDEX_TYPE(Pgm, gko::multigrid::Pgm); +GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(Pgm, gko::multigrid::Pgm); } // namespace config From c1bb518f26737183992a61e010161c68c9adb76d Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 25 Oct 2024 18:54:05 +0200 Subject: [PATCH 36/69] preconditioner with half --- common/cuda_hip/components/warp_blas.hpp | 2 +- .../cuda_hip/preconditioner/isai_kernels.cpp | 10 ++-- .../jacobi_advanced_apply_kernels.cpp | 3 +- ...obi_advanced_apply_kernels.instantiate.cpp | 2 +- .../jacobi_generate_kernels.cpp | 2 +- .../jacobi_generate_kernels.instantiate.cpp | 2 +- .../preconditioner/jacobi_kernels.cpp | 8 ++-- .../jacobi_simple_apply_kernels.cpp | 2 +- ...acobi_simple_apply_kernels.instantiate.cpp | 2 +- .../cuda_hip/preconditioner/sor_kernels.cpp | 17 +++---- .../unified/preconditioner/jacobi_kernels.cpp | 13 +++-- core/device_hooks/common_kernels.inc.cpp | 47 +++++++++++-------- core/preconditioner/gauss_seidel.cpp | 3 +- core/preconditioner/ic.cpp | 12 +++-- core/preconditioner/ilu.cpp | 23 +++++---- core/preconditioner/isai.cpp | 9 ++-- core/preconditioner/jacobi.cpp | 11 +++-- core/preconditioner/jacobi_utils.hpp | 4 +- core/preconditioner/sor.cpp | 2 +- core/test/preconditioner/isai.cpp | 2 +- core/test/preconditioner/jacobi.cpp | 2 +- dpcpp/preconditioner/isai_kernels.dp.cpp | 14 +++--- ...cobi_advanced_apply_instantiate.inc.dp.cpp | 2 +- .../jacobi_advanced_apply_kernel.dp.cpp | 3 +- .../jacobi_generate_instantiate.inc.dp.cpp | 2 +- .../jacobi_generate_kernel.dp.cpp | 2 +- dpcpp/preconditioner/jacobi_kernels.dp.cpp | 8 ++-- ...jacobi_simple_apply_instantiate.inc.dp.cpp | 2 +- .../jacobi_simple_apply_kernel.dp.cpp | 2 +- dpcpp/preconditioner/sor_kernels.dp.cpp | 4 +- .../test/preconditioner/jacobi_kernels.dp.cpp | 21 ++++----- include/ginkgo/core/preconditioner/ic.hpp | 2 +- include/ginkgo/core/preconditioner/ilu.hpp | 3 +- omp/preconditioner/isai_kernels.cpp | 10 ++-- omp/preconditioner/jacobi_kernels.cpp | 15 +++--- omp/preconditioner/sor_kernels.cpp | 4 +- reference/preconditioner/isai_kernels.cpp | 10 ++-- reference/preconditioner/jacobi_kernels.cpp | 28 ++++++----- reference/preconditioner/sor_kernels.cpp | 4 +- .../test/preconditioner/gauss_seidel.cpp | 2 +- reference/test/preconditioner/ic.cpp | 13 ++--- reference/test/preconditioner/ilu.cpp | 14 +++--- .../test/preconditioner/isai_kernels.cpp | 35 ++++++++++++-- reference/test/preconditioner/jacobi.cpp | 5 +- .../test/preconditioner/jacobi_kernels.cpp | 32 ++++++++----- reference/test/preconditioner/sor_kernels.cpp | 3 +- reference/test/solver/multigrid_kernels.cpp | 6 ++- 47 files changed, 250 insertions(+), 174 deletions(-) diff --git a/common/cuda_hip/components/warp_blas.hpp b/common/cuda_hip/components/warp_blas.hpp index 116b963ad11..0df0612152c 100644 --- a/common/cuda_hip/components/warp_blas.hpp +++ b/common/cuda_hip/components/warp_blas.hpp @@ -425,7 +425,7 @@ __device__ __forceinline__ remove_complex compute_infinity_norm( } } return reduce(group, sum, - [](result_type x, result_type y) { return max(x, y); }); + [](result_type x, result_type y) { return gko::max(x, y); }); } diff --git a/common/cuda_hip/preconditioner/isai_kernels.cpp b/common/cuda_hip/preconditioner/isai_kernels.cpp index d6fdd6389fc..77fdb3c0e23 100644 --- a/common/cuda_hip/preconditioner/isai_kernels.cpp +++ b/common/cuda_hip/preconditioner/isai_kernels.cpp @@ -487,7 +487,7 @@ void generate_tri_inverse(std::shared_ptr exec, components::prefix_sum_nonnegative(exec, excess_nz_ptrs, num_rows + 1); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL); @@ -516,7 +516,7 @@ void generate_general_inverse(std::shared_ptr exec, components::prefix_sum_nonnegative(exec, excess_nz_ptrs, num_rows + 1); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL); @@ -548,7 +548,7 @@ void generate_excess_system(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL); @@ -568,7 +568,7 @@ void scale_excess_solution(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL); @@ -593,7 +593,7 @@ void scatter_excess_solution(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL); diff --git a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp index 27b4f57eb6c..fcd86bdba29 100644 --- a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp +++ b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp @@ -66,7 +66,8 @@ void apply(std::shared_ptr exec, size_type num_blocks, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_JACOBI_APPLY_KERNEL); } // namespace jacobi diff --git a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp index 131c530d2ee..62d9c1ece43 100644 --- a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp +++ b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp @@ -160,7 +160,7 @@ void advanced_apply( const preconditioner::block_interleaved_storage_scheme&, \ const ValueType*, const ValueType*, size_type, ValueType*, size_type) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION); diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp index 207550ff6b1..7c37e578045 100644 --- a/common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp +++ b/common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp @@ -68,7 +68,7 @@ void generate(std::shared_ptr exec, block_pointers.get_const_data(), num_blocks); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_GENERATE_KERNEL); diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp index fdb0ad11e9e..5efd0c40632 100644 --- a/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp +++ b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp @@ -268,7 +268,7 @@ void generate(syn::value_list, remove_complex*, precision_reduction*, const IndexType*, \ size_type) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( DECLARE_JACOBI_GENERATE_INSTANTIATION); diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_kernels.cpp index 6f2d4ae3974..adcc08e37e9 100644 --- a/common/cuda_hip/preconditioner/jacobi_kernels.cpp +++ b/common/cuda_hip/preconditioner/jacobi_kernels.cpp @@ -297,7 +297,7 @@ void find_blocks(std::shared_ptr exec, exec, max_block_size, num_natural_blocks, block_pointers.get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL); @@ -364,7 +364,7 @@ void transpose_jacobi( storage_scheme, out_blocks.get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL); @@ -388,7 +388,7 @@ void conj_transpose_jacobi( storage_scheme, out_blocks.get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL); @@ -401,7 +401,7 @@ void convert_to_dense( storage_scheme, ValueType* result_values, size_type result_stride) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL); diff --git a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp index e9b7b10fd88..fb73c22ccef 100644 --- a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp +++ b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp @@ -57,7 +57,7 @@ void simple_apply( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL); diff --git a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp index faf869718a6..3a35fbe3f04 100644 --- a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp +++ b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp @@ -151,7 +151,7 @@ void apply(syn::value_list, const preconditioner::block_interleaved_storage_scheme&, \ const ValueType*, size_type, ValueType*, size_type) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION); diff --git a/common/cuda_hip/preconditioner/sor_kernels.cpp b/common/cuda_hip/preconditioner/sor_kernels.cpp index 4805eca3ab3..f75a52b3af2 100644 --- a/common/cuda_hip/preconditioner/sor_kernels.cpp +++ b/common/cuda_hip/preconditioner/sor_kernels.cpp @@ -26,7 +26,7 @@ void initialize_weighted_l( const auto grid_dim = static_cast( ceildiv(num_rows, static_cast(block_size))); - auto inv_weight = one(weight) / weight; + auto inv_weight = as_device_type(one(weight) / weight); if (grid_dim > 0) { using namespace gko::factorization; @@ -46,7 +46,7 @@ void initialize_weighted_l( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L); @@ -62,9 +62,10 @@ void initialize_weighted_l_u( const auto grid_dim = static_cast( ceildiv(num_rows, static_cast(block_size))); - auto inv_weight = one(weight) / weight; - auto inv_two_minus_weight = - one(weight) / (static_cast>(2.0) - weight); + auto inv_weight = as_device_type(one(weight) / weight); + auto inv_two_minus_weight = as_device_type( + one(weight) / (static_cast>(2.0) - weight)); + auto d_weight = as_device_type(weight); if (grid_dim > 0) { using namespace gko::factorization; @@ -87,13 +88,13 @@ void initialize_weighted_l_u( [inv_two_minus_weight] __device__(auto val) { return val * inv_two_minus_weight; }, - [weight, inv_two_minus_weight] __device__(auto val) { - return val * weight * inv_two_minus_weight; + [d_weight, inv_two_minus_weight] __device__(auto val) { + return val * d_weight * inv_two_minus_weight; })); } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U); diff --git a/common/unified/preconditioner/jacobi_kernels.cpp b/common/unified/preconditioner/jacobi_kernels.cpp index dce00fd1366..00f3d62f312 100644 --- a/common/unified/preconditioner/jacobi_kernels.cpp +++ b/common/unified/preconditioner/jacobi_kernels.cpp @@ -32,7 +32,8 @@ void scalar_conj(std::shared_ptr exec, diag.get_size(), diag, conj_diag); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL); template @@ -49,7 +50,8 @@ void invert_diagonal(std::shared_ptr exec, diag.get_size(), diag, inv_diag); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL); template @@ -83,7 +85,8 @@ void scalar_apply(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL); template @@ -100,7 +103,7 @@ void simple_scalar_apply(std::shared_ptr exec, x->get_size(), diag, b, x); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL); @@ -120,7 +123,7 @@ void scalar_convert_to_dense(std::shared_ptr exec, result->get_size(), blocks, result); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_SCALAR_CONVERT_TO_DENSE_KERNEL); diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 9c492871a84..7215a17aec5 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -952,18 +952,21 @@ GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_KERNEL); namespace jacobi { -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_GENERATE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_CONVERT_TO_DENSE_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_GENERATE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_APPLY_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_JACOBI_SCALAR_CONVERT_TO_DENSE_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL); +GKO_STUB_VALUE_TYPE_WITH_HALF(GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL); GKO_STUB(GKO_DECLARE_JACOBI_INITIALIZE_PRECISIONS_KERNEL); @@ -973,8 +976,9 @@ GKO_STUB(GKO_DECLARE_JACOBI_INITIALIZE_PRECISIONS_KERNEL); namespace sor { -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U); } // namespace sor @@ -983,11 +987,16 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U); namespace isai { -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL); } // namespace isai diff --git a/core/preconditioner/gauss_seidel.cpp b/core/preconditioner/gauss_seidel.cpp index aec7a4ff827..f4735cff5bc 100644 --- a/core/preconditioner/gauss_seidel.cpp +++ b/core/preconditioner/gauss_seidel.cpp @@ -71,7 +71,8 @@ std::unique_ptr GaussSeidel::generate_impl( #define GKO_DECLARE_GAUSS_SEIDEL(ValueType, IndexType) \ class GaussSeidel -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_GAUSS_SEIDEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_GAUSS_SEIDEL); } // namespace preconditioner diff --git a/core/preconditioner/ic.cpp b/core/preconditioner/ic.cpp index 691795ad60b..2e9833c21f7 100644 --- a/core/preconditioner/ic.cpp +++ b/core/preconditioner/ic.cpp @@ -50,28 +50,32 @@ typename Ic::parameters_type ic_parse( ic_parse, IndexType>>( \ const config::pnode&, const config::registry&, \ const config::type_descriptor&) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWERTRS_IC_PARSE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_LOWERTRS_IC_PARSE); #define GKO_DECLARE_IR_IC_PARSE(ValueType, IndexType) \ typename Ic, IndexType>::parameters_type \ ic_parse, IndexType>>( \ const config::pnode&, const config::registry&, \ const config::type_descriptor&) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IR_IC_PARSE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_IR_IC_PARSE); #define GKO_DECLARE_GMRES_IC_PARSE(ValueType, IndexType) \ typename Ic, IndexType>::parameters_type \ ic_parse, IndexType>>( \ const config::pnode&, const config::registry&, \ const config::type_descriptor&) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_GMRES_IC_PARSE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_GMRES_IC_PARSE); #define GKO_DECLARE_LOWERISAI_IC_PARSE(ValueType, IndexType) \ typename Ic, IndexType>::parameters_type \ ic_parse, IndexType>>( \ const config::pnode&, const config::registry&, \ const config::type_descriptor&) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWERISAI_IC_PARSE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_LOWERISAI_IC_PARSE); } // namespace detail } // namespace preconditioner diff --git a/core/preconditioner/ilu.cpp b/core/preconditioner/ilu.cpp index d6f49e49588..dae6cf97829 100644 --- a/core/preconditioner/ilu.cpp +++ b/core/preconditioner/ilu.cpp @@ -59,7 +59,8 @@ typename Ilu::parameters_type ilu_parse( solver::UpperTrs, false, IndexType>>( \ const config::pnode&, const config::registry&, \ const config::type_descriptor&) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_TRS_ILU_FALSE_PARSE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_TRS_ILU_FALSE_PARSE); #define GKO_DECLARE_TRS_ILU_TRUE_PARSE(ValueType, IndexType) \ typename Ilu, \ @@ -69,7 +70,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_TRS_ILU_FALSE_PARSE); solver::UpperTrs, true, IndexType>>( \ const config::pnode&, const config::registry&, \ const config::type_descriptor&) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_TRS_ILU_TRUE_PARSE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_TRS_ILU_TRUE_PARSE); #define GKO_DECLARE_GMRES_ILU_FALSE_PARSE(ValueType, IndexType) \ typename Ilu, solver::Gmres, false, \ @@ -77,7 +79,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_TRS_ILU_TRUE_PARSE); ilu_parse, solver::Gmres, false, \ IndexType>>(const config::pnode&, const config::registry&, \ const config::type_descriptor&) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_GMRES_ILU_FALSE_PARSE); #define GKO_DECLARE_GMRES_ILU_TRUE_PARSE(ValueType, IndexType) \ @@ -86,7 +88,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( ilu_parse, solver::Gmres, true, \ IndexType>>(const config::pnode&, const config::registry&, \ const config::type_descriptor&) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_GMRES_ILU_TRUE_PARSE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_GMRES_ILU_TRUE_PARSE); #define GKO_DECLARE_IR_ILU_FALSE_PARSE(ValueType, IndexType) \ typename Ilu, solver::Ir, false, \ @@ -95,7 +98,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_GMRES_ILU_TRUE_PARSE); Ilu, solver::Ir, false, IndexType>>( \ const config::pnode&, const config::registry&, \ const config::type_descriptor&) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IR_ILU_FALSE_PARSE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_IR_ILU_FALSE_PARSE); #define GKO_DECLARE_IR_ILU_TRUE_PARSE(ValueType, IndexType) \ typename Ilu, solver::Ir, true, \ @@ -104,7 +108,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IR_ILU_FALSE_PARSE); Ilu, solver::Ir, true, IndexType>>( \ const config::pnode&, const config::registry&, \ const config::type_descriptor&) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IR_ILU_TRUE_PARSE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_IR_ILU_TRUE_PARSE); #define GKO_DECLARE_ISAI_ILU_FALSE_PARSE(ValueType, IndexType) \ typename Ilu, \ @@ -114,7 +119,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IR_ILU_TRUE_PARSE); UpperIsai, false, IndexType>>( \ const config::pnode&, const config::registry&, \ const config::type_descriptor&) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_ILU_FALSE_PARSE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_ISAI_ILU_FALSE_PARSE); #define GKO_DECLARE_ISAI_ILU_TRUE_PARSE(ValueType, IndexType) \ typename Ilu, \ @@ -124,7 +130,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_ILU_FALSE_PARSE); UpperIsai, true, IndexType>>( \ const config::pnode&, const config::registry&, \ const config::type_descriptor&) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_ILU_TRUE_PARSE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_ISAI_ILU_TRUE_PARSE); } // namespace detail diff --git a/core/preconditioner/isai.cpp b/core/preconditioner/isai.cpp index 9684f1bdb27..ec0ef365592 100644 --- a/core/preconditioner/isai.cpp +++ b/core/preconditioner/isai.cpp @@ -358,19 +358,20 @@ std::unique_ptr Isai::conj_transpose() #define GKO_DECLARE_LOWER_ISAI(ValueType, IndexType) \ class Isai -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWER_ISAI); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_LOWER_ISAI); #define GKO_DECLARE_UPPER_ISAI(ValueType, IndexType) \ class Isai -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_ISAI); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_UPPER_ISAI); #define GKO_DECLARE_GENERAL_ISAI(ValueType, IndexType) \ class Isai -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_GENERAL_ISAI); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_GENERAL_ISAI); #define GKO_DECLARE_SPD_ISAI(ValueType, IndexType) \ class Isai -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPD_ISAI); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_SPD_ISAI); } // namespace preconditioner diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp index f6d5b042a23..3f773710ceb 100644 --- a/core/preconditioner/jacobi.cpp +++ b/core/preconditioner/jacobi.cpp @@ -328,10 +328,11 @@ void Jacobi::generate(const LinOp* system_matrix, if (parameters_.max_block_size == 1) { auto diag = share(as(system_matrix) ->extract_diagonal_linop()); - auto diag_vt = - ::gko::detail::temporary_conversion>:: - template create>>( - diag.get()); + auto diag_vt = ::gko::detail:: + temporary_conversion>::template create< + matrix::Diagonal>, + matrix::Diagonal>>>(diag.get()); if (!diag_vt) { GKO_NOT_SUPPORTED(system_matrix); } @@ -374,7 +375,7 @@ void Jacobi::generate(const LinOp* system_matrix, #define GKO_DECLARE_JACOBI(ValueType, IndexType) \ class Jacobi -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_JACOBI); } // namespace preconditioner diff --git a/core/preconditioner/jacobi_utils.hpp b/core/preconditioner/jacobi_utils.hpp index e159fd15776..36e7d1ccd75 100644 --- a/core/preconditioner/jacobi_utils.hpp +++ b/core/preconditioner/jacobi_utils.hpp @@ -108,8 +108,8 @@ GKO_ATTRIBUTES GKO_INLINE uint32 get_supported_storage_reductions( using gko::detail::float_traits; using type = remove_complex; using prd = precision_reduction_descriptor; - auto accurate = [&cond, &accuracy](type eps) { - return cond * eps < accuracy; + auto accurate = [&cond, &accuracy](auto eps) { + return cond * static_cast(eps) < accuracy; }; uint8 is_verified1 = 2; auto supported = static_cast(prd::p0n0); diff --git a/core/preconditioner/sor.cpp b/core/preconditioner/sor.cpp index c9905c5f73c..b671a99c6fb 100644 --- a/core/preconditioner/sor.cpp +++ b/core/preconditioner/sor.cpp @@ -161,7 +161,7 @@ std::unique_ptr Sor::generate_impl( #define GKO_DECLARE_SOR(ValueType, IndexType) class Sor -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SOR); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_SOR); } // namespace preconditioner diff --git a/core/test/preconditioner/isai.cpp b/core/test/preconditioner/isai.cpp index b5e7400d0e8..b2ee8175d49 100644 --- a/core/test/preconditioner/isai.cpp +++ b/core/test/preconditioner/isai.cpp @@ -64,7 +64,7 @@ class IsaiFactory : public ::testing::Test { std::unique_ptr upper_isai_factory; }; -TYPED_TEST_SUITE(IsaiFactory, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(IsaiFactory, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/core/test/preconditioner/jacobi.cpp b/core/test/preconditioner/jacobi.cpp index 8813b4c3c4d..40bc9e8d494 100644 --- a/core/test/preconditioner/jacobi.cpp +++ b/core/test/preconditioner/jacobi.cpp @@ -43,7 +43,7 @@ class JacobiFactory : public ::testing::Test { std::shared_ptr> mtx; }; -TYPED_TEST_SUITE(JacobiFactory, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(JacobiFactory, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/dpcpp/preconditioner/isai_kernels.dp.cpp b/dpcpp/preconditioner/isai_kernels.dp.cpp index 4082035ff9f..8d5429b088a 100644 --- a/dpcpp/preconditioner/isai_kernels.dp.cpp +++ b/dpcpp/preconditioner/isai_kernels.dp.cpp @@ -365,7 +365,7 @@ void generate_general_inverse( if (spd) { auto diag = subwarp.shfl(sol, num_elems - 1); - sol /= std::sqrt(diag); + sol /= gko::sqrt(diag); } return sol; @@ -531,7 +531,7 @@ void scale_excess_solution(const IndexType* __restrict__ excess_block_ptrs, return; } const auto diag = excess_solution[block_end - 1]; - const ValueType scal = one() / std::sqrt(diag); + const ValueType scal = one() / gko::sqrt(diag); for (size_type i = block_begin + local_id; i < block_end; i += subwarp_size) { @@ -642,7 +642,7 @@ void generate_tri_inverse(std::shared_ptr exec, components::prefix_sum_nonnegative(exec, excess_nz_ptrs, num_rows + 1); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL); @@ -669,7 +669,7 @@ void generate_general_inverse(std::shared_ptr exec, components::prefix_sum_nonnegative(exec, excess_nz_ptrs, num_rows + 1); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL); @@ -699,7 +699,7 @@ void generate_excess_system(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL); @@ -718,7 +718,7 @@ void scale_excess_solution(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL); @@ -742,7 +742,7 @@ void scatter_excess_solution(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL); diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp index e8c086ec0a6..4b9077d5ec5 100644 --- a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp @@ -197,7 +197,7 @@ void advanced_apply( const preconditioner::block_interleaved_storage_scheme&, \ const ValueType*, const ValueType*, size_type, ValueType*, size_type) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION); diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp index 0e26989808e..72a32c2d5cb 100644 --- a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp +++ b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp @@ -65,7 +65,8 @@ void apply(std::shared_ptr exec, size_type num_blocks, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_JACOBI_APPLY_KERNEL); } // namespace jacobi diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp index d957ea2c5be..fe0973a9f21 100644 --- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp @@ -388,7 +388,7 @@ void generate(syn::value_list, remove_complex*, precision_reduction*, const IndexType*, \ size_type) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( DECLARE_JACOBI_GENERATE_INSTANTIATION); diff --git a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp index 62ff7fdbb51..826509be1df 100644 --- a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp +++ b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp @@ -61,7 +61,7 @@ void generate(std::shared_ptr exec, block_pointers.get_const_data(), num_blocks); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_GENERATE_KERNEL); diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp index 886f96e88e3..63449ba5b4b 100644 --- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp @@ -389,7 +389,7 @@ void find_blocks(std::shared_ptr exec, exec, max_block_size, num_natural_blocks, block_pointers.get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL); @@ -452,7 +452,7 @@ void transpose_jacobi( storage_scheme, out_blocks.get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL); @@ -476,7 +476,7 @@ void conj_transpose_jacobi( storage_scheme, out_blocks.get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL); @@ -489,7 +489,7 @@ void convert_to_dense( storage_scheme, ValueType* result_values, size_type result_stride) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL); diff --git a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp index c088ae8e986..8eafc3af69d 100644 --- a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp @@ -190,7 +190,7 @@ void apply(syn::value_list, const preconditioner::block_interleaved_storage_scheme&, \ const ValueType*, size_type, ValueType*, size_type) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION); diff --git a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp index 25701c6dc55..3d6ebe76226 100644 --- a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp +++ b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp @@ -61,7 +61,7 @@ void simple_apply( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL); diff --git a/dpcpp/preconditioner/sor_kernels.dp.cpp b/dpcpp/preconditioner/sor_kernels.dp.cpp index 4af676288bd..aed20ab8c8a 100644 --- a/dpcpp/preconditioner/sor_kernels.dp.cpp +++ b/dpcpp/preconditioner/sor_kernels.dp.cpp @@ -50,7 +50,7 @@ void initialize_weighted_l( }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L); @@ -100,7 +100,7 @@ void initialize_weighted_l_u( }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U); diff --git a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp index b8950ed2d2a..36179402262 100644 --- a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp @@ -62,7 +62,7 @@ class Jacobi : public ::testing::Test { if (condition_numbers.size() == 0) { mtx = gko::test::generate_random_matrix( dim, dim, std::uniform_int_distribution<>(min_nnz, max_nnz), - std::normal_distribution(0.0, 1.0), engine, ref); + std::normal_distribution<>(0.0, 1.0), engine, ref); } else { std::vector blocks; for (gko::size_type i = 0; i < block_pointers.size() - 1; ++i) { @@ -70,8 +70,7 @@ class Jacobi : public ::testing::Test { begin(block_pointers)[i + 1] - begin(block_pointers)[i]; const auto cond = begin(condition_numbers)[i]; blocks.push_back(mtx_data::cond( - size, cond, std::normal_distribution(-1, 1), - engine)); + size, cond, std::normal_distribution<>(-1, 1), engine)); } mtx = Mtx::create(ref); mtx->read(mtx_data::diag(begin(blocks), end(blocks))); @@ -107,11 +106,11 @@ class Jacobi : public ::testing::Test { } b = gko::test::generate_random_matrix( dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs), - std::normal_distribution(0.0, 1.0), engine, ref); + std::normal_distribution<>(0.0, 1.0), engine, ref); d_b = gko::clone(dpcpp, b); x = gko::test::generate_random_matrix( dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs), - std::normal_distribution(0.0, 1.0), engine, ref); + std::normal_distribution<>(0.0, 1.0), engine, ref); d_x = gko::clone(dpcpp, x); } @@ -409,7 +408,7 @@ TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef) smtx->copy_from(dense_smtx); auto sb = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution(0.0, 1.0), engine, ref)); + std::normal_distribution<>(0.0, 1.0), engine, ref)); auto sx = Vec::create(ref, sb->get_size()); auto d_smtx = gko::share(Mtx::create(dpcpp)); @@ -453,7 +452,7 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef) auto dense_data = gko::test::generate_random_matrix_data( dim, dim, std::uniform_int_distribution<>(1, dim), - std::normal_distribution(1.0, 2.0), engine); + std::normal_distribution<>(1.0, 2.0), engine); gko::utils::make_diag_dominant(dense_data); auto dense_smtx = gko::share(Vec::create(ref)); dense_smtx->read(dense_data); @@ -461,12 +460,12 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef) smtx->copy_from(dense_smtx); auto sb = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution(0.0, 1.0), engine, ref, - gko::dim<2>(dim, 3), 4)); + std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), + 4)); auto sx = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution(0.0, 1.0), engine, ref, - gko::dim<2>(dim, 3), 4)); + std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), + 4)); auto d_smtx = gko::share(gko::clone(dpcpp, smtx)); auto d_sb = gko::share(gko::clone(dpcpp, sb)); diff --git a/include/ginkgo/core/preconditioner/ic.hpp b/include/ginkgo/core/preconditioner/ic.hpp index aea43af3cf1..9260bfbb891 100644 --- a/include/ginkgo/core/preconditioner/ic.hpp +++ b/include/ginkgo/core/preconditioner/ic.hpp @@ -441,7 +441,7 @@ class Ic : public EnableLinOp>, public Transposable { generate_default_solver(const std::shared_ptr& exec, const std::shared_ptr& mtx) { - constexpr gko::remove_complex default_reduce_residual{1e-4}; + const gko::remove_complex default_reduce_residual{1e-4}; const unsigned int default_max_iters{ static_cast(mtx->get_size()[0])}; diff --git a/include/ginkgo/core/preconditioner/ilu.hpp b/include/ginkgo/core/preconditioner/ilu.hpp index 1f4be3e3046..98aa3ce70c1 100644 --- a/include/ginkgo/core/preconditioner/ilu.hpp +++ b/include/ginkgo/core/preconditioner/ilu.hpp @@ -498,7 +498,8 @@ class Ilu : public EnableLinOp< generate_default_solver(const std::shared_ptr& exec, const std::shared_ptr& mtx) { - constexpr gko::remove_complex default_reduce_residual{1e-4}; + // half can not use constexpr constructor + const gko::remove_complex default_reduce_residual{1e-4}; const unsigned int default_max_iters{ static_cast(mtx->get_size()[0])}; diff --git a/omp/preconditioner/isai_kernels.cpp b/omp/preconditioner/isai_kernels.cpp index 6f2fe4838d9..61a2193a2b3 100644 --- a/omp/preconditioner/isai_kernels.cpp +++ b/omp/preconditioner/isai_kernels.cpp @@ -230,7 +230,7 @@ void generate_tri_inverse(std::shared_ptr exec, trs_solve, true); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL); @@ -324,7 +324,7 @@ void generate_general_inverse(std::shared_ptr exec, general_solve, false); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL); @@ -388,7 +388,7 @@ void generate_excess_system(std::shared_ptr, e_row_ptrs[e_dim] = excess_nz_ptrs[e_end] - excess_nz_ptrs[e_start]; } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL); @@ -415,7 +415,7 @@ void scale_excess_solution(std::shared_ptr, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL); @@ -441,7 +441,7 @@ void scatter_excess_solution(std::shared_ptr, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL); diff --git a/omp/preconditioner/jacobi_kernels.cpp b/omp/preconditioner/jacobi_kernels.cpp index 76224f97a2f..ee51f7adb40 100644 --- a/omp/preconditioner/jacobi_kernels.cpp +++ b/omp/preconditioner/jacobi_kernels.cpp @@ -132,7 +132,7 @@ void find_blocks(std::shared_ptr exec, block_pointers.get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL); @@ -436,7 +436,7 @@ void generate(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_GENERATE_KERNEL); @@ -514,7 +514,8 @@ void apply(std::shared_ptr exec, size_type num_blocks, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_JACOBI_APPLY_KERNEL); template @@ -548,7 +549,7 @@ void simple_apply( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL); @@ -585,7 +586,7 @@ void transpose_jacobi( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL); @@ -622,7 +623,7 @@ void conj_transpose_jacobi( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL); @@ -661,7 +662,7 @@ void convert_to_dense( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL); diff --git a/omp/preconditioner/sor_kernels.cpp b/omp/preconditioner/sor_kernels.cpp index 509946ac15a..670277b6ebd 100644 --- a/omp/preconditioner/sor_kernels.cpp +++ b/omp/preconditioner/sor_kernels.cpp @@ -29,7 +29,7 @@ void initialize_weighted_l( [](auto val) { return val; })); }; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L); @@ -57,7 +57,7 @@ void initialize_weighted_l_u( })); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U); diff --git a/reference/preconditioner/isai_kernels.cpp b/reference/preconditioner/isai_kernels.cpp index 55f56b5705e..6114d3d8e3c 100644 --- a/reference/preconditioner/isai_kernels.cpp +++ b/reference/preconditioner/isai_kernels.cpp @@ -219,7 +219,7 @@ void generate_tri_inverse(std::shared_ptr exec, trs_solve, true); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL); @@ -314,7 +314,7 @@ void generate_general_inverse(std::shared_ptr exec, general_solve, false); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL); @@ -377,7 +377,7 @@ void generate_excess_system(std::shared_ptr, e_row_ptrs[e_dim] = excess_nz_ptrs[e_end] - excess_nz_ptrs[e_start]; } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL); @@ -405,7 +405,7 @@ void scale_excess_solution(std::shared_ptr, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL); @@ -430,7 +430,7 @@ void scatter_excess_solution(std::shared_ptr, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL); diff --git a/reference/preconditioner/jacobi_kernels.cpp b/reference/preconditioner/jacobi_kernels.cpp index 4eaf0988a00..52e3666ca30 100644 --- a/reference/preconditioner/jacobi_kernels.cpp +++ b/reference/preconditioner/jacobi_kernels.cpp @@ -116,7 +116,7 @@ void find_blocks(std::shared_ptr exec, block_pointers.get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL); @@ -406,7 +406,7 @@ void generate(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_GENERATE_KERNEL); @@ -494,7 +494,8 @@ void apply(std::shared_ptr exec, size_type num_blocks, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_JACOBI_APPLY_KERNEL); template @@ -527,7 +528,7 @@ void simple_apply( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL); @@ -547,7 +548,8 @@ void scalar_apply(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL); template @@ -563,7 +565,7 @@ void simple_scalar_apply(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL); @@ -576,7 +578,8 @@ void scalar_conj(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_JACOBI_SCALAR_CONJ_KERNEL); template @@ -591,7 +594,8 @@ void invert_diagonal(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( + GKO_DECLARE_JACOBI_INVERT_DIAGONAL_KERNEL); template @@ -626,7 +630,7 @@ void transpose_jacobi( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL); @@ -662,7 +666,7 @@ void conj_transpose_jacobi( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL); @@ -682,7 +686,7 @@ void scalar_convert_to_dense(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_SCALAR_CONVERT_TO_DENSE_KERNEL); @@ -720,7 +724,7 @@ void convert_to_dense( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL); diff --git a/reference/preconditioner/sor_kernels.cpp b/reference/preconditioner/sor_kernels.cpp index 88ac422dd02..b5ada476f13 100644 --- a/reference/preconditioner/sor_kernels.cpp +++ b/reference/preconditioner/sor_kernels.cpp @@ -32,7 +32,7 @@ void initialize_weighted_l( [](auto val) { return val; })); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L); @@ -60,7 +60,7 @@ void initialize_weighted_l_u( })); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U); diff --git a/reference/test/preconditioner/gauss_seidel.cpp b/reference/test/preconditioner/gauss_seidel.cpp index 2b67b665d77..53db7f0781e 100644 --- a/reference/test/preconditioner/gauss_seidel.cpp +++ b/reference/test/preconditioner/gauss_seidel.cpp @@ -47,7 +47,7 @@ class GaussSeidel : public ::testing::Test { std::shared_ptr mtx = csr_type::create(exec); }; -TYPED_TEST_SUITE(GaussSeidel, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(GaussSeidel, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); diff --git a/reference/test/preconditioner/ic.cpp b/reference/test/preconditioner/ic.cpp index 16ffc8d7b3c..aabd6c64d73 100644 --- a/reference/test/preconditioner/ic.cpp +++ b/reference/test/preconditioner/ic.cpp @@ -67,7 +67,8 @@ class Ic : public ::testing::Test { gko::remove_complex tol; }; -TYPED_TEST_SUITE(Ic, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Ic, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Ic, BuildsTwoFactorComposition) @@ -245,7 +246,7 @@ TYPED_TEST(Ic, SolvesSingleRhsMixed) { using ic_prec_type = typename TestFixture::ic_prec_type; using T = typename TestFixture::value_type; - using Vec = gko::matrix::Dense>; + using Vec = gko::matrix::Dense>; const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); auto x = Vec::create(this->exec, gko::dim<2>{3, 1}); auto preconditioner = @@ -278,8 +279,8 @@ TYPED_TEST(Ic, SolvesSingleRhsComplex) TYPED_TEST(Ic, SolvesSingleRhsComplexMixed) { using ic_prec_type = typename TestFixture::ic_prec_type; - using Vec = gko::matrix::Dense< - gko::next_precision>>; + using Vec = gko::matrix::Dense>>; using T = typename Vec::value_type; const auto b = gko::initialize( {T{1.0, 2.0}, T{3.0, 6.0}, T{6.0, 12.0}}, this->exec); @@ -315,7 +316,7 @@ TYPED_TEST(Ic, AdvancedSolvesSingleRhsMixed) { using ic_prec_type = typename TestFixture::ic_prec_type; using T = typename TestFixture::value_type; - using Vec = gko::matrix::Dense>; + using Vec = gko::matrix::Dense>; const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); const auto alpha = gko::initialize({2.0}, this->exec); const auto beta = gko::initialize({-1.0}, this->exec); @@ -355,7 +356,7 @@ TYPED_TEST(Ic, AdvancedSolvesSingleRhsComplexMixed) { using ic_prec_type = typename TestFixture::ic_prec_type; using MixedDense = gko::matrix::Dense< - gko::next_precision>; + gko::next_precision_with_half>; using MixedDenseComplex = gko::to_complex; using T = typename MixedDenseComplex::value_type; const auto b = gko::initialize( diff --git a/reference/test/preconditioner/ilu.cpp b/reference/test/preconditioner/ilu.cpp index 180b92be9ec..e4c4809f084 100644 --- a/reference/test/preconditioner/ilu.cpp +++ b/reference/test/preconditioner/ilu.cpp @@ -84,7 +84,7 @@ class Ilu : public ::testing::Test { std::shared_ptr ilu_rev_pre_factory; }; -TYPED_TEST_SUITE(Ilu, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Ilu, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Ilu, BuildsDefaultWithoutThrowing) @@ -316,7 +316,7 @@ TYPED_TEST(Ilu, SolvesSingleRhsWithMtx) TYPED_TEST(Ilu, SolvesSingleRhsWithMixedMtx) { using Mtx = gko::matrix::Dense< - gko::next_precision>; + gko::next_precision_with_half>; const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); auto x = Mtx::create(this->exec, gko::dim<2>{3, 1}); x->copy_from(b); @@ -349,8 +349,8 @@ TYPED_TEST(Ilu, SolvesSingleRhsWithComplexMtx) TYPED_TEST(Ilu, SolvesSingleRhsWithMixedComplexMtx) { - using Mtx = gko::matrix::Dense< - gko::to_complex>>; + using Mtx = gko::matrix::Dense>>; using T = typename Mtx::value_type; const auto b = gko::initialize( {T{1.0, 2.0}, T{3.0, 6.0}, T{6.0, 12.0}}, this->exec); @@ -403,7 +403,8 @@ TYPED_TEST(Ilu, SolvesAdvancedSingleRhs) TYPED_TEST(Ilu, SolvesAdvancedSingleRhsMixed) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Mtx = gko::matrix::Dense; const value_type alpha{2.0}; const auto alpha_linop = gko::initialize({alpha}, this->exec); @@ -453,7 +454,8 @@ TYPED_TEST(Ilu, SolvesAdvancedSingleRhsComplex) TYPED_TEST(Ilu, SolvesAdvancedSingleRhsMixedComplex) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::to_complex; diff --git a/reference/test/preconditioner/isai_kernels.cpp b/reference/test/preconditioner/isai_kernels.cpp index e989125c61d..f55d7e12b87 100644 --- a/reference/test/preconditioner/isai_kernels.cpp +++ b/reference/test/preconditioner/isai_kernels.cpp @@ -186,8 +186,20 @@ class Isai : public ::testing::Test { { lower_isai_factory = LowerIsai::build().on(exec); upper_isai_factory = UpperIsai::build().on(exec); - general_isai_factory = GeneralIsai::build().on(exec); - spd_isai_factory = SpdIsai::build().on(exec); + if (std::is_same_v, gko::half>) { + general_isai_factory = + GeneralIsai::build() + .with_excess_solver_reduction( + gko::remove_complex{1e-3}) + .on(exec); + spd_isai_factory = SpdIsai::build() + .with_excess_solver_reduction( + gko::remove_complex{1e-3}) + .on(exec); + } else { + general_isai_factory = GeneralIsai::build().on(exec); + spd_isai_factory = SpdIsai::build().on(exec); + } a_dense->convert_to(a_csr); a_dense_inv->convert_to(a_csr_inv); l_dense->convert_to(l_csr); @@ -310,7 +322,8 @@ class Isai : public ::testing::Test { std::shared_ptr spd_sparse_inv; }; -TYPED_TEST_SUITE(Isai, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Isai, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Isai, KernelGenerateA) @@ -1021,6 +1034,9 @@ TYPED_TEST(Isai, ReturnsCorrectInverseALongrowWithExcessSolver) { using value_type = typename TestFixture::value_type; using GeneralIsai = typename TestFixture::GeneralIsai; + // When using the other precision, we already need to drastically reduce the + // precision, so it is hard to work with half. + SKIP_IF_HALF(value_type); auto general_isai_factory = GeneralIsai::build() .with_excess_solver_factory(this->excess_solver_factory) @@ -1068,6 +1084,9 @@ TYPED_TEST(Isai, ReturnsCorrectInverseLLongrowWithExcessSolver) using Csr = typename TestFixture::Csr; using LowerIsai = typename TestFixture::LowerIsai; using value_type = typename TestFixture::value_type; + // When using the other precision, we already need to drastically reduce the + // precision, so it is hard to work with half. + SKIP_IF_HALF(value_type); auto lower_isai_factory = LowerIsai::build() .with_excess_solver_factory(this->excess_solver_factory) @@ -1115,6 +1134,9 @@ TYPED_TEST(Isai, ReturnsCorrectInverseULongrowWithExcessSolver) using Csr = typename TestFixture::Csr; using UpperIsai = typename TestFixture::UpperIsai; using value_type = typename TestFixture::value_type; + // When using the other precision, we already need to drastically reduce the + // precision, so it is hard to work with half. + SKIP_IF_HALF(value_type); auto upper_isai_factory = UpperIsai::build() .with_excess_solver_factory(this->excess_solver_factory) @@ -1228,8 +1250,8 @@ TYPED_TEST(Isai, ReturnsCorrectInverseSpdLongrow) // need to reduce precision due to spd ISAI using GMRES instead of // direct solve GKO_ASSERT_MTX_NEAR(lower, this->spd_csr_longrow_inv, - 10 * r::value); - GKO_ASSERT_MTX_NEAR(lower_t, expected_transpose, 10 * r::value); + 30 * r::value); + GKO_ASSERT_MTX_NEAR(lower_t, expected_transpose, 30 * r::value); } @@ -1238,6 +1260,9 @@ TYPED_TEST(Isai, ReturnsCorrectInverseSpdLongrowWithExcessSolver) using Csr = typename TestFixture::Csr; using SpdIsai = typename TestFixture::SpdIsai; using value_type = typename TestFixture::value_type; + // When using the other precision, we already need to drastically reduce the + // precision, so it is hard to work with half. + SKIP_IF_HALF(value_type); const auto expected_transpose = gko::as(this->spd_csr_longrow_inv->transpose()); auto spd_isai_factory = diff --git a/reference/test/preconditioner/jacobi.cpp b/reference/test/preconditioner/jacobi.cpp index 79c276579ad..1bc0aa37470 100644 --- a/reference/test/preconditioner/jacobi.cpp +++ b/reference/test/preconditioner/jacobi.cpp @@ -144,7 +144,8 @@ class Jacobi : public ::testing::Test { std::unique_ptr adaptive_bj; }; -TYPED_TEST_SUITE(Jacobi, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Jacobi, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Jacobi, GeneratesCorrectStorageScheme) @@ -477,7 +478,7 @@ TYPED_TEST(Jacobi, ScalarJacobiGeneratesOnDifferentPrecision) { using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - using next_type = gko::next_precision; + using next_type = gko::next_precision_with_half; using Bj = typename TestFixture::Bj; auto csr = gko::share(gko::matrix::Csr::create(this->exec)); diff --git a/reference/test/preconditioner/jacobi_kernels.cpp b/reference/test/preconditioner/jacobi_kernels.cpp index 97d9951be7a..2b75c2a5590 100644 --- a/reference/test/preconditioner/jacobi_kernels.cpp +++ b/reference/test/preconditioner/jacobi_kernels.cpp @@ -86,7 +86,8 @@ class Jacobi : public ::testing::Test { std::shared_ptr> mtx; }; -TYPED_TEST_SUITE(Jacobi, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Jacobi, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Jacobi, CanBeGenerated) @@ -561,11 +562,14 @@ TYPED_TEST(Jacobi, SelectsCorrectBlockPrecisions) auto prec = bj->get_parameters().storage_optimization.block_wise.get_const_data(); - auto precision2 = std::is_same, float>::value - ? gko::precision_reduction(0, 0) // float - : gko::precision_reduction(0, 1); // double - EXPECT_EQ(prec[0], gko::precision_reduction(0, 2)); // u * cond = ~1.2e-3 - ASSERT_EQ(prec[1], precision2); // u * cond = ~2.0e-3 + auto precision1 = std::is_same, gko::half>::value + ? gko::precision_reduction(2, 0) + : gko::precision_reduction(0, 2); + auto precision2 = std::is_same, double>::value + ? gko::precision_reduction(0, 1) // double + : gko::precision_reduction(0, 0); // float, half + EXPECT_EQ(prec[0], precision1); // u * cond = ~1.2e-3 + ASSERT_EQ(prec[1], precision2); // u * cond = ~2.0e-3 } @@ -606,6 +610,9 @@ TYPED_TEST(Jacobi, AvoidsPrecisionsThatOverflow) auto precision = std::is_same, float>::value ? gko::precision_reduction(0, 2) // float : gko::precision_reduction(1, 1); // double + if (std::is_same, gko::half>::value) { + precision = gko::precision_reduction(2, 0); + } EXPECT_EQ(prec[0], precision); ASSERT_EQ(prec[1], precision); } @@ -642,7 +649,8 @@ TYPED_TEST(Jacobi, ScalarJacobiAppliesToVector) TYPED_TEST(Jacobi, AppliesToMixedVector) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Vec = gko::matrix::Dense; auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec); auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec); @@ -682,8 +690,8 @@ TYPED_TEST(Jacobi, AppliesToComplexVector) TYPED_TEST(Jacobi, AppliesToMixedComplexVector) { - using value_type = - gko::to_complex>; + using value_type = gko::to_complex< + gko::next_precision_with_half>; using Vec = gko::matrix::Dense; auto x = gko::initialize( {value_type{1.0, 2.0}, value_type{-1.0, -2.0}, value_type{2.0, 4.0}, @@ -888,7 +896,8 @@ TYPED_TEST(Jacobi, ScalarJacobiAppliesLinearCombinationToVector) TYPED_TEST(Jacobi, AppliesLinearCombinationToMixedVector) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using Vec = gko::matrix::Dense; auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec); auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec); @@ -931,7 +940,8 @@ TYPED_TEST(Jacobi, AppliesLinearCombinationToComplexVector) TYPED_TEST(Jacobi, AppliesLinearCombinationToMixedComplexVector) { - using value_type = gko::next_precision; + using value_type = + gko::next_precision_with_half; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::to_complex; using T = gko::to_complex; diff --git a/reference/test/preconditioner/sor_kernels.cpp b/reference/test/preconditioner/sor_kernels.cpp index 18c055aa6d9..cd2fa9af364 100644 --- a/reference/test/preconditioner/sor_kernels.cpp +++ b/reference/test/preconditioner/sor_kernels.cpp @@ -55,7 +55,8 @@ class Sor : public ::testing::Test { exec); }; -TYPED_TEST_SUITE(Sor, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Sor, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Sor, CanInitializeLFactor) diff --git a/reference/test/solver/multigrid_kernels.cpp b/reference/test/solver/multigrid_kernels.cpp index 7b79ba98ad2..8aad93a1efb 100644 --- a/reference/test/solver/multigrid_kernels.cpp +++ b/reference/test/solver/multigrid_kernels.cpp @@ -229,11 +229,13 @@ class Multigrid : public ::testing::Test { using Mtx = gko::matrix::Dense; using Solver = gko::solver::Multigrid; using Coarse = gko::multigrid::Pgm; - using CoarseNext = gko::multigrid::Pgm>; + using CoarseNext = + gko::multigrid::Pgm>; using Smoother = gko::solver::Ir; using InnerSolver = gko::preconditioner::Jacobi; using CoarsestSolver = gko::solver::Cg; - using CoarsestNextSolver = gko::solver::Cg>; + using CoarsestNextSolver = + gko::solver::Cg>; using DummyRPFactory = DummyMultigridLevelWithFactory; using DummyFactory = DummyLinOpWithFactory; Multigrid() From 5f3eadea9bf62157d5309e754f7017f061910428 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 25 Oct 2024 19:02:11 +0200 Subject: [PATCH 37/69] preconditioner config dispatch --- core/config/preconditioner_config.cpp | 30 +++++++++++++++------------ 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/core/config/preconditioner_config.cpp b/core/config/preconditioner_config.cpp index 68cbf8595ba..a5669902d00 100644 --- a/core/config/preconditioner_config.cpp +++ b/core/config/preconditioner_config.cpp @@ -117,24 +117,28 @@ deferred_factory_parameter parse( return dispatch::Configurator>( config, context, updated, - make_type_selector(updated.get_value_typestr(), value_type_list()), + make_type_selector(updated.get_value_typestr(), + value_type_list_with_half()), make_type_selector(updated.get_index_typestr(), index_type_list())); } else if (str == "solver::Ir") { return dispatch::Configurator>( config, context, updated, - make_type_selector(updated.get_value_typestr(), value_type_list()), + make_type_selector(updated.get_value_typestr(), + value_type_list_with_half()), make_type_selector(updated.get_index_typestr(), index_type_list())); } else if (str == "preconditioner::LowerIsai") { return dispatch::Configurator>( config, context, updated, - make_type_selector(updated.get_value_typestr(), value_type_list()), + make_type_selector(updated.get_value_typestr(), + value_type_list_with_half()), make_type_selector(updated.get_index_typestr(), index_type_list())); } else if (str == "solver::Gmres") { return dispatch::Configurator>( config, context, updated, - make_type_selector(updated.get_value_typestr(), value_type_list()), + make_type_selector(updated.get_value_typestr(), + value_type_list_with_half()), make_type_selector(updated.get_index_typestr(), index_type_list())); } else { GKO_INVALID_CONFIG_VALUE("l_solver_type", str); @@ -194,7 +198,7 @@ deferred_factory_parameter parse( ReverseApply::value>::template Configurator>( config, context, updated, make_type_selector(updated.get_value_typestr(), - value_type_list()), + value_type_list_with_half()), make_type_selector(updated.get_index_typestr(), index_type_list())); } else if (str == "solver::Ir") { @@ -204,7 +208,7 @@ deferred_factory_parameter parse( ReverseApply::value>::template Configurator>( config, context, updated, make_type_selector(updated.get_value_typestr(), - value_type_list()), + value_type_list_with_half()), make_type_selector(updated.get_index_typestr(), index_type_list())); } else if (str == "preconditioner::LowerIsai") { @@ -214,7 +218,7 @@ deferred_factory_parameter parse( ReverseApply::value>::template Configurator>( config, context, updated, make_type_selector(updated.get_value_typestr(), - value_type_list()), + value_type_list_with_half()), make_type_selector(updated.get_index_typestr(), index_type_list())); } else if (str == "solver::Gmres") { @@ -224,7 +228,7 @@ deferred_factory_parameter parse( ReverseApply::value>::template Configurator>( config, context, updated, make_type_selector(updated.get_value_typestr(), - value_type_list()), + value_type_list_with_half()), make_type_selector(updated.get_index_typestr(), index_type_list())); } else { @@ -256,7 +260,7 @@ deferred_factory_parameter parse( IsaiHelper::Configurator>( config, context, updated, make_type_selector(updated.get_value_typestr(), - value_type_list()), + value_type_list_with_half()), make_type_selector(updated.get_index_typestr(), index_type_list())); } else if (str == "upper") { @@ -265,7 +269,7 @@ deferred_factory_parameter parse( IsaiHelper::Configurator>( config, context, updated, make_type_selector(updated.get_value_typestr(), - value_type_list()), + value_type_list_with_half()), make_type_selector(updated.get_index_typestr(), index_type_list())); } else if (str == "general") { @@ -274,7 +278,7 @@ deferred_factory_parameter parse( IsaiHelper::Configurator>( config, context, updated, make_type_selector(updated.get_value_typestr(), - value_type_list()), + value_type_list_with_half()), make_type_selector(updated.get_index_typestr(), index_type_list())); } else if (str == "spd") { @@ -283,7 +287,7 @@ deferred_factory_parameter parse( IsaiHelper::Configurator>( config, context, updated, make_type_selector(updated.get_value_typestr(), - value_type_list()), + value_type_list_with_half()), make_type_selector(updated.get_index_typestr(), index_type_list())); } else { @@ -296,7 +300,7 @@ deferred_factory_parameter parse( GKO_PARSE_VALUE_AND_INDEX_TYPE(GaussSeidel, gko::preconditioner::GaussSeidel); -GKO_PARSE_VALUE_AND_INDEX_TYPE(Jacobi, gko::preconditioner::Jacobi); +GKO_PARSE_VALUE_AND_INDEX_TYPE_WITH_HALF(Jacobi, gko::preconditioner::Jacobi); GKO_PARSE_VALUE_AND_INDEX_TYPE(Sor, gko::preconditioner::Sor); From 078897144c08dcff150627f8ad403d18a2146d9c Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 25 Oct 2024 19:20:59 +0200 Subject: [PATCH 38/69] reorder with half --- core/reorder/mc64.cpp | 32 ++++++++++----------- core/reorder/rcm.cpp | 2 +- core/reorder/scaled_reordered.cpp | 3 +- core/test/reorder/amd.cpp | 3 +- reference/test/reorder/mc64.cpp | 7 +++-- reference/test/reorder/mc64_kernels.cpp | 27 +++++++++++++---- reference/test/reorder/rcm.cpp | 3 +- reference/test/reorder/scaled_reordered.cpp | 26 +++++++++++++---- test/reorder/amd.cpp | 3 +- 9 files changed, 72 insertions(+), 34 deletions(-) diff --git a/core/reorder/mc64.cpp b/core/reorder/mc64.cpp index 97dd37b90fc..1319dea252a 100644 --- a/core/reorder/mc64.cpp +++ b/core/reorder/mc64.cpp @@ -37,8 +37,7 @@ void initialize_weights(const matrix::Csr* host_mtx, array>& row_maxima_array, gko::experimental::reorder::mc64_strategy strategy) { - constexpr auto inf = - std::numeric_limits>::infinity(); + const auto inf = std::numeric_limits>::infinity(); const auto num_rows = host_mtx->get_size()[0]; const auto row_ptrs = host_mtx->get_const_row_ptrs(); const auto col_idxs = host_mtx->get_const_col_idxs(); @@ -67,11 +66,13 @@ void initialize_weights(const matrix::Csr* host_mtx, } } }; - if (strategy == - gko::experimental::reorder::mc64_strategy::max_diagonal_sum) { - run_computation([](ValueType a) { return abs(a); }); + if (strategy == mc64_strategy::max_diagonal_sum) { + run_computation( + [](ValueType a) -> remove_complex { return abs(a); }); } else { - run_computation([](ValueType a) { return std::log2(abs(a)); }); + run_computation([](ValueType a) -> remove_complex { + return std::log2(abs(a)); + }); } } @@ -179,7 +180,7 @@ void shortest_augmenting_path( addressable_priority_queue& queue, std::vector& q_j, ValueType tolerance) { - constexpr auto inf = std::numeric_limits::infinity(); + const auto inf = std::numeric_limits::infinity(); auto weights = weights_array.get_data(); auto dual_u = dual_u_array.get_data(); auto distance = distance_array.get_data(); @@ -433,8 +434,7 @@ void compute_scaling(const matrix::Csr* host_mtx, mc64_strategy strategy, ValueType* row_scaling, ValueType* col_scaling) { - constexpr auto inf = - std::numeric_limits>::infinity(); + const auto inf = std::numeric_limits>::infinity(); const auto num_rows = host_mtx->get_size()[0]; const auto weights = weights_array.get_const_data(); const auto dual_u = dual_u_array.get_const_data(); @@ -459,13 +459,14 @@ void compute_scaling(const matrix::Csr* host_mtx, } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_MC64_INITIALIZE_WEIGHTS); -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_MC64_INITIAL_MATCHING); -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_WITH_HALF( GKO_DECLARE_MC64_SHORTEST_AUGMENTING_PATH); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_MC64_COMPUTE_SCALING); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_MC64_COMPUTE_SCALING); } // namespace mc64 @@ -538,8 +539,7 @@ std::unique_ptr Mc64::generate_impl( marked_cols.fill(0); matched_idxs.fill(0); unmatched_rows.fill(0); - constexpr auto inf = - std::numeric_limits>::infinity(); + const auto inf = std::numeric_limits>::infinity(); dual_u.fill(inf); distance.fill(inf); @@ -588,7 +588,7 @@ std::unique_ptr Mc64::generate_impl( #define GKO_DECLARE_MC64(ValueType, IndexType) class Mc64 -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_MC64); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_MC64); } // namespace reorder diff --git a/core/reorder/rcm.cpp b/core/reorder/rcm.cpp index 1acf4d97f1f..0d2bae4d7dc 100644 --- a/core/reorder/rcm.cpp +++ b/core/reorder/rcm.cpp @@ -114,7 +114,7 @@ Rcm::Rcm(const Factory* factory, #define GKO_DECLARE_RCM(ValueType, IndexType) class Rcm -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_RCM); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(GKO_DECLARE_RCM); } // namespace reorder diff --git a/core/reorder/scaled_reordered.cpp b/core/reorder/scaled_reordered.cpp index 264122c0b8f..210e513841b 100644 --- a/core/reorder/scaled_reordered.cpp +++ b/core/reorder/scaled_reordered.cpp @@ -84,7 +84,8 @@ void ScaledReordered::apply_impl(const LinOp* alpha, #define GKO_DECLARE_SCALED_REORDERED(ValueType, IndexType) \ class ScaledReordered -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SCALED_REORDERED); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF( + GKO_DECLARE_SCALED_REORDERED); } // namespace reorder diff --git a/core/test/reorder/amd.cpp b/core/test/reorder/amd.cpp index 9eecf3777e1..b97201e929e 100644 --- a/core/test/reorder/amd.cpp +++ b/core/test/reorder/amd.cpp @@ -177,7 +177,8 @@ class Amd : public ::testing::Test { std::shared_ptr> amd; }; -TYPED_TEST_SUITE(Amd, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Amd, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Amd, WorksAndReducesFillIn) diff --git a/reference/test/reorder/mc64.cpp b/reference/test/reorder/mc64.cpp index 2c64538e9b2..028c093c5f3 100644 --- a/reference/test/reorder/mc64.cpp +++ b/reference/test/reorder/mc64.cpp @@ -70,7 +70,8 @@ class Mc64 : public ::testing::Test { std::unique_ptr mc64_factory; }; -TYPED_TEST_SUITE(Mc64, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Mc64, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Mc64, HasSensibleDefaults) @@ -86,11 +87,13 @@ TYPED_TEST(Mc64, HasSensibleDefaults) TYPED_TEST(Mc64, CanBeCreatedWithReorderingStrategy) { using reorder_type = typename TestFixture::reorder_type; + using real_type = typename TestFixture::real_type; auto mc64 = reorder_type::build() .with_strategy( gko::experimental::reorder::mc64_strategy::max_diagonal_sum) + .with_tolerance(real_type{1e-4}) .on(this->exec) ->generate(this->not_id3_mtx); @@ -123,7 +126,7 @@ TYPED_TEST(Mc64, CanBeCreatedWithTolerance) using real_type = typename TestFixture::real_type; auto mc64 = reorder_type::build() - .with_tolerance(real_type{1e-10}) + .with_tolerance(real_type{1e-4}) .on(this->exec) ->generate(this->id3_mtx); diff --git a/reference/test/reorder/mc64_kernels.cpp b/reference/test/reorder/mc64_kernels.cpp index fb20d4af2c8..81dd1aa59a1 100644 --- a/reference/test/reorder/mc64_kernels.cpp +++ b/reference/test/reorder/mc64_kernels.cpp @@ -12,6 +12,7 @@ #include +#include #include #include #include @@ -118,7 +119,7 @@ class Mc64 : public ::testing::Test { {0., 0., 0., 4., 2., 0.}, {0., 5., 8., 0., 0., 0.}}, ref)), - zero_tol{1e-14} + zero_tol{1e-4} {} std::pair, @@ -134,8 +135,8 @@ class Mc64 : public ::testing::Test { { ASSERT_EQ(a.get_size(), b.get_size()); for (gko::size_type i = 0; i < a.get_size(); i++) { - if (std::isfinite(a.get_const_data()[i]) || - std::isfinite(b.get_const_data()[i])) { + if (gko::is_finite(a.get_const_data()[i]) || + gko::is_finite(b.get_const_data()[i])) { ASSERT_NEAR(a.get_const_data()[i], b.get_const_data()[i], r::value) << name << '[' << i << ']'; @@ -180,7 +181,8 @@ class Mc64 : public ::testing::Test { const real_type zero_tol; }; -TYPED_TEST_SUITE(Mc64, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Mc64, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Mc64, InitializeWeightsSum) @@ -284,6 +286,7 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingExampleSum) gko::experimental::reorder::Mc64::build() .with_strategy( gko::experimental::reorder::mc64_strategy::max_diagonal_sum) + .with_tolerance(real_type{1e-4}) .on(this->ref); auto mc64 = mc64_factory->generate(this->mtx); @@ -303,10 +306,12 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingExampleProduct) { using index_type = typename TestFixture::index_type; using value_type = typename TestFixture::value_type; + using real_type = typename TestFixture::real_type; auto mc64_factory = gko::experimental::reorder::Mc64::build() .with_strategy( gko::experimental::reorder::mc64_strategy::max_diagonal_product) + .with_tolerance(real_type{1e-4}) .on(this->ref); auto mc64 = mc64_factory->generate(this->mtx); @@ -344,6 +349,12 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingLargeTrivialExampleProduct) using value_type = typename TestFixture::value_type; using matrix_type = typename TestFixture::matrix_type; using perm_type = typename TestFixture::perm_type; + // A few scaling factors is zero and gives (inf, -nan) in inv_scaling when + // it is complex value. Depends on compiler and optimization level, the + // value / (inf, -nan) gives (0, 0), which can pass the test under the + // threshold, or (nan, nan), which fails. We disable not only complex + // but also half, because it relies on the value/inf on the half. + SKIP_IF_HALF(value_type); // read input data std::ifstream mtx_stream{gko::matrices::location_1138_bus_mtx}; auto mtx = gko::share(gko::read(mtx_stream, this->ref)); @@ -354,6 +365,7 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingLargeTrivialExampleProduct) gko::experimental::reorder::Mc64::build() .with_strategy( gko::experimental::reorder::mc64_strategy::max_diagonal_product) + .with_tolerance(real_type{1e-4}) .on(this->ref); auto mc64 = mc64_factory->generate(mtx); // get components @@ -362,7 +374,7 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingLargeTrivialExampleProduct) mtx = mtx->scale_permute(row_perm, col_perm); - GKO_ASSERT_MTX_NEAR(mtx, expected_result, r::value); + GKO_ASSERT_MTX_NEAR(mtx, expected_result, 20 * r::value); } @@ -373,6 +385,11 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingLargeExampleProduct) using value_type = typename TestFixture::value_type; using matrix_type = typename TestFixture::matrix_type; using perm_type = typename TestFixture::perm_type; + // some values are too small such that log2(abs(v)) -> -inf and some values + // are out of half-precision range -> inf. It leads some permutation values + // to be invalid_index after the kernel such that scale_permute gives + // segmentation fault. + SKIP_IF_HALF(value_type); // read input data std::ifstream mtx_stream{gko::matrices::location_nontrivial_mc64_example}; auto mtx = gko::share(gko::read(mtx_stream, this->ref)); diff --git a/reference/test/reorder/rcm.cpp b/reference/test/reorder/rcm.cpp index ec547c141e3..ae63ca504bb 100644 --- a/reference/test/reorder/rcm.cpp +++ b/reference/test/reorder/rcm.cpp @@ -54,7 +54,8 @@ class Rcm : public ::testing::Test { std::unique_ptr reorder_op; }; -TYPED_TEST_SUITE(Rcm, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Rcm, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Rcm, CanBeCleared) diff --git a/reference/test/reorder/scaled_reordered.cpp b/reference/test/reorder/scaled_reordered.cpp index 75ab3728a30..b9924cd9418 100644 --- a/reference/test/reorder/scaled_reordered.cpp +++ b/reference/test/reorder/scaled_reordered.cpp @@ -132,7 +132,7 @@ class ScaledReordered : public ::testing::Test { gko::remove_complex tol; }; -TYPED_TEST_SUITE(ScaledReordered, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(ScaledReordered, gko::test::ValueIndexTypesWithHalf, PairTypenameNameGenerator); @@ -364,6 +364,9 @@ TYPED_TEST(ScaledReordered, AppliesWithRcmReordering) TYPED_TEST(ScaledReordered, SolvesSingleRhsWithOnlyInnerOperator) { using SR = typename TestFixture::SR; + using value_type = typename TestFixture::value_type; + // Need to solve them with scaling when using half + SKIP_IF_HALF(value_type); auto scaled_reordered_fact = SR::build().with_inner_operator(this->solver_factory).on(this->exec); auto scaled_reordered = scaled_reordered_fact->generate(this->rcm_mtx); @@ -410,6 +413,9 @@ TYPED_TEST(ScaledReordered, SolvesSingleRhsWithColScaling) TYPED_TEST(ScaledReordered, SolvesSingleRhsWithRcmReordering) { using SR = typename TestFixture::SR; + using value_type = typename TestFixture::value_type; + // Need to solve them with scaling when using half + SKIP_IF_HALF(value_type); auto scaled_reordered_fact = SR::build() .with_reordering(this->rcm_factory) .with_inner_operator(this->solver_factory) @@ -445,7 +451,8 @@ TYPED_TEST(ScaledReordered, SolvesSingleRhsWithScalingAndRcmReorderingMixed) { using SR = typename TestFixture::SR; using T = typename TestFixture::value_type; - using Vec = gko::matrix::Dense>; + using OtherT = gko::next_precision_with_half; + using Vec = gko::matrix::Dense; auto scaled_reordered_fact = SR::build() .with_row_scaling(this->diag2) .with_col_scaling(this->diag3) @@ -459,7 +466,10 @@ TYPED_TEST(ScaledReordered, SolvesSingleRhsWithScalingAndRcmReorderingMixed) scaled_reordered->apply(b, res); - GKO_ASSERT_MTX_NEAR(res, x, 1e-5); + auto tol = std::max(static_cast(r::value), + static_cast(r::value)) * + 15; + GKO_ASSERT_MTX_NEAR(res, x, tol); } @@ -467,6 +477,7 @@ TYPED_TEST(ScaledReordered, AdvancedSolvesSingleRhsWithScalingAndRcmReordering) { using SR = typename TestFixture::SR; using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; const auto alpha = gko::initialize({2.0}, this->exec); const auto beta = gko::initialize({-1.0}, this->exec); auto scaled_reordered_fact = SR::build() @@ -489,8 +500,8 @@ TYPED_TEST(ScaledReordered, { using SR = typename TestFixture::SR; using T = typename TestFixture::value_type; - using value_type = gko::next_precision; - using Vec = gko::matrix::Dense; + using OtherT = gko::next_precision_with_half; + using Vec = gko::matrix::Dense; auto scaled_reordered_fact = SR::build() .with_row_scaling(this->diag2) .with_col_scaling(this->diag3) @@ -506,7 +517,10 @@ TYPED_TEST(ScaledReordered, scaled_reordered->apply(alpha, b, beta, res); - GKO_ASSERT_MTX_NEAR(res, l({-8.3, -12.5, -5.9, -2., 2.9}), 1e-5); + auto tol = std::max(static_cast(r::value), + static_cast(r::value)) * + 15; + GKO_ASSERT_MTX_NEAR(res, l({-8.3, -12.5, -5.9, -2., 2.9}), tol); } diff --git a/test/reorder/amd.cpp b/test/reorder/amd.cpp index a1ca7c09359..f5a17e943e1 100644 --- a/test/reorder/amd.cpp +++ b/test/reorder/amd.cpp @@ -40,7 +40,8 @@ class Amd : public CommonTestFixture { std::shared_ptr dmtx; }; -TYPED_TEST_SUITE(Amd, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST_SUITE(Amd, gko::test::ValueIndexTypesWithHalf, + PairTypenameNameGenerator); TYPED_TEST(Amd, IsEquivalentToRef) From 9b80b994aa113c7f643a533993039cc2d8c216fa Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 12 Nov 2024 16:32:21 +0100 Subject: [PATCH 39/69] change the default mc64 tolerance respect to precision Co-authored-by: Marcel Koch --- include/ginkgo/core/reorder/mc64.hpp | 6 ++++-- reference/test/reorder/mc64.cpp | 10 ++++------ reference/test/reorder/mc64_kernels.cpp | 5 +---- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/include/ginkgo/core/reorder/mc64.hpp b/include/ginkgo/core/reorder/mc64.hpp index b2c1fd1a644..82b4f8f5be5 100644 --- a/include/ginkgo/core/reorder/mc64.hpp +++ b/include/ginkgo/core/reorder/mc64.hpp @@ -6,6 +6,7 @@ #define GKO_PUBLIC_CORE_REORDER_MC64_HPP_ +#include #include #include @@ -100,8 +101,9 @@ class Mc64 final * This parameter controls the tolerance below which a weight is * considered to be zero. */ - remove_complex GKO_FACTORY_PARAMETER_SCALAR(tolerance, - 1e-14); + remove_complex GKO_FACTORY_PARAMETER_SCALAR( + tolerance, + 50 * std::numeric_limits>::epsilon()); }; /** diff --git a/reference/test/reorder/mc64.cpp b/reference/test/reorder/mc64.cpp index 028c093c5f3..0670c77f6e2 100644 --- a/reference/test/reorder/mc64.cpp +++ b/reference/test/reorder/mc64.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -80,7 +81,8 @@ TYPED_TEST(Mc64, HasSensibleDefaults) ASSERT_EQ(this->mc64_factory->get_parameters().strategy, gko::experimental::reorder::mc64_strategy::max_diagonal_product); - ASSERT_EQ(this->mc64_factory->get_parameters().tolerance, real_type{1e-14}); + ASSERT_EQ(this->mc64_factory->get_parameters().tolerance, + 50 * std::numeric_limits::epsilon()); } @@ -93,7 +95,6 @@ TYPED_TEST(Mc64, CanBeCreatedWithReorderingStrategy) reorder_type::build() .with_strategy( gko::experimental::reorder::mc64_strategy::max_diagonal_sum) - .with_tolerance(real_type{1e-4}) .on(this->exec) ->generate(this->not_id3_mtx); @@ -125,10 +126,7 @@ TYPED_TEST(Mc64, CanBeCreatedWithTolerance) using reorder_type = typename TestFixture::reorder_type; using real_type = typename TestFixture::real_type; - auto mc64 = reorder_type::build() - .with_tolerance(real_type{1e-4}) - .on(this->exec) - ->generate(this->id3_mtx); + auto mc64 = reorder_type::build().on(this->exec)->generate(this->id3_mtx); this->assert_correct_permutation(mc64.get()); } diff --git a/reference/test/reorder/mc64_kernels.cpp b/reference/test/reorder/mc64_kernels.cpp index 81dd1aa59a1..f31bf7ba658 100644 --- a/reference/test/reorder/mc64_kernels.cpp +++ b/reference/test/reorder/mc64_kernels.cpp @@ -119,7 +119,7 @@ class Mc64 : public ::testing::Test { {0., 0., 0., 4., 2., 0.}, {0., 5., 8., 0., 0., 0.}}, ref)), - zero_tol{1e-4} + zero_tol{50 * std::numeric_limits::epsilon()} {} std::pair, @@ -286,7 +286,6 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingExampleSum) gko::experimental::reorder::Mc64::build() .with_strategy( gko::experimental::reorder::mc64_strategy::max_diagonal_sum) - .with_tolerance(real_type{1e-4}) .on(this->ref); auto mc64 = mc64_factory->generate(this->mtx); @@ -311,7 +310,6 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingExampleProduct) gko::experimental::reorder::Mc64::build() .with_strategy( gko::experimental::reorder::mc64_strategy::max_diagonal_product) - .with_tolerance(real_type{1e-4}) .on(this->ref); auto mc64 = mc64_factory->generate(this->mtx); @@ -365,7 +363,6 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingLargeTrivialExampleProduct) gko::experimental::reorder::Mc64::build() .with_strategy( gko::experimental::reorder::mc64_strategy::max_diagonal_product) - .with_tolerance(real_type{1e-4}) .on(this->ref); auto mc64 = mc64_factory->generate(mtx); // get components From 90e0d25457450e938c5ff4ba85367df5c5dd3f89 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 28 Oct 2024 18:46:18 +0100 Subject: [PATCH 40/69] log with half --- core/log/convergence.cpp | 2 +- core/log/papi.cpp | 2 +- core/log/solver_progress.cpp | 8 ++++ core/log/stream.cpp | 2 +- core/test/log/convergence.cpp | 3 +- core/test/log/papi.cpp | 2 +- core/test/log/solver_progress.cpp | 3 +- core/test/log/stream.cpp | 72 +++++++++++++++--------------- reference/test/log/convergence.cpp | 3 +- reference/test/log/papi.cpp | 2 +- 10 files changed, 55 insertions(+), 44 deletions(-) diff --git a/core/log/convergence.cpp b/core/log/convergence.cpp index 7cfa764dfd1..78f004226cb 100644 --- a/core/log/convergence.cpp +++ b/core/log/convergence.cpp @@ -110,7 +110,7 @@ void Convergence::on_iteration_complete( #define GKO_DECLARE_CONVERGENCE(_type) class Convergence<_type> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CONVERGENCE); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_CONVERGENCE); } // namespace log diff --git a/core/log/papi.cpp b/core/log/papi.cpp index 5ced377ca38..b5c56527687 100644 --- a/core/log/papi.cpp +++ b/core/log/papi.cpp @@ -279,7 +279,7 @@ void Papi::on_iteration_complete( #define GKO_DECLARE_PAPI(_type) class Papi<_type> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_PAPI); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_PAPI); } // namespace log diff --git a/core/log/solver_progress.cpp b/core/log/solver_progress.cpp index effa0279bba..4d1566e159f 100644 --- a/core/log/solver_progress.cpp +++ b/core/log/solver_progress.cpp @@ -247,6 +247,14 @@ class SolverProgressStore : public SolverProgress { run, gko::matrix::Dense, gko::matrix::Dense>, gko::matrix::Dense>, +#if GINKGO_ENABLE_HALF + gko::matrix::Dense, + gko::matrix::Dense>, + gko::WritableToMatrixData, + gko::WritableToMatrixData, int32>, + gko::WritableToMatrixData, + gko::WritableToMatrixData, int64>, +#endif // fallback for other matrix types gko::WritableToMatrixData, gko::WritableToMatrixData, diff --git a/core/log/stream.cpp b/core/log/stream.cpp index 5e510d409e2..69eef2e0949 100644 --- a/core/log/stream.cpp +++ b/core/log/stream.cpp @@ -482,7 +482,7 @@ void Stream::on_iteration_complete( #define GKO_DECLARE_STREAM(_type) class Stream<_type> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_STREAM); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_WITH_HALF(GKO_DECLARE_STREAM); } // namespace log diff --git a/core/test/log/convergence.cpp b/core/test/log/convergence.cpp index 8fff0c17b8e..64ec37e8942 100644 --- a/core/test/log/convergence.cpp +++ b/core/test/log/convergence.cpp @@ -45,7 +45,8 @@ class Convergence : public ::testing::Test { gko::array status = {exec, 1}; }; -TYPED_TEST_SUITE(Convergence, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Convergence, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(Convergence, CanGetEmptyData) diff --git a/core/test/log/papi.cpp b/core/test/log/papi.cpp index 8278120cc49..e0404b04d90 100644 --- a/core/test/log/papi.cpp +++ b/core/test/log/papi.cpp @@ -91,7 +91,7 @@ class Papi : public ::testing::Test { int eventset; }; -TYPED_TEST_SUITE(Papi, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Papi, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Papi, CatchesAllocationStarted) diff --git a/core/test/log/solver_progress.cpp b/core/test/log/solver_progress.cpp index e00044a908d..2b4a6ac599c 100644 --- a/core/test/log/solver_progress.cpp +++ b/core/test/log/solver_progress.cpp @@ -68,7 +68,8 @@ class SolverProgress : public ::testing::Test { std::unique_ptr solver; }; -TYPED_TEST_SUITE(SolverProgress, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(SolverProgress, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(SolverProgress, TableWorks) diff --git a/core/test/log/stream.cpp b/core/test/log/stream.cpp index 995a9975b89..7f4b41e5cc3 100644 --- a/core/test/log/stream.cpp +++ b/core/test/log/stream.cpp @@ -26,7 +26,7 @@ constexpr int num_iters = 10; template class Stream : public ::testing::Test {}; -TYPED_TEST_SUITE(Stream, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Stream, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Stream, CatchesAllocationStarted) @@ -380,17 +380,17 @@ TYPED_TEST(Stream, CatchesLinOpApplyStartedWithVerbose) std::stringstream out; auto logger = gko::log::Stream::create( gko::log::Logger::linop_apply_started_mask, out, true); - auto A = gko::initialize({1.1}, exec); - auto b = gko::initialize({-2.2}, exec); - auto x = gko::initialize({3.3}, exec); + auto A = gko::initialize({1.5}, exec); + auto b = gko::initialize({-2.25}, exec); + auto x = gko::initialize({3.125}, exec); logger->template on(A.get(), b.get(), x.get()); auto os = out.str(); - GKO_ASSERT_STR_CONTAINS(os, "1.1"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); - GKO_ASSERT_STR_CONTAINS(os, "3.3"); + GKO_ASSERT_STR_CONTAINS(os, "1.5"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); + GKO_ASSERT_STR_CONTAINS(os, "3.125"); } @@ -429,17 +429,17 @@ TYPED_TEST(Stream, CatchesLinOpApplyCompletedWithVerbose) std::stringstream out; auto logger = gko::log::Stream::create( gko::log::Logger::linop_apply_completed_mask, out, true); - auto A = gko::initialize({1.1}, exec); - auto b = gko::initialize({-2.2}, exec); - auto x = gko::initialize({3.3}, exec); + auto A = gko::initialize({1.5}, exec); + auto b = gko::initialize({-2.25}, exec); + auto x = gko::initialize({3.125}, exec); logger->template on( A.get(), b.get(), x.get()); auto os = out.str(); - GKO_ASSERT_STR_CONTAINS(os, "1.1"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); - GKO_ASSERT_STR_CONTAINS(os, "3.3"); + GKO_ASSERT_STR_CONTAINS(os, "1.5"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); + GKO_ASSERT_STR_CONTAINS(os, "3.125"); } @@ -486,21 +486,21 @@ TYPED_TEST(Stream, CatchesLinOpAdvancedApplyStartedWithVerbose) std::stringstream out; auto logger = gko::log::Stream::create( gko::log::Logger::linop_advanced_apply_started_mask, out, true); - auto A = gko::initialize({1.1}, exec); - auto alpha = gko::initialize({-4.4}, exec); - auto b = gko::initialize({-2.2}, exec); + auto A = gko::initialize({1.5}, exec); + auto alpha = gko::initialize({-4.75}, exec); + auto b = gko::initialize({-2.25}, exec); auto beta = gko::initialize({-5.5}, exec); - auto x = gko::initialize({3.3}, exec); + auto x = gko::initialize({3.125}, exec); logger->template on( A.get(), alpha.get(), b.get(), beta.get(), x.get()); auto os = out.str(); - GKO_ASSERT_STR_CONTAINS(os, "1.1"); - GKO_ASSERT_STR_CONTAINS(os, "-4.4"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); + GKO_ASSERT_STR_CONTAINS(os, "1.5"); + GKO_ASSERT_STR_CONTAINS(os, "-4.75"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); GKO_ASSERT_STR_CONTAINS(os, "-5.5"); - GKO_ASSERT_STR_CONTAINS(os, "3.3"); + GKO_ASSERT_STR_CONTAINS(os, "3.125"); } @@ -547,21 +547,21 @@ TYPED_TEST(Stream, CatchesLinOpAdvancedApplyCompletedWithVerbose) std::stringstream out; auto logger = gko::log::Stream::create( gko::log::Logger::linop_advanced_apply_completed_mask, out, true); - auto A = gko::initialize({1.1}, exec); - auto alpha = gko::initialize({-4.4}, exec); - auto b = gko::initialize({-2.2}, exec); + auto A = gko::initialize({1.5}, exec); + auto alpha = gko::initialize({-4.75}, exec); + auto b = gko::initialize({-2.25}, exec); auto beta = gko::initialize({-5.5}, exec); - auto x = gko::initialize({3.3}, exec); + auto x = gko::initialize({3.125}, exec); logger->template on( A.get(), alpha.get(), b.get(), beta.get(), x.get()); auto os = out.str(); - GKO_ASSERT_STR_CONTAINS(os, "1.1"); - GKO_ASSERT_STR_CONTAINS(os, "-4.4"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); + GKO_ASSERT_STR_CONTAINS(os, "1.5"); + GKO_ASSERT_STR_CONTAINS(os, "-4.75"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); GKO_ASSERT_STR_CONTAINS(os, "-5.5"); - GKO_ASSERT_STR_CONTAINS(os, "3.3"); + GKO_ASSERT_STR_CONTAINS(os, "3.125"); } @@ -782,11 +782,11 @@ TYPED_TEST(Stream, CatchesIterationsWithVerbose) gko::solver::Bicgstab::build() .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(exec); - auto solver = factory->generate(gko::initialize({1.1}, exec)); + auto solver = factory->generate(gko::initialize({1.25}, exec)); auto right_hand_side = gko::initialize({-5.5}, exec); - auto residual = gko::initialize({-4.4}, exec); - auto solution = gko::initialize({-2.2}, exec); - auto residual_norm = gko::initialize({-3.3}, exec); + auto residual = gko::initialize({-4.5}, exec); + auto solution = gko::initialize({-2.25}, exec); + auto residual_norm = gko::initialize({-3.125}, exec); gko::array stop_status(exec, 1); logger->template on( @@ -795,9 +795,9 @@ TYPED_TEST(Stream, CatchesIterationsWithVerbose) auto os = out.str(); GKO_ASSERT_STR_CONTAINS(os, "-5.5"); - GKO_ASSERT_STR_CONTAINS(os, "-4.4"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); - GKO_ASSERT_STR_CONTAINS(os, "-3.3"); + GKO_ASSERT_STR_CONTAINS(os, "-4.5"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); + GKO_ASSERT_STR_CONTAINS(os, "-3.125"); GKO_ASSERT_STR_CONTAINS(os, "Finalized:") } diff --git a/reference/test/log/convergence.cpp b/reference/test/log/convergence.cpp index 50db0db49c4..70fc004c030 100644 --- a/reference/test/log/convergence.cpp +++ b/reference/test/log/convergence.cpp @@ -19,7 +19,8 @@ namespace { template class Convergence : public ::testing::Test {}; -TYPED_TEST_SUITE(Convergence, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Convergence, gko::test::ValueTypesWithHalf, + TypenameNameGenerator); TYPED_TEST(Convergence, CatchesCriterionCheckCompleted) diff --git a/reference/test/log/papi.cpp b/reference/test/log/papi.cpp index 4f1d9e469f1..647a14af9b2 100644 --- a/reference/test/log/papi.cpp +++ b/reference/test/log/papi.cpp @@ -83,7 +83,7 @@ class Papi : public ::testing::Test { int eventset; }; -TYPED_TEST_SUITE(Papi, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Papi, gko::test::ValueTypesWithHalf, TypenameNameGenerator); TYPED_TEST(Papi, CatchesCriterionCheckCompleted) From d604ada23d2df47f14e79c61ca89c064abaa840c Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 29 Oct 2024 16:58:44 +0100 Subject: [PATCH 41/69] dispatch with distributed needs to throw with half --- core/distributed/helpers.hpp | 14 +- core/multigrid/pgm.cpp | 266 +++++++++--------- core/solver/multigrid.cpp | 174 ++++++++---- .../ginkgo/core/base/precision_dispatch.hpp | 143 ++++++---- 4 files changed, 350 insertions(+), 247 deletions(-) diff --git a/core/distributed/helpers.hpp b/core/distributed/helpers.hpp index 5536dbe32f0..9ce7d3b6ab4 100644 --- a/core/distributed/helpers.hpp +++ b/core/distributed/helpers.hpp @@ -122,11 +122,15 @@ void vector_dispatch(T* linop, F&& f, Args&&... args) { #if GINKGO_BUILD_MPI if (is_distributed(linop)) { - using type = std::conditional_t< - std::is_const::value, - const experimental::distributed::Vector, - experimental::distributed::Vector>; - f(dynamic_cast(linop), std::forward(args)...); + if constexpr (std::is_same_v, half>) { + GKO_NOT_SUPPORTED(linop); + } else { + using type = std::conditional_t< + std::is_const::value, + const experimental::distributed::Vector, + experimental::distributed::Vector>; + f(dynamic_cast(linop), std::forward(args)...); + } } else #endif { diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp index e531fb2b996..d4e4ffde4de 100644 --- a/core/multigrid/pgm.cpp +++ b/core/multigrid/pgm.cpp @@ -389,137 +389,147 @@ void Pgm::generate() #if GINKGO_BUILD_MPI if (std::dynamic_pointer_cast< const experimental::distributed::DistributedBase>(system_matrix_)) { - auto convert_fine_op = [&](auto matrix) { - using global_index_type = typename std::decay_t< - decltype(*matrix)>::result_type::global_index_type; - auto exec = as(matrix)->get_executor(); - auto comm = as(matrix) - ->get_communicator(); - auto fine = share( - experimental::distributed:: - Matrix::create( - exec, comm, - matrix::Csr::create(exec), - matrix::Csr::create(exec))); - matrix->convert_to(fine); - this->set_fine_op(fine); - }; - auto setup_fine_op = [&](auto matrix) { - // Only support csr matrix currently. - auto local_csr = std::dynamic_pointer_cast( - matrix->get_local_matrix()); - auto non_local_csr = std::dynamic_pointer_cast( - matrix->get_non_local_matrix()); - // If system matrix is not csr or need sorting, generate the csr. - if (!parameters_.skip_sorting || !local_csr || !non_local_csr) { + if constexpr (std::is_same_v, half>) { + GKO_NOT_SUPPORTED(nullptr); + } else { + auto convert_fine_op = [&](auto matrix) { + using global_index_type = typename std::decay_t< + decltype(*matrix)>::result_type::global_index_type; + auto exec = as(matrix)->get_executor(); + auto comm = + as(matrix) + ->get_communicator(); + auto fine = share( + experimental::distributed:: + Matrix::create( + exec, comm, + matrix::Csr::create(exec), + matrix::Csr::create(exec))); + matrix->convert_to(fine); + this->set_fine_op(fine); + }; + auto setup_fine_op = [&](auto matrix) { + // Only support csr matrix currently. + auto local_csr = std::dynamic_pointer_cast( + matrix->get_local_matrix()); + auto non_local_csr = std::dynamic_pointer_cast( + matrix->get_non_local_matrix()); + // If system matrix is not csr or need sorting, generate the + // csr. + if (!parameters_.skip_sorting || !local_csr || !non_local_csr) { + using global_index_type = typename std::decay_t< + decltype(*matrix)>::global_index_type; + convert_fine_op( + as>>(matrix)); + } + }; + + using fst_mtx_type = + experimental::distributed::Matrix; + using snd_mtx_type = + experimental::distributed::Matrix; + // setup the fine op using Csr with current ValueType + // we do not use dispatcher run in the first place because we have + // the fallback option for that. + if (auto obj = std::dynamic_pointer_cast( + system_matrix_)) { + setup_fine_op(obj); + } else if (auto obj = std::dynamic_pointer_cast( + system_matrix_)) { + setup_fine_op(obj); + } else { + // handle other ValueTypes. + run(obj, + convert_fine_op); + } + + auto distributed_setup = [&](auto matrix) { + auto exec = gko::as(matrix)->get_executor(); + auto comm = + gko::as(matrix) + ->get_communicator(); + auto num_rank = comm.size(); + auto pgm_local_op = + gko::as(matrix->get_local_matrix()); + auto result = this->generate_local(pgm_local_op); + + auto non_local_csr = + as(matrix->get_non_local_matrix()); + auto non_local_size = non_local_csr->get_size()[1]; + array non_local_agg(exec, non_local_size); + // get agg information (prolong_row_gather row idx) + communicate(matrix, agg_, non_local_agg); + // generate non_local_col_map + non_local_agg.set_executor(exec->get_master()); + array non_local_col_map(exec->get_master(), + non_local_size); + // add additional entry in tail such that the offset easily + // handle it. + array renumber(exec->get_master(), + non_local_size + 1); + auto recv_offsets = matrix->recv_offsets_; + generate_non_local_map(recv_offsets, non_local_agg, + non_local_col_map, renumber); + + // get new recv_size and recv_offsets + std::vector + new_recv_size(num_rank); + std::vector + new_recv_offsets(num_rank + 1); + array new_recv_gather_idxs(exec->get_master()); + compute_communication(recv_offsets, non_local_agg, renumber, + new_recv_size, new_recv_offsets, + new_recv_gather_idxs); + + non_local_col_map.set_executor(exec); + IndexType non_local_num_agg = new_recv_gather_idxs.get_size(); + // build csr from row and col map + // unlike non-distributed version, generate_coarse uses + // different row and col maps. + auto result_non_local_csr = generate_coarse( + exec, non_local_csr.get(), + static_cast(std::get<1>(result)->get_size()[0]), + agg_, non_local_num_agg, non_local_col_map); + // use local and non-local to build coarse matrix + // also restriction and prolongation (Local-only-global matrix) + auto coarse_size = + static_cast(std::get<1>(result)->get_size()[0]); + comm.all_reduce(exec->get_master(), &coarse_size, 1, MPI_SUM); + new_recv_gather_idxs.set_executor(exec); + + // setup the generated linop. using global_index_type = typename std::decay_t::global_index_type; - convert_fine_op( - as>>(matrix)); - } - }; - - using fst_mtx_type = - experimental::distributed::Matrix; - using snd_mtx_type = - experimental::distributed::Matrix; - // setup the fine op using Csr with current ValueType - // we do not use dispatcher run in the first place because we have the - // fallback option for that. - if (auto obj = - std::dynamic_pointer_cast(system_matrix_)) { - setup_fine_op(obj); - } else if (auto obj = std::dynamic_pointer_cast( - system_matrix_)) { - setup_fine_op(obj); - } else { - // handle other ValueTypes. - run(obj, - convert_fine_op); + auto coarse = share( + experimental::distributed:: + Matrix::create( + exec, comm, gko::dim<2>(coarse_size, coarse_size), + std::get<1>(result), result_non_local_csr, + new_recv_size, new_recv_offsets, + new_recv_gather_idxs)); + auto restrict_op = share( + experimental::distributed:: + Matrix::create( + exec, comm, + dim<2>(coarse_size, + gko::as(matrix)->get_size()[0]), + std::get<2>(result))); + auto prolong_op = share( + experimental::distributed:: + Matrix::create( + exec, comm, + dim<2>(gko::as(matrix)->get_size()[0], + coarse_size), + std::get<0>(result))); + this->set_multigrid_level(prolong_op, coarse, restrict_op); + }; + + // the fine op is using csr with the current ValueType + run(this->get_fine_op(), + distributed_setup); } - - auto distributed_setup = [&](auto matrix) { - auto exec = gko::as(matrix)->get_executor(); - auto comm = - gko::as(matrix) - ->get_communicator(); - auto num_rank = comm.size(); - auto pgm_local_op = - gko::as(matrix->get_local_matrix()); - auto result = this->generate_local(pgm_local_op); - - auto non_local_csr = - as(matrix->get_non_local_matrix()); - auto non_local_size = non_local_csr->get_size()[1]; - array non_local_agg(exec, non_local_size); - // get agg information (prolong_row_gather row idx) - communicate(matrix, agg_, non_local_agg); - // generate non_local_col_map - non_local_agg.set_executor(exec->get_master()); - array non_local_col_map(exec->get_master(), - non_local_size); - // add additional entry in tail such that the offset easily handle - // it. - array renumber(exec->get_master(), non_local_size + 1); - auto recv_offsets = matrix->recv_offsets_; - generate_non_local_map(recv_offsets, non_local_agg, - non_local_col_map, renumber); - - // get new recv_size and recv_offsets - std::vector - new_recv_size(num_rank); - std::vector - new_recv_offsets(num_rank + 1); - array new_recv_gather_idxs(exec->get_master()); - compute_communication(recv_offsets, non_local_agg, renumber, - new_recv_size, new_recv_offsets, - new_recv_gather_idxs); - - non_local_col_map.set_executor(exec); - IndexType non_local_num_agg = new_recv_gather_idxs.get_size(); - // build csr from row and col map - // unlike non-distributed version, generate_coarse uses different - // row and col maps. - auto result_non_local_csr = generate_coarse( - exec, non_local_csr.get(), - static_cast(std::get<1>(result)->get_size()[0]), - agg_, non_local_num_agg, non_local_col_map); - // use local and non-local to build coarse matrix - // also restriction and prolongation (Local-only-global matrix) - auto coarse_size = - static_cast(std::get<1>(result)->get_size()[0]); - comm.all_reduce(exec->get_master(), &coarse_size, 1, MPI_SUM); - new_recv_gather_idxs.set_executor(exec); - - // setup the generated linop. - using global_index_type = - typename std::decay_t::global_index_type; - auto coarse = share( - experimental::distributed:: - Matrix::create( - exec, comm, gko::dim<2>(coarse_size, coarse_size), - std::get<1>(result), result_non_local_csr, - new_recv_size, new_recv_offsets, new_recv_gather_idxs)); - auto restrict_op = share( - experimental::distributed:: - Matrix::create( - exec, comm, - dim<2>(coarse_size, - gko::as(matrix)->get_size()[0]), - std::get<2>(result))); - auto prolong_op = share( - experimental::distributed:: - Matrix::create( - exec, comm, - dim<2>(gko::as(matrix)->get_size()[0], - coarse_size), - std::get<0>(result))); - this->set_multigrid_level(prolong_op, coarse, restrict_op); - }; - - // the fine op is using csr with the current ValueType - run(this->get_fine_op(), distributed_setup); } else #endif // GINKGO_BUILD_MPI { diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index 0b918a13897..9b2a4a814e1 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -101,11 +101,16 @@ void handle_list( auto exec = matrix->get_executor(); #if GINKGO_BUILD_MPI if (gko::detail::is_distributed(matrix.get())) { - using experimental::distributed::Matrix; - return run, - Matrix, - Matrix>( - matrix, [exec, iteration, relaxation_factor](auto matrix) { + if constexpr (std::is_same_v, half>) { + GKO_NOT_SUPPORTED(matrix); + } else { + using experimental::distributed::Matrix; + return run, + Matrix, + Matrix>(matrix, [exec, iteration, + relaxation_factor]( + auto matrix) { using Mtx = typename decltype(matrix)::element_type; return share( build_smoother( @@ -119,6 +124,7 @@ void handle_list( iteration, casting(relaxation_factor)) ->generate(matrix)); }); + } } #endif return share(build_smoother(preconditioner::Jacobi::build() @@ -330,30 +336,37 @@ void MultigridState::generate(const LinOp* system_matrix_in, if (gko::detail::is_distributed(system_matrix_in)) { using value_type = typename std::decay_t::value_type; - using VectorType = - experimental::distributed::Vector; - auto fine = mg_level->get_fine_op().get(); - auto coarse = mg_level->get_coarse_op().get(); - auto distributed_fine = dynamic_cast< - const experimental::distributed::DistributedBase*>( - fine); - auto distributed_coarse = dynamic_cast< - const experimental::distributed::DistributedBase*>( - coarse); - auto current_comm = distributed_fine->get_communicator(); - auto next_comm = distributed_coarse->get_communicator(); - auto current_local_nrows = - ::gko::detail::run_matrix(fine, [](auto* fine_mat) { - return fine_mat->get_local_matrix()->get_size()[0]; - }); - auto next_local_nrows = - ::gko::detail::run_matrix(coarse, [](auto* coarse_mat) { - return coarse_mat->get_non_local_matrix() - ->get_size()[0]; - }); - this->allocate_memory( - i, cycle, current_comm, next_comm, current_nrows, - next_nrows, current_local_nrows, next_local_nrows); + if constexpr (std::is_same_v, + half>) { + GKO_NOT_SUPPORTED(system_matrix_in); + } else { + using VectorType = + experimental::distributed::Vector; + auto fine = mg_level->get_fine_op().get(); + auto coarse = mg_level->get_coarse_op().get(); + auto distributed_fine = dynamic_cast< + const experimental::distributed::DistributedBase*>( + fine); + auto distributed_coarse = dynamic_cast< + const experimental::distributed::DistributedBase*>( + coarse); + auto current_comm = + distributed_fine->get_communicator(); + auto next_comm = distributed_coarse->get_communicator(); + auto current_local_nrows = + ::gko::detail::run_matrix(fine, [](auto* fine_mat) { + return fine_mat->get_local_matrix() + ->get_size()[0]; + }); + auto next_local_nrows = ::gko::detail::run_matrix( + coarse, [](auto* coarse_mat) { + return coarse_mat->get_non_local_matrix() + ->get_size()[0]; + }); + this->allocate_memory( + i, cycle, current_comm, next_comm, current_nrows, + next_nrows, current_local_nrows, next_local_nrows); + } } else #endif { @@ -446,6 +459,32 @@ void MultigridState::allocate_memory( initialize({-one()}, exec)); } +#if GINKGO_ENABLE_HALF +template <> +void MultigridState::allocate_memory< + gko::experimental::distributed::Vector>( + int level, multigrid::cycle cycle, + const experimental::mpi::communicator& current_comm, + const experimental::mpi::communicator& next_comm, size_type current_nrows, + size_type next_nrows, size_type current_local_nrows, + size_type next_local_nrows) +{ + GKO_NOT_SUPPORTED(nullptr); +} + +template <> +void MultigridState::allocate_memory< + gko::experimental::distributed::Vector>>( + int level, multigrid::cycle cycle, + const experimental::mpi::communicator& current_comm, + const experimental::mpi::communicator& next_comm, size_type current_nrows, + size_type next_nrows, size_type current_local_nrows, + size_type next_local_nrows) +{ + GKO_NOT_SUPPORTED(nullptr); +} +#endif + #endif @@ -594,6 +633,27 @@ void MultigridState::run_cycle(multigrid::cycle cycle, size_type level, } } +template <> +void MultigridState::run_cycle< + gko::experimental::distributed::Vector>( + multigrid::cycle cycle, size_type level, + const std::shared_ptr& matrix, const LinOp* b, LinOp* x, + cycle_mode mode) +{ + GKO_NOT_SUPPORTED(nullptr); +} + +template <> +void MultigridState::run_cycle< + gko::experimental::distributed::Vector>>( + multigrid::cycle cycle, size_type level, + const std::shared_ptr& matrix, const LinOp* b, LinOp* x, + cycle_mode mode) +{ + GKO_NOT_SUPPORTED(nullptr); +} + + } // namespace detail } // namespace multigrid @@ -770,35 +830,41 @@ void Multigrid::generate() if (gko::detail::is_distributed(matrix.get())) { using absolute_value_type = remove_complex; using experimental::distributed::Matrix; - return run, - Matrix, - Matrix>(matrix, [exec](auto matrix) { - using Mtx = typename decltype(matrix)::element_type; - return solver::Gmres::build() - .with_criteria( - stop::Iteration::build().with_max_iters( - matrix->get_size()[0]), - stop::ResidualNorm::build() - .with_reduction_factor( - std::numeric_limits< - absolute_value_type>::epsilon() * - absolute_value_type{10})) - .with_krylov_dim( - std::min(size_type(100), matrix->get_size()[0])) - .with_preconditioner( - experimental::distributed::preconditioner:: - Schwarz:: - build() + if constexpr (std::is_same_v) { + GKO_NOT_SUPPORTED(matrix); + } else { + return run, + Matrix, + Matrix>(matrix, [exec](auto matrix) { + using Mtx = typename decltype(matrix)::element_type; + return solver::Gmres::build() + .with_criteria( + stop::Iteration::build().with_max_iters( + matrix->get_size()[0]), + stop::ResidualNorm::build() + .with_reduction_factor( + std::numeric_limits< + absolute_value_type>:: + epsilon() * + absolute_value_type{10})) + .with_krylov_dim(std::min( + size_type(100), matrix->get_size()[0])) + .with_preconditioner( + experimental::distributed::preconditioner:: + Schwarz::build() .with_local_solver( preconditioner::Jacobi< value_type>::build() .with_max_block_size(1u))) - .on(exec) - ->generate(matrix); - }); + .on(exec) + ->generate(matrix); + }); + } } #endif if (dynamic_cast(exec.get())) { diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index ad31a6b19e8..4adc02763f0 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -382,7 +382,11 @@ make_temporary_conversion(const LinOp* matrix) template void precision_dispatch(Function fn, Args*... linops) { - fn(distributed::make_temporary_conversion(linops).get()...); + if constexpr (std::is_same_v, half>) { + GKO_NOT_SUPPORTED(nullptr); + } else { + fn(distributed::make_temporary_conversion(linops).get()...); + } } @@ -398,23 +402,29 @@ void precision_dispatch(Function fn, Args*... linops) template void precision_dispatch_real_complex(Function fn, const LinOp* in, LinOp* out) { - auto complex_to_real = !( - is_complex() || - dynamic_cast>*>( - in)); - if (complex_to_real) { - auto dense_in = - distributed::make_temporary_conversion>(in); - auto dense_out = - distributed::make_temporary_conversion>(out); - using Vector = experimental::distributed::Vector; - // These dynamic_casts are only needed to make the code compile - // If ValueType is complex, this branch will never be taken - // If ValueType is real, the cast is a no-op - fn(dynamic_cast(dense_in->create_real_view().get()), - dynamic_cast(dense_out->create_real_view().get())); + if constexpr (std::is_same_v, half>) { + GKO_NOT_SUPPORTED(nullptr); } else { - distributed::precision_dispatch(fn, in, out); + auto complex_to_real = !( + is_complex() || + dynamic_cast< + const ConvertibleTo>*>(in)); + if (complex_to_real) { + auto dense_in = + distributed::make_temporary_conversion>( + in); + auto dense_out = + distributed::make_temporary_conversion>( + out); + using Vector = experimental::distributed::Vector; + // These dynamic_casts are only needed to make the code compile + // If ValueType is complex, this branch will never be taken + // If ValueType is real, the cast is a no-op + fn(dynamic_cast(dense_in->create_real_view().get()), + dynamic_cast(dense_out->create_real_view().get())); + } else { + distributed::precision_dispatch(fn, in, out); + } } } @@ -426,27 +436,33 @@ template void precision_dispatch_real_complex(Function fn, const LinOp* alpha, const LinOp* in, LinOp* out) { - auto complex_to_real = !( - is_complex() || - dynamic_cast>*>( - in)); - if (complex_to_real) { - auto dense_in = - distributed::make_temporary_conversion>(in); - auto dense_out = - distributed::make_temporary_conversion>(out); - auto dense_alpha = gko::make_temporary_conversion(alpha); - using Vector = experimental::distributed::Vector; - // These dynamic_casts are only needed to make the code compile - // If ValueType is complex, this branch will never be taken - // If ValueType is real, the cast is a no-op - fn(dense_alpha.get(), - dynamic_cast(dense_in->create_real_view().get()), - dynamic_cast(dense_out->create_real_view().get())); + if constexpr (std::is_same_v, half>) { + GKO_NOT_SUPPORTED(nullptr); } else { - fn(gko::make_temporary_conversion(alpha).get(), - distributed::make_temporary_conversion(in).get(), - distributed::make_temporary_conversion(out).get()); + auto complex_to_real = !( + is_complex() || + dynamic_cast< + const ConvertibleTo>*>(in)); + if (complex_to_real) { + auto dense_in = + distributed::make_temporary_conversion>( + in); + auto dense_out = + distributed::make_temporary_conversion>( + out); + auto dense_alpha = gko::make_temporary_conversion(alpha); + using Vector = experimental::distributed::Vector; + // These dynamic_casts are only needed to make the code compile + // If ValueType is complex, this branch will never be taken + // If ValueType is real, the cast is a no-op + fn(dense_alpha.get(), + dynamic_cast(dense_in->create_real_view().get()), + dynamic_cast(dense_out->create_real_view().get())); + } else { + fn(gko::make_temporary_conversion(alpha).get(), + distributed::make_temporary_conversion(in).get(), + distributed::make_temporary_conversion(out).get()); + } } } @@ -459,30 +475,36 @@ void precision_dispatch_real_complex(Function fn, const LinOp* alpha, const LinOp* in, const LinOp* beta, LinOp* out) { - auto complex_to_real = !( - is_complex() || - dynamic_cast>*>( - in)); - if (complex_to_real) { - auto dense_in = - distributed::make_temporary_conversion>(in); - auto dense_out = - distributed::make_temporary_conversion>(out); - auto dense_alpha = gko::make_temporary_conversion(alpha); - auto dense_beta = gko::make_temporary_conversion(beta); - using Vector = experimental::distributed::Vector; - // These dynamic_casts are only needed to make the code compile - // If ValueType is complex, this branch will never be taken - // If ValueType is real, the cast is a no-op - fn(dense_alpha.get(), - dynamic_cast(dense_in->create_real_view().get()), - dense_beta.get(), - dynamic_cast(dense_out->create_real_view().get())); + if constexpr (std::is_same_v, half>) { + GKO_NOT_SUPPORTED(nullptr); } else { - fn(gko::make_temporary_conversion(alpha).get(), - distributed::make_temporary_conversion(in).get(), - gko::make_temporary_conversion(beta).get(), - distributed::make_temporary_conversion(out).get()); + auto complex_to_real = !( + is_complex() || + dynamic_cast< + const ConvertibleTo>*>(in)); + if (complex_to_real) { + auto dense_in = + distributed::make_temporary_conversion>( + in); + auto dense_out = + distributed::make_temporary_conversion>( + out); + auto dense_alpha = gko::make_temporary_conversion(alpha); + auto dense_beta = gko::make_temporary_conversion(beta); + using Vector = experimental::distributed::Vector; + // These dynamic_casts are only needed to make the code compile + // If ValueType is complex, this branch will never be taken + // If ValueType is real, the cast is a no-op + fn(dense_alpha.get(), + dynamic_cast(dense_in->create_real_view().get()), + dense_beta.get(), + dynamic_cast(dense_out->create_real_view().get())); + } else { + fn(gko::make_temporary_conversion(alpha).get(), + distributed::make_temporary_conversion(in).get(), + gko::make_temporary_conversion(beta).get(), + distributed::make_temporary_conversion(out).get()); + } } } @@ -547,6 +569,7 @@ void precision_dispatch_real_complex_distributed(Function fn, if (dynamic_cast(in)) { experimental::distributed::precision_dispatch_real_complex( fn, alpha, in, beta, out); + } else { gko::precision_dispatch_real_complex(fn, alpha, in, beta, out); From f069992c28c886f88a856aff779cf149c29b543c Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 21 Nov 2024 10:35:07 +0100 Subject: [PATCH 42/69] revert the distribution value_type in dpcpp/test/preconditioner/jacobi_kernels to make them work with single --- .../test/preconditioner/jacobi_kernels.dp.cpp | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp index 36179402262..cdf3a0d0298 100644 --- a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp @@ -23,6 +23,8 @@ namespace { +// We keep some distribution with value_type to make the test with +// GINKGO_DPCPP_SINGLE_MODE still work. class Jacobi : public ::testing::Test { protected: using index_type = int32_t; @@ -62,7 +64,7 @@ class Jacobi : public ::testing::Test { if (condition_numbers.size() == 0) { mtx = gko::test::generate_random_matrix( dim, dim, std::uniform_int_distribution<>(min_nnz, max_nnz), - std::normal_distribution<>(0.0, 1.0), engine, ref); + std::normal_distribution(0.0, 1.0), engine, ref); } else { std::vector blocks; for (gko::size_type i = 0; i < block_pointers.size() - 1; ++i) { @@ -70,7 +72,8 @@ class Jacobi : public ::testing::Test { begin(block_pointers)[i + 1] - begin(block_pointers)[i]; const auto cond = begin(condition_numbers)[i]; blocks.push_back(mtx_data::cond( - size, cond, std::normal_distribution<>(-1, 1), engine)); + size, cond, std::normal_distribution(-1, 1), + engine)); } mtx = Mtx::create(ref); mtx->read(mtx_data::diag(begin(blocks), end(blocks))); @@ -106,11 +109,11 @@ class Jacobi : public ::testing::Test { } b = gko::test::generate_random_matrix( dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs), - std::normal_distribution<>(0.0, 1.0), engine, ref); + std::normal_distribution(0.0, 1.0), engine, ref); d_b = gko::clone(dpcpp, b); x = gko::test::generate_random_matrix( dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs), - std::normal_distribution<>(0.0, 1.0), engine, ref); + std::normal_distribution(0.0, 1.0), engine, ref); d_x = gko::clone(dpcpp, x); } @@ -408,7 +411,7 @@ TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef) smtx->copy_from(dense_smtx); auto sb = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution<>(0.0, 1.0), engine, ref)); + std::normal_distribution(0.0, 1.0), engine, ref)); auto sx = Vec::create(ref, sb->get_size()); auto d_smtx = gko::share(Mtx::create(dpcpp)); @@ -452,7 +455,7 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef) auto dense_data = gko::test::generate_random_matrix_data( dim, dim, std::uniform_int_distribution<>(1, dim), - std::normal_distribution<>(1.0, 2.0), engine); + std::normal_distribution(1.0, 2.0), engine); gko::utils::make_diag_dominant(dense_data); auto dense_smtx = gko::share(Vec::create(ref)); dense_smtx->read(dense_data); @@ -460,12 +463,12 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef) smtx->copy_from(dense_smtx); auto sb = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), - 4)); + std::normal_distribution(0.0, 1.0), engine, ref, + gko::dim<2>(dim, 3), 4)); auto sx = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), - 4)); + std::normal_distribution(0.0, 1.0), engine, ref, + gko::dim<2>(dim, 3), 4)); auto d_smtx = gko::share(gko::clone(dpcpp, smtx)); auto d_sb = gko::share(gko::clone(dpcpp, sb)); From 0725476aa8050fe0cdca027db7a86251e1f660ab Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Sun, 24 Nov 2024 22:36:57 +0100 Subject: [PATCH 43/69] fix distributed mixed-precision pgm --- core/multigrid/pgm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp index d4e4ffde4de..d11ebf32399 100644 --- a/core/multigrid/pgm.cpp +++ b/core/multigrid/pgm.cpp @@ -441,7 +441,7 @@ void Pgm::generate() setup_fine_op(obj); } else { // handle other ValueTypes. - run(obj, + run(system_matrix_, convert_fine_op); } From 1d85e30116bf4b6438cd147ebd3e08f34408c4e8 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 28 Nov 2024 19:05:12 +0100 Subject: [PATCH 44/69] fix type_size_impl for thrust::complex --- common/cuda_hip/base/math.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp index 51a7fedf0c4..8c0da63c181 100644 --- a/common/cuda_hip/base/math.hpp +++ b/common/cuda_hip/base/math.hpp @@ -83,6 +83,12 @@ struct truncate_type_impl> { }; +template +struct type_size_impl> { + static constexpr auto value = sizeof(T) * byte_size; +}; + + template struct is_complex_impl> : public std::true_type {}; From 30e56d5e29e551aeb86d71906b028383649f8f13 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 25 Nov 2024 18:48:03 +0100 Subject: [PATCH 45/69] do not support half for nvhpc23.3 due to signal 11 --- CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c48d12989aa..8aeb00c0c30 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,8 +34,9 @@ option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP t option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF) option(GINKGO_ENABLE_HALF "Enable the use of half precision" ON) # We do not support MSVC. SYCL will come later -if(MSVC OR GINKGO_BUILD_SYCL) - message(STATUS "HALF is not supported in MSVC, and later support in SYCL") +# NVHPC 23.3 faces "termminated by signal 11" in reference/test/isal_kernel and core/config/preconditioner, so we don't support this version for half unfortunately. +if(MSVC OR GINKGO_BUILD_SYCL OR (CMAKE_CXX_COMPILER_ID MATCHES "PGI|NVHPC" AND CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 23.3.0)) + message(STATUS "HALF is not supported in MSVC, NVHPC 23.3, and later support in SYCL") set(GINKGO_ENABLE_HALF OFF CACHE BOOL "Enable the use of half precision" FORCE) endif() option(GINKGO_SKIP_DEPENDENCY_UPDATE From 1ce1b68578b23052fcf1b44fc9c7e0169a640ab8 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 27 Nov 2024 16:55:52 +0100 Subject: [PATCH 46/69] split config to different file to overcome nvhpc limit --- core/CMakeLists.txt | 3 + core/config/preconditioner_config.cpp | 280 --------------------- core/config/preconditioner_ic_config.cpp | 111 ++++++++ core/config/preconditioner_ilu_config.cpp | 147 +++++++++++ core/config/preconditioner_isai_config.cpp | 91 +++++++ 5 files changed, 352 insertions(+), 280 deletions(-) create mode 100644 core/config/preconditioner_ic_config.cpp create mode 100644 core/config/preconditioner_ilu_config.cpp create mode 100644 core/config/preconditioner_isai_config.cpp diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 598167c0d7c..7901edf5341 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -4,6 +4,9 @@ set(config_source config/factorization_config.cpp config/multigrid_config.cpp config/preconditioner_config.cpp + config/preconditioner_ic_config.cpp + config/preconditioner_ilu_config.cpp + config/preconditioner_isai_config.cpp config/registry.cpp config/solver_config.cpp ) diff --git a/core/config/preconditioner_config.cpp b/core/config/preconditioner_config.cpp index a5669902d00..840094b51c9 100644 --- a/core/config/preconditioner_config.cpp +++ b/core/config/preconditioner_config.cpp @@ -6,14 +6,8 @@ #include #include #include -#include -#include -#include #include #include -#include -#include -#include #include "core/config/config_helper.hpp" #include "core/config/dispatch.hpp" @@ -25,280 +19,6 @@ namespace gko { namespace config { -// For Ic and Ilu, we use additional ValueType to help Solver type decision -template -class IcSolverHelper { -public: - template - class Configurator { - public: - static - typename gko::preconditioner::Ic::parameters_type - parse(const pnode& config, const registry& context, - const type_descriptor& td_for_child) - { - return gko::preconditioner::Ic::parse( - config, context, td_for_child); - } - }; -}; - - -template -class IluSolverHelper { -public: - template - class Configurator { - public: - static typename preconditioner::Ilu::parameters_type - parse(const pnode& config, const registry& context, - const type_descriptor& td_for_child) - { - return preconditioner::Ilu::parse(config, context, - td_for_child); - } - }; -}; - - -template -class IsaiHelper { -public: - template - class Configurator { - public: - static typename preconditioner::Isai::parameters_type - parse(const pnode& config, const registry& context, - const type_descriptor& td_for_child) - { - return preconditioner::Isai::parse( - config, context, td_for_child); - } - }; -}; - -// Do not use the partial specialization for SolverBase and SolverBase -// because the default template arguments are allowed for a template template -// argument (detail: CWG 150 after c++17 -// https://en.cppreference.com/w/cpp/language/template_parameters#Template_template_arguments) -template