From aacdb85ed0771ccbf6c14386db79c711023d8b73 Mon Sep 17 00:00:00 2001 From: oscarddssmith Date: Sat, 13 Jul 2024 07:14:34 -0400 Subject: [PATCH 01/12] runtime works --- src/array.c | 2 - src/runtime_intrinsics.c | 133 ++++++++++++++++++++++++++------------- 2 files changed, 89 insertions(+), 46 deletions(-) diff --git a/src/array.c b/src/array.c index f0051ec17565a..53c939a5ccfc2 100644 --- a/src/array.c +++ b/src/array.c @@ -39,7 +39,6 @@ JL_DLLEXPORT int jl_array_validate_dims(size_t *nel, uint32_t ndims, size_t *dim return 0; } -#ifndef JL_NDEBUG static inline int is_ntuple_long(jl_value_t *v) { if (!jl_is_tuple(v)) @@ -53,7 +52,6 @@ static inline int is_ntuple_long(jl_value_t *v) } return 1; } -#endif #define jl_array_elsize(a) (((jl_datatype_t*)jl_typetagof((a)->ref.mem))->layout->size) diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c index 450096eef5b01..ff7de09db5b46 100644 --- a/src/runtime_intrinsics.c +++ b/src/runtime_intrinsics.c @@ -1271,54 +1271,91 @@ static inline jl_value_t *jl_intrinsiclambda_checkeddiv(jl_value_t *ty, void *pa // floating point -#define bi_fintrinsic(OP, name) \ - bi_intrinsic_bfloat(OP, name) \ - bi_intrinsic_half(OP, name) \ - bi_intrinsic_ctype(OP, name, 32, float) \ - bi_intrinsic_ctype(OP, name, 64, double) \ -JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \ +static int is_ntuple_type(jl_value_t *tt) +{ + if (!jl_is_tuple_type(tt)) + { + return 0; + } + size_t i, nfields = jl_nparams(tt); + if(!nfields) + return 1; + jl_value_t *t1 = jl_tparam0(tt); + for (i = 1; i < nfields; i++) { + if (jl_tparam(tt, i) != t1) { + return 0; + } + } + return 1; +} + +#define bi_fintrinsic(OP, op_name) \ + bi_intrinsic_bfloat(OP, op_name) \ + bi_intrinsic_half(OP, op_name) \ + bi_intrinsic_ctype(OP, op_name, 32, float) \ + bi_intrinsic_ctype(OP, op_name, 64, double) \ +JL_DLLEXPORT jl_value_t *jl_##op_name(jl_value_t *a, jl_value_t *b) \ { \ jl_task_t *ct = jl_current_task; \ jl_value_t *ty = jl_typeof(a); \ + jl_value_t *et = ty; \ + int np=1; \ if (jl_typeof(b) != ty) \ - jl_error(#name ": types of a and b must match"); \ - if (!jl_is_primitivetype(ty)) \ - jl_error(#name ": values are not primitive types"); \ - int sz = jl_datatype_size(ty); \ - jl_value_t *newv = jl_gc_alloc(ct->ptls, sz, ty); \ + jl_error(#op_name ": types of a and b must match"); \ + if (jl_is_primitivetype(ty)){}\ + else if (is_ntuple_type(ty) && jl_nparams(ty) > 0) \ + { \ + et = jl_tparam0(ty); \ + np = jl_nparams(ty); \ + if (((jl_datatype_t*)et)->name == jl_vecelement_typename && jl_is_primitivetype(jl_tparam(et, 0))){} \ + else \ + jl_error(#op_name ": eltype is not a VecElement of a primitive type"); \ + }\ + else \ + jl_error(#op_name ": values are not primitive types"); \ + int sz = jl_datatype_size(et); \ + jl_value_t *newv = jl_gc_alloc(ct->ptls, sz*np, ty); \ void *pa = jl_data_ptr(a), *pb = jl_data_ptr(b), *pr = jl_data_ptr(newv); \ switch (sz) { \ /* choose the right size c-type operation */ \ case 2: \ if ((jl_datatype_t*)ty == jl_float16_type) \ - jl_##name##16(16, pa, pb, pr); \ + jl_##op_name##16(16*np, pa, pb, pr); \ else /*if ((jl_datatype_t*)ty == jl_bfloat16_type)*/ \ - jl_##name##bf16(16, pa, pb, pr); \ + jl_##op_name##bf16(16*np, pa, pb, pr); \ break; \ case 4: \ - jl_##name##32(32, pa, pb, pr); \ + jl_##op_name##32(32*np, pa, pb, pr); \ break; \ case 8: \ - jl_##name##64(64, pa, pb, pr); \ + jl_##op_name##64(64*np, pa, pb, pr); \ break; \ default: \ - jl_error(#name ": runtime floating point intrinsics are not implemented for bit sizes other than 16, 32 and 64"); \ + jl_error(#op_name ": runtime floating point intrinsics are not implemented for bit sizes other than 16, 32 and 64"); \ } \ return newv; \ } -#define bool_fintrinsic(OP, name) \ - bool_intrinsic_bfloat(OP, name) \ - bool_intrinsic_half(OP, name) \ - bool_intrinsic_ctype(OP, name, 32, float) \ - bool_intrinsic_ctype(OP, name, 64, double) \ -JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \ +#define bool_fintrinsic(OP, op_name) \ + bool_intrinsic_bfloat(OP, op_name) \ + bool_intrinsic_half(OP, op_name) \ + bool_intrinsic_ctype(OP, op_name, 32, float) \ + bool_intrinsic_ctype(OP, op_name, 64, double) \ +JL_DLLEXPORT jl_value_t *jl_##op_name(jl_value_t *a, jl_value_t *b) \ { \ jl_value_t *ty = jl_typeof(a); \ if (jl_typeof(b) != ty) \ - jl_error(#name ": types of a and b must match"); \ - if (!jl_is_primitivetype(ty)) \ - jl_error(#name ": values are not primitive types"); \ + jl_error(#op_name ": types of a and b must match"); \ + if (jl_is_primitivetype(ty)) {}\ + else if (is_ntuple_type(ty) && jl_nparams(ty) > 0) \ + { \ + jl_value_t *et = jl_tparam(ty, 0); \ + if (((jl_datatype_t*)et)->name == jl_vecelement_typename && jl_is_primitivetype(jl_tparam(et, 0))){} \ + else \ + jl_error(#op_name ": eltype is not a VecElement of a primitive type"); \ + }\ + else \ + jl_error(#op_name ": values are not primitive types"); \ void *pa = jl_data_ptr(a), *pb = jl_data_ptr(b); \ int sz = jl_datatype_size(ty); \ int cmp; \ @@ -1326,35 +1363,43 @@ JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \ /* choose the right size c-type operation */ \ case 2: \ if ((jl_datatype_t*)ty == jl_float16_type) \ - cmp = jl_##name##16(16, pa, pb); \ + cmp = jl_##op_name##16(16, pa, pb); \ else /*if ((jl_datatype_t*)ty == jl_bfloat16_type)*/ \ - cmp = jl_##name##bf16(16, pa, pb); \ + cmp = jl_##op_name##bf16(16, pa, pb); \ break; \ case 4: \ - cmp = jl_##name##32(32, pa, pb); \ + cmp = jl_##op_name##32(32, pa, pb); \ break; \ case 8: \ - cmp = jl_##name##64(64, pa, pb); \ + cmp = jl_##op_name##64(64, pa, pb); \ break; \ default: \ - jl_error(#name ": runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64"); \ + jl_error(#op_name ": runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64"); \ } \ return cmp ? jl_true : jl_false; \ } -#define ter_fintrinsic(OP, name) \ - ter_intrinsic_bfloat(OP, name) \ - ter_intrinsic_half(OP, name) \ - ter_intrinsic_ctype(OP, name, 32, float) \ - ter_intrinsic_ctype(OP, name, 64, double) \ -JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b, jl_value_t *c) \ +#define ter_fintrinsic(OP, op_name) \ + ter_intrinsic_bfloat(OP, op_name) \ + ter_intrinsic_half(OP, op_name) \ + ter_intrinsic_ctype(OP, op_name, 32, float) \ + ter_intrinsic_ctype(OP, op_name, 64, double) \ +JL_DLLEXPORT jl_value_t *jl_##op_name(jl_value_t *a, jl_value_t *b, jl_value_t *c) \ { \ jl_task_t *ct = jl_current_task; \ jl_value_t *ty = jl_typeof(a); \ if (jl_typeof(b) != ty || jl_typeof(c) != ty) \ - jl_error(#name ": types of a, b, and c must match"); \ - if (!jl_is_primitivetype(ty)) \ - jl_error(#name ": values are not primitive types"); \ + jl_error(#op_name ": types of a, b, and c must match"); \ + if (jl_is_primitivetype(ty)) {}\ + else if (is_ntuple_type(ty) && jl_nparams(ty) > 0) \ + { \ + jl_value_t *et = jl_tparam(ty, 0); \ + if (((jl_datatype_t*)et)->name == jl_vecelement_typename && jl_is_primitivetype(jl_tparam(et, 0))){} \ + else \ + jl_error(#op_name ": eltype is not a VecElement of a primitive type"); \ + }\ + else \ + jl_error(#op_name ": values are not primitive types"); \ int sz = jl_datatype_size(ty); \ jl_value_t *newv = jl_gc_alloc(ct->ptls, sz, ty); \ void *pa = jl_data_ptr(a), *pb = jl_data_ptr(b), *pc = jl_data_ptr(c), *pr = jl_data_ptr(newv); \ @@ -1362,18 +1407,18 @@ JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b, jl_value_t *c) /* choose the right size c-type operation */ \ case 2: \ if ((jl_datatype_t*)ty == jl_float16_type) \ - jl_##name##16(16, pa, pb, pc, pr); \ + jl_##op_name##16(16, pa, pb, pc, pr); \ else /*if ((jl_datatype_t*)ty == jl_bfloat16_type)*/ \ - jl_##name##bf16(16, pa, pb, pc, pr); \ + jl_##op_name##bf16(16, pa, pb, pc, pr); \ break; \ case 4: \ - jl_##name##32(32, pa, pb, pc, pr); \ + jl_##op_name##32(32, pa, pb, pc, pr); \ break; \ case 8: \ - jl_##name##64(64, pa, pb, pc, pr); \ + jl_##op_name##64(64, pa, pb, pc, pr); \ break; \ default: \ - jl_error(#name ": runtime floating point intrinsics are not implemented for bit sizes other than 16, 32 and 64"); \ + jl_error(#op_name ": runtime floating point intrinsics are not implemented for bit sizes other than 16, 32 and 64"); \ } \ return newv; \ } From 3b5248f4e40b4ceb8690b2f450b2e8c49fafcfc1 Mon Sep 17 00:00:00 2001 From: oscarddssmith Date: Sat, 13 Jul 2024 10:26:48 -0400 Subject: [PATCH 02/12] IT WORKSjsnjsnjsnjsnjsn! --- src/intrinsics.cpp | 21 ++++++++++++--------- src/julia.h | 18 ++++++++++++++++++ src/runtime_intrinsics.c | 18 ------------------ 3 files changed, 30 insertions(+), 27 deletions(-) diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp index 09916297e16ff..81e427e3aa9e4 100644 --- a/src/intrinsics.cpp +++ b/src/intrinsics.cpp @@ -1287,7 +1287,6 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar return emit_llvmcall(ctx, args, nargs); if (f == cglobal_auto || f == cglobal) return emit_cglobal(ctx, args, nargs); - SmallVector argv(nargs); for (size_t i = 0; i < nargs; ++i) { jl_cgval_t arg = emit_expr(ctx, args[i + 1]); @@ -1409,17 +1408,21 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar default: { assert(nargs >= 1 && "invalid nargs for intrinsic call"); const jl_cgval_t &xinfo = argv[0]; - // verify argument types - if (!jl_is_primitivetype(xinfo.typ)) - return emit_runtime_call(ctx, f, argv, nargs); - Type *xtyp = bitstype_to_llvm(xinfo.typ, ctx.builder.getContext(), true); - if (float_func()[f]) - xtyp = FLOATT(xtyp); + if (jl_is_primitivetype(xinfo.typ)){} + + else if (is_ntuple_type(xinfo.typ) && jl_nparams(xinfo.typ) > 0) + { + jl_value_t *et = jl_tparam0(xinfo.typ); + if (((jl_datatype_t*)et)->name == jl_vecelement_typename && jl_is_primitivetype(jl_tparam(et, 0))) + et = jl_tparam0(et); + else + return emit_runtime_call(ctx, f, argv, nargs); + } else - xtyp = INTT(xtyp, DL); - if (!xtyp) return emit_runtime_call(ctx, f, argv, nargs); + bool isboxed=true; + Type *xtyp = julia_type_to_llvm(ctx, xinfo.typ, &(isboxed)); ////Bool are required to be in the range [0,1] ////so while they are represented as i8, ////the operations need to be done in mod 1 diff --git a/src/julia.h b/src/julia.h index 5b9986a5e68ee..17cb1f6bd7e01 100644 --- a/src/julia.h +++ b/src/julia.h @@ -1717,6 +1717,24 @@ STATIC_INLINE int jl_is_tuple_type(void *t) JL_NOTSAFEPOINT ((jl_datatype_t*)(t))->name == jl_tuple_typename); } +STATIC_INLINE int is_ntuple_type(jl_value_t *tt) +{ + if (!jl_is_tuple_type(tt)) + { + return 0; + } + size_t i, nfields = jl_nparams(tt); + if(!nfields) + return 1; + jl_value_t *t1 = jl_tparam0(tt); + for (i = 1; i < nfields; i++) { + if (jl_tparam(tt, i) != t1) { + return 0; + } + } + return 1; +} + STATIC_INLINE int jl_is_namedtuple_type(void *t) JL_NOTSAFEPOINT { return (jl_is_datatype(t) && diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c index ff7de09db5b46..9769e0142bbf9 100644 --- a/src/runtime_intrinsics.c +++ b/src/runtime_intrinsics.c @@ -1271,24 +1271,6 @@ static inline jl_value_t *jl_intrinsiclambda_checkeddiv(jl_value_t *ty, void *pa // floating point -static int is_ntuple_type(jl_value_t *tt) -{ - if (!jl_is_tuple_type(tt)) - { - return 0; - } - size_t i, nfields = jl_nparams(tt); - if(!nfields) - return 1; - jl_value_t *t1 = jl_tparam0(tt); - for (i = 1; i < nfields; i++) { - if (jl_tparam(tt, i) != t1) { - return 0; - } - } - return 1; -} - #define bi_fintrinsic(OP, op_name) \ bi_intrinsic_bfloat(OP, op_name) \ bi_intrinsic_half(OP, op_name) \ From 7b868b19a81279920b8b4af209f1e8acc3206d7a Mon Sep 17 00:00:00 2001 From: oscarddssmith Date: Sat, 13 Jul 2024 11:05:06 -0400 Subject: [PATCH 03/12] works with bitcast --- src/intrinsics.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp index 81e427e3aa9e4..51aeb14a5f4a0 100644 --- a/src/intrinsics.cpp +++ b/src/intrinsics.cpp @@ -144,6 +144,13 @@ static Type *FLOATT(Type *t) { if (t->isFloatingPointTy()) return t; + if (auto *tv = dyn_cast(t)) + { + Type *st = FLOATT(tv->getElementType()); + if (!st) + return NULL; + return VectorType::get(st, tv->getElementCount()); + } unsigned nb = (t->isPointerTy() ? sizeof(void*) * 8 : t->getPrimitiveSizeInBits()); auto &ctxt = t->getContext(); if (nb == 64) @@ -165,6 +172,13 @@ static Type *INTT(Type *t, const DataLayout &DL) return t; if (t->isPointerTy()) return DL.getIntPtrType(t); + if (auto *tv = dyn_cast(t)) + { + Type *st = INTT(tv->getElementType(), DL); + if (!st) + return NULL; + return VectorType::get(st, tv->getElementCount()); + } if (t == getDoubleTy(ctxt)) return getInt64Ty(ctxt); if (t == getFloatTy(ctxt)) @@ -1423,6 +1437,12 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar return emit_runtime_call(ctx, f, argv, nargs); bool isboxed=true; Type *xtyp = julia_type_to_llvm(ctx, xinfo.typ, &(isboxed)); + if (float_func()[f]) + xtyp = FLOATT(xtyp); + else + xtyp = INTT(xtyp, DL); + if (!xtyp) + return emit_runtime_call(ctx, f, argv, nargs); ////Bool are required to be in the range [0,1] ////so while they are represented as i8, ////the operations need to be done in mod 1 From 2e9f793b8ad5af06076bc304ee4beebfefa95622 Mon Sep 17 00:00:00 2001 From: oscarddssmith Date: Mon, 15 Jul 2024 04:25:54 -0400 Subject: [PATCH 04/12] undo array changes --- src/array.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/array.c b/src/array.c index 53c939a5ccfc2..f0051ec17565a 100644 --- a/src/array.c +++ b/src/array.c @@ -39,6 +39,7 @@ JL_DLLEXPORT int jl_array_validate_dims(size_t *nel, uint32_t ndims, size_t *dim return 0; } +#ifndef JL_NDEBUG static inline int is_ntuple_long(jl_value_t *v) { if (!jl_is_tuple(v)) @@ -52,6 +53,7 @@ static inline int is_ntuple_long(jl_value_t *v) } return 1; } +#endif #define jl_array_elsize(a) (((jl_datatype_t*)jl_typetagof((a)->ref.mem))->layout->size) From ddc8890a5e41d17a3564b32aec0faa089b5dad55 Mon Sep 17 00:00:00 2001 From: Oscar Smith Date: Sun, 28 Jul 2024 08:01:33 -0400 Subject: [PATCH 05/12] Update src/intrinsics.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Mosรจ Giordano --- src/intrinsics.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp index 51aeb14a5f4a0..4545329f270c2 100644 --- a/src/intrinsics.cpp +++ b/src/intrinsics.cpp @@ -1424,7 +1424,6 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar const jl_cgval_t &xinfo = argv[0]; // verify argument types if (jl_is_primitivetype(xinfo.typ)){} - else if (is_ntuple_type(xinfo.typ) && jl_nparams(xinfo.typ) > 0) { jl_value_t *et = jl_tparam0(xinfo.typ); From e6d6f4a54447de7967609d5b1f298d7e40496970 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 8 Oct 2024 18:48:20 +0200 Subject: [PATCH 06/12] implement vload/vstore! and a primitive Vec type --- base/experimental.jl | 2 ++ base/simd.jl | 70 ++++++++++++++++++++++++++++++++++++++++++++ test/choosetests.jl | 2 +- test/simd.jl | 36 +++++++++++++++++++++++ 4 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 base/simd.jl create mode 100644 test/simd.jl diff --git a/base/experimental.jl b/base/experimental.jl index 982ed5e78aa8c..dbf34c8183f7f 100644 --- a/base/experimental.jl +++ b/base/experimental.jl @@ -471,4 +471,6 @@ function entrypoint(@nospecialize(argt::Type)) nothing end +include("simd.jl") + end diff --git a/base/simd.jl b/base/simd.jl new file mode 100644 index 0000000000000..eef40e5f21fea --- /dev/null +++ b/base/simd.jl @@ -0,0 +1,70 @@ +module SIMD + +import Base: VecElement, Memory, MemoryRef +import Base: @propagate_inbounds, @_propagate_inbounds_meta, @_boundscheck, @_noub_if_noinbounds_meta +import Base: memoryrefget, memoryrefnew, memoryrefset! + +export Vec +export vload, vstore!, natural_vecwidth + +# TODO: See C# and Co Vec type +# TODO: Hardware portable vector types... + +struct Vec{N, T} + data::NTuple{N, VecElement{T}} +end + +# Constructors +@inline Vec(v::NTuple{N, T}) where {N, T} = Vec(VecElement.(v)) +@inline Vec(v::Vararg{T, N}) where {N, T} = Vec(v) +@inline Vec(v::Vec) = v + +# Numbers defines this and it is needed in power_by_squaring... +Base.copy(v::Vec) = v + +function Base.show(io::IO, v::Vec{N, T}) where {N, T} + io = IOContext(io, :typeinfo => eltype(v)) + print(io, "<$N x $T>[") + join(io, [sprint(show, x.value; context=io) for x in v.data], ", ") + print(io, "]") +end + +# Breaks with multi-versioning +natural_vecwidth(::Type{Float32}) = 8 +natural_vecwidth(::Type{Float64}) = 4 + +import Base: +, -, * + +# Mocked vload/vstore! relying on SLP + +@inline function vload(::Type{Vec{N, T}}, A::Array{T}, i::Int) where {N, T} + @_noub_if_noinbounds_meta + # TODO: Alignment...; may need an intrinsic for vectorized loads. + # Writting my own boundscheck loop since `inbounds` doesn't propagate through `ntuple` FFS + @boundscheck checkbounds(A, i:(i+ N - 1)) + mem = A.ref + data = ntuple(Val(N)) do j + # why does `@inbounds ref = memoryrefnew(mem, i + j - 1, @_boundscheck)` not work? + ref = memoryrefnew(mem, i + j - 1, false) + VecElement{T}(memoryrefget(ref, :not_atomic, false)) + end + return Vec(data) +end + +@inline function vstore!(A::Array{T}, v::Vec{N, T}, i::Int) where {N, T} + @_noub_if_noinbounds_meta + # TODO: Alignment...; may need an intrinsic for vectorized loads. + # Writting my own boundscheck loop since `inbounds` doesn't propagate through `ntuple` FFS + @boundscheck checkbounds(A, i:(i+ N - 1)) + mem = A.ref + data = v.data + ntuple(Val(N)) do j + # why does `@inbounds ref = memoryrefnew(mem, i + j - 1, @_boundscheck)` not work? + ref = memoryrefnew(mem, i + j - 1, false) + memoryrefset!(ref, data[j].value, :not_atomic, false) + return nothing + end + return nothing +end + +end # module \ No newline at end of file diff --git a/test/choosetests.jl b/test/choosetests.jl index affdee412bd86..a2e9ad5e52c6f 100644 --- a/test/choosetests.jl +++ b/test/choosetests.jl @@ -11,7 +11,7 @@ const TESTNAMES = [ "char", "strings", "triplequote", "unicode", "intrinsics", "dict", "hashing", "iobuffer", "staged", "offsetarray", "arrayops", "tuple", "reduce", "reducedim", "abstractarray", - "intfuncs", "simdloop", "vecelement", "rational", + "intfuncs", "simdloop", "vecelement", "rational", "simd", "bitarray", "copy", "math", "fastmath", "functional", "iterators", "operators", "ordering", "path", "ccall", "parse", "loading", "gmp", "sorting", "spawn", "backtrace", "exceptions", diff --git a/test/simd.jl b/test/simd.jl new file mode 100644 index 0000000000000..d35fadec23b9d --- /dev/null +++ b/test/simd.jl @@ -0,0 +1,36 @@ +using Base.Experimental.SIMD +using Test +using InteractiveUtils + +function vcopyto!(a::Array{T}, b::Array{T}) where T + stride = natural_vecwidth(T) + VT = Vec{stride, T} + @assert length(a) == length(b) + @assert length(a) % stride == 0 + @inbounds for i in 1:stride:length(a) + vec = vload(VT, a, i) + vstore!(b, vec, i) + end +end + +@testset "load/store" begin + A = rand(64) + B = zeros(64) + + vcopyto!(A, B) + @test A == B + + @test_throws BoundsError vload(Vec{4, Float64}, A, 62) + vec = vload(Vec{4, Float64}, A, 1) + @test_throws BoundsError vstore!(A, vec, 62) + + load(A, i) = @inbounds vload(Vec{4, Float64}, A, i) + store(A,v,i) = @inbounds vstore!(A, v, i) + + ir = sprint(io->code_llvm(io, vload, (Type{Vec{4, Float64}}, Vector{Float64}, Int))) + @test contains(ir, "call void @j_throw_boundserror") + + ir = sprint(io->code_llvm(io, load, (Vector{Float64}, Int))) + @test contains(ir, "load <4 x double>") + @test !contains(ir, "call void @j_throw_boundserror") +end From 0a0f7e0074806476ac3013d51e985453a2f30cfd Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 9 Oct 2024 09:42:58 +0200 Subject: [PATCH 07/12] start on preferred_width intrinsic --- base/compiler/tfuncs.jl | 24 +++++++++++++++++++- base/simd.jl | 23 +++++++++++++++----- src/intrinsics.cpp | 47 ++++++++++++++++++++++++++++++++++++++++ src/intrinsics.h | 1 + src/julia_internal.h | 1 + src/llvm-cpufeatures.cpp | 29 +++++++++++++++++++++++++ src/runtime_intrinsics.c | 15 +++++++++++++ test/simd.jl | 16 ++++++++++++-- 8 files changed, 148 insertions(+), 8 deletions(-) diff --git a/base/compiler/tfuncs.jl b/base/compiler/tfuncs.jl index aaa1354fd5e54..5902293c3a8c9 100644 --- a/base/compiler/tfuncs.jl +++ b/base/compiler/tfuncs.jl @@ -52,7 +52,7 @@ end const INT_INF = typemax(Int) # integer infinity -const N_IFUNC = reinterpret(Int32, have_fma) + 1 +const N_IFUNC = reinterpret(Int32, preferred_vector_width) + 1 const T_IFUNC = Vector{Tuple{Int, Int, Any}}(undef, N_IFUNC) const T_IFUNC_COST = Vector{Int}(undef, N_IFUNC) const T_FFUNC_KEY = Vector{Any}() @@ -318,6 +318,28 @@ add_tfunc(Core.Intrinsics.cglobal, 1, 2, cglobal_tfunc, 5) add_tfunc(Core.Intrinsics.have_fma, 1, 1, @nospecs((๐•ƒ::AbstractLattice, x)->Bool), 1) +@nospecs function preferred_vector_width_tfunc(๐•ƒ::AbstractLattice, t) + return preferred_vector_width_tfunc(widenlattice(๐•ƒ), t) +end + +@nospecs function preferred_vector_width_tfunc(๐•ƒ::ConstsLattice, t) + # Want to return Union(Const(1), Const(2)) + # hardcode AVX512 + if sizeof(widenconst(t)) === 1 + return Const(32) + elseif sizeof(widenconst(t)) === 2 + return Const(16) + elseif sizeof(widenconst(t)) === 4 + return Const(8) + elseif sizeof(widenconst(t)) === 8 + return Const(4) + elseif sizeof(widenconst(t)) === 16 + return Const(4) + end + return Union{Nothing, Int} +end +add_tfunc(Core.Intrinsics.preferred_vector_width, 1, 1, preferred_vector_width_tfunc, 1) + # builtin functions # ================= diff --git a/base/simd.jl b/base/simd.jl index eef40e5f21fea..e249e04ee69ac 100644 --- a/base/simd.jl +++ b/base/simd.jl @@ -4,16 +4,33 @@ import Base: VecElement, Memory, MemoryRef import Base: @propagate_inbounds, @_propagate_inbounds_meta, @_boundscheck, @_noub_if_noinbounds_meta import Base: memoryrefget, memoryrefnew, memoryrefset! +import Core.Intrinsics: preferred_vector_width + export Vec -export vload, vstore!, natural_vecwidth +export vload, vstore!, preferred_vector, width # TODO: See C# and Co Vec type # TODO: Hardware portable vector types... +# TODO: tfunc support for preferred_vector_width does allow for "constant prop" +# but the intrinsic is not removed just yet during JIT, we should only need +# it for AOT or on a machine with scaleable vector types... + struct Vec{N, T} data::NTuple{N, VecElement{T}} end +width(::Type{<:Vec{N}}) where N = N +width(::Vec{N}) where N = N + +function preferred_vector(::Type{T}) where T + width = preferred_vector_width(T) + if width === nothing + error("$T has no preferred_vector_width") + end + return Vec{width, T} +end + # Constructors @inline Vec(v::NTuple{N, T}) where {N, T} = Vec(VecElement.(v)) @inline Vec(v::Vararg{T, N}) where {N, T} = Vec(v) @@ -29,10 +46,6 @@ function Base.show(io::IO, v::Vec{N, T}) where {N, T} print(io, "]") end -# Breaks with multi-versioning -natural_vecwidth(::Type{Float32}) = 8 -natural_vecwidth(::Type{Float64}) = 4 - import Base: +, -, * # Mocked vload/vstore! relying on SLP diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp index 4545329f270c2..c95f115da6100 100644 --- a/src/intrinsics.cpp +++ b/src/intrinsics.cpp @@ -37,6 +37,7 @@ STATISTIC(Emitted_fptrunc, "Number of fptrunc calls emitted"); STATISTIC(Emitted_fpext, "Number of fpext calls emitted"); STATISTIC(Emitted_not_int, "Number of not_int calls emitted"); STATISTIC(Emitted_have_fma, "Number of have_fma calls emitted"); +STATISTIC(Emitted_preferred_vector_width, "Number of prefferred_vector_width calls emitted"); STATISTIC(EmittedUntypedIntrinsics, "Number of untyped intrinsics emitted"); using namespace JL_I; @@ -1419,6 +1420,52 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar return mark_julia_type(ctx, ret, false, jl_bool_type); } + case preferred_vector_width: { + ++Emitted_preferred_vector_width; + assert(nargs == 1); + const jl_cgval_t &x = argv[0]; + if (!x.constant || !jl_is_datatype(x.constant)) + return emit_runtime_call(ctx, f, argv, nargs); + jl_datatype_t *dt = (jl_datatype_t*) x.constant; + + // select the appropriated overloaded intrinsic + std::string intr_name = "julia.cpu.preferred_vector_width."; + switch (jl_datatype_size(dt)) { + case 1: { + intr_name += "b1"; + break; + case 2: { + intr_name += "b2"; + break; + } + case 4: { + intr_name += "b4"; + break; + } + case 8: { + intr_name += "b8"; + break; + } + case 16: { + intr_name += "b16"; + break; + } + default: + return emit_runtime_call(ctx, f, argv, nargs); + } + } + +#ifdef _P64 + FunctionCallee intr = jl_Module->getOrInsertFunction(intr_name, getInt64Ty(ctx.builder.getContext())); + auto ret = ctx.builder.CreateCall(intr); + return mark_julia_type(ctx, ret, false, jl_int64_type); +#else + FunctionCallee intr = jl_Module->getOrInsertFunction(intr_name, getInt32Ty(ctx.builder.getContext())); + auto ret = ctx.builder.CreateCall(intr); + return mark_julia_type(ctx, ret, false, jl_int32_type); +#endif + } + default: { assert(nargs >= 1 && "invalid nargs for intrinsic call"); const jl_cgval_t &xinfo = argv[0]; diff --git a/src/intrinsics.h b/src/intrinsics.h index 5b463e3bafe28..1290c4d769154 100644 --- a/src/intrinsics.h +++ b/src/intrinsics.h @@ -102,6 +102,7 @@ ALIAS(llvmcall, llvmcall) \ /* cpu feature tests */ \ ADD_I(have_fma, 1) \ + ADD_I(preferred_vector_width, 1) \ /* hidden intrinsics */ \ ADD_HIDDEN(cglobal_auto, 1) diff --git a/src/julia_internal.h b/src/julia_internal.h index f3959490855c8..8e018ef382593 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -1614,6 +1614,7 @@ JL_DLLEXPORT jl_value_t *jl_copysign_float(jl_value_t *a, jl_value_t *b); JL_DLLEXPORT jl_value_t *jl_flipsign_int(jl_value_t *a, jl_value_t *b); JL_DLLEXPORT jl_value_t *jl_have_fma(jl_value_t *a); +JL_DLLEXPORT jl_value_t *jl_preferred_vector_width(jl_value_t *a); JL_DLLEXPORT int jl_stored_inline(jl_value_t *el_type); JL_DLLEXPORT jl_value_t *(jl_array_data_owner)(jl_array_t *a); JL_DLLEXPORT jl_array_t *jl_array_copy(jl_array_t *ary); diff --git a/src/llvm-cpufeatures.cpp b/src/llvm-cpufeatures.cpp index 05d62adc57926..b7fb94f79668b 100644 --- a/src/llvm-cpufeatures.cpp +++ b/src/llvm-cpufeatures.cpp @@ -86,6 +86,26 @@ void lowerHaveFMA(Function &intr, Function &caller, const Triple &TT, CallInst * return; } +void lowerPreferredVectorWidth(Function &intr, Function &caller, const Triple &TT, CallInst *I) JL_NOTSAFEPOINT { + auto intr_name = intr.getName(); + auto typ = intr_name.substr(strlen("julia.cpu.preferred_vector_width.")); + + size_t width = 0; + if (typ == "b1") + width = 32; + else if (typ == "b2") + width = 16; + else if (typ == "b4") + width = 8; + else if (typ == "b8") + width = 4; + else if (typ == "b16") + width = 2; + + I->replaceAllUsesWith(ConstantInt::get(I->getType(), width)); + return; +} + bool lowerCPUFeatures(Module &M) JL_NOTSAFEPOINT { auto TT = Triple(M.getTargetTriple()); @@ -102,6 +122,15 @@ bool lowerCPUFeatures(Module &M) JL_NOTSAFEPOINT Materialized.push_back(I); } } + + if (FN.starts_with("julia.cpu.preferred_vector_width.")) { + for (Use &U: F.uses()) { + User *RU = U.getUser(); + CallInst *I = cast(RU); + lowerPreferredVectorWidth(F, *I->getParent()->getParent(), TT, I); + Materialized.push_back(I); + } + } } if (!Materialized.empty()) { diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c index 9769e0142bbf9..cb86a02bf7548 100644 --- a/src/runtime_intrinsics.c +++ b/src/runtime_intrinsics.c @@ -1733,6 +1733,21 @@ JL_DLLEXPORT jl_value_t *jl_have_fma(jl_value_t *typ) return jl_false; } +JL_DLLEXPORT jl_value_t *jl_preferred_vector_width(jl_value_t *typ) +{ + JL_TYPECHK(preferred_vector_width, datatype, typ); // TODO what about float16/bfloat16? + jl_datatype_t* dt = (jl_datatype_t*)typ; + int sz = jl_datatype_size(dt); + int width = 32 / sz; + if (width == 0) + return jl_nothing; +#ifdef _P64 + return jl_box_int64(width); +#else + return jl_box_int32(width); +#endif +} + JL_DLLEXPORT jl_value_t *jl_add_ptr(jl_value_t *ptr, jl_value_t *offset) { JL_TYPECHK(add_ptr, pointer, ptr); diff --git a/test/simd.jl b/test/simd.jl index d35fadec23b9d..dbda2e4d51193 100644 --- a/test/simd.jl +++ b/test/simd.jl @@ -3,8 +3,8 @@ using Test using InteractiveUtils function vcopyto!(a::Array{T}, b::Array{T}) where T - stride = natural_vecwidth(T) - VT = Vec{stride, T} + VT = preferred_vector(T) + stride = width(VT) @assert length(a) == length(b) @assert length(a) % stride == 0 @inbounds for i in 1:stride:length(a) @@ -13,6 +13,18 @@ function vcopyto!(a::Array{T}, b::Array{T}) where T end end +# todo: noninline/mutable types? +primitive type I256 256 end +primitive type I512 512 end + +@testset "preferred_vector_width" begin + for T in (Int8, Int16, Int32, Int64, Int128, I256) + max_width = 32 # avx2 + @test width(preferred_vector(T)) == max_width รท sizeof(T) + end + @test_throws ErrorException preferred_vector(I526) +end + @testset "load/store" begin A = rand(64) B = zeros(64) From 0e95cfd18e0830b95b2484b0b3e9c196b28fb3fd Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 4 Nov 2024 16:23:13 +0100 Subject: [PATCH 08/12] fixup! start on preferred_width intrinsic --- base/compiler/tfuncs.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/base/compiler/tfuncs.jl b/base/compiler/tfuncs.jl index 5902293c3a8c9..a97f2742d874f 100644 --- a/base/compiler/tfuncs.jl +++ b/base/compiler/tfuncs.jl @@ -324,7 +324,7 @@ end @nospecs function preferred_vector_width_tfunc(๐•ƒ::ConstsLattice, t) # Want to return Union(Const(1), Const(2)) - # hardcode AVX512 + # hardcode AVX256 if sizeof(widenconst(t)) === 1 return Const(32) elseif sizeof(widenconst(t)) === 2 @@ -334,7 +334,7 @@ end elseif sizeof(widenconst(t)) === 8 return Const(4) elseif sizeof(widenconst(t)) === 16 - return Const(4) + return Const(2) end return Union{Nothing, Int} end From c8d0ba3952bf8ba8c2e371a3b307077488484dbf Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 4 Nov 2024 17:00:32 +0100 Subject: [PATCH 09/12] add some basic arithmetic support --- base/simd.jl | 25 +++++++++++++++++++++++-- test/simd.jl | 21 +++++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/base/simd.jl b/base/simd.jl index e249e04ee69ac..669e1e684f403 100644 --- a/base/simd.jl +++ b/base/simd.jl @@ -1,6 +1,6 @@ module SIMD -import Base: VecElement, Memory, MemoryRef +import Base: VecElement, Memory, MemoryRef, IEEEFloat import Base: @propagate_inbounds, @_propagate_inbounds_meta, @_boundscheck, @_noub_if_noinbounds_meta import Base: memoryrefget, memoryrefnew, memoryrefset! @@ -46,7 +46,6 @@ function Base.show(io::IO, v::Vec{N, T}) where {N, T} print(io, "]") end -import Base: +, -, * # Mocked vload/vstore! relying on SLP @@ -80,4 +79,26 @@ end return nothing end +import Base: +, -, *, /, muladd, promote_rule, widen +import Core.Intrinsics: add_float, sub_float, mul_float, div_float, muladd_float, neg_float + +## floating point promotions ## +promote_rule(::Type{Vec{N, Float32}}, ::Type{Vec{N, Float16}}) where N = Vec{N, Float32} +promote_rule(::Type{Vec{N, Float64}}, ::Type{Vec{N, Float16}}) where N = Vec{N, Float64} +promote_rule(::Type{Vec{N, Float64}}, ::Type{Vec{N, Float32}}) where N = Vec{N, Float64} + +widen(::Type{Vec{N, Float16}}) where N = Vec{N, Float16} +widen(::Type{Vec{N, Float32}}) where N = Vec{N, Float32} + +## floating point arithmetic ## +-(x::Vec{N,T}) where {N,T<:IEEEFloat} = neg_float(x.data) + ++(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = add_float(x.data, y.data) +-(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = sub_float(x.data, y.data) +*(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = mul_float(x.data, y.data) +/(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = div_float(x.data, y.data) + +muladd(x::Vec{N,T}, y::Vec{N,T}, z::Vec{N,T}) where {N, T<:IEEEFloat} = + muladd_float(x.data, y.data, z.data) + end # module \ No newline at end of file diff --git a/test/simd.jl b/test/simd.jl index dbda2e4d51193..7d3c191f797e8 100644 --- a/test/simd.jl +++ b/test/simd.jl @@ -46,3 +46,24 @@ end @test contains(ir, "load <4 x double>") @test !contains(ir, "call void @j_throw_boundserror") end + +@testset "basic arithmetic" begin + ir = sprint(io->code_llvm(io, +, (Vec{4, Float64}, Vec{4, Float64}))) + @test contains(ir, "fadd <4 x double>") + ir = sprint(io->code_llvm(io, -, (Vec{4, Float64}, Vec{4, Float64}))) + @test contains(ir, "fsub <4 x double>") + ir = sprint(io->code_llvm(io, *, (Vec{4, Float64}, Vec{4, Float64}))) + @test contains(ir, "fmul <4 x double>") + ir = sprint(io->code_llvm(io, /, (Vec{4, Float64}, Vec{4, Float64}))) + @test contains(ir, "fdiv <4 x double>") + + ir = sprint(io->code_llvm(io, muladd, (Vec{4, Float64}, Vec{4, Float64}, Vec{4, Float64}))) + @test contains(ir, "fmul contract <4 x double>") + @test contains(ir, "fadd contract <4 x double>") + + ir = sprint(io->code_llvm(io, -, (Vec{4, Float64},))) + @test contains(ir, "fneg <4 x double>") + + # TODO: Way to test Intrinsics directly? + #`-v` -> ERROR: neg_float_withtype: value is not a primitive type +end From 799e7e27006f62cbda07851003e1c90f3b11b605 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 4 Nov 2024 17:03:06 +0100 Subject: [PATCH 10/12] add Mask dt --- base/simd.jl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/base/simd.jl b/base/simd.jl index 669e1e684f403..7701fe734d5b5 100644 --- a/base/simd.jl +++ b/base/simd.jl @@ -46,6 +46,14 @@ function Base.show(io::IO, v::Vec{N, T}) where {N, T} print(io, "]") end +# TODO: llvm.vp expects a mask of i1 +struct Mask{N} + data::NTuple{N, VecElement{Bool}} +end + +function mask_all(::Val{N}, val::Bool) where N + Mask(ntuple(_->VecElement(val),Val(N))) +end # Mocked vload/vstore! relying on SLP From 71dd9abfa59f6d9e0292638848c956daf605f253 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 4 Nov 2024 17:13:48 +0100 Subject: [PATCH 11/12] implement select --- base/simd.jl | 18 +++++++++++++----- test/simd.jl | 6 ++++++ 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/base/simd.jl b/base/simd.jl index 7701fe734d5b5..ec8dfe52dc935 100644 --- a/base/simd.jl +++ b/base/simd.jl @@ -7,7 +7,7 @@ import Base: memoryrefget, memoryrefnew, memoryrefset! import Core.Intrinsics: preferred_vector_width export Vec -export vload, vstore!, preferred_vector, width +export vload, vstore!, preferred_vector, width, select # TODO: See C# and Co Vec type # TODO: Hardware portable vector types... @@ -47,12 +47,20 @@ function Base.show(io::IO, v::Vec{N, T}) where {N, T} end # TODO: llvm.vp expects a mask of i1 -struct Mask{N} - data::NTuple{N, VecElement{Bool}} -end +const Mask{N} = Vec{N, Bool} function mask_all(::Val{N}, val::Bool) where N - Mask(ntuple(_->VecElement(val),Val(N))) + Vec(ntuple(_->VecElement(val),Val(N))) +end + +# select(m::Mask{N}, a::Vec{N, T}, b::Vec{N,T}) where {N,T} = Core.ifelse(m.data, a.data, b.data) +# ERROR: TypeError: non-boolean (NTuple{4, VecElement{Bool}}) used in boolean context +# Mocked select, relying on SLP +function select(m::Mask{N}, a::Vec{N, T}, b::Vec{N,T}) where {N,T} + data = ntuple(Val(N)) do j + VecElement(Core.ifelse(m.data[j].value, a.data[j].value, b.data[j].value)) + end + return Vec(data) end # Mocked vload/vstore! relying on SLP diff --git a/test/simd.jl b/test/simd.jl index 7d3c191f797e8..34a5cad27b7e9 100644 --- a/test/simd.jl +++ b/test/simd.jl @@ -67,3 +67,9 @@ end # TODO: Way to test Intrinsics directly? #`-v` -> ERROR: neg_float_withtype: value is not a primitive type end + +@testset "select" begin + ir = sprint(io->code_llvm(io, select, (Vec{4, Bool}, Vec{4, Float64}, Vec{4, Float64}))) + @test contains(ir, "icmp eq <4 x i8>") + @test contains(ir, "select <4 x i1>") +end From 52aac7d48821b16a36e99f07db8807b747c3fbfe Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 5 Nov 2024 16:37:07 +0100 Subject: [PATCH 12/12] WIP: Implement SIMD functionality for XorishoSIMD --- base/simd.jl | 41 +++++-- src/runtime_intrinsics.c | 27 ++++- stdlib/Random/src/XoshiroSimd.jl | 187 +++++++++++-------------------- test/simd.jl | 20 +++- 4 files changed, 145 insertions(+), 130 deletions(-) diff --git a/base/simd.jl b/base/simd.jl index ec8dfe52dc935..f17076afd5fa0 100644 --- a/base/simd.jl +++ b/base/simd.jl @@ -49,7 +49,7 @@ end # TODO: llvm.vp expects a mask of i1 const Mask{N} = Vec{N, Bool} -function mask_all(::Val{N}, val::Bool) where N +function Vec{N}(val) where N Vec(ntuple(_->VecElement(val),Val(N))) end @@ -107,14 +107,41 @@ widen(::Type{Vec{N, Float16}}) where N = Vec{N, Float16} widen(::Type{Vec{N, Float32}}) where N = Vec{N, Float32} ## floating point arithmetic ## --(x::Vec{N,T}) where {N,T<:IEEEFloat} = neg_float(x.data) +-(x::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(neg_float(x.data)) -+(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = add_float(x.data, y.data) --(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = sub_float(x.data, y.data) -*(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = mul_float(x.data, y.data) -/(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = div_float(x.data, y.data) ++(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(add_float(x.data, y.data)) +-(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(sub_float(x.data, y.data)) +*(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(mul_float(x.data, y.data)) +/(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(div_float(x.data, y.data)) muladd(x::Vec{N,T}, y::Vec{N,T}, z::Vec{N,T}) where {N, T<:IEEEFloat} = - muladd_float(x.data, y.data, z.data) + Vec(muladd_float(x.data, y.data, z.data)) + +## integer arithmetic ## +import Base: รท, BitInteger, BitSigned, BitUnsigned +import Core.Intrinsics: add_int, sub_int, mul_int, sdiv_int, udiv_int, neg_int + ++(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(add_int(x.data, y.data)) +-(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(sub_int(x.data, y.data)) +*(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(mul_int(x.data, y.data)) +# TODO ought we implement div by zero? +รท(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitSigned} = Vec(sdiv_int(x.data, y.data)) +รท(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitUnsigned} = Vec(udiv_int(x.data, y.data)) + +## logical ops +import Base: xor, |, & +import Core.Intrinsics: xor_int, and_int, or_int +xor(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(xor_int(x.data, y.data)) +(|)(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(and_int(x.data, y.data)) +(&)(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(or_int(x.data, y.data)) + +## integer shifts +# unsigned shift counts always shift in the same direction +import Base: >>, <<, >>> +import Core.Intrinsics: ashr_int, lshr_int, shl_int, lshr_int +>>(x::Vec{N, <:BitSigned}, y::Vec{N, <:BitUnsigned}) where N = ashr_int(x, y) +>>(x::Vec{N, <:BitUnsigned}, y::Vec{N, <:BitUnsigned}) where N = lshr_int(x, y) +<<(x::Vec{N, <:BitInteger}, y::Vec{N, <:BitUnsigned}) where N = shl_int(x, y) +>>>(x::Vec{N, <:BitInteger}, y::Vec{N, <:BitUnsigned}) where N = lshr_int(x, y) end # module \ No newline at end of file diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c index cb86a02bf7548..f50976cc4b10e 100644 --- a/src/runtime_intrinsics.c +++ b/src/runtime_intrinsics.c @@ -1195,13 +1195,36 @@ jl_value_t *jl_iintrinsic_2(jl_value_t *a, jl_value_t *b, const char *name, { jl_value_t *ty = jl_typeof(a); jl_value_t *tyb = jl_typeof(b); + jl_value_t *et = NULL; + jl_value_t *np = NULL; + jl_value_t *etb = NULL; + jl_value_t *npb = NULL; if (tyb != ty) { if (!cvtb) jl_errorf("%s: types of a and b must match", name); - if (!jl_is_primitivetype(tyb)) + if (jl_is_primitivetype(tyb)) {} + else if (is_ntuple_type(tyb) && jl_nparams(tyb) > 0) + { + etb = jl_tparam0(tyb); + npb = jl_nparams(tyb); + if (((jl_datatype_t*)etb)->name == jl_vecelement_typename && jl_is_primitivetype(jl_tparam(etb, 0))){} + else + jl_errorf("%s: eltype is not a VecElement of a primitive type", name); + } + else jl_errorf("%s: b is not a primitive type", name); } - if (!jl_is_primitivetype(ty)) + if (jl_is_primitivetype(ty)) {} + else if (is_ntuple_type(ty) && jl_nparams(ty) > 0) + { + et = jl_tparam0(ty); + np = jl_nparams(ty); \ + if (((jl_datatype_t*)et)->name == jl_vecelement_typename && jl_is_primitivetype(jl_tparam(et, 0))){} + else + jl_errorf("%s: eltype is not a VecElement of a primitive type", name); + // TODO cvtb + } + else jl_errorf("%s: a is not a primitive type", name); void *pa = jl_data_ptr(a), *pb = jl_data_ptr(b); unsigned sz = jl_datatype_size(ty); diff --git a/stdlib/Random/src/XoshiroSimd.jl b/stdlib/Random/src/XoshiroSimd.jl index 58544714dd9f5..0b2a85df456d3 100644 --- a/stdlib/Random/src/XoshiroSimd.jl +++ b/stdlib/Random/src/XoshiroSimd.jl @@ -8,27 +8,21 @@ using Base: BitInteger_types using Base.Libc: memcpy using Core.Intrinsics: llvmcall +using Base.Experimental.SIMD + # Vector-width. Influences random stream. xoshiroWidth() = Val(8) # Simd threshold. Influences random stream. simdThreshold(::Type{T}) where T = 64 simdThreshold(::Type{Bool}) = 640 -@inline _rotl45(x::UInt64) = (x<<45)|(x>>19) -@inline _shl17(x::UInt64) = x<<17 -@inline _rotl23(x::UInt64) = (x<<23)|(x>>41) -@inline _plus(x::UInt64,y::UInt64) = x+y -@inline _xor(x::UInt64,y::UInt64) = xor(x,y) -@inline _and(x::UInt64, y::UInt64) = x & y -@inline _or(x::UInt64, y::UInt64) = x | y -@inline _lshr(x, y::Int32) = _lshr(x, y % Int64) -@inline _lshr(x::UInt64, y::Int64) = llvmcall(""" - %res = lshr i64 %0, %1 - ret i64 %res - """, - UInt64, - Tuple{UInt64, Int64}, - x, y) +@inline rotl45(x::UInt64) = (x<<45)|(x>>19) +@inline shl17(x::UInt64) = x<<17 +@inline rotl23(x::UInt64) = (x<<23)|(x>>41) + +@inline rotl45(x::Vec{N, UInt64}) where N = (x << Vec{N}(45%UInt)) | (x >> Vec{N}(19%UInt)) +@inline shl17(x::Vec{N, UInt64}) where N = x<>Vec{N}(41%UInt)) # `_bits2float(x::UInt64, T)` takes `x::UInt64` as input, it splits it in `N` parts where # `N = sizeof(UInt64) / sizeof(T)` (`N = 1` for `Float64`, `N = 2` for `Float32, etc...), it @@ -67,51 +61,6 @@ for N in [4,8,16] let code, s, fshl = "llvm.fshl.v$(N)i64", VT = :(NTuple{$N, VecElement{UInt64}}) - s = ntuple(_->VecElement(UInt64(45)), N) - @eval @inline _rotl45(x::$VT) = ccall($fshl, llvmcall, $VT, ($VT, $VT, $VT), x, x, $s) - - s = ntuple(_->VecElement(UInt64(23)), N) - @eval @inline _rotl23(x::$VT) = ccall($fshl, llvmcall, $VT, ($VT, $VT, $VT), x, x, $s) - - code = """ - %lshiftOp = shufflevector <1 x i64> , <1 x i64> undef, <$N x i32> zeroinitializer - %res = shl <$N x i64> %0, %lshiftOp - ret <$N x i64> %res - """ - @eval @inline _shl17(x::$VT) = llvmcall($code, $VT, Tuple{$VT}, x) - - code = """ - %res = add <$N x i64> %1, %0 - ret <$N x i64> %res - """ - @eval @inline _plus(x::$VT, y::$VT) = llvmcall($code, $VT, Tuple{$VT, $VT}, x, y) - - code = """ - %res = xor <$N x i64> %1, %0 - ret <$N x i64> %res - """ - @eval @inline _xor(x::$VT, y::$VT) = llvmcall($code, $VT, Tuple{$VT, $VT}, x, y) - - code = """ - %res = and <$N x i64> %1, %0 - ret <$N x i64> %res - """ - @eval @inline _and(x::$VT, y::$VT) = llvmcall($code, $VT, Tuple{$VT, $VT}, x, y) - - code = """ - %res = or <$N x i64> %1, %0 - ret <$N x i64> %res - """ - @eval @inline _or(x::$VT, y::$VT) = llvmcall($code, $VT, Tuple{$VT, $VT}, x, y) - - code = """ - %tmp = insertelement <1 x i64> undef, i64 %1, i32 0 - %shift = shufflevector <1 x i64> %tmp, <1 x i64> %tmp, <$N x i32> zeroinitializer - %res = lshr <$N x i64> %0, %shift - ret <$N x i64> %res - """ - @eval @inline _lshr(x::$VT, y::Int64) = llvmcall($code, $VT, Tuple{$VT, Int64}, x, y) - code = """ %shiftamt = shufflevector <1 x i64> , <1 x i64> undef, <$N x i32> zeroinitializer %sh = lshr <$N x i64> %0, %shiftamt @@ -156,10 +105,10 @@ function forkRand(rng::Union{TaskLocalRNG, Xoshiro}, ::Val{N}) where N # 0x5a94851fb48a6e05 == hash(UInt(2))|0x01 # 0x3688cf5d48899fa7 == hash(UInt(3))|0x01 # 0x867b4bb4c42e5661 == hash(UInt(4))|0x01 - s0 = ntuple(i->VecElement(0x02011ce34bce797f * rand(rng, UInt64)), Val(N)) - s1 = ntuple(i->VecElement(0x5a94851fb48a6e05 * rand(rng, UInt64)), Val(N)) - s2 = ntuple(i->VecElement(0x3688cf5d48899fa7 * rand(rng, UInt64)), Val(N)) - s3 = ntuple(i->VecElement(0x867b4bb4c42e5661 * rand(rng, UInt64)), Val(N)) + s0 = Vec(ntuple(i->VecElement(0x02011ce34bce797f * rand(rng, UInt64)), Val(N))) + s1 = Vec(ntuple(i->VecElement(0x5a94851fb48a6e05 * rand(rng, UInt64)), Val(N))) + s2 = Vec(ntuple(i->VecElement(0x3688cf5d48899fa7 * rand(rng, UInt64)), Val(N))) + s3 = Vec(ntuple(i->VecElement(0x867b4bb4c42e5661 * rand(rng, UInt64)), Val(N))) (s0, s1, s2, s3) end @@ -182,26 +131,26 @@ end s0, s1, s2, s3 = getstate(rng) i = 0 while i+8 <= len - res = _plus(_rotl23(_plus(s0,s3)),s0) + res = rotl23(s0 + s3) + s0 unsafe_store!(reinterpret(Ptr{UInt64}, dst + i), f(res, T)) - t = _shl17(s1) - s2 = _xor(s2, s0) - s3 = _xor(s3, s1) - s1 = _xor(s1, s2) - s0 = _xor(s0, s3) - s2 = _xor(s2, t) - s3 = _rotl45(s3) + t = shl17(s1) + s2 = xor(s2, s0) + s3 = xor(s3, s1) + s1 = xor(s1, s2) + s0 = xor(s0, s3) + s2 = xor(s2, t) + s3 = rotl45(s3) i += 8 end if i < len - res = _plus(_rotl23(_plus(s0,s3)),s0) - t = _shl17(s1) - s2 = _xor(s2, s0) - s3 = _xor(s3, s1) - s1 = _xor(s1, s2) - s0 = _xor(s0, s3) - s2 = _xor(s2, t) - s3 = _rotl45(s3) + res = rotl23(s0 + s3) + s0 + t = shl17(s1) + s2 = xor(s2, s0) + s3 = xor(s3, s1) + s1 = xor(s1, s2) + s0 = xor(s0, s3) + s2 = xor(s2, t) + s3 = rotl45(s3) ref = Ref(f(res, T)) # TODO: This may make the random-stream dependent on system endianness GC.@preserve ref memcpy(dst+i, Base.unsafe_convert(Ptr{Cvoid}, ref), len-i) @@ -214,36 +163,36 @@ end s0, s1, s2, s3 = getstate(rng) i = 0 while i+8 <= len - res = _plus(_rotl23(_plus(s0,s3)),s0) - shift = 0 + res = rotl23(s0 + s3) + s0 + shift = UInt(0) while i+8 <= len && shift < 8 - resLoc = _and(_lshr(res, shift), 0x0101010101010101) + resLoc = (res >> shift) & 0x0101010101010101 unsafe_store!(reinterpret(Ptr{UInt64}, dst + i), resLoc) i += 8 - shift += 1 + shift += UInt(1) end - t = _shl17(s1) - s2 = _xor(s2, s0) - s3 = _xor(s3, s1) - s1 = _xor(s1, s2) - s0 = _xor(s0, s3) - s2 = _xor(s2, t) - s3 = _rotl45(s3) + t = shl17(s1) + s2 = xor(s2, s0) + s3 = xor(s3, s1) + s1 = xor(s1, s2) + s0 = xor(s0, s3) + s2 = xor(s2, t) + s3 = rotl45(s3) end if i < len # we may overgenerate some bytes here, if len mod 64 <= 56 and len mod 8 != 0 - res = _plus(_rotl23(_plus(s0,s3)),s0) - resLoc = _and(res, 0x0101010101010101) + res = rotl23(s0 + s3) + s0 + resLoc = res & 0x0101010101010101 ref = Ref(resLoc) GC.@preserve ref memcpy(dst+i, Base.unsafe_convert(Ptr{Cvoid}, ref), len-i) - t = _shl17(s1) - s2 = _xor(s2, s0) - s3 = _xor(s3, s1) - s1 = _xor(s1, s2) - s0 = _xor(s0, s3) - s2 = _xor(s2, t) - s3 = _rotl45(s3) + t = shl17(s1) + s2 = xor(s2, s0) + s3 = xor(s3, s1) + s1 = xor(s1, s2) + s0 = xor(s0, s3) + s2 = xor(s2, t) + s3 = rotl45(s3) end setstate!(rng, (s0, s1, s2, s3, nothing)) nothing @@ -255,14 +204,14 @@ end i = 0 while i + 8*N <= len - res = _plus(_rotl23(_plus(s0,s3)),s0) - t = _shl17(s1) - s2 = _xor(s2, s0) - s3 = _xor(s3, s1) - s1 = _xor(s1, s2) - s0 = _xor(s0, s3) - s2 = _xor(s2, t) - s3 = _rotl45(s3) + res = rotl23(s0 + s3) + s0 + t = shl17(s1) + s2 = xor(s2, s0) + s3 = xor(s3, s1) + s1 = xor(s1, s2) + s0 = xor(s0, s3) + s2 = xor(s2, t) + s3 = rotl45(s3) unsafe_store!(reinterpret(Ptr{NTuple{N,VecElement{UInt64}}}, dst + i), f(res, T)) i += 8*N end @@ -271,20 +220,18 @@ end @noinline function xoshiro_bulk_simd(rng::Union{TaskLocalRNG, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{Bool}, ::Val{N}, f) where {N} s0, s1, s2, s3 = forkRand(rng, Val(N)) - msk = ntuple(i->VecElement(0x0101010101010101), Val(N)) i = 0 while i + 64*N <= len - res = _plus(_rotl23(_plus(s0,s3)),s0) - t = _shl17(s1) - s2 = _xor(s2, s0) - s3 = _xor(s3, s1) - s1 = _xor(s1, s2) - s0 = _xor(s0, s3) - s2 = _xor(s2, t) - s3 = _rotl45(s3) - for k=0:7 - tmp = _lshr(res, k) - toWrite = _and(tmp, msk) + res = rotl23(s0 + s3) +s0 + t = shl17(s1) + s2 = xor(s2, s0) + s3 = xor(s3, s1) + s1 = xor(s1, s2) + s0 = xor(s0, s3) + s2 = xor(s2, t) + s3 = rotl45(s3) + for k=UInt(0):UInt(7) + toWrite = (res >> Vec{N}(k)) & Vec{N}(0x0101010101010101) unsafe_store!(reinterpret(Ptr{NTuple{N,VecElement{UInt64}}}, dst + i + k*N*8), toWrite) end i += 64*N diff --git a/test/simd.jl b/test/simd.jl index 34a5cad27b7e9..fd5b35e460630 100644 --- a/test/simd.jl +++ b/test/simd.jl @@ -47,20 +47,32 @@ end @test !contains(ir, "call void @j_throw_boundserror") end -@testset "basic arithmetic" begin +@testset "basic floating-point arithmetic" begin + A = rand(64) + v = vload(Vec{4, Float64}, A, 1) + + @test v+v isa Vec{4, Float64} ir = sprint(io->code_llvm(io, +, (Vec{4, Float64}, Vec{4, Float64}))) @test contains(ir, "fadd <4 x double>") + + @test v-v isa Vec{4, Float64} ir = sprint(io->code_llvm(io, -, (Vec{4, Float64}, Vec{4, Float64}))) @test contains(ir, "fsub <4 x double>") + + @test v*v isa Vec{4, Float64} ir = sprint(io->code_llvm(io, *, (Vec{4, Float64}, Vec{4, Float64}))) @test contains(ir, "fmul <4 x double>") + + @test v/v isa Vec{4, Float64} ir = sprint(io->code_llvm(io, /, (Vec{4, Float64}, Vec{4, Float64}))) @test contains(ir, "fdiv <4 x double>") + @test muladd(v, v, v) isa Vec{4, Float64} ir = sprint(io->code_llvm(io, muladd, (Vec{4, Float64}, Vec{4, Float64}, Vec{4, Float64}))) @test contains(ir, "fmul contract <4 x double>") @test contains(ir, "fadd contract <4 x double>") + @test -v isa Vec{4, Float64} ir = sprint(io->code_llvm(io, -, (Vec{4, Float64},))) @test contains(ir, "fneg <4 x double>") @@ -73,3 +85,9 @@ end @test contains(ir, "icmp eq <4 x i8>") @test contains(ir, "select <4 x i1>") end + +@test "basic integer arithmetic" begin +end + +@test "basic logical operations" begin +end