From aacdb85ed0771ccbf6c14386db79c711023d8b73 Mon Sep 17 00:00:00 2001
From: oscarddssmith <oscar.smith@juliacomputing.com>
Date: Sat, 13 Jul 2024 07:14:34 -0400
Subject: [PATCH 01/12] runtime works

---
 src/array.c              |   2 -
 src/runtime_intrinsics.c | 133 ++++++++++++++++++++++++++-------------
 2 files changed, 89 insertions(+), 46 deletions(-)

diff --git a/src/array.c b/src/array.c
index f0051ec17565a..53c939a5ccfc2 100644
--- a/src/array.c
+++ b/src/array.c
@@ -39,7 +39,6 @@ JL_DLLEXPORT int jl_array_validate_dims(size_t *nel, uint32_t ndims, size_t *dim
     return 0;
 }
 
-#ifndef JL_NDEBUG
 static inline int is_ntuple_long(jl_value_t *v)
 {
     if (!jl_is_tuple(v))
@@ -53,7 +52,6 @@ static inline int is_ntuple_long(jl_value_t *v)
     }
     return 1;
 }
-#endif
 
 #define jl_array_elsize(a) (((jl_datatype_t*)jl_typetagof((a)->ref.mem))->layout->size)
 
diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
index 450096eef5b01..ff7de09db5b46 100644
--- a/src/runtime_intrinsics.c
+++ b/src/runtime_intrinsics.c
@@ -1271,54 +1271,91 @@ static inline jl_value_t *jl_intrinsiclambda_checkeddiv(jl_value_t *ty, void *pa
 
 // floating point
 
-#define bi_fintrinsic(OP, name) \
-    bi_intrinsic_bfloat(OP, name) \
-    bi_intrinsic_half(OP, name) \
-    bi_intrinsic_ctype(OP, name, 32, float) \
-    bi_intrinsic_ctype(OP, name, 64, double) \
-JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \
+static int is_ntuple_type(jl_value_t *tt)
+{
+    if (!jl_is_tuple_type(tt))
+    {
+        return 0;
+    }
+    size_t i, nfields = jl_nparams(tt);
+    if(!nfields)
+        return 1;
+    jl_value_t *t1 = jl_tparam0(tt);
+    for (i = 1; i < nfields; i++) {
+        if (jl_tparam(tt, i) != t1) {
+            return 0;
+        }
+    }
+    return 1;
+}
+
+#define bi_fintrinsic(OP, op_name) \
+    bi_intrinsic_bfloat(OP, op_name) \
+    bi_intrinsic_half(OP, op_name) \
+    bi_intrinsic_ctype(OP, op_name, 32, float) \
+    bi_intrinsic_ctype(OP, op_name, 64, double) \
+JL_DLLEXPORT jl_value_t *jl_##op_name(jl_value_t *a, jl_value_t *b) \
 { \
     jl_task_t *ct = jl_current_task; \
     jl_value_t *ty = jl_typeof(a); \
+    jl_value_t *et = ty; \
+    int np=1; \
     if (jl_typeof(b) != ty) \
-        jl_error(#name ": types of a and b must match"); \
-    if (!jl_is_primitivetype(ty)) \
-        jl_error(#name ": values are not primitive types"); \
-    int sz = jl_datatype_size(ty); \
-    jl_value_t *newv = jl_gc_alloc(ct->ptls, sz, ty); \
+        jl_error(#op_name ": types of a and b must match"); \
+    if (jl_is_primitivetype(ty)){}\
+    else if (is_ntuple_type(ty) && jl_nparams(ty) > 0) \
+    { \
+        et = jl_tparam0(ty); \
+        np = jl_nparams(ty); \
+        if (((jl_datatype_t*)et)->name == jl_vecelement_typename && jl_is_primitivetype(jl_tparam(et, 0))){} \
+        else \
+            jl_error(#op_name ": eltype is not a VecElement of a primitive type"); \
+    }\
+    else \
+        jl_error(#op_name ": values are not primitive types"); \
+    int sz = jl_datatype_size(et); \
+    jl_value_t *newv = jl_gc_alloc(ct->ptls, sz*np, ty); \
     void *pa = jl_data_ptr(a), *pb = jl_data_ptr(b), *pr = jl_data_ptr(newv); \
     switch (sz) { \
     /* choose the right size c-type operation */ \
     case 2: \
         if ((jl_datatype_t*)ty == jl_float16_type) \
-            jl_##name##16(16, pa, pb, pr); \
+            jl_##op_name##16(16*np, pa, pb, pr); \
         else /*if ((jl_datatype_t*)ty == jl_bfloat16_type)*/ \
-            jl_##name##bf16(16, pa, pb, pr); \
+            jl_##op_name##bf16(16*np, pa, pb, pr); \
         break; \
     case 4: \
-        jl_##name##32(32, pa, pb, pr); \
+        jl_##op_name##32(32*np, pa, pb, pr); \
         break; \
     case 8: \
-        jl_##name##64(64, pa, pb, pr); \
+        jl_##op_name##64(64*np, pa, pb, pr); \
         break; \
     default: \
-        jl_error(#name ": runtime floating point intrinsics are not implemented for bit sizes other than 16, 32 and 64"); \
+        jl_error(#op_name ": runtime floating point intrinsics are not implemented for bit sizes other than 16, 32 and 64"); \
     } \
     return newv; \
 }
 
-#define bool_fintrinsic(OP, name) \
-    bool_intrinsic_bfloat(OP, name) \
-    bool_intrinsic_half(OP, name) \
-    bool_intrinsic_ctype(OP, name, 32, float) \
-    bool_intrinsic_ctype(OP, name, 64, double) \
-JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \
+#define bool_fintrinsic(OP, op_name) \
+    bool_intrinsic_bfloat(OP, op_name) \
+    bool_intrinsic_half(OP, op_name) \
+    bool_intrinsic_ctype(OP, op_name, 32, float) \
+    bool_intrinsic_ctype(OP, op_name, 64, double) \
+JL_DLLEXPORT jl_value_t *jl_##op_name(jl_value_t *a, jl_value_t *b) \
 { \
     jl_value_t *ty = jl_typeof(a); \
     if (jl_typeof(b) != ty) \
-        jl_error(#name ": types of a and b must match"); \
-    if (!jl_is_primitivetype(ty)) \
-        jl_error(#name ": values are not primitive types"); \
+        jl_error(#op_name ": types of a and b must match"); \
+    if (jl_is_primitivetype(ty)) {}\
+    else if (is_ntuple_type(ty) && jl_nparams(ty) > 0) \
+    { \
+        jl_value_t *et = jl_tparam(ty, 0); \
+        if (((jl_datatype_t*)et)->name == jl_vecelement_typename && jl_is_primitivetype(jl_tparam(et, 0))){} \
+        else \
+            jl_error(#op_name ": eltype is not a VecElement of a primitive type"); \
+    }\
+    else \
+        jl_error(#op_name ": values are not primitive types"); \
     void *pa = jl_data_ptr(a), *pb = jl_data_ptr(b); \
     int sz = jl_datatype_size(ty); \
     int cmp; \
@@ -1326,35 +1363,43 @@ JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \
     /* choose the right size c-type operation */ \
     case 2: \
         if ((jl_datatype_t*)ty == jl_float16_type) \
-            cmp = jl_##name##16(16, pa, pb); \
+            cmp = jl_##op_name##16(16, pa, pb); \
         else /*if ((jl_datatype_t*)ty == jl_bfloat16_type)*/ \
-            cmp = jl_##name##bf16(16, pa, pb); \
+            cmp = jl_##op_name##bf16(16, pa, pb); \
         break; \
     case 4: \
-        cmp = jl_##name##32(32, pa, pb); \
+        cmp = jl_##op_name##32(32, pa, pb); \
         break; \
     case 8: \
-        cmp = jl_##name##64(64, pa, pb); \
+        cmp = jl_##op_name##64(64, pa, pb); \
         break; \
     default: \
-        jl_error(#name ": runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64"); \
+        jl_error(#op_name ": runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64"); \
     } \
     return cmp ? jl_true : jl_false; \
 }
 
-#define ter_fintrinsic(OP, name) \
-    ter_intrinsic_bfloat(OP, name) \
-    ter_intrinsic_half(OP, name) \
-    ter_intrinsic_ctype(OP, name, 32, float) \
-    ter_intrinsic_ctype(OP, name, 64, double) \
-JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b, jl_value_t *c) \
+#define ter_fintrinsic(OP, op_name) \
+    ter_intrinsic_bfloat(OP, op_name) \
+    ter_intrinsic_half(OP, op_name) \
+    ter_intrinsic_ctype(OP, op_name, 32, float) \
+    ter_intrinsic_ctype(OP, op_name, 64, double) \
+JL_DLLEXPORT jl_value_t *jl_##op_name(jl_value_t *a, jl_value_t *b, jl_value_t *c) \
 { \
     jl_task_t *ct = jl_current_task; \
     jl_value_t *ty = jl_typeof(a); \
     if (jl_typeof(b) != ty || jl_typeof(c) != ty) \
-        jl_error(#name ": types of a, b, and c must match"); \
-    if (!jl_is_primitivetype(ty)) \
-        jl_error(#name ": values are not primitive types"); \
+        jl_error(#op_name ": types of a, b, and c must match"); \
+    if (jl_is_primitivetype(ty)) {}\
+    else if (is_ntuple_type(ty) && jl_nparams(ty) > 0) \
+    { \
+        jl_value_t *et = jl_tparam(ty, 0); \
+        if (((jl_datatype_t*)et)->name == jl_vecelement_typename && jl_is_primitivetype(jl_tparam(et, 0))){} \
+        else \
+            jl_error(#op_name ": eltype is not a VecElement of a primitive type"); \
+    }\
+    else \
+        jl_error(#op_name ": values are not primitive types"); \
     int sz = jl_datatype_size(ty); \
     jl_value_t *newv = jl_gc_alloc(ct->ptls, sz, ty); \
     void *pa = jl_data_ptr(a), *pb = jl_data_ptr(b), *pc = jl_data_ptr(c), *pr = jl_data_ptr(newv); \
@@ -1362,18 +1407,18 @@ JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b, jl_value_t *c)
     /* choose the right size c-type operation */ \
     case 2: \
         if ((jl_datatype_t*)ty == jl_float16_type) \
-            jl_##name##16(16, pa, pb, pc, pr); \
+            jl_##op_name##16(16, pa, pb, pc, pr); \
         else /*if ((jl_datatype_t*)ty == jl_bfloat16_type)*/ \
-            jl_##name##bf16(16, pa, pb, pc, pr); \
+            jl_##op_name##bf16(16, pa, pb, pc, pr); \
         break; \
     case 4: \
-        jl_##name##32(32, pa, pb, pc, pr); \
+        jl_##op_name##32(32, pa, pb, pc, pr); \
         break; \
     case 8: \
-        jl_##name##64(64, pa, pb, pc, pr); \
+        jl_##op_name##64(64, pa, pb, pc, pr); \
         break; \
     default: \
-        jl_error(#name ": runtime floating point intrinsics are not implemented for bit sizes other than 16, 32 and 64"); \
+        jl_error(#op_name ": runtime floating point intrinsics are not implemented for bit sizes other than 16, 32 and 64"); \
     } \
     return newv; \
 }

From 3b5248f4e40b4ceb8690b2f450b2e8c49fafcfc1 Mon Sep 17 00:00:00 2001
From: oscarddssmith <oscar.smith@juliacomputing.com>
Date: Sat, 13 Jul 2024 10:26:48 -0400
Subject: [PATCH 02/12] IT WORKSjsnjsnjsnjsnjsn!

---
 src/intrinsics.cpp       | 21 ++++++++++++---------
 src/julia.h              | 18 ++++++++++++++++++
 src/runtime_intrinsics.c | 18 ------------------
 3 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index 09916297e16ff..81e427e3aa9e4 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -1287,7 +1287,6 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar
         return emit_llvmcall(ctx, args, nargs);
     if (f == cglobal_auto || f == cglobal)
         return emit_cglobal(ctx, args, nargs);
-
     SmallVector<jl_cgval_t, 0> argv(nargs);
     for (size_t i = 0; i < nargs; ++i) {
         jl_cgval_t arg = emit_expr(ctx, args[i + 1]);
@@ -1409,17 +1408,21 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar
     default: {
         assert(nargs >= 1 && "invalid nargs for intrinsic call");
         const jl_cgval_t &xinfo = argv[0];
-
         // verify argument types
-        if (!jl_is_primitivetype(xinfo.typ))
-            return emit_runtime_call(ctx, f, argv, nargs);
-        Type *xtyp = bitstype_to_llvm(xinfo.typ, ctx.builder.getContext(), true);
-        if (float_func()[f])
-            xtyp = FLOATT(xtyp);
+        if (jl_is_primitivetype(xinfo.typ)){}
+        
+        else if (is_ntuple_type(xinfo.typ) && jl_nparams(xinfo.typ) > 0)
+        {
+            jl_value_t *et = jl_tparam0(xinfo.typ);
+            if (((jl_datatype_t*)et)->name == jl_vecelement_typename && jl_is_primitivetype(jl_tparam(et, 0)))
+                et = jl_tparam0(et);
+            else
+                return emit_runtime_call(ctx, f, argv, nargs);
+        }
         else
-            xtyp = INTT(xtyp, DL);
-        if (!xtyp)
             return emit_runtime_call(ctx, f, argv, nargs);
+        bool isboxed=true;
+        Type *xtyp = julia_type_to_llvm(ctx, xinfo.typ, &(isboxed));
         ////Bool are required to be in the range [0,1]
         ////so while they are represented as i8,
         ////the operations need to be done in mod 1
diff --git a/src/julia.h b/src/julia.h
index 5b9986a5e68ee..17cb1f6bd7e01 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -1717,6 +1717,24 @@ STATIC_INLINE int jl_is_tuple_type(void *t) JL_NOTSAFEPOINT
             ((jl_datatype_t*)(t))->name == jl_tuple_typename);
 }
 
+STATIC_INLINE int is_ntuple_type(jl_value_t *tt)
+{
+    if (!jl_is_tuple_type(tt))
+    {
+        return 0;
+    }
+    size_t i, nfields = jl_nparams(tt);
+    if(!nfields)
+        return 1;
+    jl_value_t *t1 = jl_tparam0(tt);
+    for (i = 1; i < nfields; i++) {
+        if (jl_tparam(tt, i) != t1) {
+            return 0;
+        }
+    }
+    return 1;
+}
+
 STATIC_INLINE int jl_is_namedtuple_type(void *t) JL_NOTSAFEPOINT
 {
     return (jl_is_datatype(t) &&
diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
index ff7de09db5b46..9769e0142bbf9 100644
--- a/src/runtime_intrinsics.c
+++ b/src/runtime_intrinsics.c
@@ -1271,24 +1271,6 @@ static inline jl_value_t *jl_intrinsiclambda_checkeddiv(jl_value_t *ty, void *pa
 
 // floating point
 
-static int is_ntuple_type(jl_value_t *tt)
-{
-    if (!jl_is_tuple_type(tt))
-    {
-        return 0;
-    }
-    size_t i, nfields = jl_nparams(tt);
-    if(!nfields)
-        return 1;
-    jl_value_t *t1 = jl_tparam0(tt);
-    for (i = 1; i < nfields; i++) {
-        if (jl_tparam(tt, i) != t1) {
-            return 0;
-        }
-    }
-    return 1;
-}
-
 #define bi_fintrinsic(OP, op_name) \
     bi_intrinsic_bfloat(OP, op_name) \
     bi_intrinsic_half(OP, op_name) \

From 7b868b19a81279920b8b4af209f1e8acc3206d7a Mon Sep 17 00:00:00 2001
From: oscarddssmith <oscar.smith@juliacomputing.com>
Date: Sat, 13 Jul 2024 11:05:06 -0400
Subject: [PATCH 03/12] works with bitcast

---
 src/intrinsics.cpp | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index 81e427e3aa9e4..51aeb14a5f4a0 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -144,6 +144,13 @@ static Type *FLOATT(Type *t)
 {
     if (t->isFloatingPointTy())
         return t;
+    if (auto *tv = dyn_cast<VectorType>(t))
+    {
+        Type *st = FLOATT(tv->getElementType());
+        if (!st)
+            return NULL;
+        return VectorType::get(st, tv->getElementCount());
+    }
     unsigned nb = (t->isPointerTy() ? sizeof(void*) * 8 : t->getPrimitiveSizeInBits());
     auto &ctxt = t->getContext();
     if (nb == 64)
@@ -165,6 +172,13 @@ static Type *INTT(Type *t, const DataLayout &DL)
         return t;
     if (t->isPointerTy())
         return DL.getIntPtrType(t);
+    if (auto *tv = dyn_cast<VectorType>(t))
+    {
+        Type *st = INTT(tv->getElementType(), DL);
+        if (!st)
+            return NULL;
+        return VectorType::get(st, tv->getElementCount());
+    }
     if (t == getDoubleTy(ctxt))
         return getInt64Ty(ctxt);
     if (t == getFloatTy(ctxt))
@@ -1423,6 +1437,12 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar
             return emit_runtime_call(ctx, f, argv, nargs);
         bool isboxed=true;
         Type *xtyp = julia_type_to_llvm(ctx, xinfo.typ, &(isboxed));
+        if (float_func()[f])
+            xtyp = FLOATT(xtyp);
+        else
+            xtyp = INTT(xtyp, DL);
+        if (!xtyp)
+             return emit_runtime_call(ctx, f, argv, nargs);
         ////Bool are required to be in the range [0,1]
         ////so while they are represented as i8,
         ////the operations need to be done in mod 1

From 2e9f793b8ad5af06076bc304ee4beebfefa95622 Mon Sep 17 00:00:00 2001
From: oscarddssmith <oscar.smith@juliacomputing.com>
Date: Mon, 15 Jul 2024 04:25:54 -0400
Subject: [PATCH 04/12] undo array changes

---
 src/array.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/array.c b/src/array.c
index 53c939a5ccfc2..f0051ec17565a 100644
--- a/src/array.c
+++ b/src/array.c
@@ -39,6 +39,7 @@ JL_DLLEXPORT int jl_array_validate_dims(size_t *nel, uint32_t ndims, size_t *dim
     return 0;
 }
 
+#ifndef JL_NDEBUG
 static inline int is_ntuple_long(jl_value_t *v)
 {
     if (!jl_is_tuple(v))
@@ -52,6 +53,7 @@ static inline int is_ntuple_long(jl_value_t *v)
     }
     return 1;
 }
+#endif
 
 #define jl_array_elsize(a) (((jl_datatype_t*)jl_typetagof((a)->ref.mem))->layout->size)
 

From ddc8890a5e41d17a3564b32aec0faa089b5dad55 Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith@gmail.com>
Date: Sun, 28 Jul 2024 08:01:33 -0400
Subject: [PATCH 05/12] Update src/intrinsics.cpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Mosè Giordano <giordano@users.noreply.github.com>
---
 src/intrinsics.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index 51aeb14a5f4a0..4545329f270c2 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -1424,7 +1424,6 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar
         const jl_cgval_t &xinfo = argv[0];
         // verify argument types
         if (jl_is_primitivetype(xinfo.typ)){}
-        
         else if (is_ntuple_type(xinfo.typ) && jl_nparams(xinfo.typ) > 0)
         {
             jl_value_t *et = jl_tparam0(xinfo.typ);

From e6d6f4a54447de7967609d5b1f298d7e40496970 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 8 Oct 2024 18:48:20 +0200
Subject: [PATCH 06/12] implement vload/vstore! and a primitive Vec type

---
 base/experimental.jl |  2 ++
 base/simd.jl         | 70 ++++++++++++++++++++++++++++++++++++++++++++
 test/choosetests.jl  |  2 +-
 test/simd.jl         | 36 +++++++++++++++++++++++
 4 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100644 base/simd.jl
 create mode 100644 test/simd.jl

diff --git a/base/experimental.jl b/base/experimental.jl
index 982ed5e78aa8c..dbf34c8183f7f 100644
--- a/base/experimental.jl
+++ b/base/experimental.jl
@@ -471,4 +471,6 @@ function entrypoint(@nospecialize(argt::Type))
     nothing
 end
 
+include("simd.jl")
+
 end
diff --git a/base/simd.jl b/base/simd.jl
new file mode 100644
index 0000000000000..eef40e5f21fea
--- /dev/null
+++ b/base/simd.jl
@@ -0,0 +1,70 @@
+module SIMD
+
+import Base: VecElement, Memory, MemoryRef
+import Base: @propagate_inbounds, @_propagate_inbounds_meta, @_boundscheck, @_noub_if_noinbounds_meta
+import Base: memoryrefget, memoryrefnew, memoryrefset!
+
+export Vec
+export vload, vstore!, natural_vecwidth
+
+# TODO: See C# and Co Vec type 
+# TODO: Hardware portable vector types...
+
+struct Vec{N, T}
+    data::NTuple{N, VecElement{T}}
+end
+
+# Constructors
+@inline Vec(v::NTuple{N, T}) where {N, T} = Vec(VecElement.(v))
+@inline Vec(v::Vararg{T, N}) where {N, T} = Vec(v)
+@inline Vec(v::Vec) = v
+
+# Numbers defines this and it is needed in power_by_squaring...
+Base.copy(v::Vec) = v
+
+function Base.show(io::IO, v::Vec{N, T}) where {N, T}
+    io = IOContext(io, :typeinfo => eltype(v))
+    print(io, "<$N x $T>[")
+    join(io, [sprint(show, x.value; context=io) for x in v.data], ", ")
+    print(io, "]")
+end
+
+# Breaks with multi-versioning
+natural_vecwidth(::Type{Float32}) = 8
+natural_vecwidth(::Type{Float64}) = 4
+
+import Base: +, -, *
+
+# Mocked vload/vstore! relying on SLP
+
+@inline function vload(::Type{Vec{N, T}}, A::Array{T}, i::Int) where {N, T}
+    @_noub_if_noinbounds_meta
+    # TODO: Alignment...; may need an intrinsic for vectorized loads.
+    # Writting my own boundscheck loop since `inbounds` doesn't propagate through `ntuple` FFS
+    @boundscheck checkbounds(A, i:(i+ N - 1))
+    mem = A.ref
+    data = ntuple(Val(N)) do j
+        # why does `@inbounds  ref = memoryrefnew(mem, i + j - 1, @_boundscheck)` not work?
+        ref = memoryrefnew(mem, i + j - 1, false)
+        VecElement{T}(memoryrefget(ref, :not_atomic, false))
+    end
+    return Vec(data)
+end
+
+@inline function vstore!(A::Array{T}, v::Vec{N, T}, i::Int) where {N, T}
+    @_noub_if_noinbounds_meta
+    # TODO: Alignment...; may need an intrinsic for vectorized loads.
+    # Writting my own boundscheck loop since `inbounds` doesn't propagate through `ntuple` FFS
+    @boundscheck checkbounds(A, i:(i+ N - 1))
+    mem = A.ref
+    data = v.data
+    ntuple(Val(N)) do j
+        # why does `@inbounds  ref = memoryrefnew(mem, i + j - 1, @_boundscheck)` not work?
+        ref = memoryrefnew(mem, i + j - 1, false)
+        memoryrefset!(ref, data[j].value, :not_atomic, false)
+        return nothing
+    end
+    return nothing
+end
+
+end # module
\ No newline at end of file
diff --git a/test/choosetests.jl b/test/choosetests.jl
index affdee412bd86..a2e9ad5e52c6f 100644
--- a/test/choosetests.jl
+++ b/test/choosetests.jl
@@ -11,7 +11,7 @@ const TESTNAMES = [
         "char", "strings", "triplequote", "unicode", "intrinsics",
         "dict", "hashing", "iobuffer", "staged", "offsetarray",
         "arrayops", "tuple", "reduce", "reducedim", "abstractarray",
-        "intfuncs", "simdloop", "vecelement", "rational",
+        "intfuncs", "simdloop", "vecelement", "rational", "simd",
         "bitarray", "copy", "math", "fastmath", "functional", "iterators",
         "operators", "ordering", "path", "ccall", "parse", "loading", "gmp",
         "sorting", "spawn", "backtrace", "exceptions",
diff --git a/test/simd.jl b/test/simd.jl
new file mode 100644
index 0000000000000..d35fadec23b9d
--- /dev/null
+++ b/test/simd.jl
@@ -0,0 +1,36 @@
+using Base.Experimental.SIMD
+using Test
+using InteractiveUtils
+
+function vcopyto!(a::Array{T}, b::Array{T}) where T
+    stride = natural_vecwidth(T)
+    VT = Vec{stride, T}
+    @assert length(a) == length(b)
+    @assert length(a) % stride == 0
+    @inbounds for i in 1:stride:length(a)
+        vec = vload(VT, a, i)
+        vstore!(b, vec, i)
+    end
+end
+
+@testset "load/store" begin
+    A = rand(64)
+    B = zeros(64)
+
+    vcopyto!(A, B)
+    @test A == B
+
+    @test_throws BoundsError vload(Vec{4, Float64}, A, 62)
+    vec = vload(Vec{4, Float64}, A, 1)
+    @test_throws BoundsError vstore!(A, vec, 62)
+
+    load(A, i) = @inbounds vload(Vec{4, Float64}, A, i)
+    store(A,v,i) = @inbounds vstore!(A, v, i)
+
+    ir = sprint(io->code_llvm(io, vload, (Type{Vec{4, Float64}}, Vector{Float64}, Int)))
+    @test contains(ir, "call void @j_throw_boundserror")
+
+    ir = sprint(io->code_llvm(io, load, (Vector{Float64}, Int)))
+    @test contains(ir, "load <4 x double>")
+    @test !contains(ir, "call void @j_throw_boundserror")
+end

From 0a0f7e0074806476ac3013d51e985453a2f30cfd Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Wed, 9 Oct 2024 09:42:58 +0200
Subject: [PATCH 07/12] start on preferred_width intrinsic

---
 base/compiler/tfuncs.jl  | 24 +++++++++++++++++++-
 base/simd.jl             | 23 +++++++++++++++-----
 src/intrinsics.cpp       | 47 ++++++++++++++++++++++++++++++++++++++++
 src/intrinsics.h         |  1 +
 src/julia_internal.h     |  1 +
 src/llvm-cpufeatures.cpp | 29 +++++++++++++++++++++++++
 src/runtime_intrinsics.c | 15 +++++++++++++
 test/simd.jl             | 16 ++++++++++++--
 8 files changed, 148 insertions(+), 8 deletions(-)

diff --git a/base/compiler/tfuncs.jl b/base/compiler/tfuncs.jl
index aaa1354fd5e54..5902293c3a8c9 100644
--- a/base/compiler/tfuncs.jl
+++ b/base/compiler/tfuncs.jl
@@ -52,7 +52,7 @@ end
 
 const INT_INF = typemax(Int) # integer infinity
 
-const N_IFUNC = reinterpret(Int32, have_fma) + 1
+const N_IFUNC = reinterpret(Int32, preferred_vector_width) + 1
 const T_IFUNC = Vector{Tuple{Int, Int, Any}}(undef, N_IFUNC)
 const T_IFUNC_COST = Vector{Int}(undef, N_IFUNC)
 const T_FFUNC_KEY = Vector{Any}()
@@ -318,6 +318,28 @@ add_tfunc(Core.Intrinsics.cglobal, 1, 2, cglobal_tfunc, 5)
 
 add_tfunc(Core.Intrinsics.have_fma, 1, 1, @nospecs((𝕃::AbstractLattice, x)->Bool), 1)
 
+@nospecs function preferred_vector_width_tfunc(𝕃::AbstractLattice, t)
+    return preferred_vector_width_tfunc(widenlattice(𝕃), t)
+end
+
+@nospecs function preferred_vector_width_tfunc(𝕃::ConstsLattice, t)
+    # Want to return Union(Const(1), Const(2))
+    # hardcode AVX512
+    if sizeof(widenconst(t)) === 1
+        return Const(32)
+    elseif sizeof(widenconst(t)) === 2
+        return Const(16)
+    elseif sizeof(widenconst(t)) === 4
+        return Const(8)
+    elseif sizeof(widenconst(t)) === 8
+        return Const(4)
+    elseif sizeof(widenconst(t)) === 16
+        return Const(4)
+    end
+    return Union{Nothing, Int}
+end
+add_tfunc(Core.Intrinsics.preferred_vector_width, 1, 1, preferred_vector_width_tfunc, 1)
+
 # builtin functions
 # =================
 
diff --git a/base/simd.jl b/base/simd.jl
index eef40e5f21fea..e249e04ee69ac 100644
--- a/base/simd.jl
+++ b/base/simd.jl
@@ -4,16 +4,33 @@ import Base: VecElement, Memory, MemoryRef
 import Base: @propagate_inbounds, @_propagate_inbounds_meta, @_boundscheck, @_noub_if_noinbounds_meta
 import Base: memoryrefget, memoryrefnew, memoryrefset!
 
+import Core.Intrinsics: preferred_vector_width
+
 export Vec
-export vload, vstore!, natural_vecwidth
+export vload, vstore!, preferred_vector, width
 
 # TODO: See C# and Co Vec type 
 # TODO: Hardware portable vector types...
 
+# TODO: tfunc support for preferred_vector_width does allow for "constant prop"
+#       but the intrinsic is not removed just yet during JIT, we should only need
+#       it for AOT or on a machine with scaleable vector types...
+
 struct Vec{N, T}
     data::NTuple{N, VecElement{T}}
 end
 
+width(::Type{<:Vec{N}}) where N = N
+width(::Vec{N}) where N = N
+
+function preferred_vector(::Type{T}) where T
+    width = preferred_vector_width(T)
+    if width === nothing
+        error("$T has no preferred_vector_width")
+    end
+    return Vec{width, T}
+end
+
 # Constructors
 @inline Vec(v::NTuple{N, T}) where {N, T} = Vec(VecElement.(v))
 @inline Vec(v::Vararg{T, N}) where {N, T} = Vec(v)
@@ -29,10 +46,6 @@ function Base.show(io::IO, v::Vec{N, T}) where {N, T}
     print(io, "]")
 end
 
-# Breaks with multi-versioning
-natural_vecwidth(::Type{Float32}) = 8
-natural_vecwidth(::Type{Float64}) = 4
-
 import Base: +, -, *
 
 # Mocked vload/vstore! relying on SLP
diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index 4545329f270c2..c95f115da6100 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -37,6 +37,7 @@ STATISTIC(Emitted_fptrunc, "Number of fptrunc calls emitted");
 STATISTIC(Emitted_fpext, "Number of fpext calls emitted");
 STATISTIC(Emitted_not_int, "Number of not_int calls emitted");
 STATISTIC(Emitted_have_fma, "Number of have_fma calls emitted");
+STATISTIC(Emitted_preferred_vector_width, "Number of prefferred_vector_width calls emitted");
 STATISTIC(EmittedUntypedIntrinsics, "Number of untyped intrinsics emitted");
 
 using namespace JL_I;
@@ -1419,6 +1420,52 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar
         return mark_julia_type(ctx, ret, false, jl_bool_type);
     }
 
+    case preferred_vector_width: {
+        ++Emitted_preferred_vector_width;
+        assert(nargs == 1);
+        const jl_cgval_t &x = argv[0];
+        if (!x.constant || !jl_is_datatype(x.constant))
+            return emit_runtime_call(ctx, f, argv, nargs);
+        jl_datatype_t *dt = (jl_datatype_t*) x.constant;
+
+        // select the appropriated overloaded intrinsic
+        std::string intr_name = "julia.cpu.preferred_vector_width.";
+        switch (jl_datatype_size(dt)) {
+            case 1: {
+                intr_name += "b1";
+                break;
+            case 2: {
+                intr_name += "b2";
+                break;
+            }
+            case 4: {
+                intr_name += "b4";
+                break;
+            }
+            case 8: {
+                intr_name += "b8";
+                break;
+            }
+            case 16: {
+                intr_name += "b16";
+                break;
+            }
+            default:
+                return emit_runtime_call(ctx, f, argv, nargs);
+            }
+        }
+
+#ifdef _P64
+        FunctionCallee intr = jl_Module->getOrInsertFunction(intr_name, getInt64Ty(ctx.builder.getContext()));
+        auto ret = ctx.builder.CreateCall(intr);
+        return mark_julia_type(ctx, ret, false, jl_int64_type);
+#else
+        FunctionCallee intr = jl_Module->getOrInsertFunction(intr_name, getInt32Ty(ctx.builder.getContext()));
+        auto ret = ctx.builder.CreateCall(intr);
+        return mark_julia_type(ctx, ret, false, jl_int32_type);
+#endif
+    }
+
     default: {
         assert(nargs >= 1 && "invalid nargs for intrinsic call");
         const jl_cgval_t &xinfo = argv[0];
diff --git a/src/intrinsics.h b/src/intrinsics.h
index 5b463e3bafe28..1290c4d769154 100644
--- a/src/intrinsics.h
+++ b/src/intrinsics.h
@@ -102,6 +102,7 @@
     ALIAS(llvmcall, llvmcall) \
     /*  cpu feature tests */ \
     ADD_I(have_fma, 1) \
+    ADD_I(preferred_vector_width, 1) \
     /*  hidden intrinsics */ \
     ADD_HIDDEN(cglobal_auto, 1)
 
diff --git a/src/julia_internal.h b/src/julia_internal.h
index f3959490855c8..8e018ef382593 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -1614,6 +1614,7 @@ JL_DLLEXPORT jl_value_t *jl_copysign_float(jl_value_t *a, jl_value_t *b);
 JL_DLLEXPORT jl_value_t *jl_flipsign_int(jl_value_t *a, jl_value_t *b);
 
 JL_DLLEXPORT jl_value_t *jl_have_fma(jl_value_t *a);
+JL_DLLEXPORT jl_value_t *jl_preferred_vector_width(jl_value_t *a);
 JL_DLLEXPORT int jl_stored_inline(jl_value_t *el_type);
 JL_DLLEXPORT jl_value_t *(jl_array_data_owner)(jl_array_t *a);
 JL_DLLEXPORT jl_array_t *jl_array_copy(jl_array_t *ary);
diff --git a/src/llvm-cpufeatures.cpp b/src/llvm-cpufeatures.cpp
index 05d62adc57926..b7fb94f79668b 100644
--- a/src/llvm-cpufeatures.cpp
+++ b/src/llvm-cpufeatures.cpp
@@ -86,6 +86,26 @@ void lowerHaveFMA(Function &intr, Function &caller, const Triple &TT, CallInst *
     return;
 }
 
+void lowerPreferredVectorWidth(Function &intr, Function &caller, const Triple &TT, CallInst *I) JL_NOTSAFEPOINT {
+    auto intr_name = intr.getName();
+    auto typ = intr_name.substr(strlen("julia.cpu.preferred_vector_width."));
+
+    size_t width = 0;
+    if (typ == "b1")
+        width = 32;
+    else if (typ == "b2")
+        width = 16;
+    else if (typ == "b4")
+        width = 8;
+    else if (typ == "b8")
+        width = 4;
+    else if (typ == "b16")
+        width = 2;
+
+    I->replaceAllUsesWith(ConstantInt::get(I->getType(), width));
+    return;
+}
+
 bool lowerCPUFeatures(Module &M) JL_NOTSAFEPOINT
 {
     auto TT = Triple(M.getTargetTriple());
@@ -102,6 +122,15 @@ bool lowerCPUFeatures(Module &M) JL_NOTSAFEPOINT
                 Materialized.push_back(I);
             }
         }
+
+        if (FN.starts_with("julia.cpu.preferred_vector_width.")) {
+            for (Use &U: F.uses()) {
+                User *RU = U.getUser();
+                CallInst *I = cast<CallInst>(RU);
+                lowerPreferredVectorWidth(F, *I->getParent()->getParent(), TT, I);
+                Materialized.push_back(I);
+            }
+        }
     }
 
     if (!Materialized.empty()) {
diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
index 9769e0142bbf9..cb86a02bf7548 100644
--- a/src/runtime_intrinsics.c
+++ b/src/runtime_intrinsics.c
@@ -1733,6 +1733,21 @@ JL_DLLEXPORT jl_value_t *jl_have_fma(jl_value_t *typ)
         return jl_false;
 }
 
+JL_DLLEXPORT jl_value_t *jl_preferred_vector_width(jl_value_t *typ)
+{
+    JL_TYPECHK(preferred_vector_width, datatype, typ); // TODO what about float16/bfloat16?
+    jl_datatype_t* dt = (jl_datatype_t*)typ;
+    int sz = jl_datatype_size(dt);
+    int width = 32 / sz;
+    if (width == 0)
+        return jl_nothing;
+#ifdef _P64
+    return jl_box_int64(width);
+#else
+    return jl_box_int32(width);
+#endif
+}
+
 JL_DLLEXPORT jl_value_t *jl_add_ptr(jl_value_t *ptr, jl_value_t *offset)
 {
     JL_TYPECHK(add_ptr, pointer, ptr);
diff --git a/test/simd.jl b/test/simd.jl
index d35fadec23b9d..dbda2e4d51193 100644
--- a/test/simd.jl
+++ b/test/simd.jl
@@ -3,8 +3,8 @@ using Test
 using InteractiveUtils
 
 function vcopyto!(a::Array{T}, b::Array{T}) where T
-    stride = natural_vecwidth(T)
-    VT = Vec{stride, T}
+    VT = preferred_vector(T)
+    stride = width(VT)
     @assert length(a) == length(b)
     @assert length(a) % stride == 0
     @inbounds for i in 1:stride:length(a)
@@ -13,6 +13,18 @@ function vcopyto!(a::Array{T}, b::Array{T}) where T
     end
 end
 
+# todo: noninline/mutable types?
+primitive type I256 256 end
+primitive type I512 512 end
+
+@testset "preferred_vector_width" begin
+    for T in (Int8, Int16, Int32, Int64, Int128, I256)
+        max_width = 32 # avx2
+        @test width(preferred_vector(T)) == max_width ÷ sizeof(T)
+    end
+    @test_throws ErrorException preferred_vector(I526)
+end
+
 @testset "load/store" begin
     A = rand(64)
     B = zeros(64)

From 0e95cfd18e0830b95b2484b0b3e9c196b28fb3fd Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 4 Nov 2024 16:23:13 +0100
Subject: [PATCH 08/12] fixup! start on preferred_width intrinsic

---
 base/compiler/tfuncs.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/base/compiler/tfuncs.jl b/base/compiler/tfuncs.jl
index 5902293c3a8c9..a97f2742d874f 100644
--- a/base/compiler/tfuncs.jl
+++ b/base/compiler/tfuncs.jl
@@ -324,7 +324,7 @@ end
 
 @nospecs function preferred_vector_width_tfunc(𝕃::ConstsLattice, t)
     # Want to return Union(Const(1), Const(2))
-    # hardcode AVX512
+    # hardcode AVX256
     if sizeof(widenconst(t)) === 1
         return Const(32)
     elseif sizeof(widenconst(t)) === 2
@@ -334,7 +334,7 @@ end
     elseif sizeof(widenconst(t)) === 8
         return Const(4)
     elseif sizeof(widenconst(t)) === 16
-        return Const(4)
+        return Const(2)
     end
     return Union{Nothing, Int}
 end

From c8d0ba3952bf8ba8c2e371a3b307077488484dbf Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 4 Nov 2024 17:00:32 +0100
Subject: [PATCH 09/12] add some basic arithmetic support

---
 base/simd.jl | 25 +++++++++++++++++++++++--
 test/simd.jl | 21 +++++++++++++++++++++
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/base/simd.jl b/base/simd.jl
index e249e04ee69ac..669e1e684f403 100644
--- a/base/simd.jl
+++ b/base/simd.jl
@@ -1,6 +1,6 @@
 module SIMD
 
-import Base: VecElement, Memory, MemoryRef
+import Base: VecElement, Memory, MemoryRef, IEEEFloat
 import Base: @propagate_inbounds, @_propagate_inbounds_meta, @_boundscheck, @_noub_if_noinbounds_meta
 import Base: memoryrefget, memoryrefnew, memoryrefset!
 
@@ -46,7 +46,6 @@ function Base.show(io::IO, v::Vec{N, T}) where {N, T}
     print(io, "]")
 end
 
-import Base: +, -, *
 
 # Mocked vload/vstore! relying on SLP
 
@@ -80,4 +79,26 @@ end
     return nothing
 end
 
+import Base: +, -, *, /, muladd, promote_rule, widen
+import Core.Intrinsics: add_float, sub_float, mul_float, div_float, muladd_float, neg_float
+
+## floating point promotions ##
+promote_rule(::Type{Vec{N, Float32}}, ::Type{Vec{N, Float16}}) where N = Vec{N, Float32}
+promote_rule(::Type{Vec{N, Float64}}, ::Type{Vec{N, Float16}}) where N = Vec{N, Float64}
+promote_rule(::Type{Vec{N, Float64}}, ::Type{Vec{N, Float32}}) where N = Vec{N, Float64}
+
+widen(::Type{Vec{N, Float16}}) where N = Vec{N, Float16}
+widen(::Type{Vec{N, Float32}}) where N = Vec{N, Float32}
+
+## floating point arithmetic ##
+-(x::Vec{N,T}) where {N,T<:IEEEFloat} = neg_float(x.data)
+
++(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = add_float(x.data, y.data)
+-(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = sub_float(x.data, y.data)
+*(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = mul_float(x.data, y.data)
+/(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = div_float(x.data, y.data)
+
+muladd(x::Vec{N,T}, y::Vec{N,T}, z::Vec{N,T}) where {N, T<:IEEEFloat} =
+    muladd_float(x.data, y.data, z.data)
+
 end # module
\ No newline at end of file
diff --git a/test/simd.jl b/test/simd.jl
index dbda2e4d51193..7d3c191f797e8 100644
--- a/test/simd.jl
+++ b/test/simd.jl
@@ -46,3 +46,24 @@ end
     @test contains(ir, "load <4 x double>")
     @test !contains(ir, "call void @j_throw_boundserror")
 end
+
+@testset "basic arithmetic" begin
+    ir = sprint(io->code_llvm(io, +, (Vec{4, Float64}, Vec{4, Float64})))
+    @test contains(ir, "fadd <4 x double>")
+    ir = sprint(io->code_llvm(io, -, (Vec{4, Float64}, Vec{4, Float64})))
+    @test contains(ir, "fsub <4 x double>")
+    ir = sprint(io->code_llvm(io, *, (Vec{4, Float64}, Vec{4, Float64})))
+    @test contains(ir, "fmul <4 x double>")
+    ir = sprint(io->code_llvm(io, /, (Vec{4, Float64}, Vec{4, Float64})))
+    @test contains(ir, "fdiv <4 x double>")
+
+    ir = sprint(io->code_llvm(io, muladd, (Vec{4, Float64}, Vec{4, Float64}, Vec{4, Float64})))
+    @test contains(ir, "fmul contract <4 x double>")
+    @test contains(ir, "fadd contract <4 x double>")
+
+    ir = sprint(io->code_llvm(io, -, (Vec{4, Float64},)))
+    @test contains(ir, "fneg <4 x double>")
+
+    # TODO: Way to test Intrinsics directly?
+    #`-v` -> ERROR: neg_float_withtype: value is not a primitive type
+end

From 799e7e27006f62cbda07851003e1c90f3b11b605 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 4 Nov 2024 17:03:06 +0100
Subject: [PATCH 10/12] add Mask dt

---
 base/simd.jl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/base/simd.jl b/base/simd.jl
index 669e1e684f403..7701fe734d5b5 100644
--- a/base/simd.jl
+++ b/base/simd.jl
@@ -46,6 +46,14 @@ function Base.show(io::IO, v::Vec{N, T}) where {N, T}
     print(io, "]")
 end
 
+# TODO: llvm.vp expects a mask of i1
+struct Mask{N}
+    data::NTuple{N, VecElement{Bool}}
+end
+
+function mask_all(::Val{N}, val::Bool) where N
+    Mask(ntuple(_->VecElement(val),Val(N)))
+end
 
 # Mocked vload/vstore! relying on SLP
 

From 71dd9abfa59f6d9e0292638848c956daf605f253 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 4 Nov 2024 17:13:48 +0100
Subject: [PATCH 11/12] implement select

---
 base/simd.jl | 18 +++++++++++++-----
 test/simd.jl |  6 ++++++
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/base/simd.jl b/base/simd.jl
index 7701fe734d5b5..ec8dfe52dc935 100644
--- a/base/simd.jl
+++ b/base/simd.jl
@@ -7,7 +7,7 @@ import Base: memoryrefget, memoryrefnew, memoryrefset!
 import Core.Intrinsics: preferred_vector_width
 
 export Vec
-export vload, vstore!, preferred_vector, width
+export vload, vstore!, preferred_vector, width, select
 
 # TODO: See C# and Co Vec type 
 # TODO: Hardware portable vector types...
@@ -47,12 +47,20 @@ function Base.show(io::IO, v::Vec{N, T}) where {N, T}
 end
 
 # TODO: llvm.vp expects a mask of i1
-struct Mask{N}
-    data::NTuple{N, VecElement{Bool}}
-end
+const Mask{N} = Vec{N, Bool}
 
 function mask_all(::Val{N}, val::Bool) where N
-    Mask(ntuple(_->VecElement(val),Val(N)))
+    Vec(ntuple(_->VecElement(val),Val(N)))
+end
+
+# select(m::Mask{N}, a::Vec{N, T}, b::Vec{N,T}) where {N,T} = Core.ifelse(m.data, a.data, b.data)
+# ERROR: TypeError: non-boolean (NTuple{4, VecElement{Bool}}) used in boolean context
+# Mocked select, relying on SLP
+function select(m::Mask{N}, a::Vec{N, T}, b::Vec{N,T}) where {N,T}
+    data = ntuple(Val(N)) do j
+        VecElement(Core.ifelse(m.data[j].value, a.data[j].value, b.data[j].value))
+    end
+    return Vec(data)
 end
 
 # Mocked vload/vstore! relying on SLP
diff --git a/test/simd.jl b/test/simd.jl
index 7d3c191f797e8..34a5cad27b7e9 100644
--- a/test/simd.jl
+++ b/test/simd.jl
@@ -67,3 +67,9 @@ end
     # TODO: Way to test Intrinsics directly?
     #`-v` -> ERROR: neg_float_withtype: value is not a primitive type
 end
+
+@testset "select" begin
+    ir = sprint(io->code_llvm(io, select, (Vec{4, Bool}, Vec{4, Float64}, Vec{4, Float64})))
+    @test contains(ir, "icmp eq <4 x i8>")
+    @test contains(ir, "select <4 x i1>")
+end

From 52aac7d48821b16a36e99f07db8807b747c3fbfe Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 5 Nov 2024 16:37:07 +0100
Subject: [PATCH 12/12] WIP: Implement SIMD functionality for XorishoSIMD

---
 base/simd.jl                     |  41 +++++--
 src/runtime_intrinsics.c         |  27 ++++-
 stdlib/Random/src/XoshiroSimd.jl | 187 +++++++++++--------------------
 test/simd.jl                     |  20 +++-
 4 files changed, 145 insertions(+), 130 deletions(-)

diff --git a/base/simd.jl b/base/simd.jl
index ec8dfe52dc935..f17076afd5fa0 100644
--- a/base/simd.jl
+++ b/base/simd.jl
@@ -49,7 +49,7 @@ end
 # TODO: llvm.vp expects a mask of i1
 const Mask{N} = Vec{N, Bool}
 
-function mask_all(::Val{N}, val::Bool) where N
+function Vec{N}(val) where N
     Vec(ntuple(_->VecElement(val),Val(N)))
 end
 
@@ -107,14 +107,41 @@ widen(::Type{Vec{N, Float16}}) where N = Vec{N, Float16}
 widen(::Type{Vec{N, Float32}}) where N = Vec{N, Float32}
 
 ## floating point arithmetic ##
--(x::Vec{N,T}) where {N,T<:IEEEFloat} = neg_float(x.data)
+-(x::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(neg_float(x.data))
 
-+(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = add_float(x.data, y.data)
--(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = sub_float(x.data, y.data)
-*(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = mul_float(x.data, y.data)
-/(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = div_float(x.data, y.data)
++(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(add_float(x.data, y.data))
+-(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(sub_float(x.data, y.data))
+*(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(mul_float(x.data, y.data))
+/(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(div_float(x.data, y.data))
 
 muladd(x::Vec{N,T}, y::Vec{N,T}, z::Vec{N,T}) where {N, T<:IEEEFloat} =
-    muladd_float(x.data, y.data, z.data)
+    Vec(muladd_float(x.data, y.data, z.data))
+
+## integer arithmetic ##
+import Base: ÷, BitInteger, BitSigned, BitUnsigned
+import Core.Intrinsics: add_int, sub_int, mul_int, sdiv_int, udiv_int, neg_int
+
++(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(add_int(x.data, y.data))
+-(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(sub_int(x.data, y.data))
+*(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(mul_int(x.data, y.data))
+# TODO ought we implement div by zero?
+÷(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitSigned}   = Vec(sdiv_int(x.data, y.data))
+÷(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitUnsigned} = Vec(udiv_int(x.data, y.data))
+
+## logical ops
+import Base: xor, |, &
+import Core.Intrinsics: xor_int, and_int, or_int
+xor(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(xor_int(x.data, y.data))
+(|)(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(and_int(x.data, y.data))
+(&)(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(or_int(x.data, y.data))
+
+## integer shifts
+# unsigned shift counts always shift in the same direction
+import Base: >>, <<, >>>
+import Core.Intrinsics: ashr_int, lshr_int, shl_int, lshr_int
+>>(x::Vec{N, <:BitSigned},   y::Vec{N, <:BitUnsigned}) where N = ashr_int(x, y)
+>>(x::Vec{N, <:BitUnsigned}, y::Vec{N, <:BitUnsigned}) where N = lshr_int(x, y)
+<<(x::Vec{N, <:BitInteger},  y::Vec{N, <:BitUnsigned}) where N = shl_int(x, y)
+>>>(x::Vec{N, <:BitInteger}, y::Vec{N, <:BitUnsigned}) where N = lshr_int(x, y)
 
 end # module
\ No newline at end of file
diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
index cb86a02bf7548..f50976cc4b10e 100644
--- a/src/runtime_intrinsics.c
+++ b/src/runtime_intrinsics.c
@@ -1195,13 +1195,36 @@ jl_value_t *jl_iintrinsic_2(jl_value_t *a, jl_value_t *b, const char *name,
 {
     jl_value_t *ty = jl_typeof(a);
     jl_value_t *tyb = jl_typeof(b);
+    jl_value_t *et = NULL;
+    jl_value_t *np = NULL;
+    jl_value_t *etb = NULL;
+    jl_value_t *npb = NULL;
     if (tyb != ty) {
         if (!cvtb)
             jl_errorf("%s: types of a and b must match", name);
-        if (!jl_is_primitivetype(tyb))
+        if (jl_is_primitivetype(tyb)) {}
+        else if (is_ntuple_type(tyb) && jl_nparams(tyb) > 0)
+        {
+            etb = jl_tparam0(tyb);
+            npb = jl_nparams(tyb);
+            if (((jl_datatype_t*)etb)->name == jl_vecelement_typename && jl_is_primitivetype(jl_tparam(etb, 0))){}
+            else 
+                jl_errorf("%s: eltype is not a VecElement of a primitive type", name);
+        }
+        else
             jl_errorf("%s: b is not a primitive type", name);
     }
-    if (!jl_is_primitivetype(ty))
+    if (jl_is_primitivetype(ty)) {}
+    else if (is_ntuple_type(ty) && jl_nparams(ty) > 0)
+    {
+        et = jl_tparam0(ty);
+        np = jl_nparams(ty); \
+        if (((jl_datatype_t*)et)->name == jl_vecelement_typename && jl_is_primitivetype(jl_tparam(et, 0))){}
+        else 
+            jl_errorf("%s: eltype is not a VecElement of a primitive type", name);
+        // TODO cvtb
+    }
+    else
         jl_errorf("%s: a is not a primitive type", name);
     void *pa = jl_data_ptr(a), *pb = jl_data_ptr(b);
     unsigned sz = jl_datatype_size(ty);
diff --git a/stdlib/Random/src/XoshiroSimd.jl b/stdlib/Random/src/XoshiroSimd.jl
index 58544714dd9f5..0b2a85df456d3 100644
--- a/stdlib/Random/src/XoshiroSimd.jl
+++ b/stdlib/Random/src/XoshiroSimd.jl
@@ -8,27 +8,21 @@ using Base: BitInteger_types
 using Base.Libc: memcpy
 using Core.Intrinsics: llvmcall
 
+using Base.Experimental.SIMD
+
 # Vector-width. Influences random stream.
 xoshiroWidth() = Val(8)
 # Simd threshold. Influences random stream.
 simdThreshold(::Type{T}) where T = 64
 simdThreshold(::Type{Bool}) = 640
 
-@inline _rotl45(x::UInt64) = (x<<45)|(x>>19)
-@inline _shl17(x::UInt64) = x<<17
-@inline _rotl23(x::UInt64) = (x<<23)|(x>>41)
-@inline _plus(x::UInt64,y::UInt64) = x+y
-@inline _xor(x::UInt64,y::UInt64) = xor(x,y)
-@inline _and(x::UInt64, y::UInt64) = x & y
-@inline _or(x::UInt64, y::UInt64) = x | y
-@inline _lshr(x, y::Int32) = _lshr(x, y % Int64)
-@inline _lshr(x::UInt64, y::Int64) = llvmcall("""
-    %res = lshr i64 %0, %1
-    ret i64 %res
-    """,
-    UInt64,
-    Tuple{UInt64, Int64},
-    x, y)
+@inline rotl45(x::UInt64) = (x<<45)|(x>>19)
+@inline shl17(x::UInt64) = x<<17
+@inline rotl23(x::UInt64) = (x<<23)|(x>>41)
+
+@inline rotl45(x::Vec{N, UInt64}) where N = (x << Vec{N}(45%UInt)) | (x >> Vec{N}(19%UInt))
+@inline shl17(x::Vec{N, UInt64}) where N = x<<Vec{N}(17%UInt)
+@inline rotl23(x::Vec{N, UInt64}) where N = (x<<Vec{N}(23%UInt))|(x>>Vec{N}(41%UInt))
 
 # `_bits2float(x::UInt64, T)` takes `x::UInt64` as input, it splits it in `N` parts where
 # `N = sizeof(UInt64) / sizeof(T)` (`N = 1` for `Float64`, `N = 2` for `Float32, etc...), it
@@ -67,51 +61,6 @@ for N in [4,8,16]
     let code, s, fshl = "llvm.fshl.v$(N)i64",
         VT = :(NTuple{$N, VecElement{UInt64}})
 
-        s = ntuple(_->VecElement(UInt64(45)), N)
-        @eval @inline _rotl45(x::$VT) = ccall($fshl, llvmcall, $VT, ($VT, $VT, $VT), x, x, $s)
-
-        s = ntuple(_->VecElement(UInt64(23)), N)
-        @eval @inline _rotl23(x::$VT) = ccall($fshl, llvmcall, $VT, ($VT, $VT, $VT), x, x, $s)
-
-        code = """
-        %lshiftOp = shufflevector <1 x i64> <i64 17>, <1 x i64> undef, <$N x i32> zeroinitializer
-        %res = shl <$N x i64> %0, %lshiftOp
-        ret <$N x i64> %res
-        """
-        @eval @inline _shl17(x::$VT) = llvmcall($code, $VT, Tuple{$VT}, x)
-
-        code = """
-        %res = add <$N x i64> %1, %0
-        ret <$N x i64> %res
-        """
-        @eval @inline _plus(x::$VT, y::$VT) = llvmcall($code, $VT, Tuple{$VT, $VT}, x, y)
-
-        code = """
-        %res = xor <$N x i64> %1, %0
-        ret <$N x i64> %res
-        """
-        @eval @inline _xor(x::$VT, y::$VT) = llvmcall($code, $VT, Tuple{$VT, $VT}, x, y)
-
-        code = """
-        %res = and <$N x i64> %1, %0
-        ret <$N x i64> %res
-        """
-        @eval @inline _and(x::$VT, y::$VT) = llvmcall($code, $VT, Tuple{$VT, $VT}, x, y)
-
-        code = """
-        %res = or <$N x i64> %1, %0
-        ret <$N x i64> %res
-        """
-        @eval @inline _or(x::$VT, y::$VT) = llvmcall($code, $VT, Tuple{$VT, $VT}, x, y)
-
-        code = """
-        %tmp = insertelement <1 x i64> undef, i64 %1, i32 0
-        %shift = shufflevector <1 x i64> %tmp, <1 x i64> %tmp, <$N x i32> zeroinitializer
-        %res = lshr <$N x i64> %0, %shift
-        ret <$N x i64> %res
-        """
-        @eval @inline _lshr(x::$VT, y::Int64) = llvmcall($code, $VT, Tuple{$VT, Int64}, x, y)
-
         code = """
         %shiftamt = shufflevector <1 x i64> <i64 11>, <1 x i64> undef, <$N x i32> zeroinitializer
         %sh = lshr <$N x i64> %0, %shiftamt
@@ -156,10 +105,10 @@ function forkRand(rng::Union{TaskLocalRNG, Xoshiro}, ::Val{N}) where N
     # 0x5a94851fb48a6e05 == hash(UInt(2))|0x01
     # 0x3688cf5d48899fa7 == hash(UInt(3))|0x01
     # 0x867b4bb4c42e5661 == hash(UInt(4))|0x01
-    s0 = ntuple(i->VecElement(0x02011ce34bce797f * rand(rng, UInt64)), Val(N))
-    s1 = ntuple(i->VecElement(0x5a94851fb48a6e05 * rand(rng, UInt64)), Val(N))
-    s2 = ntuple(i->VecElement(0x3688cf5d48899fa7 * rand(rng, UInt64)), Val(N))
-    s3 = ntuple(i->VecElement(0x867b4bb4c42e5661 * rand(rng, UInt64)), Val(N))
+    s0 = Vec(ntuple(i->VecElement(0x02011ce34bce797f * rand(rng, UInt64)), Val(N)))
+    s1 = Vec(ntuple(i->VecElement(0x5a94851fb48a6e05 * rand(rng, UInt64)), Val(N)))
+    s2 = Vec(ntuple(i->VecElement(0x3688cf5d48899fa7 * rand(rng, UInt64)), Val(N)))
+    s3 = Vec(ntuple(i->VecElement(0x867b4bb4c42e5661 * rand(rng, UInt64)), Val(N)))
     (s0, s1, s2, s3)
 end
 
@@ -182,26 +131,26 @@ end
     s0, s1, s2, s3 = getstate(rng)
     i = 0
     while i+8 <= len
-        res = _plus(_rotl23(_plus(s0,s3)),s0)
+        res = rotl23(s0 + s3) + s0
         unsafe_store!(reinterpret(Ptr{UInt64}, dst + i), f(res, T))
-        t = _shl17(s1)
-        s2 = _xor(s2, s0)
-        s3 = _xor(s3, s1)
-        s1 = _xor(s1, s2)
-        s0 = _xor(s0, s3)
-        s2 = _xor(s2, t)
-        s3 = _rotl45(s3)
+        t = shl17(s1)
+        s2 = xor(s2, s0)
+        s3 = xor(s3, s1)
+        s1 = xor(s1, s2)
+        s0 = xor(s0, s3)
+        s2 = xor(s2, t)
+        s3 = rotl45(s3)
         i += 8
     end
     if i < len
-        res = _plus(_rotl23(_plus(s0,s3)),s0)
-        t = _shl17(s1)
-        s2 = _xor(s2, s0)
-        s3 = _xor(s3, s1)
-        s1 = _xor(s1, s2)
-        s0 = _xor(s0, s3)
-        s2 = _xor(s2, t)
-        s3 = _rotl45(s3)
+        res = rotl23(s0 + s3) + s0
+        t = shl17(s1)
+        s2 = xor(s2, s0)
+        s3 = xor(s3, s1)
+        s1 = xor(s1, s2)
+        s0 = xor(s0, s3)
+        s2 = xor(s2, t)
+        s3 = rotl45(s3)
         ref = Ref(f(res, T))
         # TODO: This may make the random-stream dependent on system endianness
         GC.@preserve ref memcpy(dst+i, Base.unsafe_convert(Ptr{Cvoid}, ref), len-i)
@@ -214,36 +163,36 @@ end
     s0, s1, s2, s3 = getstate(rng)
     i = 0
     while i+8 <= len
-        res = _plus(_rotl23(_plus(s0,s3)),s0)
-        shift = 0
+        res = rotl23(s0 + s3) + s0
+        shift = UInt(0)
         while i+8 <= len && shift < 8
-            resLoc = _and(_lshr(res, shift), 0x0101010101010101)
+            resLoc = (res >> shift) & 0x0101010101010101
             unsafe_store!(reinterpret(Ptr{UInt64}, dst + i), resLoc)
             i += 8
-            shift += 1
+            shift += UInt(1)
         end
 
-        t = _shl17(s1)
-        s2 = _xor(s2, s0)
-        s3 = _xor(s3, s1)
-        s1 = _xor(s1, s2)
-        s0 = _xor(s0, s3)
-        s2 = _xor(s2, t)
-        s3 = _rotl45(s3)
+        t = shl17(s1)
+        s2 = xor(s2, s0)
+        s3 = xor(s3, s1)
+        s1 = xor(s1, s2)
+        s0 = xor(s0, s3)
+        s2 = xor(s2, t)
+        s3 = rotl45(s3)
     end
     if i < len
         # we may overgenerate some bytes here, if len mod 64 <= 56 and len mod 8 != 0
-        res = _plus(_rotl23(_plus(s0,s3)),s0)
-        resLoc = _and(res, 0x0101010101010101)
+        res = rotl23(s0 + s3) + s0
+        resLoc = res & 0x0101010101010101
         ref = Ref(resLoc)
         GC.@preserve ref memcpy(dst+i, Base.unsafe_convert(Ptr{Cvoid}, ref), len-i)
-        t = _shl17(s1)
-        s2 = _xor(s2, s0)
-        s3 = _xor(s3, s1)
-        s1 = _xor(s1, s2)
-        s0 = _xor(s0, s3)
-        s2 = _xor(s2, t)
-        s3 = _rotl45(s3)
+        t = shl17(s1)
+        s2 = xor(s2, s0)
+        s3 = xor(s3, s1)
+        s1 = xor(s1, s2)
+        s0 = xor(s0, s3)
+        s2 = xor(s2, t)
+        s3 = rotl45(s3)
     end
     setstate!(rng, (s0, s1, s2, s3, nothing))
     nothing
@@ -255,14 +204,14 @@ end
 
     i = 0
     while i + 8*N <= len
-        res = _plus(_rotl23(_plus(s0,s3)),s0)
-        t = _shl17(s1)
-        s2 = _xor(s2, s0)
-        s3 = _xor(s3, s1)
-        s1 = _xor(s1, s2)
-        s0 = _xor(s0, s3)
-        s2 = _xor(s2, t)
-        s3 = _rotl45(s3)
+        res = rotl23(s0 + s3) + s0
+        t = shl17(s1)
+        s2 = xor(s2, s0)
+        s3 = xor(s3, s1)
+        s1 = xor(s1, s2)
+        s0 = xor(s0, s3)
+        s2 = xor(s2, t)
+        s3 = rotl45(s3)
         unsafe_store!(reinterpret(Ptr{NTuple{N,VecElement{UInt64}}}, dst + i), f(res, T))
         i += 8*N
     end
@@ -271,20 +220,18 @@ end
 
 @noinline function xoshiro_bulk_simd(rng::Union{TaskLocalRNG, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{Bool}, ::Val{N}, f) where {N}
     s0, s1, s2, s3 = forkRand(rng, Val(N))
-    msk = ntuple(i->VecElement(0x0101010101010101), Val(N))
     i = 0
     while i + 64*N <= len
-        res = _plus(_rotl23(_plus(s0,s3)),s0)
-        t = _shl17(s1)
-        s2 = _xor(s2, s0)
-        s3 = _xor(s3, s1)
-        s1 = _xor(s1, s2)
-        s0 = _xor(s0, s3)
-        s2 = _xor(s2, t)
-        s3 = _rotl45(s3)
-        for k=0:7
-            tmp = _lshr(res, k)
-            toWrite = _and(tmp, msk)
+        res = rotl23(s0 + s3) +s0
+        t = shl17(s1)
+        s2 = xor(s2, s0)
+        s3 = xor(s3, s1)
+        s1 = xor(s1, s2)
+        s0 = xor(s0, s3)
+        s2 = xor(s2, t)
+        s3 = rotl45(s3)
+        for k=UInt(0):UInt(7)
+            toWrite = (res >> Vec{N}(k)) & Vec{N}(0x0101010101010101)
             unsafe_store!(reinterpret(Ptr{NTuple{N,VecElement{UInt64}}}, dst + i + k*N*8), toWrite)
         end
         i += 64*N
diff --git a/test/simd.jl b/test/simd.jl
index 34a5cad27b7e9..fd5b35e460630 100644
--- a/test/simd.jl
+++ b/test/simd.jl
@@ -47,20 +47,32 @@ end
     @test !contains(ir, "call void @j_throw_boundserror")
 end
 
-@testset "basic arithmetic" begin
+@testset "basic floating-point arithmetic" begin
+    A = rand(64)
+    v = vload(Vec{4, Float64}, A, 1)
+
+    @test v+v isa Vec{4, Float64}
     ir = sprint(io->code_llvm(io, +, (Vec{4, Float64}, Vec{4, Float64})))
     @test contains(ir, "fadd <4 x double>")
+    
+    @test v-v isa Vec{4, Float64}
     ir = sprint(io->code_llvm(io, -, (Vec{4, Float64}, Vec{4, Float64})))
     @test contains(ir, "fsub <4 x double>")
+
+    @test v*v isa Vec{4, Float64}
     ir = sprint(io->code_llvm(io, *, (Vec{4, Float64}, Vec{4, Float64})))
     @test contains(ir, "fmul <4 x double>")
+
+    @test v/v isa Vec{4, Float64}
     ir = sprint(io->code_llvm(io, /, (Vec{4, Float64}, Vec{4, Float64})))
     @test contains(ir, "fdiv <4 x double>")
 
+    @test muladd(v, v, v) isa Vec{4, Float64}
     ir = sprint(io->code_llvm(io, muladd, (Vec{4, Float64}, Vec{4, Float64}, Vec{4, Float64})))
     @test contains(ir, "fmul contract <4 x double>")
     @test contains(ir, "fadd contract <4 x double>")
 
+    @test -v isa Vec{4, Float64}
     ir = sprint(io->code_llvm(io, -, (Vec{4, Float64},)))
     @test contains(ir, "fneg <4 x double>")
 
@@ -73,3 +85,9 @@ end
     @test contains(ir, "icmp eq <4 x i8>")
     @test contains(ir, "select <4 x i1>")
 end
+
+@test "basic integer arithmetic" begin
+end
+
+@test "basic logical operations" begin
+end