From 90c829221f1c074b41b603848d10bb12b034510f Mon Sep 17 00:00:00 2001
From: MasonProtter <mason.protter@icloud.com>
Date: Wed, 11 Mar 2020 13:08:01 -0600
Subject: [PATCH 1/8] early setup

---
 src/Gaius.jl         |  3 +++
 src/matmul.jl        | 23 +++++++++++------------
 src/pointermatrix.jl | 31 +++++++++++++++++++++++++++++++
 3 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/src/Gaius.jl b/src/Gaius.jl
index 88543a6..5055f93 100644
--- a/src/Gaius.jl
+++ b/src/Gaius.jl
@@ -14,6 +14,9 @@ const MatTypesC{T <: Eltypes} = Union{Matrix{T}, SubArray{T, 2, <: Array}} # C f
 const MatTypesR{T <: Eltypes} = Union{LinearAlgebra.Adjoint{T,<:MatTypesC{T}}, LinearAlgebra.Transpose{T,<:MatTypesC{T}}} # R for Row Major
 const MatTypes{T <: Eltypes} = Union{MatTypesC{T}, MatTypesR{T}}
 
+const   VecTypes{T <: Eltypes} = Union{Vector{T}, SubArray{T, 1, <:Array}}
+const CoVecTypes{T <: Eltypes} = Union{LinearAlgebra.Adjoint{T,<:VecTypes{T}}, LinearAlgebra.Transpose{T, <:VecTypes{T}}}
+
 # Note this does not support changing the number of threads at runtime
 macro _spawn(ex)
     if Threads.nthreads() > 1
diff --git a/src/matmul.jl b/src/matmul.jl
index b20ffff..7f8374b 100644
--- a/src/matmul.jl
+++ b/src/matmul.jl
@@ -13,11 +13,19 @@ function blocked_mul(A::MatTypes, B::MatTypes)
     C
 end
 
+function blocked_mul(A::StructArray{Complex{T}, 2}, B::StructArray{Complex{T}, 2}) where {T <: Eltypes}
+    C = StructArray{Complex{T}}((Matrix{T}(undef, size(A, 1), size(B,2)),
+                                 Matrix{T}(undef, size(A, 1), size(B,2))))
+    blocked_mul!(C, A, B)
+    C
+end
+
 choose_block_size(C::AbstractMatrix, ::Nothing) = size(C, 1) >= (3DEFAULT_BLOCK_SIZE) >>> 1 ? DEFAULT_BLOCK_SIZE : 32
 choose_block_size(::Any, block_size::Integer) = block_size
 
+
 function blocked_mul!(C::MatTypes{T}, A::MatTypes{T}, B::MatTypes{T};
-              block_size = nothing, sizecheck=true) where {T <: Eltypes}
+                      block_size = nothing, sizecheck=true) where {T <: Eltypes}
     sizecheck && check_compatible_sizes(C, A, B)
 
     _block_size = choose_block_size(C, block_size)
@@ -26,15 +34,8 @@ function blocked_mul!(C::MatTypes{T}, A::MatTypes{T}, B::MatTypes{T};
     C
 end
 
-function blocked_mul(A::StructArray{Complex{T}, 2}, B::StructArray{Complex{T}, 2}) where {T <: Eltypes}
-    C = StructArray{Complex{T}}((Matrix{T}(undef, size(A, 1), size(B,2)),
-                                 Matrix{T}(undef, size(A, 1), size(B,2))))
-    blocked_mul!(C, A, B)
-    C
-end
-
 function blocked_mul!(C::StructArray{Complex{T}, 2}, A::StructArray{Complex{T}, 2}, B::StructArray{Complex{T}, 2};
-              block_size = DEFAULT_BLOCK_SIZE, sizecheck=true) where {T <: Eltypes}
+                      block_size = DEFAULT_BLOCK_SIZE, sizecheck=true) where {T <: Eltypes}
     sizecheck && check_compatible_sizes(C, A, B)
     
     _block_size = choose_block_size(C, block_size)
@@ -43,9 +44,6 @@ function blocked_mul!(C::StructArray{Complex{T}, 2}, A::StructArray{Complex{T},
         Cre, Cim = PtrMatrix(C.re), PtrMatrix(C.im)
         Are, Aim = PtrMatrix(A.re), PtrMatrix(A.im)
         Bre, Bim = PtrMatrix(B.re), PtrMatrix(B.im)
-        # Cre, Cim = C.re, C.im
-        # Are, Aim = A.re, A.im
-        # Bre, Bim = B.re, B.im
         _mul!(    Cre, Are, Bre, _block_size)          # C.re = A.re * B.re
         _mul_add!(Cre, Aim, Bim, _block_size, Val(-1)) # C.re = C.re - A.im * B.im
         _mul!(    Cim, Are, Bim, _block_size)          # C.im = A.re * B.im
@@ -87,3 +85,4 @@ function _mul_add!(C, A, B, sz, ::Val{factor} = Val(1)) where {factor}
         add_gemm_kernel!(C, A, B, Val(factor))
     end
 end
+
diff --git a/src/pointermatrix.jl b/src/pointermatrix.jl
index 96d49a0..058f0b2 100644
--- a/src/pointermatrix.jl
+++ b/src/pointermatrix.jl
@@ -28,3 +28,34 @@ Base.@propagate_inbounds function Base.getindex(A::PointerMatrix, v, i::Integer,
 end
 Base.IndexStyle(::Type{<:PointerMatrix}) = IndexCartesian()
 
+
+struct PointerVector{T,P <: AbstractStridedPointer} <: AbstractVector{T}
+    ptr::P
+    length::Int
+    PointerVector(ptr::P, length::Int) where {T, P <: AbstractStridedPointer{T}} = new{T,P}(ptr, size)
+end
+@inline PtrVector(v::AbstractVector) = PointerVector(stridedpointer(v), size(v))
+@inline Base.pointer(v::PointerVector) = pointer(v.ptr)
+@inline Base.size(v::PointerMatrix) = (v.length,)
+@inline Base.length(v::PointerMatrix) = v.length
+@inline Base.strides(v::PointerVector) = strides(v.ptr)
+@inline stridedpointer(v::PointerVector) = v.ptr
+
+@inline Base.maybeview(A::PointerVector, r::UnitRange, c::UnitRange) = PointerMatrix(gesp(A.ptr, (first(r) - 1, (first(c) - 1))), (length(r), length(c)))
+# getindex is important for the sake of printing the AbstractPointerMatrix. If we call something a Matrix, it's nice to support the interface if possible.
+Base.@propagate_inbounds function Base.getindex(A::PointerMatrix, i::Integer, j::Integer)
+    @boundscheck begin
+        M, N = size(A)
+        (M < i || N < j) && throw(BoundsError(A, (i,j)))
+    end
+    vload(stridedpointer(A), (i-1, j-1))
+end
+Base.@propagate_inbounds function Base.getindex(A::PointerMatrix, v, i::Integer, j::Integer)
+    @boundscheck begin
+        M, N = size(A)
+        (M < i || N < j) && throw(BoundsError(A, (i,j)))
+    end
+    vstore!(stridedpointer(A), v, (i-1, j-1))
+end
+Base.IndexStyle(::Type{<:PointerMatrix}) = IndexCartesian()
+

From b5c2bd152ef72cea2cf5e297d898ba5e28590beb Mon Sep 17 00:00:00 2001
From: MasonProtter <mason.protter@gmail.com>
Date: Wed, 18 Mar 2020 09:23:53 -0600
Subject: [PATCH 2/8] BLAS2 stuff

---
 Project.toml            |  4 ++-
 src/Gaius.jl            | 28 ++++++++++------
 src/block_operations.jl | 66 +++++++++++++++++++++++++++++++++++-
 src/kernels.jl          | 66 ++++++++++++++++++++++++++++--------
 src/matmul.jl           | 74 +++++++++++++++++++++++++++++++++++------
 src/pointermatrix.jl    | 33 ++++++++----------
 test/runtests.jl        |  8 ++++-
 7 files changed, 222 insertions(+), 57 deletions(-)

diff --git a/Project.toml b/Project.toml
index d80556c..b41027d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,12 +8,14 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
+UnsafeArrays = "c4a57d5a-5b31-53a6-b365-19f8c011fbd6"
 
 [compat]
 julia = "1.3"
 
 [extras]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test"]
+test = ["Test", "BenchmarkTools"]
diff --git a/src/Gaius.jl b/src/Gaius.jl
index 5055f93..ea5469d 100644
--- a/src/Gaius.jl
+++ b/src/Gaius.jl
@@ -1,22 +1,16 @@
 module Gaius
 
-using LoopVectorization: @avx, VectorizationBase.AbstractStridedPointer, VectorizationBase.gesp, VectorizationBase.vload, VectorizationBase.vstore!, VectorizationBase.AVX512F
+using LoopVectorization: @avx, VectorizationBase.AbstractStridedPointer, VectorizationBase.gesp, VectorizationBase.gep, VectorizationBase.vload, VectorizationBase.vstore!, VectorizationBase.AVX512F
 import LoopVectorization: @avx, VectorizationBase.stridedpointer
 using StructArrays: StructArray
-using LinearAlgebra: LinearAlgebra
+using LinearAlgebra: LinearAlgebra, Adjoint, Transpose
+
+using UnsafeArrays: UnsafeArrays, @uviews, UnsafeArray
 
 export blocked_mul, blocked_mul!
 
 const DEFAULT_BLOCK_SIZE = AVX512F ? 96 : 64
 
-const Eltypes  = Union{Float64, Float32, Int64, Int32, Int16}
-const MatTypesC{T <: Eltypes} = Union{Matrix{T}, SubArray{T, 2, <: Array}} # C for Column Major
-const MatTypesR{T <: Eltypes} = Union{LinearAlgebra.Adjoint{T,<:MatTypesC{T}}, LinearAlgebra.Transpose{T,<:MatTypesC{T}}} # R for Row Major
-const MatTypes{T <: Eltypes} = Union{MatTypesC{T}, MatTypesR{T}}
-
-const   VecTypes{T <: Eltypes} = Union{Vector{T}, SubArray{T, 1, <:Array}}
-const CoVecTypes{T <: Eltypes} = Union{LinearAlgebra.Adjoint{T,<:VecTypes{T}}, LinearAlgebra.Transpose{T, <:VecTypes{T}}}
-
 # Note this does not support changing the number of threads at runtime
 macro _spawn(ex)
     if Threads.nthreads() > 1
@@ -34,6 +28,20 @@ macro _sync(ex)
 end
 
 include("pointermatrix.jl")
+
+const Eltypes  = Union{Float64, Float32, Int64, Int32, Int16}
+const MatTypesC{T} = Union{Matrix{T},
+                           SubArray{T, 2, <: AbstractArray},
+                           PointerMatrix{T},
+                           UnsafeArray{T, 2}} # C for Column Major
+const MatTypesR{T} = Union{Adjoint{T,<:MatTypesC{T}},
+                           Transpose{T,<:MatTypesC{T}}} # R for Row Major
+const MatTypes{ T} = Union{MatTypesC{T}, MatTypesR{T}}
+
+const   VecTypes{T} = Union{Vector{T}, SubArray{T, 1, <:Array}, PointerVector{T}}
+const CoVecTypes{T} = Union{Adjoint{T,<:VecTypes{T}},
+                            Transpose{T, <:VecTypes{T}}}
+
 include("matmul.jl")
 include("block_operations.jl")
 include("kernels.jl")
diff --git a/src/block_operations.jl b/src/block_operations.jl
index 2a7dc80..70b7b38 100644
--- a/src/block_operations.jl
+++ b/src/block_operations.jl
@@ -53,6 +53,26 @@ function block_mat_vec_mul!(C, A, B, sz)
         _mul_add!(C21, A22, B21, sz)
     end
 end
+function block_mat_vec_mul!(C::VecTypes, A, B::VecTypes, sz)
+    @inbounds @views begin 
+        C11 = C[1:sz    ]; 
+        C21 = C[sz+1:end];
+
+        A11 = A[1:sz,     1:sz]; A12 = A[1:sz,     sz+1:end] 
+        A21 = A[sz+1:end, 1:sz]; A22 = A[sz+1:end, sz+1:end]
+
+        B11 = B[1:sz    ]; 
+        B21 = B[sz+1:end];
+    end
+    @_sync begin
+        @_spawn begin
+            gemm_kernel!(C11, A11, B11)
+            _mul_add!(C11, A12, B21, sz)
+        end
+        _mul!(    C21, A21, B11, sz)
+        _mul_add!(C21, A22, B21, sz)
+    end
+end
 
 function block_covec_mat_mul!(C, A, B, sz)
     @inbounds @views begin 
@@ -99,9 +119,10 @@ function block_vec_covec_mul!(C, A, B, sz)
     end
 end
 
+
 function block_covec_vec_mul!(C, A, B, sz)
     @inbounds @views begin 
-        A11 = A[1:end,    1:sz]; A12 = A[1:sz,     sz+1:end] 
+        A11 = A[1:end,    1:sz]; A12 = A[1:end,     sz+1:end] 
 
         B11 = B[1:sz,     1:end];
         B21 = B[sz+1:end, 1:end];
@@ -111,6 +132,17 @@ function block_covec_vec_mul!(C, A, B, sz)
     _mul_add!(C, A12, B21, sz)
 end
 
+function block_covec_vec_mul!(C::VecTypes, A, B::VecTypes, sz)
+    @inbounds @views begin 
+        A11 = A[1:end,    1:sz]; A12 = A[1:end,     sz+1:end] 
+
+        B11 = B[1:sz    ];
+        B21 = B[sz+1:end];
+    end
+    gemm_kernel!(C, A11, B11)
+    _mul_add!(C, A12, B21, sz)
+end
+
 #----------------------------------------------------------------
 #----------------------------------------------------------------
 # Block Matrix addition-multiplication
@@ -165,6 +197,27 @@ function block_mat_vec_mul_add!(C, A, B, sz, ::Val{factor} = Val(1)) where {fact
     end
 end
 
+function block_mat_vec_mul_add!(C::VecTypes, A, B::VecTypes, sz, ::Val{factor} = Val(1)) where {factor}
+    @inbounds @views begin 
+        C11 = C[1:sz    ]; 
+        C21 = C[sz+1:end];
+
+        A11 = A[1:sz,     1:sz]; A12 = A[1:sz,     sz+1:end] 
+        A21 = A[sz+1:end, 1:sz]; A22 = A[sz+1:end, sz+1:end]
+
+        B11 = B[1:sz    ]; 
+        B21 = B[sz+1:end];
+    end
+    @_sync begin
+        @_spawn begin
+            add_gemm_kernel!(C11, A11, B11, Val(factor))
+            _mul_add!(C11, A12, B21, sz, Val(factor))
+        end
+        _mul_add!(C21, A21, B11, sz, Val(factor))
+        _mul_add!(C21, A22, B21, sz, Val(factor))
+    end
+end
+
 function block_covec_mat_mul_add!(C, A, B, sz, ::Val{factor} = Val(1)) where {factor}
     @inbounds @views begin 
         C11 = C[1:end,    1:sz]; C12 = C[1:end,     sz+1:end] 
@@ -218,3 +271,14 @@ function block_covec_vec_mul_add!(C, A, B, sz, ::Val{factor} = Val(1)) where {fa
     add_gemm_kernel!(C, A11, B11, Val(factor))
     _mul_add!(C, A12, B21, sz, Val(factor))
 end
+
+function block_covec_vec_mul_add!(C::VecTypes, A, B::VecTypes, sz, ::Val{factor} = Val(1)) where {factor}
+    @inbounds @views begin 
+        A11 = A[1:end,    1:sz]; A12 = A[1:end,    sz+1:end] 
+
+        B11 = B[1:sz    ];
+        B21 = B[sz+1:end];
+    end
+    add_gemm_kernel!(C, A11, B11, Val(factor))
+    _mul_add!(C, A12, B21, sz, Val(factor))
+end
diff --git a/src/kernels.jl b/src/kernels.jl
index 397d06e..5c31c57 100644
--- a/src/kernels.jl
+++ b/src/kernels.jl
@@ -8,36 +8,76 @@ function gemm_kernel!(C, A, B)
     end
 end
 
-function add_gemm_kernel!(C, A, B, ::Val{factor}) where {factor}
-    if factor == 1
-        _add_gemm_kernel!(C, A, B)
-    elseif factor == -1
-        _sub_gemm_kernel!(C, A, B)
-    else
-        _add_gemm_kernel!(C, C, B, Val(factor))
+function add_gemm_kernel!(C::MatTypes, A::MatTypes, B::MatTypes)
+    @avx for n ∈ 1:size(A, 1), m ∈ 1:size(B, 2)
+        Cₙₘ = zero(eltype(C))
+        for k ∈ 1:size(A, 2)
+            Cₙₘ += A[n,k] * B[k,m]
+        end
+        C[n,m] += Cₙₘ
     end
 end
 
-function _add_gemm_kernel!(C, A, B)
+add_gemm_kernel!(C::MatTypes, A::MatTypes, B::MatTypes, ::Val{1}) = add_gemm_kernel!(C, A, B)
+
+function add_gemm_kernel!(C::MatTypes, A::MatTypes, B::MatTypes, ::Val{-1})
     @avx for n ∈ 1:size(A, 1), m ∈ 1:size(B, 2)
         Cₙₘ = zero(eltype(C))
         for k ∈ 1:size(A, 2)
-            Cₙₘ += A[n,k] * B[k,m]
+            Cₙₘ -= A[n,k] * B[k,m]
         end
         C[n,m] += Cₙₘ
     end
 end
 
-function _sub_gemm_kernel!(C, A, B)
+function add_gemm_kernel!(C::MatTypes, A::MatTypes, B::MatTypes, ::Val{factor}) where {factor}
     @avx for n ∈ 1:size(A, 1), m ∈ 1:size(B, 2)
         Cₙₘ = zero(eltype(C))
         for k ∈ 1:size(A, 2)
-            Cₙₘ -= A[n,k] * B[k,m]
+            Cₙₘ += factor * A[n,k] * B[k,m]
         end
         C[n,m] += Cₙₘ
     end
 end
 
-_add_gemm_kernel!(C, A, B, ::Val{1}) = _add_gemm_kernel!(C, A, B)
-_add_gemm_kernel!(C, A, B, ::Val{-1}) = _sub_gemm_kernel!(C, A, B)
+#____________
+
+function gemm_kernel!(u::VecTypes, A::MatTypes, v::VecTypes)
+    @avx for n ∈ 1:size(A, 1)
+        uₙ = zero(eltype(u))
+        for k ∈ 1:size(A, 2)
+            uₙ += A[n,k] * v[k]
+        end
+        u[n] = uₙ
+    end
+end
 
+function add_gemm_kernel!(u::VecTypes, A::MatTypes, v::VecTypes)
+    @avx for n ∈ 1:size(A, 1)
+        uₙ = zero(eltype(u))
+        for k ∈ 1:size(A, 2)
+            uₙ += A[n,k] * v[k]
+        end
+        u[n] += uₙ
+    end
+end
+
+function _dd_gemm_kernel!(u::VecTypes, A::MatTypes, v::VecTypes, ::Val{-1})
+    @avx for n ∈ 1:size(A, 1)
+        uₙ = zero(eltype(u))
+        for k ∈ 1:size(A, 2)
+            uₙ -= A[n,k] * v[k]
+        end
+        u[n] += uₙ
+    end
+end
+
+function add_gemm_kernel!(u::VecTypes, A::MatTypes, v::VecTypes, ::Val{factor}) where {factor}
+    @avx for n ∈ 1:size(A, 1)
+        uₙ = zero(eltype(u))
+        for k ∈ 1:size(A, 2)
+            uₙ += factor * A[n,k] * v[k]
+        end
+        u[n] += uₙ
+    end
+end
diff --git a/src/matmul.jl b/src/matmul.jl
index 7f8374b..ae872ea 100644
--- a/src/matmul.jl
+++ b/src/matmul.jl
@@ -20,30 +20,35 @@ function blocked_mul(A::StructArray{Complex{T}, 2}, B::StructArray{Complex{T}, 2
     C
 end
 
-choose_block_size(C::AbstractMatrix, ::Nothing) = size(C, 1) >= (3DEFAULT_BLOCK_SIZE) >>> 1 ? DEFAULT_BLOCK_SIZE : 32
-choose_block_size(::Any, block_size::Integer) = block_size
-
+function choose_block_size(C, A, B, ::Nothing)
+    if (*)(length(C) |> UInt128, length(A) |> UInt128, length(B) |> UInt128) >= ((3DEFAULT_BLOCK_SIZE) >>> 1)^6
+        DEFAULT_BLOCK_SIZE
+    else
+        32
+    end
+end
+choose_block_size(C, A, B, block_size::Integer) = block_size
 
-function blocked_mul!(C::MatTypes{T}, A::MatTypes{T}, B::MatTypes{T};
+function blocked_mul!(C::AbstractArray{T}, A::AbstractArray{T}, B::AbstractArray{T};
                       block_size = nothing, sizecheck=true) where {T <: Eltypes}
     sizecheck && check_compatible_sizes(C, A, B)
 
-    _block_size = choose_block_size(C, block_size)
+    _block_size = choose_block_size(C, A, B, block_size)
 
-    GC.@preserve C A B _mul!(PtrMatrix(C), PtrMatrix(A), PtrMatrix(B), _block_size)
+    GC.@preserve C A B _mul!(PtrArray(C), PtrArray(A), PtrArray(B), _block_size)    
     C
 end
 
-function blocked_mul!(C::StructArray{Complex{T}, 2}, A::StructArray{Complex{T}, 2}, B::StructArray{Complex{T}, 2};
+function blocked_mul!(C::StructArray{Complex{T}}, A::StructArray{Complex{T}}, B::StructArray{Complex{T}};
                       block_size = DEFAULT_BLOCK_SIZE, sizecheck=true) where {T <: Eltypes}
     sizecheck && check_compatible_sizes(C, A, B)
     
-    _block_size = choose_block_size(C, block_size)
+    _block_size = choose_block_size(C, A, B, block_size)
     
     GC.@preserve C A B begin
-        Cre, Cim = PtrMatrix(C.re), PtrMatrix(C.im)
-        Are, Aim = PtrMatrix(A.re), PtrMatrix(A.im)
-        Bre, Bim = PtrMatrix(B.re), PtrMatrix(B.im)
+        Cre, Cim = PtrArray(C.re), PtrArray(C.im)
+        Are, Aim = PtrArray(A.re), PtrArray(A.im)
+        Bre, Bim = PtrArray(B.re), PtrArray(B.im)
         _mul!(    Cre, Are, Bre, _block_size)          # C.re = A.re * B.re
         _mul_add!(Cre, Aim, Bim, _block_size, Val(-1)) # C.re = C.re - A.im * B.im
         _mul!(    Cim, Are, Bim, _block_size)          # C.im = A.re * B.im
@@ -86,3 +91,50 @@ function _mul_add!(C, A, B, sz, ::Val{factor} = Val(1)) where {factor}
     end
 end
 
+#-----------------------------
+# matvec
+
+function check_compatible_sizes(C::VecTypes, A, B::VecTypes)
+    n    = length(C)
+    a, k = size(A)
+    b    = length(B)
+    @assert (n == a) && (k == b) "matrices of size $(size(C)), $(size(A)), $(size(B)) are incompatible"
+    nothing
+end
+
+function blocked_mul(A::MatTypes, B::VecTypes)
+    T = promote_type(eltype(A), eltype(B))
+    C = Vector{T}(undef, size(A,1))
+    blocked_mul!(C, A, B)
+    C
+end
+
+function blocked_mul(A::StructArray{Complex{T}, 2}, B::StructArray{Complex{T}, 1}) where {T <: Eltypes}
+    C = StructArray{Complex{T}}((Matrix{T}(undef, size(A, 1)),
+                                 Matrix{T}(undef, size(A, 1))))
+    blocked_mul!(C, A, B)
+    C
+end
+
+
+function _mul!(C::VecTypes{T}, A::MatTypes{T}, B::VecTypes{T}, sz) where {T<:Eltypes}
+    n, k = size(A)
+    if     n >= sz+8 && k >= sz+8
+        block_mat_vec_mul!(C, A, B, sz)
+    elseif n <  sz+8 && k >= sz+8
+        block_covec_vec_mul!(C, A, B, sz)
+    else
+        gemm_kernel!(C, A, B)
+    end
+end
+
+function _mul_add!(C::VecTypes{T}, A::MatTypes{T}, B::VecTypes{T}, sz ::Val{factor} = Val(1)) where {factor, T<:Eltypes}
+    n, k = size(A)
+    if     n >= sz+8 && k >= sz+8
+        block_mat_vec_mul_add!(C, A, B, sz, Val(factor))
+    elseif n <  sz+8 && k >= sz+8
+        block_covec_vec_mul_add!(C, A, B, sz, Val(factor))
+    else
+        add_gemm_kernel!(C, A, B, Val(factor))
+    end
+end
diff --git a/src/pointermatrix.jl b/src/pointermatrix.jl
index 058f0b2..f649d38 100644
--- a/src/pointermatrix.jl
+++ b/src/pointermatrix.jl
@@ -1,4 +1,8 @@
 
+PtrArray(M::AbstractMatrix) = PtrMatrix(M)
+PtrArray(v::AbstractVector) = PtrVector(v)
+
+
 struct PointerMatrix{T,P <: AbstractStridedPointer} <: AbstractMatrix{T}
     ptr::P
     size::Tuple{Int,Int}
@@ -29,33 +33,22 @@ end
 Base.IndexStyle(::Type{<:PointerMatrix}) = IndexCartesian()
 
 
+
+
 struct PointerVector{T,P <: AbstractStridedPointer} <: AbstractVector{T}
     ptr::P
     length::Int
-    PointerVector(ptr::P, length::Int) where {T, P <: AbstractStridedPointer{T}} = new{T,P}(ptr, size)
+    PointerVector(ptr::P, length::Int) where {T, P <: AbstractStridedPointer{T}} = new{T,P}(ptr, length)
 end
-@inline PtrVector(v::AbstractVector) = PointerVector(stridedpointer(v), size(v))
+@inline PtrVector(v::AbstractVector) = PointerVector(stridedpointer(v), length(v))
 @inline Base.pointer(v::PointerVector) = pointer(v.ptr)
-@inline Base.size(v::PointerMatrix) = (v.length,)
-@inline Base.length(v::PointerMatrix) = v.length
+@inline Base.size(v::PointerVector) = (v.length,)
+@inline Base.length(v::PointerVector) = v.length
 @inline Base.strides(v::PointerVector) = strides(v.ptr)
 @inline stridedpointer(v::PointerVector) = v.ptr
 
-@inline Base.maybeview(A::PointerVector, r::UnitRange, c::UnitRange) = PointerMatrix(gesp(A.ptr, (first(r) - 1, (first(c) - 1))), (length(r), length(c)))
-# getindex is important for the sake of printing the AbstractPointerMatrix. If we call something a Matrix, it's nice to support the interface if possible.
-Base.@propagate_inbounds function Base.getindex(A::PointerMatrix, i::Integer, j::Integer)
-    @boundscheck begin
-        M, N = size(A)
-        (M < i || N < j) && throw(BoundsError(A, (i,j)))
-    end
-    vload(stridedpointer(A), (i-1, j-1))
+@inline function Base.maybeview(v::PointerVector, i::UnitRange)
+    PointerVector(gesp(v.ptr, (first(i) - 1,)), length(i))
 end
-Base.@propagate_inbounds function Base.getindex(A::PointerMatrix, v, i::Integer, j::Integer)
-    @boundscheck begin
-        M, N = size(A)
-        (M < i || N < j) && throw(BoundsError(A, (i,j)))
-    end
-    vstore!(stridedpointer(A), v, (i-1, j-1))
-end
-Base.IndexStyle(::Type{<:PointerMatrix}) = IndexCartesian()
 
+Base.IndexStyle(::Type{<:PointerVector}) = IndexLinear()
diff --git a/test/runtests.jl b/test/runtests.jl
index bd7d539..4fad037 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,7 +2,13 @@ using Test, LinearAlgebra, Random
 using Gaius
 using StructArrays
 
-@testset "ComplexFloat64 Multiplication" begin
+# @testset "Matrix-Vector products" begin
+    
+
+# end
+
+
+@testset "ComplexFloat64 Matrix Multiplication" begin
     for sz ∈ [10, 50, 100, 200, 400, 1000]
         n, k, m = (sz .+ rand(-5:5, 3))
         @testset "($n × $k) × ($k × $m)" begin

From 1d5bcb18bf6b84760dd379d898bca5e3b12df4a5 Mon Sep 17 00:00:00 2001
From: MasonProtter <mason.protter@icloud.com>
Date: Wed, 18 Mar 2020 12:36:56 -0600
Subject: [PATCH 3/8] fix up and test matrix-vector products

---
 src/matmul.jl    | 25 +++++++++----------
 test/runtests.jl | 62 +++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 72 insertions(+), 15 deletions(-)

diff --git a/src/matmul.jl b/src/matmul.jl
index ae872ea..a0166cd 100644
--- a/src/matmul.jl
+++ b/src/matmul.jl
@@ -6,6 +6,16 @@ function check_compatible_sizes(C, A, B)
     nothing
 end
 
+function choose_block_size(C, A, B, ::Nothing)
+    if (*)(length(C) |> Int128, length(A) |> Int128, length(B) |> Int128) >= ((3DEFAULT_BLOCK_SIZE) >>> 1)^6
+        DEFAULT_BLOCK_SIZE
+    else
+        32
+    end
+end
+choose_block_size(C, A, B, block_size::Integer) = block_size
+
+
 function blocked_mul(A::MatTypes, B::MatTypes)
     T = promote_type(eltype(A), eltype(B))
     C = Matrix{T}(undef, size(A,1), size(B,2))
@@ -20,15 +30,6 @@ function blocked_mul(A::StructArray{Complex{T}, 2}, B::StructArray{Complex{T}, 2
     C
 end
 
-function choose_block_size(C, A, B, ::Nothing)
-    if (*)(length(C) |> UInt128, length(A) |> UInt128, length(B) |> UInt128) >= ((3DEFAULT_BLOCK_SIZE) >>> 1)^6
-        DEFAULT_BLOCK_SIZE
-    else
-        32
-    end
-end
-choose_block_size(C, A, B, block_size::Integer) = block_size
-
 function blocked_mul!(C::AbstractArray{T}, A::AbstractArray{T}, B::AbstractArray{T};
                       block_size = nothing, sizecheck=true) where {T <: Eltypes}
     sizecheck && check_compatible_sizes(C, A, B)
@@ -41,7 +42,7 @@ end
 
 function blocked_mul!(C::StructArray{Complex{T}}, A::StructArray{Complex{T}}, B::StructArray{Complex{T}};
                       block_size = DEFAULT_BLOCK_SIZE, sizecheck=true) where {T <: Eltypes}
-    sizecheck && check_compatible_sizes(C, A, B)
+    sizecheck && check_compatible_sizes(C.re, A.re, B.re)
     
     _block_size = choose_block_size(C, A, B, block_size)
     
@@ -110,8 +111,8 @@ function blocked_mul(A::MatTypes, B::VecTypes)
 end
 
 function blocked_mul(A::StructArray{Complex{T}, 2}, B::StructArray{Complex{T}, 1}) where {T <: Eltypes}
-    C = StructArray{Complex{T}}((Matrix{T}(undef, size(A, 1)),
-                                 Matrix{T}(undef, size(A, 1))))
+    C = StructArray{Complex{T}}((Vector{T}(undef, size(A, 1)),
+                                 Vector{T}(undef, size(A, 1))))
     blocked_mul!(C, A, B)
     C
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 4fad037..bd247a2 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,10 +2,66 @@ using Test, LinearAlgebra, Random
 using Gaius
 using StructArrays
 
-# @testset "Matrix-Vector products" begin
-    
+@testset "Matrix-Vector Float64" begin
+    for n ∈ [10, 100, 500, 2000]
+        for m ∈ [10, 100, 2000]
+            u = zeros(n)
+            A = rand(n, m)
+            v = rand(m)
+            @test blocked_mul!(u, A, v) ≈ A * v
+            @test u ≈ blocked_mul(A, v)
+        end
+    end
+end
+
+@testset "Matrix-Vector ComplexF64" begin
+    for n ∈ [10, 100, 500, 2000]
+        for m ∈ [10, 100, 2000]
+            u = zeros(ComplexF64, n)   |> StructArray
+            A = rand(ComplexF64, n, m) |> StructArray
+            v = rand(ComplexF64, m)    |> StructArray
+            @test blocked_mul!(u, A, v) ≈ collect(A) * collect(v)
+            @test u ≈ blocked_mul(A, v)
+        end
+    end
+end
+
+@testset "Matrix-Vector Float32" begin
+    for n ∈ [10, 100, 500, 2000]
+        for m ∈ [10, 100, 2000]
+            u = zeros(Float32, n)
+            A = rand(Float32, n, m)
+            v = rand(Float32, m)
+            @test blocked_mul!(u, A, v) ≈ A * v
+            @test u ≈ blocked_mul(A, v)
+        end
+    end
+end
+
+@testset "Matrix-Vector Int64" begin
+    for n ∈ [10, 100, 500, 2000]
+        for m ∈ [10, 100, 2000]
+            u = zeros(Int, n)
+            A = rand(-20:20, n, m)
+            v = rand(-20:20, m)
+            @test blocked_mul!(u, A, v) ≈ A * v
+            @test u ≈ blocked_mul(A, v)
+        end
+    end
+end
+
+@testset "Matrix-Vector ComplexInt32" begin
+    for n ∈ [10, 100, 500, 2000]
+        for m ∈ [10, 100, 2000]
+            u = zeros(Complex{Int32}, n) |> StructArray       
+            A = StructArray{Complex{Int32}}((rand(Int32.(-10:10), n, m), rand(Int32.(-10:10), n, m)))
+            v = StructArray{Complex{Int32}}((rand(Int32.(-10:10),    m), rand(Int32.(-10:10),    m)))   
+            @test blocked_mul!(u, A, v) ≈ collect(A) * collect(v)
+            @test u ≈ blocked_mul(A, v)
+        end
+    end
+end
 
-# end
 
 
 @testset "ComplexFloat64 Matrix Multiplication" begin

From 7f5fced33ef5aa6c8fbc49d6a1ae00ac10607870 Mon Sep 17 00:00:00 2001
From: MasonProtter <mason.protter@icloud.com>
Date: Fri, 3 Apr 2020 16:03:18 -0600
Subject: [PATCH 4/8] update tests

---
 test/runtests.jl | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index bd247a2..8c489e0 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,7 +2,7 @@ using Test, LinearAlgebra, Random
 using Gaius
 using StructArrays
 
-@testset "Matrix-Vector Float64" begin
+@testset "Matrix-Vector Float64       " begin
     for n ∈ [10, 100, 500, 2000]
         for m ∈ [10, 100, 2000]
             u = zeros(n)
@@ -14,7 +14,7 @@ using StructArrays
     end
 end
 
-@testset "Matrix-Vector ComplexF64" begin
+@testset "Matrix-Vector ComplexF64    " begin
     for n ∈ [10, 100, 500, 2000]
         for m ∈ [10, 100, 2000]
             u = zeros(ComplexF64, n)   |> StructArray
@@ -26,7 +26,7 @@ end
     end
 end
 
-@testset "Matrix-Vector Float32" begin
+@testset "Matrix-Vector Float32       " begin
     for n ∈ [10, 100, 500, 2000]
         for m ∈ [10, 100, 2000]
             u = zeros(Float32, n)
@@ -38,7 +38,7 @@ end
     end
 end
 
-@testset "Matrix-Vector Int64" begin
+@testset "Matrix-Vector Int64         " begin
     for n ∈ [10, 100, 500, 2000]
         for m ∈ [10, 100, 2000]
             u = zeros(Int, n)
@@ -50,7 +50,7 @@ end
     end
 end
 
-@testset "Matrix-Vector ComplexInt32" begin
+@testset "Matrix-Vector ComplexInt32  " begin
     for n ∈ [10, 100, 500, 2000]
         for m ∈ [10, 100, 2000]
             u = zeros(Complex{Int32}, n) |> StructArray       
@@ -64,7 +64,7 @@ end
 
 
 
-@testset "ComplexFloat64 Matrix Multiplication" begin
+@testset "ComplexFloat64 Matrix-Matrix" begin
     for sz ∈ [10, 50, 100, 200, 400, 1000]
         n, k, m = (sz .+ rand(-5:5, 3))
         @testset "($n × $k) × ($k × $m)" begin
@@ -91,7 +91,7 @@ end
     end
 end
 
-@testset "Float64 Multiplication" begin
+@testset "Float64 Matrix-Matrix       " begin
     for sz ∈ [10, 50, 100, 200, 400, 1000]
         n, k, m = (sz .+ rand(-5:5, 3))
         @testset "($n × $k) × ($k × $m)" begin
@@ -129,7 +129,7 @@ end
     end
 end
 
-@testset "Float32 Multiplication" begin
+@testset "Float32 Matrix-Matrix       " begin
     for sz ∈ [10, 50, 100, 200, 400, 1000]
         n, k, m = (sz .+ rand(-5:5, 3))
         @testset "($n × $k) × ($k × $m)" begin
@@ -150,7 +150,7 @@ end
 end
 
 
-@testset "Int64 Multiplication" begin
+@testset "Int64 Matrix-Matrix         " begin
     for sz ∈ [10, 50, 100, 200, 400, 1000]
         n, k, m = (sz .+ rand(-5:5, 3))
         @testset "($n × $k) × ($k × $m)" begin
@@ -171,7 +171,7 @@ end
 end
 
 
-@testset "Int32 Multiplication" begin
+@testset "Int32 Matrix-Matrix         " begin
     for sz ∈ [10, 50, 100, 200, 400, 1000]
         n, k, m = (sz .+ rand(-5:5, 3))
         @testset "($n × $k) × ($k × $m)" begin

From 6a3b9a6c0f124d48a6c408020ca80d923b648cff Mon Sep 17 00:00:00 2001
From: MasonProtter <mason.protter@icloud.com>
Date: Fri, 3 Apr 2020 17:18:29 -0600
Subject: [PATCH 5/8] support Matrix-Vector and CoVector-Matrix products

---
 src/Gaius.jl         |  6 +--
 src/kernels.jl       | 44 +++++++++++++++++++++-
 src/matmul.jl        | 57 ++++++++++++++++++++++++++++-
 src/pointermatrix.jl | 87 +++++++++++++++++++++++++++++++++++---------
 test/runtests.jl     | 23 ++++++++++--
 5 files changed, 190 insertions(+), 27 deletions(-)

diff --git a/src/Gaius.jl b/src/Gaius.jl
index ea5469d..1d89554 100644
--- a/src/Gaius.jl
+++ b/src/Gaius.jl
@@ -38,9 +38,9 @@ const MatTypesR{T} = Union{Adjoint{T,<:MatTypesC{T}},
                            Transpose{T,<:MatTypesC{T}}} # R for Row Major
 const MatTypes{ T} = Union{MatTypesC{T}, MatTypesR{T}}
 
-const   VecTypes{T} = Union{Vector{T}, SubArray{T, 1, <:Array}, PointerVector{T}}
-const CoVecTypes{T} = Union{Adjoint{T,<:VecTypes{T}},
-                            Transpose{T, <:VecTypes{T}}}
+const   VecTypes{T} = Union{Vector{T}, SubArray{T, 1, <:Array}}#, PointerVector{T}}
+const CoVecTypes{T} = Union{Adjoint{T,   <:VecTypes{T}},
+                            Transpose{T, <:VecTypes{T}}}#, PointerAdjointVector{T}}
 
 include("matmul.jl")
 include("block_operations.jl")
diff --git a/src/kernels.jl b/src/kernels.jl
index 5c31c57..8a570bc 100644
--- a/src/kernels.jl
+++ b/src/kernels.jl
@@ -62,7 +62,7 @@ function add_gemm_kernel!(u::VecTypes, A::MatTypes, v::VecTypes)
     end
 end
 
-function _dd_gemm_kernel!(u::VecTypes, A::MatTypes, v::VecTypes, ::Val{-1})
+function add_gemm_kernel!(u::VecTypes, A::MatTypes, v::VecTypes, ::Val{-1})
     @avx for n ∈ 1:size(A, 1)
         uₙ = zero(eltype(u))
         for k ∈ 1:size(A, 2)
@@ -81,3 +81,45 @@ function add_gemm_kernel!(u::VecTypes, A::MatTypes, v::VecTypes, ::Val{factor})
         u[n] += uₙ
     end
 end
+
+#____________
+
+function gemm_kernel!(u::CoVecTypes, v::CoVecTypes, A::MatTypes)
+    @avx for m ∈ 1:size(A, 2)
+        uₘ = zero(eltype(u))
+        for k ∈ 1:size(A, 1)
+            uₘ += v[k] * A[k, m]
+        end
+        u[m] = uₘ
+    end
+end
+
+function add_gemm_kernel!(u::CoVecTypes, v::CoVecTypes, A::MatTypes)
+    @avx for m ∈ 1:size(A, 2)
+        uₘ = zero(eltype(u))
+        for k ∈ 1:size(A, 1)
+            uₘ += v[k] * A[k, m]
+        end
+        u[m] += uₘ
+    end
+end
+
+function add_gemm_kernel(u::CoVecTypes, v::CoVecTypes, A::MatTypes, ::Val{-1})
+    @avx for m ∈ 1:size(A, 2)
+        uₘ = zero(eltype(u))
+        for k ∈ 1:size(A, 1)
+            uₘ -= v[k] * A[k, m]
+        end
+        u[m] += uₘ
+    end
+end
+
+function add_gemm_kernel!(u::CoVecTypes, v::CoVecTypes, A::MatTypes, ::Val{factor}) where {factor}
+    @avx for m ∈ 1:size(A, 2)
+        uₘ = zero(eltype(u))
+        for k ∈ 1:size(A, 1)
+            uₘ += factor * v[k] * A[k, m]
+        end
+        u[m] += uₘ
+    end
+end
diff --git a/src/matmul.jl b/src/matmul.jl
index a0166cd..6f536dc 100644
--- a/src/matmul.jl
+++ b/src/matmul.jl
@@ -129,7 +129,7 @@ function _mul!(C::VecTypes{T}, A::MatTypes{T}, B::VecTypes{T}, sz) where {T<:Elt
     end
 end
 
-function _mul_add!(C::VecTypes{T}, A::MatTypes{T}, B::VecTypes{T}, sz ::Val{factor} = Val(1)) where {factor, T<:Eltypes}
+function _mul_add!(C::VecTypes{T}, A::MatTypes{T}, B::VecTypes{T}, sz, ::Val{factor} = Val(1)) where {factor, T<:Eltypes}
     n, k = size(A)
     if     n >= sz+8 && k >= sz+8
         block_mat_vec_mul_add!(C, A, B, sz, Val(factor))
@@ -139,3 +139,58 @@ function _mul_add!(C::VecTypes{T}, A::MatTypes{T}, B::VecTypes{T}, sz ::Val{fact
         add_gemm_kernel!(C, A, B, Val(factor))
     end
 end
+
+
+#-----------------------------
+# covec-mat
+
+function check_compatible_sizes(C::CoVecTypes, A::CoVecTypes, B::MatTypes)
+    m    = length(C)
+    n    = length(A)
+    a, b = size(B)
+    @assert (n == a) && (m == b) "matrices of size $(size(C)), $(size(A)), $(size(B)) are incompatible"
+    nothing
+end
+
+function blocked_mul(A::CoVecTypes, B::MatTypes)
+    T = promote_type(eltype(A), eltype(B))
+    C = Vector{T}(undef, size(B,2))'
+    blocked_mul!(C, A, B)
+    C
+end
+
+function blocked_mul(A::Adjoint{T, <:StructArray{Complex{T}, 1}}, B::StructArray{Complex{T}, 2}) where {T <: Eltypes}
+    C = StructArray{Complex{T}}((Vector{T}(undef, size(B, 2)),
+                                 Vector{T}(undef, size(B, 2))))'
+    blocked_mul!(C, A, B)
+    C
+end
+
+function blocked_mul(A::Transpose{T, <:StructArray{Complex{T}, 1}}, B::StructArray{Complex{T}, 2}) where {T <: Eltypes}
+    C = StructArray{Complex{T}}((Vector{T}(undef, size(B, 2)),
+                                 Vector{T}(undef, size(B, 2)))) |> transpose
+    blocked_mul!(C, A, B)
+    C
+end
+
+function _mul!(C::CoVecTypes{T}, A::CoVecTypes{T}, B::MatTypes{T}, sz) where {T<:Eltypes}
+    n, k, m = 1, size(A, 1), length(C)
+    if     k >= sz+8 && m >= sz+8
+        block_covec_mat_mul!(C, A, B, sz)
+    elseif k >= sz+8 && m <  sz+8
+        block_covec_vec_mul!(C, A, B, sz)
+    else
+        gemm_kernel!(C, A, B)
+    end
+end
+
+function _mul_add!(C::CoVecTypes{T}, A::CoVecTypes{T}, B::MatTypes{T}, sz, ::Val{factor} = Val(1)) where {factor, T<:Eltypes}
+    n, k, m = 1, size(A, 1), length(C)
+    if     k >= sz+8 && m >= sz+8
+        block_covec_mat_mul!(C, A, B, sz, Val(factor))
+    elseif k >= sz+8 && m <  sz+8
+        block_covec_vec_mul!(C, A, B, sz, Val(factor))
+    else
+        add_gemm_kernel!(C, A, B, Val(factor))
+    end
+end
diff --git a/src/pointermatrix.jl b/src/pointermatrix.jl
index f649d38..9e3826e 100644
--- a/src/pointermatrix.jl
+++ b/src/pointermatrix.jl
@@ -1,7 +1,8 @@
 
 PtrArray(M::AbstractMatrix) = PtrMatrix(M)
-PtrArray(v::AbstractVector) = PtrVector(v)
-
+PtrArray(v::AbstractVector) = v#PtrVector(v)
+PtrArray(v::Adjoint{T, <:AbstractVector}) where {T} = v#PointerAdjointVector(v)
+PtrArray(v::Transpose{T, <:AbstractVector}) where {T} = v#PointerTransposeVector(v)
 
 struct PointerMatrix{T,P <: AbstractStridedPointer} <: AbstractMatrix{T}
     ptr::P
@@ -34,21 +35,71 @@ Base.IndexStyle(::Type{<:PointerMatrix}) = IndexCartesian()
 
 
 
+# This machinery below might not actually be needed.
 
-struct PointerVector{T,P <: AbstractStridedPointer} <: AbstractVector{T}
-    ptr::P
-    length::Int
-    PointerVector(ptr::P, length::Int) where {T, P <: AbstractStridedPointer{T}} = new{T,P}(ptr, length)
-end
-@inline PtrVector(v::AbstractVector) = PointerVector(stridedpointer(v), length(v))
-@inline Base.pointer(v::PointerVector) = pointer(v.ptr)
-@inline Base.size(v::PointerVector) = (v.length,)
-@inline Base.length(v::PointerVector) = v.length
-@inline Base.strides(v::PointerVector) = strides(v.ptr)
-@inline stridedpointer(v::PointerVector) = v.ptr
-
-@inline function Base.maybeview(v::PointerVector, i::UnitRange)
-    PointerVector(gesp(v.ptr, (first(i) - 1,)), length(i))
-end
+# struct PointerVector{T,P <: AbstractStridedPointer} <: AbstractVector{T}
+#     ptr::P
+#     length::Int
+#     PointerVector(ptr::P, length::Int) where {T, P <: AbstractStridedPointer{T}} = new{T,P}(ptr, length)
+# end
+# @inline PtrVector(v::AbstractVector) = PointerVector(stridedpointer(v), length(v))
+# @inline Base.pointer(v::PointerVector) = pointer(v.ptr)
+# @inline Base.size(v::PointerVector) = (v.length,)
+# @inline Base.length(v::PointerVector) = v.length
+# @inline Base.strides(v::PointerVector) = strides(v.ptr)
+# @inline stridedpointer(v::PointerVector) = v.ptr
+
+# @inline function Base.maybeview(v::PointerVector, i::UnitRange)
+#     PointerVector(gesp(v.ptr, (first(i) - 1,)), length(i))
+# end
+
+# Base.IndexStyle(::Type{<:PointerVector}) = IndexLinear()
+
+
+
+# struct PointerAdjointVector{T,P <: AbstractStridedPointer} <: AbstractMatrix{T}
+#     ptr::P
+#     length::Int
+#     PointerAdjointVector(ptr::P, length::Int) where {T, P <: AbstractStridedPointer{T}} = new{T,P}(ptr, length)
+# end
+# @inline function PointerAdjointVector(v::Adjoint{T, <:AbstractVector{T}}) where {T}
+#     vp = parent(v)
+#     if T <: Complex
+#         vp = conj.(p) #this copy is unfortunate, but I'm scared of mutating the user's data
+#     end
+#     PointerAdjointVector(stridedpointer(vp), length(v))
+# end
+# @inline Base.pointer(v::PointerAdjointVector) = pointer(v.ptr)
+# @inline Base.size(v::PointerAdjointVector) = (1,v.length)
+# @inline Base.length(v::PointerAdjointVector) = v.length
+# @inline Base.strides(v::PointerAdjointVector) = strides(v.ptr)
+# @inline stridedpointer(v::PointerAdjointVector) = v.ptr
+
+# @inline function Base.maybeview(v::PointerAdjointVector, i::UnitRange)
+#     PointerVector(gesp(v.ptr, (first(i) - 1,)), length(i))
+# end
+
+# Base.IndexStyle(::Type{<:PointerAdjointVector}) = IndexLinear()
+
+
+
+# struct PointerTransposeVector{T,P <: AbstractStridedPointer} <: AbstractMatrix{T}
+#     ptr::P
+#     length::Int
+#     PointerTransposeVector(ptr::P, length::Int) where {T, P <: AbstractStridedPointer{T}} = new{T,P}(ptr, length)
+# end
+# @inline function PointerTransposeVector(v::Transpose{T, <:AbstractVector{T}}) where {T}
+#     vp = parent(v)
+#     PointerTransposeVector(stridedpointer(vp), length(v))
+# end
+# @inline Base.pointer(v::PointerTransposeVector) = pointer(v.ptr)
+# @inline Base.size(v::PointerTransposeVector) = (1,v.length)
+# @inline Base.length(v::PointerTransposeVector) = v.length
+# @inline Base.strides(v::PointerTransposeVector) = strides(v.ptr)
+# @inline stridedpointer(v::PointerTransposeVector) = v.ptr
+
+# @inline function Base.maybeview(v::PointerTransposeVector, i::UnitRange)
+#     PointerVector(gesp(v.ptr, (first(i) - 1,)), length(i))
+# end
 
-Base.IndexStyle(::Type{<:PointerVector}) = IndexLinear()
+# Base.IndexStyle(::Type{<:PointerTransposeVector}) = IndexLinear()
diff --git a/test/runtests.jl b/test/runtests.jl
index 8c489e0..3543d50 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,7 +2,22 @@ using Test, LinearAlgebra, Random
 using Gaius
 using StructArrays
 
-@testset "Matrix-Vector Float64       " begin
+@testset "Float64 Matrix-CoVector     " begin
+    for n ∈ [10, 100, 500, 2000]
+        for m ∈ [10, 100, 2000]
+            u = zeros(m)
+            A = rand(n, m)
+            v = rand(n)
+            @test blocked_mul!(u', v', A) ≈ v' * A
+            @test u' ≈ blocked_mul(v', A)
+
+            @test blocked_mul!(transpose(u), transpose(v), A) ≈ transpose(v) * A
+            @test transpose(u) ≈ blocked_mul(transpose(v), A)
+        end
+    end
+end
+
+@testset "Float64 Matrix-Vector       " begin
     for n ∈ [10, 100, 500, 2000]
         for m ∈ [10, 100, 2000]
             u = zeros(n)
@@ -26,7 +41,7 @@ end
     end
 end
 
-@testset "Matrix-Vector Float32       " begin
+@testset "Float32 Matrix-Vector       " begin
     for n ∈ [10, 100, 500, 2000]
         for m ∈ [10, 100, 2000]
             u = zeros(Float32, n)
@@ -38,7 +53,7 @@ end
     end
 end
 
-@testset "Matrix-Vector Int64         " begin
+@testset "Int64 Matrix-Vector         " begin
     for n ∈ [10, 100, 500, 2000]
         for m ∈ [10, 100, 2000]
             u = zeros(Int, n)
@@ -50,7 +65,7 @@ end
     end
 end
 
-@testset "Matrix-Vector ComplexInt32  " begin
+@testset "ComplexInt32 Matrix-Vector  " begin
     for n ∈ [10, 100, 500, 2000]
         for m ∈ [10, 100, 2000]
             u = zeros(Complex{Int32}, n) |> StructArray       

From 767bc978ef46f80f486b9da3e20f3934110dc114 Mon Sep 17 00:00:00 2001
From: MasonProtter <mason.protter@icloud.com>
Date: Sat, 4 Apr 2020 00:54:26 -0600
Subject: [PATCH 6/8] Complex BLAS2 support with row major storage

---
 src/Gaius.jl     | 25 ++++++++++++-----------
 src/matmul.jl    | 47 ++++++++++++++++++++++++++++++++++++++++--
 test/runtests.jl | 53 ++++++++++++++++++++++++++++++++----------------
 3 files changed, 93 insertions(+), 32 deletions(-)

diff --git a/src/Gaius.jl b/src/Gaius.jl
index 1d89554..239f68b 100644
--- a/src/Gaius.jl
+++ b/src/Gaius.jl
@@ -29,18 +29,19 @@ end
 
 include("pointermatrix.jl")
 
-const Eltypes  = Union{Float64, Float32, Int64, Int32, Int16}
-const MatTypesC{T} = Union{Matrix{T},
-                           SubArray{T, 2, <: AbstractArray},
-                           PointerMatrix{T},
-                           UnsafeArray{T, 2}} # C for Column Major
-const MatTypesR{T} = Union{Adjoint{T,<:MatTypesC{T}},
-                           Transpose{T,<:MatTypesC{T}}} # R for Row Major
-const MatTypes{ T} = Union{MatTypesC{T}, MatTypesR{T}}
-
-const   VecTypes{T} = Union{Vector{T}, SubArray{T, 1, <:Array}}#, PointerVector{T}}
-const CoVecTypes{T} = Union{Adjoint{T,   <:VecTypes{T}},
-                            Transpose{T, <:VecTypes{T}}}#, PointerAdjointVector{T}}
+Eltypes  = Union{Float64, Float32, Int64, Int32, Int16}
+MatTypesC{T} = Union{Matrix{T},
+                     SubArray{T, 2, <: AbstractArray},
+                     PointerMatrix{T},
+                     UnsafeArray{T, 2}} # C for Column Major
+MatTypesR{T} = Union{Adjoint{T,<:MatTypesC{T}},
+                     Transpose{T,<:MatTypesC{T}}} # R for Row Major
+MatTypes{ T} = Union{MatTypesC{T}, MatTypesR{T}}
+
+VecTypes{T}   = Union{Vector{T}, SubArray{T, 1, <:Array}}
+CoVecTypes{T} = Union{Adjoint{T,   <:VecTypes{T}},
+                      Transpose{T, <:VecTypes{T}}}
+
 
 include("matmul.jl")
 include("block_operations.jl")
diff --git a/src/matmul.jl b/src/matmul.jl
index 6f536dc..3d48d47 100644
--- a/src/matmul.jl
+++ b/src/matmul.jl
@@ -58,6 +58,49 @@ function blocked_mul!(C::StructArray{Complex{T}}, A::StructArray{Complex{T}}, B:
     C
 end
 
+function blocked_mul!(C::Adjoint{Complex{T}, <:StructArray{Complex{T}}},
+                      A::Adjoint{Complex{T}, <:StructArray{Complex{T}}},
+                      B::StructArray{Complex{T}};
+                      block_size = DEFAULT_BLOCK_SIZE, sizecheck=true) where {T <: Eltypes}
+    sizecheck && check_compatible_sizes(C.parent.re', A.parent.re', B.re)
+    
+    _block_size = choose_block_size(C, A, B, block_size)
+    A.parent.im .= (-).(A.parent.im) #ugly hack
+    GC.@preserve C A B begin
+        Cre, Cim = PtrArray(C.parent.re'), PtrArray(C.parent.im')
+        Are, Aim = PtrArray(A.parent.re'), PtrArray(A.parent.im')
+        Bre, Bim = PtrArray(B.re), PtrArray(B.im)
+        _mul!(    Cre, Are, Bre, _block_size)          # C.re = A.re * B.re
+        _mul_add!(Cre, Aim, Bim, _block_size, Val(-1)) # C.re = C.re - A.im * B.im
+        _mul!(    Cim, Are, Bim, _block_size)          # C.im = A.re * B.im
+        _mul_add!(Cim, Aim, Bre, _block_size)          # C.im = C.im + A.im * B.re
+    end
+    A.parent.im .= (-).(A.parent.im)
+    C.parent.im .= (-).(C.parent.im) # ugly hack
+    C
+end
+
+function blocked_mul!(C::Transpose{Complex{T}, <:StructArray{Complex{T}}},
+                      A::Transpose{Complex{T}, <:StructArray{Complex{T}}},
+                      B::StructArray{Complex{T}};
+                      block_size = DEFAULT_BLOCK_SIZE, sizecheck=true) where {T <: Eltypes}
+    sizecheck && check_compatible_sizes(C.parent.re |> transpose, A.parent.re |> transpose, B.re)
+    
+    _block_size = choose_block_size(C, A, B, block_size)
+    
+    GC.@preserve C A B begin
+        Cre, Cim = PtrArray(C.parent.re |> transpose), PtrArray(C.parent.im |> transpose)
+        Are, Aim = PtrArray(A.parent.re |> transpose), PtrArray(A.parent.im |> transpose)
+        Bre, Bim = PtrArray(B.re), PtrArray(B.im)
+        _mul!(    Cre, Are, Bre, _block_size)          # C.re = A.re * B.re
+        _mul_add!(Cre, Aim, Bim, _block_size, Val(-1)) # C.re = C.re - A.im * B.im
+        _mul!(    Cim, Are, Bim, _block_size)          # C.im = A.re * B.im
+        _mul_add!(Cim, Aim, Bre, _block_size)          # C.im = C.im + A.im * B.re
+    end
+    C
+end
+
+
 function _mul!(C, A, B, sz)
     n, k, m = size(C, 1), size(A, 2), size(C, 2)
     if n >= sz+8 && m >= sz+8 && k >= sz+8
@@ -159,14 +202,14 @@ function blocked_mul(A::CoVecTypes, B::MatTypes)
     C
 end
 
-function blocked_mul(A::Adjoint{T, <:StructArray{Complex{T}, 1}}, B::StructArray{Complex{T}, 2}) where {T <: Eltypes}
+function blocked_mul(A::Adjoint{Complex{T}, <:StructArray{Complex{T}, 1}}, B::StructArray{Complex{T}, 2}) where {T <: Eltypes}
     C = StructArray{Complex{T}}((Vector{T}(undef, size(B, 2)),
                                  Vector{T}(undef, size(B, 2))))'
     blocked_mul!(C, A, B)
     C
 end
 
-function blocked_mul(A::Transpose{T, <:StructArray{Complex{T}, 1}}, B::StructArray{Complex{T}, 2}) where {T <: Eltypes}
+function blocked_mul(A::Transpose{Complex{T}, <:StructArray{Complex{T}, 1}}, B::StructArray{Complex{T}, 2}) where {T <: Eltypes}
     C = StructArray{Complex{T}}((Vector{T}(undef, size(B, 2)),
                                  Vector{T}(undef, size(B, 2)))) |> transpose
     blocked_mul!(C, A, B)
diff --git a/test/runtests.jl b/test/runtests.jl
index 3543d50..dd5a4b4 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,24 +2,9 @@ using Test, LinearAlgebra, Random
 using Gaius
 using StructArrays
 
-@testset "Float64 Matrix-CoVector     " begin
-    for n ∈ [10, 100, 500, 2000]
-        for m ∈ [10, 100, 2000]
-            u = zeros(m)
-            A = rand(n, m)
-            v = rand(n)
-            @test blocked_mul!(u', v', A) ≈ v' * A
-            @test u' ≈ blocked_mul(v', A)
-
-            @test blocked_mul!(transpose(u), transpose(v), A) ≈ transpose(v) * A
-            @test transpose(u) ≈ blocked_mul(transpose(v), A)
-        end
-    end
-end
-
 @testset "Float64 Matrix-Vector       " begin
-    for n ∈ [10, 100, 500, 2000]
-        for m ∈ [10, 100, 2000]
+    for n ∈ [10, 100, 500, 10000]
+        for m ∈ [10, 100, 10000]
             u = zeros(n)
             A = rand(n, m)
             v = rand(m)
@@ -29,7 +14,7 @@ end
     end
 end
 
-@testset "Matrix-Vector ComplexF64    " begin
+@testset "ComplexF64  Matrix-Vector   " begin
     for n ∈ [10, 100, 500, 2000]
         for m ∈ [10, 100, 2000]
             u = zeros(ComplexF64, n)   |> StructArray
@@ -78,6 +63,38 @@ end
 end
 
 
+@testset "Float64 CoVector-Matrix     " begin
+    for n ∈ [10, 100, 500, 2000]
+        for m ∈ [10, 100, 2000]
+            u = zeros(m)
+            A = rand(n, m)
+            v = rand(n)
+            @test blocked_mul!(u', v', A) ≈ v' * A
+            @test u' ≈ blocked_mul(v', A)
+
+            @test blocked_mul!(transpose(u), transpose(v), A) ≈ transpose(v) * A
+            @test transpose(u) ≈ blocked_mul(transpose(v), A)
+        end
+    end
+end
+
+@testset "ComplexF32 CoVector-Matrix  " begin
+    for n ∈ [10, 100, 500, 2000]
+        for m ∈ [10, 100, 2000]
+            u = zeros(Complex{Float32}, m)   |> StructArray
+            A = rand(Complex{Float32}, n, m) |> StructArray
+            v = rand(Complex{Float32}, n)    |> StructArray
+            @test blocked_mul!(u', v', A) ≈ v' * A
+            @test u' ≈ blocked_mul(v', A)
+
+            @test blocked_mul!(transpose(u), transpose(v), A) ≈ transpose(v) * A
+            @test transpose(u) ≈ blocked_mul(transpose(v), A)
+        end
+    end
+end
+
+
+
 
 @testset "ComplexFloat64 Matrix-Matrix" begin
     for sz ∈ [10, 50, 100, 200, 400, 1000]

From 134d01b6afc6af5f962f2f4870c89432a30a8b3f Mon Sep 17 00:00:00 2001
From: MasonProtter <mason.protter@icloud.com>
Date: Sat, 4 Apr 2020 00:55:35 -0600
Subject: [PATCH 7/8] increment version

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index b41027d..3b34f2d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "Gaius"
 uuid = "bffe22d1-cb55-4f4e-ac2c-f4dd4bf58912"
 authors = ["MasonProtter <mason.protter@icloud.com>"]
-version = "0.1.0"
+version = "0.2.0"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

From a45343a3cb503e63ee02c68a3b5215386e413d18 Mon Sep 17 00:00:00 2001
From: MasonProtter <mason.protter@icloud.com>
Date: Sat, 4 Apr 2020 01:05:38 -0600
Subject: [PATCH 8/8] test on 1.4

---
 .travis.yml  | 1 +
 Project.toml | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 2179931..2f77c41 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,6 +4,7 @@ os:
   - linux
 julia:
   - 1.3
+  - 1.4
   - nightly
 
 matrix:
diff --git a/Project.toml b/Project.toml
index 3b34f2d..2b4afd2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,11 +11,11 @@ StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
 UnsafeArrays = "c4a57d5a-5b31-53a6-b365-19f8c011fbd6"
 
 [compat]
-julia = "1.3"
+julia = "1.3, 1.4"
 
 [extras]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "BenchmarkTools"]
+test = ["Test", "BenchmarkTools"]
\ No newline at end of file