From 90c829221f1c074b41b603848d10bb12b034510f Mon Sep 17 00:00:00 2001 From: MasonProtter Date: Wed, 11 Mar 2020 13:08:01 -0600 Subject: [PATCH 1/8] early setup --- src/Gaius.jl | 3 +++ src/matmul.jl | 23 +++++++++++------------ src/pointermatrix.jl | 31 +++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/src/Gaius.jl b/src/Gaius.jl index 88543a6..5055f93 100644 --- a/src/Gaius.jl +++ b/src/Gaius.jl @@ -14,6 +14,9 @@ const MatTypesC{T <: Eltypes} = Union{Matrix{T}, SubArray{T, 2, <: Array}} # C f const MatTypesR{T <: Eltypes} = Union{LinearAlgebra.Adjoint{T,<:MatTypesC{T}}, LinearAlgebra.Transpose{T,<:MatTypesC{T}}} # R for Row Major const MatTypes{T <: Eltypes} = Union{MatTypesC{T}, MatTypesR{T}} +const VecTypes{T <: Eltypes} = Union{Vector{T}, SubArray{T, 1, <:Array}} +const CoVecTypes{T <: Eltypes} = Union{LinearAlgebra.Adjoint{T,<:VecTypes{T}}, LinearAlgebra.Transpose{T, <:VecTypes{T}}} + # Note this does not support changing the number of threads at runtime macro _spawn(ex) if Threads.nthreads() > 1 diff --git a/src/matmul.jl b/src/matmul.jl index b20ffff..7f8374b 100644 --- a/src/matmul.jl +++ b/src/matmul.jl @@ -13,11 +13,19 @@ function blocked_mul(A::MatTypes, B::MatTypes) C end +function blocked_mul(A::StructArray{Complex{T}, 2}, B::StructArray{Complex{T}, 2}) where {T <: Eltypes} + C = StructArray{Complex{T}}((Matrix{T}(undef, size(A, 1), size(B,2)), + Matrix{T}(undef, size(A, 1), size(B,2)))) + blocked_mul!(C, A, B) + C +end + choose_block_size(C::AbstractMatrix, ::Nothing) = size(C, 1) >= (3DEFAULT_BLOCK_SIZE) >>> 1 ? DEFAULT_BLOCK_SIZE : 32 choose_block_size(::Any, block_size::Integer) = block_size + function blocked_mul!(C::MatTypes{T}, A::MatTypes{T}, B::MatTypes{T}; - block_size = nothing, sizecheck=true) where {T <: Eltypes} + block_size = nothing, sizecheck=true) where {T <: Eltypes} sizecheck && check_compatible_sizes(C, A, B) _block_size = choose_block_size(C, block_size) @@ -26,15 +34,8 @@ function blocked_mul!(C::MatTypes{T}, A::MatTypes{T}, B::MatTypes{T}; C end -function blocked_mul(A::StructArray{Complex{T}, 2}, B::StructArray{Complex{T}, 2}) where {T <: Eltypes} - C = StructArray{Complex{T}}((Matrix{T}(undef, size(A, 1), size(B,2)), - Matrix{T}(undef, size(A, 1), size(B,2)))) - blocked_mul!(C, A, B) - C -end - function blocked_mul!(C::StructArray{Complex{T}, 2}, A::StructArray{Complex{T}, 2}, B::StructArray{Complex{T}, 2}; - block_size = DEFAULT_BLOCK_SIZE, sizecheck=true) where {T <: Eltypes} + block_size = DEFAULT_BLOCK_SIZE, sizecheck=true) where {T <: Eltypes} sizecheck && check_compatible_sizes(C, A, B) _block_size = choose_block_size(C, block_size) @@ -43,9 +44,6 @@ function blocked_mul!(C::StructArray{Complex{T}, 2}, A::StructArray{Complex{T}, Cre, Cim = PtrMatrix(C.re), PtrMatrix(C.im) Are, Aim = PtrMatrix(A.re), PtrMatrix(A.im) Bre, Bim = PtrMatrix(B.re), PtrMatrix(B.im) - # Cre, Cim = C.re, C.im - # Are, Aim = A.re, A.im - # Bre, Bim = B.re, B.im _mul!( Cre, Are, Bre, _block_size) # C.re = A.re * B.re _mul_add!(Cre, Aim, Bim, _block_size, Val(-1)) # C.re = C.re - A.im * B.im _mul!( Cim, Are, Bim, _block_size) # C.im = A.re * B.im @@ -87,3 +85,4 @@ function _mul_add!(C, A, B, sz, ::Val{factor} = Val(1)) where {factor} add_gemm_kernel!(C, A, B, Val(factor)) end end + diff --git a/src/pointermatrix.jl b/src/pointermatrix.jl index 96d49a0..058f0b2 100644 --- a/src/pointermatrix.jl +++ b/src/pointermatrix.jl @@ -28,3 +28,34 @@ Base.@propagate_inbounds function Base.getindex(A::PointerMatrix, v, i::Integer, end Base.IndexStyle(::Type{<:PointerMatrix}) = IndexCartesian() + +struct PointerVector{T,P <: AbstractStridedPointer} <: AbstractVector{T} + ptr::P + length::Int + PointerVector(ptr::P, length::Int) where {T, P <: AbstractStridedPointer{T}} = new{T,P}(ptr, size) +end +@inline PtrVector(v::AbstractVector) = PointerVector(stridedpointer(v), size(v)) +@inline Base.pointer(v::PointerVector) = pointer(v.ptr) +@inline Base.size(v::PointerMatrix) = (v.length,) +@inline Base.length(v::PointerMatrix) = v.length +@inline Base.strides(v::PointerVector) = strides(v.ptr) +@inline stridedpointer(v::PointerVector) = v.ptr + +@inline Base.maybeview(A::PointerVector, r::UnitRange, c::UnitRange) = PointerMatrix(gesp(A.ptr, (first(r) - 1, (first(c) - 1))), (length(r), length(c))) +# getindex is important for the sake of printing the AbstractPointerMatrix. If we call something a Matrix, it's nice to support the interface if possible. +Base.@propagate_inbounds function Base.getindex(A::PointerMatrix, i::Integer, j::Integer) + @boundscheck begin + M, N = size(A) + (M < i || N < j) && throw(BoundsError(A, (i,j))) + end + vload(stridedpointer(A), (i-1, j-1)) +end +Base.@propagate_inbounds function Base.getindex(A::PointerMatrix, v, i::Integer, j::Integer) + @boundscheck begin + M, N = size(A) + (M < i || N < j) && throw(BoundsError(A, (i,j))) + end + vstore!(stridedpointer(A), v, (i-1, j-1)) +end +Base.IndexStyle(::Type{<:PointerMatrix}) = IndexCartesian() + From b5c2bd152ef72cea2cf5e297d898ba5e28590beb Mon Sep 17 00:00:00 2001 From: MasonProtter Date: Wed, 18 Mar 2020 09:23:53 -0600 Subject: [PATCH 2/8] BLAS2 stuff --- Project.toml | 4 ++- src/Gaius.jl | 28 ++++++++++------ src/block_operations.jl | 66 +++++++++++++++++++++++++++++++++++- src/kernels.jl | 66 ++++++++++++++++++++++++++++-------- src/matmul.jl | 74 +++++++++++++++++++++++++++++++++++------ src/pointermatrix.jl | 33 ++++++++---------- test/runtests.jl | 8 ++++- 7 files changed, 222 insertions(+), 57 deletions(-) diff --git a/Project.toml b/Project.toml index d80556c..b41027d 100644 --- a/Project.toml +++ b/Project.toml @@ -8,12 +8,14 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" +UnsafeArrays = "c4a57d5a-5b31-53a6-b365-19f8c011fbd6" [compat] julia = "1.3" [extras] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test"] +test = ["Test", "BenchmarkTools"] diff --git a/src/Gaius.jl b/src/Gaius.jl index 5055f93..ea5469d 100644 --- a/src/Gaius.jl +++ b/src/Gaius.jl @@ -1,22 +1,16 @@ module Gaius -using LoopVectorization: @avx, VectorizationBase.AbstractStridedPointer, VectorizationBase.gesp, VectorizationBase.vload, VectorizationBase.vstore!, VectorizationBase.AVX512F +using LoopVectorization: @avx, VectorizationBase.AbstractStridedPointer, VectorizationBase.gesp, VectorizationBase.gep, VectorizationBase.vload, VectorizationBase.vstore!, VectorizationBase.AVX512F import LoopVectorization: @avx, VectorizationBase.stridedpointer using StructArrays: StructArray -using LinearAlgebra: LinearAlgebra +using LinearAlgebra: LinearAlgebra, Adjoint, Transpose + +using UnsafeArrays: UnsafeArrays, @uviews, UnsafeArray export blocked_mul, blocked_mul! const DEFAULT_BLOCK_SIZE = AVX512F ? 96 : 64 -const Eltypes = Union{Float64, Float32, Int64, Int32, Int16} -const MatTypesC{T <: Eltypes} = Union{Matrix{T}, SubArray{T, 2, <: Array}} # C for Column Major -const MatTypesR{T <: Eltypes} = Union{LinearAlgebra.Adjoint{T,<:MatTypesC{T}}, LinearAlgebra.Transpose{T,<:MatTypesC{T}}} # R for Row Major -const MatTypes{T <: Eltypes} = Union{MatTypesC{T}, MatTypesR{T}} - -const VecTypes{T <: Eltypes} = Union{Vector{T}, SubArray{T, 1, <:Array}} -const CoVecTypes{T <: Eltypes} = Union{LinearAlgebra.Adjoint{T,<:VecTypes{T}}, LinearAlgebra.Transpose{T, <:VecTypes{T}}} - # Note this does not support changing the number of threads at runtime macro _spawn(ex) if Threads.nthreads() > 1 @@ -34,6 +28,20 @@ macro _sync(ex) end include("pointermatrix.jl") + +const Eltypes = Union{Float64, Float32, Int64, Int32, Int16} +const MatTypesC{T} = Union{Matrix{T}, + SubArray{T, 2, <: AbstractArray}, + PointerMatrix{T}, + UnsafeArray{T, 2}} # C for Column Major +const MatTypesR{T} = Union{Adjoint{T,<:MatTypesC{T}}, + Transpose{T,<:MatTypesC{T}}} # R for Row Major +const MatTypes{ T} = Union{MatTypesC{T}, MatTypesR{T}} + +const VecTypes{T} = Union{Vector{T}, SubArray{T, 1, <:Array}, PointerVector{T}} +const CoVecTypes{T} = Union{Adjoint{T,<:VecTypes{T}}, + Transpose{T, <:VecTypes{T}}} + include("matmul.jl") include("block_operations.jl") include("kernels.jl") diff --git a/src/block_operations.jl b/src/block_operations.jl index 2a7dc80..70b7b38 100644 --- a/src/block_operations.jl +++ b/src/block_operations.jl @@ -53,6 +53,26 @@ function block_mat_vec_mul!(C, A, B, sz) _mul_add!(C21, A22, B21, sz) end end +function block_mat_vec_mul!(C::VecTypes, A, B::VecTypes, sz) + @inbounds @views begin + C11 = C[1:sz ]; + C21 = C[sz+1:end]; + + A11 = A[1:sz, 1:sz]; A12 = A[1:sz, sz+1:end] + A21 = A[sz+1:end, 1:sz]; A22 = A[sz+1:end, sz+1:end] + + B11 = B[1:sz ]; + B21 = B[sz+1:end]; + end + @_sync begin + @_spawn begin + gemm_kernel!(C11, A11, B11) + _mul_add!(C11, A12, B21, sz) + end + _mul!( C21, A21, B11, sz) + _mul_add!(C21, A22, B21, sz) + end +end function block_covec_mat_mul!(C, A, B, sz) @inbounds @views begin @@ -99,9 +119,10 @@ function block_vec_covec_mul!(C, A, B, sz) end end + function block_covec_vec_mul!(C, A, B, sz) @inbounds @views begin - A11 = A[1:end, 1:sz]; A12 = A[1:sz, sz+1:end] + A11 = A[1:end, 1:sz]; A12 = A[1:end, sz+1:end] B11 = B[1:sz, 1:end]; B21 = B[sz+1:end, 1:end]; @@ -111,6 +132,17 @@ function block_covec_vec_mul!(C, A, B, sz) _mul_add!(C, A12, B21, sz) end +function block_covec_vec_mul!(C::VecTypes, A, B::VecTypes, sz) + @inbounds @views begin + A11 = A[1:end, 1:sz]; A12 = A[1:end, sz+1:end] + + B11 = B[1:sz ]; + B21 = B[sz+1:end]; + end + gemm_kernel!(C, A11, B11) + _mul_add!(C, A12, B21, sz) +end + #---------------------------------------------------------------- #---------------------------------------------------------------- # Block Matrix addition-multiplication @@ -165,6 +197,27 @@ function block_mat_vec_mul_add!(C, A, B, sz, ::Val{factor} = Val(1)) where {fact end end +function block_mat_vec_mul_add!(C::VecTypes, A, B::VecTypes, sz, ::Val{factor} = Val(1)) where {factor} + @inbounds @views begin + C11 = C[1:sz ]; + C21 = C[sz+1:end]; + + A11 = A[1:sz, 1:sz]; A12 = A[1:sz, sz+1:end] + A21 = A[sz+1:end, 1:sz]; A22 = A[sz+1:end, sz+1:end] + + B11 = B[1:sz ]; + B21 = B[sz+1:end]; + end + @_sync begin + @_spawn begin + add_gemm_kernel!(C11, A11, B11, Val(factor)) + _mul_add!(C11, A12, B21, sz, Val(factor)) + end + _mul_add!(C21, A21, B11, sz, Val(factor)) + _mul_add!(C21, A22, B21, sz, Val(factor)) + end +end + function block_covec_mat_mul_add!(C, A, B, sz, ::Val{factor} = Val(1)) where {factor} @inbounds @views begin C11 = C[1:end, 1:sz]; C12 = C[1:end, sz+1:end] @@ -218,3 +271,14 @@ function block_covec_vec_mul_add!(C, A, B, sz, ::Val{factor} = Val(1)) where {fa add_gemm_kernel!(C, A11, B11, Val(factor)) _mul_add!(C, A12, B21, sz, Val(factor)) end + +function block_covec_vec_mul_add!(C::VecTypes, A, B::VecTypes, sz, ::Val{factor} = Val(1)) where {factor} + @inbounds @views begin + A11 = A[1:end, 1:sz]; A12 = A[1:end, sz+1:end] + + B11 = B[1:sz ]; + B21 = B[sz+1:end]; + end + add_gemm_kernel!(C, A11, B11, Val(factor)) + _mul_add!(C, A12, B21, sz, Val(factor)) +end diff --git a/src/kernels.jl b/src/kernels.jl index 397d06e..5c31c57 100644 --- a/src/kernels.jl +++ b/src/kernels.jl @@ -8,36 +8,76 @@ function gemm_kernel!(C, A, B) end end -function add_gemm_kernel!(C, A, B, ::Val{factor}) where {factor} - if factor == 1 - _add_gemm_kernel!(C, A, B) - elseif factor == -1 - _sub_gemm_kernel!(C, A, B) - else - _add_gemm_kernel!(C, C, B, Val(factor)) +function add_gemm_kernel!(C::MatTypes, A::MatTypes, B::MatTypes) + @avx for n ∈ 1:size(A, 1), m ∈ 1:size(B, 2) + Cₙₘ = zero(eltype(C)) + for k ∈ 1:size(A, 2) + Cₙₘ += A[n,k] * B[k,m] + end + C[n,m] += Cₙₘ end end -function _add_gemm_kernel!(C, A, B) +add_gemm_kernel!(C::MatTypes, A::MatTypes, B::MatTypes, ::Val{1}) = add_gemm_kernel!(C, A, B) + +function add_gemm_kernel!(C::MatTypes, A::MatTypes, B::MatTypes, ::Val{-1}) @avx for n ∈ 1:size(A, 1), m ∈ 1:size(B, 2) Cₙₘ = zero(eltype(C)) for k ∈ 1:size(A, 2) - Cₙₘ += A[n,k] * B[k,m] + Cₙₘ -= A[n,k] * B[k,m] end C[n,m] += Cₙₘ end end -function _sub_gemm_kernel!(C, A, B) +function add_gemm_kernel!(C::MatTypes, A::MatTypes, B::MatTypes, ::Val{factor}) where {factor} @avx for n ∈ 1:size(A, 1), m ∈ 1:size(B, 2) Cₙₘ = zero(eltype(C)) for k ∈ 1:size(A, 2) - Cₙₘ -= A[n,k] * B[k,m] + Cₙₘ += factor * A[n,k] * B[k,m] end C[n,m] += Cₙₘ end end -_add_gemm_kernel!(C, A, B, ::Val{1}) = _add_gemm_kernel!(C, A, B) -_add_gemm_kernel!(C, A, B, ::Val{-1}) = _sub_gemm_kernel!(C, A, B) +#____________ + +function gemm_kernel!(u::VecTypes, A::MatTypes, v::VecTypes) + @avx for n ∈ 1:size(A, 1) + uₙ = zero(eltype(u)) + for k ∈ 1:size(A, 2) + uₙ += A[n,k] * v[k] + end + u[n] = uₙ + end +end +function add_gemm_kernel!(u::VecTypes, A::MatTypes, v::VecTypes) + @avx for n ∈ 1:size(A, 1) + uₙ = zero(eltype(u)) + for k ∈ 1:size(A, 2) + uₙ += A[n,k] * v[k] + end + u[n] += uₙ + end +end + +function _dd_gemm_kernel!(u::VecTypes, A::MatTypes, v::VecTypes, ::Val{-1}) + @avx for n ∈ 1:size(A, 1) + uₙ = zero(eltype(u)) + for k ∈ 1:size(A, 2) + uₙ -= A[n,k] * v[k] + end + u[n] += uₙ + end +end + +function add_gemm_kernel!(u::VecTypes, A::MatTypes, v::VecTypes, ::Val{factor}) where {factor} + @avx for n ∈ 1:size(A, 1) + uₙ = zero(eltype(u)) + for k ∈ 1:size(A, 2) + uₙ += factor * A[n,k] * v[k] + end + u[n] += uₙ + end +end diff --git a/src/matmul.jl b/src/matmul.jl index 7f8374b..ae872ea 100644 --- a/src/matmul.jl +++ b/src/matmul.jl @@ -20,30 +20,35 @@ function blocked_mul(A::StructArray{Complex{T}, 2}, B::StructArray{Complex{T}, 2 C end -choose_block_size(C::AbstractMatrix, ::Nothing) = size(C, 1) >= (3DEFAULT_BLOCK_SIZE) >>> 1 ? DEFAULT_BLOCK_SIZE : 32 -choose_block_size(::Any, block_size::Integer) = block_size - +function choose_block_size(C, A, B, ::Nothing) + if (*)(length(C) |> UInt128, length(A) |> UInt128, length(B) |> UInt128) >= ((3DEFAULT_BLOCK_SIZE) >>> 1)^6 + DEFAULT_BLOCK_SIZE + else + 32 + end +end +choose_block_size(C, A, B, block_size::Integer) = block_size -function blocked_mul!(C::MatTypes{T}, A::MatTypes{T}, B::MatTypes{T}; +function blocked_mul!(C::AbstractArray{T}, A::AbstractArray{T}, B::AbstractArray{T}; block_size = nothing, sizecheck=true) where {T <: Eltypes} sizecheck && check_compatible_sizes(C, A, B) - _block_size = choose_block_size(C, block_size) + _block_size = choose_block_size(C, A, B, block_size) - GC.@preserve C A B _mul!(PtrMatrix(C), PtrMatrix(A), PtrMatrix(B), _block_size) + GC.@preserve C A B _mul!(PtrArray(C), PtrArray(A), PtrArray(B), _block_size) C end -function blocked_mul!(C::StructArray{Complex{T}, 2}, A::StructArray{Complex{T}, 2}, B::StructArray{Complex{T}, 2}; +function blocked_mul!(C::StructArray{Complex{T}}, A::StructArray{Complex{T}}, B::StructArray{Complex{T}}; block_size = DEFAULT_BLOCK_SIZE, sizecheck=true) where {T <: Eltypes} sizecheck && check_compatible_sizes(C, A, B) - _block_size = choose_block_size(C, block_size) + _block_size = choose_block_size(C, A, B, block_size) GC.@preserve C A B begin - Cre, Cim = PtrMatrix(C.re), PtrMatrix(C.im) - Are, Aim = PtrMatrix(A.re), PtrMatrix(A.im) - Bre, Bim = PtrMatrix(B.re), PtrMatrix(B.im) + Cre, Cim = PtrArray(C.re), PtrArray(C.im) + Are, Aim = PtrArray(A.re), PtrArray(A.im) + Bre, Bim = PtrArray(B.re), PtrArray(B.im) _mul!( Cre, Are, Bre, _block_size) # C.re = A.re * B.re _mul_add!(Cre, Aim, Bim, _block_size, Val(-1)) # C.re = C.re - A.im * B.im _mul!( Cim, Are, Bim, _block_size) # C.im = A.re * B.im @@ -86,3 +91,50 @@ function _mul_add!(C, A, B, sz, ::Val{factor} = Val(1)) where {factor} end end +#----------------------------- +# matvec + +function check_compatible_sizes(C::VecTypes, A, B::VecTypes) + n = length(C) + a, k = size(A) + b = length(B) + @assert (n == a) && (k == b) "matrices of size $(size(C)), $(size(A)), $(size(B)) are incompatible" + nothing +end + +function blocked_mul(A::MatTypes, B::VecTypes) + T = promote_type(eltype(A), eltype(B)) + C = Vector{T}(undef, size(A,1)) + blocked_mul!(C, A, B) + C +end + +function blocked_mul(A::StructArray{Complex{T}, 2}, B::StructArray{Complex{T}, 1}) where {T <: Eltypes} + C = StructArray{Complex{T}}((Matrix{T}(undef, size(A, 1)), + Matrix{T}(undef, size(A, 1)))) + blocked_mul!(C, A, B) + C +end + + +function _mul!(C::VecTypes{T}, A::MatTypes{T}, B::VecTypes{T}, sz) where {T<:Eltypes} + n, k = size(A) + if n >= sz+8 && k >= sz+8 + block_mat_vec_mul!(C, A, B, sz) + elseif n < sz+8 && k >= sz+8 + block_covec_vec_mul!(C, A, B, sz) + else + gemm_kernel!(C, A, B) + end +end + +function _mul_add!(C::VecTypes{T}, A::MatTypes{T}, B::VecTypes{T}, sz ::Val{factor} = Val(1)) where {factor, T<:Eltypes} + n, k = size(A) + if n >= sz+8 && k >= sz+8 + block_mat_vec_mul_add!(C, A, B, sz, Val(factor)) + elseif n < sz+8 && k >= sz+8 + block_covec_vec_mul_add!(C, A, B, sz, Val(factor)) + else + add_gemm_kernel!(C, A, B, Val(factor)) + end +end diff --git a/src/pointermatrix.jl b/src/pointermatrix.jl index 058f0b2..f649d38 100644 --- a/src/pointermatrix.jl +++ b/src/pointermatrix.jl @@ -1,4 +1,8 @@ +PtrArray(M::AbstractMatrix) = PtrMatrix(M) +PtrArray(v::AbstractVector) = PtrVector(v) + + struct PointerMatrix{T,P <: AbstractStridedPointer} <: AbstractMatrix{T} ptr::P size::Tuple{Int,Int} @@ -29,33 +33,22 @@ end Base.IndexStyle(::Type{<:PointerMatrix}) = IndexCartesian() + + struct PointerVector{T,P <: AbstractStridedPointer} <: AbstractVector{T} ptr::P length::Int - PointerVector(ptr::P, length::Int) where {T, P <: AbstractStridedPointer{T}} = new{T,P}(ptr, size) + PointerVector(ptr::P, length::Int) where {T, P <: AbstractStridedPointer{T}} = new{T,P}(ptr, length) end -@inline PtrVector(v::AbstractVector) = PointerVector(stridedpointer(v), size(v)) +@inline PtrVector(v::AbstractVector) = PointerVector(stridedpointer(v), length(v)) @inline Base.pointer(v::PointerVector) = pointer(v.ptr) -@inline Base.size(v::PointerMatrix) = (v.length,) -@inline Base.length(v::PointerMatrix) = v.length +@inline Base.size(v::PointerVector) = (v.length,) +@inline Base.length(v::PointerVector) = v.length @inline Base.strides(v::PointerVector) = strides(v.ptr) @inline stridedpointer(v::PointerVector) = v.ptr -@inline Base.maybeview(A::PointerVector, r::UnitRange, c::UnitRange) = PointerMatrix(gesp(A.ptr, (first(r) - 1, (first(c) - 1))), (length(r), length(c))) -# getindex is important for the sake of printing the AbstractPointerMatrix. If we call something a Matrix, it's nice to support the interface if possible. -Base.@propagate_inbounds function Base.getindex(A::PointerMatrix, i::Integer, j::Integer) - @boundscheck begin - M, N = size(A) - (M < i || N < j) && throw(BoundsError(A, (i,j))) - end - vload(stridedpointer(A), (i-1, j-1)) +@inline function Base.maybeview(v::PointerVector, i::UnitRange) + PointerVector(gesp(v.ptr, (first(i) - 1,)), length(i)) end -Base.@propagate_inbounds function Base.getindex(A::PointerMatrix, v, i::Integer, j::Integer) - @boundscheck begin - M, N = size(A) - (M < i || N < j) && throw(BoundsError(A, (i,j))) - end - vstore!(stridedpointer(A), v, (i-1, j-1)) -end -Base.IndexStyle(::Type{<:PointerMatrix}) = IndexCartesian() +Base.IndexStyle(::Type{<:PointerVector}) = IndexLinear() diff --git a/test/runtests.jl b/test/runtests.jl index bd7d539..4fad037 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,7 +2,13 @@ using Test, LinearAlgebra, Random using Gaius using StructArrays -@testset "ComplexFloat64 Multiplication" begin +# @testset "Matrix-Vector products" begin + + +# end + + +@testset "ComplexFloat64 Matrix Multiplication" begin for sz ∈ [10, 50, 100, 200, 400, 1000] n, k, m = (sz .+ rand(-5:5, 3)) @testset "($n × $k) × ($k × $m)" begin From 1d5bcb18bf6b84760dd379d898bca5e3b12df4a5 Mon Sep 17 00:00:00 2001 From: MasonProtter Date: Wed, 18 Mar 2020 12:36:56 -0600 Subject: [PATCH 3/8] fix up and test matrix-vector products --- src/matmul.jl | 25 +++++++++---------- test/runtests.jl | 62 +++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 72 insertions(+), 15 deletions(-) diff --git a/src/matmul.jl b/src/matmul.jl index ae872ea..a0166cd 100644 --- a/src/matmul.jl +++ b/src/matmul.jl @@ -6,6 +6,16 @@ function check_compatible_sizes(C, A, B) nothing end +function choose_block_size(C, A, B, ::Nothing) + if (*)(length(C) |> Int128, length(A) |> Int128, length(B) |> Int128) >= ((3DEFAULT_BLOCK_SIZE) >>> 1)^6 + DEFAULT_BLOCK_SIZE + else + 32 + end +end +choose_block_size(C, A, B, block_size::Integer) = block_size + + function blocked_mul(A::MatTypes, B::MatTypes) T = promote_type(eltype(A), eltype(B)) C = Matrix{T}(undef, size(A,1), size(B,2)) @@ -20,15 +30,6 @@ function blocked_mul(A::StructArray{Complex{T}, 2}, B::StructArray{Complex{T}, 2 C end -function choose_block_size(C, A, B, ::Nothing) - if (*)(length(C) |> UInt128, length(A) |> UInt128, length(B) |> UInt128) >= ((3DEFAULT_BLOCK_SIZE) >>> 1)^6 - DEFAULT_BLOCK_SIZE - else - 32 - end -end -choose_block_size(C, A, B, block_size::Integer) = block_size - function blocked_mul!(C::AbstractArray{T}, A::AbstractArray{T}, B::AbstractArray{T}; block_size = nothing, sizecheck=true) where {T <: Eltypes} sizecheck && check_compatible_sizes(C, A, B) @@ -41,7 +42,7 @@ end function blocked_mul!(C::StructArray{Complex{T}}, A::StructArray{Complex{T}}, B::StructArray{Complex{T}}; block_size = DEFAULT_BLOCK_SIZE, sizecheck=true) where {T <: Eltypes} - sizecheck && check_compatible_sizes(C, A, B) + sizecheck && check_compatible_sizes(C.re, A.re, B.re) _block_size = choose_block_size(C, A, B, block_size) @@ -110,8 +111,8 @@ function blocked_mul(A::MatTypes, B::VecTypes) end function blocked_mul(A::StructArray{Complex{T}, 2}, B::StructArray{Complex{T}, 1}) where {T <: Eltypes} - C = StructArray{Complex{T}}((Matrix{T}(undef, size(A, 1)), - Matrix{T}(undef, size(A, 1)))) + C = StructArray{Complex{T}}((Vector{T}(undef, size(A, 1)), + Vector{T}(undef, size(A, 1)))) blocked_mul!(C, A, B) C end diff --git a/test/runtests.jl b/test/runtests.jl index 4fad037..bd247a2 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,10 +2,66 @@ using Test, LinearAlgebra, Random using Gaius using StructArrays -# @testset "Matrix-Vector products" begin - +@testset "Matrix-Vector Float64" begin + for n ∈ [10, 100, 500, 2000] + for m ∈ [10, 100, 2000] + u = zeros(n) + A = rand(n, m) + v = rand(m) + @test blocked_mul!(u, A, v) ≈ A * v + @test u ≈ blocked_mul(A, v) + end + end +end + +@testset "Matrix-Vector ComplexF64" begin + for n ∈ [10, 100, 500, 2000] + for m ∈ [10, 100, 2000] + u = zeros(ComplexF64, n) |> StructArray + A = rand(ComplexF64, n, m) |> StructArray + v = rand(ComplexF64, m) |> StructArray + @test blocked_mul!(u, A, v) ≈ collect(A) * collect(v) + @test u ≈ blocked_mul(A, v) + end + end +end + +@testset "Matrix-Vector Float32" begin + for n ∈ [10, 100, 500, 2000] + for m ∈ [10, 100, 2000] + u = zeros(Float32, n) + A = rand(Float32, n, m) + v = rand(Float32, m) + @test blocked_mul!(u, A, v) ≈ A * v + @test u ≈ blocked_mul(A, v) + end + end +end + +@testset "Matrix-Vector Int64" begin + for n ∈ [10, 100, 500, 2000] + for m ∈ [10, 100, 2000] + u = zeros(Int, n) + A = rand(-20:20, n, m) + v = rand(-20:20, m) + @test blocked_mul!(u, A, v) ≈ A * v + @test u ≈ blocked_mul(A, v) + end + end +end + +@testset "Matrix-Vector ComplexInt32" begin + for n ∈ [10, 100, 500, 2000] + for m ∈ [10, 100, 2000] + u = zeros(Complex{Int32}, n) |> StructArray + A = StructArray{Complex{Int32}}((rand(Int32.(-10:10), n, m), rand(Int32.(-10:10), n, m))) + v = StructArray{Complex{Int32}}((rand(Int32.(-10:10), m), rand(Int32.(-10:10), m))) + @test blocked_mul!(u, A, v) ≈ collect(A) * collect(v) + @test u ≈ blocked_mul(A, v) + end + end +end -# end @testset "ComplexFloat64 Matrix Multiplication" begin From 7f5fced33ef5aa6c8fbc49d6a1ae00ac10607870 Mon Sep 17 00:00:00 2001 From: MasonProtter Date: Fri, 3 Apr 2020 16:03:18 -0600 Subject: [PATCH 4/8] update tests --- test/runtests.jl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index bd247a2..8c489e0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,7 +2,7 @@ using Test, LinearAlgebra, Random using Gaius using StructArrays -@testset "Matrix-Vector Float64" begin +@testset "Matrix-Vector Float64 " begin for n ∈ [10, 100, 500, 2000] for m ∈ [10, 100, 2000] u = zeros(n) @@ -14,7 +14,7 @@ using StructArrays end end -@testset "Matrix-Vector ComplexF64" begin +@testset "Matrix-Vector ComplexF64 " begin for n ∈ [10, 100, 500, 2000] for m ∈ [10, 100, 2000] u = zeros(ComplexF64, n) |> StructArray @@ -26,7 +26,7 @@ end end end -@testset "Matrix-Vector Float32" begin +@testset "Matrix-Vector Float32 " begin for n ∈ [10, 100, 500, 2000] for m ∈ [10, 100, 2000] u = zeros(Float32, n) @@ -38,7 +38,7 @@ end end end -@testset "Matrix-Vector Int64" begin +@testset "Matrix-Vector Int64 " begin for n ∈ [10, 100, 500, 2000] for m ∈ [10, 100, 2000] u = zeros(Int, n) @@ -50,7 +50,7 @@ end end end -@testset "Matrix-Vector ComplexInt32" begin +@testset "Matrix-Vector ComplexInt32 " begin for n ∈ [10, 100, 500, 2000] for m ∈ [10, 100, 2000] u = zeros(Complex{Int32}, n) |> StructArray @@ -64,7 +64,7 @@ end -@testset "ComplexFloat64 Matrix Multiplication" begin +@testset "ComplexFloat64 Matrix-Matrix" begin for sz ∈ [10, 50, 100, 200, 400, 1000] n, k, m = (sz .+ rand(-5:5, 3)) @testset "($n × $k) × ($k × $m)" begin @@ -91,7 +91,7 @@ end end end -@testset "Float64 Multiplication" begin +@testset "Float64 Matrix-Matrix " begin for sz ∈ [10, 50, 100, 200, 400, 1000] n, k, m = (sz .+ rand(-5:5, 3)) @testset "($n × $k) × ($k × $m)" begin @@ -129,7 +129,7 @@ end end end -@testset "Float32 Multiplication" begin +@testset "Float32 Matrix-Matrix " begin for sz ∈ [10, 50, 100, 200, 400, 1000] n, k, m = (sz .+ rand(-5:5, 3)) @testset "($n × $k) × ($k × $m)" begin @@ -150,7 +150,7 @@ end end -@testset "Int64 Multiplication" begin +@testset "Int64 Matrix-Matrix " begin for sz ∈ [10, 50, 100, 200, 400, 1000] n, k, m = (sz .+ rand(-5:5, 3)) @testset "($n × $k) × ($k × $m)" begin @@ -171,7 +171,7 @@ end end -@testset "Int32 Multiplication" begin +@testset "Int32 Matrix-Matrix " begin for sz ∈ [10, 50, 100, 200, 400, 1000] n, k, m = (sz .+ rand(-5:5, 3)) @testset "($n × $k) × ($k × $m)" begin From 6a3b9a6c0f124d48a6c408020ca80d923b648cff Mon Sep 17 00:00:00 2001 From: MasonProtter Date: Fri, 3 Apr 2020 17:18:29 -0600 Subject: [PATCH 5/8] support Matrix-Vector and CoVector-Matrix products --- src/Gaius.jl | 6 +-- src/kernels.jl | 44 +++++++++++++++++++++- src/matmul.jl | 57 ++++++++++++++++++++++++++++- src/pointermatrix.jl | 87 +++++++++++++++++++++++++++++++++++--------- test/runtests.jl | 23 ++++++++++-- 5 files changed, 190 insertions(+), 27 deletions(-) diff --git a/src/Gaius.jl b/src/Gaius.jl index ea5469d..1d89554 100644 --- a/src/Gaius.jl +++ b/src/Gaius.jl @@ -38,9 +38,9 @@ const MatTypesR{T} = Union{Adjoint{T,<:MatTypesC{T}}, Transpose{T,<:MatTypesC{T}}} # R for Row Major const MatTypes{ T} = Union{MatTypesC{T}, MatTypesR{T}} -const VecTypes{T} = Union{Vector{T}, SubArray{T, 1, <:Array}, PointerVector{T}} -const CoVecTypes{T} = Union{Adjoint{T,<:VecTypes{T}}, - Transpose{T, <:VecTypes{T}}} +const VecTypes{T} = Union{Vector{T}, SubArray{T, 1, <:Array}}#, PointerVector{T}} +const CoVecTypes{T} = Union{Adjoint{T, <:VecTypes{T}}, + Transpose{T, <:VecTypes{T}}}#, PointerAdjointVector{T}} include("matmul.jl") include("block_operations.jl") diff --git a/src/kernels.jl b/src/kernels.jl index 5c31c57..8a570bc 100644 --- a/src/kernels.jl +++ b/src/kernels.jl @@ -62,7 +62,7 @@ function add_gemm_kernel!(u::VecTypes, A::MatTypes, v::VecTypes) end end -function _dd_gemm_kernel!(u::VecTypes, A::MatTypes, v::VecTypes, ::Val{-1}) +function add_gemm_kernel!(u::VecTypes, A::MatTypes, v::VecTypes, ::Val{-1}) @avx for n ∈ 1:size(A, 1) uₙ = zero(eltype(u)) for k ∈ 1:size(A, 2) @@ -81,3 +81,45 @@ function add_gemm_kernel!(u::VecTypes, A::MatTypes, v::VecTypes, ::Val{factor}) u[n] += uₙ end end + +#____________ + +function gemm_kernel!(u::CoVecTypes, v::CoVecTypes, A::MatTypes) + @avx for m ∈ 1:size(A, 2) + uₘ = zero(eltype(u)) + for k ∈ 1:size(A, 1) + uₘ += v[k] * A[k, m] + end + u[m] = uₘ + end +end + +function add_gemm_kernel!(u::CoVecTypes, v::CoVecTypes, A::MatTypes) + @avx for m ∈ 1:size(A, 2) + uₘ = zero(eltype(u)) + for k ∈ 1:size(A, 1) + uₘ += v[k] * A[k, m] + end + u[m] += uₘ + end +end + +function add_gemm_kernel(u::CoVecTypes, v::CoVecTypes, A::MatTypes, ::Val{-1}) + @avx for m ∈ 1:size(A, 2) + uₘ = zero(eltype(u)) + for k ∈ 1:size(A, 1) + uₘ -= v[k] * A[k, m] + end + u[m] += uₘ + end +end + +function add_gemm_kernel!(u::CoVecTypes, v::CoVecTypes, A::MatTypes, ::Val{factor}) where {factor} + @avx for m ∈ 1:size(A, 2) + uₘ = zero(eltype(u)) + for k ∈ 1:size(A, 1) + uₘ += factor * v[k] * A[k, m] + end + u[m] += uₘ + end +end diff --git a/src/matmul.jl b/src/matmul.jl index a0166cd..6f536dc 100644 --- a/src/matmul.jl +++ b/src/matmul.jl @@ -129,7 +129,7 @@ function _mul!(C::VecTypes{T}, A::MatTypes{T}, B::VecTypes{T}, sz) where {T<:Elt end end -function _mul_add!(C::VecTypes{T}, A::MatTypes{T}, B::VecTypes{T}, sz ::Val{factor} = Val(1)) where {factor, T<:Eltypes} +function _mul_add!(C::VecTypes{T}, A::MatTypes{T}, B::VecTypes{T}, sz, ::Val{factor} = Val(1)) where {factor, T<:Eltypes} n, k = size(A) if n >= sz+8 && k >= sz+8 block_mat_vec_mul_add!(C, A, B, sz, Val(factor)) @@ -139,3 +139,58 @@ function _mul_add!(C::VecTypes{T}, A::MatTypes{T}, B::VecTypes{T}, sz ::Val{fact add_gemm_kernel!(C, A, B, Val(factor)) end end + + +#----------------------------- +# covec-mat + +function check_compatible_sizes(C::CoVecTypes, A::CoVecTypes, B::MatTypes) + m = length(C) + n = length(A) + a, b = size(B) + @assert (n == a) && (m == b) "matrices of size $(size(C)), $(size(A)), $(size(B)) are incompatible" + nothing +end + +function blocked_mul(A::CoVecTypes, B::MatTypes) + T = promote_type(eltype(A), eltype(B)) + C = Vector{T}(undef, size(B,2))' + blocked_mul!(C, A, B) + C +end + +function blocked_mul(A::Adjoint{T, <:StructArray{Complex{T}, 1}}, B::StructArray{Complex{T}, 2}) where {T <: Eltypes} + C = StructArray{Complex{T}}((Vector{T}(undef, size(B, 2)), + Vector{T}(undef, size(B, 2))))' + blocked_mul!(C, A, B) + C +end + +function blocked_mul(A::Transpose{T, <:StructArray{Complex{T}, 1}}, B::StructArray{Complex{T}, 2}) where {T <: Eltypes} + C = StructArray{Complex{T}}((Vector{T}(undef, size(B, 2)), + Vector{T}(undef, size(B, 2)))) |> transpose + blocked_mul!(C, A, B) + C +end + +function _mul!(C::CoVecTypes{T}, A::CoVecTypes{T}, B::MatTypes{T}, sz) where {T<:Eltypes} + n, k, m = 1, size(A, 1), length(C) + if k >= sz+8 && m >= sz+8 + block_covec_mat_mul!(C, A, B, sz) + elseif k >= sz+8 && m < sz+8 + block_covec_vec_mul!(C, A, B, sz) + else + gemm_kernel!(C, A, B) + end +end + +function _mul_add!(C::CoVecTypes{T}, A::CoVecTypes{T}, B::MatTypes{T}, sz, ::Val{factor} = Val(1)) where {factor, T<:Eltypes} + n, k, m = 1, size(A, 1), length(C) + if k >= sz+8 && m >= sz+8 + block_covec_mat_mul!(C, A, B, sz, Val(factor)) + elseif k >= sz+8 && m < sz+8 + block_covec_vec_mul!(C, A, B, sz, Val(factor)) + else + add_gemm_kernel!(C, A, B, Val(factor)) + end +end diff --git a/src/pointermatrix.jl b/src/pointermatrix.jl index f649d38..9e3826e 100644 --- a/src/pointermatrix.jl +++ b/src/pointermatrix.jl @@ -1,7 +1,8 @@ PtrArray(M::AbstractMatrix) = PtrMatrix(M) -PtrArray(v::AbstractVector) = PtrVector(v) - +PtrArray(v::AbstractVector) = v#PtrVector(v) +PtrArray(v::Adjoint{T, <:AbstractVector}) where {T} = v#PointerAdjointVector(v) +PtrArray(v::Transpose{T, <:AbstractVector}) where {T} = v#PointerTransposeVector(v) struct PointerMatrix{T,P <: AbstractStridedPointer} <: AbstractMatrix{T} ptr::P @@ -34,21 +35,71 @@ Base.IndexStyle(::Type{<:PointerMatrix}) = IndexCartesian() +# This machinery below might not actually be needed. -struct PointerVector{T,P <: AbstractStridedPointer} <: AbstractVector{T} - ptr::P - length::Int - PointerVector(ptr::P, length::Int) where {T, P <: AbstractStridedPointer{T}} = new{T,P}(ptr, length) -end -@inline PtrVector(v::AbstractVector) = PointerVector(stridedpointer(v), length(v)) -@inline Base.pointer(v::PointerVector) = pointer(v.ptr) -@inline Base.size(v::PointerVector) = (v.length,) -@inline Base.length(v::PointerVector) = v.length -@inline Base.strides(v::PointerVector) = strides(v.ptr) -@inline stridedpointer(v::PointerVector) = v.ptr - -@inline function Base.maybeview(v::PointerVector, i::UnitRange) - PointerVector(gesp(v.ptr, (first(i) - 1,)), length(i)) -end +# struct PointerVector{T,P <: AbstractStridedPointer} <: AbstractVector{T} +# ptr::P +# length::Int +# PointerVector(ptr::P, length::Int) where {T, P <: AbstractStridedPointer{T}} = new{T,P}(ptr, length) +# end +# @inline PtrVector(v::AbstractVector) = PointerVector(stridedpointer(v), length(v)) +# @inline Base.pointer(v::PointerVector) = pointer(v.ptr) +# @inline Base.size(v::PointerVector) = (v.length,) +# @inline Base.length(v::PointerVector) = v.length +# @inline Base.strides(v::PointerVector) = strides(v.ptr) +# @inline stridedpointer(v::PointerVector) = v.ptr + +# @inline function Base.maybeview(v::PointerVector, i::UnitRange) +# PointerVector(gesp(v.ptr, (first(i) - 1,)), length(i)) +# end + +# Base.IndexStyle(::Type{<:PointerVector}) = IndexLinear() + + + +# struct PointerAdjointVector{T,P <: AbstractStridedPointer} <: AbstractMatrix{T} +# ptr::P +# length::Int +# PointerAdjointVector(ptr::P, length::Int) where {T, P <: AbstractStridedPointer{T}} = new{T,P}(ptr, length) +# end +# @inline function PointerAdjointVector(v::Adjoint{T, <:AbstractVector{T}}) where {T} +# vp = parent(v) +# if T <: Complex +# vp = conj.(p) #this copy is unfortunate, but I'm scared of mutating the user's data +# end +# PointerAdjointVector(stridedpointer(vp), length(v)) +# end +# @inline Base.pointer(v::PointerAdjointVector) = pointer(v.ptr) +# @inline Base.size(v::PointerAdjointVector) = (1,v.length) +# @inline Base.length(v::PointerAdjointVector) = v.length +# @inline Base.strides(v::PointerAdjointVector) = strides(v.ptr) +# @inline stridedpointer(v::PointerAdjointVector) = v.ptr + +# @inline function Base.maybeview(v::PointerAdjointVector, i::UnitRange) +# PointerVector(gesp(v.ptr, (first(i) - 1,)), length(i)) +# end + +# Base.IndexStyle(::Type{<:PointerAdjointVector}) = IndexLinear() + + + +# struct PointerTransposeVector{T,P <: AbstractStridedPointer} <: AbstractMatrix{T} +# ptr::P +# length::Int +# PointerTransposeVector(ptr::P, length::Int) where {T, P <: AbstractStridedPointer{T}} = new{T,P}(ptr, length) +# end +# @inline function PointerTransposeVector(v::Transpose{T, <:AbstractVector{T}}) where {T} +# vp = parent(v) +# PointerTransposeVector(stridedpointer(vp), length(v)) +# end +# @inline Base.pointer(v::PointerTransposeVector) = pointer(v.ptr) +# @inline Base.size(v::PointerTransposeVector) = (1,v.length) +# @inline Base.length(v::PointerTransposeVector) = v.length +# @inline Base.strides(v::PointerTransposeVector) = strides(v.ptr) +# @inline stridedpointer(v::PointerTransposeVector) = v.ptr + +# @inline function Base.maybeview(v::PointerTransposeVector, i::UnitRange) +# PointerVector(gesp(v.ptr, (first(i) - 1,)), length(i)) +# end -Base.IndexStyle(::Type{<:PointerVector}) = IndexLinear() +# Base.IndexStyle(::Type{<:PointerTransposeVector}) = IndexLinear() diff --git a/test/runtests.jl b/test/runtests.jl index 8c489e0..3543d50 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,7 +2,22 @@ using Test, LinearAlgebra, Random using Gaius using StructArrays -@testset "Matrix-Vector Float64 " begin +@testset "Float64 Matrix-CoVector " begin + for n ∈ [10, 100, 500, 2000] + for m ∈ [10, 100, 2000] + u = zeros(m) + A = rand(n, m) + v = rand(n) + @test blocked_mul!(u', v', A) ≈ v' * A + @test u' ≈ blocked_mul(v', A) + + @test blocked_mul!(transpose(u), transpose(v), A) ≈ transpose(v) * A + @test transpose(u) ≈ blocked_mul(transpose(v), A) + end + end +end + +@testset "Float64 Matrix-Vector " begin for n ∈ [10, 100, 500, 2000] for m ∈ [10, 100, 2000] u = zeros(n) @@ -26,7 +41,7 @@ end end end -@testset "Matrix-Vector Float32 " begin +@testset "Float32 Matrix-Vector " begin for n ∈ [10, 100, 500, 2000] for m ∈ [10, 100, 2000] u = zeros(Float32, n) @@ -38,7 +53,7 @@ end end end -@testset "Matrix-Vector Int64 " begin +@testset "Int64 Matrix-Vector " begin for n ∈ [10, 100, 500, 2000] for m ∈ [10, 100, 2000] u = zeros(Int, n) @@ -50,7 +65,7 @@ end end end -@testset "Matrix-Vector ComplexInt32 " begin +@testset "ComplexInt32 Matrix-Vector " begin for n ∈ [10, 100, 500, 2000] for m ∈ [10, 100, 2000] u = zeros(Complex{Int32}, n) |> StructArray From 767bc978ef46f80f486b9da3e20f3934110dc114 Mon Sep 17 00:00:00 2001 From: MasonProtter Date: Sat, 4 Apr 2020 00:54:26 -0600 Subject: [PATCH 6/8] Complex BLAS2 support with row major storage --- src/Gaius.jl | 25 ++++++++++++----------- src/matmul.jl | 47 ++++++++++++++++++++++++++++++++++++++++-- test/runtests.jl | 53 ++++++++++++++++++++++++++++++++---------------- 3 files changed, 93 insertions(+), 32 deletions(-) diff --git a/src/Gaius.jl b/src/Gaius.jl index 1d89554..239f68b 100644 --- a/src/Gaius.jl +++ b/src/Gaius.jl @@ -29,18 +29,19 @@ end include("pointermatrix.jl") -const Eltypes = Union{Float64, Float32, Int64, Int32, Int16} -const MatTypesC{T} = Union{Matrix{T}, - SubArray{T, 2, <: AbstractArray}, - PointerMatrix{T}, - UnsafeArray{T, 2}} # C for Column Major -const MatTypesR{T} = Union{Adjoint{T,<:MatTypesC{T}}, - Transpose{T,<:MatTypesC{T}}} # R for Row Major -const MatTypes{ T} = Union{MatTypesC{T}, MatTypesR{T}} - -const VecTypes{T} = Union{Vector{T}, SubArray{T, 1, <:Array}}#, PointerVector{T}} -const CoVecTypes{T} = Union{Adjoint{T, <:VecTypes{T}}, - Transpose{T, <:VecTypes{T}}}#, PointerAdjointVector{T}} +Eltypes = Union{Float64, Float32, Int64, Int32, Int16} +MatTypesC{T} = Union{Matrix{T}, + SubArray{T, 2, <: AbstractArray}, + PointerMatrix{T}, + UnsafeArray{T, 2}} # C for Column Major +MatTypesR{T} = Union{Adjoint{T,<:MatTypesC{T}}, + Transpose{T,<:MatTypesC{T}}} # R for Row Major +MatTypes{ T} = Union{MatTypesC{T}, MatTypesR{T}} + +VecTypes{T} = Union{Vector{T}, SubArray{T, 1, <:Array}} +CoVecTypes{T} = Union{Adjoint{T, <:VecTypes{T}}, + Transpose{T, <:VecTypes{T}}} + include("matmul.jl") include("block_operations.jl") diff --git a/src/matmul.jl b/src/matmul.jl index 6f536dc..3d48d47 100644 --- a/src/matmul.jl +++ b/src/matmul.jl @@ -58,6 +58,49 @@ function blocked_mul!(C::StructArray{Complex{T}}, A::StructArray{Complex{T}}, B: C end +function blocked_mul!(C::Adjoint{Complex{T}, <:StructArray{Complex{T}}}, + A::Adjoint{Complex{T}, <:StructArray{Complex{T}}}, + B::StructArray{Complex{T}}; + block_size = DEFAULT_BLOCK_SIZE, sizecheck=true) where {T <: Eltypes} + sizecheck && check_compatible_sizes(C.parent.re', A.parent.re', B.re) + + _block_size = choose_block_size(C, A, B, block_size) + A.parent.im .= (-).(A.parent.im) #ugly hack + GC.@preserve C A B begin + Cre, Cim = PtrArray(C.parent.re'), PtrArray(C.parent.im') + Are, Aim = PtrArray(A.parent.re'), PtrArray(A.parent.im') + Bre, Bim = PtrArray(B.re), PtrArray(B.im) + _mul!( Cre, Are, Bre, _block_size) # C.re = A.re * B.re + _mul_add!(Cre, Aim, Bim, _block_size, Val(-1)) # C.re = C.re - A.im * B.im + _mul!( Cim, Are, Bim, _block_size) # C.im = A.re * B.im + _mul_add!(Cim, Aim, Bre, _block_size) # C.im = C.im + A.im * B.re + end + A.parent.im .= (-).(A.parent.im) + C.parent.im .= (-).(C.parent.im) # ugly hack + C +end + +function blocked_mul!(C::Transpose{Complex{T}, <:StructArray{Complex{T}}}, + A::Transpose{Complex{T}, <:StructArray{Complex{T}}}, + B::StructArray{Complex{T}}; + block_size = DEFAULT_BLOCK_SIZE, sizecheck=true) where {T <: Eltypes} + sizecheck && check_compatible_sizes(C.parent.re |> transpose, A.parent.re |> transpose, B.re) + + _block_size = choose_block_size(C, A, B, block_size) + + GC.@preserve C A B begin + Cre, Cim = PtrArray(C.parent.re |> transpose), PtrArray(C.parent.im |> transpose) + Are, Aim = PtrArray(A.parent.re |> transpose), PtrArray(A.parent.im |> transpose) + Bre, Bim = PtrArray(B.re), PtrArray(B.im) + _mul!( Cre, Are, Bre, _block_size) # C.re = A.re * B.re + _mul_add!(Cre, Aim, Bim, _block_size, Val(-1)) # C.re = C.re - A.im * B.im + _mul!( Cim, Are, Bim, _block_size) # C.im = A.re * B.im + _mul_add!(Cim, Aim, Bre, _block_size) # C.im = C.im + A.im * B.re + end + C +end + + function _mul!(C, A, B, sz) n, k, m = size(C, 1), size(A, 2), size(C, 2) if n >= sz+8 && m >= sz+8 && k >= sz+8 @@ -159,14 +202,14 @@ function blocked_mul(A::CoVecTypes, B::MatTypes) C end -function blocked_mul(A::Adjoint{T, <:StructArray{Complex{T}, 1}}, B::StructArray{Complex{T}, 2}) where {T <: Eltypes} +function blocked_mul(A::Adjoint{Complex{T}, <:StructArray{Complex{T}, 1}}, B::StructArray{Complex{T}, 2}) where {T <: Eltypes} C = StructArray{Complex{T}}((Vector{T}(undef, size(B, 2)), Vector{T}(undef, size(B, 2))))' blocked_mul!(C, A, B) C end -function blocked_mul(A::Transpose{T, <:StructArray{Complex{T}, 1}}, B::StructArray{Complex{T}, 2}) where {T <: Eltypes} +function blocked_mul(A::Transpose{Complex{T}, <:StructArray{Complex{T}, 1}}, B::StructArray{Complex{T}, 2}) where {T <: Eltypes} C = StructArray{Complex{T}}((Vector{T}(undef, size(B, 2)), Vector{T}(undef, size(B, 2)))) |> transpose blocked_mul!(C, A, B) diff --git a/test/runtests.jl b/test/runtests.jl index 3543d50..dd5a4b4 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,24 +2,9 @@ using Test, LinearAlgebra, Random using Gaius using StructArrays -@testset "Float64 Matrix-CoVector " begin - for n ∈ [10, 100, 500, 2000] - for m ∈ [10, 100, 2000] - u = zeros(m) - A = rand(n, m) - v = rand(n) - @test blocked_mul!(u', v', A) ≈ v' * A - @test u' ≈ blocked_mul(v', A) - - @test blocked_mul!(transpose(u), transpose(v), A) ≈ transpose(v) * A - @test transpose(u) ≈ blocked_mul(transpose(v), A) - end - end -end - @testset "Float64 Matrix-Vector " begin - for n ∈ [10, 100, 500, 2000] - for m ∈ [10, 100, 2000] + for n ∈ [10, 100, 500, 10000] + for m ∈ [10, 100, 10000] u = zeros(n) A = rand(n, m) v = rand(m) @@ -29,7 +14,7 @@ end end end -@testset "Matrix-Vector ComplexF64 " begin +@testset "ComplexF64 Matrix-Vector " begin for n ∈ [10, 100, 500, 2000] for m ∈ [10, 100, 2000] u = zeros(ComplexF64, n) |> StructArray @@ -78,6 +63,38 @@ end end +@testset "Float64 CoVector-Matrix " begin + for n ∈ [10, 100, 500, 2000] + for m ∈ [10, 100, 2000] + u = zeros(m) + A = rand(n, m) + v = rand(n) + @test blocked_mul!(u', v', A) ≈ v' * A + @test u' ≈ blocked_mul(v', A) + + @test blocked_mul!(transpose(u), transpose(v), A) ≈ transpose(v) * A + @test transpose(u) ≈ blocked_mul(transpose(v), A) + end + end +end + +@testset "ComplexF32 CoVector-Matrix " begin + for n ∈ [10, 100, 500, 2000] + for m ∈ [10, 100, 2000] + u = zeros(Complex{Float32}, m) |> StructArray + A = rand(Complex{Float32}, n, m) |> StructArray + v = rand(Complex{Float32}, n) |> StructArray + @test blocked_mul!(u', v', A) ≈ v' * A + @test u' ≈ blocked_mul(v', A) + + @test blocked_mul!(transpose(u), transpose(v), A) ≈ transpose(v) * A + @test transpose(u) ≈ blocked_mul(transpose(v), A) + end + end +end + + + @testset "ComplexFloat64 Matrix-Matrix" begin for sz ∈ [10, 50, 100, 200, 400, 1000] From 134d01b6afc6af5f962f2f4870c89432a30a8b3f Mon Sep 17 00:00:00 2001 From: MasonProtter Date: Sat, 4 Apr 2020 00:55:35 -0600 Subject: [PATCH 7/8] increment version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index b41027d..3b34f2d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Gaius" uuid = "bffe22d1-cb55-4f4e-ac2c-f4dd4bf58912" authors = ["MasonProtter "] -version = "0.1.0" +version = "0.2.0" [deps] LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" From a45343a3cb503e63ee02c68a3b5215386e413d18 Mon Sep 17 00:00:00 2001 From: MasonProtter Date: Sat, 4 Apr 2020 01:05:38 -0600 Subject: [PATCH 8/8] test on 1.4 --- .travis.yml | 1 + Project.toml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 2179931..2f77c41 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,7 @@ os: - linux julia: - 1.3 + - 1.4 - nightly matrix: diff --git a/Project.toml b/Project.toml index 3b34f2d..2b4afd2 100644 --- a/Project.toml +++ b/Project.toml @@ -11,11 +11,11 @@ StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" UnsafeArrays = "c4a57d5a-5b31-53a6-b365-19f8c011fbd6" [compat] -julia = "1.3" +julia = "1.3, 1.4" [extras] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "BenchmarkTools"] +test = ["Test", "BenchmarkTools"] \ No newline at end of file