diff --git a/Project.toml b/Project.toml index f390121..4e762ed 100644 --- a/Project.toml +++ b/Project.toml @@ -4,7 +4,6 @@ authors = ["Fredrik Bagge Carlson"] version = "0.1.1" [deps] -DSP = "717857b8-e6f2-59f4-9121-6e50c889abd2" Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" @@ -14,11 +13,10 @@ SlidingDistancesBase = "25b0cc0c-38e4-462f-a11d-8564868c562d" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [compat] -DSP = "0.6" LoopVectorization = "0.7.4" ProgressMeter = "1.2" RecipesBase = "0.8, 1.0" -SlidingDistancesBase = "0.1" +SlidingDistancesBase = "0.1.3" julia = "1" [extras] diff --git a/examples/mixed_bag.jl b/examples/mixed_bag.jl index 21fac0f..bd230ed 100644 --- a/examples/mixed_bag.jl +++ b/examples/mixed_bag.jl @@ -4,14 +4,14 @@ datapath = "/home/fredrikb/Downloads/MixedBag/" cd(datapath) ## Some data -T = parse.(Int, split(join(Char.(read("01911m_02019m_III_7680_200.txt"))), ',')) +T = parse.(Float64, split(join(Char.(read("01911m_02019m_III_7680_200.txt"))), ',')) snips = snippets(T, 3, 200, m=50) plot(snips) ## Power demand -T = parse.(Float32, readlines("Powerdemand_12_4500_200.txt")) -snips = snippets(T, 4, 24, m=6) +T = parse.(Float64, readlines("Powerdemand_12_4500_200.txt")) +snips = snippets(T, 4, 24, m=24) plot(snips) profile = matrix_profile(T, 24) @@ -21,7 +21,7 @@ plot(mot) ## Walking styles -T = parse.(Float32, readlines("PAMAP_Subject4_NormalWalking_NordicWalking_17001_500.txt")) +T = parse.(Float64, readlines("PAMAP_Subject4_NormalWalking_NordicWalking_17001_500.txt")) T = T[1:2:end] snips = snippets(T, 4, 200) plot(snips) @@ -33,7 +33,7 @@ plot(profile, mot, legend=false) ## Robot dog -T = parse.(Float32, readlines("RoboticDogActivityY_64_4000_400.txt")) +T = parse.(Float64, readlines("RoboticDogActivityY_64_4000_400.txt")) snips = snippets(T, 4, 100) plot(snips) diff --git a/src/MatrixProfile.jl b/src/MatrixProfile.jl index a000917..3580274 100644 --- a/src/MatrixProfile.jl +++ b/src/MatrixProfile.jl @@ -1,15 +1,14 @@ module MatrixProfile using Statistics, LinearAlgebra -using DSP using LoopVectorization using ProgressMeter using RecipesBase using Distances -export Euclidean using SlidingDistancesBase -import SlidingDistancesBase: floattype, lastlength, distance_profile, distance_profile! +import SlidingDistancesBase: floattype, lastlength, distance_profile, distance_profile!, window_dot +export Euclidean, ZEuclidean export matrix_profile, distance_profile, motifs, anomalies, mpdist, mpdist_profile, onsets, snippets, seqs, resample @@ -23,9 +22,9 @@ struct Profile{TT,TP,QT} end """ - profile = matrix_profile(T, m; showprogress=true) + profile = matrix_profile(T, m, [dist = ZEuclidean()]; showprogress=true) -Return the matrix profile and the profile indices of time series `T` with window length `m`. See fields `profile.P, profile.I`. You can also plot the profile. Uses the STOMP algorithm. +Return the matrix profile and the profile indices of time series `T` with window length `m`. See fields `profile.P, profile.I`. You can also plot the profile. If `dist = ZEuclidean()` the STOMP algorithm will be used. Reference: [Matrix profile II](https://www.cs.ucr.edu/~eamonn/STOMP_GPU_final_submission_camera_ready.pdf). """ @@ -33,10 +32,10 @@ function matrix_profile(T::AbstractVector{<:Number}, m::Int; showprogress=true) n = lastlength(T) l = n-m+1 n > 2m+1 || throw(ArgumentError("Window length too long, maximum length is $((n+1)÷2)")) - μ,σ = running_mean_std(T, m) + μ,σ = sliding_mean_std(T, m) QT = window_dot(getwindow(T, m, 1), T) - QT₀ = copy(QT) - D = distance_profile(Euclidean(), QT, μ, σ, m) + QT₀ = copy(QT) + D = distance_profile(ZEuclidean(), QT, μ, σ, m) P = copy(D) I = ones(Int, l) prog = Progress((l - 1) ÷ 5, dt=1, desc="Matrix profile", barglyphs = BarGlyphs("[=> ]"), color=:blue) @@ -47,7 +46,7 @@ function matrix_profile(T::AbstractVector{<:Number}, m::Int; showprogress=true) # The expression with fastmath appears to be both more accurate and faster than both muladd and fma end QT[1] = QT₀[i] - distance_profile!(D, Euclidean(), QT, μ, σ, m, i) + distance_profile!(D, ZEuclidean(), QT, μ, σ, m, i) update_min!(P, I, D, i) showprogress && i % 5 == 0 && next!(prog) end @@ -60,12 +59,12 @@ function matrix_profile(A::AbstractVector{<:Number}, T::AbstractVector{<:Number} l = n-m+1 lT = lastlength(T)-m+1 # n > 2m+1 || throw(ArgumentError("Window length too long, maximum length is $((n+1)÷2)")) - μT,σT = running_mean_std(T, m) - μA,σA = running_mean_std(A, m) + μT,σT = sliding_mean_std(T, m) + μA,σA = sliding_mean_std(A, m) QT = window_dot(getwindow(A, m, 1), T) @assert length(QT) == lT QT₀ = copy(QT) - D = distance_profile(Euclidean(), QT, μA, σA, μT, σT, m) + D = distance_profile(ZEuclidean(), QT, μA, σA, μT, σT, m) P = copy(D) I = ones(Int, lT) prog = Progress((l - 1) ÷ 5, dt=1, desc="Matrix profile", barglyphs = BarGlyphs("[=> ]"), color=:blue) @@ -74,49 +73,19 @@ function matrix_profile(A::AbstractVector{<:Number}, T::AbstractVector{<:Number} @fastmath QT[j] = QT[j-1] - T[j-1] * A[i-1] + T[j+m-1] * A[i+m-1] end QT[1] = dot(getwindow(A, m, i), getwindow(T,m,1)) #QT₀[i] - distance_profile!(D, Euclidean(), QT, μA, σA, μT, σT, m, i) + distance_profile!(D, ZEuclidean(), QT, μA, σA, μT, σT, m, i) update_min!(P, I, D, i, false) showprogress && i % 5 == 0 && next!(prog) end Profile(T, P, I, m, A) end -matrix_profile(T::AbstractVector{<:Number}, m::Int, ::Euclidean; kwargs...) = +matrix_profile(T::AbstractVector{<:Number}, m::Int, ::ZEuclidean; kwargs...) = matrix_profile(T, m; kwargs...) -matrix_profile(A::AbstractVector{<:Number}, T::AbstractVector{<:Number}, m::Int, ::Euclidean; kwargs...) = +matrix_profile(A::AbstractVector{<:Number}, T::AbstractVector{<:Number}, m::Int, ::ZEuclidean; kwargs...) = matrix_profile(A, T, m; kwargs...) -function distance_profile!(D::AbstractVector{S},::Euclidean, QT::AbstractVector{S}, μ, σ, m::Int, i::Int) where S <: Number - @assert i <= length(D) - @avx for j = eachindex(D) - frac = (QT[j] - m*μ[i]*μ[j]) / (m*σ[i]*σ[j]) - D[j] = sqrt(max(2m*(1-frac), 0)) - end - D[i] = typemax(eltype(D)) - D -end - - -function distance_profile!(D::AbstractVector{S},::Euclidean, QT::AbstractVector{S}, μA, σA, μT, σT, m::Int, i::Int) where S <: Number - @assert i <= length(μA) - @avx for j = eachindex(D,QT,μT,σT) - frac = (QT[j] - m*μA[i]*μT[j]) / (m*σA[i]*σT[j]) - D[j] = sqrt(max(2m*(1-frac), 0)) - end - D -end - -distance_profile(::Euclidean, - QT::AbstractVector{S}, - μ::AbstractVector{S}, - σ::AbstractVector{S}, - m::Int, -) where {S<:Number} = distance_profile!(similar(μ), Euclidean(), QT, μ, σ, m, 1) - -distance_profile(::Euclidean, QT::AbstractVector{S}, μA, σA, μT, σT, m::Int) where {S<:Number} = - distance_profile!(similar(μT), Euclidean(), QT, μA, σA, μT, σT, m, 1) - function update_min!(P, I, D, i, sym=true) n = lastlength(P) @@ -130,82 +99,6 @@ function update_min!(P, I, D, i, sym=true) end end -""" -Input: A query Q, and a user provided time series T -Output: The dot product between Q and all subsequences in T -""" -function window_dot(Q, T) - n = length(T) - m = length(Q) - QT = conv(reverse(Q), T) - return QT[m:n] -end - -function running_mean_std(x::AbstractArray{T}, m) where T - @assert length(x) >= m - n = length(x)-m+1 - s = ss = zero(T) - μ = similar(x, n) - σ = similar(x, n) - @fastmath @inbounds for i = 1:m - s += x[i] - ss += x[i]^2 - end - μ[1] = s/m - σ[1] = sqrt(ss/m - μ[1]^2) - @fastmath @inbounds for i = 1:n-1 # fastmath making it more accurate here as well, but not faster - s -= x[i] - ss -= x[i]^2 - s += x[i+m] - ss += x[i+m]^2 - μ[i+1] = s/m - σ[i+1] = sqrt(ss/m - μ[i+1]^2) - end - μ,σ -end - -function moving_mean!(μ,x::AbstractArray{T}, m) where T - # filt(ones(m), [m], x) - @assert length(x) >= m - n = length(x)-m+1 - s = zero(T) - @fastmath @inbounds for i = 1:m - s += x[i] - end - μ[1] = s/m - @fastmath @inbounds for i = 1:n-1 # fastmath making it more accurate here as well, but not faster - s -= x[i] - s += x[i+m] - μ[i+1] = s/m - end - μ -end - -function znorm(x::AbstractVector) - x = x .- mean(x) - x ./= std(x, mean=0, corrected=false) -end - - - - -""" - distance_profile(::Euclidean, Q, T) - -Compute the z-normalized Euclidean distance profile corresponding to sliding `Q` over `T` -""" -function distance_profile!(D::AbstractVector{S}, ::Euclidean, Q::AbstractVector{S}, T::AbstractVector{S}) where S <: Number - m = length(Q) - μ,σ = running_mean_std(T, m) - QT = window_dot(znorm(Q), T) - @avx for j = eachindex(D) - frac = QT[j] / (m*σ[j]) - D[j] = sqrt(max(2m*(1-frac), 0)) - end - D -end -distance_profile(::Euclidean, Q::AbstractArray{S}, T::AbstractArray{S}) where {S} = - distance_profile!(similar(T, length(T) - length(Q) + 1), Euclidean(), Q, T) ## General data and distance =================================================== diff --git a/src/mpdist.jl b/src/mpdist.jl index 4919fc4..ef0cd34 100644 --- a/src/mpdist.jl +++ b/src/mpdist.jl @@ -1,11 +1,11 @@ """ - mpdist(A, B, m, k = ((length(A) + length(B)) - 2m) ÷ 20) + mpdist(A, B, m, d = ZEuclidean, k = ((length(A) + length(B)) - 2m) ÷ 20) The MP distance between `A` and `B` using window length `M` and returning the `k`th smallest value. """ -function mpdist(A,B,m,d=Euclidean(),k=(length(A)+length(B)-2m)÷20) +function mpdist(A,B,m,d=ZEuclidean(),k=(length(A)+length(B)-2m)÷20) p1 = matrix_profile(A,B,m,d) p2 = matrix_profile(B,A,m,d) partialsort([p1.P; p2.P], k) @@ -28,29 +28,32 @@ function mpdist(MP::AbstractArray, th::Real, N::Int) end """ - mpdist_profile(T::AbstractVector, S::Int, m::Int) + mpdist_profile(T::AbstractVector, S::Int, m::Int, [d]) All MP distance profiles between subsequences of length `S` in `T` using internal window length `m`. """ -function mpdist_profile(T::AbstractVector,S::Int, m::Int, d=Euclidean()) +function mpdist_profile(T::AbstractVector,S::Int, m::Int, d::DT = ZEuclidean()) where DT S >= m || throw(ArgumentError("S should be > m")) n = lastlength(T) pad = S * ceil(Int, n / S) - n - T = [T;zeros(pad)] + T = [T;zeros(eltype(T), pad)] N = S-m+1 - D = similar(T, lastlength(T)-m+1, N) - moving_means = similar(D) - m_profile = similar(T, 2N) - - @showprogress 1 "MP dist profile" map(1:S:n-S) do i - d = mpdist_profile(getwindow(T,S,i), T, m, d; D=D, moving_means=moving_means, m_profile=m_profile) + D = Matrix{float(eltype(T))}(undef, lastlength(T)-m+1, N) + sliding_means = similar(D) + m_profile = similar(D, 2N) + + prog = Progress((n-S)÷S, dt=1, desc="MP dist profile") + map(1:S:n-S) do i + dp = mpdist_profile(getwindow(T,S,i), T, m, d, D, sliding_means, m_profile) + next!(prog) + dp end end """ - mpdist_profile(A::AbstractVector, B::AbstractVector, m::Int) + mpdist_profile(A::AbstractVector, B::AbstractVector, m::Int, [d]) MP distance profile between two time series using window length `m`. """ @@ -58,13 +61,13 @@ function mpdist_profile( B::AbstractVector, A::AbstractVector, m::Int, - d = Euclidean(); - D = similar(A, lastlength(A)-m+1, lastlength(B)-m+1), - moving_means = similar(D), + d = ZEuclidean(), + D = Matrix{float(eltype(A))}(undef, lastlength(A)-m+1, lastlength(B)-m+1), + sliding_means = similar(D), m_profile = similar(D, 2size(D,2)) ) mpdistmat!(D,A,B,m,d) - th = 0.05 + th = floattype(B)(0.05) lA = lastlength(A) lB = lastlength(B) lA >= lB || @@ -73,12 +76,12 @@ function mpdist_profile( rows, cols = size(D) right_margs = vec(minimum(D, dims = 2)) for i = 1:cols - moving_mean!(moving_means[!, i], D[!, i], cols) + sliding_mean!(sliding_means[!, i], D[!, i], cols) end l = lA - lB + 1 map(1:l) do i - m_profile[1:cols] .= @view moving_means[i+cols÷2, :] # left marg + m_profile[1:cols] .= @view sliding_means[i+cols÷2, :] # left marg copyto!(m_profile, cols+1, right_margs, i, cols) mpdist(m_profile, th, 2 * lastlength(B)) end @@ -87,7 +90,7 @@ end function mpdistmat!(D, A::AbstractVector, B::AbstractVector, m::Int, d) N = lastlength(B)-m +1 for i = 1:N # Not thread safe - distance_profile!(D[!,i], Euclidean(), getwindow(B, m, i), A) + distance_profile!(D[!,i], ZEuclidean(), getwindow(B, m, i), A) end D end @@ -100,12 +103,12 @@ end Summarize time series `T` by extracting `k` snippets of length `S` The parameter `m` controls the window length used internally. """ -function snippets(T, k, S, d=Euclidean(); m = max(S÷10, 4), th=S) - D = mpdist_profile(T,S,m,d) - INF = typemax(floattype(T)) - Q = fill(INF, length(D[1])) - n = lastlength(T) - minI = 0 +function snippets(T, k, S, d=ZEuclidean(); m = max(S÷10, 4), th=S) + D = mpdist_profile(T,S,m,d) + INF = typemax(floattype(T)) + Q = fill(INF, length(D[1])) + n = lastlength(T) + minI = 0 onsets = zeros(Int, k) snippet_profiles = similar(D, k) for j = 1:k diff --git a/test/runtests.jl b/test/runtests.jl index 14aa467..726a002 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,13 +2,13 @@ using MatrixProfile using Test, Statistics, LinearAlgebra, Plots using SlidingDistancesBase -using MatrixProfile: znorm, window_dot, Profile +using MatrixProfile: Profile normdist = (x,y)->norm(znorm(x)-znorm(y)) function naive_matrix_profile(A,B,m) res = map(1:length(B)-m+1) do i - findmin(distance_profile(Euclidean(), getwindow(B,m,i), A)) + findmin(distance_profile(ZEuclidean(), getwindow(B,m,i), A)) end Profile(B,getindex.(res, 1), getindex.(res, 2), m, A) end @@ -23,44 +23,6 @@ end @testset "MatrixProfile.jl" begin - - @testset "running stats" begin - @info "Testing running stats" - for l1 = [20,30] - for l2 = [20,30] - x = randn(l1) - y = randn(l2) - for w = 2:10 - mx,sx = MatrixProfile.running_mean_std(x, w) - @test length(mx) == length(sx) == length(x)-w+1 - @test mx[1] ≈ mean(x[1:w]) - @test sx[1] ≈ std(x[1:w], corrected=false) - - @test mx[2] ≈ mean(x[2:w+1]) - @test sx[2] ≈ std(x[2:w+1], corrected=false) - - my,sy = MatrixProfile.running_mean_std(y, w) - QT = window_dot(getwindow(x,w,1), y) - D = distance_profile(Euclidean(), QT, mx,sx,my,sy,w) - @test D[1] ≈ normdist(x[1:w], y[1:w]) atol=1e-5 - @test D[2] ≈ normdist(x[1:w], y[1 .+ (1:w)]) atol=1e-5 - @test D[5] ≈ normdist(x[1:w], y[4 .+ (1:w)]) atol=1e-5 - - end - end - end - end - - @testset "distance profile" begin - @info "Testing distance profile" - - Q = randn(5) - T = [randn(5); Q; randn(5)] - D = MatrixProfile.distance_profile(Euclidean(), Q, T) - @test D[6] < 1e-6 - @test D[1] ≈ norm(znorm(Q) - znorm(T[1:5])) - end - t = range(0, stop=1, step=1/10) y0 = sin.(2pi .* t) T = [randn(50); y0; randn(50); y0; randn(50)] @@ -112,13 +74,9 @@ end profile2.I[i] ∈ (51,112) || profile2.I[i] == profile.I[i] end - - # Test generic between two series profile4 = matrix_profile(T, T, length(y0), normdist) @test all(profile4.P .< 1e-6) - - end @@ -156,7 +114,6 @@ end @testset "mpdist" begin @info "Testing mpdist" - t = 1:0.01:3 A = @. sin(2pi * t) + 0.1 * randn() B = @. sign(sin(2pi * t)) + 0.1 * randn()