diff --git a/Project.toml b/Project.toml index b62720e..0237bb1 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MatrixProfile" uuid = "24e37439-14ec-4097-bda3-6a65822e2305" authors = ["Fredrik Bagge Carlson"] -version = "0.1.3" +version = "1.0.0" [deps] Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" diff --git a/README.md b/README.md index 2c82100..38cd964 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,9 @@ If `T` is a high-dimensional array, time is considered to be the last axis. `T` ## MP distance See `mpdist(A,B,m)`. +## Segmentation / change-point detection +The most likely segmentation of a time series into two is calculated using `segment(p::Profile)`. A more detailed analysis can be performed using `sp = segment_profile(p::Profile)` which returns a vector of the same length as `p`, where a low value at index `i` indicates that few nearest-neighbor arcs pass over index `i`, `sp` thus form sort-of a "segmentation profile". + ## Time series snippets To summarize a time series in the form of a small number of snippets, we have the function `snippets`. @@ -88,5 +91,6 @@ This function can take a while to run for long time-series, for `length(T) = 15k ## References - The STOMP algorithm used in `matrix_profile` is detailed in the paper [Matrix profile II](https://www.cs.ucr.edu/~eamonn/STOMP_GPU_final_submission_camera_ready.pdf). +- The algorithm used in `segment` and `segment_profile` comes from [Matrix Profile VIII](https://www.cs.ucr.edu/~eamonn/Segmentation_ICDM.pdf) - The MP distance is described in [Matrix profile XII](https://www.cs.ucr.edu/~eamonn/MPdist_Expanded.pdf) -- The algorithm for extraction of time-series snippets comes from [Matrix profile XIII](https://www.cs.ucr.edu/~eamonn/Time_Series_Snippets_10pages.pdf) +- The algorithm for extraction of time-series snippets comes from [Matrix profile XIII](https://www.cs.ucr.edu/~eamonn/Time_Series_Snippets_10pages.pdf) \ No newline at end of file diff --git a/src/MatrixProfile.jl b/src/MatrixProfile.jl index 2bb8aba..6bedac5 100644 --- a/src/MatrixProfile.jl +++ b/src/MatrixProfile.jl @@ -10,7 +10,7 @@ using SlidingDistancesBase import SlidingDistancesBase: floattype, lastlength, distance_profile, distance_profile!, window_dot export Euclidean, ZEuclidean -export matrix_profile, distance_profile, motifs, anomalies, mpdist, mpdist_profile, onsets, snippets, seqs, resample +export matrix_profile, distance_profile, motifs, anomalies, mpdist, mpdist_profile, onsets, snippets, seqs, resample, segment, segment_profile struct Profile{TT,TP,QT} @@ -146,6 +146,7 @@ end include("motifs.jl") include("mpdist.jl") +include("segment.jl") include("plotting.jl") diff --git a/src/motifs.jl b/src/motifs.jl index 0ea32e1..e874988 100644 --- a/src/motifs.jl +++ b/src/motifs.jl @@ -27,6 +27,15 @@ seqs(m::Array{<:Subsequence}) = getfield.(m, :seq) seqlength(m::Array{<:Subsequence}) = length(m[1].seq) subseqtype(m::Array{<:Subsequence}) = subseqtype(m[1]) +""" + motifs(p::Profile, k, found_motifs = Motif[]; r=2, th = p.m, dist = ZEuclidean()) + +- `k` is the number of motifs to extract +- `r` controls how similar two windows must be to belong to the same motif. A higher value leads to more windows being grouped together. +- `th` is a threshold on how nearby in time two motifs are allowed to be. + +Also see the function `anomalies(profile)` to find anomalies in the data, sometimes called *discords*. +""" function motifs(p::Profile, k, found_motifs = Motif[]; r=2, th = p.m, dist = ZEuclidean()) length(found_motifs) == k && return found_motifs m = p.m @@ -38,7 +47,7 @@ function motifs(p::Profile, k, found_motifs = Motif[]; r=2, th = p.m, dist = ZEu perm = sortperm(P) onsets = [perm[1]] j = 2 - while P[perm[j]] < (d + 1e-5)*r + while P[perm[j]] < (d + 1e-5)*r && j < length(perm) all(abs.(perm[j] .- onsets) .> th) && push!(onsets, perm[j]) j += 1 end diff --git a/src/segment.jl b/src/segment.jl new file mode 100644 index 0000000..d9fa910 --- /dev/null +++ b/src/segment.jl @@ -0,0 +1,40 @@ +"expected_arc(i,n) this polynomial indicates the expected number of NN-arcs that would pass over index `i` for a series of length`n` if the series was completely random. This is used to normalize the `segment_profile` to account for bias towards the edges." +expected_arc(i,n) = 2*(-i^2*n/(n-1)^2 + i*n/(n-1)) # polynomial coefficients solved for by using constraints e(0) = 0, e(n-1) = 0, e(n/2) = n/2 + +""" + segment_profile(p::Profile) + +Calculate the MP-index of a matrix profile. This index has the same length as the profile, and tells you how many nearest-neighbor arcs passes over index `i`. It's normalized by the expected number of arcs for a completely random time series, so that a value of 1 indicates a poor segmentation point, and a value close to 0 indicates a likely segmentation point. +""" +function segment_profile(p::Profile) + I = p.I + n = length(I) + mpi = zeros(eltype(p.P), n) + for i in 1:n + Ii = I[i] + if Ii > i + for j = i:Ii-1 + mpi[j] += 1 + end + else + for j = Ii+1:i + mpi[j] += 1 + end + end + end + mpi .= min.(mpi ./ expected_arc.(0:n-1, n), 1) + mpi +end + +""" + i = segment(p::Profile) + +Returns an index `i` indicating the most likely segmentation point of profile `p`, i.e., the point which the fewst nearest-neighbor arcs passes over. + +Ref: Matrix Profile VIII: Domain Agnostic Online Semantic +Segmentation at Superhuman Performance Levels +""" +function segment(p::Profile) + argmin(segment_profile(p)) +end + diff --git a/test/runtests.jl b/test/runtests.jl index 4cc0b46..f11aa8c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -136,6 +136,30 @@ end end + @testset "segment_profile and segmentation" begin + @info "Testing segment_profile and segmentation" + + @test MatrixProfile.expected_arc(0,10) == 0 + @test MatrixProfile.expected_arc(9,10) == 0 + @test MatrixProfile.expected_arc(9/2,10) ≈ 5 + + x = randn(1000) + p = matrix_profile(x, 20) + s = segment_profile(p) + @test minimum(s) > 0.5 + + x = [sin.(1:0.1:100); sign.(sin.(1:0.1:100))] + x .+= 0.01 .* randn.() + p = p = matrix_profile(x, 20) + s = segment_profile(p) + val, ind = findmin(s) + @test abs(ind-length(s)÷2) < 10 + @test val < 0.01 + + @test segment(p) == ind + end + + end