From 4f45a44160189788de8124edce314658b5b13b0f Mon Sep 17 00:00:00 2001
From: Fredrik Bagge Carlson <baggepinnen@gmail.com>
Date: Sun, 22 Nov 2020 15:41:44 +0100
Subject: [PATCH] add segmentation

---
 Project.toml         |  2 +-
 README.md            |  6 +++++-
 src/MatrixProfile.jl |  3 ++-
 src/motifs.jl        | 11 ++++++++++-
 src/segment.jl       | 40 ++++++++++++++++++++++++++++++++++++++++
 test/runtests.jl     | 24 ++++++++++++++++++++++++
 6 files changed, 82 insertions(+), 4 deletions(-)
 create mode 100644 src/segment.jl

diff --git a/Project.toml b/Project.toml
index b62720e..0237bb1 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "MatrixProfile"
 uuid = "24e37439-14ec-4097-bda3-6a65822e2305"
 authors = ["Fredrik Bagge Carlson"]
-version = "0.1.3"
+version = "1.0.0"
 
 [deps]
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
diff --git a/README.md b/README.md
index 2c82100..38cd964 100644
--- a/README.md
+++ b/README.md
@@ -68,6 +68,9 @@ If `T` is a high-dimensional array, time is considered to be the last axis. `T`
 ## MP distance
 See `mpdist(A,B,m)`.
 
+## Segmentation / change-point detection
+The most likely segmentation of a time series into two is calculated using `segment(p::Profile)`. A more detailed analysis can be performed using `sp = segment_profile(p::Profile)` which returns a vector of the same length as `p`, where a low value at index `i` indicates that few nearest-neighbor arcs pass over index `i`, `sp` thus form sort-of a "segmentation profile".
+
 
 ## Time series snippets
 To summarize a time series in the form of a small number of snippets, we have the function `snippets`.
@@ -88,5 +91,6 @@ This function can take a while to run for long time-series, for `length(T) = 15k
 
 ## References
 - The STOMP algorithm used in `matrix_profile` is detailed in the paper [Matrix profile II](https://www.cs.ucr.edu/~eamonn/STOMP_GPU_final_submission_camera_ready.pdf).
+- The algorithm used in `segment` and `segment_profile` comes from [Matrix Profile VIII](https://www.cs.ucr.edu/~eamonn/Segmentation_ICDM.pdf)
 - The MP distance is described in [Matrix profile XII](https://www.cs.ucr.edu/~eamonn/MPdist_Expanded.pdf)
-- The algorithm for extraction of time-series snippets comes from [Matrix profile XIII](https://www.cs.ucr.edu/~eamonn/Time_Series_Snippets_10pages.pdf)
+- The algorithm for extraction of time-series snippets comes from [Matrix profile XIII](https://www.cs.ucr.edu/~eamonn/Time_Series_Snippets_10pages.pdf)
\ No newline at end of file
diff --git a/src/MatrixProfile.jl b/src/MatrixProfile.jl
index 2bb8aba..6bedac5 100644
--- a/src/MatrixProfile.jl
+++ b/src/MatrixProfile.jl
@@ -10,7 +10,7 @@ using SlidingDistancesBase
 import SlidingDistancesBase: floattype, lastlength, distance_profile, distance_profile!, window_dot
 export Euclidean, ZEuclidean
 
-export matrix_profile, distance_profile, motifs, anomalies, mpdist, mpdist_profile, onsets, snippets, seqs, resample
+export matrix_profile, distance_profile, motifs, anomalies, mpdist, mpdist_profile, onsets, snippets, seqs, resample, segment, segment_profile
 
 
 struct Profile{TT,TP,QT}
@@ -146,6 +146,7 @@ end
 
 include("motifs.jl")
 include("mpdist.jl")
+include("segment.jl")
 include("plotting.jl")
 
 
diff --git a/src/motifs.jl b/src/motifs.jl
index 0ea32e1..e874988 100644
--- a/src/motifs.jl
+++ b/src/motifs.jl
@@ -27,6 +27,15 @@ seqs(m::Array{<:Subsequence}) = getfield.(m, :seq)
 seqlength(m::Array{<:Subsequence}) = length(m[1].seq)
 subseqtype(m::Array{<:Subsequence}) = subseqtype(m[1])
 
+"""
+    motifs(p::Profile, k, found_motifs = Motif[]; r=2, th = p.m, dist = ZEuclidean())
+
+- `k` is the number of motifs to extract
+- `r` controls how similar two windows must be to belong to the same motif. A higher value leads to more windows being grouped together.
+- `th` is a threshold on how nearby in time two motifs are allowed to be.
+
+Also see the function `anomalies(profile)` to find anomalies in the data, sometimes called *discords*.
+"""
 function motifs(p::Profile, k, found_motifs = Motif[]; r=2, th = p.m, dist = ZEuclidean())
     length(found_motifs) == k && return found_motifs
     m = p.m
@@ -38,7 +47,7 @@ function motifs(p::Profile, k, found_motifs = Motif[]; r=2, th = p.m, dist = ZEu
     perm = sortperm(P)
     onsets = [perm[1]]
     j = 2
-    while P[perm[j]] < (d + 1e-5)*r
+    while P[perm[j]] < (d + 1e-5)*r && j < length(perm)
         all(abs.(perm[j] .- onsets) .> th) && push!(onsets, perm[j])
         j += 1
     end
diff --git a/src/segment.jl b/src/segment.jl
new file mode 100644
index 0000000..d9fa910
--- /dev/null
+++ b/src/segment.jl
@@ -0,0 +1,40 @@
+"expected_arc(i,n) this polynomial indicates the expected number of NN-arcs that would pass over index `i` for a series of length`n` if the series was completely random. This is used to normalize the `segment_profile` to account for bias towards the edges."
+expected_arc(i,n) = 2*(-i^2*n/(n-1)^2 + i*n/(n-1)) # polynomial coefficients solved for by using constraints e(0) = 0, e(n-1) = 0, e(n/2) = n/2
+
+"""
+    segment_profile(p::Profile)
+
+Calculate the MP-index of a matrix profile. This index has the same length as the profile, and tells you how many nearest-neighbor arcs passes over index `i`. It's normalized by the expected number of arcs for a completely random time series, so that a value of 1 indicates a poor segmentation point, and a value close to 0 indicates a likely segmentation point.
+"""
+function segment_profile(p::Profile)
+    I = p.I
+    n = length(I)
+    mpi = zeros(eltype(p.P), n)
+    for i in 1:n
+        Ii = I[i]
+        if Ii > i
+            for j = i:Ii-1
+                mpi[j] += 1
+            end
+        else
+            for j = Ii+1:i
+                mpi[j] += 1
+            end
+        end
+    end
+    mpi .= min.(mpi ./ expected_arc.(0:n-1, n), 1)
+    mpi
+end
+
+"""
+    i = segment(p::Profile)
+
+Returns an index `i` indicating the most likely segmentation point of profile `p`, i.e., the point which the fewst nearest-neighbor arcs passes over.
+
+Ref: Matrix Profile VIII: Domain Agnostic Online Semantic
+Segmentation at Superhuman Performance Levels
+"""
+function segment(p::Profile)
+    argmin(segment_profile(p))
+end
+
diff --git a/test/runtests.jl b/test/runtests.jl
index 4cc0b46..f11aa8c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -136,6 +136,30 @@ end
    end
 
 
+   @testset "segment_profile and segmentation" begin
+       @info "Testing segment_profile and segmentation"
+
+        @test MatrixProfile.expected_arc(0,10) == 0
+        @test MatrixProfile.expected_arc(9,10) == 0
+        @test MatrixProfile.expected_arc(9/2,10) ≈ 5
+
+        x = randn(1000)
+        p = matrix_profile(x, 20)
+        s = segment_profile(p)
+        @test minimum(s) > 0.5
+
+        x = [sin.(1:0.1:100); sign.(sin.(1:0.1:100))]
+        x .+= 0.01 .* randn.()
+        p = p = matrix_profile(x, 20)
+        s = segment_profile(p)
+        val, ind = findmin(s)
+        @test abs(ind-length(s)÷2) < 10
+        @test val < 0.01
+
+        @test segment(p) == ind
+   end
+
+
  end