Merge branch 'master' into deploy

wannesm · Nov 8, 2023 · a457816 · a457816
2 parents 078ccb6 + 93887ba
commit a457816
Show file tree

Hide file tree

Showing 64 changed files with 5,988 additions and 1,602 deletions.
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -10,8 +10,8 @@ name: Python package
 on:
   push:
     branches: [ deploy ]
-#  pull_request:
-#    branches: [ deploy ]
+  # pull_request:
+  #   branches: [ deploy ]
 
 jobs:
   # Explore-GitHub-Actions:
@@ -83,7 +83,7 @@ jobs:
       matrix:
         # os: [ubuntu-20.04,ubuntu-18.04]
         # python-version: ["3.8","3.9","3.10"]
-        os: [ubuntu-18.04]
+        os: [ubuntu-22.04]
         # cibuildwheel will automatically provide all Python versions
         python-version: ["3.9"]
     steps:

diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -1,9 +1,15 @@
+version: 2
+
 requirements_file: requirements.txt
 
 build:
-    image: latest
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
 
-python:
-    version: 3.6
-    setup_py_install: true
+sphinx:
+  configuration: docs/conf.py
 
+python:
+   install:
+   - requirements: docs/requirements.txt
diff --git a/AUTHORS b/AUTHORS
@@ -5,7 +5,9 @@ Other contributors, listed alphabetically, are:
 * Aras Yurtman (KU Leuven)
 * Erlend Kvinge Jørgensen (Equanostic.com)
 * Gust Verbruggen (KU Leuven)
+* Hendrik Blockeel (KU Leuven)
 * HendrikHuel (github.com/HendrikHuel)
+* Jesse Davis (KU Leuven)
 * Killian Hendrickx (Siemens PLM Software, KU Leuven)
 * Lars Haalck (University of Münster)
 * Marco Rossi (github.com/m-rossi)

diff --git a/CITATION.cff b/CITATION.cff
@@ -13,6 +13,12 @@ authors:
 - family-names: "Robberechts"
   given-names: "Pieter"
   orcid: "https://orcid.org/0000-0002-3734-0047"
+- family-names: "Blockeel"
+  given-names: "Hendrik"
+  orcid: "https://orcid.org/0000-0003-0378-3699"
+- family-names: "Davis"
+  given-names: "Jesse"
+  orcid: "https://orcid.org/0000-0002-3748-9263"
 title: "DTAIDistance"
 version: 2
 doi: 10.5281/zenodo.3981067

diff --git a/Makefile b/Makefile
@@ -57,6 +57,10 @@ benchmark-matrixc:
 benchmark-clustering:
 	export PYTHONPATH=.;py.test -k cluster ${BENCHMARKSETTINGS}
 
+.PHONY: benchmark-subseqsearch
+benchmark-subseqsearch:
+	export PYTHONPATH=.;py.test -k test_dtw_subseqsearch_eeg_lb ${BENCHMARKSETTINGS}
+
 
 .PHONY: clean
 clean:
@@ -80,8 +84,14 @@ clean:
 	rm -f dtaidistance/*.pyc
 	rm -rf dtaidistance/__pycache__
 
+.PHONY: use-venv
+use-venv:
+	$(eval $@_TMP := $(shell python3 -c 'import sys; print(sys.prefix)'))
+	@#@echo $($@_TMP)
+	@if [ -f "use_venv.txt" ]; then grep '$($@_TMP)' use_venv.txt || (echo "venv does not appear in use_venv.txt: $($@_TMP)"; exit 1) ;fi
+
 .PHONY: build
-build:
+build: use-venv
 	python3 setup.py build_ext --inplace
 
 .PHONY: pypy-build
@@ -108,7 +118,7 @@ prepare_tag:
 	@echo "Check whether repo is clean"
 	git diff-index --quiet HEAD
 	@echo "Check correct branch"
-	if [[ "$$(git rev-parse --abbrev-ref HEAD)" != "master" ]]; then echo 'Not master branch'; exit 1; fi
+	if [[ "$$(git rev-parse --abbrev-ref HEAD)" != "deploy" ]]; then echo 'Not deploy branch'; exit 1; fi
 	@echo "Add tag"
 	git tag "v$$(python3 setup.py --version)"
 	git push --tags

diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ Example:
 
 Citing this work:
 
-> Wannes Meert, Kilian Hendrickx, Toon Van Craenendonck & Pieter Robberechts.  
+> Wannes Meert, Kilian Hendrickx, Toon Van Craenendonck, Pieter Robberechts, Hendrik Blockeel & Jesse Davis.  
 > DTAIDistance (Version v2). Zenodo.  
 > http://doi.org/10.5281/zenodo.5901139
 

diff --git a/docs/conf.py b/docs/conf.py
@@ -68,7 +68,7 @@
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+language = "en"
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.

diff --git a/docs/index.rst b/docs/index.rst
@@ -29,6 +29,7 @@ Source available on https://github.com/wannesm/dtaidistance.
    usage/clustering
    usage/subsequence
    usage/sequence
+   usage/similarity
    usage/changelist
 
 

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -0,0 +1,2 @@
+Cython
+sphinx_rtd_theme
diff --git a/docs/usage/dtw.rst b/docs/usage/dtw.rst
@@ -11,7 +11,7 @@ Dynamic Time Warping (DTW)
     path = dtw.warping_path(s1, s2)
     dtwvis.plot_warping(s1, s2, path, filename="warp.png")
 
-.. figure:: https://people.cs.kuleuven.be/wannes.meert/dtw/dtw_example.png?v=5
+.. figure:: /_static/dtw_example.png
    :alt: DTW Example
 
 
@@ -269,19 +269,20 @@ documentation for a visual example).
 Multi-dimensionsal DTW
 ^^^^^^^^^^^^^^^^^^^^^^
 
-Compare two multi-dimensional sequences.
+To compare two multivariate sequences, a multivariate time series with n_timesteps and
+at each timestep a vector with n_values is stored in a two dimensional array of size
+(n_timesteps,n_values). The first dimension of the data structure is the
+sequence item index (i.e., time series index, time step) and the second dimension
+is the index of the value in the vector.
 
-Assumes the first dimension of the data structure to be the sequence item index
-(or time series index).
-
-For example, two 2-dimensional series with five timesteps:
+For example, two 2-dimensional multivariate series with five timesteps:
 
 ::
 
     from dtaidistance import dtw_ndim
 
-    series1 = np.array([[0, 0],  # first 2-dim point at t=0
-                        [0, 1],  # second 2-dim point at t=1
+    series1 = np.array([[0, 0],  # first point at t=0
+                        [0, 1],  # second point at t=1
                         [2, 1],
                         [0, 1],
                         [0, 0]], dtype=np.double)
@@ -302,3 +303,8 @@ n-dimensional sequences. If you want to compute the independent DTW
     dtw_i = 0
     for dim in range(ndim):
         dtw_i += dtw.distance(s1[:,dim], s2[:,dim])
+
+To compute a distance matrix between multivariate time series, the same
+data structures are for univariate DTW are supported. The only difference
+is that when all data is stored in a Numpy array, this is now a 3-dimensional
+array with as size (n_series, n_timesteps, n_values).
diff --git a/docs/usage/similarity.rst b/docs/usage/similarity.rst
@@ -0,0 +1,103 @@
+Similarity vs Distance
+----------------------
+
+Distances such as Euclidean distance or Dynamic Time Warping (DTW)
+return a value that expresses *how far two instances are apart*.
+Such a distance is equal to zero, when the instances are equal, or larger than
+zero. In certain cases you might need to translate this distance to:
+
+- A *similarity measure* that inverts the meaning of the returned
+  values and expresses *how close to instances are*. Typically also
+  bounded between 0 and 1, where now 1 means that two instances are equal.
+
+- A *bounded distance* that limits the range of the distance between
+  0 and 1, where 0 means that two instances are equal. This can be achieved
+  by squashing to distance between 0 and 1.
+
+The DTAIDistance toolbox provides a number of transformations to
+translate a distance to a similarity measure or to a squashed distance.
+
+Similarity
+~~~~~~~~~~
+
+Some methods require as input a similarity instead of a distance
+(e.g., spectral clustering). Therefore, it might be useful to translate
+the computed distances to a similarity. There are different approaches
+to achieve this that are supported by dtaidistance: exponential,
+Gaussian, reciprocal, reverse.
+
+For example, given a set of series (the rows) for which we want to compute the
+pairwise similarity based on dynamic time warping:
+
+.. code-block:: python
+
+    from dtaidistance import dtw, similarity
+    s = np.array([[0., 0, 1, 2, 1, 0, 1, 0, 0],
+                  [0., 1, 2, 0, 0, 0, 0, 0, 0],
+                  [1., 2, 0, 0, 0, 0, 0, 1, 1],
+                  [0., 0, 1, 2, 1, 0, 1, 0, 0],
+                  [0., 1, 2, 0, 0, 0, 0, 0, 0],
+                  [1., 2, 0, 0, 0, 0, 0, 1, 1]])
+    sim = similarity.distance_to_similarity(dtw.distance_matrix(s))
+
+The result is:
+
+.. code-block:: python
+
+    [[1.00 0.53 0.37 1.00 0.53 0.37]
+     [0.53 1.00 0.46 0.53 1.00 0.46]
+     [0.37 0.46 1.00 0.37 0.46 1.00]
+     [1.00 0.53 0.37 1.00 0.53 0.37]
+     [0.53 1.00 0.46 0.53 1.00 0.46]
+     [0.37 0.46 1.00 0.37 0.46 1.00]]
+
+You can observe that the diagonal is all ones because each series
+is similar to itself. And the series at index 0 and 3 are identical,
+thus also resulting in a similarity of 1.
+
+If you want to use a different conversion than the default exponential
+by using the method argument.
+
+.. code-block:: python
+
+    distance_to_similarity(distances, method='exponential')
+    distance_to_similarity(distances, method='gaussian')
+    distance_to_similarity(distances, method='reciprocal')
+    distance_to_similarity(distances, method='reverse')
+
+When reapplying the distance_to_similarity function over multiple matrices, it is advised
+to set the r argument manually (or extract them using the return_params
+option). Otherwise they are computed based on
+the given distance matrix and will be different from call to call.
+
+Squashing
+~~~~~~~~~
+
+Similarity reverses high values to low and low to high. If you want to
+maintain the direction but squash the distances between 0 and 1, you can
+use the squash function (based on Vercruyssen et al., Semi-supervised anomaly detection with an application to
+water analytics, ICDM, 2018).
+
+.. code-block:: python
+
+    similarity.squash(dtw.distance_matrix(s))
+
+Which results in:
+
+.. code-block:: python
+
+    [[0.00 0.75 0.99 0.00 0.75 0.99]
+     [0.75 0.00 0.94 0.75 0.00 0.94]
+     [0.99 0.94 0.00 0.99 0.94 0.00]
+     [0.00 0.75 0.99 0.00 0.75 0.99]
+     [0.75 0.00 0.94 0.75 0.00 0.94]
+     [0.99 0.94 0.00 0.99 0.94 0.00]]
+
+You can observe the diagonal is all zeros again (when rounded, the values
+are slightly larger than zero because logistic squashing is used). And
+the most different series are close to 1.
+
+When reapplying the squash function over multiple matrices, it is advised
+to set the x0 and r argument manually (or extract them using the return_params
+option). Otherwise they are computed based on
+the given distance matrix and will be different from call to call.
diff --git a/docs/usage/subsequence.rst b/docs/usage/subsequence.rst
@@ -52,11 +52,11 @@ If you want to find all matches (or the k best):
    :alt: Subsequence alignment k-best matches
 
 
-DTW subsequence search
-~~~~~~~~~~~~~~~~~~~~~~
+DTW subsequence search (KNN)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Similar to using alignment, we can also iterate over a sequence of series or windows
-to search for the best match:
+to search for the best match, or best k matches (k-Nearest Neighbors):
 
 ::
 

diff --git a/dtaidistance/__init__.py b/dtaidistance/__init__.py
@@ -32,7 +32,7 @@
     #                "then run `cd {};python3 setup.py build_ext --inplace`.".format(dtaidistance_dir))
     dtw_cc = None
 
-__version__ = "2.3.10"
+__version__ = "2.3.11"
 __author__ = "Wannes Meert"
 __copyright__ = "Copyright 2017-2022 KU Leuven, DTAI Research Group"
 __license__ = "Apache License, Version 2.0"

diff --git a/dtaidistance/clustering/kmeans.py b/dtaidistance/clustering/kmeans.py
@@ -235,15 +235,22 @@ def kmeansplusplus_centers(self, series, use_c=False):
         logger.debug('... Done')
         return means
 
-    def fit_fast(self, series):
-        return self.fit(series, use_c=True, use_parallel=True)
+    def fit_fast(self, series, monitor_distances=None):
+        return self.fit(series, use_c=True, use_parallel=True, monitor_distances=monitor_distances)
 
-    def fit(self, series, use_c=False, use_parallel=True):
+    def fit(self, series, use_c=False, use_parallel=True, monitor_distances=None):
         """Perform K-means clustering.
 
         :param series: Container with series
         :param use_c: Use the C-library (only available if package is compiled)
         :param use_parallel: Use multipool for parallelization
+        :param monitor_distances: This function is called with two arguments:
+            (1) a list of (cluster, distance) for each instance;
+            (2) a boolean indicating whether the clustering has been stopped or not.
+            From this one can compute inertia or other metrics
+            to monitor the clustering. If the boolean argument is true, this is the
+            final assignment. If this function returns True, the clustering
+            continues, if False is returned the clustering is stopped.
         :return: cluster indices, number of iterations
             If the number of iterations is equal to max_it, the clustering
             did not converge.
@@ -299,6 +306,10 @@ def fit(self, series, use_c=False, use_parallel=True):
             else:
                 clusters_distances = list(map(fn, [(self.series[idx], self.means, self.dists_options) for idx in
                                                    range(len(self.series))]))
+            if monitor_distances is not None:
+                cont = monitor_distances(clusters_distances, False)
+                if cont is False:
+                    break
             clusters, distances = zip(*clusters_distances)
             distances = list(distances)
 
@@ -393,6 +404,8 @@ def fit(self, series, use_c=False, use_parallel=True):
         else:
             clusters_distances = list(map(fn, [(self.series[idx], self.means, self.dists_options) for idx in
                                                range(len(self.series))]))
+        if monitor_distances is not None:
+            monitor_distances(clusters_distances, True)
         clusters, distances = zip(*clusters_distances)
 
         # self.cluster_idx = {medoid: {inst for inst in instances}

diff --git a/dtaidistance/dtaidistancec_dtw.pxd b/dtaidistance/dtaidistancec_dtw.pxd
@@ -15,6 +15,7 @@ cdef extern from "dd_dtw.h":
         Py_ssize_t psi_2e
         bint use_pruning
         bint only_ub
+        int inner_dist
 
     ctypedef struct DTWBlock:
         Py_ssize_t rb
@@ -68,8 +69,8 @@ cdef extern from "dd_dtw.h":
                                        Py_ssize_t ce, DTWSettings *settings)
     void dtw_wps_negativize_value(DTWWps *p, seq_t *wps, Py_ssize_t l1, Py_ssize_t l2, Py_ssize_t r, Py_ssize_t c)
     void dtw_wps_positivize_value(DTWWps *p, seq_t *wps, Py_ssize_t l1, Py_ssize_t l2, Py_ssize_t r, Py_ssize_t c)
-    void dtw_wps_positivize(DTWWps *p, seq_t *wps, Py_ssize_t rb, Py_ssize_t re)
-    void dtw_wps_negativize(DTWWps *p, seq_t *wps, Py_ssize_t rb, Py_ssize_t re)
+    void dtw_wps_positivize(DTWWps *p, seq_t *wps, Py_ssize_t l1, Py_ssize_t l2, Py_ssize_t rb, Py_ssize_t re, Py_ssize_t cb, Py_ssize_t ce)
+    void dtw_wps_negativize(DTWWps *p, seq_t *wps, Py_ssize_t l1, Py_ssize_t l2, Py_ssize_t rb, Py_ssize_t re, Py_ssize_t cb, Py_ssize_t ce)
     Py_ssize_t dtw_wps_loc(DTWWps *p, Py_ssize_t r, Py_ssize_t c, Py_ssize_t l1, Py_ssize_t l2)
     Py_ssize_t dtw_wps_max(DTWWps * p, seq_t *wps, Py_ssize_t *r, Py_ssize_t *c, Py_ssize_t l1, Py_ssize_t l2)
     Py_ssize_t dtw_best_path(seq_t *wps, Py_ssize_t *i1, Py_ssize_t *i2, Py_ssize_t l1, Py_ssize_t l2,
@@ -79,13 +80,13 @@ cdef extern from "dd_dtw.h":
                                       DTWSettings *settings)
     Py_ssize_t dtw_best_path_prob(seq_t *wps, Py_ssize_t *i1, Py_ssize_t *i2, Py_ssize_t l1, Py_ssize_t l2,
                                   seq_t avg, DTWSettings *settings);
-    Py_ssize_t warping_path(seq_t *from_s, Py_ssize_t from_l, seq_t* to_s, Py_ssize_t to_l,
-                            Py_ssize_t *from_i, Py_ssize_t *to_i, DTWSettings * settings)
-    Py_ssize_t warping_path_ndim(seq_t *from_s, Py_ssize_t from_l, seq_t * to_s, Py_ssize_t to_l,
-                                 Py_ssize_t *from_i, Py_ssize_t *to_i, int ndim, DTWSettings * settings)
+    seq_t dtw_warping_path(seq_t *from_s, Py_ssize_t from_l, seq_t* to_s, Py_ssize_t to_l,
+                           Py_ssize_t *from_i, Py_ssize_t *to_i, Py_ssize_t *length_i, DTWSettings * settings)
+    seq_t dtw_warping_path_ndim(seq_t *from_s, Py_ssize_t from_l, seq_t * to_s, Py_ssize_t to_l,
+                                Py_ssize_t *from_i, Py_ssize_t *to_i, Py_ssize_t *length_i, int ndim, DTWSettings * settings)
     void dtw_srand(unsigned int seed)
-    Py_ssize_t warping_path_prob_ndim(seq_t *from_s, Py_ssize_t from_l, seq_t* to_s, Py_ssize_t to_l,
-                                      Py_ssize_t *from_i, Py_ssize_t *to_i, seq_t avg, int ndim, DTWSettings * settings)
+    seq_t dtw_warping_path_prob_ndim(seq_t *from_s, Py_ssize_t from_l, seq_t* to_s, Py_ssize_t to_l,
+                                     Py_ssize_t *from_i, Py_ssize_t *to_i, Py_ssize_t *length_i, seq_t avg, int ndim, DTWSettings * settings)
     DTWWps dtw_wps_parts(Py_ssize_t l1, Py_ssize_t l2, DTWSettings * settings)
 
     seq_t ub_euclidean(seq_t *s1, Py_ssize_t l1, seq_t *s2, Py_ssize_t l2)

diff --git a/dtaidistance/dtaidistancec_ed.pxd b/dtaidistance/dtaidistancec_ed.pxd
@@ -4,4 +4,6 @@ from dtaidistancec_globals cimport seq_t
 
 cdef extern from "dd_ed.h":
     seq_t euclidean_distance(seq_t *s1, Py_ssize_t l1, seq_t *s2, Py_ssize_t l2)
+    seq_t euclidean_distance_euclidean(seq_t *s1, Py_ssize_t l1, seq_t *s2, Py_ssize_t l2)
     seq_t euclidean_distance_ndim(seq_t *s1, Py_ssize_t l1, seq_t *s2, Py_ssize_t l2, int ndim)
+    seq_t euclidean_distance_ndim_euclidean(seq_t *s1, Py_ssize_t l1, seq_t *s2, Py_ssize_t l2, int ndim)