#659: [Hotfix] Empty clusters and their medoids are erased for K-Medo…

…ids.
annoviko · Nov 25, 2020 · 19a1d6c · 19a1d6c
1 parent 878c199
commit 19a1d6c
Show file tree

Hide file tree

Showing 13 changed files with 261 additions and 47 deletions.
diff --git a/CHANGES b/CHANGES
@@ -1,6 +1,6 @@
 ------------------------------------------------------------------------
 
-CHANGE NOTES FOR 0.10.1.1 (STARTED Aug 24, 2020), (RELEASED: Nov 24, 2020)
+CHANGE NOTES FOR 0.10.1.2 (STARTED Nov 25, 2020), (RELEASED: Nov 25, 2020)
 
 ------------------------------------------------------------------------
 
@@ -10,6 +10,18 @@ CORRECTED MAJOR BUGS:
   See: https://github.com/annoviko/pyclustering/issues/659
 
 
+------------------------------------------------------------------------
+
+CHANGE NOTES FOR 0.10.1.1 (STARTED Nov 24, 2020), (RELEASED: Nov 24, 2020)
+
+------------------------------------------------------------------------
+
+CORRECTED MAJOR BUGS:
+
+- Corrected bug with incorrect cluster allocation for K-Medoids (C++ `pyclustering::clst::kmeadois`).
+  See: https://github.com/annoviko/pyclustering/issues/659
+
+
 ------------------------------------------------------------------------
 
 CHANGE NOTES FOR 0.10.1 (STARTED Aug 17, 2020), (RELEASED: Nov 19, 2020)

diff --git a/README.rst b/README.rst
@@ -9,7 +9,7 @@ Python and C++ implementations (C++ pyclustering library) of each algorithm or
 model. C++ pyclustering library is a part of pyclustering and supported for
 Linux, Windows and MacOS operating systems.
 
-**Version**: 0.10.1.1
+**Version**: 0.10.1.2
 
 **License**: The 3-Clause BSD License
 

diff --git a/ccore/CMakeLists.txt b/ccore/CMakeLists.txt
@@ -8,7 +8,7 @@
 cmake_minimum_required(VERSION 3.10)
 
 
-project(pyclustering VERSION 0.10.1.1 LANGUAGES CXX)
+project(pyclustering VERSION 0.10.1.2 LANGUAGES CXX)
 
 
 file(MAKE_DIRECTORY build)

diff --git a/ccore/include/pyclustering/cluster/kmedoids.hpp b/ccore/include/pyclustering/cluster/kmedoids.hpp
@@ -199,6 +199,15 @@ class kmedoids {
     
     */
     double calculate_swap_cost(const std::size_t p_index_candidate, const std::size_t p_index_cluster) const;
+
+    /*!
+
+    @brief      Erase empty clusters and their medoids.
+    @details    Data might have identical points and a lot of identical points and as a result medoids might correspond
+                  to points that are totally identical.
+
+    */
+    void erase_empty_clusters();
 };
 
 

diff --git a/ccore/src/cluster/kmedoids.cpp b/ccore/src/cluster/kmedoids.cpp
@@ -95,6 +95,8 @@ void kmedoids::process(const dataset & p_data, const kmedoids_data_t p_type, kme
         }
     }
 
+    erase_empty_clusters();
+
     m_data_ptr = nullptr;
     m_result_ptr = nullptr;
 }
@@ -222,6 +224,19 @@ double kmedoids::calculate_swap_cost(const std::size_t p_index_candidate, const
 }
 
 
+void kmedoids::erase_empty_clusters() {
+    auto & clusters = m_result_ptr->clusters();
+    auto & medoids = m_result_ptr->medoids();
+
+    for (std::size_t index_cluster = clusters.size() - 1; index_cluster != static_cast<std::size_t>(-1); index_cluster--) {
+        if (clusters[index_cluster].empty()) {
+            clusters.erase(clusters.begin() + index_cluster);
+            medoids.erase(medoids.begin() + index_cluster);
+        }
+    }
+}
+
+
 }
 
 }
diff --git a/ccore/src/interface/interface_property.cpp b/ccore/src/interface/interface_property.cpp
@@ -11,7 +11,7 @@
 
 
 const char * INTERFACE_DESCRIPTION  = "pyclustering library is a C/C++ part of python pyclustering library";
-const char * INTERFACE_VERSION      = "0.10.1.1";
+const char * INTERFACE_VERSION      = "0.10.1.2";
 
 
 void * get_interface_description() {

diff --git a/pyclustering/__init__.py b/pyclustering/__init__.py
@@ -283,7 +283,7 @@
 
 
 ## The current version of pyclustering library.
-__version__ = '0.10.1.1'
+__version__ = '0.10.1.2'
 
 ## The current root directory of pyclustering library.
 __PYCLUSTERING_ROOT_DIRECTORY__ = str(pathlib.Path(__file__).parent)
diff --git a/pyclustering/cluster/examples/kmedoids_examples.py b/pyclustering/cluster/examples/kmedoids_examples.py
@@ -13,15 +13,21 @@
 from pyclustering.cluster import cluster_visualizer
 from pyclustering.cluster.kmedoids import kmedoids
 
-from pyclustering.utils import read_sample
+from pyclustering.utils import read_sample, calculate_distance_matrix
 from pyclustering.utils import timedcall, distance_metric, type_metric
 
 
-def template_clustering(start_medoids, path, tolerance=0.25, show=True, ccore=True):
-    sample = read_sample(path)
+def template_clustering(start_medoids, path, tolerance=0.25, show=True, **kwargs):
+    ccore = kwargs.get('ccore', True)
+    data_type = kwargs.get('data_type', 'points')
+
+    original_data = read_sample(path)
+    sample = original_data
+    if data_type == 'distance_matrix':
+        sample = calculate_distance_matrix(sample)
 
     metric = distance_metric(type_metric.EUCLIDEAN_SQUARE, data=sample)
-    kmedoids_instance = kmedoids(sample, start_medoids, tolerance, metric=metric, ccore=ccore)
+    kmedoids_instance = kmedoids(sample, start_medoids, tolerance, metric=metric, ccore=ccore, data_type=data_type)
     (ticks, result) = timedcall(kmedoids_instance.process)
 
     clusters = kmedoids_instance.get_clusters()
@@ -31,12 +37,12 @@ def template_clustering(start_medoids, path, tolerance=0.25, show=True, ccore=Tr
 
     if show is True:
         visualizer = cluster_visualizer(1)
-        visualizer.append_clusters(clusters, sample, 0)
-        visualizer.append_cluster([sample[index] for index in start_medoids], marker='*', markersize=15)
-        visualizer.append_cluster(medoids, data=sample, marker='*', markersize=15)
+        visualizer.append_clusters(clusters, original_data, 0)
+        visualizer.append_cluster([original_data[index] for index in start_medoids], marker='*', markersize=15)
+        visualizer.append_cluster(medoids, data=original_data, marker='*', markersize=15)
         visualizer.show()
 
-    return sample, clusters
+    return original_data, clusters
 
 
 def cluster_sample1():
@@ -58,7 +64,8 @@ def cluster_elongate():
     template_clustering([8, 56], SIMPLE_SAMPLES.SAMPLE_ELONGATE)
 
 def cluster_lsun():
-    template_clustering([10, 275, 385], FCPS_SAMPLES.SAMPLE_LSUN)
+    #template_clustering([10, 275, 385], FCPS_SAMPLES.SAMPLE_LSUN)
+    template_clustering([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], FCPS_SAMPLES.SAMPLE_LSUN, data_type='distance_matrix')
 
 def cluster_target():
     template_clustering([10, 160, 310, 460, 560, 700], FCPS_SAMPLES.SAMPLE_TARGET)

diff --git a/pyclustering/cluster/kmedoids.py b/pyclustering/cluster/kmedoids.py
@@ -105,11 +105,11 @@ def __init__(self, data, initial_index_medoids, tolerance=0.0001, ccore=True, **
         @param[in] initial_index_medoids (list): Indexes of intial medoids (indexes of points in input data).
         @param[in] tolerance (double): Stop condition: if maximum value of distance change of medoids of clusters is less than tolerance than algorithm will stop processing.
         @param[in] ccore (bool): If specified than CCORE library (C++ pyclustering library) is used for clustering instead of Python code.
-        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'data_type', 'itermax').
+        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `metric`, `data_type`, `itermax`).
 
         <b>Keyword Args:</b><br>
             - metric (distance_metric): Metric that is used for distance calculation between two points.
-            - data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
+            - data_type (string): Data type of input sample `data` that is processed by the algorithm (`points`, `distance_matrix`).
             - itermax (uint): Maximum number of iteration for cluster analysis.
 
         """
@@ -172,6 +172,8 @@ def process(self):
 
                 iterations += 1
 
+            self.__erase_empty_clusters()
+
         return self
 
 
@@ -316,7 +318,7 @@ def __update_clusters(self):
 
             for index in range(len(self.__medoid_indexes)):
                 dist = self.__distance_calculator(index_point, self.__medoid_indexes[index])
-                
+
                 if dist < dist_optim_first:
                     dist_optim_second = dist_optim_first
                     index_optim = index
@@ -386,3 +388,36 @@ def __calculate_swap_cost(self, index_candidate, cluster_index):
                 cost += candidate_distance - self.__distance_first_medoid[index_point]
 
         return cost - self.__distance_first_medoid[index_candidate]
+
+
+    def __erase_empty_clusters(self):
+        """!
+        @brief Erase empty clusters and their medoids.
+        @details Data might have identical points and a lot of identical points and as a result medoids might correspond
+                  to points that are totally identical.
+
+        """
+
+        erase_required = False
+
+        # Before processing check if there are empty clusters
+        for cluster in self.__clusters:
+            if len(cluster) == 0:
+                erase_required = True
+                break
+
+        if erase_required is False:
+            return
+
+        none_empty_clusters = []
+        none_empty_medoids = []
+
+        for index in range(len(self.__clusters)):
+            if len(self.__clusters[index]) == 0:
+                continue
+
+            none_empty_clusters.append(self.__clusters[index])
+            none_empty_medoids.append(self.__medoid_indexes[index])
+
+        self.__clusters = none_empty_clusters
+        self.__medoid_indexes = none_empty_medoids
diff --git a/pyclustering/cluster/tests/integration/it_kmedoids.py b/pyclustering/cluster/tests/integration/it_kmedoids.py
@@ -18,7 +18,7 @@
 from pyclustering.cluster.tests.kmedoids_templates import kmedoids_test_template
 from pyclustering.cluster.kmedoids import kmedoids
 
-from pyclustering.samples.definitions import SIMPLE_SAMPLES
+from pyclustering.samples.definitions import SIMPLE_SAMPLES, SIMPLE_ANSWERS
 
 from pyclustering.utils import read_sample
 from pyclustering.utils.metric import type_metric, distance_metric
@@ -139,7 +139,7 @@ def testClusterAllocationTheSameObjectsThreeInitialMedoidsByCore(self):
         kmedoids_test_template.templateClusterAllocationTheSameObjects(25, 3, True)
 
     def testCoreInterfaceIntInputData(self):
-        kmedoids_instance = kmedoids([ [1], [2], [3], [20], [21], [22] ], [ 2, 5 ], 0.025, True)
+        kmedoids_instance = kmedoids([[1], [2], [3], [20], [21], [22]], [2, 5], 0.025, True)
         kmedoids_instance.process()
         assert len(kmedoids_instance.get_clusters()) == 2
 
@@ -153,18 +153,21 @@ def testAllocatedRequestedClustersSampleSimple04ByCore(self):
 
     def testAllocatedRequestedClustersWithTheSamePointsByCore(self):
         # Bug issue #366 - Kmedoids returns incorrect number of clusters.
-        sample = [ [0.0, 0.0], [0.1, 0.1], [0.0, 0.0], [0.1, 0.2] ]
-        kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 4, None, True)
+        sample = [[0.0, 0.0], [0.1, 0.1], [0.0, 0.0], [0.1, 0.2]]
+        kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 3, None, True)
         kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 3, None, True)
         kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 2, None, True)
         kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 1, None, True)
 
+    def testAllocatedRequestedClustersWithTheSamePoints2(self):
+        sample = [[0.23, 0.2], [-0.1, 0.1], [0.0, 0.9], [0.1, -0.2], [0.8, 0.1], [-0.1, 0.1], [-0.4, -0.2], [0.0, 0.9]]
+        answers = [1, 2, 3, 4, 5, 6, 6, 6]
+        for expected_amount in answers:
+            kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, expected_amount, None, True)
+
     def testAllocatedRequestedClustersWithTotallyTheSamePointsByCore(self):
         # Bug issue #366 - Kmedoids returns incorrect number of clusters.
-        sample = [ [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0] ]
-        kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 4, None, True)
-        kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 3, None, True)
-        kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 2, None, True)
+        sample = [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]
         kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 1, None, True)
 
 
@@ -184,3 +187,82 @@ def testItermax10Simple01(self):
 
     def testItermax10Simple02(self):
         kmedoids_test_template.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, [3, 12, 20], [10, 5, 8], True, itermax=10)
+
+
+    def testSimple01AnswerByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, SIMPLE_ANSWERS.ANSWER_SIMPLE1, True, random_state=1000)
+
+    def testSimple01AnswerDistanceMatrixByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, SIMPLE_ANSWERS.ANSWER_SIMPLE1, True, random_state=1000, data_type='distance_matrix')
+
+    def testSimple02AnswerByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, SIMPLE_ANSWERS.ANSWER_SIMPLE2, True, random_state=1000)
+
+    def testSimple02AnswerDistanceMatrixByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, SIMPLE_ANSWERS.ANSWER_SIMPLE2, True, random_state=1000, data_type='distance_matrix')
+
+    def testSimple03AnswerByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, SIMPLE_ANSWERS.ANSWER_SIMPLE3, True, random_state=1000)
+
+    def testSimple03AnswerDistanceMatrixByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, SIMPLE_ANSWERS.ANSWER_SIMPLE3, True, random_state=1000, data_type='distance_matrix')
+
+    def testSimple04AnswerByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, SIMPLE_ANSWERS.ANSWER_SIMPLE4, True, random_state=1000)
+
+    def testSimple04AnswerDistanceMatrixByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, SIMPLE_ANSWERS.ANSWER_SIMPLE4, True, random_state=1000, data_type='distance_matrix')
+
+    def testSimple05AnswerByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE5, SIMPLE_ANSWERS.ANSWER_SIMPLE5, True, random_state=1000)
+
+    def testSimple05AnswerDistanceMatrixByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE5, SIMPLE_ANSWERS.ANSWER_SIMPLE5, True, random_state=1000, data_type='distance_matrix')
+
+    def testSimple06AnswerByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE6, SIMPLE_ANSWERS.ANSWER_SIMPLE6, True, random_state=1000)
+
+    def testSimple06AnswerDistanceMatrixByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE6, SIMPLE_ANSWERS.ANSWER_SIMPLE6, True, random_state=1000, data_type='distance_matrix')
+
+    def testSimple07AnswerByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE7, SIMPLE_ANSWERS.ANSWER_SIMPLE7, True, random_state=1000)
+
+    def testSimple07AnswerDistanceMatrixByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE7, SIMPLE_ANSWERS.ANSWER_SIMPLE7, True, random_state=1000, data_type='distance_matrix')
+
+    def testSimple08AnswerByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE8, SIMPLE_ANSWERS.ANSWER_SIMPLE8, True, random_state=1000)
+
+    def testSimple08AnswerDistanceMatrixByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE8, SIMPLE_ANSWERS.ANSWER_SIMPLE8, True, random_state=1000, data_type='distance_matrix')
+
+    def testSimple09AnswerByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE9, SIMPLE_ANSWERS.ANSWER_SIMPLE9, True, random_state=1000)
+
+    def testSimple09AnswerDistanceMatrixByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE9, SIMPLE_ANSWERS.ANSWER_SIMPLE9, True, random_state=1000, data_type='distance_matrix')
+
+    def testSimple10AnswerByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE10, SIMPLE_ANSWERS.ANSWER_SIMPLE10, True, random_state=1000)
+
+    def testSimple10AnswerDistanceMatrixByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE10, SIMPLE_ANSWERS.ANSWER_SIMPLE10, True, random_state=1000, data_type='distance_matrix')
+
+    def testSimple11AnswerByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE11, SIMPLE_ANSWERS.ANSWER_SIMPLE11, True, random_state=1000)
+
+    def testSimple11AnswerDistanceMatrixByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE11, SIMPLE_ANSWERS.ANSWER_SIMPLE11, True, random_state=1000, data_type='distance_matrix')
+
+    def testSimple12AnswerByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE12, SIMPLE_ANSWERS.ANSWER_SIMPLE12, True, random_state=1000)
+
+    def testSimple12AnswerDistanceMatrixByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE12, SIMPLE_ANSWERS.ANSWER_SIMPLE12, True, random_state=1000, data_type='distance_matrix')
+
+    def testSimple13AnswerByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE13, SIMPLE_ANSWERS.ANSWER_SIMPLE13, True, random_state=1000)
+
+    def testSimple13AnswerDistanceMatrixByCore(self):
+        kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE13, SIMPLE_ANSWERS.ANSWER_SIMPLE13, True, random_state=1000, data_type='distance_matrix')
diff --git a/pyclustering/cluster/tests/kmedoids_templates.py b/pyclustering/cluster/tests/kmedoids_templates.py
@@ -79,6 +79,10 @@ def templateLengthProcessWithMetric(path_to_file, initial_medoids, expected_clus
             if len(sample) != sum(obtained_cluster_sizes):
                 continue
 
+            for cluster in clusters:
+                if len(cluster) == 0:
+                    continue
+
             if expected_cluster_length is not None:
                 obtained_cluster_sizes.sort()
                 expected_cluster_length.sort()
@@ -143,10 +147,10 @@ def templateClusterAllocationTheSameObjects(number_objects, number_clusters, cco
 
         assertion.eq(len(clusters), len(medoids))
         assertion.eq(len(set(medoids)), len(medoids))
-        
+
         object_mark = [False] * number_objects
         allocated_number_objects = 0
-        
+
         for cluster in clusters:
             for index_object in cluster: 
                 assertion.eq(False, object_mark[index_object])    # one object can be in only one cluster.
@@ -174,7 +178,15 @@ def templatePredict(path_to_file, initial_medoids, points, expected_closest_clus
 
     @staticmethod
     def clustering_with_answer(data_file, answer_file, ccore, **kwargs):
-        data = read_sample(data_file)
+        data_type = kwargs.get('data_type', 'points')
+        metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN))
+
+        original_data = read_sample(data_file)
+        data = original_data
+
+        if data_type == 'distance_matrix':
+            data = calculate_distance_matrix(original_data, metric)
+
         reader = answer_reader(answer_file)
 
         amount_medoids = len(reader.get_clusters())