Skip to content

Commit

Permalink
#659: [Hotfix] Empty clusters and their medoids are erased for K-Medo…
Browse files Browse the repository at this point in the history
…ids.
  • Loading branch information
annoviko committed Nov 25, 2020
1 parent 878c199 commit 19a1d6c
Show file tree
Hide file tree
Showing 13 changed files with 261 additions and 47 deletions.
14 changes: 13 additions & 1 deletion CHANGES
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
------------------------------------------------------------------------

CHANGE NOTES FOR 0.10.1.1 (STARTED Aug 24, 2020), (RELEASED: Nov 24, 2020)
CHANGE NOTES FOR 0.10.1.2 (STARTED Nov 25, 2020), (RELEASED: Nov 25, 2020)

------------------------------------------------------------------------

Expand All @@ -10,6 +10,18 @@ CORRECTED MAJOR BUGS:
See: https://github.com/annoviko/pyclustering/issues/659


------------------------------------------------------------------------

CHANGE NOTES FOR 0.10.1.1 (STARTED Nov 24, 2020), (RELEASED: Nov 24, 2020)

------------------------------------------------------------------------

CORRECTED MAJOR BUGS:

- Corrected bug with incorrect cluster allocation for K-Medoids (C++ `pyclustering::clst::kmeadois`).
See: https://github.com/annoviko/pyclustering/issues/659


------------------------------------------------------------------------

CHANGE NOTES FOR 0.10.1 (STARTED Aug 17, 2020), (RELEASED: Nov 19, 2020)
Expand Down
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Python and C++ implementations (C++ pyclustering library) of each algorithm or
model. C++ pyclustering library is a part of pyclustering and supported for
Linux, Windows and MacOS operating systems.

**Version**: 0.10.1.1
**Version**: 0.10.1.2

**License**: The 3-Clause BSD License

Expand Down
2 changes: 1 addition & 1 deletion ccore/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
cmake_minimum_required(VERSION 3.10)


project(pyclustering VERSION 0.10.1.1 LANGUAGES CXX)
project(pyclustering VERSION 0.10.1.2 LANGUAGES CXX)


file(MAKE_DIRECTORY build)
Expand Down
9 changes: 9 additions & 0 deletions ccore/include/pyclustering/cluster/kmedoids.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,15 @@ class kmedoids {
*/
double calculate_swap_cost(const std::size_t p_index_candidate, const std::size_t p_index_cluster) const;

/*!
@brief Erase empty clusters and their medoids.
@details Data might have identical points and a lot of identical points and as a result medoids might correspond
to points that are totally identical.
*/
void erase_empty_clusters();
};


Expand Down
15 changes: 15 additions & 0 deletions ccore/src/cluster/kmedoids.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ void kmedoids::process(const dataset & p_data, const kmedoids_data_t p_type, kme
}
}

erase_empty_clusters();

m_data_ptr = nullptr;
m_result_ptr = nullptr;
}
Expand Down Expand Up @@ -222,6 +224,19 @@ double kmedoids::calculate_swap_cost(const std::size_t p_index_candidate, const
}


void kmedoids::erase_empty_clusters() {
auto & clusters = m_result_ptr->clusters();
auto & medoids = m_result_ptr->medoids();

for (std::size_t index_cluster = clusters.size() - 1; index_cluster != static_cast<std::size_t>(-1); index_cluster--) {
if (clusters[index_cluster].empty()) {
clusters.erase(clusters.begin() + index_cluster);
medoids.erase(medoids.begin() + index_cluster);
}
}
}


}

}
2 changes: 1 addition & 1 deletion ccore/src/interface/interface_property.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


const char * INTERFACE_DESCRIPTION = "pyclustering library is a C/C++ part of python pyclustering library";
const char * INTERFACE_VERSION = "0.10.1.1";
const char * INTERFACE_VERSION = "0.10.1.2";


void * get_interface_description() {
Expand Down
2 changes: 1 addition & 1 deletion pyclustering/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@


## The current version of pyclustering library.
__version__ = '0.10.1.1'
__version__ = '0.10.1.2'

## The current root directory of pyclustering library.
__PYCLUSTERING_ROOT_DIRECTORY__ = str(pathlib.Path(__file__).parent)
25 changes: 16 additions & 9 deletions pyclustering/cluster/examples/kmedoids_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,21 @@
from pyclustering.cluster import cluster_visualizer
from pyclustering.cluster.kmedoids import kmedoids

from pyclustering.utils import read_sample
from pyclustering.utils import read_sample, calculate_distance_matrix
from pyclustering.utils import timedcall, distance_metric, type_metric


def template_clustering(start_medoids, path, tolerance=0.25, show=True, ccore=True):
sample = read_sample(path)
def template_clustering(start_medoids, path, tolerance=0.25, show=True, **kwargs):
ccore = kwargs.get('ccore', True)
data_type = kwargs.get('data_type', 'points')

original_data = read_sample(path)
sample = original_data
if data_type == 'distance_matrix':
sample = calculate_distance_matrix(sample)

metric = distance_metric(type_metric.EUCLIDEAN_SQUARE, data=sample)
kmedoids_instance = kmedoids(sample, start_medoids, tolerance, metric=metric, ccore=ccore)
kmedoids_instance = kmedoids(sample, start_medoids, tolerance, metric=metric, ccore=ccore, data_type=data_type)
(ticks, result) = timedcall(kmedoids_instance.process)

clusters = kmedoids_instance.get_clusters()
Expand All @@ -31,12 +37,12 @@ def template_clustering(start_medoids, path, tolerance=0.25, show=True, ccore=Tr

if show is True:
visualizer = cluster_visualizer(1)
visualizer.append_clusters(clusters, sample, 0)
visualizer.append_cluster([sample[index] for index in start_medoids], marker='*', markersize=15)
visualizer.append_cluster(medoids, data=sample, marker='*', markersize=15)
visualizer.append_clusters(clusters, original_data, 0)
visualizer.append_cluster([original_data[index] for index in start_medoids], marker='*', markersize=15)
visualizer.append_cluster(medoids, data=original_data, marker='*', markersize=15)
visualizer.show()

return sample, clusters
return original_data, clusters


def cluster_sample1():
Expand All @@ -58,7 +64,8 @@ def cluster_elongate():
template_clustering([8, 56], SIMPLE_SAMPLES.SAMPLE_ELONGATE)

def cluster_lsun():
template_clustering([10, 275, 385], FCPS_SAMPLES.SAMPLE_LSUN)
#template_clustering([10, 275, 385], FCPS_SAMPLES.SAMPLE_LSUN)
template_clustering([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], FCPS_SAMPLES.SAMPLE_LSUN, data_type='distance_matrix')

def cluster_target():
template_clustering([10, 160, 310, 460, 560, 700], FCPS_SAMPLES.SAMPLE_TARGET)
Expand Down
41 changes: 38 additions & 3 deletions pyclustering/cluster/kmedoids.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,11 @@ def __init__(self, data, initial_index_medoids, tolerance=0.0001, ccore=True, **
@param[in] initial_index_medoids (list): Indexes of intial medoids (indexes of points in input data).
@param[in] tolerance (double): Stop condition: if maximum value of distance change of medoids of clusters is less than tolerance than algorithm will stop processing.
@param[in] ccore (bool): If specified than CCORE library (C++ pyclustering library) is used for clustering instead of Python code.
@param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'data_type', 'itermax').
@param[in] **kwargs: Arbitrary keyword arguments (available arguments: `metric`, `data_type`, `itermax`).
<b>Keyword Args:</b><br>
- metric (distance_metric): Metric that is used for distance calculation between two points.
- data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
- data_type (string): Data type of input sample `data` that is processed by the algorithm (`points`, `distance_matrix`).
- itermax (uint): Maximum number of iteration for cluster analysis.
"""
Expand Down Expand Up @@ -172,6 +172,8 @@ def process(self):

iterations += 1

self.__erase_empty_clusters()

return self


Expand Down Expand Up @@ -316,7 +318,7 @@ def __update_clusters(self):

for index in range(len(self.__medoid_indexes)):
dist = self.__distance_calculator(index_point, self.__medoid_indexes[index])

if dist < dist_optim_first:
dist_optim_second = dist_optim_first
index_optim = index
Expand Down Expand Up @@ -386,3 +388,36 @@ def __calculate_swap_cost(self, index_candidate, cluster_index):
cost += candidate_distance - self.__distance_first_medoid[index_point]

return cost - self.__distance_first_medoid[index_candidate]


def __erase_empty_clusters(self):
"""!
@brief Erase empty clusters and their medoids.
@details Data might have identical points and a lot of identical points and as a result medoids might correspond
to points that are totally identical.
"""

erase_required = False

# Before processing check if there are empty clusters
for cluster in self.__clusters:
if len(cluster) == 0:
erase_required = True
break

if erase_required is False:
return

none_empty_clusters = []
none_empty_medoids = []

for index in range(len(self.__clusters)):
if len(self.__clusters[index]) == 0:
continue

none_empty_clusters.append(self.__clusters[index])
none_empty_medoids.append(self.__medoid_indexes[index])

self.__clusters = none_empty_clusters
self.__medoid_indexes = none_empty_medoids
98 changes: 90 additions & 8 deletions pyclustering/cluster/tests/integration/it_kmedoids.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from pyclustering.cluster.tests.kmedoids_templates import kmedoids_test_template
from pyclustering.cluster.kmedoids import kmedoids

from pyclustering.samples.definitions import SIMPLE_SAMPLES
from pyclustering.samples.definitions import SIMPLE_SAMPLES, SIMPLE_ANSWERS

from pyclustering.utils import read_sample
from pyclustering.utils.metric import type_metric, distance_metric
Expand Down Expand Up @@ -139,7 +139,7 @@ def testClusterAllocationTheSameObjectsThreeInitialMedoidsByCore(self):
kmedoids_test_template.templateClusterAllocationTheSameObjects(25, 3, True)

def testCoreInterfaceIntInputData(self):
kmedoids_instance = kmedoids([ [1], [2], [3], [20], [21], [22] ], [ 2, 5 ], 0.025, True)
kmedoids_instance = kmedoids([[1], [2], [3], [20], [21], [22]], [2, 5], 0.025, True)
kmedoids_instance.process()
assert len(kmedoids_instance.get_clusters()) == 2

Expand All @@ -153,18 +153,21 @@ def testAllocatedRequestedClustersSampleSimple04ByCore(self):

def testAllocatedRequestedClustersWithTheSamePointsByCore(self):
# Bug issue #366 - Kmedoids returns incorrect number of clusters.
sample = [ [0.0, 0.0], [0.1, 0.1], [0.0, 0.0], [0.1, 0.2] ]
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 4, None, True)
sample = [[0.0, 0.0], [0.1, 0.1], [0.0, 0.0], [0.1, 0.2]]
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 3, None, True)
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 3, None, True)
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 2, None, True)
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 1, None, True)

def testAllocatedRequestedClustersWithTheSamePoints2(self):
sample = [[0.23, 0.2], [-0.1, 0.1], [0.0, 0.9], [0.1, -0.2], [0.8, 0.1], [-0.1, 0.1], [-0.4, -0.2], [0.0, 0.9]]
answers = [1, 2, 3, 4, 5, 6, 6, 6]
for expected_amount in answers:
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, expected_amount, None, True)

def testAllocatedRequestedClustersWithTotallyTheSamePointsByCore(self):
# Bug issue #366 - Kmedoids returns incorrect number of clusters.
sample = [ [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0] ]
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 4, None, True)
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 3, None, True)
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 2, None, True)
sample = [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 1, None, True)


Expand All @@ -184,3 +187,82 @@ def testItermax10Simple01(self):

def testItermax10Simple02(self):
kmedoids_test_template.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, [3, 12, 20], [10, 5, 8], True, itermax=10)


def testSimple01AnswerByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, SIMPLE_ANSWERS.ANSWER_SIMPLE1, True, random_state=1000)

def testSimple01AnswerDistanceMatrixByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, SIMPLE_ANSWERS.ANSWER_SIMPLE1, True, random_state=1000, data_type='distance_matrix')

def testSimple02AnswerByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, SIMPLE_ANSWERS.ANSWER_SIMPLE2, True, random_state=1000)

def testSimple02AnswerDistanceMatrixByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, SIMPLE_ANSWERS.ANSWER_SIMPLE2, True, random_state=1000, data_type='distance_matrix')

def testSimple03AnswerByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, SIMPLE_ANSWERS.ANSWER_SIMPLE3, True, random_state=1000)

def testSimple03AnswerDistanceMatrixByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, SIMPLE_ANSWERS.ANSWER_SIMPLE3, True, random_state=1000, data_type='distance_matrix')

def testSimple04AnswerByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, SIMPLE_ANSWERS.ANSWER_SIMPLE4, True, random_state=1000)

def testSimple04AnswerDistanceMatrixByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, SIMPLE_ANSWERS.ANSWER_SIMPLE4, True, random_state=1000, data_type='distance_matrix')

def testSimple05AnswerByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE5, SIMPLE_ANSWERS.ANSWER_SIMPLE5, True, random_state=1000)

def testSimple05AnswerDistanceMatrixByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE5, SIMPLE_ANSWERS.ANSWER_SIMPLE5, True, random_state=1000, data_type='distance_matrix')

def testSimple06AnswerByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE6, SIMPLE_ANSWERS.ANSWER_SIMPLE6, True, random_state=1000)

def testSimple06AnswerDistanceMatrixByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE6, SIMPLE_ANSWERS.ANSWER_SIMPLE6, True, random_state=1000, data_type='distance_matrix')

def testSimple07AnswerByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE7, SIMPLE_ANSWERS.ANSWER_SIMPLE7, True, random_state=1000)

def testSimple07AnswerDistanceMatrixByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE7, SIMPLE_ANSWERS.ANSWER_SIMPLE7, True, random_state=1000, data_type='distance_matrix')

def testSimple08AnswerByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE8, SIMPLE_ANSWERS.ANSWER_SIMPLE8, True, random_state=1000)

def testSimple08AnswerDistanceMatrixByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE8, SIMPLE_ANSWERS.ANSWER_SIMPLE8, True, random_state=1000, data_type='distance_matrix')

def testSimple09AnswerByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE9, SIMPLE_ANSWERS.ANSWER_SIMPLE9, True, random_state=1000)

def testSimple09AnswerDistanceMatrixByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE9, SIMPLE_ANSWERS.ANSWER_SIMPLE9, True, random_state=1000, data_type='distance_matrix')

def testSimple10AnswerByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE10, SIMPLE_ANSWERS.ANSWER_SIMPLE10, True, random_state=1000)

def testSimple10AnswerDistanceMatrixByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE10, SIMPLE_ANSWERS.ANSWER_SIMPLE10, True, random_state=1000, data_type='distance_matrix')

def testSimple11AnswerByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE11, SIMPLE_ANSWERS.ANSWER_SIMPLE11, True, random_state=1000)

def testSimple11AnswerDistanceMatrixByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE11, SIMPLE_ANSWERS.ANSWER_SIMPLE11, True, random_state=1000, data_type='distance_matrix')

def testSimple12AnswerByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE12, SIMPLE_ANSWERS.ANSWER_SIMPLE12, True, random_state=1000)

def testSimple12AnswerDistanceMatrixByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE12, SIMPLE_ANSWERS.ANSWER_SIMPLE12, True, random_state=1000, data_type='distance_matrix')

def testSimple13AnswerByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE13, SIMPLE_ANSWERS.ANSWER_SIMPLE13, True, random_state=1000)

def testSimple13AnswerDistanceMatrixByCore(self):
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE13, SIMPLE_ANSWERS.ANSWER_SIMPLE13, True, random_state=1000, data_type='distance_matrix')
18 changes: 15 additions & 3 deletions pyclustering/cluster/tests/kmedoids_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ def templateLengthProcessWithMetric(path_to_file, initial_medoids, expected_clus
if len(sample) != sum(obtained_cluster_sizes):
continue

for cluster in clusters:
if len(cluster) == 0:
continue

if expected_cluster_length is not None:
obtained_cluster_sizes.sort()
expected_cluster_length.sort()
Expand Down Expand Up @@ -143,10 +147,10 @@ def templateClusterAllocationTheSameObjects(number_objects, number_clusters, cco

assertion.eq(len(clusters), len(medoids))
assertion.eq(len(set(medoids)), len(medoids))

object_mark = [False] * number_objects
allocated_number_objects = 0

for cluster in clusters:
for index_object in cluster:
assertion.eq(False, object_mark[index_object]) # one object can be in only one cluster.
Expand Down Expand Up @@ -174,7 +178,15 @@ def templatePredict(path_to_file, initial_medoids, points, expected_closest_clus

@staticmethod
def clustering_with_answer(data_file, answer_file, ccore, **kwargs):
data = read_sample(data_file)
data_type = kwargs.get('data_type', 'points')
metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN))

original_data = read_sample(data_file)
data = original_data

if data_type == 'distance_matrix':
data = calculate_distance_matrix(original_data, metric)

reader = answer_reader(answer_file)

amount_medoids = len(reader.get_clusters())
Expand Down
Loading

0 comments on commit 19a1d6c

Please sign in to comment.