diff --git a/CHANGES b/CHANGES
index 0b4f8c33..dd84c3a0 100755
--- a/CHANGES
+++ b/CHANGES
@@ -6,6 +6,9 @@ CHANGE NOTES FOR 0.11.0 (STARTED Nov 26, 2020), (RELEASED: -)
GENERAL CHANGES:
+- Supported `distance_matrix` data type for K-Means++ (Python: `pyclustering.cluster.center_initializer`).
+ See: https://github.com/annoviko/pyclustering/issues/622
+
- Introduced PAM BUILD algorithm to generate initial medoids (Python: `pyclustering.cluster.kmedoids`, C++: `pyclustering::clst::pam_build`).
See: https://github.com/annoviko/pyclustering/issues/667
diff --git a/ccore/include/pyclustering/cluster/pam_build.hpp b/ccore/include/pyclustering/cluster/pam_build.hpp
index 5bd2730e..381bc093 100755
--- a/ccore/include/pyclustering/cluster/pam_build.hpp
+++ b/ccore/include/pyclustering/cluster/pam_build.hpp
@@ -106,7 +106,7 @@ class pam_build {
@param[in] p_amount: amount of medoids that should be initialized.
*/
- pam_build(const std::size_t p_amount);
+ explicit pam_build(const std::size_t p_amount);
/*
diff --git a/pyclustering/cluster/center_initializer.py b/pyclustering/cluster/center_initializer.py
index 16164d97..a85551f5 100755
--- a/pyclustering/cluster/center_initializer.py
+++ b/pyclustering/cluster/center_initializer.py
@@ -16,6 +16,8 @@
import random
import warnings
+from pyclustering.utils.metric import distance_metric, type_metric
+
class random_center_initializer:
"""!
@@ -171,11 +173,12 @@ def __init__(self, data, amount_centers, amount_candidates=None, **kwargs):
@param[in] amount_candidates (uint): Amount of candidates that is considered as a center, if the farthest points
(with the highest probability) should be considered as centers then special constant should be used
'FARTHEST_CENTER_CANDIDATE'. By default the amount of candidates is 3.
- @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `random_state`, `data_type`).
+ @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `random_state`, `data_type`, `metric`).
Keyword Args:
- random_state (int): Seed for random state (by default is `None`, current system time is used).
- data_type (str): Data type of input sample `data` (`points`, `distance_matrix`).
+ - metric (distance_metric): Metric that is used for distance calculation between two points.
@see FARTHEST_CENTER_CANDIDATE
@@ -192,9 +195,14 @@ def __init__(self, data, amount_centers, amount_candidates=None, **kwargs):
else:
self.__candidates = amount_candidates
- random.seed(kwargs.get('random_state', None))
+ random_seed = kwargs.get('random_state', None)
+ numpy.random.seed(random_seed)
+ random.seed(random_seed)
+
+ self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
self.__data_type = kwargs.get('data_type', 'points')
+ self.__metric.enable_numpy_usage()
self.__check_parameters()
@@ -231,7 +239,10 @@ def __calculate_shortest_distances(self, data, centers):
for index_center in range(len(centers)):
center = data[centers[index_center]]
- dataset_differences[index_center] = numpy.sum(numpy.square(data - center), axis=1).T
+ if self.__data_type == 'points':
+ dataset_differences[index_center] = self.__metric(data, center)
+ elif self.__data_type == 'distance_matrix':
+ dataset_differences[index_center] = numpy.array(self.__data[centers[index_center]])
with warnings.catch_warnings():
numpy.warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered')
diff --git a/pyclustering/cluster/tests/unit/ut_center_initializer.py b/pyclustering/cluster/tests/unit/ut_center_initializer.py
index ba108262..9daebed7 100755
--- a/pyclustering/cluster/tests/unit/ut_center_initializer.py
+++ b/pyclustering/cluster/tests/unit/ut_center_initializer.py
@@ -23,7 +23,7 @@
from pyclustering.samples.definitions import SIMPLE_SAMPLES
-from pyclustering.utils import read_sample
+from pyclustering.utils import read_sample, calculate_distance_matrix, type_metric, distance_metric
from pyclustering.tests.assertion import assertion
@@ -271,37 +271,49 @@ def testKmeansPlusPlusUniqueCentersSeveralCandidatesSimple02(self):
self.templateKmeansPlusPlusUnique(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 23, 10)
- def templateKmeansPlusPlusSeveralRuns(self, path_sample, amount, candidates):
- sample = read_sample(path_sample)
+ def template_compare_output(self, path, k, candidates, random_state, metric):
+ sample = read_sample(path)
+ matrix = calculate_distance_matrix(sample, metric=metric)
- attempts = 10
- for _ in range(attempts):
- medoids = kmeans_plusplus_initializer(sample, amount, candidates).initialize(return_index=True)
- medoids += kmeans_plusplus_initializer(sample, amount, candidates).initialize(return_index=True)
- medoids += kmeans_plusplus_initializer(sample, amount, candidates).initialize(return_index=True)
+ result1 = kmeans_plusplus_initializer(sample, k, candidates, random_state=random_state, data_type='points', metric=metric).initialize(return_index=True)
+ result2 = kmeans_plusplus_initializer(matrix, k, candidates, random_state=random_state, data_type='distance_matrix', metric=metric).initialize(return_index=True)
- unique_medoids = set(medoids)
- if len(unique_medoids) != len(medoids):
- continue
+ assertion.eq(result1, result2)
+
+ def test_various_data_type_simple1(self):
+ self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+ self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 3, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+ self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 4, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+
+ self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 3, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+ self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 3, 3, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+ self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 4, 3, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+
+ def test_various_data_type_simple1_euclidean(self):
+ self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN))
- return
+ def test_various_data_type_simple1_euclidean_square(self):
+ self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
- self.assertTrue(False, "K-Means++ does not return unique medoids during %d attempts." % attempts)
+ def test_various_data_type_simple1_euclidean_manhattan(self):
+ self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.MANHATTAN))
- def templateKmeansPlusPlusVariousCentersSimple01(self):
- self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1)
- self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 5, 1)
+ def test_various_data_type_simple1_euclidean_chebyshev(self):
+ self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.CHEBYSHEV))
- def templateKmeansPlusPlusVariousCentersSeveralCandidatesSimple01(self):
- self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 3)
+ def test_various_data_type_simple2(self):
+ self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 3, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+ self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 3, 5, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
- def templateKmeansPlusPlusVariousCentersFarthestCandidatesSimple01(self):
- self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 'farthest')
+ def test_various_data_type_simple3(self):
+ self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, 4, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+ self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, 4, 5, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
- def templateKmeansPlusPlusVariousCentersSimple02(self):
- self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 3, 1)
- self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 6, 1)
+ def test_various_data_type_simple4(self):
+ self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, 5, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+ self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, 5, 2, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+ self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, 5, 5, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
- def templateKmeansPlusPlusVariousCentersSimple03(self):
- self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 4, 1)
- self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 8, 1)
+ def test_various_data_type_simple5(self):
+ for i in range(10):
+ self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE5, i + 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))