From a475f11d1fc347f6d1d41df6b024fa39021179da Mon Sep 17 00:00:00 2001 From: annoviko Date: Wed, 10 Feb 2021 21:20:01 +0100 Subject: [PATCH] #622: Supported 'distance_matrix' data type for K-Means++. --- CHANGES | 3 + .../pyclustering/cluster/pam_build.hpp | 2 +- pyclustering/cluster/center_initializer.py | 17 ++++- .../tests/unit/ut_center_initializer.py | 64 +++++++++++-------- 4 files changed, 56 insertions(+), 30 deletions(-) diff --git a/CHANGES b/CHANGES index 0b4f8c33..dd84c3a0 100755 --- a/CHANGES +++ b/CHANGES @@ -6,6 +6,9 @@ CHANGE NOTES FOR 0.11.0 (STARTED Nov 26, 2020), (RELEASED: -) GENERAL CHANGES: +- Supported `distance_matrix` data type for K-Means++ (Python: `pyclustering.cluster.center_initializer`). + See: https://github.com/annoviko/pyclustering/issues/622 + - Introduced PAM BUILD algorithm to generate initial medoids (Python: `pyclustering.cluster.kmedoids`, C++: `pyclustering::clst::pam_build`). See: https://github.com/annoviko/pyclustering/issues/667 diff --git a/ccore/include/pyclustering/cluster/pam_build.hpp b/ccore/include/pyclustering/cluster/pam_build.hpp index 5bd2730e..381bc093 100755 --- a/ccore/include/pyclustering/cluster/pam_build.hpp +++ b/ccore/include/pyclustering/cluster/pam_build.hpp @@ -106,7 +106,7 @@ class pam_build { @param[in] p_amount: amount of medoids that should be initialized. */ - pam_build(const std::size_t p_amount); + explicit pam_build(const std::size_t p_amount); /* diff --git a/pyclustering/cluster/center_initializer.py b/pyclustering/cluster/center_initializer.py index 16164d97..a85551f5 100755 --- a/pyclustering/cluster/center_initializer.py +++ b/pyclustering/cluster/center_initializer.py @@ -16,6 +16,8 @@ import random import warnings +from pyclustering.utils.metric import distance_metric, type_metric + class random_center_initializer: """! @@ -171,11 +173,12 @@ def __init__(self, data, amount_centers, amount_candidates=None, **kwargs): @param[in] amount_candidates (uint): Amount of candidates that is considered as a center, if the farthest points (with the highest probability) should be considered as centers then special constant should be used 'FARTHEST_CENTER_CANDIDATE'. By default the amount of candidates is 3. - @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `random_state`, `data_type`). + @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `random_state`, `data_type`, `metric`). Keyword Args:
- random_state (int): Seed for random state (by default is `None`, current system time is used). - data_type (str): Data type of input sample `data` (`points`, `distance_matrix`). + - metric (distance_metric): Metric that is used for distance calculation between two points. @see FARTHEST_CENTER_CANDIDATE @@ -192,9 +195,14 @@ def __init__(self, data, amount_centers, amount_candidates=None, **kwargs): else: self.__candidates = amount_candidates - random.seed(kwargs.get('random_state', None)) + random_seed = kwargs.get('random_state', None) + numpy.random.seed(random_seed) + random.seed(random_seed) + + self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE)) self.__data_type = kwargs.get('data_type', 'points') + self.__metric.enable_numpy_usage() self.__check_parameters() @@ -231,7 +239,10 @@ def __calculate_shortest_distances(self, data, centers): for index_center in range(len(centers)): center = data[centers[index_center]] - dataset_differences[index_center] = numpy.sum(numpy.square(data - center), axis=1).T + if self.__data_type == 'points': + dataset_differences[index_center] = self.__metric(data, center) + elif self.__data_type == 'distance_matrix': + dataset_differences[index_center] = numpy.array(self.__data[centers[index_center]]) with warnings.catch_warnings(): numpy.warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered') diff --git a/pyclustering/cluster/tests/unit/ut_center_initializer.py b/pyclustering/cluster/tests/unit/ut_center_initializer.py index ba108262..9daebed7 100755 --- a/pyclustering/cluster/tests/unit/ut_center_initializer.py +++ b/pyclustering/cluster/tests/unit/ut_center_initializer.py @@ -23,7 +23,7 @@ from pyclustering.samples.definitions import SIMPLE_SAMPLES -from pyclustering.utils import read_sample +from pyclustering.utils import read_sample, calculate_distance_matrix, type_metric, distance_metric from pyclustering.tests.assertion import assertion @@ -271,37 +271,49 @@ def testKmeansPlusPlusUniqueCentersSeveralCandidatesSimple02(self): self.templateKmeansPlusPlusUnique(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 23, 10) - def templateKmeansPlusPlusSeveralRuns(self, path_sample, amount, candidates): - sample = read_sample(path_sample) + def template_compare_output(self, path, k, candidates, random_state, metric): + sample = read_sample(path) + matrix = calculate_distance_matrix(sample, metric=metric) - attempts = 10 - for _ in range(attempts): - medoids = kmeans_plusplus_initializer(sample, amount, candidates).initialize(return_index=True) - medoids += kmeans_plusplus_initializer(sample, amount, candidates).initialize(return_index=True) - medoids += kmeans_plusplus_initializer(sample, amount, candidates).initialize(return_index=True) + result1 = kmeans_plusplus_initializer(sample, k, candidates, random_state=random_state, data_type='points', metric=metric).initialize(return_index=True) + result2 = kmeans_plusplus_initializer(matrix, k, candidates, random_state=random_state, data_type='distance_matrix', metric=metric).initialize(return_index=True) - unique_medoids = set(medoids) - if len(unique_medoids) != len(medoids): - continue + assertion.eq(result1, result2) + + def test_various_data_type_simple1(self): + self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE)) + self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 3, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE)) + self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 4, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE)) + + self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 3, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE)) + self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 3, 3, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE)) + self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 4, 3, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE)) + + def test_various_data_type_simple1_euclidean(self): + self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN)) - return + def test_various_data_type_simple1_euclidean_square(self): + self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE)) - self.assertTrue(False, "K-Means++ does not return unique medoids during %d attempts." % attempts) + def test_various_data_type_simple1_euclidean_manhattan(self): + self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.MANHATTAN)) - def templateKmeansPlusPlusVariousCentersSimple01(self): - self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1) - self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 5, 1) + def test_various_data_type_simple1_euclidean_chebyshev(self): + self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.CHEBYSHEV)) - def templateKmeansPlusPlusVariousCentersSeveralCandidatesSimple01(self): - self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 3) + def test_various_data_type_simple2(self): + self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 3, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE)) + self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 3, 5, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE)) - def templateKmeansPlusPlusVariousCentersFarthestCandidatesSimple01(self): - self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 'farthest') + def test_various_data_type_simple3(self): + self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, 4, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE)) + self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, 4, 5, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE)) - def templateKmeansPlusPlusVariousCentersSimple02(self): - self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 3, 1) - self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 6, 1) + def test_various_data_type_simple4(self): + self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, 5, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE)) + self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, 5, 2, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE)) + self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, 5, 5, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE)) - def templateKmeansPlusPlusVariousCentersSimple03(self): - self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 4, 1) - self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 8, 1) + def test_various_data_type_simple5(self): + for i in range(10): + self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE5, i + 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))