From 25f8bb61eb005b4aea6c5106f5c8ce207f3cde3e Mon Sep 17 00:00:00 2001 From: Ales Erjavec Date: Tue, 19 Sep 2017 16:54:01 +0200 Subject: [PATCH 1/2] preprocess.randomization: Do not use the same seed for X, Y, and meta --- Orange/preprocess/preprocess.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/Orange/preprocess/preprocess.py b/Orange/preprocess/preprocess.py index 653030efdee..480d3de0c32 100644 --- a/Orange/preprocess/preprocess.py +++ b/Orange/preprocess/preprocess.py @@ -354,16 +354,19 @@ def __call__(self, data): Randomized data table. """ new_data = data.copy() + rstate = np.random.RandomState(self.rand_seed) + # ensure the same seed is not used to shuffle X and Y at the same time + r1, r2, r3 = rstate.randint(0, 2 ** 32 - 1, size=3, dtype=np.int64) if self.rand_type & Randomize.RandomizeClasses: - new_data.Y = self.randomize(new_data.Y) + new_data.Y = self.randomize(new_data.Y, r1) if self.rand_type & Randomize.RandomizeAttributes: - new_data.X = self.randomize(new_data.X) + new_data.X = self.randomize(new_data.X, r2) if self.rand_type & Randomize.RandomizeMetas: - new_data.metas = self.randomize(new_data.metas) + new_data.metas = self.randomize(new_data.metas, r3) return new_data - def randomize(self, table): - np.random.seed(self.rand_seed) + def randomize(self, table, rand_state=None): + rstate = np.random.RandomState(rand_state) if sp.issparse(table): table = table.tocsc() rnd_indices = np.arange(table.shape[0], dtype=table.indices.dtype) @@ -371,13 +374,13 @@ def randomize(self, table): col_indices = \ table.indices[table.indptr[i]: table.indptr[i + 1]] new_indices = rnd_indices[:len(col_indices)] - np.random.shuffle(new_indices) + rstate.shuffle(new_indices) col_indices[:] = new_indices elif len(table.shape) > 1: for i in range(table.shape[1]): - np.random.shuffle(table[:, i]) + rstate.shuffle(table[:, i]) else: - np.random.shuffle(table) + rstate.shuffle(table) return table From 8bdcb3e2518c1743abe5f5017e8ba5d4a678b95d Mon Sep 17 00:00:00 2001 From: Ales Erjavec Date: Wed, 20 Sep 2017 13:40:48 +0200 Subject: [PATCH 2/2] tests/owpreprocess: Test against Randomize implementation --- Orange/widgets/data/tests/test_owpreprocess.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Orange/widgets/data/tests/test_owpreprocess.py b/Orange/widgets/data/tests/test_owpreprocess.py index 0ae6810df52..317013626f5 100644 --- a/Orange/widgets/data/tests/test_owpreprocess.py +++ b/Orange/widgets/data/tests/test_owpreprocess.py @@ -25,11 +25,16 @@ def test_randomize(self): self.widget.set_model(model) self.send_signal(self.widget.Inputs.data, self.zoo) output = self.get_output(self.widget.Outputs.preprocessed_data) - np.random.seed(1) - np.random.shuffle(self.zoo.Y) + r = Randomize(Randomize.RandomizeClasses, rand_seed=1) + expected = r(self.zoo) + + np.testing.assert_array_equal(expected.X, output.X) + np.testing.assert_array_equal(expected.Y, output.Y) + np.testing.assert_array_equal(expected.metas, output.metas) + np.testing.assert_array_equal(self.zoo.X, output.X) - np.testing.assert_array_equal(self.zoo.Y, output.Y) np.testing.assert_array_equal(self.zoo.metas, output.metas) + self.assertFalse(np.array_equal(self.zoo.Y, output.Y)) def test_normalize(self): data = Table("iris")