From b2e27b6e586a78748e29cb2acb4dc4d4bced3fb9 Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Sat, 23 Apr 2022 21:29:13 +0300 Subject: [PATCH 01/30] Implement group time series cross-validator --- mlxtend/evaluate/__init__.py | 4 +- mlxtend/evaluate/time_series.py | 197 ++++++++++++++++++++++++++++++++ 2 files changed, 200 insertions(+), 1 deletion(-) create mode 100644 mlxtend/evaluate/time_series.py diff --git a/mlxtend/evaluate/__init__.py b/mlxtend/evaluate/__init__.py index 18ea36e6b..8aa99469d 100644 --- a/mlxtend/evaluate/__init__.py +++ b/mlxtend/evaluate/__init__.py @@ -28,6 +28,7 @@ from .bias_variance_decomp import bias_variance_decomp from .accuracy import accuracy_score from .counterfactual import create_counterfactual +from .time_series import GroupTimeSeriesSplit __all__ = ["scoring", "confusion_matrix", "mcnemar_table", "mcnemar_tables", @@ -40,4 +41,5 @@ "RandomHoldoutSplit", "PredefinedHoldoutSplit", "ftest", "combined_ftest_5x2cv", "proportion_difference", "bias_variance_decomp", - "accuracy_score", "create_counterfactual"] + "accuracy_score", "create_counterfactual", + "time_series"] diff --git a/mlxtend/evaluate/time_series.py b/mlxtend/evaluate/time_series.py new file mode 100644 index 000000000..c564050f9 --- /dev/null +++ b/mlxtend/evaluate/time_series.py @@ -0,0 +1,197 @@ +# mlxtend Machine Learning Library Extensions +# +# Time series cross validation with grouping. +# Author: Dmitry Labazkin +# +# License: BSD 3 clause + +from itertools import groupby + +import numpy as np +from sklearn.utils import indexable + + +class GroupTimeSeriesSplit: + """Group time series cross-validator. + + Parameters + ---------- + test_size : int + Size of test dataset. + train_size : int (default=None) + Size of train dataset. + n_splits : int (default=None) + Number of the splits. + gap_size : int (default=0) + Gap size between train and test datasets. + shift_size : int (default=1) + Step to shift for the next fold. + window_type : str (default='rolling') + Type of the window. Possible values: 'rolling', 'expanding'. + + Examples + ----------- + For usage examples, please see + http://rasbt.github.io/mlxtend/user_guide/evaluate/GroupTimeSeriesSplit/ + """ + + def __init__(self, + test_size, + train_size=None, + n_splits=None, + gap_size=0, + shift_size=1, + window_type='rolling'): + + if (train_size is None) and (n_splits is None): + raise ValueError( + 'Either train_size or n_splits have to be defined') + + if window_type not in ['rolling', 'expanding']: + raise ValueError( + 'Window type can be either "rolling" or "expanding"') + + if (train_size is not None) and (window_type == 'expanding'): + raise ValueError( + 'Train size can be specified only with rolling window') + + self.test_size = test_size + self.train_size = train_size + self.n_splits = n_splits + self.gap_size = gap_size + self.shift_size = shift_size + self.window_type = window_type + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like + Training data. + y : array-like (default=None) + Always ignored, exists for compatibility. + groups : array-like (default=None) + Array with group names or sequence numbers. + + Yields + ------ + train : ndarray + The training set indices for that split. + test : ndarray + The testing set indices for that split. + """ + test_size = self.test_size + gap = self.gap_size + shift_size = self.shift_size + X, y, groups = indexable(X, y, groups) + + if groups is None: + raise ValueError('The groups should be specified') + + group_seqs = [group[0] for group in groupby(groups)] + unique_groups, group_starts_idx = np.unique(groups, return_index=True) + + if group_seqs != sorted(unique_groups): + raise ValueError('The groups should be sorted in increasing order') + + n_groups = len(unique_groups) + self._n_groups = n_groups + groups_dict = dict(zip(unique_groups, group_starts_idx)) + n_samples = len(X) + + self._calculate_split_params() + train_size = self.train_size + n_splits = self.n_splits + train_start_idx = self._train_start_idx + train_end_idx = train_start_idx + train_size + test_start_idx = train_end_idx + gap + test_end_idx = test_start_idx + test_size + + for _ in range(n_splits): + train_idx = np.r_[slice(groups_dict[group_seqs[train_start_idx]], + groups_dict[group_seqs[train_end_idx]])] + + if test_end_idx < n_groups: + test_idx = np.r_[slice(groups_dict[group_seqs[test_start_idx]], + groups_dict[group_seqs[test_end_idx]])] + else: + test_idx = np.r_[slice(groups_dict[group_seqs[test_start_idx]], + n_samples)] + + yield train_idx, test_idx + + if self.window_type == 'rolling': + train_start_idx = train_start_idx + shift_size + + train_end_idx = train_end_idx + shift_size + test_start_idx = test_start_idx + shift_size + test_end_idx = test_end_idx + shift_size + + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + y : object + Always ignored, exists for compatibility. + groups : object + Always ignored, exists for compatibility. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + return self.n_splits + + def _calculate_split_params(self): + train_size = self.train_size + test_size = self.test_size + n_splits = self.n_splits + gap = self.gap_size + shift_size = self.shift_size + n_groups = self._n_groups + + not_enough_data_error = ( + 'Not enough data to split number of groups ({0})' + ' for number splits ({1})' + ' with train size ({2}), test size ({3}),' + ' gap size ({4}), shift size ({5})') + + if (train_size is None) and (n_splits is not None): + train_size = n_groups - gap - test_size - (n_splits - + 1) * shift_size + self.train_size = train_size + + if train_size <= 0: + raise ValueError( + not_enough_data_error.format(n_groups, n_splits, + train_size, test_size, gap, + shift_size)) + train_start_idx = 0 + elif (n_splits is None) and (train_size is not None): + n_splits = (n_groups - train_size - gap - + test_size) // shift_size + 1 + self.n_splits = n_splits + + if self.n_splits <= 0: + raise ValueError( + not_enough_data_error.format(n_groups, n_splits, + train_size, test_size, gap, + shift_size)) + train_start_idx = n_groups - train_size - gap - test_size - ( + n_splits - 1) * shift_size + else: + train_start_idx = n_groups - train_size - gap - test_size - ( + n_splits - 1) * shift_size + + if train_start_idx < 0: + raise ValueError( + not_enough_data_error.format(n_groups, n_splits, + train_size, test_size, gap, + shift_size)) + + self._train_start_idx = train_start_idx From ea68fd0ec4f8cbe330e7213a74993cee4fa8df51 Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Sat, 23 Apr 2022 21:29:45 +0300 Subject: [PATCH 02/30] Add tests --- mlxtend/evaluate/tests/test_time_series.py | 260 +++++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 mlxtend/evaluate/tests/test_time_series.py diff --git a/mlxtend/evaluate/tests/test_time_series.py b/mlxtend/evaluate/tests/test_time_series.py new file mode 100644 index 000000000..f5e4c766e --- /dev/null +++ b/mlxtend/evaluate/tests/test_time_series.py @@ -0,0 +1,260 @@ +# mlxtend Machine Learning Library Extensions +# +# Time series cross validation with grouping. +# Author: Dmitry Labazkin +# +# License: BSD 3 clause + +import numpy as np +import pytest +from mlxtend.evaluate import GroupTimeSeriesSplit + + +@pytest.fixture +def X(): + return np.array([[0], [7], [6], [4], [4], [8], [0], [6], [2], [0], [5], + [9], [7], [7], [7], [7]]) + + +@pytest.fixture +def y(): + return np.array([1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0]) + + +@pytest.fixture +def group_numbers(): + return np.array([0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 5, 5]) + + +@pytest.fixture +def not_sorted_group_numbers(): + return np.array([5, 5, 5, 5, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 0]) + + +@pytest.fixture +def not_consecutive_group_numbers(): + return np.array([0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 2, 2, 2, 2]) + + +@pytest.fixture +def group_names(): + return np.array([ + '2021-01', '2021-02', '2021-02', '2021-02', '2021-02', '2021-03', + '2021-03', '2021-03', '2021-04', '2021-04', '2021-05', '2021-05', + '2021-06', '2021-06', '2021-06', '2021-06' + ]) + + +def check_splits(X, y, groups, cv_args, expected_results): + cv = GroupTimeSeriesSplit(**cv_args) + results = list(cv.split(X, y, groups)) + + assert len(results) == len(expected_results) + + for split, expected_split in zip(results, expected_results): + assert np.array_equal(split[0], expected_split[0]) + assert np.array_equal(split[1], expected_split[1]) + + return cv + + +def test_get_n_splits(X, y, group_numbers): + cv_args = {'test_size': 1, 'train_size': 3} + expected_results = [(np.array([0, 1, 2, 3, 4, 5, 6, 7]), np.array([8, 9])), + (np.array([1, 2, 3, 4, 5, 6, 7, 8, + 9]), np.array([10, 11])), + (np.array([5, 6, 7, 8, 9, 10, + 11]), np.array([12, 13, 14, 15]))] + cv = check_splits(X, y, group_numbers, cv_args, expected_results) + + assert cv.get_n_splits() == len(expected_results) + + +def test_train_size(X, y, group_numbers): + cv_args = {'test_size': 1, 'train_size': 3} + expected_results = [(np.array([0, 1, 2, 3, 4, 5, 6, 7]), np.array([8, 9])), + (np.array([1, 2, 3, 4, 5, 6, 7, 8, + 9]), np.array([10, 11])), + (np.array([5, 6, 7, 8, 9, 10, + 11]), np.array([12, 13, 14, 15]))] + check_splits(X, y, group_numbers, cv_args, expected_results) + + +def test_train_size_group_names(X, y, group_names): + cv_args = {'test_size': 1, 'train_size': 3} + expected_results = [(np.array([0, 1, 2, 3, 4, 5, 6, 7]), np.array([8, 9])), + (np.array([1, 2, 3, 4, 5, 6, 7, 8, + 9]), np.array([10, 11])), + (np.array([5, 6, 7, 8, 9, 10, + 11]), np.array([12, 13, 14, 15]))] + check_splits(X, y, group_names, cv_args, expected_results) + + +def test_n_splits(X, y, group_numbers): + cv_args = {'test_size': 2, 'n_splits': 3} + expected_results = [(np.array([0, 1, 2, 3, 4]), np.array([5, 6, 7, 8, 9])), + (np.array([1, 2, 3, 4, 5, 6, + 7]), np.array([8, 9, 10, 11])), + (np.array([5, 6, 7, 8, + 9]), np.array([10, 11, 12, 13, 14, 15]))] + check_splits(X, y, group_numbers, cv_args, expected_results) + + +def test_n_splits_gap_size(X, y, group_numbers): + cv_args = {'test_size': 1, 'n_splits': 3, 'gap_size': 1} + expected_results = [(np.array([0, 1, 2, 3, 4]), np.array([8, 9])), + (np.array([1, 2, 3, 4, 5, 6, 7]), np.array([10, 11])), + (np.array([5, 6, 7, 8, 9]), np.array([12, 13, 14, + 15]))] + check_splits(X, y, group_numbers, cv_args, expected_results) + + +def test_n_splits_shift_size(X, y, group_numbers): + cv_args = {'test_size': 1, 'n_splits': 3, 'gap_size': 1} + expected_results = [(np.array([0, 1, 2, 3, 4]), np.array([8, 9])), + (np.array([1, 2, 3, 4, 5, 6, 7]), np.array([10, 11])), + (np.array([5, 6, 7, 8, 9]), np.array([12, 13, 14, + 15]))] + check_splits(X, y, group_numbers, cv_args, expected_results) + + +def test_n_splits_expanding_window(X, y, group_numbers): + cv_args = {'test_size': 3, 'n_splits': 3, 'window_type': 'expanding'} + expected_results = [(np.array([0]), np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])), + (np.array([0, 1, 2, 3, + 4]), np.array([5, 6, 7, 8, 9, 10, 11])), + (np.array([0, 1, 2, 3, 4, 5, 6, 7]), + np.array([8, 9, 10, 11, 12, 13, 14, 15]))] + check_splits(X, y, group_numbers, cv_args, expected_results) + + +def test_full_usage_of_data(X, y, group_numbers): + cv_args = {'test_size': 3, 'train_size': 2, 'n_splits': 2} + expected_results = [(np.array([0, 1, 2, 3, + 4]), np.array([5, 6, 7, 8, 9, 10, 11])), + (np.array([1, 2, 3, 4, 5, 6, 7]), + np.array([8, 9, 10, 11, 12, 13, 14, 15]))] + check_splits(X, y, group_numbers, cv_args, expected_results) + + +def test_partial_usage_of_data(X, y, group_numbers): + cv_args = {'test_size': 2, 'train_size': 2, 'n_splits': 2} + expected_results = [(np.array([1, 2, 3, 4, 5, 6, + 7]), np.array([8, 9, 10, 11])), + (np.array([5, 6, 7, 8, + 9]), np.array([10, 11, 12, 13, 14, 15]))] + check_splits(X, y, group_numbers, cv_args, expected_results) + + +def test_not_specified_train_size_n_splits(X, y, group_numbers): + cv_args = {'test_size': 1} + expected_results = None + error_message = 'Either train_size or n_splits have to be defined' + + with pytest.raises(ValueError, match=error_message): + check_splits(X, y, group_numbers, cv_args, expected_results) + + +def test_bad_window_type(X, y, group_numbers): + cv_args = { + 'test_size': 1, + 'train_size': 3, + 'window_type': 'incorrect_window_type' + } + expected_results = None + error_message = 'Window type can be either "rolling" or "expanding"' + + with pytest.raises(ValueError, match=error_message): + check_splits(X, y, group_numbers, cv_args, expected_results) + + +def test_train_size_with_expanding_window(X, y, group_numbers): + cv_args = {'test_size': 1, 'train_size': 3, 'window_type': 'expanding'} + expected_results = None + error_message = 'Train size can be specified only with rolling window' + + with pytest.raises(ValueError, match=error_message): + check_splits(X, y, group_numbers, cv_args, expected_results) + + +def test_not_specified_groups(X, y): + cv_args = {'test_size': 1, 'train_size': 3} + expected_results = None + error_message = 'The groups should be specified' + + with pytest.raises(ValueError, match=error_message): + check_splits(X, y, None, cv_args, expected_results) + + +def test_not_sorted_groups(X, y, not_sorted_group_numbers): + cv_args = {'test_size': 1, 'train_size': 3} + expected_results = None + error_message = 'The groups should be sorted in increasing order' + + with pytest.raises(ValueError, match=error_message): + check_splits(X, y, not_sorted_group_numbers, cv_args, expected_results) + + +def test_not_consecutive_groups(X, y, not_consecutive_group_numbers): + cv_args = {'test_size': 1, 'train_size': 3} + expected_results = None + error_message = 'The groups should be sorted in increasing order' + + with pytest.raises(ValueError, match=error_message): + check_splits(X, y, not_consecutive_group_numbers, cv_args, + expected_results) + + +def test_too_large_train_size_(X, y, group_numbers): + cv_args = {'test_size': 1, 'train_size': 10} + expected_results = None + error_message = (r'Not enough data to split number of groups \(6\)' + r' for number splits \(-4\) with train size \(10\),' + r' test size \(1\), gap size \(0\), shift size \(1\)') + + with pytest.raises(ValueError, match=error_message): + check_splits(X, y, group_numbers, cv_args, expected_results) + + +def test_too_many_splits(X, y, group_numbers): + cv_args = {'test_size': 1, 'n_splits': 10} + expected_results = None + error_message = (r'Not enough data to split number of groups \(6\)' + r' for number splits \(10\) with train size \(-4\),' + r' test size \(1\), gap size \(0\), shift size \(1\)') + + with pytest.raises(ValueError, match=error_message): + check_splits(X, y, group_numbers, cv_args, expected_results) + + +def test_too_large_train_size_and_many_n_splits(X, y, group_numbers): + cv_args = {'test_size': 1, 'train_size': 10, 'n_splits': 10} + expected_results = None + error_message = (r'Not enough data to split number of groups \(6\)' + r' for number splits \(10\) with train size \(10\),' + r' test size \(1\), gap size \(0\), shift size \(1\)') + + with pytest.raises(ValueError, match=error_message): + check_splits(X, y, group_numbers, cv_args, expected_results) + + +def test_too_large_shift_size(X, y, group_numbers): + cv_args = {'test_size': 1, 'n_splits': 3, 'shift_size': 10} + expected_results = None + error_message = (r'Not enough data to split number of groups \(6\)' + r' for number splits \(3\) with train size \(-15\),' + r' test size \(1\), gap size \(0\), shift size \(10\)') + + with pytest.raises(ValueError, match=error_message): + check_splits(X, y, group_numbers, cv_args, expected_results) + + +def test_too_large_gap_size(X, y, group_numbers): + cv_args = {'test_size': 1, 'n_splits': 3, 'gap_size': 10} + expected_results = None + error_message = (r'Not enough data to split number of groups \(6\)' + r' for number splits \(3\) with train size \(-7\),' + r' test size \(1\), gap size \(10\), shift size \(1\)') + + with pytest.raises(ValueError, match=error_message): + check_splits(X, y, group_numbers, cv_args, expected_results) From 7db32bb871b94a4497fb19729fed633ab904d229 Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Sat, 23 Apr 2022 22:27:18 +0300 Subject: [PATCH 03/30] Change non_sorted_group_numbers fixture for tests --- .vscode/settings.json | 4 ++++ conda_requirements.txt | 8 ++++++++ mlxtend/evaluate/tests/test_time_series.py | 2 +- 3 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 .vscode/settings.json create mode 100644 conda_requirements.txt diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..5671e3b2e --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,4 @@ +{ + "autoDocstring.docstringFormat": "numpy", + "editor.formatOnSave": true +} \ No newline at end of file diff --git a/conda_requirements.txt b/conda_requirements.txt new file mode 100644 index 000000000..b5a770223 --- /dev/null +++ b/conda_requirements.txt @@ -0,0 +1,8 @@ +flake8>=4.0.1 +pandas>=1.3.4 +pip>=21.3.1 +pytest>=6.2.5 +scikit-learn>=1.0.1 +scipy>=1.7.3 +setuptools>=59.4.0 +markdown>=3.3.6 \ No newline at end of file diff --git a/mlxtend/evaluate/tests/test_time_series.py b/mlxtend/evaluate/tests/test_time_series.py index f5e4c766e..c88c3b559 100644 --- a/mlxtend/evaluate/tests/test_time_series.py +++ b/mlxtend/evaluate/tests/test_time_series.py @@ -28,7 +28,7 @@ def group_numbers(): @pytest.fixture def not_sorted_group_numbers(): - return np.array([5, 5, 5, 5, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 0]) + return np.array([5, 5, 5, 5, 1, 1, 1, 1, 3, 3, 2, 2, 2, 4, 4, 0]) @pytest.fixture From 06b767273e4dcbd422147a6c714c0f5b85841d38 Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Sat, 23 Apr 2022 22:34:39 +0300 Subject: [PATCH 04/30] Delete mistakenly added settings.json --- .vscode/settings.json | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 5671e3b2e..000000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "autoDocstring.docstringFormat": "numpy", - "editor.formatOnSave": true -} \ No newline at end of file From e6c9c630fdc488995c5d13923a89a5c91c7ed6e6 Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Sat, 23 Apr 2022 22:35:55 +0300 Subject: [PATCH 05/30] Delete mistakenly added conda_requirements.txt --- conda_requirements.txt | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 conda_requirements.txt diff --git a/conda_requirements.txt b/conda_requirements.txt deleted file mode 100644 index b5a770223..000000000 --- a/conda_requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -flake8>=4.0.1 -pandas>=1.3.4 -pip>=21.3.1 -pytest>=6.2.5 -scikit-learn>=1.0.1 -scipy>=1.7.3 -setuptools>=59.4.0 -markdown>=3.3.6 \ No newline at end of file From 09b7d05d0ca042320487a76cc0952aae00454eea Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Sat, 23 Apr 2022 22:43:55 +0300 Subject: [PATCH 06/30] Change error tests naming to be more consistent --- mlxtend/evaluate/tests/test_time_series.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlxtend/evaluate/tests/test_time_series.py b/mlxtend/evaluate/tests/test_time_series.py index c88c3b559..c3cd24c79 100644 --- a/mlxtend/evaluate/tests/test_time_series.py +++ b/mlxtend/evaluate/tests/test_time_series.py @@ -216,7 +216,7 @@ def test_too_large_train_size_(X, y, group_numbers): check_splits(X, y, group_numbers, cv_args, expected_results) -def test_too_many_splits(X, y, group_numbers): +def test_too_large_n_splits(X, y, group_numbers): cv_args = {'test_size': 1, 'n_splits': 10} expected_results = None error_message = (r'Not enough data to split number of groups \(6\)' @@ -227,7 +227,7 @@ def test_too_many_splits(X, y, group_numbers): check_splits(X, y, group_numbers, cv_args, expected_results) -def test_too_large_train_size_and_many_n_splits(X, y, group_numbers): +def test_too_large_train_size_n_splits(X, y, group_numbers): cv_args = {'test_size': 1, 'train_size': 10, 'n_splits': 10} expected_results = None error_message = (r'Not enough data to split number of groups \(6\)' From cf01967baea1c622b5957833320021ba3850ab3b Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Thu, 28 Apr 2022 15:23:20 +0300 Subject: [PATCH 07/30] Change imported object to correct one --- mlxtend/evaluate/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlxtend/evaluate/__init__.py b/mlxtend/evaluate/__init__.py index 8aa99469d..9f899c1b9 100644 --- a/mlxtend/evaluate/__init__.py +++ b/mlxtend/evaluate/__init__.py @@ -42,4 +42,4 @@ "ftest", "combined_ftest_5x2cv", "proportion_difference", "bias_variance_decomp", "accuracy_score", "create_counterfactual", - "time_series"] + "GroupTimeSeriesSplit"] From 0b23a1c0ccb3f11a422ef5bbabf7627d630a6d18 Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Thu, 28 Apr 2022 16:02:44 +0300 Subject: [PATCH 08/30] Add test for cross_val_score --- mlxtend/evaluate/tests/test_time_series.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/mlxtend/evaluate/tests/test_time_series.py b/mlxtend/evaluate/tests/test_time_series.py index c3cd24c79..e4c528d07 100644 --- a/mlxtend/evaluate/tests/test_time_series.py +++ b/mlxtend/evaluate/tests/test_time_series.py @@ -8,6 +8,8 @@ import numpy as np import pytest from mlxtend.evaluate import GroupTimeSeriesSplit +from sklearn.dummy import DummyClassifier +from sklearn.model_selection import cross_val_score @pytest.fixture @@ -258,3 +260,20 @@ def test_too_large_gap_size(X, y, group_numbers): with pytest.raises(ValueError, match=error_message): check_splits(X, y, group_numbers, cv_args, expected_results) + + +def test_cross_val_score(X, y, group_numbers): + cv_args = {'test_size': 1, 'train_size': 3} + cv = GroupTimeSeriesSplit(**cv_args) + + expected_scores = np.array([0, 0.5, 0.25]) + clf = DummyClassifier(strategy='most_frequent') + scoring = 'accuracy' + cv_scores = cross_val_score(clf, + X, + y, + groups=group_numbers, + scoring=scoring, + cv=cv) + + assert np.array_equal(cv_scores, expected_scores) From afe15ffbc8f26b76d467e8ad5255d24617c360a6 Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Sun, 1 May 2022 08:24:31 +0300 Subject: [PATCH 09/30] Lessen the group restrictions to be only consecutive --- mlxtend/evaluate/time_series.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/mlxtend/evaluate/time_series.py b/mlxtend/evaluate/time_series.py index c564050f9..d9d766b9c 100644 --- a/mlxtend/evaluate/time_series.py +++ b/mlxtend/evaluate/time_series.py @@ -5,7 +5,7 @@ # # License: BSD 3 clause -from itertools import groupby +from itertools import accumulate, groupby import numpy as np from sklearn.utils import indexable @@ -89,15 +89,17 @@ def split(self, X, y=None, groups=None): if groups is None: raise ValueError('The groups should be specified') - group_seqs = [group[0] for group in groupby(groups)] - unique_groups, group_starts_idx = np.unique(groups, return_index=True) + group_names, group_lengths = zip( + *[(group_name, len(list(group_seq))) + for group_name, group_seq in groupby(groups)]) + n_groups = len(group_names) - if group_seqs != sorted(unique_groups): - raise ValueError('The groups should be sorted in increasing order') + if n_groups != len(set(group_names)): + raise ValueError('The groups should be consecutive') - n_groups = len(unique_groups) + group_starts_idx = [0] + list(accumulate(group_lengths))[:-1] self._n_groups = n_groups - groups_dict = dict(zip(unique_groups, group_starts_idx)) + groups_dict = dict(zip(group_names, group_starts_idx)) n_samples = len(X) self._calculate_split_params() @@ -109,15 +111,16 @@ def split(self, X, y=None, groups=None): test_end_idx = test_start_idx + test_size for _ in range(n_splits): - train_idx = np.r_[slice(groups_dict[group_seqs[train_start_idx]], - groups_dict[group_seqs[train_end_idx]])] + train_idx = np.r_[slice(groups_dict[group_names[train_start_idx]], + groups_dict[group_names[train_end_idx]])] if test_end_idx < n_groups: - test_idx = np.r_[slice(groups_dict[group_seqs[test_start_idx]], - groups_dict[group_seqs[test_end_idx]])] + test_idx = np.r_[slice( + groups_dict[group_names[test_start_idx]], + groups_dict[group_names[test_end_idx]])] else: - test_idx = np.r_[slice(groups_dict[group_seqs[test_start_idx]], - n_samples)] + test_idx = np.r_[slice( + groups_dict[group_names[test_start_idx]], n_samples)] yield train_idx, test_idx From a8ed4775ec1ffeeebdeca649d417a53d2f89ed35 Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Sun, 1 May 2022 08:25:31 +0300 Subject: [PATCH 10/30] Add more tests for group names --- mlxtend/evaluate/tests/test_time_series.py | 53 +++++++++++++++++++--- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/mlxtend/evaluate/tests/test_time_series.py b/mlxtend/evaluate/tests/test_time_series.py index e4c528d07..5ad9b2548 100644 --- a/mlxtend/evaluate/tests/test_time_series.py +++ b/mlxtend/evaluate/tests/test_time_series.py @@ -47,6 +47,24 @@ def group_names(): ]) +@pytest.fixture +def not_sorted_group_names(): + return np.array([ + '2021-06', '2021-06', '2021-06', '2021-06', '2021-02', '2021-02', + '2021-02', '2021-02', '2021-04', '2021-04', '2021-03', '2021-03', + '2021-03', '2021-05', '2021-05', '2021-01' + ]) + + +@pytest.fixture +def not_consecutive_group_names(): + return np.array([ + '2021-01', '2021-02', '2021-02', '2021-02', '2021-02', '2021-03', + '2021-03', '2021-03', '2021-04', '2021-04', '2021-05', '2021-05', + '2021-03', '2021-03', '2021-03', '2021-03' + ]) + + def check_splits(X, y, groups, cv_args, expected_results): cv = GroupTimeSeriesSplit(**cv_args) results = list(cv.split(X, y, groups)) @@ -148,6 +166,28 @@ def test_partial_usage_of_data(X, y, group_numbers): check_splits(X, y, group_numbers, cv_args, expected_results) +def test_not_sorted_group_numbers(X, y, not_sorted_group_numbers): + cv_args = {'test_size': 1, 'train_size': 3} + expected_results = [(np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, + 9]), np.array([10, 11, 12])), + (np.array([4, 5, 6, 7, 8, 9, 10, 11, + 12]), np.array([13, 14])), + (np.array([8, 9, 10, 11, 12, 13, 14]), np.array([15]))] + + check_splits(X, y, not_sorted_group_numbers, cv_args, expected_results) + + +def test_not_sorted_group_names(X, y, not_sorted_group_names): + cv_args = {'test_size': 1, 'train_size': 3} + expected_results = [(np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, + 9]), np.array([10, 11, 12])), + (np.array([4, 5, 6, 7, 8, 9, 10, 11, + 12]), np.array([13, 14])), + (np.array([8, 9, 10, 11, 12, 13, 14]), np.array([15]))] + + check_splits(X, y, not_sorted_group_names, cv_args, expected_results) + + def test_not_specified_train_size_n_splits(X, y, group_numbers): cv_args = {'test_size': 1} expected_results = None @@ -188,22 +228,23 @@ def test_not_specified_groups(X, y): check_splits(X, y, None, cv_args, expected_results) -def test_not_sorted_groups(X, y, not_sorted_group_numbers): +def test_not_consecutive_group_numbers(X, y, not_consecutive_group_numbers): cv_args = {'test_size': 1, 'train_size': 3} expected_results = None - error_message = 'The groups should be sorted in increasing order' + error_message = 'The groups should be consecutive' with pytest.raises(ValueError, match=error_message): - check_splits(X, y, not_sorted_group_numbers, cv_args, expected_results) + check_splits(X, y, not_consecutive_group_numbers, cv_args, + expected_results) -def test_not_consecutive_groups(X, y, not_consecutive_group_numbers): +def test_not_consecutive_group_names(X, y, not_consecutive_group_names): cv_args = {'test_size': 1, 'train_size': 3} expected_results = None - error_message = 'The groups should be sorted in increasing order' + error_message = 'The groups should be consecutive' with pytest.raises(ValueError, match=error_message): - check_splits(X, y, not_consecutive_group_numbers, cv_args, + check_splits(X, y, not_consecutive_group_names, cv_args, expected_results) From 04128a12489b2567b95148d3a9d59c9ad2a06dc1 Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Sun, 1 May 2022 08:47:45 +0300 Subject: [PATCH 11/30] Reformat using black --- mlxtend/evaluate/__init__.py | 38 ++- mlxtend/evaluate/tests/test_time_series.py | 301 ++++++++++++--------- mlxtend/evaluate/time_series.py | 110 ++++---- 3 files changed, 261 insertions(+), 188 deletions(-) diff --git a/mlxtend/evaluate/__init__.py b/mlxtend/evaluate/__init__.py index 9f899c1b9..f470ea0eb 100644 --- a/mlxtend/evaluate/__init__.py +++ b/mlxtend/evaluate/__init__.py @@ -30,16 +30,28 @@ from .counterfactual import create_counterfactual from .time_series import GroupTimeSeriesSplit -__all__ = ["scoring", "confusion_matrix", - "mcnemar_table", "mcnemar_tables", - "mcnemar", "lift_score", - "bootstrap", "permutation_test", - "BootstrapOutOfBag", "bootstrap_point632_score", - "cochrans_q", "paired_ttest_resampled", - "paired_ttest_kfold_cv", "paired_ttest_5x2cv", - "feature_importance_permutation", - "RandomHoldoutSplit", "PredefinedHoldoutSplit", - "ftest", "combined_ftest_5x2cv", - "proportion_difference", "bias_variance_decomp", - "accuracy_score", "create_counterfactual", - "GroupTimeSeriesSplit"] +__all__ = [ + "scoring", + "confusion_matrix", + "mcnemar_table", + "mcnemar_tables", + "mcnemar", + "lift_score", + "bootstrap", + "permutation_test", + "BootstrapOutOfBag", + "bootstrap_point632_score", + "cochrans_q", + "paired_ttest_resampled", + "paired_ttest_kfold_cv", + "paired_ttest_5x2cv", + "feature_importance_permutation", + "RandomHoldoutSplit", + "PredefinedHoldoutSplit", + "ftest", + "combined_ftest_5x2cv", + "proportion_difference", + "bias_variance_decomp", + "accuracy_score", + "create_counterfactual", +] diff --git a/mlxtend/evaluate/tests/test_time_series.py b/mlxtend/evaluate/tests/test_time_series.py index 5ad9b2548..d41b2ec34 100644 --- a/mlxtend/evaluate/tests/test_time_series.py +++ b/mlxtend/evaluate/tests/test_time_series.py @@ -14,8 +14,9 @@ @pytest.fixture def X(): - return np.array([[0], [7], [6], [4], [4], [8], [0], [6], [2], [0], [5], - [9], [7], [7], [7], [7]]) + return np.array( + [[0], [7], [6], [4], [4], [8], [0], [6], [2], [0], [5], [9], [7], [7], [7], [7]] + ) @pytest.fixture @@ -40,29 +41,74 @@ def not_consecutive_group_numbers(): @pytest.fixture def group_names(): - return np.array([ - '2021-01', '2021-02', '2021-02', '2021-02', '2021-02', '2021-03', - '2021-03', '2021-03', '2021-04', '2021-04', '2021-05', '2021-05', - '2021-06', '2021-06', '2021-06', '2021-06' - ]) + return np.array( + [ + "2021-01", + "2021-02", + "2021-02", + "2021-02", + "2021-02", + "2021-03", + "2021-03", + "2021-03", + "2021-04", + "2021-04", + "2021-05", + "2021-05", + "2021-06", + "2021-06", + "2021-06", + "2021-06", + ] + ) @pytest.fixture def not_sorted_group_names(): - return np.array([ - '2021-06', '2021-06', '2021-06', '2021-06', '2021-02', '2021-02', - '2021-02', '2021-02', '2021-04', '2021-04', '2021-03', '2021-03', - '2021-03', '2021-05', '2021-05', '2021-01' - ]) + return np.array( + [ + "2021-06", + "2021-06", + "2021-06", + "2021-06", + "2021-02", + "2021-02", + "2021-02", + "2021-02", + "2021-04", + "2021-04", + "2021-03", + "2021-03", + "2021-03", + "2021-05", + "2021-05", + "2021-01", + ] + ) @pytest.fixture def not_consecutive_group_names(): - return np.array([ - '2021-01', '2021-02', '2021-02', '2021-02', '2021-02', '2021-03', - '2021-03', '2021-03', '2021-04', '2021-04', '2021-05', '2021-05', - '2021-03', '2021-03', '2021-03', '2021-03' - ]) + return np.array( + [ + "2021-01", + "2021-02", + "2021-02", + "2021-02", + "2021-02", + "2021-03", + "2021-03", + "2021-03", + "2021-04", + "2021-04", + "2021-05", + "2021-05", + "2021-03", + "2021-03", + "2021-03", + "2021-03", + ] + ) def check_splits(X, y, groups, cv_args, expected_results): @@ -79,130 +125,128 @@ def check_splits(X, y, groups, cv_args, expected_results): def test_get_n_splits(X, y, group_numbers): - cv_args = {'test_size': 1, 'train_size': 3} - expected_results = [(np.array([0, 1, 2, 3, 4, 5, 6, 7]), np.array([8, 9])), - (np.array([1, 2, 3, 4, 5, 6, 7, 8, - 9]), np.array([10, 11])), - (np.array([5, 6, 7, 8, 9, 10, - 11]), np.array([12, 13, 14, 15]))] + cv_args = {"test_size": 1, "train_size": 3} + expected_results = [ + (np.array([0, 1, 2, 3, 4, 5, 6, 7]), np.array([8, 9])), + (np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]), np.array([10, 11])), + (np.array([5, 6, 7, 8, 9, 10, 11]), np.array([12, 13, 14, 15])), + ] cv = check_splits(X, y, group_numbers, cv_args, expected_results) assert cv.get_n_splits() == len(expected_results) def test_train_size(X, y, group_numbers): - cv_args = {'test_size': 1, 'train_size': 3} - expected_results = [(np.array([0, 1, 2, 3, 4, 5, 6, 7]), np.array([8, 9])), - (np.array([1, 2, 3, 4, 5, 6, 7, 8, - 9]), np.array([10, 11])), - (np.array([5, 6, 7, 8, 9, 10, - 11]), np.array([12, 13, 14, 15]))] + cv_args = {"test_size": 1, "train_size": 3} + expected_results = [ + (np.array([0, 1, 2, 3, 4, 5, 6, 7]), np.array([8, 9])), + (np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]), np.array([10, 11])), + (np.array([5, 6, 7, 8, 9, 10, 11]), np.array([12, 13, 14, 15])), + ] check_splits(X, y, group_numbers, cv_args, expected_results) def test_train_size_group_names(X, y, group_names): - cv_args = {'test_size': 1, 'train_size': 3} - expected_results = [(np.array([0, 1, 2, 3, 4, 5, 6, 7]), np.array([8, 9])), - (np.array([1, 2, 3, 4, 5, 6, 7, 8, - 9]), np.array([10, 11])), - (np.array([5, 6, 7, 8, 9, 10, - 11]), np.array([12, 13, 14, 15]))] + cv_args = {"test_size": 1, "train_size": 3} + expected_results = [ + (np.array([0, 1, 2, 3, 4, 5, 6, 7]), np.array([8, 9])), + (np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]), np.array([10, 11])), + (np.array([5, 6, 7, 8, 9, 10, 11]), np.array([12, 13, 14, 15])), + ] check_splits(X, y, group_names, cv_args, expected_results) def test_n_splits(X, y, group_numbers): - cv_args = {'test_size': 2, 'n_splits': 3} - expected_results = [(np.array([0, 1, 2, 3, 4]), np.array([5, 6, 7, 8, 9])), - (np.array([1, 2, 3, 4, 5, 6, - 7]), np.array([8, 9, 10, 11])), - (np.array([5, 6, 7, 8, - 9]), np.array([10, 11, 12, 13, 14, 15]))] + cv_args = {"test_size": 2, "n_splits": 3} + expected_results = [ + (np.array([0, 1, 2, 3, 4]), np.array([5, 6, 7, 8, 9])), + (np.array([1, 2, 3, 4, 5, 6, 7]), np.array([8, 9, 10, 11])), + (np.array([5, 6, 7, 8, 9]), np.array([10, 11, 12, 13, 14, 15])), + ] check_splits(X, y, group_numbers, cv_args, expected_results) def test_n_splits_gap_size(X, y, group_numbers): - cv_args = {'test_size': 1, 'n_splits': 3, 'gap_size': 1} - expected_results = [(np.array([0, 1, 2, 3, 4]), np.array([8, 9])), - (np.array([1, 2, 3, 4, 5, 6, 7]), np.array([10, 11])), - (np.array([5, 6, 7, 8, 9]), np.array([12, 13, 14, - 15]))] + cv_args = {"test_size": 1, "n_splits": 3, "gap_size": 1} + expected_results = [ + (np.array([0, 1, 2, 3, 4]), np.array([8, 9])), + (np.array([1, 2, 3, 4, 5, 6, 7]), np.array([10, 11])), + (np.array([5, 6, 7, 8, 9]), np.array([12, 13, 14, 15])), + ] check_splits(X, y, group_numbers, cv_args, expected_results) def test_n_splits_shift_size(X, y, group_numbers): - cv_args = {'test_size': 1, 'n_splits': 3, 'gap_size': 1} - expected_results = [(np.array([0, 1, 2, 3, 4]), np.array([8, 9])), - (np.array([1, 2, 3, 4, 5, 6, 7]), np.array([10, 11])), - (np.array([5, 6, 7, 8, 9]), np.array([12, 13, 14, - 15]))] + cv_args = {"test_size": 1, "n_splits": 3, "gap_size": 1} + expected_results = [ + (np.array([0, 1, 2, 3, 4]), np.array([8, 9])), + (np.array([1, 2, 3, 4, 5, 6, 7]), np.array([10, 11])), + (np.array([5, 6, 7, 8, 9]), np.array([12, 13, 14, 15])), + ] check_splits(X, y, group_numbers, cv_args, expected_results) def test_n_splits_expanding_window(X, y, group_numbers): - cv_args = {'test_size': 3, 'n_splits': 3, 'window_type': 'expanding'} - expected_results = [(np.array([0]), np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])), - (np.array([0, 1, 2, 3, - 4]), np.array([5, 6, 7, 8, 9, 10, 11])), - (np.array([0, 1, 2, 3, 4, 5, 6, 7]), - np.array([8, 9, 10, 11, 12, 13, 14, 15]))] + cv_args = {"test_size": 3, "n_splits": 3, "window_type": "expanding"} + expected_results = [ + (np.array([0]), np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])), + (np.array([0, 1, 2, 3, 4]), np.array([5, 6, 7, 8, 9, 10, 11])), + (np.array([0, 1, 2, 3, 4, 5, 6, 7]), np.array([8, 9, 10, 11, 12, 13, 14, 15])), + ] check_splits(X, y, group_numbers, cv_args, expected_results) def test_full_usage_of_data(X, y, group_numbers): - cv_args = {'test_size': 3, 'train_size': 2, 'n_splits': 2} - expected_results = [(np.array([0, 1, 2, 3, - 4]), np.array([5, 6, 7, 8, 9, 10, 11])), - (np.array([1, 2, 3, 4, 5, 6, 7]), - np.array([8, 9, 10, 11, 12, 13, 14, 15]))] + cv_args = {"test_size": 3, "train_size": 2, "n_splits": 2} + expected_results = [ + (np.array([0, 1, 2, 3, 4]), np.array([5, 6, 7, 8, 9, 10, 11])), + (np.array([1, 2, 3, 4, 5, 6, 7]), np.array([8, 9, 10, 11, 12, 13, 14, 15])), + ] check_splits(X, y, group_numbers, cv_args, expected_results) def test_partial_usage_of_data(X, y, group_numbers): - cv_args = {'test_size': 2, 'train_size': 2, 'n_splits': 2} - expected_results = [(np.array([1, 2, 3, 4, 5, 6, - 7]), np.array([8, 9, 10, 11])), - (np.array([5, 6, 7, 8, - 9]), np.array([10, 11, 12, 13, 14, 15]))] + cv_args = {"test_size": 2, "train_size": 2, "n_splits": 2} + expected_results = [ + (np.array([1, 2, 3, 4, 5, 6, 7]), np.array([8, 9, 10, 11])), + (np.array([5, 6, 7, 8, 9]), np.array([10, 11, 12, 13, 14, 15])), + ] check_splits(X, y, group_numbers, cv_args, expected_results) def test_not_sorted_group_numbers(X, y, not_sorted_group_numbers): - cv_args = {'test_size': 1, 'train_size': 3} - expected_results = [(np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, - 9]), np.array([10, 11, 12])), - (np.array([4, 5, 6, 7, 8, 9, 10, 11, - 12]), np.array([13, 14])), - (np.array([8, 9, 10, 11, 12, 13, 14]), np.array([15]))] + cv_args = {"test_size": 1, "train_size": 3} + expected_results = [ + (np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), np.array([10, 11, 12])), + (np.array([4, 5, 6, 7, 8, 9, 10, 11, 12]), np.array([13, 14])), + (np.array([8, 9, 10, 11, 12, 13, 14]), np.array([15])), + ] check_splits(X, y, not_sorted_group_numbers, cv_args, expected_results) def test_not_sorted_group_names(X, y, not_sorted_group_names): - cv_args = {'test_size': 1, 'train_size': 3} - expected_results = [(np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, - 9]), np.array([10, 11, 12])), - (np.array([4, 5, 6, 7, 8, 9, 10, 11, - 12]), np.array([13, 14])), - (np.array([8, 9, 10, 11, 12, 13, 14]), np.array([15]))] + cv_args = {"test_size": 1, "train_size": 3} + expected_results = [ + (np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), np.array([10, 11, 12])), + (np.array([4, 5, 6, 7, 8, 9, 10, 11, 12]), np.array([13, 14])), + (np.array([8, 9, 10, 11, 12, 13, 14]), np.array([15])), + ] check_splits(X, y, not_sorted_group_names, cv_args, expected_results) def test_not_specified_train_size_n_splits(X, y, group_numbers): - cv_args = {'test_size': 1} + cv_args = {"test_size": 1} expected_results = None - error_message = 'Either train_size or n_splits have to be defined' + error_message = "Either train_size or n_splits have to be defined" with pytest.raises(ValueError, match=error_message): check_splits(X, y, group_numbers, cv_args, expected_results) def test_bad_window_type(X, y, group_numbers): - cv_args = { - 'test_size': 1, - 'train_size': 3, - 'window_type': 'incorrect_window_type' - } + cv_args = {"test_size": 1, "train_size": 3, "window_type": "incorrect_window_type"} expected_results = None error_message = 'Window type can be either "rolling" or "expanding"' @@ -211,110 +255,113 @@ def test_bad_window_type(X, y, group_numbers): def test_train_size_with_expanding_window(X, y, group_numbers): - cv_args = {'test_size': 1, 'train_size': 3, 'window_type': 'expanding'} + cv_args = {"test_size": 1, "train_size": 3, "window_type": "expanding"} expected_results = None - error_message = 'Train size can be specified only with rolling window' + error_message = "Train size can be specified only with rolling window" with pytest.raises(ValueError, match=error_message): check_splits(X, y, group_numbers, cv_args, expected_results) def test_not_specified_groups(X, y): - cv_args = {'test_size': 1, 'train_size': 3} + cv_args = {"test_size": 1, "train_size": 3} expected_results = None - error_message = 'The groups should be specified' + error_message = "The groups should be specified" with pytest.raises(ValueError, match=error_message): check_splits(X, y, None, cv_args, expected_results) def test_not_consecutive_group_numbers(X, y, not_consecutive_group_numbers): - cv_args = {'test_size': 1, 'train_size': 3} + cv_args = {"test_size": 1, "train_size": 3} expected_results = None - error_message = 'The groups should be consecutive' + error_message = "The groups should be consecutive" with pytest.raises(ValueError, match=error_message): - check_splits(X, y, not_consecutive_group_numbers, cv_args, - expected_results) + check_splits(X, y, not_consecutive_group_numbers, cv_args, expected_results) def test_not_consecutive_group_names(X, y, not_consecutive_group_names): - cv_args = {'test_size': 1, 'train_size': 3} + cv_args = {"test_size": 1, "train_size": 3} expected_results = None - error_message = 'The groups should be consecutive' + error_message = "The groups should be consecutive" with pytest.raises(ValueError, match=error_message): - check_splits(X, y, not_consecutive_group_names, cv_args, - expected_results) + check_splits(X, y, not_consecutive_group_names, cv_args, expected_results) def test_too_large_train_size_(X, y, group_numbers): - cv_args = {'test_size': 1, 'train_size': 10} + cv_args = {"test_size": 1, "train_size": 10} expected_results = None - error_message = (r'Not enough data to split number of groups \(6\)' - r' for number splits \(-4\) with train size \(10\),' - r' test size \(1\), gap size \(0\), shift size \(1\)') + error_message = ( + r"Not enough data to split number of groups \(6\)" + r" for number splits \(-4\) with train size \(10\)," + r" test size \(1\), gap size \(0\), shift size \(1\)" + ) with pytest.raises(ValueError, match=error_message): check_splits(X, y, group_numbers, cv_args, expected_results) def test_too_large_n_splits(X, y, group_numbers): - cv_args = {'test_size': 1, 'n_splits': 10} + cv_args = {"test_size": 1, "n_splits": 10} expected_results = None - error_message = (r'Not enough data to split number of groups \(6\)' - r' for number splits \(10\) with train size \(-4\),' - r' test size \(1\), gap size \(0\), shift size \(1\)') + error_message = ( + r"Not enough data to split number of groups \(6\)" + r" for number splits \(10\) with train size \(-4\)," + r" test size \(1\), gap size \(0\), shift size \(1\)" + ) with pytest.raises(ValueError, match=error_message): check_splits(X, y, group_numbers, cv_args, expected_results) def test_too_large_train_size_n_splits(X, y, group_numbers): - cv_args = {'test_size': 1, 'train_size': 10, 'n_splits': 10} + cv_args = {"test_size": 1, "train_size": 10, "n_splits": 10} expected_results = None - error_message = (r'Not enough data to split number of groups \(6\)' - r' for number splits \(10\) with train size \(10\),' - r' test size \(1\), gap size \(0\), shift size \(1\)') + error_message = ( + r"Not enough data to split number of groups \(6\)" + r" for number splits \(10\) with train size \(10\)," + r" test size \(1\), gap size \(0\), shift size \(1\)" + ) with pytest.raises(ValueError, match=error_message): check_splits(X, y, group_numbers, cv_args, expected_results) def test_too_large_shift_size(X, y, group_numbers): - cv_args = {'test_size': 1, 'n_splits': 3, 'shift_size': 10} + cv_args = {"test_size": 1, "n_splits": 3, "shift_size": 10} expected_results = None - error_message = (r'Not enough data to split number of groups \(6\)' - r' for number splits \(3\) with train size \(-15\),' - r' test size \(1\), gap size \(0\), shift size \(10\)') + error_message = ( + r"Not enough data to split number of groups \(6\)" + r" for number splits \(3\) with train size \(-15\)," + r" test size \(1\), gap size \(0\), shift size \(10\)" + ) with pytest.raises(ValueError, match=error_message): check_splits(X, y, group_numbers, cv_args, expected_results) def test_too_large_gap_size(X, y, group_numbers): - cv_args = {'test_size': 1, 'n_splits': 3, 'gap_size': 10} + cv_args = {"test_size": 1, "n_splits": 3, "gap_size": 10} expected_results = None - error_message = (r'Not enough data to split number of groups \(6\)' - r' for number splits \(3\) with train size \(-7\),' - r' test size \(1\), gap size \(10\), shift size \(1\)') + error_message = ( + r"Not enough data to split number of groups \(6\)" + r" for number splits \(3\) with train size \(-7\)," + r" test size \(1\), gap size \(10\), shift size \(1\)" + ) with pytest.raises(ValueError, match=error_message): check_splits(X, y, group_numbers, cv_args, expected_results) def test_cross_val_score(X, y, group_numbers): - cv_args = {'test_size': 1, 'train_size': 3} + cv_args = {"test_size": 1, "train_size": 3} cv = GroupTimeSeriesSplit(**cv_args) expected_scores = np.array([0, 0.5, 0.25]) - clf = DummyClassifier(strategy='most_frequent') - scoring = 'accuracy' - cv_scores = cross_val_score(clf, - X, - y, - groups=group_numbers, - scoring=scoring, - cv=cv) + clf = DummyClassifier(strategy="most_frequent") + scoring = "accuracy" + cv_scores = cross_val_score(clf, X, y, groups=group_numbers, scoring=scoring, cv=cv) assert np.array_equal(cv_scores, expected_scores) diff --git a/mlxtend/evaluate/time_series.py b/mlxtend/evaluate/time_series.py index d9d766b9c..982d468f1 100644 --- a/mlxtend/evaluate/time_series.py +++ b/mlxtend/evaluate/time_series.py @@ -35,25 +35,24 @@ class GroupTimeSeriesSplit: http://rasbt.github.io/mlxtend/user_guide/evaluate/GroupTimeSeriesSplit/ """ - def __init__(self, - test_size, - train_size=None, - n_splits=None, - gap_size=0, - shift_size=1, - window_type='rolling'): + def __init__( + self, + test_size, + train_size=None, + n_splits=None, + gap_size=0, + shift_size=1, + window_type="rolling", + ): if (train_size is None) and (n_splits is None): - raise ValueError( - 'Either train_size or n_splits have to be defined') + raise ValueError("Either train_size or n_splits have to be defined") - if window_type not in ['rolling', 'expanding']: - raise ValueError( - 'Window type can be either "rolling" or "expanding"') + if window_type not in ["rolling", "expanding"]: + raise ValueError('Window type can be either "rolling" or "expanding"') - if (train_size is not None) and (window_type == 'expanding'): - raise ValueError( - 'Train size can be specified only with rolling window') + if (train_size is not None) and (window_type == "expanding"): + raise ValueError("Train size can be specified only with rolling window") self.test_size = test_size self.train_size = train_size @@ -87,15 +86,18 @@ def split(self, X, y=None, groups=None): X, y, groups = indexable(X, y, groups) if groups is None: - raise ValueError('The groups should be specified') + raise ValueError("The groups should be specified") group_names, group_lengths = zip( - *[(group_name, len(list(group_seq))) - for group_name, group_seq in groupby(groups)]) + *[ + (group_name, len(list(group_seq))) + for group_name, group_seq in groupby(groups) + ] + ) n_groups = len(group_names) if n_groups != len(set(group_names)): - raise ValueError('The groups should be consecutive') + raise ValueError("The groups should be consecutive") group_starts_idx = [0] + list(accumulate(group_lengths))[:-1] self._n_groups = n_groups @@ -111,20 +113,28 @@ def split(self, X, y=None, groups=None): test_end_idx = test_start_idx + test_size for _ in range(n_splits): - train_idx = np.r_[slice(groups_dict[group_names[train_start_idx]], - groups_dict[group_names[train_end_idx]])] + train_idx = np.r_[ + slice( + groups_dict[group_names[train_start_idx]], + groups_dict[group_names[train_end_idx]], + ) + ] if test_end_idx < n_groups: - test_idx = np.r_[slice( - groups_dict[group_names[test_start_idx]], - groups_dict[group_names[test_end_idx]])] + test_idx = np.r_[ + slice( + groups_dict[group_names[test_start_idx]], + groups_dict[group_names[test_end_idx]], + ) + ] else: - test_idx = np.r_[slice( - groups_dict[group_names[test_start_idx]], n_samples)] + test_idx = np.r_[ + slice(groups_dict[group_names[test_start_idx]], n_samples) + ] yield train_idx, test_idx - if self.window_type == 'rolling': + if self.window_type == "rolling": train_start_idx = train_start_idx + shift_size train_end_idx = train_end_idx + shift_size @@ -159,42 +169,46 @@ def _calculate_split_params(self): n_groups = self._n_groups not_enough_data_error = ( - 'Not enough data to split number of groups ({0})' - ' for number splits ({1})' - ' with train size ({2}), test size ({3}),' - ' gap size ({4}), shift size ({5})') + "Not enough data to split number of groups ({0})" + " for number splits ({1})" + " with train size ({2}), test size ({3})," + " gap size ({4}), shift size ({5})" + ) if (train_size is None) and (n_splits is not None): - train_size = n_groups - gap - test_size - (n_splits - - 1) * shift_size + train_size = n_groups - gap - test_size - (n_splits - 1) * shift_size self.train_size = train_size if train_size <= 0: raise ValueError( - not_enough_data_error.format(n_groups, n_splits, - train_size, test_size, gap, - shift_size)) + not_enough_data_error.format( + n_groups, n_splits, train_size, test_size, gap, shift_size + ) + ) train_start_idx = 0 elif (n_splits is None) and (train_size is not None): - n_splits = (n_groups - train_size - gap - - test_size) // shift_size + 1 + n_splits = (n_groups - train_size - gap - test_size) // shift_size + 1 self.n_splits = n_splits if self.n_splits <= 0: raise ValueError( - not_enough_data_error.format(n_groups, n_splits, - train_size, test_size, gap, - shift_size)) - train_start_idx = n_groups - train_size - gap - test_size - ( - n_splits - 1) * shift_size + not_enough_data_error.format( + n_groups, n_splits, train_size, test_size, gap, shift_size + ) + ) + train_start_idx = ( + n_groups - train_size - gap - test_size - (n_splits - 1) * shift_size + ) else: - train_start_idx = n_groups - train_size - gap - test_size - ( - n_splits - 1) * shift_size + train_start_idx = ( + n_groups - train_size - gap - test_size - (n_splits - 1) * shift_size + ) if train_start_idx < 0: raise ValueError( - not_enough_data_error.format(n_groups, n_splits, - train_size, test_size, gap, - shift_size)) + not_enough_data_error.format( + n_groups, n_splits, train_size, test_size, gap, shift_size + ) + ) self._train_start_idx = train_start_idx From 715814557002429604f989f50642e5186f7490bd Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Sun, 1 May 2022 08:52:13 +0300 Subject: [PATCH 12/30] Reformat using black with line length = 79 --- mlxtend/evaluate/tests/test_time_series.py | 47 ++++++++++++++++--- mlxtend/evaluate/time_series.py | 53 ++++++++++++++++++---- 2 files changed, 83 insertions(+), 17 deletions(-) diff --git a/mlxtend/evaluate/tests/test_time_series.py b/mlxtend/evaluate/tests/test_time_series.py index d41b2ec34..b17fb55d4 100644 --- a/mlxtend/evaluate/tests/test_time_series.py +++ b/mlxtend/evaluate/tests/test_time_series.py @@ -15,7 +15,24 @@ @pytest.fixture def X(): return np.array( - [[0], [7], [6], [4], [4], [8], [0], [6], [2], [0], [5], [9], [7], [7], [7], [7]] + [ + [0], + [7], + [6], + [4], + [4], + [8], + [0], + [6], + [2], + [0], + [5], + [9], + [7], + [7], + [7], + [7], + ] ) @@ -191,7 +208,10 @@ def test_n_splits_expanding_window(X, y, group_numbers): expected_results = [ (np.array([0]), np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])), (np.array([0, 1, 2, 3, 4]), np.array([5, 6, 7, 8, 9, 10, 11])), - (np.array([0, 1, 2, 3, 4, 5, 6, 7]), np.array([8, 9, 10, 11, 12, 13, 14, 15])), + ( + np.array([0, 1, 2, 3, 4, 5, 6, 7]), + np.array([8, 9, 10, 11, 12, 13, 14, 15]), + ), ] check_splits(X, y, group_numbers, cv_args, expected_results) @@ -200,7 +220,10 @@ def test_full_usage_of_data(X, y, group_numbers): cv_args = {"test_size": 3, "train_size": 2, "n_splits": 2} expected_results = [ (np.array([0, 1, 2, 3, 4]), np.array([5, 6, 7, 8, 9, 10, 11])), - (np.array([1, 2, 3, 4, 5, 6, 7]), np.array([8, 9, 10, 11, 12, 13, 14, 15])), + ( + np.array([1, 2, 3, 4, 5, 6, 7]), + np.array([8, 9, 10, 11, 12, 13, 14, 15]), + ), ] check_splits(X, y, group_numbers, cv_args, expected_results) @@ -246,7 +269,11 @@ def test_not_specified_train_size_n_splits(X, y, group_numbers): def test_bad_window_type(X, y, group_numbers): - cv_args = {"test_size": 1, "train_size": 3, "window_type": "incorrect_window_type"} + cv_args = { + "test_size": 1, + "train_size": 3, + "window_type": "incorrect_window_type", + } expected_results = None error_message = 'Window type can be either "rolling" or "expanding"' @@ -278,7 +305,9 @@ def test_not_consecutive_group_numbers(X, y, not_consecutive_group_numbers): error_message = "The groups should be consecutive" with pytest.raises(ValueError, match=error_message): - check_splits(X, y, not_consecutive_group_numbers, cv_args, expected_results) + check_splits( + X, y, not_consecutive_group_numbers, cv_args, expected_results + ) def test_not_consecutive_group_names(X, y, not_consecutive_group_names): @@ -287,7 +316,9 @@ def test_not_consecutive_group_names(X, y, not_consecutive_group_names): error_message = "The groups should be consecutive" with pytest.raises(ValueError, match=error_message): - check_splits(X, y, not_consecutive_group_names, cv_args, expected_results) + check_splits( + X, y, not_consecutive_group_names, cv_args, expected_results + ) def test_too_large_train_size_(X, y, group_numbers): @@ -362,6 +393,8 @@ def test_cross_val_score(X, y, group_numbers): expected_scores = np.array([0, 0.5, 0.25]) clf = DummyClassifier(strategy="most_frequent") scoring = "accuracy" - cv_scores = cross_val_score(clf, X, y, groups=group_numbers, scoring=scoring, cv=cv) + cv_scores = cross_val_score( + clf, X, y, groups=group_numbers, scoring=scoring, cv=cv + ) assert np.array_equal(cv_scores, expected_scores) diff --git a/mlxtend/evaluate/time_series.py b/mlxtend/evaluate/time_series.py index 982d468f1..33ade28d1 100644 --- a/mlxtend/evaluate/time_series.py +++ b/mlxtend/evaluate/time_series.py @@ -46,13 +46,19 @@ def __init__( ): if (train_size is None) and (n_splits is None): - raise ValueError("Either train_size or n_splits have to be defined") + raise ValueError( + "Either train_size or n_splits have to be defined" + ) if window_type not in ["rolling", "expanding"]: - raise ValueError('Window type can be either "rolling" or "expanding"') + raise ValueError( + 'Window type can be either "rolling" or "expanding"' + ) if (train_size is not None) and (window_type == "expanding"): - raise ValueError("Train size can be specified only with rolling window") + raise ValueError( + "Train size can be specified only with rolling window" + ) self.test_size = test_size self.train_size = train_size @@ -176,38 +182,65 @@ def _calculate_split_params(self): ) if (train_size is None) and (n_splits is not None): - train_size = n_groups - gap - test_size - (n_splits - 1) * shift_size + train_size = ( + n_groups - gap - test_size - (n_splits - 1) * shift_size + ) self.train_size = train_size if train_size <= 0: raise ValueError( not_enough_data_error.format( - n_groups, n_splits, train_size, test_size, gap, shift_size + n_groups, + n_splits, + train_size, + test_size, + gap, + shift_size, ) ) train_start_idx = 0 elif (n_splits is None) and (train_size is not None): - n_splits = (n_groups - train_size - gap - test_size) // shift_size + 1 + n_splits = ( + n_groups - train_size - gap - test_size + ) // shift_size + 1 self.n_splits = n_splits if self.n_splits <= 0: raise ValueError( not_enough_data_error.format( - n_groups, n_splits, train_size, test_size, gap, shift_size + n_groups, + n_splits, + train_size, + test_size, + gap, + shift_size, ) ) train_start_idx = ( - n_groups - train_size - gap - test_size - (n_splits - 1) * shift_size + n_groups + - train_size + - gap + - test_size + - (n_splits - 1) * shift_size ) else: train_start_idx = ( - n_groups - train_size - gap - test_size - (n_splits - 1) * shift_size + n_groups + - train_size + - gap + - test_size + - (n_splits - 1) * shift_size ) if train_start_idx < 0: raise ValueError( not_enough_data_error.format( - n_groups, n_splits, train_size, test_size, gap, shift_size + n_groups, + n_splits, + train_size, + test_size, + gap, + shift_size, ) ) From 267ee40da3bcf6f5b045effe10cf9b514c649ebd Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Sun, 1 May 2022 16:12:34 +0300 Subject: [PATCH 13/30] Add GroupTimeSeriesSplit to __all__ list --- mlxtend/evaluate/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mlxtend/evaluate/__init__.py b/mlxtend/evaluate/__init__.py index f470ea0eb..6671cf863 100644 --- a/mlxtend/evaluate/__init__.py +++ b/mlxtend/evaluate/__init__.py @@ -54,4 +54,5 @@ "bias_variance_decomp", "accuracy_score", "create_counterfactual", + "GroupTimeSeriesSplit" ] From 74fdbd04b2acd8938c06e66b57ff19e17e54d225 Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Sun, 1 May 2022 16:18:07 +0300 Subject: [PATCH 14/30] Add trailing comma --- mlxtend/evaluate/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlxtend/evaluate/__init__.py b/mlxtend/evaluate/__init__.py index 6671cf863..3571832da 100644 --- a/mlxtend/evaluate/__init__.py +++ b/mlxtend/evaluate/__init__.py @@ -54,5 +54,5 @@ "bias_variance_decomp", "accuracy_score", "create_counterfactual", - "GroupTimeSeriesSplit" + "GroupTimeSeriesSplit", ] From 672b4364e72aff1593e5646877f9ca592721ca6b Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Mon, 2 May 2022 13:28:32 +0300 Subject: [PATCH 15/30] Implement group_starts_idx using itertools functions --- mlxtend/evaluate/time_series.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mlxtend/evaluate/time_series.py b/mlxtend/evaluate/time_series.py index 33ade28d1..e78e3f6e9 100644 --- a/mlxtend/evaluate/time_series.py +++ b/mlxtend/evaluate/time_series.py @@ -5,7 +5,7 @@ # # License: BSD 3 clause -from itertools import accumulate, groupby +from itertools import accumulate, chain, groupby, islice import numpy as np from sklearn.utils import indexable @@ -105,8 +105,11 @@ def split(self, X, y=None, groups=None): if n_groups != len(set(group_names)): raise ValueError("The groups should be consecutive") - group_starts_idx = [0] + list(accumulate(group_lengths))[:-1] self._n_groups = n_groups + group_starts_idx = chain( + [0], + islice(accumulate(group_lengths), len(group_lengths) - 1), + ) groups_dict = dict(zip(group_names, group_starts_idx)) n_samples = len(X) From 8501a3727931e59d090e6155e845f1f8188851da Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Tue, 3 May 2022 08:32:21 +0300 Subject: [PATCH 16/30] Reformat using black --- mlxtend/evaluate/__init__.py | 2 +- mlxtend/evaluate/tests/test_time_series.py | 12 ++------ mlxtend/evaluate/time_series.py | 32 +++++----------------- 3 files changed, 11 insertions(+), 35 deletions(-) diff --git a/mlxtend/evaluate/__init__.py b/mlxtend/evaluate/__init__.py index 6671cf863..3571832da 100644 --- a/mlxtend/evaluate/__init__.py +++ b/mlxtend/evaluate/__init__.py @@ -54,5 +54,5 @@ "bias_variance_decomp", "accuracy_score", "create_counterfactual", - "GroupTimeSeriesSplit" + "GroupTimeSeriesSplit", ] diff --git a/mlxtend/evaluate/tests/test_time_series.py b/mlxtend/evaluate/tests/test_time_series.py index b17fb55d4..d88f01dd7 100644 --- a/mlxtend/evaluate/tests/test_time_series.py +++ b/mlxtend/evaluate/tests/test_time_series.py @@ -305,9 +305,7 @@ def test_not_consecutive_group_numbers(X, y, not_consecutive_group_numbers): error_message = "The groups should be consecutive" with pytest.raises(ValueError, match=error_message): - check_splits( - X, y, not_consecutive_group_numbers, cv_args, expected_results - ) + check_splits(X, y, not_consecutive_group_numbers, cv_args, expected_results) def test_not_consecutive_group_names(X, y, not_consecutive_group_names): @@ -316,9 +314,7 @@ def test_not_consecutive_group_names(X, y, not_consecutive_group_names): error_message = "The groups should be consecutive" with pytest.raises(ValueError, match=error_message): - check_splits( - X, y, not_consecutive_group_names, cv_args, expected_results - ) + check_splits(X, y, not_consecutive_group_names, cv_args, expected_results) def test_too_large_train_size_(X, y, group_numbers): @@ -393,8 +389,6 @@ def test_cross_val_score(X, y, group_numbers): expected_scores = np.array([0, 0.5, 0.25]) clf = DummyClassifier(strategy="most_frequent") scoring = "accuracy" - cv_scores = cross_val_score( - clf, X, y, groups=group_numbers, scoring=scoring, cv=cv - ) + cv_scores = cross_val_score(clf, X, y, groups=group_numbers, scoring=scoring, cv=cv) assert np.array_equal(cv_scores, expected_scores) diff --git a/mlxtend/evaluate/time_series.py b/mlxtend/evaluate/time_series.py index e78e3f6e9..89d8e2d6c 100644 --- a/mlxtend/evaluate/time_series.py +++ b/mlxtend/evaluate/time_series.py @@ -46,19 +46,13 @@ def __init__( ): if (train_size is None) and (n_splits is None): - raise ValueError( - "Either train_size or n_splits have to be defined" - ) + raise ValueError("Either train_size or n_splits have to be defined") if window_type not in ["rolling", "expanding"]: - raise ValueError( - 'Window type can be either "rolling" or "expanding"' - ) + raise ValueError('Window type can be either "rolling" or "expanding"') if (train_size is not None) and (window_type == "expanding"): - raise ValueError( - "Train size can be specified only with rolling window" - ) + raise ValueError("Train size can be specified only with rolling window") self.test_size = test_size self.train_size = train_size @@ -185,9 +179,7 @@ def _calculate_split_params(self): ) if (train_size is None) and (n_splits is not None): - train_size = ( - n_groups - gap - test_size - (n_splits - 1) * shift_size - ) + train_size = n_groups - gap - test_size - (n_splits - 1) * shift_size self.train_size = train_size if train_size <= 0: @@ -203,9 +195,7 @@ def _calculate_split_params(self): ) train_start_idx = 0 elif (n_splits is None) and (train_size is not None): - n_splits = ( - n_groups - train_size - gap - test_size - ) // shift_size + 1 + n_splits = (n_groups - train_size - gap - test_size) // shift_size + 1 self.n_splits = n_splits if self.n_splits <= 0: @@ -220,19 +210,11 @@ def _calculate_split_params(self): ) ) train_start_idx = ( - n_groups - - train_size - - gap - - test_size - - (n_splits - 1) * shift_size + n_groups - train_size - gap - test_size - (n_splits - 1) * shift_size ) else: train_start_idx = ( - n_groups - - train_size - - gap - - test_size - - (n_splits - 1) * shift_size + n_groups - train_size - gap - test_size - (n_splits - 1) * shift_size ) if train_start_idx < 0: From be1d3cd3a10e8be4d72d906877bfad404c1ea9ff Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Tue, 3 May 2022 12:03:25 +0300 Subject: [PATCH 17/30] Fix formatting without trailing comma --- mlxtend/evaluate/tests/test_time_series.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/mlxtend/evaluate/tests/test_time_series.py b/mlxtend/evaluate/tests/test_time_series.py index d88f01dd7..c378fab10 100644 --- a/mlxtend/evaluate/tests/test_time_series.py +++ b/mlxtend/evaluate/tests/test_time_series.py @@ -15,24 +15,7 @@ @pytest.fixture def X(): return np.array( - [ - [0], - [7], - [6], - [4], - [4], - [8], - [0], - [6], - [2], - [0], - [5], - [9], - [7], - [7], - [7], - [7], - ] + [[0], [7], [6], [4], [4], [8], [0], [6], [2], [0], [5], [9], [7], [7], [7], [7]] ) From 718aafb9df7f31d800e5122090469e5aed63717e Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Tue, 3 May 2022 12:04:18 +0300 Subject: [PATCH 18/30] Fix messages text --- mlxtend/evaluate/tests/test_time_series.py | 2 +- mlxtend/evaluate/time_series.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mlxtend/evaluate/tests/test_time_series.py b/mlxtend/evaluate/tests/test_time_series.py index c378fab10..1ec260b71 100644 --- a/mlxtend/evaluate/tests/test_time_series.py +++ b/mlxtend/evaluate/tests/test_time_series.py @@ -245,7 +245,7 @@ def test_not_sorted_group_names(X, y, not_sorted_group_names): def test_not_specified_train_size_n_splits(X, y, group_numbers): cv_args = {"test_size": 1} expected_results = None - error_message = "Either train_size or n_splits have to be defined" + error_message = "Either train_size or n_splits should be defined" with pytest.raises(ValueError, match=error_message): check_splits(X, y, group_numbers, cv_args, expected_results) diff --git a/mlxtend/evaluate/time_series.py b/mlxtend/evaluate/time_series.py index 89d8e2d6c..efa7bde35 100644 --- a/mlxtend/evaluate/time_series.py +++ b/mlxtend/evaluate/time_series.py @@ -26,8 +26,8 @@ class GroupTimeSeriesSplit: Gap size between train and test datasets. shift_size : int (default=1) Step to shift for the next fold. - window_type : str (default='rolling') - Type of the window. Possible values: 'rolling', 'expanding'. + window_type : str (default="rolling") + Type of the window. Possible values: "rolling", "expanding". Examples ----------- @@ -46,7 +46,7 @@ def __init__( ): if (train_size is None) and (n_splits is None): - raise ValueError("Either train_size or n_splits have to be defined") + raise ValueError("Either train_size or n_splits should be defined") if window_type not in ["rolling", "expanding"]: raise ValueError('Window type can be either "rolling" or "expanding"') From 972b0d5b828221f3361c13c3e2376a3b0f84df8b Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Tue, 3 May 2022 13:49:00 +0300 Subject: [PATCH 19/30] Add documentation draft with navigation section --- docs/mkdocs.yml | 1 + docs/sources/USER_GUIDE_INDEX.md | 1 + .../evaluate/GroupTimeSeriesSplit.ipynb | 1741 +++++++++++++++++ 3 files changed, 1743 insertions(+) create mode 100644 docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index f7263974b..49d0ead04 100755 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -70,6 +70,7 @@ nav: - user_guide/evaluate/create_counterfactual.md - user_guide/evaluate/feature_importance_permutation.md - user_guide/evaluate/ftest.md + - user_guide/evaluate/GroupTimeSeriesSplit.md - user_guide/evaluate/lift_score.md - user_guide/evaluate/mcnemar_table.md - user_guide/evaluate/mcnemar_tables.md diff --git a/docs/sources/USER_GUIDE_INDEX.md b/docs/sources/USER_GUIDE_INDEX.md index 28b17cca6..ef264ffa9 100755 --- a/docs/sources/USER_GUIDE_INDEX.md +++ b/docs/sources/USER_GUIDE_INDEX.md @@ -36,6 +36,7 @@ - [create_counterfactual](user_guide/evaluate/create_counterfactual.md) - [feature_importance_permutation](user_guide/evaluate/feature_importance_permutation.md) - [ftest](user_guide/evaluate/ftest.md) +- [GroupTimeSeriesSplit](user_guide/evaluate/GroupTimeSeriesSplit.md) - [lift_score](user_guide/evaluate/lift_score.md) - [mcnemar_table](user_guide/evaluate/mcnemar_table.md) - [mcnemar_tables](user_guide/evaluate/mcnemar_tables.md) diff --git a/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb b/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb new file mode 100644 index 000000000..f66b430cc --- /dev/null +++ b/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb @@ -0,0 +1,1741 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GroupTimeSeriesSplit: A scikit-learn compatible version of the time series validation with groups" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "from IPython.display import display\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.dummy import DummyClassifier\n", + "from sklearn.model_selection import cross_val_score\n", + "\n", + "from matplotlib.patches import Patch\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib.ticker import MaxNLocator\n", + "%matplotlib inline\n", + "\n", + "from mlxtend.evaluate.time_series import GroupTimeSeriesSplit" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Prepare sample data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'2021-01': 0,\n", + " '2021-02': 1,\n", + " '2021-03': 2,\n", + " '2021-04': 3,\n", + " '2021-05': 4,\n", + " '2021-06': 5}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "months_map = {f'2021-0{i+1}': i for i in range(6)}\n", + "months_map" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['2021-01',\n", + " '2021-02',\n", + " '2021-02',\n", + " '2021-02',\n", + " '2021-02',\n", + " '2021-03',\n", + " '2021-03',\n", + " '2021-03',\n", + " '2021-04',\n", + " '2021-04',\n", + " '2021-05',\n", + " '2021-05',\n", + " '2021-06',\n", + " '2021-06',\n", + " '2021-06',\n", + " '2021-06']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "months = ['2021-01', '2021-02', '2021-02', '2021-02', '2021-02', '2021-03',\n", + " '2021-03', '2021-03', '2021-04', '2021-04', '2021-05', '2021-05',\n", + " '2021-06', '2021-06', '2021-06', '2021-06']\n", + "months" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 5, 5])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "groups = np.fromiter(map(months_map.get, months), dtype='int')\n", + "groups" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Features and targets" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
num_feature
2021-010
2021-027
2021-026
2021-024
2021-024
2021-038
2021-030
2021-036
2021-042
2021-040
2021-055
2021-059
2021-067
2021-067
2021-067
2021-067
\n", + "
" + ], + "text/plain": [ + " num_feature\n", + "2021-01 0\n", + "2021-02 7\n", + "2021-02 6\n", + "2021-02 4\n", + "2021-02 4\n", + "2021-03 8\n", + "2021-03 0\n", + "2021-03 6\n", + "2021-04 2\n", + "2021-04 0\n", + "2021-05 5\n", + "2021-05 9\n", + "2021-06 7\n", + "2021-06 7\n", + "2021-06 7\n", + "2021-06 7" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "0 1\n", + "1 0\n", + "2 1\n", + "3 0\n", + "4 1\n", + "5 0\n", + "6 0\n", + "7 1\n", + "8 1\n", + "9 1\n", + "10 0\n", + "11 1\n", + "12 1\n", + "13 0\n", + "14 0\n", + "15 0\n", + "Name: target, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "data = [[0], [7], [6], [4], [4], [8], [0], [6], [2], [0], [5], [9], [7], [7], [7], [7]]\n", + "target = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0]\n", + "\n", + "X = pd.DataFrame(data, index=months, columns=['num_feature'])\n", + "y = pd.Series(target, name='target')\n", + "\n", + "display(X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define helper function" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def test_cv(X, y, groups, **cv_args):\n", + " cv = GroupTimeSeriesSplit(**cv_args)\n", + " groups = np.array(groups)\n", + " \n", + " for train_idx, test_idx in cv.split(X, groups=groups):\n", + " print('TRAIN INDICES:', train_idx)\n", + " print('TEST INDICES:', test_idx)\n", + " print('TRAIN LENGTH:', len(train_idx))\n", + " print('TEST LENGTH:', len(test_idx))\n", + " print('TRAIN GROUPS:', groups[train_idx]) \n", + " print('TEST GROUPS:', groups[test_idx])\n", + " print('TRAIN GROUP SIZE:', len(set(groups[train_idx]))) \n", + " print('TEST GROUP SIZE:', len(set(groups[test_idx])))\n", + " print('TRAIN GROUP MONTHS:', X.index[train_idx].values) \n", + " print('TEST GROUP MONTHS:', X.index[test_idx].values)\n", + " print()\n", + "\n", + "def plot_cv_indices(cv, X, y, groups, n_splits,\n", + " image_file_path=None):\n", + " \"\"\"Create a sample plot for indices of a cross-validation object.\"\"\"\n", + " \n", + " fig, ax = plt.subplots(figsize=(12, 4))\n", + " cmap_data = plt.cm.tab20\n", + " cmap_cv = plt.cm.coolwarm\n", + " lw = 10\n", + " marker_size = 200\n", + " \n", + " for split_idx, (train_idx, test_idx) in enumerate(cv.split(X=X, y=y, groups=groups)):\n", + " indices = np.array([np.nan] * len(X))\n", + " indices[test_idx] = 1\n", + " indices[train_idx] = 0\n", + " \n", + " ax.scatter(\n", + " range(len(X)),\n", + " [split_idx + 0.5] * len(X),\n", + " c=indices,\n", + " marker=\"_\",\n", + " lw=lw,\n", + " cmap=cmap_cv,\n", + " vmin=-0.4,\n", + " vmax=1.4,\n", + " s=marker_size\n", + " )\n", + " ax.scatter(\n", + " range(len(X)), [split_idx + 1.5] * len(X), \n", + " c=groups, marker=\"_\", lw=lw, \n", + " cmap=cmap_data, s=marker_size\n", + " )\n", + "\n", + " yticklabels = list(range(n_splits)) + ['group']\n", + " ax.set(\n", + " yticks=np.arange(n_splits + 1) + 0.5,\n", + " yticklabels=yticklabels,\n", + " ylabel='CV iteration',\n", + " ylim=[n_splits + 1.2, -0.2],\n", + " xlim=[-0.5, len(indices) - 0.5]\n", + " )\n", + " ax.legend(\n", + " [Patch(color=cmap_cv(0.2)), Patch(color=cmap_cv(0.8))],\n", + " [\"Training set\", \"Testing set\"],\n", + " loc=(1.02, 0.8),\n", + " fontsize=13\n", + " )\n", + " \n", + " ax.set_title('{}\\n{}'.format(type(cv).__name__, cv_args), fontsize=15)\n", + " ax.xaxis.set_major_locator(MaxNLocator(min_n_ticks=len(X), integer=True))\n", + " ax.set_xlabel(xlabel='Sample index', fontsize=13)\n", + " ax.set_ylabel(ylabel='CV iteration', fontsize=13)\n", + " ax.tick_params(axis='both', which='major', labelsize=13)\n", + " ax.tick_params(axis='both', which='minor', labelsize=13)\n", + " \n", + " plt.tight_layout()\n", + " \n", + " if image_file_path:\n", + " plt.savefig(image_file_path, bbox_inches='tight')\n", + " \n", + " plt.show()\n", + " \n", + "def plot_cv(X, y, groups, image_file_path=None, **cv_args):\n", + " cv = GroupTimeSeriesSplit(**cv_args)\n", + " cv._n_groups = len(np.unique(groups))\n", + " cv._calculate_split_params()\n", + " n_splits = cv.n_splits\n", + " \n", + " plot_cv_indices(cv, X, y, groups, n_splits,\n", + " image_file_path=image_file_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Check train/test split" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "cv_args = {\n", + " 'test_size': 3, \n", + " 'n_splits': 1\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TRAIN INDICES: [0 1 2 3 4 5 6 7]\n", + "TEST INDICES: [ 8 9 10 11 12 13 14 15]\n", + "TRAIN LENGTH: 8\n", + "TEST LENGTH: 8\n", + "TRAIN GROUPS: [0 1 1 1 1 2 2 2]\n", + "TEST GROUPS: [3 3 4 4 5 5 5 5]\n", + "TRAIN GROUP SIZE: 3\n", + "TEST GROUP SIZE: 3\n", + "TRAIN GROUP MONTHS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03'\n", + " '2021-03']\n", + "TEST GROUP MONTHS: ['2021-04' '2021-04' '2021-05' '2021-05' '2021-06' '2021-06' '2021-06'\n", + " '2021-06']\n", + "\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "test_cv(X, y, groups, **cv_args)\n", + "plot_cv(X, y, groups, **cv_args)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Check cross-validation split" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### Success cases" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- test_size + train_size \n", + "- test_size + n_splits\n", + "- test_size + n_splits + gap_size\n", + "- test_size + n_splits + shift_size\n", + "- test_size + n_splits + expanding window\n", + "- test_size + train_size + n_splits (full usage of data)\n", + "- test_size + train_size + n_splits (partial usage of data)\n", + "- Groups are not sorted\n", + "- Group names\n", + "- Usage in CV" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**test_size + train_size**" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TRAIN INDICES: [0 1 2 3 4 5 6 7]\n", + "TEST INDICES: [8 9]\n", + "TRAIN LENGTH: 8\n", + "TEST LENGTH: 2\n", + "TRAIN GROUPS: [0 1 1 1 1 2 2 2]\n", + "TEST GROUPS: [3 3]\n", + "TRAIN GROUP SIZE: 3\n", + "TEST GROUP SIZE: 1\n", + "TRAIN GROUP MONTHS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03'\n", + " '2021-03']\n", + "TEST GROUP MONTHS: ['2021-04' '2021-04']\n", + "\n", + "TRAIN INDICES: [1 2 3 4 5 6 7 8 9]\n", + "TEST INDICES: [10 11]\n", + "TRAIN LENGTH: 9\n", + "TEST LENGTH: 2\n", + "TRAIN GROUPS: [1 1 1 1 2 2 2 3 3]\n", + "TEST GROUPS: [4 4]\n", + "TRAIN GROUP SIZE: 3\n", + "TEST GROUP SIZE: 1\n", + "TRAIN GROUP MONTHS: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03'\n", + " '2021-04' '2021-04']\n", + "TEST GROUP MONTHS: ['2021-05' '2021-05']\n", + "\n", + "TRAIN INDICES: [ 5 6 7 8 9 10 11]\n", + "TEST INDICES: [12 13 14 15]\n", + "TRAIN LENGTH: 7\n", + "TEST LENGTH: 4\n", + "TRAIN GROUPS: [2 2 2 3 3 4 4]\n", + "TEST GROUPS: [5 5 5 5]\n", + "TRAIN GROUP SIZE: 3\n", + "TEST GROUP SIZE: 1\n", + "TRAIN GROUP MONTHS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04' '2021-05' '2021-05']\n", + "TEST GROUP MONTHS: ['2021-06' '2021-06' '2021-06' '2021-06']\n", + "\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "cv_args = {\n", + " 'test_size': 1,\n", + " 'train_size': 3\n", + "}\n", + "\n", + "test_cv(X, y, groups, **cv_args)\n", + "plot_cv(X, y, groups, **cv_args)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**test_size + n_splits**" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TRAIN INDICES: [0 1 2 3 4]\n", + "TEST INDICES: [5 6 7 8 9]\n", + "TRAIN LENGTH: 5\n", + "TEST LENGTH: 5\n", + "TRAIN GROUPS: [0 1 1 1 1]\n", + "TEST GROUPS: [2 2 2 3 3]\n", + "TRAIN GROUP SIZE: 2\n", + "TEST GROUP SIZE: 2\n", + "TRAIN GROUP MONTHS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02']\n", + "TEST GROUP MONTHS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04']\n", + "\n", + "TRAIN INDICES: [1 2 3 4 5 6 7]\n", + "TEST INDICES: [ 8 9 10 11]\n", + "TRAIN LENGTH: 7\n", + "TEST LENGTH: 4\n", + "TRAIN GROUPS: [1 1 1 1 2 2 2]\n", + "TEST GROUPS: [3 3 4 4]\n", + "TRAIN GROUP SIZE: 2\n", + "TEST GROUP SIZE: 2\n", + "TRAIN GROUP MONTHS: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03']\n", + "TEST GROUP MONTHS: ['2021-04' '2021-04' '2021-05' '2021-05']\n", + "\n", + "TRAIN INDICES: [5 6 7 8 9]\n", + "TEST INDICES: [10 11 12 13 14 15]\n", + "TRAIN LENGTH: 5\n", + "TEST LENGTH: 6\n", + "TRAIN GROUPS: [2 2 2 3 3]\n", + "TEST GROUPS: [4 4 5 5 5 5]\n", + "TRAIN GROUP SIZE: 2\n", + "TEST GROUP SIZE: 2\n", + "TRAIN GROUP MONTHS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04']\n", + "TEST GROUP MONTHS: ['2021-05' '2021-05' '2021-06' '2021-06' '2021-06' '2021-06']\n", + "\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "cv_args = {\n", + " 'test_size': 2,\n", + " 'n_splits': 3\n", + "}\n", + "\n", + "test_cv(X, y, groups, **cv_args)\n", + "plot_cv(X, y, groups, **cv_args)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**test_size + n_splits + gap_size**" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TRAIN INDICES: [0 1 2 3 4]\n", + "TEST INDICES: [8 9]\n", + "TRAIN LENGTH: 5\n", + "TEST LENGTH: 2\n", + "TRAIN GROUPS: [0 1 1 1 1]\n", + "TEST GROUPS: [3 3]\n", + "TRAIN GROUP SIZE: 2\n", + "TEST GROUP SIZE: 1\n", + "TRAIN GROUP MONTHS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02']\n", + "TEST GROUP MONTHS: ['2021-04' '2021-04']\n", + "\n", + "TRAIN INDICES: [1 2 3 4 5 6 7]\n", + "TEST INDICES: [10 11]\n", + "TRAIN LENGTH: 7\n", + "TEST LENGTH: 2\n", + "TRAIN GROUPS: [1 1 1 1 2 2 2]\n", + "TEST GROUPS: [4 4]\n", + "TRAIN GROUP SIZE: 2\n", + "TEST GROUP SIZE: 1\n", + "TRAIN GROUP MONTHS: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03']\n", + "TEST GROUP MONTHS: ['2021-05' '2021-05']\n", + "\n", + "TRAIN INDICES: [5 6 7 8 9]\n", + "TEST INDICES: [12 13 14 15]\n", + "TRAIN LENGTH: 5\n", + "TEST LENGTH: 4\n", + "TRAIN GROUPS: [2 2 2 3 3]\n", + "TEST GROUPS: [5 5 5 5]\n", + "TRAIN GROUP SIZE: 2\n", + "TEST GROUP SIZE: 1\n", + "TRAIN GROUP MONTHS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04']\n", + "TEST GROUP MONTHS: ['2021-06' '2021-06' '2021-06' '2021-06']\n", + "\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA1gAAAEYCAYAAABBWFftAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAABAP0lEQVR4nO3dd5ycVd3+8c+VSsqmkEIKJKErBJJA8BGQ34NgA6kixggqiFQpgkQQUakKwoMIggoIiIDSREAUpAsiSEJICCVATEJJI5CyyYYkm/3+/jj3wDDM9tmd3c31fr3mtTt3Ofc1Mzu7891z7nMrIjAzMzMzM7Pm61TuAGZmZmZmZh2FCywzMzMzM7MScYFlZmZmZmZWIi6wzMzMzMzMSsQFlpmZmZmZWYm4wDIzMzMzMysRF1hm1uFJOkDSPyS9I2mNpLck3S7pC+XOliNplKRowC233fGtmK2XpHMlzZS0StJCSY9JOqKEx5gj6eJStZfX7r6S/iVpqaTlkl6Q9BtJvRvZzmHZ8947u597HfbJ2+b7knYv7SMwM7P2pku5A5iZtSRJvwBOBG4Afg28A4wEvgr8XdIWETGrjBFz5gM7593fDLgJ+A7wbJHtZrdeNO4AxgHnATOAwcD/A/YGfleiYxxIem1KRtJE4Gbgt6TsAWwPfBPoB6xoRvO51+HlvGXfB34FPNqMds3MrJ2TLzRsZh2VpP2BvwCHR8T1RdbvC0yJiHlF1vWIiFUtHrIWkkYDzwOfjohHy5hjS+AV4CsRcVvBOkUz/4i05PMs6V/A0oj4YpF1jcou6TDgOqAiIooWZpIWA7+KiLOaltjMzDoCDxE0s47su8AzxYorgIi4J1dcZcO9TpF0qaS3ScUNkgZK+n02vLBK0qOSxue3U2zInqSzsg/cufu5IWY7SXo8G2r3iqQDG/ugCo+XZbpd0uGSZktaIekPkrpL+oSk/2TLHpU0oqCtDST9XNIbklZLmiZp77xN+mVfFxR5/j5UoEgaIelPkt7Nnqv7JW2dtz43rO4QSTdIWgrck637yBBBSbtlQxGrsuf/akkVeev7SbpG0jxJ70l6XdLVBdk/krswe95r/8ss+1JJl0vqVmzfgseyTy4/MAD4Sd5wzt1r29/MzDouF1hm1iFJ6kIawvWPRuw2CRgKfJ00rBBSD9jngVOBCaTfm49I2qKJ0W4B7gK+RCribpM0polt5fskaejbCaShal8BLgeuBn4JHEoadnhVwX63A4cBPwX2BZ4B7pY0Nls/E1gJXCrpc5I2KHZwSRsCTwBbA8dkx+8FPCipR8HmFwOVwMHZcYu1tyvwIKlA+jKpWN6b1IuUcwnwKeBk0mt0BmkYYM6zwERJx0saVuw4eb4HbAwcQhpOeBRwfj375DsQWEYaMrlzdnu2zj3MzKxD8jlYZtZRDQC6A2/kL5QkoHPeonV5vRnzI2JC3rZfAHYFdo+Ix7JlDwNzSMXY0U3IdU1EXJy1dT/wIvAD0jlhzdEb2D8ilmVt7w4cCfxvRPwzWzYMuEJSz4iokrQn8EXyHh/wD0lbAT8EDo6I5ZKOJBVq9wNrJT0F/CF7LLnn7mRSQTU2It7Njvcv0nP1LeCKvKxPRcR36nk8FwBPFrwebwEPSRodETOATwBXRMQtefvdmPf9GcB2pELzckmzSQXzzyOisGerMnu8NaRz87oDP5T0s9zjqUtETJVUDbwZEU/Vt72ZmXVc7sEys46u8Dyb7wFr8275H/T/VrDtJ4BFecUHEbES+Cup56Qp7sxrq4bUm/WJJraVb3KuuMq8Bqwh9SrlLwPI9eZ8htRD9C9JXXI34CHg/WGQEfFH0sQg3wL+BGxF6gm7Oa/tzwAPAMvz2qkEpuS3lbm3rgciqSepB+jWglxPkF6zHbNNnwMmSTouKwo/JCLeyLb9DPB/wLukQnC6pI0LNr8rez1y/gz0AEbXldXMzKyQCywz66jeAVaThn3l+wOwU3YrtLDg/lBgUS3bbdjEXIXtLcqO01xLC+6vASoLioY12dfcML+BwBA+XHCuBc4CNslvLCLeiYjrIuIb2brrgK/mDW8cSBpCWdjWpwvb4qPPc6H+pF7GKwvaWg10zWvveFKP1I+BmZJelfShnsCIWBcRD0XEqRExnjSUcENSoZ2v2OsCpXltzMxsPeIhgmbWIUVEtaR/A58jfQDPLV9I9gE/jRb88G4F9+eTpiQvtBGpNyRnNVA4IUL/WqIN5sPTkQ/OjlMO7wJvAQc0ZqeIWKs0/f3hwMeAaVlbdwPnFtmlsrCJeg6xNNvmLD7aqwgwL8uxlHSu3ImStiede3aTpOkR8WIt2f8haVqWO1/h65y7X67XxszM2in3YJlZR3Yp8D+Svt7E/Z8GBkv6f7kF2fC1L/LhoXdvAh/P26YTsGctbR5YsN3+wH+amK+5HiL1YK2IiMmFtyxjRZFJKgC2zL4uzGtrW+CFIm3NbEyobBjmU8DWxXIVm1Y/IqaTzovrRFY8SfpIcZxN0rExH+1F2z97PXK+BKwiXferodbwQe+gmZmtp9yDZWYdVkTcJelS4HpJnyZNCb6YNAHG57LNar3YbETcL+lJ4BZJp5N6nk4lnZtzUd6mdwLfkTQV+C/wbaBPLc1+W9Ia0gf3bwNbABOb9gib7QHSxBUPSLoQeIGUeyywQUT8gDQr4N2SrgWeBKqy9T8knQOVKzQvIc1U+LCky0k9YxsB/ws8kZ3H1RjfJ01oUUOa6bASGEEqbn8YEa9IeoL03M8g9XgdSZrxMFew3i/pZdLr/gapmDye1Lv424LjVZBmdLyaVCj+iDSBRr0TXOR5GfiipPtIP1czI6Kw987MzDo4F1hm1qFFxMmS/gkcR5pCuwJ4G/g3sHdE/L2eJg4gTZBwKal34j/AHhHxWt42Z5OGlJ1H6sX4FalYKTZT3leBX2TbvgFMiIipTXlszRURIelLpNn2vksqYN4lFU6XZ5vNAq4hnbt0LKm4fB24FrgwIqqzthZL+iRpavNfkK5BNZ9UgE1vQrYnsp7Ds0nnzXUG5gL38UHv079JU8yPAtYBU4G9IuLNbP3PSc/3haTX523S1OmfiojCXsP/I01j/0dSL9jvsuelMSaRZku8F+hJOv/s0Ua2YWZm7ZwacSF7MzNrIkmHkSaGqIiIWnvNrPVJCuCEiPhVubOYmVn753OwzMzMzMzMSsQFlpmZmZmZWYl4iKCZmZmZmVmJuAfLzMzMzMysRFxgmZmZmZmZlYgLLLN2TtJYSU9JWinpuoJ1IWn37Ptuks6SNLYFMrRI25IelXR7idvcPZs1rrH7DZJ0maT/SFojaU4zMhzWnP1bUuFznr2ui/Pub5Ut69fIdpv0mCV9WdKTkt6R9J6kmZLOlNStCW2dJenRxu7XVki6XtLkErc5Kvs9MaqR+3WTdJGkxyWtqu09Jem72fvllWwmTTOzDs8Flln7dx4wHDgk+7423YCfkC4SW2ot1fZxwA9K3GZTDQcmAAtI14laX+SugZWzFem17tdKxx8APEy6KPNepOtv/ZB0YeP1zbmk6361BT1Jr0kV6QLUtfkT6XWbDVwpaYNWyGZmVla+0LBZ+7cV8NeI+Eu5g5RaRLxY7gx5pkfERgCSLga+XOY8rSK7aO+b9W7Ycsf/bcGiRyT1Ab4j6YRYj2ZqiohZ5c6QExFLJW2YXaz6eGCPWrZbACyQ1AP4HOli1q+0YlQzs1bnHiyz9q8bsLIB21VmX6/LhgS9PyxI0gaSfi7pDUmrJU2TtHf+zpL2kzQlG4q4RNLTkv63vrbrImlbSfdJejdr9yVJ38lbXzhcLWq5HZa3zW6SHpNUlQ0ru1pSRQOenzpFRE1z26hNNnwuJG0n6YHsuXhZ0pca0UZXSRdLej17DedJujM3lC7vGDvlDet6RdKB9bT7/hDBbLjpPdmq2Vl7c7J1/SRdkx33vSzH1U14OhriHdLPfbNJOljSq9nz8YikcUV+pr4h6Yns53RJtt34gnaulzRZ0gHZa/dets82jciysaRbJS3K8sySdG7hMfLuz6nl/XBW3jajJd0rqTK73SZpSFOfr3yNLG5zF9cuyetmZtaWucAy68AiQhHxaHY39x/m84Cds9v8bNntpKFHPwX2BZ4B7lZ2TpWkzbNtHs7WHwL8FdiwAW3X5R5gHXAosB9wOVBXMbRzwe1yIIDXspy7Ag+ShvF9GfgusDfw/rlpEfFoRCi/0eyD6vUNyFsSEXF9RIwqsupm4G7gQOBV4E+SNm5gsz8gvS4/Aj5LeuzLgM4F290C3AV8CXgeuE3SmAYe41ng1Oz7L5Feg1yBdgnwKeBk0pDCM0ivDVD8MRcWA3WR1FlST0mfAk4Eft3Y3quIOCsids9rczxpCNuz2eO4m/T8FBoF3AAcDHwNeAN4XNJmBduNJD0P52bb9QXuV8OHxd0AbAIcRRpWdz7QvY7tD+TD74czsuWvZI9vC+BfwAak99hhwLbAPZIEEBFzst8Tc3KNZv/YeLSBmc3MrICHCJq1f/1oWA/WM9nXWRHxVG6hpD2BLwK7R8Rj2eJ/SNqKdK7LwcA4oDIiJuW197f62q6LpIHApsD+EfF8tvihuvYpyL0jcCRwdkQ8kS2+AHgyIibkbfcW8JCk0RExo5amq0mFXrn9IiKuBZA0BVgI7AP8pgH7fgK4OSJ+n7fs1iLbXRMRF2fHuB94kVScfbW+A0TEckkzs7tT8z+UZ8e/IiLyC5Qb62lyHdDQnsGVfFBs3ABMqmPbhjoNeAn4alas3SepK3Bh/kYRcU7ue0mdgAdIj/dQ4Jy8TQeSfp6fzLadAswiFTYNfQ0nRkSul/DRujaOiKl5uUYCpwDXR8TN2eKfkP7ZsFdErMm2mw68TPrHw721NN0S74Wq7GvfFmjbzKxNcQ+WWTslqZekb5E+sExpRlOfIX0I+5ekLrkbqdjJDYN6Hugr6feSPiepV7PCJ++SegJ+I2mCpMEN3VHSIODPpN6qc7JlPUn/xb+14HE8AawFdqytvYjYIiKOaPpDKZl/5L6JiHeARUBDe7CeAw6T9H1J2+d6KIq4M+8YNaTerE80Le5Hjj9J0nFZcV6viOiSX7zUYxdgN+B7wP7Ar5qU8sN2Au4p6Am7u3AjSR/PhlsuJBUfa4GtSec/5luUK64AImIu6b3Z0Of3OeBn2XDOEQ19EErnN90JzAWOzVv1mWx5Td77YTYwhw/e2x8REXtGxJ4NPX4DvUQqko+WNKCOn08zs3bPBZZZO6R0ftMK4HfA5RHxkQ+FjTAQGEL60Jh/O4s0XImImEn6ULsZqedqsaSbs0KnSbIP958jFXfXkk6Ef1zSuLr2yz4k3gqsAQ7N+3DcnzQc7sqCx7Ea6Jp7LG3c0oL7a0jDuxriPOAK0syL04A3JJ1UZLtFRe4PbUTG2hwP/AX4MTAzO6+p3l6xhoqIZyPiiYi4hDRE8Nhs6GpzDAHeLlj2oftK5+/9g/TzcwqpyNuJ9BwXvjaFz21uWUOf3wnAZOAXwFxJz2U9zPW5Kst3UES8l7d8IKmXrvC9vRmt/H6IiErSsMkvA4uBb7bm8c3MWpOHCJq1T/NIH/S+Ahwn6aaIeLqJbb0LvAUcUNdGEXEvcK+kvqQhhZeSzoFq8ofoiHgZOCgblrUbaWjWvZI2rmNSiYtJH3A/GRHL8pYvJZ3zcxYfHr6YM6+pOduD7IP1j4EfS9oSOAa4VNLMiLgvb9PBpEki8u835Hy5+o6/lFT4nChpe+D7wE2SprfAbJDPZl83JQ3Ba6oFQOE/CQrv70zqRfxs9vMKQPY+KFSsF3Yw8EJDwkTEW6ReyE6kXq+zSOdCjsh6ND9C0neBicAXsh6zfO+SerCuKbLr4iLLWkz2Hv8VqYA8B5ha9x5mZu2Xe7DM2qGIWJOdd3QKqYfmcw3YbU32tfC/7g+R/pO/IiImF96KHHtZdo7HnUBuhrTa2m6QiFgbEQ+TJggYSi3XWJL0deAk4IjC86kiYiXwFLB1sccRER26wMoXEa+SJqNYzQevUc77swZmH+T3B/7TiObrfa0jYjrpHKlOwMca0XZD7Zp9nd3Mdp4B9i0YrrZfwTY9sq+rcwsk7UKa+KLQ4GxdbrsRwA407vklImqy8w3PJl1vamSx7SR9GrgIOCMiHiyyyUOkSS2mFHk/zGlMphIYTeo1OzsiHqytYDQz6wjcg2XWjkVEtdIU2n0asO0aSbOBr0iaAbwHTCedsH8/8ICkC0n/be9DumjwBhHxA0lHk/6Tfx+pJ2hL0uQXN9TVdu7E+mKyXo6LSbO2/Zc0xO80YFpEvFtk+81JQ6H+Tho+9cm81bMi4m1Sr8lDkmpIsx5Wkq6780XghxFR9Po7kl4DHqvvPCxJuWtfbQX0zLv/WHZ8stkId69llsAWI+lO0vk+U4FVpKFYXYB/Fmz6bUlrgBmkC8VuQeoBaajcJBdHS/oTUBURz0t6glR0zyD1JB5JOuem1uJCUjVwTl3nYUm6j3Su3Quk8592JZ2HdUvkXRcqN+td/iyBDXAh8DRptsbrgI9nueGDyTeeIg3HvVrSz0m9WWeRen0LLQZulHQm6TU4mzRE8Pr6gmQ9YveT3lOvkCb0+B6pl+2lWra/lfR8/7Pg/fBmpOuXnUV6/u+VdG2Wbzhplsnr82YYLWz7IUjnYtWTeS+gF9kFxvPeD88U6U3L/Y4qNozSzKxDcYFl1v4F0NATxo8hFTUPkj7AbRoRc5Sut3QGaWrvEaShRc+RhgBCKsT2I/UwbUgaUnY1aUharW2TTqavzQLSLHk/BIaRhvg9QiqyitmE1GuyV3bLdzjpA+MTkv4f6YPtH0jnZM0lFYYL68jShY9OZ17MbbXc/zQfzPjWk/J8iHySdA5PrufoRdI5OYW9kF8lneNzHmmSkQn5s9HVJyLmSjqVNBzwBNJFiEcB/ybNljeKVAhNJc1eV9dFijtT/0iKZ/LarSYV4z/go7Py9aTun7ePiIjJkiaSLk+wP2n42rGkfzosz7ZZKOlg0s/2XaTp848hFfOF5mZtXUDqdZoMfK3gvKjavEeaTOYk0s96Fam4+1xErCqyfX/SOVYDSc99vrOBsyLilazwOo/0z4kepMLwIbJLG9SiIe8FgF/z4d613PvhcD5aVOZ+R603F4Y2s/WXonGXETGzNkbSLOBvEXFCubMYSJoL/LhguvSyU7pw7nVARUSsqGfzdkVSd1JB9Lm8Sw00ta1DScX5ZhHR4CGIWc/l6IiodXa+9Zmkz5P+0fHx/HPZzMw6IvdgmbV/bwKfkjQMWFDH5BDWwiQNJ81Y+MdyZ1nPjAeeb0pxJenXpB6rJaTzpc4E7m1McWW1y85vqyD1gNfQwSebMZsyZUq3Tp06Hdu5c+fDI6IvDR9hYu1DSFq2bt2662pqan694447Fj0VwgWWWft3CXATaejP70nDqcoum0Ch1uFfEVHdinFaRTYL3LBSt5t9SK1r2FbN+lxYR8S/qOO6TvUYQJrafwBpdsVbKD78r1nqez8A66JjDik5iTQktQb4ZUQsL3MesxbVpUuXq/v06bPrsGHDVnbr1u0dX/KtY4kI1qxZ03XevHknLF++fAdqueSEhwiadQCSNgA2B1aWYXaworIhU3Vd62bTtpK1rZO0O+n8tNqcHRFntUoYaxJJZwE/qWOTT9c26UR7ll1AfAjwRkQsKXces5Y2bdq0Wdttt93STp06+QN2B1ZTU6Pnn3++35gxY4pej9E9WGYdQHYSfYOutdOKziJd96Y2HirUcFNI1/6qjZ/Ltu8q4K91rJ9Zx7p2KyIW4ZkDbf3S2cVVx5e9xrWOLHEPVhEDBw6MUaNGlTuGmZmZmZXRlClTFkdE4QXIazVt2rQ5Y8aMadULeVt5TJs2beCYMWNGFVvnHqwiRo0axeTJH7m+qpmZmZmtR7KZYc0axQWWmZmZmVkL+ukf14ypWt3yn7t7dqf6jIndprX0cQr9+te/3vDSSy8dMnPmzBdbYvv2pr4LPJqZmZmZWTO0RnHVmOP07NlzXO7WpUuXHbp06bJD/rLGHvfYY499tzHFUmO3b00zZ87sJmnHWbNmdW1qG+7BMjMzMzNbj1RVVU3NfT9hwoSR1dXVuuOOO+YU23b16tXq3r27J21oBPdgmZmZmZkZAJ/4xCe2/ta3vrXJZz7zmc179+497uyzz95o1qxZXXfbbbct+/fvP6aiomLsjjvuuPXjjz/eM7fPZZddNmDEiBGj89s48sgjN/785z+/ea9evcZtsskmo2+88cZ+Td2+pqaG008/fchGG220fd++fcceccQRm+y8885bnXLKKUWvPblq1SpNnDhx5IYbbjimd+/e40aOHDn62muv7Z9bf9999/Xecccdt+7bt+/YTTbZZPRPfvKTjWpq0uUkx48fvw3A6NGjR/fs2XPcpEmThjb2OXSBZWZmZmZm77v11lsHnnjiiYuWL18+9Qc/+MGimpoaHXPMMYtef/315+fPnz9t++23r5owYcLmq1evrvVKyrfffvuAU089dcHy5cunfvvb31507LHHjqqsrKy19qhr+yuvvHLA1VdfvdGf//znVxctWjRt6NCha5555pnetbV1xRVXDHjuued6vfjiizNWrFgx9aGHHpo5ZsyYVQBTpkzZ4KCDDtry5JNPXvjOO+88d/fdd792zTXXDL7yyisHAEyePPlFgBkzZsyoqqqaetFFF81v7PPnAsvMzMzMzN631157Ldlvv/0qO3XqREVFRc2WW2655pBDDllWUVFR07t377jkkkvemj9/frcZM2Z0r62NffbZZ8lnP/vZlZ07d+bkk09evGLFis5N3f7mm28e8PWvf/3tXXfddVX37t3jnHPOWTh48OC1tbXVrVu3qKqq6vTcc8/1WLt2LVtsscXaHXfc8T2AX/7yl4P33nvvJYceeujSLl26MG7cuPe+/e1vL7r55psHNOc5y+dzsMzMzMzM7H0jR45cnX9//vz5XY477riNn3rqqYrKysoukgJgwYIFtdYSQ4cOfb8A6tOnTw3A8uXLa704b13bL1iwoOvIkSPX5NZ36tSJoUOHrvloK8mxxx77zsKFC7ueeuqpm8ydO7f7zjvvXHnJJZe8OXr06NWvv/56t6eeeqpPRUVFv9z2EaEhQ4bU2l5juQfLzMzMzMze16nTh0uEk08+efjChQu7PvXUUy+vWLFi6uuvvz4dUmHSGnmGDBmydu7cud1y92tqapg/f3632rbv2rUr559//oIZM2a8NHv27Od79OhRc9hhh40C2HjjjdccfPDBiysrK5/L3VasWDH1tddeewE++tibwgWWmZmZmZnVqrKysnOPHj1qBg0aVL1s2bJOJ5xwwsatefyJEye+c+ONNw568skne6xevVpnn332RosWLap1GvW777674vHHH++5evVq9erVq6ZXr141nTt3DoCTTjpp0T333LPhzTff3Hf16tVau3YtU6ZM2eDee+/tDaknrVOnTrz44osbNDWvCywzMzMzsxbUszvV7fk4P/3pT9965513ug4YMGDstttuu+0uu+yyonPnWkf7ldx3vvOddw4//PBFBxxwwJaDBw8e8+abb3YbM2bMyu7du9cU237+/PldDzvssE379+8/dsiQIWPeeOONbtdcc81cgJ122um922+//dXLL798oyFDhmw/cODAsYcddtimuYKtd+/eMWnSpLcOP/zwzSoqKsaedtppQxqbVxGe1r7Q+PHjY/LkyeWOYWZmZmZlJGlKRIxv6PbTpk2bM2bMmMUtmclg3bp1DB06dPtzzjnnzWOOOebdcmSYNm3awDFjxowqts49WGZmZmZm1qZdddVV/auqqlRZWdnplFNOGfbee+91Ouigg5aVO1cxLrDMzMzMzKxN++1vfzt4o402GjNs2LDtH3/88T5//vOfXx00aNC6cucqxtO0m5mZmZlZmzZlypSZ5c7QUO7BMjMzMzMzKxEXWGZmZmZmZiXiAsvMzMzMzKxEXGCZmZmZmZmViAssMzMzMzOzEvEsgmZmZmZmLWjpJaeOiVUrW/xzt3r0qu53ysXTWvo4jbHFFltse9ppp8078sgjl5Q7S2txgWVmZmZm1oJao7hqzHF69uw5Lvf9mjVrBNCtW7fILauqqpra2GPPnDmz28c+9rHtXnvttembb7752tzy11577YXGttVaDjrooFFdunSJW265ZW4p23WBZWZmZma2HskvoCZMmDCyurpad9xxx5wyRupQfA6WmZmZmZkB8Oqrr3b7whe+sNnAgQPHDBo0aPuJEyeOXLJkSSeAmpoaTjjhhOGDBw/evlevXuOGDx++3fnnnz8YYPz48dsAjB49enTPnj3HTZo0aSjA8OHDt7vyyis3BPjrX/9a0aVLlx2vvvrq/ptsssnoioqKsXvvvfdmufYBpk+f3n2nnXbaunfv3uO23nrrbc4999zBknasLe9f/vKXio9//OPb9O7de1z//v3H7LLLLlvl1lVWVnY66qijNh4+fPh2ffv2HbvbbrttOWPGjO4AZ5555kZ33XXXhnfccceAnj17juvZs+e46urqkjyHLrDMzMzMzIyqqirtueeeW33sYx97b86cOdNffPHFF+bNm9f1qKOOGgHwl7/8pc+tt9464Mknn3xp5cqVU//zn/+8tPvuu1cCTJ48+UWAGTNmzKiqqpp60UUXzS92jHXr1vHAAw/0mTFjxosvvvjijBdeeKHnBRdcMBhg7dq17Lfffltuu+22VQsWLHjuzjvvfO2GG24YVFfmo446atNjjjlm4fLly6fOmzdv+hlnnPH+cQ855JCRr7766gZPPfXUSwsXLpw2fvz4lfvuu+8Wq1ev1nnnnbdw//33f/eggw56p6qqampVVdXULl1KM7jPQwRL5Nw/Nq3i/dHE1nkJmpoP2n5G50vaej7r+JZf8t0m79vnlEtLlqMuTc3YWvnMzMrplltu6RcRXHrppfMAevfuve68886bt+eee36surp6Trdu3WLNmjV67rnnegwbNqx6+PDh1cOHD2/0B5BLL730rb59+9b07du35gtf+MLSZ599thfAww8/3HvevHndfvWrX73Zu3fv2GabbdYcd9xxC0855ZSRtbXVtWvXmDVrVvc333yzy4gRI6r32WefSoD58+d3ueeeezZ85ZVXnt9kk02qAS6++OJ511xzzeBHH3201+c///kVTXuW6tfhe7AkdZZ0kaS3JVVKukPSwHLnMjMzMzNrS/773/92mz9/freKioqxudvee++9lSTeeOONrvvss0/lD3/4w7d+9rOfDR08ePDYXXfddct//vOfPRtzjM6dOzNs2LD3i7JevXrVrFy5sjPA66+/3nXDDTes7t279/sTbmy66aar62rv9ttvf23WrFkbjBkzZtvNN99823POOWcwwCuvvNINYIcddtgm91j69es3trq6WnPmzOnWmMyNtT786/p0YH/gf4B3gGuBPwB7lTOUmZmZmVlbMnLkyDWjRo1aXdfMf6eeeuriU089dXFlZWWnSZMmDZswYcLm8+fPf75Tp+b324wYMWLtkiVLuqxYsUK5Iqu+YmjnnXdede+99/63pqaGf/zjH70POOCArcaOHbtqxx13XAUwc+bMGfkFXb5SZC7abou02rYcBVwYEf+NiGXA94EvSKq1q9HMzMzMbH0zYcKEZWvXrtXpp58+ZMmSJZ1qamqYPXt21xtuuKEfwCOPPNLzvvvu671q1Sr16NGjpqKiYl2uSBk6dOjaTp068eKLL27Q1OPvscceK4YOHbrmxBNP3Liqqkovv/xytyuvvHKj2rZ/7733dPnllw+YP39+l06dOjFgwIBqSdG5c+cYPnx49b777vvuEUccMWL27NldARYvXtz5hhtu6Lds2bJOABtttNHauXPndl+3bl1TIxfVoQssSf2AEcCU3LKImAUsB8aUKZaZmZmZrUfUo1dppqdr4eNUVFTUPPTQQzNfeumlHltvvfXoPn36jNtjjz22mjp1ak+AysrKzqeccsomAwcOHNu/f/+xDz/8cJ8bb7xxFkDv3r1j0qRJbx1++OGbVVRUjD3ttNOGNPb4Xbt25c4773xt+vTpPQcNGjR2//3332LChAnvdO3aNWrb5/bbb++/zTbbbNuzZ89xBx544JaTJk2a98UvfnEFwE033TR3yy23fG/33XffulevXuNGjx697W233dZfEgDHHXfc21VVVZ369+8/tqKiYmypZhHs6EMEK7KvywqWLwX65C+QdBSpt4sRI0a0eDAzMzMzWz/0O+XiaeXOUJvCi+xuscUWa++6667Zxbbdb7/9Kvfbb7+XamvrggsuWHDBBRcsyF/21ltvPZ/7fp999qmsrq6ekr/+kksumZd/f9y4ce9Nnjx5Zu7+RRddNHDYsGFrih1vgw02iMcee+y12vJUVFTUXHbZZfMuu+yyecXWb7PNNmumT5/+cm37N1WH7sECKrOvfQuW9yP1Yr0vIq6KiPERMX7QoDpngzQzMzMzsxZw//33937hhRe619TU8PTTT/e47LLLhhx44IHvljtXY3ToHqyIWCrpdWAH4DkASZuReq+mlzGamZmZmZkVmDNnTrfDDjts06VLl3bp379/9T777LPk/PPPL3pNrbaqQxdYmauA0yQ9QppF8ELg/oiYU9ZUZmZmZmb2IUcfffS7Rx99dLvqsSq0PhRYFwD9gWeA7sADwKFlTWRmZmZmZh2SImqdlGO9NX78+Jg8eXK5Y5iZmZlZGUmaEhHjG7r9tGnT/rvddtst6dSpkz9gd2A1NTV6/vnn+48ZM2azYus7+iQXZmZmZmatQtKCVatWNfk6UNY+rFq1agNJC2pb7wLLzMzMzKwEqqurz54zZ063lStX9qipqVG581hp1dTUaOXKlT3mzJnTrbq6+uzatlsfzsEyMzMzM2txO+yww/3PPvvs8bNmzfpJRAzBnRkdTY2kBdXV1WfvsMMO99e2kQssMzMzM7MSyT541/rh2zo+V9VmZmZmZmYl4gLLzMzMzMysRFxgmZmZmZmZlYgLLDMzMzMzsxJxgWVmZmZmZlYiLrDMzMzMzMxKxAWWmZmZmZlZibjAMjMzMzMzKxEXWGZmZmZmZiXiAsvMzMzMzKxEXGCZmZmZmZmViAssMzMzMzOzEnGBZWZmZmZmViIusMzMzMzMzErEBZaZmZmZmVmJuMAyMzMzMzMrERdYZmZmZmZmJeICy8zMzMzMrERcYJmZmZmZmZVIl4ZsJEnAV4HxQEX+uog4qgVymZmZmZmZtTsN7cH6NfArYBOga8HNzMzMzMzMaGAPFnAw8ImImNWSYczMzMzMzNqzhvZgVQGvt2QQMzMzMzOz9q6hBdbPgR9n52KZmZmZmZlZEQ0dIngiMBI4QdKi/BURsVXJU5mZmZmZmbVDDS2wzmvRFGZmZmZmZh1AgwqsiPh9SwcxMzMzMzNr7xrag4WknYBvkaZqfwO4NiKeaalgZmZmZmZm7U2DJrmQdADwT6AvMBXoAzwm6cCWi2ZmZmZmZta+NLQH6yfAQRHxt9wCSXsBFwB3tkQwK61z/1jd5H1/NLHBHZ3N0tSMzpe09XzQ9jM6n5Xb8ku+26T9+pxyaUlz1Kat5zMzawsaOk37KOC+gmX3k2YWNDMzMzMzMxpeYM0FPlOwbE/a+MWHJX1V0uOSlktqeheOmZmZmZlZAzR03Mi5wF2Sbgdmk3q0DgK+2UK5SmUJcCXQA7iqzFnMzMzMzKyDa+g07XdImk8qqMaTZhH8bEQ82ZLhmisi7geQtHt5k5iZmZmZ2fqgwWc+Z8VUmy6omkPSUcBRACNGjChzGjMzMzMza49qLbAk/U9EPJ19v0tt27X1XqyGioiryIYRjh8/Psocx8zMzMzM2qG6erAeBCqy75+oZZsAOpc0kZmZmZmZWTtVa4EVERV53zd0tkEzMzMzM7P1VoMKJ0lfr2X5IaWNU1qSOkvaAOiW3d8gu6nM0czMzMzMrANSRP2nG0laHhF9iix/NyI2bJFkJSDpMOC6Iqs2jYg5te03fvz4mDx5ckvFMjMzM7N2QNKUiBhf7hzWvjR06N9HenwkjQLa9MV7I+L6iFCR25xyZzMzMzMzs46nzmnaJa0lm8hC0pqC1Z2BX7dUMDMzMzMzs/amvutgfYbUe/U3YK+85TXAgoh4taWCmZmZmZmZtTd1FlgR8RiApM0jYn7rRDIzMzMzM2uf6uvBAiAi5ksaAOwEDCLvnKyIuKGFspmZmZmZmbUrDSqwJH0GuANYA/QDlmZfZwMusMzMzMzMzGj4LIIXAOdExCBgRfb1XOA3LZbMzMzMzMysnWlogbUlcGn2fW544IXAd0ucx8zMzMzMrN1qaIFVBXTPvn9H0gigG9C/RVKZmZmZmZm1Qw0tsJ4EDsi+/ztwN/Ag8O8WyGRmZmZmZtYuNWiSC+BQPijGTgW+B1QAl7REKDMzMzMzs/ao3gJLUhfgYlJRRUS8B5zfwrnMzMzMzMzanXqHCEZENfBVYHXLxzEzMzMzM2u/GnoO1t3AQS0ZxMzMzMzMrL1r6DlYXYEbJR0DzAFqcisi4qgWyGVmZmZmZtbuNLTAWgv8Mfu+c3YzMzMzMzOzPA0qsCLi8JYOYmZmZmZm1t419BwsJPWV9DVJ38/uD5E0rOWimZmZmZmZtS8NKrAk7Qi8BpwO/ChbvD1weQvlMjMzMzMza3ca2oN1KfD9iNgeqM6WPQl8siVCmZmZmZmZtUcNLbC2Ba7Pvg+AiFgB9GqBTGZmZmZmZu1SQwust4ER+QskbQG8VfJEZmZmZmZm7VRDC6zfA3+S9ClA2TlZ1wBXt1gyMzMzMzOzdqah18G6kDQc8G9Ab+AR4Jd4kgszMzMzM7P3NfQ6WOuAHwI/lDQwIha3bCwzMzMzM7P2p6HTtN+X+z6/uJJ0b0uEMjMzMzMza48aeg7WLrUs9zTtZmZmZmZmmTqHCEr6Wm47SRMB5a3eEljSUsHMzMzMzMzam/rOwTo/+9od+Gne8hpgAXBCS4QyMzMzMzNrj+ossCJiUwBJd0fEfq0TyczMzMzMrH1q0DlYLq7MzMzMzMzqV2sPlqTLIuLE7PuratsuIo5qiWBmZmZmZmbtTV1DBLvW8r2ZmZmZmZkVUWuBFRHH5n1/eOvEMbOO7Nw/Vjdpvx9NbNA10ZvN+Zqnqfmg7WdsrXzWPMsv+W6T9+1zyqUly1GXpmZ0vqSt5zODhl8Hy8zMzMzMzOrRoQssSRdKekHScknzJF0tacNy5zIzMzMzs46pQxdYwDrgUGAAMAbYGLi+nIHMzMzMzKzj6tCDyiPijLy7b0v6JXBrufKYmZmZmVnHVmcPlqS7Je0tSa0VqIXtCUwrtkLSUZImS5r89ttvt3IsMzMzMzPrCOobIvgOqcdnjqQfSRrWCplahKSDgGOAk4qtj4irImJ8RIwfNGhQ64YzMzMzM7MOoc4CK5uefRjwc+BLpELrLkl7tadeLUkHA1cD+0XEs+XOY2ZmZmZmHVO9k1xExPKIuCIixgG7AYtJvVqzJZ3Z0gGbS9LhwG+BfSPikXLnMTMzMzOzjqtRswhGxNMRcQSp0FoHnN0iqUpE0onAxcDnI+Jf5c5jZmZmZmYdmyKiYRtKXYADgaOATwP/An4bETe3XLzmkRRANbA6f3lE9K5rv/Hjx8fkyZNbMpqZmZmZtXGSpkTE+HLnsPal3mnaJW0FHAl8I9v+98AJEfFyC2drtohoN+eJmZmZmZlZ+1dngSXpMeBTpN6q7wG3RcTquvYxMzMzMzNbX9XXgzUVOCYiXmqNMGZmZmZmZu1ZfZNcXABsV2yFpK9IGlz6SGZmZmZmZu1TfQXW94Eta1m3GXBaaeOYmZmZmZm1X/UVWHsD19Sy7jpgn9LGMTMzMzMza7/qK7CGRMTCYiuy5UNKH8nMzMzMzKx9qq/AWiNpaLEV2fK1pY9kZmZmZmbWPtVXYP0LOKGWdd8BHi9tHDMzMzMzs/arvmnazwcelzQI+CPwFjAcmAgcQrpGlpmZmZmZmVFPgRURkyXtB1wBHAEEIOA1YL+IeLblI5qZmZmZmbUP9fVgEREPAFtJ2hIYBLwdEa+2eDIzMzMzM7N2pt4CKycrqlxYmZmZmZmZ1aK+SS7MzMzMzMysgVxgmZmZmZmZlYgLLDMzMzMzsxJxgWVmZmZmZlYiLrDMzMzMzMxKxAWWmZmZmZlZibjAMjMzMzMzKxEXWGZmZmZmZiXiAsvMzMzMzKxEXGCZmZmZmZmViAssMzMzMzOzEnGBZWZmZmZmViIusMzMzMzMzErEBZaZmZmZmVmJuMAyMzMzMzMrERdYZmZmZmZmJeICy8zMzMzMrERcYJmZmZmZmZWICywzMzMzM7MScYFlZmZmZmZWIl3KHaCjGHX6vU3ab84FXyxxkuK2+/12Td73+W8+X8IktWtqRudL2no+gCuOebhJ+33nN3uUOElxztc8b57+eJP33fiC3UqYpHZNzdha+R56ePMm7bfnHrNKnKS4tp7vulfnNXnfw7ccVsIktWtqRudL2no+M3APlpmZmZmZWcm4wDIzMzMzMyuRNllgSepa7gxmZmZmZmaN1WoFlqQhku6RtEzSK5KOkBSSRkm6XtJN2dd3gcuyfY6VNDPb5ylJu+W1d5akBwuO8aikM7Pvd5dULembkuZKejdrv3drPWYzMzMzM1u/tGYP1k3AGmAT4FPA1wvWHwz8HRgEfE/SROBc4BvAAOBq4D5JIxtxzM7AvsD2wMeBrYBLim0o6ShJkyVNfvvttxtxCDMzMzMzs6RVCixJGwN7AJMiYnlELCIVT/meiIhbImJdRFQBhwO/jYinI6I6In4HTAe+1sjDnxYRyyJiIfBj4BuSPvK4I+KqiBgfEeMHDRrU6MdoZmZmZmbWWj1Yw7Ovr+ctm1uwzZyC+5sAswuWzcqWN0b+ceYA3YGBjWzDzMzMzMysXq1VYL2VfR2Rt2xEwTY1BfffAEYVLNssWw5QCfQqWF/sIgf5QwpHAauBxbVHNTMzMzMza5pWKbAi4k3gUeACSRWSBgFn1rPb9cDRkj4hqYukw4GxwM3Z+inADpJ2zNYfD2xapJ2fSeojaTBwFvCHiCgs5szMzMzMzJpNEdE6B5KGkiaq2A1YCPwCuJLU6/QzoDoivl2wz/HAicBGwEzg+xHxaN76nwG5fX6Ttf1gRJwnaXfgQeAI4BygD3AP8J2IqKwr6/jx42Py5MnNeLRmZmZm1t5JmhIR48udw9qXViuwPnJg6fPAXUCPaIEQuQIrIro0dl8XWGZmZmbmAsuaotHFR1NJGks6z+p50lC+84BbWqK4MjMzMzMzK4fWvA5Wf+DPwArgCdKU6ye14vHNzMzMzMxaVKv1YEXEI8AWrXi8R2nFx2dmZmZmZtaaPVhmZmZmZmYdmgssMzMzMzOzEnGBZWZmZmZmViIusMzMzMzMzErEBZaZmZmZmVmJuMAyMzMzMzMrERdYZmZmZmZmJeICy8zMzMzMrERcYJmZmZmZmZWICywzMzMzM7MScYFlZmZmZmZWIi6wzMzMzMzMSsQFlpmZmZmZWYm4wDIzMzMzMysRF1hmZmZmZmYl4gLLzMzMzMysRFxgmZmZmZmZlYgLLDMzMzMzsxJxgWVmZmZmZlYiLrDMzMzMzMxKxAWWmZmZmZlZiSgiyp2hzZH0NjC3hE0OBBaXsL1Sa+v5oO1ndL7ma+sZna952no+aPsZna/52npG52uelsg3MiIGlbhN6+BcYLUCSZMjYny5c9SmreeDtp/R+ZqvrWd0vuZp6/mg7Wd0vuZr6xmdr3naej5bf3iIoJmZmZmZWYm4wDIzMzMzMysRF1it46pyB6hHW88HbT+j8zVfW8/ofM3T1vNB28/ofM3X1jM6X/O09Xy2nvA5WGZmZmZmZiXiHiwzMzMzM7MScYFlZmZmZmZWIi6wzMzMzMzMSsQFVguS1FnSRZLellQp6Q5JA8udK0fSVyU9Lmm5pOpy5ykk6UJJL2T55km6WtKG5c6VT9L5kmZnGRdJul3SiHLnKiSpk6QnJYWkjcudB0DS9ZLWSlqRdzuu3LmKkfQZSU9lGRdLurLcmQCy90f+87cqe413KHe2HElDJN2S/R5cIulhSWPKnStH0gBJv5e0QNIySTdL6l/GPHX+Xpb0hex1XyVphqTPtZV8koZLukvS3Ozn8NDWzNbAjHtnP4OLs5/HxyXt1oby7SbpWUnvZj+Pz0r6UlvJV7DdsdnrfGZr5suOXddzuHuWK/9345OtndHWby6wWtbpwP7A/wC5D7V/KF+cj1gCXAl8t8w5arMOOBQYAIwhPYfXlzNQEX8AxkZEH2AU8Drwp7ImKu5koKrcIYr4fUT0zru1icIln6TdgduBi0k/ixsD15Qx0vsiYtv85w+4BHgxIp4td7Y8VwIbAlsBGwGTgb9KUllTfeAGoDewJbAp6TUu5+/pWn8vS9oM+DPwM6Bv9vVOSaPaQj6gBvgH8DXgzVbMVKiujP2By4EtgEHAzcDfJW3SaunqzjcTOJD0c9gv2+ZGSR9vpWzQgM8GkkYC3wOeb6VMherLuK7gb8surRfNDLqUO0AHdxRwTkT8F0DS94HXJI2MiLnljQYRcT+8/wGyzYmIM/Luvi3pl8Ct5cpTTES8nHdXpA8YW5cpTlGStgKOAw4CppY5Tnv0M+A3EXF73rK2VMAAIKkL8C1S3rZkC+BXEbEEQNLvgEmkD5CLyxlMUi9gL2BcRFRmy34KPCppRES83tqZ6vm9/E1gSkTcmN2/SdIx2fKzy50vIuYDV2Tr17VGnmLqyXhTwaJfS/oJsBPwRouHo958i3LfS+pE+pvSifQ+eqnc+fL8DvghcGwrRPqItv75xcw9WC1EUj9gBDAltywiZgHLSb0x1nh7AtPKHaKQpK9JWgasAE4Czipvog9kf6CvBU4FlpY3TVEHZUNhXlEaTtu73IHyZR/APwF0yYbqLJb0qKTx5c5WxAGkXo0bypyj0EWk13mQpA1I/3h6IiLKWlxllHfLyf1dHNvqaeo3hry/KZln8d+UJpO0HTCQ8vXEFCVpKbAaeBx4mtQz2CZIOhpYGRG3lDtLHTpLeiMb+ntvWxqWbOsHF1gtpyL7uqxg+VKgT+tGaf8kHQQcQypg2pSIuDki+gJDScVVW/pDfRKwICLuLHeQIi4HPkb6cHMg8L/A1WVN9FH9Sb8nJwKHAcNIH3T+lv0TpS05GrglIpaWO0iBfwGdgUWkf0J8CTiyrIkyEbECeBQ4S1I/SYOAXM95W/w9XYH/ppSMpMHAHcDFEfFqufPki4h+pKGrBwJ/A9rEedJK5xifSRoV0Va9TPoHyaakvzHTgYclDStnKFu/uMBqOZXZ174Fy/uRerGsgSQdTPrgvV8bO7fkQyJiASnnX9UGJuOQtAVpjPzx5c5STERMiYiFEVETES+QzhP7sqTu5c6WJ/c+vi4ipkfEGtIQvK5AmxnTL2lzUg/vb8qdJV/Wg/og8Arpd2FP4HzgcUkblTNbnkNJPQUvAf8B7sqWt4UetkKV+G9KSWQfth8h/cPkB2WOU1RErI6Iv5D++fTtMsfJuQY4LyLeKneQ2kTEgoiYFhHVEbE0In4AvEsaDmzWKlxgtZDsv8ivA+/P5pWdoNyH9N8UawBJhwO/BfaNiEfKnacBugC9SD0d5fYp0kncMyQt5oPzhqarbc7WV5N9bSuTHxARy4A5QBSuKrKsnI4GpkXE0+UOUmBD0n+RL4+I5RGxJiKuIf3t2bm80ZKIeCsiJkTE0IjYFJgNvAc8VeZoxUwj729KZhxtcOh0W5ZNCvI48PeIOD4i2tJ7uZgupElY2oLPAj/NhksvBnYFfiDp8TLnqk8Nbehvi3V8LrBa1lXAaZI2ldQHuBC4PyLmlDdWojSN/AZAt+z+BtmtTfwSknQiaea2z0fEv8qdp5DS1OfHZ8NMUJr+/ArSB/KX69q3ldwKbE4aKjEW2Dtb/jnawHk62TS7/bLvtwT+D7g7It4ra7CPuhI4XNI22UQSk0g9Hm1i2l9J3UjDF9tU7xVAdp7VK8BxknpJ6iLpW6Shbm3iH02Stpa0YfZ+3gm4FLigXEMt6/m9fAMwXtJESV0lTQR2BH7fRvK9f5/0YbZrdr9VJ9SqK6OkjwFPAH+MiFNbM1cD8x0kabvsvbKBpCOBPYD720I+YBPSOX9js9tk0t+9L7dWvvoyStpD0hbZe7q3pLNIM5i22nNoRkT41kI30nkHF5OGmlSSptcdWO5cefkO44P/xOffRpU7W5YvgLWk8zbev5U7V16+TqSx8YuAlcBbwE3A5uXOVkveUdlzunG5s2R5HiUN21hJ6jW4BOhT7lxFcgo4B1hAOt/lEdLU/GXPluX7KmmIWO9yZ6kl38eBv2a/B5eRJmnYv9y58vIdCcwnXcbgVeCkMuep8/cy8AXgBWBV9vVzbSxfsXVntZWMwHXZ9ysKboe0kXzHZz+HK7Lfj/8GDm4rz1+RbR8FzmzNfA14Dk8G5mZ/WxYB9wE7tXZG39bvmyLaes+4mZmZmZlZ++AhgmZmZmZmZiXiAsvMzMzMzKxEXGCZmZmZmZmViAssMzMzMzOzEnGBZWZmZmZmViIusMzMzMzMzErEBZaZ2XpM0lmSHmzG/mdIuqeZGV6TdFhz2jAzM2srXGCZmbUSSZtJuk3SAkkrJL0h6U5J3cqdraki4qcRsW+5c5iZmbUVLrDMzFrP34D5wNZABbAzcD+gcoYyMzOz0nGBZWbWCiQNIBVWv4mIZZG8GRG/iYjV2TZjJD0mabGkJZL+LmnzvDaul/QHSddKWirpLUkTJY2V9IykSkmPSBqWt88cST+W9ETWazZZ0k515Owp6WJJsyW9K+k+SVvUsf2HhhhmxztD0kPZ8WZI2iVvfVdJl0halPXknVakzd2yvO9KmiXpe5KUrfuDpH9I6pTd313ScknbNfS1MDMza0kusMzMWkFEvAO8AFwj6RuStskVDfmbAWcBw4FRwArgxoJtvgzcAWwInAtcDZwDHAhslLVxdsE+xwAnZfvcDvxNUp9aol4NfAz4JDAEeBr4q6SujXi43wJOBPoCDwC/z1t3OrAPsAuwafY4R+ZWStqG1NN3ETAI+CJwPPD1vMeyMXCmpI2APwInRcTzjchnZmbWYlxgmZm1nt2BR4HvAs8BCyX9KFdoRcT0iHgkIlZHxDJSofRJST3z2ng4Iu6NiBrgBqAX8IesN6yKVECNLzju7yJiSkSsAS4EVpGKnA+RNBD4GnBcRCzMtj8bGAr8TyMe528j4oWIWAdcA2whqW+27hvAhRHxWkSsAk4lFYU5xwG3RcRdEbEuIl4GfpXtR0SsBA7O9nsAuC8irmtENjMzsxbVpdwBzMzWFxGxGDgDOCMrmr5C6jF6C7g2Gw54EamYqeCDwmMQMDf7fn5ee1VZbfb+MqAq2zffnLx9QtLrpF6gQptmX6cXdK51BTZp0IMsyAiszL5WAMuy4+bnWSlpUUGGPSR9KW9ZJ+CNvH1ekPQIsC9wQCNymZmZtTj3YJmZlUFEVEXE9cB0YGy2+DdAJbB9RPQBds2WN3cSjFG5b7LeshHAm0W2yxVxW0ZEv7xbz4j4YzMz5LxVkKcXqYDMz3BtwfH7RMS2efscShrCeDtpyKX/lpmZWZvhP0pmZq1AUn9JP5M0OpvooYukg4DRwOPZZn1IPT5Ls+F655To8N+StEN2HtUkoCdwb+FGEbEIuBm4UtLwLHc/SQdK6l2iLH8AJknaXFIP4Od8+G/RlcBXJe2b9zxtI+l/szwfB64ADgG+CQwknbdmZmbWJrjAMjNrHWuAwcCfgXeBt4EzgRMj4rZsm5OB3YDlpKLrryU69lXAZcASYALwxewcr2KOBGYCj0qqBJ4nnfMUtWzfWD8jTU3/FDAbeJ0Pes6IiBmk88O+SxpquAi4HhiUDau8DfhFRDyYncP1FeAkSZ8tUT4zM7NmUUSp/maamVlbI2kOcGZEFM5GaGZmZi3APVhmZmZmZmYl4gLLzMzMzMysRDxE0MzMzMzMrETcg2VmZmZmZlYiLrDMzMzMzMxKxAWWmZmZmZlZibjAMjMzMzMzKxEXWGZmZmZmZiXy/wE95yDwdcnyJAAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "cv_args = {\n", + " 'test_size': 1,\n", + " 'n_splits': 3,\n", + " 'gap_size': 1\n", + "}\n", + "\n", + "test_cv(X, y, groups, **cv_args)\n", + "plot_cv(X, y, groups, **cv_args)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**test_size + n_splits + shift_size**" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TRAIN INDICES: [0 1 2 3 4 5 6 7]\n", + "TEST INDICES: [8 9]\n", + "TRAIN LENGTH: 8\n", + "TEST LENGTH: 2\n", + "TRAIN GROUPS: [0 1 1 1 1 2 2 2]\n", + "TEST GROUPS: [3 3]\n", + "TRAIN GROUP SIZE: 3\n", + "TEST GROUP SIZE: 1\n", + "TRAIN GROUP MONTHS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03'\n", + " '2021-03']\n", + "TEST GROUP MONTHS: ['2021-04' '2021-04']\n", + "\n", + "TRAIN INDICES: [ 5 6 7 8 9 10 11]\n", + "TEST INDICES: [12 13 14 15]\n", + "TRAIN LENGTH: 7\n", + "TEST LENGTH: 4\n", + "TRAIN GROUPS: [2 2 2 3 3 4 4]\n", + "TEST GROUPS: [5 5 5 5]\n", + "TRAIN GROUP SIZE: 3\n", + "TEST GROUP SIZE: 1\n", + "TRAIN GROUP MONTHS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04' '2021-05' '2021-05']\n", + "TEST GROUP MONTHS: ['2021-06' '2021-06' '2021-06' '2021-06']\n", + "\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "cv_args = {\n", + " 'test_size': 1,\n", + " 'n_splits': 2,\n", + " 'shift_size': 2\n", + "}\n", + "\n", + "test_cv(X, y, groups, **cv_args)\n", + "plot_cv(X, y, groups, **cv_args)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**test_size + n_splits + expanding window**" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TRAIN INDICES: [0]\n", + "TEST INDICES: [1 2 3 4 5 6 7 8 9]\n", + "TRAIN LENGTH: 1\n", + "TEST LENGTH: 9\n", + "TRAIN GROUPS: [0]\n", + "TEST GROUPS: [1 1 1 1 2 2 2 3 3]\n", + "TRAIN GROUP SIZE: 1\n", + "TEST GROUP SIZE: 3\n", + "TRAIN GROUP MONTHS: ['2021-01']\n", + "TEST GROUP MONTHS: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03'\n", + " '2021-04' '2021-04']\n", + "\n", + "TRAIN INDICES: [0 1 2 3 4]\n", + "TEST INDICES: [ 5 6 7 8 9 10 11]\n", + "TRAIN LENGTH: 5\n", + "TEST LENGTH: 7\n", + "TRAIN GROUPS: [0 1 1 1 1]\n", + "TEST GROUPS: [2 2 2 3 3 4 4]\n", + "TRAIN GROUP SIZE: 2\n", + "TEST GROUP SIZE: 3\n", + "TRAIN GROUP MONTHS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02']\n", + "TEST GROUP MONTHS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04' '2021-05' '2021-05']\n", + "\n", + "TRAIN INDICES: [0 1 2 3 4 5 6 7]\n", + "TEST INDICES: [ 8 9 10 11 12 13 14 15]\n", + "TRAIN LENGTH: 8\n", + "TEST LENGTH: 8\n", + "TRAIN GROUPS: [0 1 1 1 1 2 2 2]\n", + "TEST GROUPS: [3 3 4 4 5 5 5 5]\n", + "TRAIN GROUP SIZE: 3\n", + "TEST GROUP SIZE: 3\n", + "TRAIN GROUP MONTHS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03'\n", + " '2021-03']\n", + "TEST GROUP MONTHS: ['2021-04' '2021-04' '2021-05' '2021-05' '2021-06' '2021-06' '2021-06'\n", + " '2021-06']\n", + "\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "cv_args = {\n", + " 'test_size': 3,\n", + " 'n_splits': 3,\n", + " 'window_type': 'expanding'\n", + "}\n", + "\n", + "test_cv(X, y, groups, **cv_args)\n", + "plot_cv(X, y, groups, **cv_args)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**test_size + train_size + n_splits (full usage of data)**" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TRAIN INDICES: [0 1 2 3 4]\n", + "TEST INDICES: [ 5 6 7 8 9 10 11]\n", + "TRAIN LENGTH: 5\n", + "TEST LENGTH: 7\n", + "TRAIN GROUPS: [0 1 1 1 1]\n", + "TEST GROUPS: [2 2 2 3 3 4 4]\n", + "TRAIN GROUP SIZE: 2\n", + "TEST GROUP SIZE: 3\n", + "TRAIN GROUP MONTHS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02']\n", + "TEST GROUP MONTHS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04' '2021-05' '2021-05']\n", + "\n", + "TRAIN INDICES: [1 2 3 4 5 6 7]\n", + "TEST INDICES: [ 8 9 10 11 12 13 14 15]\n", + "TRAIN LENGTH: 7\n", + "TEST LENGTH: 8\n", + "TRAIN GROUPS: [1 1 1 1 2 2 2]\n", + "TEST GROUPS: [3 3 4 4 5 5 5 5]\n", + "TRAIN GROUP SIZE: 2\n", + "TEST GROUP SIZE: 3\n", + "TRAIN GROUP MONTHS: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03']\n", + "TEST GROUP MONTHS: ['2021-04' '2021-04' '2021-05' '2021-05' '2021-06' '2021-06' '2021-06'\n", + " '2021-06']\n", + "\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "cv_args = {\n", + " 'test_size': 3,\n", + " 'train_size': 2,\n", + " 'n_splits': 2\n", + "}\n", + "\n", + "test_cv(X, y, groups, **cv_args)\n", + "plot_cv(X, y, groups, **cv_args)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**test_size + train_size + n_splits (partial usage of data)**" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TRAIN INDICES: [1 2 3 4 5 6 7]\n", + "TEST INDICES: [ 8 9 10 11]\n", + "TRAIN LENGTH: 7\n", + "TEST LENGTH: 4\n", + "TRAIN GROUPS: [1 1 1 1 2 2 2]\n", + "TEST GROUPS: [3 3 4 4]\n", + "TRAIN GROUP SIZE: 2\n", + "TEST GROUP SIZE: 2\n", + "TRAIN GROUP MONTHS: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03']\n", + "TEST GROUP MONTHS: ['2021-04' '2021-04' '2021-05' '2021-05']\n", + "\n", + "TRAIN INDICES: [5 6 7 8 9]\n", + "TEST INDICES: [10 11 12 13 14 15]\n", + "TRAIN LENGTH: 5\n", + "TEST LENGTH: 6\n", + "TRAIN GROUPS: [2 2 2 3 3]\n", + "TEST GROUPS: [4 4 5 5 5 5]\n", + "TRAIN GROUP SIZE: 2\n", + "TEST GROUP SIZE: 2\n", + "TRAIN GROUP MONTHS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04']\n", + "TEST GROUP MONTHS: ['2021-05' '2021-05' '2021-06' '2021-06' '2021-06' '2021-06']\n", + "\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA1gAAAEYCAYAAABBWFftAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAA+oklEQVR4nO3dd5xU5d3+8c9FlbIUKYJKsaCJooCgiRqfGDXGXuJjkJgCdo0lGolGTexRoz9jYjRGjb08GkussUuiMZiACCKKQgRFmojAwiKw7Pf3xzmj47hldpnZ2V2u9+s1r5k59ZozW+Y7933uo4jAzMzMzMzM1l2rUgcwMzMzMzNrKVxgmZmZmZmZFYgLLDMzMzMzswJxgWVmZmZmZlYgLrDMzMzMzMwKxAWWmZmZmZlZgbjAMrMWT9Ihkp6R9LGk1ZI+lPSApH1KnS1D0kBJkccts9zJjZitk6SLJU2XtFLSAkl/l3R0AfcxS9JVhdpe1nYPlPRPSUskLZP0pqQbJHWu53ZGp8e9c/o88z4ckLXMzyXtXthXYGZmzU2bUgcwMysmSb8FTgXuAP4IfAwMAI4A/iZpy4iYWcKIGfOAnbOebw7cDfwEeK2a5d5rvGg8CAwDLgGmAr2B/wH2A/5coH0cSvLeFIykUcA9wJ9IsgewPfBjoBuwfB02n3kf3s6a9nPgD8C4ddiumZk1c/KFhs2spZJ0MPBXYExE3FbN/AOBiRExt5p5HSJiZdFD1kDSYOAN4FsRMa6EOQYB7wDfi4i/5MxTrOM/kWIeZ0n/BJZExP7VzKtXdkmjgVuBsoiotjCTtAj4Q0Rc0LDEZmbWEriLoJm1ZD8F/lNdcQUQEY9liqu0u9cZkq6R9BFJcYOknpJuT7sXVkgaJ2lE9naq67In6YL0A3fmeaaL2Y6SXkq72r0j6dD6vqjc/aWZHpA0RtJ7kpZLulNSe0k7Sfp3Om2cpP4529pA0m8kfSBplaTJkvbLWqRbej+/muP3hQJFUn9J/ydpcXqsnpa0ddb8TLe6IyXdIWkJ8Fg670tdBCXtlnZFrEiP/02SyrLmd5N0s6S5kj6V9L6km3Kyfyl3bvas9/53afYlkq6V1K66dXNeywGZ/EAP4Pys7py717S+mZm1XC6wzKxFktSGpAvXM/VYbSzQF/ghSbdCSFrAvgOcCYwk+bv5oqQtGxjtPuAR4LskRdxfJA1p4LayfZ2k69spJF3VvgdcC9wE/A74AUm3wxtz1nsAGA38GjgQ+A/wqKSh6fzpwArgGkl7S9qgup1L2hB4GdgaOCHdfyfgOUkdcha/CigHDk/3W932dgWeIymQ/pekWN6PpBUp42rgG8DpJO/ROSTdADNeA0ZJOlnSxtXtJ8vPgE2BI0m6Ex4HXFrHOtkOBZaSdJncOb29VusaZmbWIvkcLDNrqXoA7YEPsidKEtA6a9LarNaMeRExMmvZfYBdgd0j4u/ptBeAWSTF2PENyHVzRFyVbutpYBrwC5JzwtZFZ+DgiFiabnt34FjgmxHxj3TaxsB1kjpGRIWkPYH9yXp9wDOStgLOBQ6PiGWSjiUp1J4G1kgaD9yZvpbMsTudpKAaGhGL0/39k+RYHQVcl5V1fET8pI7XcznwSs778SHwvKTBETEV2Am4LiLuy1rvrqzH5wDbkRSa10p6j6Rg/k1E5LZslaevt4rk3Lz2wLmSLsu8ntpExCRJlcCciBhf1/JmZtZyuQXLzFq63PNsfgasybplf9B/MmfZnYCFWcUHEbECeJyk5aQhHs7aVhVJa9ZODdxWtgmZ4io1A1hN0qqUPQ0g05qzF0kL0T8ltcncgOeBz7pBRsS9JAODHAX8H7AVSUvYPVnb3gt4FliWtZ1yYGL2tlJP1PZCJHUkaQG6PyfXyyTv2fB00deBsZJOSovCL4iID9Jl9wL+H7CYpBCcImnTnMUfSd+PjIeADsDg2rKamZnlcoFlZi3Vx8Aqkm5f2e4EdkxvuRbkPO8LLKxhuQ0bmCt3ewvT/ayrJTnPVwPlOUXD6vQ+082vJ9CHLxaca4ALgH7ZG4uIjyPi1oj4UTrvVuCIrO6NPUm6UOZu61u52+LLxzlXd5JWxutztrUKaJu1vZNJWqR+BUyX9K6kL7QERsTaiHg+Is6MiBEkXQk3JCm0s1X3vkBh3hszM1uPuIugmbVIEVEp6V/A3iQfwDPTF5B+wE96C35xtZzn80iGJM+1EUlrSMYqIHdAhO41ROvNF4cj753upxQWAx8Ch9RnpYhYo2T4+zHAV4DJ6bYeBS6uZpXy3E3UsYsl6TIX8OVWRYC5aY4lJOfKnSppe5Jzz+6WNCUiptWQ/RlJk9Pc2XLf58zzUr03ZmbWTLkFy8xasmuAr0n6YQPXfxXoLel/MhPS7mv788Wud3OAr2Yt0wrYs4ZtHpqz3MHAvxuYb109T9KCtTwiJuTe0oxl1QxSATAovV+Qta1tgTer2db0+oRKu2GOB7auLld1w+pHxBSS8+JakRZPkr5UHKeDdGzKl1vRDk7fj4zvAitJrvuVr9V83jpoZmbrKbdgmVmLFRGPSLoGuE3St0iGBF9EMgDG3uliNV5sNiKelvQKcJ+ks0lans4kOTfnyqxFHwZ+ImkS8F/gGKBLDZs9RtJqkg/uxwBbAqMa9grX2bMkA1c8K+kK4E2S3EOBDSLiFySjAj4q6RbgFaAinX8uyTlQmULzapKRCl+QdC1Jy9hGwDeBl9PzuOrj5yQDWlSRjHRYDvQnKW7PjYh3JL1McuynkrR4HUsy4mGmYH1a0tsk7/sHJMXkySSti3/K2V8ZyYiON5EUir8kGUCjzgEusrwN7C/pKZKfq+kRkdt6Z2ZmLZwLLDNr0SLidEn/AE4iGUK7DPgI+BewX0T8rY5NHEIyQMI1JK0T/wb2iIgZWctcSNKl7BKSVow/kBQr1Y2UdwTw23TZD4CRETGpIa9tXUVESPouyWh7PyUpYBaTFE7XpovNBG4mOXfpRJLi8n3gFuCKiKhMt7VI0tdJhjb/Lck1qOaRFGBTGpDt5bTl8EKS8+ZaA7OBp/i89elfJEPMDwTWApOAfSNiTjr/NyTH+wqS9+cjkqHTvxERua2G/49kGPt7SVrB/pwel/oYSzJa4hNAR5Lzz8bVcxtmZtbMqR4XsjczswaSNJpkYIiyiKix1cwan6QATomIP5Q6i5mZNX8+B8vMzMzMzKxAXGCZmZmZmZkViLsImpmZmZmZFYhbsMzMzMzMzArEBZaZmZmZmVmBuMAya4YkDZU0XtIKSbfmzAtJu6eP20m6QNLQImQoyrYljZP0QIG3uXs6Ulx919tL0n2SZkuqkDRV0smSWq9rBklbpcevW323Vcd+Zkm6qpDbzGOft0maUOBtDkx/lgfWc70dJd0qaUb6nk2XdH56geFGydAY0p+dRVnPd0+zDk6fN+j3s9jHXdIhktakv1Nn12cfZmbNha+DZdY8XQJsAhwJvFHLcu2A84FZJNc2KqRibfskYE0Bt7cujiO5ntF5JNes+gbJ9ZI2A362jtveiuT43QYsWcdtZTuU5ILIjelikutjNQUjgS1Irn31LrA9Sb7tgcNKmKvYXgN2JrluGRT3d786+R73vwPfBo4GLpN0b0TMboR8ZmaNxgWWWfO0FfB4RPy11EEKLSKmlTpDlpMiYlHW83GSOgKnSzonIlY1RghJG0TEp/ksW4qLFkfEzLqXajSXV/OefQr8SdKAlvphPiKWAeNLGCGv4x4Rn6TzFgE/ALYmuYC0mVmL4S6CZs1TO2BFHsuVp/e3pt1+Puv6I2kDSb+R9IGkVZImS9ove2VJB0mamHZF/ETSq5K+Wde2ayNpW0lPSVqcbvctST/Jmv+FLoJZ2869jc5aZjdJf0+7Jn0s6SZJZXkcn1rlfGDMmARsAGzY0O2mXTgfS5++l76eWem80enzndJjsRIYm867XNIbkpZLmiPpbkl9crb9hS6Cme57kr4taUp6zF+WtG098m4q6X5JCyWtlDRT0sW5+8jJUN17dkHWMoMlPSGpPL39Jfe1NEQt7xnAxuuy7azuc9+T9CdJS9P34UJJef8/lXS0pGnpsVyU/uxum7OP70u6Mz02CyWdX8c2v9BFkNp/93+hpCvfp5IWpL+P63TsG3DcMxfbbrcu+zUza4rcgmXWwkSEsp7uAbxA0qXwiXTavPT+AWAnkm5EM4HvAY9KGhERr0vaIl3mdyQf8DcAhvN5YVHbtmvzGPAWybfXq0i+we5Sy/I75zz/PnAyMANA0q7Ac8Bfgf8FegCXA93T50TEOCD7uJAWNOMiYnQemXPzLAEW1melnAyvAWcCVwHfJTluua1h9wLXAxfyeRfC3sCvgblAL5Juii9IGhwRVbXsvj9wJXApsDLd732Stov8rtVxB0kXwOPSLJsDX6ll+UOB9lnPv5XmfgdA0pbAP4EJJD8HbUi6kz0maadIzOLL79k4gIjYPY/M2XYGqvi8+1xeqsuQ+g3wIMnP157Ar4A3gfvr2qak/wFuSNf5F8nP/s5A15xFrwQeT/fxP8D5khZFxHV5xq/291PSj4BzgLPSzD3SZTtB9a+5sY+7mVlz5wLLrHnqRn4tWP9J72dGxGfdhyTtCewP7B4Rf08nPyNpK+Bc4HBgGFAeEWOztvdkXduujaSeJOcvHRwRmXPHnq9tnZzcw4FjgQsj4uV08uXAKxExMmu5D4Hn08Jjag2brgTW5pM7a7vbACcCV0REvdbNFhHLJE1Pn05KP9Tm+n1E/C5nvaOysrQm+YA+h+TcsH/UsssNgV0j4t103VbAwyTF7dt5RN4JGBURmVa3cbUtnN1NUdIA4Azgtoi4J518PjAf2DciVqfLTUmz7MfnBUGueh/ztGXmPODOiKhXUVyLf0RE5hy8ZyXtQ1Io11lgkRzLKRFxWda0R6tZ7s2IOD59/LSk3sA5kv5YRzGdUdPv/k7AMxFxfdayD9WxrWIc94r0PrewNDNr9txF0KwZkdRJ0lEkH0omrsOm9iL5gPtPSW0yN5JiZ0S6zBtAV0m3S9pbUqd1Cp9YTDJYxA2SRqYfGvMiqRfJB8HngIvSaR1JviW/P+d1vEwyUMbwmrYXEVtGxNH12H93klaLKSStMcX2pSJD0r6SXpG0lKRAnJPO2qqObc3KFFepzHlum+aZ5XWSAQlGS+qf5zpI6kBSyM0mKUwz9kqnV2W9Z++RDMgwInc7GRGxZ0TsWY/9tyMpepYDp+e7Xh6eyXk+jfody2GSfivpf9KM1Xk45/lDJF3t8t1PbfvfL+3WuJPyGBGzSMf9I+B9YLSkjevTxdLMrKnzHzSzZiI9f2I58Gfg2oio7lvvfPUE+pAUIdm3C4B+ABExHTiYpDvYk8AiSfekhU6DpN+8701S3N0CzJf0kqRhta2XfgC/H1gN/CCrW1t3oDVJV7rs17EKaJt5LetKyVDTj5B0ezso0+pSZAtyMuxI0tIxB/ghSWH59XR2XUOQL8l5nsmf79DlI0m68/0WmC3p9bQVtC43krwHh+UM0tGTpIta7s/f5hTuPRNJ18Ztgf3SwRUKZUnO89XkeSwj4jlgDEm3v3Ekv1fXVfMFRm6rT+Z533ol/bJbSLoIfg94FVgg6ZJ8Cq185Hvc09/h7wE7AB+SdJk0M2sR3EXQrPmYC+xG8qHkJEl3R8SrDdzWYpIPNYfUtlBEPAE8IakrSZfCa4BrgSMauF8i4m3gMEltSV7PFek+Nq2l69NVwI7A1yNiadb0JUCQFIZPfnk15jY0Z0b6wfMeYBuSbnYL6lilUHLPjTqU5Fv/kZkCM+1+V/wgER+StDS0IunidgHJ+Xr9I6LaIeEl/RQYBexTzch9i0laaG6uZtXqBktoiGtIviD4dvoz12RExO3A7emXFd8lKVzLgezrQuW27mae53OeY237rkr391tJ/Ugu9XApSeF+w7psO3UN+R/335F82fIjPh8Qw8ys2XOBZdZMpK0mL0saT3INmb1JvoGuTU0tFc+TDJCwPJ8Pn2lRc4+SEQQzg07UtxUkd5trSAZouJqkgOlG8sH7CyT9EDgNOCL3fKqIWJEej60j4qKG5MjD9cA+wF5pq16h1Pf4dQDW5AxKcWQB89Qp/XA+XtKFwCvAAKq55pakb5EM0nBO2mKT63mSFo6JeQ6yUS+SfkEyEMr3ss7Va3Ii4iOSYcy/S1LAZzsU+GPW88xgKHPIT50/XxHxAXC5pDHV7L/e6nPc0y63XwOOSb/IMTNrMVxgmTUzEVGp5BoytY28l1l2taT3gO9Jmgp8SnIO0bPA0yQn6F9BMppYF2AosEFE/ELS8STF1FMkLUGDSAa/uKO2bdfWfU7S9qQj2AH/JenidxYwOSKqK662IOlm9jeSrmlfz5o9M/2A+nOSAS2qSEY9LCcZNW9/4NyIeKeGLDOAv9d2Hpakc0hGzruM5Hyh7P1PS689hJLhx8/PGcGxLpli7XhJ/wdUZA38UZ1ngZ9KuoZkJMZdSEbgK6q09fJpkvf9HZJukj8jaXl4q4bl7wemAv/IOWZzImIOSQvYv0laLm8habXahOQCtLelIy5Wl+V5SM4JqiXv90nOkbsN+LCGnxmUDPN/K7BZDYOMFEVanG5I2j2QZDCZb/LF1iuAbSX9ieS8v/8h+VLltDwHuKjtd/9aki8yxgNLSUZ4HETye1hT5oId9yyZv1+FGnjEzKzJcIFl1jwF1Q8fXZ0TSIqa50g+HG8WEbPSb83PAX5KUpAsJjkB/tp0vSnAQcDVJB8I5wE38cVzJb60bZKBCmoyn+TconNJTthfArxIzR/u+pF8A79vess2huTD+MtKhr6+ELiT5Jys2SSFYW3d+dqky9Zm7/T+F+kt27f4fDS9jiTd9/IWEbMlnQmcCpxC0jIxsJbln5R0VrrssSQjCB5AOvR5EX1KMuDJaSTvRwXJh/O9I2JlNct3JznHqmeaMduFwAUR8U76AfwSkgK6A0mX1edJh9+vQT7nCWXes9HpLdsYkgIAkvdsNV8+n6rY/kMy8MMRQBnJz+oFJN3lsv2c5P19kOQ9uBj4Qz33Vd3v579Ifn6OJ/ndmgEcG7VftLyQxz0j8/er4C2YZmalpiL0zjCzIpM0E3gyIk4pdRYDSX8HXoiIC0udxfIj6XagKiLGlDpLtnQwm/eAAyPi8RLHKRpJmUsE7BsRT5U6j5lZIbkFy6x5mgN8Q9LGwPx8uw1Z4aUjHA4mvaixNRs7A4eVOsT6Jh1lsCPJOWaQ/zllZs3CxIkT27Vq1erE1q1bj4mIruTf28Sah5C0dO3atbdWVVX9cfjw4dWeFuECy6x5uhq4m6Rb1e18uUtOSaSjzNV4+YeIqGzEOI0ifU09Sp2joep6z4C1xRiIotQioq5rhzVIWnDXstuGX6C6hTiYz6/x9WAtFwI3a5batGlzU5cuXXbdeOONV7Rr1+7j5DsFaykigtWrV7edO3fuKcuWLdsB+HF1y/k6WGbNUEQ8QnKOy2CS8zeailv48rWNPrul3Z+safkVtbxnJAMwWP5qO5bP17VyRMyKCLXg7oHjSAb22Cgi3OprLdE3BgwYsLR9+/ZrXFy1PJJo3779mgEDBiwFvlHTcm7BMmum0gu3vlnqHDkuoPYT8df5ulRWcDcCtX2YL+TQ9OuDHWuZV95oKZqoiFhCMpiOWUvVulWrVi2u1d++KH2PaxwAyINcVKNnz54xcODAUscwMzMzsxKaOHHioojole/ykydPnjVkyJBCXTDdmrDJkyf3HDJkyMDq5rkFqxoDBw5kwoQJpY5hZmZmZiUkaXapM1jz4wLLzMzMzKyIfn3v6iEVq4r/ubtjeyrPGdVucrH3k+uPf/zjhtdcc02f6dOnTyvG8s2NB7kwMzMzMyuixiiu6rOfjh07Dsvc2rRps0ObNm12yJ5W3/2eeOKJi+tTLNV3+cY0ffr0dpKGz5w5s21Dt+EWLDMzMzOz9UhFRcWkzOORI0cOqKys1IMPPjirumVXrVql9u3be9CGenALlpmZmZmZAbDTTjttfdRRR/Xba6+9tujcufOwCy+8cKOZM2e23W233QZ17959SFlZ2dDhw4dv/dJLL3XMrPP73/++R//+/Qdnb+PYY4/d9Dvf+c4WnTp1GtavX7/Bd911V7eGLl9VVcXZZ5/dZ6ONNtq+a9euQ48++uh+O++881ZnnHHGxtW9hpUrV2rUqFEDNtxwwyGdO3ceNmDAgMG33HJL98z8p556qvPw4cO37tq169B+/foNPv/88zeqqqoCYMSIEdsADB48eHDHjh2HjR07tm99j6ELLDMzMzMz+8z999/f89RTT124bNmySb/4xS8WVlVV6YQTTlj4/vvvvzFv3rzJ22+/fcXIkSO3WLVqVY0X+3rggQd6nHnmmfOXLVs26Zhjjll44oknDiwvL6+x9qht+euvv77HTTfdtNFDDz307sKFCyf37dt39X/+85/ONW3ruuuu6/H66693mjZt2tTly5dPev7556cPGTJkJcDEiRM3OOywwwadfvrpCz7++OPXH3300Rk333xz7+uvv74HwIQJE6YBTJ06dWpFRcWkK6+8cl59j58LLDMzMzMz+8y+++77yUEHHVTeqlUrysrKqgYNGrT6yCOPXFpWVlbVuXPnuPrqqz+cN29eu6lTp7avaRsHHHDAJ9/+9rdXtG7dmtNPP33R8uXLWzd0+XvuuafHD3/4w4923XXXle3bt4+LLrpoQe/evdfUtK127dpFRUVFq9dff73DmjVr2HLLLdcMHz78U4Df/e53vffbb79PfvCDHyxp06YNw4YN+/SYY45ZeM899/RYl2OWzedgmZmZmZnZZwYMGLAq+/m8efPanHTSSZuOHz++rLy8vI2kAJg/f36NtUTfvn0/K4C6dOlSBbBs2bIaL85b2/Lz589vO2DAgNWZ+a1ataJv376rv7yVxIknnvjxggUL2p555pn9Zs+e3X7nnXcuv/rqq+cMHjx41fvvv99u/PjxXcrKyrpllo8I9enTp8bt1ZdbsMzMzMzM7DOtWn2xRDj99NM3WbBgQdvx48e/vXz58knvv//+FEgKk8bI06dPnzWzZ89ul3leVVXFvHnz2tW0fNu2bbn00kvnT5069a333nvvjQ4dOlSNHj16IMCmm266+vDDD19UXl7+eua2fPnySTNmzHgTvvzaG8IFlpmZmZmZ1ai8vLx1hw4dqnr16lW5dOnSVqeccsqmjbn/UaNGfXzXXXf1euWVVzqsWrVKF1544UYLFy6scRj1Rx99tOyll17quGrVKnXq1KmqU6dOVa1btw6A0047beFjjz224T333NN11apVWrNmDRMnTtzgiSee6AxJS1qrVq2YNm3aBg3N6wLLzMzMzKyIOransjnv59e//vWHH3/8cdsePXoM3XbbbbfdZZddlrduXWNvv4L7yU9+8vGYMWMWHnLIIYN69+49ZM6cOe2GDBmyon379lXVLT9v3ry2o0eP3qx79+5D+/TpM+SDDz5od/PNN88G2HHHHT994IEH3r322ms36tOnz/Y9e/YcOnr06M0yBVvnzp1j7NixH44ZM2bzsrKyoWeddVaf+uZVhIe1zzVixIiYMGFCqWOYmZmZWQlJmhgRI/JdfvLkybOGDBmyqJiZDNauXUvfvn23v+iii+accMIJi0uRYfLkyT2HDBkysLp5bsEyMzMzM7Mm7cYbb+xeUVGh8vLyVmecccbGn376aavDDjtsaalzVccFlpmZmZmZNWl/+tOfem+00UZDNt544+1feumlLg899NC7vXr1WlvqXNXxMO1mZmZmZtakTZw4cXqpM+TLLVhmZmZmZmYF4gLLzMzMzMysQFxgmZmZmZmZFYgLLDMzMzMzswJxgWVmZmZmZlYgHkXQzMzMzKyIllx95pBYuaLon7vVoVNltzOumlzs/dTHlltuue1ZZ50199hjj/2k1FkaiwssMzMzM7Miaoziqj776dix47DM49WrVwugXbt2kZlWUVExqb77nj59eruvfOUr282YMWPKFltssSYzfcaMGW/Wd1uN5bDDDhvYpk2buO+++2YXcrsusMzMzMzM1iPZBdTIkSMHVFZW6sEHH5xVwkgtis/BMjMzMzMzAN599912++yzz+Y9e/Yc0qtXr+1HjRo14JNPPmkFUFVVxSmnnLJJ7969t+/UqdOwTTbZZLtLL720N8CIESO2ARg8ePDgjh07Dhs7dmxfgE022WS766+/fkOAxx9/vKxNmzbDb7rppu79+vUbXFZWNnS//fbbPLN9gClTprTfcccdt+7cufOwrbfeepuLL764t6ThNeX961//WvbVr351m86dOw/r3r37kF122WWrzLzy8vJWxx133KabbLLJdl27dh262267DZo6dWp7gPPOO2+jRx55ZMMHH3ywR8eOHYd17NhxWGVlZUGOoQssMzMzMzOjoqJCe+6551Zf+cpXPp01a9aUadOmvTl37ty2xx13XH+Av/71r13uv//+Hq+88spbK1asmPTvf//7rd13370cYMKECdMApk6dOrWiomLSlVdeOa+6faxdu5Znn322y9SpU6dNmzZt6ptvvtnx8ssv7w2wZs0aDjrooEHbbrttxfz5819/+OGHZ9xxxx29ast83HHHbXbCCScsWLZs2aS5c+dOOeeccz7b75FHHjng3Xff3WD8+PFvLViwYPKIESNWHHjggVuuWrVKl1xyyYKDDz548WGHHfZxRUXFpIqKiklt2hSmc58LLDMzMzMz47777usWEVxzzTVzO3fuHL169Vp7ySWXzH3kkUc2rKyspF27drF69Wq9/vrrHSoqKrTJJptU7rrrrivru59rrrnmw65du1b169evcp999lny2muvdQJ44YUXOs+dO7fdH/7whzmdO3eObbbZZvVJJ520oLZttW3bNmbOnNl+zpw5bTp06BAHHHBAOcC8efPaPPbYYxveeOON7/fr169ygw02iKuuumruokWL2o4bN65Tw45QflxgmZmZmZkZ//3vf9vNmzevXVlZ2dDMbb/99ttKEh988EHbAw44oPzcc8/98LLLLuvbu3fvobvuuuugf/zjHx3rs4/WrVuz8cYbf9YXr1OnTlUrVqxoDfD++++33XDDDSs7d+782YAbm2222aratvfAAw/MmDlz5gZDhgzZdostttj2oosu6g3wzjvvtAPYYYcdtsm8lm7dug2trKzUrFmz2tUnc315kAszMzMzM2PAgAGrBw4cuKq2kf/OPPPMRWeeeeai8vLyVmPHjt145MiRW8ybN++NVq3Wvd2mf//+az755JM2y5cvV6bIqqsY2nnnnVc+8cQT/62qquKZZ57pfMghh2w1dOjQlcOHD18JMH369KnZBV22QmSudrtF2aqZmZmZmTUrI0eOXLpmzRqdffbZfT755JNWVVVVvPfee23vuOOObgAvvvhix6eeeqrzypUr1aFDh6qysrK1mSKlb9++a1q1asW0adM2aOj+99hjj+V9+/Zdfeqpp25aUVGht99+u93111+/UU3Lf/rpp7r22mt7zJs3r02rVq3o0aNHpaRo3bp1bLLJJpUHHnjg4qOPPrr/e++91xZg0aJFre+4445uS5cubQWw0UYbrZk9e3b7tWvXNjRytVxgmZmZmZkVkTp0KszwdEXeT1lZWdXzzz8//a233uqw9dZbD+7SpcuwPfbYY6tJkyZ1BCgvL299xhln9OvZs+fQ7t27D33hhRe63HXXXTMBOnfuHGPHjv1wzJgxm5eVlQ0966yz+tR3/23btuXhhx+eMWXKlI69evUaevDBB285cuTIj9u2bRs1rfPAAw9032abbbbt2LHjsEMPPXTQ2LFj5+6///7LAe6+++7ZgwYN+nT33XffulOnTsMGDx687V/+8pfukgA46aSTPqqoqGjVvXv3oWVlZUMLNYqgImrMu94aMWJETJgwodQxzMzMzKyEJE2MiBH5Lj958uRZQ4YMWVTMTOubK6+8sud1113XZ9asWVNLnSXb5MmTew4ZMmRgdfPcgmVmZmZmZk3C008/3fnNN99sX1VVxauvvtrh97//fZ9DDz10calz1YcHuTAzMzMzsyZh1qxZ7UaPHr3ZkiVL2nTv3r3ygAMO+OTSSy+t9ppaTZULLDMzMzMzaxKOP/74xccff3yzarHK5QJrPXHxvQ0/ae+Xoxrnx6ShGZ0v0dTzQdPP6HzrZtnVP23wul3OuKZgOWrT0IzOl2jq+czMmgKfg2VmZmZmVhhVVVVVKnUIK670Pa6qaX6LL7AktZZ0paSPJJVLelBSz1LnMjMzM7OWRdL8lStXNvg6UNY8rFy5cgNJ82ua3+ILLOBs4GDga8Cm6bQ7SxfHzMzMzFqiysrKC2fNmtVuxYoVHdyS1fJUVVVpxYoVHWbNmtWusrLywpqWWx/OwToOuCgi/gsg6efADEkDImJ2aaOZmZmZWUuxww47PP3aa6+dPHPmzPMjog/rR2PG+qRK0vzKysoLd9hhh6drWqhFF1iSugH9gYmZaRExU9IyYAgwO2vZ40iKMfr379+4Qc3MzMysRUg/eNf44dtavpZeVZel90tzpi8BumRPiIgbI2JERIzo1atXY2QzMzMzM7MWpqUXWOXpfdec6d2AZY0bxczMzMzMWroWXWBFxBLgfWCHzDRJm5O0Xk0pUSwzMzMzM2uhWnSBlboROEvSZpK6AFcAT0fErNLGMjMzMzOzlkYRUeoMRSWpNUlRNRpoDzwLHBcRi2paZ8SIETFhwoTGCWhmZmZmTZKkiRExotQ5rHlp0aMIAkTEWuDM9GZmZmZmZlY060MXQTMzMzMzs0bhAsvMzMzMzKxAXGCZmZmZmZkViAssMzMzMzOzAnGBZWZmZmZmViAusMzMzMzMzArEBZaZmZmZmVmBuMAyMzMzMzMrEBdYZmZmZmZmBeICy8zMzMzMrEBcYJmZmZmZmRWICywzMzMzM7MCcYFlZmZmZmZWIG3yWUiSgCOAEUBZ9ryIOK4IuczMzMzMzJqdfFuw/gj8AegHtM25mZmZmZmZGXm2YAGHAztFxMxihjEzMzMzM2vO8m3BqgDeL2YQMzMzMzOz5i7fAus3wK/Sc7HMzMzMzMysGvl2ETwVGACcImlh9oyI2KrgqczMzMzMzJqhfAusS4qawszMzMzMrAXIq8CKiNuLHcTMzMzMzKy5y7cFC0k7AkeRDNX+AXBLRPynWMHMzMzMzMyam7wGuZB0CPAPoCswCegC/F3SocWLZmZmZmZm1rzk24J1PnBYRDyZmSBpX+By4OFiBDMzMzMzM2tu8h2mfSDwVM60p0lGFjQzMzMzMzPyL7BmA3vlTNsTX3zYzMzMzMzsM/l2EbwYeETSA8B7JC1ahwE/LlIuMzMzMzOzZievFqyIeJCkxaoCGAGsBL4dEQ8UMZuZmZmZmVmzkvcw7RHxCvBKEbOYmZmZmZk1azUWWJK+FhGvpo93qWm5tPAyMzMzMzNb79XWgvUcUJY+frmGZQJoXdBEZmZmZmZmzVSNBVZElGU9zne0QTMzMzMzs/VWXoWTpB/WMP3IwsYxMzMzMzNrvvJtmbquhunXFiqImZmZmZlZc5dvgaUvTZAGApUFTWNmZmZmZtaM1TpMu6Q1pANZSFqdM7s18MdiBTMzMzMzM2tu6roO1l4krVdPAvtmTa8C5kfEu8UKZmZmZmZm1tzUWmBFxN8BJG0REfMaJ5KZmZmZmVnzVFcLFgARMU9SD2BHoBdZ52RFxB1FymZmZmZmZtas5FVgSdoLeBBYDXQDlqT37wEusMzMzMzMzMh/FMHLgYsiohewPL2/GLihaMnMzMzMzMyamXwLrEHANenjTPfAK4CfFjiPmZmZmZlZs5VvgVUBtE8ffyypP9AO6F6UVGZmZmZmZs1QvgXWK8Ah6eO/AY8CzwH/KkImMzMzMzOzZimvQS6AH/B5MXYm8DOgDLi6GKHMzMzMzMyaozoLLEltgKtIiioi4lPg0iLnMrMW6OJ7Kxu03i9H5ftd0LpxvnXT0HzQ9DM2Vr5lV/+0Qet1OeOaguaoSUvNB00/o/Mlmno+M8iji2BEVAJHAKuKH8fMzMzMzKz5yvccrEeBw4oZpBgkHSHpJUnLJDX8q1UzMzMzM7M85NvnoS1wl6QTgFlAVWZGRBxXhFyF8glwPdABuLHEWczMzMzMrIXLt8BaA9ybPm6d3pq8iHgaQNLupU1iZmZmZmbrg7wKrIgYU+wgpSbpOOA4gP79+5c4jZmZmZmZNUf5noOFpK6Svi/p5+nzPpI2Ll60xhURN0bEiIgY0atXr1LHMTMzMzOzZiivAkvScGAGcDbwy3Ty9sC1RcplZmZmZmbW7OTbgnUN8POI2B7IjMb3CvD1YoQyMzMzMzNrjvId5GJb4Lb0cQBExHJJnYoRqlAktSYZAbFd+nyDdNaqiIiSBTMzMzMzsxZJ+dQZkqYDe0fEbEmLI2JDSVsCj0TEtkVP2UCSRgO3VjNrs4iYVdN6I0aMiAkTJhQrlpmZmZk1A5ImRsSIUuew5iXfLoK3A/8n6RuA0nOybgZuKlqyAoiI2yJC1dxmlTqbmZmZmZm1PPl2EbwC6AQ8CXQGXgR+hwe5MDMzMzMz+0y+18FaC5wLnCupZ0QsKm4sMzMzMzOz5iffYdqfyjzOLq4kPVGMUGZmZmZmZs1Rvudg7VLDdA/TbmZmZmZmlqq1i6Ck72eWkzQKUNbsQcAnxQpmZmZmZmbW3NR1Dtal6X174NdZ06uA+cApxQhlZmZmZmbWHNVaYEXEZgCSHo2IgxonkpmZmZmZWfOU1zlYLq7MzMzMzMzqVmMLlqTfR8Sp6eMba1ouIo4rRjAzMzMzM7PmprYugm1reGxmZmZmZmbVqLHAiogTsx6PaZw4ZmZmZmZmzVe+18EyMzMzMzOzOrjAMjMzMzMzKxAXWGZmZmZmZgXiAsvMzMzMzKxAai2wJD0qaT9JaqxAZmZmZmZmzVVdLVgfA/cDsyT9UtLGjZDJzMzMzMysWaq1wEqHZ98Y+A3wXZJC6xFJ+7pVy8zMzMzM7IvqPAcrIpZFxHURMQzYDVhE0qr1nqTzih3QzMzMzMysuajXIBcR8WpEHE1SaK0FLixKKjMzMzMzs2Yo7wJLUhtJh0t6FpgAzAF+WLRkZmZmZmZmzUybuhaQtBVwLPCjdPnbgVMi4u0iZzMzMzMzM2tWai2wJP0d+AbwT+BnwF8iYlVjBDMzMzMzM2tu6mrBmgScEBFvNUYYMzMzMzOz5qyuc7AuB7arboak70nqXfhIZmZmZmZmzVNdBdbPgUE1zNscOKuwcczMzMzMzJqvugqs/YCba5h3K3BAYeOYmZmZmZk1X3UVWH0iYkF1M9LpfQofyczMzMzMrHmqq8BaLalvdTPS6WsKH8nMzMzMzKx5qqvA+idwSg3zfgK8VNg4ZmZmZmZmzVddw7RfCrwkqRdwL/AhsAkwCjiS5BpZZmZmZmZmRh0FVkRMkHQQcB1wNBCAgBnAQRHxWvEjmpmZmZmZNQ91tWAREc8CW0kaBPQCPoqId4uezMzMzMzMrJmps8DKSIsqF1ZmZmZmZmY1qGuQCzMzMzMzM8uTCywzMzMzM7MCcYFlZmZmZmZWIC6wzMzMzMzMCsQFlpmZmZmZWYG4wDIzMzMzMysQF1hmZmZmZmYF4gLLzMzMzMysQFxgmZmZmZmZFYgLLDMzMzMzswJxgWVmZmZmZlYgbUodoKUYePYTDVpv1uX7FzhJ9ba7fbsGr/vGj98oYJKaNTSj8yWaej6A6054oUHr/eSGPQqcpHrOt27mnP1Sg9fd9PLdCpikZg3N2Fj5nn9hiwatt+ceMwucpHpNPd+t785t8LpjBm1cwCQ1a2hG50s09Xxm4BYsMzMzMzOzgnGBZWZmZmZmViBNssCS1LbUGczMzMzMzOqr0QosSX0kPSZpqaR3JB0tKSQNlHSbpLvT+8XA79N1TpQ0PV1nvKTdsrZ3gaTncvYxTtJ56ePdJVVK+rGk2ZIWp9vv3Fiv2czMzMzM1i+N2YJ1N7Aa6Ad8A/hhzvzDgb8BvYCfSRoFXAz8COgB3AQ8JWlAPfbZGjgQ2B74KrAVcHV1C0o6TtIESRM++uijeuzCzMzMzMws0SgFlqRNgT2AsRGxLCIWkhRP2V6OiPsiYm1EVABjgD9FxKsRURkRfwamAN+v5+7PioilEbEA+BXwI0lfet0RcWNEjIiIEb169ar3azQzMzMzM2usFqxN0vv3s6bNzllmVs7zfsB7OdNmptPrI3s/s4D2QM96bsPMzMzMzKxOjVVgfZje98+a1j9nmaqc5x8AA3OmbZ5OBygHOuXMr+4iB9ldCgcCq4BFNUc1MzMzMzNrmEYpsCJiDjAOuFxSmaRewHl1rHYbcLyknSS1kTQGGArck86fCOwgaXg6/2Rgs2q2c5mkLpJ6AxcAd0ZEbjFnZmZmZma2zhQRjbMjqS/JQBW7AQuA3wLXk7Q6XQZURsQxOeucDJwKbARMB34eEeOy5l8GZNa5Id32cxFxiaTdgeeAo4GLgC7AY8BPIqK8tqwjRoyICRMmrMOrNTMzM7PmTtLEiBhR6hzWvDRagfWlHUvfAR4BOkQRQmQKrIhoU991XWCZmZmZmQssa4h6Fx8NJWkoyXlWb5B05bsEuK8YxZWZmZmZmVkpNOZ1sLoDDwHLgZdJhlw/rRH3b2ZmZmZmVlSN1oIVES8CWzbi/sbRiK/PzMzMzMysMVuwzMzMzMzMWjQXWGZmZmZmZgXiAsvMzMzMzKxAXGCZmZmZmZkViAssMzMzMzOzAnGBZWZmZmZmViAusMzMzMzMzArEBZaZmZmZmVmBuMAyMzMzMzMrEBdYZmZmZmZmBeICy8zMzMzMrEBcYJmZmZmZmRWICywzMzMzM7MCcYFlZmZmZmZWIC6wzMzMzMzMCsQFlpmZmZmZWYG4wDIzMzMzMysQF1hmZmZmZmYF4gLLzMzMzMysQFxgmZmZmZmZFYgLLDMzMzMzswJxgWVmZmZmZlYgLrDMzMzMzMwKxAWWmZmZmZlZgbjAMjMzMzMzKxAXWGZmZmZmZgXiAsvMzMzMzKxAXGCZmZmZmZkViAssMzMzMzOzAnGBZWZmZmZmViCKiFJnaHIkfQTMLuAmewKLCri9Qmvq+aDpZ3S+ddfUMzrfumnq+aDpZ3S+ddfUMzrfuilGvgER0avA27QWzgVWI5A0ISJGlDpHTZp6Pmj6GZ1v3TX1jM63bpp6Pmj6GZ1v3TX1jM63bpp6Plt/uIugmZmZmZlZgbjAMjMzMzMzKxAXWI3jxlIHqENTzwdNP6PzrbumntH51k1TzwdNP6PzrbumntH51k1Tz2frCZ+DZWZmZmZmViBuwTIzMzMzMysQF1hmZmZmZmYF4gLLzMzMzMysQFxgFZGk1pKulPSRpHJJD0rqWepcGZKOkPSSpGWSKkudJ5ekKyS9meabK+kmSRuWOlc2SZdKei/NuFDSA5L6lzpXLkmtJL0iKSRtWuo8AJJuk7RG0vKs20mlzlUdSXtJGp9mXCTp+lJnAkh/P7KP38r0Pd6h1NkyJPWRdF/6d/ATSS9IGlLqXBmSeki6XdJ8SUsl3SOpewnz1Pp3WdI+6fu+UtJUSXs3lXySNpH0iKTZ6c/hDxozW54Z90t/BhelP48vSdqtCeXbTdJrkhanP4+vSfpuU8mXs9yJ6ft8XmPmS/dd2zHcPc2V/bfxlcbOaOs3F1jFdTZwMPA1IPOh9s7SxfmST4DrgZ+WOEdN1gI/AHoAQ0iO4W2lDFSNO4GhEdEFGAi8D/xfSRNV73SgotQhqnF7RHTOujWJwiWbpN2BB4CrSH4WNwVuLmGkz0TEttnHD7gamBYRr5U6W5brgQ2BrYCNgAnA45JU0lSfuwPoDAwCNiN5j0v5d7rGv8uSNgceAi4Duqb3D0sa2BTyAVXAM8D3gTmNmClXbRm7A9cCWwK9gHuAv0nq12jpas83HTiU5OewW7rMXZK+2kjZII/PBpIGAD8D3mikTLnqyrg253/LLo0XzQzalDpAC3cccFFE/BdA0s+BGZIGRMTs0kaDiHgaPvsA2eRExDlZTz+S9Dvg/lLlqU5EvJ31VCQfMLYuUZxqSdoKOAk4DJhU4jjN0WXADRHxQNa0plTAACCpDXAUSd6mZEvgDxHxCYCkPwNjST5ALiplMEmdgH2BYRFRnk77NTBOUv+IeL+xM9Xxd/nHwMSIuCt9frekE9LpF5Y6X0TMA65L569tjDzVqSPj3TmT/ijpfGBH4IOih6POfAszjyW1Ivmf0ork9+itUufL8mfgXODERoj0JU3984uZW7CKRFI3oD8wMTMtImYCy0haY6z+9gQmlzpELknfl7QUWA6cBlxQ2kSfS/9B3wKcCSwpbZpqHZZ2hXlHSXfazqUOlC39AL4T0CbtqrNI0jhJI0qdrRqHkLRq3FHiHLmuJHmfe0nagOSLp5cjoqTFVUpZt4zM/8WhjZ6mbkPI+p+Seg3/T2kwSdsBPSldS0y1JC0BVgEvAa+StAw2CZKOB1ZExH2lzlKL1pI+SLv+PtGUuiXb+sEFVvGUpfdLc6YvAbo0bpTmT9JhwAkkBUyTEhH3RERXoC9JcdWU/lGfBsyPiIdLHaQa1wJfIflwcyjwTeCmkib6su4kfydHAaOBjUk+6DyZfonSlBwP3BcRS0odJMc/gdbAQpIvIb4LHFvSRKmIWA6MAy6Q1E1SLyDTct4U/06X4f8pBSOpN/AgcFVEvFvqPNkiohtJ19VDgSeBJnGetJJzjM8j6RXRVL1N8gXJZiT/Y6YAL0jauJShbP3iAqt4ytP7rjnTu5G0YlmeJB1O8sH7oCZ2bskXRMR8kpyPqwkMxiFpS5I+8ieXOkt1ImJiRCyIiKqIeJPkPLH/ldS+1NmyZH6Pb42IKRGxmqQLXlugyfTpl7QFSQvvDaXOki1tQX0OeIfkb2FH4FLgJUkblTJblh+QtBS8BfwbeCSd3hRa2HKV4/8pBZF+2H6R5AuTX5Q4TrUiYlVE/JXky6djShwn42bgkoj4sNRBahIR8yNickRURsSSiPgFsJikO7BZo3CBVSTpt8jvA5+N5pWeoNyF5NsUy4OkMcCfgAMj4sVS58lDG6ATSUtHqX2D5CTuqZIW8fl5Q1PUNEfrq0rvm8rgB0TEUmAWELmzqplWSscDkyPi1VIHybEhybfI10bEsohYHRE3k/zv2bm00RIR8WFEjIyIvhGxGfAe8CkwvsTRqjOZrP8pqWE0wa7TTVk6KMhLwN8i4uSIaEq/y9VpQzIIS1PwbeDXaXfpRcCuwC8kvVTiXHWpogn9b7GWzwVWcd0InCVpM0ldgCuApyNiVmljJZQMI78B0C59vkF6axJ/hCSdSjJy23ci4p+lzpNLydDnJ6fdTFAy/Pl1JB/I365t3UZyP7AFSVeJocB+6fS9aQLn6aTD7HZLHw8C/h/waER8WtJgX3Y9MEbSNulAEmNJWjyaxLC/ktqRdF9sUq1XAOl5Vu8AJ0nqJKmNpKNIuro1iS+aJG0tacP093lH4Brg8lJ1tazj7/IdwAhJoyS1lTQKGA7c3kTyffac5MNs2/R5ow6oVVtGSV8BXgbujYgzGzNXnvkOk7Rd+ruygaRjgT2Ap5tCPqAfyTl/Q9PbBJL/e//bWPnqyihpD0lbpr/TnSVdQDKCaaMdQzMiwrci3UjOO7iKpKtJOcnwuj1LnSsr32g+/yY++zaw1NnSfAGsITlv47NbqXNl5WtF0jd+IbAC+BC4G9ii1NlqyDswPaabljpLmmccSbeNFSStBlcDXUqdq5qcAi4C5pOc7/IiydD8Jc+W5juCpItY51JnqSHfV4HH07+DS0kGaTi41Lmy8h0LzCO5jMG7wGklzlPr32VgH+BNYGV6v3cTy1fdvAuaSkbg1vTx8pzbkU0k38npz+Hy9O/jv4DDm8rxq2bZccB5jZkvj2N4OjA7/d+yEHgK2LGxM/q2ft8U0dRbxs3MzMzMzJoHdxE0MzMzMzMrEBdYZmZmZmZmBeICy8zMzMzMrEBcYJmZmZmZmRWICywzMzMzM7MCcYFlZmZmZmZWIC6wzMzWY5IukPTcOqx/jqTH1jHDDEmj12UbZmZmTYULLDOzRiJpc0l/kTRf0nJJH0h6WFK7UmdrqIj4dUQcWOocZmZmTYULLDOzxvMkMA/YGigDdgaeBlTKUGZmZlY4LrDMzBqBpB4khdUNEbE0EnMi4oaIWJUuM0TS3yUtkvSJpL9J2iJrG7dJulPSLZKWSPpQ0ihJQyX9R1K5pBclbZy1zixJv5L0ctpqNkHSjrXk7CjpKknvSVos6SlJW9ay/Be6GKb7O0fS8+n+pkraJWt+W0lXS1qYtuSdVc02d0vzLpY0U9LPJCmdd6ekZyS1Sp/vLmmZpO3yfS/MzMyKyQWWmVkjiIiPgTeBmyX9SNI2maIhezHgAmATYCCwHLgrZ5n/BR4ENgQuBm4CLgIOBTZKt3FhzjonAKel6zwAPCmpSw1RbwK+Anwd6AO8CjwuqW09Xu5RwKlAV+BZ4PaseWcDBwC7AJulr3NAZqakbUha+q4EegH7AycDP8x6LZsC50naCLgXOC0i3qhHPjMzs6JxgWVm1nh2B8YBPwVeBxZI+mWm0IqIKRHxYkSsioilJIXS1yV1zNrGCxHxRERUAXcAnYA709awCpICakTOfv8cERMjYjVwBbCSpMj5Akk9ge8DJ0XEgnT5C4G+wNfq8Tr/FBFvRsRa4GZgS0ld03k/Aq6IiBkRsRI4k6QozDgJ+EtEPBIRayPibeAP6XpExArg8HS9Z4GnIuLWemQzMzMrqjalDmBmtr6IiEXAOcA5adH0PZIWow+BW9LugFeSFDNlfF549AJmp4/nZW2vIq3NPpsGVKTrZpuVtU5Iep+kFSjXZun9lJzGtbZAv7xeZE5GYEV6XwYsTfebnWeFpIU5GfaQ9N2saa2AD7LWeVPSi8CBwCH1yGVmZlZ0bsEyMyuBiKiIiNuAKcDQdPINQDmwfUR0AXZNp6/rIBgDMw/S1rL+wJxqlssUcYMiolvWrWNE3LuOGTI+zMnTiaSAzM5wS87+u0TEtlnr/ICkC+MDJF0u/b/MzMyaDP9TMjNrBJK6S7pM0uB0oIc2kg4DBgMvpYt1IWnxWZJ217uoQLs/StIO6XlUY4GOwBO5C0XEQuAe4HpJm6S5u0k6VFLnAmW5ExgraQtJHYDf8MX/RdcDR0g6MOs4bSPpm2merwLXAUcCPwZ6kpy3ZmZm1iS4wDIzaxyrgd7AQ8Bi4CPgPODUiPhLuszpwG7AMpKi6/EC7ftG4PfAJ8BIYP/0HK/qHAtMB8ZJKgfeIDnnKWpYvr4uIxmafjzwHvA+n7ecERFTSc4P+ylJV8OFwG1Ar7Rb5V+A30bEc+k5XN8DTpP07QLlMzMzWyeKKNT/TDMza2okzQLOi4jc0QjNzMysCNyCZWZmZmZmViAusMzMzMzMzArEXQTNzMzMzMwKxC1YZmZmZmZmBeICy8zMzMzMrEBcYJmZmZmZmRWICywzMzMzM7MCcYFlZmZmZmZWIP8f/zK+xPjueGcAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "cv_args = {\n", + " 'test_size': 2,\n", + " 'train_size': 2,\n", + " 'n_splits': 2\n", + "}\n", + "\n", + "test_cv(X, y, groups, **cv_args)\n", + "plot_cv(X, y, groups, **cv_args)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Groups are not sorted**" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['2021-06',\n", + " '2021-06',\n", + " '2021-06',\n", + " '2021-06',\n", + " '2021-02',\n", + " '2021-02',\n", + " '2021-02',\n", + " '2021-02',\n", + " '2021-04',\n", + " '2021-04',\n", + " '2021-03',\n", + " '2021-03',\n", + " '2021-03',\n", + " '2021-05',\n", + " '2021-05',\n", + " '2021-01']" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "not_sorted_months = ['2021-06', '2021-06', '2021-06', '2021-06', '2021-02', '2021-02', \n", + " '2021-02', '2021-02', '2021-04', '2021-04', '2021-03', '2021-03', \n", + " '2021-03', '2021-05', '2021-05', '2021-01']\n", + "not_sorted_months" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([5, 5, 5, 5, 1, 1, 1, 1, 3, 3, 2, 2, 2, 4, 4, 0])" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "not_sorted_groups = np.fromiter(map(months_map.get, not_sorted_months), dtype='int')\n", + "not_sorted_groups" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
num_feature
2021-060
2021-067
2021-066
2021-064
2021-024
2021-028
2021-020
2021-026
2021-042
2021-040
2021-035
2021-039
2021-037
2021-057
2021-057
2021-017
\n", + "
" + ], + "text/plain": [ + " num_feature\n", + "2021-06 0\n", + "2021-06 7\n", + "2021-06 6\n", + "2021-06 4\n", + "2021-02 4\n", + "2021-02 8\n", + "2021-02 0\n", + "2021-02 6\n", + "2021-04 2\n", + "2021-04 0\n", + "2021-03 5\n", + "2021-03 9\n", + "2021-03 7\n", + "2021-05 7\n", + "2021-05 7\n", + "2021-01 7" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_not_sorted = X.copy()\n", + "X_not_sorted.index = not_sorted_months\n", + "X_not_sorted" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TRAIN INDICES: [0 1 2 3 4 5 6 7 8 9]\n", + "TEST INDICES: [10 11 12]\n", + "TRAIN LENGTH: 10\n", + "TEST LENGTH: 3\n", + "TRAIN GROUPS: [5 5 5 5 1 1 1 1 3 3]\n", + "TEST GROUPS: [2 2 2]\n", + "TRAIN GROUP SIZE: 3\n", + "TEST GROUP SIZE: 1\n", + "TRAIN GROUP MONTHS: ['2021-06' '2021-06' '2021-06' '2021-06' '2021-02' '2021-02' '2021-02'\n", + " '2021-02' '2021-04' '2021-04']\n", + "TEST GROUP MONTHS: ['2021-03' '2021-03' '2021-03']\n", + "\n", + "TRAIN INDICES: [ 4 5 6 7 8 9 10 11 12]\n", + "TEST INDICES: [13 14]\n", + "TRAIN LENGTH: 9\n", + "TEST LENGTH: 2\n", + "TRAIN GROUPS: [1 1 1 1 3 3 2 2 2]\n", + "TEST GROUPS: [4 4]\n", + "TRAIN GROUP SIZE: 3\n", + "TEST GROUP SIZE: 1\n", + "TRAIN GROUP MONTHS: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-04' '2021-04' '2021-03'\n", + " '2021-03' '2021-03']\n", + "TEST GROUP MONTHS: ['2021-05' '2021-05']\n", + "\n", + "TRAIN INDICES: [ 8 9 10 11 12 13 14]\n", + "TEST INDICES: [15]\n", + "TRAIN LENGTH: 7\n", + "TEST LENGTH: 1\n", + "TRAIN GROUPS: [3 3 2 2 2 4 4]\n", + "TEST GROUPS: [0]\n", + "TRAIN GROUP SIZE: 3\n", + "TEST GROUP SIZE: 1\n", + "TRAIN GROUP MONTHS: ['2021-04' '2021-04' '2021-03' '2021-03' '2021-03' '2021-05' '2021-05']\n", + "TEST GROUP MONTHS: ['2021-01']\n", + "\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "cv_args = {\n", + " 'test_size': 1,\n", + " 'train_size': 3\n", + "}\n", + "\n", + "test_cv(X_not_sorted, y, not_sorted_groups, **cv_args)\n", + "plot_cv(X_not_sorted, y, not_sorted_groups, **cv_args)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Group names**" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TRAIN INDICES: [0 1 2 3 4 5 6 7]\n", + "TEST INDICES: [8 9]\n", + "TRAIN LENGTH: 8\n", + "TEST LENGTH: 2\n", + "TRAIN GROUPS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03'\n", + " '2021-03']\n", + "TEST GROUPS: ['2021-04' '2021-04']\n", + "TRAIN GROUP SIZE: 3\n", + "TEST GROUP SIZE: 1\n", + "TRAIN GROUP MONTHS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03'\n", + " '2021-03']\n", + "TEST GROUP MONTHS: ['2021-04' '2021-04']\n", + "\n", + "TRAIN INDICES: [1 2 3 4 5 6 7 8 9]\n", + "TEST INDICES: [10 11]\n", + "TRAIN LENGTH: 9\n", + "TEST LENGTH: 2\n", + "TRAIN GROUPS: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03'\n", + " '2021-04' '2021-04']\n", + "TEST GROUPS: ['2021-05' '2021-05']\n", + "TRAIN GROUP SIZE: 3\n", + "TEST GROUP SIZE: 1\n", + "TRAIN GROUP MONTHS: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03'\n", + " '2021-04' '2021-04']\n", + "TEST GROUP MONTHS: ['2021-05' '2021-05']\n", + "\n", + "TRAIN INDICES: [ 5 6 7 8 9 10 11]\n", + "TEST INDICES: [12 13 14 15]\n", + "TRAIN LENGTH: 7\n", + "TEST LENGTH: 4\n", + "TRAIN GROUPS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04' '2021-05' '2021-05']\n", + "TEST GROUPS: ['2021-06' '2021-06' '2021-06' '2021-06']\n", + "TRAIN GROUP SIZE: 3\n", + "TEST GROUP SIZE: 1\n", + "TRAIN GROUP MONTHS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04' '2021-05' '2021-05']\n", + "TEST GROUP MONTHS: ['2021-06' '2021-06' '2021-06' '2021-06']\n", + "\n" + ] + } + ], + "source": [ + "cv_args = {\n", + " 'test_size': 1,\n", + " 'train_size': 3\n", + "}\n", + "\n", + "test_cv(X, y, months, **cv_args)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Usage in CV" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0. , 0.5 , 0.25])" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cv_args = {\n", + " 'test_size': 1,\n", + " 'train_size': 3\n", + "}\n", + "cv = GroupTimeSeriesSplit(**cv_args)\n", + "clf = DummyClassifier(strategy='most_frequent')\n", + "\n", + "scores = cross_val_score(clf, X, y, groups=groups, scoring='accuracy', cv=cv)\n", + "scores" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1 0 1 0 1 0 0 1] [0 0 0 0 0 0 0 0] [1 1] [0 0]\n", + "[0 1 0 1 0 0 1 1 1] [1 1 1 1 1 1 1 1 1] [0 1] [1 1]\n", + "[0 0 1 1 1 0 1] [1 1 1 1 1 1 1] [1 0 0 0] [1 1 1 1]\n" + ] + } + ], + "source": [ + "clf = DummyClassifier(strategy='most_frequent')\n", + "\n", + "for train_idx, test_idx in cv.split(X, y, groups):\n", + " clf.fit(X.iloc[train_idx], y.iloc[train_idx])\n", + " y_train_pred = clf.predict(X.iloc[train_idx])\n", + " y_test_pred = clf.predict(X.iloc[test_idx])\n", + " print(y.iloc[train_idx].values, y_train_pred,\n", + " y.iloc[test_idx].values, y_test_pred)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Failed cases" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Groups are not consecutive\n", + "- Both train_size and n_splits are specified\n", + "- Incorrect window name\n", + "- Specify train_size with expanding window\n", + "- Not enough data (too many number of splits)\n", + "- Not enough data (too large value of shift size)\n", + "- Not enough data (too large value of gap size)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Groups are not consecutive**" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['2021-01',\n", + " '2021-02',\n", + " '2021-02',\n", + " '2021-02',\n", + " '2021-02',\n", + " '2021-03',\n", + " '2021-03',\n", + " '2021-03',\n", + " '2021-04',\n", + " '2021-04',\n", + " '2021-05',\n", + " '2021-05',\n", + " '2021-03',\n", + " '2021-03',\n", + " '2021-03',\n", + " '2021-03']" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "not_consecutive_months = ['2021-01', '2021-02', '2021-02', '2021-02', '2021-02', '2021-03',\n", + " '2021-03', '2021-03', '2021-04', '2021-04', '2021-05', '2021-05',\n", + " '2021-03', '2021-03', '2021-03', '2021-03']\n", + "not_consecutive_months" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 2, 2, 2, 2])" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "not_consecutive_groups = np.fromiter(map(months_map.get, not_consecutive_months), dtype='int')\n", + "not_consecutive_groups" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The groups should be consecutive\n" + ] + } + ], + "source": [ + "try:\n", + " test_cv(X, y, not_consecutive_groups, **cv_args)\n", + "except ValueError as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Both train_size and n_splits are not specified**" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Either train_size or n_splits should be defined\n" + ] + } + ], + "source": [ + "cv_args = {\n", + " 'test_size': 3\n", + "}\n", + "\n", + "try:\n", + " test_cv(X, y, groups, **cv_args)\n", + "except ValueError as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Incorrect window name**" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Window type can be either \"rolling\" or \"expanding\"\n" + ] + } + ], + "source": [ + "cv_args = {\n", + " 'test_size': 3,\n", + " 'n_splits': 3,\n", + " 'window_type': 'bad_name'\n", + "}\n", + "\n", + "try:\n", + " test_cv(X, y, groups, **cv_args)\n", + "except ValueError as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Specify train_size with expanding window**" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train size can be specified only with rolling window\n" + ] + } + ], + "source": [ + "cv_args = {\n", + " 'test_size': 3,\n", + " 'train_size': 3,\n", + " 'window_type': 'expanding'\n", + "}\n", + "\n", + "try:\n", + " test_cv(X, y, groups, **cv_args)\n", + "except ValueError as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Not enough data (too large train_size and/or n_splits)**" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Not enough data to split number of groups (6) for number splits (10) with train size (10), test size (1), gap size (0), shift size (1)\n" + ] + } + ], + "source": [ + "cv_args = {\n", + " 'test_size': 1,\n", + " 'train_size': 10, \n", + " 'n_splits': 10\n", + "}\n", + "\n", + "try:\n", + " test_cv(X, y, groups, **cv_args)\n", + "except ValueError as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Not enough data (too many number of splits)**" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Not enough data to split number of groups (6) for number splits (10) with train size (-4), test size (1), gap size (0), shift size (1)\n" + ] + } + ], + "source": [ + "cv_args = {\n", + " 'test_size': 1,\n", + " 'n_splits': 10\n", + "}\n", + "\n", + "try:\n", + " test_cv(X, y, groups, **cv_args)\n", + "except ValueError as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Not enough data (too large values of shift size)**" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Not enough data to split number of groups (6) for number splits (3) with train size (-15), test size (1), gap size (0), shift size (10)\n" + ] + } + ], + "source": [ + "cv_args = {\n", + " 'test_size': 1,\n", + " 'n_splits': 3,\n", + " 'shift_size': 10\n", + "}\n", + "\n", + "try:\n", + " test_cv(X, y, groups, **cv_args)\n", + "except ValueError as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Not enough data (too large values of gap size)**" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Not enough data to split number of groups (6) for number splits (3) with train size (-7), test size (1), gap size (10), shift size (1)\n" + ] + } + ], + "source": [ + "cv_args = {\n", + " 'test_size': 1,\n", + " 'n_splits': 3,\n", + " 'gap_size': 10\n", + "}\n", + "\n", + "try:\n", + " test_cv(X, y, groups, **cv_args)\n", + "except ValueError as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlxtend", + "language": "python", + "name": "mlxtend" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 2bd1ee150d26cd0b2df35841ad2ecd98c6aac2fd Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Tue, 3 May 2022 14:17:30 +0300 Subject: [PATCH 20/30] Reformat notebook using black --- .../evaluate/GroupTimeSeriesSplit.ipynb | 270 +++++++++--------- 1 file changed, 130 insertions(+), 140 deletions(-) diff --git a/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb b/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb index f66b430cc..c5fc5fde1 100644 --- a/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb +++ b/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb @@ -36,6 +36,7 @@ "from matplotlib.patches import Patch\n", "import matplotlib.pyplot as plt\n", "from matplotlib.ticker import MaxNLocator\n", + "\n", "%matplotlib inline\n", "\n", "from mlxtend.evaluate.time_series import GroupTimeSeriesSplit" @@ -72,7 +73,7 @@ } ], "source": [ - "months_map = {f'2021-0{i+1}': i for i in range(6)}\n", + "months_map = {f\"2021-0{i+1}\": i for i in range(6)}\n", "months_map" ] }, @@ -108,9 +109,24 @@ } ], "source": [ - "months = ['2021-01', '2021-02', '2021-02', '2021-02', '2021-02', '2021-03',\n", - " '2021-03', '2021-03', '2021-04', '2021-04', '2021-05', '2021-05',\n", - " '2021-06', '2021-06', '2021-06', '2021-06']\n", + "months = [\n", + " \"2021-01\",\n", + " \"2021-02\",\n", + " \"2021-02\",\n", + " \"2021-02\",\n", + " \"2021-02\",\n", + " \"2021-03\",\n", + " \"2021-03\",\n", + " \"2021-03\",\n", + " \"2021-04\",\n", + " \"2021-04\",\n", + " \"2021-05\",\n", + " \"2021-05\",\n", + " \"2021-06\",\n", + " \"2021-06\",\n", + " \"2021-06\",\n", + " \"2021-06\",\n", + "]\n", "months" ] }, @@ -131,7 +147,7 @@ } ], "source": [ - "groups = np.fromiter(map(months_map.get, months), dtype='int')\n", + "groups = np.fromiter(map(months_map.get, months), dtype=\"int\")\n", "groups" ] }, @@ -293,8 +309,8 @@ "data = [[0], [7], [6], [4], [4], [8], [0], [6], [2], [0], [5], [9], [7], [7], [7], [7]]\n", "target = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0]\n", "\n", - "X = pd.DataFrame(data, index=months, columns=['num_feature'])\n", - "y = pd.Series(target, name='target')\n", + "X = pd.DataFrame(data, index=months, columns=[\"num_feature\"])\n", + "y = pd.Series(target, name=\"target\")\n", "\n", "display(X, y)" ] @@ -315,35 +331,37 @@ "def test_cv(X, y, groups, **cv_args):\n", " cv = GroupTimeSeriesSplit(**cv_args)\n", " groups = np.array(groups)\n", - " \n", + "\n", " for train_idx, test_idx in cv.split(X, groups=groups):\n", - " print('TRAIN INDICES:', train_idx)\n", - " print('TEST INDICES:', test_idx)\n", - " print('TRAIN LENGTH:', len(train_idx))\n", - " print('TEST LENGTH:', len(test_idx))\n", - " print('TRAIN GROUPS:', groups[train_idx]) \n", - " print('TEST GROUPS:', groups[test_idx])\n", - " print('TRAIN GROUP SIZE:', len(set(groups[train_idx]))) \n", - " print('TEST GROUP SIZE:', len(set(groups[test_idx])))\n", - " print('TRAIN GROUP MONTHS:', X.index[train_idx].values) \n", - " print('TEST GROUP MONTHS:', X.index[test_idx].values)\n", + " print(\"TRAIN INDICES:\", train_idx)\n", + " print(\"TEST INDICES:\", test_idx)\n", + " print(\"TRAIN LENGTH:\", len(train_idx))\n", + " print(\"TEST LENGTH:\", len(test_idx))\n", + " print(\"TRAIN GROUPS:\", groups[train_idx])\n", + " print(\"TEST GROUPS:\", groups[test_idx])\n", + " print(\"TRAIN GROUP SIZE:\", len(set(groups[train_idx])))\n", + " print(\"TEST GROUP SIZE:\", len(set(groups[test_idx])))\n", + " print(\"TRAIN GROUP MONTHS:\", X.index[train_idx].values)\n", + " print(\"TEST GROUP MONTHS:\", X.index[test_idx].values)\n", " print()\n", "\n", - "def plot_cv_indices(cv, X, y, groups, n_splits,\n", - " image_file_path=None):\n", + "\n", + "def plot_cv_indices(cv, X, y, groups, n_splits, image_file_path=None):\n", " \"\"\"Create a sample plot for indices of a cross-validation object.\"\"\"\n", - " \n", + "\n", " fig, ax = plt.subplots(figsize=(12, 4))\n", " cmap_data = plt.cm.tab20\n", " cmap_cv = plt.cm.coolwarm\n", " lw = 10\n", " marker_size = 200\n", - " \n", - " for split_idx, (train_idx, test_idx) in enumerate(cv.split(X=X, y=y, groups=groups)):\n", + "\n", + " for split_idx, (train_idx, test_idx) in enumerate(\n", + " cv.split(X=X, y=y, groups=groups)\n", + " ):\n", " indices = np.array([np.nan] * len(X))\n", " indices[test_idx] = 1\n", " indices[train_idx] = 0\n", - " \n", + "\n", " ax.scatter(\n", " range(len(X)),\n", " [split_idx + 0.5] * len(X),\n", @@ -353,51 +371,55 @@ " cmap=cmap_cv,\n", " vmin=-0.4,\n", " vmax=1.4,\n", - " s=marker_size\n", + " s=marker_size,\n", " )\n", " ax.scatter(\n", - " range(len(X)), [split_idx + 1.5] * len(X), \n", - " c=groups, marker=\"_\", lw=lw, \n", - " cmap=cmap_data, s=marker_size\n", + " range(len(X)),\n", + " [split_idx + 1.5] * len(X),\n", + " c=groups,\n", + " marker=\"_\",\n", + " lw=lw,\n", + " cmap=cmap_data,\n", + " s=marker_size,\n", " )\n", "\n", - " yticklabels = list(range(n_splits)) + ['group']\n", + " yticklabels = list(range(n_splits)) + [\"group\"]\n", " ax.set(\n", " yticks=np.arange(n_splits + 1) + 0.5,\n", " yticklabels=yticklabels,\n", - " ylabel='CV iteration',\n", + " ylabel=\"CV iteration\",\n", " ylim=[n_splits + 1.2, -0.2],\n", - " xlim=[-0.5, len(indices) - 0.5]\n", + " xlim=[-0.5, len(indices) - 0.5],\n", " )\n", " ax.legend(\n", " [Patch(color=cmap_cv(0.2)), Patch(color=cmap_cv(0.8))],\n", " [\"Training set\", \"Testing set\"],\n", " loc=(1.02, 0.8),\n", - " fontsize=13\n", + " fontsize=13,\n", " )\n", - " \n", - " ax.set_title('{}\\n{}'.format(type(cv).__name__, cv_args), fontsize=15)\n", + "\n", + " ax.set_title(\"{}\\n{}\".format(type(cv).__name__, cv_args), fontsize=15)\n", " ax.xaxis.set_major_locator(MaxNLocator(min_n_ticks=len(X), integer=True))\n", - " ax.set_xlabel(xlabel='Sample index', fontsize=13)\n", - " ax.set_ylabel(ylabel='CV iteration', fontsize=13)\n", - " ax.tick_params(axis='both', which='major', labelsize=13)\n", - " ax.tick_params(axis='both', which='minor', labelsize=13)\n", - " \n", + " ax.set_xlabel(xlabel=\"Sample index\", fontsize=13)\n", + " ax.set_ylabel(ylabel=\"CV iteration\", fontsize=13)\n", + " ax.tick_params(axis=\"both\", which=\"major\", labelsize=13)\n", + " ax.tick_params(axis=\"both\", which=\"minor\", labelsize=13)\n", + "\n", " plt.tight_layout()\n", - " \n", + "\n", " if image_file_path:\n", - " plt.savefig(image_file_path, bbox_inches='tight')\n", - " \n", + " plt.savefig(image_file_path, bbox_inches=\"tight\")\n", + "\n", " plt.show()\n", - " \n", + "\n", + "\n", "def plot_cv(X, y, groups, image_file_path=None, **cv_args):\n", " cv = GroupTimeSeriesSplit(**cv_args)\n", " cv._n_groups = len(np.unique(groups))\n", " cv._calculate_split_params()\n", " n_splits = cv.n_splits\n", - " \n", - " plot_cv_indices(cv, X, y, groups, n_splits,\n", - " image_file_path=image_file_path)" + "\n", + " plot_cv_indices(cv, X, y, groups, n_splits, image_file_path=image_file_path)" ] }, { @@ -415,10 +437,7 @@ "metadata": {}, "outputs": [], "source": [ - "cv_args = {\n", - " 'test_size': 3, \n", - " 'n_splits': 1\n", - "}" + "cv_args = {\"test_size\": 3, \"n_splits\": 1}" ] }, { @@ -562,10 +581,7 @@ } ], "source": [ - "cv_args = {\n", - " 'test_size': 1,\n", - " 'train_size': 3\n", - "}\n", + "cv_args = {\"test_size\": 1, \"train_size\": 3}\n", "\n", "test_cv(X, y, groups, **cv_args)\n", "plot_cv(X, y, groups, **cv_args)" @@ -636,10 +652,7 @@ } ], "source": [ - "cv_args = {\n", - " 'test_size': 2,\n", - " 'n_splits': 3\n", - "}\n", + "cv_args = {\"test_size\": 2, \"n_splits\": 3}\n", "\n", "test_cv(X, y, groups, **cv_args)\n", "plot_cv(X, y, groups, **cv_args)" @@ -710,11 +723,7 @@ } ], "source": [ - "cv_args = {\n", - " 'test_size': 1,\n", - " 'n_splits': 3,\n", - " 'gap_size': 1\n", - "}\n", + "cv_args = {\"test_size\": 1, \"n_splits\": 3, \"gap_size\": 1}\n", "\n", "test_cv(X, y, groups, **cv_args)\n", "plot_cv(X, y, groups, **cv_args)" @@ -775,11 +784,7 @@ } ], "source": [ - "cv_args = {\n", - " 'test_size': 1,\n", - " 'n_splits': 2,\n", - " 'shift_size': 2\n", - "}\n", + "cv_args = {\"test_size\": 1, \"n_splits\": 2, \"shift_size\": 2}\n", "\n", "test_cv(X, y, groups, **cv_args)\n", "plot_cv(X, y, groups, **cv_args)" @@ -853,11 +858,7 @@ } ], "source": [ - "cv_args = {\n", - " 'test_size': 3,\n", - " 'n_splits': 3,\n", - " 'window_type': 'expanding'\n", - "}\n", + "cv_args = {\"test_size\": 3, \"n_splits\": 3, \"window_type\": \"expanding\"}\n", "\n", "test_cv(X, y, groups, **cv_args)\n", "plot_cv(X, y, groups, **cv_args)" @@ -918,11 +919,7 @@ } ], "source": [ - "cv_args = {\n", - " 'test_size': 3,\n", - " 'train_size': 2,\n", - " 'n_splits': 2\n", - "}\n", + "cv_args = {\"test_size\": 3, \"train_size\": 2, \"n_splits\": 2}\n", "\n", "test_cv(X, y, groups, **cv_args)\n", "plot_cv(X, y, groups, **cv_args)" @@ -982,11 +979,7 @@ } ], "source": [ - "cv_args = {\n", - " 'test_size': 2,\n", - " 'train_size': 2,\n", - " 'n_splits': 2\n", - "}\n", + "cv_args = {\"test_size\": 2, \"train_size\": 2, \"n_splits\": 2}\n", "\n", "test_cv(X, y, groups, **cv_args)\n", "plot_cv(X, y, groups, **cv_args)" @@ -1031,9 +1024,24 @@ } ], "source": [ - "not_sorted_months = ['2021-06', '2021-06', '2021-06', '2021-06', '2021-02', '2021-02', \n", - " '2021-02', '2021-02', '2021-04', '2021-04', '2021-03', '2021-03', \n", - " '2021-03', '2021-05', '2021-05', '2021-01']\n", + "not_sorted_months = [\n", + " \"2021-06\",\n", + " \"2021-06\",\n", + " \"2021-06\",\n", + " \"2021-06\",\n", + " \"2021-02\",\n", + " \"2021-02\",\n", + " \"2021-02\",\n", + " \"2021-02\",\n", + " \"2021-04\",\n", + " \"2021-04\",\n", + " \"2021-03\",\n", + " \"2021-03\",\n", + " \"2021-03\",\n", + " \"2021-05\",\n", + " \"2021-05\",\n", + " \"2021-01\",\n", + "]\n", "not_sorted_months" ] }, @@ -1054,7 +1062,7 @@ } ], "source": [ - "not_sorted_groups = np.fromiter(map(months_map.get, not_sorted_months), dtype='int')\n", + "not_sorted_groups = np.fromiter(map(months_map.get, not_sorted_months), dtype=\"int\")\n", "not_sorted_groups" ] }, @@ -1247,10 +1255,7 @@ } ], "source": [ - "cv_args = {\n", - " 'test_size': 1,\n", - " 'train_size': 3\n", - "}\n", + "cv_args = {\"test_size\": 1, \"train_size\": 3}\n", "\n", "test_cv(X_not_sorted, y, not_sorted_groups, **cv_args)\n", "plot_cv(X_not_sorted, y, not_sorted_groups, **cv_args)" @@ -1313,10 +1318,7 @@ } ], "source": [ - "cv_args = {\n", - " 'test_size': 1,\n", - " 'train_size': 3\n", - "}\n", + "cv_args = {\"test_size\": 1, \"train_size\": 3}\n", "\n", "test_cv(X, y, months, **cv_args)" ] @@ -1345,14 +1347,11 @@ } ], "source": [ - "cv_args = {\n", - " 'test_size': 1,\n", - " 'train_size': 3\n", - "}\n", + "cv_args = {\"test_size\": 1, \"train_size\": 3}\n", "cv = GroupTimeSeriesSplit(**cv_args)\n", - "clf = DummyClassifier(strategy='most_frequent')\n", + "clf = DummyClassifier(strategy=\"most_frequent\")\n", "\n", - "scores = cross_val_score(clf, X, y, groups=groups, scoring='accuracy', cv=cv)\n", + "scores = cross_val_score(clf, X, y, groups=groups, scoring=\"accuracy\", cv=cv)\n", "scores" ] }, @@ -1372,14 +1371,13 @@ } ], "source": [ - "clf = DummyClassifier(strategy='most_frequent')\n", + "clf = DummyClassifier(strategy=\"most_frequent\")\n", "\n", "for train_idx, test_idx in cv.split(X, y, groups):\n", " clf.fit(X.iloc[train_idx], y.iloc[train_idx])\n", " y_train_pred = clf.predict(X.iloc[train_idx])\n", " y_test_pred = clf.predict(X.iloc[test_idx])\n", - " print(y.iloc[train_idx].values, y_train_pred,\n", - " y.iloc[test_idx].values, y_test_pred)" + " print(y.iloc[train_idx].values, y_train_pred, y.iloc[test_idx].values, y_test_pred)" ] }, { @@ -1441,9 +1439,24 @@ } ], "source": [ - "not_consecutive_months = ['2021-01', '2021-02', '2021-02', '2021-02', '2021-02', '2021-03',\n", - " '2021-03', '2021-03', '2021-04', '2021-04', '2021-05', '2021-05',\n", - " '2021-03', '2021-03', '2021-03', '2021-03']\n", + "not_consecutive_months = [\n", + " \"2021-01\",\n", + " \"2021-02\",\n", + " \"2021-02\",\n", + " \"2021-02\",\n", + " \"2021-02\",\n", + " \"2021-03\",\n", + " \"2021-03\",\n", + " \"2021-03\",\n", + " \"2021-04\",\n", + " \"2021-04\",\n", + " \"2021-05\",\n", + " \"2021-05\",\n", + " \"2021-03\",\n", + " \"2021-03\",\n", + " \"2021-03\",\n", + " \"2021-03\",\n", + "]\n", "not_consecutive_months" ] }, @@ -1464,7 +1477,9 @@ } ], "source": [ - "not_consecutive_groups = np.fromiter(map(months_map.get, not_consecutive_months), dtype='int')\n", + "not_consecutive_groups = np.fromiter(\n", + " map(months_map.get, not_consecutive_months), dtype=\"int\"\n", + ")\n", "not_consecutive_groups" ] }, @@ -1509,9 +1524,7 @@ } ], "source": [ - "cv_args = {\n", - " 'test_size': 3\n", - "}\n", + "cv_args = {\"test_size\": 3}\n", "\n", "try:\n", " test_cv(X, y, groups, **cv_args)\n", @@ -1540,11 +1553,7 @@ } ], "source": [ - "cv_args = {\n", - " 'test_size': 3,\n", - " 'n_splits': 3,\n", - " 'window_type': 'bad_name'\n", - "}\n", + "cv_args = {\"test_size\": 3, \"n_splits\": 3, \"window_type\": \"bad_name\"}\n", "\n", "try:\n", " test_cv(X, y, groups, **cv_args)\n", @@ -1573,11 +1582,7 @@ } ], "source": [ - "cv_args = {\n", - " 'test_size': 3,\n", - " 'train_size': 3,\n", - " 'window_type': 'expanding'\n", - "}\n", + "cv_args = {\"test_size\": 3, \"train_size\": 3, \"window_type\": \"expanding\"}\n", "\n", "try:\n", " test_cv(X, y, groups, **cv_args)\n", @@ -1606,11 +1611,7 @@ } ], "source": [ - "cv_args = {\n", - " 'test_size': 1,\n", - " 'train_size': 10, \n", - " 'n_splits': 10\n", - "}\n", + "cv_args = {\"test_size\": 1, \"train_size\": 10, \"n_splits\": 10}\n", "\n", "try:\n", " test_cv(X, y, groups, **cv_args)\n", @@ -1639,10 +1640,7 @@ } ], "source": [ - "cv_args = {\n", - " 'test_size': 1,\n", - " 'n_splits': 10\n", - "}\n", + "cv_args = {\"test_size\": 1, \"n_splits\": 10}\n", "\n", "try:\n", " test_cv(X, y, groups, **cv_args)\n", @@ -1671,11 +1669,7 @@ } ], "source": [ - "cv_args = {\n", - " 'test_size': 1,\n", - " 'n_splits': 3,\n", - " 'shift_size': 10\n", - "}\n", + "cv_args = {\"test_size\": 1, \"n_splits\": 3, \"shift_size\": 10}\n", "\n", "try:\n", " test_cv(X, y, groups, **cv_args)\n", @@ -1704,11 +1698,7 @@ } ], "source": [ - "cv_args = {\n", - " 'test_size': 1,\n", - " 'n_splits': 3,\n", - " 'gap_size': 10\n", - "}\n", + "cv_args = {\"test_size\": 1, \"n_splits\": 3, \"gap_size\": 10}\n", "\n", "try:\n", " test_cv(X, y, groups, **cv_args)\n", From 7cadc4cfeba3c7a8debefae5bd07f555ffe25d2d Mon Sep 17 00:00:00 2001 From: rasbt Date: Wed, 11 May 2022 16:59:13 -0500 Subject: [PATCH 21/30] isort --- mlxtend/evaluate/__init__.py | 7 ++----- mlxtend/evaluate/tests/test_time_series.py | 3 ++- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/mlxtend/evaluate/__init__.py b/mlxtend/evaluate/__init__.py index 014e27c22..01e0eb2ea 100644 --- a/mlxtend/evaluate/__init__.py +++ b/mlxtend/evaluate/__init__.py @@ -20,13 +20,10 @@ from .mcnemar import mcnemar, mcnemar_table, mcnemar_tables from .permutation import permutation_test from .proportion_difference import proportion_difference -from .bias_variance_decomp import bias_variance_decomp -from .accuracy import accuracy_score -from .counterfactual import create_counterfactual from .scoring import scoring from .time_series import GroupTimeSeriesSplit -from .ttest import paired_ttest_5x2cv, paired_ttest_kfold_cv, paired_ttest_resampled - +from .ttest import (paired_ttest_5x2cv, paired_ttest_kfold_cv, + paired_ttest_resampled) __all__ = [ "scoring", diff --git a/mlxtend/evaluate/tests/test_time_series.py b/mlxtend/evaluate/tests/test_time_series.py index 1ec260b71..8233e0ba8 100644 --- a/mlxtend/evaluate/tests/test_time_series.py +++ b/mlxtend/evaluate/tests/test_time_series.py @@ -7,10 +7,11 @@ import numpy as np import pytest -from mlxtend.evaluate import GroupTimeSeriesSplit from sklearn.dummy import DummyClassifier from sklearn.model_selection import cross_val_score +from mlxtend.evaluate import GroupTimeSeriesSplit + @pytest.fixture def X(): From 96919e59248754ed27fe14f1f8bb06b2bdf5779d Mon Sep 17 00:00:00 2001 From: rasbt Date: Wed, 11 May 2022 17:47:02 -0500 Subject: [PATCH 22/30] isort with black profile --- mlxtend/evaluate/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mlxtend/evaluate/__init__.py b/mlxtend/evaluate/__init__.py index 01e0eb2ea..7c76fec88 100644 --- a/mlxtend/evaluate/__init__.py +++ b/mlxtend/evaluate/__init__.py @@ -22,8 +22,7 @@ from .proportion_difference import proportion_difference from .scoring import scoring from .time_series import GroupTimeSeriesSplit -from .ttest import (paired_ttest_5x2cv, paired_ttest_kfold_cv, - paired_ttest_resampled) +from .ttest import paired_ttest_5x2cv, paired_ttest_kfold_cv, paired_ttest_resampled __all__ = [ "scoring", From 0010304baaddf1fc826ba4822aed3d168d548908 Mon Sep 17 00:00:00 2001 From: rasbt Date: Sat, 14 May 2022 09:58:23 -0500 Subject: [PATCH 23/30] add description at the top --- .../evaluate/GroupTimeSeriesSplit.ipynb | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb b/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb index c5fc5fde1..ff971b8e4 100644 --- a/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb +++ b/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb @@ -7,6 +7,20 @@ "# GroupTimeSeriesSplit: A scikit-learn compatible version of the time series validation with groups" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A scikit-learn-compatible time series cross-validator that supports non-overlapping groups." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> `from mlxtend.evaluate import GroupTimeSeriesSplit` " + ] + }, { "cell_type": "code", "execution_count": 1, @@ -1709,9 +1723,9 @@ ], "metadata": { "kernelspec": { - "display_name": "mlxtend", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "mlxtend" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -1723,7 +1737,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.9.7" } }, "nbformat": 4, From 82eb1246fca5a2483997b8b056de2b37044ffe63 Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Sun, 15 May 2022 13:45:51 +0300 Subject: [PATCH 24/30] Prepare description and code for documentation --- .../evaluate/GroupTimeSeriesSplit.ipynb | 1873 ++++------------- mlxtend/evaluate/time_series.py | 117 + 2 files changed, 566 insertions(+), 1424 deletions(-) diff --git a/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb b/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb index ff971b8e4..1761a1b07 100644 --- a/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb +++ b/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb @@ -22,15 +22,37 @@ ] }, { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "tags": [] - }, - "outputs": [], + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Overview" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, "source": [ - "%load_ext autoreload\n", - "%autoreload 2" + "Time series tasks in machine learning require special type of validation, because the time order of the objects is important if we want to get more fair evaluation of the ML model’s quality. \n", + "Also there can be different units for splitting the data for different tasks - hours, days, months etc. \n", + "\n", + "Here, we use time series validation with support of the groups which can be flexibly configured along with other parameters:\n", + "\n", + "- Test size\n", + "- Train size\n", + "- Number of splits\n", + "- Gap size\n", + "- Shift size \n", + "- Window type \n", + "\n", + "There are several features that need to be taken into account:\n", + "\n", + "- Implementation is compatible with sklearn-learn API\n", + "- It can be used both for holdout split and cross-validation\n", + "- Numbers or custom non-numeric values can be used as groups\n", + "- Groups should be consecutive\n", + "- Test size with either train size or number of splits are required parameters for splitting\n", + "- If full data can’t be used with specific parameters, the most recent data is considered to split\n", + "- If split is impossible (e.g., the data is not enough to split) using specified parameters, an exception will be raised " ] }, { @@ -39,21 +61,17 @@ "metadata": {}, "outputs": [], "source": [ - "from pathlib import Path\n", - "\n", - "from IPython.display import display\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.dummy import DummyClassifier\n", "from sklearn.model_selection import cross_val_score\n", "\n", - "from matplotlib.patches import Patch\n", - "import matplotlib.pyplot as plt\n", - "from matplotlib.ticker import MaxNLocator\n", - "\n", - "%matplotlib inline\n", - "\n", - "from mlxtend.evaluate.time_series import GroupTimeSeriesSplit" + "from mlxtend.evaluate.time_series import (\n", + " GroupTimeSeriesSplit,\n", + " plot_splits,\n", + " print_cv_info,\n", + " print_split_info,\n", + ")" ] }, { @@ -65,6 +83,13 @@ "## Prepare sample data" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Group numbers" + ] + }, { "cell_type": "code", "execution_count": 3, @@ -73,12 +98,7 @@ { "data": { "text/plain": [ - "{'2021-01': 0,\n", - " '2021-02': 1,\n", - " '2021-03': 2,\n", - " '2021-04': 3,\n", - " '2021-05': 4,\n", - " '2021-06': 5}" + "array([0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 5, 5])" ] }, "execution_count": 3, @@ -87,8 +107,15 @@ } ], "source": [ - "months_map = {f\"2021-0{i+1}\": i for i in range(6)}\n", - "months_map" + "groups = np.array([0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 5, 5])\n", + "groups" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Group names (months)" ] }, { @@ -99,22 +126,9 @@ { "data": { "text/plain": [ - "['2021-01',\n", - " '2021-02',\n", - " '2021-02',\n", - " '2021-02',\n", - " '2021-02',\n", - " '2021-03',\n", - " '2021-03',\n", - " '2021-03',\n", - " '2021-04',\n", - " '2021-04',\n", - " '2021-05',\n", - " '2021-05',\n", - " '2021-06',\n", - " '2021-06',\n", - " '2021-06',\n", - " '2021-06']" + "array(['2021-01', '2021-02', '2021-02', '2021-02', '2021-02', '2021-03',\n", + " '2021-03', '2021-03', '2021-04', '2021-04', '2021-05', '2021-05',\n", + " '2021-06', '2021-06', '2021-06', '2021-06'], dtype='\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
num_feature
2021-010
2021-027
2021-026
2021-024
2021-024
2021-038
2021-030
2021-036
2021-042
2021-040
2021-055
2021-059
2021-067
2021-067
2021-067
2021-067
\n", - "" - ], + "image/png": "\n", "text/plain": [ - " num_feature\n", - "2021-01 0\n", - "2021-02 7\n", - "2021-02 6\n", - "2021-02 4\n", - "2021-02 4\n", - "2021-03 8\n", - "2021-03 0\n", - "2021-03 6\n", - "2021-04 2\n", - "2021-04 0\n", - "2021-05 5\n", - "2021-05 9\n", - "2021-06 7\n", - "2021-06 7\n", - "2021-06 7\n", - "2021-06 7" + "
" ] }, - "metadata": {}, + "metadata": { + "needs_background": "light" + }, "output_type": "display_data" }, { - "data": { - "text/plain": [ - "0 1\n", - "1 0\n", - "2 1\n", - "3 0\n", - "4 1\n", - "5 0\n", - "6 0\n", - "7 1\n", - "8 1\n", - "9 1\n", - "10 0\n", - "11 1\n", - "12 1\n", - "13 0\n", - "14 0\n", - "15 0\n", - "Name: target, dtype: int64" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stdout", + "output_type": "stream", + "text": [ + "Train indices: [0 1 2 3 4 5 6 7]\n", + "Test indices: [8 9]\n", + "Train length: 8\n", + "Test length: 2\n", + "Train groups: [0 1 1 1 1 2 2 2]\n", + "Test groups: [3 3]\n", + "Train group size: 3\n", + "Test group size: 1\n", + "Train group months: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03'\n", + " '2021-03']\n", + "Test group months: ['2021-04' '2021-04']\n", + "\n", + "Train indices: [1 2 3 4 5 6 7 8 9]\n", + "Test indices: [10 11]\n", + "Train length: 9\n", + "Test length: 2\n", + "Train groups: [1 1 1 1 2 2 2 3 3]\n", + "Test groups: [4 4]\n", + "Train group size: 3\n", + "Test group size: 1\n", + "Train group months: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03'\n", + " '2021-04' '2021-04']\n", + "Test group months: ['2021-05' '2021-05']\n", + "\n", + "Train indices: [ 5 6 7 8 9 10 11]\n", + "Test indices: [12 13 14 15]\n", + "Train length: 7\n", + "Test length: 4\n", + "Train groups: [2 2 2 3 3 4 4]\n", + "Test groups: [5 5 5 5]\n", + "Train group size: 3\n", + "Test group size: 1\n", + "Train group months: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04' '2021-05' '2021-05']\n", + "Test group months: ['2021-06' '2021-06' '2021-06' '2021-06']\n", + "\n" + ] } ], "source": [ - "data = [[0], [7], [6], [4], [4], [8], [0], [6], [2], [0], [5], [9], [7], [7], [7], [7]]\n", - "target = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0]\n", - "\n", - "X = pd.DataFrame(data, index=months, columns=[\"num_feature\"])\n", - "y = pd.Series(target, name=\"target\")\n", + "cv_args = {\"test_size\": 1, \"train_size\": 3}\n", "\n", - "display(X, y)" + "plot_splits(X, y, groups, **cv_args)\n", + "print_split_info(X, y, groups, **cv_args)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Define helper function" + "### Usage in CV" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Split number: 1\n", + "Train true target: [1 0 1 0 1 0 0 1]\n", + "Train predicted target: [0 0 0 0 0 0 0 0]\n", + "Test true target: [1 1]\n", + "Test predicted target: [0 0]\n", + "Accuracy: 0.0\n", + "\n", + "Split number: 2\n", + "Train true target: [0 1 0 1 0 0 1 1 1]\n", + "Train predicted target: [1 1 1 1 1 1 1 1 1]\n", + "Test true target: [0 1]\n", + "Test predicted target: [1 1]\n", + "Accuracy: 0.5\n", + "\n", + "Split number: 3\n", + "Train true target: [0 0 1 1 1 0 1]\n", + "Train predicted target: [1 1 1 1 1 1 1]\n", + "Test true target: [1 0 0 0]\n", + "Test predicted target: [1 1 1 1]\n", + "Accuracy: 0.25\n", + "\n" + ] + } + ], "source": [ - "def test_cv(X, y, groups, **cv_args):\n", - " cv = GroupTimeSeriesSplit(**cv_args)\n", - " groups = np.array(groups)\n", - "\n", - " for train_idx, test_idx in cv.split(X, groups=groups):\n", - " print(\"TRAIN INDICES:\", train_idx)\n", - " print(\"TEST INDICES:\", test_idx)\n", - " print(\"TRAIN LENGTH:\", len(train_idx))\n", - " print(\"TEST LENGTH:\", len(test_idx))\n", - " print(\"TRAIN GROUPS:\", groups[train_idx])\n", - " print(\"TEST GROUPS:\", groups[test_idx])\n", - " print(\"TRAIN GROUP SIZE:\", len(set(groups[train_idx])))\n", - " print(\"TEST GROUP SIZE:\", len(set(groups[test_idx])))\n", - " print(\"TRAIN GROUP MONTHS:\", X.index[train_idx].values)\n", - " print(\"TEST GROUP MONTHS:\", X.index[test_idx].values)\n", - " print()\n", - "\n", - "\n", - "def plot_cv_indices(cv, X, y, groups, n_splits, image_file_path=None):\n", - " \"\"\"Create a sample plot for indices of a cross-validation object.\"\"\"\n", - "\n", - " fig, ax = plt.subplots(figsize=(12, 4))\n", - " cmap_data = plt.cm.tab20\n", - " cmap_cv = plt.cm.coolwarm\n", - " lw = 10\n", - " marker_size = 200\n", - "\n", - " for split_idx, (train_idx, test_idx) in enumerate(\n", - " cv.split(X=X, y=y, groups=groups)\n", - " ):\n", - " indices = np.array([np.nan] * len(X))\n", - " indices[test_idx] = 1\n", - " indices[train_idx] = 0\n", - "\n", - " ax.scatter(\n", - " range(len(X)),\n", - " [split_idx + 0.5] * len(X),\n", - " c=indices,\n", - " marker=\"_\",\n", - " lw=lw,\n", - " cmap=cmap_cv,\n", - " vmin=-0.4,\n", - " vmax=1.4,\n", - " s=marker_size,\n", - " )\n", - " ax.scatter(\n", - " range(len(X)),\n", - " [split_idx + 1.5] * len(X),\n", - " c=groups,\n", - " marker=\"_\",\n", - " lw=lw,\n", - " cmap=cmap_data,\n", - " s=marker_size,\n", - " )\n", - "\n", - " yticklabels = list(range(n_splits)) + [\"group\"]\n", - " ax.set(\n", - " yticks=np.arange(n_splits + 1) + 0.5,\n", - " yticklabels=yticklabels,\n", - " ylabel=\"CV iteration\",\n", - " ylim=[n_splits + 1.2, -0.2],\n", - " xlim=[-0.5, len(indices) - 0.5],\n", - " )\n", - " ax.legend(\n", - " [Patch(color=cmap_cv(0.2)), Patch(color=cmap_cv(0.8))],\n", - " [\"Training set\", \"Testing set\"],\n", - " loc=(1.02, 0.8),\n", - " fontsize=13,\n", - " )\n", - "\n", - " ax.set_title(\"{}\\n{}\".format(type(cv).__name__, cv_args), fontsize=15)\n", - " ax.xaxis.set_major_locator(MaxNLocator(min_n_ticks=len(X), integer=True))\n", - " ax.set_xlabel(xlabel=\"Sample index\", fontsize=13)\n", - " ax.set_ylabel(ylabel=\"CV iteration\", fontsize=13)\n", - " ax.tick_params(axis=\"both\", which=\"major\", labelsize=13)\n", - " ax.tick_params(axis=\"both\", which=\"minor\", labelsize=13)\n", - "\n", - " plt.tight_layout()\n", - "\n", - " if image_file_path:\n", - " plt.savefig(image_file_path, bbox_inches=\"tight\")\n", - "\n", - " plt.show()\n", - "\n", - "\n", - "def plot_cv(X, y, groups, image_file_path=None, **cv_args):\n", - " cv = GroupTimeSeriesSplit(**cv_args)\n", - " cv._n_groups = len(np.unique(groups))\n", - " cv._calculate_split_params()\n", - " n_splits = cv.n_splits\n", + "cv = GroupTimeSeriesSplit(**cv_args)\n", + "clf = DummyClassifier(strategy=\"most_frequent\")\n", "\n", - " plot_cv_indices(cv, X, y, groups, n_splits, image_file_path=image_file_path)" + "scores = cross_val_score(clf, X, y, groups=groups, scoring=\"accuracy\", cv=cv)\n", + "print_cv_info(cv, X, y, groups, clf, scores)" ] }, { "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "## Check train/test split" - ] - }, - { - "cell_type": "code", - "execution_count": 8, "metadata": {}, - "outputs": [], "source": [ - "cv_args = {\"test_size\": 3, \"n_splits\": 1}" + "## Example 2 -- Multiple training groups (with number of splits specified)" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TRAIN INDICES: [0 1 2 3 4 5 6 7]\n", - "TEST INDICES: [ 8 9 10 11 12 13 14 15]\n", - "TRAIN LENGTH: 8\n", - "TEST LENGTH: 8\n", - "TRAIN GROUPS: [0 1 1 1 1 2 2 2]\n", - "TEST GROUPS: [3 3 4 4 5 5 5 5]\n", - "TRAIN GROUP SIZE: 3\n", - "TEST GROUP SIZE: 3\n", - "TRAIN GROUP MONTHS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03'\n", - " '2021-03']\n", - "TEST GROUP MONTHS: ['2021-04' '2021-04' '2021-05' '2021-05' '2021-06' '2021-06' '2021-06'\n", - " '2021-06']\n", - "\n" - ] - }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -489,50 +306,107 @@ "needs_background": "light" }, "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train indices: [0 1 2 3 4]\n", + "Test indices: [5 6 7 8 9]\n", + "Train length: 5\n", + "Test length: 5\n", + "Train groups: [0 1 1 1 1]\n", + "Test groups: [2 2 2 3 3]\n", + "Train group size: 2\n", + "Test group size: 2\n", + "Train group months: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02']\n", + "Test group months: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04']\n", + "\n", + "Train indices: [1 2 3 4 5 6 7]\n", + "Test indices: [ 8 9 10 11]\n", + "Train length: 7\n", + "Test length: 4\n", + "Train groups: [1 1 1 1 2 2 2]\n", + "Test groups: [3 3 4 4]\n", + "Train group size: 2\n", + "Test group size: 2\n", + "Train group months: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03']\n", + "Test group months: ['2021-04' '2021-04' '2021-05' '2021-05']\n", + "\n", + "Train indices: [5 6 7 8 9]\n", + "Test indices: [10 11 12 13 14 15]\n", + "Train length: 5\n", + "Test length: 6\n", + "Train groups: [2 2 2 3 3]\n", + "Test groups: [4 4 5 5 5 5]\n", + "Train group size: 2\n", + "Test group size: 2\n", + "Train group months: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04']\n", + "Test group months: ['2021-05' '2021-05' '2021-06' '2021-06' '2021-06' '2021-06']\n", + "\n" + ] } ], "source": [ - "test_cv(X, y, groups, **cv_args)\n", - "plot_cv(X, y, groups, **cv_args)" + "cv_args = {\"test_size\": 2, \"n_splits\": 3}\n", + "\n", + "plot_splits(X, y, groups, **cv_args)\n", + "print_split_info(X, y, groups, **cv_args)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Check cross-validation split" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "### Success cases" + "### Usage in CV" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 9, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Split number: 1\n", + "Train true target: [1 0 1 0 1]\n", + "Train predicted target: [1 1 1 1 1]\n", + "Test true target: [0 0 1 1 1]\n", + "Test predicted target: [1 1 1 1 1]\n", + "Accuracy: 0.6\n", + "\n", + "Split number: 2\n", + "Train true target: [0 1 0 1 0 0 1]\n", + "Train predicted target: [0 0 0 0 0 0 0]\n", + "Test true target: [1 1 0 1]\n", + "Test predicted target: [0 0 0 0]\n", + "Accuracy: 0.25\n", + "\n", + "Split number: 3\n", + "Train true target: [0 0 1 1 1]\n", + "Train predicted target: [1 1 1 1 1]\n", + "Test true target: [0 1 1 0 0 0]\n", + "Test predicted target: [1 1 1 1 1 1]\n", + "Accuracy: 0.33\n", + "\n" + ] + } + ], "source": [ - "- test_size + train_size \n", - "- test_size + n_splits\n", - "- test_size + n_splits + gap_size\n", - "- test_size + n_splits + shift_size\n", - "- test_size + n_splits + expanding window\n", - "- test_size + train_size + n_splits (full usage of data)\n", - "- test_size + train_size + n_splits (partial usage of data)\n", - "- Groups are not sorted\n", - "- Group names\n", - "- Usage in CV" + "cv = GroupTimeSeriesSplit(**cv_args)\n", + "clf = DummyClassifier(strategy=\"most_frequent\")\n", + "\n", + "scores = cross_val_score(clf, X, y, groups=groups, scoring=\"accuracy\", cv=cv)\n", + "print_cv_info(cv, X, y, groups, clf, scores)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**test_size + train_size**" + "## Example 3 -- Defining the gap size between training and test folds" ] }, { @@ -540,50 +414,9 @@ "execution_count": 10, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TRAIN INDICES: [0 1 2 3 4 5 6 7]\n", - "TEST INDICES: [8 9]\n", - "TRAIN LENGTH: 8\n", - "TEST LENGTH: 2\n", - "TRAIN GROUPS: [0 1 1 1 1 2 2 2]\n", - "TEST GROUPS: [3 3]\n", - "TRAIN GROUP SIZE: 3\n", - "TEST GROUP SIZE: 1\n", - "TRAIN GROUP MONTHS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03'\n", - " '2021-03']\n", - "TEST GROUP MONTHS: ['2021-04' '2021-04']\n", - "\n", - "TRAIN INDICES: [1 2 3 4 5 6 7 8 9]\n", - "TEST INDICES: [10 11]\n", - "TRAIN LENGTH: 9\n", - "TEST LENGTH: 2\n", - "TRAIN GROUPS: [1 1 1 1 2 2 2 3 3]\n", - "TEST GROUPS: [4 4]\n", - "TRAIN GROUP SIZE: 3\n", - "TEST GROUP SIZE: 1\n", - "TRAIN GROUP MONTHS: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03'\n", - " '2021-04' '2021-04']\n", - "TEST GROUP MONTHS: ['2021-05' '2021-05']\n", - "\n", - "TRAIN INDICES: [ 5 6 7 8 9 10 11]\n", - "TEST INDICES: [12 13 14 15]\n", - "TRAIN LENGTH: 7\n", - "TEST LENGTH: 4\n", - "TRAIN GROUPS: [2 2 2 3 3 4 4]\n", - "TEST GROUPS: [5 5 5 5]\n", - "TRAIN GROUP SIZE: 3\n", - "TEST GROUP SIZE: 1\n", - "TRAIN GROUP MONTHS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04' '2021-05' '2021-05']\n", - "TEST GROUP MONTHS: ['2021-06' '2021-06' '2021-06' '2021-06']\n", - "\n" - ] - }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -592,20 +425,59 @@ "needs_background": "light" }, "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train indices: [0 1 2 3 4]\n", + "Test indices: [8 9]\n", + "Train length: 5\n", + "Test length: 2\n", + "Train groups: [0 1 1 1 1]\n", + "Test groups: [3 3]\n", + "Train group size: 2\n", + "Test group size: 1\n", + "Train group months: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02']\n", + "Test group months: ['2021-04' '2021-04']\n", + "\n", + "Train indices: [1 2 3 4 5 6 7]\n", + "Test indices: [10 11]\n", + "Train length: 7\n", + "Test length: 2\n", + "Train groups: [1 1 1 1 2 2 2]\n", + "Test groups: [4 4]\n", + "Train group size: 2\n", + "Test group size: 1\n", + "Train group months: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03']\n", + "Test group months: ['2021-05' '2021-05']\n", + "\n", + "Train indices: [5 6 7 8 9]\n", + "Test indices: [12 13 14 15]\n", + "Train length: 5\n", + "Test length: 4\n", + "Train groups: [2 2 2 3 3]\n", + "Test groups: [5 5 5 5]\n", + "Train group size: 2\n", + "Test group size: 1\n", + "Train group months: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04']\n", + "Test group months: ['2021-06' '2021-06' '2021-06' '2021-06']\n", + "\n" + ] } ], "source": [ - "cv_args = {\"test_size\": 1, \"train_size\": 3}\n", + "cv_args = {\"test_size\": 1, \"n_splits\": 3, \"gap_size\": 1}\n", "\n", - "test_cv(X, y, groups, **cv_args)\n", - "plot_cv(X, y, groups, **cv_args)" + "plot_splits(X, y, groups, **cv_args)\n", + "print_split_info(X, y, groups, **cv_args)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**test_size + n_splits**" + "### Usage in CV" ] }, { @@ -617,66 +489,43 @@ "name": "stdout", "output_type": "stream", "text": [ - "TRAIN INDICES: [0 1 2 3 4]\n", - "TEST INDICES: [5 6 7 8 9]\n", - "TRAIN LENGTH: 5\n", - "TEST LENGTH: 5\n", - "TRAIN GROUPS: [0 1 1 1 1]\n", - "TEST GROUPS: [2 2 2 3 3]\n", - "TRAIN GROUP SIZE: 2\n", - "TEST GROUP SIZE: 2\n", - "TRAIN GROUP MONTHS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02']\n", - "TEST GROUP MONTHS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04']\n", + "Split number: 1\n", + "Train true target: [1 0 1 0 1]\n", + "Train predicted target: [1 1 1 1 1]\n", + "Test true target: [1 1]\n", + "Test predicted target: [1 1]\n", + "Accuracy: 1.0\n", "\n", - "TRAIN INDICES: [1 2 3 4 5 6 7]\n", - "TEST INDICES: [ 8 9 10 11]\n", - "TRAIN LENGTH: 7\n", - "TEST LENGTH: 4\n", - "TRAIN GROUPS: [1 1 1 1 2 2 2]\n", - "TEST GROUPS: [3 3 4 4]\n", - "TRAIN GROUP SIZE: 2\n", - "TEST GROUP SIZE: 2\n", - "TRAIN GROUP MONTHS: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03']\n", - "TEST GROUP MONTHS: ['2021-04' '2021-04' '2021-05' '2021-05']\n", + "Split number: 2\n", + "Train true target: [0 1 0 1 0 0 1]\n", + "Train predicted target: [0 0 0 0 0 0 0]\n", + "Test true target: [0 1]\n", + "Test predicted target: [0 0]\n", + "Accuracy: 0.5\n", "\n", - "TRAIN INDICES: [5 6 7 8 9]\n", - "TEST INDICES: [10 11 12 13 14 15]\n", - "TRAIN LENGTH: 5\n", - "TEST LENGTH: 6\n", - "TRAIN GROUPS: [2 2 2 3 3]\n", - "TEST GROUPS: [4 4 5 5 5 5]\n", - "TRAIN GROUP SIZE: 2\n", - "TEST GROUP SIZE: 2\n", - "TRAIN GROUP MONTHS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04']\n", - "TEST GROUP MONTHS: ['2021-05' '2021-05' '2021-06' '2021-06' '2021-06' '2021-06']\n", + "Split number: 3\n", + "Train true target: [0 0 1 1 1]\n", + "Train predicted target: [1 1 1 1 1]\n", + "Test true target: [1 0 0 0]\n", + "Test predicted target: [1 1 1 1]\n", + "Accuracy: 0.25\n", "\n" ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" } ], "source": [ - "cv_args = {\"test_size\": 2, \"n_splits\": 3}\n", + "cv = GroupTimeSeriesSplit(**cv_args)\n", + "clf = DummyClassifier(strategy=\"most_frequent\")\n", "\n", - "test_cv(X, y, groups, **cv_args)\n", - "plot_cv(X, y, groups, **cv_args)" + "scores = cross_val_score(clf, X, y, groups=groups, scoring=\"accuracy\", cv=cv)\n", + "print_cv_info(cv, X, y, groups, clf, scores)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**test_size + n_splits + gap_size**" + "## Example 4 -- Expanding the window size" ] }, { @@ -684,48 +533,9 @@ "execution_count": 12, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TRAIN INDICES: [0 1 2 3 4]\n", - "TEST INDICES: [8 9]\n", - "TRAIN LENGTH: 5\n", - "TEST LENGTH: 2\n", - "TRAIN GROUPS: [0 1 1 1 1]\n", - "TEST GROUPS: [3 3]\n", - "TRAIN GROUP SIZE: 2\n", - "TEST GROUP SIZE: 1\n", - "TRAIN GROUP MONTHS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02']\n", - "TEST GROUP MONTHS: ['2021-04' '2021-04']\n", - "\n", - "TRAIN INDICES: [1 2 3 4 5 6 7]\n", - "TEST INDICES: [10 11]\n", - "TRAIN LENGTH: 7\n", - "TEST LENGTH: 2\n", - "TRAIN GROUPS: [1 1 1 1 2 2 2]\n", - "TEST GROUPS: [4 4]\n", - "TRAIN GROUP SIZE: 2\n", - "TEST GROUP SIZE: 1\n", - "TRAIN GROUP MONTHS: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03']\n", - "TEST GROUP MONTHS: ['2021-05' '2021-05']\n", - "\n", - "TRAIN INDICES: [5 6 7 8 9]\n", - "TEST INDICES: [12 13 14 15]\n", - "TRAIN LENGTH: 5\n", - "TEST LENGTH: 4\n", - "TRAIN GROUPS: [2 2 2 3 3]\n", - "TEST GROUPS: [5 5 5 5]\n", - "TRAIN GROUP SIZE: 2\n", - "TEST GROUP SIZE: 1\n", - "TRAIN GROUP MONTHS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04']\n", - "TEST GROUP MONTHS: ['2021-06' '2021-06' '2021-06' '2021-06']\n", - "\n" - ] - }, { "data": { - "image/png": "\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA1gAAAEYCAYAAABBWFftAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAA6GklEQVR4nO3de5yUdd3/8fcbFhDY5XxGDqZoKQnIapn5y9vs5Nm8jcws1Dyf0iRN7U5NU9PbLA+VmmftVjFTszRPlGZaIIKIopCAyknktLAILPv5/XFdq+O4553ZmV1ez8djHjNzHb7Xe+aC3fns93t9xxEhAAAAAEDLdSh0AAAAAABoLyiwAAAAACBHKLAAAAAAIEcosAAAAAAgRyiwAAAAACBHKLAAAAAAIEcosAC0e7YPtv1X2+/Z3mj7HduTbX+10Nlq2B5pOxpxq9nulFbM1t32T23Psb3e9lLbf7N9TA6PMd/2lblqL6PdA2z/w/Yq22tsv2L7N7ZLm9jOxPR9L02f15yH/TO2+aHtvXL7CgAAbU1JoQMAQD7Z/oWk0yTdLunXkt6TNELSNyX9xfZ2ETGvgBFrLJa0e8bzT0i6S9LJkl6sZbs3Wy+a7pc0TtLFkmZJGiDp/0naV9LvcnSMQ5Scm5yxfbikuyX9Vkn2kLSzpO9K6iVpbQuarzkPr2Us+6GkayVNaUG7AIA2znzRMID2yvZBkv4o6aiIuLWW9QdImhYRi2pZ1zUi1uc9ZB1sj5b0sqT/iogpBcwxStLrkr4REfdlrXO08JdIPt9n2/+QtCoi9qtlXZOy254o6RZJZRFRa2Fme7mkayPiguYlBgC0BwwRBNCefV/Sv2srriQpIh6uKa7S4V5n2r7a9rtKihvZ7mf7tnR4YaXtKbbLM9upbcie7QvSD9w1z2uGmO1q+5l0qN3rtg9p6ovKPl6aabLto2y/aXut7Ttsd7G9m+1/pcum2B6e1dZWtn9u+y3bG2zPsL1vxia90vsltbx/HylQbA+3/X+2V6Tv1WO2d8hYXzOs7gjbt9teJenhdN3Hhgja3jMdiliZvv832i7LWN/L9k22F9l+3/ZC2zdmZf9Y7uzsGef+l2n2Vbavsd25tn2zXsv+Nfkl9ZX0k4zhnHvVtT8AoP2iwALQLtkuUTKE669N2G2SpMGSjlQyrFBKesC+IuksSROU/Nx82vZ2zYx2j6QHJX1dSRF3n+0xzWwr02eVDH07VclQtW9IukbSjZJ+KenbSoYd3pC132RJEyX9TNIBkv4t6SHbY9P1cyStk3S17S/b3qq2g9vuI+lZSTtIOiE9fndJT9jumrX5lZIqJB2WHre29vaQ9ISSAum/lRTL+yrpRapxlaTPSzpDyTk6V8kwwBovSjrc9im2h9R2nAw/kLS1pCOUDCc8TtIlDeyT6RBJq5UMmdw9vb1Y7x4AgHaJa7AAtFd9JXWR9FbmQtuW1DFj0eaM3ozFETEhY9uvStpD0l4R8bd02VOS5ispxo5vRq6bIuLKtK3HJM2W9CMl14S1RKmkgyJiddr2XpKOlfSFiPh7umyIpOtsd4uISttflLSfMl6fpL/a3l7SeZIOi4g1to9VUqg9JmmT7ecl3ZG+lpr37gwlBdXYiFiRHu8fSt6royVdl5H1+Yg4uYHXc5mk57LOxzuSnrQ9OiJmSdpN0nURcU/GfndmPD5X0qeVFJrX2H5TScH884jI7tmqSF9vtZJr87pIOs/2pTWvpz4RMd12laS3I+L5hrYHALRf9GABaO+yr7P5gaRNGbfMD/p/ztp2N0nLMooPRcQ6SX9S0nPSHA9ktFWtpDdrt2a2lWlqTXGVmitpo5JepcxlklTTm7OPkh6if9guqblJelLSB8MgI+L3SiYGOVrS/0naXklP2N0Zbe8j6XFJazLaqZA0LbOt1CP1vRDb3ZT0AN2bletZJedsfLrpS5Im2T4pLQo/IiLeSrfdR9L/SlqhpBCcaXvrrM0fTM9HjT9I6ippdH1ZAQDIRoEFoL16T9IGJcO+Mt0hadf0lm1p1vPBkpbVsV2fZubKbm9ZepyWWpX1fKOkiqyiYWN6XzPMr5+kQfpowblJ0gWShmU2FhHvRcQtEfGddN0tkr6ZMbyxn5IhlNlt/Vd2W/r4+5ytt5Jexuuz2togqVNGe6co6ZH6H0lzbL9h+yM9gRGxOSKejIizIqJcyVDCPkoK7Uy1nRcpN+cGALAFYYgggHYpIqps/1PSl5V8AK9ZvlTpB/xktOBHd8t6vljJlOTZBirpDamxQVL2hAi964g2QB+djnxAepxCWCHpHUkHN2WniNjkZPr7oyR9UtKMtK2HJP20ll0qspto4BCr0m0u0Md7FSVpUZpjlZJr5U6zvbOSa8/usj0zImbXkf2vtmekuTNln+ea54U6NwCANooeLADt2dWSPmP7yGbu/4KkAbb/X82CdPjafvro0Lu3JX0qY5sOkr5YR5uHZG13kKR/NTNfSz2ppAdrbURMzb6lGctqmaRCkkal90sz2tpJ0iu1tDWnKaHSYZjPS9qhtly1TasfETOVXBfXQWnxZPtjxXE6ScfW+ngv2kHp+ajxdUnrlXzvV2Nt1Ie9gwCALRQ9WADarYh40PbVkm61/V9KpgRfrmQCjC+nm9X5ZbMR8Zjt5yTdY/scJT1PZym5NueKjE0fkHSy7emS/iPpe5J61NHs92xvVPLB/XuStpN0ePNeYYs9rmTiisdtXy7pFSW5x0raKiJ+pGRWwIds3yzpOUmV6frzlFwDVVNoXqVkpsKnbF+jpGdsoKQvSHo2vY6rKX6oZEKLaiUzHVZIGq6kuD0vIl63/ayS936Wkh6vY5XMeFhTsD5m+zUl5/0tJcXkKUp6F3+bdbwyJTM63qikUPyxkgk0GpzgIsNrkvaz/aiSf1dzIiK79w4A0M5RYAFo1yLiDNt/l3SSkim0yyS9K+mfkvaNiL800MTBSiZIuFpJ78S/JO0dEXMztrlQyZCyi5X0YlyrpFipbaa8b0r6RbrtW5ImRMT05ry2loqIsP11JbPtfV9JAbNCSeF0TbrZPEk3Kbl26UQlxeVCSTdLujwiqtK2ltv+rJKpzX+h5DuoFispwGY2I9uzac/hhUqum+soaYGkR/Vh79M/lUwxP1LSZknTJX0tIt5O1/9cyft9uZLz866SqdM/HxHZvYb/q2Qa+98r6QX7Xfq+NMUkJbMlPiKpm5Lrz6Y0sQ0AQBvnJnyRPQCgmWxPVDIxRFlE1NlrhtZnOySdGhHXFjoLAKDt4xosAAAAAMgRCiwAAAAAyBGGCAIAAABAjtCDBQAAAAA5QoEFAAAAADlCgQWgVdkea/t52+ts35K1LmzvlT7ubPsC22PzkCEvbdueYntyjtvcK53lrqn79bf9K9v/sr3R9vwWZJiYub/t3Wxf0Nz26jlO2D4l1+02cMyiOWdZbXS3/Vb6nozOWvcL2+ttz7S9b8vSAgByjQILQGu7WNJQSUekj+vSWdJPlHypba7lq+2TJP0ox20211BJEyQtUfK9Vrm0m5L3L9d2l3RfHtqtTzGds0znSepUx7qrJR0iaZOS71wDABQRvmgYQGvbXtKfIuKPhQ6SaxExu9AZMsyMiIGSZPtKSf/d2gFsW1KXiHi/MdtHxPN5jlTbMYvpnEmSbG8n6TRJZ0n6dfb6iFggaUG63TW2O0fExlaOCQCoAz1YAFpbZ0nrGrFdRXp/SzpMKmyPlCTbW9n+eTqEaoPtGdlDpWwfaHtaOhRxpe0XbH+hobbrY3sn24/aXpG2+6rtkzPWf2S4WUbb2beJGdvsaftvtittv2f7RttljXh/6hUR1S1tozZp9mvSxzWvZ0r6/ALby21/3va/Jb0v6bB0uNu1tuekr/NN29fZ7pHV9keGCNa8n7a/ZXuu7TW2/2J76ybkbTPnLMPVkm6S9FoD29V8YXXnHB4bANBC9GABKBoR4Yyne0t6SskwwkfSZYvT+8n6cJjaPEnfkPSQ7fKIeMn2tuk2v5Q0SdJWksZL6tOItuvzsKRXJX1b0gZJO0jqUc/2u2c9/5akUyTNlSTbe0h6QtIflfQw9ZV0maTe6XNFxBRJme+L0uuhpkTExEZkbrGIuFXSrenTRyT9r6Qf6MPXtyZj826SbpP0c0mvS1qULuuoZNjbu5KGpY/vk/SVBg7/GUlD0uN1VXJOb5DU2GuP2tQ5s72fpM9KOlLSmHpfGQCgKFFgAWhtvdS4Hqx/p/fzMoeO2f6ipP0k7RURf0sX/9X29ko+tB8maZykioiYlNHenxtquz62+0naRtJBEfFyuvjJ+vbJyj1e0rGSLoyIZ9PFl0l6LiImZGz3jqQnbY+OiFl1NF0laXNjcudaRLxbM+FFHe9dV0lnRsSDWctPrHlgu0TSm5KetT08IhbWc8gekvaLiJXpvoMk/cJ214hYX1/WtnbObHdW0nv1PxGxMhlhWa/K9L6nPuzNAgAUGEMEAbSKdJjY0Uo+DE5rQVP7KJm44R+2S2puSj44l6fbvCypp+3bbH/ZdvcWhU+skPSWpN/YnmB7QGN3tN1f0h+U9HxclC7rpqS35N6s1/GskskLxtfVXkRsFxHHNP+l5FVI+kv2QttH2p5ue62S11dTsGzfQHv/rimuUjXXTA1tRJa2ds7OVDKs8reNjDldUrWkk233cCMqMgBA/lFgAci79PqmtZJ+J+maiHioBc31kzRIyQfazNsFSoaeKSLmSDpI0ieU9Fwtt313+qG5WdJrmr6spLi7WdIS28/YHlfffukH8HslbZT07Yiomb67t5Jhc9dnvY4NSmaPG9bcrAW2MnvCBduHSLpd0j+V9DB+VskseFIyfLM+q7Ke17Td0H5t6pyl/zbPU/LvuMx2L0ml6eqy2v5IEBFvSDpZ0g8lrZb0hextAACtjyGCAFrDIkl7KrlW6iTbd0XEC81sa4WkdyQdXN9GEfGIpEds91QypPBqJZMzfLOZx1VEvCbpUNudlLyey9NjbF3PpBJXStpV0mcjYnXG8lVKensu0EeHL9ZY1NycBVbb9z8dJumFiDipZoE/nHAkv2HazjkbqqSgqu07uZ5T0kO7T+bCtEfuckkPSvqVpBdbcHwAQI5QYAHIu7RH41nbz0s6RkmvQkMFVl09FU8qmfBgbfrhuaFjr5Z0d/qBvmYCg0b3gtTR5iZJT9m+StLdSq4rW5G9ne0jJZ0u6ZvZ1+ZExLr0/dghIi5qTo4C2iglszk2cgr2rkp6eTIdkfNU9WgD52yupP/KWjZW0i8kHa3ai6c9lFyj9sOImJfjPACAZqLAAtBqIqLK9nLVP4tbzbYbbb8p6Ru2Zym5NmWmpMclPSbpcduXS3olbW+spK0i4ke2j1dSTD2qpFdhlJJelNvra7u+7xKyvbOSno17JP1HyXCxsyXNiIjaPqhvq2S2u78o+c6iz2asnhcR7yoZ2vWk7WolPRcVkoYr6XE7LyJeryPLXEl/a+iaHts13321vaRuGc//lh5ftm9VMmHIyPraylJT2J5u+ylJa9JhmXV5XNJ1ts9TUljvK+mLTThes7SlcxYRayVNydqn5uG/65g8o+b/0bLa2gQAFAYFFoDWFsqawroeJyj5gPyEpC6StomI+ba/LulcSd9X8uF2haSXlH4/k5JC7EBJVymZmn2xpBsl/U99bUuaX0+WJZKWKrlOZoiS4WJPK/nAXpthSnrIvpbeMh0l6daIeNb2/5N0oaQ7lFzfs0BJYbi0niwl6bYNua+O5/+lDz/Md1PTP6A/I+kKJT09l0r6u6S96tn+t0quhztdyXvyuJLpz/P9xcJt8Zw1Rc3/o9qGZQIACsQfXrsLAPlne56kP0fEqYXOAsn2AiXTgt9W6CxomrSn9jeSujZyqCYAoBXQgwWgtb0t6fO2h0haUs9EA8gz20OVzH73+0JnQePZ7qBkuONXJS2nuAKKx7Rp0zp36NDhxI4dOx4VET3V+BEbaBvC9urNmzffUl1d/evx48fXemkBBRaA1naVpLuUzAR4m6SJBU2TSj+01vnVFRFR1YpxWkVEvKNk6FybtCWes9RVSoZbblLdwx0BFEBJScmNPXr02GPIkCHrOnfu/B5fT9e+RIQ2btzYadGiRaeuWbNmF0nfrW07vgcLQKuKiAeVfJfVaCXTXReLm/Xx79b64JZ+lxeKy5Z6zv5X0s6S+kTELwodBsBHfH7EiBGru3Tpsoniqv2xrS5dumwaMWLEakmfr2s7erAAtLp0SNMrhc6R5QJJ19azvq1+L1V7doG2wHMWEW9JeqvQOQDUqmOHDh2Y4KCdS89xnRMXUWDVol+/fjFy5MhCxwDQisaPH9/gNuXl5a2QBI3FOQOQb9OmTVseEf0LnQNtCwVWLUaOHKmpU6cWOgYAAAAKKJ1pFWgSCiwAAAAgj372+41jKjfk/3N3ty6qOvfwzjPyfZxsv/71r/tcffXVg+bMmTM7H9u3NUxyAQAAAORRaxRXTTlOt27dxtXcSkpKdikpKdklc1lTj3viiSeuaEqx1NTtW9OcOXM62x4/b968Ts1tgx4sAAAAYAtSWVk5vebxhAkTRlRVVfn++++fX9u2GzZscJcuXZi4ownowQIAAAAgSdptt912OProo4fts88+25aWlo678MILB86bN6/TnnvuOap3795jysrKxo4fP36HZ555plvNPr/61a/6Dh8+fHRmG8cee+zWX/nKV7bt3r37uGHDho2+8847ezV3++rqap1zzjmDBg4cuHPPnj3HHnPMMcN233337c8888xav8tx/fr1Pvzww0f06dNnTGlp6bgRI0aMvvnmm3vXrH/00UdLx48fv0PPnj3HDhs2bPRPfvKTgdXV1ZKk8vLyHSVp9OjRo7t16zZu0qRJg5v6HlJgAQAAAPjAvffe2++0005btmbNmuk/+tGPllVXV/uEE05YtnDhwpcXL148Y+edd66cMGHCths2bKjzy74mT57c96yzzlqyZs2a6d/73veWnXjiiSMrKirqrD3q2/7666/ve+ONNw78wx/+8MayZctmDB48eOO///3v0rrauu666/q+9NJL3WfPnj1r7dq105988sk5Y8aMWS9J06ZN2+rQQw8ddcYZZyx97733XnrooYfm3nTTTQOuv/76vpI0derU2ZI0a9asWZWVldOvuOKKxU19/yiwAAAAAHzga1/72soDDzywokOHDiorK6seNWrUxiOOOGJ1WVlZdWlpaVx11VXvLF68uPOsWbO61NXG/vvvv/JLX/rSuo4dO+qMM85Yvnbt2o7N3f7uu+/ue+SRR767xx57rO/SpUtcdNFFSwcMGLCprrY6d+4clZWVHV566aWumzZt0nbbbbdp/Pjx70vSL3/5ywH77rvvym9/+9urSkpKNG7cuPe/973vLbv77rv7tuQ9y8Q1WAAAAAA+MGLEiA2ZzxcvXlxy0kknbf3888+XVVRUlNgOSVqyZEmdtcTgwYM/KIB69OhRLUlr1qyp88t569t+yZIlnUaMGLGxZn2HDh00ePDgjR9vJXHiiSe+t3Tp0k5nnXXWsAULFnTZfffdK6666qq3R48evWHhwoWdn3/++R5lZWW9araPCA8aNKjO9pqKHiwAAAAAH+jQ4aMlwhlnnDF06dKlnZ5//vnX1q5dO33hwoUzpaQwaY08gwYN2rRgwYLONc+rq6u1ePHiznVt36lTJ11yySVLZs2a9eqbb775cteuXasnTpw4UpK23nrrjYcddtjyioqKl2pua9eunT537txXpI+/9uagwAIAAABQp4qKio5du3at7t+/f9Xq1as7nHrqqVu35vEPP/zw9+68887+zz33XNcNGzb4wgsvHLhs2bI6p1F/6KGHyp555pluGzZscPfu3au7d+9e3bFjx5Ck008/fdnDDz/c5+677+65YcMGb9q0SdOmTdvqkUceKZWSnrQOHTpo9uzZWzU3LwUWAAAAkEfduqiqLR/nZz/72Tvvvfdep759+47daaeddvrc5z63tmPHOkf75dzJJ5/83lFHHbXs4IMPHjVgwIAxb7/9ducxY8as69KlS3Vt2y9evLjTxIkTt+ndu/fYQYMGjXnrrbc633TTTQskadddd31/8uTJb1xzzTUDBw0atHO/fv3GTpw4cZuagq20tDQmTZr0zlFHHfWJsrKysWefffagpuZ1BNPaZysvL4+pU6cWOgYAAAAKyPa0iChv7PYzZsyYP2bMmOX5zARp8+bNGjx48M4XXXTR2yeccMKKQmSYMWNGvzFjxoysbR09WAAAAACK2g033NC7srLSFRUVHc4888wh77//fodDDz10daFz1YYCCwAAAEBR++1vfztg4MCBY4YMGbLzM8880+MPf/jDG/37999c6Fy1YZp2AAAAAEVt2rRpcwqdobHowQIAAACAHKHAAgAAAIAcocACAAAAgByhwAIAAACAHKHAAgAAAIAcYRZBAAAAII9WXXXWmFi/Lu+fu921e1WvM6+cke/jNMV2222309lnn73o2GOPXVnoLK2FAgsAAADIo9YorppynG7duo2rebxx40ZLUufOnaNmWWVl5fSmHnvOnDmdP/nJT3567ty5M7fddttNNcvnzp37SlPbai2HHnroyJKSkrjnnnsW5LJdCiwAAABgC5JZQE2YMGFEVVWV77///vkFjNSucA0WAAAAAEnSG2+80fmrX/3qJ/r16zemf//+Ox9++OEjVq5c2UGSqqurdeqppw4dMGDAzt27dx83dOjQT19yySUDJKm8vHxHSRo9evTobt26jZs0adJgSRo6dOinr7/++j6S9Kc//amspKRk/I033th72LBho8vKysbuu+++n6hpX5JmzpzZZdddd92htLR03A477LDjT3/60wG2x9eV949//GPZpz71qR1LS0vH9e7de8znPve57WvWVVRUdDjuuOO2Hjp06Kd79uw5ds899xw1a9asLpJ0/vnnD3zwwQf73H///X27des2rlu3buOqqqpy8h5SYAEAAABQZWWlv/jFL27/yU9+8v358+fPnD179iuLFi3qdNxxxw2XpD/+8Y897r333r7PPffcq+vWrZv+r3/969W99tqrQpKmTp06W5JmzZo1q7KycvoVV1yxuLZjbN68WY8//niPWbNmzZ49e/asV155pdtll102QJI2bdqkAw88cNROO+1UuWTJkpceeOCBubfffnv/+jIfd9xx25xwwglL16xZM33RokUzzz333A+Oe8QRR4x44403tnr++edfXbp06Yzy8vJ1BxxwwHYbNmzwxRdfvPSggw5aceihh75XWVk5vbKycnpJSW4G91FgAQAAANA999zTKyJ09dVXLyotLY3+/ftvvvjiixc9+OCDfaqqqtS5c+fYuHGjX3rppa6VlZUeOnRo1R577LG+qce5+uqr3+nZs2f1sGHDqr761a+uevHFF7tL0lNPPVW6aNGiztdee+3bpaWlseOOO2486aSTltbXVqdOnWLevHld3n777ZKuXbvG/vvvXyFJixcvLnn44Yf73HDDDQuHDRtWtdVWW8WVV165aPny5Z2mTJnSvXnvUONQYAEAAADQf/7zn86LFy/uXFZWNrbmtu+++25vW2+99Van/fffv+K8885759JLLx08YMCAsXvssceov//9792acoyOHTtqyJAhH4zF6969e/W6des6StLChQs79enTp6q0tPSDCTe22WabDfW1N3ny5Lnz5s3basyYMTttu+22O1100UUDJOn111/vLEm77LLLjjWvpVevXmOrqqo8f/78zk3J3FRMcgEAAABAI0aM2Dhy5MgN9c38d9ZZZy0/66yzlldUVHSYNGnSkAkTJmy7ePHilzt0aHm/zfDhwzetXLmyZO3ata4pshoqhnbffff1jzzyyH+qq6v117/+tfTggw/efuzYsevHjx+/XpLmzJkzK7Ogy5SLzLW2m5dWAQAAALQpEyZMWL1p0yafc845g1auXNmhurpab775Zqfbb7+9lyQ9/fTT3R599NHS9evXu2vXrtVlZWWba4qUwYMHb+rQoYNmz569VXOPv/fee68dPHjwxtNOO23ryspKv/baa52vv/76gXVt//777/uaa67pu3jx4pIOHTqob9++VbajY8eOMXTo0KoDDjhgxTHHHDP8zTff7CRJy5cv73j77bf3Wr16dQdJGjhw4KYFCxZ02bx5c3Mj14oCCwAAAMgjd+2em+np8nycsrKy6ieffHLOq6++2nWHHXYY3aNHj3F777339tOnT+8mSRUVFR3PPPPMYf369Rvbu3fvsU899VSPO++8c54klZaWxqRJk9456qijPlFWVjb27LPPHtTU43fq1EkPPPDA3JkzZ3br37//2IMOOmi7CRMmvNepU6eoa5/Jkyf33nHHHXfq1q3buEMOOWTUpEmTFu23335rJemuu+5aMGrUqPf32muvHbp37z5u9OjRO9133329bUuSTjrppHcrKys79O7de2xZWdnYXM0i6Ig6826xysvLY+rUqYWOAQAAgAKyPS0iyhu7/YwZM+aPGTNmeT4zbWmuuOKKftddd92g+fPnzyp0lkwzZszoN2bMmJG1raMHCwAAAEBReOyxx0pfeeWVLtXV1XrhhRe6/upXvxp0yCGHrCh0rqZgkgsAAAAARWH+/PmdJ06cuM2qVatKevfuXbX//vuvvOSSS2r9Tq1iRYEFAAAAoCgcf/zxK44//vg21WOVjQIrR376++ZdFPfjw1vnFDQ3n1T8GcmXKPZ8UvFnJF/L8HOm5dZc9f1m7dfjzKtzmqMuxZ4PAIoB12ABAAAAuVFdXV3tQodAfqXnuLqu9e2+wLLd0fYVtt+1XWH7ftv9Cp0LAAAA7YvtJevXr2/290ChbVi/fv1WtpfUtb7dF1iSzpF0kKTPSNo6XXZH4eIAAACgPaqqqrpw/vz5ndetW9eVnqz2p7q62uvWres6f/78zlVVVRfWtd2WcA3WcZIuioj/SJLtH0qaa3tERCwobDQAAAC0F7vssstjL7744inz5s37SUQM0pbRmbElqba9pKqq6sJddtnlsbo2atcFlu1ekoZLmlazLCLm2V4jaYykBRnbHqekGNPw4cNbNygAAADahfSDd50fvtH+tfequiy9X521fJWkHpkLIuKGiCiPiPL+/fu3RjYAAAAA7Ux7L7Aq0vueWct7SVrTulEAAAAAtHftusCKiFWSFkrapWaZ7U8o6b2aWaBYAAAAANqpdl1gpW6QdLbtbWz3kHS5pMciYn5hYwEAAABobxwRhc6QV7Y7KimqJkrqIulxScdFxPK69ikvL4+pU6e2TkAAAAAUJdvTIqK80DnQtrTrWQQlKSI2SzorvQEAAABA3mwJQwQBAAAAoFVQYAEAAABAjlBgAQAAAECOUGABAAAAQI5QYAEAAABAjlBgAQAAAECOUGABAAAAQI5QYAEAAABAjlBgAQAAAECOUGABAAAAQI5QYAEAAABAjlBgAQAAAECOUGABAAAAQI6UNGYj25b0TUnlksoy10XEcXnIBQAAAABtTmN7sH4t6VpJwyR1yroBAAAAANTIHixJh0naLSLm5TMMAAAAALRlje3BqpS0MJ9BAAAAAKCta2yB9XNJ/5NeiwUAAAAAqEVjhwieJmmEpFNtL8tcERHb5zwVAAAAALRBjS2wLs5rCgAAAABoBxpVYEXEbfkOAgAAAABtXWN7sGR7V0lHK5mq/S1JN0fEv/MVDAAAAADamkZNcmH7YEl/l9RT0nRJPST9zfYh+YsGAAAAAG1LY3uwfiLp0Ij4c80C21+TdJmkB/IRDAAAAADamsZO0z5S0qNZyx5TMrMgAAAAAECNL7AWSNona9kXxZcPAwAAAMAHGjtE8KeSHrQ9WdKbSnq0DpX03TzlAgAAAIA2p1E9WBFxv5Ieq0pJ5ZLWS/pSREzOYzYAAAAAaFMaPU17RDwn6bk8ZgEAAACANq3OAsv2ZyLihfTx5+raLi28AAAAAGCLV18P1hOSytLHz9axTUjqmNNEAAAAANBG1VlgRURZxuPGzjYIAAAAAFusRhVOto+sY/kRuY0DAAAAAG1XY3umrqtj+TW5CgIAAAAAbV1jCyx/bIE9UlJVTtMAAAAAQBtW7zTttjcpncjC9sas1R0l/TpfwQAAAACgrWnoe7D2UdJ79WdJX8tYXi1pSUS8ka9gAAAAANDW1FtgRcTfJMn2thGxuHUiAQAAAEDb1FAPliQpIhbb7itpV0n9lXFNVkTcnqdsAAAAANCmNKrAsr2PpPslbZTUS9Kq9P5NSRRYAAAAAKDGzyJ4maSLIqK/pLXp/U8l/SZvyQAAAACgjWlsgTVK0tXp45rhgZdL+n6O8wAAAABAm9XYAqtSUpf08Xu2h0vqLKl3XlIBAAAAQBvU2ALrOUkHp4//IukhSU9I+mceMgEAAABAm9SoSS4kfVsfFmNnSfqBpDJJV+UjFAAAAAC0RQ0WWLZLJF2ppKhSRLwv6ZI850KO/fT3Vc3e98eHN7YOb5nmZiRfotjzScWfkXwtw8+ZliNfy6y56vvN3rfHmVfnLEd9mpuRfIlizwdIjRgiGBFVkr4paUP+4wAAAABA29XYa7AeknRoPoPkg+1v2n7G9hrbzf/TKgAAAAA0QmP77DtJutP2CZLmS6quWRERx+UhV66slHS9pK6SbihwFgAAAADtXGMLrE2Sfp8+7pjeil5EPCZJtvcqbBIAAAAAW4JGFVgRcVS+gxSa7eMkHSdJw4cPL3AaAAAAAG1RY6/Bku2etr9l+4fp80G2h+QvWuuKiBsiojwiyvv371/oOAAAAADaoEYVWLbHS5or6RxJP04X7yzpmjzlAgAAAIA2p7E9WFdL+mFE7CypZja+5yR9Nh+hAAAAAKAtauwkFztJujV9HJIUEWttd89HqFyx3VHJDIid0+dbpas2REQULBgAAACAdsmNqTNsz5H05YhYYHtFRPSxvZ2kByNip7ynbCbbEyXdUsuqbSJifl37lZeXx9SpU/MVCwAAAG2A7WkRUV7oHGhbGjtE8DZJ/2f785KcXpN1k6Qb85YsByLi1ohwLbf5hc4GAAAAoP1p7BDByyV1l/RnSaWSnpb0SzHJBQAAAAB8oLHfg7VZ0nmSzrPdLyKW5zcWAAAAALQ9jZ2m/dGax5nFle1H8hEKAAAAANqixl6D9bk6ljNNOwAAAACk6h0iaPtbNdvZPlySM1aPkrQyX8EAAAAAoK1p6BqsS9L7LpJ+lrG8WtISSafmIxQAAAAAtEX1FlgRsY0k2X4oIg5snUgAAAAA0DY16hosiisAAAAAaFidPVi2fxURp6WPb6hru4g4Lh/BAAAAAKCtqW+IYKc6HgMAAAAAalFngRURJ2Y8Pqp14gAAAABA29XY78ECAAAAADSAAgsAAAAAcoQCCwAAAAByhAILAAAAAHKk3gLL9kO297Xt1goEAAAAAG1VQz1Y70m6V9J82z+2PaQVMgEAAABAm1RvgZVOzz5E0s8lfV1JofWg7a/RqwUAAAAAH9XgNVgRsSYirouIcZL2lLRcSa/Wm7bPz3dAAAAAAGgrmjTJRUS8EBHHKCm0Nku6MC+pAAAAAKANanSBZbvE9mG2H5c0VdLbko7MWzIAAAAAaGNKGtrA9vaSjpX0nXT72ySdGhGv5TkbAAAAALQp9RZYtv8m6fOS/iHpB5Lui4gNrREMAAAAANqahnqwpks6ISJebY0wAAAAANCWNXQN1mWSPl3bCtvfsD0g95EAAAAAoG1qqMD6oaRRdaz7hKSzcxsHAAAAANquhgqsfSXdVMe6WyTtn9s4AAAAANB2NVRgDYqIpbWtSJcPyn0kAAAAAGibGiqwNtoeXNuKdPmm3EcCAAAAgLapoQLrH5JOrWPdyZKeyW0cAAAAAGi7Gpqm/RJJz9juL+n3kt6RNFTS4ZKOUPIdWQAAAAAANVBgRcRU2wdKuk7SMZJCkiXNlXRgRLyY/4gAAAAA0DY01IOliHhc0va2R0nqL+ndiHgj78kAAAAAoI1psMCqkRZVFFYAAAAAUIeGJrkAAAAAADQSBRYAAAAA5AgFFgAAAADkCAUWAAAAAOQIBRYAAAAA5AgFFgAAAADkCAUWAAAAAOQIBRYAAAAA5AgFFgAAAADkCAUWAAAAAOQIBRYAAAAA5EhJoQO0FyPPeaRZ+82/bL8cJ6ndp2/7dLP3ffm7L+cwSd2am5F8iWLPJ0nXnfBUs/Y7+Td75zhJ7cjXMm+f80yz9936sj1zmKRuzc3YWvmefGrbZu33xb3n5ThJ7Yo93y1vLGr2vkeNGpLDJHVrbkbyJYo9HyDRgwUAAAAAOUOBBQAAAAA5UpQFlu1Ohc4AAAAAAE3VagWW7UG2H7a92vbrto+xHbZH2r7V9l3p/QpJv0r3OdH2nHSf523vmdHeBbafyDrGFNvnp4/3sl1l+7u2F9hekbZf2lqvGQAAAMCWpTV7sO6StFHSMEmfl3Rk1vrDJP1FUn9JP7B9uKSfSvqOpL6SbpT0qO0RTThmR0kHSNpZ0qckbS/pqto2tH2c7am2p7777rtNOAQAAAAAJFqlwLK9taS9JU2KiDURsUxJ8ZTp2Yi4JyI2R0SlpKMk/TYiXoiIqoj4naSZkr7VxMOfHRGrI2KppP+R9B3bH3vdEXFDRJRHRHn//v2b/BoBAAAAoLV6sIam9wszli3I2mZ+1vNhkt7MWjYvXd4UmceZL6mLpH5NbAMAAAAAGtRaBdY76f3wjGXDs7apznr+lqSRWcs+kS6XpApJ3bPW1/YlB5lDCkdK2iBped1RAQAAAKB5WqXAioi3JU2RdJntMtv9JZ3fwG63Sjre9m62S2wfJWmspLvT9dMk7WJ7fLr+FEnb1NLOpbZ72B4g6QJJd0REdjEHAAAAAC3miGidA9mDlUxUsaekpZJ+Iel6Jb1Ol0qqiojvZe1ziqTTJA2UNEfSDyNiSsb6SyXV7PObtO0nIuJi23tJekLSMZIuktRD0sOSTo6IivqylpeXx9SpU1vwagEAANDW2Z4WEeWFzoG2pdUKrI8d2P6KpAcldY08hKgpsCKipKn7UmABAACAAgvN0eTio7lsj1VyndXLSobyXSzpnnwUVwAAAABQCK35PVi9Jf1B0lpJzyqZcv30Vjw+AAAAAORVq/VgRcTTkrZrxeNNUSu+PgAAAABozR4sAAAAAGjXKLAAAAAAIEcosAAAAAAgRyiwAAAAACBHKLAAAAAAIEcosAAAAAAgRyiwAAAAACBHKLAAAAAAIEcosAAAAAAgRyiwAAAAACBHKLAAAAAAIEcosAAAAAAgRyiwAAAAACBHKLAAAAAAIEcosAAAAAAgRyiwAAAAACBHKLAAAAAAIEcosAAAAAAgRyiwAAAAACBHKLAAAAAAIEcosAAAAAAgRyiwAAAAACBHKLAAAAAAIEcosAAAAAAgRyiwAAAAACBHKLAAAAAAIEcosAAAAAAgRyiwAAAAACBHKLAAAAAAIEcosAAAAAAgRxwRhc5QdGy/K2lBDpvsJ2l5DtvLtWLPJxV/RvK1XLFnJF/LFHs+qfgzkq/lij0j+VomH/lGRET/HLeJdo4CqxXYnhoR5YXOUZdizycVf0bytVyxZyRfyxR7Pqn4M5Kv5Yo9I/laptjzYcvBEEEAAAAAyBEKLAAAAADIEQqs1nFDoQM0oNjzScWfkXwtV+wZydcyxZ5PKv6M5Gu5Ys9IvpYp9nzYQnANFgAAAADkCD1YAAAAAJAjFFgAAAAAkCMUWAAAAACQIxRYeWS7o+0rbL9ru8L2/bb7FTpXDdvftP2M7TW2qwqdJ5vty22/kuZbZPtG230KnSuT7Utsv5lmXGZ7su3hhc6VzXYH28/ZDttbFzqPJNm+1fYm22szbicVOldtbO9j+/k043Lb1xc6kySl/z8y37/16TnepdDZatgeZPue9OfgSttP2R5T6Fw1bPe1fZvtJbZX277bdu8C5qn357Ltr6bnfb3tWba/XCz5bA+1/aDtBem/w2+3ZrZGZtw3/Te4PP33+IztPYso3562X7S9Iv33+KLtrxdLvqztTkzP8/mtmS89dn3v4V5prsyfjc+1dkZs2Siw8uscSQdJ+oykmg+1dxQuzseslHS9pO8XOEddNkv6tqS+ksYoeQ9vLWSgWtwhaWxE9JA0UtJCSf9X0ES1O0NSZaFD1OK2iCjNuBVF4ZLJ9l6SJku6Usm/xa0l3VTASB+IiJ0y3z9JV0maHREvFjpbhusl9ZG0vaSBkqZK+pNtFzTVh26XVCpplKRtlJzjQv6crvPnsu1PSPqDpEsl9UzvH7A9shjySaqW9FdJ35L0ditmylZfxt6SrpG0naT+ku6W9Bfbw1otXf355kg6RMm/w17pNnfa/lQrZZMa8dnA9ghJP5D0citlytZQxs1Zv1s+13rRAKmk0AHaueMkXRQR/5Ek2z+UNNf2iIhYUNhoUkQ8Jn3wAbLoRMS5GU/ftf1LSfcWKk9tIuK1jKdW8gFjhwLFqZXt7SWdJOlQSdMLHKctulTSbyJicsayYipgJEm2SyQdrSRvMdlO0rURsVKSbP9O0iQlHyCXFzKY7e6SviZpXERUpMt+JmmK7eERsbC1MzXwc/m7kqZFxJ3p87tsn5Auv7DQ+SJisaTr0vWbWyNPbRrIeFfWol/b/omkXSW9lfdwajDfsprHtjso+Z3SQcn/o1cLnS/D7ySdJ+nEVoj0McX++QWgBytPbPeSNFzStJplETFP0holvTFoui9KmlHoENlsf8v2aklrJZ0u6YLCJvpQ+gv6ZklnSVpV2DS1OjQdCvO6k+G0pYUOlCn9AL6bpJJ0qM5y21Nslxc6Wy0OVtKrcXuBc2S7Qsl57m97KyV/eHo2IgpaXKWccatR83txbKunadgYZfxOSb0ofqc0m+1PS+qnwvXE1Mr2KkkbJD0j6QUlPYNFwfbxktZFxD2FzlKPjrbfSof+PlJMw5KxZaDAyp+y9H511vJVknq0bpS2z/ahkk5QUsAUlYi4OyJ6ShqspLgqpl/Up0taEhEPFDpILa6R9EklH24OkfQFSTcWNNHH9Vbyc/JwSRMlDVHyQefP6R9Risnxku6JiFWFDpLlH5I6Slqm5I8QX5d0bEETpSJiraQpki6w3ct2f0k1PefF+HO6TPxOyRnbAyTdL+nKiHij0HkyRUQvJUNXD5H0Z0lFcZ20k2uMz1cyKqJYvabkDyTbKPkdM1PSU7aHFDIUtiwUWPlTkd73zFreS0kvFhrJ9mFKPngfWGTXlnxERCxRkvNPLoLJOGxvp2SM/CmFzlKbiJgWEUsjojoiXlFyndh/2+5S6GwZav4f3xIRMyNio5IheJ0kFc2YftvbKunh/U2hs2RKe1CfkPS6kp+F3SRdIukZ2wMLmS3Dt5X0FLwq6V+SHkyXF0MPW7YK8TslJ9IP208r+YPJjwocp1YRsSEi/qjkj0/fK3CcGjdJujgi3il0kLpExJKImBERVRGxKiJ+JGmFkuHAQKugwMqT9K/ICyV9MJtXeoFyDyV/TUEj2D5K0m8lHRARTxc6TyOUSOqupKej0D6v5CLuWbaX68Prhma6OGfrq07vi2XyA0XEaknzJUX2qlqWFdLxkmZExAuFDpKlj5K/Il8TEWsiYmNE3KTkd8/uhY2WiIh3ImJCRAyOiG0kvSnpfUnPFzhabWYo43dKapyKcOh0MUsnBXlG0l8i4pSIKKb/y7UpUTIJSzH4kqSfpcOll0vaQ9KPbD9T4FwNqVYR/W5B+0eBlV83SDrb9ja2e0i6XNJjETG/sLESTqaR30pS5/T5VumtKH4I2T5NycxtX4mIfxQ6TzYnU5+fkg4zkZPpz69T8oH8tfr2bSX3StpWyVCJsZL2TZd/WUVwnU46zW6v9PEoSf8r6aGIeL+gwT7ueklH2d4xnUhikpIej6KY9td2ZyXDF4uq90qS0uusXpd0ku3utktsH61kqFtR/KHJ9g62+6T/n3eVdLWkywo11LKBn8u3Syq3fbjtTrYPlzRe0m1Fku+D50o+zHZKn7fqhFr1ZbT9SUnPSvp9RJzVmrkame9Q259O/69sZftYSXtLeqwY8kkapuSav7HpbaqS33v/3Vr5Gspoe2/b26X/p0ttX6BkBtNWew8BRQS3PN2UXHdwpZKhJhVKptftV+hcGfkm6sO/xGfeRhY6W5ovJG1Sct3GB7dC58rI10HJ2PhlktZJekfSXZK2LXS2OvKOTN/TrQudJc0zRcmwjXVKeg2uktSj0LlqyWlJF0laouR6l6eVTM1f8Gxpvm8qGSJWWugsdeT7lKQ/pT8HVyuZpOGgQufKyHespMVKvsbgDUmnFzhPvT+XJX1V0iuS1qf3Xy6yfLWtu6BYMkq6JX28Nut2RJHkOyX9d7g2/fn4T0mHFcv7V8u2UySd35r5GvEeniFpQfq7ZZmkRyXt2toZuW3ZN0cUe884AAAAALQNDBEEAAAAgByhwAIAAACAHKHAAgAAAIAcocACAAAAgByhwAIAAACAHKHAAgAAAIAcocACgC2Y7QtsP9GC/c+1/XALM8y1PbElbQAAUCwosACgldj+hO37bC+xvdb2W7YfsN250NmaKyJ+FhEHFDoHAADFggILAFrPnyUtlrSDpDJJu0t6TJILGQoAAOQOBRYAtALbfZUUVr+JiNWReDsifhMRG9Jtxtj+m+3ltlfa/ovtbTPauNX2HbZvtr3K9ju2D7c91va/bVfYftr2kIx95tv+H9vPpr1mU23vWk/ObravtP2m7RW2H7W9XT3bf2SIYXq8c20/mR5vlu3PZazvZPsq28vSnryza2lzzzTvCtvzbP/AttN1d9j+q+0O6fO9bK+x/enGngsAAPKJAgsAWkFEvCfpFUk32f6O7R1riobMzSRdIGmopJGS1kq6M2ub/5Z0v6Q+kn4q6UZJF0k6RNLAtI0Ls/Y5QdLp6T6TJf3Zdo86ot4o6ZOSPitpkKQXJP3JdqcmvNyjJZ0mqaekxyXdlrHuHEn7S/qcpG3S1zmiZqXtHZX09F0hqb+k/SSdIunIjNeytaTzbQ+U9HtJp0fEy03IBwBA3lBgAUDr2UvSFEnfl/SSpKW2f1xTaEXEzIh4OiI2RMRqJYXSZ213y2jjqYh4JCKqJd0uqbukO9LesEolBVR51nF/FxHTImKjpMslrVdS5HyE7X6SviXppIhYmm5/oaTBkj7ThNf524h4JSI2S7pJ0na2e6brviPp8oiYGxHrJZ2lpCiscZKk+yLiwYjYHBGvSbo23U8RsU7SYel+j0t6NCJuaUI2AADyqqTQAQBgSxERyyWdK+nctGj6hpIeo3ck3ZwOB7xCSTFTpg8Lj/6SFqSPF2e0V5nWZh8sk1SZ7ptpfsY+YXuhkl6gbNuk9zOzOtc6SRrWqBeZlVHSuvS+TNLq9LiZedbZXpaVYW/bX89Y1kHSWxn7vGL7aUkHSDq4CbkAAMg7erAAoAAiojIibpU0U9LYdPFvJFVI2jkiekjaI13e0kkwRtY8SHvLhkt6u5btaoq4URHRK+PWLSJ+38IMNd7JytNdSQGZmeHmrOP3iIidMvb5tpIhjJOVDLnkdxkAoGjwSwkAWoHt3rYvtT06neihxPahkkZLeibdrIeSHp9V6XC9i3J0+KNt75JeRzVJUjdJj2RvFBHLJN0t6XrbQ9PcvWwfYrs0R1nukDTJ9ra2u0r6uT76u+h6Sd+0fUDG+7Sj7S+keT4l6TpJR0j6rqR+Sq5bAwCgKFBgAUDr2ChpgKQ/SFoh6V1J50s6LSLuS7c5Q9KektYoKbr+lKNj3yDpV5JWSpogab/0Gq/aHCtpjqQptiskvazkmqeoY/umulTJ1PTPS3pT0kJ92HOmiJil5Pqw7ysZarhM0q2S+qfDKu+T9IuIeCK9husbkk63/aUc5QMAoEUckavfmQCAYmN7vqTzIyJ7NkIAAJAH9GABAAAAQI5QYAEAAABAjjBEEAAAAAByhB4sAAAAAMgRCiwAAAAAyBEKLAAAAADIEQosAAAAAMgRCiwAAAAAyJH/D13iViTcQs3kAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] @@ -734,20 +544,50 @@ "needs_background": "light" }, "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train indices: [0 1 2 3 4 5 6 7 8 9]\n", + "Test indices: [10 11]\n", + "Train length: 10\n", + "Test length: 2\n", + "Train groups: [0 1 1 1 1 2 2 2 3 3]\n", + "Test groups: [4 4]\n", + "Train group size: 4\n", + "Test group size: 1\n", + "Train group months: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03'\n", + " '2021-03' '2021-04' '2021-04']\n", + "Test group months: ['2021-05' '2021-05']\n", + "\n", + "Train indices: [ 1 2 3 4 5 6 7 8 9 10 11]\n", + "Test indices: [12 13 14 15]\n", + "Train length: 11\n", + "Test length: 4\n", + "Train groups: [1 1 1 1 2 2 2 3 3 4 4]\n", + "Test groups: [5 5 5 5]\n", + "Train group size: 4\n", + "Test group size: 1\n", + "Train group months: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03'\n", + " '2021-04' '2021-04' '2021-05' '2021-05']\n", + "Test group months: ['2021-06' '2021-06' '2021-06' '2021-06']\n", + "\n" + ] } ], "source": [ - "cv_args = {\"test_size\": 1, \"n_splits\": 3, \"gap_size\": 1}\n", + "cv_args = {\"test_size\": 1, \"train_size\": 4}\n", "\n", - "test_cv(X, y, groups, **cv_args)\n", - "plot_cv(X, y, groups, **cv_args)" + "plot_splits(X, y, groups, **cv_args)\n", + "print_split_info(X, y, groups, **cv_args)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**test_size + n_splits + shift_size**" + "### Usage in CV" ] }, { @@ -759,56 +599,36 @@ "name": "stdout", "output_type": "stream", "text": [ - "TRAIN INDICES: [0 1 2 3 4 5 6 7]\n", - "TEST INDICES: [8 9]\n", - "TRAIN LENGTH: 8\n", - "TEST LENGTH: 2\n", - "TRAIN GROUPS: [0 1 1 1 1 2 2 2]\n", - "TEST GROUPS: [3 3]\n", - "TRAIN GROUP SIZE: 3\n", - "TEST GROUP SIZE: 1\n", - "TRAIN GROUP MONTHS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03'\n", - " '2021-03']\n", - "TEST GROUP MONTHS: ['2021-04' '2021-04']\n", + "Split number: 1\n", + "Train true target: [1 0 1 0 1 0 0 1 1 1]\n", + "Train predicted target: [1 1 1 1 1 1 1 1 1 1]\n", + "Test true target: [0 1]\n", + "Test predicted target: [1 1]\n", + "Accuracy: 0.5\n", "\n", - "TRAIN INDICES: [ 5 6 7 8 9 10 11]\n", - "TEST INDICES: [12 13 14 15]\n", - "TRAIN LENGTH: 7\n", - "TEST LENGTH: 4\n", - "TRAIN GROUPS: [2 2 2 3 3 4 4]\n", - "TEST GROUPS: [5 5 5 5]\n", - "TRAIN GROUP SIZE: 3\n", - "TEST GROUP SIZE: 1\n", - "TRAIN GROUP MONTHS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04' '2021-05' '2021-05']\n", - "TEST GROUP MONTHS: ['2021-06' '2021-06' '2021-06' '2021-06']\n", + "Split number: 2\n", + "Train true target: [0 1 0 1 0 0 1 1 1 0 1]\n", + "Train predicted target: [1 1 1 1 1 1 1 1 1 1 1]\n", + "Test true target: [1 0 0 0]\n", + "Test predicted target: [1 1 1 1]\n", + "Accuracy: 0.25\n", "\n" ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" } ], "source": [ - "cv_args = {\"test_size\": 1, \"n_splits\": 2, \"shift_size\": 2}\n", + "cv = GroupTimeSeriesSplit(**cv_args)\n", + "clf = DummyClassifier(strategy=\"most_frequent\")\n", "\n", - "test_cv(X, y, groups, **cv_args)\n", - "plot_cv(X, y, groups, **cv_args)" + "scores = cross_val_score(clf, X, y, groups=groups, scoring=\"accuracy\", cv=cv)\n", + "print_cv_info(cv, X, y, groups, clf, scores)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**test_size + n_splits + expanding window**" + "## API" ] }, { @@ -820,912 +640,117 @@ "name": "stdout", "output_type": "stream", "text": [ - "TRAIN INDICES: [0]\n", - "TEST INDICES: [1 2 3 4 5 6 7 8 9]\n", - "TRAIN LENGTH: 1\n", - "TEST LENGTH: 9\n", - "TRAIN GROUPS: [0]\n", - "TEST GROUPS: [1 1 1 1 2 2 2 3 3]\n", - "TRAIN GROUP SIZE: 1\n", - "TEST GROUP SIZE: 3\n", - "TRAIN GROUP MONTHS: ['2021-01']\n", - "TEST GROUP MONTHS: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03'\n", - " '2021-04' '2021-04']\n", + "## GroupTimeSeriesSplit\n", "\n", - "TRAIN INDICES: [0 1 2 3 4]\n", - "TEST INDICES: [ 5 6 7 8 9 10 11]\n", - "TRAIN LENGTH: 5\n", - "TEST LENGTH: 7\n", - "TRAIN GROUPS: [0 1 1 1 1]\n", - "TEST GROUPS: [2 2 2 3 3 4 4]\n", - "TRAIN GROUP SIZE: 2\n", - "TEST GROUP SIZE: 3\n", - "TRAIN GROUP MONTHS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02']\n", - "TEST GROUP MONTHS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04' '2021-05' '2021-05']\n", + "*GroupTimeSeriesSplit(test_size, train_size=None, n_splits=None, gap_size=0, shift_size=1, window_type='rolling')*\n", "\n", - "TRAIN INDICES: [0 1 2 3 4 5 6 7]\n", - "TEST INDICES: [ 8 9 10 11 12 13 14 15]\n", - "TRAIN LENGTH: 8\n", - "TEST LENGTH: 8\n", - "TRAIN GROUPS: [0 1 1 1 1 2 2 2]\n", - "TEST GROUPS: [3 3 4 4 5 5 5 5]\n", - "TRAIN GROUP SIZE: 3\n", - "TEST GROUP SIZE: 3\n", - "TRAIN GROUP MONTHS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03'\n", - " '2021-03']\n", - "TEST GROUP MONTHS: ['2021-04' '2021-04' '2021-05' '2021-05' '2021-06' '2021-06' '2021-06'\n", - " '2021-06']\n", - "\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "cv_args = {\"test_size\": 3, \"n_splits\": 3, \"window_type\": \"expanding\"}\n", - "\n", - "test_cv(X, y, groups, **cv_args)\n", - "plot_cv(X, y, groups, **cv_args)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**test_size + train_size + n_splits (full usage of data)**" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TRAIN INDICES: [0 1 2 3 4]\n", - "TEST INDICES: [ 5 6 7 8 9 10 11]\n", - "TRAIN LENGTH: 5\n", - "TEST LENGTH: 7\n", - "TRAIN GROUPS: [0 1 1 1 1]\n", - "TEST GROUPS: [2 2 2 3 3 4 4]\n", - "TRAIN GROUP SIZE: 2\n", - "TEST GROUP SIZE: 3\n", - "TRAIN GROUP MONTHS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02']\n", - "TEST GROUP MONTHS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04' '2021-05' '2021-05']\n", + "Group time series cross-validator.\n", "\n", - "TRAIN INDICES: [1 2 3 4 5 6 7]\n", - "TEST INDICES: [ 8 9 10 11 12 13 14 15]\n", - "TRAIN LENGTH: 7\n", - "TEST LENGTH: 8\n", - "TRAIN GROUPS: [1 1 1 1 2 2 2]\n", - "TEST GROUPS: [3 3 4 4 5 5 5 5]\n", - "TRAIN GROUP SIZE: 2\n", - "TEST GROUP SIZE: 3\n", - "TRAIN GROUP MONTHS: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03']\n", - "TEST GROUP MONTHS: ['2021-04' '2021-04' '2021-05' '2021-05' '2021-06' '2021-06' '2021-06'\n", - " '2021-06']\n", - "\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "cv_args = {\"test_size\": 3, \"train_size\": 2, \"n_splits\": 2}\n", - "\n", - "test_cv(X, y, groups, **cv_args)\n", - "plot_cv(X, y, groups, **cv_args)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**test_size + train_size + n_splits (partial usage of data)**" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TRAIN INDICES: [1 2 3 4 5 6 7]\n", - "TEST INDICES: [ 8 9 10 11]\n", - "TRAIN LENGTH: 7\n", - "TEST LENGTH: 4\n", - "TRAIN GROUPS: [1 1 1 1 2 2 2]\n", - "TEST GROUPS: [3 3 4 4]\n", - "TRAIN GROUP SIZE: 2\n", - "TEST GROUP SIZE: 2\n", - "TRAIN GROUP MONTHS: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03']\n", - "TEST GROUP MONTHS: ['2021-04' '2021-04' '2021-05' '2021-05']\n", + "**Parameters**\n", "\n", - "TRAIN INDICES: [5 6 7 8 9]\n", - "TEST INDICES: [10 11 12 13 14 15]\n", - "TRAIN LENGTH: 5\n", - "TEST LENGTH: 6\n", - "TRAIN GROUPS: [2 2 2 3 3]\n", - "TEST GROUPS: [4 4 5 5 5 5]\n", - "TRAIN GROUP SIZE: 2\n", - "TEST GROUP SIZE: 2\n", - "TRAIN GROUP MONTHS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04']\n", - "TEST GROUP MONTHS: ['2021-05' '2021-05' '2021-06' '2021-06' '2021-06' '2021-06']\n", - "\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "cv_args = {\"test_size\": 2, \"train_size\": 2, \"n_splits\": 2}\n", - "\n", - "test_cv(X, y, groups, **cv_args)\n", - "plot_cv(X, y, groups, **cv_args)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Groups are not sorted**" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['2021-06',\n", - " '2021-06',\n", - " '2021-06',\n", - " '2021-06',\n", - " '2021-02',\n", - " '2021-02',\n", - " '2021-02',\n", - " '2021-02',\n", - " '2021-04',\n", - " '2021-04',\n", - " '2021-03',\n", - " '2021-03',\n", - " '2021-03',\n", - " '2021-05',\n", - " '2021-05',\n", - " '2021-01']" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "not_sorted_months = [\n", - " \"2021-06\",\n", - " \"2021-06\",\n", - " \"2021-06\",\n", - " \"2021-06\",\n", - " \"2021-02\",\n", - " \"2021-02\",\n", - " \"2021-02\",\n", - " \"2021-02\",\n", - " \"2021-04\",\n", - " \"2021-04\",\n", - " \"2021-03\",\n", - " \"2021-03\",\n", - " \"2021-03\",\n", - " \"2021-05\",\n", - " \"2021-05\",\n", - " \"2021-01\",\n", - "]\n", - "not_sorted_months" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([5, 5, 5, 5, 1, 1, 1, 1, 3, 3, 2, 2, 2, 4, 4, 0])" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "not_sorted_groups = np.fromiter(map(months_map.get, not_sorted_months), dtype=\"int\")\n", - "not_sorted_groups" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
num_feature
2021-060
2021-067
2021-066
2021-064
2021-024
2021-028
2021-020
2021-026
2021-042
2021-040
2021-035
2021-039
2021-037
2021-057
2021-057
2021-017
\n", - "
" - ], - "text/plain": [ - " num_feature\n", - "2021-06 0\n", - "2021-06 7\n", - "2021-06 6\n", - "2021-06 4\n", - "2021-02 4\n", - "2021-02 8\n", - "2021-02 0\n", - "2021-02 6\n", - "2021-04 2\n", - "2021-04 0\n", - "2021-03 5\n", - "2021-03 9\n", - "2021-03 7\n", - "2021-05 7\n", - "2021-05 7\n", - "2021-01 7" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X_not_sorted = X.copy()\n", - "X_not_sorted.index = not_sorted_months\n", - "X_not_sorted" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TRAIN INDICES: [0 1 2 3 4 5 6 7 8 9]\n", - "TEST INDICES: [10 11 12]\n", - "TRAIN LENGTH: 10\n", - "TEST LENGTH: 3\n", - "TRAIN GROUPS: [5 5 5 5 1 1 1 1 3 3]\n", - "TEST GROUPS: [2 2 2]\n", - "TRAIN GROUP SIZE: 3\n", - "TEST GROUP SIZE: 1\n", - "TRAIN GROUP MONTHS: ['2021-06' '2021-06' '2021-06' '2021-06' '2021-02' '2021-02' '2021-02'\n", - " '2021-02' '2021-04' '2021-04']\n", - "TEST GROUP MONTHS: ['2021-03' '2021-03' '2021-03']\n", + "- `test_size` : int\n", "\n", - "TRAIN INDICES: [ 4 5 6 7 8 9 10 11 12]\n", - "TEST INDICES: [13 14]\n", - "TRAIN LENGTH: 9\n", - "TEST LENGTH: 2\n", - "TRAIN GROUPS: [1 1 1 1 3 3 2 2 2]\n", - "TEST GROUPS: [4 4]\n", - "TRAIN GROUP SIZE: 3\n", - "TEST GROUP SIZE: 1\n", - "TRAIN GROUP MONTHS: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-04' '2021-04' '2021-03'\n", - " '2021-03' '2021-03']\n", - "TEST GROUP MONTHS: ['2021-05' '2021-05']\n", + " Size of test dataset.\n", "\n", - "TRAIN INDICES: [ 8 9 10 11 12 13 14]\n", - "TEST INDICES: [15]\n", - "TRAIN LENGTH: 7\n", - "TEST LENGTH: 1\n", - "TRAIN GROUPS: [3 3 2 2 2 4 4]\n", - "TEST GROUPS: [0]\n", - "TRAIN GROUP SIZE: 3\n", - "TEST GROUP SIZE: 1\n", - "TRAIN GROUP MONTHS: ['2021-04' '2021-04' '2021-03' '2021-03' '2021-03' '2021-05' '2021-05']\n", - "TEST GROUP MONTHS: ['2021-01']\n", - "\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "cv_args = {\"test_size\": 1, \"train_size\": 3}\n", - "\n", - "test_cv(X_not_sorted, y, not_sorted_groups, **cv_args)\n", - "plot_cv(X_not_sorted, y, not_sorted_groups, **cv_args)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Group names**" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TRAIN INDICES: [0 1 2 3 4 5 6 7]\n", - "TEST INDICES: [8 9]\n", - "TRAIN LENGTH: 8\n", - "TEST LENGTH: 2\n", - "TRAIN GROUPS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03'\n", - " '2021-03']\n", - "TEST GROUPS: ['2021-04' '2021-04']\n", - "TRAIN GROUP SIZE: 3\n", - "TEST GROUP SIZE: 1\n", - "TRAIN GROUP MONTHS: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03'\n", - " '2021-03']\n", - "TEST GROUP MONTHS: ['2021-04' '2021-04']\n", + "- `train_size` : int (default=None)\n", "\n", - "TRAIN INDICES: [1 2 3 4 5 6 7 8 9]\n", - "TEST INDICES: [10 11]\n", - "TRAIN LENGTH: 9\n", - "TEST LENGTH: 2\n", - "TRAIN GROUPS: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03'\n", - " '2021-04' '2021-04']\n", - "TEST GROUPS: ['2021-05' '2021-05']\n", - "TRAIN GROUP SIZE: 3\n", - "TEST GROUP SIZE: 1\n", - "TRAIN GROUP MONTHS: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03'\n", - " '2021-04' '2021-04']\n", - "TEST GROUP MONTHS: ['2021-05' '2021-05']\n", + " Size of train dataset.\n", + "\n", + "- `n_splits` : int (default=None)\n", + "\n", + " Number of the splits.\n", + "\n", + "- `gap_size` : int (default=0)\n", + "\n", + " Gap size between train and test datasets.\n", + "\n", + "- `shift_size` : int (default=1)\n", + "\n", + " Step to shift for the next fold.\n", + "\n", + "- `window_type` : str (default=\"rolling\")\n", + "\n", + " Type of the window. Possible values: \"rolling\", \"expanding\".\n", + "\n", + "**Examples**\n", + "\n", + "For usage examples, please see\n", + " http://rasbt.github.io/mlxtend/user_guide/evaluate/GroupTimeSeriesSplit/\n", + "\n", + "### Methods\n", + "\n", + "
\n", + "\n", + "*get_n_splits(X=None, y=None, groups=None)*\n", + "\n", + "Returns the number of splitting iterations in the cross-validator.\n", + "\n", + "**Parameters**\n", + "\n", + "- `X` : object\n", + "\n", + " Always ignored, exists for compatibility.\n", + "\n", + "- `y` : object\n", + "\n", + " Always ignored, exists for compatibility.\n", + "\n", + "- `groups` : object\n", + "\n", + " Always ignored, exists for compatibility.\n", + "\n", + "**Returns**\n", + "\n", + "- `n_splits` : int\n", + "\n", + " Returns the number of splitting iterations in the cross-validator.\n", + "\n", + "
\n", + "\n", + "*split(X, y=None, groups=None)*\n", + "\n", + "Generate indices to split data into training and test set.\n", + "\n", + "**Parameters**\n", + "\n", + "- `X` : array-like\n", + "\n", + " Training data.\n", + "\n", + "- `y` : array-like (default=None)\n", + "\n", + " Always ignored, exists for compatibility.\n", + "\n", + "- `groups` : array-like (default=None)\n", + "\n", + " Array with group names or sequence numbers.\n", + "\n", + "**Yields**\n", + "\n", + "- `train` : ndarray\n", + "\n", + " The training set indices for that split.\n", + "\n", + "- `test` : ndarray\n", + "\n", + " The testing set indices for that split.\n", "\n", - "TRAIN INDICES: [ 5 6 7 8 9 10 11]\n", - "TEST INDICES: [12 13 14 15]\n", - "TRAIN LENGTH: 7\n", - "TEST LENGTH: 4\n", - "TRAIN GROUPS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04' '2021-05' '2021-05']\n", - "TEST GROUPS: ['2021-06' '2021-06' '2021-06' '2021-06']\n", - "TRAIN GROUP SIZE: 3\n", - "TEST GROUP SIZE: 1\n", - "TRAIN GROUP MONTHS: ['2021-03' '2021-03' '2021-03' '2021-04' '2021-04' '2021-05' '2021-05']\n", - "TEST GROUP MONTHS: ['2021-06' '2021-06' '2021-06' '2021-06']\n", "\n" ] } ], "source": [ - "cv_args = {\"test_size\": 1, \"train_size\": 3}\n", - "\n", - "test_cv(X, y, months, **cv_args)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Usage in CV" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([0. , 0.5 , 0.25])" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cv_args = {\"test_size\": 1, \"train_size\": 3}\n", - "cv = GroupTimeSeriesSplit(**cv_args)\n", - "clf = DummyClassifier(strategy=\"most_frequent\")\n", - "\n", - "scores = cross_val_score(clf, X, y, groups=groups, scoring=\"accuracy\", cv=cv)\n", - "scores" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[1 0 1 0 1 0 0 1] [0 0 0 0 0 0 0 0] [1 1] [0 0]\n", - "[0 1 0 1 0 0 1 1 1] [1 1 1 1 1 1 1 1 1] [0 1] [1 1]\n", - "[0 0 1 1 1 0 1] [1 1 1 1 1 1 1] [1 0 0 0] [1 1 1 1]\n" - ] - } - ], - "source": [ - "clf = DummyClassifier(strategy=\"most_frequent\")\n", - "\n", - "for train_idx, test_idx in cv.split(X, y, groups):\n", - " clf.fit(X.iloc[train_idx], y.iloc[train_idx])\n", - " y_train_pred = clf.predict(X.iloc[train_idx])\n", - " y_test_pred = clf.predict(X.iloc[test_idx])\n", - " print(y.iloc[train_idx].values, y_train_pred, y.iloc[test_idx].values, y_test_pred)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Failed cases" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Groups are not consecutive\n", - "- Both train_size and n_splits are specified\n", - "- Incorrect window name\n", - "- Specify train_size with expanding window\n", - "- Not enough data (too many number of splits)\n", - "- Not enough data (too large value of shift size)\n", - "- Not enough data (too large value of gap size)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Groups are not consecutive**" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['2021-01',\n", - " '2021-02',\n", - " '2021-02',\n", - " '2021-02',\n", - " '2021-02',\n", - " '2021-03',\n", - " '2021-03',\n", - " '2021-03',\n", - " '2021-04',\n", - " '2021-04',\n", - " '2021-05',\n", - " '2021-05',\n", - " '2021-03',\n", - " '2021-03',\n", - " '2021-03',\n", - " '2021-03']" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "not_consecutive_months = [\n", - " \"2021-01\",\n", - " \"2021-02\",\n", - " \"2021-02\",\n", - " \"2021-02\",\n", - " \"2021-02\",\n", - " \"2021-03\",\n", - " \"2021-03\",\n", - " \"2021-03\",\n", - " \"2021-04\",\n", - " \"2021-04\",\n", - " \"2021-05\",\n", - " \"2021-05\",\n", - " \"2021-03\",\n", - " \"2021-03\",\n", - " \"2021-03\",\n", - " \"2021-03\",\n", - "]\n", - "not_consecutive_months" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 2, 2, 2, 2])" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "not_consecutive_groups = np.fromiter(\n", - " map(months_map.get, not_consecutive_months), dtype=\"int\"\n", - ")\n", - "not_consecutive_groups" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The groups should be consecutive\n" - ] - } - ], - "source": [ - "try:\n", - " test_cv(X, y, not_consecutive_groups, **cv_args)\n", - "except ValueError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Both train_size and n_splits are not specified**" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Either train_size or n_splits should be defined\n" - ] - } - ], - "source": [ - "cv_args = {\"test_size\": 3}\n", - "\n", - "try:\n", - " test_cv(X, y, groups, **cv_args)\n", - "except ValueError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Incorrect window name**" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Window type can be either \"rolling\" or \"expanding\"\n" - ] - } - ], - "source": [ - "cv_args = {\"test_size\": 3, \"n_splits\": 3, \"window_type\": \"bad_name\"}\n", - "\n", - "try:\n", - " test_cv(X, y, groups, **cv_args)\n", - "except ValueError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Specify train_size with expanding window**" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train size can be specified only with rolling window\n" - ] - } - ], - "source": [ - "cv_args = {\"test_size\": 3, \"train_size\": 3, \"window_type\": \"expanding\"}\n", - "\n", - "try:\n", - " test_cv(X, y, groups, **cv_args)\n", - "except ValueError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Not enough data (too large train_size and/or n_splits)**" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Not enough data to split number of groups (6) for number splits (10) with train size (10), test size (1), gap size (0), shift size (1)\n" - ] - } - ], - "source": [ - "cv_args = {\"test_size\": 1, \"train_size\": 10, \"n_splits\": 10}\n", - "\n", - "try:\n", - " test_cv(X, y, groups, **cv_args)\n", - "except ValueError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Not enough data (too many number of splits)**" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Not enough data to split number of groups (6) for number splits (10) with train size (-4), test size (1), gap size (0), shift size (1)\n" - ] - } - ], - "source": [ - "cv_args = {\"test_size\": 1, \"n_splits\": 10}\n", - "\n", - "try:\n", - " test_cv(X, y, groups, **cv_args)\n", - "except ValueError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Not enough data (too large values of shift size)**" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Not enough data to split number of groups (6) for number splits (3) with train size (-15), test size (1), gap size (0), shift size (10)\n" - ] - } - ], - "source": [ - "cv_args = {\"test_size\": 1, \"n_splits\": 3, \"shift_size\": 10}\n", - "\n", - "try:\n", - " test_cv(X, y, groups, **cv_args)\n", - "except ValueError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Not enough data (too large values of gap size)**" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Not enough data to split number of groups (6) for number splits (3) with train size (-7), test size (1), gap size (10), shift size (1)\n" - ] - } - ], - "source": [ - "cv_args = {\"test_size\": 1, \"n_splits\": 3, \"gap_size\": 10}\n", - "\n", - "try:\n", - " test_cv(X, y, groups, **cv_args)\n", - "except ValueError as e:\n", - " print(e)" + "with open(\"../../api_modules/mlxtend.evaluate/GroupTimeSeriesSplit.md\", \"r\") as f:\n", + " s = f.read()\n", + "print(s)" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "mlxtend", "language": "python", - "name": "python3" + "name": "mlxtend" }, "language_info": { "codemirror_mode": { @@ -1737,7 +762,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.8.5" } }, "nbformat": 4, diff --git a/mlxtend/evaluate/time_series.py b/mlxtend/evaluate/time_series.py index efa7bde35..242c83b3c 100644 --- a/mlxtend/evaluate/time_series.py +++ b/mlxtend/evaluate/time_series.py @@ -7,7 +7,10 @@ from itertools import accumulate, chain, groupby, islice +import matplotlib.pyplot as plt import numpy as np +from matplotlib.patches import Patch +from matplotlib.ticker import MaxNLocator from sklearn.utils import indexable @@ -230,3 +233,117 @@ def _calculate_split_params(self): ) self._train_start_idx = train_start_idx + + +def print_split_info(X, y, groups, **cv_args): + """Print information details about splits.""" + cv = GroupTimeSeriesSplit(**cv_args) + groups = np.array(groups) + + for train_idx, test_idx in cv.split(X, groups=groups): + print("Train indices:", train_idx) + print("Test indices:", test_idx) + print("Train length:", len(train_idx)) + print("Test length:", len(test_idx)) + print("Train groups:", groups[train_idx]) + print("Test groups:", groups[test_idx]) + print("Train group size:", len(set(groups[train_idx]))) + print("Test group size:", len(set(groups[test_idx]))) + print("Train group months:", X.index[train_idx].values) + print("Test group months:", X.index[test_idx].values) + print() + + +def plot_split_indices(cv, cv_args, X, y, groups, n_splits, image_file_path=None): + """Create a sample plot for indices of a cross-validation object.""" + fig, ax = plt.subplots(figsize=(12, 4)) + cmap_data = plt.cm.tab20 + cmap_cv = plt.cm.coolwarm + lw = 10 + marker_size = 200 + + for split_idx, (train_idx, test_idx) in enumerate( + cv.split(X=X, y=y, groups=groups) + ): + indices = np.array([np.nan] * len(X)) + indices[test_idx] = 1 + indices[train_idx] = 0 + + ax.scatter( + range(len(X)), + [split_idx + 0.5] * len(X), + c=indices, + marker="_", + lw=lw, + cmap=cmap_cv, + vmin=-0.4, + vmax=1.4, + s=marker_size, + ) + + ax.scatter( + range(len(X)), + [split_idx + 1.5] * len(X), + c=groups, + marker="_", + lw=lw, + cmap=cmap_data, + s=marker_size, + ) + + yticklabels = list(range(n_splits)) + ["group"] + ax.set( + yticks=np.arange(n_splits + 1) + 0.5, + yticklabels=yticklabels, + ylabel="CV iteration", + ylim=[n_splits + 1.2, -0.2], + xlim=[-0.5, len(indices) - 0.5], + ) + + ax.legend( + [Patch(color=cmap_cv(0.2)), Patch(color=cmap_cv(0.8))], + ["Training set", "Testing set"], + loc=(1.02, 0.8), + fontsize=13, + ) + + ax.set_title("{}\n{}".format(type(cv).__name__, cv_args), fontsize=15) + ax.xaxis.set_major_locator(MaxNLocator(min_n_ticks=len(X), integer=True)) + ax.set_xlabel(xlabel="Sample index", fontsize=13) + ax.set_ylabel(ylabel="CV iteration", fontsize=13) + ax.tick_params(axis="both", which="major", labelsize=13) + ax.tick_params(axis="both", which="minor", labelsize=13) + + plt.tight_layout() + + if image_file_path: + plt.savefig(image_file_path, bbox_inches="tight") + + plt.show() + + +def plot_splits(X, y, groups, image_file_path=None, **cv_args): + """Visualize splits by group.""" + cv = GroupTimeSeriesSplit(**cv_args) + cv._n_groups = len(np.unique(groups)) + cv._calculate_split_params() + n_splits = cv.n_splits + + plot_split_indices( + cv, cv_args, X, y, groups, n_splits, image_file_path=image_file_path + ) + + +def print_cv_info(cv, X, y, groups, clf, scores): + """Print information details about cross-validation usage with classifier.""" + for split_idx, (train_idx, test_idx) in enumerate(cv.split(X, y, groups)): + clf.fit(X.iloc[train_idx], y.iloc[train_idx]) + y_train_pred = clf.predict(X.iloc[train_idx]) + y_test_pred = clf.predict(X.iloc[test_idx]) + print(f"Split number: {split_idx + 1}") + print(f"Train true target: {y.iloc[train_idx].values}") + print(f"Train predicted target: {y_train_pred}") + print(f"Test true target: {y.iloc[test_idx].values}") + print(f"Test predicted target: {y_test_pred}") + print(f"Accuracy: {scores[split_idx].round(2)}") + print() From 8a69587390348778426a80809a7283914bcdcfa0 Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Sat, 21 May 2022 14:46:08 +0300 Subject: [PATCH 25/30] Reformat documentation --- .../evaluate/GroupTimeSeriesSplit.ipynb | 114 +++++++++++++++--- 1 file changed, 100 insertions(+), 14 deletions(-) diff --git a/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb b/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb index 1761a1b07..0d29b1781 100644 --- a/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb +++ b/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb @@ -44,10 +44,15 @@ "- Shift size \n", "- Window type \n", "\n", + "This implementation has several advantages over scikit-learn's [TimeSeriesSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html):\n", + "- Here you can group data as you want before splitting, while TimeSeriesSplit considers only record level.\n", + "- It can be used for both cases: holdout validation (n_splits=1) and cross-validation (n_splits>=2), whereas TimeSeriesSplit can be used only for the latter case.\n", + "- TimeSeriesSplit uses only expanding window, while for this implementation you can choose between rolling and expanding window types.\n", + "- You have additional control for splitting using additional parameter: shift size.\n", + "\n", "There are several features that need to be taken into account:\n", "\n", "- Implementation is compatible with sklearn-learn API\n", - "- It can be used both for holdout split and cross-validation\n", "- Numbers or custom non-numeric values can be used as groups\n", "- Groups should be consecutive\n", "- Test size with either train size or number of splits are required parameters for splitting\n", @@ -57,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -83,6 +88,40 @@ "## Prepare sample data" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the following examples, we are creating sample dataset consisting of 16 training data points with corresponding targets." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Features and targets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's assume that we have one numeric feature and target for the binary classification task." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "data = [[0], [7], [6], [4], [4], [8], [0], [6], [2], [0], [5], [9], [7], [7], [7], [7]]\n", + "target = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0]\n", + "\n", + "X = pd.DataFrame(data, columns=[\"num_feature\"])\n", + "y = pd.Series(target, name=\"target\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -90,6 +129,24 @@ "### Group numbers" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We create 6 different groups so that the first training example belongs to group 0, the next 4 to group 1, and so forth. \n", + "These groups do not have to be in ascending order (as in this dataset), but they must be consecutive.\n", + "\n", + "Example of correct groups order (not sorted but consecutive):\n", + "```python\n", + "np.array([5, 5, 5, 5, 1, 1, 1, 1, 3, 3, 2, 2, 2, 4, 4, 0])\n", + "```\n", + "\n", + "Example of incorrect groups order (not consecutive):\n", + "```python\n", + "np.array([0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 2, 2, 2, 2])\n", + "```" + ] + }, { "cell_type": "code", "execution_count": 3, @@ -118,6 +175,13 @@ "### Group names (months)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will add months as the index according to the specified groups for a more illustrative example." + ] + }, { "cell_type": "code", "execution_count": 4, @@ -143,30 +207,30 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 5, "metadata": {}, + "outputs": [], "source": [ - "### Features and targets" + "X = X.set_index(months)" ] }, { - "cell_type": "code", - "execution_count": 5, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "data = [[0], [7], [6], [4], [4], [8], [0], [6], [2], [0], [5], [9], [7], [7], [7], [7]]\n", - "target = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0]\n", - "\n", - "X = pd.DataFrame(data, index=months, columns=[\"num_feature\"])\n", - "y = pd.Series(target, name=\"target\")" + "## Example 1 -- Multiple training groups (with train size specified)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Example 1 -- Multiple training groups (with train size specified)" + "Let's create splitting with training dataset with size of 3 groups and test sizes of 1 group. \n", + "Number of splits will be calculated automatically. \n", + "\n", + "The first 3 rows in the visualization describe how each split is distributed among groups. \n", + "The groups are visualized in the last row where each color represents different group." ] }, { @@ -290,6 +354,14 @@ "## Example 2 -- Multiple training groups (with number of splits specified)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we will split the dataset with test size (2 goups) and number of splits (3 groups) specified. \n", + "Train size will be calculated automatically. " + ] + }, { "cell_type": "code", "execution_count": 8, @@ -406,7 +478,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Example 3 -- Defining the gap size between training and test folds" + "## Example 3 -- Defining the gap size between training and test datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's make a splitting where we have a gap of the size of 1 group between training and test datasets." ] }, { @@ -528,6 +607,13 @@ "## Example 4 -- Expanding the window size" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we try to expand training dataset set to 4 groups (in comparison with example 1), then number of splits will be adjusted accordingly." + ] + }, { "cell_type": "code", "execution_count": 12, From 75af8f0a6d158e5dff7f2b89af79072cdfda778d Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Sat, 21 May 2022 20:56:01 +0300 Subject: [PATCH 26/30] Fix overview list formatting --- docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb b/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb index 0d29b1781..413ecdd19 100644 --- a/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb +++ b/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb @@ -45,6 +45,7 @@ "- Window type \n", "\n", "This implementation has several advantages over scikit-learn's [TimeSeriesSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html):\n", + "\n", "- Here you can group data as you want before splitting, while TimeSeriesSplit considers only record level.\n", "- It can be used for both cases: holdout validation (n_splits=1) and cross-validation (n_splits>=2), whereas TimeSeriesSplit can be used only for the latter case.\n", "- TimeSeriesSplit uses only expanding window, while for this implementation you can choose between rolling and expanding window types.\n", From 37e92b51dc4b6a40620268dfc9f943defd335388 Mon Sep 17 00:00:00 2001 From: rasbt Date: Tue, 24 May 2022 09:43:16 -0500 Subject: [PATCH 27/30] some doc updates --- .../evaluate/GroupTimeSeriesSplit.ipynb | 109 +++++++++++------- 1 file changed, 70 insertions(+), 39 deletions(-) diff --git a/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb b/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb index 413ecdd19..da4dc3c98 100644 --- a/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb +++ b/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb @@ -32,8 +32,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Time series tasks in machine learning require special type of validation, because the time order of the objects is important if we want to get more fair evaluation of the ML model’s quality. \n", - "Also there can be different units for splitting the data for different tasks - hours, days, months etc. \n", + "Time series tasks in machine learning require special type of validation, because the time order of the objects is important for a fairer evaluation of an ML model’s quality. \n", + "Also there can be different time units for splitting the data for different tasks - hours, days, months etc. \n", "\n", "Here, we use time series validation with support of the groups which can be flexibly configured along with other parameters:\n", "\n", @@ -44,21 +44,28 @@ "- Shift size \n", "- Window type \n", "\n", - "This implementation has several advantages over scikit-learn's [TimeSeriesSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html):\n", + "This `GroupTimeSeriesSplit` implementation is inspired by scikit-learn's [TimeSeriesSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html) but it has several advantages:\n", "\n", - "- Here you can group data as you want before splitting, while TimeSeriesSplit considers only record level.\n", - "- It can be used for both cases: holdout validation (n_splits=1) and cross-validation (n_splits>=2), whereas TimeSeriesSplit can be used only for the latter case.\n", - "- TimeSeriesSplit uses only expanding window, while for this implementation you can choose between rolling and expanding window types.\n", - "- You have additional control for splitting using additional parameter: shift size.\n", + "- `GroupTimeSeriesSplit` lets you group data as you want before splitting, while `TimeSeriesSplit` only considers the record level.\n", + "- It can be used for both holdout validation (n_splits=1) and cross-validation (n_splits>=2), whereas `TimeSeriesSplit` can be used only for the latter case.\n", + "- `TimeSeriesSplit` uses only an expanding window, while for this implementation you can choose between both rolling and expanding window types.\n", + "- `GroupTimeSeriesSplit` offers additional control for splitting using an additional `shift size` parameter.\n", "\n", - "There are several features that need to be taken into account:\n", + "**There are several features that need to be taken into account:**\n", "\n", - "- Implementation is compatible with sklearn-learn API\n", + "- `GroupTimeSeriesSplit` is compatible with sklearn-learn API.\n", "- Numbers or custom non-numeric values can be used as groups\n", - "- Groups should be consecutive\n", - "- Test size with either train size or number of splits are required parameters for splitting\n", - "- If full data can’t be used with specific parameters, the most recent data is considered to split\n", - "- If split is impossible (e.g., the data is not enough to split) using specified parameters, an exception will be raised " + "- However, groups should be consecutive\n", + "- Specifying the test size with either a) the train size or b) the number of splits is required parameters for splitting\n", + "- If full data can’t be used with specific parameters, the most recent data is considered for splitting\n", + "- If splitting is impossible (e.g., there is not enough data to split) using specified parameters, an exception will be raised " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Before we illustrate the usage of `GroupTimeSeriesSplit` in the following examples below, let's set up a `DummyClassifier` that we will reuse in the following sections. Also, let's import the libraries we will be using in the following examples:" ] }, { @@ -135,17 +142,7 @@ "metadata": {}, "source": [ "We create 6 different groups so that the first training example belongs to group 0, the next 4 to group 1, and so forth. \n", - "These groups do not have to be in ascending order (as in this dataset), but they must be consecutive.\n", - "\n", - "Example of correct groups order (not sorted but consecutive):\n", - "```python\n", - "np.array([5, 5, 5, 5, 1, 1, 1, 1, 3, 3, 2, 2, 2, 4, 4, 0])\n", - "```\n", - "\n", - "Example of incorrect groups order (not consecutive):\n", - "```python\n", - "np.array([0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 2, 2, 2, 2])\n", - "```" + "These groups do not have to be in ascending order (as in this dataset), **but they must be consecutive.**" ] }, { @@ -169,6 +166,21 @@ "groups" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the following i another example of a **correct** group ordering (not sorted but consecutive):\n", + "```python\n", + "np.array([5, 5, 5, 5, 1, 1, 1, 1, 3, 3, 2, 2, 2, 4, 4, 0])\n", + "```\n", + "\n", + "However, the example below shows an **incorrect** group ordering (not consecutive), which is not compatible with `GroupTimeSeriesSplit`:\n", + "```python\n", + "np.array([0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 2, 2, 2, 2])\n", + "```" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -227,8 +239,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's create splitting with training dataset with size of 3 groups and test sizes of 1 group. \n", - "Number of splits will be calculated automatically. \n", + "Let's create a time series splitting with a training dataset that consists of 3 groups. And we will use 1 group for testing. In this case, the number of splits will be calculated automatically since both the training set and the test set sizes are specified.\n", "\n", "The first 3 rows in the visualization describe how each split is distributed among groups. \n", "The groups are visualized in the last row where each color represents different group." @@ -241,7 +252,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -307,6 +318,13 @@ "### Usage in CV" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The example below illustrates how we can use the time series splitter with scikit-learn, i.e., using `cross_val_score`:" + ] + }, { "cell_type": "code", "execution_count": 7, @@ -359,8 +377,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we will split the dataset with test size (2 goups) and number of splits (3 groups) specified. \n", - "Train size will be calculated automatically. " + "Now let's take a look at an example where we don't specify the number of training groups. Here, we will split the dataset with test size (2 goups) and a specified number of splits (3 groups), which is sufficient for calculating the training size automatically." ] }, { @@ -370,7 +387,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -434,6 +451,13 @@ "### Usage in CV" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again, let's take a look at how this looks in a scikit-learn context using `cross_val_score`:" + ] + }, { "cell_type": "code", "execution_count": 9, @@ -486,17 +510,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's make a splitting where we have a gap of the size of 1 group between training and test datasets." + "`GroupTimeSeriesSplit` let's you specify a gap size greater than 1 in order to skip a specified number of groups between training and test folds (the default gap size is 0). In the example below, we use a gap of 1 group to illustrate this." ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -560,9 +584,16 @@ "### Usage in CV" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The example below shows how this looks like in a scikit-learn context using `cross_val_score`:" + ] + }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -617,12 +648,12 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -835,9 +866,9 @@ ], "metadata": { "kernelspec": { - "display_name": "mlxtend", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "mlxtend" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -849,7 +880,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.9.7" } }, "nbformat": 4, From 7c2deddeb2c05dc2ead9dae13c5f0a8e0db1d5db Mon Sep 17 00:00:00 2001 From: rasbt Date: Tue, 24 May 2022 09:59:43 -0500 Subject: [PATCH 28/30] merge ex 1 and 4 --- .../evaluate/GroupTimeSeriesSplit.ipynb | 141 ++++-------------- 1 file changed, 31 insertions(+), 110 deletions(-) diff --git a/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb b/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb index da4dc3c98..c9662e71e 100644 --- a/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb +++ b/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb @@ -311,6 +311,37 @@ "print_split_info(X, y, groups, **cv_args)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Please note that if we specify the number of groups for both the training and the test set, the split size is determined automatically, and the number of splits naturally changes with the groups sizes. For example, increasing the number of training groups will naturally result in a lower number of splits as shown below." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "cv_args = {\"test_size\": 1, \"train_size\": 4}\n", + "\n", + "plot_splits(X, y, groups, **cv_args)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -632,116 +663,6 @@ "print_cv_info(cv, X, y, groups, clf, scores)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Example 4 -- Expanding the window size" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If we try to expand training dataset set to 4 groups (in comparison with example 1), then number of splits will be adjusted accordingly." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train indices: [0 1 2 3 4 5 6 7 8 9]\n", - "Test indices: [10 11]\n", - "Train length: 10\n", - "Test length: 2\n", - "Train groups: [0 1 1 1 1 2 2 2 3 3]\n", - "Test groups: [4 4]\n", - "Train group size: 4\n", - "Test group size: 1\n", - "Train group months: ['2021-01' '2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03'\n", - " '2021-03' '2021-04' '2021-04']\n", - "Test group months: ['2021-05' '2021-05']\n", - "\n", - "Train indices: [ 1 2 3 4 5 6 7 8 9 10 11]\n", - "Test indices: [12 13 14 15]\n", - "Train length: 11\n", - "Test length: 4\n", - "Train groups: [1 1 1 1 2 2 2 3 3 4 4]\n", - "Test groups: [5 5 5 5]\n", - "Train group size: 4\n", - "Test group size: 1\n", - "Train group months: ['2021-02' '2021-02' '2021-02' '2021-02' '2021-03' '2021-03' '2021-03'\n", - " '2021-04' '2021-04' '2021-05' '2021-05']\n", - "Test group months: ['2021-06' '2021-06' '2021-06' '2021-06']\n", - "\n" - ] - } - ], - "source": [ - "cv_args = {\"test_size\": 1, \"train_size\": 4}\n", - "\n", - "plot_splits(X, y, groups, **cv_args)\n", - "print_split_info(X, y, groups, **cv_args)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Usage in CV" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Split number: 1\n", - "Train true target: [1 0 1 0 1 0 0 1 1 1]\n", - "Train predicted target: [1 1 1 1 1 1 1 1 1 1]\n", - "Test true target: [0 1]\n", - "Test predicted target: [1 1]\n", - "Accuracy: 0.5\n", - "\n", - "Split number: 2\n", - "Train true target: [0 1 0 1 0 0 1 1 1 0 1]\n", - "Train predicted target: [1 1 1 1 1 1 1 1 1 1 1]\n", - "Test true target: [1 0 0 0]\n", - "Test predicted target: [1 1 1 1]\n", - "Accuracy: 0.25\n", - "\n" - ] - } - ], - "source": [ - "cv = GroupTimeSeriesSplit(**cv_args)\n", - "clf = DummyClassifier(strategy=\"most_frequent\")\n", - "\n", - "scores = cross_val_score(clf, X, y, groups=groups, scoring=\"accuracy\", cv=cv)\n", - "print_cv_info(cv, X, y, groups, clf, scores)" - ] - }, { "cell_type": "markdown", "metadata": {}, From fb2a2f0550c836a6f44562a322d0e8fe7fda6469 Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Wed, 25 May 2022 09:27:08 +0300 Subject: [PATCH 29/30] Fix typo --- docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb b/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb index c9662e71e..291466add 100644 --- a/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb +++ b/docs/sources/user_guide/evaluate/GroupTimeSeriesSplit.ipynb @@ -408,7 +408,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now let's take a look at an example where we don't specify the number of training groups. Here, we will split the dataset with test size (2 goups) and a specified number of splits (3 groups), which is sufficient for calculating the training size automatically." + "Now let's take a look at an example where we don't specify the number of training groups. Here, we will split the dataset with test size (2 groups) and a specified number of splits (3 groups), which is sufficient for calculating the training size automatically." ] }, { @@ -801,7 +801,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.8.5" } }, "nbformat": 4, From fc178568fc9a77c2f924466f5d91cb40fcc67545 Mon Sep 17 00:00:00 2001 From: Dmitry Labazkin Date: Wed, 25 May 2022 09:50:22 +0300 Subject: [PATCH 30/30] Add entry for GroupTimeSeriesSplit --- docs/sources/CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index c07ce73a5..a91b0d775 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -26,6 +26,7 @@ The CHANGELOG for the current development version is available at - The `mlxtend.evaluate.bootstrap_point632_score` now supports `fit_params`. ([#861](https://github.com/rasbt/mlxtend/pull/861)) - The `mlxtend/plotting/decision_regions.py` function now has a `contourf_kwargs` for matplotlib to change the look of the decision boundaries if desired. ([#881](https://github.com/rasbt/mlxtend/pull/881) via [[pbloem](https://github.com/pbloem)]) - Add a `norm_colormap` parameter to `mlxtend.plotting.plot_confusion_matrix`, to allow normalizing the colormap, e.g., using `matplotlib.colors.LogNorm()` ([#895](https://github.com/rasbt/mlxtend/pull/895)) +- Add new `GroupTimeSeriesSplit` class for evaluation in time series tasks with support of custom groups and additional parameters in comparison with scikit-learn's `TimeSeriesSplit`. ([#915](https://github.com/rasbt/mlxtend/pull/915) via [Dmitry Labazkin](https://github.com/labdmitriy)) ##### Changes