Skip to content

Commit

Permalink
Change default for incomplete chunks to keep (#367)
Browse files Browse the repository at this point in the history
* Change default for incomplete chunks to `keep`

* Fix failing tests due to 'keep' default for Size based chunker

---------

Co-authored-by: Niels Nuyttens <[email protected]>
  • Loading branch information
michael-nml and nnansters authored Feb 25, 2024
1 parent 3246e76 commit e6cc9b6
Show file tree
Hide file tree
Showing 8 changed files with 56 additions and 57 deletions.
8 changes: 4 additions & 4 deletions nannyml/chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,14 +331,14 @@ class SizeBasedChunker(Chunker):
"""

def __init__(self, chunk_size: int, incomplete: str = 'append', timestamp_column_name: Optional[str] = None):
def __init__(self, chunk_size: int, incomplete: str = 'keep', timestamp_column_name: Optional[str] = None):
"""Create a new SizeBasedChunker.
Parameters
----------
chunk_size: int
The preferred size of the resulting Chunks, i.e. the number of observations in each Chunk.
incomplete: str, default='append'
incomplete: str, default='keep'
Choose how to handle any leftover observations that don't make up a full Chunk.
The following options are available:
Expand Down Expand Up @@ -429,7 +429,7 @@ class CountBasedChunker(Chunker):
"""

def __init__(self, chunk_number: int, incomplete: str = 'append', timestamp_column_name: Optional[str] = None):
def __init__(self, chunk_number: int, incomplete: str = 'keep', timestamp_column_name: Optional[str] = None):
"""Creates a new CountBasedChunker.
It will calculate the amount of observations per chunk based on the given chunk count.
Expand All @@ -450,7 +450,7 @@ def __init__(self, chunk_number: int, incomplete: str = 'append', timestamp_colu
- ``'append'``: append leftover observations to the last complete Chunk (overfilling it)
Defaults to ``'append'``.
Defaults to ``'keep'``.
Returns
-------
Expand Down
6 changes: 0 additions & 6 deletions nannyml/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,6 @@ class WriterConfig(BaseModel):
write_args: Optional[Dict[str, Any]]


class ChunkerConfig(BaseModel):
chunk_size: Optional[int]
chunk_period: Optional[str]
chunk_count: Optional[int]


class IntervalSchedulingConfig(BaseModel):
weeks: Optional[int]
days: Optional[int]
Expand Down
14 changes: 7 additions & 7 deletions nannyml/sampling_error/summary_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@
#
# License: Apache Software License 2.0

from logging import getLogger
from typing import Tuple

import numpy as np
import pandas as pd
from scipy.stats import gaussian_kde, moment
from logging import getLogger

logger = getLogger(__name__)


def summary_stats_std_sampling_error_components(col: pd.Series) -> Tuple:
"""
Calculate sampling error components for Summary Stats Standard Deviation
Expand Down Expand Up @@ -54,12 +55,11 @@ def summary_stats_std_sampling_error(sampling_error_components, col) -> float:
_mu4 = sampling_error_components[1]
_size = col.shape[0]

err_var_parenthesis_part = (_mu4 - ((_size - 3) * (_std**4) / (_size - 1)))
if not (
np.isfinite(err_var_parenthesis_part) and
err_var_parenthesis_part >= 0
):
logger.debug("Summary Stats sampling error calculation imputed to nan because of non finite positive parenthesis factor.")
err_var_parenthesis_part = _mu4 - ((_size - 3) * (_std**4) / (_size - 1))
if not (np.isfinite(err_var_parenthesis_part) and err_var_parenthesis_part >= 0):
logger.debug(
"Summary Stats sampling error calculation imputed to nan because of non finite positive parenthesis factor."
)
return np.nan
err_var = np.sqrt((1 / _size) * err_var_parenthesis_part)
return (1 / (2 * _std)) * err_var
Expand Down
4 changes: 2 additions & 2 deletions tests/drift/test_drift.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,11 +453,11 @@ def test_statistical_drift_calculator_deals_with_missing_class_labels(sample_dri
[
(
{'chunk_size': 5000},
[0.004968, 0.004833, 0.01186, 0.242068],
[0.004968, 0.004833, 0.01186, 0.243595, 0.210516],
),
(
{'chunk_size': 5000, 'timestamp_column_name': 'timestamp'},
[0.004968, 0.004833, 0.01186, 0.242068],
[0.004968, 0.004833, 0.01186, 0.243595, 0.210516],
),
(
{'chunk_number': 5},
Expand Down
4 changes: 2 additions & 2 deletions tests/drift/test_multiv_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,11 +292,11 @@ def test_data_reconstruction_drift_calculator_numeric_results(sample_drift_data)
[
(
{'chunk_size': 5000},
[0.7998744001719177, 0.8020996183121666, 0.8043000024523013, 0.73631],
[0.79987, 0.80210, 0.80430, 0.73552, 0.76087],
),
(
{'chunk_size': 5000, 'timestamp_column_name': 'timestamp'},
[0.7998744001719177, 0.8020996183121666, 0.8043000024523013, 0.73631],
[0.79987, 0.80210, 0.80430, 0.73552, 0.76087],
),
(
{'chunk_number': 5},
Expand Down
34 changes: 23 additions & 11 deletions tests/performance_estimation/CBPE/test_cbpe_metrics.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import pytest

from nannyml.chunk import DefaultChunker
from nannyml.chunk import DefaultChunker, SizeBasedChunker
from nannyml.datasets import (
load_synthetic_binary_classification_dataset,
load_synthetic_multiclass_classification_dataset,
Expand All @@ -24,7 +24,7 @@
[
(
{
'chunk_size': 20000,
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': None,
'business_value_matrix': [[2, -5], [-10, 10]],
'normalize_business_value': None,
Expand All @@ -48,7 +48,7 @@
),
(
{
'chunk_size': 20000,
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': None,
'business_value_matrix': [[2, -5], [-10, 10]],
'normalize_business_value': 'per_prediction',
Expand All @@ -71,7 +71,11 @@
),
),
(
{'chunk_size': 20000, 'normalize_confusion_matrix': 'all', 'business_value_matrix': [[-1, 4], [8, -8]]},
{
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': 'all',
'business_value_matrix': [[-1, 4], [8, -8]],
},
pd.DataFrame(
{
'key': ['[0:19999]', '[20000:49999]'],
Expand All @@ -90,7 +94,11 @@
),
),
(
{'chunk_size': 20000, 'normalize_confusion_matrix': 'true', 'business_value_matrix': [[-1, 4], [8, -8]]},
{
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': 'true',
'business_value_matrix': [[-1, 4], [8, -8]],
},
pd.DataFrame(
{
'key': ['[0:19999]', '[20000:49999]'],
Expand All @@ -109,7 +117,11 @@
),
),
(
{'chunk_size': 20000, 'normalize_confusion_matrix': 'pred', 'business_value_matrix': [[-1, 4], [8, -8]]},
{
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': 'pred',
'business_value_matrix': [[-1, 4], [8, -8]],
},
pd.DataFrame(
{
'key': ['[0:19999]', '[20000:49999]'],
Expand All @@ -129,7 +141,7 @@
),
(
{
'chunk_size': 20000,
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': None,
'timestamp_column_name': 'timestamp',
'business_value_matrix': [[-1, 4], [8, -8]],
Expand All @@ -153,7 +165,7 @@
),
(
{
'chunk_size': 20000,
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': 'all',
'timestamp_column_name': 'timestamp',
'business_value_matrix': [[-1, 4], [8, -8]],
Expand All @@ -177,7 +189,7 @@
),
(
{
'chunk_size': 20000,
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': 'all',
'timestamp_column_name': 'timestamp',
'business_value_matrix': [[2, -5], [-10, 10]],
Expand All @@ -202,7 +214,7 @@
),
(
{
'chunk_size': 20000,
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': 'true',
'timestamp_column_name': 'timestamp',
'business_value_matrix': [[-1, 4], [8, -8]],
Expand All @@ -226,7 +238,7 @@
),
(
{
'chunk_size': 20000,
'chunker': SizeBasedChunker(chunk_size=20000, incomplete='append'),
'normalize_confusion_matrix': 'pred',
'timestamp_column_name': 'timestamp',
'business_value_matrix': [[-1, 4], [8, -8]],
Expand Down
37 changes: 15 additions & 22 deletions tests/stats/test_std.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,13 @@

"""Tests for Drift package."""

import pytest
import pandas as pd
import numpy as np
import pandas as pd
import pytest


from nannyml.chunk import SizeBasedChunker
from nannyml.datasets import load_synthetic_car_loan_dataset
from nannyml.stats import SummaryStatsStdCalculator
from nannyml.chunk import SizeBasedChunker

# @pytest.fixture(scope="module")
# def status_sum_result() -> Result:
Expand Down Expand Up @@ -43,30 +42,24 @@ def test_stats_std_calculator_with_default_params_chunk_size_one(): # noqa: D10
reference, analysis, _ = load_synthetic_car_loan_dataset()

chunker = SizeBasedChunker(chunk_size=5_000, incomplete='keep')
calc = SummaryStatsStdCalculator(
column_names=['car_value'],
chunker=chunker
).fit(reference)
calc = SummaryStatsStdCalculator(column_names=['car_value'], chunker=chunker).fit(reference)
result = calc.calculate(data=analysis.head(5_001))
expected = pd.DataFrame(
{
('chunk', 'key'): ['[0:4999]', '[5000:5000]'],
('chunk', 'chunk_index'): [0,1],
('chunk', 'start_index'): [0,5000],
('chunk', 'end_index'): [4999,5000],
('chunk', 'start_date'): [None,None],
('chunk', 'end_date'): [None,None],
('chunk', 'period'): ['analysis','analysis'],
('car_value', 'value'): [20614.8926,np.nan],
('car_value', 'sampling_error'): [271.9917,np.nan],
('car_value', 'upper_confidence_boundary'): [21430.8679,np.nan],
('car_value', 'lower_confidence_boundary'): [19798.9174,np.nan],
('chunk', 'chunk_index'): [0, 1],
('chunk', 'start_index'): [0, 5000],
('chunk', 'end_index'): [4999, 5000],
('chunk', 'start_date'): [None, None],
('chunk', 'end_date'): [None, None],
('chunk', 'period'): ['analysis', 'analysis'],
('car_value', 'value'): [20614.8926, np.nan],
('car_value', 'sampling_error'): [271.9917, np.nan],
('car_value', 'upper_confidence_boundary'): [21430.8679, np.nan],
('car_value', 'lower_confidence_boundary'): [19798.9174, np.nan],
('car_value', 'upper_threshold'): [20978.5658, 20978.5658],
('car_value', 'lower_threshold'): [19816.9091, 19816.9091],
('car_value', 'alert'): [False, True],
}
)
pd.testing.assert_frame_equal(
expected,
result.filter(period='analysis').to_df().round(4)
)
pd.testing.assert_frame_equal(expected, result.filter(period='analysis').to_df().round(4))
6 changes: 3 additions & 3 deletions tests/test_chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,12 +241,12 @@ def test_size_based_chunker_returns_chunks_of_required_size(sample_chunk_data):
chunker = SizeBasedChunker(chunk_size=chunk_size)
sut = chunker.split(sample_chunk_data)
assert len(sut[0]) == chunk_size
assert len(sut) == math.ceil(sample_chunk_data.shape[0] / chunk_size) - 1
assert len(sut) == math.ceil(sample_chunk_data.shape[0] / chunk_size)


def test_size_based_chunker_returns_last_chunk_that_is_partially_filled(sample_chunk_data): # noqa: D103
chunk_size = 3333
expected_last_chunk_size = chunk_size + sample_chunk_data.shape[0] % chunk_size
expected_last_chunk_size = sample_chunk_data.shape[0] % chunk_size
chunker = SizeBasedChunker(chunk_size)
sut = chunker.split(sample_chunk_data)
assert len(sut[-1]) == expected_last_chunk_size
Expand Down Expand Up @@ -304,7 +304,7 @@ def test_size_based_chunker_uses_observations_to_set_chunk_date_boundaries(sampl

def test_size_based_chunker_assigns_observation_range_to_chunk_keys(sample_chunk_data): # noqa: D103
chunk_size = 1500
last_chunk_start = ((sample_chunk_data.shape[0] // chunk_size) - 1) * chunk_size
last_chunk_start = (sample_chunk_data.shape[0] // chunk_size) * chunk_size
last_chunk_end = sample_chunk_data.shape[0] - 1

chunker = SizeBasedChunker(chunk_size=chunk_size)
Expand Down

0 comments on commit e6cc9b6

Please sign in to comment.