diff --git a/src/dolomite_matrix/DelayedMask.py b/src/dolomite_matrix/DelayedMask.py index 3d34cef..9837a05 100644 --- a/src/dolomite_matrix/DelayedMask.py +++ b/src/dolomite_matrix/DelayedMask.py @@ -134,3 +134,8 @@ def is_sparse_DelayedMask(x: DelayedMask): """See :py:meth:`~delayedarray.is_sparse.is_sparse`.""" return delayedarray.is_sparse(x._seed) + +@delayedarray.is_masked.register +def is_masked_DelayedMask(x: DelayedMask): + """See :py:meth:`~delayedarray.is_masked.is_masked`.""" + return True diff --git a/src/dolomite_matrix/_optimize_storage.py b/src/dolomite_matrix/_optimize_storage.py index 9a6a60e..4226eb9 100644 --- a/src/dolomite_matrix/_optimize_storage.py +++ b/src/dolomite_matrix/_optimize_storage.py @@ -6,7 +6,7 @@ import dolomite_base as dl import h5py import numpy -from delayedarray import SparseNdarray, apply_over_blocks, is_sparse +from delayedarray import SparseNdarray, apply_over_blocks, choose_block_shape_for_iteration, is_sparse, is_masked has_scipy = False try: @@ -53,9 +53,9 @@ def _aggregate_sum(collated: list, name: str): return mval -def _blockwise_any(x: numpy.ndarray, condition: Callable): +def _blockwise_any(x: numpy.ndarray, condition: Callable, buffer_size: int) -> bool: y = x.ravel() - step = 100000 + step = max(1, int(buffer_size / x.dtype.itemsize)) limit = len(y) for i in range(0, limit, step): if condition(y[i : min(limit, i + step)]).any(): @@ -63,7 +63,6 @@ def _blockwise_any(x: numpy.ndarray, condition: Callable): return False - def _collect_from_Sparse2darray(contents, fun: Callable, dtype: Callable): if contents is None: attrs = fun(numpy.array([], dtype=dtype)) @@ -91,7 +90,7 @@ def _unique_values_from_ndarray(position: Tuple, contents: numpy.ndarray) -> set return set(contents) output = set() for y in contents: - if not numpy.ma.is_masked(y): + if not y is numpy.ma.masked: output.add(y) return output @@ -117,15 +116,6 @@ def _unique_values(x) -> set: ################################################### -@singledispatch -def collect_integer_attributes(x: Any): - if is_sparse(x): - collated = apply_over_blocks(x, lambda pos, block : _collect_integer_attributes_from_Sparse2darray(block), allow_sparse=True) - else: - collated = apply_over_blocks(x, lambda pos, block : _collect_integer_attributes_from_ndarray(block)) - return _combine_integer_attributes(collated) - - @dataclass class _IntegerAttributes: minimum: Optional[int] @@ -134,12 +124,31 @@ class _IntegerAttributes: non_zero: int = 0 -def _simple_integer_collector(x: numpy.ndarray) -> _IntegerAttributes: +@singledispatch +def collect_integer_attributes(x: Any, buffer_size: int) -> _IntegerAttributes: + block_shape = choose_block_shape_for_iteration(x, memory = buffer_size) + if is_sparse(x): + collated = apply_over_blocks( + x, + lambda pos, block : _collect_integer_attributes_from_Sparse2darray(block, buffer_size), + block_shape = block_shape, + allow_sparse=True + ) + else: + collated = apply_over_blocks( + x, + lambda pos, block : _collect_integer_attributes_from_ndarray(block, buffer_size), + block_shape = block_shape + ) + return _combine_integer_attributes(collated, check_missing = is_masked(x)) + + +def _simple_integer_collector(x: numpy.ndarray, check_missing: bool) -> _IntegerAttributes: if x.size == 0: return _IntegerAttributes(minimum = None, maximum = None, missing = False) missing = False - if numpy.ma.is_masked(x): + if check_missing: if x.mask.all(): return _IntegerAttributes(minimum = None, maximum = None, missing = True) if x.mask.any(): @@ -148,50 +157,65 @@ def _simple_integer_collector(x: numpy.ndarray) -> _IntegerAttributes: return _IntegerAttributes(minimum=x.min(), maximum=x.max(), missing=missing) -def _combine_integer_attributes(x: List[_IntegerAttributes]): +def _combine_integer_attributes(x: List[_IntegerAttributes], check_missing: bool): + if check_missing: + missing = _aggregate_any(x, "missing") + else: + missing = False + return _IntegerAttributes( - minimum=_aggregate_min(x, "minimum"), - maximum=_aggregate_max(x, "maximum"), - missing=_aggregate_any(x, "missing"), - non_zero=_aggregate_sum(x, "non_zero"), + minimum = _aggregate_min(x, "minimum"), + maximum = _aggregate_max(x, "maximum"), + missing = missing, + non_zero = _aggregate_sum(x, "non_zero"), ) @collect_integer_attributes.register -def _collect_integer_attributes_from_ndarray(x: numpy.ndarray) -> _IntegerAttributes: - return _simple_integer_collector(x) +def _collect_integer_attributes_from_ndarray(x: numpy.ndarray, buffer_size: int) -> _IntegerAttributes: + return _simple_integer_collector(x, check_missing = numpy.ma.isMaskedArray(x)) @collect_integer_attributes.register -def _collect_integer_attributes_from_Sparse2darray(x: SparseNdarray) -> _IntegerAttributes: - collected = _collect_from_Sparse2darray(x.contents, _simple_integer_collector, x.dtype) - return _combine_integer_attributes(collected) +def _collect_integer_attributes_from_Sparse2darray(x: SparseNdarray, buffer_size: int) -> _IntegerAttributes: + check_missing = is_masked(x) + collected = _collect_from_Sparse2darray(x.contents, lambda block : _simple_integer_collector(block, check_missing), x.dtype) + return _combine_integer_attributes(collected, check_missing) if has_scipy: + # Currently, it seems like scipy's sparse matrices are not intended + # to be masked, seeing as how any subsetting discards the masks, e.g., + # + # >>> y = (scipy.sparse.random(1000, 200, 0.1)).tocsr() + # >>> y.data = numpy.ma.MaskedArray(y.data, y.data > 0.5) + # >>> y[0:5,:].data # gives back a regulary NumPy array. + # + # So we won't bother capturing the mask state. + @collect_integer_attributes.register - def _collect_integer_attributes_from_scipy_csc(x: scipy.sparse.csc_matrix): - output = _simple_integer_collector(x.data) + def _collect_integer_attributes_from_scipy_csc(x: scipy.sparse.csc_matrix, buffer_size: int): + output = _simple_integer_collector(x.data, check_missing = False) output.non_zero = int(x.data.shape[0]) return output @collect_integer_attributes.register - def _collect_integer_attributes_from_scipy_csr(x: scipy.sparse.csr_matrix): - output = _simple_integer_collector(x.data) + def _collect_integer_attributes_from_scipy_csr(x: scipy.sparse.csr_matrix, buffer_size: int): + output = _simple_integer_collector(x.data, check_missing = False) output.non_zero = int(x.data.shape[0]) return output @collect_integer_attributes.register - def _collect_integer_attributes_from_scipy_coo(x: scipy.sparse.coo_matrix): - output = _simple_integer_collector(x.data) + def _collect_integer_attributes_from_scipy_coo(x: scipy.sparse.coo_matrix, buffer_size: int): + output = _simple_integer_collector(x.data, check_missing = False) output.non_zero = int(x.data.shape[0]) return output -def optimize_integer_storage(x) -> _OptimizedStorageParameters: - attr = collect_integer_attributes(x) +def optimize_integer_storage(x, buffer_size: int = 1e8) -> _OptimizedStorageParameters: + attr = collect_integer_attributes(x, buffer_size) lower = attr.minimum upper = attr.maximum has_missing = attr.missing @@ -252,135 +276,207 @@ def optimize_integer_storage(x) -> _OptimizedStorageParameters: @dataclass class _FloatAttributes: - minimum: Optional[int] - maximum: Optional[int] - missing: bool + # Minimum and maximum are only set if non_integer = True. non_integer: bool - has_nan: bool - has_positive_inf: bool - has_negative_inf: bool + integer_minimum: Optional[int] + integer_maximum: Optional[int] + + # These flags are only set if check_missing = True. + has_nan: Optional[bool] + has_positive_inf: Optional[bool] + has_negative_inf: Optional[bool] has_zero: Optional[bool] has_lowest: Optional[bool] has_highest: Optional[bool] + + missing: bool non_zero: int = 0 @singledispatch -def collect_float_attributes(x: Any, no_missing: bool) -> _FloatAttributes: +def collect_float_attributes(x: Any, buffer_size: int) -> _FloatAttributes: + block_shape = choose_block_shape_for_iteration(x, memory = buffer_size) if is_sparse(x): - collated = apply_over_blocks(x, lambda pos, block : _collect_float_attributes_from_Sparse2darray(block, no_missing), allow_sparse=True) + collated = apply_over_blocks( + x, + lambda pos, block : _collect_float_attributes_from_Sparse2darray(block, buffer_size), + block_shape = block_shape, + allow_sparse=True + ) else: - collated = apply_over_blocks(x, lambda pos, block : _collect_float_attributes_from_ndarray(block, no_missing)) - return _combine_float_attributes(collated) - - -def _simple_float_collector(x: numpy.ndarray, no_missing: bool) -> _FloatAttributes: - output = _FloatAttributes( - minimum = None, - maximum = None, - missing = False, - non_integer = False, - has_nan = False, - has_positive_inf = False, - has_negative_inf = False, - has_zero = None, - has_lowest = None, - has_highest = None - ) + collated = apply_over_blocks( + x, + lambda pos, block : _collect_float_attributes_from_ndarray(block, buffer_size), + block_shape = block_shape + ) + return _combine_float_attributes(collated, check_missing = is_masked(x)) + +def _simple_float_collector(x: numpy.ndarray, check_missing: bool, buffer_size: int) -> _FloatAttributes: + # Do NOT set default parameters in _FloatAttributes; it's too easy to + # forget to set one of these flags. Prefer to spell it all out explicitly + # to avoid errors, despite the verbosity. if x.size == 0: - return output + return _FloatAttributes( + non_integer = False, + integer_minimum = None, + integer_maximum = None, + missing = False, + has_nan = False, + has_positive_inf = False, + has_negative_inf = False, + has_zero = False, + has_lowest = False, + has_highest = False + ) - if not no_missing: + missing = False + if check_missing: if numpy.ma.is_masked(x): if x.mask.all(): - output.missing = True - return output + return _FloatAttributes( + non_integer = False, + integer_minimum = None, + integer_maximum = None, + missing = True, + has_nan = False, + has_positive_inf = False, + has_negative_inf = False, + has_zero = False, + has_lowest = False, + has_highest = False + ) if x.mask.any(): - output.missing = True - - # While these are technically only used if there are missing values, we - # still need them to obtain 'non_integer', so we just compute them. - output.has_nan = _blockwise_any(x, numpy.isnan) - output.has_positive_inf = numpy.inf in x - output.has_negative_inf = -numpy.inf in x - - if output.has_nan or output.has_positive_inf or output.has_negative_inf: - output.non_integer = True - else: - output.non_integer = _blockwise_any(x, lambda b : (b % 1 != 0)) + missing = True - # Minimum and maximum are only used if all floats contain integers. - if not output.non_integer: - output.minimum = x.min() - output.maximum = x.max() + has_nan = _blockwise_any(x, numpy.isnan, buffer_size = buffer_size) + has_positive_inf = numpy.inf in x + has_negative_inf = -numpy.inf in x + non_finite = (has_nan or has_positive_inf or has_negative_inf) - # Highest/lowest are only used when there might be missing values. - if not no_missing: fstats = numpy.finfo(x.dtype) - output.has_lowest = fstats.min in x - output.has_highest = fstats.max in x - output.has_zero = 0 in x + has_lowest = fstats.min in x + has_highest = fstats.max in x + has_zero = 0 in x + else: + non_finite = _blockwise_any(x, lambda b : numpy.logical_not(numpy.isfinite(b)), buffer_size = buffer_size) + has_nan = None + has_positive_inf = None + has_negative_inf = None + has_lowest = None + has_highest = None + has_zero = None + + integer_minimum = None + integer_maximum = None + if non_finite: + non_integer = True + else: + non_integer = _blockwise_any(x, lambda b : (b % 1 != 0), buffer_size = buffer_size) + if not non_integer: + integer_minimum = x.min() + integer_maximum = x.max() - return output + return _FloatAttributes( + non_integer = non_integer, + integer_minimum = integer_minimum, + integer_maximum = integer_maximum, + missing = missing, + has_nan = has_nan, + has_positive_inf = has_positive_inf, + has_negative_inf = has_negative_inf, + has_zero = has_zero, + has_lowest = has_lowest, + has_highest = has_highest, + ) @collect_float_attributes.register -def _collect_float_attributes_from_ndarray(x: numpy.ndarray, no_missing: bool) -> _FloatAttributes: - return _simple_float_collector(x, no_missing) +def _collect_float_attributes_from_ndarray(x: numpy.ndarray, buffer_size: int) -> _FloatAttributes: + return _simple_float_collector(x, check_missing = numpy.ma.isMaskedArray(x), buffer_size = buffer_size) @collect_float_attributes.register -def _collect_float_attributes_from_Sparse2darray(x: SparseNdarray, no_missing: bool) -> _FloatAttributes: - collected = _collect_from_Sparse2darray(x.contents, lambda block : _simple_float_collector(block, no_missing), x.dtype) - return _combine_float_attributes(collected) +def _collect_float_attributes_from_Sparse2darray(x: SparseNdarray, buffer_size: int) -> _FloatAttributes: + check_missing = is_masked(x) + collected = _collect_from_Sparse2darray( + x.contents, + lambda block : _simple_float_collector(block, check_missing, buffer_size), + x.dtype + ) + return _combine_float_attributes(collected, check_missing) if has_scipy: @collect_float_attributes.register - def _collect_float_attributes_from_scipy_csc(x: scipy.sparse.csc_matrix, no_missing: bool): - output = _simple_float_collector(x.data, no_missing) + def _collect_float_attributes_from_scipy_csc(x: scipy.sparse.csc_matrix, buffer_size: int) -> _FloatAttributes: + output = _simple_float_collector(x.data, check_missing = False, buffer_size = buffer_size) output.non_zero = int(x.data.shape[0]) return output @collect_float_attributes.register - def _collect_float_attributes_from_scipy_csr(x: scipy.sparse.csr_matrix, no_missing: bool): - output = _simple_float_collector(x.data, no_missing) + def _collect_float_attributes_from_scipy_csr(x: scipy.sparse.csr_matrix, buffer_size: int) -> _FloatAttributes: + output = _simple_float_collector(x.data, check_missing = False, buffer_size = buffer_size) output.non_zero = int(x.data.shape[0]) return output @collect_float_attributes.register - def _collect_float_attributes_from_scipy_coo(x: scipy.sparse.coo_matrix, no_missing: bool): - output = _simple_float_collector(x.data, no_missing) + def _collect_float_attributes_from_scipy_coo(x: scipy.sparse.coo_matrix, buffer_size: int) -> _FloatAttributes: + output = _simple_float_collector(x.data, check_missing = False, buffer_size = buffer_size) output.non_zero = int(x.data.shape[0]) return output -def _combine_float_attributes(x: List[_FloatAttributes]) -> _FloatAttributes: +def _combine_float_attributes(x: List[_FloatAttributes], check_missing: bool) -> _FloatAttributes: + non_integer = _aggregate_any(x, "non_integer") + if not non_integer: + integer_minimum = _aggregate_min(x, "integer_minimum") + integer_maximum = _aggregate_max(x, "integer_maximum") + else: + integer_minimum = None + integer_maximum = None + + if check_missing: + missing = _aggregate_any(x, "missing") + has_nan = _aggregate_any(x, "has_nan") + has_positive_inf = _aggregate_any(x, "has_positive_inf") + has_negative_inf = _aggregate_any(x, "has_negative_inf") + has_lowest = _aggregate_any(x, "has_lowest") + has_highest = _aggregate_any(x, "has_highest") + has_zero = _aggregate_any(x, "has_zero") + else: + missing = False + has_nan = None + has_positive_inf = None + has_negative_inf = None + has_lowest = None + has_highest = None + has_zero = None + return _FloatAttributes( - minimum=_aggregate_min(x, "minimum"), - maximum=_aggregate_max(x, "maximum"), - non_integer=_aggregate_any(x, "non_integer"), - missing=_aggregate_any(x, "missing"), - has_nan=_aggregate_any(x, "has_nan"), - has_positive_inf=_aggregate_any(x, "has_positive_inf"), - has_negative_inf=_aggregate_any(x, "has_negative_inf"), - has_lowest=_aggregate_any(x, "has_lowest"), - has_highest=_aggregate_any(x, "has_highest"), - has_zero=_aggregate_any(x, "has_zero"), - non_zero=_aggregate_sum(x, "non_zero"), + non_integer = non_integer, + integer_minimum = integer_minimum, + integer_maximum = integer_maximum, + missing = missing, + has_nan = has_nan, + has_positive_inf = has_positive_inf, + has_negative_inf = has_negative_inf, + has_lowest = has_lowest, + has_highest = has_highest, + has_zero = has_zero, + non_zero = _aggregate_sum(x, "non_zero"), ) -def optimize_float_storage(x) -> _OptimizedStorageParameters: - attr = collect_float_attributes(x, isinstance(x, numpy.ndarray) and not numpy.ma.is_masked(x)) +def optimize_float_storage(x, buffer_size: int = 1e8) -> _OptimizedStorageParameters: + attr = collect_float_attributes(x, buffer_size = buffer_size) if attr.missing: if not attr.non_integer: - lower = attr.minimum - upper = attr.maximum + lower = attr.integer_minimum + upper = attr.integer_maximum # See logic in optimize_integer_storage(). if lower is None: @@ -430,8 +526,8 @@ def optimize_float_storage(x) -> _OptimizedStorageParameters: else: if not attr.non_integer: - lower = attr.minimum - upper = attr.maximum + lower = attr.integer_minimum + upper = attr.integer_maximum # See logic in optimize_integer_storage(). if lower is None: @@ -464,29 +560,29 @@ def optimize_float_storage(x) -> _OptimizedStorageParameters: @dataclass class _StringAttributes: - has_na1: bool - has_na2: bool missing: bool + has_na1: Optional[bool] + has_na2: Optional[bool] max_len: int -def _simple_string_collector(x: numpy.ndarray) -> _FloatAttributes: +def _simple_string_collector(x: numpy.ndarray, check_missing: None) -> _StringAttributes: if x.size == 0: return _StringAttributes( + missing = False, has_na1 = False, has_na2 = False, - missing = False, max_len = 0, ) missing = False - if numpy.ma.is_masked(x): + if check_missing: if x.mask.all(): return _StringAttributes( - has_na1=False, - has_na2=False, - missing=True, - max_len=0, + missing = True, + has_na1 = False, + has_na2 = False, + max_len = 0, ) if x.mask.any(): missing = True @@ -501,36 +597,57 @@ def _simple_string_collector(x: numpy.ndarray) -> _FloatAttributes: else: max_len = max(len(y.encode("UTF8")) for y in x.ravel()) + if check_missing: + has_na1 = x.dtype.type("NA") in x + has_na2 = x.dtype.type("NA_") in x + else: + has_na1 = None + has_na2 = None + return _StringAttributes( - has_na1=x.dtype.type("NA") in x, - has_na2=x.dtype.type("NA_") in x, - missing=missing, - max_len=max_len, + missing = missing, + has_na1 = has_na1, + has_na2 = has_na2, + max_len = max_len, ) @singledispatch -def collect_string_attributes(x: Any) -> _StringAttributes: - collected = apply_over_blocks(x, lambda pos, block : _collect_string_attributes_from_ndarray(block)) - return _combine_string_attributes(collected) +def collect_string_attributes(x: Any, buffer_size: int) -> _StringAttributes: + block_shape = choose_block_shape_for_iteration(x, memory = buffer_size) + collected = apply_over_blocks( + x, + lambda pos, block : _collect_string_attributes_from_ndarray(block, buffer_size), + block_shape = block_shape + ) + return _combine_string_attributes(collected, check_missing = is_masked(x)) + +def _combine_string_attributes(x: List[_StringAttributes], check_missing: bool) -> _StringAttributes: + if check_missing: + missing = _aggregate_any(x, "missing") + has_na1 = _aggregate_any(x, "has_na1") + has_na2 = _aggregate_any(x, "has_na2") + else: + missing = False + has_na1 = None + has_na2 = None -def _combine_string_attributes(x: List[_StringAttributes]) -> _StringAttributes: return _StringAttributes( - has_na1 = _aggregate_any(x, "has_na1"), - has_na2 = _aggregate_any(x, "has_na2"), - missing = _aggregate_any(x, "missing"), + missing = missing, + has_na1 = has_na1, + has_na2 = has_na2, max_len = _aggregate_max(x, "max_len"), ) @collect_string_attributes.register -def _collect_string_attributes_from_ndarray(x: numpy.ndarray) -> _StringAttributes: - return _simple_string_collector(x) +def _collect_string_attributes_from_ndarray(x: numpy.ndarray, buffer_size: int) -> _StringAttributes: + return _simple_string_collector(x, check_missing = numpy.ma.isMaskedArray(x)) -def optimize_string_storage(x) -> _OptimizedStorageParameters: - attr = collect_string_attributes(x) +def optimize_string_storage(x, buffer_size: int = 1e8) -> _OptimizedStorageParameters: + attr = collect_string_attributes(x, buffer_size = buffer_size) attr.max_len = max(1, attr.max_len) placeholder = None @@ -559,65 +676,85 @@ class _BooleanAttributes: @singledispatch -def collect_boolean_attributes(x: Any) -> _BooleanAttributes: +def collect_boolean_attributes(x: Any, buffer_size: int) -> _BooleanAttributes: + block_shape = choose_block_shape_for_iteration(x, memory = buffer_size) if is_sparse(x): - collated = apply_over_blocks(x, lambda pos, block : _collect_boolean_attributes_from_Sparse2darray(block), allow_sparse=True) + collated = apply_over_blocks( + x, + lambda pos, block : _collect_boolean_attributes_from_Sparse2darray(block, buffer_size), + block_shape = block_shape, + allow_sparse=True + ) else: - collated = apply_over_blocks(x, lambda pos, block : _collect_boolean_attributes_from_ndarray(block)) - return _combine_boolean_attributes(collated) + collated = apply_over_blocks( + x, + lambda pos, block : _collect_boolean_attributes_from_ndarray(block, buffer_size), + block_shape = block_shape + ) + return _combine_boolean_attributes(collated, check_missing = is_masked(x)) @collect_boolean_attributes.register -def _collect_boolean_attributes_from_ndarray(x: numpy.ndarray) -> _BooleanAttributes: - return _simple_boolean_collector(x) +def _collect_boolean_attributes_from_ndarray(x: numpy.ndarray, buffer_size: int) -> _BooleanAttributes: + return _simple_boolean_collector(x, check_missing = numpy.ma.isMaskedArray(x)) @collect_boolean_attributes.register -def _collect_boolean_attributes_from_Sparse2darray(x: SparseNdarray) -> _BooleanAttributes: - collected = _collect_from_Sparse2darray(x.contents, _simple_boolean_collector, x.dtype) - return _combine_boolean_attributes(collected) +def _collect_boolean_attributes_from_Sparse2darray(x: SparseNdarray, buffer_size: int) -> _BooleanAttributes: + check_missing = is_masked(x) + collected = _collect_from_Sparse2darray( + x.contents, + lambda block : _simple_boolean_collector(block, check_missing), + x.dtype + ) + return _combine_boolean_attributes(collected, check_missing) -def _simple_boolean_collector(x: numpy.ndarray) -> _BooleanAttributes: +def _simple_boolean_collector(x: numpy.ndarray, check_missing: bool) -> _BooleanAttributes: missing = False if x.size: - if numpy.ma.is_masked(x): + if check_missing: if x.mask.any(): missing = True return _BooleanAttributes(non_zero = 0, missing = missing) -def _combine_boolean_attributes(x: List[_BooleanAttributes]) -> _BooleanAttributes: +def _combine_boolean_attributes(x: List[_BooleanAttributes], check_missing: bool) -> _BooleanAttributes: + if check_missing: + missing = _aggregate_any(x, "missing") + else: + missing = False + return _BooleanAttributes( - missing = _aggregate_any(x, "missing"), + missing = missing, non_zero = _aggregate_sum(x, "non_zero") ) if has_scipy: @collect_boolean_attributes.register - def _collect_boolean_attributes_from_scipy_csc(x: scipy.sparse.csc_matrix): - output = _simple_boolean_collector(x.data) + def _collect_boolean_attributes_from_scipy_csc(x: scipy.sparse.csc_matrix, buffer_size: int) -> _BooleanAttributes: + output = _simple_boolean_collector(x.data, check_missing = False) output.non_zero = int(x.data.shape[0]) return output @collect_boolean_attributes.register - def _collect_boolean_attributes_from_scipy_csr(x: scipy.sparse.csr_matrix): - output = _simple_boolean_collector(x.data) + def _collect_boolean_attributes_from_scipy_csr(x: scipy.sparse.csr_matrix, buffer_size: int) -> _BooleanAttributes: + output = _simple_boolean_collector(x.data, check_missing = False) output.non_zero = int(x.data.shape[0]) return output @collect_boolean_attributes.register - def _collect_boolean_attributes_from_scipy_coo(x: scipy.sparse.coo_matrix): - output = _simple_boolean_collector(x.data) + def _collect_boolean_attributes_from_scipy_coo(x: scipy.sparse.coo_matrix, buffer_size: int) -> _BooleanAttributes: + output = _simple_boolean_collector(x.data, check_missing = False) output.non_zero = int(x.data.shape[0]) return output -def optimize_boolean_storage(x) -> _OptimizedStorageParameters: - attr = collect_boolean_attributes(x) +def optimize_boolean_storage(x, buffer_size: int = 1e8) -> _OptimizedStorageParameters: + attr = collect_boolean_attributes(x, buffer_size) if attr.missing: return _OptimizedStorageParameters(type="i1", placeholder=-1, non_zero=attr.non_zero) else: diff --git a/src/dolomite_matrix/_utils.py b/src/dolomite_matrix/_utils.py index 859788c..0bf13ed 100644 --- a/src/dolomite_matrix/_utils.py +++ b/src/dolomite_matrix/_utils.py @@ -1,11 +1,11 @@ import numpy -def sanitize_for_writing(x, placeholder): - if not numpy.ma.is_masked(x): +def sanitize_for_writing(x, placeholder, output_dtype): + if not numpy.ma.isMaskedArray(x): return x if not x.mask.any(): return x.data - copy = x.data.copy() - copy[mask] = placeholder + copy = x.data.astype(output_dtype, copy=True) + copy[x.mask] = placeholder return copy diff --git a/src/dolomite_matrix/save_compressed_sparse_matrix.py b/src/dolomite_matrix/save_compressed_sparse_matrix.py index 2a0b01a..c854b2d 100644 --- a/src/dolomite_matrix/save_compressed_sparse_matrix.py +++ b/src/dolomite_matrix/save_compressed_sparse_matrix.py @@ -54,7 +54,7 @@ def _h5_write_sparse_matrix(x: Any, handle, details, compressed_sparse_matrix_bu compressed_sparse_matrix_chunk_size = min(compressed_sparse_matrix_chunk_size, details.non_zero) dhandle = handle.create_dataset("data", shape = details.non_zero, dtype = details.type, compression = "gzip", chunks = compressed_sparse_matrix_chunk_size) if details.placeholder is not None: - dhandle.create("missing-value-placeholder", data = details.placeholder, dtype = details.dtype) + dhandle.attrs.create("missing-value-placeholder", data = details.placeholder, dtype = details.type) itype = _choose_index_type(x.shape[secondary]) ihandle = handle.create_dataset("indices", shape = details.non_zero, dtype = itype, compression = "gzip", chunks = compressed_sparse_matrix_chunk_size) @@ -85,7 +85,7 @@ def _h5_write_sparse_matrix(x: Any, handle, details, compressed_sparse_matrix_bu if b is not None: counter += len(b[0]) icollected.append(b[0]) - dcollected.append(ut.sanitize_for_writing(b[1], details.placeholder)) + dcollected.append(ut.sanitize_for_writing(b[1], details.placeholder, output_dtype=dhandle.dtype)) indptrs[start + i + 1] = counter # Collecting everything in memory for a single write operation, avoid @@ -112,20 +112,9 @@ def _write_compressed_sparse_matrix(x: Any, handle, details, compressed_sparse_m handle.create_dataset("indices", data = x.indices, dtype = itype, compression = "gzip", chunks = compressed_sparse_matrix_chunk_size) handle.create_dataset("indptr", data = x.indptr, dtype = "u8", compression = "gzip", chunks = True) - if not numpy.ma.is_masked(x.data): - handle.create_dataset("data", data = x.data, dtype = details.type, compression = "gzip", chunks = compressed_sparse_matrix_chunk_size) - elif not x.mask.any(): - handle.create_dataset("data", data = x.data.data, dtype = details.type, compression = "gzip", chunks = compressed_sparse_matrix_chunk_size) - else: - dhandle = handle.create_dataset("data", shape = details.non_zero, dtype = details.type, compression="gzip", chunks = compressed_sparse_matrix_chunk_size) - if details.placeholder is not None: - dhandle.create("missing-value-placeholder", data = details.placeholder, dtype = details.dtype) - - step = max(1, int(compressed_sparse_matrix_buffer_size / compressed_sparse_matrix_chunk_size)) * compressed_sparse_matrix_chunk_size - for i in range(0, details.non_zero, step): - end = min(details.non_zero, i + step) - block = x.data[i : end] # might be a view, so sanitization (and possible copying) is necessary. - dhandle[i : end] = ut.sanitize_for_writing(block, details.placeholder) + # Currently, it seems like scipy's sparse matrices are not intended + # to be masked, so we'll just ignore it completely. + handle.create_dataset("data", data = x.data, dtype = details.type, compression = "gzip", chunks = compressed_sparse_matrix_chunk_size) @_h5_write_sparse_matrix.register @@ -166,13 +155,13 @@ def _save_compressed_sparse_matrix(x: Any, path: str, compressed_sparse_matrix_c if numpy.issubdtype(x.dtype, numpy.integer): tt = "integer" - opts = optim.optimize_integer_storage(x) + opts = optim.optimize_integer_storage(x, buffer_size = compressed_sparse_matrix_buffer_size) elif numpy.issubdtype(x.dtype, numpy.floating): tt = "number" - opts = optim.optimize_float_storage(x) + opts = optim.optimize_float_storage(x, buffer_size = compressed_sparse_matrix_buffer_size) elif x.dtype == numpy.bool_: tt = "boolean" - opts = optim.optimize_boolean_storage(x) + opts = optim.optimize_boolean_storage(x, buffer_size = compressed_sparse_matrix_buffer_size) else: raise NotImplementedError("cannot save sparse matrix of type '" + x.dtype.name + "'") diff --git a/src/dolomite_matrix/save_dense_array.py b/src/dolomite_matrix/save_dense_array.py index 78e9648..a16beb8 100644 --- a/src/dolomite_matrix/save_dense_array.py +++ b/src/dolomite_matrix/save_dense_array.py @@ -1,4 +1,4 @@ -from typing import Tuple, Optional, Any, Dict +from typing import Tuple, Optional, Any, Dict, Union import numpy from dolomite_base import save_object, validate_saves import delayedarray @@ -30,14 +30,19 @@ def _chunk_shape_DenseArrayOutputMock(x: _DenseArrayOutputMock): return x.chunks -def _blockwise_write_to_hdf5(dhandle: h5py.Dataset, chunk_shape: Tuple, x: Any, placeholder: Any, is_string: bool, memory: int): +def _blockwise_write_to_hdf5(dhandle: h5py.Dataset, chunk_shape: Tuple, x: Any, placeholder: Any, memory: int): mock = _DenseArrayOutputMock(x.shape, x.dtype, chunk_shape) block_shape = delayedarray.choose_block_shape_for_iteration(mock, memory=memory) + + is_string = numpy.issubdtype(dhandle.dtype, numpy.bytes_) if placeholder is not None: - placeholder = x.dtype.type(placeholder) + if is_string: + placeholder = placeholder.encode("UTF8") + else: + placeholder = dhandle.dtype.type(placeholder) def _blockwise_dense_writer(pos: Tuple, block): - block = ut.sanitize_for_writing(block, placeholder) + block = ut.sanitize_for_writing(block, placeholder, output_dtype=dhandle.dtype) # h5py doesn't want to convert from numpy's Unicode type to bytes # automatically, and fails: so fine, we'll do it ourselves. @@ -84,16 +89,16 @@ def _save_dense_array( blockwise = False if numpy.issubdtype(x.dtype, numpy.integer): tt = "integer" - opts = optim.optimize_integer_storage(x) + opts = optim.optimize_integer_storage(x, buffer_size = dense_array_buffer_size) elif numpy.issubdtype(x.dtype, numpy.floating): tt = "number" - opts = optim.optimize_float_storage(x) + opts = optim.optimize_float_storage(x, buffer_size = dense_array_buffer_size) elif x.dtype == numpy.bool_: tt = "boolean" - opts = optim.optimize_boolean_storage(x) + opts = optim.optimize_boolean_storage(x, buffer_size = dense_array_buffer_size) elif numpy.issubdtype(x.dtype, numpy.str_): tt = "string" - opts = optim.optimize_string_storage(x) + opts = optim.optimize_string_storage(x, buffer_size = dense_array_buffer_size) blockwise = True else: raise NotImplementedError("cannot save dense array of type '" + x.dtype.name + "'") @@ -123,7 +128,7 @@ def _save_dense_array( # So, we save the blocks in transposed form for efficiency. ghandle.create_dataset("transposed", data=1, dtype="i1") dhandle = ghandle.create_dataset("data", shape=(*reversed(x.shape),), chunks=(*reversed(dense_array_chunk_dimensions),), dtype=opts.type, compression="gzip") - _blockwise_write_to_hdf5(dhandle, chunk_shape=dense_array_chunk_dimensions, x=x, placeholder=opts.placeholder, is_string=(tt == "string"), memory=dense_array_buffer_size) + _blockwise_write_to_hdf5(dhandle, chunk_shape=dense_array_chunk_dimensions, x=x, placeholder=opts.placeholder, memory=dense_array_buffer_size) if opts.placeholder is not None: dhandle.attrs.create("missing-value-placeholder", data=opts.placeholder, dtype=opts.type) diff --git a/tests/test_DelayedMask.py b/tests/test_DelayedMask.py index a275074..797010c 100644 --- a/tests/test_DelayedMask.py +++ b/tests/test_DelayedMask.py @@ -11,6 +11,7 @@ def test_DelayedMask_dense(): assert m.shape == y.shape assert m.placeholder == 1 assert not delayedarray.is_sparse(m) + assert delayedarray.is_masked(m) assert delayedarray.chunk_shape(m) == (1, 3) block = delayedarray.extract_dense_array(m) diff --git a/tests/test_compressed_sparse_matrix.py b/tests/test_compressed_sparse_matrix.py index 9155222..0d066e0 100644 --- a/tests/test_compressed_sparse_matrix.py +++ b/tests/test_compressed_sparse_matrix.py @@ -1,3 +1,4 @@ +from typing import Tuple import scipy.sparse import dolomite_base as dl import dolomite_matrix as dm @@ -6,6 +7,7 @@ import delayedarray import filebackedarray import os +import random def test_compressed_sparse_matrix_csc(): @@ -45,26 +47,6 @@ def test_compressed_sparse_matrix_coo(): assert (numpy.array(roundtrip) == y.toarray()).all() -def test_compressed_sparse_matrix_SparseNdarray(): - y = delayedarray.SparseNdarray( - (10, 5), - [ - None, - (numpy.array([0, 8]), numpy.array([1, 20])), - None, - (numpy.array([2, 9]), numpy.array([0, 5000])), - None - ] - ) - dir = os.path.join(mkdtemp(),"foobar") - dl.save_object(y, dir) - - roundtrip = dl.read_object(dir) - assert roundtrip.shape == y.shape - assert numpy.issubdtype(roundtrip.dtype, numpy.integer) - assert (numpy.array(roundtrip) == numpy.array(y)).all() - - def test_compressed_sparse_matrix_integer(): y = (scipy.sparse.random(1000, 200, 0.1) * 10).tocsc() y = y.astype(numpy.int32) @@ -86,3 +68,111 @@ def test_compressed_sparse_matrix_boolean(): assert roundtrip.shape == y.shape assert roundtrip.dtype == y.dtype assert (numpy.array(roundtrip) == y.toarray()).all() + + +############################################ +############################################ + + +def _simulate_SparseNdarray(shape: Tuple[int, int], dtype: numpy.dtype, density: float = 0.2, mask_rate: float = 0) -> delayedarray.SparseNdarray: + contents = [] + for i in range(shape[1]): + all_indices = [] + for j in range(shape[0]): + if random.random() < density: + all_indices.append(j) + + vals =(numpy.random.rand(len(all_indices)) * 10).astype(dtype) + if mask_rate: + vals = numpy.ma.MaskedArray(vals, mask=numpy.random.rand(len(vals)) < mask_rate) + contents.append((numpy.array(all_indices, dtype=numpy.dtype("int32")), vals)) + + return delayedarray.SparseNdarray(shape, contents=contents) + + +def test_compressed_sparse_matrix_SparseNdarray_integer(): + y = _simulate_SparseNdarray((40, 35), dtype=numpy.dtype("int32")) + dir = os.path.join(mkdtemp(),"foobar") + dl.save_object(y, dir) + + roundtrip = dl.read_object(dir) + assert roundtrip.shape == y.shape + assert numpy.issubdtype(roundtrip.dtype, numpy.integer) + assert (numpy.array(roundtrip) == numpy.array(y)).all() + + +def test_compressed_sparse_matrix_SparseNdarray_boolean(): + y = _simulate_SparseNdarray((25, 50), dtype=numpy.dtype("bool")) + dir = os.path.join(mkdtemp(),"foobar") + dl.save_object(y, dir) + + roundtrip = dl.read_object(dir) + assert roundtrip.shape == y.shape + assert roundtrip.dtype == numpy.bool_ + assert (numpy.array(roundtrip) == numpy.array(y)).all() + + +def test_compressed_sparse_matrix_SparseNdarray_float_chunks(): + y = _simulate_SparseNdarray((25, 100), dtype=numpy.dtype("float32")) + dir = os.path.join(mkdtemp(),"foobar") + dl.save_object(y, dir, compressed_sparse_matrix_buffer_size=y.dtype.itemsize * 60) + + roundtrip = dl.read_object(dir) + assert roundtrip.shape == y.shape + assert roundtrip.dtype == numpy.float32 + assert (numpy.array(roundtrip) == numpy.array(y)).all() + + +############################################ +############################################ + + +def test_compressed_sparse_matrix_integer_mask(): + y = _simulate_SparseNdarray((50, 30), dtype=numpy.dtype("int32"), mask_rate=0.3) + dir = os.path.join(mkdtemp(),"foobar") + dl.save_object(y, dir) + + roundtrip = dl.read_object(dir) + assert roundtrip.shape == y.shape + assert numpy.issubdtype(roundtrip.dtype, numpy.integer) + + densed = delayedarray.extract_dense_array(roundtrip) + ref = delayedarray.extract_dense_array(y) + assert (densed.mask == ref.mask).all() + assert numpy.logical_or(densed == ref, ref.mask).all() + + +def test_compressed_sparse_matrix_float_mask(): + y = _simulate_SparseNdarray((20, 100), dtype=numpy.dtype("float64"), mask_rate=0.3) + y.contents[0] = ( # injecting some special values. + numpy.array([1,2,3]), + numpy.array([numpy.nan, numpy.inf, -numpy.inf], dtype=numpy.dtype("float64")) + ) + + dir = os.path.join(mkdtemp(),"foobar") + dl.save_object(y, dir, compressed_sparse_matrix_buffer_size=y.dtype.itemsize * 50) + + roundtrip = dl.read_object(dir) + assert roundtrip.shape == y.shape + assert numpy.issubdtype(roundtrip.dtype, numpy.floating) + + densed = delayedarray.extract_dense_array(roundtrip) + ref = delayedarray.extract_dense_array(y) + assert (densed.mask == ref.mask).all() + vals = numpy.logical_or(densed == ref, numpy.isnan(densed) == numpy.isnan(ref)) + assert numpy.logical_or(vals, ref.mask).all() + + +def test_compressed_sparse_matrix_bool_mask(): + y = _simulate_SparseNdarray((100, 20), dtype=numpy.dtype("bool"), mask_rate=0.3) + dir = os.path.join(mkdtemp(),"foobar") + dl.save_object(y, dir, compressed_sparse_matrix_buffer_size=y.dtype.itemsize * 500) + + roundtrip = dl.read_object(dir) + assert roundtrip.shape == y.shape + assert roundtrip.dtype == numpy.bool_ + + densed = delayedarray.extract_dense_array(roundtrip) + ref = delayedarray.extract_dense_array(y) + assert (densed.mask == ref.mask).all() + assert numpy.logical_or(densed == ref, ref.mask).all() diff --git a/tests/test_delayed_array.py b/tests/test_delayed_array.py index 80a6966..3f7b6c3 100644 --- a/tests/test_delayed_array.py +++ b/tests/test_delayed_array.py @@ -115,7 +115,7 @@ def test_delayed_array_low_block_size_F_contiguous(): ######################################################## -def test_delayed_array_sparse(): +def test_delayed_array_sparse_csc(): x = scipy.sparse.random(1000, 200, 0.1).tocsc() y = da.wrap(x) * 10 @@ -127,3 +127,17 @@ def test_delayed_array_sparse(): assert isinstance(roundtrip, dm.ReloadedArray) assert isinstance(roundtrip.seed.seed, filebackedarray.Hdf5CompressedSparseMatrixSeed) assert (numpy.array(roundtrip) == x.toarray() * 10).all() + + +def test_delayed_array_sparse_csr(): + x = scipy.sparse.random(1000, 200, 0.1).tocsr() + y = da.wrap(x) * 10 + + dir = os.path.join(mkdtemp(), "foobar") + dl.save_object(y, dir, compressed_sparse_matrix_buffer_size=8*10000) + roundtrip = dl.read_object(dir) + assert roundtrip.shape == y.shape + assert roundtrip.dtype == y.dtype + assert isinstance(roundtrip, dm.ReloadedArray) + assert isinstance(roundtrip.seed.seed, filebackedarray.Hdf5CompressedSparseMatrixSeed) + assert (numpy.array(roundtrip) == x.toarray() * 10).all() diff --git a/tests/test_dense_array.py b/tests/test_dense_array.py index e2e98f9..31c5b83 100644 --- a/tests/test_dense_array.py +++ b/tests/test_dense_array.py @@ -56,20 +56,82 @@ def test_dense_array_string(): ######################################################## -#def test_dense_array_number_mask(): -# y0 = numpy.random.rand(100, 200) -# mask = y0 > 0.9 -# y = numpy.ma.MaskedArray(y0, mask=mask) -# -# dir = os.path.join(mkdtemp(), "foobar") -# save_object(y, dir) -# roundtrip = read_object(dir) -# assert roundtrip.shape == y.shape -# assert numpy.issubdtype(roundtrip.dtype, numpy.floating) -# -# dump = delayedarray.extract_dense_array(roundtrip) -# assert (dump.mask == mask).all() -# assert (dump == y).all() +def test_dense_array_number_mask(): + y0 = numpy.random.rand(100, 200) + mask = y0 > 0.9 + y = numpy.ma.MaskedArray(y0, mask=mask) + + dir = os.path.join(mkdtemp(), "foobar") + save_object(y, dir) + roundtrip = read_object(dir) + assert roundtrip.shape == y.shape + assert numpy.issubdtype(roundtrip.dtype, numpy.floating) + + dump = delayedarray.extract_dense_array(roundtrip) + assert (dump.mask == mask).all() + assert numpy.logical_or(dump == y, mask).all() + + +def test_dense_array_number_mask_complex(): + y0 = numpy.random.rand(100, 200) + mask = y0 > 0.9 + y = numpy.ma.MaskedArray(y0, mask=mask) + y[0, 0] = numpy.nan + y[1, 1] = numpy.inf + y[2, 2] = -numpy.inf + + dir = os.path.join(mkdtemp(), "foobar") + save_object(y, dir) + roundtrip = read_object(dir) + assert roundtrip.shape == y.shape + assert numpy.issubdtype(roundtrip.dtype, numpy.floating) + + dump = delayedarray.extract_dense_array(roundtrip) + assert (dump.mask == mask).all() + vals = numpy.logical_or(dump == y, numpy.isnan(dump) == numpy.isnan(y)) + assert numpy.logical_or(vals, mask).all() + + +def test_dense_array_number_mask_integer(): + # This setup is chosen to populate all of the 8-bit space in the non-masked + # data, which subsequently forces a type promotion during blockwise writing + # so that we can correctly insert the placeholder. + y0 = (numpy.random.rand(100, 200) * 256).astype(numpy.uint8) + mask = numpy.random.rand(100, 200) < 0.5 + y = numpy.ma.MaskedArray(y0, mask=mask) + + dir = os.path.join(mkdtemp(), "foobar") + save_object(y, dir) + roundtrip = read_object(dir) + assert roundtrip.shape == y.shape + assert numpy.issubdtype(roundtrip.dtype, numpy.integer) + + dump = delayedarray.extract_dense_array(roundtrip) + assert (dump.mask == mask).all() + assert numpy.logical_or(dump == y, mask).all() + + +def test_dense_array_number_mask_string(): + # This setup is chosen to force promotion to a longer string length during + # blockwise writing so that we correctly insert the placeholder. + x = numpy.ndarray([100, 200], dtype="U1") + choices = "abcdefghijk" + for i in range(x.shape[0]): + for j in range(x.shape[1]): + x[i,j] = random.choice(choices) + + mask = numpy.random.rand(100, 200) < 0.5 + x = numpy.ma.MaskedArray(x, mask=mask) + + dir = os.path.join(mkdtemp(), "foobar") + save_object(x, dir) + roundtrip = read_object(dir) + assert roundtrip.shape == x.shape + assert numpy.issubdtype(roundtrip.dtype, numpy.str_) + + dump = delayedarray.extract_dense_array(roundtrip) + assert (dump.mask == mask).all() + assert numpy.logical_or(dump == x, mask).all() ######################################################## @@ -87,6 +149,7 @@ def test_dense_array_F_contiguous(): def test_dense_array_block_size(): + # Triggering blockwise processing by using strings. x = numpy.ndarray([100, 200], dtype="U1") choices = "ABCDE" for i in range(x.shape[0]): @@ -99,3 +162,15 @@ def test_dense_array_block_size(): assert roundtrip.shape == x.shape assert roundtrip.dtype == x.dtype assert (numpy.array(roundtrip) == x).all() + + # Triggering blockwise processing by using placeholders. + x0 = numpy.random.rand(100, 200) + mask = x0 > 0.9 + x = numpy.ma.MaskedArray(x0, mask=mask) + + dir = os.path.join(mkdtemp(), "foobar") + save_object(x, dir, dense_array_buffer_size=x.dtype.itemsize * 50) + roundtrip = read_object(dir) + assert roundtrip.shape == x.shape + assert roundtrip.dtype == x.dtype + assert (numpy.array(roundtrip) == x).all() diff --git a/tests/test_optimize_storage.py b/tests/test_optimize_storage.py index a2e978b..5e05cdc 100644 --- a/tests/test_optimize_storage.py +++ b/tests/test_optimize_storage.py @@ -1,6 +1,7 @@ import dolomite_matrix._optimize_storage as optim import numpy import delayedarray +import pytest ################################################### @@ -176,20 +177,21 @@ def test_optimize_integer_storage_scipy(): assert opt.placeholder is None assert opt.non_zero == 5 - opt = optim.optimize_integer_storage(y.tocsc()) + opt = optim.optimize_integer_storage(y.tocsc(), buffer_size = 10) assert opt.type == "i2" assert opt.placeholder is None assert opt.non_zero == 5 - opt = optim.optimize_integer_storage(y.tocsr()) + opt = optim.optimize_integer_storage(y.tocsr(), buffer_size = 20) assert opt.type == "i2" assert opt.placeholder is None assert opt.non_zero == 5 -def test_optimize_integer_storage_Any(): +@pytest.mark.parametrize("buffer_size", [1, 10, 100]) +def test_optimize_integer_storage_Any(buffer_size): y = delayedarray.DelayedArray(numpy.array([[1,2,3],[4,5,6]])) - opt = optim.optimize_integer_storage(y * 200000) + opt = optim.optimize_integer_storage(y * 200000, buffer_size = buffer_size) assert opt.type == "i4" assert opt.placeholder is None @@ -204,7 +206,7 @@ def test_optimize_integer_storage_Any(): ] ) y = delayedarray.DelayedArray(y) - opt = optim.optimize_integer_storage(y * 2) + opt = optim.optimize_integer_storage(y * 2, buffer_size = buffer_size) assert opt.type == "u2" assert opt.placeholder is None @@ -448,23 +450,24 @@ def test_optimize_float_storage_scipy(): assert opt.placeholder is None assert opt.non_zero == 5 - opt = optim.optimize_float_storage(y.tocsc()) + opt = optim.optimize_float_storage(y.tocsc(), buffer_size = 10) assert opt.type == "i2" assert opt.placeholder is None assert opt.non_zero == 5 - opt = optim.optimize_float_storage(y.tocsr()) + opt = optim.optimize_float_storage(y.tocsr(), buffer_size = 20) assert opt.type == "i2" assert opt.placeholder is None assert opt.non_zero == 5 -def test_optimize_float_storage_Any(): +@pytest.mark.parametrize("buffer_size", [1, 10, 100]) +def test_optimize_float_storage_Any(buffer_size): y = delayedarray.DelayedArray(numpy.array([[1,2,3],[4,5,6]])) y = y * 20000.000 assert y.dtype == numpy.float64 - opt = optim.optimize_float_storage(y) + opt = optim.optimize_float_storage(y, buffer_size = buffer_size) assert opt.type == "u4" assert opt.placeholder is None @@ -479,7 +482,7 @@ def test_optimize_float_storage_Any(): ] ) y = delayedarray.DelayedArray(y) - opt = optim.optimize_float_storage(y * 2) + opt = optim.optimize_float_storage(y * 2, buffer_size = buffer_size) assert opt.type == "u2" assert opt.placeholder is None @@ -524,9 +527,10 @@ def test_optimize_string_storage_dense_MaskedArray(): assert opt.placeholder == "NA" -def test_optimize_string_storage_Any(): +@pytest.mark.parametrize("buffer_size", [1, 10, 100]) +def test_optimize_string_storage_Any(buffer_size): y = delayedarray.DelayedArray(numpy.array([["A","BB","CCC"],["DDDD","EEEEE","FFFFFF"]])) - opt = optim.optimize_string_storage(y) + opt = optim.optimize_string_storage(y, buffer_size = buffer_size) assert opt.type == "S6" assert opt.placeholder is None @@ -625,20 +629,21 @@ def test_optimize_boolean_storage_scipy(): assert opt.placeholder is None assert opt.non_zero == 5 - opt = optim.optimize_boolean_storage(y.tocsc()) + opt = optim.optimize_boolean_storage(y.tocsc(), buffer_size = 2) assert opt.type == "i1" assert opt.placeholder is None assert opt.non_zero == 5 - opt = optim.optimize_boolean_storage(y.tocsr()) + opt = optim.optimize_boolean_storage(y.tocsr(), buffer_size = 5) assert opt.type == "i1" assert opt.placeholder is None assert opt.non_zero == 5 -def test_optimize_boolean_storage_Any(): +@pytest.mark.parametrize("buffer_size", [1, 10, 100]) +def test_optimize_boolean_storage_Any(buffer_size): y = delayedarray.DelayedArray(numpy.array([[True,False,True],[False,True,False]])) - opt = optim.optimize_boolean_storage(y) + opt = optim.optimize_boolean_storage(y, buffer_size) assert opt.type == "i1" assert opt.placeholder is None @@ -653,8 +658,6 @@ def test_optimize_boolean_storage_Any(): ] ) y = delayedarray.DelayedArray(y) - opt = optim.optimize_boolean_storage(y) + opt = optim.optimize_boolean_storage(y, buffer_size = buffer_size) assert opt.type == "i1" assert opt.placeholder is None - -