From ddf476b2a7fe67b8ce951a999b5c56b73273b20f Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Thu, 3 Oct 2019 10:50:38 +0200 Subject: [PATCH] Move AttrSeries from reporting.utils to (new) reporting.attrseries --- doc/source/reporting.rst | 6 +- ixmp/reporting/__init__.py | 9 ++- ixmp/reporting/attrseries.py | 124 ++++++++++++++++++++++++++++++++ ixmp/reporting/utils.py | 134 +++-------------------------------- ixmp/testing.py | 2 +- 5 files changed, 145 insertions(+), 130 deletions(-) create mode 100644 ixmp/reporting/attrseries.py diff --git a/doc/source/reporting.rst b/doc/source/reporting.rst index 5e040337d..6c3919675 100644 --- a/doc/source/reporting.rst +++ b/doc/source/reporting.rst @@ -130,6 +130,10 @@ Computations .. automodule:: ixmp.reporting.computations :members: + Unless otherwise specified, these methods accept and return + :class:`Quantity ` objects for data + arguments/return values. + Calculations: .. autosummary:: @@ -154,7 +158,7 @@ Computations Utilities --------- -.. autoclass:: ixmp.reporting.utils.AttrSeries +.. autoclass:: ixmp.reporting.attrseries.AttrSeries .. automodule:: ixmp.reporting.utils :members: diff --git a/ixmp/reporting/__init__.py b/ixmp/reporting/__init__.py index 4f02eef13..51335b896 100644 --- a/ixmp/reporting/__init__.py +++ b/ixmp/reporting/__init__.py @@ -5,7 +5,7 @@ # The core design pattern uses dask graphs; see # http://docs.dask.org/en/latest/spec.html # - Reporter.graph is a dictionary where: -# - keys are strings or ixmp.reporting.util.Key objects (which compare/hash +# - keys are strings or ixmp.reporting.key.Key objects (which compare/hash # equal to their str() representation), and # - values are 'computations' (the Reporter.add() docstring repeats the # definition of computations from the above URL). @@ -37,7 +37,12 @@ import yaml from .key import Key -from .utils import REPLACE_UNITS, keys_for_quantity, rename_dims, ureg +from .utils import ( + REPLACE_UNITS, + keys_for_quantity, + rename_dims, + ureg, +) from . import computations from .describe import describe_recursive diff --git a/ixmp/reporting/attrseries.py b/ixmp/reporting/attrseries.py new file mode 100644 index 000000000..11dee619e --- /dev/null +++ b/ixmp/reporting/attrseries.py @@ -0,0 +1,124 @@ +from collections import OrderedDict +from collections.abc import Collection +from copy import deepcopy + +import pandas as pd +from pandas.core.generic import NDFrame +import xarray as xr + + +class AttrSeries(pd.Series): + """:class:`pandas.Series` subclass imitating :class:`xarray.DataArray`. + + Future versions of :mod:`ixmp.reporting` will use :class:`xarray.DataArray` + as :class:`Quantity`; however, because :mod:`xarray` currently lacks sparse + matrix support, ixmp quantities may be too large for available memory. + + The AttrSeries class provides similar methods and behaviour to + :class:`xarray.DataArray`, such as an `attrs` dictionary for metadata, so + that :mod:`ixmp.reporting.computations` methods can use xarray-like syntax. + """ + + # normal properties + _metadata = ('attrs', ) + + def __init__(self, *args, **kwargs): + if 'attrs' in kwargs: + # Use provided attrs + attrs = kwargs.pop('attrs') + elif hasattr(args[0], 'attrs'): + # Use attrs from an xarray object + attrs = args[0].attrs.copy() + + # pre-convert an pd.Series to preserve names and labels + args = list(args) + args[0] = args[0].to_series() + else: + # default empty + attrs = OrderedDict() + + super().__init__(*args, **kwargs) + + self.attrs = attrs + + def assign_attrs(self, d): + self.attrs.update(d) + return self + + def assign_coords(self, **kwargs): + return pd.concat([self], keys=kwargs.values(), names=kwargs.keys()) + + @property + def coords(self): + """Read-only.""" + return dict(zip(self.index.names, self.index.levels)) + + @property + def dims(self): + return tuple(self.index.names) + + def sel(self, indexers=None, drop=False, **indexers_kwargs): + indexers = indexers or {} + indexers.update(indexers_kwargs) + if len(indexers) == 1: + level, key = list(indexers.items())[0] + if not isinstance(key, Collection) and not drop: + # When using .loc[] to select 1 label on 1 level, pandas drops + # the level. Use .xs() to avoid this behaviour unless drop=True + return AttrSeries(self.xs(key, level=level, drop_level=False)) + + idx = tuple(indexers.get(l, slice(None)) for l in self.index.names) + return AttrSeries(self.loc[idx]) + + def sum(self, *args, **kwargs): + try: + dim = kwargs.pop('dim') + if isinstance(self.index, pd.MultiIndex): + if len(dim) == len(self.index.names): + # assume dimensions = full multi index, do simple sum + obj = self + kwargs = {} + else: + # pivot and sum across columns + obj = self.unstack(dim) + kwargs['axis'] = 1 + else: + if dim != [self.index.name]: + raise ValueError(dim, self.index.name, self) + obj = super() + kwargs['level'] = dim + except KeyError: + obj = super() + return AttrSeries(obj.sum(*args, **kwargs)) + + def squeeze(self, *args, **kwargs): + kwargs.pop('drop') + return super().squeeze(*args, **kwargs) if len(self) > 1 else self + + def as_xarray(self): + return xr.DataArray.from_series(self) + + def transpose(self, *dims): + return self.reorder_levels(dims) + + def to_dataframe(self): + return self.to_frame() + + def to_series(self): + return self + + @property + def _constructor(self): + return AttrSeries + + def __finalize__(self, other, method=None, **kwargs): + """Propagate metadata from other to self. + + This is identical to the version in pandas, except deepcopy() is added + so that the 'attrs' OrderedDict is not double-referenced. + """ + if isinstance(other, NDFrame): + for name in self._metadata: + object.__setattr__(self, name, + deepcopy(getattr(other, name, None))) + return self diff --git a/ixmp/reporting/utils.py b/ixmp/reporting/utils.py index 79240e003..c94b60ff7 100644 --- a/ixmp/reporting/utils.py +++ b/ixmp/reporting/utils.py @@ -1,15 +1,12 @@ -import collections -from collections.abc import Collection -from copy import deepcopy from functools import partial, reduce import logging from operator import mul import pandas as pd -from pandas.core.generic import NDFrame import pint import xarray as xr +from .attrseries import AttrSeries from .key import Key @@ -17,6 +14,12 @@ ureg = pint.UnitRegistry() +# See also: +# - docstring of attrseries.AttrSeries. +# - test_report_size() for a test that shows how non-sparse xr.DataArray +# triggers MemoryError. +Quantity = AttrSeries +# Quantity = xr.DataArray # Replacements to apply to quantity units before parsing by pint REPLACE_UNITS = { @@ -156,123 +159,6 @@ def invalid(unit): return unit -class AttrSeries(pd.Series): - """:class:`pandas.Series` subclass imitating :class:`xarray.DataArray`. - - Future versions of :mod:`ixmp.reporting` will use :class:`xarray.DataArray` - as :class:`Quantity`; however, because :mod:`xarray` currently lacks sparse - matrix support, ixmp quantities may be too large for memory. - - The AttrSeries class provides similar methods and behaviour to - :class:`xarray.DataArray`, such as an `attrs` dictionary for metadata, so - that :mod:`ixmp.reporting.computations` methods can use xarray-like syntax. - """ - - # normal properties - _metadata = ('attrs', ) - - def __init__(self, *args, **kwargs): - if 'attrs' in kwargs: - # Use provided attrs - attrs = kwargs.pop('attrs') - elif hasattr(args[0], 'attrs'): - # Use attrs from an xarray object - attrs = args[0].attrs.copy() - - # pre-convert an pd.Series to preserve names and labels - args = list(args) - args[0] = args[0].to_series() - else: - # default empty - attrs = collections.OrderedDict() - - super().__init__(*args, **kwargs) - - self.attrs = attrs - - def assign_attrs(self, d): - self.attrs.update(d) - return self - - def assign_coords(self, **kwargs): - return pd.concat([self], keys=kwargs.values(), names=kwargs.keys()) - - @property - def coords(self): - """Read-only.""" - return dict(zip(self.index.names, self.index.levels)) - - @property - def dims(self): - return tuple(self.index.names) - - def sel(self, indexers=None, drop=False, **indexers_kwargs): - indexers = indexers or {} - indexers.update(indexers_kwargs) - if len(indexers) == 1: - level, key = list(indexers.items())[0] - if not isinstance(key, Collection) and not drop: - # When using .loc[] to select 1 label on 1 level, pandas drops - # the level. Use .xs() to avoid this behaviour unless drop=True - return AttrSeries(self.xs(key, level=level, drop_level=False)) - - idx = tuple(indexers.get(l, slice(None)) for l in self.index.names) - return AttrSeries(self.loc[idx]) - - def sum(self, *args, **kwargs): - try: - dim = kwargs.pop('dim') - if isinstance(self.index, pd.MultiIndex): - if len(dim) == len(self.index.names): - # assume dimensions = full multi index, do simple sum - obj = self - kwargs = {} - else: - # pivot and sum across columns - obj = self.unstack(dim) - kwargs['axis'] = 1 - else: - if dim != [self.index.name]: - raise ValueError(dim, self.index.name, self) - obj = super() - kwargs['level'] = dim - except KeyError: - obj = super() - return AttrSeries(obj.sum(*args, **kwargs)) - - def squeeze(self, *args, **kwargs): - kwargs.pop('drop') - return super().squeeze(*args, **kwargs) if len(self) > 1 else self - - def as_xarray(self): - return xr.DataArray.from_series(self) - - def transpose(self, *dims): - return self.reorder_levels(dims) - - def to_dataframe(self): - return self.to_frame() - - def to_series(self): - return self - - @property - def _constructor(self): - return AttrSeries - - def __finalize__(self, other, method=None, **kwargs): - """Propagate metadata from other to self. - - This is identical to the version in pandas, except deepcopy() is added - so that the 'attrs' OrderedDict is not double-referenced. - """ - if isinstance(other, NDFrame): - for name in self._metadata: - object.__setattr__(self, name, - deepcopy(getattr(other, name, None))) - return self - - def data_for_quantity(ix_type, name, column, scenario, filters=None): """Retrieve data from *scenario*. @@ -342,7 +228,7 @@ def data_for_quantity(ix_type, name, column, scenario, filters=None): # Convert to a Dataset, assign attrbutes and name # ds = xr.Dataset.from_dataframe(data)[column] # or to a new "Attribute Series" - ds = AttrSeries(data[column]) + ds = Quantity(data[column]) ds = ds \ .assign_attrs(attrs) \ @@ -357,10 +243,6 @@ def data_for_quantity(ix_type, name, column, scenario, filters=None): return ds -# Quantity = xr.DataArray -Quantity = AttrSeries - - def concat(*args, **kwargs): if Quantity is AttrSeries: kwargs.pop('dim') diff --git a/ixmp/testing.py b/ixmp/testing.py index e0f7a3184..07928d04f 100644 --- a/ixmp/testing.py +++ b/ixmp/testing.py @@ -27,7 +27,7 @@ from .config import _config as ixmp_config from .core import Platform, Scenario, IAMC_IDX -from .reporting.utils import Quantity, AttrSeries +from .reporting.utils import AttrSeries, Quantity models = {