diff --git a/.gitignore b/.gitignore index 33760721f..6e63296c2 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ __pycache__/ examples/test_* examples/*.txt examples/*.png +experiments/* # C extensions *.so diff --git a/cvxportfolio/errors.py b/cvxportfolio/errors.py index 388d85fa3..78555cffe 100644 --- a/cvxportfolio/errors.py +++ b/cvxportfolio/errors.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__all__ = ['DataError', 'MissingValuesError', 'ForeCastError', +__all__ = ['DataError', 'MissingTimesError', 'NaNError', 'MissingAssetsError', 'ForeCastError', 'PortfolioOptimizationError', 'Bankruptcy', 'ConvexSpecificationError', 'ConvexityError'] @@ -23,10 +23,20 @@ class DataError(Exception): pass -class MissingValuesError(DataError): - """Cvxportfolio tried to access numpy.nan values.""" +class MissingTimesError(DataError): + """Cvxportfolio couldn't find data for a certain time.""" pass + + +class NaNError(DataError): + """Cvxportfolio tried to access data that includes np.nan.""" + pass + + +class MissingAssetsError(DataError): + """Cvxportfolio couldn't find data for certain assets.""" + pass class ForeCastError(DataError): diff --git a/cvxportfolio/estimator.py b/cvxportfolio/estimator.py index 3302c3e7b..aaaba35ba 100644 --- a/cvxportfolio/estimator.py +++ b/cvxportfolio/estimator.py @@ -22,7 +22,7 @@ import cvxpy as cp -from .errors import MissingValuesError, DataError +from .errors import MissingTimesError, DataError, NaNError, MissingAssetsError from .hyperparameters import HyperParameter from .utils import repr_numpy_pandas @@ -159,6 +159,11 @@ class DataEstimator(PolicyEstimator): to retrieve the last available value at time t by setting this to True. Default is False. :type use_last_available_time: bool + + :raises cvxportfolio.NaNError: If np.nan's are present in result. + :raises cvxportfolio.MissingTimesError: If some times are missing. + :raises cvxportfolio.MissingAssetsError: If some assets are missing. + :raises cvxportfolio.DataError: If data is not in the right form. """ def __init__(self, data, use_last_available_time=False, allow_nans=False, @@ -188,21 +193,11 @@ def _recursive_pre_evaluation(self, universe, backtest_times): def value_checker(self, result): """Ensure that only scalars or arrays without np.nan are returned. - - Args: - result (int, float, or np.array): data produced by self._recursive_values_in_time - - Returns: - result (int, float, or np.array): same data if no np.nan are present and type is correct - - Raises: - cvxportfolio.MissingValuesError: if np.nan's are present in result - cvxportfolio.DataError: if data is not in the right form """ if np.isscalar(result): if np.isnan(result) and not self.allow_nans: - raise MissingValuesError( + raise NaNError( f"{self.__class__.__name__}._recursive_values_in_time result is a np.nan scalar." ) else: @@ -214,7 +209,7 @@ def value_checker(self, result): if hasattr(self.data, 'columns') and len(self.data.columns) == len(result): message += "Specifically, the problem is with symbol(s): " + str( self.data.columns[np.isnan(result)]) - raise MissingValuesError(message) + raise NaNError(message) else: # we pass a copy because it can be accidentally overwritten return np.array(result) @@ -242,39 +237,39 @@ def _universe_subselect(self, data): """ if (self.universe_maybe_noncash is None) or self.ignore_shape_check: - return data + return data.values if hasattr(data, 'values') else data if isinstance(data, pd.Series): try: - return data.loc[self.universe_maybe_noncash] + return data.loc[self.universe_maybe_noncash].values except KeyError: - raise MissingValuesError( - f"The pandas Series found by {self.__class__.__name__} has index {self.data.index}" + raise MissingAssetsError( + f"The pandas Series found by {self.__class__.__name__} has index {data.index}" f" while the current universe {'minus cash' if not self.data_includes_cash else ''}" - f" is {self.universe_maybe_noncash}. It was not possibly to reconcile the two.") + f" is {self.universe_maybe_noncash}. It was not possible to reconcile the two.") if isinstance(data, pd.DataFrame): try: - return data.loc[self.universe_maybe_noncash, self.universe_maybe_noncash] + return data.loc[self.universe_maybe_noncash, self.universe_maybe_noncash].values except KeyError: try: - return data.loc[:, self.universe_maybe_noncash] + return data.loc[:, self.universe_maybe_noncash].values except KeyError: try: - return data.loc[self.universe_maybe_noncash, :] + return data.loc[self.universe_maybe_noncash, :].values except KeyError: pass - raise MissingValuesError( - f"The pandas DataFrame found by {self.__class__.__name__} has index {self.data.index}" - f" and columns {self.data.columns}" + raise MissingAssetsError( + f"The pandas DataFrame found by {self.__class__.__name__} has index {data.index}" + f" and columns {data.columns}" f" while the current universe {'minus cash' if not self.data_includes_cash else ''}" - f" is {self.universe_maybe_noncash}. It was not possibly to reconcile the two.") + f" is {self.universe_maybe_noncash}. It was not possible to reconcile the two.") if isinstance(data, np.ndarray): dimensions = data.shape if not len(self.universe_maybe_noncash) in dimensions: - raise MissingValuesError( - f"The numpy array found by {self.__class__.__name__} has dimensions {self.data.shape}" + raise MissingAssetsError( + f"The numpy array found by {self.__class__.__name__} has dimensions {data.shape}" f" while the current universe {'minus cash' if not self.data_includes_cash else ''}" f" has size {len(self.universe_maybe_noncash)}.") return data @@ -290,11 +285,7 @@ def internal__recursive_values_in_time(self, t, *args, **kwargs): # if self.data has values_in_time we use it if hasattr(self.data, "values_in_time"): tmp = self.data.values_in_time(t=t, *args, **kwargs) - tmp = self._universe_subselect(tmp) - if hasattr(tmp, 'values'): - return self.value_checker(tmp.values) - else: - return self.value_checker(tmp) + return self.value_checker(self._universe_subselect(tmp) ) # if self.data is pandas and has datetime (first) index if (hasattr(self.data, "loc") and hasattr(self.data, "index") @@ -311,19 +302,17 @@ def internal__recursive_values_in_time(self, t, *args, **kwargs): tmp = self.data.loc[newt] else: tmp = self.data.loc[t] - if hasattr(tmp, "values"): - return self.value_checker(self._universe_subselect(tmp.values)) - else: - return self.value_checker(self._universe_subselect(tmp)) + + return self.value_checker(self._universe_subselect(tmp)) + except (KeyError, IndexError): - raise MissingValuesError( - f"{self.__class__.__name__}._recursive_values_in_time could not find data for requested time." - ) + raise MissingTimesError( + f"{self.__class__.__name__}._recursive_values_in_time could not find data for time {t}.") # if data is pandas but no datetime index (constant in time) if hasattr(self.data, "values"): - return self.value_checker(self._universe_subselect(self.data.values)) + return self.value_checker(self._universe_subselect(self.data)) # if data is scalar or numpy return self.value_checker(self._universe_subselect(self.data)) diff --git a/cvxportfolio/policies.py b/cvxportfolio/policies.py index a074da00d..2b5309adb 100644 --- a/cvxportfolio/policies.py +++ b/cvxportfolio/policies.py @@ -161,24 +161,26 @@ class FixedTrades(BaseTradingPolicy): If there are no weights defined for the given day, default to no trades. - Args: - trades_weights (pd.Series or pd.DataFrame): Series of weights - (if constant in time) or DataFrame of trade weights - indexed by time. It trades each day the corresponding vector. + :param trades_weights: target trade weights :math:`z_t` to trade at each period. + If constant in time use a pandas Series indexed by the assets' + names, including the cash account name (``cash_key`` option + to the simulator). If varying in time, use a pandas DataFrame + with datetime index and as columns the assets names including cash. + If a certain time in the backtest is not present in the data provided + the policy defaults to not trading in that period. + :type trades_weights: pd.Series or pd.DataFrame """ def __init__(self, trades_weights): """Trade the tradevec vector (dollars) or tradeweight weights.""" - self.trades_weights = DataEstimator(trades_weights) + self.trades_weights = DataEstimator(trades_weights, data_includes_cash=True) def _recursive_values_in_time(self, t, current_weights, **kwargs): """We need to override recursion b/c we catch exception.""" try: super()._recursive_values_in_time(t=t, current_weights=current_weights, **kwargs) - return pd.Series( - self.trades_weights.current_value, - current_weights.index) - except MissingValuesError: + return pd.Series(self.trades_weights.current_value, current_weights.index) + except MissingTimesError: return pd.Series(0., current_weights.index) @@ -188,15 +190,20 @@ class FixedWeights(BaseTradingPolicy): If there are no weights defined for the given day, default to no trades. - Args: - target_weights (pd.Series or pd.DataFrame): Series of weights - (if constant in time) or DataFrame of trade weights - indexed by time. It trades each day to the corresponding vector. + :param target_weights: target weights :math:`w_t^+` to trade to at each period. + If constant in time use a pandas Series indexed by the assets' + names, including the cash account name (``cash_key`` option + to the simulator). If varying in time, use a pandas DataFrame + with datetime index and as columns the assets names including cash. + If a certain time in the backtest is not present in the data provided + the policy defaults to not trading in that period. + :type target_weights: pd.Series or pd.DataFrame + """ def __init__(self, target_weights): """Trade the tradevec vector (dollars) or tradeweight weights.""" - self.target_weights = DataEstimator(target_weights) + self.target_weights = DataEstimator(target_weights, data_includes_cash=True) def _recursive_values_in_time(self, t, current_weights, **kwargs): """We need to override recursion b/c we catch exception.""" @@ -204,7 +211,7 @@ def _recursive_values_in_time(self, t, current_weights, **kwargs): super()._recursive_values_in_time(t=t, current_weights=current_weights, **kwargs) return pd.Series(self.target_weights.current_value, current_weights.index) - current_weights - except MissingValuesError: + except MissingTimesError: return pd.Series(0., current_weights.index) diff --git a/cvxportfolio/returns.py b/cvxportfolio/returns.py index 738786b1b..7a8ca47c2 100644 --- a/cvxportfolio/returns.py +++ b/cvxportfolio/returns.py @@ -122,7 +122,7 @@ class ReturnsForecast(BaseReturnsModel): while ``decay`` close to one a `slow` signal. The default value is 1. :type decay: float - :raises cvxportfolio.MissingValuesError: If the class accesses + :raises cvxportfolio.MissingTimesError: If the class accesses user-provided elements of ``r_hat`` that are :class:`numpy.nan`. :Example: diff --git a/cvxportfolio/tests/test_estimator.py b/cvxportfolio/tests/test_estimator.py index 254595580..4b1d8be86 100644 --- a/cvxportfolio/tests/test_estimator.py +++ b/cvxportfolio/tests/test_estimator.py @@ -19,7 +19,7 @@ import unittest from cvxportfolio.estimator import DataEstimator # , ParameterEstimator -from cvxportfolio.errors import MissingValuesError, DataError +from cvxportfolio.errors import MissingTimesError, DataError, NaNError, MissingAssetsError import cvxportfolio as cvx @@ -39,7 +39,7 @@ def test_callable(self): self.assertEqual(estimator._recursive_values_in_time(time), 1.0) estimator = DataEstimator(PlaceholderCallable(np.nan)) - with self.assertRaises(MissingValuesError): + with self.assertRaises(NaNError): estimator._recursive_values_in_time(t=time) data = np.arange(10.0) @@ -48,7 +48,7 @@ def test_callable(self): np.all(estimator._recursive_values_in_time(t=time) == data)) data[1] = np.nan - with self.assertRaises(MissingValuesError): + with self.assertRaises(NaNError): estimator._recursive_values_in_time(time) def test_scalar(self): @@ -60,7 +60,7 @@ def test_scalar(self): self.assertTrue(estimator._recursive_values_in_time(t=time) == 1.0) estimator = DataEstimator(np.nan) - with self.assertRaises(MissingValuesError): + with self.assertRaises(NaNError): estimator._recursive_values_in_time(t=time) def test_array(self): @@ -73,7 +73,7 @@ def test_array(self): data[1] = np.nan estimator = DataEstimator(data) - with self.assertRaises(MissingValuesError): + with self.assertRaises(NaNError): estimator._recursive_values_in_time(t=time) def test_series_dataframe_notime(self): @@ -98,21 +98,21 @@ def test_series_timeindex(self): self.assertTrue(estimator._recursive_values_in_time( "2022-01-05") == data.loc["2022-01-05"]) - with self.assertRaises(MissingValuesError): + with self.assertRaises(MissingTimesError): estimator._recursive_values_in_time("2022-02-05") estimator = DataEstimator(data, use_last_available_time=True) self.assertTrue(estimator._recursive_values_in_time( "2022-02-05") == data.iloc[-1]) - with self.assertRaises(MissingValuesError): + with self.assertRaises(MissingTimesError): estimator._recursive_values_in_time("2021-02-05") data["2022-01-05"] = np.nan estimator = DataEstimator(data) self.assertTrue(estimator._recursive_values_in_time( "2022-01-04") == data.loc["2022-01-04"]) - with self.assertRaises(MissingValuesError): + with self.assertRaises(NaNError): estimator._recursive_values_in_time("2022-01-05") def test_dataframe_timeindex(self): @@ -124,7 +124,7 @@ def test_dataframe_timeindex(self): self.assertTrue(np.all(estimator._recursive_values_in_time( "2022-01-05") == data.loc["2022-01-05"])) - with self.assertRaises(MissingValuesError): + with self.assertRaises(MissingTimesError): estimator._recursive_values_in_time("2021-01-05") estimator = DataEstimator(data, use_last_available_time=True) @@ -133,10 +133,80 @@ def test_dataframe_timeindex(self): data.loc["2022-01-05", 3] = np.nan estimator = DataEstimator(data, use_last_available_time=True) - with self.assertRaises(MissingValuesError): + with self.assertRaises(MissingTimesError): estimator._recursive_values_in_time("2021-01-05") + + def test_series_notime_assetselect(self): + """Test _universe_subselect.""" + universe = ['a','b','c'] + t = pd.Timestamp('2000-01-01') + + # data includes cash acct + data = pd.Series(range(len(universe)), index=universe) + estimator = DataEstimator(data, data_includes_cash=True) + estimator._recursive_pre_evaluation(universe, backtest_times=[t]) + result = estimator._recursive_values_in_time(t) + assert np.all(result==data.values) + + # data excludes cash acct + data = pd.Series(range(len(universe)), index=universe) + estimator = DataEstimator(data) + estimator._recursive_pre_evaluation(universe, backtest_times=[t]) + result = estimator._recursive_values_in_time(t) + assert np.all(result==data.values[:2]) + + # shuffled universe + estimator = DataEstimator(data.iloc[::-1]) + estimator._recursive_pre_evaluation(universe, backtest_times=[t]) + result = estimator._recursive_values_in_time(t) + assert np.all(result==data.values[:2]) + + # wrong universe + data = pd.Series(range(len(universe)), index=universe) + estimator = DataEstimator(data) + estimator._recursive_pre_evaluation(['d', 'e', 'f'], backtest_times=[t]) + with self.assertRaises(MissingAssetsError): + result = estimator._recursive_values_in_time(t) + + # selection of universe + data = pd.Series(range(len(universe)), index=universe) + estimator = DataEstimator(data, data_includes_cash=True) + estimator._recursive_pre_evaluation(['b'], backtest_times=[t]) + result = estimator._recursive_values_in_time(t) + assert np.all(result==data.values[1]) + + def test_ndarray_assetselect(self): + "Test errors if ndarray is not of right size." + data = np.zeros((2,3)) + t = pd.Timestamp('2000-01-01') + + # with universe of size 2 + estimator = DataEstimator(data, data_includes_cash=True) + estimator._recursive_pre_evaluation(['a','b'], backtest_times=[t]) + result = estimator._recursive_values_in_time(t) + assert np.all(result==data) + + # with universe of size 3 + estimator = DataEstimator(data, data_includes_cash=True) + estimator._recursive_pre_evaluation(['a','b','c'], backtest_times=[t]) + result = estimator._recursive_values_in_time(t) + assert np.all(result==data) + + # error with universe of size 4 + estimator = DataEstimator(data, data_includes_cash=True) + estimator._recursive_pre_evaluation(['a','b','c', 'd'], backtest_times=[t]) + with self.assertRaises(MissingAssetsError): + result = estimator._recursive_values_in_time(t) + + # all ok if skipping check + estimator = DataEstimator(data, data_includes_cash=True, ignore_shape_check=True) + estimator._recursive_pre_evaluation(['a','b','c', 'd'], backtest_times=[t]) + result = estimator._recursive_values_in_time(t) + assert np.all(result==data) + def test_dataframe_multindex(self): + "We also check that _universe_subselect works fine" timeindex = pd.date_range("2022-01-01", "2022-01-30") second_level = ["hello", "ciao", "hola"] index = pd.MultiIndex.from_product([timeindex, second_level]) @@ -146,19 +216,57 @@ def test_dataframe_multindex(self): self.assertTrue(np.all(estimator._recursive_values_in_time( "2022-01-05") == data.loc["2022-01-05"])) + # use_last_avalaible_time estimator = DataEstimator(data, use_last_available_time=True) self.assertTrue(np.all(estimator._recursive_values_in_time( "2022-02-05") == data.loc["2022-01-30"])) self.assertTrue(np.all(estimator._recursive_values_in_time( "2022-01-05") == data.loc["2022-01-05"])) - with self.assertRaises(MissingValuesError): + with self.assertRaises(MissingTimesError): estimator._recursive_values_in_time("2020-01-05") - + + # universe subselect + t = "2022-01-01" + data = pd.DataFrame(np.random.randn(len(index), 10), index=index) + estimator = DataEstimator(data, data_includes_cash=True) + estimator._recursive_pre_evaluation(universe=second_level, backtest_times=[t]) + result = estimator._recursive_values_in_time(t) + self.assertTrue(np.all(result == data.loc[t])) + + # result has same second_level as columns + data = pd.DataFrame(np.random.randn(len(index), len(second_level)), index=index, columns=second_level) + estimator = DataEstimator(data, data_includes_cash=True) + estimator._recursive_pre_evaluation(universe=second_level, backtest_times=[t]) + result = estimator._recursive_values_in_time(t) + self.assertTrue(np.all(result == data.loc[t])) + + # universe are columns + uni = ['a', 'b'] + data = pd.DataFrame(np.random.randn(len(index), 2), index=index, columns=uni) + estimator = DataEstimator(data, data_includes_cash=True) + estimator._recursive_pre_evaluation(universe=uni, backtest_times=[t]) + result = estimator._recursive_values_in_time(t) + self.assertTrue(np.all(result == data.loc[t])) + + # wrong universe + data = pd.DataFrame(np.random.randn(len(index), 2), index=index, columns=uni) + estimator = DataEstimator(data, data_includes_cash=True) + estimator._recursive_pre_evaluation(universe=uni + ['c'], backtest_times=[t]) + with self.assertRaises(MissingAssetsError): + result = estimator._recursive_values_in_time(t) + + + + # if timeindex is not first level it is not picked up index = pd.MultiIndex.from_product([second_level, timeindex]) data = pd.DataFrame(np.random.randn(len(index), 10), index=index) estimator = DataEstimator(data) assert np.all(estimator._recursive_values_in_time( "2020-01-05") == data.values) + + + + def test_parameter_estimator(self): timeindex = pd.date_range("2022-01-01", "2022-01-30")