fixed GH issue #106 by adding _universe_subselect method to DataEstim…

…ator
cvxgrp · Sep 5, 2023 · 469951e · 469951e
1 parent 26c66bd
commit 469951e
Show file tree

Hide file tree

Showing 7 changed files with 112 additions and 39 deletions.
diff --git a/cvxportfolio/benchmark.py b/cvxportfolio/benchmark.py
@@ -39,7 +39,9 @@ class Benchmark(BaseBenchmark, DataEstimator):
     """
 
     def __init__(self, benchmark_weights):
-        DataEstimator.__init__(self, benchmark_weights)
+        DataEstimator.__init__(self, 
+            benchmark_weights, 
+            data_includes_cash=True)
 
 
 class CashBenchmark(BaseBenchmark):

diff --git a/cvxportfolio/constraints.py b/cvxportfolio/constraints.py
@@ -442,7 +442,8 @@ class FactorMaxLimit(BaseWeightConstraint, InequalityConstraint):
     def __init__(self, factor_exposure, limit):
         self.factor_exposure = DataEstimator(
             factor_exposure, compile_parameter=True)
-        self.limit = DataEstimator(limit, compile_parameter=True)
+        self.limit = DataEstimator(limit, compile_parameter=True, 
+            ignore_shape_check=True)
 
     def _compile_constr_to_cvxpy(self, w_plus, z, w_plus_minus_w_bm):
         "Compile left hand side of the constraint expression."
@@ -478,7 +479,8 @@ class FactorMinLimit(BaseWeightConstraint, InequalityConstraint):
     def __init__(self, factor_exposure, limit):
         self.factor_exposure = DataEstimator(
             factor_exposure, compile_parameter=True)
-        self.limit = DataEstimator(limit, compile_parameter=True)
+        self.limit = DataEstimator(limit, compile_parameter=True,
+            ignore_shape_check=True)
 
     def _compile_constr_to_cvxpy(self, w_plus, z, w_plus_minus_w_bm):
         "Compile left hand side of the constraint expression."

diff --git a/cvxportfolio/costs.py b/cvxportfolio/costs.py
@@ -380,7 +380,7 @@ def _simulate(self, t, h_plus, current_and_past_returns, t_next, **kwargs):
         # TODO this is a temporary fix, we should plug this into a recursive tree
         for est in [self.short_fees, self.long_fees, self.dividends]:
             if not (est is None):
-                est._recursive_pre_evaluation(universe=h_plus.index[:-1], backtest_times=[t])
+                est._recursive_pre_evaluation(universe=h_plus.index, backtest_times=[t])
                 est._recursive_values_in_time(t=t)
 
         if not (self.short_fees is None):

diff --git a/cvxportfolio/data.py b/cvxportfolio/data.py
@@ -560,6 +560,7 @@ def __init__(
 
         self.base_location = base_location
         self.use_last_available_time = use_last_available_time
+        self.universe_maybe_noncash = None # fix, but we should retire this class
 
     def _recursive_pre_evaluation(self, *args, **kwargs):
         self.data = self.update_and_load(self.symbol)
diff --git a/cvxportfolio/estimator.py b/cvxportfolio/estimator.py
@@ -146,38 +146,45 @@ class DataEstimator(PolicyEstimator):
     by its `_recursive_values_in_time` method, which is the way `cvxportfolio`
     objects use this class to get data.
 
-    Args:
-        data (object, pandas.Series, pandas.DataFrame): Data expressed
-            preferably as pandas Series or DataFrame where the first
-            index is a pandas.DateTimeIndex. Otherwise you can
-            pass a callable object which implements the _recursive_values_in_time method
-            (with the standard signature) and returns the corresponding value in time,
-             or a constant float, numpy.array, or even pandas Series or DataFrame not
-            indexed by time (e.g., a covariance matrix where both index and columns
-            are the stock symbols).
-        use_last_available_time (bool): if the pandas index exists
-            and is a pandas.DateTimeIndex you can instruct self._recursive_values_in_time
-            to retrieve the last available value at time t by setting
-            this to True. Default is False.
-
+    :param data: Data expressed preferably as pandas Series or DataFrame 
+        where the first index is a pandas.DateTimeIndex. Otherwise you can
+        pass a callable object which implements the _recursive_values_in_time method
+        (with the standard signature) and returns the corresponding value in time,
+        or a constant float, numpy.array, or even pandas Series or DataFrame not
+        indexed by time (e.g., a covariance matrix where both index and columns
+        are the stock symbols).
+    :type data: object, pandas.Series, pandas.DataFrame 
+    :param use_last_available_time: if the pandas index exists
+        and is a pandas.DateTimeIndex you can instruct self._recursive_values_in_time
+        to retrieve the last available value at time t by setting
+        this to True. Default is False.
+    :type use_last_available_time: bool 
     """
 
     def __init__(self, data, use_last_available_time=False, allow_nans=False,
-                 compile_parameter=False, non_negative=False, positive_semi_definite=False):
+                 compile_parameter=False, non_negative=False, positive_semi_definite=False,
+                 data_includes_cash=False, # affects _universe_subselect
+                 ignore_shape_check=False # affects _universe_subselect
+                 ):
         self.data = data
         self.use_last_available_time = use_last_available_time
         self.allow_nans = allow_nans
         self.compile_parameter = compile_parameter
         self.non_negative = non_negative
         self.positive_semi_definite = positive_semi_definite
+        self.universe_maybe_noncash = None
+        self.data_includes_cash = data_includes_cash
+        self.ignore_shape_check = ignore_shape_check
 
     def _recursive_pre_evaluation(self, universe, backtest_times):
         # super()._recursive_pre_evaluation(universe, backtest_times)
         if self.compile_parameter:
             value = self.internal__recursive_values_in_time(
                 t=backtest_times[0])
             self.parameter = cp.Parameter(value.shape if hasattr(value, "shape") else (),
-                                          PSD=self.positive_semi_definite, nonneg=self.non_negative)
+                                          PSD=self.positive_semi_definite, nonneg=self.non_negative)          
+
+        self.universe_maybe_noncash = universe if self.data_includes_cash else universe[:-1]
 
     def value_checker(self, result):
         """Ensure that only scalars or arrays without np.nan are returned.
@@ -215,50 +222,110 @@ def value_checker(self, result):
         raise DataError(
             f"{self.__class__.__name__}._recursive_values_in_time result is not a scalar or array."
         )
+
+    def _universe_subselect(self, data):
+        """This function subselects from ``data`` the relevant universe.
+        
+        See github issue #106.
+        
+        If data is a pandas Series we subselect its index. If we fail
+        we throw an error. If data is a pandas DataFrame (covariance, exposure matrix) 
+        we try to subselect its index and columns. If we fail on either
+        we ignore the failure, but if we fail on both we throw an error.
+        If data is a numpy 1-d array we check that its length is the same as the 
+        universe's.
+        If it is a 2-d array we check that at least one dimension is the
+        same as the universe's.
+        If the universe is None we skip all checks. (We may revisit this choice.) This only happens
+        if the DataEstimator instance is not part of a PolicyEstimator tree 
+        (a usecase which we will probably drop).
+        """
+
+        if (self.universe_maybe_noncash is None) or self.ignore_shape_check:
+            return data
+
+        if isinstance(data, pd.Series):
+            try:
+                return data.loc[self.universe_maybe_noncash]
+            except KeyError:
+                raise MissingValuesError(
+                f"The pandas Series found by {self.__class__.__name__} has index {self.data.index}"
+                f" while the current universe (minus cash) is {self.universe_maybe_noncash}."
+                " It was not possibly to reconcile the two.")
+
+        if isinstance(data, pd.DataFrame):
+            try:
+                return data.loc[self.universe_maybe_noncash, self.universe_maybe_noncash]
+            except KeyError:
+                try:
+                    return data.loc[:, self.universe_maybe_noncash]
+                except KeyError:
+                    try:
+                        return data.loc[self.universe_maybe_noncash, :]
+                    except KeyError:
+                        pass
+            raise MissingValuesError(
+                f"The pandas DataFrame found by {self.__class__.__name__} has index {self.data.index}"
+                f" and columns {self.data.columns}"
+                f" while the current universe (minus cash) is {self.universe_maybe_noncash}."
+                " It was not possibly to reconcile the two.")
+
+        if isinstance(data, np.ndarray):
+            dimensions = data.shape
+            if not len(self.universe_maybe_noncash) in dimensions:
+                raise MissingValuesError(
+                    f"The numpy array found by {self.__class__.__name__} has dimensions {self.data.shape}"
+                    f" while the current universe (minus cash) has size {len(self.universe_maybe_noncash)}.")
+            return data
+
+        # scalar
+        return data
+
+
 
     def internal__recursive_values_in_time(self, t, *args, **kwargs):
         """Internal method called by `self._recursive_values_in_time`."""
 
+        # if self.data has values_in_time we use it
         if hasattr(self.data, "values_in_time"):
-            _ = self.data.values_in_time(t=t, *args, **kwargs)
-            if hasattr(_, 'values'):
-                return self.value_checker(_.values)
+            tmp = self.data.values_in_time(t=t, *args, **kwargs)
+            tmp = self._universe_subselect(tmp)
+            if hasattr(tmp, 'values'):
+                return self.value_checker(tmp.values)
             else:
-                return self.value_checker(_)
+                return self.value_checker(tmp)
 
+        # if self.data is pandas and has datetime (first) index
         if (hasattr(self.data, "loc") and hasattr(self.data, "index")
             and (isinstance(self.data.index, pd.DatetimeIndex)
-                 or (
-                isinstance(self.data.index, pd.MultiIndex)
-                and isinstance(self.data.index.levels[0], pd.DatetimeIndex)
-            )
-        )
-        ):
+                 or (isinstance(self.data.index, pd.MultiIndex) and 
+                     isinstance(self.data.index.levels[0], pd.DatetimeIndex)))):
             try:
                 if self.use_last_available_time:
                     if isinstance(self.data.index, pd.MultiIndex):
                         newt = self.data.index.levels[0][
-                            self.data.index.levels[0] <= t
-                        ][-1]
+                            self.data.index.levels[0] <= t][-1]
                     else:
                         newt = self.data.index[self.data.index <= t][-1]
                     tmp = self.data.loc[newt]
                 else:
                     tmp = self.data.loc[t]
                 if hasattr(tmp, "values"):
-                    return self.value_checker(tmp.values)
+                    return self.value_checker(self._universe_subselect(tmp.values))
                 else:
-                    return self.value_checker(tmp)
+                    return self.value_checker(self._universe_subselect(tmp))
 
             except (KeyError, IndexError):
                 raise MissingValuesError(
                     f"{self.__class__.__name__}._recursive_values_in_time could not find data for requested time."
                 )
 
+        # if data is pandas but no datetime index (constant in time)
         if hasattr(self.data, "values"):
-            return self.value_checker(self.data.values)
+            return self.value_checker(self._universe_subselect(self.data.values))
 
-        return self.value_checker(self.data)
+        # if data is scalar or numpy
+        return self.value_checker(self._universe_subselect(self.data))
 
     def _recursive_values_in_time(self, t, *args, **kwargs):
         """Obtain value of `self.data` at time t or right before.

diff --git a/cvxportfolio/risks.py b/cvxportfolio/risks.py
@@ -281,7 +281,7 @@ class FactorModelCovariance(BaseRiskModel):
     def __init__(self, F=None, d=None, Sigma_F=None, num_factors=1, kelly=True):
         self.F = F if F is None else DataEstimator(F, compile_parameter=True)
         self.d = d if d is None else DataEstimator(d)
-        self.Sigma_F = Sigma_F if Sigma_F is None else DataEstimator(Sigma_F)
+        self.Sigma_F = Sigma_F if Sigma_F is None else DataEstimator(Sigma_F, ignore_shape_check=True)
         if (self.F is None) or (self.d is None):
             self._fit = True
             self.Sigma = HistoricalFactorizedCovariance(kelly=kelly)

diff --git a/cvxportfolio/tests/test_estimator.py b/cvxportfolio/tests/test_estimator.py
@@ -165,10 +165,11 @@ def test_parameter_estimator(self):
         second_level = ["hello", "ciao", "hola"]
         index = pd.MultiIndex.from_product([timeindex, second_level])
         data = pd.DataFrame(np.random.randn(len(index), 10), index=index)
-        estimator = DataEstimator(data, compile_parameter=True)
+        estimator = DataEstimator(data, compile_parameter=True, 
+            data_includes_cash=True)
         self.assertTrue(not hasattr(estimator, "parameter"))
         estimator._recursive_pre_evaluation(
-            universe=None, backtest_times=timeindex)
+            universe=data.columns, backtest_times=timeindex)
         # assert hasattr(estimator, 'parameter')
         self.assertTrue(hasattr(estimator, "parameter"))
         estimator._recursive_values_in_time("2022-01-05")