Merge pull request #108 from cvxgrp/gh_issue_107

Gh issue #107 and also fixed #106
cvxgrp · Sep 5, 2023 · 3239fdf · 3239fdf
2 parents 211f0bc + 85c3c6b
commit 3239fdf
Show file tree

Hide file tree

Showing 18 changed files with 457 additions and 287 deletions.
diff --git a/Makefile b/Makefile
@@ -16,7 +16,7 @@ env:
 	$(BINDIR)/python -m pip install -r requirements.txt
 
 test:
-	$(BINDIR)/python -m unittest $(PROJECT)/tests/*.py
+	$(BINDIR)/coverage run -m unittest $(PROJECT)/tests/*.py
 
 pytest:
 	$(BINDIR)/pytest $(PROJECT)/tests/*.py
@@ -36,6 +36,10 @@ cleanenv:
 docs:
 	$(BINDIR)/sphinx-build -E docs $(BUILDDIR); open build/index.html
 
+coverage: test
+	$(BINDIR)/coverage html
+	open htmlcov/index.html
+
 pep8:
 	# use autopep8 to make innocuous fixes 
 	$(BINDIR)/autopep8 -i $(PROJECT)/*.py $(PROJECT)/tests/*.py

diff --git a/cvxportfolio/benchmark.py b/cvxportfolio/benchmark.py
@@ -39,7 +39,9 @@ class Benchmark(BaseBenchmark, DataEstimator):
     """
 
     def __init__(self, benchmark_weights):
-        DataEstimator.__init__(self, benchmark_weights)
+        DataEstimator.__init__(self, 
+            benchmark_weights, 
+            data_includes_cash=True)
 
 
 class CashBenchmark(BaseBenchmark):

diff --git a/cvxportfolio/constraints.py b/cvxportfolio/constraints.py
@@ -442,7 +442,8 @@ class FactorMaxLimit(BaseWeightConstraint, InequalityConstraint):
     def __init__(self, factor_exposure, limit):
         self.factor_exposure = DataEstimator(
             factor_exposure, compile_parameter=True)
-        self.limit = DataEstimator(limit, compile_parameter=True)
+        self.limit = DataEstimator(limit, compile_parameter=True, 
+            ignore_shape_check=True)
 
     def _compile_constr_to_cvxpy(self, w_plus, z, w_plus_minus_w_bm):
         "Compile left hand side of the constraint expression."
@@ -478,7 +479,8 @@ class FactorMinLimit(BaseWeightConstraint, InequalityConstraint):
     def __init__(self, factor_exposure, limit):
         self.factor_exposure = DataEstimator(
             factor_exposure, compile_parameter=True)
-        self.limit = DataEstimator(limit, compile_parameter=True)
+        self.limit = DataEstimator(limit, compile_parameter=True,
+            ignore_shape_check=True)
 
     def _compile_constr_to_cvxpy(self, w_plus, z, w_plus_minus_w_bm):
         "Compile left hand side of the constraint expression."

diff --git a/cvxportfolio/costs.py b/cvxportfolio/costs.py
diff --git a/cvxportfolio/data.py b/cvxportfolio/data.py
@@ -560,6 +560,7 @@ def __init__(
 
         self.base_location = base_location
         self.use_last_available_time = use_last_available_time
+        self.universe_maybe_noncash = None # fix, but we should retire this class
 
     def _recursive_pre_evaluation(self, *args, **kwargs):
         self.data = self.update_and_load(self.symbol)
diff --git a/cvxportfolio/estimator.py b/cvxportfolio/estimator.py
@@ -146,38 +146,45 @@ class DataEstimator(PolicyEstimator):
     by its `_recursive_values_in_time` method, which is the way `cvxportfolio`
     objects use this class to get data.
 
-    Args:
-        data (object, pandas.Series, pandas.DataFrame): Data expressed
-            preferably as pandas Series or DataFrame where the first
-            index is a pandas.DateTimeIndex. Otherwise you can
-            pass a callable object which implements the _recursive_values_in_time method
-            (with the standard signature) and returns the corresponding value in time,
-             or a constant float, numpy.array, or even pandas Series or DataFrame not
-            indexed by time (e.g., a covariance matrix where both index and columns
-            are the stock symbols).
-        use_last_available_time (bool): if the pandas index exists
-            and is a pandas.DateTimeIndex you can instruct self._recursive_values_in_time
-            to retrieve the last available value at time t by setting
-            this to True. Default is False.
-
+    :param data: Data expressed preferably as pandas Series or DataFrame 
+        where the first index is a pandas.DateTimeIndex. Otherwise you can
+        pass a callable object which implements the _recursive_values_in_time method
+        (with the standard signature) and returns the corresponding value in time,
+        or a constant float, numpy.array, or even pandas Series or DataFrame not
+        indexed by time (e.g., a covariance matrix where both index and columns
+        are the stock symbols).
+    :type data: object, pandas.Series, pandas.DataFrame 
+    :param use_last_available_time: if the pandas index exists
+        and is a pandas.DateTimeIndex you can instruct self._recursive_values_in_time
+        to retrieve the last available value at time t by setting
+        this to True. Default is False.
+    :type use_last_available_time: bool 
     """
 
     def __init__(self, data, use_last_available_time=False, allow_nans=False,
-                 compile_parameter=False, non_negative=False, positive_semi_definite=False):
+                 compile_parameter=False, non_negative=False, positive_semi_definite=False,
+                 data_includes_cash=False, # affects _universe_subselect
+                 ignore_shape_check=False # affects _universe_subselect
+                 ):
         self.data = data
         self.use_last_available_time = use_last_available_time
         self.allow_nans = allow_nans
         self.compile_parameter = compile_parameter
         self.non_negative = non_negative
         self.positive_semi_definite = positive_semi_definite
+        self.universe_maybe_noncash = None
+        self.data_includes_cash = data_includes_cash
+        self.ignore_shape_check = ignore_shape_check
 
     def _recursive_pre_evaluation(self, universe, backtest_times):
         # super()._recursive_pre_evaluation(universe, backtest_times)
         if self.compile_parameter:
             value = self.internal__recursive_values_in_time(
                 t=backtest_times[0])
             self.parameter = cp.Parameter(value.shape if hasattr(value, "shape") else (),
-                                          PSD=self.positive_semi_definite, nonneg=self.non_negative)
+                                          PSD=self.positive_semi_definite, nonneg=self.non_negative)          
+
+        self.universe_maybe_noncash = universe if self.data_includes_cash else universe[:-1]
 
     def value_checker(self, result):
         """Ensure that only scalars or arrays without np.nan are returned.
@@ -215,50 +222,111 @@ def value_checker(self, result):
         raise DataError(
             f"{self.__class__.__name__}._recursive_values_in_time result is not a scalar or array."
         )
+
+    def _universe_subselect(self, data):
+        """This function subselects from ``data`` the relevant universe.
+        
+        See github issue #106.
+        
+        If data is a pandas Series we subselect its index. If we fail
+        we throw an error. If data is a pandas DataFrame (covariance, exposure matrix) 
+        we try to subselect its index and columns. If we fail on either
+        we ignore the failure, but if we fail on both we throw an error.
+        If data is a numpy 1-d array we check that its length is the same as the 
+        universe's.
+        If it is a 2-d array we check that at least one dimension is the
+        same as the universe's.
+        If the universe is None we skip all checks. (We may revisit this choice.) This only happens
+        if the DataEstimator instance is not part of a PolicyEstimator tree 
+        (a usecase which we will probably drop).
+        """
+
+        if (self.universe_maybe_noncash is None) or self.ignore_shape_check:
+            return data
+
+        if isinstance(data, pd.Series):
+            try:
+                return data.loc[self.universe_maybe_noncash]
+            except KeyError:
+                raise MissingValuesError(
+                f"The pandas Series found by {self.__class__.__name__} has index {self.data.index}"
+                f" while the current universe {'minus cash' if not self.data_includes_cash else ''}"
+                f" is {self.universe_maybe_noncash}. It was not possibly to reconcile the two.")
+
+        if isinstance(data, pd.DataFrame):
+            try:
+                return data.loc[self.universe_maybe_noncash, self.universe_maybe_noncash]
+            except KeyError:
+                try:
+                    return data.loc[:, self.universe_maybe_noncash]
+                except KeyError:
+                    try:
+                        return data.loc[self.universe_maybe_noncash, :]
+                    except KeyError:
+                        pass
+            raise MissingValuesError(
+                f"The pandas DataFrame found by {self.__class__.__name__} has index {self.data.index}"
+                f" and columns {self.data.columns}"
+                f" while the current universe {'minus cash' if not self.data_includes_cash else ''}"
+                f" is {self.universe_maybe_noncash}. It was not possibly to reconcile the two.")
+
+        if isinstance(data, np.ndarray):
+            dimensions = data.shape
+            if not len(self.universe_maybe_noncash) in dimensions:
+                raise MissingValuesError(
+                    f"The numpy array found by {self.__class__.__name__} has dimensions {self.data.shape}"
+                    f" while the current universe {'minus cash' if not self.data_includes_cash else ''}" 
+                    f" has size {len(self.universe_maybe_noncash)}.")
+            return data
+
+        # scalar
+        return data
+
+
 
     def internal__recursive_values_in_time(self, t, *args, **kwargs):
         """Internal method called by `self._recursive_values_in_time`."""
 
+        # if self.data has values_in_time we use it
         if hasattr(self.data, "values_in_time"):
-            _ = self.data.values_in_time(t=t, *args, **kwargs)
-            if hasattr(_, 'values'):
-                return self.value_checker(_.values)
+            tmp = self.data.values_in_time(t=t, *args, **kwargs)
+            tmp = self._universe_subselect(tmp)
+            if hasattr(tmp, 'values'):
+                return self.value_checker(tmp.values)
             else:
-                return self.value_checker(_)
+                return self.value_checker(tmp)
 
+        # if self.data is pandas and has datetime (first) index
         if (hasattr(self.data, "loc") and hasattr(self.data, "index")
             and (isinstance(self.data.index, pd.DatetimeIndex)
-                 or (
-                isinstance(self.data.index, pd.MultiIndex)
-                and isinstance(self.data.index.levels[0], pd.DatetimeIndex)
-            )
-        )
-        ):
+                 or (isinstance(self.data.index, pd.MultiIndex) and 
+                     isinstance(self.data.index.levels[0], pd.DatetimeIndex)))):
             try:
                 if self.use_last_available_time:
                     if isinstance(self.data.index, pd.MultiIndex):
                         newt = self.data.index.levels[0][
-                            self.data.index.levels[0] <= t
-                        ][-1]
+                            self.data.index.levels[0] <= t][-1]
                     else:
                         newt = self.data.index[self.data.index <= t][-1]
                     tmp = self.data.loc[newt]
                 else:
                     tmp = self.data.loc[t]
                 if hasattr(tmp, "values"):
-                    return self.value_checker(tmp.values)
+                    return self.value_checker(self._universe_subselect(tmp.values))
                 else:
-                    return self.value_checker(tmp)
+                    return self.value_checker(self._universe_subselect(tmp))
 
             except (KeyError, IndexError):
                 raise MissingValuesError(
                     f"{self.__class__.__name__}._recursive_values_in_time could not find data for requested time."
                 )
 
+        # if data is pandas but no datetime index (constant in time)
         if hasattr(self.data, "values"):
-            return self.value_checker(self.data.values)
+            return self.value_checker(self._universe_subselect(self.data.values))
 
-        return self.value_checker(self.data)
+        # if data is scalar or numpy
+        return self.value_checker(self._universe_subselect(self.data))
 
     def _recursive_values_in_time(self, t, *args, **kwargs):
         """Obtain value of `self.data` at time t or right before.

diff --git a/cvxportfolio/hyperparameters.py b/cvxportfolio/hyperparameters.py
@@ -91,6 +91,12 @@ def _collect_hyperparameters(self):
             if hasattr(el, '_collect_hyperparameters'):
                 result += el._collect_hyperparameters()
         return result
+
+    def __repr__(self):
+        result = ''
+        for le, ri in zip(self.left, self.right):
+            result += str(le) + ' * ' + str(ri)
+        return result
 
 
 class RangeHyperParameter(HyperParameter):
@@ -100,29 +106,34 @@ class RangeHyperParameter(HyperParameter):
     its subclasses for ones that you can use.
     """
 
-    def __init__(self, values_range, initial_value):
-        if not (initial_value in values_range):
+    def __init__(self, values_range, current_value):
+        if not (current_value in values_range):
             raise SyntaxError('Initial value must be in the provided range')
         self.values_range = values_range
-        self.current_value = initial_value
+        self.current_value = current_value
+
+    def __repr__(self):
+        return self.__class__.__name__ \
+            + f'(values_range={self.values_range}'\
+            + f', current_value={self.current_value})'
 
 
 class GammaRisk(RangeHyperParameter):
     """Multiplier of a risk term."""
 
-    def __init__(self, values_range=GAMMA_RISK_RANGE, initial_value=1.):
-        super().__init__(values_range, initial_value)
+    def __init__(self, values_range=GAMMA_RISK_RANGE, current_value=1.):
+        super().__init__(values_range, current_value)
 
 
 class GammaTrade(RangeHyperParameter):
     """Multiplier of a transaction cost term."""
 
-    def __init__(self, values_range=GAMMA_COST_RANGE, initial_value=1.):
-        super().__init__(values_range, initial_value)
+    def __init__(self, values_range=GAMMA_COST_RANGE, current_value=1.):
+        super().__init__(values_range, current_value)
 
 
 class GammaHold(RangeHyperParameter):
     """Multiplier of a holding cost term."""
 
-    def __init__(self, values_range=GAMMA_COST_RANGE, initial_value=1.):
-        super().__init__(values_range, initial_value)
+    def __init__(self, values_range=GAMMA_COST_RANGE, current_value=1.):
+        super().__init__(values_range, current_value)
diff --git a/cvxportfolio/policies.py b/cvxportfolio/policies.py
@@ -345,21 +345,21 @@ def __init__(self, objective, constraints=[], include_cash_return=True, planning
             if not (hasattr(constraints, '__iter__') and len(constraints) and (hasattr(constraints[0], '__iter__') and len(objective) == len(constraints))):
                 raise SyntaxError(
                     'If you pass objective as a list, constraints should be a list of lists of the same length.')
-            self.planning_horizon = len(objective)
+            self._planning_horizon = len(objective)
             self.objective = objective
             self.constraints = constraints
         else:
             if not np.isscalar(planning_horizon):
                 raise SyntaxError(
                     'If `objective` and `constraints` are the same for all steps you must specify `planning_horizon`.')
-            self.planning_horizon = planning_horizon
+            self._planning_horizon = planning_horizon
             self.objective = [copy.deepcopy(objective) for i in range(
                 planning_horizon)] if planning_horizon > 1 else [objective]
             self.constraints = [copy.deepcopy(constraints) for i in range(
                 planning_horizon)] if planning_horizon > 1 else [constraints]
 
-        self.include_cash_return = include_cash_return
-        if self.include_cash_return:
+        self._include_cash_return = include_cash_return
+        if self._include_cash_return:
             self.objective = [el + CashReturn() for el in self.objective]
         self.terminal_constraint = terminal_constraint
         self.benchmark = benchmark() if isinstance(benchmark, type) else benchmark
@@ -394,7 +394,7 @@ def compile_and_check_constraint(constr, i):
         self.cvxpy_constraints = sum(self.cvxpy_constraints, [])
         self.cvxpy_constraints += [cp.sum(z) == 0 for z in self.z_at_lags]
         w = self.w_current
-        for i in range(self.planning_horizon):
+        for i in range(self._planning_horizon):
             self.cvxpy_constraints.append(
                 self.w_plus_at_lags[i] == self.z_at_lags[i] + w)
             self.cvxpy_constraints.append(
@@ -433,11 +433,11 @@ def _recursive_pre_evaluation(self, universe, backtest_times):
         # self.portfolio_value = cp.Parameter(nonneg=True)
         self.w_current = cp.Parameter(len(universe))
         self.z_at_lags = [cp.Variable(len(universe))
-                          for i in range(self.planning_horizon)]
+                          for i in range(self._planning_horizon)]
         self.w_plus_at_lags = [cp.Variable(
-            len(universe)) for i in range(self.planning_horizon)]
+            len(universe)) for i in range(self._planning_horizon)]
         self.w_plus_minus_w_bm_at_lags = [cp.Variable(
-            len(universe)) for i in range(self.planning_horizon)]
+            len(universe)) for i in range(self._planning_horizon)]
 
         # simulator will overwrite this with cached loaded from disk
         self.cache = {}
@@ -499,7 +499,7 @@ def _collect_hyperparameters(self):
             result += el._collect_hyperparameters()
         for el in self.constraints:
             for constr in el:
-                result += el._collect_hyperparameters()
+                result += constr._collect_hyperparameters()
         return result
 
 

diff --git a/cvxportfolio/result.py b/cvxportfolio/result.py
@@ -204,7 +204,7 @@ def __repr__(self):
             "Per-period absolute growth rate": self._print_growth_rate(self.growth_rates.mean()),
             "Per-period excess growth rate": self._print_growth_rate(self.excess_growth_rates.mean()),
             # stats
-            "Sharpe ratio (w/ excess returns)": self.sharpe_ratio,
+            "Sharpe ratio": self.sharpe_ratio,
             "Worst drawdown (%)": self.drawdown.min() * 100,
             "Average drawdown (%)": self.drawdown.mean() * 100,
             "Per-period Turnover (%)": self.turnover.mean() * 100,