Skip to content

Commit

Permalink
fixed GH issue #106 by adding _universe_subselect method to DataEstim…
Browse files Browse the repository at this point in the history
…ator
  • Loading branch information
enzbus committed Sep 5, 2023
1 parent 26c66bd commit 469951e
Show file tree
Hide file tree
Showing 7 changed files with 112 additions and 39 deletions.
4 changes: 3 additions & 1 deletion cvxportfolio/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ class Benchmark(BaseBenchmark, DataEstimator):
"""

def __init__(self, benchmark_weights):
DataEstimator.__init__(self, benchmark_weights)
DataEstimator.__init__(self,
benchmark_weights,
data_includes_cash=True)


class CashBenchmark(BaseBenchmark):
Expand Down
6 changes: 4 additions & 2 deletions cvxportfolio/constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,8 @@ class FactorMaxLimit(BaseWeightConstraint, InequalityConstraint):
def __init__(self, factor_exposure, limit):
self.factor_exposure = DataEstimator(
factor_exposure, compile_parameter=True)
self.limit = DataEstimator(limit, compile_parameter=True)
self.limit = DataEstimator(limit, compile_parameter=True,
ignore_shape_check=True)

def _compile_constr_to_cvxpy(self, w_plus, z, w_plus_minus_w_bm):
"Compile left hand side of the constraint expression."
Expand Down Expand Up @@ -478,7 +479,8 @@ class FactorMinLimit(BaseWeightConstraint, InequalityConstraint):
def __init__(self, factor_exposure, limit):
self.factor_exposure = DataEstimator(
factor_exposure, compile_parameter=True)
self.limit = DataEstimator(limit, compile_parameter=True)
self.limit = DataEstimator(limit, compile_parameter=True,
ignore_shape_check=True)

def _compile_constr_to_cvxpy(self, w_plus, z, w_plus_minus_w_bm):
"Compile left hand side of the constraint expression."
Expand Down
2 changes: 1 addition & 1 deletion cvxportfolio/costs.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ def _simulate(self, t, h_plus, current_and_past_returns, t_next, **kwargs):
# TODO this is a temporary fix, we should plug this into a recursive tree
for est in [self.short_fees, self.long_fees, self.dividends]:
if not (est is None):
est._recursive_pre_evaluation(universe=h_plus.index[:-1], backtest_times=[t])
est._recursive_pre_evaluation(universe=h_plus.index, backtest_times=[t])
est._recursive_values_in_time(t=t)

if not (self.short_fees is None):
Expand Down
1 change: 1 addition & 0 deletions cvxportfolio/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,7 @@ def __init__(

self.base_location = base_location
self.use_last_available_time = use_last_available_time
self.universe_maybe_noncash = None # fix, but we should retire this class

def _recursive_pre_evaluation(self, *args, **kwargs):
self.data = self.update_and_load(self.symbol)
131 changes: 99 additions & 32 deletions cvxportfolio/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,38 +146,45 @@ class DataEstimator(PolicyEstimator):
by its `_recursive_values_in_time` method, which is the way `cvxportfolio`
objects use this class to get data.
Args:
data (object, pandas.Series, pandas.DataFrame): Data expressed
preferably as pandas Series or DataFrame where the first
index is a pandas.DateTimeIndex. Otherwise you can
pass a callable object which implements the _recursive_values_in_time method
(with the standard signature) and returns the corresponding value in time,
or a constant float, numpy.array, or even pandas Series or DataFrame not
indexed by time (e.g., a covariance matrix where both index and columns
are the stock symbols).
use_last_available_time (bool): if the pandas index exists
and is a pandas.DateTimeIndex you can instruct self._recursive_values_in_time
to retrieve the last available value at time t by setting
this to True. Default is False.
:param data: Data expressed preferably as pandas Series or DataFrame
where the first index is a pandas.DateTimeIndex. Otherwise you can
pass a callable object which implements the _recursive_values_in_time method
(with the standard signature) and returns the corresponding value in time,
or a constant float, numpy.array, or even pandas Series or DataFrame not
indexed by time (e.g., a covariance matrix where both index and columns
are the stock symbols).
:type data: object, pandas.Series, pandas.DataFrame
:param use_last_available_time: if the pandas index exists
and is a pandas.DateTimeIndex you can instruct self._recursive_values_in_time
to retrieve the last available value at time t by setting
this to True. Default is False.
:type use_last_available_time: bool
"""

def __init__(self, data, use_last_available_time=False, allow_nans=False,
compile_parameter=False, non_negative=False, positive_semi_definite=False):
compile_parameter=False, non_negative=False, positive_semi_definite=False,
data_includes_cash=False, # affects _universe_subselect
ignore_shape_check=False # affects _universe_subselect
):
self.data = data
self.use_last_available_time = use_last_available_time
self.allow_nans = allow_nans
self.compile_parameter = compile_parameter
self.non_negative = non_negative
self.positive_semi_definite = positive_semi_definite
self.universe_maybe_noncash = None
self.data_includes_cash = data_includes_cash
self.ignore_shape_check = ignore_shape_check

def _recursive_pre_evaluation(self, universe, backtest_times):
# super()._recursive_pre_evaluation(universe, backtest_times)
if self.compile_parameter:
value = self.internal__recursive_values_in_time(
t=backtest_times[0])
self.parameter = cp.Parameter(value.shape if hasattr(value, "shape") else (),
PSD=self.positive_semi_definite, nonneg=self.non_negative)
PSD=self.positive_semi_definite, nonneg=self.non_negative)

self.universe_maybe_noncash = universe if self.data_includes_cash else universe[:-1]

def value_checker(self, result):
"""Ensure that only scalars or arrays without np.nan are returned.
Expand Down Expand Up @@ -215,50 +222,110 @@ def value_checker(self, result):
raise DataError(
f"{self.__class__.__name__}._recursive_values_in_time result is not a scalar or array."
)

def _universe_subselect(self, data):
"""This function subselects from ``data`` the relevant universe.
See github issue #106.
If data is a pandas Series we subselect its index. If we fail
we throw an error. If data is a pandas DataFrame (covariance, exposure matrix)
we try to subselect its index and columns. If we fail on either
we ignore the failure, but if we fail on both we throw an error.
If data is a numpy 1-d array we check that its length is the same as the
universe's.
If it is a 2-d array we check that at least one dimension is the
same as the universe's.
If the universe is None we skip all checks. (We may revisit this choice.) This only happens
if the DataEstimator instance is not part of a PolicyEstimator tree
(a usecase which we will probably drop).
"""

if (self.universe_maybe_noncash is None) or self.ignore_shape_check:
return data

if isinstance(data, pd.Series):
try:
return data.loc[self.universe_maybe_noncash]
except KeyError:
raise MissingValuesError(
f"The pandas Series found by {self.__class__.__name__} has index {self.data.index}"
f" while the current universe (minus cash) is {self.universe_maybe_noncash}."
" It was not possibly to reconcile the two.")

if isinstance(data, pd.DataFrame):
try:
return data.loc[self.universe_maybe_noncash, self.universe_maybe_noncash]
except KeyError:
try:
return data.loc[:, self.universe_maybe_noncash]
except KeyError:
try:
return data.loc[self.universe_maybe_noncash, :]
except KeyError:
pass
raise MissingValuesError(
f"The pandas DataFrame found by {self.__class__.__name__} has index {self.data.index}"
f" and columns {self.data.columns}"
f" while the current universe (minus cash) is {self.universe_maybe_noncash}."
" It was not possibly to reconcile the two.")

if isinstance(data, np.ndarray):
dimensions = data.shape
if not len(self.universe_maybe_noncash) in dimensions:
raise MissingValuesError(
f"The numpy array found by {self.__class__.__name__} has dimensions {self.data.shape}"
f" while the current universe (minus cash) has size {len(self.universe_maybe_noncash)}.")
return data

# scalar
return data



def internal__recursive_values_in_time(self, t, *args, **kwargs):
"""Internal method called by `self._recursive_values_in_time`."""

# if self.data has values_in_time we use it
if hasattr(self.data, "values_in_time"):
_ = self.data.values_in_time(t=t, *args, **kwargs)
if hasattr(_, 'values'):
return self.value_checker(_.values)
tmp = self.data.values_in_time(t=t, *args, **kwargs)
tmp = self._universe_subselect(tmp)
if hasattr(tmp, 'values'):
return self.value_checker(tmp.values)
else:
return self.value_checker(_)
return self.value_checker(tmp)

# if self.data is pandas and has datetime (first) index
if (hasattr(self.data, "loc") and hasattr(self.data, "index")
and (isinstance(self.data.index, pd.DatetimeIndex)
or (
isinstance(self.data.index, pd.MultiIndex)
and isinstance(self.data.index.levels[0], pd.DatetimeIndex)
)
)
):
or (isinstance(self.data.index, pd.MultiIndex) and
isinstance(self.data.index.levels[0], pd.DatetimeIndex)))):
try:
if self.use_last_available_time:
if isinstance(self.data.index, pd.MultiIndex):
newt = self.data.index.levels[0][
self.data.index.levels[0] <= t
][-1]
self.data.index.levels[0] <= t][-1]
else:
newt = self.data.index[self.data.index <= t][-1]
tmp = self.data.loc[newt]
else:
tmp = self.data.loc[t]
if hasattr(tmp, "values"):
return self.value_checker(tmp.values)
return self.value_checker(self._universe_subselect(tmp.values))
else:
return self.value_checker(tmp)
return self.value_checker(self._universe_subselect(tmp))

except (KeyError, IndexError):
raise MissingValuesError(
f"{self.__class__.__name__}._recursive_values_in_time could not find data for requested time."
)

# if data is pandas but no datetime index (constant in time)
if hasattr(self.data, "values"):
return self.value_checker(self.data.values)
return self.value_checker(self._universe_subselect(self.data.values))

return self.value_checker(self.data)
# if data is scalar or numpy
return self.value_checker(self._universe_subselect(self.data))

def _recursive_values_in_time(self, t, *args, **kwargs):
"""Obtain value of `self.data` at time t or right before.
Expand Down
2 changes: 1 addition & 1 deletion cvxportfolio/risks.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ class FactorModelCovariance(BaseRiskModel):
def __init__(self, F=None, d=None, Sigma_F=None, num_factors=1, kelly=True):
self.F = F if F is None else DataEstimator(F, compile_parameter=True)
self.d = d if d is None else DataEstimator(d)
self.Sigma_F = Sigma_F if Sigma_F is None else DataEstimator(Sigma_F)
self.Sigma_F = Sigma_F if Sigma_F is None else DataEstimator(Sigma_F, ignore_shape_check=True)
if (self.F is None) or (self.d is None):
self._fit = True
self.Sigma = HistoricalFactorizedCovariance(kelly=kelly)
Expand Down
5 changes: 3 additions & 2 deletions cvxportfolio/tests/test_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,10 +165,11 @@ def test_parameter_estimator(self):
second_level = ["hello", "ciao", "hola"]
index = pd.MultiIndex.from_product([timeindex, second_level])
data = pd.DataFrame(np.random.randn(len(index), 10), index=index)
estimator = DataEstimator(data, compile_parameter=True)
estimator = DataEstimator(data, compile_parameter=True,
data_includes_cash=True)
self.assertTrue(not hasattr(estimator, "parameter"))
estimator._recursive_pre_evaluation(
universe=None, backtest_times=timeindex)
universe=data.columns, backtest_times=timeindex)
# assert hasattr(estimator, 'parameter')
self.assertTrue(hasattr(estimator, "parameter"))
estimator._recursive_values_in_time("2022-01-05")
Expand Down

0 comments on commit 469951e

Please sign in to comment.