diff --git a/examples/d3rlpy_training_api_continuous_stochastic.py b/examples/d3rlpy_training_api_continuous_stochastic.py index e72d07f..da8aff4 100644 --- a/examples/d3rlpy_training_api_continuous_stochastic.py +++ b/examples/d3rlpy_training_api_continuous_stochastic.py @@ -21,6 +21,10 @@ from offline_rl_ope.PropensityModels.torch import FullGuassian, TorchRegTrainer from offline_rl_ope.api.d3rlpy.Policy import PolicyFactory +from offline_rl_ope.OPEEstimators import ( + EmpiricalMeanDenom, PassWeightDenom, WeightedEmpiricalMeanDenom + ) + # Import callbacks from offline_rl_ope.api.d3rlpy.Callbacks import ( @@ -144,22 +148,31 @@ scorers = {} scorers.update({"vanilla_is_loss": ISEstimatorScorer( - discount=gamma, cache=is_callback, is_type="vanilla", norm_weights=False)}) + discount=gamma, cache=is_callback, is_type="vanilla", + empirical_denom=EmpiricalMeanDenom(), weight_denom=PassWeightDenom() + )}) scorers.update({"pd_is_loss": ISEstimatorScorer( discount=gamma, cache=is_callback, is_type="per_decision", - norm_weights=False)}) + empirical_denom=EmpiricalMeanDenom(), weight_denom=PassWeightDenom() + )}) scorers.update({"vanilla_wis_loss": ISEstimatorScorer( - discount=gamma, cache=is_callback, is_type="vanilla", norm_weights=True)}) + discount=gamma, cache=is_callback, is_type="vanilla", + empirical_denom=WeightedEmpiricalMeanDenom(), + weight_denom=PassWeightDenom() + )}) scorers.update({"vanilla_wis_loss_smooth": ISEstimatorScorer( - discount=gamma, cache=is_callback, is_type="vanilla", norm_weights=True, - norm_kwargs={"smooth_eps":0.0000001})}) + discount=gamma, cache=is_callback, is_type="vanilla", + empirical_denom=WeightedEmpiricalMeanDenom(smooth_eps=0.0000001), + weight_denom=PassWeightDenom(), + )}) scorers.update({"pd_wis_loss": ISEstimatorScorer( discount=gamma, cache=is_callback, is_type="per_decision", - norm_weights=True)}) + empirical_denom=EmpiricalMeanDenom(), weight_denom=PassWeightDenom() + )}) for scr in fqe_scorers: scorers.update({scr: QueryScorer(cache=fqe_callback, query_key=scr)}) diff --git a/examples/d3rlpy_training_api_discrete.py b/examples/d3rlpy_training_api_discrete.py index 94dd738..3f7dd28 100644 --- a/examples/d3rlpy_training_api_discrete.py +++ b/examples/d3rlpy_training_api_discrete.py @@ -14,6 +14,9 @@ import shutil from offline_rl_ope.api.d3rlpy.Policy import PolicyFactory +from offline_rl_ope.OPEEstimators import ( + EmpiricalMeanDenom, PassWeightDenom, WeightedEmpiricalMeanDenom + ) # Import callbacks from offline_rl_ope.api.d3rlpy.Callbacks import ( @@ -115,22 +118,31 @@ scorers = {} scorers.update({"vanilla_is_loss": ISEstimatorScorer( - discount=gamma, cache=is_callback, is_type="vanilla", norm_weights=False)}) + discount=gamma, cache=is_callback, is_type="vanilla", + empirical_denom=EmpiricalMeanDenom(), weight_denom=PassWeightDenom() + )}) scorers.update({"pd_is_loss": ISEstimatorScorer( discount=gamma, cache=is_callback, is_type="per_decision", - norm_weights=False)}) + empirical_denom=EmpiricalMeanDenom(), weight_denom=PassWeightDenom() + )}) scorers.update({"vanilla_wis_loss": ISEstimatorScorer( - discount=gamma, cache=is_callback, is_type="vanilla", norm_weights=True)}) + discount=gamma, cache=is_callback, is_type="vanilla", + empirical_denom=WeightedEmpiricalMeanDenom(), + weight_denom=PassWeightDenom() + )}) scorers.update({"vanilla_wis_loss_smooth": ISEstimatorScorer( - discount=gamma, cache=is_callback, is_type="vanilla", norm_weights=True, - norm_kwargs={"smooth_eps":0.0000001})}) + discount=gamma, cache=is_callback, is_type="vanilla", + empirical_denom=WeightedEmpiricalMeanDenom(smooth_eps=0.0000001), + weight_denom=PassWeightDenom(), + )}) scorers.update({"pd_wis_loss": ISEstimatorScorer( discount=gamma, cache=is_callback, is_type="per_decision", - norm_weights=True)}) + empirical_denom=EmpiricalMeanDenom(), weight_denom=PassWeightDenom() + )}) for act in unique_pol_acts: scorers.update( diff --git a/examples/static_torch_deterministic_continuous.py b/examples/static_torch_deterministic_continuous.py index 2629aba..cc8e400 100644 --- a/examples/static_torch_deterministic_continuous.py +++ b/examples/static_torch_deterministic_continuous.py @@ -16,8 +16,9 @@ from offline_rl_ope.Dataset import ISEpisode from offline_rl_ope.components.Policy import Policy, GreedyDeterministic from offline_rl_ope.components.ImportanceSampler import ISWeightOrchestrator -from offline_rl_ope.OPEEstimators import ( - ISEstimator, WDR, D3rlpyQlearnDM) +from offline_rl_ope.OPEEstimators import D3rlpyQlearnDM +from offline_rl_ope.api.StandardEstimators import ( + VanillaISPDIS, WIS, WDR) from offline_rl_ope.PropensityModels.torch import FullGuassian, TorchRegTrainer from offline_rl_ope.LowerBounds.HCOPE import get_lower_bound @@ -171,14 +172,16 @@ def __call__( fqe_dm_model = D3rlpyQlearnDM(model=fqe) - is_estimator = ISEstimator(norm_weights=False, cache_traj_rewards=True) - wis_estimator = ISEstimator(norm_weights=True) - wis_estimator_smooth = ISEstimator(norm_weights=True, norm_kwargs={ - "smooth_eps":0.0000001 - }) + is_estimator = VanillaISPDIS(cache_traj_rewards=True) + wis_estimator = WIS() + wis_estimator_smooth = WIS(smooth_eps=0.0000001) w_dr_estimator = WDR( - dm_model=fqe_dm_model, + dm_model=fqe_dm_model ) + w_dr_estimator_smooth = WDR( + dm_model=fqe_dm_model, smooth_eps=0.0000001 + ) + res = is_estimator.predict( @@ -229,11 +232,21 @@ def __call__( print(res) res = w_dr_estimator.predict( - weights=is_weight_calculator["per_decision"].traj_is_weights, + weights=is_weight_calculator["vanilla"].traj_is_weights, rewards=[ep.reward for ep in episodes], discount=0.99, is_msk=is_weight_calculator.weight_msk, states=[ep.state for ep in episodes], actions=[ep.action for ep in episodes], ) print(res) + + res = w_dr_estimator_smooth.predict( + weights=is_weight_calculator["vanilla"].traj_is_weights, + rewards=[ep.reward for ep in episodes], discount=0.99, + is_msk=is_weight_calculator.weight_msk, + states=[ep.state for ep in episodes], + actions=[ep.action for ep in episodes], + ) + print(res) + diff --git a/examples/static_torch_stochastic_continuous.py b/examples/static_torch_stochastic_continuous.py index 85431fb..64d13e2 100644 --- a/examples/static_torch_stochastic_continuous.py +++ b/examples/static_torch_stochastic_continuous.py @@ -16,8 +16,9 @@ from offline_rl_ope.Dataset import ISEpisode from offline_rl_ope.components.Policy import Policy from offline_rl_ope.components.ImportanceSampler import ISWeightOrchestrator -from offline_rl_ope.OPEEstimators import ( - ISEstimator, WDR, D3rlpyQlearnDM) +from offline_rl_ope.OPEEstimators import D3rlpyQlearnDM +from offline_rl_ope.api.StandardEstimators import ( + VanillaISPDIS, WIS, WDR) from offline_rl_ope.PropensityModels.torch import FullGuassian, TorchRegTrainer from offline_rl_ope.LowerBounds.HCOPE import get_lower_bound @@ -153,14 +154,15 @@ fqe_dm_model = D3rlpyQlearnDM(model=fqe) - is_estimator = ISEstimator(norm_weights=False, cache_traj_rewards=True) - wis_estimator = ISEstimator(norm_weights=True) - wis_estimator_smooth = ISEstimator(norm_weights=True, norm_kwargs={ - "smooth_eps":0.0000001 - }) + is_estimator = VanillaISPDIS(cache_traj_rewards=True) + wis_estimator = WIS() + wis_estimator_smooth = WIS(smooth_eps=0.0000001) w_dr_estimator = WDR( dm_model=fqe_dm_model ) + w_dr_estimator_smooth = WDR( + dm_model=fqe_dm_model, smooth_eps=0.0000001 + ) res = is_estimator.predict( @@ -211,7 +213,16 @@ print(res) res = w_dr_estimator.predict( - weights=is_weight_calculator["per_decision"].traj_is_weights, + weights=is_weight_calculator["vanilla"].traj_is_weights, + rewards=[ep.reward for ep in episodes], discount=0.99, + is_msk=is_weight_calculator.weight_msk, + states=[ep.state for ep in episodes], + actions=[ep.action for ep in episodes], + ) + print(res) + + res = w_dr_estimator_smooth.predict( + weights=is_weight_calculator["vanilla"].traj_is_weights, rewards=[ep.reward for ep in episodes], discount=0.99, is_msk=is_weight_calculator.weight_msk, states=[ep.state for ep in episodes], diff --git a/examples/static_xgboost_discrete.py b/examples/static_xgboost_discrete.py index 73b9793..c7b5bf3 100644 --- a/examples/static_xgboost_discrete.py +++ b/examples/static_xgboost_discrete.py @@ -15,8 +15,9 @@ from offline_rl_ope.components.Policy import ( GreedyDeterministic, Policy, NumpyPolicyFuncWrapper) from offline_rl_ope.components.ImportanceSampler import ISWeightOrchestrator -from offline_rl_ope.OPEEstimators import ( - ISEstimator, WDR, D3rlpyQlearnDM) +from offline_rl_ope.OPEEstimators import D3rlpyQlearnDM +from offline_rl_ope.api.StandardEstimators import ( + VanillaISPDIS, WIS, WDR) from offline_rl_ope.PropensityModels.sklearn import ( SklearnDiscrete) from offline_rl_ope.LowerBounds.HCOPE import get_lower_bound @@ -118,15 +119,15 @@ fqe_dm_model = D3rlpyQlearnDM(model=discrete_fqe) - is_estimator = ISEstimator(norm_weights=False, cache_traj_rewards=True) - wis_estimator = ISEstimator(norm_weights=True) - wis_estimator_smooth = ISEstimator(norm_weights=True, norm_kwargs={ - "smooth_eps":0.0000001 - }) + is_estimator = VanillaISPDIS(cache_traj_rewards=True) + wis_estimator = WIS() + wis_estimator_smooth = WIS(smooth_eps=0.0000001) w_dr_estimator = WDR( dm_model=fqe_dm_model ) - + w_dr_estimator_smooth = WDR( + dm_model=fqe_dm_model, smooth_eps=0.0000001 + ) res = is_estimator.predict( rewards=[ep.reward for ep in episodes], discount=0.99, @@ -176,7 +177,16 @@ print(res) res = w_dr_estimator.predict( - weights=is_weight_calculator["per_decision"].traj_is_weights, + weights=is_weight_calculator["vanilla"].traj_is_weights, + rewards=[ep.reward for ep in episodes], discount=0.99, + is_msk=is_weight_calculator.weight_msk, + states=[ep.state for ep in episodes], + actions=[ep.action for ep in episodes], + ) + print(res) + + res = w_dr_estimator_smooth.predict( + weights=is_weight_calculator["vanilla"].traj_is_weights, rewards=[ep.reward for ep in episodes], discount=0.99, is_msk=is_weight_calculator.weight_msk, states=[ep.state for ep in episodes], diff --git a/src/offline_rl_ope/OPEEstimators/DoublyRobust.py b/src/offline_rl_ope/OPEEstimators/DoublyRobust.py index 48055c9..f44a5b1 100644 --- a/src/offline_rl_ope/OPEEstimators/DoublyRobust.py +++ b/src/offline_rl_ope/OPEEstimators/DoublyRobust.py @@ -85,7 +85,6 @@ def predict_traj_rewards( rewards=rewards, discount=discount, h=h) # weights dim is (n_trajectories, max_length) weights = self.process_weights(weights=weights, is_msk=is_msk) - print(f"weights:{weights}") v:List[Float[torch.Tensor, "max_length 1"]] = [] q:List[Float[torch.Tensor, "max_length 1"]] = [] for s,a in zip(states, actions): diff --git a/src/offline_rl_ope/api/StandardEstimators/IS.py b/src/offline_rl_ope/api/StandardEstimators/IS.py index 75269d2..24b3f7a 100644 --- a/src/offline_rl_ope/api/StandardEstimators/IS.py +++ b/src/offline_rl_ope/api/StandardEstimators/IS.py @@ -95,7 +95,6 @@ def __init__( smooth_eps:float = 0.0 ) -> None: """_summary_ - - https://arxiv.org/pdf/1906.03735 (snis when weights are IS) - https://arxiv.org/pdf/1906.03735 (snsis when weights are PD) Args: clip_weights (bool, optional): _description_. Defaults to False. diff --git a/src/offline_rl_ope/api/d3rlpy/Scorers/IS.py b/src/offline_rl_ope/api/d3rlpy/Scorers/IS.py index 656cb24..e7a5f9a 100644 --- a/src/offline_rl_ope/api/d3rlpy/Scorers/IS.py +++ b/src/offline_rl_ope/api/d3rlpy/Scorers/IS.py @@ -6,7 +6,9 @@ from d3rlpy.metrics import EvaluatorProtocol from d3rlpy.dataset import ReplayBuffer -from ....OPEEstimators import ISEstimator +from ....OPEEstimators import ( + ISEstimator, WeightDenomBase, EmpiricalMeanDenomBase + ) from .base import OPEEstimatorScorerBase from ..Callbacks.IS import ISCallback @@ -19,18 +21,24 @@ class ISEstimatorScorer(OPEEstimatorScorerBase, ISEstimator): - def __init__(self, discount, cache:ISCallback, is_type:str, - norm_weights: bool, clip_weights:bool=False, - clip: float = 0.0, norm_kwargs:Dict[str,Any] = {}, - episodes:Optional[Sequence[EpisodeBase]] = None - ) -> None: + def __init__( + self, + discount:float, + cache:ISCallback, + is_type:str, + empirical_denom: WeightDenomBase, + weight_denom: EmpiricalMeanDenomBase, + clip_weights:bool=False, + clip: float = 0.0, + episodes:Optional[Sequence[EpisodeBase]] = None + ) -> None: OPEEstimatorScorerBase.__init__(self, cache=cache, episodes=episodes) ISEstimator.__init__( self, - norm_weights=norm_weights, + empirical_denom=empirical_denom, + weight_denom=weight_denom, clip_weights=clip_weights, clip=clip, - norm_kwargs=norm_kwargs ) self.is_type = is_type self.discount = discount