From f00c8cf950317a2b6be66d0a48b6811efe7de461 Mon Sep 17 00:00:00 2001 From: Joshua Spear Date: Mon, 9 Oct 2023 10:54:55 +0100 Subject: [PATCH] fixed static example, updated meta data and bumped version --- examples/static.py | 53 ++++++++++++++++++++++++---------- setup.py | 5 ++-- src/offline_rl_ope/_version.py | 2 +- 3 files changed, 40 insertions(+), 20 deletions(-) diff --git a/examples/static.py b/examples/static.py index 079ac07..3881728 100644 --- a/examples/static.py +++ b/examples/static.py @@ -1,14 +1,14 @@ -from d3rlpy.algos import DQN +from d3rlpy.algos import DQNConfig import pickle from d3rlpy.datasets import get_cartpole import numpy as np from sklearn.multioutput import MultiOutputClassifier from sklearn.multiclass import OneVsRestClassifier -from d3rlpy.ope import DiscreteFQE -from d3rlpy.metrics.scorer import (soft_opc_scorer, - initial_state_value_estimation_scorer) +from d3rlpy.ope import DiscreteFQE, FQEConfig +from d3rlpy.metrics import (SoftOPCEvaluator, + InitialStateValueEstimationEvaluator) +from d3rlpy.dataset import BasicTransitionPicker from xgboost import XGBClassifier -import math import torch from offline_rl_ope.Dataset import ISEpisode @@ -25,7 +25,7 @@ # setup algorithm gamma = 0.99 -dqn = DQN(gamma=gamma, target_update_interval=100) +dqn = DQNConfig(gamma=gamma, target_update_interval=100).create() unique_pol_acts = np.arange(0,env.action_space.n) @@ -54,25 +54,46 @@ def eval_pdf(self, indep_vals:np.array, dep_vals:np.array): objective="binary:logistic"))) # Fit the behaviour model -behav_est.fit(X=dataset.observations, Y=dataset.actions.reshape(-1,1)) +observations = [] +actions = [] +tp = BasicTransitionPicker() +for ep in dataset.episodes: + for i in range(ep.transition_count): + _transition = tp(ep,i) + observations.append(_transition.observation.reshape(1,-1)) + actions.append(_transition.action) + +observations = np.concatenate(observations) +actions = np.concatenate(actions) + +behav_est.fit(X=observations, Y=actions.reshape(-1,1)) gbt_est = GbtEst(estimator=behav_est) gbt_policy_be = BehavPolicy(policy_class=gbt_est, collect_res=False) -dqn.fit(dataset.episodes, n_epochs=1) +no_obs_steps = int(len(actions)*0.025) +n_epochs=1 +n_steps_per_epoch = no_obs_steps +n_steps = no_obs_steps*n_epochs +dqn.fit(dataset, n_steps=n_steps, n_steps_per_epoch=n_steps_per_epoch, + with_timestamp=False) fqe_scorers = { - "soft_opc": soft_opc_scorer(70), - "init_state_val": initial_state_value_estimation_scorer + "soft_opc": SoftOPCEvaluator( + return_threshold=70, + episodes=dataset.episodes + ), + "init_state_val": InitialStateValueEstimationEvaluator( + episodes=dataset.episodes + ) } -fqe_init_kwargs = {"use_gpu": False, "discrete_action": True, - "q_func_factory": 'mean', "learning_rate": 1e-4 - } -discrete_fqe = DiscreteFQE(algo=dqn, **fqe_init_kwargs) -discrete_fqe.fit(dataset.episodes, eval_episodes=dataset.episodes, - scorers=fqe_scorers, n_epochs=1) +fqe_config = FQEConfig(learning_rate=1e-4) +#discrete_fqe = DiscreteFQE(algo=dqn, **fqe_init_kwargs) +discrete_fqe = DiscreteFQE(algo=dqn, config=fqe_config, device=False) + +discrete_fqe.fit(dataset, evaluators=fqe_scorers, n_steps=no_obs_steps) # Static OPE evaluation diff --git a/setup.py b/setup.py index 88b3bd1..ee492b2 100644 --- a/setup.py +++ b/setup.py @@ -24,9 +24,8 @@ license='MIT', classifiers=[], package_dir={"": "src"}, - python_requires="", + python_requires=">=3.11", install_requires=[ - #"d3rlpy @ git+https://github.com/takuseno/d3rlpy.git" - "d3rlpy==2.0.4" + "d3rlpy>=2.0.4" ], ) \ No newline at end of file diff --git a/src/offline_rl_ope/_version.py b/src/offline_rl_ope/_version.py index e845d64..5152aea 100644 --- a/src/offline_rl_ope/_version.py +++ b/src/offline_rl_ope/_version.py @@ -1 +1 @@ -__version__ = "3.0.0" \ No newline at end of file +__version__ = "3.0.1" \ No newline at end of file