From 4ee76c177b70c823fd7f79537bfa799edc73e338 Mon Sep 17 00:00:00 2001 From: Raghu Rajan Date: Wed, 19 Jun 2024 12:43:05 +0200 Subject: [PATCH] MAJOR: Version changed to 1.0.0 with breaking API changes: Change to using gymnasium instead of gym and associated changes in step() and reset() return values, etc.; upgrade numpy and random number generation; still need to update tests; and minigrid example is failing in example.py --- .../mdp_playground/envs/gym_env_wrapper.html | 12 +- .../envs/mujoco_env_wrapper.html | 4 +- .../mdp_playground/envs/rl_toy_env.html | 2 +- .../spaces/test_image_multi_discrete.html | 4 +- example.py | 58 ++-- mdp_playground/__init__.py | 4 +- .../config_processor/config_processor.py | 14 +- mdp_playground/envs/__init__.py | 2 +- mdp_playground/envs/gym_env_wrapper.py | 66 +++-- mdp_playground/envs/mujoco_env_wrapper.py | 14 +- mdp_playground/envs/rl_toy_env.py | 46 +-- mdp_playground/spaces/box_extended.py | 4 +- mdp_playground/spaces/discrete_extended.py | 4 +- mdp_playground/spaces/grid_action_space.py | 6 +- mdp_playground/spaces/image_continuous.py | 4 +- mdp_playground/spaces/image_multi_discrete.py | 14 +- .../spaces/multi_discrete_extended.py | 4 +- .../spaces/test_image_continuous.py | 2 +- .../spaces/test_image_multi_discrete.py | 6 +- mdp_playground/spaces/tuple_extended.py | 4 +- setup.py | 4 +- tests/test_gym_env_wrapper.py | 177 ++++-------- tests/test_mdp_playground.py | 268 +++++++++--------- tests/test_run_experiments.py | 2 +- tests/test_version.py | 2 +- 25 files changed, 340 insertions(+), 387 deletions(-) diff --git a/docs/_build/html/_modules/mdp_playground/envs/gym_env_wrapper.html b/docs/_build/html/_modules/mdp_playground/envs/gym_env_wrapper.html index a11160e..916f361 100644 --- a/docs/_build/html/_modules/mdp_playground/envs/gym_env_wrapper.html +++ b/docs/_build/html/_modules/mdp_playground/envs/gym_env_wrapper.html @@ -602,7 +602,7 @@

Source code for mdp_playground.envs.gym_env_wrapper

# print("Setting Mujoco self.frame_skip, self._ctrl_cost_weight, self._forward_reward_weight to", self.frame_skip, self._ctrl_cost_weight, self._forward_reward_weight, "corresponding to time_unit in config.")
[docs] def step(self, action): - # next_state, reward, done, info = super(GymEnvWrapper, self).step(action) + # next_state, reward, done, trunc, info = super(GymEnvWrapper, self).step(action) self.total_transitions_episode += 1 if self.config["state_space_type"] == "discrete" and self.transition_noise > 0.0: @@ -689,7 +689,7 @@

Source code for mdp_playground.envs.gym_env_wrapper

int The seed returned by Gym """ - # If seed is None, you get a randomly generated seed from gym.utils... + # If seed is None, you get a randomly generated seed from gymnasium.utils... self.np_random, self.seed_ = gym.utils.seeding.np_random(seed) #random print("Env SEED set to: " + str(seed) + ". Returned seed from Gym: " + str(self.seed_)) @@ -701,8 +701,8 @@

Source code for mdp_playground.envs.gym_env_wrapper

# from mdp_playground.envs.gym_env_wrapper import get_gym_wrapper -# from gym.envs.atari import AtariEnv -# from gym.wrappers import AtariPreprocessing +# from gymnasium.envs.atari import AtariEnv +# from gymnasium.wrappers import AtariPreprocessing # AtariPreprocessing() # AtariEnvWrapper = get_gym_wrapper(AtariEnv) # from ray.tune.registry import register_env @@ -711,7 +711,7 @@

Source code for mdp_playground.envs.gym_env_wrapper

# ob = aew.reset() # from mdp_playground.envs.gym_env_wrapper import GymEnvWrapper -# from gym.envs.atari import AtariEnv +# from gymnasium.envs.atari import AtariEnv # ae = AtariEnv(**{'game': 'beam_rider', 'obs_type': 'image', 'frameskip': 1}) # aew = GymEnvWrapper(ae, **{'reward_noise': lambda a: a.normal(0, 0.1), 'transition_noise': 0.1, 'delay': 1, 'frame_skip': 4, "atari_preprocessing": True, "state_space_type": "discrete", 'seed': 0}) # ob = aew.reset() @@ -720,7 +720,7 @@

Source code for mdp_playground.envs.gym_env_wrapper

# total_reward = 0.0 # for i in range(200): # act = aew.action_space.sample() -# next_state, reward, done, info = aew.step(act) +# next_state, reward, done, trunc, info = aew.step(act) # print(reward, done, act) # if reward > 10: # print("reward in step:", i, reward) diff --git a/docs/_build/html/_modules/mdp_playground/envs/mujoco_env_wrapper.html b/docs/_build/html/_modules/mdp_playground/envs/mujoco_env_wrapper.html index dd9a434..9d5fb80 100644 --- a/docs/_build/html/_modules/mdp_playground/envs/mujoco_env_wrapper.html +++ b/docs/_build/html/_modules/mdp_playground/envs/mujoco_env_wrapper.html @@ -438,7 +438,7 @@

MDP Playground 0.0.1 documentation

Source code for mdp_playground.envs.mujoco_env_wrapper

-# from gym.envs.mujoco.mujoco_env import MujocoEnv
+# from gymnasium.envs.mujoco.mujoco_env import MujocoEnv
 from gym.envs.mujoco.half_cheetah_v3 import HalfCheetahEnv
 from gym.envs.mujoco.pusher import PusherEnv
 from gym.envs.mujoco.reacher import ReacherEnv
@@ -516,7 +516,7 @@ 

Source code for mdp_playground.envs.mujoco_env_wrapper

# from mdp_playground.envs.mujoco_env_wrapper import get_mujoco_wrapper #hack # -# from gym.envs.mujoco.reacher import ReacherEnv +# from gymnasium.envs.mujoco.reacher import ReacherEnv # ReacherWrapperV2 = get_mujoco_wrapper(ReacherEnv) # config = {"time_unit": 0.2} # rw2 = ReacherWrapperV2(**config) diff --git a/docs/_build/html/_modules/mdp_playground/envs/rl_toy_env.html b/docs/_build/html/_modules/mdp_playground/envs/rl_toy_env.html index 98d7c9f..5aa8b15 100644 --- a/docs/_build/html/_modules/mdp_playground/envs/rl_toy_env.html +++ b/docs/_build/html/_modules/mdp_playground/envs/rl_toy_env.html @@ -1967,7 +1967,7 @@

Source code for mdp_playground.envs.rl_toy_env

int The seed returned by Gym """ - # If seed is None, you get a randomly generated seed from gym.utils... + # If seed is None, you get a randomly generated seed from gymnasium.utils... self.np_random, self.seed_ = gym.utils.seeding.np_random(seed) #random print("Env SEED set to: " + str(seed) + ". Returned seed from Gym: " + str(self.seed_)) return self.seed_

diff --git a/docs/_build/html/_modules/mdp_playground/spaces/test_image_multi_discrete.html b/docs/_build/html/_modules/mdp_playground/spaces/test_image_multi_discrete.html index e19c1eb..07851ce 100644 --- a/docs/_build/html/_modules/mdp_playground/spaces/test_image_multi_discrete.html +++ b/docs/_build/html/_modules/mdp_playground/spaces/test_image_multi_discrete.html @@ -442,8 +442,8 @@

Source code for mdp_playground.spaces.test_image_multi_discrete

import numpy as np from mdp_playground.spaces.image_multi_discrete import ImageMultiDiscrete from gym.spaces import Discrete, MultiDiscrete -# import gym -# from gym.spaces import MultiDiscrete +# import gymnasium as gym +# from gymnasium.spaces import MultiDiscrete # # from .space import Space # import PIL.ImageDraw as ImageDraw # import PIL.Image as Image diff --git a/example.py b/example.py index 0c140d4..dc3bb01 100644 --- a/example.py +++ b/example.py @@ -59,7 +59,7 @@ def discrete_environment_example(): config["repeats_in_sequences"] = False config["generate_random_mdp"] = True - env = RLToyEnv(**config) # Calls env.reset() automatically. So, in general, + env = RLToyEnv(**config) # Calls env.reset()[0] automatically. So, in general, # there is no need to call it after this. # The environment maintains an augmented state which contains the underlying @@ -73,7 +73,7 @@ def discrete_environment_example(): "the transition:" ) action = env.action_space.sample() - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) env.close() @@ -113,7 +113,7 @@ def discrete_environment_image_representations_example(): "the transition:" ) action = env.action_space.sample() - next_state_image, reward, done, info = env.step(action) + next_state_image, reward, done, trunc, info = env.step(action) augmented_state_dict = env.get_augmented_state() next_state = augmented_state_dict["curr_state"] # Underlying MDP state holds # the current discrete state. @@ -159,7 +159,7 @@ def discrete_environment_diameter_image_representations_example(): "the transition:" ) action = env.action_space.sample() - next_state_image, reward, done, info = env.step(action) + next_state_image, reward, done, trunc, info = env.step(action) augmented_state_dict = env.get_augmented_state() next_state = augmented_state_dict["curr_state"] # Underlying MDP state holds # the current discrete state. @@ -192,14 +192,14 @@ def continuous_environment_example_move_to_a_point(): config["reward_function"] = "move_to_a_point" env = RLToyEnv(**config) - state = env.reset().copy() + state = env.reset()[0].copy() print( "Taking a step in the environment with a random action and printing " "the transition:" ) action = env.action_space.sample() - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) env.close() @@ -231,7 +231,7 @@ def continuous_environment_example_move_to_a_point_irrelevant_image(): config["relevant_indices"] = [0, 1] env = RLToyEnv(**config) - state = env.reset() + state = env.reset()[0] augmented_state_dict = env.get_augmented_state() state = augmented_state_dict["curr_state"].copy() # Underlying MDP state holds # the current continuous state. @@ -241,7 +241,7 @@ def continuous_environment_example_move_to_a_point_irrelevant_image(): "the transition:" ) action = env.action_space.sample() - next_state_image, reward, done, info = env.step(action) + next_state_image, reward, done, trunc, info = env.step(action) augmented_state_dict = env.get_augmented_state() next_state = augmented_state_dict["curr_state"].copy() # Underlying MDP state holds # the current continuous state. @@ -274,14 +274,14 @@ def continuous_environment_example_move_along_a_line(): config["reward_function"] = "move_along_a_line" env = RLToyEnv(**config) - state = env.reset().copy() + state = env.reset()[0].copy() print( "Taking a step in the environment with a random action and printing " "the transition:" ) action = env.action_space.sample() - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) env.close() @@ -305,12 +305,12 @@ def grid_environment_example(): for i in range(len(actions)): action = actions[i] - next_obs, reward, done, info = env.step(action) + next_obs, reward, done, trunc, info = env.step(action) next_state = env.get_augmented_state()["augmented_state"][-1] print("sars', done =", state, action, reward, next_state, done) state = next_state - env.reset() + env.reset()[0] env.close() @@ -334,12 +334,12 @@ def grid_environment_image_representations_example(): for i in range(len(actions)): action = actions[i] - next_obs, reward, done, info = env.step(action) + next_obs, reward, done, trunc, info = env.step(action) next_state = env.get_augmented_state()["augmented_state"][-1] print("sars', done =", state, action, reward, next_state, done) state = next_state - env.reset() + env.reset()[0] env.close() display_image(next_obs) @@ -356,18 +356,18 @@ def atari_wrapper_example(): } from mdp_playground.envs import GymEnvWrapper - import gym + import gymnasium as gym ae = gym.make("QbertNoFrameskip-v4") env = GymEnvWrapper(ae, **config) - state = env.reset() + state = env.reset()[0] print( "Taking 10 steps in the environment with a random action and printing the transition:" ) for i in range(10): action = env.action_space.sample() - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print( "s.shape a r s'.shape, done =", state.shape, @@ -403,18 +403,18 @@ def mujoco_wrapper_example(): # of the Mujoco base_class. try: from mdp_playground.envs import get_mujoco_wrapper - from gym.envs.mujoco.half_cheetah_v3 import HalfCheetahEnv + from gymnasium.envs.mujoco.half_cheetah_v3 import HalfCheetahEnv wrapped_mujoco_env = get_mujoco_wrapper(HalfCheetahEnv) env = wrapped_mujoco_env(**config) - state = env.reset() + state = env.reset()[0] print( "Taking a step in the environment with a random action and printing the transition:" ) action = env.action_space.sample() - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) env.close() @@ -440,22 +440,22 @@ def minigrid_wrapper_example(): } from mdp_playground.envs.gym_env_wrapper import GymEnvWrapper - import gym + import gymnasium as gym - from gym_minigrid.wrappers import RGBImgPartialObsWrapper, ImgObsWrapper + from minigrid.wrappers import RGBImgPartialObsWrapper, ImgObsWrapper env = gym.make("MiniGrid-Empty-8x8-v0") env = RGBImgPartialObsWrapper(env) # Get pixel observations env = ImgObsWrapper(env) # Get rid of the 'mission' field env = GymEnvWrapper(env, **config) - obs = env.reset() # This now produces an RGB tensor only + obs = env.reset()[0] # This now produces an RGB tensor only print( "Taking a step in the environment with a random action and printing the transition:" ) action = env.action_space.sample() - next_obs, reward, done, info = env.step(action) + next_obs, reward, done, trunc, info = env.step(action) print( "s.shape ar s'.shape, done =", obs.shape, @@ -481,17 +481,17 @@ def procgen_wrapper_example(): } from mdp_playground.envs.gym_env_wrapper import GymEnvWrapper - import gym + import gymnasium as gym env = gym.make("procgen:procgen-coinrun-v0") env = GymEnvWrapper(env, **config) - obs = env.reset() + obs = env.reset()[0] print( "Taking a step in the environment with a random action and printing the transition:" ) action = env.action_space.sample() - next_obs, reward, done, info = env.step(action) + next_obs, reward, done, trunc, info = env.step(action) print( "s.shape ar s'.shape, done =", obs.shape, @@ -577,7 +577,7 @@ def procgen_wrapper_example(): # Using gym.make() example 1 import mdp_playground - import gym + import gymnasium as gym gym.make("RLToy-v0") @@ -591,6 +591,6 @@ def procgen_wrapper_example(): "maximally_connected": True, } ) - env.reset() + env.reset()[0] for i in range(10): print(env.step(env.action_space.sample())) diff --git a/mdp_playground/__init__.py b/mdp_playground/__init__.py index 76d4bbe..63e6414 100644 --- a/mdp_playground/__init__.py +++ b/mdp_playground/__init__.py @@ -1,4 +1,4 @@ -from gym.envs.registration import register +from gymnasium.envs.registration import register register( id="RLToy-v0", @@ -11,4 +11,4 @@ max_episode_steps=100, ) -__version__ = "0.0.2" +__version__ = "1.0.0" diff --git a/mdp_playground/config_processor/config_processor.py b/mdp_playground/config_processor/config_processor.py index 376f1e5..ea10434 100644 --- a/mdp_playground/config_processor/config_processor.py +++ b/mdp_playground/config_processor/config_processor.py @@ -786,7 +786,7 @@ def combined_processing(*static_configs, varying_configs, framework="ray", algor from mdp_playground.envs.mujoco_env_wrapper import ( get_mujoco_wrapper, ) # hack - from gym.envs.mujoco.half_cheetah_v3 import HalfCheetahEnv + from gymnasium.envs.mujoco.half_cheetah_v3 import HalfCheetahEnv wrapped_mujoco_env = get_mujoco_wrapper(HalfCheetahEnv) register_env( @@ -802,7 +802,7 @@ def combined_processing(*static_configs, varying_configs, framework="ray", algor from mdp_playground.envs.mujoco_env_wrapper import ( get_mujoco_wrapper, ) # hack - from gym.envs.mujoco.hopper_v3 import HopperEnv + from gymnasium.envs.mujoco.hopper_v3 import HopperEnv wrapped_mujoco_env = get_mujoco_wrapper(HopperEnv) register_env( @@ -818,7 +818,7 @@ def combined_processing(*static_configs, varying_configs, framework="ray", algor from mdp_playground.envs.mujoco_env_wrapper import ( get_mujoco_wrapper, ) # hack - from gym.envs.mujoco.pusher import PusherEnv + from gymnasium.envs.mujoco.pusher import PusherEnv wrapped_mujoco_env = get_mujoco_wrapper(PusherEnv) register_env( @@ -834,7 +834,7 @@ def combined_processing(*static_configs, varying_configs, framework="ray", algor from mdp_playground.envs.mujoco_env_wrapper import ( get_mujoco_wrapper, ) # hack - from gym.envs.mujoco.reacher import ReacherEnv + from gymnasium.envs.mujoco.reacher import ReacherEnv wrapped_mujoco_env = get_mujoco_wrapper(ReacherEnv) register_env( @@ -993,7 +993,7 @@ def post_processing(framework="ray"): def create_gym_env_wrapper_atari(config): - from gym.envs.atari import AtariEnv + from gymnasium.envs.atari import AtariEnv from mdp_playground.envs.gym_env_wrapper import GymEnvWrapper ae = AtariEnv(**config["AtariEnv"]) @@ -1008,9 +1008,9 @@ def create_gym_env_wrapper_frame_stack_atari(config): # hack ###TODO remove? """When using frameStack GymEnvWrapper should wrap AtariEnv using wrap_deepmind_ray and therefore this function sets "wrap_deepmind_ray": True and 'frame_skip': 1 inside config so as to keep config same as for create_gym_env_wrapper_atari above and reduce manual errors when switching between the 2.""" config["wrap_deepmind_ray"] = True # hack config["frame_skip"] = 1 # hack - from gym.envs.atari import AtariEnv + from gymnasium.envs.atari import AtariEnv from mdp_playground.envs.gym_env_wrapper import GymEnvWrapper - import gym + import gymnasium as gym game = config["AtariEnv"]["game"] game = "".join([g.capitalize() for g in game.split("_")]) diff --git a/mdp_playground/envs/__init__.py b/mdp_playground/envs/__init__.py index 42ec031..b393fad 100644 --- a/mdp_playground/envs/__init__.py +++ b/mdp_playground/envs/__init__.py @@ -1,5 +1,5 @@ from mdp_playground.envs.rl_toy_env import RLToyEnv -from gym import error +from gymnasium import error try: from mdp_playground.envs.gym_env_wrapper import GymEnvWrapper diff --git a/mdp_playground/envs/gym_env_wrapper.py b/mdp_playground/envs/gym_env_wrapper.py index 4f8e521..885e19b 100644 --- a/mdp_playground/envs/gym_env_wrapper.py +++ b/mdp_playground/envs/gym_env_wrapper.py @@ -1,9 +1,9 @@ -import gym +import gymnasium as gym import copy import numpy as np import sys -from gym.spaces import Box, Tuple -from gym.wrappers import AtariPreprocessing +from gymnasium.spaces import Box, Tuple +from gymnasium.wrappers import AtariPreprocessing from mdp_playground.envs.rl_toy_env import RLToyEnv import warnings import PIL.ImageDraw as ImageDraw @@ -31,8 +31,12 @@ class GymEnvWrapper(gym.Env): """ - # Should not be a gym.Wrapper because 1) gym.Wrapper has member variables observation_space and action_space while here with irrelevant_features we would have multiple observation_spaces and this could cause conflict with code that assumes any subclass of gym.Wrapper should have these member variables. - # However, it _should_ be at least a gym.Env + # Should not be a gym.Wrapper because 1) gym.Wrapper has member variables + # observation_space and action_space while here with irrelevant_features + # we would have multiple observation_spaces and this could cause conflict + # with code that assumes any subclass of gym.Wrapper should have these member + # variables. However, it _should_ be at least a gym.Env. + # Following comment based on the old get_gym_wrapper(base_class) code: # Does it need to be a subclass of base_class because some external code # may check if it's an AtariEnv, for instance, and do further stuff based # on that? @@ -46,16 +50,16 @@ def __init__(self, env, **config): if "seed" in config: seed_int = config["seed"] - self.seed(seed_int) # seed + self.seed(seed_int) # #seed # IMP Move below code from here to seed()? Because if seed is called # during the run of an env, the expectation is that all obs., act. space, - # etc. seeds are set? Only Atari in Gym seems to do something similar, the - # others I saw there don't seem to set seed for obs., act. spaces. + # etc. seeds are set during that call? Only Atari in Gym seems to do something + # similar, the others I saw there don't seem to set seed for obs., act. spaces. self.env.seed( seed_int - ) # seed ###IMP Apparently Atari also has a seed. :/ Without this, for beam_rider(?), about 1 in 5 times I got reward of 88.0 and 44.0 the remaining times with the same action sequence!! With setting this seed, I got the same reward of 44.0 when I ran about 20 times.; ##TODO If this is really a wrapper, should it be modifying the seed of the env? - obs_space_seed = self.np_random.randint(sys.maxsize) # random - act_space_seed = self.np_random.randint(sys.maxsize) # random + ) # #seed ###IMP Apparently Atari also has a seed. :/ Without this, for beam_rider(?), about 1 in 5 times I got reward of 88.0 and 44.0 the remaining times with the same action sequence!! With setting this seed, I got the same reward of 44.0 when I ran about 20 times.; ##TODO If this is really a wrapper, should it be modifying the seed of the env? + obs_space_seed = self.np_random.integers(sys.maxsize).item() # random + act_space_seed = self.np_random.integers(sys.maxsize).item() # random self.env.observation_space.seed(obs_space_seed) # seed self.env.action_space.seed(act_space_seed) # seed @@ -203,7 +207,7 @@ def __init__(self, env, **config): # self.irrelevant_features = config["irrelevant_features"] irr_toy_env_conf = config["irrelevant_features"] if "seed" not in irr_toy_env_conf: - irr_toy_env_conf["seed"] = self.np_random.randint(sys.maxsize) # random + irr_toy_env_conf["seed"] = self.np_random.integers(sys.maxsize).item() # random if config["state_space_type"] == "discrete": pass @@ -323,7 +327,7 @@ def __init__(self, env, **config): # print("Setting Mujoco self.frame_skip, self._ctrl_cost_weight, self._forward_reward_weight to", self.frame_skip, self._ctrl_cost_weight, self._forward_reward_weight, "corresponding to time_unit in config.") def step(self, action): - # next_state, reward, done, info = super(GymEnvWrapper, self).step(action) + # next_state, reward, done, trunc, info = super(GymEnvWrapper, self).step(action) self.total_transitions_episode += 1 if self.config["state_space_type"] == "discrete": @@ -356,22 +360,22 @@ def step(self, action): if "irrelevant_features" in self.config: if self.config["state_space_type"] == "discrete": - next_state, reward, done, info = self.env.step(action[0]) - next_state_irr, _, done_irr, _ = self.irr_toy_env.step(action[1]) + next_state, reward, done, trunc, info = self.env.step(action[0]) + next_state_irr, _, done_irr, trunc_irr, _ = self.irr_toy_env.step(action[1]) next_state = tuple([next_state, next_state_irr]) else: # env_act_shape is the shape of the underlying env's action space and we # sub-select those dimensions from the total action space next and apply # to the underlying env: - next_state, reward, done, info = self.env.step( + next_state, reward, done, trunc, info = self.env.step( action[: self.env_act_shape[0]] ) - next_state_irr, _, done_irr, _ = self.irr_toy_env.step( + next_state_irr, _, done_irr, trunc_irr, _ = self.irr_toy_env.step( action[self.env_act_shape[0] :] ) next_state = np.concatenate((next_state, next_state_irr)) else: - next_state, reward, done, info = self.env.step(action) + next_state, reward, done, trunc, info = self.env.step(action) if self.config["state_space_type"] == "continuous": next_state += noise_in_transition @@ -403,7 +407,7 @@ def step(self, action): reward *= self.reward_scale reward += self.reward_shift - return next_state, reward, done, info + return next_state, reward, done, trunc, info def reset(self): # on episode "end" stuff (to not be invoked when reset() called when @@ -441,15 +445,15 @@ def reset(self): if "irrelevant_features" in self.config: if self.config["state_space_type"] == "discrete": - reset_state = self.env.reset() - reset_state_irr = self.irr_toy_env.reset() + reset_state = self.env.reset()[0] + reset_state_irr = self.irr_toy_env.reset()[0] reset_state = tuple([reset_state, reset_state_irr]) else: - reset_state = self.env.reset() - reset_state_irr = self.irr_toy_env.reset() + reset_state = self.env.reset()[0] + reset_state_irr = self.irr_toy_env.reset()[0] reset_state = np.concatenate((reset_state, reset_state_irr)) else: - reset_state = self.env.reset() + reset_state = self.env.reset()[0] if self.image_transforms: reset_state = self.get_transformed_image(reset_state) @@ -470,7 +474,7 @@ def seed(self, seed=None): int The seed returned by Gym """ - # If seed is None, you get a randomly generated seed from gym.utils... + # If seed is None, you get a randomly generated seed from gymnasium.utils... self.np_random, self.seed_ = gym.utils.seeding.np_random(seed) # random print( "Env SEED set to: " @@ -544,8 +548,8 @@ def get_transformed_image(self, env_img): if "shift" in self.image_transforms: max_shift_w = (tot_width - R) // 2 max_shift_h = (tot_height - R) // 2 - add_shift_w = self.np_random.randint(-max_shift_w + 1, max_shift_w) - add_shift_h = self.np_random.randint(-max_shift_h + 1, max_shift_h) + add_shift_w = self.np_random.integers(-max_shift_w + 1, max_shift_w).item() + add_shift_h = self.np_random.integers(-max_shift_h + 1, max_shift_h).item() # print("add_shift_w, add_shift_h", add_shift_w, add_shift_h) add_shift_w = int(add_shift_w / sh_quant) * sh_quant add_shift_h = int(add_shift_h / sh_quant) * sh_quant @@ -582,8 +586,8 @@ def get_transformed_image(self, env_img): # from mdp_playground.envs.gym_env_wrapper import get_gym_wrapper -# from gym.envs.atari import AtariEnv -# from gym.wrappers import AtariPreprocessing +# from gymnasium.envs.atari import AtariEnv +# from gymnasium.wrappers import AtariPreprocessing # AtariPreprocessing() # AtariEnvWrapper = get_gym_wrapper(AtariEnv) # from ray.tune.registry import register_env @@ -592,7 +596,7 @@ def get_transformed_image(self, env_img): # ob = aew.reset() # from mdp_playground.envs.gym_env_wrapper import GymEnvWrapper -# from gym.envs.atari import AtariEnv +# from gymnasium.envs.atari import AtariEnv # ae = AtariEnv(**{'game': 'beam_rider', 'obs_type': 'image', 'frameskip': 1}) # aew = GymEnvWrapper(ae, **{'reward_noise': lambda a: a.normal(0, 0.1), 'transition_noise': 0.1, 'delay': 1, 'frame_skip': 4, "atari_preprocessing": True, "state_space_type": "discrete", 'seed': 0}) # ob = aew.reset() @@ -601,7 +605,7 @@ def get_transformed_image(self, env_img): # total_reward = 0.0 # for i in range(200): # act = aew.action_space.sample() -# next_state, reward, done, info = aew.step(act) +# next_state, reward, done, trunc, info = aew.step(act) # print(reward, done, act) # if reward > 10: # print("reward in step:", i, reward) diff --git a/mdp_playground/envs/mujoco_env_wrapper.py b/mdp_playground/envs/mujoco_env_wrapper.py index 748db1f..6c9c91c 100644 --- a/mdp_playground/envs/mujoco_env_wrapper.py +++ b/mdp_playground/envs/mujoco_env_wrapper.py @@ -1,7 +1,7 @@ -# from gym.envs.mujoco.mujoco_env import MujocoEnv -from gym.envs.mujoco.half_cheetah_v3 import HalfCheetahEnv -from gym.envs.mujoco.pusher import PusherEnv -from gym.envs.mujoco.reacher import ReacherEnv +# from gymnasium.envs.mujoco.mujoco_env import MujocoEnv +from gymnasium.envs.mujoco.half_cheetah_v3 import HalfCheetahEnv +from gymnasium.envs.mujoco.pusher import PusherEnv +from gymnasium.envs.mujoco.reacher import ReacherEnv import copy @@ -104,20 +104,20 @@ def __init__(self, **config): # Gets passed env_config from run_experiments.py ) def step(self, action): # hack - obs, reward, done, info = super(MujocoEnvWrapper, self).step(action) + obs, reward, done, trunc, info = super(MujocoEnvWrapper, self).step(action) if ( self.base_class in [PusherEnv, ReacherEnv] and "time_unit" in self.config ): reward *= self.time_unit - return obs, reward, done, info + return obs, reward, done, trunc, info return MujocoEnvWrapper # from mdp_playground.envs.mujoco_env_wrapper import get_mujoco_wrapper #hack # -# from gym.envs.mujoco.reacher import ReacherEnv +# from gymnasium.envs.mujoco.reacher import ReacherEnv # ReacherWrapperV2 = get_mujoco_wrapper(ReacherEnv) # config = {"time_unit": 0.2} # rw2 = ReacherWrapperV2(**config) diff --git a/mdp_playground/envs/rl_toy_env.py b/mdp_playground/envs/rl_toy_env.py index 7091ec0..9ae01e9 100644 --- a/mdp_playground/envs/rl_toy_env.py +++ b/mdp_playground/envs/rl_toy_env.py @@ -12,7 +12,7 @@ import scipy from scipy import stats from scipy.spatial import distance -import gym +import gymnasium as gym from mdp_playground.spaces import ( BoxExtended, DiscreteExtended, @@ -287,7 +287,7 @@ def __init__(self, **config): self.seed_int = config["seed"] need_to_gen_seeds = True else: - raise TypeError("Unsupported data type for seed: ", type(config["seed"])) + raise TypeError("Unsupported data type for seed, actual config: ", type(config["seed"]), config) # #seed #TODO move to seed() so that obs., act. space, etc. have their # seeds reset too when env seed is reset? @@ -300,25 +300,25 @@ def __init__(self, **config): # separation of the relevant and irrelevant dimensions!! _And_ the seed # remaining the same for the underlying discrete environment makes it # easier to write tests! - self.seed_dict["relevant_state_space"] = self.np_random.randint( + self.seed_dict["relevant_state_space"] = self.np_random.integers( sys.maxsize - ) # #random - self.seed_dict["relevant_action_space"] = self.np_random.randint( + ).item() # #random + self.seed_dict["relevant_action_space"] = self.np_random.integers( sys.maxsize - ) # #random - self.seed_dict["irrelevant_state_space"] = self.np_random.randint( + ).item() # #random + self.seed_dict["irrelevant_state_space"] = self.np_random.integers( sys.maxsize - ) # #random - self.seed_dict["irrelevant_action_space"] = self.np_random.randint( + ).item() # #random + self.seed_dict["irrelevant_action_space"] = self.np_random.integers( sys.maxsize - ) # #random + ).item() # #random # #IMP This is currently used to sample only for continuous spaces and not used for discrete spaces by the Environment. User might want to sample from it for multi-discrete environments. #random - self.seed_dict["state_space"] = self.np_random.randint(sys.maxsize) + self.seed_dict["state_space"] = self.np_random.integers(sys.maxsize).item() # #IMP This IS currently used to sample random actions by the RL agent for both discrete and continuous environments (but not used anywhere by the Environment). #random - self.seed_dict["action_space"] = self.np_random.randint(sys.maxsize) - self.seed_dict["image_representations"] = self.np_random.randint( + self.seed_dict["action_space"] = self.np_random.integers(sys.maxsize).item() + self.seed_dict["image_representations"] = self.np_random.integers( sys.maxsize - ) # #random + ).item() # #random # print("Mersenne0, dummy_eval:", self.np_random.get_state()[2], "dummy_eval" in config) else: # if seed dict was passed self.seed(self.seed_dict["env"]) @@ -1928,8 +1928,8 @@ def step(self, action, imaginary_rollout=False): Returns ------- - int or np.array, double, boolean, dict - The next state, reward, whether the episode terminated and additional info dict at the end of the current transition + int or np.array, double, boolean, boolean, dict + The next state, reward, whether the episode terminated, whether it was truncated and additional info dict at the end of the current transition """ # For imaginary transitions, discussion: @@ -2043,7 +2043,11 @@ def step(self, action, imaginary_rollout=False): + str(self.reward) ) - return self.curr_obs, self.reward, self.done, self.get_augmented_state() + # The following returns False for the truncated variable as early termination of episodes is handled + # using max_episode_steps in the environment wrapper gymnasium.wrappers.TimeLimit when using + # the env RLToyFinitieHorizon. In the experiments from the paper, early termination was handled by + # Ray Rllib's horizon parameter. + return self.curr_obs, self.reward, self.done, False, self.get_augmented_state() def get_augmented_state(self): """Intended to return the full augmented state which would be Markovian. (However, it's not Markovian wrt the noise in P and R because we're not returning the underlying RNG.) Currently, returns the augmented state which is the sequence of length "delay + sequence_length + 1" of past states for both discrete and continuous environments. Additonally, the current state derivatives are also returned for continuous environments. @@ -2229,7 +2233,7 @@ def reset(self): + str(self.sequence_length) ) - return self.curr_obs + return self.curr_obs, {} def seed(self, seed=None): """Initialises the Numpy RNG for the environment by calling a utility for this in Gym. @@ -2246,7 +2250,11 @@ def seed(self, seed=None): int The seed returned by Gym """ - # If seed is None, you get a randomly generated seed from gym.utils... + # If seed is None, you get a randomly generated seed from gymnasium.utils... + # As of 2024.06.18: + # seed_seq = np.random.SeedSequence(seed) + # np_seed = seed_seq.entropy + # rng = RandomNumberGenerator(np.random.PCG64(seed_seq)) self.np_random, self.seed_ = gym.utils.seeding.np_random(seed) # #random print( "Env SEED set to: " diff --git a/mdp_playground/spaces/box_extended.py b/mdp_playground/spaces/box_extended.py index 140a0f9..11661f8 100644 --- a/mdp_playground/spaces/box_extended.py +++ b/mdp_playground/spaces/box_extended.py @@ -1,6 +1,6 @@ import numpy as np -import gym -from gym.spaces import Box +import gymnasium as gym +from gymnasium.spaces import Box class BoxExtended(Box): diff --git a/mdp_playground/spaces/discrete_extended.py b/mdp_playground/spaces/discrete_extended.py index 4d6c5ad..f827493 100644 --- a/mdp_playground/spaces/discrete_extended.py +++ b/mdp_playground/spaces/discrete_extended.py @@ -1,6 +1,6 @@ import numpy as np -import gym -from gym.spaces import Discrete +import gymnasium as gym +from gymnasium.spaces import Discrete class DiscreteExtended(Discrete): diff --git a/mdp_playground/spaces/grid_action_space.py b/mdp_playground/spaces/grid_action_space.py index e5b6697..92661ef 100644 --- a/mdp_playground/spaces/grid_action_space.py +++ b/mdp_playground/spaces/grid_action_space.py @@ -1,5 +1,5 @@ import numpy as np -import gym +import gymnasium as gym from mdp_playground.spaces import BoxExtended @@ -15,8 +15,8 @@ def sample(self): # Select which dimension will have action (only 1 dimension can have # motion in traditional grid worlds). This also is more consistent with # Manhattan dist reward defined for grid worlds in rl_toy_env.py - ind = self.np_random.randint(self.high.size) - val = self.np_random.randint(3) + ind = self.np_random.integers(self.high.size).item() + val = self.np_random.integers(3).item() samp[ind] = val - 1 # Shift into grid action range of [-1, 0, 1] return samp.astype(int) diff --git a/mdp_playground/spaces/image_continuous.py b/mdp_playground/spaces/image_continuous.py index 1f4577d..946ca4f 100644 --- a/mdp_playground/spaces/image_continuous.py +++ b/mdp_playground/spaces/image_continuous.py @@ -1,7 +1,7 @@ import warnings import numpy as np -import gym -from gym.spaces import Box, Space +import gymnasium as gym +from gymnasium.spaces import Box, Space import PIL.ImageDraw as ImageDraw import PIL.Image as Image from PIL.Image import FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM diff --git a/mdp_playground/spaces/image_multi_discrete.py b/mdp_playground/spaces/image_multi_discrete.py index 8a363b7..61af6c5 100644 --- a/mdp_playground/spaces/image_multi_discrete.py +++ b/mdp_playground/spaces/image_multi_discrete.py @@ -1,7 +1,7 @@ import warnings import numpy as np -import gym -from gym.spaces import Box, Discrete, MultiDiscrete, Space +import gymnasium as gym +from gymnasium.spaces import Box, Discrete, MultiDiscrete, Space import PIL.ImageDraw as ImageDraw import PIL.Image as Image from PIL.Image import FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM @@ -160,8 +160,8 @@ def generate_image(self, discrete_state): # , state_space_size, polygon_sides if "shift" in self.transforms: max_shift_w = self.width / 2 - R max_shift_h = self.height / 2 - R - add_shift_w = self.np_random.randint(-max_shift_w + 1, max_shift_w) - add_shift_h = self.np_random.randint(-max_shift_h + 1, max_shift_h) + add_shift_w = self.np_random.integers(-max_shift_w + 1, max_shift_w).item() + add_shift_h = self.np_random.integers(-max_shift_h + 1, max_shift_h).item() add_shift_w = (add_shift_w // sh_quant) * sh_quant add_shift_h = (add_shift_h // sh_quant) * sh_quant # print("shift_w, shift_h", add_shift_w, add_shift_h) @@ -236,15 +236,15 @@ def generate_image(self, discrete_state): # , state_space_size, polygon_sides "rotate" in self.transforms ): # TODO rotation can lead to image going out of bounds. # rotation_ = (360 / polygon_sides) * (discrete_state / state_space_size) # Need to divide by polygon_sides because - rotation = self.np_random.randint(360) + rotation = self.np_random.integers(360).item() rotation = (rotation // ro_quant) * ro_quant # print("rotation", rotation) image_ = image_.rotate(rotation) # image_.rotate( if "flip" in self.transforms: - if self.np_random.randint(2) == 0: # Only flip half the times - if self.np_random.randint(2) == 0: + if self.np_random.integers(2).item() == 0: # Only flip half the times + if self.np_random.integers(2).item() == 0: image_ = image_.transpose(FLIP_LEFT_RIGHT) else: image_ = image_.transpose(FLIP_TOP_BOTTOM) diff --git a/mdp_playground/spaces/multi_discrete_extended.py b/mdp_playground/spaces/multi_discrete_extended.py index 5e669d9..0544b62 100644 --- a/mdp_playground/spaces/multi_discrete_extended.py +++ b/mdp_playground/spaces/multi_discrete_extended.py @@ -1,6 +1,6 @@ import numpy as np -import gym -from gym.spaces import MultiDiscrete +import gymnasium as gym +from gymnasium.spaces import MultiDiscrete class MultiDiscreteExtended(MultiDiscrete): diff --git a/mdp_playground/spaces/test_image_continuous.py b/mdp_playground/spaces/test_image_continuous.py index 1d5e5cb..49b123f 100644 --- a/mdp_playground/spaces/test_image_continuous.py +++ b/mdp_playground/spaces/test_image_continuous.py @@ -1,7 +1,7 @@ import unittest import numpy as np from mdp_playground.spaces.image_continuous import ImageContinuous -from gym.spaces import Box +from gymnasium.spaces import Box # import PIL.ImageDraw as ImageDraw import PIL.Image as Image diff --git a/mdp_playground/spaces/test_image_multi_discrete.py b/mdp_playground/spaces/test_image_multi_discrete.py index 5116cd7..4db4f76 100644 --- a/mdp_playground/spaces/test_image_multi_discrete.py +++ b/mdp_playground/spaces/test_image_multi_discrete.py @@ -1,10 +1,10 @@ import unittest import numpy as np from mdp_playground.spaces.image_multi_discrete import ImageMultiDiscrete -from gym.spaces import Discrete, MultiDiscrete +from gymnasium.spaces import Discrete, MultiDiscrete -# import gym -# from gym.spaces import MultiDiscrete +# import gymnasium as gym +# from gymnasium.spaces import MultiDiscrete # # from .space import Space # import PIL.ImageDraw as ImageDraw # import PIL.Image as Image diff --git a/mdp_playground/spaces/tuple_extended.py b/mdp_playground/spaces/tuple_extended.py index 9dc9dce..c8aaf65 100644 --- a/mdp_playground/spaces/tuple_extended.py +++ b/mdp_playground/spaces/tuple_extended.py @@ -1,6 +1,6 @@ import numpy as np -import gym -from gym.spaces import Tuple +import gymnasium as gym +from gymnasium.spaces import Tuple class TupleExtended(Tuple): diff --git a/setup.py b/setup.py index 69442e8..d9fbe2d 100644 --- a/setup.py +++ b/setup.py @@ -82,7 +82,7 @@ setup( name="mdp-playground", - version="0.0.2", + version="1.0.0", author=AUTHORS, author_email=AUTHOR_EMAIL, description="A python package to design and debug RL agents", @@ -109,7 +109,7 @@ py_modules=[], python_requires=">=3.6", setup_requires=["numpy"], - install_requires=["dill", "numpy"], + install_requires=["dill", "numpy", "scipy", "pillow", "gymnasium"], extras_require={ "extras": extras_require, "extras_disc": extras_require_disc, diff --git a/tests/test_gym_env_wrapper.py b/tests/test_gym_env_wrapper.py index 9912748..1159f11 100644 --- a/tests/test_gym_env_wrapper.py +++ b/tests/test_gym_env_wrapper.py @@ -4,6 +4,7 @@ from mdp_playground.envs.gym_env_wrapper import GymEnvWrapper import unittest import pytest +import gymnasium as gym import sys @@ -27,11 +28,6 @@ def test_r_delay(self): """ """ print("\033[32;1;4mTEST_REWARD_DELAY\033[0m") config = { - "AtariEnv": { - "game": "beam_rider", # "breakout", - "obs_type": "image", - "frameskip": 1, - }, "delay": 1, # "GymEnvWrapper": { "atari_preprocessing": True, @@ -46,9 +42,7 @@ def test_r_delay(self): # config["log_filename"] = log_filename - from gym.envs.atari import AtariEnv - - ae = AtariEnv(**{"game": "beam_rider", "obs_type": "image", "frameskip": 1}) + ae = gym.make("BeamRiderNoFrameskip-v4") aew = GymEnvWrapper(ae, **config) ob = aew.reset() print("observation_space.shape:", ob.shape) @@ -56,7 +50,7 @@ def test_r_delay(self): total_reward = 0.0 for i in range(200): act = aew.action_space.sample() - next_state, reward, done, info = aew.step(act) + next_state, reward, done, trunc, info = aew.step(act) print("step, reward, done, act:", i, reward, done, act) if i == 154 or i == 159: assert reward == 44.0, ( @@ -72,11 +66,6 @@ def test_r_shift(self): """ """ print("\033[32;1;4mTEST_REWARD_SHIFT\033[0m") config = { - "AtariEnv": { - "game": "beam_rider", # "breakout", - "obs_type": "image", - "frameskip": 1, - }, "reward_shift": 1, # "GymEnvWrapper": { "atari_preprocessing": True, @@ -91,9 +80,8 @@ def test_r_shift(self): # config["log_filename"] = log_filename - from gym.envs.atari import AtariEnv - ae = AtariEnv(**{"game": "beam_rider", "obs_type": "image", "frameskip": 1}) + ae = gym.make("BeamRiderNoFrameskip-v4") aew = GymEnvWrapper(ae, **config) ob = aew.reset() print("observation_space.shape:", ob.shape) @@ -101,7 +89,7 @@ def test_r_shift(self): total_reward = 0.0 for i in range(200): act = aew.action_space.sample() - next_state, reward, done, info = aew.step(act) + next_state, reward, done, trunc, info = aew.step(act) print("step, reward, done, act:", i, reward, done, act) if i == 153 or i == 158: assert reward == 45.0, ( @@ -119,11 +107,6 @@ def test_r_scale(self): """ """ print("\033[32;1;4mTEST_REWARD_SCALE\033[0m") config = { - "AtariEnv": { - "game": "beam_rider", # "breakout", - "obs_type": "image", - "frameskip": 1, - }, "reward_scale": 2, # "GymEnvWrapper": { "atari_preprocessing": True, @@ -138,9 +121,7 @@ def test_r_scale(self): # config["log_filename"] = log_filename - from gym.envs.atari import AtariEnv - - ae = AtariEnv(**{"game": "beam_rider", "obs_type": "image", "frameskip": 1}) + ae = gym.make("BeamRiderNoFrameskip-v4") aew = GymEnvWrapper(ae, **config) ob = aew.reset() print("observation_space.shape:", ob.shape) @@ -148,7 +129,7 @@ def test_r_scale(self): total_reward = 0.0 for i in range(200): act = aew.action_space.sample() - next_state, reward, done, info = aew.step(act) + next_state, reward, done, trunc, info = aew.step(act) print("step, reward, done, act:", i, reward, done, act) if i == 153 or i == 158: assert reward == 88.0, ( @@ -167,11 +148,6 @@ def test_r_scale(self): # """ """ # print("\033[32;1;4mTEST_TERM_STATE_REWARD\033[0m") # config = { - # "AtariEnv": { - # "game": "beam_rider", # "breakout", - # "obs_type": "image", - # "frameskip": 1, - # }, # "term_state_reward": 200, # # "GymEnvWrapper": { # "atari_preprocessing": True, @@ -186,9 +162,7 @@ def test_r_scale(self): # # config["log_filename"] = log_filename - # from gym.envs.atari import AtariEnv - - # ae = AtariEnv(**{"game": "beam_rider", "obs_type": "image", "frameskip": 1}) + # ae = gym.make("BeamRiderNoFrameskip-v4") # aew = GymEnvWrapper(ae, **config) # ob = aew.reset() # print("observation_space.shape:", ob.shape) @@ -196,7 +170,7 @@ def test_r_scale(self): # total_reward = 0.0 # for i in range(200): # act = aew.action_space.sample() - # next_state, reward, done, info = aew.step(act) + # next_state, reward, done, trunc, info = aew.step(act) # print("step, reward, done, act:", i, reward, done, act) # if i == 153 or i == 158: # assert reward == 88.0, ( @@ -210,57 +184,50 @@ def test_r_scale(self): # print("total_reward:", total_reward) # aew.reset() - def test_r_delay_ray_frame_stack(self): - """ - Uses wrap_deepmind_ray to frame stack Atari - """ - print("\033[32;1;4mTEST_REWARD_DELAY_RAY_FRAME_STACK\033[0m") - config = { - "AtariEnv": { - "game": "beam_rider", # "breakout", - "obs_type": "image", - "frameskip": 1, - }, - "delay": 1, - # "GymEnvWrapper": { - "wrap_deepmind_ray": True, - "frame_skip": 1, - "atari_preprocessing": True, - "frame_skip": 4, - "grayscale_obs": False, - "state_space_type": "discrete", - "action_space_type": "discrete", - "seed": 0, - # }, - # 'seed': 0, #seed - } - - # config["log_filename"] = log_filename + # Disabled tests for Ray Rllib for now, too much maintenance overhead. + # def test_r_delay_ray_frame_stack(self): + # """ + # Uses wrap_deepmind_ray to frame stack Atari + # """ + # print("\033[32;1;4mTEST_REWARD_DELAY_RAY_FRAME_STACK\033[0m") + # config = { + # "delay": 1, + # # "GymEnvWrapper": { + # "wrap_deepmind_ray": True, + # "frame_skip": 1, + # "atari_preprocessing": True, + # "frame_skip": 4, + # "grayscale_obs": False, + # "state_space_type": "discrete", + # "action_space_type": "discrete", + # "seed": 0, + # # }, + # # 'seed': 0, #seed + # } - from gym.envs.atari import AtariEnv - import gym + # # config["log_filename"] = log_filename - game = "beam_rider" - game = "".join([g.capitalize() for g in game.split("_")]) - ae = gym.make("{}NoFrameskip-v4".format(game)) - aew = GymEnvWrapper(ae, **config) - ob = aew.reset() - print("observation_space.shape:", ob.shape) - # print(ob) - total_reward = 0.0 - for i in range(200): - act = aew.action_space.sample() - next_state, reward, done, info = aew.step(act) - print("step, reward, done, act:", i, reward, done, act) - if i == 142 or i == 159: - assert reward == 44.0, ( - "1-step delayed reward in step: " - + str(i) - + " should have been 44.0." - ) - total_reward += reward - print("total_reward:", total_reward) - aew.reset() + # game = "beam_rider" + # game = "".join([g.capitalize() for g in game.split("_")]) + # ae = gym.make("{}NoFrameskip-v4".format(game)) + # aew = GymEnvWrapper(ae, **config) + # ob = aew.reset() + # print("observation_space.shape:", ob.shape) + # # print(ob) + # total_reward = 0.0 + # for i in range(200): + # act = aew.action_space.sample() + # next_state, reward, done, trunc, info = aew.step(act) + # print("step, reward, done, act:", i, reward, done, act) + # if i == 142 or i == 159: + # assert reward == 44.0, ( + # "1-step delayed reward in step: " + # + str(i) + # + " should have been 44.0." + # ) + # total_reward += reward + # print("total_reward:", total_reward) + # aew.reset() def test_r_delay_p_noise_r_noise(self): """ @@ -268,11 +235,6 @@ def test_r_delay_p_noise_r_noise(self): """ print("\033[32;1;4mTEST_MULTIPLE\033[0m") config = { - "AtariEnv": { - "game": "beam_rider", # "breakout", - "obs_type": "image", - "frameskip": 1, - }, "delay": 1, "reward_noise": lambda a: a.normal(0, 0.1), "transition_noise": 0.1, @@ -289,9 +251,7 @@ def test_r_delay_p_noise_r_noise(self): # config["log_filename"] = log_filename - from gym.envs.atari import AtariEnv - - ae = AtariEnv(**{"game": "beam_rider", "obs_type": "image", "frameskip": 1}) + ae = gym.make("BeamRiderNoFrameskip-v4") aew = GymEnvWrapper(ae, **config) ob = aew.reset() print("observation_space.shape:", ob.shape) @@ -299,7 +259,7 @@ def test_r_delay_p_noise_r_noise(self): total_reward = 0.0 for i in range(200): act = aew.action_space.sample() - next_state, reward, done, info = aew.step(act) + next_state, reward, done, trunc, info = aew.step(act) print("step, reward, done, act:", i, reward, done, act) # Testing hardcoded values at these timesteps implicitly tests that there # were 21 noisy transitions in total and noise inserted in rewards. @@ -329,11 +289,6 @@ def test_discrete_irr_features(self): """ """ print("\033[32;1;4mTEST_DISC_IRR_FEATURES\033[0m") config = { - "AtariEnv": { - "game": "beam_rider", # "breakout", - "obs_type": "image", - "frameskip": 1, - }, "delay": 1, # "GymEnvWrapper": { "atari_preprocessing": True, @@ -359,9 +314,7 @@ def test_discrete_irr_features(self): # config["log_filename"] = log_filename - from gym.envs.atari import AtariEnv - - ae = AtariEnv(**{"game": "beam_rider", "obs_type": "image", "frameskip": 1}) + ae = gym.make("BeamRiderNoFrameskip-v4") aew = GymEnvWrapper(ae, **config) ob = aew.reset() print("type(observation_space):", type(ob)) @@ -369,7 +322,7 @@ def test_discrete_irr_features(self): total_reward = 0.0 for i in range(200): act = aew.action_space.sample() - next_state, reward, done, info = aew.step(act) + next_state, reward, done, trunc, info = aew.step(act) print( "step, reward, done, act, next_state[1]:", i, @@ -392,11 +345,6 @@ def test_image_transforms(self): """ """ print("\033[32;1;4mTEST_IMAGE_TRANSFORMS\033[0m") config = { - "AtariEnv": { - "game": "beam_rider", # "breakout", - "obs_type": "image", - "frameskip": 1, - }, "image_transforms": "shift,scale,rotate", # "image_sh_quant": 2, "image_width": 40, @@ -414,9 +362,7 @@ def test_image_transforms(self): # config["log_filename"] = log_filename - from gym.envs.atari import AtariEnv - - ae = AtariEnv(**{"game": "beam_rider", "obs_type": "image", "frameskip": 1}) + ae = gym.make("BeamRiderNoFrameskip-v4") aew = GymEnvWrapper(ae, **config) ob = aew.reset() print("observation_space.shape:", ob.shape) @@ -425,7 +371,7 @@ def test_image_transforms(self): total_reward = 0.0 for i in range(200): act = aew.action_space.sample() - next_state, reward, done, info = aew.step(act) + next_state, reward, done, trunc, info = aew.step(act) print("step, reward, done, act:", i, reward, done, act) if i == 153 or i == 158: assert reward == 44.0, ( @@ -440,11 +386,6 @@ def test_cont_irr_features(self): """ """ print("\033[32;1;4mTEST_CONT_IRR_FEATURES\033[0m") config = { - # "AtariEnv": { - # "game": 'beam_rider', #"breakout", - # 'obs_type': 'image', - # 'frameskip': 1, - # }, # 'delay': 1, # "GymEnvWrapper": { "state_space_type": "continuous", @@ -471,7 +412,7 @@ def test_cont_irr_features(self): # config["log_filename"] = log_filename from mdp_playground.envs.mujoco_env_wrapper import get_mujoco_wrapper # hack - from gym.envs.mujoco.half_cheetah_v3 import HalfCheetahEnv + from gymnasium.envs.mujoco.half_cheetah_v3 import HalfCheetahEnv HalfCheetahWrapperV3 = get_mujoco_wrapper(HalfCheetahEnv) base_env_config = {} @@ -512,7 +453,7 @@ def test_cont_irr_features(self): for i in range(200): act = hc3w.action_space.sample() - next_state, reward, done, info = hc3w.step(act) + next_state, reward, done, trunc, info = hc3w.step(act) print( "step, reward, done, act, next_state:", i, reward, done, act, next_state ) diff --git a/tests/test_mdp_playground.py b/tests/test_mdp_playground.py index 308c246..c687667 100644 --- a/tests/test_mdp_playground.py +++ b/tests/test_mdp_playground.py @@ -59,14 +59,14 @@ def test_continuous_dynamics_move_along_a_line(self): # Test 1: general dynamics and reward env = RLToyEnv(**config) - state = env.get_augmented_state()["curr_state"].copy() # env.reset() + state = env.get_augmented_state()["curr_state"].copy() # env.reset()[0] self.assertEqual( type(state), np.ndarray, "Type of continuous state should be numpy.ndarray." ) for i in range(20): # action = env.action_space.sample() action = np.array([1, 1, 1, 1]) # just to test if acting "in a line" works - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) np.testing.assert_allclose( 0.0, reward, atol=1e-5, err_msg="Step: " + str(i) @@ -77,7 +77,7 @@ def test_continuous_dynamics_move_along_a_line(self): ) # test_ = np.allclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False) # self.assertAlmostEqual(state, np.array([21.59339006, 20.68189965, 21.49608203, 20.19183292]), places=3) # Error - env.reset() + env.reset()[0] env.close() # Test 2: sequence lengths # TODO done in next test. @@ -86,14 +86,14 @@ def test_continuous_dynamics_move_along_a_line(self): # of optimal actions leads to good reward. Also implicitly tests sequence # lengths. env = RLToyEnv(**config) - state = env.get_augmented_state()["curr_state"].copy() # env.reset() + state = env.get_augmented_state()["curr_state"].copy() # env.reset()[0] prev_reward = None for i in range(40): if i < 20: action = env.action_space.sample() else: action = np.array([1, 1, 1, 1]) - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) if i >= 29: np.testing.assert_allclose( @@ -119,21 +119,21 @@ def test_continuous_dynamics_move_along_a_line(self): ) state = next_state.copy() prev_reward = reward - env.reset() + env.reset()[0] env.close() # Test 4: same as 3 above except with delay print("\033[32;1;4mTEST_CONTINUOUS_DYNAMICS_DELAY\033[0m") config["delay"] = 1 env = RLToyEnv(**config) - state = env.get_augmented_state()["curr_state"].copy() # env.reset() + state = env.get_augmented_state()["curr_state"].copy() # env.reset()[0] prev_reward = None for i in range(40): if i < 20: action = env.action_space.sample() else: action = np.array([1, 1, 1, 1]) - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) if i >= 30: np.testing.assert_allclose( @@ -157,7 +157,7 @@ def test_continuous_dynamics_move_along_a_line(self): ) state = next_state.copy() prev_reward = reward - env.reset() + env.reset()[0] env.close() # Test 5: R noise - same as 1 above except with reward noise @@ -165,12 +165,12 @@ def test_continuous_dynamics_move_along_a_line(self): config["reward_noise"] = lambda a: a.normal(0, 0.5) config["delay"] = 0 env = RLToyEnv(**config) - state = env.get_augmented_state()["curr_state"].copy() # env.reset() + state = env.get_augmented_state()["curr_state"].copy() # env.reset()[0] expected_rewards = [-0.70707351, 0.44681, 0.150735, -0.346204, 0.80687] for i in range(5): # action = env.action_space.sample() action = np.array([1, 1, 1, 1]) # just to test if acting "in a line" works - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) np.testing.assert_allclose( expected_rewards[i], reward, atol=1e-6, err_msg="Step: " + str(i) @@ -179,7 +179,7 @@ def test_continuous_dynamics_move_along_a_line(self): np.testing.assert_allclose( state, np.array([6.59339006, 5.68189965, 6.49608203, 5.19183292]), atol=1e-5 ) - env.reset() + env.reset()[0] env.close() # Test 6: for dynamics and reward in presence of irrelevant dimensions @@ -189,13 +189,13 @@ def test_continuous_dynamics_move_along_a_line(self): config["relevant_indices"] = [0, 1, 2, 6] config["action_space_relevant_indices"] = [0, 1, 2, 6] env = RLToyEnv(**config) - state = env.get_augmented_state()["curr_state"].copy() # env.reset() + state = env.get_augmented_state()["curr_state"].copy() # env.reset()[0] for i in range(20): action = env.action_space.sample() action[ config["action_space_relevant_indices"] ] = 1.0 # test to see if acting "in a line" works for relevant dimensions - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) np.testing.assert_allclose( 0.0, reward, atol=1e-5, err_msg="Step: " + str(i) @@ -205,19 +205,19 @@ def test_continuous_dynamics_move_along_a_line(self): state[config["relevant_indices"]], np.array([21.59339006, 20.68189965, 21.49608203, 19.835966]), ) - env.reset() + env.reset()[0] env.close() # Test that random actions in relevant action space along with linear # actions in irrelevant action space leads to bad reward for # move_along_a_line reward function env = RLToyEnv(**config) - state = env.get_augmented_state()["curr_state"].copy() # env.reset() + state = env.get_augmented_state()["curr_state"].copy() # env.reset()[0] for i in range(20): action = env.action_space.sample() # test to see if acting "in a line" for irrelevant dimensions and not for relevant dimensions produces bad reward action[[3, 4, 5]] = 1.0 - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) if i > 10: assert reward < -0.8, ( @@ -227,24 +227,24 @@ def test_continuous_dynamics_move_along_a_line(self): + str(reward) ) state = next_state.copy() - env.reset() + env.reset()[0] env.close() # Test using config values: state_space_max and action_space_max config["state_space_max"] = 5 # Will be a Box in the range [-max, max] config["action_space_max"] = 1 # Will be a Box in the range [-max, max] env = RLToyEnv(**config) - state = env.get_augmented_state()["curr_state"].copy() # env.reset() + state = env.get_augmented_state()["curr_state"].copy() # env.reset()[0] for _ in range(20): # action = env.action_space.sample() action = np.array([-1] * 7) # just to test if acting "in a line" works - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) state = next_state.copy() np.testing.assert_allclose(state, np.array([-5] * 7)) # test_ = np.allclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False) # self.assertAlmostEqual(state, np.array([21.59339006, 20.68189965, 21.49608203, 20.19183292]), places=3) # Error - env.reset() + env.reset()[0] env.close() # Test for terminal states in presence of irrelevant dimensions @@ -259,14 +259,14 @@ def test_continuous_dynamics_move_along_a_line(self): ] config["term_state_edge"] = 1.0 env = RLToyEnv(**config) - state = env.get_augmented_state()["curr_state"].copy() # env.reset() + state = env.get_augmented_state()["curr_state"].copy() # env.reset()[0] state_derivatives = copy.deepcopy(env.state_derivatives) # augmented_state = copy.deepcopy(env.augmented_state) for _ in range(20): # action = env.action_space.sample() action = np.array([1] * 7) # just to test if acting "in a line" works - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) if _ == 1: assert done, "Terminal state should have been reached at step " + str(_) @@ -281,13 +281,13 @@ def test_continuous_dynamics_move_along_a_line(self): ) # 5 because of state_space_max # test_ = np.allclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False) # self.assertAlmostEqual(state, np.array([21.59339006, 20.68189965, 21.49608203, 20.19183292]), places=3) # Error - env.reset() + env.reset()[0] env.close() # Test P noise config["transition_noise"] = lambda a: a.normal([0] * 7, [0.5] * 7) env = RLToyEnv(**config) - state = env.get_augmented_state()["curr_state"].copy() # env.reset() + state = env.get_augmented_state()["curr_state"].copy() # env.reset()[0] state_derivatives = copy.deepcopy(env.state_derivatives) # augmented_state = copy.deepcopy(env.augmented_state) @@ -366,7 +366,7 @@ def test_continuous_dynamics_move_along_a_line(self): for i in range(3): # action = env.action_space.sample() action = np.array([1] * 7) # just to test if acting "in a line" works - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) np.testing.assert_allclose( state_derivatives[0], env.augmented_state[-2] @@ -382,7 +382,7 @@ def test_continuous_dynamics_move_along_a_line(self): # augmented_state = copy.deepcopy(env.augmented_state) # test_ = np.allclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False) # self.assertAlmostEqual(state, np.array([21.59339006, 20.68189965, 21.49608203, 20.19183292]), places=3) # Error - env.reset() + env.reset()[0] env.close() # TODO Write test for continuous for checking reward with/without irrelevant dimensions, delay, r noise, seq_len? @@ -418,7 +418,7 @@ def test_continuous_dynamics_order(self): state_derivatives = copy.deepcopy(env.state_derivatives) action = np.array([2.0, 1.0]) - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) np.testing.assert_allclose( next_state - state, (1 / 6) * np.array([1, 0.5]) * 1e-6, atol=1e-7 @@ -437,7 +437,7 @@ def test_continuous_dynamics_order(self): state_derivatives = copy.deepcopy(env.state_derivatives) action = np.array([2.0, 1.0]) - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) np.testing.assert_allclose( next_state - state, (7 / 6) * np.array([1, 0.5]) * 1e-6, atol=1e-7 @@ -456,7 +456,7 @@ def test_continuous_dynamics_order(self): # TODO Test for more timesteps? (>seq_len so that reward function kicks in) or higher order derivatives (.DONE) - env.reset() + env.reset()[0] env.close() def test_continuous_dynamics_target_point_dense(self): @@ -489,11 +489,11 @@ def test_continuous_dynamics_target_point_dense(self): # Test : dense reward env = RLToyEnv(**config) - state = env.get_augmented_state()["curr_state"].copy() # env.reset() + state = env.get_augmented_state()["curr_state"].copy() # env.reset()[0] for i in range(20): # action = env.action_space.sample() action = np.array([0.5] * 2) # just to test if acting "in a line" works - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) # At each step, the distance reduces by ~0.035355 to the final point of # this trajectory which is also the target point by design for this test. @@ -503,7 +503,7 @@ def test_continuous_dynamics_target_point_dense(self): ) state = next_state.copy() np.testing.assert_allclose(state, np.array([-0.29792, 1.71012]), atol=1e-6) - env.reset() + env.reset()[0] env.close() # Test irrelevant dimensions @@ -513,11 +513,11 @@ def test_continuous_dynamics_target_point_dense(self): config["action_space_relevant_indices"] = [1, 2] config["target_point"] = [1.71012, 0.941906] env = RLToyEnv(**config) - state = env.get_augmented_state()["curr_state"].copy() # env.reset() + state = env.get_augmented_state()["curr_state"].copy() # env.reset()[0] for i in range(20): # action = env.action_space.sample() action = np.array([0.5] * 5) # just to test if acting "in a line" works - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) # At each step, the distance reduces by ~0.035355 to the final point of # this trajectory which is also the target point by design for this test. @@ -532,24 +532,24 @@ def test_continuous_dynamics_target_point_dense(self): atol=1e-6, ) # check 1 extra step away from target point gives -ve reward - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) # At each step, the distance reduces by ~0.035355 to the final point of # this trajectory which is also the target point by design for this np.testing.assert_allclose( -0.035355, reward, atol=1e-5, err_msg="Step: " + str(i) ) - env.reset() + env.reset()[0] env.close() # Test delay config["delay"] = 10 env = RLToyEnv(**config) - state = env.get_augmented_state()["curr_state"].copy() # env.reset() + state = env.get_augmented_state()["curr_state"].copy() # env.reset()[0] for i in range(20): # action = env.action_space.sample() action = np.array([0.5] * 5) # just to test if acting "in a line" works - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) if i < 10: np.testing.assert_allclose( @@ -568,7 +568,7 @@ def test_continuous_dynamics_target_point_dense(self): np.array([-0.29792, 1.71012, 0.941906, -0.034626, 0.493934]), atol=1e-6, ) - env.reset() + env.reset()[0] env.close() def test_continuous_dynamics_target_point_sparse(self): @@ -603,11 +603,11 @@ def test_continuous_dynamics_target_point_sparse(self): # Test : sparse reward env = RLToyEnv(**config) - state = env.get_augmented_state()["curr_state"].copy() # env.reset() + state = env.get_augmented_state()["curr_state"].copy() # env.reset()[0] for i in range(20): # action = env.action_space.sample() action = np.array([0.5] * 2) # just to test if acting "in a line" works - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) if i < 17: np.testing.assert_allclose( @@ -621,17 +621,17 @@ def test_continuous_dynamics_target_point_sparse(self): np.testing.assert_allclose(state, np.array([-0.29792, 1.71012]), atol=1e-6) # test_ = np.allclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False) # self.assertAlmostEqual(state, np.array([21.59339006, 20.68189965, 21.49608203, 20.19183292]), places=3) # Error - env.reset() + env.reset()[0] env.close() # Test delay config["delay"] = 10 env = RLToyEnv(**config) - state = env.get_augmented_state()["curr_state"].copy() # env.reset() + state = env.get_augmented_state()["curr_state"].copy() # env.reset()[0] for i in range(35): # action = env.action_space.sample() action = np.array([0.5] * 2) # just to test if acting "in a line" works - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) if i < 27 or i > 31: np.testing.assert_allclose( @@ -645,7 +645,7 @@ def test_continuous_dynamics_target_point_sparse(self): np.testing.assert_allclose(state, np.array([0.07708, 2.08512]), atol=1e-6) # test_ = np.allclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False) # self.assertAlmostEqual(state, np.array([21.59339006, 20.68189965, 21.49608203, 20.19183292]), places=3) # Error - env.reset() + env.reset()[0] env.close() # Test irrelevant dimensions @@ -655,11 +655,11 @@ def test_continuous_dynamics_target_point_sparse(self): config["action_space_relevant_indices"] = [1, 2] config["target_point"] = [1.71012, 0.941906] env = RLToyEnv(**config) - state = env.get_augmented_state()["curr_state"].copy() # env.reset() + state = env.get_augmented_state()["curr_state"].copy() # env.reset()[0] for i in range(35): # action = env.action_space.sample() action = np.array([0.5] * 5) # just to test if acting "in a line" works - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) if i < 27 or i > 31: # At each step, the distance reduces by ~0.035355 to the final point of @@ -676,7 +676,7 @@ def test_continuous_dynamics_target_point_sparse(self): np.testing.assert_allclose( state, np.array([0.07708, 2.08512, 1.316906, 0.340374, 0.868934]), atol=1e-6 ) - env.reset() + env.reset()[0] env.close() def test_continuous_image_representations(self): @@ -724,7 +724,7 @@ def test_continuous_image_representations(self): for i in range(5): # action = env.action_space.sample() action = np.array([-0.45, 0.8]) # just to test if acting "in a line" works - next_obs, reward, done, info = env.step(action) + next_obs, reward, done, trunc, info = env.step(action) next_state = env.get_augmented_state()["augmented_state"][-1] print("sars', done =", state, action, reward, next_state, done) state = next_state.copy() @@ -747,7 +747,7 @@ def test_continuous_image_representations(self): # test_ = np.allclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False) # self.assertAlmostEqual(state, np.array([21.59339006, 20.68189965, 21.49608203, 20.19183292]), places=3) # Error - env.reset() + env.reset()[0] env.close() def test_grid_image_representations(self): @@ -793,7 +793,7 @@ def test_grid_image_representations(self): for i in range(len(actions)): # action = env.action_space.sample() action = actions[i] - next_obs, reward, done, info = env.step(action) + next_obs, reward, done, trunc, info = env.step(action) next_state = env.get_augmented_state()["augmented_state"][-1] print("sars', done =", state, action, reward, next_state, done) state = next_state.copy() @@ -816,7 +816,7 @@ def test_grid_image_representations(self): for i in range(4): # action = env.action_space.sample() action = [0, 1] - next_obs, reward, done, info = env.step(action) + next_obs, reward, done, trunc, info = env.step(action) next_state = env.get_augmented_state()["augmented_state"][-1] print("sars', done =", state, action, reward, next_state, done) state = next_state.copy() @@ -826,7 +826,7 @@ def test_grid_image_representations(self): assert state == [5, 7], str(state) # test_ = np.allclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False) # self.assertAlmostEqual(state, np.array([21.59339006, 20.68189965, 21.49608203, 20.19183292]), places=3) # Error - env.reset() + env.reset()[0] env.close() # Test 2: Almost the same as above, but with make_denser @@ -848,7 +848,7 @@ def test_grid_image_representations(self): tot_rew = 0 for i in range(len(actions)): action = actions[i] - next_obs, reward, done, info = env.step(action) + next_obs, reward, done, trunc, info = env.step(action) next_state = env.get_augmented_state()["augmented_state"][-1] print("sars', done =", state, action, reward, next_state, done) state = next_state.copy() @@ -856,7 +856,7 @@ def test_grid_image_representations(self): assert tot_rew == 6.0, str(tot_rew) - env.reset() + env.reset()[0] env.close() # Test 3: Almost the same as 2, but with terminal states @@ -885,7 +885,7 @@ def test_grid_image_representations(self): tot_rew = 0 for i in range(len(actions)): action = actions[i] - next_obs, reward, done, info = env.step(action) + next_obs, reward, done, trunc, info = env.step(action) next_state = env.get_augmented_state()["augmented_state"][-1] print("sars', done =", state, action, reward, next_state, done) state = next_state.copy() @@ -893,7 +893,7 @@ def test_grid_image_representations(self): assert tot_rew == 5.5, str(tot_rew) - env.reset() + env.reset()[0] env.close() # Test 4: Almost the same as 3, but with irrelevant features @@ -922,7 +922,7 @@ def test_grid_image_representations(self): tot_rew = 0 for i in range(len(actions)): action = actions[i] + [0, 0] - next_obs, reward, done, info = env.step(action) + next_obs, reward, done, trunc, info = env.step(action) next_state = env.curr_state print("sars', done =", state, action, reward, next_state, done) state = next_state.copy() @@ -943,7 +943,7 @@ def test_grid_image_representations(self): for i in range(len(actions)): action = [0, 0] + actions[i] - next_obs, reward, done, info = env.step(action) + next_obs, reward, done, trunc, info = env.step(action) next_state = env.curr_state print("sars', done =", state, action, reward, next_state, done) state = next_state.copy() @@ -951,7 +951,7 @@ def test_grid_image_representations(self): assert tot_rew == 0.5, str(tot_rew) - env.reset() + env.reset()[0] env.close() # Test 5: With transition noise @@ -981,7 +981,7 @@ def test_grid_image_representations(self): tot_rew = 0 for i in range(len(actions)): action = actions[i] + [0, 0] - next_obs, reward, done, info = env.step(action) + next_obs, reward, done, trunc, info = env.step(action) next_state = env.curr_state print("sars', done =", state, action, reward, next_state, done) state = next_state.copy() @@ -994,7 +994,7 @@ def test_grid_image_representations(self): assert tot_rew == 2.75, str(tot_rew) - env.reset() + env.reset()[0] env.close() def test_grid_env(self): @@ -1038,7 +1038,7 @@ def test_grid_env(self): tot_rew = 0 for i in range(len(actions)): action = actions[i] - next_obs, reward, done, info = env.step(action) + next_obs, reward, done, trunc, info = env.step(action) next_state = env.get_augmented_state()["augmented_state"][-1] print("sars', done =", state, action, reward, next_state, done) self.assertEqual( @@ -1054,7 +1054,7 @@ def test_grid_env(self): assert tot_rew == 8.25, str(tot_rew) - env.reset() + env.reset()[0] env.close() # Test 2: Almost the same as 1, but with irrelevant features and no terminal reward @@ -1081,7 +1081,7 @@ def test_grid_env(self): tot_rew = 0 for i in range(len(actions)): action = actions[i] + [0, 0] - next_obs, reward, done, info = env.step(action) + next_obs, reward, done, trunc, info = env.step(action) next_state = env.get_augmented_state()["augmented_state"][-1] print("sars', done =", state, action, reward, next_state, done) self.assertEqual( @@ -1098,7 +1098,7 @@ def test_grid_env(self): # Perform actions only in irrelevant space and noop in relevant space for i in range(len(actions)): action = [0, 0] + actions[i] - next_obs, reward, done, info = env.step(action) + next_obs, reward, done, trunc, info = env.step(action) next_state = env.get_augmented_state()["augmented_state"][-1] print("sars', done =", state, action, reward, next_state, done) state = next_state.copy() @@ -1106,7 +1106,7 @@ def test_grid_env(self): assert tot_rew == 9, str(tot_rew) - env.reset() + env.reset()[0] env.close() # Test 3: Almost the same as 1, but with delay @@ -1136,7 +1136,7 @@ def test_grid_env(self): tot_rew = 0 for i in range(len(actions)): action = actions[i] - next_obs, reward, done, info = env.step(action) + next_obs, reward, done, trunc, info = env.step(action) next_state = env.get_augmented_state()["augmented_state"][-1] print("sars', done =", state, action, reward, next_state, done) self.assertEqual( @@ -1152,7 +1152,7 @@ def test_grid_env(self): assert tot_rew == 6.75, str(tot_rew) - env.reset() + env.reset()[0] env.close() def test_discrete_dynamics(self): @@ -1185,7 +1185,7 @@ def test_discrete_dynamics(self): ) # TODO Move this and the test_continuous_dynamics type checks to separate unit tests action = 2 - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) self.assertEqual( next_state, @@ -1195,7 +1195,7 @@ def test_discrete_dynamics(self): state = next_state action = 4 - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) self.assertEqual( next_state, @@ -1205,7 +1205,7 @@ def test_discrete_dynamics(self): state = next_state action = 1 - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) self.assertEqual( next_state, @@ -1221,7 +1221,7 @@ def test_discrete_dynamics(self): # Try a random action to see that terminal state leads back to same terminal state action = env.action_space.sample() - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) print("sars', done =", state, action, reward, next_state, done) self.assertEqual( next_state, @@ -1230,7 +1230,7 @@ def test_discrete_dynamics(self): ) state = next_state - env.reset() + env.reset()[0] env.close() def test_discrete_reward_delay(self): @@ -1269,13 +1269,13 @@ def test_discrete_reward_delay(self): 5, 2, 3, - np.random.randint(config["action_space_size"]), + np.random.default_rng().integers(config["action_space_size"]), 4, ] # 2nd last action is random just to check that last delayed reward works with any action expected_rewards = [0, 0, 0, 1, 1, 0, 1, 0, 0] expected_states = [0, 2, 2, 5, 2, 5, 5, 0, 6] for i in range(len(expected_rewards)): - next_state, reward, done, info = env.step(actions[i]) + next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) self.assertEqual( reward, @@ -1289,7 +1289,7 @@ def test_discrete_reward_delay(self): # for 2nd last time step due to random action. state = next_state - env.reset() + env.reset()[0] env.close() def test_discrete_rewardable_sequences(self): @@ -1327,12 +1327,12 @@ def test_discrete_rewardable_sequences(self): 3, 4, 2, - np.random.randint(config["action_space_size"]), + np.random.default_rng().integers(config["action_space_size"]), 5, ] # expected_rewards = [0, 0, 1, 0, 1, 0, 0, 0] for i in range(len(expected_rewards)): - next_state, reward, done, info = env.step(actions[i]) + next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) self.assertEqual( reward, @@ -1343,7 +1343,7 @@ def test_discrete_rewardable_sequences(self): ) state = next_state - env.reset() + env.reset()[0] env.close() def test_discrete_p_noise(self): @@ -1374,7 +1374,7 @@ def test_discrete_p_noise(self): env = RLToyEnv(**config) state = env.get_augmented_state()["curr_state"] - actions = [6, 6, 2, np.random.randint(config["action_space_size"])] # + actions = [6, 6, 2, np.random.default_rng().integers(config["action_space_size"])] # expected_states = [ 2, 6, @@ -1382,7 +1382,7 @@ def test_discrete_p_noise(self): 3, ] # Last state 3 is fixed for this test because of fixed seed for Env which selects the next noisy state. for i in range(len(actions)): - next_state, reward, done, info = env.step(actions[i]) + next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) self.assertEqual( next_state, @@ -1393,7 +1393,7 @@ def test_discrete_p_noise(self): ) state = next_state - env.reset() + env.reset()[0] env.close() def test_discrete_r_noise(self): @@ -1432,7 +1432,7 @@ def test_discrete_r_noise(self): 0.086749, ] # 2nd state produces 'true' reward for i in range(len(actions)): - next_state, reward, done, info = env.step(actions[i]) + next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) np.testing.assert_allclose( reward, @@ -1445,7 +1445,7 @@ def test_discrete_r_noise(self): state = next_state - env.reset() + env.reset()[0] env.close() # TODO Test for make_denser; also one for creating multiple instances of an Env with the same config dict (can lead to issues because the dict is shared as I found with Ray's A3C imple.) @@ -1493,7 +1493,7 @@ def test_discrete_multiple_meta_features(self): 3, 4, 2, - np.random.randint(config["action_space_size"]), + np.random.default_rng().integers(config["action_space_size"]), 5, ] # expected_rewards = [0, 0, 0, 1, 0, 1, 0, 0] @@ -1512,7 +1512,7 @@ def test_discrete_multiple_meta_features(self): expected_rewards[i] + expected_reward_noises[i] ) * config["reward_scale"] + config["reward_shift"] for i in range(len(expected_rewards)): - next_state, reward, done, info = env.step(actions[i]) + next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) np.testing.assert_allclose( reward, @@ -1524,7 +1524,7 @@ def test_discrete_multiple_meta_features(self): ) state = next_state - env.reset() + env.reset()[0] env.close() # Commented out the following 2 tests after changing implementation of @@ -1566,12 +1566,12 @@ def test_discrete_multiple_meta_features(self): # actions = [[1, 1, 0], [0, 1, 0], [1, 0 ,1], [1, 0 ,0], [1, 0, 1], [0, 1, 0], [0, 1, 1], [0, 0, 1], [1, 0, 0]] # expected_rewards = [0, 0, 0, 1, 1, 0, 1, 0, 0] # for i in range(len(expected_rewards)): - # next_state, reward, done, info = env.step(actions[i]) + # next_state, reward, done, trunc, info = env.step(actions[i]) # print("sars', done =", state, actions[i], reward, next_state, done) # self.assertEqual(reward, expected_rewards[i], "Expected reward mismatch in time step: " + str(i + 1) + " when reward delay = 3.") # state = next_state # - # env.reset() + # env.reset()[0] # env.close() # def test_discrete_multi_discrete_irrelevant_dimensions(self): @@ -1615,12 +1615,12 @@ def test_discrete_multiple_meta_features(self): # actions = [[1, 1, 0], [0, 1, 0], [1, 0 ,1], [1, 0 ,0], [1, 0, 1], [0, 1, 0], [0, 1, 1], [0, 0, 1], [1, 0, 0]] # expected_rewards = [0, 0, 0, 0, 1, 1, 0, 1, 0] # for i in range(len(expected_rewards)): - # next_state, reward, done, info = env.step(actions[i]) + # next_state, reward, done, trunc, info = env.step(actions[i]) # print("sars', done =", state, actions[i], reward, next_state, done) # self.assertEqual(reward, expected_rewards[i], "Expected reward mismatch in time step: " + str(i + 1) + " when reward delay = 3.") # state = next_state # - # env.reset() + # env.reset()[0] # env.close() # # except AssertionError as e: @@ -1636,13 +1636,13 @@ def test_discrete_multiple_meta_features(self): # expected_rewards = [0, 0, 0, 1, 1, 0, 1, 0, 0] # expected_states = [[0, 0, 0, 3], [0, 1, 0, 1], [0, 1, 0, 1], [1, 0, 1, 3], [0, 1, 0, 2], [1, 0, 1, 0], [1, 0, 1, 1], [0, 0, 0, 4], [1, 0, 0, 2]] # for i in range(len(expected_rewards)): - # next_state, reward, done, info = env.step(actions[i]) + # next_state, reward, done, trunc, info = env.step(actions[i]) # print("sars', done =", state, actions[i], reward, next_state, done) # self.assertEqual(reward, expected_rewards[i], "Expected reward mismatch in time step: " + str(i + 1) + " when reward delay = 3.") # self.assertEqual(state, expected_states[i], "Expected state mismatch in time step: " + str(i + 1) + " when reward delay = 3.") # state = next_state # - # env.reset() + # env.reset()[0] # env.close() # Test: This test lets even irrelevant dimensions be multi-dimensional @@ -1657,13 +1657,13 @@ def test_discrete_multiple_meta_features(self): # expected_rewards = [0, 0, 0, 1, 1, 0, 1, 0, 0] # expected_states = [[0, 0, 0, 0, 3], [0, 1, 0, 0, 1], [0, 1, 0, 0, 1], [1, 0, 1, 0, 3], [0, 1, 0, 0, 2], [1, 0, 1, 0, 0], [1, 0, 1, 0, 1], [0, 0, 0, 0, 4], [1, 0, 0, 0, 2]] # for i in range(len(expected_rewards)): - # next_state, reward, done, info = env.step(actions[i]) + # next_state, reward, done, trunc, info = env.step(actions[i]) # print("sars', done =", state, actions[i], reward, next_state, done) # self.assertEqual(reward, expected_rewards[i], "Expected reward mismatch in time step: " + str(i + 1) + " when reward delay = 3.") # self.assertEqual(state, expected_states[i], "Expected state mismatch in time step: " + str(i + 1) + " when reward delay = 3.") # state = next_state # - # env.reset() + # env.reset()[0] # env.close() def test_discrete_irr_features(self): @@ -1694,11 +1694,11 @@ def test_discrete_irr_features(self): state = env.get_augmented_state()["curr_state"] actions = [[7, 0], [5, 0], [5, 0], [1, 2]] + [ - [5, np.random.randint(config["action_space_size"][1])] + [5, np.random.default_rng().integers(config["action_space_size"][1])] ] * 5 expected_rewards = [0, 1, 0, 1, 0, 0, 0, 0, 0] for i in range(len(expected_rewards)): - next_state, reward, done, info = env.step(actions[i]) + next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) self.assertEqual( reward, @@ -1710,7 +1710,7 @@ def test_discrete_irr_features(self): ) state = next_state - env.reset() + env.reset()[0] env.close() def test_discrete_image_representations(self): @@ -1758,7 +1758,7 @@ def test_discrete_image_representations(self): 3, 4, 2, - np.random.randint(config["action_space_size"]), + np.random.default_rng().integers(config["action_space_size"]), 5, ] # expected_rewards = [0, 0, 0, 1, 0, 1, 0, 0] @@ -1782,7 +1782,7 @@ def test_discrete_image_representations(self): expected_rewards[i] + expected_reward_noises[i] ) * config["reward_scale"] + config["reward_shift"] for i in range(len(expected_rewards)): - next_state, reward, done, info = env.step(actions[i]) + next_state, reward, done, trunc, info = env.step(actions[i]) assert next_state.shape == ( 100, 100, @@ -1813,7 +1813,7 @@ def test_discrete_image_representations(self): ) state = next_state - env.reset() + env.reset()[0] env.close() def test_discrete_reward_every_n_steps(self): @@ -1854,12 +1854,12 @@ def test_discrete_reward_every_n_steps(self): 6, 1, 0, - np.random.randint(config["action_space_size"]), + np.random.default_rng().integers(config["action_space_size"]), 5, ] # expected_rewards = [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0] for i in range(len(expected_rewards)): - next_state, reward, done, info = env.step(actions[i]) + next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) self.assertEqual( reward, @@ -1870,7 +1870,7 @@ def test_discrete_reward_every_n_steps(self): ) state = next_state - env.reset() + env.reset()[0] env.close() # With delay @@ -1891,12 +1891,12 @@ def test_discrete_reward_every_n_steps(self): 6, 1, 0, - np.random.randint(config["action_space_size"]), + np.random.default_rng().integers(config["action_space_size"]), 5, ] # expected_rewards = [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0] for i in range(len(expected_rewards)): - next_state, reward, done, info = env.step(actions[i]) + next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) self.assertEqual( reward, @@ -1907,7 +1907,7 @@ def test_discrete_reward_every_n_steps(self): ) state = next_state - env.reset() + env.reset()[0] env.close() # With delay >= sequence length @@ -1928,12 +1928,12 @@ def test_discrete_reward_every_n_steps(self): 6, 1, 0, - np.random.randint(config["action_space_size"]), + np.random.default_rng().integers(config["action_space_size"]), 5, ] # expected_rewards = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0] for i in range(len(expected_rewards)): - next_state, reward, done, info = env.step(actions[i]) + next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) self.assertEqual( reward, @@ -1944,7 +1944,7 @@ def test_discrete_reward_every_n_steps(self): ) state = next_state - env.reset() + env.reset()[0] env.close() def test_discrete_custom_P_R(self): @@ -1965,9 +1965,9 @@ def test_discrete_custom_P_R(self): config["reward_scale"] = 2.0 config["use_custom_mdp"] = True - np.random.seed(0) # seed - config["transition_function"] = np.random.randint(8, size=(8, 5)) - config["reward_function"] = np.random.randint(4, size=(8, 5)) + # np.random.seed(0) # seed + config["transition_function"] = np.random.default_rng().integers(8, size=(8, 5)) + config["reward_function"] = np.random.default_rng().integers(4, size=(8, 5)) config["init_state_dist"] = np.array([1 / 8 for i in range(8)]) env = RLToyEnv(**config) @@ -1983,12 +1983,12 @@ def test_discrete_custom_P_R(self): 4, 1, 0, - np.random.randint(config["action_space_size"]), + np.random.default_rng(0).integers(config["action_space_size"]), 4, ] # expected_rewards = [0, 0, 6, 4, 4, 0, 4, 6, 6, 2, 0] for i in range(len(expected_rewards)): - next_state, reward, done, info = env.step(actions[i]) + next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) self.assertEqual( reward, @@ -1997,13 +1997,13 @@ def test_discrete_custom_P_R(self): ) state = next_state - env.reset() + env.reset()[0] env.close() # np.random.seed(0) #seed config["delay"] = 2 - P = np.random.randint(8, size=(8, 5)) - R = np.random.randint(4, size=(8, 5)) + P = np.random.default_rng().integers(8, size=(8, 5)) + R = np.random.default_rng().integers(4, size=(8, 5)) config["transition_function"] = lambda s, a: P[s, a] config["reward_function"] = lambda s, a: R[s[-2], a] config["init_state_dist"] = np.array([1 / 8 for i in range(8)]) @@ -2021,12 +2021,12 @@ def test_discrete_custom_P_R(self): 4, 1, 0, - np.random.randint(config["action_space_size"]), + np.random.default_rng().integers(config["action_space_size"]), 4, ] # expected_rewards = [0, 0, 0, 2, 2, 0, 0, 6, 0, 4, 6] for i in range(len(expected_rewards)): - next_state, reward, done, info = env.step(actions[i]) + next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) self.assertEqual( reward, @@ -2035,7 +2035,7 @@ def test_discrete_custom_P_R(self): ) state = next_state - env.reset() + env.reset()[0] env.close() def test_continuous_custom_P_R(self): @@ -2054,7 +2054,7 @@ def test_continuous_custom_P_R(self): config["delay"] = 1 config["use_custom_mdp"] = True - np.random.seed(0) # seed + # np.random.seed(0) # seed config["transition_function"] = lambda s, a: s + a config["reward_function"] = lambda s, a: s[-2][0] # config["init_state_dist"] = np.array([1 / 8 for i in range(8)]) @@ -2074,7 +2074,7 @@ def test_continuous_custom_P_R(self): -1.564964, ] # , -0.564964] for i in range(len(expected_rewards)): - next_state, reward, done, info = env.step(actions[i]) + next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) np.testing.assert_allclose( reward, @@ -2084,7 +2084,7 @@ def test_continuous_custom_P_R(self): ) state = next_state - env.reset() + env.reset()[0] env.close() # def test_discrete_imaginary_rollouts(self): @@ -2119,13 +2119,13 @@ def test_continuous_custom_P_R(self): # expected_rewards = [0, 0, 0, 0, 0, 0]#, 1, 0, 0] # expected_states = [9, 2, 4, 5, 8, 9] # [2, 4, 5, 8, 9] is a rewardable sequence. init state is 9 and action 0 leads to state 2. # for i in range(len(expected_rewards)): - # next_state, reward, done, info = env.step(actions[i]) + # next_state, reward, done, trunc, info = env.step(actions[i]) # print("sars', done =", state, actions[i], reward, next_state, done) # self.assertEqual(reward, expected_rewards[i], "Expected reward mismatch in time step: " + str(i + 1) + " when reward delay = " + str(config["delay"])) # self.assertEqual(state, expected_states[i], "Expected state mismatch in time step: " + str(i + 1) + " when reward delay = " + str(config["delay"])) # state = next_state # - # env.reset() + # env.reset()[0] # env.close() def test_discrete_r_dist(self): @@ -2163,7 +2163,7 @@ def test_discrete_r_dist(self): 1.424395, ] # 1st, 3rd and 4th states produce 'true' rewards, every reward has been shifted by 1 for i in range(len(actions)): - next_state, reward, done, info = env.step(actions[i]) + next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) np.testing.assert_allclose( reward, @@ -2176,7 +2176,7 @@ def test_discrete_r_dist(self): state = next_state - env.reset() + env.reset()[0] env.close() def test_discrete_diameter(self): @@ -2248,7 +2248,7 @@ def test_discrete_diameter(self): 1, ] # 1st, 3rd and 4th states produce 'true' rewards, every reward has been shifted by 1 for i in range(len(actions)): - next_state, reward, done, info = env.step(actions[i]) + next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) np.testing.assert_allclose( reward, @@ -2262,7 +2262,7 @@ def test_discrete_diameter(self): state = next_state - env.reset() + env.reset()[0] env.close() # Sub-test 2 Have sequence length greater than the diameter and check selected rewardable sequences @@ -2333,7 +2333,7 @@ def test_discrete_diameter(self): 0, ] # 1st, 3rd and 4th states produce 'true' rewards, every reward has been shifted by 1 for i in range(len(actions)): - next_state, reward, done, info = env.step(actions[i]) + next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) np.testing.assert_allclose( reward, @@ -2347,7 +2347,7 @@ def test_discrete_diameter(self): state = next_state - env.reset() + env.reset()[0] env.close() # Unit tests diff --git a/tests/test_run_experiments.py b/tests/test_run_experiments.py index a427c06..226993e 100644 --- a/tests/test_run_experiments.py +++ b/tests/test_run_experiments.py @@ -71,7 +71,7 @@ def test_dqn_test_expt(self): # from glob import glob # expt_list = glob("experiments/*.py") - # # sel_expt_list = np.random.randint(0, len(expt_list), 10) + # # sel_expt_list = np.random.integers(0, len(expt_list), 10) # expt_list = np.random.permutation(expt_list) # for i in range(2): # conf_file = expt_list[i] diff --git a/tests/test_version.py b/tests/test_version.py index 03d5fd6..7100a93 100644 --- a/tests/test_version.py +++ b/tests/test_version.py @@ -2,4 +2,4 @@ def test_version(): - assert __version__ == "0.0.2" + assert __version__ == "1.0.0"