diff --git a/.gitignore b/.gitignore index 535d27c..960daf1 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,8 @@ __pycache__/ MUJOCO_LOG.TXT *.pdf +log* + *.swp *.csv .directory @@ -114,4 +116,4 @@ venv.bak/ #whitelist !tests/files/mdpp_12744267_SAC_target_radius/*.csv -!misc/sample_recorded_data/*/*.csv \ No newline at end of file +!misc/sample_recorded_data/*/*.csv diff --git a/example.py b/example.py index fc4c148..ebe3e27 100644 --- a/example.py +++ b/example.py @@ -1,4 +1,5 @@ """We collect here some examples of basic usage for MDP Playground. +Example call: python example.py --do_not_display_images --log_level INFO Calling this file as a script, invokes the following examples: one for basic discrete environments @@ -10,7 +11,7 @@ one for basic grid environments one for grid environments with image representations one for wrapping Atari env qbert - one for wrapping Mujoco env HalfCheetah + one for wrapping Mujoco envs HalfCheetah, Pusher, Reacher one for wrapping MiniGrid env # Currently commented out due to some errors one for wrapping ProcGen env # Currently commented out due to some errors two examples at the end showing how to create toy envs using gym.make() @@ -25,6 +26,7 @@ from mdp_playground.envs import RLToyEnv import numpy as np +display_images = True def display_image(obs, mode="RGB"): # Display the image observation associated with the next state @@ -121,7 +123,8 @@ def discrete_environment_image_representations_example(): env.close() - display_image(next_state_image, mode="L") + if display_images: + display_image(next_state_image, mode="L") def discrete_environment_diameter_image_representations_example(): @@ -167,7 +170,8 @@ def discrete_environment_diameter_image_representations_example(): env.close() - display_image(next_state_image, mode="L") + if display_images: + display_image(next_state_image, mode="L") def continuous_environment_example_move_to_a_point(): @@ -249,8 +253,9 @@ def continuous_environment_example_move_to_a_point_irrelevant_image(): env.close() - img1 = display_image(next_state_image, mode="RGB") - img1.save("cont_env_irrelevant_image.pdf") + if display_images: + img1 = display_image(next_state_image, mode="RGB") + # img1.save("cont_env_irrelevant_image.pdf") def continuous_environment_example_move_along_a_line(): @@ -342,7 +347,8 @@ def grid_environment_image_representations_example(): env.reset()[0] env.close() - display_image(next_obs) + if display_images: + display_image(next_obs) def atari_wrapper_example(): @@ -351,7 +357,7 @@ def atari_wrapper_example(): "seed": 0, "delay": 1, "transition_noise": 0.25, - "reward_noise": lambda a: a.normal(0, 0.1), + "reward_noise": lambda s, a, rng: rng.normal(0, 0.1), "state_space_type": "discrete", } @@ -380,7 +386,8 @@ def atari_wrapper_example(): env.close() - display_image(next_state) + if display_images: + display_image(next_state) def mujoco_wrapper_examples(): @@ -435,11 +442,13 @@ def mujoco_wrapper_examples(): state = env.reset(seed=gym_wrap_config["seed"])[0] print( - "Taking a step in the environment with a random action and printing the transition:" + "Taking steps in the HalfCheetah environment with a random action and printing the transition:" ) - action = env.action_space.sample() - next_state, reward, done, trunc, info = env.step(action) - print("sars', done =", state, action, reward, next_state, done) + for i in range(3): + action = env.action_space.sample() + next_state, reward, done, trunc, info = env.step(action) + print("sars', done =", state, action, reward, next_state, done) + state = next_state env.close() @@ -453,14 +462,16 @@ def mujoco_wrapper_examples(): import gymnasium as gym env = GymEnvWrapper(env, **gym_wrap_config) - state = env.reset(seed=gym_wrap_config["seed"])[0] + state = env.reset(seed=gym_wrap_config["seed"] + 1)[0] print( - "Taking a step in the environment with a random action and printing the transition:" + "Taking steps in the Pusher environment with a random action and printing the transition:" ) - action = env.action_space.sample() - next_state, reward, done, trunc, info = env.step(action) - print("sars', done =", state, action, reward, next_state, done) + for i in range(3): + action = env.action_space.sample() + next_state, reward, done, trunc, info = env.step(action) + print("sars', done =", state, action, reward, next_state, done) + state = next_state env.close() @@ -474,14 +485,16 @@ def mujoco_wrapper_examples(): import gymnasium as gym env = GymEnvWrapper(env, **gym_wrap_config) - state = env.reset(seed=gym_wrap_config["seed"])[0] + state = env.reset(seed=gym_wrap_config["seed"] + 2)[0] print( - "Taking a step in the environment with a random action and printing the transition:" + "Taking steps in the Reacher environment with a random action and printing the transition:" ) - action = env.action_space.sample() - next_state, reward, done, trunc, info = env.step(action) - print("sars', done =", state, action, reward, next_state, done) + for i in range(3): + action = env.action_space.sample() + next_state, reward, done, trunc, info = env.step(action) + print("sars', done =", state, action, reward, next_state, done) + state = next_state env.close() @@ -501,7 +514,7 @@ def minigrid_wrapper_example(): "seed": 0, "delay": 1, "transition_noise": 0.25, - "reward_noise": lambda a: a.normal(0, 0.1), + "reward_noise": lambda s, a, rng: rng.normal(0, 0.1), "state_space_type": "discrete", } @@ -533,7 +546,8 @@ def minigrid_wrapper_example(): env.close() - display_image(next_obs) + if display_images: + display_image(next_obs) def procgen_wrapper_example(): @@ -542,7 +556,7 @@ def procgen_wrapper_example(): "seed": 0, "delay": 1, "transition_noise": 0.25, - "reward_noise": lambda a: a.normal(0, 0.1), + "reward_noise": lambda s, a, rng: rng.normal(0, 0.1), "state_space_type": "discrete", } @@ -569,76 +583,106 @@ def procgen_wrapper_example(): env.close() - display_image(next_obs) + if display_images: + display_image(next_obs) if __name__ == "__main__": + # Use argparse to set display_images to False if you don't want to display images + # and to set log level. + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--display_images", "-di", help="Display image observations (available for some examples)", action="store_true") + parser.add_argument("--do_not_display_images", "-n", help="Do not display image observations (available for some examples)", action="store_false", dest="display_images") + parser.add_argument("--log_level", type=str, default="DEBUG", help="Set the log level") + parser.set_defaults(display_images=True) + args = parser.parse_args() + display_images = args.display_images + + # Set up logging globally for the MDP Playground library: + import logging + logger = logging.getLogger("mdp_playground") + logger.setLevel(args.log_level) + if not logger.handlers: + log_filename = "log_file.txt" + log_file_handler = logging.FileHandler(log_filename) + log_file_handler.setFormatter(logging.Formatter('%(message)s - %(levelname)s - %(name)s - %(asctime)s', datefmt='%m.%d.%Y %I:%M:%S %p')) + logger.addHandler(log_file_handler) + # Add a console handler: + console_handler = logging.StreamHandler() + console_handler.setFormatter(logging.Formatter('%(message)s')) + # Have less verbose logging to console: + console_handler.setLevel(logging.INFO) + logger.addHandler(console_handler) + logger.info("Begin logging to: %s", log_filename) + + # Colour print set_ansi_escape = "\033[33;1m" # Yellow, bold reset_ansi_escape = "\033[0m" - print(set_ansi_escape + "Running discrete environment\n" + reset_ansi_escape) + logger.info(set_ansi_escape + "Running discrete environment\n" + reset_ansi_escape) discrete_environment_example() - print( + logger.info( set_ansi_escape + "\nRunning discrete environment with image representations\n" + reset_ansi_escape ) discrete_environment_image_representations_example() - print( + logger.info( set_ansi_escape + "\nRunning discrete environment with diameter and image representations\n" + reset_ansi_escape ) discrete_environment_diameter_image_representations_example() - print( + logger.info( set_ansi_escape + "\nRunning continuous environment: move_to_a_point\n" + reset_ansi_escape ) continuous_environment_example_move_to_a_point() - print( + logger.info( set_ansi_escape + "\nRunning continuous environment: move_to_a_point with irrelevant features and image representations\n" + reset_ansi_escape ) continuous_environment_example_move_to_a_point_irrelevant_image() - print( + logger.info( set_ansi_escape + "\nRunning continuous environment: move_along_a_line\n" + reset_ansi_escape ) continuous_environment_example_move_along_a_line() - print( + logger.info( set_ansi_escape + "\nRunning grid environment: move_to_a_point\n" + reset_ansi_escape ) grid_environment_example() - print( + logger.info( set_ansi_escape + "\nRunning grid environment: move_to_a_point " "with image representations\n" + reset_ansi_escape ) grid_environment_image_representations_example() - print(set_ansi_escape + "\nRunning Atari wrapper example:\n" + reset_ansi_escape) + logger.info(set_ansi_escape + "\nRunning Atari wrapper example:\n" + reset_ansi_escape) atari_wrapper_example() - print(set_ansi_escape + "\nRunning Mujoco wrapper example:\n" + reset_ansi_escape) + logger.info(set_ansi_escape + "\nRunning Mujoco wrapper example:\n" + reset_ansi_escape) mujoco_wrapper_examples() - print(set_ansi_escape + "\nRunning MiniGrid wrapper example:\n" + reset_ansi_escape) + # logger.info(set_ansi_escape + "\nRunning MiniGrid wrapper example:\n" + reset_ansi_escape) # minigrid_wrapper_example() - # print(set_ansi_escape + "\nRunning ProcGen wrapper example:\n" + reset_ansi_escape) + # logger.info(set_ansi_escape + "\nRunning ProcGen wrapper example:\n" + reset_ansi_escape) # procgen_wrapper_example() # Using gym.make() example 1 @@ -660,4 +704,4 @@ def procgen_wrapper_example(): ) env.reset()[0] for i in range(10): - print(env.step(env.action_space.sample())) + logger.info(env.step(env.action_space.sample())) diff --git a/mdp_playground/envs/gym_env_wrapper.py b/mdp_playground/envs/gym_env_wrapper.py index 95791d3..d940f1e 100644 --- a/mdp_playground/envs/gym_env_wrapper.py +++ b/mdp_playground/envs/gym_env_wrapper.py @@ -9,6 +9,7 @@ import PIL.ImageDraw as ImageDraw import PIL.Image as Image from PIL.Image import FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM +import logging # def get_gym_wrapper(base_class): @@ -23,7 +24,7 @@ class GymEnvWrapper(gym.Env): reward scale reward shift terminal state reward - image_transforms (for discrete environments) + image_transforms (for discrete environments with image observations) The wrapper is pretty general and can be applied to any Gym Environment. The environment should be instantiated and passed as the 1st argument to the __init__ method of this class. If using this wrapper with Atari, additional keys may be added specifying either atari_preprocessing = True or wrap_deepmind_ray = True. These would use the AtariPreprocessing wrapper from OpenAI Gym or wrap_deepmind() wrapper from Ray Rllib. @@ -36,6 +37,7 @@ class GymEnvWrapper(gym.Env): # we would have multiple observation_spaces and this could cause conflict # with code that assumes any subclass of gym.Wrapper should have these member # variables. However, it _should_ be at least a gym.Env. + # Following comment based on the old get_gym_wrapper(base_class) code: # Does it need to be a subclass of base_class because some external code # may check if it's an AtariEnv, for instance, and do further stuff based @@ -46,6 +48,21 @@ def __init__(self, env, **config): # self.env = config["env"] self.env = env + if "log_level" not in config: + self.log_level = logging.NOTSET + else: + self.log_level = config["log_level"] + + self.logger = logging.getLogger(__name__) + self.logger.setLevel(self.log_level) + + if "log_filename" in config: + if not self.logger.handlers: + self.log_filename = config["log_filename"] + log_file_handler = logging.FileHandler(self.log_filename) + self.logger.addHandler(log_file_handler) + print("Logger logging to:", self.log_filename) + seed_int = None if "seed" in config: seed_int = config["seed"] @@ -74,30 +91,27 @@ def __init__(self, env, **config): self.delay = 0 if "transition_noise" in config: - self.transition_noise = config["transition_noise"] if config["state_space_type"] == "continuous": if callable(config["transition_noise"]): self.transition_noise = config["transition_noise"] else: p_noise_std = config["transition_noise"] - self.transition_noise = lambda a: a.normal(0, p_noise_std) - else: + self.transition_noise = lambda s, a, rng: rng.normal(0, p_noise_std, size=s.shape) + else: # discrete env + self.transition_noise = config["transition_noise"] assert self.transition_noise <= 1.0 and self.transition_noise >= 0.0, ( "transition_noise must be a value in [0.0, 1.0] when env is discrete, it was:" + str(self.transition_noise) ) else: - if config["state_space_type"] == "discrete": - self.transition_noise = 0.0 - else: - self.transition_noise = lambda a: 0.0 + self.transition_noise = None if "reward_noise" in config: if callable(config["reward_noise"]): self.reward_noise = config["reward_noise"] else: reward_noise_std = config["reward_noise"] - self.reward_noise = lambda a: a.normal(0, reward_noise_std) + self.reward_noise = lambda s, a, rng: rng.normal(0, reward_noise_std) else: self.reward_noise = None @@ -121,14 +135,15 @@ def __init__(self, env, **config): else: assert ( config["state_space_type"] == "discrete" - ), "Image transforms are only applicable to discrete envs." + ), "Image transforms are only supported for discrete envs with \ + image observations." self.image_transforms = config["image_transforms"] if len(self.env.observation_space.shape) != 3: warnings.warn( "The length of observation_space.shape =" + self.env.observation_space.shape - + "It was expected" - + "to be 3 for environments with image representations." + + "It was expected to be 3 (width, height, channels) " + + "for environments with image representations." ) if "image_padding" in config: @@ -332,7 +347,7 @@ def step(self, action): self.total_transitions_episode += 1 if self.config["state_space_type"] == "discrete": - if self.transition_noise > 0.0: + if self.transition_noise: probs = ( np.ones(shape=(self.env.action_space.n,)) * self.transition_noise @@ -347,24 +362,24 @@ def step(self, action): # print("NOISE inserted", old_action, action) self.total_noisy_transitions_episode += 1 else: # cont. envs - if self.transition_noise is not None: - noise_in_transition = ( - self.transition_noise(self._np_random) - if self.transition_noise - else 0 - ) # #random - self.total_abs_noise_in_transition_episode += np.abs( - noise_in_transition - ) - else: - noise_in_transition = 0.0 + noise_in_transition = ( + self.transition_noise(self.curr_state, action, self._np_random) + if self.transition_noise + else 0.0 + ) # #random + self.total_abs_noise_in_transition_episode += np.abs( + noise_in_transition + ) + self.logger.debug("total_transitions_episode: " + str(self.total_transitions_episode) + + " Noise in transition: " + str(noise_in_transition)) if "irrelevant_features" in self.config: if self.config["state_space_type"] == "discrete": next_state, reward, done, trunc, info = self.env.step(action[0]) next_state_irr, _, done_irr, trunc_irr, _ = self.irr_toy_env.step(action[1]) next_state = tuple([next_state, next_state_irr]) - else: + next_obs = next_state + else: # cont. env # env_act_shape is the shape of the underlying env's action space and we # sub-select those dimensions from the total action space next and apply # to the underlying env: @@ -375,13 +390,20 @@ def step(self, action): action[self.env_act_shape[0] :] ) next_state = np.concatenate((next_state, next_state_irr)) - else: + next_obs = next_state.copy() + else: # no irrelevant features next_state, reward, done, trunc, info = self.env.step(action) - if self.config["state_space_type"] == "continuous": - next_state += noise_in_transition + if self.config["state_space_type"] == "discrete": + next_obs = next_state + else: # cont. env + next_obs = next_state.copy() + + # I think this adds noise whether or not irrelevant features are present. #TODO Add test + if self.config["state_space_type"] == "continuous": + next_obs += noise_in_transition if self.image_transforms: - next_state = self.get_transformed_image(next_state) + next_obs = self.get_transformed_image(next_state) if done: # if episode is finished return the rewards that were delayed and not @@ -401,14 +423,17 @@ def step(self, action): # action and time_step as well. Would need to change implementation to # have a queue for the rewards achieved and then pick the reward that was # generated delay timesteps ago. - noise_in_reward = self.reward_noise(self._np_random) if self.reward_noise else 0 + noise_in_reward = self.reward_noise(self.curr_state, action, self._np_random) if self.reward_noise else 0 + self.logger.info("Noise in reward: " + str(noise_in_reward)) self.total_abs_noise_in_reward_episode += np.abs(noise_in_reward) self.total_reward_episode += reward reward += noise_in_reward reward *= self.reward_scale reward += self.reward_shift - return next_state, reward, done, trunc, info + self.logger.debug("sas'o'r: " + str(self.curr_state) + "\n" + str(action) + "\n" + str(next_state) + "\n" + str(next_obs) + " \n" + str(reward)) + self.curr_state = next_state + return next_obs, reward, done, trunc, info def reset(self, seed=None): ''' @@ -420,7 +445,7 @@ def reset(self, seed=None): # episode end reached by reaching a terminal state, but reset() may have # been called in the middle of an episode): if not self.total_episodes == 0: - print( + self.logger.info( "Noise stats for previous episode num.: " + str(self.total_episodes) + " (total abs. noise in rewards, total abs. noise in transitions, total reward, total noisy transitions, total transitions): " @@ -463,6 +488,8 @@ def reset(self, seed=None): if self.image_transforms: reset_state = (self.get_transformed_image(reset_state[0]), reset_state[1]) + # Need to store the state to be able to calculate the state- and action-dependent noise in step() + self.curr_state = reset_state[0] return reset_state # return super(GymEnvWrapper, self).reset() diff --git a/mdp_playground/envs/rl_toy_env.py b/mdp_playground/envs/rl_toy_env.py index 994586c..ddc2bf0 100644 --- a/mdp_playground/envs/rl_toy_env.py +++ b/mdp_playground/envs/rl_toy_env.py @@ -38,12 +38,12 @@ class RLToyEnv(gym.Env): Delays each reward by this number of timesteps. Default value: 0. sequence_length : int >= 1 Intrinsic sequence length of the reward function of an environment. For discrete environments, randomly selected sequences of this length are set to be rewardable at initialisation if use_custom_mdp = false and generate_random_mdp = true. Default value: 1. - transition_noise : float in range [0, 1] or Python function(rng) + transition_noise : float in range [0, 1] or Python function(state, action, rng) For discrete environments, it is a float that specifies the fraction of times the environment transitions to a noisy next state at each timestep, independently and uniformly at random. - For continuous environments, if it's a float, it's used as the standard deviation of an i.i.d. normal distribution of noise. If it is a Python function with one argument, it is added to next state. The argument is the Random Number Generator (RNG) of the environment which is an np.random.RandomState object. This RNG should be used to perform calls to the desired random function to be used as noise to ensure reproducibility. Default value: 0. - reward_noise : float or Python function(rng) + For continuous environments, if it's a float, it's used as the standard deviation of an i.i.d. normal distribution of noise. If it is a Python function, it should have 3 arguments and return a noise value that is added to next state. The arguments are provided to it by the existing code in this class and are the current state, the current action and the Random Number Generator (RNG) of the environment which is an np.random.RandomState object. This RNG is used to ensure reproducibility. Default value: 0. + reward_noise : float or Python function(state, action, rng) If it's a float, it's used as the standard deviation of an i.i.d. normal distribution of noise. - If it's a Python function with one argument, it is added to the reward given at every time step. The argument is the Random Number Generator (RNG) of the environment which is an np.random.RandomState object. This RNG should be used to perform calls to the desired random function to be used as noise to ensure reproducibility. Default value: 0. + If it is a Python function, it should have 3 arguments and return a noise value that is to the reward. The arguments are provided to it by the existing code in this class and are the current state, the current action and the Random Number Generator (RNG) of the environment which is an np.random.RandomState object. This RNG is used to ensure reproducibility. Default value: 0. reward_density : float in range [0, 1] The fraction of possible sequences of a given length that will be selected to be rewardable at initialisation time. Default value: 0.25. reward_scale : float @@ -61,7 +61,7 @@ class RLToyEnv(gym.Env): For discrete envs, this is handled by an mdp_playground.spaces.ImageMultiDiscrete object. It associates the image of an n + 3 sided polygon for a categorical state n. More details can be found in the documentation for the ImageMultiDiscrete class. For continuous and grid envs, this is handled by an mdp_playground.spaces.ImageContinuous object. More details can be found in the documentation for the ImageContinuous class. irrelevant_features : boolean - If True, an additional irrelevant sub-space (irrelevant to achieving rewards) is present as part of the observation space. This sub-space has its own transition dynamics independent of the dynamics of the relevant sub-space. + If True, an additional irrelevant sub-space (irrelevant to achieving rewards) is present as part of the observation space. This sub-space has its own transition dynamics independent of the dynamics of the relevant sub-space. No noise is currently added to irrelevant_features parts in continuous or grid spaces. Default value: False. For discrete environments, additionally, state_space_size must be specified as a list. For continuous environments, the option relevant_indices must be specified. This option specifies the dimensions relevant to achieving rewards. For grid environments, nothing additional needs to be done as relevant grid shape is also used as the irrelevant grid shape. @@ -243,16 +243,16 @@ def __init__(self, **config): # Set other default settings for config to use if config is passed without any values for them if "log_level" not in config: - self.log_level = logging.CRITICAL # #logging.NOTSET + self.log_level = logging.NOTSET # #logging.CRITICAL else: self.log_level = config["log_level"] # print('self.log_level', self.log_level) - logging.getLogger(__name__).setLevel(self.log_level) # fmtr = logging.Formatter(fmt='%(message)s - %(levelname)s - %(name)s - %(asctime)s', datefmt='%m.%d.%Y %I:%M:%S %p', style='%') # sh = logging.StreamHandler() # sh.setFormatter(fmt=fmtr) self.logger = logging.getLogger(__name__) + self.logger.setLevel(self.log_level) # print("Logging stuff:", self.logger, self.logger.handlers, __name__) # Example output of above: [] mdp_playground.envs.rl_toy_env # self.logger.addHandler(sh) @@ -389,7 +389,7 @@ def __init__(self, **config): self.reward_noise = config["reward_noise"] else: reward_noise_std = config["reward_noise"] - self.reward_noise = lambda a: a.normal(0, reward_noise_std) + self.reward_noise = lambda s, a, rng: rng.normal(0, reward_noise_std) else: self.reward_noise = None @@ -399,7 +399,7 @@ def __init__(self, **config): self.transition_noise = config["transition_noise"] else: p_noise_std = config["transition_noise"] - self.transition_noise = lambda a: a.normal(0, p_noise_std) + self.transition_noise = lambda s, a, rng: rng.normal(0, p_noise_std, s.shape) else: # discrete case self.transition_noise = config["transition_noise"] else: # no transition noise @@ -792,13 +792,13 @@ def __init__(self, **config): self.reset(seed=self.seed_dict["env"]) ) # #TODO Maybe not call it here, since Gym seems to expect to _always_ call this method when using an environment; make this seedable? DO NOT do seed dependent initialization in reset() otherwise the initial state distrbution will always be at the same state at every call to reset()!! (Gym env has its own seed? Yes, it does, as does also space); - self.logger.info( + self.logger.debug( "self.augmented_state, len: " + str(self.augmented_state) + ", " + str(len(self.augmented_state)) ) - self.logger.info( + self.logger.debug( "MDP Playground toy env instantiated with config: " + str(self.config) ) print("MDP Playground toy env instantiated with config: " + str(self.config)) @@ -1286,7 +1286,7 @@ def get_sequences(maximum, length, fraction, repeats=False, diameter=1): # replace=True) # Be careful that sequence_length is less than state space # size sequences.append(specific_sequence) - self.logger.info( + self.logger.debug( "Total no. of rewarded sequences:" + str(len(sequences)) + "Out of" @@ -1303,7 +1303,7 @@ def get_sequences(maximum, length, fraction, repeats=False, diameter=1): for i in range(length): permutations.append(maximum - (i // diameter)) # permutations = list(range(maximum + 1 - length, maximum + 1)) - self.logger.info( + self.logger.debug( "No. of choices for each element in a" " possible sequence (Total no. of permutations will be a" " product of this), no. of possible perms per independent" @@ -1412,7 +1412,7 @@ def get_sequences(maximum, length, fraction, repeats=False, diameter=1): " rewardable sequence when it was generated. No. of" " times a clash was detected:" + str(total_clashes) ) - self.logger.info( + self.logger.debug( "Total no. of rewarded sequences:" + str(len(sequences)) + "Out of" @@ -1556,7 +1556,7 @@ def transition_function(self, state, action): new_next_state = self.observation_spaces[0].sample(prob=probs) # random # print("noisy old next_state, new_next_state", next_state, new_next_state) if next_state != new_next_state: - self.logger.info( + self.logger.debug( "NOISE inserted! old next_state, new_next_state" + str(next_state) + str(new_next_state) @@ -1618,19 +1618,22 @@ def transition_function(self, state, action): + str(action) + " out of range of action space. Applying 0 action!!" ) + # if "transition_noise" in self.config: noise_in_transition = ( - self.transition_noise(self._np_random) if self.transition_noise else + self.transition_noise(state, action, self._np_random) if self.transition_noise else np.zeros(self.state_space_dim) ) # #random self.total_abs_noise_in_transition_episode += np.abs(noise_in_transition) + self.logger.debug("total_transitions_episode: " + str(self.total_transitions_episode) + + " Noise in transition: " + str(noise_in_transition)) next_state += noise_in_transition # ##IMP Noise is only applied to # Store the noise in transition for easier testing self.noise_in_transition = noise_in_transition # state and not to higher order derivatives # TODO Check if next_state is within state space bounds if not self.observation_space.contains(next_state): - self.logger.info( + self.logger.debug( "next_state out of bounds. next_state, clipping to" + str(next_state) + str( @@ -1674,7 +1677,7 @@ def transition_function(self, state, action): while True: # Be careful of infinite loops new_action = list(self.action_space.sample()) # #random if new_action != action: - self.logger.info( + self.logger.debug( "NOISE inserted! old action, new_action" + str(action) + str(new_action) @@ -1690,11 +1693,11 @@ def transition_function(self, state, action): # actions -1, 0, 1 represent back, noop, forward respt. next_state.append(state[i] + action[i]) if next_state[i] < 0: - self.logger.info("Underflow in grid next state. Bouncing back.") + self.logger.debug("Underflow in grid next state. Bouncing back.") next_state[i] = 0 if next_state[i] >= self.grid_shape[i]: - self.logger.info("Overflow in grid next state. Bouncing back.") + self.logger.debug("Overflow in grid next state. Bouncing back.") next_state[i] = self.grid_shape[i] - 1 else: # if action is from outside allowed action_space @@ -1792,7 +1795,7 @@ def reward_function(self, state, action): # print("self.reward_buffer", self.reward_buffer) del self.reward_buffer[0] - self.logger.info("rew" + str(reward)) + self.logger.debug("rew" + str(reward)) elif self.config["state_space_type"] == "continuous": # ##TODO Make reward for along a line case to be length of line @@ -1826,7 +1829,7 @@ def reward_function(self, state, action): ] data_mean = data_.mean(axis=0) uu, dd, vv = np.linalg.svd(data_ - data_mean) - self.logger.info( + self.logger.debug( "uu.shape, dd.shape, vv.shape =" + str(uu.shape) + str(dd.shape) @@ -1852,7 +1855,7 @@ def reward_function(self, state, action): total_deviation += dist_of_pt_from_line( data_pt, line_end_pts[0], line_end_pts[-1] ) - self.logger.info( + self.logger.debug( "total_deviation of pts from fit line:" + str(total_deviation) ) @@ -1923,8 +1926,8 @@ def reward_function(self, state, action): # print("self.reward_buffer", self.reward_buffer) del self.reward_buffer[0] - noise_in_reward = self.reward_noise(self._np_random) if self.reward_noise else 0 - # #random ###TODO Would be better to parameterise this in terms of state, action and time_step as well. Would need to change implementation to have a queue for the rewards achieved and then pick the reward that was generated delay timesteps ago. + noise_in_reward = self.reward_noise(state, action, self._np_random) if self.reward_noise else 0 + # #random ### TODO Would be better to parameterise this in terms of state, action and time_step as well. Would need to change implementation to have a queue for the rewards achieved and then pick the reward that was generated delay timesteps ago. self.total_abs_noise_in_reward_episode += np.abs(noise_in_reward) self.total_reward_episode += reward self.logger.info("Reward: " + str(reward) + " Noise in reward: " + str(noise_in_reward)) @@ -2024,6 +2027,8 @@ def step(self, action, imaginary_rollout=False): # print("NOISE inserted! old next_state_irrelevant, new_next_state_irrelevant", next_state_irrelevant, new_next_state_irrelevant) # self.total_noisy_transitions_irrelevant_episode += 1 next_state_irrelevant = new_next_state_irrelevant + # No noise is currently added to irrelevant_features parts in continuous or grid spaces + # Transform discrete back to multi-discrete if needed if self.config["state_space_type"] == "discrete": @@ -2049,14 +2054,14 @@ def step(self, action, imaginary_rollout=False): self.reward += ( self.term_state_reward * self.reward_scale ) # Scale before or after? - self.logger.info( + self.logger.debug( "sas'r: " + str(self.augmented_state[-2]) - + " " + + "\n" + str(action) - + " " + + "\n" + str(self.augmented_state[-1]) - + " " + + "\n" + str(self.reward) ) @@ -2147,11 +2152,11 @@ def reset(self, seed=None): p=self.config["irrelevant_init_state_dist"], ) # #random self.curr_state = (self.curr_state_relevant, self.curr_state_irrelevant) - self.logger.info( + self.logger.debug( "RESET called. Relevant part of state reset to:" + str(self.curr_state_relevant) ) - self.logger.info( + self.logger.debug( "Irrelevant part of state reset to:" + str(self.curr_state_irrelevant) ) @@ -2176,7 +2181,7 @@ def reset(self, seed=None): for i in range(len(self.term_spaces)): if self.term_spaces[i].contains(self.curr_state): j = i - self.logger.info( + self.logger.debug( "A state was sampled in term state subspace." " Therefore, resampling. State was, subspace was:" + str(self.curr_state) @@ -2213,7 +2218,7 @@ def reset(self, seed=None): self.curr_state = self.feature_space.sample().astype(int) # #random self.curr_state_relevant = list(self.curr_state[[0, 1]]) # #hardcoded if self.is_terminal_state(self.curr_state_relevant): - self.logger.info( + self.logger.debug( "A terminal state was sampled. Therefore," " resampling. State was:" + str(self.curr_state) ) @@ -2238,9 +2243,12 @@ def reset(self, seed=None): self.reached_terminal = False self.total_abs_noise_in_reward_episode = 0 - self.total_abs_noise_in_transition_episode = ( - 0 # only present in continuous spaces - ) + if self.config["state_space_type"] == "continuous": + self.total_abs_noise_in_transition_episode = ( + np.zeros(shape=(self.state_space_dim)) # only present in continuous spaces + ) + else: + self.total_abs_noise_in_transition_episode = None self.total_noisy_transitions_episode = 0 # only present in discrete spaces self.total_reward_episode = 0 self.total_transitions_episode = 0 @@ -2271,7 +2279,7 @@ def seed(self, seed=None): # If seed is None, you get a randomly generated seed from gymnasium.utils... # As of 2024.06.18: # seed_seq = np.random.SeedSequence(seed) - # np_seed = seed_seq.entropy + # np_seed = seed_seq.entropy # Is just the same as the seed above # rng = RandomNumberGenerator(np.random.PCG64(seed_seq)) self._np_random, self.seed_ = gym.utils.seeding.np_random(seed) # #random print( diff --git a/temp.txt b/temp.txt new file mode 100644 index 0000000..b7eaef6 --- /dev/null +++ b/temp.txt @@ -0,0 +1,1985 @@ +Begin logging to: log_file.txt +Running discrete environment + +Seeds set to:{'env': 0, 'relevant_state_space': 5874934615388537134, 'relevant_action_space': 2488343231644625808, 'irrelevant_state_space': 377914054924498011, 'irrelevant_action_space': 152440531369162766, 'state_space': 7501093982645987484, 'action_space': 8418684267946577446, 'image_representations': 5595227450766711102} +Inited terminal states to self.config['terminal_states']: [7 6]. Total 2 +self.relevant_init_state_dist:[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 + 0. 0. ] +specific_sequence that will be rewarded(3, 4, 1) +specific_sequence that will be rewarded(1, 4, 3) +specific_sequence that will be rewarded(2, 3, 0) +specific_sequence that will be rewarded(3, 5, 4) +specific_sequence that will be rewarded(4, 5, 1) +specific_sequence that will be rewarded(5, 2, 1) +specific_sequence that will be rewarded(5, 4, 2) +specific_sequence that will be rewarded(0, 1, 4) +specific_sequence that will be rewarded(1, 2, 4) +specific_sequence that will be rewarded(3, 0, 1) +specific_sequence that will be rewarded(5, 3, 1) +specific_sequence that will be rewarded(3, 5, 1) +specific_sequence that will be rewarded(4, 3, 5) +specific_sequence that will be rewarded(4, 5, 3) +specific_sequence that will be rewarded(0, 1, 2) +specific_sequence that will be rewarded(1, 5, 2) +specific_sequence that will be rewarded(3, 5, 2) +specific_sequence that will be rewarded(1, 4, 2) +specific_sequence that will be rewarded(4, 0, 2) +specific_sequence that will be rewarded(0, 1, 5) +specific_sequence that will be rewarded(2, 0, 1) +specific_sequence that will be rewarded(3, 1, 2) +specific_sequence that will be rewarded(0, 2, 4) +specific_sequence that will be rewarded(3, 1, 0) +specific_sequence that will be rewarded(5, 2, 3) +specific_sequence that will be rewarded(2, 5, 0) +specific_sequence that will be rewarded(0, 4, 1) +specific_sequence that will be rewarded(2, 4, 3) +specific_sequence that will be rewarded(4, 0, 5) +specific_sequence that will be rewarded(4, 5, 2) +RESET called. curr_state reset to: 3 + self.delay, self.sequence_length:13 +Reward: 0.0 Noise in reward: -0.06605243164565094 + +Running discrete environment with image representations + +Seeds set to:{'env': 0, 'relevant_state_space': 5874934615388537134, 'relevant_action_space': 2488343231644625808, 'irrelevant_state_space': 377914054924498011, 'irrelevant_action_space': 152440531369162766, 'state_space': 7501093982645987484, 'action_space': 8418684267946577446, 'image_representations': 5595227450766711102} +Inited terminal states to self.config['terminal_states']: [7 6]. Total 2 +self.relevant_init_state_dist:[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 + 0. 0. ] +specific_sequence that will be rewarded(3, 4, 1) +specific_sequence that will be rewarded(1, 4, 3) +specific_sequence that will be rewarded(2, 3, 0) +specific_sequence that will be rewarded(3, 5, 4) +specific_sequence that will be rewarded(4, 5, 1) +specific_sequence that will be rewarded(5, 2, 1) +specific_sequence that will be rewarded(5, 4, 2) +specific_sequence that will be rewarded(0, 1, 4) +specific_sequence that will be rewarded(1, 2, 4) +specific_sequence that will be rewarded(3, 0, 1) +specific_sequence that will be rewarded(5, 3, 1) +specific_sequence that will be rewarded(3, 5, 1) +specific_sequence that will be rewarded(4, 3, 5) +specific_sequence that will be rewarded(4, 5, 3) +specific_sequence that will be rewarded(0, 1, 2) +specific_sequence that will be rewarded(1, 5, 2) +specific_sequence that will be rewarded(3, 5, 2) +specific_sequence that will be rewarded(1, 4, 2) +specific_sequence that will be rewarded(4, 0, 2) +specific_sequence that will be rewarded(0, 1, 5) +specific_sequence that will be rewarded(2, 0, 1) +specific_sequence that will be rewarded(3, 1, 2) +specific_sequence that will be rewarded(0, 2, 4) +specific_sequence that will be rewarded(3, 1, 0) +specific_sequence that will be rewarded(5, 2, 3) +specific_sequence that will be rewarded(2, 5, 0) +specific_sequence that will be rewarded(0, 4, 1) +specific_sequence that will be rewarded(2, 4, 3) +specific_sequence that will be rewarded(4, 0, 5) +specific_sequence that will be rewarded(4, 5, 2) +RESET called. curr_state reset to: 3 + self.delay, self.sequence_length:13 +Reward: 0.0 Noise in reward: -0.06605243164565094 + +Running discrete environment with diameter and image representations + +Seeds set to:{'env': 3, 'relevant_state_space': 789974133212406139, 'relevant_action_space': 2184191404571879930, 'irrelevant_state_space': 7390452496230446618, 'irrelevant_action_space': 5369497044354532241, 'state_space': 868183486707206022, 'action_space': 3994890908985562243, 'image_representations': 4418468347491149040} +Inited terminal states to self.config['terminal_states']: [3 7]. Total 1 +self.relevant_init_state_dist:[0.16666667 0.16666667 0.16666667 0. 0.16666667 0.16666667 + 0.16666667 0. ] +specific_sequence that will be rewarded(2, 4, 0) +specific_sequence that will be rewarded(2, 4, 1) +specific_sequence that will be rewarded(1, 5, 2) +specific_sequence that will be rewarded(0, 5, 1) +specific_sequence that will be rewarded(6, 1, 4) +specific_sequence that will be rewarded(6, 2, 4) +specific_sequence that will be rewarded(5, 2, 4) +specific_sequence that will be rewarded(6, 1, 5) +RESET called. curr_state reset to: 0 + self.delay, self.sequence_length:13 +Reward: 0.0 Noise in reward: -1.2778325156570909 + +Running continuous environment: move_to_a_point + +Seeds set to:{'env': 0, 'relevant_state_space': 5874934615388537134, 'relevant_action_space': 2488343231644625808, 'irrelevant_state_space': 377914054924498011, 'irrelevant_action_space': 152440531369162766, 'state_space': 7501093982645987484, 'action_space': 8418684267946577446, 'image_representations': 5595227450766711102} +RESET called. curr_state reset to: [4.7930346 7.204931 ] + self.delay, self.sequence_length:01 +Noise stats for previous episode num.: 1 (total abs. noise in rewards, total abs. noise in transitions, total reward, total noisy transitions, total transitions): 0 [0. 0.] 0 0 0 +RESET called. curr_state reset to: [-4.655305 -2.974354] + self.delay, self.sequence_length:01 +Reward: -0.7221389 Noise in reward: 0 + +Running continuous environment: move_to_a_point with irrelevant features and image representations + +Seeds set to:{'env': 0, 'relevant_state_space': 5874934615388537134, 'relevant_action_space': 2488343231644625808, 'irrelevant_state_space': 377914054924498011, 'irrelevant_action_space': 152440531369162766, 'state_space': 7501093982645987484, 'action_space': 8418684267946577446, 'image_representations': 5595227450766711102} +RESET called. curr_state reset to: [ 4.7930346 7.204931 -4.655305 -2.974354 ] + self.delay, self.sequence_length:01 +Passed config: {'seed': 0, 'state_space_type': 'discrete', 'action_space_size': 8, 'delay': 1, 'sequence_length': 3, 'reward_scale': 2.5, 'reward_shift': -1.75, 'reward_noise': 0.5, 'transition_noise': 0.1, 'reward_density': 0.25, 'make_denser': False, 'terminal_state_density': 0.25, 'maximally_connected': True, 'repeats_in_sequences': False, 'generate_random_mdp': True} + +========================================================Initialising Toy MDP======================================================== +Current working directory: /home/rajanr/mdp-playground +Env SEED set to: 0. Returned seed from Gym: 0 +transition_matrix inited to: +[[0 2 4 7 1 6 5 3] + [6 1 5 2 7 3 4 0] + [6 5 3 2 4 1 0 7] + [0 5 1 4 2 6 3 7] + [3 0 2 4 5 7 6 1] + [6 0 3 7 2 5 1 4] + [6 6 6 6 6 6 6 6] + [7 7 7 7 7 7 7 7]] +Python type of state: +rewardable_sequences: {(3, 4, 1): 1.0, (1, 4, 3): 1.0, (2, 3, 0): 1.0, (3, 5, 4): 1.0, (4, 5, 1): 1.0, (5, 2, 1): 1.0, (5, 4, 2): 1.0, (0, 1, 4): 1.0, (1, 2, 4): 1.0, (3, 0, 1): 1.0, (5, 3, 1): 1.0, (3, 5, 1): 1.0, (4, 3, 5): 1.0, (4, 5, 3): 1.0, (0, 1, 2): 1.0, (1, 5, 2): 1.0, (3, 5, 2): 1.0, (1, 4, 2): 1.0, (4, 0, 2): 1.0, (0, 1, 5): 1.0, (2, 0, 1): 1.0, (3, 1, 2): 1.0, (0, 2, 4): 1.0, (3, 1, 0): 1.0, (5, 2, 3): 1.0, (2, 5, 0): 1.0, (0, 4, 1): 1.0, (2, 4, 3): 1.0, (4, 0, 5): 1.0, (4, 5, 2): 1.0} +MDP Playground toy env instantiated with config: {'seed': 0, 'state_space_type': 'discrete', 'action_space_size': 8, 'delay': 1, 'sequence_length': 3, 'reward_scale': 2.5, 'reward_shift': -1.75, 'reward_noise': 0.5, 'transition_noise': 0.1, 'reward_density': 0.25, 'make_denser': False, 'terminal_state_density': 0.25, 'maximally_connected': True, 'repeats_in_sequences': False, 'generate_random_mdp': True, 'terminal_states': array([7, 6]), 'relevant_init_state_dist': array([0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, + 0.16666667, 0. , 0. ]), 'transition_function': . at 0x7b47a29bd090>} +Taking a step in the environment with a random action and printing the transition: +sars', done = 3 1 -1.9151310791141274 5 False +Passed config: {'seed': 0, 'state_space_type': 'discrete', 'action_space_size': 8, 'image_representations': True, 'delay': 1, 'sequence_length': 3, 'reward_scale': 2.5, 'reward_shift': -1.75, 'reward_noise': 0.5, 'transition_noise': 0.1, 'reward_density': 0.25, 'make_denser': False, 'terminal_state_density': 0.25, 'maximally_connected': True, 'repeats_in_sequences': False, 'generate_random_mdp': True} + +========================================================Initialising Toy MDP======================================================== +Current working directory: /home/rajanr/mdp-playground +Env SEED set to: 0. Returned seed from Gym: 0 +transition_matrix inited to: +[[0 2 4 7 1 6 5 3] + [6 1 5 2 7 3 4 0] + [6 5 3 2 4 1 0 7] + [0 5 1 4 2 6 3 7] + [3 0 2 4 5 7 6 1] + [6 0 3 7 2 5 1 4] + [6 6 6 6 6 6 6 6] + [7 7 7 7 7 7 7 7]] +Python type of state: +rewardable_sequences: {(3, 4, 1): 1.0, (1, 4, 3): 1.0, (2, 3, 0): 1.0, (3, 5, 4): 1.0, (4, 5, 1): 1.0, (5, 2, 1): 1.0, (5, 4, 2): 1.0, (0, 1, 4): 1.0, (1, 2, 4): 1.0, (3, 0, 1): 1.0, (5, 3, 1): 1.0, (3, 5, 1): 1.0, (4, 3, 5): 1.0, (4, 5, 3): 1.0, (0, 1, 2): 1.0, (1, 5, 2): 1.0, (3, 5, 2): 1.0, (1, 4, 2): 1.0, (4, 0, 2): 1.0, (0, 1, 5): 1.0, (2, 0, 1): 1.0, (3, 1, 2): 1.0, (0, 2, 4): 1.0, (3, 1, 0): 1.0, (5, 2, 3): 1.0, (2, 5, 0): 1.0, (0, 4, 1): 1.0, (2, 4, 3): 1.0, (4, 0, 5): 1.0, (4, 5, 2): 1.0} +MDP Playground toy env instantiated with config: {'seed': 0, 'state_space_type': 'discrete', 'action_space_size': 8, 'image_representations': True, 'delay': 1, 'sequence_length': 3, 'reward_scale': 2.5, 'reward_shift': -1.75, 'reward_noise': 0.5, 'transition_noise': 0.1, 'reward_density': 0.25, 'make_denser': False, 'terminal_state_density': 0.25, 'maximally_connected': True, 'repeats_in_sequences': False, 'generate_random_mdp': True, 'terminal_states': array([7, 6]), 'relevant_init_state_dist': array([0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, + 0.16666667, 0. , 0. ]), 'transition_function': . at 0x7b47a29bd240>} +Taking a step in the environment with a random action and printing the transition: +sars', done = 3 1 -1.9151310791141274 5 False +Passed config: {'seed': 3, 'state_space_type': 'discrete', 'action_space_size': 4, 'image_representations': True, 'delay': 1, 'diameter': 2, 'sequence_length': 3, 'reward_scale': 2.5, 'reward_shift': -1.75, 'reward_noise': 0.5, 'transition_noise': 0.1, 'reward_density': 0.25, 'make_denser': False, 'terminal_state_density': 0.25, 'maximally_connected': True, 'repeats_in_sequences': False, 'generate_random_mdp': True} + +========================================================Initialising Toy MDP======================================================== +Current working directory: /home/rajanr/mdp-playground +Env SEED set to: 3. Returned seed from Gym: 3 +transition_matrix inited to: +[[6 7 5 4] + [6 5 4 7] + [5 4 6 7] + [3 3 3 3] + [1 2 0 3] + [1 0 2 3] + [0 2 3 1] + [7 7 7 7]] +Python type of state: +rewardable_sequences: {(2, 4, 0): 1.0, (2, 4, 1): 1.0, (1, 5, 2): 1.0, (0, 5, 1): 1.0, (6, 1, 4): 1.0, (6, 2, 4): 1.0, (5, 2, 4): 1.0, (6, 1, 5): 1.0} +MDP Playground toy env instantiated with config: {'seed': 3, 'state_space_type': 'discrete', 'action_space_size': 4, 'image_representations': True, 'delay': 1, 'diameter': 2, 'sequence_length': 3, 'reward_scale': 2.5, 'reward_shift': -1.75, 'reward_noise': 0.5, 'transition_noise': 0.1, 'reward_density': 0.25, 'make_denser': False, 'terminal_state_density': 0.25, 'maximally_connected': True, 'repeats_in_sequences': False, 'generate_random_mdp': True, 'terminal_states': array([3, 7]), 'relevant_init_state_dist': array([0.16666667, 0.16666667, 0.16666667, 0. , 0.16666667, + 0.16666667, 0.16666667, 0. ]), 'transition_function': . at 0x7b47a29bcf70>} +Taking a step in the environment with a random action and printing the transition: +sars', done = 0 3 -4.944581289142727 4 False +Passed config: {'seed': 0, 'state_space_type': 'continuous', 'state_space_dim': 2, 'transition_dynamics_order': 1, 'inertia': 1, 'time_unit': 1, 'make_denser': True, 'target_point': [0, 0], 'target_radius': 0.05, 'state_space_max': 10, 'action_space_max': 1, 'action_loss_weight': 0.0, 'reward_function': 'move_to_a_point'} + +========================================================Initialising Toy MDP======================================================== +Current working directory: /home/rajanr/mdp-playground +Env SEED set to: 0. Returned seed from Gym: 0 +MDP Playground toy env instantiated with config: {'seed': 0, 'state_space_type': 'continuous', 'state_space_dim': 2, 'transition_dynamics_order': 1, 'inertia': 1, 'time_unit': 1, 'make_denser': True, 'target_point': [0, 0], 'target_radius': 0.05, 'state_space_max': 10, 'action_space_max': 1, 'action_loss_weight': 0.0, 'reward_function': 'move_to_a_point', 'relevant_indices': range(0, 2)} +Taking a step in the environment with a random action and printing the transition: +sars', done = [-4.655305 -2.974354] [-0.517571 -0.52710384] -0.7221389 [-5.172876 -3.501458] False +Passed config: {'seed': 0, 'state_space_type': 'continuous', 'state_space_dim': 4, 'transition_dynamics_order': 1, 'inertia': 1, 'time_unit': 1, 'make_denser': True, 'target_point': [0, 0], 'target_radius': 0.05, 'state_space_max': 10, 'action_space_max': 1, 'action_loss_weight': 0.0, 'reward_function': 'move_to_a_point', 'image_representations': True, 'irrelevant_features': True, 'relevant_indices': [0, 1]} + +========================================================Initialising Toy MDP======================================================== +Current working directory: /home/rajanr/mdp-playground +Env SEED set to: 0. Returned seed from Gym: 0 +Noise stats for previous episode num.: 1 (total abs. noise in rewards, total abs. noise in transitions, total reward, total noisy transitions, total transitions): 0 [0. 0. 0. 0.] 0 0 0 +RESET called. curr_state reset to: [-1.0326167 0.95215625 -9.128669 5.937066 ] + self.delay, self.sequence_length:01 +Reward: -0.202806 Noise in reward: 0 + +Running continuous environment: move_along_a_line + +Seeds set to:{'env': 0, 'relevant_state_space': 5874934615388537134, 'relevant_action_space': 2488343231644625808, 'irrelevant_state_space': 377914054924498011, 'irrelevant_action_space': 152440531369162766, 'state_space': 7501093982645987484, 'action_space': 8418684267946577446, 'image_representations': 5595227450766711102} +RESET called. curr_state reset to: [-2.957625 1.4144933 0.14378542 -0.54651916] + self.delay, self.sequence_length:010 +Noise stats for previous episode num.: 1 (total abs. noise in rewards, total abs. noise in transitions, total reward, total noisy transitions, total transitions): 0 [0. 0. 0. 0.] 0 0 0 +RESET called. curr_state reset to: [-0.9091467 -0.7278909 -0.53039795 0.8469715 ] + self.delay, self.sequence_length:010 +Reward: 0.0 Noise in reward: -0.0535669373161111 + +Running grid environment: move_to_a_point + +Seeds set to:{'env': 0, 'relevant_state_space': 5874934615388537134, 'relevant_action_space': 2488343231644625808, 'irrelevant_state_space': 377914054924498011, 'irrelevant_action_space': 152440531369162766, 'state_space': 7501093982645987484, 'action_space': 8418684267946577446, 'image_representations': 5595227450766711102} +RESET called. curr_state reset to: [5 6] + self.delay, self.sequence_length:01 +Reward: -1.0 Noise in reward: 0 +Reward: -1.0 Noise in reward: 0 +Reward: -1.0 Noise in reward: 0 +Reward: 1.0 Noise in reward: 0 +/home/rajanr/mdp-playground/mdp_playground/envs/rl_toy_env.py:1705: UserWarning: WARNING: Action [0.5, -0.5] out of range of action space. Applying noop action!! + warnings.warn( +Reward: 0.0 Noise in reward: 0 +/home/rajanr/mdp-playground/mdp_playground/envs/rl_toy_env.py:1705: UserWarning: WARNING: Action [1, 2] out of range of action space. Applying noop action!! + warnings.warn( +Reward: 0.0 Noise in reward: 0 +/home/rajanr/mdp-playground/mdp_playground/envs/rl_toy_env.py:1705: UserWarning: WARNING: Action [1, 1] out of range of action space. Applying noop action!! + warnings.warn( +Reward: 0.0 Noise in reward: 0 +Reward: 0.0 Noise in reward: 0 +Noise stats for previous episode num.: 1 (total abs. noise in rewards, total abs. noise in transitions, total reward, total noisy transitions, total transitions): 0 None -2.0 0 8 +RESET called. curr_state reset to: [2 2] + self.delay, self.sequence_length:01 + +Running grid environment: move_to_a_point with image representations + +Seeds set to:{'env': 0, 'relevant_state_space': 5874934615388537134, 'relevant_action_space': 2488343231644625808, 'irrelevant_state_space': 377914054924498011, 'irrelevant_action_space': 152440531369162766, 'state_space': 7501093982645987484, 'action_space': 8418684267946577446, 'image_representations': 5595227450766711102} +RESET called. curr_state reset to: [5 6] + self.delay, self.sequence_length:01 +Reward: -1.0 Noise in reward: 0 +Reward: -1.0 Noise in reward: 0 +Reward: -1.0 Noise in reward: 0 +Reward: 1.0 Noise in reward: 0 +Reward: 0.0 Noise in reward: 0 +Reward: 0.0 Noise in reward: 0 +Noise stats for previous episode num.: 1 (total abs. noise in rewards, total abs. noise in transitions, total reward, total noisy transitions, total transitions): 0 None -2.0 0 6 +RESET called. curr_state reset to: [2 2] + self.delay, self.sequence_length:01 + +Running Atari wrapper example: + +A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7) +[Powered by Stella] +DEBUG:mdp_playground.envs.gym_env_wrapper:sas'or: [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] 0 [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] 0.010490011715303971 +DEBUG:mdp_playground.envs.gym_env_wrapper:sas'or: [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] 2 [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] 0.03615950549094848 +DEBUG:mdp_playground.envs.gym_env_wrapper:sas'or: [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] 3 [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] 0.09470809631292422 +DEBUG:mdp_playground.envs.gym_env_wrapper:sas'or: [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] 5 [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] -0.12654214710460526 +DEBUG:mdp_playground.envs.gym_env_wrapper:sas'or: [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] 5 [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] 0.00413259793472436 +DEBUG:mdp_playground.envs.gym_env_wrapper:sas'or: [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] 3 [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] -0.021879166393254573 +DEBUG:mdp_playground.envs.gym_env_wrapper:sas'or: [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] 2 [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] -0.07322673547034517 +DEBUG:mdp_playground.envs.gym_env_wrapper:sas'or: [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] 4 [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] -0.031630015636915455 +DEBUG:mdp_playground.envs.gym_env_wrapper:sas'or: [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] 5 [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] 0.10425133694426776 +DEBUG:mdp_playground.envs.gym_env_wrapper:sas'or: [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] 0 [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] [[[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + ... + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]] + + [[0 0 0] + [0 0 0] + [0 0 0] + ... + [0 0 0] + [0 0 0] + [0 0 0]]] 0.1366463470549686 + +Running Mujoco wrapper example: + +INFO:mdp_playground: +Running Mujoco wrapper example: + +DEBUG:mdp_playground.envs.gym_env_wrapper:total_transitions_episode: 1 Noise in transition: [ 0.16010566 0.02622503 -0.13391734 0.09039876 0.32600001 0.23677024 + -0.17593381 -0.31635537 -0.15581862 0.01033149 -0.58125769 -0.05469792 + -0.31147774 -0.18306684 -0.13606475 -0.07907504 0.10290763] +DEBUG:mdp_playground.envs.gym_env_wrapper:sas'or: [-0.04604266 -0.0918053 -0.09669447 0.06265405 0.08255112 0.02132716 + 0.04589931 0.008725 -0.12654215 -0.06232745 0.0041326 -0.23250308 + -0.02187917 -0.12459109 -0.07322674 -0.0544259 -0.03163002] [-0.04805083 0.3459275 -0.2552678 0.2436387 -0.4248672 -0.01168279] [ 0.10950414 -0.06336018 -0.2275261 0.18317325 0.3461872 0.28841474 + -0.18688053 -0.30771495 -0.13111082 -0.28758849 -0.47827629 0.28402364 + 1.37572029 -3.87323739 1.81738214 -3.64180002 0.10067344] [ 0.10950414 -0.06336018 -0.2275261 0.18317325 0.3461872 0.28841474 + -0.18688053 -0.30771495 -0.13111082 -0.28758849 -0.47827629 0.28402364 + 1.37572029 -3.87323739 1.81738214 -3.64180002 0.10067344] -0.024188971266994975 +DEBUG:mdp_playground.envs.gym_env_wrapper:total_transitions_episode: 1 Noise in transition: [ 0.16010566 0.02622503 -0.13391734 0.09039876 0.32600001 0.23677024 + -0.17593381 -0.31635537 -0.15581862 0.01033149 -0.58125769 -0.05469792 + -0.31147774 -0.18306684 -0.13606475 -0.07907504 0.10290763 0.26062834 + -0.03213367 0.34161587 -0.16629867 0.08787752 0.22586755] +DEBUG:mdp_playground.envs.gym_env_wrapper:sas'or: [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 + 0.00000000e+00 0.00000000e+00 0.00000000e+00 3.13270239e-03 + 4.12755577e-03 1.06635776e-03 2.29496561e-03 4.36249915e-04 + 4.35072424e-03 3.15853554e-03 8.21000000e-01 -6.00000000e-01 + 0.00000000e+00 2.56611054e-01 -3.37707943e-01 -2.75000000e-01 + 4.50000000e-01 -5.00000000e-02 -3.23000000e-01] [-0.09610166 0.691855 -0.5105356 0.4872774 -0.8497344 -0.02336558 + -0.56485224] [ 0.16013751 0.02655736 -0.13673393 0.090879 0.32045312 0.23664117 + -0.17957025 -0.31526854 -0.12720501 -0.17721969 -0.58767688 -0.42326442 + -0.32038087 -0.4256351 0.6849352 -0.67905798 0.10269957 0.5172394 + -0.36984161 0.06661587 0.28370133 0.03787752 -0.09713245] [ 0.16013751 0.02655736 -0.13673393 0.090879 0.32045312 0.23664117 + -0.17957025 -0.31526854 -0.12720501 -0.17721969 -0.58767688 -0.42326442 + -0.32038087 -0.4256351 0.6849352 -0.67905798 0.10269957 0.5172394 + -0.36984161 0.06661587 0.28370133 0.03787752 -0.09713245] -0.44646933818807744 +MDP Playground toy env instantiated with config: {'seed': 0, 'state_space_type': 'continuous', 'state_space_dim': 4, 'transition_dynamics_order': 1, 'inertia': 1, 'time_unit': 1, 'make_denser': True, 'target_point': [0, 0], 'target_radius': 0.05, 'state_space_max': 10, 'action_space_max': 1, 'action_loss_weight': 0.0, 'reward_function': 'move_to_a_point', 'image_representations': True, 'irrelevant_features': True, 'relevant_indices': [0, 1]} +Taking a step in the environment with a random action and printing the transition: +sars', done = [-1.0326167 0.95215625 -9.128669 5.937066 ] [-0.517571 -0.52710384 -0.11979694 -0.59993315] -0.202806 [-1.5501877 0.4250524 -9.248466 5.337133 ] False +Passed config: {'seed': 0, 'state_space_type': 'continuous', 'state_space_dim': 4, 'transition_dynamics_order': 1, 'inertia': 1, 'time_unit': 1, 'delay': 0, 'sequence_length': 10, 'reward_scale': 1.0, 'reward_noise': 0.1, 'transition_noise': 0.1, 'reward_function': 'move_along_a_line'} + +========================================================Initialising Toy MDP======================================================== +Current working directory: /home/rajanr/mdp-playground +Env SEED set to: 0. Returned seed from Gym: 0 +MDP Playground toy env instantiated with config: {'seed': 0, 'state_space_type': 'continuous', 'state_space_dim': 4, 'transition_dynamics_order': 1, 'inertia': 1, 'time_unit': 1, 'delay': 0, 'sequence_length': 10, 'reward_scale': 1.0, 'reward_noise': 0.1, 'transition_noise': 0.1, 'reward_function': 'move_along_a_line', 'relevant_indices': range(0, 4)} +Taking a step in the environment with a random action and printing the transition: +sars', done = [-0.9091467 -0.7278909 -0.53039795 0.8469715 ] [-2.6563277 0.6132033 -0.574738 1.8175125] -0.0535669373161111 [-3.5529015 -0.12789811 -1.0410937 2.674974 ] False +Passed config: {'seed': 0, 'state_space_type': 'grid', 'grid_shape': (8, 8), 'reward_function': 'move_to_a_point', 'make_denser': True, 'target_point': [5, 5]} + +========================================================Initialising Toy MDP======================================================== +Current working directory: /home/rajanr/mdp-playground +Env SEED set to: 0. Returned seed from Gym: 0 +MDP Playground toy env instantiated with config: {'seed': 0, 'state_space_type': 'grid', 'grid_shape': (8, 8), 'reward_function': 'move_to_a_point', 'make_denser': True, 'target_point': [5, 5]} +sars', done = [np.int64(5), np.int64(6)] [0, 1] -1.0 [np.int64(5), np.int64(7)] False +sars', done = [np.int64(5), np.int64(7)] [-1, 0] -1.0 [np.int64(4), np.int64(7)] False +sars', done = [np.int64(4), np.int64(7)] [-1, 0] -1.0 [np.int64(3), np.int64(7)] False +sars', done = [np.int64(3), np.int64(7)] [1, 0] 1.0 [np.int64(4), np.int64(7)] False +sars', done = [np.int64(4), np.int64(7)] [0.5, -0.5] 0.0 [np.int64(4), np.int64(7)] False +sars', done = [np.int64(4), np.int64(7)] [1, 2] 0.0 [np.int64(4), np.int64(7)] False +sars', done = [np.int64(4), np.int64(7)] [1, 1] 0.0 [np.int64(4), np.int64(7)] False +sars', done = [np.int64(4), np.int64(7)] [0, 1] 0.0 [np.int64(4), np.int64(7)] False +Passed config: {'seed': 0, 'state_space_type': 'grid', 'grid_shape': (8, 8), 'reward_function': 'move_to_a_point', 'make_denser': True, 'target_point': [5, 5], 'image_representations': True, 'terminal_states': [[5, 5], [2, 3], [2, 4], [3, 3], [3, 4]]} + +========================================================Initialising Toy MDP======================================================== +Current working directory: /home/rajanr/mdp-playground +Env SEED set to: 0. Returned seed from Gym: 0 +MDP Playground toy env instantiated with config: {'seed': 0, 'state_space_type': 'grid', 'grid_shape': (8, 8), 'reward_function': 'move_to_a_point', 'make_denser': True, 'target_point': [5, 5], 'image_representations': True, 'terminal_states': [[5, 5], [2, 3], [2, 4], [3, 3], [3, 4]]} +sars', done = [np.int64(5), np.int64(6)] [0, 1] -1.0 [np.int64(5), np.int64(7)] False +sars', done = [np.int64(5), np.int64(7)] [-1, 0] -1.0 [np.int64(4), np.int64(7)] False +sars', done = [np.int64(4), np.int64(7)] [-1, 0] -1.0 [np.int64(3), np.int64(7)] False +sars', done = [np.int64(3), np.int64(7)] [1, 0] 1.0 [np.int64(4), np.int64(7)] False +sars', done = [np.int64(4), np.int64(7)] [0.5, -0.5] 0.0 [np.int64(4), np.int64(7)] False +sars', done = [np.int64(4), np.int64(7)] [1, 2] 0.0 [np.int64(4), np.int64(7)] False +Logger name: mdp_playground.envs.gym_env_wrapper +Logger level set to: 0 +Env SEED set to: 0. Returned seed from Gym: 0 +Taking 10 steps in the environment with a random action and printing the transition: +s.shape a r s'.shape, done = (210, 160, 3) 0 0.010490011715303971 (210, 160, 3) False +s.shape a r s'.shape, done = (210, 160, 3) 2 0.03615950549094848 (210, 160, 3) False +s.shape a r s'.shape, done = (210, 160, 3) 3 0.09470809631292422 (210, 160, 3) False +s.shape a r s'.shape, done = (210, 160, 3) 5 -0.12654214710460526 (210, 160, 3) False +s.shape a r s'.shape, done = (210, 160, 3) 5 0.00413259793472436 (210, 160, 3) False +s.shape a r s'.shape, done = (210, 160, 3) 1 -0.021879166393254573 (210, 160, 3) False +s.shape a r s'.shape, done = (210, 160, 3) 2 -0.07322673547034517 (210, 160, 3) False +s.shape a r s'.shape, done = (210, 160, 3) 4 -0.031630015636915455 (210, 160, 3) False +s.shape a r s'.shape, done = (210, 160, 3) 5 0.10425133694426776 (210, 160, 3) False +s.shape a r s'.shape, done = (210, 160, 3) 0 0.1366463470549686 (210, 160, 3) False +Setting Mujoco self.action_space.low, self.action_space.high from: [-1. -1. -1. -1. -1. -1.] [1. 1. 1. 1. 1. 1.] +to: [-0.5 -0.5 -0.5 -0.5 -0.5 -0.5] [0.5 0.5 0.5 0.5 0.5 0.5] +Original frame_skip for Mujoco Env: 5 +Setting Mujoco self.frame_skip to 2 corresponding to time_unit in config. +Setting Mujoco self._ctrl_cost_weight, self._forward_reward_weight to 0.05 0.5 corresponding to time_unit in config. +Logger name: mdp_playground.envs.gym_env_wrapper +Logger level set to: 0 +Env SEED set to: 0. Returned seed from Gym: 0 +Taking a step in the environment with a random action and printing the transition: +sars', done = [-0.04604266 -0.0918053 -0.09669447 0.06265405 0.08255112 0.02132716 + 0.04589931 0.008725 -0.12654215 -0.06232745 0.0041326 -0.23250308 + -0.02187917 -0.12459109 -0.07322674 -0.0544259 -0.03163002] [-0.04805083 0.3459275 -0.2552678 0.2436387 -0.4248672 -0.01168279] -0.024188971266994975 [ 0.10950414 -0.06336018 -0.2275261 0.18317325 0.3461872 0.28841474 + -0.18688053 -0.30771495 -0.13111082 -0.28758849 -0.47827629 0.28402364 + 1.37572029 -3.87323739 1.81738214 -3.64180002 0.10067344] False +Setting Mujoco self.action_space.low, self.action_space.high from: [-2. -2. -2. -2. -2. -2. -2.] [2. 2. 2. 2. 2. 2. 2.] +to: [-1. -1. -1. -1. -1. -1. -1.] [1. 1. 1. 1. 1. 1. 1.] +Original frame_skip for Mujoco Env: 5 +Setting Mujoco self.frame_skip to 2 corresponding to time_unit in config. +Current mujoco env is not HalfCheetah v4, so only modified frameskip when changing time_unit. Not changing the _ctrl_cost_weight or _forward_reward_weight. It may make sense to also modify these variables depending on their relation with the time_unit. You will need to look deeper into how the reward function is defined to know if this is needed. +Logger name: mdp_playground.envs.gym_env_wrapper +Logger level set to: 0 +Env SEED set to: 0. Returned seed from Gym: 0 +Taking a step in the environment with a random action and printing the transition: +sars', done = [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 + 0.00000000e+00 0.00000000e+00 0.00000000e+00 3.13270239e-03 + 4.12755577e-03 1.06635776e-03 2.29496561e-03 4.36249915e-04 + 4.35072424e-03 3.15853554e-03 8.21000000e-01 -6.00000000e-01 + 0.00000000e+00 2.56611054e-01 -3.37707943e-01 -2.75000000e-01 + 4.50000000e-01 -5.00000000e-02 -3.23000000e-01] [-0.09610166 0.691855 -0.5105356 0.4872774 -0.8497344 -0.02336558 + -0.56485224] -0.44646933818807744 DEBUG:mdp_playground.envs.gym_env_wrapper:total_transitions_episode: 1 Noise in transition: [ 0.16010566 0.02622503 -0.13391734 0.09039876 0.32600001 0.23677024 + -0.17593381 -0.31635537 -0.15581862 0.01033149 -0.58125769] +DEBUG:mdp_playground.envs.gym_env_wrapper:sas'or: [ 9.99624853e-01 9.98940224e-01 2.73889120e-02 -4.60263911e-02 + 4.26543103e-02 9.17986244e-02 4.36249915e-04 4.35072424e-03 + 1.67289045e-01 -9.11111494e-02 0.00000000e+00] [-0.04805083 0.3459275 ] [ 1.15974342 1.02531983 -0.10700366 0.04785959 0.36865432 0.32856887 + -0.27129071 0.3762751 0.01147739 -0.08049602 -0.58125769] [ 1.15974342 1.02531983 -0.10700366 0.04785959 0.36865432 0.32856887 + -0.27129071 0.3762751 0.01147739 -0.08049602 -0.58125769] -0.15623291976154724 +Seeds set to:{'env': None, 'relevant_state_space': 9004017047643832299, 'relevant_action_space': 4834550647764529712, 'irrelevant_state_space': 3717188456764710370, 'irrelevant_action_space': 8356878982445552517, 'state_space': 3589039499206773928, 'action_space': 6966239785405507687, 'image_representations': 6875145353107871371} +WARNING:mdp_playground.envs.rl_toy_env:Seeds set to:{'env': None, 'relevant_state_space': 9004017047643832299, 'relevant_action_space': 4834550647764529712, 'irrelevant_state_space': 3717188456764710370, 'irrelevant_action_space': 8356878982445552517, 'state_space': 3589039499206773928, 'action_space': 6966239785405507687, 'image_representations': 6875145353107871371} +Inited terminal states to self.config['terminal_states']: [7 6]. Total 2 +WARNING:mdp_playground.envs.rl_toy_env:Inited terminal states to self.config['terminal_states']: [7 6]. Total 2 +self.relevant_init_state_dist:[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 + 0. 0. ] +WARNING:mdp_playground.envs.rl_toy_env:self.relevant_init_state_dist:[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 + 0. 0. ] +DEBUG:mdp_playground.envs.rl_toy_env:No. of choices for each element in a possible sequence (Total no. of permutations will be a product of this), no. of possible perms per independent set: [6], 6 +DEBUG:mdp_playground.envs.rl_toy_env:Number of generated sequences that did not clash with an existing one when it was generated:0 +DEBUG:mdp_playground.envs.rl_toy_env:Total no. of rewarded sequences:1Out of6per independent set +specific_sequence that will be rewarded(4,) +WARNING:mdp_playground.envs.rl_toy_env:specific_sequence that will be rewarded(4,) +RESET called. curr_state reset to: 3 +INFO:mdp_playground.envs.rl_toy_env:RESET called. curr_state reset to: 3 + self.delay, self.sequence_length:01 +INFO:mdp_playground.envs.rl_toy_env: self.delay, self.sequence_length:01 +DEBUG:mdp_playground.envs.rl_toy_env:self.augmented_state, len: [nan, 3], 2 +DEBUG:mdp_playground.envs.rl_toy_env:MDP Playground toy env instantiated with config: {'state_space_size': 8, 'action_space_size': 8, 'state_space_type': 'discrete', 'action_space_type': 'discrete', 'terminal_state_density': 0.25, 'maximally_connected': True, 'terminal_states': array([7, 6]), 'relevant_init_state_dist': array([0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, + 0.16666667, 0. , 0. ]), 'transition_function': . at 0x7b479f72e440>} +Seeds set to:{'env': None, 'relevant_state_space': 7063060476891979449, 'relevant_action_space': 1709824039924999301, 'irrelevant_state_space': 6643936351744273560, 'irrelevant_action_space': 4629118677738699341, 'state_space': 6986896399128717799, 'action_space': 1441446296418599557, 'image_representations': 228710266027316918} +WARNING:mdp_playground.envs.rl_toy_env:Seeds set to:{'env': None, 'relevant_state_space': 7063060476891979449, 'relevant_action_space': 1709824039924999301, 'irrelevant_state_space': 6643936351744273560, 'irrelevant_action_space': 4629118677738699341, 'state_space': 6986896399128717799, 'action_space': 1441446296418599557, 'image_representations': 228710266027316918} +Inited terminal states to self.config['terminal_states']: [7 6]. Total 2 +WARNING:mdp_playground.envs.rl_toy_env:Inited terminal states to self.config['terminal_states']: [7 6]. Total 2 +self.relevant_init_state_dist:[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 + 0. 0. ] +WARNING:mdp_playground.envs.rl_toy_env:self.relevant_init_state_dist:[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 + 0. 0. ] +DEBUG:mdp_playground.envs.rl_toy_env:No. of choices for each element in a possible sequence (Total no. of permutations will be a product of this), no. of possible perms per independent set: [6], 6 +DEBUG:mdp_playground.envs.rl_toy_env:Number of generated sequences that did not clash with an existing one when it was generated:0 +DEBUG:mdp_playground.envs.rl_toy_env:Total no. of rewarded sequences:1Out of6per independent set +specific_sequence that will be rewarded(1,) +WARNING:mdp_playground.envs.rl_toy_env:specific_sequence that will be rewarded(1,) +RESET called. curr_state reset to: 4 +INFO:mdp_playground.envs.rl_toy_env:RESET called. curr_state reset to: 4 + self.delay, self.sequence_length:01 +INFO:mdp_playground.envs.rl_toy_env: self.delay, self.sequence_length:01 +DEBUG:mdp_playground.envs.rl_toy_env:self.augmented_state, len: [nan, 4], 2 +DEBUG:mdp_playground.envs.rl_toy_env:MDP Playground toy env instantiated with config: {'state_space_size': 8, 'action_space_size': 8, 'state_space_type': 'discrete', 'action_space_type': 'discrete', 'maximally_connected': True, 'terminal_states': array([7, 6]), 'relevant_init_state_dist': array([0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, + 0.16666667, 0. , 0. ]), 'transition_function': . at 0x7b482737b640>} +/home/rajanr/anaconda3/envs/unc/lib/python3.10/site-packages/gymnasium/utils/passive_env_checker.py:181: DeprecationWarning: WARN: Current gymnasium version requires that `Env.reset` can be passed `options` to allow the environment initialisation to be passed additional information. + logger.deprecation( +Noise stats for previous episode num.: 1 (total abs. noise in rewards, total abs. noise in transitions, total reward, total noisy transitions, total transitions): 0 None 0 0 0 +INFO:mdp_playground.envs.rl_toy_env:Noise stats for previous episode num.: 1 (total abs. noise in rewards, total abs. noise in transitions, total reward, total noisy transitions, total transitions): 0 None 0 0 0 +RESET called. curr_state reset to: 5 +INFO:mdp_playground.envs.rl_toy_env:RESET called. curr_state reset to: 5 + self.delay, self.sequence_length:01 +INFO:mdp_playground.envs.rl_toy_env: self.delay, self.sequence_length:01 +DEBUG:mdp_playground.envs.rl_toy_env:state_considered for reward:[5, 4] with delay 0 +DEBUG:mdp_playground.envs.rl_toy_env:rew0.0 +Reward: 0.0 Noise in reward: 0 +INFO:mdp_playground.envs.rl_toy_env:Reward: 0.0 Noise in reward: 0 +DEBUG:mdp_playground.envs.rl_toy_env:sas'r: 5 3 4 0.0 +(4, 0.0, False, False, {'curr_state': 4, 'curr_obs': 4, 'augmented_state': [5, 4]}) +INFO:mdp_playground:(4, 0.0, False, False, {'curr_state': 4, 'curr_obs': 4, 'augmented_state': [5, 4]}) +DEBUG:mdp_playground.envs.rl_toy_env:state_considered for reward:[4, 6] with delay 0 +DEBUG:mdp_playground.envs.rl_toy_env:rew0.0 +Reward: 0.0 Noise in reward: 0 +INFO:mdp_playground.envs.rl_toy_env:Reward: 0.0 Noise in reward: 0 +DEBUG:mdp_playground.envs.rl_toy_env:sas'r: 4 0 6 0.0 +(6, 0.0, True, False, {'curr_state': 6, 'curr_obs': 6, 'augmented_state': [4, 6]}) +INFO:mdp_playground:(6, 0.0, True, False, {'curr_state': 6, 'curr_obs': 6, 'augmented_state': [4, 6]}) +DEBUG:mdp_playground.envs.rl_toy_env:state_considered for reward:[6, 6] with delay 0 +DEBUG:mdp_playground.envs.rl_toy_env:rew0.0 +Reward: 0.0 Noise in reward: 0 +INFO:mdp_playground.envs.rl_toy_env:Reward: 0.0 Noise in reward: 0 +DEBUG:mdp_playground.envs.rl_toy_env:sas'r: 6 0 6 0.0 +(6, 0.0, True, False, {'curr_state': 6, 'curr_obs': 6, 'augmented_state': [6, 6]}) +INFO:mdp_playground:(6, 0.0, True, False, {'curr_state': 6, 'curr_obs': 6, 'augmented_state': [6, 6]}) +DEBUG:mdp_playground.envs.rl_toy_env:state_considered for reward:[6, 6] with delay 0 +DEBUG:mdp_playground.envs.rl_toy_env:rew0.0 +Reward: 0.0 Noise in reward: 0 +INFO:mdp_playground.envs.rl_toy_env:Reward: 0.0 Noise in reward: 0 +DEBUG:mdp_playground.envs.rl_toy_env:sas'r: 6 5 6 0.0 +(6, 0.0, True, False, {'curr_state': 6, 'curr_obs': 6, 'augmented_state': [6, 6]}) +INFO:mdp_playground:(6, 0.0, True, False, {'curr_state': 6, 'curr_obs': 6, 'augmented_state': [6, 6]}) +DEBUG:mdp_playground.envs.rl_toy_env:state_considered for reward:[6, 6] with delay 0 +DEBUG:mdp_playground.envs.rl_toy_env:rew0.0 +Reward: 0.0 Noise in reward: 0 +INFO:mdp_playground.envs.rl_toy_env:Reward: 0.0 Noise in reward: 0 +DEBUG:mdp_playground.envs.rl_toy_env:sas'r: 6 2 6 0.0 +(6, 0.0, True, False, {'curr_state': 6, 'curr_obs': 6, 'augmented_state': [6, 6]}) +INFO:mdp_playground:(6, 0.0, True, False, {'curr_state': 6, 'curr_obs': 6, 'augmented_state': [6, 6]}) +DEBUG:mdp_playground.envs.rl_toy_env:state_considered for reward:[6, 6] with delay 0 +DEBUG:mdp_playground.envs.rl_toy_env:rew0.0 +Reward: 0.0 Noise in reward: 0 +INFO:mdp_playground.envs.rl_toy_env:Reward: 0.0 Noise in reward: 0 +DEBUG:mdp_playground.envs.rl_toy_env:sas'r: 6 0 6 0.0 +(6, 0.0, True, False, {'curr_state': 6, 'curr_obs': 6, 'augmented_state': [6, 6]}) +INFO:mdp_playground:(6, 0.0, True, False, {'curr_state': 6, 'curr_obs': 6, 'augmented_state': [6, 6]}) +DEBUG:mdp_playground.envs.rl_toy_env:state_considered for reward:[6, 6] with delay 0 +DEBUG:mdp_playground.envs.rl_toy_env:rew0.0 +Reward: 0.0 Noise in reward: 0 +INFO:mdp_playground.envs.rl_toy_env:Reward: 0.0 Noise in reward: 0 +DEBUG:mdp_playground.envs.rl_toy_env:sas'r: 6 4 6 0.0 +(6, 0.0, True, False, {'curr_state': 6, 'curr_obs': 6, 'augmented_state': [6, 6]}) +INFO:mdp_playground:(6, 0.0, True, False, {'curr_state': 6, 'curr_obs': 6, 'augmented_state': [6, 6]}) +DEBUG:mdp_playground.envs.rl_toy_env:state_considered for reward:[6, 6] with delay 0 +DEBUG:mdp_playground.envs.rl_toy_env:rew0.0 +Reward: 0.0 Noise in reward: 0 +INFO:mdp_playground.envs.rl_toy_env:Reward: 0.0 Noise in reward: 0 +DEBUG:mdp_playground.envs.rl_toy_env:sas'r: 6 4 6 0.0 +(6, 0.0, True, False, {'curr_state': 6, 'curr_obs': 6, 'augmented_state': [6, 6]}) +INFO:mdp_playground:(6, 0.0, True, False, {'curr_state': 6, 'curr_obs': 6, 'augmented_state': [6, 6]}) +DEBUG:mdp_playground.envs.rl_toy_env:state_considered for reward:[6, 6] with delay 0 +DEBUG:mdp_playground.envs.rl_toy_env:rew0.0 +Reward: 0.0 Noise in reward: 0 +INFO:mdp_playground.envs.rl_toy_env:Reward: 0.0 Noise in reward: 0 +DEBUG:mdp_playground.envs.rl_toy_env:sas'r: 6 5 6 0.0 +(6, 0.0, True, False, {'curr_state': 6, 'curr_obs': 6, 'augmented_state': [6, 6]}) +INFO:mdp_playground:(6, 0.0, True, False, {'curr_state': 6, 'curr_obs': 6, 'augmented_state': [6, 6]}) +DEBUG:mdp_playground.envs.rl_toy_env:state_considered for reward:[6, 6] with delay 0 +DEBUG:mdp_playground.envs.rl_toy_env:rew0.0 +Reward: 0.0 Noise in reward: 0 +INFO:mdp_playground.envs.rl_toy_env:Reward: 0.0 Noise in reward: 0 +DEBUG:mdp_playground.envs.rl_toy_env:sas'r: 6 4 6 0.0 +(6, 0.0, True, False, {'curr_state': 6, 'curr_obs': 6, 'augmented_state': [6, 6]}) +INFO:mdp_playground:(6, 0.0, True, False, {'curr_state': 6, 'curr_obs': 6, 'augmented_state': [6, 6]}) +[ 0.16013751 0.02655736 -0.13673393 0.090879 0.32045312 0.23664117 + -0.17957025 -0.31526854 -0.12720501 -0.17721969 -0.58767688 -0.42326442 + -0.32038087 -0.4256351 0.6849352 -0.67905798 0.10269957 0.5172394 + -0.36984161 0.06661587 0.28370133 0.03787752 -0.09713245] False +Setting Mujoco self.action_space.low, self.action_space.high from: [-1. -1.] [1. 1.] +to: [-0.5 -0.5] [0.5 0.5] +Original frame_skip for Mujoco Env: 2 +Setting Mujoco self.frame_skip to 1 corresponding to time_unit in config. +Current mujoco env is not HalfCheetah v4, so only modified frameskip when changing time_unit. Not changing the _ctrl_cost_weight or _forward_reward_weight. It may make sense to also modify these variables depending on their relation with the time_unit. You will need to look deeper into how the reward function is defined to know if this is needed. +Logger name: mdp_playground.envs.gym_env_wrapper +Logger level set to: 0 +Env SEED set to: 0. Returned seed from Gym: 0 +Taking a step in the environment with a random action and printing the transition: +sars', done = [ 9.99624853e-01 9.98940224e-01 2.73889120e-02 -4.60263911e-02 + 4.26543103e-02 9.17986244e-02 4.36249915e-04 4.35072424e-03 + 1.67289045e-01 -9.11111494e-02 0.00000000e+00] [-0.04805083 0.3459275 ] -0.15623291976154724 [ 1.15974342 1.02531983 -0.10700366 0.04785959 0.36865432 0.32856887 + -0.27129071 0.3762751 0.01147739 -0.08049602 -0.58125769] False +Passed config: {} + +========================================================Initialising Toy MDP======================================================== +Current working directory: /home/rajanr/mdp-playground +Env SEED set to: None. Returned seed from Gym: 69231007977469579473528112087359450090 +transition_matrix inited to: +[[1 3 7 4 6 0 5 2] + [1 0 7 4 5 2 6 3] + [6 5 4 1 7 3 2 0] + [7 0 6 4 3 5 2 1] + [6 3 5 0 1 2 4 7] + [6 4 5 0 7 1 3 2] + [6 6 6 6 6 6 6 6] + [7 7 7 7 7 7 7 7]] +Python type of state: +rewardable_sequences: {(4,): 1.0} +MDP Playground toy env instantiated with config: {'state_space_size': 8, 'action_space_size': 8, 'state_space_type': 'discrete', 'action_space_type': 'discrete', 'terminal_state_density': 0.25, 'maximally_connected': True, 'terminal_states': array([7, 6]), 'relevant_init_state_dist': array([0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, + 0.16666667, 0. , 0. ]), 'transition_function': . at 0x7b479f72e440>} +Passed config: {'state_space_size': 8, 'action_space_size': 8, 'state_space_type': 'discrete', 'action_space_type': 'discrete', 'maximally_connected': True} + +========================================================Initialising Toy MDP======================================================== +Current working directory: /home/rajanr/mdp-playground +Env SEED set to: None. Returned seed from Gym: 210298697754387723685784222489111545910 +transition_matrix inited to: +[[3 1 2 5 0 4 7 6] + [1 0 5 7 3 2 4 6] + [7 4 5 1 0 3 6 2] + [2 7 3 4 1 6 5 0] + [6 3 5 7 4 1 2 0] + [5 3 6 4 7 2 1 0] + [6 6 6 6 6 6 6 6] + [7 7 7 7 7 7 7 7]] +Python type of state: +rewardable_sequences: {(1,): 1.0} +MDP Playground toy env instantiated with config: {'state_space_size': 8, 'action_space_size': 8, 'state_space_type': 'discrete', 'action_space_type': 'discrete', 'maximally_connected': True, 'terminal_states': array([7, 6]), 'relevant_init_state_dist': array([0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, + 0.16666667, 0. , 0. ]), 'transition_function': . at 0x7b482737b640>}