From ce5aa6a8f4960abcc8440c858c765a17c76d4fd0 Mon Sep 17 00:00:00 2001
From: Raghu Rajan <RaghuSpaceRajan@gmail.com>
Date: Thu, 21 Nov 2024 17:37:59 +0100
Subject: [PATCH] Allow dtype_s and dtype_o of toy envs to be set for the
 underlying state space and observation space, respt. (action_space is
 currently set the same as the state space); partially fix some text cases.

---
 mdp_playground/envs/rl_toy_env.py | 56 +++++++++++++++++++------------
 tests/test_mdp_playground.py      | 17 +++++-----
 2 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/mdp_playground/envs/rl_toy_env.py b/mdp_playground/envs/rl_toy_env.py
index fd97960..88b8001 100644
--- a/mdp_playground/envs/rl_toy_env.py
+++ b/mdp_playground/envs/rl_toy_env.py
@@ -176,6 +176,8 @@ class RLToyEnv(gym.Env):
         The externally visible observation space for the enviroment.
     action_space : Gym.Space
         The externally visible action space for the enviroment.
+    feature_space : Gym.Space
+        In case of continuous and grid environments, this is the underlying state space. ##TODO Unify this across all types of environments.
     rewardable_sequences : dict
         holds the rewardable sequences. The keys are tuples of rewardable sequences and values are the rewards handed out. When make_denser is True for discrete environments, this dict also holds the rewardable partial sequences.
 
@@ -519,7 +521,6 @@ def __init__(self, **config):
         elif config["state_space_type"] == "grid":
             assert "grid_shape" in config
             self.grid_shape = config["grid_shape"]
-            self.grid_np_data_type = np.int64
         else:
             raise ValueError("Unknown state_space_type")
 
@@ -546,9 +547,9 @@ def __init__(self, **config):
         else:
             self.repeats_in_sequences = config["repeats_in_sequences"]
 
-        self.dtype = np.float32 if "dtype" not in config else config["dtype"]
 
         if config["state_space_type"] == "discrete":
+            self.dtype_s = np.int64 if "dtype_s" not in config else config["dtype_s"]
             if self.irrelevant_features:
                 assert (
                     len(config["action_space_size"]) == 2
@@ -570,6 +571,7 @@ def __init__(self, **config):
                 )
                 # assert (np.array(self.state_space_size) % np.array(self.diameter) == 0).all(), "state_space_size should be a multiple of the diameter to allow for the generation of regularly connected MDPs."
         elif config["state_space_type"] == "continuous":
+            self.dtype_s = np.float32 if "dtype_s" not in config else config["dtype_s"]
             self.action_space_dim = self.state_space_dim
             if self.irrelevant_features:
                 assert (
@@ -580,10 +582,18 @@ def __init__(self, **config):
                 config["relevant_indices"] = range(self.state_space_dim)
             # config["irrelevant_indices"] = list(set(range(len(config["state_space_dim"]))) - set(config["relevant_indices"]))
         elif config["state_space_type"] == "grid":
+            self.dtype_s = np.int64 if "dtype_s" not in config else config["dtype_s"]
             # Repeat the grid for the irrelevant part as well
             if self.irrelevant_features:
                 self.grid_shape = self.grid_shape * 2
 
+        # Set the dtype for the observation space:
+        if self.image_representations:
+            self.dtype_o = np.float32 if "dtype_o" not in config else config["dtype_o"]
+        else:
+            self.dtype_o = self.dtype_s if "dtype_o" not in config else config["dtype_o"]
+
+
         if ("init_state_dist" in config) and ("relevant_init_state_dist" not in config):
             config["relevant_init_state_dist"] = config["init_state_dist"]
 
@@ -614,7 +624,7 @@ def __init__(self, **config):
                 assert self.sequence_length == 1
                 if "target_point" in config:
                     self.target_point = np.array(
-                        config["target_point"], dtype=self.dtype
+                        config["target_point"], dtype=self.dtype_s
                     )
                     assert self.target_point.shape == (
                         len(config["relevant_indices"]),
@@ -640,6 +650,7 @@ def __init__(self, **config):
                 DiscreteExtended(
                     self.state_space_size[0],
                     seed=self.seed_dict["relevant_state_space"],
+                    # dtype=self.dtype_o,  # Gymnasium seems to hardcode as np.int64
                 )
             ]  # #seed #hardcoded, many time below as well
             self.action_spaces = [
@@ -671,7 +682,7 @@ def __init__(self, **config):
             # self.action_spaces[i] = DiscreteExtended(self.action_space_size[i],
             # seed=self.seed_dict["irrelevant_action_space"]) #seed
 
-            if self.image_representations:
+            if self.image_representations:  # for discrete envs
                 # underlying_obs_space = MultiDiscreteExtended(self.state_space_size, seed=self.seed_dict["state_space"]) #seed
                 self.observation_space = ImageMultiDiscrete(
                     self.state_space_size,
@@ -714,7 +725,7 @@ def __init__(self, **config):
                 self.state_space_max,
                 shape=(self.state_space_dim,),
                 seed=self.seed_dict["state_space"],
-                dtype=self.dtype,
+                dtype=self.dtype_s,
             )  # #seed
             # hack #TODO # low and high are 1st 2 and required arguments
             # for instantiating BoxExtended
@@ -729,7 +740,7 @@ def __init__(self, **config):
                 self.action_space_max,
                 shape=(self.action_space_dim,),
                 seed=self.seed_dict["action_space"],
-                dtype=self.dtype,
+                dtype=self.dtype_s,
             )  # #seed
             # hack #TODO
 
@@ -754,7 +765,7 @@ def __init__(self, **config):
                 0 * underlying_space_maxes,
                 underlying_space_maxes,
                 seed=self.seed_dict["state_space"],
-                dtype=self.dtype,
+                dtype=self.dtype_s,
             )  # #seed
 
             lows = np.array([-1] * len(self.grid_shape))
@@ -893,7 +904,7 @@ def init_terminal_states(self):
                         # print("Term state lows, highs:", lows, highs)
                         self.term_spaces.append(
                             BoxExtended(
-                                low=lows, high=highs, seed=self.seed_, dtype=self.dtype
+                                low=lows, high=highs, seed=self.seed_, dtype=self.dtype_s
                             )
                         )  # #seed #hack #TODO
                     self.logger.debug(
@@ -931,7 +942,7 @@ def init_terminal_states(self):
                         highs = term_state  # #hardcoded
                         self.term_spaces.append(
                             BoxExtended(
-                                low=lows, high=highs, seed=self.seed_, dtype=self.grid_np_data_type
+                                low=lows, high=highs, seed=self.seed_, dtype=self.dtype_s
                             )
                         )  # #seed #hack #TODO
 
@@ -1657,7 +1668,7 @@ def transition_function(self, state, action):
                 # for a "wall", but would need to take care of multiple
                 # reflections near a corner/edge.
                 # Resets all higher order derivatives to 0
-                zero_state = np.array([0.0] * (self.state_space_dim), dtype=self.dtype)
+                zero_state = np.array([0.0] * (self.state_space_dim), dtype=self.dtype_s)
                 # #####IMP to have copy() otherwise it's the same array
                 # (in memory) at every position in the list:
                 self.state_derivatives = [
@@ -1666,7 +1677,7 @@ def transition_function(self, state, action):
                 self.state_derivatives[0] = next_state
 
             if self.config["reward_function"] == "move_to_a_point":
-                next_state_rel = np.array(next_state, dtype=self.dtype)[
+                next_state_rel = np.array(next_state, dtype=self.dtype_s)[
                     self.config["relevant_indices"]
                 ]
                 dist_ = np.linalg.norm(next_state_rel - self.target_point)
@@ -1678,7 +1689,7 @@ def transition_function(self, state, action):
             # Need to check that dtype is int because Gym doesn't
             if (
                 self.action_space.contains(action)
-                and np.array(action).dtype == self.grid_np_data_type
+                and np.array(action).dtype == self.dtype_s
             ):
                 if self.transition_noise:
                     # self._np_random.choice only works for 1-D arrays
@@ -1820,7 +1831,7 @@ def reward_function(self, state, action):
                     # of the formulae and see that programmatic results match: should 
                     # also have a unit version of 4. for dist_of_pt_from_line() and 
                     # an integration version here for total_deviation calc.?.
-                    data_ = np.array(state_considered, dtype=self.dtype)[
+                    data_ = np.array(state_considered, dtype=self.dtype_s)[
                         1 + delay : self.augmented_state_length,
                         self.config["relevant_indices"],
                     ]
@@ -1863,10 +1874,10 @@ def reward_function(self, state, action):
                     # that. #TODO Generate it randomly to have random Rs?
                     if self.make_denser:
                         old_relevant_state = np.array(
-                            state_considered, dtype=self.dtype
+                            state_considered, dtype=self.dtype_s
                         )[-2, self.config["relevant_indices"]]
                         new_relevant_state = np.array(
-                            state_considered, dtype=self.dtype
+                            state_considered, dtype=self.dtype_s
                         )[-1, self.config["relevant_indices"]]
                         reward = -np.linalg.norm(new_relevant_state - self.target_point)
                         # Should allow other powers of the distance from target_point,
@@ -1879,7 +1890,7 @@ def reward_function(self, state, action):
                         # TODO also make_denser, sparse rewards only at target
                     else:  # sparse reward
                         new_relevant_state = np.array(
-                            state_considered, dtype=self.dtype
+                            state_considered, dtype=self.dtype_s
                         )[-1, self.config["relevant_indices"]]
                         if (
                             np.linalg.norm(new_relevant_state - self.target_point)
@@ -1890,7 +1901,7 @@ def reward_function(self, state, action):
                             # stay in the radius and earn more reward.
 
                     reward -= self.action_loss_weight * np.linalg.norm(
-                        np.array(action, dtype=self.dtype)
+                        np.array(action, dtype=self.dtype_s)
                     )
 
         elif self.config["state_space_type"] == "grid":
@@ -2044,8 +2055,8 @@ def step(self, action, imaginary_rollout=False):
         if self.image_representations:
             next_obs = self.observation_space.get_concatenated_image(next_state)
 
-        self.curr_state = next_state
-        self.curr_obs = next_obs
+        self.curr_state = self.dtype_s(next_state)
+        self.curr_obs = self.dtype_o(next_obs)
 
         # #### TODO curr_state is external state, while we need to check relevant state for terminality! Done - by using augmented_state now instead of curr_state!
         self.done = (
@@ -2199,7 +2210,7 @@ def reset(self, seed=None):
 
             # if not self.use_custom_mdp:
             # init the state derivatives needed for continuous spaces
-            zero_state = np.array([0.0] * (self.state_space_dim), dtype=self.dtype)
+            zero_state = np.array([0.0] * (self.state_space_dim), dtype=self.dtype_s)
             self.state_derivatives = [
                 zero_state.copy() for i in range(self.dynamics_order + 1)
             ]  # #####IMP to have copy()
@@ -2217,7 +2228,7 @@ def reset(self, seed=None):
             while True:  # Be careful about infinite loops
                 term_space_was_sampled = False
                 # curr_state is an np.array while curr_state_relevant is a list
-                self.curr_state = self.feature_space.sample().astype(int)  # #random
+                self.curr_state = self.feature_space.sample().astype(self.dtype_s)  # #random
                 self.curr_state_relevant = list(self.curr_state[[0, 1]])  # #hardcoded
                 if self.is_terminal_state(self.curr_state_relevant):
                     self.logger.debug(
@@ -2241,6 +2252,9 @@ def reset(self, seed=None):
         else:
             self.curr_obs = self.curr_state
 
+        self.curr_state = self.dtype_s(self.curr_state)
+        self.curr_obs = self.dtype_o(self.curr_obs)
+
         self.logger.info("RESET called. curr_state reset to: " + str(self.curr_state))
         self.reached_terminal = False
 
diff --git a/tests/test_mdp_playground.py b/tests/test_mdp_playground.py
index 035d190..6271875 100644
--- a/tests/test_mdp_playground.py
+++ b/tests/test_mdp_playground.py
@@ -173,7 +173,7 @@ def test_continuous_dynamics_move_along_a_line(self):
         # Test 5: R noise - same as Test 1 above except with reward noise and with only 5 steps
         # instead of 20.
         print("\nTest 5: \033[32;1;4mTEST_CONTINUOUS_DYNAMICS_R_NOISE\033[0m")
-        config["reward_noise"] = lambda a: a.normal(0, 0.5)
+        config["reward_noise"] = lambda s, a, rng: rng.normal(0, 0.5)
         config["delay"] = 0
         env = RLToyEnv(**config)
         state = env.get_augmented_state()["curr_state"].copy()  # env.reset()[0]
@@ -303,7 +303,7 @@ def test_continuous_dynamics_move_along_a_line(self):
 
         # Test P noise
         print("\nTest 9: \033[32;1;4mTEST_CONTINUOUS_DYNAMICS_P_NOISE\033[0m")
-        config["transition_noise"] = lambda a: a.normal([0] * 7, [0.5] * 7)
+        config["transition_noise"] = lambda s, a, rng: rng.normal([0] * 7, [0.5] * 7)
         # Reset seed to have states far away from state maxes so that it is easier to 
         # test stuff below, but in the end, the state is clipped to [-5, 5] anyway
         # while testing, so this wasn't really needed.
@@ -1243,9 +1243,10 @@ def test_discrete_dynamics(self):
         config["generate_random_mdp"] = True
         env = RLToyEnv(**config)
         state = env.get_augmented_state()["curr_state"]
-        self.assertEqual(
-            type(state), int, "Type of discrete state should be int."
-        )  # TODO Move this and the test_continuous_dynamics type checks to separate unit tests
+        if type(state) != int:
+            self.assertEqual(
+                state.dtype, env.observation_space.dtype, "Type of discrete state should be: " + str(env.observation_space.dtype)
+            )  # TODO Move this and the test_continuous_dynamics type checks to separate unit tests
 
         action = 2
         next_state, reward, done, trunc, info = env.step(action)
@@ -1482,7 +1483,7 @@ def test_discrete_r_noise(self):
         config["delay"] = 0
         config["sequence_length"] = 1
         config["reward_scale"] = 1.0
-        config["reward_noise"] = lambda a: a.normal(0, 0.5)
+        config["reward_noise"] = lambda s, a, rng: rng.normal(0, 0.5)
 
         config["generate_random_mdp"] = True
         config["log_level"] = logging.INFO
@@ -1545,7 +1546,7 @@ def test_discrete_multiple_meta_features(self):
         config["reward_scale"] = 2.5
         config["reward_shift"] = -1.75
         # config["transition_noise"] = 0.1
-        config["reward_noise"] = lambda a: a.normal(0, 0.5)
+        config["reward_noise"] = lambda s, a, rng: rng.normal(0, 0.5)
 
         config["generate_random_mdp"] = True
         env = RLToyEnv(**config)
@@ -1804,7 +1805,7 @@ def test_discrete_image_representations(self):
         config["reward_scale"] = 2.5
         config["reward_shift"] = -1.75
         # config["transition_noise"] = 0.1
-        config["reward_noise"] = lambda a: a.normal(0, 0.5)
+        config["reward_noise"] = lambda s, a, rng: rng.normal(0, 0.5)
 
         config["generate_random_mdp"] = True