From c3c8b5ba5cb390232911843c028deb99e1867018 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CJa4822=E2=80=9D?= <3471606159@qq.com>
Date: Tue, 26 Sep 2023 19:57:22 -0700
Subject: [PATCH 1/3] fix d4rl ignores seed parameter bug

---
 d4rl/locomotion/ant.py      | 36 +++++++++++++++++++-----------------
 d4rl/locomotion/maze_env.py | 20 ++++++++++----------
 d4rl/locomotion/wrappers.py |  3 +++
 3 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/d4rl/locomotion/ant.py b/d4rl/locomotion/ant.py
index 8b1f2923..be90afcc 100644
--- a/d4rl/locomotion/ant.py
+++ b/d4rl/locomotion/ant.py
@@ -184,28 +184,30 @@ def __init__(self, goal_sampler=None, expose_all_qpos=True,
     self.v2_resets = v2_resets
           
   def reset(self):
-    if self.v2_resets:
-      """
-      The target goal for evaluation in antmazes is randomized.
-      antmazes-v0 and -v1 resulted in really high-variance evaluations
-      because the target goal was set once at the seed level. This led to
-      each run running evaluations with one particular goal. To accurately
-      cover each goal, this requires about 50-100 seeds, which might be
-      computationally infeasible. As an alternate fix, to reduce variance 
-      in result reporting, we are creating the v2 environments
-      which use the same offline dataset as v0 environments, with the distinction 
-      that the randomization of goals during evaluation is performed at the level of
-      each rollout. Thus running a few seeds, but performing the final evaluation 
-      over 100-200 episodes will give a valid estimate of an algorithm's performance.
-      """      
-      self.set_target()
+    # if self.v2_resets:
+    #   """
+    #   The target goal for evaluation in antmazes is randomized.
+    #   antmazes-v0 and -v1 resulted in really high-variance evaluations
+    #   because the target goal was set once at the seed level. This led to
+    #   each run running evaluations with one particular goal. To accurately
+    #   cover each goal, this requires about 50-100 seeds, which might be
+    #   computationally infeasible. As an alternate fix, to reduce variance 
+    #   in result reporting, we are creating the v2 environments
+    #   which use the same offline dataset as v0 environments, with the distinction 
+    #   that the randomization of goals during evaluation is performed at the level of
+    #   each rollout. Thus running a few seeds, but performing the final evaluation 
+    #   over 100-200 episodes will give a valid estimate of an algorithm's performance.
+    #   """      
+    #   self.set_target()
+    self.set_target()
     return super().reset()
     
   def set_target(self, target_location=None):
     return self.set_target_goal(target_location)
 
-  def seed(self, seed=0):
-      mujoco_env.MujocoEnv.seed(self, seed)
+  def seed(self, seed=None):
+    # print(f"MazeEnv seed = {seed}")
+    mujoco_env.MujocoEnv.seed(self, seed)
 
 def make_ant_maze_env(**kwargs):
   env = AntMazeEnv(**kwargs)
diff --git a/d4rl/locomotion/maze_env.py b/d4rl/locomotion/maze_env.py
index c6010f2b..1027a6fc 100644
--- a/d4rl/locomotion/maze_env.py
+++ b/d4rl/locomotion/maze_env.py
@@ -214,13 +214,13 @@ def _xy_to_rowcol(self, xy):
   def _get_reset_location(self,):
     prob = (1.0 - self._np_maze_map) / np.sum(1.0 - self._np_maze_map) 
     prob_row = np.sum(prob, 1)
-    row_sample = np.random.choice(np.arange(self._np_maze_map.shape[0]), p=prob_row)
-    col_sample = np.random.choice(np.arange(self._np_maze_map.shape[1]), p=prob[row_sample] * 1.0 / prob_row[row_sample])
+    row_sample = self.np_random.choice(np.arange(self._np_maze_map.shape[0]), p=prob_row)
+    col_sample = self.np_random.choice(np.arange(self._np_maze_map.shape[1]), p=prob[row_sample] * 1.0 / prob_row[row_sample])
     reset_location = self._rowcol_to_xy((row_sample, col_sample))
     
     # Add some random noise
-    random_x = np.random.uniform(low=0, high=0.5) * 0.5 * self._maze_size_scaling
-    random_y = np.random.uniform(low=0, high=0.5) * 0.5 * self._maze_size_scaling
+    random_x = self.np_random.uniform(low=0, high=0.5) * 0.5 * self._maze_size_scaling
+    random_y = self.np_random.uniform(low=0, high=0.5) * 0.5 * self._maze_size_scaling
 
     return (max(reset_location[0] + random_x, 0), max(reset_location[1] + random_y, 0))
 
@@ -229,8 +229,8 @@ def _rowcol_to_xy(self, rowcol, add_random_noise=False):
     x = col * self._maze_size_scaling - self._init_torso_x
     y = row * self._maze_size_scaling - self._init_torso_y
     if add_random_noise:
-      x = x + np.random.uniform(low=0, high=self._maze_size_scaling * 0.25)
-      y = y + np.random.uniform(low=0, high=self._maze_size_scaling * 0.25)
+      x = x + self.np_random.uniform(low=0, high=self._maze_size_scaling * 0.25)
+      y = y + self.np_random.uniform(low=0, high=self._maze_size_scaling * 0.25)
     return (x, y)
 
   def goal_sampler(self, np_random, only_free_cells=True, interpolate=True):
@@ -247,11 +247,11 @@ def goal_sampler(self, np_random, only_free_cells=True, interpolate=True):
     # If there is a 'goal' designated, use that. Otherwise, any valid cell can
     # be a goal.
     sample_choices = goal_cells if goal_cells else valid_cells
-    cell = sample_choices[np_random.choice(len(sample_choices))]
+    cell = sample_choices[self.np_random.choice(len(sample_choices))]
     xy = self._rowcol_to_xy(cell, add_random_noise=True)
 
-    random_x = np.random.uniform(low=0, high=0.5) * 0.25 * self._maze_size_scaling
-    random_y = np.random.uniform(low=0, high=0.5) * 0.25 * self._maze_size_scaling
+    random_x = self.np_random.uniform(low=0, high=0.5) * 0.25 * self._maze_size_scaling
+    random_y = self.np_random.uniform(low=0, high=0.5) * 0.25 * self._maze_size_scaling
 
     xy = (max(xy[0] + random_x, 0), max(xy[1] + random_y, 0))
 
@@ -259,7 +259,7 @@ def goal_sampler(self, np_random, only_free_cells=True, interpolate=True):
   
   def set_target_goal(self, goal_input=None):
     if goal_input is None:
-      self.target_goal = self.goal_sampler(np.random)
+      self.target_goal = self.goal_sampler(self.np_random)
     else:
       self.target_goal = goal_input
     
diff --git a/d4rl/locomotion/wrappers.py b/d4rl/locomotion/wrappers.py
index 45b371cd..6d6d2173 100644
--- a/d4rl/locomotion/wrappers.py
+++ b/d4rl/locomotion/wrappers.py
@@ -22,6 +22,9 @@ def reset(self, **kwargs):
 
     def step(self, action):
         return self._wrapped_env.step(action)
+    
+    def seed(self, seed=None):
+        return self._wrapped_env.seed(seed)
 
     def render(self, *args, **kwargs):
         return self._wrapped_env.render(*args, **kwargs)

From 8ef2562be13a10b92627006f129569e3fbb7cec0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CJa4822=E2=80=9D?= <3471606159@qq.com>
Date: Sat, 30 Sep 2023 13:10:25 -0700
Subject: [PATCH 2/3] fix d4rl ignores seed parameter bug

---
 d4rl/locomotion/ant.py | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/d4rl/locomotion/ant.py b/d4rl/locomotion/ant.py
index be90afcc..3d2162b5 100644
--- a/d4rl/locomotion/ant.py
+++ b/d4rl/locomotion/ant.py
@@ -184,22 +184,21 @@ def __init__(self, goal_sampler=None, expose_all_qpos=True,
     self.v2_resets = v2_resets
           
   def reset(self):
-    # if self.v2_resets:
-    #   """
-    #   The target goal for evaluation in antmazes is randomized.
-    #   antmazes-v0 and -v1 resulted in really high-variance evaluations
-    #   because the target goal was set once at the seed level. This led to
-    #   each run running evaluations with one particular goal. To accurately
-    #   cover each goal, this requires about 50-100 seeds, which might be
-    #   computationally infeasible. As an alternate fix, to reduce variance 
-    #   in result reporting, we are creating the v2 environments
-    #   which use the same offline dataset as v0 environments, with the distinction 
-    #   that the randomization of goals during evaluation is performed at the level of
-    #   each rollout. Thus running a few seeds, but performing the final evaluation 
-    #   over 100-200 episodes will give a valid estimate of an algorithm's performance.
-    #   """      
-    #   self.set_target()
-    self.set_target()
+    if self.v2_resets:
+      """
+      The target goal for evaluation in antmazes is randomized.
+      antmazes-v0 and -v1 resulted in really high-variance evaluations
+      because the target goal was set once at the seed level. This led to
+      each run running evaluations with one particular goal. To accurately
+      cover each goal, this requires about 50-100 seeds, which might be
+      computationally infeasible. As an alternate fix, to reduce variance 
+      in result reporting, we are creating the v2 environments
+      which use the same offline dataset as v0 environments, with the distinction 
+      that the randomization of goals during evaluation is performed at the level of
+      each rollout. Thus running a few seeds, but performing the final evaluation 
+      over 100-200 episodes will give a valid estimate of an algorithm's performance.
+      """      
+      self.set_target()
     return super().reset()
     
   def set_target(self, target_location=None):

From 163a54080612f1809a63fe6c01382d457e07707b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CJa4822=E2=80=9D?= <3471606159@qq.com>
Date: Sat, 30 Sep 2023 13:15:05 -0700
Subject: [PATCH 3/3] fix d4rl ignores seed parameter bug

---
 d4rl/locomotion/ant.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/d4rl/locomotion/ant.py b/d4rl/locomotion/ant.py
index 3d2162b5..b9f7c0a1 100644
--- a/d4rl/locomotion/ant.py
+++ b/d4rl/locomotion/ant.py
@@ -205,7 +205,6 @@ def set_target(self, target_location=None):
     return self.set_target_goal(target_location)
 
   def seed(self, seed=None):
-    # print(f"MazeEnv seed = {seed}")
     mujoco_env.MujocoEnv.seed(self, seed)
 
 def make_ant_maze_env(**kwargs):