From c3c8b5ba5cb390232911843c028deb99e1867018 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CJa4822=E2=80=9D?= <3471606159@qq.com> Date: Tue, 26 Sep 2023 19:57:22 -0700 Subject: [PATCH 1/3] fix d4rl ignores seed parameter bug --- d4rl/locomotion/ant.py | 36 +++++++++++++++++++----------------- d4rl/locomotion/maze_env.py | 20 ++++++++++---------- d4rl/locomotion/wrappers.py | 3 +++ 3 files changed, 32 insertions(+), 27 deletions(-) diff --git a/d4rl/locomotion/ant.py b/d4rl/locomotion/ant.py index 8b1f2923..be90afcc 100644 --- a/d4rl/locomotion/ant.py +++ b/d4rl/locomotion/ant.py @@ -184,28 +184,30 @@ def __init__(self, goal_sampler=None, expose_all_qpos=True, self.v2_resets = v2_resets def reset(self): - if self.v2_resets: - """ - The target goal for evaluation in antmazes is randomized. - antmazes-v0 and -v1 resulted in really high-variance evaluations - because the target goal was set once at the seed level. This led to - each run running evaluations with one particular goal. To accurately - cover each goal, this requires about 50-100 seeds, which might be - computationally infeasible. As an alternate fix, to reduce variance - in result reporting, we are creating the v2 environments - which use the same offline dataset as v0 environments, with the distinction - that the randomization of goals during evaluation is performed at the level of - each rollout. Thus running a few seeds, but performing the final evaluation - over 100-200 episodes will give a valid estimate of an algorithm's performance. - """ - self.set_target() + # if self.v2_resets: + # """ + # The target goal for evaluation in antmazes is randomized. + # antmazes-v0 and -v1 resulted in really high-variance evaluations + # because the target goal was set once at the seed level. This led to + # each run running evaluations with one particular goal. To accurately + # cover each goal, this requires about 50-100 seeds, which might be + # computationally infeasible. As an alternate fix, to reduce variance + # in result reporting, we are creating the v2 environments + # which use the same offline dataset as v0 environments, with the distinction + # that the randomization of goals during evaluation is performed at the level of + # each rollout. Thus running a few seeds, but performing the final evaluation + # over 100-200 episodes will give a valid estimate of an algorithm's performance. + # """ + # self.set_target() + self.set_target() return super().reset() def set_target(self, target_location=None): return self.set_target_goal(target_location) - def seed(self, seed=0): - mujoco_env.MujocoEnv.seed(self, seed) + def seed(self, seed=None): + # print(f"MazeEnv seed = {seed}") + mujoco_env.MujocoEnv.seed(self, seed) def make_ant_maze_env(**kwargs): env = AntMazeEnv(**kwargs) diff --git a/d4rl/locomotion/maze_env.py b/d4rl/locomotion/maze_env.py index c6010f2b..1027a6fc 100644 --- a/d4rl/locomotion/maze_env.py +++ b/d4rl/locomotion/maze_env.py @@ -214,13 +214,13 @@ def _xy_to_rowcol(self, xy): def _get_reset_location(self,): prob = (1.0 - self._np_maze_map) / np.sum(1.0 - self._np_maze_map) prob_row = np.sum(prob, 1) - row_sample = np.random.choice(np.arange(self._np_maze_map.shape[0]), p=prob_row) - col_sample = np.random.choice(np.arange(self._np_maze_map.shape[1]), p=prob[row_sample] * 1.0 / prob_row[row_sample]) + row_sample = self.np_random.choice(np.arange(self._np_maze_map.shape[0]), p=prob_row) + col_sample = self.np_random.choice(np.arange(self._np_maze_map.shape[1]), p=prob[row_sample] * 1.0 / prob_row[row_sample]) reset_location = self._rowcol_to_xy((row_sample, col_sample)) # Add some random noise - random_x = np.random.uniform(low=0, high=0.5) * 0.5 * self._maze_size_scaling - random_y = np.random.uniform(low=0, high=0.5) * 0.5 * self._maze_size_scaling + random_x = self.np_random.uniform(low=0, high=0.5) * 0.5 * self._maze_size_scaling + random_y = self.np_random.uniform(low=0, high=0.5) * 0.5 * self._maze_size_scaling return (max(reset_location[0] + random_x, 0), max(reset_location[1] + random_y, 0)) @@ -229,8 +229,8 @@ def _rowcol_to_xy(self, rowcol, add_random_noise=False): x = col * self._maze_size_scaling - self._init_torso_x y = row * self._maze_size_scaling - self._init_torso_y if add_random_noise: - x = x + np.random.uniform(low=0, high=self._maze_size_scaling * 0.25) - y = y + np.random.uniform(low=0, high=self._maze_size_scaling * 0.25) + x = x + self.np_random.uniform(low=0, high=self._maze_size_scaling * 0.25) + y = y + self.np_random.uniform(low=0, high=self._maze_size_scaling * 0.25) return (x, y) def goal_sampler(self, np_random, only_free_cells=True, interpolate=True): @@ -247,11 +247,11 @@ def goal_sampler(self, np_random, only_free_cells=True, interpolate=True): # If there is a 'goal' designated, use that. Otherwise, any valid cell can # be a goal. sample_choices = goal_cells if goal_cells else valid_cells - cell = sample_choices[np_random.choice(len(sample_choices))] + cell = sample_choices[self.np_random.choice(len(sample_choices))] xy = self._rowcol_to_xy(cell, add_random_noise=True) - random_x = np.random.uniform(low=0, high=0.5) * 0.25 * self._maze_size_scaling - random_y = np.random.uniform(low=0, high=0.5) * 0.25 * self._maze_size_scaling + random_x = self.np_random.uniform(low=0, high=0.5) * 0.25 * self._maze_size_scaling + random_y = self.np_random.uniform(low=0, high=0.5) * 0.25 * self._maze_size_scaling xy = (max(xy[0] + random_x, 0), max(xy[1] + random_y, 0)) @@ -259,7 +259,7 @@ def goal_sampler(self, np_random, only_free_cells=True, interpolate=True): def set_target_goal(self, goal_input=None): if goal_input is None: - self.target_goal = self.goal_sampler(np.random) + self.target_goal = self.goal_sampler(self.np_random) else: self.target_goal = goal_input diff --git a/d4rl/locomotion/wrappers.py b/d4rl/locomotion/wrappers.py index 45b371cd..6d6d2173 100644 --- a/d4rl/locomotion/wrappers.py +++ b/d4rl/locomotion/wrappers.py @@ -22,6 +22,9 @@ def reset(self, **kwargs): def step(self, action): return self._wrapped_env.step(action) + + def seed(self, seed=None): + return self._wrapped_env.seed(seed) def render(self, *args, **kwargs): return self._wrapped_env.render(*args, **kwargs) From 8ef2562be13a10b92627006f129569e3fbb7cec0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CJa4822=E2=80=9D?= <3471606159@qq.com> Date: Sat, 30 Sep 2023 13:10:25 -0700 Subject: [PATCH 2/3] fix d4rl ignores seed parameter bug --- d4rl/locomotion/ant.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/d4rl/locomotion/ant.py b/d4rl/locomotion/ant.py index be90afcc..3d2162b5 100644 --- a/d4rl/locomotion/ant.py +++ b/d4rl/locomotion/ant.py @@ -184,22 +184,21 @@ def __init__(self, goal_sampler=None, expose_all_qpos=True, self.v2_resets = v2_resets def reset(self): - # if self.v2_resets: - # """ - # The target goal for evaluation in antmazes is randomized. - # antmazes-v0 and -v1 resulted in really high-variance evaluations - # because the target goal was set once at the seed level. This led to - # each run running evaluations with one particular goal. To accurately - # cover each goal, this requires about 50-100 seeds, which might be - # computationally infeasible. As an alternate fix, to reduce variance - # in result reporting, we are creating the v2 environments - # which use the same offline dataset as v0 environments, with the distinction - # that the randomization of goals during evaluation is performed at the level of - # each rollout. Thus running a few seeds, but performing the final evaluation - # over 100-200 episodes will give a valid estimate of an algorithm's performance. - # """ - # self.set_target() - self.set_target() + if self.v2_resets: + """ + The target goal for evaluation in antmazes is randomized. + antmazes-v0 and -v1 resulted in really high-variance evaluations + because the target goal was set once at the seed level. This led to + each run running evaluations with one particular goal. To accurately + cover each goal, this requires about 50-100 seeds, which might be + computationally infeasible. As an alternate fix, to reduce variance + in result reporting, we are creating the v2 environments + which use the same offline dataset as v0 environments, with the distinction + that the randomization of goals during evaluation is performed at the level of + each rollout. Thus running a few seeds, but performing the final evaluation + over 100-200 episodes will give a valid estimate of an algorithm's performance. + """ + self.set_target() return super().reset() def set_target(self, target_location=None): From 163a54080612f1809a63fe6c01382d457e07707b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CJa4822=E2=80=9D?= <3471606159@qq.com> Date: Sat, 30 Sep 2023 13:15:05 -0700 Subject: [PATCH 3/3] fix d4rl ignores seed parameter bug --- d4rl/locomotion/ant.py | 1 - 1 file changed, 1 deletion(-) diff --git a/d4rl/locomotion/ant.py b/d4rl/locomotion/ant.py index 3d2162b5..b9f7c0a1 100644 --- a/d4rl/locomotion/ant.py +++ b/d4rl/locomotion/ant.py @@ -205,7 +205,6 @@ def set_target(self, target_location=None): return self.set_target_goal(target_location) def seed(self, seed=None): - # print(f"MazeEnv seed = {seed}") mujoco_env.MujocoEnv.seed(self, seed) def make_ant_maze_env(**kwargs):