From b9047fbdd246143def84e8aa2bdb97e4bcc1811f Mon Sep 17 00:00:00 2001 From: Alexander Soare Date: Fri, 22 Mar 2024 13:25:23 +0000 Subject: [PATCH 1/4] fix environment seeding --- lerobot/common/envs/abstract.py | 12 ++++++++++-- lerobot/common/envs/aloha/env.py | 7 ++----- lerobot/common/envs/pusht/env.py | 8 ++++---- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/lerobot/common/envs/abstract.py b/lerobot/common/envs/abstract.py index a449e23f..01250d1c 100644 --- a/lerobot/common/envs/abstract.py +++ b/lerobot/common/envs/abstract.py @@ -4,6 +4,8 @@ from typing import Optional from tensordict import TensorDict from torchrl.envs import EnvBase +from lerobot.common.utils import set_seed + class AbstractEnv(EnvBase): def __init__( @@ -34,7 +36,13 @@ class AbstractEnv(EnvBase): self._make_env() self._make_spec() - self._current_seed = self.set_seed(seed) + + # self._next_seed will be used for the next reset. It is recommended that when self.set_seed is called + # you store the return value in self._next_seed (it will be a new randomly generated seed). + self._next_seed = seed + # Don't store the result of this in self._next_seed, as we want to make sure that the first time + # self._reset is called, we use seed. + self.set_seed(seed) if self.num_prev_obs > 0: self._prev_obs_image_queue = deque(maxlen=self.num_prev_obs) @@ -59,4 +67,4 @@ class AbstractEnv(EnvBase): raise NotImplementedError("Abstract method") def _set_seed(self, seed: Optional[int]): - raise NotImplementedError("Abstract method") + set_seed(seed) diff --git a/lerobot/common/envs/aloha/env.py b/lerobot/common/envs/aloha/env.py index af2b354b..a001ca55 100644 --- a/lerobot/common/envs/aloha/env.py +++ b/lerobot/common/envs/aloha/env.py @@ -126,9 +126,8 @@ class AlohaEnv(AbstractEnv): logging.warning(f"{self.__class__.__name__}._reset ignores the provided tensordict.") AlohaEnv._reset_warning_issued = True - # we need to handle seed iteration, since self._env.reset() rely an internal _seed. - self._current_seed += 1 - self.set_seed(self._current_seed) + # Seed the environment and update the seed to be used for the next reset. + self._next_seed = self.set_seed(self._next_seed) # TODO(rcadene): do not use global variable for this if "sim_transfer_cube" in self.task: @@ -137,8 +136,6 @@ class AlohaEnv(AbstractEnv): BOX_POSE[0] = np.concatenate(sample_insertion_pose()) # used in sim reset raw_obs = self._env.reset() - # TODO(rcadene): add assert - # assert self._current_seed == self._env._seed obs = self._format_raw_obs(raw_obs.observation) diff --git a/lerobot/common/envs/pusht/env.py b/lerobot/common/envs/pusht/env.py index 3824a5d2..070c718f 100644 --- a/lerobot/common/envs/pusht/env.py +++ b/lerobot/common/envs/pusht/env.py @@ -106,11 +106,9 @@ class PushtEnv(AbstractEnv): logging.warning(f"{self.__class__.__name__}._reset ignores the provided tensordict.") PushtEnv._reset_warning_issued = True - # we need to handle seed iteration, since self._env.reset() rely an internal _seed. - self._current_seed += 1 - self.set_seed(self._current_seed) + # Seed the environment and update the seed to be used for the next reset. + self._next_seed = self.set_seed(self._next_seed) raw_obs = self._env.reset() - assert self._current_seed == self._env._seed obs = self._format_raw_obs(raw_obs) @@ -239,5 +237,7 @@ class PushtEnv(AbstractEnv): ) def _set_seed(self, seed: Optional[int]): + # Set global seed. set_seed(seed) + # Set PushTImageEnv seed as it relies on it's own internal _seed attribute. self._env.seed(seed) From 15ff3b3af811abb72f980b808fca6142ee3b557d Mon Sep 17 00:00:00 2001 From: Alexander Soare Date: Fri, 22 Mar 2024 15:06:57 +0000 Subject: [PATCH 2/4] add fixes for reproducibility --- lerobot/common/envs/pusht/pusht_env.py | 2 +- lerobot/common/envs/pusht/pusht_image_env.py | 3 +- lerobot/scripts/eval.py | 52 +++++++++++++++----- lerobot/scripts/train.py | 2 +- 4 files changed, 45 insertions(+), 14 deletions(-) diff --git a/lerobot/common/envs/pusht/pusht_env.py b/lerobot/common/envs/pusht/pusht_env.py index 690bfe12..186f9e31 100644 --- a/lerobot/common/envs/pusht/pusht_env.py +++ b/lerobot/common/envs/pusht/pusht_env.py @@ -33,7 +33,7 @@ class PushTEnv(gym.Env): def __init__( self, - legacy=False, + legacy=True, # compatibility with original block_cog=None, damping=None, render_action=True, diff --git a/lerobot/common/envs/pusht/pusht_image_env.py b/lerobot/common/envs/pusht/pusht_image_env.py index ec8e177b..4981eb64 100644 --- a/lerobot/common/envs/pusht/pusht_image_env.py +++ b/lerobot/common/envs/pusht/pusht_image_env.py @@ -7,7 +7,8 @@ from lerobot.common.envs.pusht.pusht_env import PushTEnv class PushTImageEnv(PushTEnv): metadata = {"render.modes": ["rgb_array"], "video.frames_per_second": 10} - def __init__(self, legacy=False, block_cog=None, damping=None, render_size=96): + # Note: legacy defaults to True for compatibility with original + def __init__(self, legacy=True, block_cog=None, damping=None, render_size=96): super().__init__( legacy=legacy, block_cog=block_cog, damping=damping, render_size=render_size, render_action=False ) diff --git a/lerobot/scripts/eval.py b/lerobot/scripts/eval.py index 76deb2fe..7beb1a8f 100644 --- a/lerobot/scripts/eval.py +++ b/lerobot/scripts/eval.py @@ -1,3 +1,4 @@ +import json import logging import threading import time @@ -41,6 +42,7 @@ def eval_policy( sum_rewards = [] max_rewards = [] successes = [] + seeds = [] threads = [] # for video saving threads episode_counter = 0 # for saving the correct number of videos @@ -53,11 +55,15 @@ def eval_policy( if save_video or (return_first_video and i == 0): # noqa: B023 ep_frames.append(env.render()) # noqa: B023 + # Clear the policy's action queue before the start of a new rollout. + if policy is not None: + policy.clear_action_queue() + + env.start() # needed to be able to get the seeds the first time as BatchedEnvs are lazy + seeds.extend(env._next_seed) with torch.inference_mode(): # TODO(alexander-soare): When `break_when_any_done == False` this rolls out for max_steps even when all # envs are done the first time. But we only use the first rollout. This is a waste of compute. - if policy is not None: - policy.clear_action_queue() rollout = env.rollout( max_steps=max_steps, policy=policy, @@ -65,8 +71,8 @@ def eval_policy( callback=maybe_render_frame, break_when_any_done=env.batch_size[0] == 1, ) - # Figure out where in each rollout sequence the first done condition was encountered (results after this won't - # be included). + # Figure out where in each rollout sequence the first done condition was encountered (results after + # this won't be included). # Note: this assumes that the shape of the done key is (batch_size, max_steps, 1). # Note: this relies on a property of argmax: that it returns the first occurrence as a tiebreaker. rollout_steps = rollout["next", "done"].shape[1] @@ -108,11 +114,31 @@ def eval_policy( thread.join() info = { - "avg_sum_reward": np.nanmean(sum_rewards[:num_episodes]), - "avg_max_reward": np.nanmean(max_rewards[:num_episodes]), - "pc_success": np.nanmean(successes[:num_episodes]) * 100, - "eval_s": time.time() - start, - "eval_ep_s": (time.time() - start) / num_episodes, + "micro": [ + { + "episode_ix": i, + "sum_reward": sum_reward, + "max_reward": max_reward, + "success": success, + "seed": seed, + } + for i, (sum_reward, max_reward, success, seed) in enumerate( + zip( + sum_rewards[:num_episodes], + max_rewards[:num_episodes], + successes[:num_episodes], + seeds[:num_episodes], + strict=True, + ) + ) + ], + "macro": { + "avg_sum_reward": np.nanmean(sum_rewards[:num_episodes]), + "avg_max_reward": np.nanmean(max_rewards[:num_episodes]), + "pc_success": np.nanmean(successes[:num_episodes]) * 100, + "eval_s": time.time() - start, + "eval_ep_s": (time.time() - start) / num_episodes, + }, } if return_first_video: return info, first_video @@ -156,7 +182,7 @@ def eval(cfg: dict, out_dir=None): # when policy is None, rollout a random policy policy = None - metrics = eval_policy( + info = eval_policy( env, policy=policy, save_video=True, @@ -165,7 +191,11 @@ def eval(cfg: dict, out_dir=None): max_steps=cfg.env.episode_length, num_episodes=cfg.eval_episodes, ) - print(metrics) + print(info["macro"]) + + # Save info + with open(Path(out_dir) / "eval_info.json", "w") as f: + json.dump(info, f, indent=2) logging.info("End of eval") diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py index 82b3cc5d..a9ecbf07 100644 --- a/lerobot/scripts/train.py +++ b/lerobot/scripts/train.py @@ -183,7 +183,7 @@ def train(cfg: dict, out_dir=None, job_name=None): video_dir=Path(out_dir) / "eval", save_video=True, ) - log_eval_info(logger, eval_info, step, cfg, offline_buffer, is_offline) + log_eval_info(logger, eval_info["macro"], step, cfg, offline_buffer, is_offline) if cfg.wandb.enable: logger.log_video(first_video, step, mode="eval") logging.info("Resume training") From d43fa600a02b324c04442d30917922d41d81c645 Mon Sep 17 00:00:00 2001 From: Alexander Soare Date: Fri, 22 Mar 2024 15:32:55 +0000 Subject: [PATCH 3/4] only try to start env if it is closed --- lerobot/scripts/eval.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lerobot/scripts/eval.py b/lerobot/scripts/eval.py index 4135570b..f0c2f074 100644 --- a/lerobot/scripts/eval.py +++ b/lerobot/scripts/eval.py @@ -90,7 +90,8 @@ def eval_policy( if policy is not None: policy.clear_action_queue() - env.start() # needed to be able to get the seeds the first time as BatchedEnvs are lazy + if env.is_closed: + env.start() # needed to be able to get the seeds the first time as BatchedEnvs are lazy seeds.extend(env._next_seed) with torch.inference_mode(): # TODO(alexander-soare): When `break_when_any_done == False` this rolls out for max_steps even when all From bd40ffc53cebb6ba1d4be27c5d5fb2a0c6e9847e Mon Sep 17 00:00:00 2001 From: Alexander Soare Date: Fri, 22 Mar 2024 15:43:45 +0000 Subject: [PATCH 4/4] revision --- lerobot/scripts/eval.py | 6 +++--- lerobot/scripts/train.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lerobot/scripts/eval.py b/lerobot/scripts/eval.py index f0c2f074..b3d107ab 100644 --- a/lerobot/scripts/eval.py +++ b/lerobot/scripts/eval.py @@ -146,7 +146,7 @@ def eval_policy( thread.join() info = { - "micro": [ + "per_episode": [ { "episode_ix": i, "sum_reward": sum_reward, @@ -164,7 +164,7 @@ def eval_policy( ) ) ], - "macro": { + "aggregated": { "avg_sum_reward": np.nanmean(sum_rewards[:num_episodes]), "avg_max_reward": np.nanmean(max_rewards[:num_episodes]), "pc_success": np.nanmean(successes[:num_episodes]) * 100, @@ -218,7 +218,7 @@ def eval(cfg: dict, out_dir=None, stats_path=None): max_steps=cfg.env.episode_length, num_episodes=cfg.eval_episodes, ) - print(info["macro"]) + print(info["aggregated"]) # Save info with open(Path(out_dir) / "eval_info.json", "w") as f: diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py index a9ecbf07..cf71ad2e 100644 --- a/lerobot/scripts/train.py +++ b/lerobot/scripts/train.py @@ -183,7 +183,7 @@ def train(cfg: dict, out_dir=None, job_name=None): video_dir=Path(out_dir) / "eval", save_video=True, ) - log_eval_info(logger, eval_info["macro"], step, cfg, offline_buffer, is_offline) + log_eval_info(logger, eval_info["aggregated"], step, cfg, offline_buffer, is_offline) if cfg.wandb.enable: logger.log_video(first_video, step, mode="eval") logging.info("Resume training")