add fixes for reproducibility

2024-03-22 15:06:57 +00:00 · 2024-03-22 15:06:57 +00:00 · 15ff3b3af8
parent b9047fbdd2
commit 15ff3b3af8
4 changed files with 45 additions and 14 deletions
--- a/lerobot/common/envs/pusht/pusht_env.py
+++ b/lerobot/common/envs/pusht/pusht_env.py
@ -33,7 +33,7 @@ class PushTEnv(gym.Env):
    def __init__(
        self,
-        legacy=False,
+        legacy=True,  # compatibility with original
        block_cog=None,
        damping=None,
        render_action=True,
--- a/lerobot/common/envs/pusht/pusht_image_env.py
+++ b/lerobot/common/envs/pusht/pusht_image_env.py
@ -7,7 +7,8 @@ from lerobot.common.envs.pusht.pusht_env import PushTEnv
 class PushTImageEnv(PushTEnv):
    metadata = {"render.modes": ["rgb_array"], "video.frames_per_second": 10}
-    def __init__(self, legacy=False, block_cog=None, damping=None, render_size=96):
+    # Note: legacy defaults to True for compatibility with original
    def __init__(self, legacy=True, block_cog=None, damping=None, render_size=96):
        super().__init__(
            legacy=legacy, block_cog=block_cog, damping=damping, render_size=render_size, render_action=False
        )
--- a/lerobot/scripts/eval.py
+++ b/lerobot/scripts/eval.py
@ -1,3 +1,4 @@
 import json
 import logging
 import threading
 import time
@ -41,6 +42,7 @@ def eval_policy(
    sum_rewards = []
    max_rewards = []
    successes = []
    seeds = []
    threads = []  # for video saving threads
    episode_counter = 0  # for saving the correct number of videos
@ -53,11 +55,15 @@ def eval_policy(
            if save_video or (return_first_video and i == 0):  # noqa: B023
                ep_frames.append(env.render())  # noqa: B023
        # Clear the policy's action queue before the start of a new rollout.
        if policy is not None:
            policy.clear_action_queue()
        env.start()  # needed to be able to get the seeds the first time as BatchedEnvs are lazy
        seeds.extend(env._next_seed)
        with torch.inference_mode():
            # TODO(alexander-soare): When `break_when_any_done == False` this rolls out for max_steps even when all
            # envs are done the first time. But we only use the first rollout. This is a waste of compute.
            if policy is not None:
                policy.clear_action_queue()
            rollout = env.rollout(
                max_steps=max_steps,
                policy=policy,
@ -65,8 +71,8 @@ def eval_policy(
                callback=maybe_render_frame,
                break_when_any_done=env.batch_size[0] == 1,
            )
-        # Figure out where in each rollout sequence the first done condition was encountered (results after this won't
+        # Figure out where in each rollout sequence the first done condition was encountered (results after
-        # be included).
+        # this won't be included).
        # Note: this assumes that the shape of the done key is (batch_size, max_steps, 1).
        # Note: this relies on a property of argmax: that it returns the first occurrence as a tiebreaker.
        rollout_steps = rollout["next", "done"].shape[1]
@ -108,11 +114,31 @@ def eval_policy(
        thread.join()
    info = {
        "micro": [
            {
                "episode_ix": i,
                "sum_reward": sum_reward,
                "max_reward": max_reward,
                "success": success,
                "seed": seed,
            }
            for i, (sum_reward, max_reward, success, seed) in enumerate(
                zip(
                    sum_rewards[:num_episodes],
                    max_rewards[:num_episodes],
                    successes[:num_episodes],
                    seeds[:num_episodes],
                    strict=True,
                )
            )
        ],
        "macro": {
            "avg_sum_reward": np.nanmean(sum_rewards[:num_episodes]),
            "avg_max_reward": np.nanmean(max_rewards[:num_episodes]),
            "pc_success": np.nanmean(successes[:num_episodes]) * 100,
            "eval_s": time.time() - start,
            "eval_ep_s": (time.time() - start) / num_episodes,
        },
    }
    if return_first_video:
        return info, first_video
@ -156,7 +182,7 @@ def eval(cfg: dict, out_dir=None):
        # when policy is None, rollout a random policy
        policy = None
-    metrics = eval_policy(
+    info = eval_policy(
        env,
        policy=policy,
        save_video=True,
@ -165,7 +191,11 @@ def eval(cfg: dict, out_dir=None):
        max_steps=cfg.env.episode_length,
        num_episodes=cfg.eval_episodes,
    )
-    print(metrics)
+    print(info["macro"])
    # Save info
    with open(Path(out_dir) / "eval_info.json", "w") as f:
        json.dump(info, f, indent=2)
    logging.info("End of eval")
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@ -183,7 +183,7 @@ def train(cfg: dict, out_dir=None, job_name=None):
                video_dir=Path(out_dir) / "eval",
                save_video=True,
            )
-            log_eval_info(logger, eval_info, step, cfg, offline_buffer, is_offline)
+            log_eval_info(logger, eval_info["macro"], step, cfg, offline_buffer, is_offline)
            if cfg.wandb.enable:
                logger.log_video(first_video, step, mode="eval")
            logging.info("Resume training")