add fixes for reproducibility

This commit is contained in:
Alexander Soare 2024-03-22 15:06:57 +00:00
parent b9047fbdd2
commit 15ff3b3af8
4 changed files with 45 additions and 14 deletions

View File

@ -33,7 +33,7 @@ class PushTEnv(gym.Env):
def __init__( def __init__(
self, self,
legacy=False, legacy=True, # compatibility with original
block_cog=None, block_cog=None,
damping=None, damping=None,
render_action=True, render_action=True,

View File

@ -7,7 +7,8 @@ from lerobot.common.envs.pusht.pusht_env import PushTEnv
class PushTImageEnv(PushTEnv): class PushTImageEnv(PushTEnv):
metadata = {"render.modes": ["rgb_array"], "video.frames_per_second": 10} metadata = {"render.modes": ["rgb_array"], "video.frames_per_second": 10}
def __init__(self, legacy=False, block_cog=None, damping=None, render_size=96): # Note: legacy defaults to True for compatibility with original
def __init__(self, legacy=True, block_cog=None, damping=None, render_size=96):
super().__init__( super().__init__(
legacy=legacy, block_cog=block_cog, damping=damping, render_size=render_size, render_action=False legacy=legacy, block_cog=block_cog, damping=damping, render_size=render_size, render_action=False
) )

View File

@ -1,3 +1,4 @@
import json
import logging import logging
import threading import threading
import time import time
@ -41,6 +42,7 @@ def eval_policy(
sum_rewards = [] sum_rewards = []
max_rewards = [] max_rewards = []
successes = [] successes = []
seeds = []
threads = [] # for video saving threads threads = [] # for video saving threads
episode_counter = 0 # for saving the correct number of videos episode_counter = 0 # for saving the correct number of videos
@ -53,11 +55,15 @@ def eval_policy(
if save_video or (return_first_video and i == 0): # noqa: B023 if save_video or (return_first_video and i == 0): # noqa: B023
ep_frames.append(env.render()) # noqa: B023 ep_frames.append(env.render()) # noqa: B023
# Clear the policy's action queue before the start of a new rollout.
if policy is not None:
policy.clear_action_queue()
env.start() # needed to be able to get the seeds the first time as BatchedEnvs are lazy
seeds.extend(env._next_seed)
with torch.inference_mode(): with torch.inference_mode():
# TODO(alexander-soare): When `break_when_any_done == False` this rolls out for max_steps even when all # TODO(alexander-soare): When `break_when_any_done == False` this rolls out for max_steps even when all
# envs are done the first time. But we only use the first rollout. This is a waste of compute. # envs are done the first time. But we only use the first rollout. This is a waste of compute.
if policy is not None:
policy.clear_action_queue()
rollout = env.rollout( rollout = env.rollout(
max_steps=max_steps, max_steps=max_steps,
policy=policy, policy=policy,
@ -65,8 +71,8 @@ def eval_policy(
callback=maybe_render_frame, callback=maybe_render_frame,
break_when_any_done=env.batch_size[0] == 1, break_when_any_done=env.batch_size[0] == 1,
) )
# Figure out where in each rollout sequence the first done condition was encountered (results after this won't # Figure out where in each rollout sequence the first done condition was encountered (results after
# be included). # this won't be included).
# Note: this assumes that the shape of the done key is (batch_size, max_steps, 1). # Note: this assumes that the shape of the done key is (batch_size, max_steps, 1).
# Note: this relies on a property of argmax: that it returns the first occurrence as a tiebreaker. # Note: this relies on a property of argmax: that it returns the first occurrence as a tiebreaker.
rollout_steps = rollout["next", "done"].shape[1] rollout_steps = rollout["next", "done"].shape[1]
@ -108,11 +114,31 @@ def eval_policy(
thread.join() thread.join()
info = { info = {
"micro": [
{
"episode_ix": i,
"sum_reward": sum_reward,
"max_reward": max_reward,
"success": success,
"seed": seed,
}
for i, (sum_reward, max_reward, success, seed) in enumerate(
zip(
sum_rewards[:num_episodes],
max_rewards[:num_episodes],
successes[:num_episodes],
seeds[:num_episodes],
strict=True,
)
)
],
"macro": {
"avg_sum_reward": np.nanmean(sum_rewards[:num_episodes]), "avg_sum_reward": np.nanmean(sum_rewards[:num_episodes]),
"avg_max_reward": np.nanmean(max_rewards[:num_episodes]), "avg_max_reward": np.nanmean(max_rewards[:num_episodes]),
"pc_success": np.nanmean(successes[:num_episodes]) * 100, "pc_success": np.nanmean(successes[:num_episodes]) * 100,
"eval_s": time.time() - start, "eval_s": time.time() - start,
"eval_ep_s": (time.time() - start) / num_episodes, "eval_ep_s": (time.time() - start) / num_episodes,
},
} }
if return_first_video: if return_first_video:
return info, first_video return info, first_video
@ -156,7 +182,7 @@ def eval(cfg: dict, out_dir=None):
# when policy is None, rollout a random policy # when policy is None, rollout a random policy
policy = None policy = None
metrics = eval_policy( info = eval_policy(
env, env,
policy=policy, policy=policy,
save_video=True, save_video=True,
@ -165,7 +191,11 @@ def eval(cfg: dict, out_dir=None):
max_steps=cfg.env.episode_length, max_steps=cfg.env.episode_length,
num_episodes=cfg.eval_episodes, num_episodes=cfg.eval_episodes,
) )
print(metrics) print(info["macro"])
# Save info
with open(Path(out_dir) / "eval_info.json", "w") as f:
json.dump(info, f, indent=2)
logging.info("End of eval") logging.info("End of eval")

View File

@ -183,7 +183,7 @@ def train(cfg: dict, out_dir=None, job_name=None):
video_dir=Path(out_dir) / "eval", video_dir=Path(out_dir) / "eval",
save_video=True, save_video=True,
) )
log_eval_info(logger, eval_info, step, cfg, offline_buffer, is_offline) log_eval_info(logger, eval_info["macro"], step, cfg, offline_buffer, is_offline)
if cfg.wandb.enable: if cfg.wandb.enable:
logger.log_video(first_video, step, mode="eval") logger.log_video(first_video, step, mode="eval")
logging.info("Resume training") logging.info("Resume training")