From c62382413984c937e2cdcba45d215f6f640883c1 Mon Sep 17 00:00:00 2001 From: Michel Aractingi Date: Tue, 11 Feb 2025 11:34:46 +0100 Subject: [PATCH] - Added JointMaskingActionSpace wrapper in `gym_manipulator` in order to select which joints will be controlled. For example, we can disable the gripper actions for some tasks. - Added Nan detection mechanisms in the actor, learner and gym_manipulator for the case where we encounter nans in the loop. - changed the non-blocking in the `.to(device)` functions to only work for the case of cuda because they were causing nans when running the policy on mps - Added some joint clipping and limits in the env, robot and policy configs. TODO clean this part and make the limits in one config file only. Co-authored-by: Adil Zouitine --- .../hilserl/classifier/modeling_classifier.py | 4 +- lerobot/configs/env/so100_real.yaml | 4 +- lerobot/configs/policy/sac_real.yaml | 10 +- lerobot/configs/robot/so100.yaml | 9 +- lerobot/scripts/server/actor_server.py | 21 +++- lerobot/scripts/server/buffer.py | 17 ++- lerobot/scripts/server/find_joint_limits.py | 1 + lerobot/scripts/server/gym_manipulator.py | 106 ++++++++++++++++-- lerobot/scripts/server/learner_server.py | 20 +++- 9 files changed, 161 insertions(+), 31 deletions(-) diff --git a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py index 58532302..c5485227 100644 --- a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py +++ b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py @@ -145,8 +145,8 @@ class Classifier( return ClassifierOutput(logits=logits, probabilities=probabilities, hidden_states=encoder_outputs) - def predict_reward(self, x): + def predict_reward(self, x, threshold=0.6): if self.config.num_classes == 2: - return (self.forward(x).probabilities > 0.6).float() + return (self.forward(x).probabilities > threshold).float() else: return torch.argmax(self.forward(x).probabilities, dim=1) diff --git a/lerobot/configs/env/so100_real.yaml b/lerobot/configs/env/so100_real.yaml index 862ea951..82dcfeea 100644 --- a/lerobot/configs/env/so100_real.yaml +++ b/lerobot/configs/env/so100_real.yaml @@ -18,10 +18,12 @@ env: control_time_s: 20 reset_follower_pos: true use_relative_joint_positions: true - reset_time_s: 10 + reset_time_s: 5 display_cameras: false delta_action: 0.1 + joint_masking_action_space: [1, 1, 1, 1, 0, 0] # disable wrist and gripper reward_classifier: pretrained_path: outputs/classifier/checkpoints/best/pretrained_model config_path: lerobot/configs/policy/hilserl_classifier.yaml + \ No newline at end of file diff --git a/lerobot/configs/policy/sac_real.yaml b/lerobot/configs/policy/sac_real.yaml index cbce0b00..de0ffe9b 100644 --- a/lerobot/configs/policy/sac_real.yaml +++ b/lerobot/configs/policy/sac_real.yaml @@ -20,7 +20,7 @@ training: lr: 3e-4 eval_freq: 2500 - log_freq: 500 + log_freq: 1 save_freq: 2000000 online_steps: 1000000 @@ -31,7 +31,7 @@ training: online_env_seed: 10000 online_buffer_capacity: 1000000 online_buffer_seed_size: 0 - online_step_before_learning: 100 #5000 + online_step_before_learning: 1000 #5000 do_online_rollout_async: false policy_update_freq: 1 @@ -76,8 +76,10 @@ policy: mean: [0.485, 0.456, 0.406] std: [0.229, 0.224, 0.225] observation.state: - min: [-88.50586, 23.81836, 0.87890625, -32.16797, 78.66211, 0.53691274] - max: [84.55078, 187.11914, 145.98633, 101.60156, 146.60156, 88.18792] + min: [-87.09961, 62.402344, 67.23633, 36.035156, 77.34375,0.53691274] + max: [58.183594, 131.83594, 145.98633, 82.08984, 78.22266, 0.60402685] + # min: [-88.50586, 23.81836, 0.87890625, -32.16797, 78.66211, 0.53691274] + # max: [84.55078, 187.11914, 145.98633, 101.60156, 146.60156, 88.18792] output_normalization_modes: action: min_max diff --git a/lerobot/configs/robot/so100.yaml b/lerobot/configs/robot/so100.yaml index d57ae721..59c52a6d 100644 --- a/lerobot/configs/robot/so100.yaml +++ b/lerobot/configs/robot/so100.yaml @@ -15,8 +15,13 @@ calibration_dir: .cache/calibration/so100 # the number of motors in your follower arms. max_relative_target: null joint_position_relative_bounds: - min: [-88.50586, 23.81836, 0.87890625, -32.16797, 78.66211, 0.53691274] - max: [84.55078, 187.11914, 145.98633, 101.60156, 146.60156, 88.18792] + min: [-87.09961, 62.402344, 67.23633, 36.035156, 77.34375, + 0.53691274] + max: [58.183594, 131.83594, 145.98633, 82.08984, 78.22266, + 0.60402685] + + # min: [-88.50586, 23.81836, 0.87890625, -32.16797, 78.66211, 0.53691274] + # max: [84.55078, 187.11914, 145.98633, 101.60156, 146.60156, 88.18792] leader_arms: main: diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py index 28c582d2..7ee91b2c 100644 --- a/lerobot/scripts/server/actor_server.py +++ b/lerobot/scripts/server/actor_server.py @@ -101,10 +101,14 @@ class ActorServiceServicer(hilserl_pb2_grpc.ActorServiceServicer): message = message_queue.get(block=True) if message.transition is not None: - transition_to_send_to_learner = [ - move_transition_to_device(T, device="cpu") for T in message.transition + transition_to_send_to_learner: list[Transition] = [ + move_transition_to_device(transition=T, device="cpu") for T in message.transition ] - + # Check for NaNs in transitions before sending to learner + for transition in transition_to_send_to_learner: + for key, value in transition["state"].items(): + if torch.isnan(value).any(): + logging.warning(f"Found NaN values in transition {key}") buf = io.BytesIO() torch.save(transition_to_send_to_learner, buf) transition_bytes = buf.getvalue() @@ -226,7 +230,7 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module) with TimerManager( elapsed_time_list=list_policy_time, label="Policy inference time", log=False ) as timer: # noqa: F841 - action = policy.select_action(batch=obs) * 0.0 + action = policy.select_action(batch=obs) policy_fps = 1.0 / (list_policy_time[-1] + 1e-9) log_policy_frequency_issue(policy_fps=policy_fps, cfg=cfg, interaction_step=interaction_step) @@ -238,7 +242,9 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module) next_obs, reward, done, truncated, info = online_env.step(action) # HACK: We have only one env but we want to batch it, it will be resolved with the torch box - action = torch.from_numpy(action[0]).to(device, non_blocking=True).unsqueeze(dim=0) + action = ( + torch.from_numpy(action[0]).to(device, non_blocking=device.type == "cuda").unsqueeze(dim=0) + ) sum_reward_episode += float(reward) @@ -247,6 +253,11 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module) # TODO: Check the shape action = info["action_intervention"] + # Check for NaN values in observations + for key, tensor in obs.items(): + if torch.isnan(tensor).any(): + logging.error(f"[ACTOR] NaN values found in obs[{key}] at step {interaction_step}") + list_transition_to_send_to_learner.append( Transition( state=obs, diff --git a/lerobot/scripts/server/buffer.py b/lerobot/scripts/server/buffer.py index 8be21365..6caa9df7 100644 --- a/lerobot/scripts/server/buffer.py +++ b/lerobot/scripts/server/buffer.py @@ -43,16 +43,27 @@ class BatchTransition(TypedDict): def move_transition_to_device(transition: Transition, device: str = "cpu") -> Transition: # Move state tensors to CPU - transition["state"] = {key: val.to(device, non_blocking=True) for key, val in transition["state"].items()} + device = torch.device(device) + transition["state"] = { + key: val.to(device, non_blocking=device.type == "cuda") for key, val in transition["state"].items() + } # Move action to CPU - transition["action"] = transition["action"].to(device, non_blocking=True) + transition["action"] = transition["action"].to(device, non_blocking=device.type == "cuda") # No need to move reward or done, as they are float and bool + # No need to move reward or done, as they are float and bool + if isinstance(transition["reward"], torch.Tensor): + transition["reward"] = transition["reward"].to(device=device, non_blocking=device.type == "cuda") + + if isinstance(transition["done"], torch.Tensor): + transition["done"] = transition["done"].to(device, non_blocking=device.type == "cuda") + # Move next_state tensors to CPU transition["next_state"] = { - key: val.to(device, non_blocking=True) for key, val in transition["next_state"].items() + key: val.to(device, non_blocking=device.type == "cuda") + for key, val in transition["next_state"].items() } # If complementary_info is present, move its tensors to CPU diff --git a/lerobot/scripts/server/find_joint_limits.py b/lerobot/scripts/server/find_joint_limits.py index 6ec9d89f..1c2443d6 100644 --- a/lerobot/scripts/server/find_joint_limits.py +++ b/lerobot/scripts/server/find_joint_limits.py @@ -40,6 +40,7 @@ def find_joint_bounds( min = np.min(np.stack(pos_list), 0) print(f"Max angle position per joint {max}") print(f"Min angle position per joint {min}") + break if __name__ == "__main__": diff --git a/lerobot/scripts/server/gym_manipulator.py b/lerobot/scripts/server/gym_manipulator.py index 5bf51868..09b979c5 100644 --- a/lerobot/scripts/server/gym_manipulator.py +++ b/lerobot/scripts/server/gym_manipulator.py @@ -296,12 +296,17 @@ class RewardWrapper(gym.Wrapper): # NOTE: We got 15% speedup by compiling the model self.reward_classifier = torch.compile(reward_classifier) + + if isinstance(device, str): + device = torch.device(device) self.device = device def step(self, action): observation, _, terminated, truncated, info = self.env.step(action) images = [ - observation[key].to(self.device, non_blocking=True) for key in observation if "image" in key + observation[key].to(self.device, non_blocking=self.device.type == "cuda") + for key in observation + if "image" in key ] start_time = time.perf_counter() with torch.inference_mode(): @@ -309,12 +314,76 @@ class RewardWrapper(gym.Wrapper): self.reward_classifier.predict_reward(images) if self.reward_classifier is not None else 0.0 ) info["Reward classifer frequency"] = 1 / (time.perf_counter() - start_time) + + if reward == 1.0: + terminated = True return observation, reward, terminated, truncated, info def reset(self, seed=None, options=None): return self.env.reset(seed=seed, options=options) +class JointMaskingActionSpace(gym.ActionWrapper): + def __init__(self, env, mask): + """ + Wrapper to mask out dimensions of the action space. + + Args: + env: The environment to wrap + mask: Binary mask array where 0 indicates dimensions to remove + """ + super().__init__(env) + + # Validate mask matches action space + + # Keep only dimensions where mask is 1 + self.active_dims = np.where(mask)[0] + + if isinstance(env.action_space, gym.spaces.Box): + if len(mask) != env.action_space.shape[0]: + raise ValueError("Mask length must match action space dimensions") + low = env.action_space.low[self.active_dims] + high = env.action_space.high[self.active_dims] + self.action_space = gym.spaces.Box(low=low, high=high, dtype=env.action_space.dtype) + + if isinstance(env.action_space, gym.spaces.Tuple): + if len(mask) != env.action_space[0].shape[0]: + raise ValueError("Mask length must match action space 0 dimensions") + + low = env.action_space[0].low[self.active_dims] + high = env.action_space[0].high[self.active_dims] + action_space_masked = gym.spaces.Box(low=low, high=high, dtype=env.action_space[0].dtype) + self.action_space = gym.spaces.Tuple((action_space_masked, env.action_space[1])) + # Create new action space with masked dimensions + + def action(self, action): + """ + Convert masked action back to full action space. + + Args: + action: Action in masked space. For Tuple spaces, the first element is masked. + + Returns: + Action in original space with masked dims set to 0. + """ + + # Determine whether we are handling a Tuple space or a Box. + if isinstance(self.env.action_space, gym.spaces.Tuple): + # Extract the masked component from the tuple. + masked_action = action[0] if isinstance(action, tuple) else action + # Create a full action for the Box element. + full_box_action = np.zeros(self.env.action_space[0].shape, dtype=self.env.action_space[0].dtype) + full_box_action[self.active_dims] = masked_action + # Return a tuple with the reconstructed Box action and the unchanged remainder. + return (full_box_action, action[1]) + else: + # For Box action spaces. + masked_action = action if not isinstance(action, tuple) else action[0] + full_action = np.zeros(self.env.action_space.shape, dtype=self.env.action_space.dtype) + full_action[self.active_dims] = masked_action + return full_action + + class TimeLimitWrapper(gym.Wrapper): def __init__(self, env, control_time_s, fps): self.env = env @@ -331,13 +400,12 @@ class TimeLimitWrapper(gym.Wrapper): def step(self, action): obs, reward, terminated, truncated, info = self.env.step(action) time_since_last_step = time.perf_counter() - self.last_timestamp - # logging.warning(f"Current timestep is lower than the expected fps {self.fps}") self.episode_time_in_s += time_since_last_step self.last_timestamp = time.perf_counter() self.current_step += 1 # check if last timestep took more time than the expected fps - # if 1.0 / time_since_last_step < self.fps: - # logging.warning(f"Current timestep exceeded expected fps {self.fps}") + if 1.0 / time_since_last_step < self.fps: + logging.debug(f"Current timestep exceeded expected fps {self.fps}") if self.episode_time_in_s > self.control_time_s: # if self.current_step >= self.max_episode_steps: @@ -360,7 +428,7 @@ class ImageCropResizeWrapper(gym.Wrapper): print(f"obs_keys , {self.env.observation_space}") print(f"crop params dict {crop_params_dict.keys()}") for key_crop in crop_params_dict: - if key_crop not in self.env.observation_space.keys(): + if key_crop not in self.env.observation_space.keys(): # noqa: SIM118 raise ValueError(f"Key {key_crop} not in observation space") for key in crop_params_dict: top, left, height, width = crop_params_dict[key] @@ -375,14 +443,23 @@ class ImageCropResizeWrapper(gym.Wrapper): obs, reward, terminated, truncated, info = self.env.step(action) for k in self.crop_params_dict: device = obs[k].device + + # Check for NaNs before processing + if torch.isnan(obs[k]).any(): + logging.error(f"NaN values detected in observation {k} before crop and resize") + if device == torch.device("mps:0"): obs[k] = obs[k].cpu() + obs[k] = F.crop(obs[k], *self.crop_params_dict[k]) obs[k] = F.resize(obs[k], self.resize_size) + + # Check for NaNs after processing + if torch.isnan(obs[k]).any(): + logging.error(f"NaN values detected in observation {k} after crop and resize") + obs[k] = obs[k].to(device) - # print(f"observation with key {k} with size {obs[k].size()}") - # cv2.imshow(k, cv2.cvtColor(obs[k].cpu().squeeze(0).permute(1, 2, 0).numpy(), cv2.COLOR_RGB2BGR)) - # cv2.waitKey(1) + return obs, reward, terminated, truncated, info def reset(self, seed=None, options=None): @@ -400,12 +477,18 @@ class ImageCropResizeWrapper(gym.Wrapper): class ConvertToLeRobotObservation(gym.ObservationWrapper): def __init__(self, env, device): super().__init__(env) + + if isinstance(device, str): + device = torch.device(device) self.device = device def observation(self, observation): observation = preprocess_observation(observation) - observation = {key: observation[key].to(self.device, non_blocking=True) for key in observation} + observation = { + key: observation[key].to(self.device, non_blocking=self.device.type == "cuda") + for key in observation + } observation = {k: torch.tensor(v, device=self.device) for k, v in observation.items()} return observation @@ -440,7 +523,7 @@ class KeyboardInterfaceWrapper(gym.Wrapper): if key == keyboard.Key.right or key == keyboard.Key.esc: print("Right arrow key pressed. Exiting loop...") self.events["exit_early"] = True - elif key == keyboard.Key.space: + elif key == keyboard.Key.space and not self.events["exit_early"]: if not self.events["pause_policy"]: print( "Space key pressed. Human intervention required.\n" @@ -587,6 +670,7 @@ def make_robot_env( env = TimeLimitWrapper(env=env, control_time_s=cfg.wrapper.control_time_s, fps=cfg.fps) env = KeyboardInterfaceWrapper(env=env) env = ResetWrapper(env=env, reset_fn=None, reset_time_s=cfg.wrapper.reset_time_s) + env = JointMaskingActionSpace(env=env, mask=cfg.wrapper.joint_masking_action_space) env = BatchCompitableWrapper(env=env) return env @@ -596,7 +680,7 @@ def make_robot_env( def get_classifier(pretrained_path, config_path, device="mps"): if pretrained_path is None or config_path is None: - return + return None from lerobot.common.policies.factory import _policy_cfg_from_hydra_cfg from lerobot.common.policies.hilserl.classifier.configuration_classifier import ClassifierConfig diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py index bbd70598..1b54e3a9 100644 --- a/lerobot/scripts/server/learner_server.py +++ b/lerobot/scripts/server/learner_server.py @@ -278,12 +278,23 @@ def learner_push_parameters( torch.save(params_dict, buf) params_bytes = buf.getvalue() - # Push them to the Actor’s "SendParameters" method + # Push them to the Actor's "SendParameters" method logging.info("[LEARNER] Publishing parameters to the Actor") response = actor_stub.SendParameters(hilserl_pb2.Parameters(parameter_bytes=params_bytes)) # noqa: F841 time.sleep(seconds_between_pushes) +def check_nan_in_transition(observations: torch.Tensor, actions: torch.Tensor, next_state: torch.Tensor): + for k in observations: + if torch.isnan(observations[k]).any(): + logging.error(f"observations[{k}] contains NaN values") + for k in next_state: + if torch.isnan(next_state[k]).any(): + logging.error(f"next_state[{k}] contains NaN values") + if torch.isnan(actions).any(): + logging.error("actions contains NaN values") + + def add_actor_information_and_train( cfg, device: str, @@ -372,6 +383,7 @@ def add_actor_information_and_train( observations = batch["state"] next_observations = batch["next_state"] done = batch["done"] + check_nan_in_transition(observations=observations, actions=actions, next_state=next_observations) with policy_lock: loss_critic = policy.compute_loss_critic( @@ -399,6 +411,8 @@ def add_actor_information_and_train( next_observations = batch["next_state"] done = batch["done"] + assert_and_breakpoint(observations=observations, actions=actions, next_state=next_observations) + with policy_lock: loss_critic = policy.compute_loss_critic( observations=observations, @@ -497,8 +511,8 @@ def make_optimizers_and_scheduler(cfg, policy: nn.Module): It also initializes a learning rate scheduler, though currently, it is set to `None`. **NOTE:** - - If the encoder is shared, its parameters are excluded from the actor’s optimization process. - - The policy’s log temperature (`log_alpha`) is wrapped in a list to ensure proper optimization as a standalone tensor. + - If the encoder is shared, its parameters are excluded from the actor's optimization process. + - The policy's log temperature (`log_alpha`) is wrapped in a list to ensure proper optimization as a standalone tensor. Args: cfg: Configuration object containing hyperparameters.