diff --git a/lerobot/common/datasets/lerobot_dataset.py b/lerobot/common/datasets/lerobot_dataset.py index 5c56d10c..5278987b 100644 --- a/lerobot/common/datasets/lerobot_dataset.py +++ b/lerobot/common/datasets/lerobot_dataset.py @@ -84,7 +84,7 @@ class LeRobotDatasetMetadata: # Load metadata (self.root / "meta").mkdir(exist_ok=True, parents=True) - # self.pull_from_repo(allow_patterns="meta/") + self.pull_from_repo(allow_patterns="meta/") self.info = load_info(self.root) self.stats = load_stats(self.root) self.tasks = load_tasks(self.root) @@ -539,7 +539,7 @@ class LeRobotDataset(torch.utils.data.Dataset): # HACK: UNCOMMENT IF YOU REVIEW THAT, PLEASE SUGGEST TO UNCOMMENT logging.warning("HACK: WE COMMENT THIS LINE, IF SOMETHING IS WEIRD WITH DATASETS UNCOMMENT") - # self.pull_from_repo(allow_patterns=files, ignore_patterns=ignore_patterns) + self.pull_from_repo(allow_patterns=files, ignore_patterns=ignore_patterns) def load_hf_dataset(self) -> datasets.Dataset: """hf_dataset contains all the observations, states, actions, rewards, etc.""" diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py index 11830aa1..622919b9 100644 --- a/lerobot/common/policies/sac/modeling_sac.py +++ b/lerobot/common/policies/sac/modeling_sac.py @@ -137,7 +137,7 @@ class SACPolicy( """Override .to(device) method to involve moving the log_alpha fixed_std""" if self.actor.fixed_std is not None: self.actor.fixed_std = self.actor.fixed_std.to(*args, **kwargs) - self.log_alpha = self.log_alpha.to(*args, **kwargs) + # self.log_alpha = self.log_alpha.to(*args, **kwargs) super().to(*args, **kwargs) @torch.no_grad() diff --git a/lerobot/configs/policy/sac_real.yaml b/lerobot/configs/policy/sac_real.yaml index f5607867..9b78f593 100644 --- a/lerobot/configs/policy/sac_real.yaml +++ b/lerobot/configs/policy/sac_real.yaml @@ -31,7 +31,7 @@ training: online_env_seed: 10000 online_buffer_capacity: 1000000 online_buffer_seed_size: 0 - online_step_before_learning: 1000 #5000 + online_step_before_learning: 100 #5000 do_online_rollout_async: false policy_update_freq: 1 @@ -61,7 +61,7 @@ policy: observation.images.side: [3, 128, 128] # observation.image: [3, 128, 128] output_shapes: - action: ["${env.action_dim}"] + action: [4] # ["${env.action_dim}"] # Normalization / Unnormalization input_normalization_modes: @@ -84,9 +84,12 @@ policy: output_normalization_modes: action: min_max output_normalization_params: + # action: + # min: [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0] + # max: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] action: - min: [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0] - max: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] + min: [-145.283203125, -69.43359375, -78.75, -46.0546875] + max: [145.283203125, 69.43359375, 78.75, 46.0546875] # Architecture / modeling. # Neural networks. diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py index 7b1866f9..2be6674c 100644 --- a/lerobot/scripts/server/actor_server.py +++ b/lerobot/scripts/server/actor_server.py @@ -201,6 +201,7 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module) "action": {"min": min_action_space, "max": max_action_space} } cfg.policy.output_normalization_params = output_normalization_params + cfg.policy.output_shapes["action"] = online_env.action_space.spaces[0].shape ### Instantiate the policy in both the actor and learner processes ### To avoid sending a SACPolicy object through the port, we create a policy intance @@ -252,6 +253,8 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module) # NOTE: We overide the action if the intervention is True, because the action applied is the intervention action if info["is_intervention"]: # TODO: Check the shape + # NOTE: The action space for demonstration before hand is with the full action space + # but sometimes for example we want to deactivate the gripper action = info["action_intervention"] episode_intervention = True diff --git a/lerobot/scripts/server/buffer.py b/lerobot/scripts/server/buffer.py index 6caa9df7..99f5c55b 100644 --- a/lerobot/scripts/server/buffer.py +++ b/lerobot/scripts/server/buffer.py @@ -195,6 +195,7 @@ class ReplayBuffer: device: str = "cuda:0", state_keys: Optional[Sequence[str]] = None, capacity: Optional[int] = None, + action_mask: Optional[Sequence[int]] = None, ) -> "ReplayBuffer": """ Convert a LeRobotDataset into a ReplayBuffer. @@ -229,6 +230,12 @@ class ReplayBuffer: elif isinstance(v, torch.Tensor): data[k] = v.to(device) + if action_mask is not None: + if data["action"].dim() == 1: + data["action"] = data["action"][action_mask] + else: + data["action"] = data["action"][:, action_mask] + replay_buffer.add( state=data["state"], action=data["action"], diff --git a/lerobot/scripts/server/gym_manipulator.py b/lerobot/scripts/server/gym_manipulator.py index baaa3da9..3ed1fdc9 100644 --- a/lerobot/scripts/server/gym_manipulator.py +++ b/lerobot/scripts/server/gym_manipulator.py @@ -328,7 +328,7 @@ class RewardWrapper(gym.Wrapper): return self.env.reset(seed=seed, options=options) -class JointMaskingActionSpace(gym.ActionWrapper): +class JointMaskingActionSpace(gym.Wrapper): def __init__(self, env, mask): """ Wrapper to mask out dimensions of the action space. @@ -388,6 +388,16 @@ class JointMaskingActionSpace(gym.ActionWrapper): full_action[self.active_dims] = masked_action return full_action + def step(self, action): + action = self.action(action) + obs, reward, terminated, truncated, info = self.env.step(action) + if "action_intervention" in info and info["action_intervention"] is not None: + if info["action_intervention"].dim() == 1: + info["action_intervention"] = info["action_intervention"][self.active_dims] + else: + info["action_intervention"] = info["action_intervention"][:, self.active_dims] + return obs, reward, terminated, truncated, info + class TimeLimitWrapper(gym.Wrapper): def __init__(self, env, control_time_s, fps): diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py index 460b845d..31976537 100644 --- a/lerobot/scripts/server/learner_server.py +++ b/lerobot/scripts/server/learner_server.py @@ -354,7 +354,7 @@ def add_actor_information_and_train( transition = move_transition_to_device(transition, device=device) replay_buffer.add(**transition) - if transition.get("complementary_info", {}).get("is_interaction"): + if transition.get("complementary_info", {}).get("is_intervention"): offline_replay_buffer.add(**transition) while not interaction_message_queue.empty(): @@ -568,6 +568,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No ### To avoid sending a SACPolicy object through the port, we create a policy intance ### on both sides, the learner sends the updated parameters every n steps to update the actor's parameters # TODO: At some point we should just need make sac policy + policy_lock = Lock() policy: SACPolicy = make_policy( hydra_cfg=cfg, @@ -593,8 +594,12 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No logging.info("make_dataset offline buffer") offline_dataset = make_dataset(cfg) logging.info("Convertion to a offline replay buffer") + active_action_dims = [i for i, mask in enumerate(cfg.env.wrapper.joint_masking_action_space) if mask] offline_replay_buffer = ReplayBuffer.from_lerobot_dataset( - offline_dataset, device=device, state_keys=cfg.policy.input_shapes.keys() + offline_dataset, + device=device, + state_keys=cfg.policy.input_shapes.keys(), + action_mask=active_action_dims, ) batch_size: int = batch_size // 2 # We will sample from both replay buffer