From bbb5ba0adfd54d8a1a41b13501e754de4e4daee9 Mon Sep 17 00:00:00 2001 From: Michel Aractingi Date: Mon, 13 Jan 2025 13:57:49 +0100 Subject: [PATCH] Extend reward classifier for multiple camera views (#626) --- lerobot/common/logger.py | 2 +- .../classifier/configuration_classifier.py | 1 + .../hilserl/classifier/modeling_classifier.py | 16 ++- lerobot/common/robot_devices/control_utils.py | 9 ++ .../configs/policy/hilserl_classifier.yaml | 9 +- lerobot/scripts/control_robot.py | 13 ++ lerobot/scripts/eval_on_robot.py | 123 +++++++++++++----- lerobot/scripts/train_hilserl_classifier.py | 7 +- tests/test_train_hilserl_classifier.py | 61 ++++++++- 9 files changed, 192 insertions(+), 49 deletions(-) diff --git a/lerobot/common/logger.py b/lerobot/common/logger.py index dec8b465..4015492d 100644 --- a/lerobot/common/logger.py +++ b/lerobot/common/logger.py @@ -25,13 +25,13 @@ from glob import glob from pathlib import Path import torch +import wandb from huggingface_hub.constants import SAFETENSORS_SINGLE_FILE from omegaconf import DictConfig, OmegaConf from termcolor import colored from torch.optim import Optimizer from torch.optim.lr_scheduler import LRScheduler -import wandb from lerobot.common.policies.policy_protocol import Policy from lerobot.common.utils.utils import get_global_random_state, set_global_random_state diff --git a/lerobot/common/policies/hilserl/classifier/configuration_classifier.py b/lerobot/common/policies/hilserl/classifier/configuration_classifier.py index f0b9352f..de3742ec 100644 --- a/lerobot/common/policies/hilserl/classifier/configuration_classifier.py +++ b/lerobot/common/policies/hilserl/classifier/configuration_classifier.py @@ -13,6 +13,7 @@ class ClassifierConfig: model_name: str = "microsoft/resnet-50" device: str = "cpu" model_type: str = "cnn" # "transformer" or "cnn" + num_cameras: int = 2 def save_pretrained(self, save_dir): """Save config to json file.""" diff --git a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py index d7bd42cd..4a022335 100644 --- a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py +++ b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py @@ -97,7 +97,7 @@ class Classifier( raise ValueError("Unsupported transformer architecture since hidden_size is not found") self.classifier_head = nn.Sequential( - nn.Linear(input_dim, self.config.hidden_dim), + nn.Linear(input_dim * self.config.num_cameras, self.config.hidden_dim), nn.Dropout(self.config.dropout_rate), nn.LayerNorm(self.config.hidden_dim), nn.ReLU(), @@ -130,11 +130,11 @@ class Classifier( return outputs.pooler_output return outputs.last_hidden_state[:, 0, :] - def forward(self, x: torch.Tensor) -> ClassifierOutput: + def forward(self, xs: torch.Tensor) -> ClassifierOutput: """Forward pass of the classifier.""" # For training, we expect input to be a tensor directly from LeRobotDataset - encoder_output = self._get_encoder_output(x) - logits = self.classifier_head(encoder_output) + encoder_outputs = torch.hstack([self._get_encoder_output(x) for x in xs]) + logits = self.classifier_head(encoder_outputs) if self.config.num_classes == 2: logits = logits.squeeze(-1) @@ -142,4 +142,10 @@ class Classifier( else: probabilities = torch.softmax(logits, dim=-1) - return ClassifierOutput(logits=logits, probabilities=probabilities, hidden_states=encoder_output) + return ClassifierOutput(logits=logits, probabilities=probabilities, hidden_states=encoder_outputs) + + def predict_reward(self, x): + if self.config.num_classes == 2: + return (self.forward(x).probabilities > 0.5).float() + else: + return torch.argmax(self.forward(x).probabilities, dim=1) diff --git a/lerobot/common/robot_devices/control_utils.py b/lerobot/common/robot_devices/control_utils.py index ad6f5632..10cb9f5c 100644 --- a/lerobot/common/robot_devices/control_utils.py +++ b/lerobot/common/robot_devices/control_utils.py @@ -11,6 +11,7 @@ from copy import copy from functools import cache import cv2 +import numpy as np import torch import tqdm from deepdiff import DeepDiff @@ -332,6 +333,14 @@ def reset_environment(robot, events, reset_time_s): break +def reset_follower_position(robot: Robot, target_position): + current_position = robot.follower_arms["main"].read("Present_Position") + trajectory = torch.from_numpy(np.linspace(current_position, target_position, 30)) # NOTE: 30 is just an aribtrary number + for pose in trajectory: + robot.send_action(pose) + busy_wait(0.015) + + def stop_recording(robot, listener, display_cameras): robot.disconnect() diff --git a/lerobot/configs/policy/hilserl_classifier.yaml b/lerobot/configs/policy/hilserl_classifier.yaml index 498c9983..f8137b69 100644 --- a/lerobot/configs/policy/hilserl_classifier.yaml +++ b/lerobot/configs/policy/hilserl_classifier.yaml @@ -4,7 +4,7 @@ defaults: - _self_ seed: 13 -dataset_repo_id: "dataset_repo_id" +dataset_repo_id: aractingi/pick_place_lego_cube_1 train_split_proportion: 0.8 # Required by logger @@ -24,7 +24,7 @@ training: eval_freq: 1 # How often to run validation (in epochs) save_freq: 1 # How often to save checkpoints (in epochs) save_checkpoint: true - image_key: "observation.images.phone" + image_keys: ["observation.images.top", "observation.images.wrist"] label_key: "next.reward" eval: @@ -32,9 +32,10 @@ eval: num_samples_to_log: 30 # Number of validation samples to log in the table policy: - name: "hilserl/classifier" + name: "hilserl/classifier/pick_place_lego_cube_1" model_name: "facebook/convnext-base-224" model_type: "cnn" + num_cameras: 2 # Has to be len(training.image_keys) wandb: enable: false @@ -44,4 +45,4 @@ wandb: device: "mps" resume: false -output_dir: "output" +output_dir: "outputs/classifier" diff --git a/lerobot/scripts/control_robot.py b/lerobot/scripts/control_robot.py index 22529653..9f266e2f 100644 --- a/lerobot/scripts/control_robot.py +++ b/lerobot/scripts/control_robot.py @@ -109,6 +109,7 @@ from lerobot.common.robot_devices.control_utils import ( log_control_info, record_episode, reset_environment, + reset_follower_position, sanity_check_dataset_name, sanity_check_dataset_robot_compatibility, stop_recording, @@ -205,6 +206,7 @@ def record( num_image_writer_threads_per_camera: int = 4, display_cameras: bool = True, play_sounds: bool = True, + reset_follower: bool = False, resume: bool = False, # TODO(rcadene, aliberts): remove local_files_only when refactor with dataset as argument local_files_only: bool = False, @@ -265,6 +267,9 @@ def record( robot.connect() listener, events = init_keyboard_listener(assign_rewards=assign_rewards) + if reset_follower: + initial_position = robot.follower_arms["main"].read("Present_Position") + # Execute a few seconds without recording to: # 1. teleoperate the robot to move it in starting position if no policy provided, # 2. give times to the robot devices to connect and start synchronizing, @@ -307,6 +312,8 @@ def record( (recorded_episodes < num_episodes - 1) or events["rerecord_episode"] ): log_say("Reset the environment", play_sounds) + if reset_follower: + reset_follower_position(robot, initial_position) reset_environment(robot, events, reset_time_s) if events["rerecord_episode"]: @@ -527,6 +534,12 @@ if __name__ == "__main__": default=0, help="Enables the assignation of rewards to frames (by default no assignation). When enabled, assign a 0 reward to frames until the space bar is pressed which assign a 1 reward. Press the space bar a second time to assign a 0 reward. The reward assigned is reset to 0 when the episode ends.", ) + parser_record.add_argument( + "--reset-follower", + type=int, + default=0, + help="Resets the follower to the initial position during while reseting the evironment, this is to avoid having the follower start at an awkward position in the next episode", + ) parser_replay = subparsers.add_parser("replay", parents=[base_parser]) parser_replay.add_argument( diff --git a/lerobot/scripts/eval_on_robot.py b/lerobot/scripts/eval_on_robot.py index 92daa860..842c1a28 100644 --- a/lerobot/scripts/eval_on_robot.py +++ b/lerobot/scripts/eval_on_robot.py @@ -23,6 +23,15 @@ python lerobot/scripts/eval_on_robot.py \ eval.n_episodes=10 ``` +Test reward classifier with teleoperation (you need to press space to take over) +``` +python lerobot/scripts/eval_on_robot.py \ + --robot-path lerobot/configs/robot/so100.yaml \ + --reward-classifier-pretrained-path outputs/classifier/checkpoints/best/pretrained_model \ + --reward-classifier-config-file lerobot/configs/policy/hilserl_classifier.yaml \ + --display-cameras 1 +``` + **NOTE** (michel-aractingi): This script is incomplete and it is being prepared for running training on the real robot. """ @@ -30,14 +39,14 @@ for running training on the real robot. import argparse import logging import time -from copy import deepcopy +import cv2 import numpy as np import torch from tqdm import trange from lerobot.common.policies.policy_protocol import Policy -from lerobot.common.robot_devices.control_utils import busy_wait, is_headless +from lerobot.common.robot_devices.control_utils import busy_wait, is_headless, reset_follower_position from lerobot.common.robot_devices.robots.factory import Robot, make_robot from lerobot.common.utils.utils import ( init_hydra_config, @@ -46,7 +55,33 @@ from lerobot.common.utils.utils import ( ) -def rollout(robot: Robot, policy: Policy, fps: int, control_time_s: float = 20, use_amp: bool = True) -> dict: +def get_classifier(pretrained_path, config_path): + if pretrained_path is None or config_path is None: + return + + from lerobot.common.policies.factory import _policy_cfg_from_hydra_cfg + from lerobot.common.policies.hilserl.classifier.configuration_classifier import ClassifierConfig + from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier + + cfg = init_hydra_config(config_path) + + classifier_config = _policy_cfg_from_hydra_cfg(ClassifierConfig, cfg) + classifier_config.num_cameras = len(cfg.training.image_keys) # TODO automate these paths + model = Classifier(classifier_config) + model.load_state_dict(Classifier.from_pretrained(pretrained_path).state_dict()) + model = model.to("mps") + return model + + +def rollout( + robot: Robot, + policy: Policy, + reward_classifier, + fps: int, + control_time_s: float = 20, + use_amp: bool = True, + display_cameras: bool = False, +) -> dict: """Run a batched policy rollout on the real robot. The return dictionary contains: @@ -70,6 +105,7 @@ def rollout(robot: Robot, policy: Policy, fps: int, control_time_s: float = 20, Returns: The dictionary described above. """ + # TODO (michel-aractingi): Infer the device from policy parameters when policy is added # assert isinstance(policy, nn.Module), "Policy must be a PyTorch nn module." # device = get_device_from_parameters(policy) @@ -79,25 +115,21 @@ def rollout(robot: Robot, policy: Policy, fps: int, control_time_s: float = 20, # Reset the policy. TODO (michel-aractingi) add real policy evaluation once the code is ready. # policy.reset() - # Get observation from real robot + # NOTE: sorting to make sure the key sequence is the same during training and testing. observation = robot.capture_observation() + image_keys = [key for key in observation if "image" in key] + image_keys.sort() - # Calculate reward. TODO (michel-aractingi) - # in HIL-SERL it will be with a reward classifier - reward = calculate_reward(observation) - all_observations = [] all_actions = [] all_rewards = [] all_successes = [] start_episode_t = time.perf_counter() + init_pos = robot.follower_arms["main"].read("Present_Position") timestamp = 0.0 while timestamp < control_time_s: start_loop_t = time.perf_counter() - all_observations.append(deepcopy(observation)) - # observation = {key: observation[key].to(device, non_blocking=True) for key in observation} - # Apply the next action. while events["pause_policy"] and not events["human_intervention_step"]: busy_wait(0.5) @@ -109,18 +141,26 @@ def rollout(robot: Robot, policy: Policy, fps: int, control_time_s: float = 20, else: # explore with policy with torch.inference_mode(): + # TODO (michel-aractingi) replace this part with policy (predict_action) action = robot.follower_arms["main"].read("Present_Position") action = torch.from_numpy(action) robot.send_action(action) # action = predict_action(observation, policy, device, use_amp) observation = robot.capture_observation() - # Calculate reward - # in HIL-SERL it will be with a reward classifier - reward = calculate_reward(observation) + images = [] + for key in image_keys: + if display_cameras: + cv2.imshow(key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR)) + cv2.waitKey(1) + images.append(observation[key].to("mps")) + + reward = reward_classifier.predict_reward(images) if reward_classifier is not None else 0.0 + all_rewards.append(reward) + + # print("REWARD : ", reward) all_actions.append(action) - all_rewards.append(torch.from_numpy(reward)) all_successes.append(torch.tensor([False])) dt_s = time.perf_counter() - start_loop_t @@ -131,7 +171,8 @@ def rollout(robot: Robot, policy: Policy, fps: int, control_time_s: float = 20, events["human_intervention_step"] = False events["pause_policy"] = False break - all_observations.append(deepcopy(observation)) + + reset_follower_position(robot, target_position=init_pos) dones = torch.tensor([False] * len(all_actions)) dones[-1] = True @@ -142,10 +183,6 @@ def rollout(robot: Robot, policy: Policy, fps: int, control_time_s: float = 20, "next.success": torch.stack(all_successes, dim=1), "done": dones, } - stacked_observations = {} - for key in all_observations[0]: - stacked_observations[key] = torch.stack([obs[key] for obs in all_observations], dim=1) - ret["observation"] = stacked_observations listener.stop() @@ -159,6 +196,9 @@ def eval_policy( n_episodes: int, control_time_s: int = 20, use_amp: bool = True, + display_cameras: bool = False, + reward_classifier_pretrained_path: str | None = None, + reward_classifier_config_file: str | None = None, ) -> dict: """ Args: @@ -179,8 +219,12 @@ def eval_policy( start_eval = time.perf_counter() progbar = trange(n_episodes, desc="Evaluating policy on real robot") - for _batch_idx in progbar: - rollout_data = rollout(robot, policy, fps, control_time_s, use_amp) + reward_classifier = get_classifier(reward_classifier_pretrained_path, reward_classifier_config_file) + + for _ in progbar: + rollout_data = rollout( + robot, policy, reward_classifier, fps, control_time_s, use_amp, display_cameras + ) rollouts.append(rollout_data) sum_rewards.append(sum(rollout_data["next.reward"])) @@ -219,15 +263,6 @@ def eval_policy( return info -def calculate_reward(observation): - """ - Method to calculate reward function in some way. - In HIL-SERL this is done through defining a reward classifier - """ - # reward = reward_classifier(observation) - return np.array([0.0]) - - def init_keyboard_listener(): # Allow to exit early while recording an episode or resetting the environment, # by tapping the right arrow key '->'. This might require a sudo permission @@ -324,6 +359,21 @@ if __name__ == "__main__": "outputs/eval/{timestamp}_{env_name}_{policy_name}" ), ) + parser.add_argument( + "--display-cameras", help=("Whether to display the camera feed while the rollout is happening") + ) + parser.add_argument( + "--reward-classifier-pretrained-path", + type=str, + default=None, + help="Path to the pretrained classifier weights.", + ) + parser.add_argument( + "--reward-classifier-config-file", + type=str, + default=None, + help="Path to a yaml config file that is necessary to build the reward classifier model.", + ) args = parser.parse_args() @@ -332,4 +382,13 @@ if __name__ == "__main__": if not robot.is_connected: robot.connect() - eval_policy(robot, None, fps=40, n_episodes=2, control_time_s=100) + eval_policy( + robot, + None, + fps=40, + n_episodes=2, + control_time_s=100, + display_cameras=args.display_cameras, + reward_classifier_config_file=args.reward_classifier_config_file, + reward_classifier_pretrained_path=args.reward_classifier_pretrained_path, + ) diff --git a/lerobot/scripts/train_hilserl_classifier.py b/lerobot/scripts/train_hilserl_classifier.py index 22ff2957..458e3ff1 100644 --- a/lerobot/scripts/train_hilserl_classifier.py +++ b/lerobot/scripts/train_hilserl_classifier.py @@ -22,6 +22,7 @@ from pprint import pformat import hydra import torch import torch.nn as nn +import wandb from deepdiff import DeepDiff from omegaconf import DictConfig, OmegaConf from termcolor import colored @@ -30,7 +31,6 @@ from torch.cuda.amp import GradScaler from torch.utils.data import DataLoader, WeightedRandomSampler, random_split from tqdm import tqdm -import wandb from lerobot.common.datasets.factory import resolve_delta_timestamps from lerobot.common.datasets.lerobot_dataset import LeRobotDataset from lerobot.common.logger import Logger @@ -79,7 +79,7 @@ def train_epoch(model, train_loader, criterion, optimizer, grad_scaler, device, pbar = tqdm(train_loader, desc="Training") for batch_idx, batch in enumerate(pbar): start_time = time.perf_counter() - images = batch[cfg.training.image_key].to(device) + images = [batch[img_key].to(device) for img_key in cfg.training.image_keys] labels = batch[cfg.training.label_key].float().to(device) # Forward pass with optional AMP @@ -130,7 +130,7 @@ def validate(model, val_loader, criterion, device, logger, cfg, num_samples_to_l torch.autocast(device_type=device.type) if support_amp(device, cfg) else nullcontext(), ): for batch in tqdm(val_loader, desc="Validation"): - images = batch[cfg.training.image_key].to(device) + images = [batch[img_key].to(device) for img_key in cfg.training.image_keys] labels = batch[cfg.training.label_key].float().to(device) outputs = model(images) @@ -163,6 +163,7 @@ def validate(model, val_loader, criterion, device, logger, cfg, num_samples_to_l accuracy = 100 * correct / total avg_loss = running_loss / len(val_loader) + print(f"Average validation loss {avg_loss}, and accuracy {accuracy}") eval_info = { "loss": avg_loss, diff --git a/tests/test_train_hilserl_classifier.py b/tests/test_train_hilserl_classifier.py index c1d854ac..8c1ad453 100644 --- a/tests/test_train_hilserl_classifier.py +++ b/tests/test_train_hilserl_classifier.py @@ -33,7 +33,9 @@ class MockDataset(Dataset): def make_dummy_model(): - model_config = ClassifierConfig(num_classes=2, model_name="hf-tiny-model-private/tiny-random-ResNetModel") + model_config = ClassifierConfig( + num_classes=2, model_name="hf-tiny-model-private/tiny-random-ResNetModel", num_cameras=1 + ) model = Classifier(config=model_config) return model @@ -88,7 +90,7 @@ def test_train_epoch(): logger = MagicMock() step = 0 cfg = MagicMock() - cfg.training.image_key = "image" + cfg.training.image_keys = ["image"] cfg.training.label_key = "label" cfg.training.use_amp = False @@ -130,7 +132,7 @@ def test_validate(): device = torch.device("cpu") logger = MagicMock() cfg = MagicMock() - cfg.training.image_key = "image" + cfg.training.image_keys = ["image"] cfg.training.label_key = "label" cfg.training.use_amp = False @@ -145,6 +147,57 @@ def test_validate(): assert isinstance(eval_info, dict) +def test_train_epoch_multiple_cameras(): + model_config = ClassifierConfig( + num_classes=2, model_name="hf-tiny-model-private/tiny-random-ResNetModel", num_cameras=2 + ) + model = Classifier(config=model_config) + + # Mock components + model.train = MagicMock() + + train_loader = [ + { + "image_1": torch.rand(2, 3, 224, 224), + "image_2": torch.rand(2, 3, 224, 224), + "label": torch.tensor([0.0, 1.0]), + } + ] + + criterion = nn.BCEWithLogitsLoss() + optimizer = MagicMock() + grad_scaler = MagicMock() + device = torch.device("cpu") + logger = MagicMock() + step = 0 + cfg = MagicMock() + cfg.training.image_keys = ["image_1", "image_2"] + cfg.training.label_key = "label" + cfg.training.use_amp = False + + # Call the function under test + train_epoch( + model, + train_loader, + criterion, + optimizer, + grad_scaler, + device, + logger, + step, + cfg, + ) + + # Check that model.train() was called + model.train.assert_called_once() + + # Check that optimizer.zero_grad() was called + optimizer.zero_grad.assert_called() + + # Check that logger.log_dict was called + logger.log_dict.assert_called() + + @pytest.mark.parametrize("resume", [True, False]) @patch("lerobot.scripts.train_hilserl_classifier.init_hydra_config") @patch("lerobot.scripts.train_hilserl_classifier.Logger.get_last_checkpoint_dir") @@ -179,7 +232,7 @@ def test_resume_function( "train_split_proportion=0.8", "training.num_workers=0", "training.batch_size=2", - "training.image_key=image", + "training.image_keys=[image]", "training.label_key=label", "training.use_amp=False", "training.num_epochs=1",