diff --git a/lerobot/configs/env/aloha2_real.yaml b/lerobot/configs/env/aloha_real.yaml similarity index 86% rename from lerobot/configs/env/aloha2_real.yaml rename to lerobot/configs/env/aloha_real.yaml index 3053fc01..088781d4 100644 --- a/lerobot/configs/env/aloha2_real.yaml +++ b/lerobot/configs/env/aloha_real.yaml @@ -4,7 +4,7 @@ fps: 30 env: name: dora - task: DoraAloha2-v0 + task: DoraAloha-v0 state_dim: 14 action_dim: 14 fps: ${fps} diff --git a/lerobot/configs/policy/act_real.yaml b/lerobot/configs/policy/act_real.yaml index 684085ec..f39a03e9 100644 --- a/lerobot/configs/policy/act_real.yaml +++ b/lerobot/configs/policy/act_real.yaml @@ -1,7 +1,21 @@ # @package _global_ +# Use `act_real.yaml` to train on real-world Aloha/Aloha2 datasets. +# Compared to `act.yaml`, it contains 4 cameras (i.e. cam_right_wrist, cam_left_wrist, images, +# cam_low) instead of 1 camera (i.e. top). Also, `training.eval_freq` is set to -1. This config is used +# to evaluate checkpoints at a certain frequency of training steps. When it is set to -1, it deactivates evaluation. +# This is because real-world evaluation is done through [dora-lerobot](https://github.com/dora-rs/dora-lerobot). +# Look at its README for more information on how to evaluate a checkpoint in the real-world. +# +# Example of usage for training: +# ```bash +# python lerobot/scripts/train.py \ +# policy=act_real \ +# env=aloha_real +# ``` + seed: 1000 -dataset_repo_id: cadene/wrist_gripper +dataset_repo_id: lerobot/aloha_static_vinh_cup override_dataset_stats: observation.images.cam_right_wrist: diff --git a/lerobot/configs/policy/act_real_no_state.yaml b/lerobot/configs/policy/act_real_no_state.yaml new file mode 100644 index 00000000..5adee434 --- /dev/null +++ b/lerobot/configs/policy/act_real_no_state.yaml @@ -0,0 +1,111 @@ +# @package _global_ + +# Use `act_real_no_state.yaml` to train on real-world Aloha/Aloha2 datasets when cameras are moving (e.g. wrist cameras) +# Compared to `act_real.yaml`, it is camera only and does not use the state as input which is vector of robot joint positions. +# We validated experimentaly that not using state reaches better success rate. Our hypothesis is that `act_real.yaml` might +# overfits to the state, because the images are more complex to learn from since they are moving. +# +# Example of usage for training: +# ```bash +# python lerobot/scripts/train.py \ +# policy=act_real_no_state \ +# env=aloha2_real +# ``` + +seed: 1000 +dataset_repo_id: lerobot/aloha_static_vinh_cup + +override_dataset_stats: + observation.images.cam_right_wrist: + # stats from imagenet, since we use a pretrained vision model + mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1) + std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1) + observation.images.cam_left_wrist: + # stats from imagenet, since we use a pretrained vision model + mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1) + std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1) + observation.images.cam_high: + # stats from imagenet, since we use a pretrained vision model + mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1) + std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1) + observation.images.cam_low: + # stats from imagenet, since we use a pretrained vision model + mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1) + std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1) + +training: + offline_steps: 80000 + online_steps: 0 + eval_freq: -1 + save_freq: 10000 + log_freq: 100 + save_checkpoint: true + + batch_size: 8 + lr: 1e-5 + lr_backbone: 1e-5 + weight_decay: 1e-4 + grad_clip_norm: 10 + online_steps_between_rollouts: 1 + + delta_timestamps: + action: "[i / ${fps} for i in range(${policy.chunk_size})]" + +eval: + n_episodes: 50 + batch_size: 50 + +# See `configuration_act.py` for more details. +policy: + name: act + + # Input / output structure. + n_obs_steps: 1 + chunk_size: 100 # chunk_size + n_action_steps: 100 + + input_shapes: + # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env? + observation.images.cam_right_wrist: [3, 480, 640] + observation.images.cam_left_wrist: [3, 480, 640] + observation.images.cam_high: [3, 480, 640] + observation.images.cam_low: [3, 480, 640] + output_shapes: + action: ["${env.action_dim}"] + + # Normalization / Unnormalization + input_normalization_modes: + observation.images.cam_right_wrist: mean_std + observation.images.cam_left_wrist: mean_std + observation.images.cam_high: mean_std + observation.images.cam_low: mean_std + output_normalization_modes: + action: mean_std + + # Architecture. + # Vision backbone. + vision_backbone: resnet18 + pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1 + replace_final_stride_with_dilation: false + # Transformer layers. + pre_norm: false + dim_model: 512 + n_heads: 8 + dim_feedforward: 3200 + feedforward_activation: relu + n_encoder_layers: 4 + # Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code + # that means only the first layer is used. Here we match the original implementation by setting this to 1. + # See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521. + n_decoder_layers: 1 + # VAE. + use_vae: true + latent_dim: 32 + n_vae_encoder_layers: 4 + + # Inference. + temporal_ensemble_momentum: null + + # Training and loss computation. + dropout: 0.1 + kl_weight: 10.0