From 2b928eedd4bf88b2d84efe58c8cbdb5b7d1390bc Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare159@gmail.com>
Date: Tue, 2 Apr 2024 19:11:53 +0100
Subject: [PATCH 01/25] backup wip

---
 lerobot/common/datasets/factory.py      |  26 ++++
 lerobot/common/policies/act/detr_vae.py |  81 ++++++----
 lerobot/common/policies/act/policy.py   |  40 ++++-
 lerobot/common/policies/factory.py      |   2 +-
 lerobot/configs/policy/act.yaml         |   3 +-
 lerobot/scripts/train.py                |   1 +
 poetry.lock                             | 197 +++++++++++++++++++++++-
 pyproject.toml                          |   1 +
 8 files changed, 314 insertions(+), 37 deletions(-)

diff --git a/lerobot/common/datasets/factory.py b/lerobot/common/datasets/factory.py
index 04077034..47a15ea4 100644
--- a/lerobot/common/datasets/factory.py
+++ b/lerobot/common/datasets/factory.py
@@ -125,6 +125,32 @@ def make_offline_buffer(
 
         # TODO(rcadene): remove this and put it in config. Ideally we want to reproduce SOTA results just with mean_std
         normalization_mode = "mean_std" if cfg.env.name == "aloha" else "min_max"
+        
+        # TODO(now): These stats are needed to use their pretrained model for sim_transfer_cube_human.
+        # (Pdb) stats['observation']['state']['mean']
+        # tensor([-0.0071, -0.6293,  1.0351, -0.0517, -0.4642, -0.0754,  0.4751, -0.0373,
+        #         -0.3324,  0.9034, -0.2258, -0.3127, -0.2412,  0.6866])
+        stats['observation', 'state', 'mean'] = torch.tensor([-0.00740268, -0.63187766,  1.0356655 , -0.05027218, -0.46199223,
+            -0.07467502,  0.47467607, -0.03615446, -0.33203387,  0.9038929 ,
+            -0.22060776, -0.31011587, -0.23484458,  0.6842416 ])
+        # (Pdb) stats['observation']['state']['std']
+        # tensor([0.0022, 0.0520, 0.0291, 0.0092, 0.0267, 0.0145, 0.0563, 0.0179, 0.0494,
+        #         0.0326, 0.0476, 0.0535, 0.0956, 0.0513])
+        stats['observation', 'state', 'std'] = torch.tensor([0.01219023, 0.2975381 , 0.16728032, 0.04733803, 0.1486037 ,
+            0.08788499, 0.31752336, 0.1049916 , 0.27933604, 0.18094037,
+            0.26604933, 0.30466506, 0.5298686 , 0.25505227])
+        # (Pdb) stats['action']['mean']
+        # tensor([-0.0075, -0.6346,  1.0353, -0.0465, -0.4686, -0.0738,  0.3723, -0.0396,
+        #         -0.3184,  0.8991, -0.2065, -0.3182, -0.2338,  0.5593])
+        stats['action']['mean'] = torch.tensor([-0.00756444, -0.6281845 ,  1.0312834 , -0.04664314, -0.47211358,
+            -0.074527  ,  0.37389806, -0.03718753, -0.3261143 ,  0.8997205 ,
+            -0.21371077, -0.31840396, -0.23360962,  0.551947])
+        # (Pdb) stats['action']['std']
+        # tensor([0.0023, 0.0514, 0.0290, 0.0086, 0.0263, 0.0143, 0.0593, 0.0185, 0.0510,
+        #         0.0328, 0.0478, 0.0531, 0.0945, 0.0794])
+        stats['action']['std'] = torch.tensor([0.01252818, 0.2957442 , 0.16701928, 0.04584508, 0.14833844,
+            0.08763024, 0.30665937, 0.10600077, 0.27572668, 0.1805853 ,
+            0.26304692, 0.30708534, 0.5305411 , 0.38381037])
         transforms.append(NormalizeTransform(stats, in_keys, mode=normalization_mode))
 
         offline_buffer.set_transform(transforms)
diff --git a/lerobot/common/policies/act/detr_vae.py b/lerobot/common/policies/act/detr_vae.py
index 0f2626f7..4d5525f2 100644
--- a/lerobot/common/policies/act/detr_vae.py
+++ b/lerobot/common/policies/act/detr_vae.py
@@ -2,6 +2,7 @@ import numpy as np
 import torch
 from torch import nn
 from torch.autograd import Variable
+from transformers import DetrForObjectDetection
 
 from .backbone import build_backbone
 from .transformer import TransformerEncoder, TransformerEncoderLayer, build_transformer
@@ -24,31 +25,57 @@ def get_sinusoid_encoding_table(n_position, d_hid):
     return torch.FloatTensor(sinusoid_table).unsqueeze(0)
 
 
-class DETRVAE(nn.Module):
-    """This is the DETR module that performs object detection"""
+class ActionChunkingTransformer(nn.Module):
+    """
+    Action Chunking Transformer as per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware
+    (https://arxiv.org/abs/2304.13705).
+
+    Note: In this code we use the symbols `vae_encoder`, 'encoder', `decoder`. The meanings are as follows.
+        - The `vae_encoder` is, as per the literature around conditional variational auto-encoders (cVAE), the
+          part of the model that encodes the target data (here, a sequence of actions), and the condition
+          (here, we include the robot joint-space state as an input to the encoder).
+        - The `transformer` is the cVAE's decoder. But since we have an option to train this model without the
+          variational objective (in which case we drop the `vae_encoder` altogether), we don't call it the
+          `vae_decoder`.
+        # TODO(now): remove the following
+        - The `encoder` is actually a component of the cVAE's "decoder". But we refer to it as an "encoder"
+          because, in terms of the transformer with cross-attention that forms the cVAE's decoder, it is the
+          "encoder" part. We drop the `vae_` prefix because we have an option to train this model without the
+          variational objective (in which case we drop the `vae_encoder` altogether), and nothing about this
+          model has anything to do with a VAE).
+        - The `decoder` is a building block of the VAE decoder, and is just the "decoder" part of a
+          transformer with cross-attention. For the same reasoning behind the naming of `encoder`, we make
+          this term agnostic to the option to use a variational objective for training.
+
+    """
 
     def __init__(
-        self, backbones, transformer, encoder, state_dim, action_dim, num_queries, camera_names, vae
+        self, backbones, transformer, vae_encoder, state_dim, action_dim, horizon, camera_names, vae
     ):
         """Initializes the model.
         Parameters:
             backbones: torch module of the backbone to be used. See backbone.py
             transformer: torch module of the transformer architecture. See transformer.py
             state_dim: robot state dimension of the environment
-            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+            horizon: number of object queries, ie detection slot. This is the maximal number of objects
                          DETR can detect in a single image. For COCO, we recommend 100 queries.
-            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+
+        Args:
+            state_dim: Robot positional state dimension.
+            action_dim: Action dimension.
+            horizon: The number of actions to generate in one forward pass.
+            vae: Whether to use the variational objective. TODO(now): Give more details.
         """
         super().__init__()
-        self.num_queries = num_queries
         self.camera_names = camera_names
         self.transformer = transformer
-        self.encoder = encoder
+        self.vae_encoder = vae_encoder
         self.vae = vae
         hidden_dim = transformer.d_model
         self.action_head = nn.Linear(hidden_dim, action_dim)
         self.is_pad_head = nn.Linear(hidden_dim, 1)
-        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+        # Positional embedding to be used as input to the latent vae_encoder (if applicable) and for the 
+        self.pos_embed = nn.Embedding(horizon, hidden_dim)
         if backbones is not None:
             self.input_proj = nn.Conv2d(backbones[0].num_channels, hidden_dim, kernel_size=1)
             self.backbones = nn.ModuleList(backbones)
@@ -61,16 +88,16 @@ class DETRVAE(nn.Module):
             self.pos = torch.nn.Embedding(2, hidden_dim)
             self.backbones = None
 
-        # encoder extra parameters
+        # vae_encoder extra parameters
         self.latent_dim = 32  # final size of latent z # TODO tune
         self.cls_embed = nn.Embedding(1, hidden_dim)  # extra cls token embedding
-        self.encoder_action_proj = nn.Linear(14, hidden_dim)  # project action to embedding
-        self.encoder_joint_proj = nn.Linear(14, hidden_dim)  # project qpos to embedding
+        self.vae_encoder_action_proj = nn.Linear(14, hidden_dim)  # project action to embedding
+        self.vae_encoder_joint_proj = nn.Linear(14, hidden_dim)  # project qpos to embedding
         self.latent_proj = nn.Linear(
             hidden_dim, self.latent_dim * 2
         )  # project hidden state to latent std, var
         self.register_buffer(
-            "pos_table", get_sinusoid_encoding_table(1 + 1 + num_queries, hidden_dim)
+            "pos_table", get_sinusoid_encoding_table(1 + 1 + horizon, hidden_dim)
         )  # [CLS], qpos, a_seq
 
         # decoder extra parameters
@@ -91,15 +118,15 @@ class DETRVAE(nn.Module):
         ### Obtain latent z from action sequence
         if self.vae and is_training:
             # project action sequence to embedding dim, and concat with a CLS token
-            action_embed = self.encoder_action_proj(actions)  # (bs, seq, hidden_dim)
-            qpos_embed = self.encoder_joint_proj(qpos)  # (bs, hidden_dim)
+            action_embed = self.vae_encoder_action_proj(actions)  # (bs, seq, hidden_dim)
+            qpos_embed = self.vae_encoder_joint_proj(qpos)  # (bs, hidden_dim)
             qpos_embed = torch.unsqueeze(qpos_embed, axis=1)  # (bs, 1, hidden_dim)
             cls_embed = self.cls_embed.weight  # (1, hidden_dim)
             cls_embed = torch.unsqueeze(cls_embed, axis=0).repeat(bs, 1, 1)  # (bs, 1, hidden_dim)
-            encoder_input = torch.cat(
+            vae_encoder_input = torch.cat(
                 [cls_embed, qpos_embed, action_embed], axis=1
             )  # (bs, seq+1, hidden_dim)
-            encoder_input = encoder_input.permute(1, 0, 2)  # (seq+1, bs, hidden_dim)
+            vae_encoder_input = vae_encoder_input.permute(1, 0, 2)  # (seq+1, bs, hidden_dim)
             # do not mask cls token
             # cls_joint_is_pad = torch.full((bs, 2), False).to(qpos.device)  # False: not a padding
             # is_pad = torch.cat([cls_joint_is_pad, is_pad], axis=1)  # (bs, seq+1)
@@ -107,9 +134,9 @@ class DETRVAE(nn.Module):
             pos_embed = self.pos_table.clone().detach()
             pos_embed = pos_embed.permute(1, 0, 2)  # (seq+1, 1, hidden_dim)
             # query model
-            encoder_output = self.encoder(encoder_input, pos=pos_embed)  # , src_key_padding_mask=is_pad)
-            encoder_output = encoder_output[0]  # take cls output only
-            latent_info = self.latent_proj(encoder_output)
+            vae_encoder_output = self.vae_encoder(vae_encoder_input, pos=pos_embed)  # , src_key_padding_mask=is_pad)
+            vae_encoder_output = vae_encoder_output[0]  # take cls output only
+            latent_info = self.latent_proj(vae_encoder_output)
             mu = latent_info[:, : self.latent_dim]
             logvar = latent_info[:, self.latent_dim :]
             latent_sample = reparametrize(mu, logvar)
@@ -137,7 +164,7 @@ class DETRVAE(nn.Module):
             hs = self.transformer(
                 src,
                 None,
-                self.query_embed.weight,
+                self.pos_embed.weight,
                 pos,
                 latent_input,
                 proprio_input,
@@ -147,7 +174,7 @@ class DETRVAE(nn.Module):
             qpos = self.input_proj_robot_state(qpos)
             env_state = self.input_proj_env_state(env_state)
             transformer_input = torch.cat([qpos, env_state], axis=1)  # seq length = 2
-            hs = self.transformer(transformer_input, None, self.query_embed.weight, self.pos.weight)[0]
+            hs = self.transformer(transformer_input, None, self.pos_embed.weight, self.pos.weight)[0]
         a_hat = self.action_head(hs)
         is_pad_hat = self.is_pad_head(hs)
         return a_hat, is_pad_hat, [mu, logvar]
@@ -165,7 +192,7 @@ def mlp(input_dim, hidden_dim, output_dim, hidden_depth):
     return trunk
 
 
-def build_encoder(args):
+def build_vae_encoder(args):
     d_model = args.hidden_dim  # 256
     dropout = args.dropout  # 0.1
     nhead = args.nheads  # 8
@@ -192,16 +219,16 @@ def build(args):
     backbones.append(backbone)
 
     transformer = build_transformer(args)
+    
+    vae_encoder = build_vae_encoder(args)
 
-    encoder = build_encoder(args)
-
-    model = DETRVAE(
+    model = ActionChunkingTransformer(
         backbones,
         transformer,
-        encoder,
+        vae_encoder,
         state_dim=args.state_dim,
         action_dim=args.action_dim,
-        num_queries=args.num_queries,
+        horizon=args.num_queries,
         camera_names=args.camera_names,
         vae=args.vae,
     )
diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py
index ae4f7320..a88f7640 100644
--- a/lerobot/common/policies/act/policy.py
+++ b/lerobot/common/policies/act/policy.py
@@ -42,9 +42,28 @@ def kl_divergence(mu, logvar):
 
 
 class ActionChunkingTransformerPolicy(AbstractPolicy):
+    """
+    Action Chunking Transformer as per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware
+    (https://arxiv.org/abs/2304.13705).
+    """
+
     name = "act"
 
     def __init__(self, cfg, device, n_action_steps=1):
+        """
+        Args:
+            vae: Whether to use the variational objective. TODO(now): Give more details.
+            temporal_agg: Whether to do temporal aggregation. For each timestep during rollout, the action
+                returned as an exponential moving average of previously generated actions for that timestep. 
+            n_obs_steps: Number of time steps worth of observation to use as input.
+            horizon: The number of actions to generate in one forward pass.
+            kl_weight: Weight for KL divergence. Defaults to None. Only applicable when using the variational
+                objective.
+            batch_size: Training batch size.
+            grad_clip_norm: Optionally clip the gradients to have this value as the norm at most. Defaults to
+                None meaning gradient clipping is not applied.
+            lr: Learning rate.
+        """
         super().__init__(n_action_steps)
         self.cfg = cfg
         self.n_action_steps = n_action_steps
@@ -57,8 +76,6 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
     def update(self, replay_buffer, step):
         del step
 
-        start_time = time.time()
-
         self.train()
 
         num_slices = self.cfg.batch_size
@@ -103,11 +120,14 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
                 "action": action.to(self.device, non_blocking=True),
             }
             return out
+        
+        start_time = time.time()
 
         batch = replay_buffer.sample(batch_size)
         batch = process_batch(batch, self.cfg.horizon, num_slices)
 
         data_s = time.time() - start_time
+        print(data_s)
 
         loss = self.compute_loss(batch)
         loss.backward()
@@ -151,9 +171,6 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
 
     @torch.no_grad()
     def select_actions(self, observation, step_count):
-        if observation["image"].shape[0] != 1:
-            raise NotImplementedError("Batch size > 1 not handled")
-
         # TODO(rcadene): remove unused step_count
         del step_count
 
@@ -167,7 +184,17 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
             "image": observation["image", "top"],
             "agent_pos": observation["state"],
         }
-        action = self._forward(qpos=obs_dict["agent_pos"], image=obs_dict["image"])
+        # qpos = obs_dict["agent_pos"]
+        # img = obs_dict["image"]
+        # qpos_ = torch.load('/tmp/qpos.pth')
+        # img_ = torch.load('/tmp/curr_image.pth')
+        # out_ = torch.load('/tmp/out.pth')
+        # import cv2, numpy as np
+        # cv2.imwrite("ours.png", (obs_dict["image"][0, 0].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8))
+        # cv2.imwrite("theirs.png", (img_[0, 0].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8))
+        # out = self._forward(qpos_, img_)
+        # breakpoint()
+        action = self._forward(qpos=obs_dict["agent_pos"] * 0.182, image=obs_dict["image"])
 
         if self.cfg.temporal_agg:
             # TODO(rcadene): implement temporal aggregation
@@ -197,6 +224,7 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
             if is_pad is not None:
                 is_pad = is_pad[:, : self.model.num_queries]
 
+            breakpoint()
             a_hat, is_pad_hat, (mu, logvar) = self.model(qpos, image, env_state, actions, is_pad)
 
             all_l1 = F.l1_loss(actions, a_hat, reduction="none")
diff --git a/lerobot/common/policies/factory.py b/lerobot/common/policies/factory.py
index 934f0962..577ccf75 100644
--- a/lerobot/common/policies/factory.py
+++ b/lerobot/common/policies/factory.py
@@ -1,5 +1,5 @@
 def make_policy(cfg):
-    if cfg.policy.name != "diffusion" and cfg.rollout_batch_size > 1:
+    if cfg.policy.name not in ["diffusion", "act"] and cfg.rollout_batch_size > 1:
         raise NotImplementedError("Only diffusion policy supports rollout_batch_size > 1 for the time being.")
 
     if cfg.policy.name == "tdmpc":
diff --git a/lerobot/configs/policy/act.yaml b/lerobot/configs/policy/act.yaml
index a52c3f54..0244944b 100644
--- a/lerobot/configs/policy/act.yaml
+++ b/lerobot/configs/policy/act.yaml
@@ -1,6 +1,6 @@
 # @package _global_
 
-offline_steps: 1344000
+offline_steps: 2000
 online_steps: 0
 
 eval_episodes: 1
@@ -24,7 +24,6 @@ policy:
   weight_decay: 1e-4
   grad_clip_norm: 10
   backbone: resnet18
-  num_queries: ${horizon} # chunk_size
   horizon: ${horizon} # chunk_size
   kl_weight: 10
   hidden_dim: 512
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index 18c3715b..454adf1a 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -151,6 +151,7 @@ def train(cfg: dict, out_dir=None, job_name=None):
 
     logging.info("make_policy")
     policy = make_policy(cfg)
+    policy.save("act.pt")
 
     num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
     num_total_params = sum(p.numel() for p in policy.parameters())
diff --git a/poetry.lock b/poetry.lock
index 72397001..9766051c 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3248,6 +3248,133 @@ numpy = "*"
 [package.extras]
 all = ["defusedxml", "fsspec", "imagecodecs (>=2023.8.12)", "lxml", "matplotlib", "zarr"]
 
+[[package]]
+name = "tokenizers"
+version = "0.15.2"
+description = ""
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tokenizers-0.15.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:52f6130c9cbf70544287575a985bf44ae1bda2da7e8c24e97716080593638012"},
+    {file = "tokenizers-0.15.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:054c1cc9c6d68f7ffa4e810b3d5131e0ba511b6e4be34157aa08ee54c2f8d9ee"},
+    {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a9b9b070fdad06e347563b88c278995735292ded1132f8657084989a4c84a6d5"},
+    {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea621a7eef4b70e1f7a4e84dd989ae3f0eeb50fc8690254eacc08acb623e82f1"},
+    {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cf7fd9a5141634fa3aa8d6b7be362e6ae1b4cda60da81388fa533e0b552c98fd"},
+    {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:44f2a832cd0825295f7179eaf173381dc45230f9227ec4b44378322d900447c9"},
+    {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8b9ec69247a23747669ec4b0ca10f8e3dfb3545d550258129bd62291aabe8605"},
+    {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40b6a4c78da863ff26dbd5ad9a8ecc33d8a8d97b535172601cf00aee9d7ce9ce"},
+    {file = "tokenizers-0.15.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5ab2a4d21dcf76af60e05af8063138849eb1d6553a0d059f6534357bce8ba364"},
+    {file = "tokenizers-0.15.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a47acfac7e511f6bbfcf2d3fb8c26979c780a91e06fb5b9a43831b2c0153d024"},
+    {file = "tokenizers-0.15.2-cp310-none-win32.whl", hash = "sha256:064ff87bb6acdbd693666de9a4b692add41308a2c0ec0770d6385737117215f2"},
+    {file = "tokenizers-0.15.2-cp310-none-win_amd64.whl", hash = "sha256:3b919afe4df7eb6ac7cafd2bd14fb507d3f408db7a68c43117f579c984a73843"},
+    {file = "tokenizers-0.15.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:89cd1cb93e4b12ff39bb2d626ad77e35209de9309a71e4d3d4672667b4b256e7"},
+    {file = "tokenizers-0.15.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cfed5c64e5be23d7ee0f0e98081a25c2a46b0b77ce99a4f0605b1ec43dd481fa"},
+    {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a907d76dcfda37023ba203ab4ceeb21bc5683436ebefbd895a0841fd52f6f6f2"},
+    {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20ea60479de6fc7b8ae756b4b097572372d7e4032e2521c1bbf3d90c90a99ff0"},
+    {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:48e2b9335be2bc0171df9281385c2ed06a15f5cf121c44094338306ab7b33f2c"},
+    {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:112a1dd436d2cc06e6ffdc0b06d55ac019a35a63afd26475205cb4b1bf0bfbff"},
+    {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4620cca5c2817177ee8706f860364cc3a8845bc1e291aaf661fb899e5d1c45b0"},
+    {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccd73a82751c523b3fc31ff8194702e4af4db21dc20e55b30ecc2079c5d43cb7"},
+    {file = "tokenizers-0.15.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:107089f135b4ae7817affe6264f8c7a5c5b4fd9a90f9439ed495f54fcea56fb4"},
+    {file = "tokenizers-0.15.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0ff110ecc57b7aa4a594396525a3451ad70988e517237fe91c540997c4e50e29"},
+    {file = "tokenizers-0.15.2-cp311-none-win32.whl", hash = "sha256:6d76f00f5c32da36c61f41c58346a4fa7f0a61be02f4301fd30ad59834977cc3"},
+    {file = "tokenizers-0.15.2-cp311-none-win_amd64.whl", hash = "sha256:cc90102ed17271cf0a1262babe5939e0134b3890345d11a19c3145184b706055"},
+    {file = "tokenizers-0.15.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f86593c18d2e6248e72fb91c77d413a815153b8ea4e31f7cd443bdf28e467670"},
+    {file = "tokenizers-0.15.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0774bccc6608eca23eb9d620196687c8b2360624619623cf4ba9dc9bd53e8b51"},
+    {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d0222c5b7c9b26c0b4822a82f6a7011de0a9d3060e1da176f66274b70f846b98"},
+    {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3835738be1de66624fff2f4f6f6684775da4e9c00bde053be7564cbf3545cc66"},
+    {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0143e7d9dcd811855c1ce1ab9bf5d96d29bf5e528fd6c7824d0465741e8c10fd"},
+    {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db35825f6d54215f6b6009a7ff3eedee0848c99a6271c870d2826fbbedf31a38"},
+    {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3f5e64b0389a2be47091d8cc53c87859783b837ea1a06edd9d8e04004df55a5c"},
+    {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e0480c452217edd35eca56fafe2029fb4d368b7c0475f8dfa3c5c9c400a7456"},
+    {file = "tokenizers-0.15.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a33ab881c8fe70474980577e033d0bc9a27b7ab8272896e500708b212995d834"},
+    {file = "tokenizers-0.15.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a308a607ca9de2c64c1b9ba79ec9a403969715a1b8ba5f998a676826f1a7039d"},
+    {file = "tokenizers-0.15.2-cp312-none-win32.whl", hash = "sha256:b8fcfa81bcb9447df582c5bc96a031e6df4da2a774b8080d4f02c0c16b42be0b"},
+    {file = "tokenizers-0.15.2-cp312-none-win_amd64.whl", hash = "sha256:38d7ab43c6825abfc0b661d95f39c7f8af2449364f01d331f3b51c94dcff7221"},
+    {file = "tokenizers-0.15.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:38bfb0204ff3246ca4d5e726e8cc8403bfc931090151e6eede54d0e0cf162ef0"},
+    {file = "tokenizers-0.15.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9c861d35e8286a53e06e9e28d030b5a05bcbf5ac9d7229e561e53c352a85b1fc"},
+    {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:936bf3842db5b2048eaa53dade907b1160f318e7c90c74bfab86f1e47720bdd6"},
+    {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:620beacc3373277700d0e27718aa8b25f7b383eb8001fba94ee00aeea1459d89"},
+    {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2735ecbbf37e52db4ea970e539fd2d450d213517b77745114f92867f3fc246eb"},
+    {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:473c83c5e2359bb81b0b6fde870b41b2764fcdd36d997485e07e72cc3a62264a"},
+    {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:968fa1fb3c27398b28a4eca1cbd1e19355c4d3a6007f7398d48826bbe3a0f728"},
+    {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:865c60ae6eaebdde7da66191ee9b7db52e542ed8ee9d2c653b6d190a9351b980"},
+    {file = "tokenizers-0.15.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7c0d8b52664ab2d4a8d6686eb5effc68b78608a9008f086a122a7b2996befbab"},
+    {file = "tokenizers-0.15.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:f33dfbdec3784093a9aebb3680d1f91336c56d86cc70ddf88708251da1fe9064"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:d44ba80988ff9424e33e0a49445072ac7029d8c0e1601ad25a0ca5f41ed0c1d6"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:dce74266919b892f82b1b86025a613956ea0ea62a4843d4c4237be2c5498ed3a"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0ef06b9707baeb98b316577acb04f4852239d856b93e9ec3a299622f6084e4be"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c73e2e74bbb07910da0d37c326869f34113137b23eadad3fc00856e6b3d9930c"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4eeb12daf02a59e29f578a865f55d87cd103ce62bd8a3a5874f8fdeaa82e336b"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9ba9f6895af58487ca4f54e8a664a322f16c26bbb442effd01087eba391a719e"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ccec77aa7150e38eec6878a493bf8c263ff1fa8a62404e16c6203c64c1f16a26"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3f40604f5042ff210ba82743dda2b6aa3e55aa12df4e9f2378ee01a17e2855e"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:5645938a42d78c4885086767c70923abad047163d809c16da75d6b290cb30bbe"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:05a77cbfebe28a61ab5c3891f9939cc24798b63fa236d84e5f29f3a85a200c00"},
+    {file = "tokenizers-0.15.2-cp37-none-win32.whl", hash = "sha256:361abdc068e8afe9c5b818769a48624687fb6aaed49636ee39bec4e95e1a215b"},
+    {file = "tokenizers-0.15.2-cp37-none-win_amd64.whl", hash = "sha256:7ef789f83eb0f9baeb4d09a86cd639c0a5518528f9992f38b28e819df397eb06"},
+    {file = "tokenizers-0.15.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:4fe1f74a902bee74a3b25aff180fbfbf4f8b444ab37c4d496af7afd13a784ed2"},
+    {file = "tokenizers-0.15.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4c4b89038a684f40a6b15d6b09f49650ac64d951ad0f2a3ea9169687bbf2a8ba"},
+    {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d05a1b06f986d41aed5f2de464c003004b2df8aaf66f2b7628254bcbfb72a438"},
+    {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:508711a108684111ec8af89d3a9e9e08755247eda27d0ba5e3c50e9da1600f6d"},
+    {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:daa348f02d15160cb35439098ac96e3a53bacf35885072611cd9e5be7d333daa"},
+    {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:494fdbe5932d3416de2a85fc2470b797e6f3226c12845cadf054dd906afd0442"},
+    {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c2d60f5246f4da9373f75ff18d64c69cbf60c3bca597290cea01059c336d2470"},
+    {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:93268e788825f52de4c7bdcb6ebc1fcd4a5442c02e730faa9b6b08f23ead0e24"},
+    {file = "tokenizers-0.15.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6fc7083ab404019fc9acafe78662c192673c1e696bd598d16dc005bd663a5cf9"},
+    {file = "tokenizers-0.15.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:41e39b41e5531d6b2122a77532dbea60e171ef87a3820b5a3888daa847df4153"},
+    {file = "tokenizers-0.15.2-cp38-none-win32.whl", hash = "sha256:06cd0487b1cbfabefb2cc52fbd6b1f8d4c37799bd6c6e1641281adaa6b2504a7"},
+    {file = "tokenizers-0.15.2-cp38-none-win_amd64.whl", hash = "sha256:5179c271aa5de9c71712e31cb5a79e436ecd0d7532a408fa42a8dbfa4bc23fd9"},
+    {file = "tokenizers-0.15.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:82f8652a74cc107052328b87ea8b34291c0f55b96d8fb261b3880216a9f9e48e"},
+    {file = "tokenizers-0.15.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:02458bee6f5f3139f1ebbb6d042b283af712c0981f5bc50edf771d6b762d5e4f"},
+    {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c9a09cd26cca2e1c349f91aa665309ddb48d71636370749414fbf67bc83c5343"},
+    {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:158be8ea8554e5ed69acc1ce3fbb23a06060bd4bbb09029431ad6b9a466a7121"},
+    {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1ddba9a2b0c8c81633eca0bb2e1aa5b3a15362b1277f1ae64176d0f6eba78ab1"},
+    {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3ef5dd1d39797044642dbe53eb2bc56435308432e9c7907728da74c69ee2adca"},
+    {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:454c203164e07a860dbeb3b1f4a733be52b0edbb4dd2e5bd75023ffa8b49403a"},
+    {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0cf6b7f1d4dc59af960e6ffdc4faffe6460bbfa8dce27a58bf75755ffdb2526d"},
+    {file = "tokenizers-0.15.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2ef09bbc16519f6c25d0c7fc0c6a33a6f62923e263c9d7cca4e58b8c61572afb"},
+    {file = "tokenizers-0.15.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c9a2ebdd2ad4ec7a68e7615086e633857c85e2f18025bd05d2a4399e6c5f7169"},
+    {file = "tokenizers-0.15.2-cp39-none-win32.whl", hash = "sha256:918fbb0eab96fe08e72a8c2b5461e9cce95585d82a58688e7f01c2bd546c79d0"},
+    {file = "tokenizers-0.15.2-cp39-none-win_amd64.whl", hash = "sha256:524e60da0135e106b254bd71f0659be9f89d83f006ea9093ce4d1fab498c6d0d"},
+    {file = "tokenizers-0.15.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:6a9b648a58281c4672212fab04e60648fde574877d0139cd4b4f93fe28ca8944"},
+    {file = "tokenizers-0.15.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:7c7d18b733be6bbca8a55084027f7be428c947ddf871c500ee603e375013ffba"},
+    {file = "tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:13ca3611de8d9ddfbc4dc39ef54ab1d2d4aaa114ac8727dfdc6a6ec4be017378"},
+    {file = "tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:237d1bf3361cf2e6463e6c140628e6406766e8b27274f5fcc62c747ae3c6f094"},
+    {file = "tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67a0fe1e49e60c664915e9fb6b0cb19bac082ab1f309188230e4b2920230edb3"},
+    {file = "tokenizers-0.15.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:4e022fe65e99230b8fd89ebdfea138c24421f91c1a4f4781a8f5016fd5cdfb4d"},
+    {file = "tokenizers-0.15.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:d857be2df69763362ac699f8b251a8cd3fac9d21893de129bc788f8baaef2693"},
+    {file = "tokenizers-0.15.2-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:708bb3e4283177236309e698da5fcd0879ce8fd37457d7c266d16b550bcbbd18"},
+    {file = "tokenizers-0.15.2-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:64c35e09e9899b72a76e762f9854e8750213f67567787d45f37ce06daf57ca78"},
+    {file = "tokenizers-0.15.2-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1257f4394be0d3b00de8c9e840ca5601d0a4a8438361ce9c2b05c7d25f6057b"},
+    {file = "tokenizers-0.15.2-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02272fe48280e0293a04245ca5d919b2c94a48b408b55e858feae9618138aeda"},
+    {file = "tokenizers-0.15.2-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:dc3ad9ebc76eabe8b1d7c04d38be884b8f9d60c0cdc09b0aa4e3bcf746de0388"},
+    {file = "tokenizers-0.15.2-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:32e16bdeffa7c4f46bf2152172ca511808b952701d13e7c18833c0b73cb5c23f"},
+    {file = "tokenizers-0.15.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:fb16ba563d59003028b678d2361a27f7e4ae0ab29c7a80690efa20d829c81fdb"},
+    {file = "tokenizers-0.15.2-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:2277c36d2d6cdb7876c274547921a42425b6810d38354327dd65a8009acf870c"},
+    {file = "tokenizers-0.15.2-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:1cf75d32e8d250781940d07f7eece253f2fe9ecdb1dc7ba6e3833fa17b82fcbc"},
+    {file = "tokenizers-0.15.2-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1b3b31884dc8e9b21508bb76da80ebf7308fdb947a17affce815665d5c4d028"},
+    {file = "tokenizers-0.15.2-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b10122d8d8e30afb43bb1fe21a3619f62c3e2574bff2699cf8af8b0b6c5dc4a3"},
+    {file = "tokenizers-0.15.2-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d88b96ff0fe8e91f6ef01ba50b0d71db5017fa4e3b1d99681cec89a85faf7bf7"},
+    {file = "tokenizers-0.15.2-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:37aaec5a52e959892870a7c47cef80c53797c0db9149d458460f4f31e2fb250e"},
+    {file = "tokenizers-0.15.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e2ea752f2b0fe96eb6e2f3adbbf4d72aaa1272079b0dfa1145507bd6a5d537e6"},
+    {file = "tokenizers-0.15.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:4b19a808d8799fda23504a5cd31d2f58e6f52f140380082b352f877017d6342b"},
+    {file = "tokenizers-0.15.2-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:64c86e5e068ac8b19204419ed8ca90f9d25db20578f5881e337d203b314f4104"},
+    {file = "tokenizers-0.15.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de19c4dc503c612847edf833c82e9f73cd79926a384af9d801dcf93f110cea4e"},
+    {file = "tokenizers-0.15.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ea09acd2fe3324174063d61ad620dec3bcf042b495515f27f638270a7d466e8b"},
+    {file = "tokenizers-0.15.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cf27fd43472e07b57cf420eee1e814549203d56de00b5af8659cb99885472f1f"},
+    {file = "tokenizers-0.15.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:7ca22bd897537a0080521445d91a58886c8c04084a6a19e6c78c586e0cfa92a5"},
+    {file = "tokenizers-0.15.2.tar.gz", hash = "sha256:e6e9c6e019dd5484be5beafc775ae6c925f4c69a3487040ed09b45e13df2cb91"},
+]
+
+[package.dependencies]
+huggingface_hub = ">=0.16.4,<1.0"
+
+[package.extras]
+dev = ["tokenizers[testing]"]
+docs = ["setuptools_rust", "sphinx", "sphinx_rtd_theme"]
+testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"]
+
 [[package]]
 name = "tomli"
 version = "2.0.1"
@@ -3413,6 +3540,74 @@ notebook = ["ipywidgets (>=6)"]
 slack = ["slack-sdk"]
 telegram = ["requests"]
 
+[[package]]
+name = "transformers"
+version = "4.39.3"
+description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
+optional = false
+python-versions = ">=3.8.0"
+files = [
+    {file = "transformers-4.39.3-py3-none-any.whl", hash = "sha256:7838034a12cca3168247f9d2d1dba6724c9de3ae0f73a108258c6b8fc5912601"},
+    {file = "transformers-4.39.3.tar.gz", hash = "sha256:2586e5ff4150f122716fc40f5530e92871befc051848fbe82600969c535b762d"},
+]
+
+[package.dependencies]
+filelock = "*"
+huggingface-hub = ">=0.19.3,<1.0"
+numpy = ">=1.17"
+packaging = ">=20.0"
+pyyaml = ">=5.1"
+regex = "!=2019.12.17"
+requests = "*"
+safetensors = ">=0.4.1"
+tokenizers = ">=0.14,<0.19"
+tqdm = ">=4.27"
+
+[package.extras]
+accelerate = ["accelerate (>=0.21.0)"]
+agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch"]
+all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm", "tokenizers (>=0.14,<0.19)", "torch", "torchaudio", "torchvision"]
+audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
+codecarbon = ["codecarbon (==1.2.0)"]
+deepspeed = ["accelerate (>=0.21.0)", "deepspeed (>=0.9.3)"]
+deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.21.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
+dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm", "tokenizers (>=0.14,<0.19)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.14,<0.19)", "urllib3 (<2.0.0)"]
+dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "librosa", "nltk", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm", "tokenizers (>=0.14,<0.19)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+docs = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "hf-doc-builder", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm", "tokenizers (>=0.14,<0.19)", "torch", "torchaudio", "torchvision"]
+docs-specific = ["hf-doc-builder"]
+flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)"]
+flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
+ftfy = ["ftfy"]
+integrations = ["optuna", "ray[tune] (>=2.7.0)", "sigopt"]
+ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"]
+modelcreation = ["cookiecutter (==1.7.3)"]
+natten = ["natten (>=0.14.6,<0.15.0)"]
+onnx = ["onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "tf2onnx"]
+onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"]
+optuna = ["optuna"]
+quality = ["GitPython (<3.1.19)", "datasets (!=2.5.0)", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "ruff (==0.1.5)", "urllib3 (<2.0.0)"]
+ray = ["ray[tune] (>=2.7.0)"]
+retrieval = ["datasets (!=2.5.0)", "faiss-cpu"]
+sagemaker = ["sagemaker (>=2.31.0)"]
+sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"]
+serving = ["fastapi", "pydantic", "starlette", "uvicorn"]
+sigopt = ["sigopt"]
+sklearn = ["scikit-learn"]
+speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
+testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "tensorboard", "timeout-decorator"]
+tf = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
+tf-cpu = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow-cpu (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
+tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
+timm = ["timm"]
+tokenizers = ["tokenizers (>=0.14,<0.19)"]
+torch = ["accelerate (>=0.21.0)", "torch"]
+torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
+torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
+torchhub = ["filelock", "huggingface-hub (>=0.19.3,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.14,<0.19)", "torch", "tqdm (>=4.27)"]
+video = ["av (==9.2.0)", "decord (==0.6.0)"]
+vision = ["Pillow (>=10.0.1,<=15.0)"]
+
 [[package]]
 name = "triton"
 version = "2.2.0"
@@ -3589,4 +3784,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "174c7d42f8039eedd2c447a4e6cae5169782cbd94346b5606572a0010194ca05"
+content-hash = "5ebd02dac0322efe1236eb9fec84c471edd0c5373cc8967b1982314164b3bf50"
diff --git a/pyproject.toml b/pyproject.toml
index 972c1b61..b2526e5c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,7 @@ robomimic = "0.2.0"
 gymnasium-robotics = "^1.2.4"
 gymnasium = "^0.29.1"
 cmake = "^3.29.0.1"
+transformers = "^4.39.3"
 
 
 [tool.poetry.group.dev.dependencies]

From 65ef8c30d03fd5c8904f2f914870447c712387a9 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare159@gmail.com>
Date: Tue, 2 Apr 2024 19:13:49 +0100
Subject: [PATCH 02/25] backup wip

---
 lerobot/common/datasets/factory.py      | 86 +++++++++++++++++++++----
 lerobot/common/policies/act/detr_vae.py |  9 +--
 lerobot/common/policies/act/policy.py   |  4 +-
 3 files changed, 80 insertions(+), 19 deletions(-)

diff --git a/lerobot/common/datasets/factory.py b/lerobot/common/datasets/factory.py
index 47a15ea4..b394e830 100644
--- a/lerobot/common/datasets/factory.py
+++ b/lerobot/common/datasets/factory.py
@@ -125,32 +125,92 @@ def make_offline_buffer(
 
         # TODO(rcadene): remove this and put it in config. Ideally we want to reproduce SOTA results just with mean_std
         normalization_mode = "mean_std" if cfg.env.name == "aloha" else "min_max"
-        
+
         # TODO(now): These stats are needed to use their pretrained model for sim_transfer_cube_human.
         # (Pdb) stats['observation']['state']['mean']
         # tensor([-0.0071, -0.6293,  1.0351, -0.0517, -0.4642, -0.0754,  0.4751, -0.0373,
         #         -0.3324,  0.9034, -0.2258, -0.3127, -0.2412,  0.6866])
-        stats['observation', 'state', 'mean'] = torch.tensor([-0.00740268, -0.63187766,  1.0356655 , -0.05027218, -0.46199223,
-            -0.07467502,  0.47467607, -0.03615446, -0.33203387,  0.9038929 ,
-            -0.22060776, -0.31011587, -0.23484458,  0.6842416 ])
+        stats["observation", "state", "mean"] = torch.tensor(
+            [
+                -0.00740268,
+                -0.63187766,
+                1.0356655,
+                -0.05027218,
+                -0.46199223,
+                -0.07467502,
+                0.47467607,
+                -0.03615446,
+                -0.33203387,
+                0.9038929,
+                -0.22060776,
+                -0.31011587,
+                -0.23484458,
+                0.6842416,
+            ]
+        )
         # (Pdb) stats['observation']['state']['std']
         # tensor([0.0022, 0.0520, 0.0291, 0.0092, 0.0267, 0.0145, 0.0563, 0.0179, 0.0494,
         #         0.0326, 0.0476, 0.0535, 0.0956, 0.0513])
-        stats['observation', 'state', 'std'] = torch.tensor([0.01219023, 0.2975381 , 0.16728032, 0.04733803, 0.1486037 ,
-            0.08788499, 0.31752336, 0.1049916 , 0.27933604, 0.18094037,
-            0.26604933, 0.30466506, 0.5298686 , 0.25505227])
+        stats["observation", "state", "std"] = torch.tensor(
+            [
+                0.01219023,
+                0.2975381,
+                0.16728032,
+                0.04733803,
+                0.1486037,
+                0.08788499,
+                0.31752336,
+                0.1049916,
+                0.27933604,
+                0.18094037,
+                0.26604933,
+                0.30466506,
+                0.5298686,
+                0.25505227,
+            ]
+        )
         # (Pdb) stats['action']['mean']
         # tensor([-0.0075, -0.6346,  1.0353, -0.0465, -0.4686, -0.0738,  0.3723, -0.0396,
         #         -0.3184,  0.8991, -0.2065, -0.3182, -0.2338,  0.5593])
-        stats['action']['mean'] = torch.tensor([-0.00756444, -0.6281845 ,  1.0312834 , -0.04664314, -0.47211358,
-            -0.074527  ,  0.37389806, -0.03718753, -0.3261143 ,  0.8997205 ,
-            -0.21371077, -0.31840396, -0.23360962,  0.551947])
+        stats["action"]["mean"] = torch.tensor(
+            [
+                -0.00756444,
+                -0.6281845,
+                1.0312834,
+                -0.04664314,
+                -0.47211358,
+                -0.074527,
+                0.37389806,
+                -0.03718753,
+                -0.3261143,
+                0.8997205,
+                -0.21371077,
+                -0.31840396,
+                -0.23360962,
+                0.551947,
+            ]
+        )
         # (Pdb) stats['action']['std']
         # tensor([0.0023, 0.0514, 0.0290, 0.0086, 0.0263, 0.0143, 0.0593, 0.0185, 0.0510,
         #         0.0328, 0.0478, 0.0531, 0.0945, 0.0794])
-        stats['action']['std'] = torch.tensor([0.01252818, 0.2957442 , 0.16701928, 0.04584508, 0.14833844,
-            0.08763024, 0.30665937, 0.10600077, 0.27572668, 0.1805853 ,
-            0.26304692, 0.30708534, 0.5305411 , 0.38381037])
+        stats["action"]["std"] = torch.tensor(
+            [
+                0.01252818,
+                0.2957442,
+                0.16701928,
+                0.04584508,
+                0.14833844,
+                0.08763024,
+                0.30665937,
+                0.10600077,
+                0.27572668,
+                0.1805853,
+                0.26304692,
+                0.30708534,
+                0.5305411,
+                0.38381037,
+            ]
+        )
         transforms.append(NormalizeTransform(stats, in_keys, mode=normalization_mode))
 
         offline_buffer.set_transform(transforms)
diff --git a/lerobot/common/policies/act/detr_vae.py b/lerobot/common/policies/act/detr_vae.py
index 4d5525f2..f21308ad 100644
--- a/lerobot/common/policies/act/detr_vae.py
+++ b/lerobot/common/policies/act/detr_vae.py
@@ -2,7 +2,6 @@ import numpy as np
 import torch
 from torch import nn
 from torch.autograd import Variable
-from transformers import DetrForObjectDetection
 
 from .backbone import build_backbone
 from .transformer import TransformerEncoder, TransformerEncoderLayer, build_transformer
@@ -74,7 +73,7 @@ class ActionChunkingTransformer(nn.Module):
         hidden_dim = transformer.d_model
         self.action_head = nn.Linear(hidden_dim, action_dim)
         self.is_pad_head = nn.Linear(hidden_dim, 1)
-        # Positional embedding to be used as input to the latent vae_encoder (if applicable) and for the 
+        # Positional embedding to be used as input to the latent vae_encoder (if applicable) and for the
         self.pos_embed = nn.Embedding(horizon, hidden_dim)
         if backbones is not None:
             self.input_proj = nn.Conv2d(backbones[0].num_channels, hidden_dim, kernel_size=1)
@@ -134,7 +133,9 @@ class ActionChunkingTransformer(nn.Module):
             pos_embed = self.pos_table.clone().detach()
             pos_embed = pos_embed.permute(1, 0, 2)  # (seq+1, 1, hidden_dim)
             # query model
-            vae_encoder_output = self.vae_encoder(vae_encoder_input, pos=pos_embed)  # , src_key_padding_mask=is_pad)
+            vae_encoder_output = self.vae_encoder(
+                vae_encoder_input, pos=pos_embed
+            )  # , src_key_padding_mask=is_pad)
             vae_encoder_output = vae_encoder_output[0]  # take cls output only
             latent_info = self.latent_proj(vae_encoder_output)
             mu = latent_info[:, : self.latent_dim]
@@ -219,7 +220,7 @@ def build(args):
     backbones.append(backbone)
 
     transformer = build_transformer(args)
-    
+
     vae_encoder = build_vae_encoder(args)
 
     model = ActionChunkingTransformer(
diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py
index a88f7640..5cf74ae5 100644
--- a/lerobot/common/policies/act/policy.py
+++ b/lerobot/common/policies/act/policy.py
@@ -54,7 +54,7 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
         Args:
             vae: Whether to use the variational objective. TODO(now): Give more details.
             temporal_agg: Whether to do temporal aggregation. For each timestep during rollout, the action
-                returned as an exponential moving average of previously generated actions for that timestep. 
+                returned as an exponential moving average of previously generated actions for that timestep.
             n_obs_steps: Number of time steps worth of observation to use as input.
             horizon: The number of actions to generate in one forward pass.
             kl_weight: Weight for KL divergence. Defaults to None. Only applicable when using the variational
@@ -120,7 +120,7 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
                 "action": action.to(self.device, non_blocking=True),
             }
             return out
-        
+
         start_time = time.time()
 
         batch = replay_buffer.sample(batch_size)

From 110ac5ffa123c64eb61a313eb08638ed6efe84ee Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare159@gmail.com>
Date: Wed, 3 Apr 2024 14:21:07 +0100
Subject: [PATCH 03/25] backup wip

---
 lerobot/common/envs/aloha/env.py           |   1 -
 lerobot/common/policies/act/detr_vae.py    | 216 ++++++++++-----------
 lerobot/common/policies/act/policy.py      |   5 +-
 lerobot/common/policies/act/transformer.py |  85 ++------
 lerobot/configs/policy/act.yaml            |   2 +-
 scripts/convert_act_weights.py             |  64 ++++++
 6 files changed, 182 insertions(+), 191 deletions(-)
 create mode 100644 scripts/convert_act_weights.py

diff --git a/lerobot/common/envs/aloha/env.py b/lerobot/common/envs/aloha/env.py
index 8f907650..ad8087d0 100644
--- a/lerobot/common/envs/aloha/env.py
+++ b/lerobot/common/envs/aloha/env.py
@@ -191,7 +191,6 @@ class AlohaEnv(AbstractEnv):
             {
                 "observation": TensorDict(obs, batch_size=[]),
                 "reward": torch.tensor([reward], dtype=torch.float32),
-                # success and done are true when coverage > self.success_threshold in env
                 "done": torch.tensor([done], dtype=torch.bool),
                 "success": torch.tensor([success], dtype=torch.bool),
             },
diff --git a/lerobot/common/policies/act/detr_vae.py b/lerobot/common/policies/act/detr_vae.py
index f21308ad..ff137a34 100644
--- a/lerobot/common/policies/act/detr_vae.py
+++ b/lerobot/common/policies/act/detr_vae.py
@@ -1,18 +1,12 @@
+import einops
 import numpy as np
 import torch
 from torch import nn
-from torch.autograd import Variable
 
 from .backbone import build_backbone
 from .transformer import TransformerEncoder, TransformerEncoderLayer, build_transformer
 
 
-def reparametrize(mu, logvar):
-    std = logvar.div(2).exp()
-    eps = Variable(std.data.new(std.size()).normal_())
-    return mu + std * eps
-
-
 def get_sinusoid_encoding_table(n_position, d_hid):
     def get_position_angle_vec(position):
         return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
@@ -27,7 +21,7 @@ def get_sinusoid_encoding_table(n_position, d_hid):
 class ActionChunkingTransformer(nn.Module):
     """
     Action Chunking Transformer as per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware
-    (https://arxiv.org/abs/2304.13705).
+    (paper: https://arxiv.org/abs/2304.13705, code: https://github.com/tonyzhaozh/act)
 
     Note: In this code we use the symbols `vae_encoder`, 'encoder', `decoder`. The meanings are as follows.
         - The `vae_encoder` is, as per the literature around conditional variational auto-encoders (cVAE), the
@@ -49,7 +43,7 @@ class ActionChunkingTransformer(nn.Module):
     """
 
     def __init__(
-        self, backbones, transformer, vae_encoder, state_dim, action_dim, horizon, camera_names, vae
+        self, backbones, transformer, vae_encoder, state_dim, action_dim, horizon, camera_names, use_vae
     ):
         """Initializes the model.
         Parameters:
@@ -63,134 +57,124 @@ class ActionChunkingTransformer(nn.Module):
             state_dim: Robot positional state dimension.
             action_dim: Action dimension.
             horizon: The number of actions to generate in one forward pass.
-            vae: Whether to use the variational objective. TODO(now): Give more details.
+            use_vae: Whether to use the variational objective. TODO(now): Give more details.
         """
         super().__init__()
+
         self.camera_names = camera_names
         self.transformer = transformer
         self.vae_encoder = vae_encoder
-        self.vae = vae
+        self.use_vae = use_vae
         hidden_dim = transformer.d_model
-        self.action_head = nn.Linear(hidden_dim, action_dim)
-        self.is_pad_head = nn.Linear(hidden_dim, 1)
-        # Positional embedding to be used as input to the latent vae_encoder (if applicable) and for the
-        self.pos_embed = nn.Embedding(horizon, hidden_dim)
-        if backbones is not None:
-            self.input_proj = nn.Conv2d(backbones[0].num_channels, hidden_dim, kernel_size=1)
-            self.backbones = nn.ModuleList(backbones)
-            self.input_proj_robot_state = nn.Linear(state_dim, hidden_dim)
-        else:
-            # input_dim = 14 + 7 # robot_state + env_state
-            self.input_proj_robot_state = nn.Linear(state_dim, hidden_dim)
-            # TODO(rcadene): understand what is env_state, and why it needs to be 7
-            self.input_proj_env_state = nn.Linear(state_dim // 2, hidden_dim)
-            self.pos = torch.nn.Embedding(2, hidden_dim)
-            self.backbones = None
 
-        # vae_encoder extra parameters
-        self.latent_dim = 32  # final size of latent z # TODO tune
-        self.cls_embed = nn.Embedding(1, hidden_dim)  # extra cls token embedding
-        self.vae_encoder_action_proj = nn.Linear(14, hidden_dim)  # project action to embedding
-        self.vae_encoder_joint_proj = nn.Linear(14, hidden_dim)  # project qpos to embedding
-        self.latent_proj = nn.Linear(
-            hidden_dim, self.latent_dim * 2
-        )  # project hidden state to latent std, var
-        self.register_buffer(
-            "pos_table", get_sinusoid_encoding_table(1 + 1 + horizon, hidden_dim)
-        )  # [CLS], qpos, a_seq
+        # BERT style VAE encoder with input [cls, *joint_space_configuration, *action_sequence].
+        # The cls token forms parameters of the latent's distribution (like this [*means, *log_variances]).
+        if use_vae:
+            self.cls_embed = nn.Embedding(1, hidden_dim)
+            # Projection layer for joint-space configuration to hidden dimension.
+            self.vae_encoder_robot_state_input_proj = nn.Linear(state_dim, hidden_dim)
+            # Projection layer for action (joint-space target) to hidden dimension.
+            self.vae_encoder_action_input_proj = nn.Linear(state_dim, hidden_dim)
+            # Final size of latent z. TODO(now): Add to hyperparams.
+            self.latent_dim = 32
+            # Projection layer from the VAE encoder's output to the latent distribution's parameter space.
+            self.vae_encoder_latent_output_proj = nn.Linear(hidden_dim, self.latent_dim * 2)
+            # Fixed sinusoidal positional embedding the whole input to the VAE encoder.
+            self.register_buffer(
+                "vae_encoder_pos_enc", get_sinusoid_encoding_table(1 + 1 + horizon, hidden_dim)
+            )
 
-        # decoder extra parameters
-        self.latent_out_proj = nn.Linear(self.latent_dim, hidden_dim)  # project latent sample to embedding
+        # Transformer encoder input projections. The tokens will be structured like
+        # [latent, robot_state, image_feature_map_pixels].
+        self.backbones = nn.ModuleList(backbones)
+        self.encoder_img_feat_input_proj = nn.Conv2d(backbones[0].num_channels, hidden_dim, kernel_size=1)
+        self.encoder_robot_state_input_proj = nn.Linear(state_dim, hidden_dim)
+        self.encoder_latent_input_proj = nn.Linear(self.latent_dim, hidden_dim)
+        # TODO(now): Fix this nonsense. One positional embedding is needed. We should extract the image
+        # feature dimension with a dry run.
         self.additional_pos_embed = nn.Embedding(
             2, hidden_dim
         )  # learned position embedding for proprio and latent
 
-    def forward(self, qpos, image, env_state, actions=None, is_pad=None):
+        # Transformer decoder.
+        # Learnable positional embedding for the transformer's decoder (in the style of DETR object queries).
+        self.decoder_pos_embed = nn.Embedding(horizon, hidden_dim)
+        # Final action regression head on the output of the transformer's decoder.
+        self.action_head = nn.Linear(hidden_dim, action_dim)
+
+    def forward(self, robot_state, image, actions=None):
         """
-        qpos: batch, qpos_dim
-        image: batch, num_cam, channel, height, width
-        env_state: None
-        actions: batch, seq, action_dim
+        Args:
+            robot_state: (B, J) batch of robot joint configurations.
+            image: (B, N, C, H, W) batch of N camera frames.
+            actions: (B, S, A) batch of actions from the target dataset which must be provided if the
+                VAE is enabled and the model is in training mode.
         """
-        is_training = actions is not None  # train or val
-        bs, _ = qpos.shape
-        ### Obtain latent z from action sequence
-        if self.vae and is_training:
-            # project action sequence to embedding dim, and concat with a CLS token
-            action_embed = self.vae_encoder_action_proj(actions)  # (bs, seq, hidden_dim)
-            qpos_embed = self.vae_encoder_joint_proj(qpos)  # (bs, hidden_dim)
-            qpos_embed = torch.unsqueeze(qpos_embed, axis=1)  # (bs, 1, hidden_dim)
-            cls_embed = self.cls_embed.weight  # (1, hidden_dim)
-            cls_embed = torch.unsqueeze(cls_embed, axis=0).repeat(bs, 1, 1)  # (bs, 1, hidden_dim)
-            vae_encoder_input = torch.cat(
-                [cls_embed, qpos_embed, action_embed], axis=1
-            )  # (bs, seq+1, hidden_dim)
-            vae_encoder_input = vae_encoder_input.permute(1, 0, 2)  # (seq+1, bs, hidden_dim)
-            # do not mask cls token
-            # cls_joint_is_pad = torch.full((bs, 2), False).to(qpos.device)  # False: not a padding
-            # is_pad = torch.cat([cls_joint_is_pad, is_pad], axis=1)  # (bs, seq+1)
-            # obtain position embedding
-            pos_embed = self.pos_table.clone().detach()
-            pos_embed = pos_embed.permute(1, 0, 2)  # (seq+1, 1, hidden_dim)
-            # query model
+        if self.use_vae and self.training:
+            assert (
+                actions is not None
+            ), "actions must be provided when using the variational objective in training mode."
+
+        batch_size, _ = robot_state.shape
+
+        # Prepare the latent for input to the transformer.
+        if self.use_vae and actions is not None:
+            # Prepare the input to the VAE encoder: [cls, *joint_space_configuration, *action_sequence].
+            cls_embed = einops.repeat(self.cls_embed.weight, "1 d -> b 1 d", b=batch_size)  # (B, 1, D)
+            robot_state_embed = self.vae_encoder_robot_state_input_proj(robot_state).unsqueeze(1)  # (B, 1, D)
+            action_embed = self.vae_encoder_action_input_proj(actions)  # (B, S, D)
+            vae_encoder_input = torch.cat([cls_embed, robot_state_embed, action_embed], axis=1)  # (B, S+2, D)
+            vae_encoder_input = vae_encoder_input.permute(1, 0, 2)  # (S+2, B, D)
+            # Note: detach() shouldn't be necessary but leaving it the same as the original code just in case.
+            # Prepare fixed positional embedding.
+            pos_embed = self.vae_encoder_pos_enc.clone().detach().permute(1, 0, 2)  # (S+2, 1, D)
+            # Forward pass through VAE encoder and sample the latent with the reparameterization trick.
             vae_encoder_output = self.vae_encoder(
                 vae_encoder_input, pos=pos_embed
-            )  # , src_key_padding_mask=is_pad)
+            )  # , src_key_padding_mask=is_pad)  # TODO(now)
             vae_encoder_output = vae_encoder_output[0]  # take cls output only
-            latent_info = self.latent_proj(vae_encoder_output)
-            mu = latent_info[:, : self.latent_dim]
-            logvar = latent_info[:, self.latent_dim :]
-            latent_sample = reparametrize(mu, logvar)
-            latent_input = self.latent_out_proj(latent_sample)
+            latent_pdf_params = self.vae_encoder_latent_output_proj(vae_encoder_output)
+            mu = latent_pdf_params[:, : self.latent_dim]
+            logvar = latent_pdf_params[:, self.latent_dim :]
+            # Use reparameterization trick to sample from the latent's PDF.
+            latent_sample = mu + logvar.div(2).exp() * torch.randn_like(mu)
         else:
+            # When not using the VAE encoder, we set the latent to be all zeros.
             mu = logvar = None
-            latent_sample = torch.zeros([bs, self.latent_dim], dtype=torch.float32).to(qpos.device)
-            latent_input = self.latent_out_proj(latent_sample)
+            latent_sample = torch.zeros([batch_size, self.latent_dim], dtype=robot_state.dtype).to(
+                robot_state.device
+            )
 
-        if self.backbones is not None:
-            # Image observation features and position embeddings
-            all_cam_features = []
-            all_cam_pos = []
-            for cam_id, _ in enumerate(self.camera_names):
-                features, pos = self.backbones[0](image[:, cam_id])  # HARDCODED
-                features = features[0]  # take the last layer feature
-                pos = pos[0]
-                all_cam_features.append(self.input_proj(features))
-                all_cam_pos.append(pos)
-            # proprioception features
-            proprio_input = self.input_proj_robot_state(qpos)
-            # fold camera dimension into width dimension
-            src = torch.cat(all_cam_features, axis=3)
-            pos = torch.cat(all_cam_pos, axis=3)
-            hs = self.transformer(
-                src,
-                None,
-                self.pos_embed.weight,
-                pos,
-                latent_input,
-                proprio_input,
-                self.additional_pos_embed.weight,
-            )[0]
-        else:
-            qpos = self.input_proj_robot_state(qpos)
-            env_state = self.input_proj_env_state(env_state)
-            transformer_input = torch.cat([qpos, env_state], axis=1)  # seq length = 2
-            hs = self.transformer(transformer_input, None, self.pos_embed.weight, self.pos.weight)[0]
-        a_hat = self.action_head(hs)
-        is_pad_hat = self.is_pad_head(hs)
-        return a_hat, is_pad_hat, [mu, logvar]
+        # Prepare all other transformer inputs.
+        # Image observation features and position embeddings.
+        all_cam_features = []
+        all_cam_pos = []
+        for cam_id, _ in enumerate(self.camera_names):
+            # TODO(now): remove the positional embedding from the backbones.
+            features, pos = self.backbones[0](image[:, cam_id])  # HARDCODED
+            features = features[0]  # take the last layer feature
+            pos = pos[0]
+            all_cam_features.append(self.encoder_img_feat_input_proj(features))
+            all_cam_pos.append(pos)
+        # Concatenate image observation feature maps along the width dimension.
+        transformer_input = torch.cat(all_cam_features, axis=3)
+        # TODO(now): remove the positional embedding from the backbones.
+        pos = torch.cat(all_cam_pos, axis=3)
+        robot_state_embed = self.encoder_robot_state_input_proj(robot_state)
+        latent_embed = self.encoder_latent_input_proj(latent_sample)
 
+        # Run the transformer and project the outputs to the action space.
+        transformer_output = self.transformer(
+            transformer_input,
+            query_embed=self.decoder_pos_embed.weight,
+            pos_embed=pos,
+            latent_input=latent_embed,
+            proprio_input=robot_state_embed,
+            additional_pos_embed=self.additional_pos_embed.weight,
+        )
+        a_hat = self.action_head(transformer_output)
 
-def mlp(input_dim, hidden_dim, output_dim, hidden_depth):
-    if hidden_depth == 0:
-        mods = [nn.Linear(input_dim, output_dim)]
-    else:
-        mods = [nn.Linear(input_dim, hidden_dim), nn.ReLU(inplace=True)]
-        for _ in range(hidden_depth - 1):
-            mods += [nn.Linear(hidden_dim, hidden_dim), nn.ReLU(inplace=True)]
-        mods.append(nn.Linear(hidden_dim, output_dim))
-    trunk = nn.Sequential(*mods)
-    return trunk
+        return a_hat, [mu, logvar]
 
 
 def build_vae_encoder(args):
@@ -231,7 +215,7 @@ def build(args):
         action_dim=args.action_dim,
         horizon=args.num_queries,
         camera_names=args.camera_names,
-        vae=args.vae,
+        use_vae=args.vae,
     )
 
     n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py
index 5cf74ae5..7d24620a 100644
--- a/lerobot/common/policies/act/policy.py
+++ b/lerobot/common/policies/act/policy.py
@@ -224,8 +224,7 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
             if is_pad is not None:
                 is_pad = is_pad[:, : self.model.num_queries]
 
-            breakpoint()
-            a_hat, is_pad_hat, (mu, logvar) = self.model(qpos, image, env_state, actions, is_pad)
+            a_hat, (mu, logvar) = self.model(qpos, image, env_state, actions, is_pad)
 
             all_l1 = F.l1_loss(actions, a_hat, reduction="none")
             l1 = all_l1.mean() if is_pad is None else (all_l1 * ~is_pad.unsqueeze(-1)).mean()
@@ -240,5 +239,5 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
                 loss_dict["loss"] = loss_dict["l1"]
             return loss_dict
         else:
-            action, _, (_, _) = self.model(qpos, image, env_state)  # no action, sample from prior
+            action, _ = self.model(qpos, image, env_state)  # no action, sample from prior
             return action
diff --git a/lerobot/common/policies/act/transformer.py b/lerobot/common/policies/act/transformer.py
index 20cfc815..11d5a013 100644
--- a/lerobot/common/policies/act/transformer.py
+++ b/lerobot/common/policies/act/transformer.py
@@ -26,10 +26,8 @@ class Transformer(nn.Module):
         dropout=0.1,
         activation="relu",
         normalize_before=False,
-        return_intermediate_dec=False,
     ):
         super().__init__()
-
         encoder_layer = TransformerEncoderLayer(
             d_model, nhead, dim_feedforward, dropout, activation, normalize_before
         )
@@ -40,9 +38,7 @@ class Transformer(nn.Module):
             d_model, nhead, dim_feedforward, dropout, activation, normalize_before
         )
         decoder_norm = nn.LayerNorm(d_model)
-        self.decoder = TransformerDecoder(
-            decoder_layer, num_decoder_layers, decoder_norm, return_intermediate=return_intermediate_dec
-        )
+        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
 
         self._reset_parameters()
 
@@ -57,7 +53,6 @@ class Transformer(nn.Module):
     def forward(
         self,
         src,
-        mask,
         query_embed,
         pos_embed,
         latent_input=None,
@@ -68,10 +63,10 @@ class Transformer(nn.Module):
         if len(src.shape) == 4:  # has H and W
             # flatten NxCxHxW to HWxNxC
             bs, c, h, w = src.shape
+            # Each "pixel" on the feature maps will form a token.
             src = src.flatten(2).permute(2, 0, 1)
             pos_embed = pos_embed.flatten(2).permute(2, 0, 1).repeat(1, bs, 1)
             query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
-            # mask = mask.flatten(1)
 
             additional_pos_embed = additional_pos_embed.unsqueeze(1).repeat(1, bs, 1)  # seq, bs, dim
             pos_embed = torch.cat([additional_pos_embed, pos_embed], axis=0)
@@ -87,9 +82,9 @@ class Transformer(nn.Module):
             query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
 
         tgt = torch.zeros_like(query_embed)
-        memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
-        hs = self.decoder(tgt, memory, memory_key_padding_mask=mask, pos=pos_embed, query_pos=query_embed)
-        hs = hs.transpose(1, 2)
+        memory = self.encoder(src, pos=pos_embed)
+        hs = self.decoder(tgt, memory, pos=pos_embed, query_pos=query_embed)
+        hs = hs.transpose(0, 1)
         return hs
 
 
@@ -103,14 +98,12 @@ class TransformerEncoder(nn.Module):
     def forward(
         self,
         src,
-        mask: Optional[Tensor] = None,
-        src_key_padding_mask: Optional[Tensor] = None,
         pos: Optional[Tensor] = None,
     ):
         output = src
 
         for layer in self.layers:
-            output = layer(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, pos=pos)
+            output = layer(output, pos=pos)
 
         if self.norm is not None:
             output = self.norm(output)
@@ -119,52 +112,33 @@ class TransformerEncoder(nn.Module):
 
 
 class TransformerDecoder(nn.Module):
-    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
+    def __init__(self, decoder_layer, num_layers, norm=None):
         super().__init__()
         self.layers = _get_clones(decoder_layer, num_layers)
         self.num_layers = num_layers
         self.norm = norm
-        self.return_intermediate = return_intermediate
 
     def forward(
         self,
         tgt,
         memory,
-        tgt_mask: Optional[Tensor] = None,
-        memory_mask: Optional[Tensor] = None,
-        tgt_key_padding_mask: Optional[Tensor] = None,
-        memory_key_padding_mask: Optional[Tensor] = None,
         pos: Optional[Tensor] = None,
         query_pos: Optional[Tensor] = None,
     ):
         output = tgt
 
-        intermediate = []
-
         for layer in self.layers:
             output = layer(
                 output,
                 memory,
-                tgt_mask=tgt_mask,
-                memory_mask=memory_mask,
-                tgt_key_padding_mask=tgt_key_padding_mask,
-                memory_key_padding_mask=memory_key_padding_mask,
                 pos=pos,
                 query_pos=query_pos,
             )
-            if self.return_intermediate:
-                intermediate.append(self.norm(output))
 
         if self.norm is not None:
             output = self.norm(output)
-            if self.return_intermediate:
-                intermediate.pop()
-                intermediate.append(output)
 
-        if self.return_intermediate:
-            return torch.stack(intermediate)
-
-        return output.unsqueeze(0)
+        return output
 
 
 class TransformerEncoderLayer(nn.Module):
@@ -192,12 +166,10 @@ class TransformerEncoderLayer(nn.Module):
     def forward_post(
         self,
         src,
-        src_mask: Optional[Tensor] = None,
-        src_key_padding_mask: Optional[Tensor] = None,
         pos: Optional[Tensor] = None,
     ):
         q = k = self.with_pos_embed(src, pos)
-        src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
+        src2 = self.self_attn(q, k, value=src)[0]
         src = src + self.dropout1(src2)
         src = self.norm1(src)
         src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
@@ -208,13 +180,11 @@ class TransformerEncoderLayer(nn.Module):
     def forward_pre(
         self,
         src,
-        src_mask: Optional[Tensor] = None,
-        src_key_padding_mask: Optional[Tensor] = None,
         pos: Optional[Tensor] = None,
     ):
         src2 = self.norm1(src)
         q = k = self.with_pos_embed(src2, pos)
-        src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
+        src2 = self.self_attn(q, k, value=src2)[0]
         src = src + self.dropout1(src2)
         src2 = self.norm2(src)
         src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
@@ -224,13 +194,11 @@ class TransformerEncoderLayer(nn.Module):
     def forward(
         self,
         src,
-        src_mask: Optional[Tensor] = None,
-        src_key_padding_mask: Optional[Tensor] = None,
         pos: Optional[Tensor] = None,
     ):
         if self.normalize_before:
-            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
-        return self.forward_post(src, src_mask, src_key_padding_mask, pos)
+            return self.forward_pre(src, pos)
+        return self.forward_post(src, pos)
 
 
 class TransformerDecoderLayer(nn.Module):
@@ -262,23 +230,17 @@ class TransformerDecoderLayer(nn.Module):
         self,
         tgt,
         memory,
-        tgt_mask: Optional[Tensor] = None,
-        memory_mask: Optional[Tensor] = None,
-        tgt_key_padding_mask: Optional[Tensor] = None,
-        memory_key_padding_mask: Optional[Tensor] = None,
         pos: Optional[Tensor] = None,
         query_pos: Optional[Tensor] = None,
     ):
         q = k = self.with_pos_embed(tgt, query_pos)
-        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0]
+        tgt2 = self.self_attn(q, k, value=tgt)[0]
         tgt = tgt + self.dropout1(tgt2)
         tgt = self.norm1(tgt)
         tgt2 = self.multihead_attn(
             query=self.with_pos_embed(tgt, query_pos),
             key=self.with_pos_embed(memory, pos),
             value=memory,
-            attn_mask=memory_mask,
-            key_padding_mask=memory_key_padding_mask,
         )[0]
         tgt = tgt + self.dropout2(tgt2)
         tgt = self.norm2(tgt)
@@ -291,24 +253,18 @@ class TransformerDecoderLayer(nn.Module):
         self,
         tgt,
         memory,
-        tgt_mask: Optional[Tensor] = None,
-        memory_mask: Optional[Tensor] = None,
-        tgt_key_padding_mask: Optional[Tensor] = None,
-        memory_key_padding_mask: Optional[Tensor] = None,
         pos: Optional[Tensor] = None,
         query_pos: Optional[Tensor] = None,
     ):
         tgt2 = self.norm1(tgt)
         q = k = self.with_pos_embed(tgt2, query_pos)
-        tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0]
+        tgt2 = self.self_attn(q, k, value=tgt2)[0]
         tgt = tgt + self.dropout1(tgt2)
         tgt2 = self.norm2(tgt)
         tgt2 = self.multihead_attn(
             query=self.with_pos_embed(tgt2, query_pos),
             key=self.with_pos_embed(memory, pos),
             value=memory,
-            attn_mask=memory_mask,
-            key_padding_mask=memory_key_padding_mask,
         )[0]
         tgt = tgt + self.dropout2(tgt2)
         tgt2 = self.norm3(tgt)
@@ -320,10 +276,6 @@ class TransformerDecoderLayer(nn.Module):
         self,
         tgt,
         memory,
-        tgt_mask: Optional[Tensor] = None,
-        memory_mask: Optional[Tensor] = None,
-        tgt_key_padding_mask: Optional[Tensor] = None,
-        memory_key_padding_mask: Optional[Tensor] = None,
         pos: Optional[Tensor] = None,
         query_pos: Optional[Tensor] = None,
     ):
@@ -331,16 +283,10 @@ class TransformerDecoderLayer(nn.Module):
             return self.forward_pre(
                 tgt,
                 memory,
-                tgt_mask,
-                memory_mask,
-                tgt_key_padding_mask,
-                memory_key_padding_mask,
                 pos,
                 query_pos,
             )
-        return self.forward_post(
-            tgt, memory, tgt_mask, memory_mask, tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos
-        )
+        return self.forward_post(tgt, memory, pos, query_pos)
 
 
 def _get_clones(module, n):
@@ -356,7 +302,6 @@ def build_transformer(args):
         num_encoder_layers=args.enc_layers,
         num_decoder_layers=args.dec_layers,
         normalize_before=args.pre_norm,
-        return_intermediate_dec=True,
     )
 
 
diff --git a/lerobot/configs/policy/act.yaml b/lerobot/configs/policy/act.yaml
index 0244944b..1086b595 100644
--- a/lerobot/configs/policy/act.yaml
+++ b/lerobot/configs/policy/act.yaml
@@ -29,7 +29,7 @@ policy:
   hidden_dim: 512
   dim_feedforward: 3200
   enc_layers: 4
-  dec_layers: 7
+  dec_layers: 1
   nheads: 8
   #camera_names: [top, front_close, left_pillar, right_pillar]
   camera_names: [top]
diff --git a/scripts/convert_act_weights.py b/scripts/convert_act_weights.py
new file mode 100644
index 00000000..d0c0c3e7
--- /dev/null
+++ b/scripts/convert_act_weights.py
@@ -0,0 +1,64 @@
+import torch
+
+from lerobot.common.policies.factory import make_policy
+from lerobot.common.utils import init_hydra_config
+
+cfg = init_hydra_config(
+    "/home/alexander/Projects/lerobot/outputs/train/act_aloha_sim_transfer_cube_human/.hydra/config.yaml"
+)
+
+policy = make_policy(cfg)
+
+state_dict = torch.load("/home/alexander/Projects/act/outputs/sim_transfer_cube_human_vae/policy_last.ckpt")
+
+
+# Replace keys based on what they start with.
+
+start_replacements = [
+    ("model.query_embed.weight", "model.pos_embed.weight"),
+    ("model.pos_table", "model.vae_encoder_pos_enc"),
+    ("model.pos_embed.weight", "model.decoder_pos_embed.weight"),
+    ("model.encoder.", "model.vae_encoder."),
+    ("model.encoder_action_proj.", "model.vae_encoder_action_input_proj."),
+    ("model.encoder_joint_proj.", "model.vae_encoder_robot_state_input_proj."),
+    ("model.latent_proj.", "model.vae_encoder_latent_output_proj."),
+    ("model.latent_proj.", "model.vae_encoder_latent_output_proj."),
+    ("model.input_proj.", "model.encoder_img_feat_input_proj."),
+    ("model.input_proj_robot_state", "model.encoder_robot_state_input_proj"),
+    ("model.latent_out_proj.", "model.encoder_latent_input_proj."),
+]
+
+for to_replace, replace_with in start_replacements:
+    for k in list(state_dict.keys()):
+        if k.startswith(to_replace):
+            k_ = replace_with + k.removeprefix(to_replace)
+            state_dict[k_] = state_dict[k]
+            del state_dict[k]
+
+# Remove keys based on what they start with.
+
+start_removals = [
+    # There is a bug that means the pretrained model doesn't even use the final decoder layers.
+    *[f"model.transformer.decoder.layers.{i}" for i in range(1, 7)],
+    "model.is_pad_head.",
+]
+
+for to_remove in start_removals:
+    for k in list(state_dict.keys()):
+        if k.startswith(to_remove):
+            del state_dict[k]
+
+missing_keys, unexpected_keys = policy.load_state_dict(state_dict, strict=False)
+
+if len(missing_keys) != 0:
+    print("MISSING KEYS")
+    print(missing_keys)
+if len(unexpected_keys) != 0:
+    print("UNEXPECTED KEYS")
+    print(unexpected_keys)
+
+# if len(missing_keys) != 0 or len(unexpected_keys) != 0:
+#     print("Failed due to mismatch in state dicts.")
+#     exit()
+
+policy.save("/tmp/weights.pth")

From 278336a39a32ec0a7f7af87dac5b65c21368e488 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare159@gmail.com>
Date: Wed, 3 Apr 2024 19:23:22 +0100
Subject: [PATCH 04/25] backup wip

---
 lerobot/common/policies/act/detr_vae.py    |  85 ++---
 lerobot/common/policies/act/transformer.py | 350 ++++++++-------------
 pyproject.toml                             |   3 +
 3 files changed, 185 insertions(+), 253 deletions(-)

diff --git a/lerobot/common/policies/act/detr_vae.py b/lerobot/common/policies/act/detr_vae.py
index ff137a34..aaf4d098 100644
--- a/lerobot/common/policies/act/detr_vae.py
+++ b/lerobot/common/policies/act/detr_vae.py
@@ -4,7 +4,7 @@ import torch
 from torch import nn
 
 from .backbone import build_backbone
-from .transformer import TransformerEncoder, TransformerEncoderLayer, build_transformer
+from .transformer import Transformer, TransformerEncoder
 
 
 def get_sinusoid_encoding_table(n_position, d_hid):
@@ -124,16 +124,14 @@ class ActionChunkingTransformer(nn.Module):
             robot_state_embed = self.vae_encoder_robot_state_input_proj(robot_state).unsqueeze(1)  # (B, 1, D)
             action_embed = self.vae_encoder_action_input_proj(actions)  # (B, S, D)
             vae_encoder_input = torch.cat([cls_embed, robot_state_embed, action_embed], axis=1)  # (B, S+2, D)
-            vae_encoder_input = vae_encoder_input.permute(1, 0, 2)  # (S+2, B, D)
             # Note: detach() shouldn't be necessary but leaving it the same as the original code just in case.
             # Prepare fixed positional embedding.
-            pos_embed = self.vae_encoder_pos_enc.clone().detach().permute(1, 0, 2)  # (S+2, 1, D)
+            pos_embed = self.vae_encoder_pos_enc.clone().detach()  # (1, S+2, D)
             # Forward pass through VAE encoder and sample the latent with the reparameterization trick.
-            vae_encoder_output = self.vae_encoder(
-                vae_encoder_input, pos=pos_embed
-            )  # , src_key_padding_mask=is_pad)  # TODO(now)
-            vae_encoder_output = vae_encoder_output[0]  # take cls output only
-            latent_pdf_params = self.vae_encoder_latent_output_proj(vae_encoder_output)
+            cls_token_out = self.vae_encoder(
+                vae_encoder_input.permute(1, 0, 2), pos=pos_embed.permute(1, 0, 2)
+            )[0]  # (B, D)
+            latent_pdf_params = self.vae_encoder_latent_output_proj(cls_token_out)
             mu = latent_pdf_params[:, : self.latent_dim]
             logvar = latent_pdf_params[:, self.latent_dim :]
             # Use reparameterization trick to sample from the latent's PDF.
@@ -151,10 +149,11 @@ class ActionChunkingTransformer(nn.Module):
         all_cam_pos = []
         for cam_id, _ in enumerate(self.camera_names):
             # TODO(now): remove the positional embedding from the backbones.
-            features, pos = self.backbones[0](image[:, cam_id])  # HARDCODED
-            features = features[0]  # take the last layer feature
+            cam_features, pos = self.backbones[0](image[:, cam_id])  # HARDCODED
+            cam_features = cam_features[0]  # take the last layer feature
             pos = pos[0]
-            all_cam_features.append(self.encoder_img_feat_input_proj(features))
+            cam_features = self.encoder_img_feat_input_proj(cam_features)  # (B, C, h, w)
+            all_cam_features.append(cam_features)
             all_cam_pos.append(pos)
         # Concatenate image observation feature maps along the width dimension.
         transformer_input = torch.cat(all_cam_features, axis=3)
@@ -163,36 +162,25 @@ class ActionChunkingTransformer(nn.Module):
         robot_state_embed = self.encoder_robot_state_input_proj(robot_state)
         latent_embed = self.encoder_latent_input_proj(latent_sample)
 
+        # TODO(now): Explain all of this madness.
+        transformer_input = torch.cat(
+            [
+                torch.stack([latent_embed, robot_state_embed], axis=0),
+                transformer_input.flatten(2).permute(2, 0, 1),
+            ]
+        )
+        pos_embed = torch.cat(
+            [self.additional_pos_embed.weight.unsqueeze(1), pos.flatten(2).permute(2, 0, 1)], axis=0
+        )
+
         # Run the transformer and project the outputs to the action space.
         transformer_output = self.transformer(
             transformer_input,
-            query_embed=self.decoder_pos_embed.weight,
-            pos_embed=pos,
-            latent_input=latent_embed,
-            proprio_input=robot_state_embed,
-            additional_pos_embed=self.additional_pos_embed.weight,
-        )
-        a_hat = self.action_head(transformer_output)
-
-        return a_hat, [mu, logvar]
-
-
-def build_vae_encoder(args):
-    d_model = args.hidden_dim  # 256
-    dropout = args.dropout  # 0.1
-    nhead = args.nheads  # 8
-    dim_feedforward = args.dim_feedforward  # 2048
-    num_encoder_layers = args.enc_layers  # 4 # TODO shared with VAE decoder
-    normalize_before = args.pre_norm  # False
-    activation = "relu"
-
-    encoder_layer = TransformerEncoderLayer(
-        d_model, nhead, dim_feedforward, dropout, activation, normalize_before
-    )
-    encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
-    encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
-
-    return encoder
+            encoder_pos=pos_embed,
+            decoder_pos=self.decoder_pos_embed.weight.unsqueeze(1),
+        ).transpose(0, 1)  # back to (B, S, C)
+        actions = self.action_head(transformer_output)
+        return actions, [mu, logvar]
 
 
 def build(args):
@@ -203,9 +191,26 @@ def build(args):
     backbone = build_backbone(args)
     backbones.append(backbone)
 
-    transformer = build_transformer(args)
+    transformer = Transformer(
+        d_model=args.hidden_dim,
+        dropout=args.dropout,
+        nhead=args.nheads,
+        dim_feedforward=args.dim_feedforward,
+        num_encoder_layers=args.enc_layers,
+        num_decoder_layers=args.dec_layers,
+        normalize_before=args.pre_norm,
+    )
 
-    vae_encoder = build_vae_encoder(args)
+    # TODO(now): args.enc_layers shouldn't be shared with the transformer decoder
+    vae_encoder = TransformerEncoder(
+        num_layers=args.enc_layers,
+        d_model=args.hidden_dim,
+        nhead=args.nheads,
+        dim_feedforward=args.dim_feedforward,
+        dropout=args.dropout,
+        activation="relu",
+        normalize_before=args.pre_norm,
+    )
 
     model = ActionChunkingTransformer(
         backbones,
diff --git a/lerobot/common/policies/act/transformer.py b/lerobot/common/policies/act/transformer.py
index 11d5a013..7e71f3ea 100644
--- a/lerobot/common/policies/act/transformer.py
+++ b/lerobot/common/policies/act/transformer.py
@@ -1,13 +1,7 @@
 """
-DETR Transformer class.
-
-Copy-paste from torch.nn.Transformer with modifications:
-    * positional encodings are passed in MHattention
-    * extra LN at the end of encoder is removed
-    * decoder returns a stack of activations from all decoding layers
+TODO(now)
 """
 
-import copy
 from typing import Optional
 
 import torch
@@ -28,117 +22,68 @@ class Transformer(nn.Module):
         normalize_before=False,
     ):
         super().__init__()
-        encoder_layer = TransformerEncoderLayer(
-            d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+        self.encoder = TransformerEncoder(
+            num_encoder_layers, d_model, nhead, dim_feedforward, dropout, activation, normalize_before
         )
-        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
-        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
-
-        decoder_layer = TransformerDecoderLayer(
-            d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+        self.decoder = TransformerDecoder(
+            num_decoder_layers, d_model, nhead, dim_feedforward, dropout, activation, normalize_before
         )
-        decoder_norm = nn.LayerNorm(d_model)
-        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
-
-        self._reset_parameters()
-
         self.d_model = d_model
         self.nhead = nhead
+        self._init_params()  # TODO(now): move to somewhere common
 
-    def _reset_parameters(self):
+    def _init_params(self):
         for p in self.parameters():
             if p.dim() > 1:
                 nn.init.xavier_uniform_(p)
 
-    def forward(
-        self,
-        src,
-        query_embed,
-        pos_embed,
-        latent_input=None,
-        proprio_input=None,
-        additional_pos_embed=None,
-    ):
+    def forward(self, x, encoder_pos, decoder_pos):
+        """
+        Args:
+            x: ((E)ncoder (S)equence, (B)atch, (C)hannels)
+            decoder_pos: (Decoder Sequence, C) tensor for the decoder's positional embedding.
+            encoder_pos: (ES, C) tenso
+        """
         # TODO flatten only when input has H and W
-        if len(src.shape) == 4:  # has H and W
-            # flatten NxCxHxW to HWxNxC
-            bs, c, h, w = src.shape
-            # Each "pixel" on the feature maps will form a token.
-            src = src.flatten(2).permute(2, 0, 1)
-            pos_embed = pos_embed.flatten(2).permute(2, 0, 1).repeat(1, bs, 1)
-            query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
+        bs = x.shape[1]
 
-            additional_pos_embed = additional_pos_embed.unsqueeze(1).repeat(1, bs, 1)  # seq, bs, dim
-            pos_embed = torch.cat([additional_pos_embed, pos_embed], axis=0)
-
-            addition_input = torch.stack([latent_input, proprio_input], axis=0)
-            src = torch.cat([addition_input, src], axis=0)
-        else:
-            assert len(src.shape) == 3
-            # flatten NxHWxC to HWxNxC
-            bs, hw, c = src.shape
-            src = src.permute(1, 0, 2)
-            pos_embed = pos_embed.unsqueeze(1).repeat(1, bs, 1)
-            query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
-
-        tgt = torch.zeros_like(query_embed)
-        memory = self.encoder(src, pos=pos_embed)
-        hs = self.decoder(tgt, memory, pos=pos_embed, query_pos=query_embed)
-        hs = hs.transpose(0, 1)
-        return hs
+        encoder_out = self.encoder(x, pos=encoder_pos)
+        decoder_in = torch.zeros(
+            (decoder_pos.shape[0], bs, decoder_pos.shape[2]),
+            dtype=decoder_pos.dtype,
+            device=decoder_pos.device,
+        )
+        decoder_out = self.decoder(decoder_in, encoder_out, encoder_pos=encoder_pos, decoder_pos=decoder_pos)
+        return decoder_out
 
 
 class TransformerEncoder(nn.Module):
-    def __init__(self, encoder_layer, num_layers, norm=None):
-        super().__init__()
-        self.layers = _get_clones(encoder_layer, num_layers)
-        self.num_layers = num_layers
-        self.norm = norm
-
-    def forward(
+    def __init__(
         self,
-        src,
-        pos: Optional[Tensor] = None,
+        num_layers,
+        d_model,
+        nhead,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
     ):
-        output = src
-
-        for layer in self.layers:
-            output = layer(output, pos=pos)
-
-        if self.norm is not None:
-            output = self.norm(output)
-
-        return output
-
-
-class TransformerDecoder(nn.Module):
-    def __init__(self, decoder_layer, num_layers, norm=None):
         super().__init__()
-        self.layers = _get_clones(decoder_layer, num_layers)
-        self.num_layers = num_layers
-        self.norm = norm
-
-    def forward(
-        self,
-        tgt,
-        memory,
-        pos: Optional[Tensor] = None,
-        query_pos: Optional[Tensor] = None,
-    ):
-        output = tgt
+        self.layers = nn.ModuleList(
+            [
+                TransformerEncoderLayer(
+                    d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm = nn.LayerNorm(d_model) if normalize_before else nn.Identity()
 
+    def forward(self, x, pos: Optional[Tensor] = None):
         for layer in self.layers:
-            output = layer(
-                output,
-                memory,
-                pos=pos,
-                query_pos=query_pos,
-            )
-
-        if self.norm is not None:
-            output = self.norm(output)
-
-        return output
+            x = layer(x, pos=pos)
+        x = self.norm(x)
+        return x
 
 
 class TransformerEncoderLayer(nn.Module):
@@ -160,45 +105,55 @@ class TransformerEncoderLayer(nn.Module):
         self.activation = _get_activation_fn(activation)
         self.normalize_before = normalize_before
 
-    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
-        return tensor if pos is None else tensor + pos
-
-    def forward_post(
-        self,
-        src,
-        pos: Optional[Tensor] = None,
-    ):
-        q = k = self.with_pos_embed(src, pos)
-        src2 = self.self_attn(q, k, value=src)[0]
-        src = src + self.dropout1(src2)
-        src = self.norm1(src)
-        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
-        src = src + self.dropout2(src2)
-        src = self.norm2(src)
-        return src
-
-    def forward_pre(
-        self,
-        src,
-        pos: Optional[Tensor] = None,
-    ):
-        src2 = self.norm1(src)
-        q = k = self.with_pos_embed(src2, pos)
-        src2 = self.self_attn(q, k, value=src2)[0]
-        src = src + self.dropout1(src2)
-        src2 = self.norm2(src)
-        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
-        src = src + self.dropout2(src2)
-        return src
-
-    def forward(
-        self,
-        src,
-        pos: Optional[Tensor] = None,
-    ):
+    def forward(self, x, pos: Optional[Tensor] = None):
+        skip = x
         if self.normalize_before:
-            return self.forward_pre(src, pos)
-        return self.forward_post(src, pos)
+            x = self.norm1(x)
+        q = k = x if pos is None else x + pos
+        x = self.self_attn(q, k, value=x)[0]
+        x = skip + self.dropout1(x)
+        if self.normalize_before:
+            skip = x
+            x = self.norm2(x)
+        else:
+            x = self.norm1(x)
+            skip = x
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        x = skip + self.dropout2(x)
+        if not self.normalize_before:
+            x = self.norm2(x)
+        return x
+
+
+class TransformerDecoder(nn.Module):
+    def __init__(
+        self,
+        num_layers,
+        d_model,
+        nhead,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [
+                TransformerDecoderLayer(
+                    d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.num_layers = num_layers
+        self.norm = nn.LayerNorm(d_model)
+
+    def forward(self, x, encoder_out, decoder_pos: Tensor | None = None, encoder_pos: Tensor | None = None):
+        for layer in self.layers:
+            x = layer(x, encoder_out, decoder_pos=decoder_pos, encoder_pos=encoder_pos)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
 
 
 class TransformerDecoderLayer(nn.Module):
@@ -223,86 +178,55 @@ class TransformerDecoderLayer(nn.Module):
         self.activation = _get_activation_fn(activation)
         self.normalize_before = normalize_before
 
-    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+    def maybe_add_pos_embed(self, tensor: Tensor, pos: Tensor | None) -> Tensor:
         return tensor if pos is None else tensor + pos
 
-    def forward_post(
-        self,
-        tgt,
-        memory,
-        pos: Optional[Tensor] = None,
-        query_pos: Optional[Tensor] = None,
-    ):
-        q = k = self.with_pos_embed(tgt, query_pos)
-        tgt2 = self.self_attn(q, k, value=tgt)[0]
-        tgt = tgt + self.dropout1(tgt2)
-        tgt = self.norm1(tgt)
-        tgt2 = self.multihead_attn(
-            query=self.with_pos_embed(tgt, query_pos),
-            key=self.with_pos_embed(memory, pos),
-            value=memory,
-        )[0]
-        tgt = tgt + self.dropout2(tgt2)
-        tgt = self.norm2(tgt)
-        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
-        tgt = tgt + self.dropout3(tgt2)
-        tgt = self.norm3(tgt)
-        return tgt
-
-    def forward_pre(
-        self,
-        tgt,
-        memory,
-        pos: Optional[Tensor] = None,
-        query_pos: Optional[Tensor] = None,
-    ):
-        tgt2 = self.norm1(tgt)
-        q = k = self.with_pos_embed(tgt2, query_pos)
-        tgt2 = self.self_attn(q, k, value=tgt2)[0]
-        tgt = tgt + self.dropout1(tgt2)
-        tgt2 = self.norm2(tgt)
-        tgt2 = self.multihead_attn(
-            query=self.with_pos_embed(tgt2, query_pos),
-            key=self.with_pos_embed(memory, pos),
-            value=memory,
-        )[0]
-        tgt = tgt + self.dropout2(tgt2)
-        tgt2 = self.norm3(tgt)
-        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
-        tgt = tgt + self.dropout3(tgt2)
-        return tgt
-
     def forward(
         self,
-        tgt,
-        memory,
-        pos: Optional[Tensor] = None,
-        query_pos: Optional[Tensor] = None,
-    ):
+        x: Tensor,
+        encoder_out: Tensor,
+        decoder_pos: Tensor | None = None,
+        encoder_pos: Tensor | None = None,
+    ) -> Tensor:
+        """
+        Args:
+            x: (Decoder Sequence, Batch, Channel) tensor of input tokens.
+            encoder_out: (Encoder Sequence, B, C) output features from the last layer of the encoder we are
+                cross-attending with.
+            decoder_pos: (ES, 1, C) positional embedding for keys (from the encoder).
+            encoder_pos: (DS, 1, C) Positional_embedding for the queries (from the decoder).
+        Returns:
+            (DS, B, C) tensor of decoder output features.
+        """
+        skip = x
         if self.normalize_before:
-            return self.forward_pre(
-                tgt,
-                memory,
-                pos,
-                query_pos,
-            )
-        return self.forward_post(tgt, memory, pos, query_pos)
-
-
-def _get_clones(module, n):
-    return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])
-
-
-def build_transformer(args):
-    return Transformer(
-        d_model=args.hidden_dim,
-        dropout=args.dropout,
-        nhead=args.nheads,
-        dim_feedforward=args.dim_feedforward,
-        num_encoder_layers=args.enc_layers,
-        num_decoder_layers=args.dec_layers,
-        normalize_before=args.pre_norm,
-    )
+            x = self.norm1(x)
+        q = k = self.maybe_add_pos_embed(x, decoder_pos)
+        x = self.self_attn(q, k, value=x)[0]
+        x = skip + self.dropout1(x)
+        if self.normalize_before:
+            skip = x
+            x = self.norm2(x)
+        else:
+            x = self.norm1(x)
+            skip = x
+        x = self.multihead_attn(
+            query=self.maybe_add_pos_embed(x, decoder_pos),
+            key=self.maybe_add_pos_embed(encoder_out, encoder_pos),
+            value=encoder_out,
+        )[0]
+        x = skip + self.dropout2(x)
+        if self.normalize_before:
+            skip = x
+            x = self.norm3(x)
+        else:
+            x = self.norm2(x)
+            skip = x
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        x = skip + self.dropout3(x)
+        if not self.normalize_before:
+            x = self.norm3(x)
+        return x
 
 
 def _get_activation_fn(activation):
@@ -313,4 +237,4 @@ def _get_activation_fn(activation):
         return F.gelu
     if activation == "glu":
         return F.glu
-    raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
+    raise RuntimeError(f"activation should be relu/gelu/glu, not {activation}.")
diff --git a/pyproject.toml b/pyproject.toml
index b2526e5c..6d76cffc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -101,3 +101,6 @@ enable = true
 [build-system]
 requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
 build-backend = "poetry_dynamic_versioning.backend"
+
+[tool.black]
+line-length = 110

From 3a4dfa82fe8393e0a401f28d97efa3fd2cac9a05 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare159@gmail.com>
Date: Thu, 4 Apr 2024 18:34:41 +0100
Subject: [PATCH 05/25] backup wip

---
 lerobot/common/policies/act/backbone.py       | 115 ----
 lerobot/common/policies/act/detr_vae.py       | 229 -------
 lerobot/common/policies/act/policy.py         | 570 ++++++++++++++++--
 .../common/policies/act/position_encoding.py  | 102 ----
 lerobot/common/policies/act/transformer.py    | 240 --------
 lerobot/common/policies/act/utils.py          | 478 ---------------
 lerobot/configs/policy/act.yaml               |   3 +-
 scripts/convert_act_weights.py                |  28 +-
 8 files changed, 538 insertions(+), 1227 deletions(-)
 delete mode 100644 lerobot/common/policies/act/backbone.py
 delete mode 100644 lerobot/common/policies/act/detr_vae.py
 delete mode 100644 lerobot/common/policies/act/position_encoding.py
 delete mode 100644 lerobot/common/policies/act/transformer.py
 delete mode 100644 lerobot/common/policies/act/utils.py

diff --git a/lerobot/common/policies/act/backbone.py b/lerobot/common/policies/act/backbone.py
deleted file mode 100644
index 6399d339..00000000
--- a/lerobot/common/policies/act/backbone.py
+++ /dev/null
@@ -1,115 +0,0 @@
-from typing import List
-
-import torch
-import torchvision
-from torch import nn
-from torchvision.models._utils import IntermediateLayerGetter
-
-from .position_encoding import build_position_encoding
-from .utils import NestedTensor, is_main_process
-
-
-class FrozenBatchNorm2d(torch.nn.Module):
-    """
-    BatchNorm2d where the batch statistics and the affine parameters are fixed.
-
-    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
-    without which any other policy_models than torchvision.policy_models.resnet[18,34,50,101]
-    produce nans.
-    """
-
-    def __init__(self, n):
-        super().__init__()
-        self.register_buffer("weight", torch.ones(n))
-        self.register_buffer("bias", torch.zeros(n))
-        self.register_buffer("running_mean", torch.zeros(n))
-        self.register_buffer("running_var", torch.ones(n))
-
-    def _load_from_state_dict(
-        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
-    ):
-        num_batches_tracked_key = prefix + "num_batches_tracked"
-        if num_batches_tracked_key in state_dict:
-            del state_dict[num_batches_tracked_key]
-
-        super()._load_from_state_dict(
-            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
-        )
-
-    def forward(self, x):
-        # move reshapes to the beginning
-        # to make it fuser-friendly
-        w = self.weight.reshape(1, -1, 1, 1)
-        b = self.bias.reshape(1, -1, 1, 1)
-        rv = self.running_var.reshape(1, -1, 1, 1)
-        rm = self.running_mean.reshape(1, -1, 1, 1)
-        eps = 1e-5
-        scale = w * (rv + eps).rsqrt()
-        bias = b - rm * scale
-        return x * scale + bias
-
-
-class BackboneBase(nn.Module):
-    def __init__(
-        self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_layers: bool
-    ):
-        super().__init__()
-        # for name, parameter in backbone.named_parameters(): # only train later layers # TODO do we want this?
-        #     if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
-        #         parameter.requires_grad_(False)
-        if return_interm_layers:
-            return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
-        else:
-            return_layers = {"layer4": "0"}
-        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
-        self.num_channels = num_channels
-
-    def forward(self, tensor):
-        xs = self.body(tensor)
-        return xs
-        # out: Dict[str, NestedTensor] = {}
-        # for name, x in xs.items():
-        #     m = tensor_list.mask
-        #     assert m is not None
-        #     mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
-        #     out[name] = NestedTensor(x, mask)
-        # return out
-
-
-class Backbone(BackboneBase):
-    """ResNet backbone with frozen BatchNorm."""
-
-    def __init__(self, name: str, train_backbone: bool, return_interm_layers: bool, dilation: bool):
-        backbone = getattr(torchvision.models, name)(
-            replace_stride_with_dilation=[False, False, dilation],
-            pretrained=is_main_process(),
-            norm_layer=FrozenBatchNorm2d,
-        )  # pretrained # TODO do we want frozen batch_norm??
-        num_channels = 512 if name in ("resnet18", "resnet34") else 2048
-        super().__init__(backbone, train_backbone, num_channels, return_interm_layers)
-
-
-class Joiner(nn.Sequential):
-    def __init__(self, backbone, position_embedding):
-        super().__init__(backbone, position_embedding)
-
-    def forward(self, tensor_list: NestedTensor):
-        xs = self[0](tensor_list)
-        out: List[NestedTensor] = []
-        pos = []
-        for _, x in xs.items():
-            out.append(x)
-            # position encoding
-            pos.append(self[1](x).to(x.dtype))
-
-        return out, pos
-
-
-def build_backbone(args):
-    position_embedding = build_position_encoding(args)
-    train_backbone = args.lr_backbone > 0
-    return_interm_layers = args.masks
-    backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation)
-    model = Joiner(backbone, position_embedding)
-    model.num_channels = backbone.num_channels
-    return model
diff --git a/lerobot/common/policies/act/detr_vae.py b/lerobot/common/policies/act/detr_vae.py
deleted file mode 100644
index aaf4d098..00000000
--- a/lerobot/common/policies/act/detr_vae.py
+++ /dev/null
@@ -1,229 +0,0 @@
-import einops
-import numpy as np
-import torch
-from torch import nn
-
-from .backbone import build_backbone
-from .transformer import Transformer, TransformerEncoder
-
-
-def get_sinusoid_encoding_table(n_position, d_hid):
-    def get_position_angle_vec(position):
-        return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
-
-    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
-    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
-    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
-
-    return torch.FloatTensor(sinusoid_table).unsqueeze(0)
-
-
-class ActionChunkingTransformer(nn.Module):
-    """
-    Action Chunking Transformer as per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware
-    (paper: https://arxiv.org/abs/2304.13705, code: https://github.com/tonyzhaozh/act)
-
-    Note: In this code we use the symbols `vae_encoder`, 'encoder', `decoder`. The meanings are as follows.
-        - The `vae_encoder` is, as per the literature around conditional variational auto-encoders (cVAE), the
-          part of the model that encodes the target data (here, a sequence of actions), and the condition
-          (here, we include the robot joint-space state as an input to the encoder).
-        - The `transformer` is the cVAE's decoder. But since we have an option to train this model without the
-          variational objective (in which case we drop the `vae_encoder` altogether), we don't call it the
-          `vae_decoder`.
-        # TODO(now): remove the following
-        - The `encoder` is actually a component of the cVAE's "decoder". But we refer to it as an "encoder"
-          because, in terms of the transformer with cross-attention that forms the cVAE's decoder, it is the
-          "encoder" part. We drop the `vae_` prefix because we have an option to train this model without the
-          variational objective (in which case we drop the `vae_encoder` altogether), and nothing about this
-          model has anything to do with a VAE).
-        - The `decoder` is a building block of the VAE decoder, and is just the "decoder" part of a
-          transformer with cross-attention. For the same reasoning behind the naming of `encoder`, we make
-          this term agnostic to the option to use a variational objective for training.
-
-    """
-
-    def __init__(
-        self, backbones, transformer, vae_encoder, state_dim, action_dim, horizon, camera_names, use_vae
-    ):
-        """Initializes the model.
-        Parameters:
-            backbones: torch module of the backbone to be used. See backbone.py
-            transformer: torch module of the transformer architecture. See transformer.py
-            state_dim: robot state dimension of the environment
-            horizon: number of object queries, ie detection slot. This is the maximal number of objects
-                         DETR can detect in a single image. For COCO, we recommend 100 queries.
-
-        Args:
-            state_dim: Robot positional state dimension.
-            action_dim: Action dimension.
-            horizon: The number of actions to generate in one forward pass.
-            use_vae: Whether to use the variational objective. TODO(now): Give more details.
-        """
-        super().__init__()
-
-        self.camera_names = camera_names
-        self.transformer = transformer
-        self.vae_encoder = vae_encoder
-        self.use_vae = use_vae
-        hidden_dim = transformer.d_model
-
-        # BERT style VAE encoder with input [cls, *joint_space_configuration, *action_sequence].
-        # The cls token forms parameters of the latent's distribution (like this [*means, *log_variances]).
-        if use_vae:
-            self.cls_embed = nn.Embedding(1, hidden_dim)
-            # Projection layer for joint-space configuration to hidden dimension.
-            self.vae_encoder_robot_state_input_proj = nn.Linear(state_dim, hidden_dim)
-            # Projection layer for action (joint-space target) to hidden dimension.
-            self.vae_encoder_action_input_proj = nn.Linear(state_dim, hidden_dim)
-            # Final size of latent z. TODO(now): Add to hyperparams.
-            self.latent_dim = 32
-            # Projection layer from the VAE encoder's output to the latent distribution's parameter space.
-            self.vae_encoder_latent_output_proj = nn.Linear(hidden_dim, self.latent_dim * 2)
-            # Fixed sinusoidal positional embedding the whole input to the VAE encoder.
-            self.register_buffer(
-                "vae_encoder_pos_enc", get_sinusoid_encoding_table(1 + 1 + horizon, hidden_dim)
-            )
-
-        # Transformer encoder input projections. The tokens will be structured like
-        # [latent, robot_state, image_feature_map_pixels].
-        self.backbones = nn.ModuleList(backbones)
-        self.encoder_img_feat_input_proj = nn.Conv2d(backbones[0].num_channels, hidden_dim, kernel_size=1)
-        self.encoder_robot_state_input_proj = nn.Linear(state_dim, hidden_dim)
-        self.encoder_latent_input_proj = nn.Linear(self.latent_dim, hidden_dim)
-        # TODO(now): Fix this nonsense. One positional embedding is needed. We should extract the image
-        # feature dimension with a dry run.
-        self.additional_pos_embed = nn.Embedding(
-            2, hidden_dim
-        )  # learned position embedding for proprio and latent
-
-        # Transformer decoder.
-        # Learnable positional embedding for the transformer's decoder (in the style of DETR object queries).
-        self.decoder_pos_embed = nn.Embedding(horizon, hidden_dim)
-        # Final action regression head on the output of the transformer's decoder.
-        self.action_head = nn.Linear(hidden_dim, action_dim)
-
-    def forward(self, robot_state, image, actions=None):
-        """
-        Args:
-            robot_state: (B, J) batch of robot joint configurations.
-            image: (B, N, C, H, W) batch of N camera frames.
-            actions: (B, S, A) batch of actions from the target dataset which must be provided if the
-                VAE is enabled and the model is in training mode.
-        """
-        if self.use_vae and self.training:
-            assert (
-                actions is not None
-            ), "actions must be provided when using the variational objective in training mode."
-
-        batch_size, _ = robot_state.shape
-
-        # Prepare the latent for input to the transformer.
-        if self.use_vae and actions is not None:
-            # Prepare the input to the VAE encoder: [cls, *joint_space_configuration, *action_sequence].
-            cls_embed = einops.repeat(self.cls_embed.weight, "1 d -> b 1 d", b=batch_size)  # (B, 1, D)
-            robot_state_embed = self.vae_encoder_robot_state_input_proj(robot_state).unsqueeze(1)  # (B, 1, D)
-            action_embed = self.vae_encoder_action_input_proj(actions)  # (B, S, D)
-            vae_encoder_input = torch.cat([cls_embed, robot_state_embed, action_embed], axis=1)  # (B, S+2, D)
-            # Note: detach() shouldn't be necessary but leaving it the same as the original code just in case.
-            # Prepare fixed positional embedding.
-            pos_embed = self.vae_encoder_pos_enc.clone().detach()  # (1, S+2, D)
-            # Forward pass through VAE encoder and sample the latent with the reparameterization trick.
-            cls_token_out = self.vae_encoder(
-                vae_encoder_input.permute(1, 0, 2), pos=pos_embed.permute(1, 0, 2)
-            )[0]  # (B, D)
-            latent_pdf_params = self.vae_encoder_latent_output_proj(cls_token_out)
-            mu = latent_pdf_params[:, : self.latent_dim]
-            logvar = latent_pdf_params[:, self.latent_dim :]
-            # Use reparameterization trick to sample from the latent's PDF.
-            latent_sample = mu + logvar.div(2).exp() * torch.randn_like(mu)
-        else:
-            # When not using the VAE encoder, we set the latent to be all zeros.
-            mu = logvar = None
-            latent_sample = torch.zeros([batch_size, self.latent_dim], dtype=robot_state.dtype).to(
-                robot_state.device
-            )
-
-        # Prepare all other transformer inputs.
-        # Image observation features and position embeddings.
-        all_cam_features = []
-        all_cam_pos = []
-        for cam_id, _ in enumerate(self.camera_names):
-            # TODO(now): remove the positional embedding from the backbones.
-            cam_features, pos = self.backbones[0](image[:, cam_id])  # HARDCODED
-            cam_features = cam_features[0]  # take the last layer feature
-            pos = pos[0]
-            cam_features = self.encoder_img_feat_input_proj(cam_features)  # (B, C, h, w)
-            all_cam_features.append(cam_features)
-            all_cam_pos.append(pos)
-        # Concatenate image observation feature maps along the width dimension.
-        transformer_input = torch.cat(all_cam_features, axis=3)
-        # TODO(now): remove the positional embedding from the backbones.
-        pos = torch.cat(all_cam_pos, axis=3)
-        robot_state_embed = self.encoder_robot_state_input_proj(robot_state)
-        latent_embed = self.encoder_latent_input_proj(latent_sample)
-
-        # TODO(now): Explain all of this madness.
-        transformer_input = torch.cat(
-            [
-                torch.stack([latent_embed, robot_state_embed], axis=0),
-                transformer_input.flatten(2).permute(2, 0, 1),
-            ]
-        )
-        pos_embed = torch.cat(
-            [self.additional_pos_embed.weight.unsqueeze(1), pos.flatten(2).permute(2, 0, 1)], axis=0
-        )
-
-        # Run the transformer and project the outputs to the action space.
-        transformer_output = self.transformer(
-            transformer_input,
-            encoder_pos=pos_embed,
-            decoder_pos=self.decoder_pos_embed.weight.unsqueeze(1),
-        ).transpose(0, 1)  # back to (B, S, C)
-        actions = self.action_head(transformer_output)
-        return actions, [mu, logvar]
-
-
-def build(args):
-    # From state
-    # backbone = None # from state for now, no need for conv nets
-    # From image
-    backbones = []
-    backbone = build_backbone(args)
-    backbones.append(backbone)
-
-    transformer = Transformer(
-        d_model=args.hidden_dim,
-        dropout=args.dropout,
-        nhead=args.nheads,
-        dim_feedforward=args.dim_feedforward,
-        num_encoder_layers=args.enc_layers,
-        num_decoder_layers=args.dec_layers,
-        normalize_before=args.pre_norm,
-    )
-
-    # TODO(now): args.enc_layers shouldn't be shared with the transformer decoder
-    vae_encoder = TransformerEncoder(
-        num_layers=args.enc_layers,
-        d_model=args.hidden_dim,
-        nhead=args.nheads,
-        dim_feedforward=args.dim_feedforward,
-        dropout=args.dropout,
-        activation="relu",
-        normalize_before=args.pre_norm,
-    )
-
-    model = ActionChunkingTransformer(
-        backbones,
-        transformer,
-        vae_encoder,
-        state_dim=args.state_dim,
-        action_dim=args.action_dim,
-        horizon=args.num_queries,
-        camera_names=args.camera_names,
-        use_vae=args.vae,
-    )
-
-    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    print("number of parameters: {:.2f}M".format(n_parameters / 1e6))
-
-    return model
diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py
index 7d24620a..906ea0cd 100644
--- a/lerobot/common/policies/act/policy.py
+++ b/lerobot/common/policies/act/policy.py
@@ -1,50 +1,32 @@
-import logging
-import time
+"""Action Chunking Transformer Policy
 
+As per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware (https://arxiv.org/abs/2304.13705).
+"""
+
+import logging
+import math
+import time
+from itertools import chain
+from typing import Callable, Optional
+
+import einops
+import numpy as np
 import torch
 import torch.nn.functional as F  # noqa: N812
+import torchvision
 import torchvision.transforms as transforms
+from torch import Tensor, nn
+from torchvision.models._utils import IntermediateLayerGetter
+from torchvision.ops.misc import FrozenBatchNorm2d
 
 from lerobot.common.policies.abstract import AbstractPolicy
-from lerobot.common.policies.act.detr_vae import build
 from lerobot.common.utils import get_safe_torch_device
 
 
-def build_act_model_and_optimizer(cfg):
-    model = build(cfg)
-
-    param_dicts = [
-        {"params": [p for n, p in model.named_parameters() if "backbone" not in n and p.requires_grad]},
-        {
-            "params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad],
-            "lr": cfg.lr_backbone,
-        },
-    ]
-    optimizer = torch.optim.AdamW(param_dicts, lr=cfg.lr, weight_decay=cfg.weight_decay)
-
-    return model, optimizer
-
-
-def kl_divergence(mu, logvar):
-    batch_size = mu.size(0)
-    assert batch_size != 0
-    if mu.data.ndimension() == 4:
-        mu = mu.view(mu.size(0), mu.size(1))
-    if logvar.data.ndimension() == 4:
-        logvar = logvar.view(logvar.size(0), logvar.size(1))
-
-    klds = -0.5 * (1 + logvar - mu.pow(2) - logvar.exp())
-    total_kld = klds.sum(1).mean(0, True)
-    dimension_wise_kld = klds.mean(0)
-    mean_kld = klds.mean(1).mean(0, True)
-
-    return total_kld, dimension_wise_kld, mean_kld
-
-
 class ActionChunkingTransformerPolicy(AbstractPolicy):
     """
-    Action Chunking Transformer as per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware
-    (https://arxiv.org/abs/2304.13705).
+    Action Chunking Transformer Policy as per Learning Fine-Grained Bimanual Manipulation with Low-Cost
+    Hardware (https://arxiv.org/abs/2304.13705).
     """
 
     name = "act"
@@ -68,7 +50,35 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
         self.cfg = cfg
         self.n_action_steps = n_action_steps
         self.device = get_safe_torch_device(device)
-        self.model, self.optimizer = build_act_model_and_optimizer(cfg)
+
+        self.model = ActionChunkingTransformer(
+            cfg,
+            state_dim=cfg.state_dim,
+            action_dim=cfg.action_dim,
+            horizon=cfg.horizon,
+            camera_names=cfg.camera_names,
+            use_vae=cfg.vae,
+        )
+
+        optimizer_params_dicts = [
+            {
+                "params": [
+                    p
+                    for n, p in self.model.named_parameters()
+                    if not n.startswith("backbone") and p.requires_grad
+                ]
+            },
+            {
+                "params": [
+                    p
+                    for n, p in self.model.named_parameters()
+                    if n.startswith("backbone") and p.requires_grad
+                ],
+                "lr": cfg.lr_backbone,
+            },
+        ]
+        self.optimizer = torch.optim.AdamW(optimizer_params_dicts, lr=cfg.lr, weight_decay=cfg.weight_decay)
+
         self.kl_weight = self.cfg.kl_weight
         logging.info(f"KL Weight {self.kl_weight}")
         self.to(self.device)
@@ -140,12 +150,10 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
 
         self.optimizer.step()
         self.optimizer.zero_grad()
-        # self.lr_scheduler.step()
 
         info = {
             "loss": loss.item(),
             "grad_norm": float(grad_norm),
-            # "lr": self.lr_scheduler.get_last_lr()[0],
             "lr": self.cfg.lr,
             "data_s": data_s,
             "update_s": time.time() - start_time,
@@ -213,31 +221,495 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
         action = action[: self.n_action_steps]
         return action
 
-    def _forward(self, qpos, image, actions=None, is_pad=None):
-        env_state = None
+    def _forward(self, qpos, image, actions=None):
         normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
         image = normalize(image)
 
         is_training = actions is not None
         if is_training:  # training time
-            actions = actions[:, : self.model.num_queries]
-            if is_pad is not None:
-                is_pad = is_pad[:, : self.model.num_queries]
+            actions = actions[:, : self.model.horizon]
 
-            a_hat, (mu, logvar) = self.model(qpos, image, env_state, actions, is_pad)
+            a_hat, (mu, log_sigma_x2) = self.model(qpos, image, actions)
 
             all_l1 = F.l1_loss(actions, a_hat, reduction="none")
-            l1 = all_l1.mean() if is_pad is None else (all_l1 * ~is_pad.unsqueeze(-1)).mean()
+            l1 = all_l1.mean()
 
             loss_dict = {}
             loss_dict["l1"] = l1
             if self.cfg.vae:
-                total_kld, dim_wise_kld, mean_kld = kl_divergence(mu, logvar)
-                loss_dict["kl"] = total_kld[0]
+                # Calculate Dₖₗ(latent_pdf || standard_normal). Note: After computing the KL-divergence for
+                # each dimension independently, we sum over the latent dimension to get the total
+                # KL-divergence per batch element, then take the mean over the batch.
+                # (See App. B of https://arxiv.org/abs/1312.6114 for more details).
+                mean_kld = (-0.5 * (1 + log_sigma_x2 - mu.pow(2) - (log_sigma_x2).exp())).sum(-1).mean()
+                loss_dict["kl"] = mean_kld
                 loss_dict["loss"] = loss_dict["l1"] + loss_dict["kl"] * self.kl_weight
             else:
                 loss_dict["loss"] = loss_dict["l1"]
             return loss_dict
         else:
-            action, _ = self.model(qpos, image, env_state)  # no action, sample from prior
+            action, _ = self.model(qpos, image)  # no action, sample from prior
             return action
+
+
+def create_sinusoidal_position_embedding(n_position, d_hid):
+    def get_position_angle_vec(position):
+        return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+
+    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+
+    return torch.FloatTensor(sinusoid_table).unsqueeze(0)
+
+
+# TODO(alexander-soare) move all this code into the policy when we have the policy API established.
+class ActionChunkingTransformer(nn.Module):
+    """
+    Action Chunking Transformer as per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware
+    (paper: https://arxiv.org/abs/2304.13705, code: https://github.com/tonyzhaozh/act)
+
+    Note: In this code we use the terms `vae_encoder`, 'encoder', `decoder`. The meanings are as follows.
+        - The `vae_encoder` is, as per the literature around variational auto-encoders (VAE), the part of the
+          model that encodes the target data (a sequence of actions), and the condition (the robot
+          joint-space).
+        - A transformer with an `encoder` (not the VAE encoder) and `decoder` (not the VAE decoder) with
+          cross-attention is used as the VAE decoder. For these terms, we drop the `vae_` prefix because we
+          have an option to train this model without the variational objective (in which case we drop the
+          `vae_encoder` altogether, and nothing about this model has anything to do with a VAE).
+
+                                 Transformer
+                                 Used alone for inference
+                                 (acts as VAE decoder
+                                  during training)
+                                ┌───────────────────────┐
+                                │             Outputs   │
+                                │                ▲      │
+                                │     ┌─────►┌───────┐  │
+                   ┌──────┐     │     │      │Transf.│  │
+                   │      │     │     ├─────►│decoder│  │
+              ┌────┴────┐ │     │     │      │       │  │
+              │         │ │     │ ┌───┴───┬─►│       │  │
+              │ VAE     │ │     │ │       │  └───────┘  │
+              │ encoder │ │     │ │Transf.│             │
+              │         │ │     │ │encoder│             │
+              └───▲─────┘ │     │ │       │             │
+                  │       │     │ └───▲───┘             │
+                  │       │     │     │                 │
+                inputs    └─────┼─────┘                 │
+                                │                       │
+                                └───────────────────────┘
+    """
+
+    def __init__(self, args, state_dim, action_dim, horizon, camera_names, use_vae):
+        """Initializes the model.
+        Parameters:
+            state_dim: robot state dimension of the environment
+            horizon: number of object queries, ie detection slot. This is the maximal number of objects
+                         DETR can detect in a single image. For COCO, we recommend 100 queries.
+
+        Args:
+            state_dim: Robot positional state dimension.
+            action_dim: Action dimension.
+            horizon: The number of actions to generate in one forward pass.
+            use_vae: Whether to use the variational objective. TODO(now): Give more details.
+        """
+        super().__init__()
+
+        self.camera_names = camera_names
+        self.use_vae = use_vae
+        self.horizon = horizon
+        self.hidden_dim = args.hidden_dim
+
+        transformer_common_kwargs = dict(  # noqa: C408
+            d_model=self.hidden_dim,
+            nhead=args.nheads,
+            dim_feedforward=args.dim_feedforward,
+            dropout=args.dropout,
+            activation=args.activation,
+            normalize_before=args.pre_norm,
+        )
+
+        # BERT style VAE encoder with input [cls, *joint_space_configuration, *action_sequence].
+        # The cls token forms parameters of the latent's distribution (like this [*means, *log_variances]).
+        if use_vae:
+            # TODO(now): args.enc_layers shouldn't be shared with the transformer decoder
+            self.vae_encoder = TransformerEncoder(num_layers=args.enc_layers, **transformer_common_kwargs)
+            self.cls_embed = nn.Embedding(1, self.hidden_dim)
+            # Projection layer for joint-space configuration to hidden dimension.
+            self.vae_encoder_robot_state_input_proj = nn.Linear(state_dim, self.hidden_dim)
+            # Projection layer for action (joint-space target) to hidden dimension.
+            self.vae_encoder_action_input_proj = nn.Linear(state_dim, self.hidden_dim)
+            # Final size of latent z. TODO(now): Add to hyperparams.
+            self.latent_dim = 32
+            # Projection layer from the VAE encoder's output to the latent distribution's parameter space.
+            self.vae_encoder_latent_output_proj = nn.Linear(self.hidden_dim, self.latent_dim * 2)
+            # Fixed sinusoidal positional embedding the whole input to the VAE encoder.
+            self.register_buffer(
+                "vae_encoder_pos_enc", create_sinusoidal_position_embedding(1 + 1 + horizon, self.hidden_dim)
+            )
+
+        # Backbone for image feature extraction.
+        self.backbone_position_embedding = SinusoidalPositionEmbedding2D(self.hidden_dim // 2)
+        backbone_model = getattr(torchvision.models, args.backbone)(
+            replace_stride_with_dilation=[False, False, args.dilation],
+            pretrained=True,  # TODO(now): Add pretrained option
+            norm_layer=FrozenBatchNorm2d,
+        )
+        # Note: The forward method of this returns a dict: {"feature_map": output}.
+        self.backbone = IntermediateLayerGetter(backbone_model, return_layers={"layer4": "feature_map"})
+
+        # Transformer (acts as VAE decoder when training with the variational objective).
+        self.encoder = TransformerEncoder(num_layers=args.enc_layers, **transformer_common_kwargs)
+        self.decoder = TransformerDecoder(num_layers=args.dec_layers, **transformer_common_kwargs)
+
+        # Transformer encoder input projections. The tokens will be structured like
+        # [latent, robot_state, image_feature_map_pixels].
+        self.encoder_img_feat_input_proj = nn.Conv2d(
+            backbone_model.fc.in_features, self.hidden_dim, kernel_size=1
+        )
+        self.encoder_robot_state_input_proj = nn.Linear(state_dim, self.hidden_dim)
+        self.encoder_latent_input_proj = nn.Linear(self.latent_dim, self.hidden_dim)
+        # TODO(now): Fix this nonsense. One positional embedding is needed. We should extract the image
+        # feature dimension with a dry run.
+        self.additional_pos_embed = nn.Embedding(
+            2, self.hidden_dim
+        )  # learned position embedding for proprio and latent
+
+        # Transformer decoder.
+        # Learnable positional embedding for the transformer's decoder (in the style of DETR object queries).
+        self.decoder_pos_embed_embed = nn.Embedding(horizon, self.hidden_dim)
+        # Final action regression head on the output of the transformer's decoder.
+        self.action_head = nn.Linear(self.hidden_dim, action_dim)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        """Xavier-uniform initialization of the transformer parameters as in the original code."""
+        for p in chain(self.encoder.parameters(), self.decoder.parameters()):
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(self, robot_state, image, actions=None):
+        """
+        Args:
+            robot_state: (B, J) batch of robot joint configurations.
+            image: (B, N, C, H, W) batch of N camera frames.
+            actions: (B, S, A) batch of actions from the target dataset which must be provided if the
+                VAE is enabled and the model is in training mode.
+        """
+        if self.use_vae and self.training:
+            assert (
+                actions is not None
+            ), "actions must be provided when using the variational objective in training mode."
+
+        batch_size, _ = robot_state.shape
+
+        # Prepare the latent for input to the transformer.
+        if self.use_vae and actions is not None:
+            # Prepare the input to the VAE encoder: [cls, *joint_space_configuration, *action_sequence].
+            cls_embed = einops.repeat(self.cls_embed.weight, "1 d -> b 1 d", b=batch_size)  # (B, 1, D)
+            robot_state_embed = self.vae_encoder_robot_state_input_proj(robot_state).unsqueeze(1)  # (B, 1, D)
+            action_embed = self.vae_encoder_action_input_proj(actions)  # (B, S, D)
+            vae_encoder_input = torch.cat([cls_embed, robot_state_embed, action_embed], axis=1)  # (B, S+2, D)
+            # Note: detach() shouldn't be necessary but leaving it the same as the original code just in case.
+            # Prepare fixed positional embedding.
+            pos_embed = self.vae_encoder_pos_enc.clone().detach()  # (1, S+2, D)
+            # Forward pass through VAE encoder and sample the latent with the reparameterization trick.
+            cls_token_out = self.vae_encoder(
+                vae_encoder_input.permute(1, 0, 2), pos=pos_embed.permute(1, 0, 2)
+            )[0]  # (B, D)
+            latent_pdf_params = self.vae_encoder_latent_output_proj(cls_token_out)
+            mu = latent_pdf_params[:, : self.latent_dim]
+            # This is 2log(sigma). Done this way to match the original implementation.
+            log_sigma_x2 = latent_pdf_params[:, self.latent_dim :]
+            # Use reparameterization trick to sample from the latent's PDF.
+            latent_sample = mu + log_sigma_x2.div(2).exp() * torch.randn_like(mu)
+        else:
+            # When not using the VAE encoder, we set the latent to be all zeros.
+            mu = log_sigma_x2 = None
+            latent_sample = torch.zeros([batch_size, self.latent_dim], dtype=torch.float32).to(
+                robot_state.device
+            )
+
+        # Prepare all other transformer inputs.
+        # Image observation features and position embeddings.
+        all_cam_features = []
+        all_cam_pos = []
+        for cam_id, _ in enumerate(self.camera_names):
+            cam_features = self.backbone(image[:, cam_id])["feature_map"]
+            pos = self.backbone_position_embedding(cam_features).to(dtype=cam_features.dtype)
+            cam_features = self.encoder_img_feat_input_proj(cam_features)  # (B, C, h, w)
+            all_cam_features.append(cam_features)
+            all_cam_pos.append(pos)
+        # Concatenate image observation feature maps along the width dimension.
+        encoder_in = torch.cat(all_cam_features, axis=3)
+        pos = torch.cat(all_cam_pos, axis=3)
+        robot_state_embed = self.encoder_robot_state_input_proj(robot_state)
+        latent_embed = self.encoder_latent_input_proj(latent_sample)
+
+        # TODO(now): Explain all of this madness.
+        encoder_in = torch.cat(
+            [
+                torch.stack([latent_embed, robot_state_embed], axis=0),
+                encoder_in.flatten(2).permute(2, 0, 1),
+            ]
+        )
+        pos_embed = torch.cat(
+            [self.additional_pos_embed.weight.unsqueeze(1), pos.flatten(2).permute(2, 0, 1)], axis=0
+        )
+
+        encoder_out = self.encoder(encoder_in, pos=pos_embed)
+        decoder_in = torch.zeros(
+            (self.horizon, batch_size, self.hidden_dim), dtype=pos_embed.dtype, device=pos_embed.device
+        )
+        decoder_out = self.decoder(
+            decoder_in,
+            encoder_out,
+            encoder_pos_embed=pos_embed,
+            decoder_pos_embed=self.decoder_pos_embed_embed.weight.unsqueeze(1),
+        ).transpose(0, 1)  # back to (B, S, C)
+
+        actions = self.action_head(decoder_out)
+        return actions, [mu, log_sigma_x2]
+
+
+class TransformerEncoder(nn.Module):
+    def __init__(
+        self,
+        num_layers,
+        d_model,
+        nhead,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [
+                TransformerEncoderLayer(
+                    d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm = nn.LayerNorm(d_model) if normalize_before else nn.Identity()
+
+    def forward(self, x, pos: Optional[Tensor] = None):
+        for layer in self.layers:
+            x = layer(x, pos=pos)
+        x = self.norm(x)
+        return x
+
+
+class TransformerEncoderLayer(nn.Module):
+    def __init__(
+        self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False
+    ):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def forward(self, x, pos_embed: Optional[Tensor] = None):
+        skip = x
+        if self.normalize_before:
+            x = self.norm1(x)
+        q = k = x if pos_embed is None else x + pos_embed
+        x = self.self_attn(q, k, value=x)[0]
+        x = skip + self.dropout1(x)
+        if self.normalize_before:
+            skip = x
+            x = self.norm2(x)
+        else:
+            x = self.norm1(x)
+            skip = x
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        x = skip + self.dropout2(x)
+        if not self.normalize_before:
+            x = self.norm2(x)
+        return x
+
+
+class TransformerDecoder(nn.Module):
+    def __init__(
+        self,
+        num_layers,
+        d_model,
+        nhead,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [
+                TransformerDecoderLayer(
+                    d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.num_layers = num_layers
+        self.norm = nn.LayerNorm(d_model)
+
+    def forward(
+        self, x, encoder_out, decoder_pos_embed: Tensor | None = None, encoder_pos_embed: Tensor | None = None
+    ):
+        for layer in self.layers:
+            x = layer(
+                x, encoder_out, decoder_pos_embed=decoder_pos_embed, encoder_pos_embed=encoder_pos_embed
+            )
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+
+
+class TransformerDecoderLayer(nn.Module):
+    def __init__(
+        self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False
+    ):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def maybe_add_pos_embed(self, tensor: Tensor, pos_embed: Tensor | None) -> Tensor:
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(
+        self,
+        x: Tensor,
+        encoder_out: Tensor,
+        decoder_pos_embed: Tensor | None = None,
+        encoder_pos_embed: Tensor | None = None,
+    ) -> Tensor:
+        """
+        Args:
+            x: (Decoder Sequence, Batch, Channel) tensor of input tokens.
+            encoder_out: (Encoder Sequence, B, C) output features from the last layer of the encoder we are
+                cross-attending with.
+            decoder_pos_embed: (ES, 1, C) positional embedding for keys (from the encoder).
+            encoder_pos_embed: (DS, 1, C) Positional_embedding for the queries (from the decoder).
+        Returns:
+            (DS, B, C) tensor of decoder output features.
+        """
+        skip = x
+        if self.normalize_before:
+            x = self.norm1(x)
+        q = k = self.maybe_add_pos_embed(x, decoder_pos_embed)
+        x = self.self_attn(q, k, value=x)[0]
+        x = skip + self.dropout1(x)
+        if self.normalize_before:
+            skip = x
+            x = self.norm2(x)
+        else:
+            x = self.norm1(x)
+            skip = x
+        x = self.multihead_attn(
+            query=self.maybe_add_pos_embed(x, decoder_pos_embed),
+            key=self.maybe_add_pos_embed(encoder_out, encoder_pos_embed),
+            value=encoder_out,
+        )[0]
+        x = skip + self.dropout2(x)
+        if self.normalize_before:
+            skip = x
+            x = self.norm3(x)
+        else:
+            x = self.norm2(x)
+            skip = x
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        x = skip + self.dropout3(x)
+        if not self.normalize_before:
+            x = self.norm3(x)
+        return x
+
+
+class SinusoidalPositionEmbedding2D(nn.Module):
+    """Sinusoidal positional embeddings similar to what's presented in Attention Is All You Need.
+
+    The variation is that the position indices are normalized in [0, 2π] (not quite: the lower bound is 1/H
+    for the vertical direction, and 1/W for the horizontal direction.
+    """
+
+    def __init__(self, dimension: int):
+        """
+        Args:
+            dimension: The desired dimension of the embeddings.
+        """
+        super().__init__()
+        self.dimension = dimension
+        self._two_pi = 2 * math.pi
+        self._eps = 1e-6
+        # Inverse "common ratio" for the geometric progression in sinusoid frequencies.
+        self._temperature = 10000
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x: A (B, C, H, W) batch of 2D feature map to generate the embeddings for.
+        Returns:
+            A (1, C, H, W) batch of corresponding sinusoidal positional embeddings.
+        """
+        not_mask = torch.ones_like(x[0, [0]])  # (1, H, W)
+        # Note: These are like range(1, H+1) and range(1, W+1) respectively, but in most implementations
+        # they would be range(0, H) and range(0, W). Keeping it at as to match the original code.
+        y_range = not_mask.cumsum(1, dtype=torch.float32)
+        x_range = not_mask.cumsum(2, dtype=torch.float32)
+
+        # "Normalize" the position index such that it ranges in [0, 2π].
+        # Note: Adding epsilon on the denominator should not be needed as all values of y_embed and x_range
+        # are non-zero by construction. This is an artifact of the original code.
+        y_range = y_range / (y_range[:, -1:, :] + self._eps) * self._two_pi
+        x_range = x_range / (x_range[:, :, -1:] + self._eps) * self._two_pi
+
+        inverse_frequency = self._temperature ** (
+            2 * (torch.arange(self.dimension, dtype=torch.float32, device=x.device) // 2) / self.dimension
+        )
+
+        x_range = x_range.unsqueeze(-1) / inverse_frequency  # (1, H, W, 1)
+        y_range = y_range.unsqueeze(-1) / inverse_frequency  # (1, H, W, 1)
+
+        # Note: this stack then flatten operation results in interleaved sine and cosine terms.
+        # pos_embed_x and pos_embed are (1, H, W, C // 2).
+        pos_embed_x = torch.stack((x_range[..., 0::2].sin(), x_range[..., 1::2].cos()), dim=-1).flatten(3)
+        pos_embed_y = torch.stack((y_range[..., 0::2].sin(), y_range[..., 1::2].cos()), dim=-1).flatten(3)
+        pos_embed = torch.cat((pos_embed_y, pos_embed_x), dim=3).permute(0, 3, 1, 2)  # (1, C, H, W)
+
+        return pos_embed
+
+
+def _get_activation_fn(activation: str) -> Callable:
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(f"activation should be relu/gelu/glu, not {activation}.")
diff --git a/lerobot/common/policies/act/position_encoding.py b/lerobot/common/policies/act/position_encoding.py
deleted file mode 100644
index 63bb4840..00000000
--- a/lerobot/common/policies/act/position_encoding.py
+++ /dev/null
@@ -1,102 +0,0 @@
-"""
-Various positional encodings for the transformer.
-"""
-
-import math
-
-import torch
-from torch import nn
-
-from .utils import NestedTensor
-
-
-class PositionEmbeddingSine(nn.Module):
-    """
-    This is a more standard version of the position embedding, very similar to the one
-    used by the Attention is all you need paper, generalized to work on images.
-    """
-
-    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
-        super().__init__()
-        self.num_pos_feats = num_pos_feats
-        self.temperature = temperature
-        self.normalize = normalize
-        if scale is not None and normalize is False:
-            raise ValueError("normalize should be True if scale is passed")
-        if scale is None:
-            scale = 2 * math.pi
-        self.scale = scale
-
-    def forward(self, tensor):
-        x = tensor
-        # mask = tensor_list.mask
-        # assert mask is not None
-        # not_mask = ~mask
-
-        not_mask = torch.ones_like(x[0, [0]])
-        y_embed = not_mask.cumsum(1, dtype=torch.float32)
-        x_embed = not_mask.cumsum(2, dtype=torch.float32)
-        if self.normalize:
-            eps = 1e-6
-            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
-            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
-
-        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
-        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
-
-        pos_x = x_embed[:, :, :, None] / dim_t
-        pos_y = y_embed[:, :, :, None] / dim_t
-        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
-        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
-        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
-        return pos
-
-
-class PositionEmbeddingLearned(nn.Module):
-    """
-    Absolute pos embedding, learned.
-    """
-
-    def __init__(self, num_pos_feats=256):
-        super().__init__()
-        self.row_embed = nn.Embedding(50, num_pos_feats)
-        self.col_embed = nn.Embedding(50, num_pos_feats)
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        nn.init.uniform_(self.row_embed.weight)
-        nn.init.uniform_(self.col_embed.weight)
-
-    def forward(self, tensor_list: NestedTensor):
-        x = tensor_list.tensors
-        h, w = x.shape[-2:]
-        i = torch.arange(w, device=x.device)
-        j = torch.arange(h, device=x.device)
-        x_emb = self.col_embed(i)
-        y_emb = self.row_embed(j)
-        pos = (
-            torch.cat(
-                [
-                    x_emb.unsqueeze(0).repeat(h, 1, 1),
-                    y_emb.unsqueeze(1).repeat(1, w, 1),
-                ],
-                dim=-1,
-            )
-            .permute(2, 0, 1)
-            .unsqueeze(0)
-            .repeat(x.shape[0], 1, 1, 1)
-        )
-        return pos
-
-
-def build_position_encoding(args):
-    n_steps = args.hidden_dim // 2
-    if args.position_embedding in ("v2", "sine"):
-        # TODO find a better way of exposing other arguments
-        position_embedding = PositionEmbeddingSine(n_steps, normalize=True)
-    elif args.position_embedding in ("v3", "learned"):
-        position_embedding = PositionEmbeddingLearned(n_steps)
-    else:
-        raise ValueError(f"not supported {args.position_embedding}")
-
-    return position_embedding
diff --git a/lerobot/common/policies/act/transformer.py b/lerobot/common/policies/act/transformer.py
deleted file mode 100644
index 7e71f3ea..00000000
--- a/lerobot/common/policies/act/transformer.py
+++ /dev/null
@@ -1,240 +0,0 @@
-"""
-TODO(now)
-"""
-
-from typing import Optional
-
-import torch
-import torch.nn.functional as F  # noqa: N812
-from torch import Tensor, nn
-
-
-class Transformer(nn.Module):
-    def __init__(
-        self,
-        d_model=512,
-        nhead=8,
-        num_encoder_layers=6,
-        num_decoder_layers=6,
-        dim_feedforward=2048,
-        dropout=0.1,
-        activation="relu",
-        normalize_before=False,
-    ):
-        super().__init__()
-        self.encoder = TransformerEncoder(
-            num_encoder_layers, d_model, nhead, dim_feedforward, dropout, activation, normalize_before
-        )
-        self.decoder = TransformerDecoder(
-            num_decoder_layers, d_model, nhead, dim_feedforward, dropout, activation, normalize_before
-        )
-        self.d_model = d_model
-        self.nhead = nhead
-        self._init_params()  # TODO(now): move to somewhere common
-
-    def _init_params(self):
-        for p in self.parameters():
-            if p.dim() > 1:
-                nn.init.xavier_uniform_(p)
-
-    def forward(self, x, encoder_pos, decoder_pos):
-        """
-        Args:
-            x: ((E)ncoder (S)equence, (B)atch, (C)hannels)
-            decoder_pos: (Decoder Sequence, C) tensor for the decoder's positional embedding.
-            encoder_pos: (ES, C) tenso
-        """
-        # TODO flatten only when input has H and W
-        bs = x.shape[1]
-
-        encoder_out = self.encoder(x, pos=encoder_pos)
-        decoder_in = torch.zeros(
-            (decoder_pos.shape[0], bs, decoder_pos.shape[2]),
-            dtype=decoder_pos.dtype,
-            device=decoder_pos.device,
-        )
-        decoder_out = self.decoder(decoder_in, encoder_out, encoder_pos=encoder_pos, decoder_pos=decoder_pos)
-        return decoder_out
-
-
-class TransformerEncoder(nn.Module):
-    def __init__(
-        self,
-        num_layers,
-        d_model,
-        nhead,
-        dim_feedforward=2048,
-        dropout=0.1,
-        activation="relu",
-        normalize_before=False,
-    ):
-        super().__init__()
-        self.layers = nn.ModuleList(
-            [
-                TransformerEncoderLayer(
-                    d_model, nhead, dim_feedforward, dropout, activation, normalize_before
-                )
-                for _ in range(num_layers)
-            ]
-        )
-        self.norm = nn.LayerNorm(d_model) if normalize_before else nn.Identity()
-
-    def forward(self, x, pos: Optional[Tensor] = None):
-        for layer in self.layers:
-            x = layer(x, pos=pos)
-        x = self.norm(x)
-        return x
-
-
-class TransformerEncoderLayer(nn.Module):
-    def __init__(
-        self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False
-    ):
-        super().__init__()
-        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
-        # Implementation of Feedforward model
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.dropout = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-
-        self.activation = _get_activation_fn(activation)
-        self.normalize_before = normalize_before
-
-    def forward(self, x, pos: Optional[Tensor] = None):
-        skip = x
-        if self.normalize_before:
-            x = self.norm1(x)
-        q = k = x if pos is None else x + pos
-        x = self.self_attn(q, k, value=x)[0]
-        x = skip + self.dropout1(x)
-        if self.normalize_before:
-            skip = x
-            x = self.norm2(x)
-        else:
-            x = self.norm1(x)
-            skip = x
-        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
-        x = skip + self.dropout2(x)
-        if not self.normalize_before:
-            x = self.norm2(x)
-        return x
-
-
-class TransformerDecoder(nn.Module):
-    def __init__(
-        self,
-        num_layers,
-        d_model,
-        nhead,
-        dim_feedforward=2048,
-        dropout=0.1,
-        activation="relu",
-        normalize_before=False,
-    ):
-        super().__init__()
-        self.layers = nn.ModuleList(
-            [
-                TransformerDecoderLayer(
-                    d_model, nhead, dim_feedforward, dropout, activation, normalize_before
-                )
-                for _ in range(num_layers)
-            ]
-        )
-        self.num_layers = num_layers
-        self.norm = nn.LayerNorm(d_model)
-
-    def forward(self, x, encoder_out, decoder_pos: Tensor | None = None, encoder_pos: Tensor | None = None):
-        for layer in self.layers:
-            x = layer(x, encoder_out, decoder_pos=decoder_pos, encoder_pos=encoder_pos)
-        if self.norm is not None:
-            x = self.norm(x)
-        return x
-
-
-class TransformerDecoderLayer(nn.Module):
-    def __init__(
-        self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False
-    ):
-        super().__init__()
-        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
-        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
-        # Implementation of Feedforward model
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.dropout = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.norm3 = nn.LayerNorm(d_model)
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-        self.dropout3 = nn.Dropout(dropout)
-
-        self.activation = _get_activation_fn(activation)
-        self.normalize_before = normalize_before
-
-    def maybe_add_pos_embed(self, tensor: Tensor, pos: Tensor | None) -> Tensor:
-        return tensor if pos is None else tensor + pos
-
-    def forward(
-        self,
-        x: Tensor,
-        encoder_out: Tensor,
-        decoder_pos: Tensor | None = None,
-        encoder_pos: Tensor | None = None,
-    ) -> Tensor:
-        """
-        Args:
-            x: (Decoder Sequence, Batch, Channel) tensor of input tokens.
-            encoder_out: (Encoder Sequence, B, C) output features from the last layer of the encoder we are
-                cross-attending with.
-            decoder_pos: (ES, 1, C) positional embedding for keys (from the encoder).
-            encoder_pos: (DS, 1, C) Positional_embedding for the queries (from the decoder).
-        Returns:
-            (DS, B, C) tensor of decoder output features.
-        """
-        skip = x
-        if self.normalize_before:
-            x = self.norm1(x)
-        q = k = self.maybe_add_pos_embed(x, decoder_pos)
-        x = self.self_attn(q, k, value=x)[0]
-        x = skip + self.dropout1(x)
-        if self.normalize_before:
-            skip = x
-            x = self.norm2(x)
-        else:
-            x = self.norm1(x)
-            skip = x
-        x = self.multihead_attn(
-            query=self.maybe_add_pos_embed(x, decoder_pos),
-            key=self.maybe_add_pos_embed(encoder_out, encoder_pos),
-            value=encoder_out,
-        )[0]
-        x = skip + self.dropout2(x)
-        if self.normalize_before:
-            skip = x
-            x = self.norm3(x)
-        else:
-            x = self.norm2(x)
-            skip = x
-        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
-        x = skip + self.dropout3(x)
-        if not self.normalize_before:
-            x = self.norm3(x)
-        return x
-
-
-def _get_activation_fn(activation):
-    """Return an activation function given a string"""
-    if activation == "relu":
-        return F.relu
-    if activation == "gelu":
-        return F.gelu
-    if activation == "glu":
-        return F.glu
-    raise RuntimeError(f"activation should be relu/gelu/glu, not {activation}.")
diff --git a/lerobot/common/policies/act/utils.py b/lerobot/common/policies/act/utils.py
deleted file mode 100644
index 0d935839..00000000
--- a/lerobot/common/policies/act/utils.py
+++ /dev/null
@@ -1,478 +0,0 @@
-"""
-Misc functions, including distributed helpers.
-
-Mostly copy-paste from torchvision references.
-"""
-
-import datetime
-import os
-import pickle
-import subprocess
-import time
-from collections import defaultdict, deque
-from typing import List, Optional
-
-import torch
-import torch.distributed as dist
-
-# needed due to empty tensor bug in pytorch and torchvision 0.5
-import torchvision
-from packaging import version
-from torch import Tensor
-
-if version.parse(torchvision.__version__) < version.parse("0.7"):
-    from torchvision.ops import _new_empty_tensor
-    from torchvision.ops.misc import _output_size
-
-
-class SmoothedValue:
-    """Track a series of values and provide access to smoothed values over a
-    window or the global series average.
-    """
-
-    def __init__(self, window_size=20, fmt=None):
-        if fmt is None:
-            fmt = "{median:.4f} ({global_avg:.4f})"
-        self.deque = deque(maxlen=window_size)
-        self.total = 0.0
-        self.count = 0
-        self.fmt = fmt
-
-    def update(self, value, n=1):
-        self.deque.append(value)
-        self.count += n
-        self.total += value * n
-
-    def synchronize_between_processes(self):
-        """
-        Warning: does not synchronize the deque!
-        """
-        if not is_dist_avail_and_initialized():
-            return
-        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
-        dist.barrier()
-        dist.all_reduce(t)
-        t = t.tolist()
-        self.count = int(t[0])
-        self.total = t[1]
-
-    @property
-    def median(self):
-        d = torch.tensor(list(self.deque))
-        return d.median().item()
-
-    @property
-    def avg(self):
-        d = torch.tensor(list(self.deque), dtype=torch.float32)
-        return d.mean().item()
-
-    @property
-    def global_avg(self):
-        return self.total / self.count
-
-    @property
-    def max(self):
-        return max(self.deque)
-
-    @property
-    def value(self):
-        return self.deque[-1]
-
-    def __str__(self):
-        return self.fmt.format(
-            median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
-        )
-
-
-def all_gather(data):
-    """
-    Run all_gather on arbitrary picklable data (not necessarily tensors)
-    Args:
-        data: any picklable object
-    Returns:
-        list[data]: list of data gathered from each rank
-    """
-    world_size = get_world_size()
-    if world_size == 1:
-        return [data]
-
-    # serialized to a Tensor
-    buffer = pickle.dumps(data)
-    storage = torch.ByteStorage.from_buffer(buffer)
-    tensor = torch.ByteTensor(storage).to("cuda")
-
-    # obtain Tensor size of each rank
-    local_size = torch.tensor([tensor.numel()], device="cuda")
-    size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
-    dist.all_gather(size_list, local_size)
-    size_list = [int(size.item()) for size in size_list]
-    max_size = max(size_list)
-
-    # receiving Tensor from all ranks
-    # we pad the tensor because torch all_gather does not support
-    # gathering tensors of different shapes
-    tensor_list = []
-    for _ in size_list:
-        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
-    if local_size != max_size:
-        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
-        tensor = torch.cat((tensor, padding), dim=0)
-    dist.all_gather(tensor_list, tensor)
-
-    data_list = []
-    for size, tensor in zip(size_list, tensor_list, strict=False):
-        buffer = tensor.cpu().numpy().tobytes()[:size]
-        data_list.append(pickle.loads(buffer))
-
-    return data_list
-
-
-def reduce_dict(input_dict, average=True):
-    """
-    Args:
-        input_dict (dict): all the values will be reduced
-        average (bool): whether to do average or sum
-    Reduce the values in the dictionary from all processes so that all processes
-    have the averaged results. Returns a dict with the same fields as
-    input_dict, after reduction.
-    """
-    world_size = get_world_size()
-    if world_size < 2:
-        return input_dict
-    with torch.no_grad():
-        names = []
-        values = []
-        # sort the keys so that they are consistent across processes
-        for k in sorted(input_dict.keys()):
-            names.append(k)
-            values.append(input_dict[k])
-        values = torch.stack(values, dim=0)
-        dist.all_reduce(values)
-        if average:
-            values /= world_size
-        reduced_dict = {k: v for k, v in zip(names, values, strict=False)}  # noqa: C416
-    return reduced_dict
-
-
-class MetricLogger:
-    def __init__(self, delimiter="\t"):
-        self.meters = defaultdict(SmoothedValue)
-        self.delimiter = delimiter
-
-    def update(self, **kwargs):
-        for k, v in kwargs.items():
-            if isinstance(v, torch.Tensor):
-                v = v.item()
-            assert isinstance(v, (float, int))
-            self.meters[k].update(v)
-
-    def __getattr__(self, attr):
-        if attr in self.meters:
-            return self.meters[attr]
-        if attr in self.__dict__:
-            return self.__dict__[attr]
-        raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr))
-
-    def __str__(self):
-        loss_str = []
-        for name, meter in self.meters.items():
-            loss_str.append("{}: {}".format(name, str(meter)))
-        return self.delimiter.join(loss_str)
-
-    def synchronize_between_processes(self):
-        for meter in self.meters.values():
-            meter.synchronize_between_processes()
-
-    def add_meter(self, name, meter):
-        self.meters[name] = meter
-
-    def log_every(self, iterable, print_freq, header=None):
-        if not header:
-            header = ""
-        start_time = time.time()
-        end = time.time()
-        iter_time = SmoothedValue(fmt="{avg:.4f}")
-        data_time = SmoothedValue(fmt="{avg:.4f}")
-        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
-        if torch.cuda.is_available():
-            log_msg = self.delimiter.join(
-                [
-                    header,
-                    "[{0" + space_fmt + "}/{1}]",
-                    "eta: {eta}",
-                    "{meters}",
-                    "time: {time}",
-                    "data: {data}",
-                    "max mem: {memory:.0f}",
-                ]
-            )
-        else:
-            log_msg = self.delimiter.join(
-                [
-                    header,
-                    "[{0" + space_fmt + "}/{1}]",
-                    "eta: {eta}",
-                    "{meters}",
-                    "time: {time}",
-                    "data: {data}",
-                ]
-            )
-        mega_b = 1024.0 * 1024.0
-        for i, obj in enumerate(iterable):
-            data_time.update(time.time() - end)
-            yield obj
-            iter_time.update(time.time() - end)
-            if i % print_freq == 0 or i == len(iterable) - 1:
-                eta_seconds = iter_time.global_avg * (len(iterable) - i)
-                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
-                if torch.cuda.is_available():
-                    print(
-                        log_msg.format(
-                            i,
-                            len(iterable),
-                            eta=eta_string,
-                            meters=str(self),
-                            time=str(iter_time),
-                            data=str(data_time),
-                            memory=torch.cuda.max_memory_allocated() / mega_b,
-                        )
-                    )
-                else:
-                    print(
-                        log_msg.format(
-                            i,
-                            len(iterable),
-                            eta=eta_string,
-                            meters=str(self),
-                            time=str(iter_time),
-                            data=str(data_time),
-                        )
-                    )
-            end = time.time()
-        total_time = time.time() - start_time
-        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
-        print("{} Total time: {} ({:.4f} s / it)".format(header, total_time_str, total_time / len(iterable)))
-
-
-def get_sha():
-    cwd = os.path.dirname(os.path.abspath(__file__))
-
-    def _run(command):
-        return subprocess.check_output(command, cwd=cwd).decode("ascii").strip()
-
-    sha = "N/A"
-    diff = "clean"
-    branch = "N/A"
-    try:
-        sha = _run(["git", "rev-parse", "HEAD"])
-        subprocess.check_output(["git", "diff"], cwd=cwd)
-        diff = _run(["git", "diff-index", "HEAD"])
-        diff = "has uncommited changes" if diff else "clean"
-        branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
-    except Exception:
-        pass
-    message = f"sha: {sha}, status: {diff}, branch: {branch}"
-    return message
-
-
-def collate_fn(batch):
-    batch = list(zip(*batch, strict=False))
-    batch[0] = nested_tensor_from_tensor_list(batch[0])
-    return tuple(batch)
-
-
-def _max_by_axis(the_list):
-    # type: (List[List[int]]) -> List[int]
-    maxes = the_list[0]
-    for sublist in the_list[1:]:
-        for index, item in enumerate(sublist):
-            maxes[index] = max(maxes[index], item)
-    return maxes
-
-
-class NestedTensor:
-    def __init__(self, tensors, mask: Optional[Tensor]):
-        self.tensors = tensors
-        self.mask = mask
-
-    def to(self, device):
-        # type: (Device) -> NestedTensor # noqa
-        cast_tensor = self.tensors.to(device)
-        mask = self.mask
-        if mask is not None:
-            assert mask is not None
-            cast_mask = mask.to(device)
-        else:
-            cast_mask = None
-        return NestedTensor(cast_tensor, cast_mask)
-
-    def decompose(self):
-        return self.tensors, self.mask
-
-    def __repr__(self):
-        return str(self.tensors)
-
-
-def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
-    # TODO make this more general
-    if tensor_list[0].ndim == 3:
-        if torchvision._is_tracing():
-            # nested_tensor_from_tensor_list() does not export well to ONNX
-            # call _onnx_nested_tensor_from_tensor_list() instead
-            return _onnx_nested_tensor_from_tensor_list(tensor_list)
-
-        # TODO make it support different-sized images
-        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
-        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
-        batch_shape = [len(tensor_list)] + max_size
-        b, c, h, w = batch_shape
-        dtype = tensor_list[0].dtype
-        device = tensor_list[0].device
-        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
-        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
-        for img, pad_img, m in zip(tensor_list, tensor, mask, strict=False):
-            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
-            m[: img.shape[1], : img.shape[2]] = False
-    else:
-        raise ValueError("not supported")
-    return NestedTensor(tensor, mask)
-
-
-# _onnx_nested_tensor_from_tensor_list() is an implementation of
-# nested_tensor_from_tensor_list() that is supported by ONNX tracing.
-@torch.jit.unused
-def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
-    max_size = []
-    for i in range(tensor_list[0].dim()):
-        max_size_i = torch.max(torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)).to(
-            torch.int64
-        )
-        max_size.append(max_size_i)
-    max_size = tuple(max_size)
-
-    # work around for
-    # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
-    # m[: img.shape[1], :img.shape[2]] = False
-    # which is not yet supported in onnx
-    padded_imgs = []
-    padded_masks = []
-    for img in tensor_list:
-        padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape), strict=False)]
-        padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
-        padded_imgs.append(padded_img)
-
-        m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
-        padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
-        padded_masks.append(padded_mask.to(torch.bool))
-
-    tensor = torch.stack(padded_imgs)
-    mask = torch.stack(padded_masks)
-
-    return NestedTensor(tensor, mask=mask)
-
-
-def setup_for_distributed(is_master):
-    """
-    This function disables printing when not in master process
-    """
-    import builtins as __builtin__
-
-    builtin_print = __builtin__.print
-
-    def print(*args, **kwargs):
-        force = kwargs.pop("force", False)
-        if is_master or force:
-            builtin_print(*args, **kwargs)
-
-    __builtin__.print = print
-
-
-def is_dist_avail_and_initialized():
-    if not dist.is_available():
-        return False
-    if not dist.is_initialized():
-        return False
-    return True
-
-
-def get_world_size():
-    if not is_dist_avail_and_initialized():
-        return 1
-    return dist.get_world_size()
-
-
-def get_rank():
-    if not is_dist_avail_and_initialized():
-        return 0
-    return dist.get_rank()
-
-
-def is_main_process():
-    return get_rank() == 0
-
-
-def save_on_master(*args, **kwargs):
-    if is_main_process():
-        torch.save(*args, **kwargs)
-
-
-def init_distributed_mode(args):
-    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
-        args.rank = int(os.environ["RANK"])
-        args.world_size = int(os.environ["WORLD_SIZE"])
-        args.gpu = int(os.environ["LOCAL_RANK"])
-    elif "SLURM_PROCID" in os.environ:
-        args.rank = int(os.environ["SLURM_PROCID"])
-        args.gpu = args.rank % torch.cuda.device_count()
-    else:
-        print("Not using distributed mode")
-        args.distributed = False
-        return
-
-    args.distributed = True
-
-    torch.cuda.set_device(args.gpu)
-    args.dist_backend = "nccl"
-    print("| distributed init (rank {}): {}".format(args.rank, args.dist_url), flush=True)
-    torch.distributed.init_process_group(
-        backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank
-    )
-    torch.distributed.barrier()
-    setup_for_distributed(args.rank == 0)
-
-
-@torch.no_grad()
-def accuracy(output, target, topk=(1,)):
-    """Computes the precision@k for the specified values of k"""
-    if target.numel() == 0:
-        return [torch.zeros([], device=output.device)]
-    maxk = max(topk)
-    batch_size = target.size(0)
-
-    _, pred = output.topk(maxk, 1, True, True)
-    pred = pred.t()
-    correct = pred.eq(target.view(1, -1).expand_as(pred))
-
-    res = []
-    for k in topk:
-        correct_k = correct[:k].view(-1).float().sum(0)
-        res.append(correct_k.mul_(100.0 / batch_size))
-    return res
-
-
-def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
-    # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
-    """
-    Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
-    This will eventually be supported natively by PyTorch, and this
-    class can go away.
-    """
-    if version.parse(torchvision.__version__) < version.parse("0.7"):
-        if input.numel() > 0:
-            return torch.nn.functional.interpolate(input, size, scale_factor, mode, align_corners)
-
-        output_shape = _output_size(2, input, size, scale_factor)
-        output_shape = list(input.shape[:-2]) + list(output_shape)
-        return _new_empty_tensor(input, output_shape)
-    else:
-        return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
diff --git a/lerobot/configs/policy/act.yaml b/lerobot/configs/policy/act.yaml
index 1086b595..22b6cd6f 100644
--- a/lerobot/configs/policy/act.yaml
+++ b/lerobot/configs/policy/act.yaml
@@ -33,11 +33,10 @@ policy:
   nheads: 8
   #camera_names: [top, front_close, left_pillar, right_pillar]
   camera_names: [top]
-  position_embedding: sine
-  masks: false
   dilation: false
   dropout: 0.1
   pre_norm: false
+  activation: relu
 
   vae: true
 
diff --git a/scripts/convert_act_weights.py b/scripts/convert_act_weights.py
index d0c0c3e7..c8f83422 100644
--- a/scripts/convert_act_weights.py
+++ b/scripts/convert_act_weights.py
@@ -11,6 +11,19 @@ policy = make_policy(cfg)
 
 state_dict = torch.load("/home/alexander/Projects/act/outputs/sim_transfer_cube_human_vae/policy_last.ckpt")
 
+# Remove keys based on what they start with.
+
+start_removals = [
+    # There is a bug that means the pretrained model doesn't even use the final decoder layers.
+    *[f"model.transformer.decoder.layers.{i}" for i in range(1, 7)],
+    "model.is_pad_head.",
+]
+
+for to_remove in start_removals:
+    for k in list(state_dict.keys()):
+        if k.startswith(to_remove):
+            del state_dict[k]
+
 
 # Replace keys based on what they start with.
 
@@ -26,6 +39,9 @@ start_replacements = [
     ("model.input_proj.", "model.encoder_img_feat_input_proj."),
     ("model.input_proj_robot_state", "model.encoder_robot_state_input_proj"),
     ("model.latent_out_proj.", "model.encoder_latent_input_proj."),
+    ("model.transformer.encoder.", "model.encoder."),
+    ("model.transformer.decoder.", "model.decoder."),
+    ("model.backbones.0.0.body.", "model.backbone."),
 ]
 
 for to_replace, replace_with in start_replacements:
@@ -35,18 +51,6 @@ for to_replace, replace_with in start_replacements:
             state_dict[k_] = state_dict[k]
             del state_dict[k]
 
-# Remove keys based on what they start with.
-
-start_removals = [
-    # There is a bug that means the pretrained model doesn't even use the final decoder layers.
-    *[f"model.transformer.decoder.layers.{i}" for i in range(1, 7)],
-    "model.is_pad_head.",
-]
-
-for to_remove in start_removals:
-    for k in list(state_dict.keys()):
-        if k.startswith(to_remove):
-            del state_dict[k]
 
 missing_keys, unexpected_keys = policy.load_state_dict(state_dict, strict=False)
 

From edb125b35116a044574f7d406de19ee368d63583 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare159@gmail.com>
Date: Fri, 5 Apr 2024 11:03:28 +0100
Subject: [PATCH 06/25] backup wip

---
 lerobot/common/policies/act/policy.py | 390 ++++++++++++--------------
 lerobot/configs/policy/act.yaml       |   9 +-
 scripts/convert_act_weights.py        |   2 +
 3 files changed, 188 insertions(+), 213 deletions(-)

diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py
index 906ea0cd..5071c09a 100644
--- a/lerobot/common/policies/act/policy.py
+++ b/lerobot/common/policies/act/policy.py
@@ -1,13 +1,13 @@
 """Action Chunking Transformer Policy
 
 As per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware (https://arxiv.org/abs/2304.13705).
+The majority of changes here involve removing unused code, unifying naming, and adding helpful comments.
 """
 
-import logging
 import math
 import time
 from itertools import chain
-from typing import Callable, Optional
+from typing import Callable
 
 import einops
 import numpy as np
@@ -26,40 +26,56 @@ from lerobot.common.utils import get_safe_torch_device
 class ActionChunkingTransformerPolicy(AbstractPolicy):
     """
     Action Chunking Transformer Policy as per Learning Fine-Grained Bimanual Manipulation with Low-Cost
-    Hardware (https://arxiv.org/abs/2304.13705).
+    Hardware (paper: https://arxiv.org/abs/2304.13705, code: https://github.com/tonyzhaozh/act)
+
+    Note: In this code we use the terms `vae_encoder`, 'encoder', `decoder`. The meanings are as follows.
+        - The `vae_encoder` is, as per the literature around variational auto-encoders (VAE), the part of the
+          model that encodes the target data (a sequence of actions), and the condition (the robot
+          joint-space).
+        - A transformer with an `encoder` (not the VAE encoder) and `decoder` (not the VAE decoder) with
+          cross-attention is used as the VAE decoder. For these terms, we drop the `vae_` prefix because we
+          have an option to train this model without the variational objective (in which case we drop the
+          `vae_encoder` altogether, and nothing about this model has anything to do with a VAE).
+
+                                 Transformer
+                                 Used alone for inference
+                                 (acts as VAE decoder
+                                  during training)
+                                ┌───────────────────────┐
+                                │             Outputs   │
+                                │                ▲      │
+                                │     ┌─────►┌───────┐  │
+                   ┌──────┐     │     │      │Transf.│  │
+                   │      │     │     ├─────►│decoder│  │
+              ┌────┴────┐ │     │     │      │       │  │
+              │         │ │     │ ┌───┴───┬─►│       │  │
+              │ VAE     │ │     │ │       │  └───────┘  │
+              │ encoder │ │     │ │Transf.│             │
+              │         │ │     │ │encoder│             │
+              └───▲─────┘ │     │ │       │             │
+                  │       │     │ └───▲───┘             │
+                  │       │     │     │                 │
+                inputs    └─────┼─────┘                 │
+                                │                       │
+                                └───────────────────────┘
     """
 
     name = "act"
 
     def __init__(self, cfg, device, n_action_steps=1):
         """
-        Args:
-            vae: Whether to use the variational objective. TODO(now): Give more details.
-            temporal_agg: Whether to do temporal aggregation. For each timestep during rollout, the action
-                returned as an exponential moving average of previously generated actions for that timestep.
-            n_obs_steps: Number of time steps worth of observation to use as input.
-            horizon: The number of actions to generate in one forward pass.
-            kl_weight: Weight for KL divergence. Defaults to None. Only applicable when using the variational
-                objective.
-            batch_size: Training batch size.
-            grad_clip_norm: Optionally clip the gradients to have this value as the norm at most. Defaults to
-                None meaning gradient clipping is not applied.
-            lr: Learning rate.
+        TODO(alexander-soare): Add documentation for all parameters.
         """
         super().__init__(n_action_steps)
         self.cfg = cfg
         self.n_action_steps = n_action_steps
         self.device = get_safe_torch_device(device)
 
-        self.model = ActionChunkingTransformer(
-            cfg,
-            state_dim=cfg.state_dim,
-            action_dim=cfg.action_dim,
-            horizon=cfg.horizon,
-            camera_names=cfg.camera_names,
-            use_vae=cfg.vae,
-        )
+        self.model = _ActionChunkingTransformer(cfg)
+        self._create_optimizer()
+        self.to(self.device)
 
+    def _create_optimizer(self):
         optimizer_params_dicts = [
             {
                 "params": [
@@ -74,14 +90,12 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
                     for n, p in self.model.named_parameters()
                     if n.startswith("backbone") and p.requires_grad
                 ],
-                "lr": cfg.lr_backbone,
+                "lr": self.cfg.lr_backbone,
             },
         ]
-        self.optimizer = torch.optim.AdamW(optimizer_params_dicts, lr=cfg.lr, weight_decay=cfg.weight_decay)
-
-        self.kl_weight = self.cfg.kl_weight
-        logging.info(f"KL Weight {self.kl_weight}")
-        self.to(self.device)
+        self.optimizer = torch.optim.AdamW(
+            optimizer_params_dicts, lr=self.cfg.lr, weight_decay=self.cfg.weight_decay
+        )
 
     def update(self, replay_buffer, step):
         del step
@@ -137,7 +151,6 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
         batch = process_batch(batch, self.cfg.horizon, num_slices)
 
         data_s = time.time() - start_time
-        print(data_s)
 
         loss = self.compute_loss(batch)
         loss.backward()
@@ -192,16 +205,6 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
             "image": observation["image", "top"],
             "agent_pos": observation["state"],
         }
-        # qpos = obs_dict["agent_pos"]
-        # img = obs_dict["image"]
-        # qpos_ = torch.load('/tmp/qpos.pth')
-        # img_ = torch.load('/tmp/curr_image.pth')
-        # out_ = torch.load('/tmp/out.pth')
-        # import cv2, numpy as np
-        # cv2.imwrite("ours.png", (obs_dict["image"][0, 0].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8))
-        # cv2.imwrite("theirs.png", (img_[0, 0].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8))
-        # out = self._forward(qpos_, img_)
-        # breakpoint()
         action = self._forward(qpos=obs_dict["agent_pos"] * 0.182, image=obs_dict["image"])
 
         if self.cfg.temporal_agg:
@@ -236,14 +239,14 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
 
             loss_dict = {}
             loss_dict["l1"] = l1
-            if self.cfg.vae:
+            if self.cfg.use_vae:
                 # Calculate Dₖₗ(latent_pdf || standard_normal). Note: After computing the KL-divergence for
                 # each dimension independently, we sum over the latent dimension to get the total
                 # KL-divergence per batch element, then take the mean over the batch.
                 # (See App. B of https://arxiv.org/abs/1312.6114 for more details).
                 mean_kld = (-0.5 * (1 + log_sigma_x2 - mu.pow(2) - (log_sigma_x2).exp())).sum(-1).mean()
                 loss_dict["kl"] = mean_kld
-                loss_dict["loss"] = loss_dict["l1"] + loss_dict["kl"] * self.kl_weight
+                loss_dict["loss"] = loss_dict["l1"] + loss_dict["kl"] * self.cfg.kl_weight
             else:
                 loss_dict["loss"] = loss_dict["l1"]
             return loss_dict
@@ -252,135 +255,74 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
             return action
 
 
-def create_sinusoidal_position_embedding(n_position, d_hid):
-    def get_position_angle_vec(position):
-        return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
-
-    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
-    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
-    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
-
-    return torch.FloatTensor(sinusoid_table).unsqueeze(0)
-
-
 # TODO(alexander-soare) move all this code into the policy when we have the policy API established.
-class ActionChunkingTransformer(nn.Module):
-    """
-    Action Chunking Transformer as per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware
-    (paper: https://arxiv.org/abs/2304.13705, code: https://github.com/tonyzhaozh/act)
-
-    Note: In this code we use the terms `vae_encoder`, 'encoder', `decoder`. The meanings are as follows.
-        - The `vae_encoder` is, as per the literature around variational auto-encoders (VAE), the part of the
-          model that encodes the target data (a sequence of actions), and the condition (the robot
-          joint-space).
-        - A transformer with an `encoder` (not the VAE encoder) and `decoder` (not the VAE decoder) with
-          cross-attention is used as the VAE decoder. For these terms, we drop the `vae_` prefix because we
-          have an option to train this model without the variational objective (in which case we drop the
-          `vae_encoder` altogether, and nothing about this model has anything to do with a VAE).
-
-                                 Transformer
-                                 Used alone for inference
-                                 (acts as VAE decoder
-                                  during training)
-                                ┌───────────────────────┐
-                                │             Outputs   │
-                                │                ▲      │
-                                │     ┌─────►┌───────┐  │
-                   ┌──────┐     │     │      │Transf.│  │
-                   │      │     │     ├─────►│decoder│  │
-              ┌────┴────┐ │     │     │      │       │  │
-              │         │ │     │ ┌───┴───┬─►│       │  │
-              │ VAE     │ │     │ │       │  └───────┘  │
-              │ encoder │ │     │ │Transf.│             │
-              │         │ │     │ │encoder│             │
-              └───▲─────┘ │     │ │       │             │
-                  │       │     │ └───▲───┘             │
-                  │       │     │     │                 │
-                inputs    └─────┼─────┘                 │
-                                │                       │
-                                └───────────────────────┘
-    """
-
-    def __init__(self, args, state_dim, action_dim, horizon, camera_names, use_vae):
-        """Initializes the model.
-        Parameters:
-            state_dim: robot state dimension of the environment
-            horizon: number of object queries, ie detection slot. This is the maximal number of objects
-                         DETR can detect in a single image. For COCO, we recommend 100 queries.
-
-        Args:
-            state_dim: Robot positional state dimension.
-            action_dim: Action dimension.
-            horizon: The number of actions to generate in one forward pass.
-            use_vae: Whether to use the variational objective. TODO(now): Give more details.
-        """
+class _ActionChunkingTransformer(nn.Module):
+    def __init__(self, cfg):
         super().__init__()
 
-        self.camera_names = camera_names
-        self.use_vae = use_vae
-        self.horizon = horizon
-        self.hidden_dim = args.hidden_dim
+        self.camera_names = cfg.camera_names
+        self.use_vae = cfg.use_vae
+        self.horizon = cfg.horizon
+        self.d_model = cfg.d_model
 
         transformer_common_kwargs = dict(  # noqa: C408
-            d_model=self.hidden_dim,
-            nhead=args.nheads,
-            dim_feedforward=args.dim_feedforward,
-            dropout=args.dropout,
-            activation=args.activation,
-            normalize_before=args.pre_norm,
+            d_model=self.d_model,
+            num_heads=cfg.num_heads,
+            dim_feedforward=cfg.dim_feedforward,
+            dropout=cfg.dropout,
+            activation=cfg.activation,
+            normalize_before=cfg.pre_norm,
         )
 
         # BERT style VAE encoder with input [cls, *joint_space_configuration, *action_sequence].
         # The cls token forms parameters of the latent's distribution (like this [*means, *log_variances]).
-        if use_vae:
-            # TODO(now): args.enc_layers shouldn't be shared with the transformer decoder
-            self.vae_encoder = TransformerEncoder(num_layers=args.enc_layers, **transformer_common_kwargs)
-            self.cls_embed = nn.Embedding(1, self.hidden_dim)
+        if self.use_vae:
+            self.vae_encoder = _TransformerEncoder(num_layers=cfg.vae_enc_layers, **transformer_common_kwargs)
+            self.vae_encoder_cls_embed = nn.Embedding(1, self.d_model)
             # Projection layer for joint-space configuration to hidden dimension.
-            self.vae_encoder_robot_state_input_proj = nn.Linear(state_dim, self.hidden_dim)
+            self.vae_encoder_robot_state_input_proj = nn.Linear(cfg.state_dim, self.d_model)
             # Projection layer for action (joint-space target) to hidden dimension.
-            self.vae_encoder_action_input_proj = nn.Linear(state_dim, self.hidden_dim)
-            # Final size of latent z. TODO(now): Add to hyperparams.
-            self.latent_dim = 32
+            self.vae_encoder_action_input_proj = nn.Linear(cfg.state_dim, self.d_model)
+            self.latent_dim = cfg.latent_dim
             # Projection layer from the VAE encoder's output to the latent distribution's parameter space.
-            self.vae_encoder_latent_output_proj = nn.Linear(self.hidden_dim, self.latent_dim * 2)
-            # Fixed sinusoidal positional embedding the whole input to the VAE encoder.
+            self.vae_encoder_latent_output_proj = nn.Linear(self.d_model, self.latent_dim * 2)
+            # Fixed sinusoidal positional embedding the whole input to the VAE encoder. Unsqueeze for batch
+            # dimension.
             self.register_buffer(
-                "vae_encoder_pos_enc", create_sinusoidal_position_embedding(1 + 1 + horizon, self.hidden_dim)
+                "vae_encoder_pos_enc",
+                _create_sinusoidal_position_embedding(1 + 1 + self.horizon, self.d_model).unsqueeze(0),
             )
 
         # Backbone for image feature extraction.
-        self.backbone_position_embedding = SinusoidalPositionEmbedding2D(self.hidden_dim // 2)
-        backbone_model = getattr(torchvision.models, args.backbone)(
-            replace_stride_with_dilation=[False, False, args.dilation],
-            pretrained=True,  # TODO(now): Add pretrained option
+        backbone_model = getattr(torchvision.models, cfg.backbone)(
+            replace_stride_with_dilation=[False, False, cfg.dilation],
+            pretrained=cfg.pretrained_backbone,
             norm_layer=FrozenBatchNorm2d,
         )
         # Note: The forward method of this returns a dict: {"feature_map": output}.
         self.backbone = IntermediateLayerGetter(backbone_model, return_layers={"layer4": "feature_map"})
 
         # Transformer (acts as VAE decoder when training with the variational objective).
-        self.encoder = TransformerEncoder(num_layers=args.enc_layers, **transformer_common_kwargs)
-        self.decoder = TransformerDecoder(num_layers=args.dec_layers, **transformer_common_kwargs)
+        self.encoder = _TransformerEncoder(num_layers=cfg.enc_layers, **transformer_common_kwargs)
+        self.decoder = _TransformerDecoder(num_layers=cfg.dec_layers, **transformer_common_kwargs)
 
         # Transformer encoder input projections. The tokens will be structured like
         # [latent, robot_state, image_feature_map_pixels].
+        self.encoder_robot_state_input_proj = nn.Linear(cfg.state_dim, self.d_model)
+        self.encoder_latent_input_proj = nn.Linear(self.latent_dim, self.d_model)
         self.encoder_img_feat_input_proj = nn.Conv2d(
-            backbone_model.fc.in_features, self.hidden_dim, kernel_size=1
+            backbone_model.fc.in_features, self.d_model, kernel_size=1
         )
-        self.encoder_robot_state_input_proj = nn.Linear(state_dim, self.hidden_dim)
-        self.encoder_latent_input_proj = nn.Linear(self.latent_dim, self.hidden_dim)
-        # TODO(now): Fix this nonsense. One positional embedding is needed. We should extract the image
-        # feature dimension with a dry run.
-        self.additional_pos_embed = nn.Embedding(
-            2, self.hidden_dim
-        )  # learned position embedding for proprio and latent
+        # Transformer encoder positional embeddings.
+        self.encoder_robot_and_latent_pos_embed = nn.Embedding(2, self.d_model)
+        self.encoder_cam_feat_pos_embed = _SinusoidalPositionEmbedding2D(self.d_model // 2)
 
         # Transformer decoder.
         # Learnable positional embedding for the transformer's decoder (in the style of DETR object queries).
-        self.decoder_pos_embed_embed = nn.Embedding(horizon, self.hidden_dim)
+        self.decoder_pos_embed = nn.Embedding(self.horizon, self.d_model)
+
         # Final action regression head on the output of the transformer's decoder.
-        self.action_head = nn.Linear(self.hidden_dim, action_dim)
+        self.action_head = nn.Linear(self.d_model, cfg.action_dim)
 
         self._reset_parameters()
 
@@ -390,7 +332,7 @@ class ActionChunkingTransformer(nn.Module):
             if p.dim() > 1:
                 nn.init.xavier_uniform_(p)
 
-    def forward(self, robot_state, image, actions=None):
+    def forward(self, robot_state: Tensor, image: Tensor, actions: Tensor | None = None):
         """
         Args:
             robot_state: (B, J) batch of robot joint configurations.
@@ -405,10 +347,12 @@ class ActionChunkingTransformer(nn.Module):
 
         batch_size, _ = robot_state.shape
 
-        # Prepare the latent for input to the transformer.
+        # Prepare the latent for input to the transformer encoder.
         if self.use_vae and actions is not None:
             # Prepare the input to the VAE encoder: [cls, *joint_space_configuration, *action_sequence].
-            cls_embed = einops.repeat(self.cls_embed.weight, "1 d -> b 1 d", b=batch_size)  # (B, 1, D)
+            cls_embed = einops.repeat(
+                self.vae_encoder_cls_embed.weight, "1 d -> b 1 d", b=batch_size
+            )  # (B, 1, D)
             robot_state_embed = self.vae_encoder_robot_state_input_proj(robot_state).unsqueeze(1)  # (B, 1, D)
             action_embed = self.vae_encoder_action_input_proj(actions)  # (B, S, D)
             vae_encoder_input = torch.cat([cls_embed, robot_state_embed, action_embed], axis=1)  # (B, S+2, D)
@@ -417,7 +361,7 @@ class ActionChunkingTransformer(nn.Module):
             pos_embed = self.vae_encoder_pos_enc.clone().detach()  # (1, S+2, D)
             # Forward pass through VAE encoder and sample the latent with the reparameterization trick.
             cls_token_out = self.vae_encoder(
-                vae_encoder_input.permute(1, 0, 2), pos=pos_embed.permute(1, 0, 2)
+                vae_encoder_input.permute(1, 0, 2), pos_embed=pos_embed.permute(1, 0, 2)
             )[0]  # (B, D)
             latent_pdf_params = self.vae_encoder_latent_output_proj(cls_token_out)
             mu = latent_pdf_params[:, : self.latent_dim]
@@ -432,23 +376,25 @@ class ActionChunkingTransformer(nn.Module):
                 robot_state.device
             )
 
-        # Prepare all other transformer inputs.
-        # Image observation features and position embeddings.
+        # Prepare all other transformer encoder inputs.
+        # Camera observation features and positional embeddings.
         all_cam_features = []
-        all_cam_pos = []
+        all_cam_pos_embeds = []
         for cam_id, _ in enumerate(self.camera_names):
             cam_features = self.backbone(image[:, cam_id])["feature_map"]
-            pos = self.backbone_position_embedding(cam_features).to(dtype=cam_features.dtype)
+            cam_pos_embed = self.encoder_cam_feat_pos_embed(cam_features).to(dtype=cam_features.dtype)
             cam_features = self.encoder_img_feat_input_proj(cam_features)  # (B, C, h, w)
             all_cam_features.append(cam_features)
-            all_cam_pos.append(pos)
-        # Concatenate image observation feature maps along the width dimension.
+            all_cam_pos_embeds.append(cam_pos_embed)
+        # Concatenate camera observation feature maps and positional embeddings along the width dimension.
         encoder_in = torch.cat(all_cam_features, axis=3)
-        pos = torch.cat(all_cam_pos, axis=3)
+        cam_pos_embed = torch.cat(all_cam_pos_embeds, axis=3)
+
+        # Get positional embeddings for robot state and latent.
         robot_state_embed = self.encoder_robot_state_input_proj(robot_state)
         latent_embed = self.encoder_latent_input_proj(latent_sample)
 
-        # TODO(now): Explain all of this madness.
+        # Stack encoder input and positional embeddings moving to (S, B, C).
         encoder_in = torch.cat(
             [
                 torch.stack([latent_embed, robot_state_embed], axis=0),
@@ -456,60 +402,68 @@ class ActionChunkingTransformer(nn.Module):
             ]
         )
         pos_embed = torch.cat(
-            [self.additional_pos_embed.weight.unsqueeze(1), pos.flatten(2).permute(2, 0, 1)], axis=0
+            [
+                self.encoder_robot_and_latent_pos_embed.weight.unsqueeze(1),
+                cam_pos_embed.flatten(2).permute(2, 0, 1),
+            ],
+            axis=0,
         )
 
-        encoder_out = self.encoder(encoder_in, pos=pos_embed)
+        # Forward pass through the transformer modules.
+        encoder_out = self.encoder(encoder_in, pos_embed=pos_embed)
         decoder_in = torch.zeros(
-            (self.horizon, batch_size, self.hidden_dim), dtype=pos_embed.dtype, device=pos_embed.device
+            (self.horizon, batch_size, self.d_model), dtype=pos_embed.dtype, device=pos_embed.device
         )
         decoder_out = self.decoder(
             decoder_in,
             encoder_out,
             encoder_pos_embed=pos_embed,
-            decoder_pos_embed=self.decoder_pos_embed_embed.weight.unsqueeze(1),
-        ).transpose(0, 1)  # back to (B, S, C)
+            decoder_pos_embed=self.decoder_pos_embed.weight.unsqueeze(1),
+        )
+
+        # Move back to (B, S, C).
+        decoder_out = decoder_out.transpose(0, 1)
 
         actions = self.action_head(decoder_out)
+
         return actions, [mu, log_sigma_x2]
 
 
-class TransformerEncoder(nn.Module):
-    def __init__(
-        self,
-        num_layers,
-        d_model,
-        nhead,
-        dim_feedforward=2048,
-        dropout=0.1,
-        activation="relu",
-        normalize_before=False,
-    ):
+class _TransformerEncoder(nn.Module):
+    """Convenience module for running multiple encoder layers, maybe followed by normalization."""
+
+    def __init__(self, num_layers: int, **encoder_layer_kwargs: dict):
         super().__init__()
         self.layers = nn.ModuleList(
-            [
-                TransformerEncoderLayer(
-                    d_model, nhead, dim_feedforward, dropout, activation, normalize_before
-                )
-                for _ in range(num_layers)
-            ]
+            [_TransformerEncoderLayer(**encoder_layer_kwargs) for _ in range(num_layers)]
+        )
+        self.norm = (
+            nn.LayerNorm(encoder_layer_kwargs["d_model"])
+            if encoder_layer_kwargs["normalize_before"]
+            else nn.Identity()
         )
-        self.norm = nn.LayerNorm(d_model) if normalize_before else nn.Identity()
 
-    def forward(self, x, pos: Optional[Tensor] = None):
+    def forward(self, x: Tensor, pos_embed: Tensor | None = None) -> Tensor:
         for layer in self.layers:
-            x = layer(x, pos=pos)
+            x = layer(x, pos_embed=pos_embed)
         x = self.norm(x)
         return x
 
 
-class TransformerEncoderLayer(nn.Module):
+class _TransformerEncoderLayer(nn.Module):
     def __init__(
-        self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False
+        self,
+        d_model: int,
+        num_heads: int,
+        dim_feedforward: int,
+        dropout: float,
+        activation: str,
+        normalize_before: bool,
     ):
         super().__init__()
-        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
-        # Implementation of Feedforward model
+        self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)
+
+        # Feed forward layers.
         self.linear1 = nn.Linear(d_model, dim_feedforward)
         self.dropout = nn.Dropout(dropout)
         self.linear2 = nn.Linear(dim_feedforward, d_model)
@@ -522,7 +476,7 @@ class TransformerEncoderLayer(nn.Module):
         self.activation = _get_activation_fn(activation)
         self.normalize_before = normalize_before
 
-    def forward(self, x, pos_embed: Optional[Tensor] = None):
+    def forward(self, x, pos_embed: Tensor | None = None) -> Tensor:
         skip = x
         if self.normalize_before:
             x = self.norm1(x)
@@ -542,32 +496,23 @@ class TransformerEncoderLayer(nn.Module):
         return x
 
 
-class TransformerDecoder(nn.Module):
-    def __init__(
-        self,
-        num_layers,
-        d_model,
-        nhead,
-        dim_feedforward=2048,
-        dropout=0.1,
-        activation="relu",
-        normalize_before=False,
-    ):
+class _TransformerDecoder(nn.Module):
+    def __init__(self, num_layers: int, **decoder_layer_kwargs):
+        """Convenience module for running multiple decoder layers followed by normalization."""
         super().__init__()
         self.layers = nn.ModuleList(
-            [
-                TransformerDecoderLayer(
-                    d_model, nhead, dim_feedforward, dropout, activation, normalize_before
-                )
-                for _ in range(num_layers)
-            ]
+            [_TransformerDecoderLayer(**decoder_layer_kwargs) for _ in range(num_layers)]
         )
         self.num_layers = num_layers
-        self.norm = nn.LayerNorm(d_model)
+        self.norm = nn.LayerNorm(decoder_layer_kwargs["d_model"])
 
     def forward(
-        self, x, encoder_out, decoder_pos_embed: Tensor | None = None, encoder_pos_embed: Tensor | None = None
-    ):
+        self,
+        x: Tensor,
+        encoder_out: Tensor,
+        decoder_pos_embed: Tensor | None = None,
+        encoder_pos_embed: Tensor | None = None,
+    ) -> Tensor:
         for layer in self.layers:
             x = layer(
                 x, encoder_out, decoder_pos_embed=decoder_pos_embed, encoder_pos_embed=encoder_pos_embed
@@ -577,14 +522,21 @@ class TransformerDecoder(nn.Module):
         return x
 
 
-class TransformerDecoderLayer(nn.Module):
+class _TransformerDecoderLayer(nn.Module):
     def __init__(
-        self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False
+        self,
+        d_model: int,
+        num_heads: int,
+        dim_feedforward: int,
+        dropout: float,
+        activation: str,
+        normalize_before: bool,
     ):
         super().__init__()
-        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
-        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
-        # Implementation of Feedforward model
+        self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)
+
+        # Feed forward layers.
         self.linear1 = nn.Linear(d_model, dim_feedforward)
         self.dropout = nn.Dropout(dropout)
         self.linear2 = nn.Linear(dim_feedforward, d_model)
@@ -650,8 +602,26 @@ class TransformerDecoderLayer(nn.Module):
         return x
 
 
-class SinusoidalPositionEmbedding2D(nn.Module):
-    """Sinusoidal positional embeddings similar to what's presented in Attention Is All You Need.
+def _create_sinusoidal_position_embedding(num_positions: int, dimension: int) -> Tensor:
+    """1D sinusoidal positional embeddings as in Attention is All You Need.
+
+    Args:
+        num_positions: Number of token positions required.
+    Returns: (num_positions, dimension) position embeddings (the first dimension is the batch dimension).
+
+    """
+
+    def get_position_angle_vec(position):
+        return [position / np.power(10000, 2 * (hid_j // 2) / dimension) for hid_j in range(dimension)]
+
+    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(num_positions)])
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+    return torch.from_numpy(sinusoid_table).float()
+
+
+class _SinusoidalPositionEmbedding2D(nn.Module):
+    """2D sinusoidal positional embeddings similar to what's presented in Attention Is All You Need.
 
     The variation is that the position indices are normalized in [0, 2π] (not quite: the lower bound is 1/H
     for the vertical direction, and 1/W for the horizontal direction.
@@ -705,7 +675,7 @@ class SinusoidalPositionEmbedding2D(nn.Module):
 
 
 def _get_activation_fn(activation: str) -> Callable:
-    """Return an activation function given a string"""
+    """Return an activation function given a string."""
     if activation == "relu":
         return F.relu
     if activation == "gelu":
diff --git a/lerobot/configs/policy/act.yaml b/lerobot/configs/policy/act.yaml
index 22b6cd6f..3551768c 100644
--- a/lerobot/configs/policy/act.yaml
+++ b/lerobot/configs/policy/act.yaml
@@ -21,24 +21,27 @@ policy:
 
   lr: 1e-5
   lr_backbone: 1e-5
+  pretrained_backbone: true
   weight_decay: 1e-4
   grad_clip_norm: 10
   backbone: resnet18
   horizon: ${horizon} # chunk_size
   kl_weight: 10
-  hidden_dim: 512
+  d_model: 512
   dim_feedforward: 3200
+  vae_enc_layers: 4
   enc_layers: 4
   dec_layers: 1
-  nheads: 8
+  num_heads: 8
   #camera_names: [top, front_close, left_pillar, right_pillar]
   camera_names: [top]
   dilation: false
   dropout: 0.1
   pre_norm: false
   activation: relu
+  latent_dim: 32
 
-  vae: true
+  use_vae: true
 
   batch_size: 8
 
diff --git a/scripts/convert_act_weights.py b/scripts/convert_act_weights.py
index c8f83422..b1492009 100644
--- a/scripts/convert_act_weights.py
+++ b/scripts/convert_act_weights.py
@@ -42,6 +42,8 @@ start_replacements = [
     ("model.transformer.encoder.", "model.encoder."),
     ("model.transformer.decoder.", "model.decoder."),
     ("model.backbones.0.0.body.", "model.backbone."),
+    ("model.additional_pos_embed.weight", "model.encoder_robot_and_latent_pos_embed.weight"),
+    ("model.cls_embed.weight", "model.vae_encoder_cls_embed.weight"),
 ]
 
 for to_replace, replace_with in start_replacements:

From 9c28ac8aa424d5e3b51004883e88cd35954329f4 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare159@gmail.com>
Date: Fri, 5 Apr 2024 15:25:11 +0100
Subject: [PATCH 07/25] re-add pre-commit check

---
 .pre-commit-config.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index da78b677..765b678a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -23,3 +23,11 @@ repos:
       - id: ruff
         args: [--fix]
       - id: ruff-format
+  - repo: https://github.com/python-poetry/poetry
+    rev: 1.8.0
+    hooks:
+      - id: poetry-check
+      - id: poetry-lock
+        args:
+          - "--check"
+          - "--no-update"

From 1e71196fe3d45ab973d0f612f6b9aa3800af40fb Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare159@gmail.com>
Date: Fri, 5 Apr 2024 17:38:29 +0100
Subject: [PATCH 08/25] backup wip

---
 lerobot/common/datasets/aloha.py            |   8 +-
 lerobot/common/datasets/factory.py          | 179 +++++-----
 lerobot/common/policies/act/policy.py       | 362 +++++++++-----------
 lerobot/common/policies/diffusion/policy.py |   1 -
 lerobot/scripts/train.py                    |   4 +-
 poetry.lock                                 |  49 ++-
 pyproject.toml                              |   1 +
 7 files changed, 306 insertions(+), 298 deletions(-)

diff --git a/lerobot/common/datasets/aloha.py b/lerobot/common/datasets/aloha.py
index 102de08e..4c0795dd 100644
--- a/lerobot/common/datasets/aloha.py
+++ b/lerobot/common/datasets/aloha.py
@@ -158,7 +158,7 @@ class AlohaDataset(torch.utils.data.Dataset):
         self.data_ids_per_episode = {}
         ep_dicts = []
 
-        logging.info("Initialize and feed offline buffer")
+        frame_idx = 0
         for ep_id in tqdm.tqdm(range(NUM_EPISODES[self.dataset_id])):
             ep_path = raw_dir / f"episode_{ep_id}.hdf5"
             with h5py.File(ep_path, "r") as ep:
@@ -190,8 +190,14 @@ class AlohaDataset(torch.utils.data.Dataset):
                     ep_dict[f"observation.images.{cam}"] = image[:-1]
                     # ep_dict[f"next.observation.images.{cam}"] = image[1:]
 
+                assert isinstance(ep_id, int)
+                self.data_ids_per_episode[ep_id] = torch.arange(frame_idx, frame_idx + num_frames, 1)
+                assert len(self.data_ids_per_episode[ep_id]) == num_frames
+
                 ep_dicts.append(ep_dict)
 
+            frame_idx += num_frames
+
         self.data_dict = {}
 
         keys = ep_dicts[0].keys()
diff --git a/lerobot/common/datasets/factory.py b/lerobot/common/datasets/factory.py
index 49170098..0217583a 100644
--- a/lerobot/common/datasets/factory.py
+++ b/lerobot/common/datasets/factory.py
@@ -59,96 +59,95 @@ def make_dataset(
                 transform=Prod(in_keys=clsfunc.image_keys, prod=1 / 255.0),
             )
             stats = compute_or_load_stats(stats_dataset)
-
         # TODO(rcadene): remove this and put it in config. Ideally we want to reproduce SOTA results just with mean_std
         normalization_mode = "mean_std" if cfg.env.name == "aloha" else "min_max"
 
-        # TODO(now): These stats are needed to use their pretrained model for sim_transfer_cube_human.
-        # (Pdb) stats['observation']['state']['mean']
-        # tensor([-0.0071, -0.6293,  1.0351, -0.0517, -0.4642, -0.0754,  0.4751, -0.0373,
-        #         -0.3324,  0.9034, -0.2258, -0.3127, -0.2412,  0.6866])
-        stats["observation", "state", "mean"] = torch.tensor(
-            [
-                -0.00740268,
-                -0.63187766,
-                1.0356655,
-                -0.05027218,
-                -0.46199223,
-                -0.07467502,
-                0.47467607,
-                -0.03615446,
-                -0.33203387,
-                0.9038929,
-                -0.22060776,
-                -0.31011587,
-                -0.23484458,
-                0.6842416,
-            ]
-        )
-        # (Pdb) stats['observation']['state']['std']
-        # tensor([0.0022, 0.0520, 0.0291, 0.0092, 0.0267, 0.0145, 0.0563, 0.0179, 0.0494,
-        #         0.0326, 0.0476, 0.0535, 0.0956, 0.0513])
-        stats["observation", "state", "std"] = torch.tensor(
-            [
-                0.01219023,
-                0.2975381,
-                0.16728032,
-                0.04733803,
-                0.1486037,
-                0.08788499,
-                0.31752336,
-                0.1049916,
-                0.27933604,
-                0.18094037,
-                0.26604933,
-                0.30466506,
-                0.5298686,
-                0.25505227,
-            ]
-        )
-        # (Pdb) stats['action']['mean']
-        # tensor([-0.0075, -0.6346,  1.0353, -0.0465, -0.4686, -0.0738,  0.3723, -0.0396,
-        #         -0.3184,  0.8991, -0.2065, -0.3182, -0.2338,  0.5593])
-        stats["action"]["mean"] = torch.tensor(
-            [
-                -0.00756444,
-                -0.6281845,
-                1.0312834,
-                -0.04664314,
-                -0.47211358,
-                -0.074527,
-                0.37389806,
-                -0.03718753,
-                -0.3261143,
-                0.8997205,
-                -0.21371077,
-                -0.31840396,
-                -0.23360962,
-                0.551947,
-            ]
-        )
-        # (Pdb) stats['action']['std']
-        # tensor([0.0023, 0.0514, 0.0290, 0.0086, 0.0263, 0.0143, 0.0593, 0.0185, 0.0510,
-        #         0.0328, 0.0478, 0.0531, 0.0945, 0.0794])
-        stats["action"]["std"] = torch.tensor(
-            [
-                0.01252818,
-                0.2957442,
-                0.16701928,
-                0.04584508,
-                0.14833844,
-                0.08763024,
-                0.30665937,
-                0.10600077,
-                0.27572668,
-                0.1805853,
-                0.26304692,
-                0.30708534,
-                0.5305411,
-                0.38381037,
-            ]
-        )
-        transforms.append(NormalizeTransform(stats, in_keys, mode=normalization_mode))  # noqa: F821
+        # # TODO(now): These stats are needed to use their pretrained model for sim_transfer_cube_human.
+        # # (Pdb) stats['observation']['state']['mean']
+        # # tensor([-0.0071, -0.6293,  1.0351, -0.0517, -0.4642, -0.0754,  0.4751, -0.0373,
+        # #         -0.3324,  0.9034, -0.2258, -0.3127, -0.2412,  0.6866])
+        # stats["observation", "state", "mean"] = torch.tensor(
+        #     [
+        #         -0.00740268,
+        #         -0.63187766,
+        #         1.0356655,
+        #         -0.05027218,
+        #         -0.46199223,
+        #         -0.07467502,
+        #         0.47467607,
+        #         -0.03615446,
+        #         -0.33203387,
+        #         0.9038929,
+        #         -0.22060776,
+        #         -0.31011587,
+        #         -0.23484458,
+        #         0.6842416,
+        #     ]
+        # )
+        # # (Pdb) stats['observation']['state']['std']
+        # # tensor([0.0022, 0.0520, 0.0291, 0.0092, 0.0267, 0.0145, 0.0563, 0.0179, 0.0494,
+        # #         0.0326, 0.0476, 0.0535, 0.0956, 0.0513])
+        # stats["observation", "state", "std"] = torch.tensor(
+        #     [
+        #         0.01219023,
+        #         0.2975381,
+        #         0.16728032,
+        #         0.04733803,
+        #         0.1486037,
+        #         0.08788499,
+        #         0.31752336,
+        #         0.1049916,
+        #         0.27933604,
+        #         0.18094037,
+        #         0.26604933,
+        #         0.30466506,
+        #         0.5298686,
+        #         0.25505227,
+        #     ]
+        # )
+        # # (Pdb) stats['action']['mean']
+        # # tensor([-0.0075, -0.6346,  1.0353, -0.0465, -0.4686, -0.0738,  0.3723, -0.0396,
+        # #         -0.3184,  0.8991, -0.2065, -0.3182, -0.2338,  0.5593])
+        # stats["action"]["mean"] = torch.tensor(
+        #     [
+        #         -0.00756444,
+        #         -0.6281845,
+        #         1.0312834,
+        #         -0.04664314,
+        #         -0.47211358,
+        #         -0.074527,
+        #         0.37389806,
+        #         -0.03718753,
+        #         -0.3261143,
+        #         0.8997205,
+        #         -0.21371077,
+        #         -0.31840396,
+        #         -0.23360962,
+        #         0.551947,
+        #     ]
+        # )
+        # # (Pdb) stats['action']['std']
+        # # tensor([0.0023, 0.0514, 0.0290, 0.0086, 0.0263, 0.0143, 0.0593, 0.0185, 0.0510,
+        # #         0.0328, 0.0478, 0.0531, 0.0945, 0.0794])
+        # stats["action"]["std"] = torch.tensor(
+        #     [
+        #         0.01252818,
+        #         0.2957442,
+        #         0.16701928,
+        #         0.04584508,
+        #         0.14833844,
+        #         0.08763024,
+        #         0.30665937,
+        #         0.10600077,
+        #         0.27572668,
+        #         0.1805853,
+        #         0.26304692,
+        #         0.30708534,
+        #         0.5305411,
+        #         0.38381037,
+        #     ]
+        # )
+        # transforms.append(NormalizeTransform(stats, in_keys, mode=normalization_mode))  # noqa: F821
 
         transforms = v2.Compose(
             [
@@ -173,7 +172,11 @@ def make_dataset(
             "action": [-0.1] + [i / clsfunc.fps for i in range(15)],
         }
     else:
-        delta_timestamps = None
+        delta_timestamps = {
+            "observation.images.top": [0],
+            "observation.state": [0],
+            "action": [i / clsfunc.fps for i in range(cfg.policy.horizon)],
+        }
 
     dataset = clsfunc(
         dataset_id=cfg.dataset_id,
diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py
index 5071c09a..1aacc41d 100644
--- a/lerobot/common/policies/act/policy.py
+++ b/lerobot/common/policies/act/policy.py
@@ -19,11 +19,10 @@ from torch import Tensor, nn
 from torchvision.models._utils import IntermediateLayerGetter
 from torchvision.ops.misc import FrozenBatchNorm2d
 
-from lerobot.common.policies.abstract import AbstractPolicy
 from lerobot.common.utils import get_safe_torch_device
 
 
-class ActionChunkingTransformerPolicy(AbstractPolicy):
+class ActionChunkingTransformerPolicy(nn.Module):
     """
     Action Chunking Transformer Policy as per Learning Fine-Grained Bimanual Manipulation with Low-Cost
     Hardware (paper: https://arxiv.org/abs/2304.13705, code: https://github.com/tonyzhaozh/act)
@@ -61,205 +60,20 @@ class ActionChunkingTransformerPolicy(AbstractPolicy):
     """
 
     name = "act"
+    _multiple_obs_steps_not_handled_msg = (
+        "ActionChunkingTransformerPolicy does not handle multiple observation steps."
+    )
 
     def __init__(self, cfg, device, n_action_steps=1):
         """
         TODO(alexander-soare): Add documentation for all parameters.
         """
-        super().__init__(n_action_steps)
+        super().__init__()
+        if getattr(cfg, "n_obs_steps", 1) != 1:
+            raise ValueError(self._multiple_obs_steps_not_handled_msg)
         self.cfg = cfg
         self.n_action_steps = n_action_steps
         self.device = get_safe_torch_device(device)
-
-        self.model = _ActionChunkingTransformer(cfg)
-        self._create_optimizer()
-        self.to(self.device)
-
-    def _create_optimizer(self):
-        optimizer_params_dicts = [
-            {
-                "params": [
-                    p
-                    for n, p in self.model.named_parameters()
-                    if not n.startswith("backbone") and p.requires_grad
-                ]
-            },
-            {
-                "params": [
-                    p
-                    for n, p in self.model.named_parameters()
-                    if n.startswith("backbone") and p.requires_grad
-                ],
-                "lr": self.cfg.lr_backbone,
-            },
-        ]
-        self.optimizer = torch.optim.AdamW(
-            optimizer_params_dicts, lr=self.cfg.lr, weight_decay=self.cfg.weight_decay
-        )
-
-    def update(self, replay_buffer, step):
-        del step
-
-        self.train()
-
-        num_slices = self.cfg.batch_size
-        batch_size = self.cfg.horizon * num_slices
-
-        assert batch_size % self.cfg.horizon == 0
-        assert batch_size % num_slices == 0
-
-        def process_batch(batch, horizon, num_slices):
-            # trajectory t = 64, horizon h = 16
-            # (t h) ... -> t h ...
-            batch = batch.reshape(num_slices, horizon)
-
-            image = batch["observation", "image", "top"]
-            image = image[:, 0]  # first observation t=0
-            # batch, num_cam, channel, height, width
-            image = image.unsqueeze(1)
-            assert image.ndim == 5
-            image = image.float()
-
-            state = batch["observation", "state"]
-            state = state[:, 0]  # first observation t=0
-            # batch, qpos_dim
-            assert state.ndim == 2
-
-            action = batch["action"]
-            # batch, seq, action_dim
-            assert action.ndim == 3
-            assert action.shape[1] == horizon
-
-            if self.cfg.n_obs_steps > 1:
-                raise NotImplementedError()
-                # # keep first n observations of the slice corresponding to t=[-1,0]
-                # image = image[:, : self.cfg.n_obs_steps]
-                # state = state[:, : self.cfg.n_obs_steps]
-
-            out = {
-                "obs": {
-                    "image": image.to(self.device, non_blocking=True),
-                    "agent_pos": state.to(self.device, non_blocking=True),
-                },
-                "action": action.to(self.device, non_blocking=True),
-            }
-            return out
-
-        start_time = time.time()
-
-        batch = replay_buffer.sample(batch_size)
-        batch = process_batch(batch, self.cfg.horizon, num_slices)
-
-        data_s = time.time() - start_time
-
-        loss = self.compute_loss(batch)
-        loss.backward()
-
-        grad_norm = torch.nn.utils.clip_grad_norm_(
-            self.model.parameters(),
-            self.cfg.grad_clip_norm,
-            error_if_nonfinite=False,
-        )
-
-        self.optimizer.step()
-        self.optimizer.zero_grad()
-
-        info = {
-            "loss": loss.item(),
-            "grad_norm": float(grad_norm),
-            "lr": self.cfg.lr,
-            "data_s": data_s,
-            "update_s": time.time() - start_time,
-        }
-
-        return info
-
-    def save(self, fp):
-        torch.save(self.state_dict(), fp)
-
-    def load(self, fp):
-        d = torch.load(fp)
-        self.load_state_dict(d)
-
-    def compute_loss(self, batch):
-        loss_dict = self._forward(
-            qpos=batch["obs"]["agent_pos"],
-            image=batch["obs"]["image"],
-            actions=batch["action"],
-        )
-        loss = loss_dict["loss"]
-        return loss
-
-    @torch.no_grad()
-    def select_actions(self, observation, step_count):
-        # TODO(rcadene): remove unused step_count
-        del step_count
-
-        self.eval()
-
-        # TODO(rcadene): remove hack
-        # add 1 camera dimension
-        observation["image", "top"] = observation["image", "top"].unsqueeze(1)
-
-        obs_dict = {
-            "image": observation["image", "top"],
-            "agent_pos": observation["state"],
-        }
-        action = self._forward(qpos=obs_dict["agent_pos"] * 0.182, image=obs_dict["image"])
-
-        if self.cfg.temporal_agg:
-            # TODO(rcadene): implement temporal aggregation
-            raise NotImplementedError()
-            # all_time_actions[[t], t:t+num_queries] = action
-            # actions_for_curr_step = all_time_actions[:, t]
-            # actions_populated = torch.all(actions_for_curr_step != 0, axis=1)
-            # actions_for_curr_step = actions_for_curr_step[actions_populated]
-            # k = 0.01
-            # exp_weights = np.exp(-k * np.arange(len(actions_for_curr_step)))
-            # exp_weights = exp_weights / exp_weights.sum()
-            # exp_weights = torch.from_numpy(exp_weights).cuda().unsqueeze(dim=1)
-            # raw_action = (actions_for_curr_step * exp_weights).sum(dim=0, keepdim=True)
-
-        # take first predicted action or n first actions
-        action = action[: self.n_action_steps]
-        return action
-
-    def _forward(self, qpos, image, actions=None):
-        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-        image = normalize(image)
-
-        is_training = actions is not None
-        if is_training:  # training time
-            actions = actions[:, : self.model.horizon]
-
-            a_hat, (mu, log_sigma_x2) = self.model(qpos, image, actions)
-
-            all_l1 = F.l1_loss(actions, a_hat, reduction="none")
-            l1 = all_l1.mean()
-
-            loss_dict = {}
-            loss_dict["l1"] = l1
-            if self.cfg.use_vae:
-                # Calculate Dₖₗ(latent_pdf || standard_normal). Note: After computing the KL-divergence for
-                # each dimension independently, we sum over the latent dimension to get the total
-                # KL-divergence per batch element, then take the mean over the batch.
-                # (See App. B of https://arxiv.org/abs/1312.6114 for more details).
-                mean_kld = (-0.5 * (1 + log_sigma_x2 - mu.pow(2) - (log_sigma_x2).exp())).sum(-1).mean()
-                loss_dict["kl"] = mean_kld
-                loss_dict["loss"] = loss_dict["l1"] + loss_dict["kl"] * self.cfg.kl_weight
-            else:
-                loss_dict["loss"] = loss_dict["l1"]
-            return loss_dict
-        else:
-            action, _ = self.model(qpos, image)  # no action, sample from prior
-            return action
-
-
-# TODO(alexander-soare) move all this code into the policy when we have the policy API established.
-class _ActionChunkingTransformer(nn.Module):
-    def __init__(self, cfg):
-        super().__init__()
-
         self.camera_names = cfg.camera_names
         self.use_vae = cfg.use_vae
         self.horizon = cfg.horizon
@@ -326,26 +140,179 @@ class _ActionChunkingTransformer(nn.Module):
 
         self._reset_parameters()
 
+        self._create_optimizer()
+        self.to(self.device)
+
+    def _create_optimizer(self):
+        optimizer_params_dicts = [
+            {
+                "params": [
+                    p for n, p in self.named_parameters() if not n.startswith("backbone") and p.requires_grad
+                ]
+            },
+            {
+                "params": [
+                    p for n, p in self.named_parameters() if n.startswith("backbone") and p.requires_grad
+                ],
+                "lr": self.cfg.lr_backbone,
+            },
+        ]
+        self.optimizer = torch.optim.AdamW(
+            optimizer_params_dicts, lr=self.cfg.lr, weight_decay=self.cfg.weight_decay
+        )
+
     def _reset_parameters(self):
         """Xavier-uniform initialization of the transformer parameters as in the original code."""
         for p in chain(self.encoder.parameters(), self.decoder.parameters()):
             if p.dim() > 1:
                 nn.init.xavier_uniform_(p)
 
+    @torch.no_grad()
+    def select_actions(self, observation, step_count):
+        # TODO(rcadene): remove unused step_count
+        del step_count
+
+        self.eval()
+
+        # TODO(rcadene): remove hack
+        # add 1 camera dimension
+        observation["image", "top"] = observation["image", "top"].unsqueeze(1)
+
+        obs_dict = {
+            "image": observation["image", "top"],
+            "agent_pos": observation["state"],
+        }
+        action = self._forward(qpos=obs_dict["agent_pos"] * 0.182, image=obs_dict["image"])
+
+        if self.cfg.temporal_agg:
+            # TODO(rcadene): implement temporal aggregation
+            raise NotImplementedError()
+            # all_time_actions[[t], t:t+num_queries] = action
+            # actions_for_curr_step = all_time_actions[:, t]
+            # actions_populated = torch.all(actions_for_curr_step != 0, axis=1)
+            # actions_for_curr_step = actions_for_curr_step[actions_populated]
+            # k = 0.01
+            # exp_weights = np.exp(-k * np.arange(len(actions_for_curr_step)))
+            # exp_weights = exp_weights / exp_weights.sum()
+            # exp_weights = torch.from_numpy(exp_weights).cuda().unsqueeze(dim=1)
+            # raw_action = (actions_for_curr_step * exp_weights).sum(dim=0, keepdim=True)
+
+        # take first predicted action or n first actions
+        action = action[: self.n_action_steps]
+        return action
+
+    def __call__(self, *args, **kwargs):
+        # TODO(now): Temporary bridge.
+        return self.update(*args, **kwargs)
+
+    def _preprocess_batch(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
+        """
+        Expects batch to have (at least):
+        {
+            "observation.state": (B, 1, J) tensor of robot states (joint configuration)
+
+            "observation.images.top": (B, 1, C, H, W) tensor of images.
+            "action": (B, H, J) tensor of actions (positional target for robot joint configuration)
+            "action_is_pad": (B, H) mask for whether the actions are padding outside of the episode bounds.
+        }
+        """
+        if batch["observation.state"].shape[1] != 1:
+            raise ValueError(self._multiple_obs_steps_not_handled_msg)
+        batch["observation.state"] = batch["observation.state"].squeeze(1)
+        # TODO(alexander-soare): generalize this to multiple images. Note: no squeeze is required for
+        # "observation.images.top" because then we'd have to unsqueeze to get get the image index dimension.
+
+    def update(self, batch, *_):
+        start_time = time.time()
+        self._preprocess_batch(batch)
+
+        self.train()
+
+        num_slices = self.cfg.batch_size
+        batch_size = self.cfg.horizon * num_slices
+
+        assert batch_size % self.cfg.horizon == 0
+        assert batch_size % num_slices == 0
+
+        loss = self.compute_loss(batch)
+        loss.backward()
+
+        grad_norm = torch.nn.utils.clip_grad_norm_(
+            self.parameters(),
+            self.cfg.grad_clip_norm,
+            error_if_nonfinite=False,
+        )
+
+        self.optimizer.step()
+        self.optimizer.zero_grad()
+
+        info = {
+            "loss": loss.item(),
+            "grad_norm": float(grad_norm),
+            "lr": self.cfg.lr,
+            "update_s": time.time() - start_time,
+        }
+
+        return info
+
+    def compute_loss(self, batch):
+        loss_dict = self.forward(
+            robot_state=batch["observation.state"],
+            image=batch["observation.images.top"],
+            actions=batch["action"],
+        )
+        loss = loss_dict["loss"]
+        return loss
+
     def forward(self, robot_state: Tensor, image: Tensor, actions: Tensor | None = None):
+        # TODO(now): Maybe this shouldn't be here?
+        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        image = normalize(image)
+
+        is_training = actions is not None
+        if is_training:  # training time
+            actions = actions[:, : self.horizon]
+
+            a_hat, (mu, log_sigma_x2) = self._forward(robot_state, image, actions)
+
+            all_l1 = F.l1_loss(actions, a_hat, reduction="none")
+            l1 = all_l1.mean()
+
+            loss_dict = {}
+            loss_dict["l1"] = l1
+            if self.cfg.use_vae:
+                # Calculate Dₖₗ(latent_pdf || standard_normal). Note: After computing the KL-divergence for
+                # each dimension independently, we sum over the latent dimension to get the total
+                # KL-divergence per batch element, then take the mean over the batch.
+                # (See App. B of https://arxiv.org/abs/1312.6114 for more details).
+                mean_kld = (-0.5 * (1 + log_sigma_x2 - mu.pow(2) - (log_sigma_x2).exp())).sum(-1).mean()
+                loss_dict["kl"] = mean_kld
+                loss_dict["loss"] = loss_dict["l1"] + loss_dict["kl"] * self.cfg.kl_weight
+            else:
+                loss_dict["loss"] = loss_dict["l1"]
+            return loss_dict
+        else:
+            action, _ = self._forward(robot_state, image)  # no action, sample from prior
+            return action
+
+    def _forward(self, robot_state: Tensor, image: Tensor, actions: Tensor | None = None):
         """
         Args:
             robot_state: (B, J) batch of robot joint configurations.
             image: (B, N, C, H, W) batch of N camera frames.
             actions: (B, S, A) batch of actions from the target dataset which must be provided if the
                 VAE is enabled and the model is in training mode.
+        Returns:
+            (B, S, A) batch of action sequences
+            Tuple containing the latent PDF's parameters (mean, log(σ²)) both as (B, L) tensors where L is the
+            latent dimension.
         """
         if self.use_vae and self.training:
             assert (
                 actions is not None
             ), "actions must be provided when using the variational objective in training mode."
 
-        batch_size, _ = robot_state.shape
+        batch_size = robot_state.shape[0]
 
         # Prepare the latent for input to the transformer encoder.
         if self.use_vae and actions is not None:
@@ -428,6 +395,13 @@ class _ActionChunkingTransformer(nn.Module):
 
         return actions, [mu, log_sigma_x2]
 
+    def save(self, fp):
+        torch.save(self.state_dict(), fp)
+
+    def load(self, fp):
+        d = torch.load(fp)
+        self.load_state_dict(d)
+
 
 class _TransformerEncoder(nn.Module):
     """Convenience module for running multiple encoder layers, maybe followed by normalization."""
diff --git a/lerobot/common/policies/diffusion/policy.py b/lerobot/common/policies/diffusion/policy.py
index a4f4a450..93e5ba5d 100644
--- a/lerobot/common/policies/diffusion/policy.py
+++ b/lerobot/common/policies/diffusion/policy.py
@@ -152,7 +152,6 @@ class DiffusionPolicy(nn.Module):
         self.diffusion.train()
 
         data_s = time.time() - start_time
-
         loss = self.diffusion.compute_loss(batch)
         loss.backward()
 
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index 631ecc93..d49dfff8 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -41,7 +41,6 @@ def log_train_info(logger, info, step, cfg, dataset, is_offline):
     loss = info["loss"]
     grad_norm = info["grad_norm"]
     lr = info["lr"]
-    data_s = info["data_s"]
     update_s = info["update_s"]
 
     # A sample is an (observation,action) pair, where observation and action
@@ -62,7 +61,6 @@ def log_train_info(logger, info, step, cfg, dataset, is_offline):
         f"grdn:{grad_norm:.3f}",
         f"lr:{lr:0.1e}",
         # in seconds
-        f"data_s:{data_s:.3f}",
         f"updt_s:{update_s:.3f}",
     ]
     logging.info(" ".join(log_items))
@@ -200,7 +198,7 @@ def train(cfg: dict, out_dir=None, job_name=None):
     is_offline = True
     dataloader = torch.utils.data.DataLoader(
         dataset,
-        num_workers=4,
+        num_workers=0,
         batch_size=cfg.policy.batch_size,
         shuffle=True,
         pin_memory=cfg.device != "cpu",
diff --git a/poetry.lock b/poetry.lock
index 0cbf9318..b8c6c638 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -880,6 +880,29 @@ files = [
 [package.extras]
 protobuf = ["grpcio-tools (>=1.62.1)"]
 
+[[package]]
+name = "gym-pusht"
+version = "0.1.0"
+description = "PushT environment for LeRobot"
+optional = true
+python-versions = "^3.10"
+files = []
+develop = false
+
+[package.dependencies]
+gymnasium = "^0.29.1"
+opencv-python = "^4.9.0.80"
+pygame = "^2.5.2"
+pymunk = "^6.6.0"
+scikit-image = "^0.22.0"
+shapely = "^2.0.3"
+
+[package.source]
+type = "git"
+url = "git@github.com:huggingface/gym-pusht.git"
+reference = "HEAD"
+resolved_reference = "0fe4449cca5a2b08f529f7a07fbf5b9df24962ec"
+
 [[package]]
 name = "gymnasium"
 version = "0.29.1"
@@ -1261,17 +1284,21 @@ setuptools = "!=50.0.0"
 
 [[package]]
 name = "lazy-loader"
-version = "0.3"
-description = "lazy_loader"
+version = "0.4"
+description = "Makes it easy to load subpackages and functions on demand."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "lazy_loader-0.3-py3-none-any.whl", hash = "sha256:1e9e76ee8631e264c62ce10006718e80b2cfc74340d17d1031e0f84af7478554"},
-    {file = "lazy_loader-0.3.tar.gz", hash = "sha256:3b68898e34f5b2a29daaaac172c6555512d0f32074f147e2254e4a6d9d838f37"},
+    {file = "lazy_loader-0.4-py3-none-any.whl", hash = "sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc"},
+    {file = "lazy_loader-0.4.tar.gz", hash = "sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1"},
 ]
 
+[package.dependencies]
+packaging = "*"
+
 [package.extras]
-lint = ["pre-commit (>=3.3)"]
+dev = ["changelist (==0.5)"]
+lint = ["pre-commit (==3.7.0)"]
 test = ["pytest (>=7.4)", "pytest-cov (>=4.1)"]
 
 [[package]]
@@ -3274,7 +3301,7 @@ protobuf = ">=3.20"
 
 [[package]]
 name = "tensordict"
-version = "0.4.0+b4c91e8"
+version = "0.4.0+f622b2f"
 description = ""
 optional = false
 python-versions = "*"
@@ -3518,13 +3545,13 @@ tutorials = ["matplotlib", "pandas", "tabulate", "torch"]
 
 [[package]]
 name = "typing-extensions"
-version = "4.10.0"
+version = "4.11.0"
 description = "Backported and Experimental Type Hints for Python 3.8+"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "typing_extensions-4.10.0-py3-none-any.whl", hash = "sha256:69b1a937c3a517342112fb4c6df7e72fc39a38e7891a5730ed4985b5214b5475"},
-    {file = "typing_extensions-4.10.0.tar.gz", hash = "sha256:b0abd7c89e8fb96f98db18d86106ff1d90ab692004eb746cf6eda2682f91b3cb"},
+    {file = "typing_extensions-4.11.0-py3-none-any.whl", hash = "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a"},
+    {file = "typing_extensions-4.11.0.tar.gz", hash = "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0"},
 ]
 
 [[package]]
@@ -3667,9 +3694,9 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.link
 testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
 
 [extras]
-pusht = []
+pusht = ["gym_pusht"]
 
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "04b17fa57f189ad63181611d2e724d7fbdfb3485bc1a587b259d0a3751db918d"
+content-hash = "3eee17e4bf2b7a570f41ef9c400ec5a24a3113f62a13162229cf43504ca0d005"
diff --git a/pyproject.toml b/pyproject.toml
index f0869158..a7d2dd65 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,7 @@ robomimic = "0.2.0"
 gymnasium-robotics = "^1.2.4"
 gymnasium = "^0.29.1"
 cmake = "^3.29.0.1"
+gym_pusht = { git = "git@github.com:huggingface/gym-pusht.git", optional = true}
 
 [tool.poetry.extras]
 pusht = ["gym_pusht"]

From 8d2463f45b4cd22f5ce6e38b7beade9231e52f37 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare159@gmail.com>
Date: Fri, 5 Apr 2024 18:46:30 +0100
Subject: [PATCH 09/25] backup wip

---
 lerobot/common/policies/abstract.py   |  76 ------------------
 lerobot/common/policies/act/policy.py | 111 ++++++++++++++++++++------
 lerobot/scripts/eval.py               |   2 +-
 lerobot/scripts/train.py              |   3 +-
 scripts/convert_act_weights.py        |  33 ++++----
 5 files changed, 105 insertions(+), 120 deletions(-)

diff --git a/lerobot/common/policies/abstract.py b/lerobot/common/policies/abstract.py
index 6dc72bef..beebd8ac 100644
--- a/lerobot/common/policies/abstract.py
+++ b/lerobot/common/policies/abstract.py
@@ -4,79 +4,3 @@ import torch
 from torch import Tensor, nn
 
 
-class AbstractPolicy(nn.Module):
-    """Base policy which all policies should be derived from.
-
-    The forward method should generally not be overriden as it plays the role of handling multi-step policies. See its
-    documentation for more information.
-
-    Note:
-        When implementing a concrete class (e.g. `AlohaDataset`, `PushtEnv`, `DiffusionPolicy`), you need to:
-            1. set the required class attributes:
-                - for classes inheriting from `AbstractDataset`: `available_datasets`
-                - for classes inheriting from `AbstractEnv`: `name`, `available_tasks`
-                - for classes inheriting from `AbstractPolicy`: `name`
-            2. update variables in `lerobot/__init__.py` (e.g. `available_envs`, `available_datasets_per_envs`, `available_policies`)
-            3. update variables in `tests/test_available.py` by importing your new class
-    """
-
-    name: str | None = None  # same name should be used to instantiate the policy in factory.py
-
-    def __init__(self, n_action_steps: int | None):
-        """
-        n_action_steps: Sets the cache size for storing action trajectories. If None, it is assumed that a single
-            action is returned by `select_actions` and that doesn't have a horizon dimension. The `forward` method then
-            adds that dimension.
-        """
-        super().__init__()
-        assert self.name is not None, "Subclasses of `AbstractPolicy` should set the `name` class attribute."
-        self.n_action_steps = n_action_steps
-        self.clear_action_queue()
-
-    def update(self, replay_buffer, step):
-        """One step of the policy's learning algorithm."""
-        raise NotImplementedError("Abstract method")
-
-    def save(self, fp):
-        torch.save(self.state_dict(), fp)
-
-    def load(self, fp):
-        d = torch.load(fp)
-        self.load_state_dict(d)
-
-    def select_actions(self, observation) -> Tensor:
-        """Select an action (or trajectory of actions) based on an observation during rollout.
-
-        If n_action_steps was provided at initialization, this should return a (batch_size, n_action_steps, *) tensor of
-        actions. Otherwise if n_actions_steps is None, this should return a (batch_size, *) tensor of actions.
-        """
-        raise NotImplementedError("Abstract method")
-
-    def clear_action_queue(self):
-        """This should be called whenever the environment is reset."""
-        if self.n_action_steps is not None:
-            self._action_queue = deque([], maxlen=self.n_action_steps)
-
-    def forward(self, *args, **kwargs) -> Tensor:
-        """Inference step that makes multi-step policies compatible with their single-step environments.
-
-        WARNING: In general, this should not be overriden.
-
-        Consider a "policy" that observes the environment then charts a course of N actions to take. To make this fit
-        into the formalism of a TorchRL environment, we view it as being effectively a policy that (1) makes an
-        observation and prepares a queue of actions, (2) consumes that queue when queried, regardless of the environment
-        observation, (3) repopulates the action queue when empty. This method handles the aforementioned logic so that
-        the subclass doesn't have to.
-
-        This method effectively wraps the `select_actions` method of the subclass. The following assumptions are made:
-        1. The `select_actions` method returns a Tensor of actions with shape (B, H, *) where B is the batch size, H is
-           the action trajectory horizon and * is the action dimensions.
-        2. Prior to the `select_actions` method being called, theres is an `n_action_steps` instance attribute defined.
-        """
-        if self.n_action_steps is None:
-            return self.select_actions(*args, **kwargs)
-        if len(self._action_queue) == 0:
-            # `select_actions` returns a (batch_size, n_action_steps, *) tensor, but the queue effectively has shape
-            # (n_action_steps, batch_size, *), hence the transpose.
-            self._action_queue.extend(self.select_actions(*args, **kwargs).transpose(0, 1))
-        return self._action_queue.popleft()
diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py
index f42c6a3c..a9a5ac06 100644
--- a/lerobot/common/policies/act/policy.py
+++ b/lerobot/common/policies/act/policy.py
@@ -3,7 +3,7 @@
 As per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware (https://arxiv.org/abs/2304.13705).
 The majority of changes here involve removing unused code, unifying naming, and adding helpful comments.
 """
-
+from collections import deque
 import math
 import time
 from itertools import chain
@@ -22,6 +22,67 @@ from torchvision.ops.misc import FrozenBatchNorm2d
 from lerobot.common.utils import get_safe_torch_device
 
 
+# class AbstractPolicy(nn.Module):
+#     """Base policy which all policies should be derived from.
+
+#     The forward method should generally not be overriden as it plays the role of handling multi-step policies. See its
+#     documentation for more information.
+
+#     Note:
+#         When implementing a concrete class (e.g. `AlohaDataset`, `PushtEnv`, `DiffusionPolicy`), you need to:
+#             1. set the required class attributes:
+#                 - for classes inheriting from `AbstractDataset`: `available_datasets`
+#                 - for classes inheriting from `AbstractEnv`: `name`, `available_tasks`
+#                 - for classes inheriting from `AbstractPolicy`: `name`
+#             2. update variables in `lerobot/__init__.py` (e.g. `available_envs`, `available_datasets_per_envs`, `available_policies`)
+#             3. update variables in `tests/test_available.py` by importing your new class
+#     """
+
+#     name: str | None = None  # same name should be used to instantiate the policy in factory.py
+
+#     def __init__(self, n_action_steps: int | None):
+#         """
+#         n_action_steps: Sets the cache size for storing action trajectories. If None, it is assumed that a single
+#             action is returned by `select_actions` and that doesn't have a horizon dimension. The `forward` method then
+#             adds that dimension.
+#         """
+#         super().__init__()
+#         assert self.name is not None, "Subclasses of `AbstractPolicy` should set the `name` class attribute."
+#         self.n_action_steps = n_action_steps
+#         self.clear_action_queue()
+
+#     def clear_action_queue(self):
+#         """This should be called whenever the environment is reset."""
+#         if self.n_action_steps is not None:
+#             self._action_queue = deque([], maxlen=self.n_action_steps)
+
+#     def forward(self, fn) -> Tensor:
+#         """Inference step that makes multi-step policies compatible with their single-step environments.
+
+#         WARNING: In general, this should not be overriden.
+
+#         Consider a "policy" that observes the environment then charts a course of N actions to take. To make this fit
+#         into the formalism of a TorchRL environment, we view it as being effectively a policy that (1) makes an
+#         observation and prepares a queue of actions, (2) consumes that queue when queried, regardless of the environment
+#         observation, (3) repopulates the action queue when empty. This method handles the aforementioned logic so that
+#         the subclass doesn't have to.
+
+#         This method effectively wraps the `select_actions` method of the subclass. The following assumptions are made:
+#         1. The `select_actions` method returns a Tensor of actions with shape (B, H, *) where B is the batch size, H is
+#            the action trajectory horizon and * is the action dimensions.
+#         2. Prior to the `select_actions` method being called, theres is an `n_action_steps` instance attribute defined.
+#         """
+#         if self.n_action_steps is None:
+#             return self.select_actions(*args, **kwargs)
+#         if len(self._action_queue) == 0:
+#             # `select_actions` returns a (batch_size, n_action_steps, *) tensor, but the queue effectively has shape
+#             # (n_action_steps, batch_size, *), hence the transpose.
+#             self._action_queue.extend(self.select_actions(*args, **kwargs).transpose(0, 1))
+#         return self._action_queue.popleft()
+
+
+
+
 class ActionChunkingTransformerPolicy(nn.Module):
     """
     Action Chunking Transformer Policy as per Learning Fine-Grained Bimanual Manipulation with Low-Cost
@@ -168,14 +229,16 @@ class ActionChunkingTransformerPolicy(nn.Module):
                 nn.init.xavier_uniform_(p)
 
     @torch.no_grad()
-    def select_actions(self, batch, *_):
+    def select_action(self, batch, *_):
         # TODO(now): Implement queueing mechanism.
         self.eval()
         self._preprocess_batch(batch)
 
         # TODO(now): What's up with this 0.182?
         action = self.forward(
-            robot_state=batch["observation.state"] * 0.182, image=batch["observation.images.top"]
+            robot_state=batch["observation.state"] * 0.182,
+            image=batch["observation.images.top"],
+            return_loss=False,
         )
 
         if self.cfg.temporal_agg:
@@ -226,7 +289,7 @@ class ActionChunkingTransformerPolicy(nn.Module):
         assert batch_size % self.cfg.horizon == 0
         assert batch_size % num_slices == 0
 
-        loss = self.compute_loss(batch)
+        loss = self.forward(batch, return_loss=True)["loss"]
         loss.backward()
 
         grad_norm = torch.nn.utils.clip_grad_norm_(
@@ -247,44 +310,38 @@ class ActionChunkingTransformerPolicy(nn.Module):
 
         return info
 
-    def compute_loss(self, batch):
-        loss_dict = self.forward(
-            robot_state=batch["observation.state"],
-            image=batch["observation.images.top"],
-            actions=batch["action"],
-        )
-        loss = loss_dict["loss"]
-        return loss
-
-    def forward(self, robot_state: Tensor, image: Tensor, actions: Tensor | None = None):
+    def forward(self, batch: dict[str, Tensor], return_loss: bool = False):
         # TODO(now): Maybe this shouldn't be here?
         normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-        image = normalize(image)
+        images = normalize(batch["observation.images.top"])
 
-        is_training = actions is not None
-        if is_training:  # training time
-            actions = actions[:, : self.horizon]
+        if return_loss:  # training time
+            actions_hat, (mu_hat, log_sigma_x2_hat) = self._forward(
+                batch["observation.state"], images, batch["action"]
+            )
 
-            a_hat, (mu, log_sigma_x2) = self._forward(robot_state, image, actions)
-
-            all_l1 = F.l1_loss(actions, a_hat, reduction="none")
-            l1 = all_l1.mean()
+            l1_loss = (
+                F.l1_loss(batch["action"], actions_hat, reduction="none")
+                * ~batch["action_is_pad"].unsqueeze(-1)
+            ).mean()
 
             loss_dict = {}
-            loss_dict["l1"] = l1
+            loss_dict["l1"] = l1_loss
             if self.cfg.use_vae:
                 # Calculate Dₖₗ(latent_pdf || standard_normal). Note: After computing the KL-divergence for
                 # each dimension independently, we sum over the latent dimension to get the total
                 # KL-divergence per batch element, then take the mean over the batch.
                 # (See App. B of https://arxiv.org/abs/1312.6114 for more details).
-                mean_kld = (-0.5 * (1 + log_sigma_x2 - mu.pow(2) - (log_sigma_x2).exp())).sum(-1).mean()
+                mean_kld = (
+                    (-0.5 * (1 + log_sigma_x2_hat - mu_hat.pow(2) - (log_sigma_x2_hat).exp())).sum(-1).mean()
+                )
                 loss_dict["kl"] = mean_kld
                 loss_dict["loss"] = loss_dict["l1"] + loss_dict["kl"] * self.cfg.kl_weight
             else:
                 loss_dict["loss"] = loss_dict["l1"]
             return loss_dict
         else:
-            action, _ = self._forward(robot_state, image)  # no action, sample from prior
+            action, _ = self._forward(batch["observation.state"], images)
             return action
 
     def _forward(self, robot_state: Tensor, image: Tensor, actions: Tensor | None = None):
@@ -321,7 +378,9 @@ class ActionChunkingTransformerPolicy(nn.Module):
             # Forward pass through VAE encoder and sample the latent with the reparameterization trick.
             cls_token_out = self.vae_encoder(
                 vae_encoder_input.permute(1, 0, 2), pos_embed=pos_embed.permute(1, 0, 2)
-            )[0]  # (B, D)
+            )[
+                0
+            ]  # (B, D)
             latent_pdf_params = self.vae_encoder_latent_output_proj(cls_token_out)
             mu = latent_pdf_params[:, : self.latent_dim]
             # This is 2log(sigma). Done this way to match the original implementation.
diff --git a/lerobot/scripts/eval.py b/lerobot/scripts/eval.py
index e7ba53fc..b05f9704 100644
--- a/lerobot/scripts/eval.py
+++ b/lerobot/scripts/eval.py
@@ -251,7 +251,7 @@ def eval(cfg: dict, out_dir=None, stats_path=None):
     dataset = make_dataset(cfg, stats_path=stats_path)
 
     logging.info("Making environment.")
-    env = make_env(cfg, num_parallel_envs=cfg.eval_episodes)
+    env = make_env(cfg, num_parallel_envs=cfg.rollout_batch_size)
 
     # when policy is None, rollout a random policy
     policy = make_policy(cfg) if cfg.policy.pretrained_model_path else None
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index d49dfff8..81f3cdbc 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -148,7 +148,8 @@ def train(cfg: dict, out_dir=None, job_name=None):
     #     )
 
     logging.info("make_env")
-    env = make_env(cfg, num_parallel_envs=cfg.eval_episodes)
+    # TODO(now): uncomment
+    #env = make_env(cfg, num_parallel_envs=cfg.eval_episodes)
 
     logging.info("make_policy")
     policy = make_policy(cfg)
diff --git a/scripts/convert_act_weights.py b/scripts/convert_act_weights.py
index b1492009..d5e38796 100644
--- a/scripts/convert_act_weights.py
+++ b/scripts/convert_act_weights.py
@@ -28,22 +28,23 @@ for to_remove in start_removals:
 # Replace keys based on what they start with.
 
 start_replacements = [
-    ("model.query_embed.weight", "model.pos_embed.weight"),
-    ("model.pos_table", "model.vae_encoder_pos_enc"),
-    ("model.pos_embed.weight", "model.decoder_pos_embed.weight"),
-    ("model.encoder.", "model.vae_encoder."),
-    ("model.encoder_action_proj.", "model.vae_encoder_action_input_proj."),
-    ("model.encoder_joint_proj.", "model.vae_encoder_robot_state_input_proj."),
-    ("model.latent_proj.", "model.vae_encoder_latent_output_proj."),
-    ("model.latent_proj.", "model.vae_encoder_latent_output_proj."),
-    ("model.input_proj.", "model.encoder_img_feat_input_proj."),
-    ("model.input_proj_robot_state", "model.encoder_robot_state_input_proj"),
-    ("model.latent_out_proj.", "model.encoder_latent_input_proj."),
-    ("model.transformer.encoder.", "model.encoder."),
-    ("model.transformer.decoder.", "model.decoder."),
-    ("model.backbones.0.0.body.", "model.backbone."),
-    ("model.additional_pos_embed.weight", "model.encoder_robot_and_latent_pos_embed.weight"),
-    ("model.cls_embed.weight", "model.vae_encoder_cls_embed.weight"),
+    ("model.", ""),
+    ("query_embed.weight", "pos_embed.weight"),
+    ("pos_table", "vae_encoder_pos_enc"),
+    ("pos_embed.weight", "decoder_pos_embed.weight"),
+    ("encoder.", "vae_encoder."),
+    ("encoder_action_proj.", "vae_encoder_action_input_proj."),
+    ("encoder_joint_proj.", "vae_encoder_robot_state_input_proj."),
+    ("latent_proj.", "vae_encoder_latent_output_proj."),
+    ("latent_proj.", "vae_encoder_latent_output_proj."),
+    ("input_proj.", "encoder_img_feat_input_proj."),
+    ("input_proj_robot_state", "encoder_robot_state_input_proj"),
+    ("latent_out_proj.", "encoder_latent_input_proj."),
+    ("transformer.encoder.", "encoder."),
+    ("transformer.decoder.", "decoder."),
+    ("backbones.0.0.body.", "backbone."),
+    ("additional_pos_embed.weight", "encoder_robot_and_latent_pos_embed.weight"),
+    ("cls_embed.weight", "vae_encoder_cls_embed.weight"),
 ]
 
 for to_replace, replace_with in start_replacements:

From 1bab4a1dd5fab56f18306496077e7a9db9c9b2fc Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare159@gmail.com>
Date: Mon, 8 Apr 2024 10:23:26 +0100
Subject: [PATCH 10/25] Eval reproduction works with gym_aloha

---
 lerobot/common/envs/factory.py        |   2 +-
 lerobot/common/policies/act/policy.py | 130 +++++++++-----------------
 lerobot/common/policies/factory.py    |   1 -
 lerobot/configs/policy/act.yaml       |   2 +-
 lerobot/scripts/eval.py               |   8 +-
 poetry.lock                           |  26 +++---
 6 files changed, 66 insertions(+), 103 deletions(-)

diff --git a/lerobot/common/envs/factory.py b/lerobot/common/envs/factory.py
index 971f4b63..749bb533 100644
--- a/lerobot/common/envs/factory.py
+++ b/lerobot/common/envs/factory.py
@@ -35,7 +35,7 @@ def make_env(cfg, num_parallel_envs=0) -> gym.Env | gym.vector.SyncVectorEnv:
         kwargs["task"] = cfg.env.task
 
         env_fn = lambda: gym.make(  # noqa: E731
-            "gym_aloha/AlohaInsertion-v0",
+            "gym_aloha/AlohaTransferCube-v0",
             **kwargs,
         )
     else:
diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py
index a9a5ac06..75d5ca0e 100644
--- a/lerobot/common/policies/act/policy.py
+++ b/lerobot/common/policies/act/policy.py
@@ -3,9 +3,10 @@
 As per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware (https://arxiv.org/abs/2304.13705).
 The majority of changes here involve removing unused code, unifying naming, and adding helpful comments.
 """
-from collections import deque
+
 import math
 import time
+from collections import deque
 from itertools import chain
 from typing import Callable
 
@@ -22,67 +23,6 @@ from torchvision.ops.misc import FrozenBatchNorm2d
 from lerobot.common.utils import get_safe_torch_device
 
 
-# class AbstractPolicy(nn.Module):
-#     """Base policy which all policies should be derived from.
-
-#     The forward method should generally not be overriden as it plays the role of handling multi-step policies. See its
-#     documentation for more information.
-
-#     Note:
-#         When implementing a concrete class (e.g. `AlohaDataset`, `PushtEnv`, `DiffusionPolicy`), you need to:
-#             1. set the required class attributes:
-#                 - for classes inheriting from `AbstractDataset`: `available_datasets`
-#                 - for classes inheriting from `AbstractEnv`: `name`, `available_tasks`
-#                 - for classes inheriting from `AbstractPolicy`: `name`
-#             2. update variables in `lerobot/__init__.py` (e.g. `available_envs`, `available_datasets_per_envs`, `available_policies`)
-#             3. update variables in `tests/test_available.py` by importing your new class
-#     """
-
-#     name: str | None = None  # same name should be used to instantiate the policy in factory.py
-
-#     def __init__(self, n_action_steps: int | None):
-#         """
-#         n_action_steps: Sets the cache size for storing action trajectories. If None, it is assumed that a single
-#             action is returned by `select_actions` and that doesn't have a horizon dimension. The `forward` method then
-#             adds that dimension.
-#         """
-#         super().__init__()
-#         assert self.name is not None, "Subclasses of `AbstractPolicy` should set the `name` class attribute."
-#         self.n_action_steps = n_action_steps
-#         self.clear_action_queue()
-
-#     def clear_action_queue(self):
-#         """This should be called whenever the environment is reset."""
-#         if self.n_action_steps is not None:
-#             self._action_queue = deque([], maxlen=self.n_action_steps)
-
-#     def forward(self, fn) -> Tensor:
-#         """Inference step that makes multi-step policies compatible with their single-step environments.
-
-#         WARNING: In general, this should not be overriden.
-
-#         Consider a "policy" that observes the environment then charts a course of N actions to take. To make this fit
-#         into the formalism of a TorchRL environment, we view it as being effectively a policy that (1) makes an
-#         observation and prepares a queue of actions, (2) consumes that queue when queried, regardless of the environment
-#         observation, (3) repopulates the action queue when empty. This method handles the aforementioned logic so that
-#         the subclass doesn't have to.
-
-#         This method effectively wraps the `select_actions` method of the subclass. The following assumptions are made:
-#         1. The `select_actions` method returns a Tensor of actions with shape (B, H, *) where B is the batch size, H is
-#            the action trajectory horizon and * is the action dimensions.
-#         2. Prior to the `select_actions` method being called, theres is an `n_action_steps` instance attribute defined.
-#         """
-#         if self.n_action_steps is None:
-#             return self.select_actions(*args, **kwargs)
-#         if len(self._action_queue) == 0:
-#             # `select_actions` returns a (batch_size, n_action_steps, *) tensor, but the queue effectively has shape
-#             # (n_action_steps, batch_size, *), hence the transpose.
-#             self._action_queue.extend(self.select_actions(*args, **kwargs).transpose(0, 1))
-#         return self._action_queue.popleft()
-
-
-
-
 class ActionChunkingTransformerPolicy(nn.Module):
     """
     Action Chunking Transformer Policy as per Learning Fine-Grained Bimanual Manipulation with Low-Cost
@@ -228,18 +168,30 @@ class ActionChunkingTransformerPolicy(nn.Module):
             if p.dim() > 1:
                 nn.init.xavier_uniform_(p)
 
-    @torch.no_grad()
-    def select_action(self, batch, *_):
-        # TODO(now): Implement queueing mechanism.
-        self.eval()
-        self._preprocess_batch(batch)
+    def reset(self):
+        """This should be called whenever the environment is reset."""
+        if self.n_action_steps is not None:
+            self._action_queue = deque([], maxlen=self.n_action_steps)
 
-        # TODO(now): What's up with this 0.182?
-        action = self.forward(
-            robot_state=batch["observation.state"] * 0.182,
-            image=batch["observation.images.top"],
-            return_loss=False,
-        )
+    def select_action(self, batch: dict[str, Tensor], *_):
+        """
+        This method wraps `select_actions` in order to return one action at a time for execution in the
+        environment. It works by managing the actions in a queue and only calling `select_actions` when the
+        queue is empty.
+        """
+        if len(self._action_queue) == 0:
+            # `select_actions` returns a (batch_size, n_action_steps, *) tensor, but the queue effectively has shape
+            # (n_action_steps, batch_size, *), hence the transpose.
+            self._action_queue.extend(self.select_actions(batch).transpose(0, 1))
+        return self._action_queue.popleft()
+
+    @torch.no_grad()
+    def select_actions(self, batch: dict[str, Tensor]):
+        """Use the action chunking transformer to generate a sequence of actions."""
+        self.eval()
+        self._preprocess_batch(batch, add_obs_steps_dim=True)
+
+        action = self.forward(batch, return_loss=False)
 
         if self.cfg.temporal_agg:
             # TODO(rcadene): implement temporal aggregation
@@ -257,25 +209,37 @@ class ActionChunkingTransformerPolicy(nn.Module):
         return action[: self.n_action_steps]
 
     def __call__(self, *args, **kwargs):
-        # TODO(now): Temporary bridge.
+        # TODO(now): Temporary bridge until we know what to do about the `update` method.
         return self.update(*args, **kwargs)
 
-    def _preprocess_batch(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
+    def _preprocess_batch(
+        self, batch: dict[str, Tensor], add_obs_steps_dim: bool = False
+    ) -> dict[str, Tensor]:
         """
-        Expects batch to have (at least):
+        This function expects `batch` to have (at least):
         {
-            "observation.state": (B, 1, J) tensor of robot states (joint configuration)
-
-            "observation.images.top": (B, 1, C, H, W) tensor of images.
+            "observation.state": (B, 1, J) OR (B, J) tensor of robot states (joint configuration).
+            "observation.images.top": (B, 1, C, H, W) OR (B, C, H, W) tensor of images.
             "action": (B, H, J) tensor of actions (positional target for robot joint configuration)
             "action_is_pad": (B, H) mask for whether the actions are padding outside of the episode bounds.
         }
         """
+        if add_obs_steps_dim:
+            # Add a dimension for the observations steps. Since n_obs_steps > 1 is not supported right now,
+            # this just amounts to an unsqueeze.
+            for k in batch:
+                if k.startswith("observation."):
+                    batch[k] = batch[k].unsqueeze(1)
+
         if batch["observation.state"].shape[1] != 1:
             raise ValueError(self._multiple_obs_steps_not_handled_msg)
         batch["observation.state"] = batch["observation.state"].squeeze(1)
-        # TODO(alexander-soare): generalize this to multiple images. Note: no squeeze is required for
-        # "observation.images.top" because then we'd have to unsqueeze to get get the image index dimension.
+        # TODO(alexander-soare): generalize this to multiple images.
+        assert (
+            sum(k.startswith("observation.images.") and not k.endswith("is_pad") for k in batch) == 1
+        ), "ACT only handles one image for now."
+        # Note: no squeeze is required for "observation.images.top" because then we'd have to unsqueeze to get
+        # the image index dimension.
 
     def update(self, batch, *_):
         start_time = time.time()
@@ -378,9 +342,7 @@ class ActionChunkingTransformerPolicy(nn.Module):
             # Forward pass through VAE encoder and sample the latent with the reparameterization trick.
             cls_token_out = self.vae_encoder(
                 vae_encoder_input.permute(1, 0, 2), pos_embed=pos_embed.permute(1, 0, 2)
-            )[
-                0
-            ]  # (B, D)
+            )[0]  # (B, D)
             latent_pdf_params = self.vae_encoder_latent_output_proj(cls_token_out)
             mu = latent_pdf_params[:, : self.latent_dim]
             # This is 2log(sigma). Done this way to match the original implementation.
diff --git a/lerobot/common/policies/factory.py b/lerobot/common/policies/factory.py
index 90e7ecc1..cc956014 100644
--- a/lerobot/common/policies/factory.py
+++ b/lerobot/common/policies/factory.py
@@ -26,7 +26,6 @@ def make_policy(cfg):
         policy = ActionChunkingTransformerPolicy(
             cfg.policy,
             cfg.device,
-            n_obs_steps=cfg.n_obs_steps,
             n_action_steps=cfg.n_action_steps,
         )
     else:
diff --git a/lerobot/configs/policy/act.yaml b/lerobot/configs/policy/act.yaml
index c1d1801f..80f50003 100644
--- a/lerobot/configs/policy/act.yaml
+++ b/lerobot/configs/policy/act.yaml
@@ -58,6 +58,6 @@ policy:
   action_dim: ???
 
   delta_timestamps:
-    observation.image: [0.0]
+    observation.images.top: [0.0]
     observation.state: [0.0]
     action: [0.0, 0.02, 0.04, 0.06, 0.08, 0.1, 0.12, 0.14, 0.16, 0.18, 0.2, 0.22, 0.24, 0.26, 0.28, 0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.70, 0.72, 0.74, 0.76, 0.78, 0.8, 0.82, 0.84, 0.86, 0.88, 0.9, 0.92, 0.94, 0.96, 0.98, 1.0, 1.02, 1.04, 1.06, 1.08, 1.1, 1.12, 1.14, 1.16, 1.18, 1.2, 1.22, 1.24, 1.26, 1.28, 1.3, 1.32, 1.34, 1.36, 1.38, 1.40, 1.42, 1.44, 1.46, 1.48, 1.5, 1.52, 1.54, 1.56, 1.58, 1.6, 1.62, 1.64, 1.66, 1.68, 1.7, 1.72, 1.74, 1.76, 1.78, 1.8, 1.82, 1.84, 1.86, 1.88, 1.90, 1.92, 1.94, 1.96, 1.98]
diff --git a/lerobot/scripts/eval.py b/lerobot/scripts/eval.py
index b05f9704..b43f4ed1 100644
--- a/lerobot/scripts/eval.py
+++ b/lerobot/scripts/eval.py
@@ -89,7 +89,9 @@ def eval_policy(
                 visu = env.envs[0].render(mode="visualization")
                 visu = visu[None, ...]  # add batch dim
             else:
-                visu = np.stack([env.render(mode="visualization") for env in env.envs])
+                # TODO(now): Put mode back in.
+                visu = np.stack([env.render() for env in env.envs])
+                # visu = np.stack([env.render(mode="visualization") for env in env.envs])
             ep_frames.append(visu)  # noqa: B023
 
     for _ in range(num_episodes):
@@ -248,7 +250,7 @@ def eval(cfg: dict, out_dir=None, stats_path=None):
 
     logging.info("Making transforms.")
     # TODO(alexander-soare): Completely decouple datasets from evaluation.
-    dataset = make_dataset(cfg, stats_path=stats_path)
+    transform = make_dataset(cfg, stats_path=stats_path).transform
 
     logging.info("Making environment.")
     env = make_env(cfg, num_parallel_envs=cfg.rollout_batch_size)
@@ -263,7 +265,7 @@ def eval(cfg: dict, out_dir=None, stats_path=None):
         video_dir=Path(out_dir) / "eval",
         fps=cfg.env.fps,
         # TODO(rcadene): what should we do with the transform?
-        transform=dataset.transform,
+        transform=transform,
         seed=cfg.seed,
     )
     print(info["aggregated"])
diff --git a/poetry.lock b/poetry.lock
index f96f66bc..60354b8a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -941,7 +941,7 @@ mujoco = "^2.3.7"
 type = "git"
 url = "git@github.com:huggingface/gym-xarm.git"
 reference = "HEAD"
-resolved_reference = "2eb83fc4fc871b9d271c946d169e42f226ac3a7c"
+resolved_reference = "08ddd5a9400783a6898bbf3c3014fc5da3961b9d"
 
 [[package]]
 name = "gymnasium"
@@ -1709,20 +1709,20 @@ pyopengl = "*"
 
 [[package]]
 name = "networkx"
-version = "3.2.1"
+version = "3.3"
 description = "Python package for creating and manipulating graphs and networks"
 optional = false
-python-versions = ">=3.9"
+python-versions = ">=3.10"
 files = [
-    {file = "networkx-3.2.1-py3-none-any.whl", hash = "sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2"},
-    {file = "networkx-3.2.1.tar.gz", hash = "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6"},
+    {file = "networkx-3.3-py3-none-any.whl", hash = "sha256:28575580c6ebdaf4505b22c6256a2b9de86b316dc63ba9e93abde3d78dfdbcf2"},
+    {file = "networkx-3.3.tar.gz", hash = "sha256:0c127d8b2f4865f59ae9cb8aafcd60b5c70f3241ebd66f7defad7c4ab90126c9"},
 ]
 
 [package.extras]
-default = ["matplotlib (>=3.5)", "numpy (>=1.22)", "pandas (>=1.4)", "scipy (>=1.9,!=1.11.0,!=1.11.1)"]
-developer = ["changelist (==0.4)", "mypy (>=1.1)", "pre-commit (>=3.2)", "rtoml"]
-doc = ["nb2plots (>=0.7)", "nbconvert (<7.9)", "numpydoc (>=1.6)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.14)", "sphinx (>=7)", "sphinx-gallery (>=0.14)", "texext (>=0.6.7)"]
-extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.11)", "sympy (>=1.10)"]
+default = ["matplotlib (>=3.6)", "numpy (>=1.23)", "pandas (>=1.4)", "scipy (>=1.9,!=1.11.0,!=1.11.1)"]
+developer = ["changelist (==0.5)", "mypy (>=1.1)", "pre-commit (>=3.2)", "rtoml"]
+doc = ["myst-nb (>=1.0)", "numpydoc (>=1.7)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.14)", "sphinx (>=7)", "sphinx-gallery (>=0.14)", "texext (>=0.6.7)"]
+extra = ["lxml (>=4.6)", "pydot (>=2.0)", "pygraphviz (>=1.12)", "sympy (>=1.10)"]
 test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"]
 
 [[package]]
@@ -3699,20 +3699,20 @@ watchdog = ["watchdog (>=2.3)"]
 
 [[package]]
 name = "zarr"
-version = "2.17.1"
+version = "2.17.2"
 description = "An implementation of chunked, compressed, N-dimensional arrays for Python"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "zarr-2.17.1-py3-none-any.whl", hash = "sha256:e25df2741a6e92645f3890f30f3136d5b57a0f8f831094b024bbcab5f2797bc7"},
-    {file = "zarr-2.17.1.tar.gz", hash = "sha256:564b3aa072122546fe69a0fa21736f466b20fad41754334b62619f088ce46261"},
+    {file = "zarr-2.17.2-py3-none-any.whl", hash = "sha256:70d7cc07c24280c380ef80644151d136b7503b0d83c9f214e8000ddc0f57f69b"},
+    {file = "zarr-2.17.2.tar.gz", hash = "sha256:2cbaa6cb4e342d45152d4a7a4b2013c337fcd3a8e7bc98253560180de60552ce"},
 ]
 
 [package.dependencies]
 asciitree = "*"
 fasteners = {version = "*", markers = "sys_platform != \"emscripten\""}
 numcodecs = ">=0.10.0"
-numpy = ">=1.21.1"
+numpy = ">=1.23"
 
 [package.extras]
 docs = ["numcodecs[msgpack]", "numpydoc", "pydata-sphinx-theme", "sphinx", "sphinx-automodapi", "sphinx-copybutton", "sphinx-design", "sphinx-issues"]

From 863f28ffd8883cf0b21ebc4bd4f57c327ecb0cd2 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare159@gmail.com>
Date: Mon, 8 Apr 2024 13:10:19 +0100
Subject: [PATCH 11/25] ready for review

---
 lerobot/common/datasets/factory.py          | 88 +--------------------
 lerobot/common/policies/abstract.py         | 76 ++++++++++++++++++
 lerobot/common/policies/act/policy.py       |  9 ++-
 lerobot/common/policies/diffusion/policy.py |  2 -
 lerobot/configs/policy/act.yaml             | 10 ++-
 lerobot/scripts/eval.py                     |  4 +-
 scripts/convert_act_weights.py              | 71 -----------------
 7 files changed, 92 insertions(+), 168 deletions(-)
 delete mode 100644 scripts/convert_act_weights.py

diff --git a/lerobot/common/datasets/factory.py b/lerobot/common/datasets/factory.py
index ed7854ff..c22ae698 100644
--- a/lerobot/common/datasets/factory.py
+++ b/lerobot/common/datasets/factory.py
@@ -59,96 +59,10 @@ def make_dataset(
                 transform=Prod(in_keys=clsfunc.image_keys, prod=1 / 255.0),
             )
             stats = compute_or_load_stats(stats_dataset)
+
         # TODO(rcadene): remove this and put it in config. Ideally we want to reproduce SOTA results just with mean_std
         normalization_mode = "mean_std" if cfg.env.name == "aloha" else "min_max"
 
-        # # TODO(now): These stats are needed to use their pretrained model for sim_transfer_cube_human.
-        # # (Pdb) stats['observation']['state']['mean']
-        # # tensor([-0.0071, -0.6293,  1.0351, -0.0517, -0.4642, -0.0754,  0.4751, -0.0373,
-        # #         -0.3324,  0.9034, -0.2258, -0.3127, -0.2412,  0.6866])
-        # stats["observation", "state", "mean"] = torch.tensor(
-        #     [
-        #         -0.00740268,
-        #         -0.63187766,
-        #         1.0356655,
-        #         -0.05027218,
-        #         -0.46199223,
-        #         -0.07467502,
-        #         0.47467607,
-        #         -0.03615446,
-        #         -0.33203387,
-        #         0.9038929,
-        #         -0.22060776,
-        #         -0.31011587,
-        #         -0.23484458,
-        #         0.6842416,
-        #     ]
-        # )
-        # # (Pdb) stats['observation']['state']['std']
-        # # tensor([0.0022, 0.0520, 0.0291, 0.0092, 0.0267, 0.0145, 0.0563, 0.0179, 0.0494,
-        # #         0.0326, 0.0476, 0.0535, 0.0956, 0.0513])
-        # stats["observation", "state", "std"] = torch.tensor(
-        #     [
-        #         0.01219023,
-        #         0.2975381,
-        #         0.16728032,
-        #         0.04733803,
-        #         0.1486037,
-        #         0.08788499,
-        #         0.31752336,
-        #         0.1049916,
-        #         0.27933604,
-        #         0.18094037,
-        #         0.26604933,
-        #         0.30466506,
-        #         0.5298686,
-        #         0.25505227,
-        #     ]
-        # )
-        # # (Pdb) stats['action']['mean']
-        # # tensor([-0.0075, -0.6346,  1.0353, -0.0465, -0.4686, -0.0738,  0.3723, -0.0396,
-        # #         -0.3184,  0.8991, -0.2065, -0.3182, -0.2338,  0.5593])
-        # stats["action"]["mean"] = torch.tensor(
-        #     [
-        #         -0.00756444,
-        #         -0.6281845,
-        #         1.0312834,
-        #         -0.04664314,
-        #         -0.47211358,
-        #         -0.074527,
-        #         0.37389806,
-        #         -0.03718753,
-        #         -0.3261143,
-        #         0.8997205,
-        #         -0.21371077,
-        #         -0.31840396,
-        #         -0.23360962,
-        #         0.551947,
-        #     ]
-        # )
-        # # (Pdb) stats['action']['std']
-        # # tensor([0.0023, 0.0514, 0.0290, 0.0086, 0.0263, 0.0143, 0.0593, 0.0185, 0.0510,
-        # #         0.0328, 0.0478, 0.0531, 0.0945, 0.0794])
-        # stats["action"]["std"] = torch.tensor(
-        #     [
-        #         0.01252818,
-        #         0.2957442,
-        #         0.16701928,
-        #         0.04584508,
-        #         0.14833844,
-        #         0.08763024,
-        #         0.30665937,
-        #         0.10600077,
-        #         0.27572668,
-        #         0.1805853,
-        #         0.26304692,
-        #         0.30708534,
-        #         0.5305411,
-        #         0.38381037,
-        #     ]
-        # )
-        # transforms.append(NormalizeTransform(stats, in_keys, mode=normalization_mode))  # noqa: F821
-
         transforms = v2.Compose(
             [
                 # TODO(rcadene): we need to do something about image_keys
diff --git a/lerobot/common/policies/abstract.py b/lerobot/common/policies/abstract.py
index beebd8ac..6dc72bef 100644
--- a/lerobot/common/policies/abstract.py
+++ b/lerobot/common/policies/abstract.py
@@ -4,3 +4,79 @@ import torch
 from torch import Tensor, nn
 
 
+class AbstractPolicy(nn.Module):
+    """Base policy which all policies should be derived from.
+
+    The forward method should generally not be overriden as it plays the role of handling multi-step policies. See its
+    documentation for more information.
+
+    Note:
+        When implementing a concrete class (e.g. `AlohaDataset`, `PushtEnv`, `DiffusionPolicy`), you need to:
+            1. set the required class attributes:
+                - for classes inheriting from `AbstractDataset`: `available_datasets`
+                - for classes inheriting from `AbstractEnv`: `name`, `available_tasks`
+                - for classes inheriting from `AbstractPolicy`: `name`
+            2. update variables in `lerobot/__init__.py` (e.g. `available_envs`, `available_datasets_per_envs`, `available_policies`)
+            3. update variables in `tests/test_available.py` by importing your new class
+    """
+
+    name: str | None = None  # same name should be used to instantiate the policy in factory.py
+
+    def __init__(self, n_action_steps: int | None):
+        """
+        n_action_steps: Sets the cache size for storing action trajectories. If None, it is assumed that a single
+            action is returned by `select_actions` and that doesn't have a horizon dimension. The `forward` method then
+            adds that dimension.
+        """
+        super().__init__()
+        assert self.name is not None, "Subclasses of `AbstractPolicy` should set the `name` class attribute."
+        self.n_action_steps = n_action_steps
+        self.clear_action_queue()
+
+    def update(self, replay_buffer, step):
+        """One step of the policy's learning algorithm."""
+        raise NotImplementedError("Abstract method")
+
+    def save(self, fp):
+        torch.save(self.state_dict(), fp)
+
+    def load(self, fp):
+        d = torch.load(fp)
+        self.load_state_dict(d)
+
+    def select_actions(self, observation) -> Tensor:
+        """Select an action (or trajectory of actions) based on an observation during rollout.
+
+        If n_action_steps was provided at initialization, this should return a (batch_size, n_action_steps, *) tensor of
+        actions. Otherwise if n_actions_steps is None, this should return a (batch_size, *) tensor of actions.
+        """
+        raise NotImplementedError("Abstract method")
+
+    def clear_action_queue(self):
+        """This should be called whenever the environment is reset."""
+        if self.n_action_steps is not None:
+            self._action_queue = deque([], maxlen=self.n_action_steps)
+
+    def forward(self, *args, **kwargs) -> Tensor:
+        """Inference step that makes multi-step policies compatible with their single-step environments.
+
+        WARNING: In general, this should not be overriden.
+
+        Consider a "policy" that observes the environment then charts a course of N actions to take. To make this fit
+        into the formalism of a TorchRL environment, we view it as being effectively a policy that (1) makes an
+        observation and prepares a queue of actions, (2) consumes that queue when queried, regardless of the environment
+        observation, (3) repopulates the action queue when empty. This method handles the aforementioned logic so that
+        the subclass doesn't have to.
+
+        This method effectively wraps the `select_actions` method of the subclass. The following assumptions are made:
+        1. The `select_actions` method returns a Tensor of actions with shape (B, H, *) where B is the batch size, H is
+           the action trajectory horizon and * is the action dimensions.
+        2. Prior to the `select_actions` method being called, theres is an `n_action_steps` instance attribute defined.
+        """
+        if self.n_action_steps is None:
+            return self.select_actions(*args, **kwargs)
+        if len(self._action_queue) == 0:
+            # `select_actions` returns a (batch_size, n_action_steps, *) tensor, but the queue effectively has shape
+            # (n_action_steps, batch_size, *), hence the transpose.
+            self._action_queue.extend(self.select_actions(*args, **kwargs).transpose(0, 1))
+        return self._action_queue.popleft()
diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py
index 75d5ca0e..834dd9b2 100644
--- a/lerobot/common/policies/act/policy.py
+++ b/lerobot/common/policies/act/policy.py
@@ -67,7 +67,7 @@ class ActionChunkingTransformerPolicy(nn.Module):
 
     def __init__(self, cfg, device, n_action_steps=1):
         """
-        TODO(alexander-soare): Add documentation for all parameters.
+        TODO(alexander-soare): Add documentation for all parameters once we have model configs established.
         """
         super().__init__()
         if getattr(cfg, "n_obs_steps", 1) != 1:
@@ -109,6 +109,9 @@ class ActionChunkingTransformerPolicy(nn.Module):
             )
 
         # Backbone for image feature extraction.
+        self.image_normalizer = transforms.Normalize(
+            mean=cfg.image_normalization.mean, std=cfg.image_normalization.std
+        )
         backbone_model = getattr(torchvision.models, cfg.backbone)(
             replace_stride_with_dilation=[False, False, cfg.dilation],
             pretrained=cfg.pretrained_backbone,
@@ -275,9 +278,7 @@ class ActionChunkingTransformerPolicy(nn.Module):
         return info
 
     def forward(self, batch: dict[str, Tensor], return_loss: bool = False):
-        # TODO(now): Maybe this shouldn't be here?
-        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-        images = normalize(batch["observation.images.top"])
+        images = self.image_normalizer(batch["observation.images.top"])
 
         if return_loss:  # training time
             actions_hat, (mu_hat, log_sigma_x2_hat) = self._forward(
diff --git a/lerobot/common/policies/diffusion/policy.py b/lerobot/common/policies/diffusion/policy.py
index 93e5ba5d..9785358b 100644
--- a/lerobot/common/policies/diffusion/policy.py
+++ b/lerobot/common/policies/diffusion/policy.py
@@ -151,7 +151,6 @@ class DiffusionPolicy(nn.Module):
 
         self.diffusion.train()
 
-        data_s = time.time() - start_time
         loss = self.diffusion.compute_loss(batch)
         loss.backward()
 
@@ -172,7 +171,6 @@ class DiffusionPolicy(nn.Module):
             "loss": loss.item(),
             "grad_norm": float(grad_norm),
             "lr": self.lr_scheduler.get_last_lr()[0],
-            "data_s": data_s,
             "update_s": time.time() - start_time,
         }
 
diff --git a/lerobot/configs/policy/act.yaml b/lerobot/configs/policy/act.yaml
index 80f50003..cd34d115 100644
--- a/lerobot/configs/policy/act.yaml
+++ b/lerobot/configs/policy/act.yaml
@@ -1,6 +1,6 @@
 # @package _global_
 
-offline_steps: 2000
+offline_steps: 80000
 online_steps: 0
 
 eval_episodes: 1
@@ -54,8 +54,12 @@ policy:
 
   temporal_agg: false
 
-  state_dim: ???
-  action_dim: ???
+  state_dim: 14
+  action_dim: 14
+
+  image_normalization:
+    mean: [0.485, 0.456, 0.406]
+    std: [0.229, 0.224, 0.225]
 
   delta_timestamps:
     observation.images.top: [0.0]
diff --git a/lerobot/scripts/eval.py b/lerobot/scripts/eval.py
index b43f4ed1..72966211 100644
--- a/lerobot/scripts/eval.py
+++ b/lerobot/scripts/eval.py
@@ -86,7 +86,9 @@ def eval_policy(
     def maybe_render_frame(env):
         if save_video:  # noqa: B023
             if return_first_video:
-                visu = env.envs[0].render(mode="visualization")
+                # TODO(now): Put mode back in.
+                visu = env.envs[0].render()
+                # visu = env.envs[0].render(mode="visualization")
                 visu = visu[None, ...]  # add batch dim
             else:
                 # TODO(now): Put mode back in.
diff --git a/scripts/convert_act_weights.py b/scripts/convert_act_weights.py
deleted file mode 100644
index d5e38796..00000000
--- a/scripts/convert_act_weights.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import torch
-
-from lerobot.common.policies.factory import make_policy
-from lerobot.common.utils import init_hydra_config
-
-cfg = init_hydra_config(
-    "/home/alexander/Projects/lerobot/outputs/train/act_aloha_sim_transfer_cube_human/.hydra/config.yaml"
-)
-
-policy = make_policy(cfg)
-
-state_dict = torch.load("/home/alexander/Projects/act/outputs/sim_transfer_cube_human_vae/policy_last.ckpt")
-
-# Remove keys based on what they start with.
-
-start_removals = [
-    # There is a bug that means the pretrained model doesn't even use the final decoder layers.
-    *[f"model.transformer.decoder.layers.{i}" for i in range(1, 7)],
-    "model.is_pad_head.",
-]
-
-for to_remove in start_removals:
-    for k in list(state_dict.keys()):
-        if k.startswith(to_remove):
-            del state_dict[k]
-
-
-# Replace keys based on what they start with.
-
-start_replacements = [
-    ("model.", ""),
-    ("query_embed.weight", "pos_embed.weight"),
-    ("pos_table", "vae_encoder_pos_enc"),
-    ("pos_embed.weight", "decoder_pos_embed.weight"),
-    ("encoder.", "vae_encoder."),
-    ("encoder_action_proj.", "vae_encoder_action_input_proj."),
-    ("encoder_joint_proj.", "vae_encoder_robot_state_input_proj."),
-    ("latent_proj.", "vae_encoder_latent_output_proj."),
-    ("latent_proj.", "vae_encoder_latent_output_proj."),
-    ("input_proj.", "encoder_img_feat_input_proj."),
-    ("input_proj_robot_state", "encoder_robot_state_input_proj"),
-    ("latent_out_proj.", "encoder_latent_input_proj."),
-    ("transformer.encoder.", "encoder."),
-    ("transformer.decoder.", "decoder."),
-    ("backbones.0.0.body.", "backbone."),
-    ("additional_pos_embed.weight", "encoder_robot_and_latent_pos_embed.weight"),
-    ("cls_embed.weight", "vae_encoder_cls_embed.weight"),
-]
-
-for to_replace, replace_with in start_replacements:
-    for k in list(state_dict.keys()):
-        if k.startswith(to_replace):
-            k_ = replace_with + k.removeprefix(to_replace)
-            state_dict[k_] = state_dict[k]
-            del state_dict[k]
-
-
-missing_keys, unexpected_keys = policy.load_state_dict(state_dict, strict=False)
-
-if len(missing_keys) != 0:
-    print("MISSING KEYS")
-    print(missing_keys)
-if len(unexpected_keys) != 0:
-    print("UNEXPECTED KEYS")
-    print(unexpected_keys)
-
-# if len(missing_keys) != 0 or len(unexpected_keys) != 0:
-#     print("Failed due to mismatch in state dicts.")
-#     exit()
-
-policy.save("/tmp/weights.pth")

From 86365adf9fd909c5037f0a3a00a0e1d706a44c61 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare159@gmail.com>
Date: Mon, 8 Apr 2024 14:44:10 +0100
Subject: [PATCH 13/25] revision

---
 lerobot/common/envs/factory.py        | 12 +++++++-----
 lerobot/common/policies/act/policy.py | 19 +++++++++++--------
 lerobot/configs/policy/act.yaml       |  2 +-
 lerobot/scripts/train.py              |  3 +--
 4 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/lerobot/common/envs/factory.py b/lerobot/common/envs/factory.py
index 749bb533..bcbdb95d 100644
--- a/lerobot/common/envs/factory.py
+++ b/lerobot/common/envs/factory.py
@@ -32,12 +32,14 @@ def make_env(cfg, num_parallel_envs=0) -> gym.Env | gym.vector.SyncVectorEnv:
     elif cfg.env.name == "aloha":
         import gym_aloha  # noqa: F401
 
-        kwargs["task"] = cfg.env.task
+        if cfg.env.task == "sim_transfer_cube":
+            env_name = "gym_aloha/AlohaTransferCube-v0"
+        elif cfg.env.task == "sim_insertion":
+            env_name = "gym_aloha/AlohaInsertion-v0"
+        else:
+            raise ValueError(f"`{cfg.env.task}` has no environment implementation.")
 
-        env_fn = lambda: gym.make(  # noqa: E731
-            "gym_aloha/AlohaTransferCube-v0",
-            **kwargs,
-        )
+        env_fn = lambda: gym.make(env_name, **kwargs)  # noqa: E731
     else:
         raise ValueError(cfg.env.name)
 
diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py
index 834dd9b2..7fb03576 100644
--- a/lerobot/common/policies/act/policy.py
+++ b/lerobot/common/policies/act/policy.py
@@ -337,18 +337,21 @@ class ActionChunkingTransformerPolicy(nn.Module):
             robot_state_embed = self.vae_encoder_robot_state_input_proj(robot_state).unsqueeze(1)  # (B, 1, D)
             action_embed = self.vae_encoder_action_input_proj(actions)  # (B, S, D)
             vae_encoder_input = torch.cat([cls_embed, robot_state_embed, action_embed], axis=1)  # (B, S+2, D)
-            # Note: detach() shouldn't be necessary but leaving it the same as the original code just in case.
+
             # Prepare fixed positional embedding.
+            # Note: detach() shouldn't be necessary but leaving it the same as the original code just in case.
             pos_embed = self.vae_encoder_pos_enc.clone().detach()  # (1, S+2, D)
-            # Forward pass through VAE encoder and sample the latent with the reparameterization trick.
+
+            # Forward pass through VAE encoder.
             cls_token_out = self.vae_encoder(
                 vae_encoder_input.permute(1, 0, 2), pos_embed=pos_embed.permute(1, 0, 2)
-            )[0]  # (B, D)
+            )[0]  # select the class token, with shape (B, D)
             latent_pdf_params = self.vae_encoder_latent_output_proj(cls_token_out)
+
+            # Sample the latent with the reparameterization trick.
             mu = latent_pdf_params[:, : self.latent_dim]
             # This is 2log(sigma). Done this way to match the original implementation.
             log_sigma_x2 = latent_pdf_params[:, self.latent_dim :]
-            # Use reparameterization trick to sample from the latent's PDF.
             latent_sample = mu + log_sigma_x2.div(2).exp() * torch.randn_like(mu)
         else:
             # When not using the VAE encoder, we set the latent to be all zeros.
@@ -469,7 +472,7 @@ class _TransformerEncoderLayer(nn.Module):
         if self.normalize_before:
             x = self.norm1(x)
         q = k = x if pos_embed is None else x + pos_embed
-        x = self.self_attn(q, k, value=x)[0]
+        x = self.self_attn(q, k, value=x)[0]  # select just the output, not the attention weights
         x = skip + self.dropout1(x)
         if self.normalize_before:
             skip = x
@@ -563,7 +566,7 @@ class _TransformerDecoderLayer(nn.Module):
         if self.normalize_before:
             x = self.norm1(x)
         q = k = self.maybe_add_pos_embed(x, decoder_pos_embed)
-        x = self.self_attn(q, k, value=x)[0]
+        x = self.self_attn(q, k, value=x)[0]  # select just the output, not the attention weights
         x = skip + self.dropout1(x)
         if self.normalize_before:
             skip = x
@@ -575,7 +578,7 @@ class _TransformerDecoderLayer(nn.Module):
             query=self.maybe_add_pos_embed(x, decoder_pos_embed),
             key=self.maybe_add_pos_embed(encoder_out, encoder_pos_embed),
             value=encoder_out,
-        )[0]
+        )[0]  # select just the output, not the attention weights
         x = skip + self.dropout2(x)
         if self.normalize_before:
             skip = x
@@ -634,7 +637,7 @@ class _SinusoidalPositionEmbedding2D(nn.Module):
         Returns:
             A (1, C, H, W) batch of corresponding sinusoidal positional embeddings.
         """
-        not_mask = torch.ones_like(x[0, [0]])  # (1, H, W)
+        not_mask = torch.ones_like(x[0, :1])  # (1, H, W)
         # Note: These are like range(1, H+1) and range(1, W+1) respectively, but in most implementations
         # they would be range(0, H) and range(0, W). Keeping it at as to match the original code.
         y_range = not_mask.cumsum(1, dtype=torch.float32)
diff --git a/lerobot/configs/policy/act.yaml b/lerobot/configs/policy/act.yaml
index cd34d115..79729a02 100644
--- a/lerobot/configs/policy/act.yaml
+++ b/lerobot/configs/policy/act.yaml
@@ -64,4 +64,4 @@ policy:
   delta_timestamps:
     observation.images.top: [0.0]
     observation.state: [0.0]
-    action: [0.0, 0.02, 0.04, 0.06, 0.08, 0.1, 0.12, 0.14, 0.16, 0.18, 0.2, 0.22, 0.24, 0.26, 0.28, 0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.70, 0.72, 0.74, 0.76, 0.78, 0.8, 0.82, 0.84, 0.86, 0.88, 0.9, 0.92, 0.94, 0.96, 0.98, 1.0, 1.02, 1.04, 1.06, 1.08, 1.1, 1.12, 1.14, 1.16, 1.18, 1.2, 1.22, 1.24, 1.26, 1.28, 1.3, 1.32, 1.34, 1.36, 1.38, 1.40, 1.42, 1.44, 1.46, 1.48, 1.5, 1.52, 1.54, 1.56, 1.58, 1.6, 1.62, 1.64, 1.66, 1.68, 1.7, 1.72, 1.74, 1.76, 1.78, 1.8, 1.82, 1.84, 1.86, 1.88, 1.90, 1.92, 1.94, 1.96, 1.98]
+    action: "[i / ${fps} for i in range(${horizon})]"
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index d49dfff8..caaf5182 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -152,7 +152,6 @@ def train(cfg: dict, out_dir=None, job_name=None):
 
     logging.info("make_policy")
     policy = make_policy(cfg)
-    policy.save("act.pt")
 
     num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
     num_total_params = sum(p.numel() for p in policy.parameters())
@@ -198,7 +197,7 @@ def train(cfg: dict, out_dir=None, job_name=None):
     is_offline = True
     dataloader = torch.utils.data.DataLoader(
         dataset,
-        num_workers=0,
+        num_workers=4,
         batch_size=cfg.policy.batch_size,
         shuffle=True,
         pin_memory=cfg.device != "cpu",

From 62b18a7607d955eed60ba7eff70b71162f5acaf2 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare159@gmail.com>
Date: Mon, 8 Apr 2024 14:51:45 +0100
Subject: [PATCH 14/25] Add type hints

---
 lerobot/common/policies/act/policy.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py
index 7fb03576..e14a1e88 100644
--- a/lerobot/common/policies/act/policy.py
+++ b/lerobot/common/policies/act/policy.py
@@ -176,7 +176,7 @@ class ActionChunkingTransformerPolicy(nn.Module):
         if self.n_action_steps is not None:
             self._action_queue = deque([], maxlen=self.n_action_steps)
 
-    def select_action(self, batch: dict[str, Tensor], *_):
+    def select_action(self, batch: dict[str, Tensor], *_) -> Tensor:
         """
         This method wraps `select_actions` in order to return one action at a time for execution in the
         environment. It works by managing the actions in a queue and only calling `select_actions` when the
@@ -189,7 +189,7 @@ class ActionChunkingTransformerPolicy(nn.Module):
         return self._action_queue.popleft()
 
     @torch.no_grad()
-    def select_actions(self, batch: dict[str, Tensor]):
+    def select_actions(self, batch: dict[str, Tensor]) -> Tensor:
         """Use the action chunking transformer to generate a sequence of actions."""
         self.eval()
         self._preprocess_batch(batch, add_obs_steps_dim=True)
@@ -211,7 +211,7 @@ class ActionChunkingTransformerPolicy(nn.Module):
 
         return action[: self.n_action_steps]
 
-    def __call__(self, *args, **kwargs):
+    def __call__(self, *args, **kwargs) -> dict:
         # TODO(now): Temporary bridge until we know what to do about the `update` method.
         return self.update(*args, **kwargs)
 
@@ -244,7 +244,7 @@ class ActionChunkingTransformerPolicy(nn.Module):
         # Note: no squeeze is required for "observation.images.top" because then we'd have to unsqueeze to get
         # the image index dimension.
 
-    def update(self, batch, *_):
+    def update(self, batch, *_) -> dict:
         start_time = time.time()
         self._preprocess_batch(batch)
 
@@ -277,7 +277,7 @@ class ActionChunkingTransformerPolicy(nn.Module):
 
         return info
 
-    def forward(self, batch: dict[str, Tensor], return_loss: bool = False):
+    def forward(self, batch: dict[str, Tensor], return_loss: bool = False) -> dict | Tensor:
         images = self.image_normalizer(batch["observation.images.top"])
 
         if return_loss:  # training time
@@ -309,7 +309,9 @@ class ActionChunkingTransformerPolicy(nn.Module):
             action, _ = self._forward(batch["observation.state"], images)
             return action
 
-    def _forward(self, robot_state: Tensor, image: Tensor, actions: Tensor | None = None):
+    def _forward(
+        self, robot_state: Tensor, image: Tensor, actions: Tensor | None = None
+    ) -> tuple[Tensor, tuple[Tensor, Tensor]]:
         """
         Args:
             robot_state: (B, J) batch of robot joint configurations.
@@ -410,7 +412,7 @@ class ActionChunkingTransformerPolicy(nn.Module):
 
         actions = self.action_head(decoder_out)
 
-        return actions, [mu, log_sigma_x2]
+        return actions, (mu, log_sigma_x2)
 
     def save(self, fp):
         torch.save(self.state_dict(), fp)

From 0b4c42f4ffa6c0efcaf30f8b407789150bc001d2 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare159@gmail.com>
Date: Mon, 8 Apr 2024 14:59:37 +0100
Subject: [PATCH 15/25] typos

---
 lerobot/common/policies/act/policy.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py
index e14a1e88..b8276214 100644
--- a/lerobot/common/policies/act/policy.py
+++ b/lerobot/common/policies/act/policy.py
@@ -641,7 +641,7 @@ class _SinusoidalPositionEmbedding2D(nn.Module):
         """
         not_mask = torch.ones_like(x[0, :1])  # (1, H, W)
         # Note: These are like range(1, H+1) and range(1, W+1) respectively, but in most implementations
-        # they would be range(0, H) and range(0, W). Keeping it at as to match the original code.
+        # they would be range(0, H) and range(0, W). Keeping it at as is to match the original code.
         y_range = not_mask.cumsum(1, dtype=torch.float32)
         x_range = not_mask.cumsum(2, dtype=torch.float32)
 
@@ -659,7 +659,7 @@ class _SinusoidalPositionEmbedding2D(nn.Module):
         y_range = y_range.unsqueeze(-1) / inverse_frequency  # (1, H, W, 1)
 
         # Note: this stack then flatten operation results in interleaved sine and cosine terms.
-        # pos_embed_x and pos_embed are (1, H, W, C // 2).
+        # pos_embed_x and pos_embed_y are (1, H, W, C // 2).
         pos_embed_x = torch.stack((x_range[..., 0::2].sin(), x_range[..., 1::2].cos()), dim=-1).flatten(3)
         pos_embed_y = torch.stack((y_range[..., 0::2].sin(), y_range[..., 1::2].cos()), dim=-1).flatten(3)
         pos_embed = torch.cat((pos_embed_y, pos_embed_x), dim=3).permute(0, 3, 1, 2)  # (1, C, H, W)

From 91e0e4e175236b859cdc463d8d3418b22d9c2ef8 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare159@gmail.com>
Date: Mon, 8 Apr 2024 15:05:40 +0100
Subject: [PATCH 16/25] rever change

---
 lerobot/scripts/eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lerobot/scripts/eval.py b/lerobot/scripts/eval.py
index 72966211..802a2eb6 100644
--- a/lerobot/scripts/eval.py
+++ b/lerobot/scripts/eval.py
@@ -255,7 +255,7 @@ def eval(cfg: dict, out_dir=None, stats_path=None):
     transform = make_dataset(cfg, stats_path=stats_path).transform
 
     logging.info("Making environment.")
-    env = make_env(cfg, num_parallel_envs=cfg.rollout_batch_size)
+    env = make_env(cfg, num_parallel_envs=cfg.eval_episodes)
 
     # when policy is None, rollout a random policy
     policy = make_policy(cfg) if cfg.policy.pretrained_model_path else None

From d9019d9e7eae22b3b250ae445f35cae458c82464 Mon Sep 17 00:00:00 2001
From: Simon Alibert <alibert.sim@gmail.com>
Date: Tue, 9 Apr 2024 10:24:28 +0200
Subject: [PATCH 17/25] disable env_checker in factory

---
 lerobot/common/envs/factory.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/lerobot/common/envs/factory.py b/lerobot/common/envs/factory.py
index c8d10851..d5571935 100644
--- a/lerobot/common/envs/factory.py
+++ b/lerobot/common/envs/factory.py
@@ -30,10 +30,13 @@ def make_env(cfg, num_parallel_envs=0) -> gym.Env | gym.vector.SyncVectorEnv:
 
     if num_parallel_envs == 0:
         # non-batched version of the env that returns an observation of shape (c)
-        env = gym.make(gym_handle, **kwargs)
+        env = gym.make(gym_handle, disable_env_checker=True, **kwargs)
     else:
         # batched version of the env that returns an observation of shape (b, c)
         env = gym.vector.SyncVectorEnv(
-            [lambda: gym.make(gym_handle, **kwargs) for _ in range(num_parallel_envs)]
+            [
+                lambda: gym.make(gym_handle, disable_env_checker=True, **kwargs)
+                for _ in range(num_parallel_envs)
+            ]
         )
     return env

From 274f20b49d018251e1414f9dab98c59ce5a2d23b Mon Sep 17 00:00:00 2001
From: Simon Alibert <alibert.sim@gmail.com>
Date: Tue, 9 Apr 2024 10:25:41 +0200
Subject: [PATCH 18/25] Update gym-pusht

---
 poetry.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/poetry.lock b/poetry.lock
index 95c9f31e..f712289e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -920,7 +920,7 @@ shapely = "^2.0.3"
 type = "git"
 url = "git@github.com:huggingface/gym-pusht.git"
 reference = "HEAD"
-resolved_reference = "6c9893504f670ff069d0f759a733e971ea1efdbf"
+resolved_reference = "824b22832cc8d71a4b4e96a57563510cf47e30c1"
 
 [[package]]
 name = "gym-xarm"

From 2573e89e1df6136142e883ec23cd066e3a75c657 Mon Sep 17 00:00:00 2001
From: Simon Alibert <alibert.sim@gmail.com>
Date: Tue, 9 Apr 2024 10:38:08 +0200
Subject: [PATCH 19/25] Remove direct dependencies

---
 .github/poetry/cpu/poetry.lock    | 51 ++++++++++++++-----------------
 .github/poetry/cpu/pyproject.toml | 14 ++++-----
 poetry.lock                       | 36 +++++++++++-----------
 pyproject.toml                    |  7 +----
 4 files changed, 49 insertions(+), 59 deletions(-)

diff --git a/.github/poetry/cpu/poetry.lock b/.github/poetry/cpu/poetry.lock
index 15b27c76..edc1d503 100644
--- a/.github/poetry/cpu/poetry.lock
+++ b/.github/poetry/cpu/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
 
 [[package]]
 name = "absl-py"
@@ -517,21 +517,11 @@ files = [
     {file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"},
 ]
 
-[[package]]
-name = "dm"
-version = "1.3"
-description = "Dict to Data mapper"
-optional = false
-python-versions = "*"
-files = [
-    {file = "dm-1.3.tar.gz", hash = "sha256:ce77537bf346b5d8c0dc0b5d679cfc4a946faadcd5315e6c80ef6f3af824130d"},
-]
-
 [[package]]
 name = "dm-control"
 version = "1.0.14"
 description = "Continuous control environments and MuJoCo Python bindings."
-optional = false
+optional = true
 python-versions = ">=3.8"
 files = [
     {file = "dm_control-1.0.14-py3-none-any.whl", hash = "sha256:883c63244a7ebf598700a97564ed19fffd3479ca79efd090aed881609cdb9fc6"},
@@ -562,7 +552,7 @@ hdf5 = ["h5py"]
 name = "dm-env"
 version = "1.6"
 description = "A Python interface for Reinforcement Learning environments."
-optional = false
+optional = true
 python-versions = ">=3.7"
 files = [
     {file = "dm-env-1.6.tar.gz", hash = "sha256:a436eb1c654c39e0c986a516cee218bea7140b510fceff63f97eb4fcff3d93de"},
@@ -578,7 +568,7 @@ numpy = "*"
 name = "dm-tree"
 version = "0.1.8"
 description = "Tree is a library for working with nested data structures."
-optional = false
+optional = true
 python-versions = "*"
 files = [
     {file = "dm-tree-0.1.8.tar.gz", hash = "sha256:0fcaabbb14e7980377439e7140bd05552739ca5e515ecb3119f234acee4b9430"},
@@ -806,7 +796,7 @@ test = ["black", "coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre
 name = "glfw"
 version = "2.7.0"
 description = "A ctypes-based wrapper for GLFW3."
-optional = false
+optional = true
 python-versions = "*"
 files = [
     {file = "glfw-2.7.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-macosx_10_6_intel.whl", hash = "sha256:bd82849edcceda4e262bd1227afaa74b94f9f0731c1197863cd25c15bfc613fc"},
@@ -986,7 +976,7 @@ toy-text = ["pygame (>=2.1.3)", "pygame (>=2.1.3)"]
 name = "gymnasium-robotics"
 version = "1.2.4"
 description = "Robotics environments for the Gymnasium repo."
-optional = false
+optional = true
 python-versions = ">=3.8"
 files = [
     {file = "gymnasium-robotics-1.2.4.tar.gz", hash = "sha256:d304192b066f8b800599dfbe3d9d90bba9b761ee884472bdc4d05968a8bc61cb"},
@@ -1218,7 +1208,7 @@ i18n = ["Babel (>=2.7)"]
 name = "labmaze"
 version = "1.0.6"
 description = "LabMaze: DeepMind Lab's text maze generator."
-optional = false
+optional = true
 python-versions = "*"
 files = [
     {file = "labmaze-1.0.6-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:b2ddef976dfd8d992b19cfa6c633f2eba7576d759c2082da534e3f727479a84a"},
@@ -1262,7 +1252,7 @@ setuptools = "!=50.0.0"
 name = "lazy-loader"
 version = "0.3"
 description = "lazy_loader"
-optional = false
+optional = true
 python-versions = ">=3.7"
 files = [
     {file = "lazy_loader-0.3-py3-none-any.whl", hash = "sha256:1e9e76ee8631e264c62ce10006718e80b2cfc74340d17d1031e0f84af7478554"},
@@ -1307,7 +1297,7 @@ files = [
 name = "lxml"
 version = "5.1.0"
 description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
-optional = false
+optional = true
 python-versions = ">=3.6"
 files = [
     {file = "lxml-5.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:704f5572ff473a5f897745abebc6df40f22d4133c1e0a1f124e4f2bd3330ff7e"},
@@ -1525,7 +1515,7 @@ tests = ["pytest (>=4.6)"]
 name = "mujoco"
 version = "2.3.7"
 description = "MuJoCo Physics Simulator"
-optional = false
+optional = true
 python-versions = ">=3.8"
 files = [
     {file = "mujoco-2.3.7-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:e8714a5ff6a1561b364b7b4648d4c0c8d13e751874cf7401c309b9d23fa9598b"},
@@ -1839,7 +1829,7 @@ xml = ["lxml (>=4.9.2)"]
 name = "pettingzoo"
 version = "1.24.3"
 description = "Gymnasium for multi-agent reinforcement learning."
-optional = false
+optional = true
 python-versions = ">=3.8"
 files = [
     {file = "pettingzoo-1.24.3-py3-none-any.whl", hash = "sha256:23ed90517d2e8a7098bdaf5e31234b3a7f7b73ca578d70d1ca7b9d0cb0e37982"},
@@ -2207,7 +2197,7 @@ dev = ["aafigure", "matplotlib", "pygame", "pyglet (<2.0.0)", "sphinx", "wheel"]
 name = "pyopengl"
 version = "3.1.7"
 description = "Standard OpenGL bindings for Python"
-optional = false
+optional = true
 python-versions = "*"
 files = [
     {file = "PyOpenGL-3.1.7-py3-none-any.whl", hash = "sha256:a6ab19cf290df6101aaf7470843a9c46207789855746399d0af92521a0a92b7a"},
@@ -2218,7 +2208,7 @@ files = [
 name = "pyparsing"
 version = "3.1.2"
 description = "pyparsing module - Classes and methods to define and execute parsing grammars"
-optional = false
+optional = true
 python-versions = ">=3.6.8"
 files = [
     {file = "pyparsing-3.1.2-py3-none-any.whl", hash = "sha256:f9db75911801ed778fe61bb643079ff86601aca99fcae6345aa67292038fb742"},
@@ -2649,7 +2639,7 @@ torch = ["safetensors[numpy]", "torch (>=1.10)"]
 name = "scikit-image"
 version = "0.22.0"
 description = "Image processing in Python"
-optional = false
+optional = true
 python-versions = ">=3.9"
 files = [
     {file = "scikit_image-0.22.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:74ec5c1d4693506842cc7c9487c89d8fc32aed064e9363def7af08b8f8cbb31d"},
@@ -2697,7 +2687,7 @@ test = ["asv", "matplotlib (>=3.5)", "numpydoc (>=1.5)", "pooch (>=1.6.0)", "pyt
 name = "scipy"
 version = "1.12.0"
 description = "Fundamental algorithms for scientific computing in Python"
-optional = false
+optional = true
 python-versions = ">=3.9"
 files = [
     {file = "scipy-1.12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:78e4402e140879387187f7f25d91cc592b3501a2e51dfb320f48dfb73565f10b"},
@@ -2902,7 +2892,7 @@ testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jar
 name = "shapely"
 version = "2.0.3"
 description = "Manipulation and analysis of geometric objects"
-optional = false
+optional = true
 python-versions = ">=3.7"
 files = [
     {file = "shapely-2.0.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:af7e9abe180b189431b0f490638281b43b84a33a960620e6b2e8d3e3458b61a1"},
@@ -3069,7 +3059,7 @@ tests = ["pytest", "pytest-cov"]
 name = "tifffile"
 version = "2024.2.12"
 description = "Read and write TIFF files"
-optional = false
+optional = true
 python-versions = ">=3.9"
 files = [
     {file = "tifffile-2024.2.12-py3-none-any.whl", hash = "sha256:870998f82fbc94ff7c3528884c1b0ae54863504ff51dbebea431ac3fa8fb7c21"},
@@ -3331,7 +3321,12 @@ files = [
 docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
 testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
 
+[extras]
+aloha = ["gym-aloha"]
+pusht = ["gym-pusht"]
+xarm = ["gym-xarm"]
+
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "32cd6caa01276a90b37cb177204e5b1511e92838f3f0268391034042d56f3bd6"
+content-hash = "ba2b64b1a683450b097a1ccbed3410cc3bee97ba8b41f409c5e379f95d8b1c6f"
diff --git a/.github/poetry/cpu/pyproject.toml b/.github/poetry/cpu/pyproject.toml
index d310da47..741e3b37 100644
--- a/.github/poetry/cpu/pyproject.toml
+++ b/.github/poetry/cpu/pyproject.toml
@@ -23,7 +23,6 @@ packages = [{include = "lerobot"}]
 python = "^3.10"
 termcolor = "^2.4.0"
 omegaconf = "^2.3.0"
-dm-env = "^1.6"
 pandas = "^2.2.1"
 wandb = "^0.16.3"
 moviepy = "^1.0.3"
@@ -34,21 +33,15 @@ einops = "^0.7.0"
 pygame = "^2.5.2"
 pymunk = "^6.6.0"
 zarr = "^2.17.0"
-shapely = "^2.0.3"
-scikit-image = "^0.22.0"
 numba = "^0.59.0"
 mpmath = "^1.3.0"
 torch = {version = "^2.2.1", source = "torch-cpu"}
-mujoco = "^2.3.7"
 opencv-python = "^4.9.0.80"
 diffusers = "^0.26.3"
 torchvision = {version = "^0.17.1", source = "torch-cpu"}
 h5py = "^3.10.0"
-dm = "^1.3"
-dm-control = "1.0.14"
 robomimic = "0.2.0"
 huggingface-hub = "^0.21.4"
-gymnasium-robotics = "^1.2.4"
 gymnasium = "^0.29.1"
 cmake = "^3.29.0.1"
 gym-pusht = { git = "git@github.com:huggingface/gym-pusht.git", optional = true}
@@ -58,6 +51,13 @@ gym-aloha = { git = "git@github.com:huggingface/gym-aloha.git", optional = true}
 # gym-xarm = { path = "../gym-xarm", develop = true, optional = true}
 # gym-aloha = { path = "../gym-aloha", develop = true, optional = true}
 
+
+[tool.poetry.extras]
+pusht = ["gym-pusht"]
+xarm = ["gym-xarm"]
+aloha = ["gym-aloha"]
+
+
 [tool.poetry.group.dev.dependencies]
 pre-commit = "^3.6.2"
 debugpy = "^1.8.1"
diff --git a/poetry.lock b/poetry.lock
index f712289e..b5e97cb7 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
 
 [[package]]
 name = "absl-py"
@@ -521,7 +521,7 @@ files = [
 name = "dm-control"
 version = "1.0.14"
 description = "Continuous control environments and MuJoCo Python bindings."
-optional = false
+optional = true
 python-versions = ">=3.8"
 files = [
     {file = "dm_control-1.0.14-py3-none-any.whl", hash = "sha256:883c63244a7ebf598700a97564ed19fffd3479ca79efd090aed881609cdb9fc6"},
@@ -552,7 +552,7 @@ hdf5 = ["h5py"]
 name = "dm-env"
 version = "1.6"
 description = "A Python interface for Reinforcement Learning environments."
-optional = false
+optional = true
 python-versions = ">=3.7"
 files = [
     {file = "dm-env-1.6.tar.gz", hash = "sha256:a436eb1c654c39e0c986a516cee218bea7140b510fceff63f97eb4fcff3d93de"},
@@ -568,7 +568,7 @@ numpy = "*"
 name = "dm-tree"
 version = "0.1.8"
 description = "Tree is a library for working with nested data structures."
-optional = false
+optional = true
 python-versions = "*"
 files = [
     {file = "dm-tree-0.1.8.tar.gz", hash = "sha256:0fcaabbb14e7980377439e7140bd05552739ca5e515ecb3119f234acee4b9430"},
@@ -796,7 +796,7 @@ test = ["black", "coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre
 name = "glfw"
 version = "2.7.0"
 description = "A ctypes-based wrapper for GLFW3."
-optional = false
+optional = true
 python-versions = "*"
 files = [
     {file = "glfw-2.7.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-macosx_10_6_intel.whl", hash = "sha256:bd82849edcceda4e262bd1227afaa74b94f9f0731c1197863cd25c15bfc613fc"},
@@ -976,7 +976,7 @@ toy-text = ["pygame (>=2.1.3)", "pygame (>=2.1.3)"]
 name = "gymnasium-robotics"
 version = "1.2.4"
 description = "Robotics environments for the Gymnasium repo."
-optional = false
+optional = true
 python-versions = ">=3.8"
 files = [
     {file = "gymnasium-robotics-1.2.4.tar.gz", hash = "sha256:d304192b066f8b800599dfbe3d9d90bba9b761ee884472bdc4d05968a8bc61cb"},
@@ -1281,7 +1281,7 @@ i18n = ["Babel (>=2.7)"]
 name = "labmaze"
 version = "1.0.6"
 description = "LabMaze: DeepMind Lab's text maze generator."
-optional = false
+optional = true
 python-versions = "*"
 files = [
     {file = "labmaze-1.0.6-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:b2ddef976dfd8d992b19cfa6c633f2eba7576d759c2082da534e3f727479a84a"},
@@ -1325,7 +1325,7 @@ setuptools = "!=50.0.0"
 name = "lazy-loader"
 version = "0.3"
 description = "lazy_loader"
-optional = false
+optional = true
 python-versions = ">=3.7"
 files = [
     {file = "lazy_loader-0.3-py3-none-any.whl", hash = "sha256:1e9e76ee8631e264c62ce10006718e80b2cfc74340d17d1031e0f84af7478554"},
@@ -1370,7 +1370,7 @@ files = [
 name = "lxml"
 version = "5.1.0"
 description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
-optional = false
+optional = true
 python-versions = ">=3.6"
 files = [
     {file = "lxml-5.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:704f5572ff473a5f897745abebc6df40f22d4133c1e0a1f124e4f2bd3330ff7e"},
@@ -1588,7 +1588,7 @@ tests = ["pytest (>=4.6)"]
 name = "mujoco"
 version = "2.3.7"
 description = "MuJoCo Physics Simulator"
-optional = false
+optional = true
 python-versions = ">=3.8"
 files = [
     {file = "mujoco-2.3.7-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:e8714a5ff6a1561b364b7b4648d4c0c8d13e751874cf7401c309b9d23fa9598b"},
@@ -2043,7 +2043,7 @@ xml = ["lxml (>=4.9.2)"]
 name = "pettingzoo"
 version = "1.24.3"
 description = "Gymnasium for multi-agent reinforcement learning."
-optional = false
+optional = true
 python-versions = ">=3.8"
 files = [
     {file = "pettingzoo-1.24.3-py3-none-any.whl", hash = "sha256:23ed90517d2e8a7098bdaf5e31234b3a7f7b73ca578d70d1ca7b9d0cb0e37982"},
@@ -2411,7 +2411,7 @@ dev = ["aafigure", "matplotlib", "pygame", "pyglet (<2.0.0)", "sphinx", "wheel"]
 name = "pyopengl"
 version = "3.1.7"
 description = "Standard OpenGL bindings for Python"
-optional = false
+optional = true
 python-versions = "*"
 files = [
     {file = "PyOpenGL-3.1.7-py3-none-any.whl", hash = "sha256:a6ab19cf290df6101aaf7470843a9c46207789855746399d0af92521a0a92b7a"},
@@ -2422,7 +2422,7 @@ files = [
 name = "pyparsing"
 version = "3.1.2"
 description = "pyparsing module - Classes and methods to define and execute parsing grammars"
-optional = false
+optional = true
 python-versions = ">=3.6.8"
 files = [
     {file = "pyparsing-3.1.2-py3-none-any.whl", hash = "sha256:f9db75911801ed778fe61bb643079ff86601aca99fcae6345aa67292038fb742"},
@@ -2853,7 +2853,7 @@ torch = ["safetensors[numpy]", "torch (>=1.10)"]
 name = "scikit-image"
 version = "0.22.0"
 description = "Image processing in Python"
-optional = false
+optional = true
 python-versions = ">=3.9"
 files = [
     {file = "scikit_image-0.22.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:74ec5c1d4693506842cc7c9487c89d8fc32aed064e9363def7af08b8f8cbb31d"},
@@ -2901,7 +2901,7 @@ test = ["asv", "matplotlib (>=3.5)", "numpydoc (>=1.5)", "pooch (>=1.6.0)", "pyt
 name = "scipy"
 version = "1.12.0"
 description = "Fundamental algorithms for scientific computing in Python"
-optional = false
+optional = true
 python-versions = ">=3.9"
 files = [
     {file = "scipy-1.12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:78e4402e140879387187f7f25d91cc592b3501a2e51dfb320f48dfb73565f10b"},
@@ -3106,7 +3106,7 @@ testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jar
 name = "shapely"
 version = "2.0.3"
 description = "Manipulation and analysis of geometric objects"
-optional = false
+optional = true
 python-versions = ">=3.7"
 files = [
     {file = "shapely-2.0.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:af7e9abe180b189431b0f490638281b43b84a33a960620e6b2e8d3e3458b61a1"},
@@ -3273,7 +3273,7 @@ tests = ["pytest", "pytest-cov"]
 name = "tifffile"
 version = "2024.2.12"
 description = "Read and write TIFF files"
-optional = false
+optional = true
 python-versions = ">=3.9"
 files = [
     {file = "tifffile-2024.2.12-py3-none-any.whl", hash = "sha256:870998f82fbc94ff7c3528884c1b0ae54863504ff51dbebea431ac3fa8fb7c21"},
@@ -3598,4 +3598,4 @@ xarm = ["gym-xarm"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "bf4627c62a45764931729ce373f1038fe289b6caebb01e66d878f6f278c54518"
+content-hash = "d444fab7fed5e3c5c9cde69c8f19a286126615ab4a9de11c23730b5286cac77b"
diff --git a/pyproject.toml b/pyproject.toml
index a549e66f..75342c80 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,7 +23,6 @@ packages = [{include = "lerobot"}]
 python = "^3.10"
 termcolor = "^2.4.0"
 omegaconf = "^2.3.0"
-dm-env = "^1.6"
 pandas = "^2.2.1"
 wandb = "^0.16.3"
 moviepy = "^1.0.3"
@@ -34,20 +33,15 @@ einops = "^0.7.0"
 pygame = "^2.5.2"
 pymunk = "^6.6.0"
 zarr = "^2.17.0"
-shapely = "^2.0.3"
-scikit-image = "^0.22.0"
 numba = "^0.59.0"
 mpmath = "^1.3.0"
 torch = "^2.2.1"
-mujoco = "^2.3.7"
 opencv-python = "^4.9.0.80"
 diffusers = "^0.26.3"
 torchvision = "^0.17.1"
 h5py = "^3.10.0"
-dm-control = "1.0.14"
 huggingface-hub = {extras = ["hf-transfer"], version = "^0.21.4"}
 robomimic = "0.2.0"
-gymnasium-robotics = "^1.2.4"
 gymnasium = "^0.29.1"
 cmake = "^3.29.0.1"
 gym-pusht = { git = "git@github.com:huggingface/gym-pusht.git", optional = true}
@@ -62,6 +56,7 @@ pusht = ["gym-pusht"]
 xarm = ["gym-xarm"]
 aloha = ["gym-aloha"]
 
+
 [tool.poetry.group.dev.dependencies]
 pre-commit = "^3.6.2"
 debugpy = "^1.8.1"

From dfaacbcf5a7bf4d75a39d6ad8bac8a75291e8cb5 Mon Sep 17 00:00:00 2001
From: Simon Alibert <alibert.sim@gmail.com>
Date: Tue, 9 Apr 2024 10:40:11 +0200
Subject: [PATCH 20/25] Split dev/test dependencies

---
 .github/poetry/cpu/poetry.lock    | 2 +-
 .github/poetry/cpu/pyproject.toml | 7 +++++++
 poetry.lock                       | 2 +-
 pyproject.toml                    | 7 +++++++
 4 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/.github/poetry/cpu/poetry.lock b/.github/poetry/cpu/poetry.lock
index edc1d503..fe4ed7a0 100644
--- a/.github/poetry/cpu/poetry.lock
+++ b/.github/poetry/cpu/poetry.lock
@@ -3329,4 +3329,4 @@ xarm = ["gym-xarm"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "ba2b64b1a683450b097a1ccbed3410cc3bee97ba8b41f409c5e379f95d8b1c6f"
+content-hash = "8fa6dfc30e605741c24f5de58b89125d5b02153f550e5af7a44356956d6bb167"
diff --git a/.github/poetry/cpu/pyproject.toml b/.github/poetry/cpu/pyproject.toml
index 741e3b37..f5c439dc 100644
--- a/.github/poetry/cpu/pyproject.toml
+++ b/.github/poetry/cpu/pyproject.toml
@@ -58,9 +58,16 @@ xarm = ["gym-xarm"]
 aloha = ["gym-aloha"]
 
 
+[tool.poetry.group.dev]
+optional = true
+
+
 [tool.poetry.group.dev.dependencies]
 pre-commit = "^3.6.2"
 debugpy = "^1.8.1"
+
+
+[tool.poetry.group.test.dependencies]
 pytest = "^8.1.0"
 pytest-cov = "^5.0.0"
 
diff --git a/poetry.lock b/poetry.lock
index b5e97cb7..387366b8 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3598,4 +3598,4 @@ xarm = ["gym-xarm"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "d444fab7fed5e3c5c9cde69c8f19a286126615ab4a9de11c23730b5286cac77b"
+content-hash = "7ec0310f8dd0ffa4d92fa78e06513bce98c3657692b3753ff34aadd297a3766c"
diff --git a/pyproject.toml b/pyproject.toml
index 75342c80..a0fc7d44 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,9 +57,16 @@ xarm = ["gym-xarm"]
 aloha = ["gym-aloha"]
 
 
+[tool.poetry.group.dev]
+optional = true
+
+
 [tool.poetry.group.dev.dependencies]
 pre-commit = "^3.6.2"
 debugpy = "^1.8.1"
+
+
+[tool.poetry.group.test.dependencies]
 pytest = "^8.1.0"
 pytest-cov = "^5.0.0"
 

From d21543eb4fcced5bcf717dca62155d53a4c2bc87 Mon Sep 17 00:00:00 2001
From: Simon Alibert <alibert.sim@gmail.com>
Date: Tue, 9 Apr 2024 10:41:20 +0200
Subject: [PATCH 21/25] Add env.close()

---
 tests/test_envs.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_envs.py b/tests/test_envs.py
index 72bc93c4..c49461a0 100644
--- a/tests/test_envs.py
+++ b/tests/test_envs.py
@@ -34,7 +34,7 @@ def test_env(env_name, task, obs_type):
     importlib.import_module(package_name)
     env = gym.make(f"{package_name}/{task}", obs_type=obs_type)
     check_env(env.unwrapped)
-
+    env.close()
 
 @pytest.mark.parametrize(
     "env_name",
@@ -61,3 +61,5 @@ def test_factory(env_name):
         # TODO(rcadene): we assume for now that image normalization takes place in the model
         assert img.max() <= 1.0
         assert img.min() >= 0.0
+
+    env.close()

From dba037508997dae6fe3ba16a7886714ca1759bef Mon Sep 17 00:00:00 2001
From: Simon Alibert <alibert.sim@gmail.com>
Date: Tue, 9 Apr 2024 10:45:58 +0200
Subject: [PATCH 22/25] Fix CI

---
 .github/workflows/test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index c1b14780..34d76827 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -87,7 +87,7 @@ jobs:
           TMP: ~/tmp
         run: |
           mkdir ~/tmp
-          poetry install --no-interaction --no-root
+          poetry install --no-interaction --no-root --all-extras
 
       - name: Save cached venv
         if: |
@@ -106,7 +106,7 @@ jobs:
       #             install project
       #----------------------------------------------
       - name: Install project
-        run: poetry install --no-interaction
+        run: poetry install --no-interaction --all-extras
 
       #----------------------------------------------
       #            run tests & coverage

From d44950e020c7e658fa6da19063c40379801080e8 Mon Sep 17 00:00:00 2001
From: Simon Alibert <alibert.sim@gmail.com>
Date: Tue, 9 Apr 2024 11:44:55 +0200
Subject: [PATCH 23/25] Add ssh key

---
 .github/workflows/test.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 34d76827..afdcc41f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -34,6 +34,11 @@ jobs:
         with:
           python-version: '3.10'
 
+      - name: Add SSH key for installing envs
+        uses: webfactory/ssh-agent@v0.9.0
+        with:
+          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+
       #----------------------------------------------
       #         install & configure poetry
       #----------------------------------------------

From 7f4ff0b170091288bae65281df093768de562f13 Mon Sep 17 00:00:00 2001
From: Simon Alibert <alibert.sim@gmail.com>
Date: Tue, 9 Apr 2024 11:58:59 +0200
Subject: [PATCH 24/25] CI fix attempt

---
 tests/test_envs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_envs.py b/tests/test_envs.py
index c49461a0..d25231b0 100644
--- a/tests/test_envs.py
+++ b/tests/test_envs.py
@@ -33,7 +33,7 @@ def test_env(env_name, task, obs_type):
     package_name = f"gym_{env_name}"
     importlib.import_module(package_name)
     env = gym.make(f"{package_name}/{task}", obs_type=obs_type)
-    check_env(env.unwrapped)
+    check_env(env.unwrapped, skip_render_check=True)
     env.close()
 
 @pytest.mark.parametrize(

From 91ff69d64c9d91f072b2c5fd33999b9056e2e466 Mon Sep 17 00:00:00 2001
From: Simon Alibert <alibert.sim@gmail.com>
Date: Tue, 9 Apr 2024 17:08:36 +0200
Subject: [PATCH 25/25] Update gym_xarm

---
 poetry.lock | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index f0e77c33..faeb70f1 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -941,7 +941,7 @@ mujoco = "^2.3.7"
 type = "git"
 url = "git@github.com:huggingface/gym-xarm.git"
 reference = "HEAD"
-resolved_reference = "08ddd5a9400783a6898bbf3c3014fc5da3961b9d"
+resolved_reference = "ce294c0d30def08414d9237e2bf9f373d448ca07"
 
 [[package]]
 name = "gymnasium"
@@ -1329,16 +1329,12 @@ description = "lazy_loader"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "lazy_loader-0.4-py3-none-any.whl", hash = "sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc"},
-    {file = "lazy_loader-0.4.tar.gz", hash = "sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1"},
+    {file = "lazy_loader-0.3-py3-none-any.whl", hash = "sha256:1e9e76ee8631e264c62ce10006718e80b2cfc74340d17d1031e0f84af7478554"},
+    {file = "lazy_loader-0.3.tar.gz", hash = "sha256:3b68898e34f5b2a29daaaac172c6555512d0f32074f147e2254e4a6d9d838f37"},
 ]
 
-[package.dependencies]
-packaging = "*"
-
 [package.extras]
-dev = ["changelist (==0.5)"]
-lint = ["pre-commit (==3.7.0)"]
+lint = ["pre-commit (>=3.3)"]
 test = ["pytest (>=7.4)", "pytest-cov (>=4.1)"]
 
 [[package]]