Merge remote-tracking branch 'upstream/main' into refactor_dp

2024-04-15 17:08:28 +01:00 · 2024-04-15 17:08:28 +01:00 · 14f3ffb412
parent 6d0a45a97d e37f4e8c53
commit 14f3ffb412
6 changed files with 261 additions and 174 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -146,7 +146,8 @@ jobs:
            device=cpu \
            save_model=true \
            save_freq=2 \
-            horizon=20 \
+            policy.n_action_steps=20 \
            policy.chunk_size=20 \
            policy.batch_size=2 \
            hydra.run.dir=tests/outputs/act/
--- a/lerobot/common/policies/act/configuration_act.py
+++ b/lerobot/common/policies/act/configuration_act.py
@ -0,0 +1,114 @@
 from dataclasses import dataclass, field
@dataclass
 class ActionChunkingTransformerConfig:
    """Configuration class for the Action Chunking Transformers policy.
    Defaults are configured for training on bimanual Aloha tasks like "insertion" or "transfer".
    The parameters you will most likely need to change are the ones which depend on the environment / sensors.
    Those are: `state_dim`, `action_dim` and `camera_names`.
    Args:
        state_dim: Dimensionality of the observation state space (excluding images).
        action_dim: Dimensionality of the action space.
        n_obs_steps: Number of environment steps worth of observations to pass to the policy (takes the
            current step and additional steps going back).
        camera_names: The (unique) set of names for the cameras.
        chunk_size: The size of the action prediction "chunks" in units of environment steps.
        n_action_steps: The number of action steps to run in the environment for one invocation of the policy.
            This should be no greater than the chunk size. For example, if the chunk size size 100, you may
            set this to 50. This would mean that the model predicts 100 steps worth of actions, runs 50 in the
            environment, and throws the other 50 out.
        image_normalization_mean: Value to subtract from the input image pixels (inputs are assumed to be in
            [0, 1]) for normalization.
        image_normalization_std: Value by which to divide the input image pixels (after the mean has been
            subtracted).
        vision_backbone: Name of the torchvision resnet backbone to use for encoding images.
        use_pretrained_backbone: Whether the backbone should be initialized with ImageNet, pretrained weights
            from torchvision.
        replace_final_stride_with_dilation: Whether to replace the ResNet's final 2x2 stride with a dilated
            convolution.
        pre_norm: Whether to use "pre-norm" in the transformer blocks.
        d_model: The transformer blocks' main hidden dimension.
        n_heads: The number of heads to use in the transformer blocks' multi-head attention.
        dim_feedforward: The dimension to expand the transformer's hidden dimension to in the feed-forward
            layers.
        feedforward_activation: The activation to use in the transformer block's feed-forward layers.
        n_encoder_layers: The number of transformer layers to use for the transformer encoder.
        n_decoder_layers: The number of transformer layers to use for the transformer decoder.
        use_vae: Whether to use a variational objective during training. This introduces another transformer
            which is used as the VAE's encoder (not to be confused with the transformer encoder - see
            documentation in the policy class).
        latent_dim: The VAE's latent dimension.
        n_vae_encoder_layers: The number of transformer layers to use for the VAE's encoder.
        use_temporal_aggregation: Whether to blend the actions of multiple policy invocations for any given
            environment step.
        dropout: Dropout to use in the transformer layers (see code for details).
        kl_weight: The weight to use for the KL-divergence component of the loss if the variational objective
            is enabled. Loss is then calculated as: `reconstruction_loss + kl_weight * kld_loss`.
    """
    # Environment.
    state_dim: int = 14
    action_dim: int = 14
    # Inputs / output structure.
    n_obs_steps: int = 1
    camera_names: list[str] = field(default_factory=lambda: ["top"])
    chunk_size: int = 100
    n_action_steps: int = 100
    # Vision preprocessing.
    image_normalization_mean: tuple[float, float, float] = field(
        default_factory=lambda: [0.485, 0.456, 0.406]
    )
    image_normalization_std: tuple[float, float, float] = field(default_factory=lambda: [0.229, 0.224, 0.225])
    # Architecture.
    # Vision backbone.
    vision_backbone: str = "resnet18"
    use_pretrained_backbone: bool = True
    replace_final_stride_with_dilation: int = False
    # Transformer layers.
    pre_norm: bool = False
    d_model: int = 512
    n_heads: int = 8
    dim_feedforward: int = 3200
    feedforward_activation: str = "relu"
    n_encoder_layers: int = 4
    n_decoder_layers: int = 1
    # VAE.
    use_vae: bool = True
    latent_dim: int = 32
    n_vae_encoder_layers: int = 4
    # Inference.
    use_temporal_aggregation: bool = False
    # Training and loss computation.
    dropout: float = 0.1
    kl_weight: float = 10.0
    # ---
    # TODO(alexander-soare): Remove these from the policy config.
    batch_size: int = 8
    lr: float = 1e-5
    lr_backbone: float = 1e-5
    weight_decay: float = 1e-4
    grad_clip_norm: float = 10
    utd: int = 1
    def __post_init__(self):
        """Input validation."""
        if not self.vision_backbone.startswith("resnet"):
            raise ValueError("`vision_backbone` must be one of the ResNet variants.")
        if self.use_temporal_aggregation:
            raise NotImplementedError("Temporal aggregation is not yet implemented.")
        if self.n_action_steps > self.chunk_size:
            raise ValueError(
                "The chunk size is the upper bound for the number of action steps per model invocation."
            )
        if self.camera_names != ["top"]:
            raise ValueError("For now, `camera_names` can only be ['top']")
--- a/lerobot/common/policies/act/modeling_act.py
+++ b/lerobot/common/policies/act/modeling_act.py
@ -20,7 +20,7 @@ from torch import Tensor, nn
 from torchvision.models._utils import IntermediateLayerGetter
 from torchvision.ops.misc import FrozenBatchNorm2d
-from lerobot.common.utils import get_safe_torch_device
+from lerobot.common.policies.act.configuration_act import ActionChunkingTransformerConfig
 class ActionChunkingTransformerPolicy(nn.Module):
@ -65,7 +65,7 @@ class ActionChunkingTransformerPolicy(nn.Module):
        "ActionChunkingTransformerPolicy does not handle multiple observation steps."
    )
-    def __init__(self, cfg, device):
+    def __init__(self, cfg: ActionChunkingTransformerConfig):
        """
        TODO(alexander-soare): Add documentation for all parameters once we have model configs established.
        """
@ -73,79 +73,64 @@ class ActionChunkingTransformerPolicy(nn.Module):
        if getattr(cfg, "n_obs_steps", 1) != 1:
            raise ValueError(self._multiple_obs_steps_not_handled_msg)
        self.cfg = cfg
        self.n_action_steps = cfg.n_action_steps
        self.device = get_safe_torch_device(device)
        self.camera_names = cfg.camera_names
        self.use_vae = cfg.use_vae
        self.horizon = cfg.horizon
        self.d_model = cfg.d_model
        transformer_common_kwargs = dict(  # noqa: C408
            d_model=self.d_model,
            num_heads=cfg.num_heads,
            dim_feedforward=cfg.dim_feedforward,
            dropout=cfg.dropout,
            activation=cfg.activation,
            normalize_before=cfg.pre_norm,
        )
        # BERT style VAE encoder with input [cls, *joint_space_configuration, *action_sequence].
        # The cls token forms parameters of the latent's distribution (like this [*means, *log_variances]).
-        if self.use_vae:
+        if self.cfg.use_vae:
-            self.vae_encoder = _TransformerEncoder(num_layers=cfg.vae_enc_layers, **transformer_common_kwargs)
+            self.vae_encoder = _TransformerEncoder(cfg)
-            self.vae_encoder_cls_embed = nn.Embedding(1, self.d_model)
+            self.vae_encoder_cls_embed = nn.Embedding(1, cfg.d_model)
            # Projection layer for joint-space configuration to hidden dimension.
-            self.vae_encoder_robot_state_input_proj = nn.Linear(cfg.state_dim, self.d_model)
+            self.vae_encoder_robot_state_input_proj = nn.Linear(cfg.state_dim, cfg.d_model)
            # Projection layer for action (joint-space target) to hidden dimension.
-            self.vae_encoder_action_input_proj = nn.Linear(cfg.state_dim, self.d_model)
+            self.vae_encoder_action_input_proj = nn.Linear(cfg.state_dim, cfg.d_model)
            self.latent_dim = cfg.latent_dim
            # Projection layer from the VAE encoder's output to the latent distribution's parameter space.
-            self.vae_encoder_latent_output_proj = nn.Linear(self.d_model, self.latent_dim * 2)
+            self.vae_encoder_latent_output_proj = nn.Linear(cfg.d_model, self.latent_dim * 2)
            # Fixed sinusoidal positional embedding the whole input to the VAE encoder. Unsqueeze for batch
            # dimension.
            self.register_buffer(
                "vae_encoder_pos_enc",
-                _create_sinusoidal_position_embedding(1 + 1 + self.horizon, self.d_model).unsqueeze(0),
+                _create_sinusoidal_position_embedding(1 + 1 + cfg.chunk_size, cfg.d_model).unsqueeze(0),
            )
        # Backbone for image feature extraction.
        self.image_normalizer = transforms.Normalize(
-            mean=cfg.image_normalization.mean, std=cfg.image_normalization.std
+            mean=cfg.image_normalization_mean, std=cfg.image_normalization_std
        )
-        backbone_model = getattr(torchvision.models, cfg.backbone)(
+        backbone_model = getattr(torchvision.models, cfg.vision_backbone)(
-            replace_stride_with_dilation=[False, False, cfg.dilation],
+            replace_stride_with_dilation=[False, False, cfg.replace_final_stride_with_dilation],
-            pretrained=cfg.pretrained_backbone,
+            pretrained=cfg.use_pretrained_backbone,
            norm_layer=FrozenBatchNorm2d,
        )
        # Note: The assumption here is that we are using a ResNet model (and hence layer4 is the final feature
        # map).
        # Note: The forward method of this returns a dict: {"feature_map": output}.
        self.backbone = IntermediateLayerGetter(backbone_model, return_layers={"layer4": "feature_map"})
        # Transformer (acts as VAE decoder when training with the variational objective).
-        self.encoder = _TransformerEncoder(num_layers=cfg.enc_layers, **transformer_common_kwargs)
+        self.encoder = _TransformerEncoder(cfg)
-        self.decoder = _TransformerDecoder(num_layers=cfg.dec_layers, **transformer_common_kwargs)
+        self.decoder = _TransformerDecoder(cfg)
        # Transformer encoder input projections. The tokens will be structured like
        # [latent, robot_state, image_feature_map_pixels].
-        self.encoder_robot_state_input_proj = nn.Linear(cfg.state_dim, self.d_model)
+        self.encoder_robot_state_input_proj = nn.Linear(cfg.state_dim, cfg.d_model)
-        self.encoder_latent_input_proj = nn.Linear(self.latent_dim, self.d_model)
+        self.encoder_latent_input_proj = nn.Linear(self.latent_dim, cfg.d_model)
        self.encoder_img_feat_input_proj = nn.Conv2d(
-            backbone_model.fc.in_features, self.d_model, kernel_size=1
+            backbone_model.fc.in_features, cfg.d_model, kernel_size=1
        )
        # Transformer encoder positional embeddings.
-        self.encoder_robot_and_latent_pos_embed = nn.Embedding(2, self.d_model)
+        self.encoder_robot_and_latent_pos_embed = nn.Embedding(2, cfg.d_model)
-        self.encoder_cam_feat_pos_embed = _SinusoidalPositionEmbedding2D(self.d_model // 2)
+        self.encoder_cam_feat_pos_embed = _SinusoidalPositionEmbedding2D(cfg.d_model // 2)
        # Transformer decoder.
        # Learnable positional embedding for the transformer's decoder (in the style of DETR object queries).
-        self.decoder_pos_embed = nn.Embedding(self.horizon, self.d_model)
+        self.decoder_pos_embed = nn.Embedding(cfg.chunk_size, cfg.d_model)
        # Final action regression head on the output of the transformer's decoder.
-        self.action_head = nn.Linear(self.d_model, cfg.action_dim)
+        self.action_head = nn.Linear(cfg.d_model, cfg.action_dim)
        self._reset_parameters()
        self._create_optimizer()
        self.to(self.device)
    def _create_optimizer(self):
        optimizer_params_dicts = [
@ -173,8 +158,8 @@ class ActionChunkingTransformerPolicy(nn.Module):
    def reset(self):
        """This should be called whenever the environment is reset."""
-        if self.n_action_steps is not None:
+        if self.cfg.n_action_steps is not None:
-            self._action_queue = deque([], maxlen=self.n_action_steps)
+            self._action_queue = deque([], maxlen=self.cfg.n_action_steps)
    @torch.no_grad
    def select_action(self, batch: dict[str, Tensor], **_) -> Tensor:
@ -184,8 +169,8 @@ class ActionChunkingTransformerPolicy(nn.Module):
        queue is empty.
        """
        if len(self._action_queue) == 0:
-            # `select_actions` returns a (batch_size, n_action_steps, *) tensor, but the queue effectively has shape
+            # `select_actions` returns a (batch_size, n_action_steps, *) tensor, but the queue effectively has
-            # (n_action_steps, batch_size, *), hence the transpose.
+            # shape (n_action_steps, batch_size, *), hence the transpose.
            self._action_queue.extend(self.select_actions(batch).transpose(0, 1))
        return self._action_queue.popleft()
@ -197,20 +182,7 @@ class ActionChunkingTransformerPolicy(nn.Module):
        action = self.forward(batch, return_loss=False)
-        if self.cfg.temporal_agg:
+        return action[: self.cfg.n_action_steps]
            # TODO(rcadene): implement temporal aggregation
            raise NotImplementedError()
            # all_time_actions[[t], t:t+num_queries] = action
            # actions_for_curr_step = all_time_actions[:, t]
            # actions_populated = torch.all(actions_for_curr_step != 0, axis=1)
            # actions_for_curr_step = actions_for_curr_step[actions_populated]
            # k = 0.01
            # exp_weights = np.exp(-k * np.arange(len(actions_for_curr_step)))
            # exp_weights = exp_weights / exp_weights.sum()
            # exp_weights = torch.from_numpy(exp_weights).cuda().unsqueeze(dim=1)
            # raw_action = (actions_for_curr_step * exp_weights).sum(dim=0, keepdim=True)
        return action[: self.n_action_steps]
    def __call__(self, *args, **kwargs) -> dict:
        # TODO(alexander-soare): Temporary bridge until we know what to do about the `update` method.
@ -251,9 +223,9 @@ class ActionChunkingTransformerPolicy(nn.Module):
        self.train()
        num_slices = self.cfg.batch_size
-        batch_size = self.cfg.horizon * num_slices
+        batch_size = self.cfg.chunk_size * num_slices
-        assert batch_size % self.cfg.horizon == 0
+        assert batch_size % self.cfg.chunk_size == 0
        assert batch_size % num_slices == 0
        loss = self.forward(batch, return_loss=True)["loss"]
@ -324,7 +296,7 @@ class ActionChunkingTransformerPolicy(nn.Module):
            Tuple containing the latent PDF's parameters (mean, log(σ²)) both as (B, L) tensors where L is the
            latent dimension.
        """
-        if self.use_vae and self.training:
+        if self.cfg.use_vae and self.training:
            assert (
                actions is not None
            ), "actions must be provided when using the variational objective in training mode."
@ -332,7 +304,7 @@ class ActionChunkingTransformerPolicy(nn.Module):
        batch_size = robot_state.shape[0]
        # Prepare the latent for input to the transformer encoder.
-        if self.use_vae and actions is not None:
+        if self.cfg.use_vae and actions is not None:
            # Prepare the input to the VAE encoder: [cls, *joint_space_configuration, *action_sequence].
            cls_embed = einops.repeat(
                self.vae_encoder_cls_embed.weight, "1 d -> b 1 d", b=batch_size
@ -367,7 +339,7 @@ class ActionChunkingTransformerPolicy(nn.Module):
        # Camera observation features and positional embeddings.
        all_cam_features = []
        all_cam_pos_embeds = []
-        for cam_id, _ in enumerate(self.camera_names):
+        for cam_id, _ in enumerate(self.cfg.camera_names):
            cam_features = self.backbone(image[:, cam_id])["feature_map"]
            cam_pos_embed = self.encoder_cam_feat_pos_embed(cam_features).to(dtype=cam_features.dtype)
            cam_features = self.encoder_img_feat_input_proj(cam_features)  # (B, C, h, w)
@ -399,7 +371,9 @@ class ActionChunkingTransformerPolicy(nn.Module):
        # Forward pass through the transformer modules.
        encoder_out = self.encoder(encoder_in, pos_embed=pos_embed)
        decoder_in = torch.zeros(
-            (self.horizon, batch_size, self.d_model), dtype=pos_embed.dtype, device=pos_embed.device
+            (self.cfg.chunk_size, batch_size, self.cfg.d_model),
            dtype=pos_embed.dtype,
            device=pos_embed.device,
        )
        decoder_out = self.decoder(
            decoder_in,
@ -426,16 +400,10 @@ class ActionChunkingTransformerPolicy(nn.Module):
 class _TransformerEncoder(nn.Module):
    """Convenience module for running multiple encoder layers, maybe followed by normalization."""
-    def __init__(self, num_layers: int, **encoder_layer_kwargs: dict):
+    def __init__(self, cfg: ActionChunkingTransformerConfig):
        super().__init__()
-        self.layers = nn.ModuleList(
+        self.layers = nn.ModuleList([_TransformerEncoderLayer(cfg) for _ in range(cfg.n_encoder_layers)])
-            [_TransformerEncoderLayer(**encoder_layer_kwargs) for _ in range(num_layers)]
+        self.norm = nn.LayerNorm(cfg.d_model) if cfg.pre_norm else nn.Identity()
        )
        self.norm = (
            nn.LayerNorm(encoder_layer_kwargs["d_model"])
            if encoder_layer_kwargs["normalize_before"]
            else nn.Identity()
        )
    def forward(self, x: Tensor, pos_embed: Tensor | None = None) -> Tensor:
        for layer in self.layers:
@ -445,39 +413,31 @@ class _TransformerEncoder(nn.Module):
 class _TransformerEncoderLayer(nn.Module):
-    def __init__(
+    def __init__(self, cfg: ActionChunkingTransformerConfig):
        self,
        d_model: int,
        num_heads: int,
        dim_feedforward: int,
        dropout: float,
        activation: str,
        normalize_before: bool,
    ):
        super().__init__()
-        self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)
+        self.self_attn = nn.MultiheadAttention(cfg.d_model, cfg.n_heads, dropout=cfg.dropout)
        # Feed forward layers.
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.linear1 = nn.Linear(cfg.d_model, cfg.dim_feedforward)
-        self.dropout = nn.Dropout(dropout)
+        self.dropout = nn.Dropout(cfg.dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.linear2 = nn.Linear(cfg.dim_feedforward, cfg.d_model)
-        self.norm1 = nn.LayerNorm(d_model)
+        self.norm1 = nn.LayerNorm(cfg.d_model)
-        self.norm2 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(cfg.d_model)
-        self.dropout1 = nn.Dropout(dropout)
+        self.dropout1 = nn.Dropout(cfg.dropout)
-        self.dropout2 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(cfg.dropout)
-        self.activation = _get_activation_fn(activation)
+        self.activation = _get_activation_fn(cfg.feedforward_activation)
-        self.normalize_before = normalize_before
+        self.pre_norm = cfg.pre_norm
    def forward(self, x, pos_embed: Tensor | None = None) -> Tensor:
        skip = x
-        if self.normalize_before:
+        if self.pre_norm:
            x = self.norm1(x)
        q = k = x if pos_embed is None else x + pos_embed
        x = self.self_attn(q, k, value=x)[0]  # select just the output, not the attention weights
        x = skip + self.dropout1(x)
-        if self.normalize_before:
+        if self.pre_norm:
            skip = x
            x = self.norm2(x)
        else:
@ -485,20 +445,17 @@ class _TransformerEncoderLayer(nn.Module):
            skip = x
        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
        x = skip + self.dropout2(x)
-        if not self.normalize_before:
+        if not self.pre_norm:
            x = self.norm2(x)
        return x
 class _TransformerDecoder(nn.Module):
-    def __init__(self, num_layers: int, **decoder_layer_kwargs):
+    def __init__(self, cfg: ActionChunkingTransformerConfig):
        """Convenience module for running multiple decoder layers followed by normalization."""
        super().__init__()
-        self.layers = nn.ModuleList(
+        self.layers = nn.ModuleList([_TransformerDecoderLayer(cfg) for _ in range(cfg.n_decoder_layers)])
-            [_TransformerDecoderLayer(**decoder_layer_kwargs) for _ in range(num_layers)]
+        self.norm = nn.LayerNorm(cfg.d_model)
        )
        self.num_layers = num_layers
        self.norm = nn.LayerNorm(decoder_layer_kwargs["d_model"])
    def forward(
        self,
@ -517,33 +474,25 @@ class _TransformerDecoder(nn.Module):
 class _TransformerDecoderLayer(nn.Module):
-    def __init__(
+    def __init__(self, cfg: ActionChunkingTransformerConfig):
        self,
        d_model: int,
        num_heads: int,
        dim_feedforward: int,
        dropout: float,
        activation: str,
        normalize_before: bool,
    ):
        super().__init__()
-        self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)
+        self.self_attn = nn.MultiheadAttention(cfg.d_model, cfg.n_heads, dropout=cfg.dropout)
-        self.multihead_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(cfg.d_model, cfg.n_heads, dropout=cfg.dropout)
        # Feed forward layers.
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.linear1 = nn.Linear(cfg.d_model, cfg.dim_feedforward)
-        self.dropout = nn.Dropout(dropout)
+        self.dropout = nn.Dropout(cfg.dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.linear2 = nn.Linear(cfg.dim_feedforward, cfg.d_model)
-        self.norm1 = nn.LayerNorm(d_model)
+        self.norm1 = nn.LayerNorm(cfg.d_model)
-        self.norm2 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(cfg.d_model)
-        self.norm3 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(cfg.d_model)
-        self.dropout1 = nn.Dropout(dropout)
+        self.dropout1 = nn.Dropout(cfg.dropout)
-        self.dropout2 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(cfg.dropout)
-        self.dropout3 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(cfg.dropout)
-        self.activation = _get_activation_fn(activation)
+        self.activation = _get_activation_fn(cfg.feedforward_activation)
-        self.normalize_before = normalize_before
+        self.pre_norm = cfg.pre_norm
    def maybe_add_pos_embed(self, tensor: Tensor, pos_embed: Tensor | None) -> Tensor:
        return tensor if pos_embed is None else tensor + pos_embed
@ -566,12 +515,12 @@ class _TransformerDecoderLayer(nn.Module):
            (DS, B, C) tensor of decoder output features.
        """
        skip = x
-        if self.normalize_before:
+        if self.pre_norm:
            x = self.norm1(x)
        q = k = self.maybe_add_pos_embed(x, decoder_pos_embed)
        x = self.self_attn(q, k, value=x)[0]  # select just the output, not the attention weights
        x = skip + self.dropout1(x)
-        if self.normalize_before:
+        if self.pre_norm:
            skip = x
            x = self.norm2(x)
        else:
@ -583,7 +532,7 @@ class _TransformerDecoderLayer(nn.Module):
            value=encoder_out,
        )[0]  # select just the output, not the attention weights
        x = skip + self.dropout2(x)
-        if self.normalize_before:
+        if self.pre_norm:
            skip = x
            x = self.norm3(x)
        else:
@ -591,7 +540,7 @@ class _TransformerDecoderLayer(nn.Module):
            skip = x
        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
        x = skip + self.dropout3(x)
-        if not self.normalize_before:
+        if not self.pre_norm:
            x = self.norm3(x)
        return x
--- a/lerobot/common/policies/factory.py
+++ b/lerobot/common/policies/factory.py
@ -1,3 +1,10 @@
 import inspect
 from omegaconf import OmegaConf
 from lerobot.common.utils import get_safe_torch_device
 def make_policy(cfg):
    if cfg.policy.name == "tdmpc":
        from lerobot.common.policies.tdmpc.policy import TDMPCPolicy
@ -19,10 +26,22 @@ def make_policy(cfg):
            **cfg.policy,
        )
    elif cfg.policy.name == "act":
-        from lerobot.common.policies.act.policy import ActionChunkingTransformerPolicy
+        from lerobot.common.policies.act.configuration_act import ActionChunkingTransformerConfig
        from lerobot.common.policies.act.modeling_act import ActionChunkingTransformerPolicy
-        policy = ActionChunkingTransformerPolicy(cfg.policy, cfg.device)
+        expected_kwargs = set(inspect.signature(ActionChunkingTransformerConfig).parameters)
-        policy.to(cfg.device)
+        assert set(cfg.policy).issuperset(
            expected_kwargs
        ), f"Hydra config is missing arguments: {set(cfg.policy).difference(expected_kwargs)}"
        policy_cfg = ActionChunkingTransformerConfig(
            **{
                k: v
                for k, v in OmegaConf.to_container(cfg.policy, resolve=True).items()
                if k in expected_kwargs
            }
        )
        policy = ActionChunkingTransformerPolicy(policy_cfg)
        policy.to(get_safe_torch_device(cfg.device))
    else:
        raise ValueError(cfg.policy.name)
--- a/lerobot/configs/policy/act.yaml
+++ b/lerobot/configs/policy/act.yaml
@ -8,61 +8,65 @@ eval_freq: 10000
 save_freq: 100000
 log_freq: 250
 horizon: 100
 n_obs_steps: 1
 # when temporal_agg=False, n_action_steps=horizon
 n_action_steps: ${horizon}
 # See `configuration_act.py` for more details.
 policy:
  name: act
  pretrained_model_path:
  # Environment.
  # Inherit these from the environment.
  state_dim: ???
  action_dim: ???
  # Inputs / output structure.
  n_obs_steps: ${n_obs_steps}
  camera_names: [top]  # [top, front_close, left_pillar, right_pillar]
  chunk_size: 100 # chunk_size
  n_action_steps: 100
  # Vision preprocessing.
  image_normalization_mean: [0.485, 0.456, 0.406]
  image_normalization_std: [0.229, 0.224, 0.225]
  # Architecture.
  # Vision backbone.
  vision_backbone: resnet18
  use_pretrained_backbone: true
  replace_final_stride_with_dilation: false
  # Transformer layers.
  pre_norm: false
  d_model: 512
  n_heads: 8
  dim_feedforward: 3200
  feedforward_activation: relu
  n_encoder_layers: 4
  n_decoder_layers: 1
  # VAE.
  use_vae: true
  latent_dim: 32
  n_vae_encoder_layers: 4
  # Inference.
  use_temporal_aggregation: false
  # Training and loss computation.
  dropout: 0.1
  kl_weight: 10.0
  # ---
  # TODO(alexander-soare): Remove these from the policy config.
  batch_size: 8
  lr: 1e-5
  lr_backbone: 1e-5
  pretrained_backbone: true
  weight_decay: 1e-4
  grad_clip_norm: 10
  backbone: resnet18
  horizon: ${horizon} # chunk_size
  kl_weight: 10
  d_model: 512
  dim_feedforward: 3200
  vae_enc_layers: 4
  enc_layers: 4
  dec_layers: 1
  num_heads: 8
  #camera_names: [top, front_close, left_pillar, right_pillar]
  camera_names: [top]
  dilation: false
  dropout: 0.1
  pre_norm: false
  activation: relu
  latent_dim: 32
  use_vae: true
  batch_size: 8
  per_alpha: 0.6
  per_beta: 0.4
  balanced_sampling: false
  utd: 1
  n_obs_steps: ${n_obs_steps}
  n_action_steps: ${n_action_steps}
  temporal_agg: false
  state_dim: 14
  action_dim: 14
  image_normalization:
    mean: [0.485, 0.456, 0.406]
    std: [0.229, 0.224, 0.225]
  delta_timestamps:
-    observation.images.top: [0.0]
+    observation.images.top: "[i / ${fps} for i in range(1 - ${n_obs_steps}, 1)]"
-    observation.state: [0.0]
+    observation.state: "[i / ${fps} for i in range(1 - ${n_obs_steps}, 1)]"
-    action: "[i / ${fps} for i in range(${horizon})]"
+    action: "[i / ${fps} for i in range(${policy.chunk_size})]"
--- a/tests/test_available.py
+++ b/tests/test_available.py
@ -18,7 +18,7 @@ from lerobot.common.datasets.xarm import XarmDataset
 from lerobot.common.datasets.aloha import AlohaDataset
 from lerobot.common.datasets.pusht import PushtDataset
-from lerobot.common.policies.act.policy import ActionChunkingTransformerPolicy
+from lerobot.common.policies.act.modeling_act import ActionChunkingTransformerPolicy
 from lerobot.common.policies.diffusion.policy import DiffusionPolicy
 from lerobot.common.policies.tdmpc.policy import TDMPCPolicy