From e35546f58ec40d3f065762fdcb7f57e455314b28 Mon Sep 17 00:00:00 2001
From: Yoel <yoel.chornton@gmail.com>
Date: Mon, 9 Dec 2024 10:21:50 +0100
Subject: [PATCH 002/112] Reward classifier and training (#528)

Co-authored-by: Daniel Ritchie <daniel@brainwavecollective.ai>
Co-authored-by: resolver101757 <kelster101757@hotmail.com>
Co-authored-by: Jannik Grothusen <56967823+J4nn1K@users.noreply.github.com>
Co-authored-by: Remi <re.cadene@gmail.com>
Co-authored-by: Michel Aractingi <michel.aractingi@huggingface.co>
---
 examples/12_train_hilserl_classifier.md       |  83 +++++
 lerobot/common/datasets/lerobot_dataset.py    |   2 +-
 lerobot/common/logger.py                      |   5 +-
 .../classifier/configuration_classifier.py    |  36 ++
 .../hilserl/classifier/modeling_classifier.py | 134 ++++++++
 lerobot/common/robot_devices/control_utils.py |  27 +-
 .../configs/policy/hilserl_classifier.yaml    |  48 +++
 lerobot/scripts/control_robot.py              |  26 +-
 lerobot/scripts/train_hilserl_classifier.py   | 310 ++++++++++++++++++
 tests/test_train_hilserl_classifier.py        | 251 ++++++++++++++
 10 files changed, 906 insertions(+), 16 deletions(-)
 create mode 100644 examples/12_train_hilserl_classifier.md
 create mode 100644 lerobot/common/policies/hilserl/classifier/configuration_classifier.py
 create mode 100644 lerobot/common/policies/hilserl/classifier/modeling_classifier.py
 create mode 100644 lerobot/configs/policy/hilserl_classifier.yaml
 create mode 100644 lerobot/scripts/train_hilserl_classifier.py
 create mode 100644 tests/test_train_hilserl_classifier.py

diff --git a/examples/12_train_hilserl_classifier.md b/examples/12_train_hilserl_classifier.md
new file mode 100644
index 00000000..eeaf0f2b
--- /dev/null
+++ b/examples/12_train_hilserl_classifier.md
@@ -0,0 +1,83 @@
+# Training a HIL-SERL Reward Classifier with LeRobot
+
+This tutorial provides step-by-step instructions for training a reward classifier using LeRobot.
+
+---
+
+## Training Script Overview
+
+LeRobot includes a ready-to-use training script located at [`lerobot/scripts/train_hilserl_classifier.py`](../../lerobot/scripts/train_hilserl_classifier.py). Here's an outline of its workflow:
+
+1. **Configuration Loading**
+   The script uses Hydra to load a configuration file for subsequent steps. (Details on Hydra follow below.)
+
+2. **Dataset Initialization**
+   It loads a `LeRobotDataset` containing images and rewards. To optimize performance, a weighted random sampler is used to balance class sampling.
+
+3. **Classifier Initialization**
+   A lightweight classification head is built on top of a frozen, pretrained image encoder from HuggingFace. The classifier outputs either:
+   - A single probability (binary classification), or
+   - Logits (multi-class classification).
+
+4. **Training Loop Execution**
+   The script performs:
+   - Forward and backward passes,
+   - Optimization steps,
+   - Periodic logging, evaluation, and checkpoint saving.
+
+---
+
+## Configuring with Hydra
+
+For detailed information about Hydra usage, refer to [`examples/4_train_policy_with_script.md`](../examples/4_train_policy_with_script.md). However, note that training the reward classifier differs slightly and requires a separate configuration file.
+
+### Config File Setup
+
+The default `default.yaml` cannot launch the reward classifier training directly. Instead, you need a configuration file like [`lerobot/configs/policy/hilserl_classifier.yaml`](../../lerobot/configs/policy/hilserl_classifier.yaml), with the following adjustment:
+
+Replace the `dataset_repo_id` field with the identifier for your dataset, which contains images and sparse rewards:
+
+```yaml
+# Example: lerobot/configs/policy/reward_classifier.yaml
+dataset_repo_id: "my_dataset_repo_id"
+## Typical logs and metrics
+```
+When you start the training process, you will first see your full configuration being printed in the terminal. You can check it to make sure that you config it correctly and your config is not overrided by other files. The final configuration will also be saved with the checkpoint.
+
+After that, you will see training log like this one:
+
+```
+[2024-11-29 18:26:36,999][root][INFO] -
+Epoch 5/5
+Training:  82%|██████████████████████████████████████████████████████████████████████████████▋                 | 91/111 [00:50<00:09,  2.04it/s, loss=0.2999, acc=69.99%]
+```
+
+or evaluation log like:
+
+```
+Validation: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:20<00:00,  1.37it/s]
+```
+
+### Metrics Tracking with Weights & Biases (WandB)
+
+If `wandb.enable` is set to `true`, the training and evaluation logs will also be saved in WandB. This allows you to track key metrics in real-time, including:
+
+- **Training Metrics**:
+  - `train/accuracy`
+  - `train/loss`
+  - `train/dataloading_s`
+- **Evaluation Metrics**:
+  - `eval/accuracy`
+  - `eval/loss`
+  - `eval/eval_s`
+
+#### Additional Features
+
+You can also log sample predictions during evaluation. Each logged sample will include:
+
+- The **input image**.
+- The **predicted label**.
+- The **true label**.
+- The **classifier's "confidence" (logits/probability)**.
+
+These logs can be useful for diagnosing and debugging performance issues.
diff --git a/lerobot/common/datasets/lerobot_dataset.py b/lerobot/common/datasets/lerobot_dataset.py
index b32cf709..23255805 100644
--- a/lerobot/common/datasets/lerobot_dataset.py
+++ b/lerobot/common/datasets/lerobot_dataset.py
@@ -291,7 +291,7 @@ class LeRobotDatasetMetadata:
         obj.root.mkdir(parents=True, exist_ok=False)
 
         if robot is not None:
-            features = get_features_from_robot(robot, use_videos)
+            features = {**(features or {}), **get_features_from_robot(robot)}
             robot_type = robot.robot_type
             if not all(cam.fps == fps for cam in robot.cameras.values()):
                 logging.warning(
diff --git a/lerobot/common/logger.py b/lerobot/common/logger.py
index 3bd2df89..dec8b465 100644
--- a/lerobot/common/logger.py
+++ b/lerobot/common/logger.py
@@ -31,6 +31,7 @@ from termcolor import colored
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LRScheduler
 
+import wandb
 from lerobot.common.policies.policy_protocol import Policy
 from lerobot.common.utils.utils import get_global_random_state, set_global_random_state
 
@@ -107,8 +108,6 @@ class Logger:
             self._wandb = None
         else:
             os.environ["WANDB_SILENT"] = "true"
-            import wandb
-
             wandb_run_id = None
             if cfg.resume:
                 wandb_run_id = get_wandb_run_id_from_filesystem(self.checkpoints_dir)
@@ -232,7 +231,7 @@ class Logger:
         # TODO(alexander-soare): Add local text log.
         if self._wandb is not None:
             for k, v in d.items():
-                if not isinstance(v, (int, float, str)):
+                if not isinstance(v, (int, float, str, wandb.Table)):
                     logging.warning(
                         f'WandB logging of key "{k}" was ignored as its type is not handled by this wrapper.'
                     )
diff --git a/lerobot/common/policies/hilserl/classifier/configuration_classifier.py b/lerobot/common/policies/hilserl/classifier/configuration_classifier.py
new file mode 100644
index 00000000..209ff659
--- /dev/null
+++ b/lerobot/common/policies/hilserl/classifier/configuration_classifier.py
@@ -0,0 +1,36 @@
+import json
+import os
+from dataclasses import asdict, dataclass
+
+import torch
+
+
+@dataclass
+class ClassifierConfig:
+    """Configuration for the Classifier model."""
+
+    num_classes: int = 2
+    hidden_dim: int = 256
+    dropout_rate: float = 0.1
+    model_name: str = "microsoft/resnet-50"
+    device: str = "cuda" if torch.cuda.is_available() else "mps"
+    model_type: str = "cnn"  # "transformer" or "cnn"
+
+    def save_pretrained(self, save_dir):
+        """Save config to json file."""
+        os.makedirs(save_dir, exist_ok=True)
+
+        # Convert to dict and save as JSON
+        config_dict = asdict(self)
+        with open(os.path.join(save_dir, "config.json"), "w") as f:
+            json.dump(config_dict, f, indent=2)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path):
+        """Load config from json file."""
+        config_file = os.path.join(pretrained_model_name_or_path, "config.json")
+
+        with open(config_file) as f:
+            config_dict = json.load(f)
+
+        return cls(**config_dict)
diff --git a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
new file mode 100644
index 00000000..dbb434a7
--- /dev/null
+++ b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
@@ -0,0 +1,134 @@
+import logging
+from typing import Optional
+
+import torch
+from huggingface_hub import PyTorchModelHubMixin
+from torch import Tensor, nn
+from transformers import AutoImageProcessor, AutoModel
+
+from .configuration_classifier import ClassifierConfig
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+
+class ClassifierOutput:
+    """Wrapper for classifier outputs with additional metadata."""
+
+    def __init__(
+        self, logits: Tensor, probabilities: Optional[Tensor] = None, hidden_states: Optional[Tensor] = None
+    ):
+        self.logits = logits
+        self.probabilities = probabilities
+        self.hidden_states = hidden_states
+
+
+class Classifier(
+    nn.Module,
+    PyTorchModelHubMixin,
+    # Add Hub metadata
+    library_name="lerobot",
+    repo_url="https://github.com/huggingface/lerobot",
+    tags=["robotics", "vision-classifier"],
+):
+    """Image classifier built on top of a pre-trained encoder."""
+
+    # Add name attribute for factory
+    name = "classifier"
+
+    def __init__(self, config: ClassifierConfig):
+        super().__init__()
+        self.config = config
+        self.processor = AutoImageProcessor.from_pretrained(self.config.model_name, trust_remote_code=True)
+        encoder = AutoModel.from_pretrained(self.config.model_name, trust_remote_code=True)
+        # Extract vision model if we're given a multimodal model
+        if hasattr(encoder, "vision_model"):
+            logging.info("Multimodal model detected - using vision encoder only")
+            self.encoder = encoder.vision_model
+            self.vision_config = encoder.config.vision_config
+        else:
+            self.encoder = encoder
+            self.vision_config = getattr(encoder, "config", None)
+
+        # Model type from config
+        self.is_cnn = self.config.model_type == "cnn"
+
+        # For CNNs, initialize backbone
+        if self.is_cnn:
+            self._setup_cnn_backbone()
+
+        self._freeze_encoder()
+        self._build_classifier_head()
+
+    def _setup_cnn_backbone(self):
+        """Set up CNN encoder"""
+        if hasattr(self.encoder, "fc"):
+            self.feature_dim = self.encoder.fc.in_features
+            self.encoder = nn.Sequential(*list(self.encoder.children())[:-1])
+        elif hasattr(self.encoder.config, "hidden_sizes"):
+            self.feature_dim = self.encoder.config.hidden_sizes[-1]  # Last channel dimension
+        else:
+            raise ValueError("Unsupported CNN architecture")
+
+    def _freeze_encoder(self) -> None:
+        """Freeze the encoder parameters."""
+        for param in self.encoder.parameters():
+            param.requires_grad = False
+
+    def _build_classifier_head(self) -> None:
+        """Initialize the classifier head architecture."""
+        # Get input dimension based on model type
+        if self.is_cnn:
+            input_dim = self.feature_dim
+        else:  # Transformer models
+            if hasattr(self.encoder.config, "hidden_size"):
+                input_dim = self.encoder.config.hidden_size
+            else:
+                raise ValueError("Unsupported transformer architecture since hidden_size is not found")
+
+        self.classifier_head = nn.Sequential(
+            nn.Linear(input_dim, self.config.hidden_dim),
+            nn.Dropout(self.config.dropout_rate),
+            nn.LayerNorm(self.config.hidden_dim),
+            nn.ReLU(),
+            nn.Linear(self.config.hidden_dim, 1 if self.config.num_classes == 2 else self.config.num_classes),
+        )
+
+    def _get_encoder_output(self, x: torch.Tensor) -> torch.Tensor:
+        """Extract the appropriate output from the encoder."""
+        # Process images with the processor (handles resizing and normalization)
+        processed = self.processor(
+            images=x,  # LeRobotDataset already provides proper tensor format
+            return_tensors="pt",
+        )
+        processed = processed["pixel_values"].to(x.device)
+
+        with torch.no_grad():
+            if self.is_cnn:
+                # The HF ResNet applies pooling internally
+                outputs = self.encoder(processed)
+                # Get pooled output directly
+                features = outputs.pooler_output
+
+                if features.dim() > 2:
+                    features = features.squeeze(-1).squeeze(-1)
+                return features
+            else:  # Transformer models
+                outputs = self.encoder(processed)
+                if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
+                    return outputs.pooler_output
+                return outputs.last_hidden_state[:, 0, :]
+
+    def forward(self, x: torch.Tensor) -> ClassifierOutput:
+        """Forward pass of the classifier."""
+        # For training, we expect input to be a tensor directly from LeRobotDataset
+        encoder_output = self._get_encoder_output(x)
+        logits = self.classifier_head(encoder_output)
+
+        if self.config.num_classes == 2:
+            logits = logits.squeeze(-1)
+            probabilities = torch.sigmoid(logits)
+        else:
+            probabilities = torch.softmax(logits, dim=-1)
+
+        return ClassifierOutput(logits=logits, probabilities=probabilities, hidden_states=encoder_output)
diff --git a/lerobot/common/robot_devices/control_utils.py b/lerobot/common/robot_devices/control_utils.py
index 8cc0f326..911a265b 100644
--- a/lerobot/common/robot_devices/control_utils.py
+++ b/lerobot/common/robot_devices/control_utils.py
@@ -120,14 +120,22 @@ def predict_action(observation, policy, device, use_amp):
     return action
 
 
-def init_keyboard_listener():
-    # Allow to exit early while recording an episode or resetting the environment,
-    # by tapping the right arrow key '->'. This might require a sudo permission
-    # to allow your terminal to monitor keyboard events.
+def init_keyboard_listener(assign_rewards=False):
+    """
+    Initializes a keyboard listener to enable early termination of an episode 
+    or environment reset by pressing the right arrow key ('->'). This may require 
+    sudo permissions to allow the terminal to monitor keyboard events.
+
+    Args:
+        assign_rewards (bool): If True, allows annotating the collected trajectory 
+        with a binary reward at the end of the episode to indicate success.
+    """
     events = {}
     events["exit_early"] = False
     events["rerecord_episode"] = False
     events["stop_recording"] = False
+    if assign_rewards:
+        events["next.reward"] = 0
 
     if is_headless():
         logging.warning(
@@ -152,6 +160,13 @@ def init_keyboard_listener():
                 print("Escape key pressed. Stopping data recording...")
                 events["stop_recording"] = True
                 events["exit_early"] = True
+            elif assign_rewards and key == keyboard.Key.space:
+                events["next.reward"] = 1 if events["next.reward"] == 0 else 0
+                print(
+                    "Space key pressed. Assigning new reward to the subsequent frames. New reward:",
+                    events["next.reward"],
+                )
+
         except Exception as e:
             print(f"Error handling key press: {e}")
 
@@ -272,6 +287,8 @@ def control_loop(
 
         if dataset is not None:
             frame = {**observation, **action}
+            if "next.reward" in events:
+                frame["next.reward"] = events["next.reward"]
             dataset.add_frame(frame)
 
         if display_cameras and not is_headless():
@@ -301,6 +318,8 @@ def reset_environment(robot, events, reset_time_s):
 
     timestamp = 0
     start_vencod_t = time.perf_counter()
+    if "next.reward" in events:
+        events["next.reward"] = 0
 
     # Wait if necessary
     with tqdm.tqdm(total=reset_time_s, desc="Waiting") as pbar:
diff --git a/lerobot/configs/policy/hilserl_classifier.yaml b/lerobot/configs/policy/hilserl_classifier.yaml
new file mode 100644
index 00000000..be82bc4e
--- /dev/null
+++ b/lerobot/configs/policy/hilserl_classifier.yaml
@@ -0,0 +1,48 @@
+# @package _global_
+
+defaults:
+  - _self_
+
+seed: 13
+dataset_repo_id: "dataset_repo_id"
+train_split_proportion: 0.8
+
+# Required by logger
+env:
+  name: "classifier"
+  task: "binary_classification"
+
+
+training:
+  num_epochs: 5
+  batch_size: 16
+  learning_rate: 1e-4
+  num_workers: 4
+  grad_clip_norm: 10
+  use_amp: true
+  log_freq: 1
+  eval_freq: 1  # How often to run validation (in epochs)
+  save_freq: 1  # How often to save checkpoints (in epochs)
+  save_checkpoint: true
+  image_key: "observation.images.phone"
+  label_key: "next.reward"
+
+eval:
+  batch_size: 16
+  num_samples_to_log: 30  # Number of validation samples to log in the table
+
+policy:
+  name: "hilserl/classifier"
+  model_name: "facebook/convnext-base-224"
+  model_type: "cnn"
+
+wandb:
+  enable: false
+  project: "classifier-training"
+  entity: "wandb_entity"
+  job_name: "classifier_training_0"
+  disable_artifact: false
+
+device: "mps"
+resume: false
+output_dir: "output"
diff --git a/lerobot/scripts/control_robot.py b/lerobot/scripts/control_robot.py
index 12eaf146..45a6bd66 100644
--- a/lerobot/scripts/control_robot.py
+++ b/lerobot/scripts/control_robot.py
@@ -191,6 +191,7 @@ def record(
     single_task: str,
     pretrained_policy_name_or_path: str | None = None,
     policy_overrides: List[str] | None = None,
+    assign_rewards: bool = False,
     fps: int | None = None,
     warmup_time_s: int | float = 2,
     episode_time_s: int | float = 10,
@@ -214,6 +215,9 @@ def record(
     policy = None
     device = None
     use_amp = None
+    extra_features = (
+        {"next.reward": {"dtype": "int64", "shape": (1,), "names": None}} if assign_rewards else None
+    )
 
     if single_task:
         task = single_task
@@ -254,12 +258,12 @@ def record(
             use_videos=video,
             image_writer_processes=num_image_writer_processes,
             image_writer_threads=num_image_writer_threads_per_camera * len(robot.cameras),
+            features=extra_features,
         )
 
     if not robot.is_connected:
         robot.connect()
-
-    listener, events = init_keyboard_listener()
+    listener, events = init_keyboard_listener(assign_rewards=assign_rewards)
 
     # Execute a few seconds without recording to:
     # 1. teleoperate the robot to move it in starting position if no policy provided,
@@ -469,12 +473,12 @@ if __name__ == "__main__":
         default=1,
         help="Upload dataset to Hugging Face hub.",
     )
-    parser_record.add_argument(
-        "--tags",
-        type=str,
-        nargs="*",
-        help="Add tags to your dataset on the hub.",
-    )
+    # parser_record.add_argument(
+    #     "--tags",
+    #     type=str,
+    #     nargs="*",
+    #     help="Add tags to your dataset on the hub.",
+    # )
     parser_record.add_argument(
         "--num-image-writer-processes",
         type=int,
@@ -517,6 +521,12 @@ if __name__ == "__main__":
         nargs="*",
         help="Any key=value arguments to override config values (use dots for.nested=overrides)",
     )
+    parser_record.add_argument(
+        "--assign-rewards",
+        type=int,
+        default=0,
+        help="Enables the assignation of rewards to frames (by default no assignation). When enabled, assign a 0 reward to frames until the space bar is pressed which assign a 1 reward. Press the space bar a second time to assign a 0 reward. The reward assigned is reset to 0 when the episode ends.",
+    )
 
     parser_replay = subparsers.add_parser("replay", parents=[base_parser])
     parser_replay.add_argument(
diff --git a/lerobot/scripts/train_hilserl_classifier.py b/lerobot/scripts/train_hilserl_classifier.py
new file mode 100644
index 00000000..8dea68c6
--- /dev/null
+++ b/lerobot/scripts/train_hilserl_classifier.py
@@ -0,0 +1,310 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import time
+from contextlib import nullcontext
+from pathlib import Path
+from pprint import pformat
+
+import hydra
+import torch
+import torch.nn as nn
+from deepdiff import DeepDiff
+from omegaconf import DictConfig, OmegaConf
+from termcolor import colored
+from torch import optim
+from torch.cuda.amp import GradScaler
+from torch.utils.data import DataLoader, WeightedRandomSampler, random_split
+from tqdm import tqdm
+
+import wandb
+from lerobot.common.datasets.factory import resolve_delta_timestamps
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.common.logger import Logger
+from lerobot.common.policies.factory import _policy_cfg_from_hydra_cfg
+from lerobot.common.policies.hilserl.classifier.configuration_classifier import ClassifierConfig
+from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
+from lerobot.common.utils.utils import (
+    format_big_number,
+    get_safe_torch_device,
+    init_hydra_config,
+    set_global_seed,
+)
+
+
+def get_model(cfg, logger):
+    classifier_config = _policy_cfg_from_hydra_cfg(ClassifierConfig, cfg)
+    model = Classifier(classifier_config)
+    if cfg.resume:
+        model.load_state_dict(Classifier.from_pretrained(str(logger.last_pretrained_model_dir)).state_dict())
+    return model
+
+
+def create_balanced_sampler(dataset, cfg):
+    # Creates a weighted sampler to handle class imbalance
+
+    labels = torch.tensor([item[cfg.training.label_key] for item in dataset])
+    _, counts = torch.unique(labels, return_counts=True)
+    class_weights = 1.0 / counts.float()
+    sample_weights = class_weights[labels]
+
+    return WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)
+
+
+def train_epoch(model, train_loader, criterion, optimizer, grad_scaler, device, logger, step, cfg):
+    # Single epoch training loop with AMP support and progress tracking
+    model.train()
+    correct = 0
+    total = 0
+
+    pbar = tqdm(train_loader, desc="Training")
+    for batch_idx, batch in enumerate(pbar):
+        start_time = time.perf_counter()
+        images = batch[cfg.training.image_key].to(device)
+        labels = batch[cfg.training.label_key].float().to(device)
+
+        # Forward pass with optional AMP
+        with torch.autocast(device_type=device.type) if cfg.training.use_amp else nullcontext():
+            outputs = model(images)
+            loss = criterion(outputs.logits, labels)
+
+        # Backward pass with gradient scaling if AMP enabled
+        optimizer.zero_grad()
+        if cfg.training.use_amp:
+            grad_scaler.scale(loss).backward()
+            grad_scaler.step(optimizer)
+            grad_scaler.update()
+        else:
+            loss.backward()
+            optimizer.step()
+
+        # Track metrics
+        if model.config.num_classes == 2:
+            predictions = (torch.sigmoid(outputs.logits) > 0.5).float()
+        else:
+            predictions = torch.argmax(outputs.logits, dim=1)
+        correct += (predictions == labels).sum().item()
+        total += labels.size(0)
+
+        current_acc = 100 * correct / total
+        train_info = {
+            "loss": loss.item(),
+            "accuracy": current_acc,
+            "dataloading_s": time.perf_counter() - start_time,
+        }
+
+        logger.log_dict(train_info, step + batch_idx, mode="train")
+        pbar.set_postfix({"loss": f"{loss.item():.4f}", "acc": f"{current_acc:.2f}%"})
+
+
+def validate(model, val_loader, criterion, device, logger, cfg, num_samples_to_log=8):
+    # Validation loop with metric tracking and sample logging
+    model.eval()
+    correct = 0
+    total = 0
+    batch_start_time = time.perf_counter()
+    samples = []
+    running_loss = 0
+
+    with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.training.use_amp else nullcontext():
+        for batch in tqdm(val_loader, desc="Validation"):
+            images = batch[cfg.training.image_key].to(device)
+            labels = batch[cfg.training.label_key].float().to(device)
+
+            outputs = model(images)
+            loss = criterion(outputs.logits, labels)
+
+            # Track metrics
+            if model.config.num_classes == 2:
+                predictions = (torch.sigmoid(outputs.logits) > 0.5).float()
+            else:
+                predictions = torch.argmax(outputs.logits, dim=1)
+            correct += (predictions == labels).sum().item()
+            total += labels.size(0)
+            running_loss += loss.item()
+
+            # Log sample predictions for visualization
+            if len(samples) < num_samples_to_log:
+                for i in range(min(num_samples_to_log - len(samples), len(images))):
+                    if model.config.num_classes == 2:
+                        confidence = round(outputs.probabilities[i].item(), 3)
+                    else:
+                        confidence = [round(prob, 3) for prob in outputs.probabilities[i].tolist()]
+                    samples.append(
+                        {
+                            "image": wandb.Image(images[i].cpu()),
+                            "true_label": labels[i].item(),
+                            "predicted": predictions[i].item(),
+                            "confidence": confidence,
+                        }
+                    )
+
+    accuracy = 100 * correct / total
+    avg_loss = running_loss / len(val_loader)
+
+    eval_info = {
+        "loss": avg_loss,
+        "accuracy": accuracy,
+        "eval_s": time.perf_counter() - batch_start_time,
+        "eval/prediction_samples": wandb.Table(
+            data=[[s["image"], s["true_label"], s["predicted"], f"{s['confidence']}"] for s in samples],
+            columns=["Image", "True Label", "Predicted", "Confidence"],
+        )
+        if logger._cfg.wandb.enable
+        else None,
+    }
+
+    return accuracy, eval_info
+
+
+@hydra.main(version_base="1.2", config_path="../configs", config_name="classifier")
+def train(cfg: DictConfig) -> None:
+    # Main training pipeline with support for resuming training
+    logging.info(OmegaConf.to_yaml(cfg))
+
+    # Initialize training environment
+    device = get_safe_torch_device(cfg.device, log=True)
+    set_global_seed(cfg.seed)
+
+    out_dir = Path(cfg.output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    logger = Logger(cfg, out_dir, cfg.wandb.job_name if cfg.wandb.enable else None)
+
+    # Setup dataset and dataloaders
+    dataset = LeRobotDataset(cfg.dataset_repo_id)
+    logging.info(f"Dataset size: {len(dataset)}")
+
+    train_size = int(cfg.train_split_proportion * len(dataset))
+    val_size = len(dataset) - train_size
+    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
+
+    sampler = create_balanced_sampler(train_dataset, cfg)
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=cfg.training.batch_size,
+        num_workers=cfg.training.num_workers,
+        sampler=sampler,
+        pin_memory=True,
+    )
+
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=cfg.eval.batch_size,
+        shuffle=False,
+        num_workers=cfg.training.num_workers,
+        pin_memory=True,
+    )
+
+    # Resume training if requested
+    step = 0
+    best_val_acc = 0
+
+    if cfg.resume:
+        if not Logger.get_last_checkpoint_dir(out_dir).exists():
+            raise RuntimeError(
+                "You have set resume=True, but there is no model checkpoint in "
+                f"{Logger.get_last_checkpoint_dir(out_dir)}"
+            )
+        checkpoint_cfg_path = str(Logger.get_last_pretrained_model_dir(out_dir) / "config.yaml")
+        logging.info(
+            colored(
+                "You have set resume=True, indicating that you wish to resume a run",
+                color="yellow",
+                attrs=["bold"],
+            )
+        )
+        # Load and validate checkpoint configuration
+        checkpoint_cfg = init_hydra_config(checkpoint_cfg_path)
+        # Check for differences between the checkpoint configuration and provided configuration.
+        # Hack to resolve the delta_timestamps ahead of time in order to properly diff.
+        resolve_delta_timestamps(cfg)
+        diff = DeepDiff(OmegaConf.to_container(checkpoint_cfg), OmegaConf.to_container(cfg))
+        # Ignore the `resume` and parameters.
+        if "values_changed" in diff and "root['resume']" in diff["values_changed"]:
+            del diff["values_changed"]["root['resume']"]
+        if len(diff) > 0:
+            logging.warning(
+                "At least one difference was detected between the checkpoint configuration and "
+                f"the provided configuration: \n{pformat(diff)}\nNote that the checkpoint configuration "
+                "takes precedence.",
+            )
+        # Use the checkpoint config instead of the provided config (but keep `resume` parameter).
+        cfg = checkpoint_cfg
+        cfg.resume = True
+
+    # Initialize model and training components
+    model = get_model(cfg=cfg, logger=logger).to(device)
+
+    optimizer = optim.AdamW(model.parameters(), lr=cfg.training.learning_rate)
+    # Use BCEWithLogitsLoss for binary classification and CrossEntropyLoss for multi-class
+    criterion = nn.BCEWithLogitsLoss() if model.config.num_classes == 2 else nn.CrossEntropyLoss()
+    grad_scaler = GradScaler(enabled=cfg.training.use_amp)
+
+    # Log model parameters
+    num_learnable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    num_total_params = sum(p.numel() for p in model.parameters())
+    logging.info(f"Learnable parameters: {format_big_number(num_learnable_params)}")
+    logging.info(f"Total parameters: {format_big_number(num_total_params)}")
+
+    if cfg.resume:
+        step = logger.load_last_training_state(optimizer, None)
+
+    # Training loop with validation and checkpointing
+    for epoch in range(cfg.training.num_epochs):
+        logging.info(f"\nEpoch {epoch+1}/{cfg.training.num_epochs}")
+
+        train_epoch(model, train_loader, criterion, optimizer, grad_scaler, device, logger, step, cfg)
+
+        # Periodic validation
+        if cfg.training.eval_freq > 0 and (epoch + 1) % cfg.training.eval_freq == 0:
+            val_acc, eval_info = validate(
+                model,
+                val_loader,
+                criterion,
+                device,
+                logger,
+                cfg,
+            )
+            logger.log_dict(eval_info, step + len(train_loader), mode="eval")
+
+            # Save best model
+            if val_acc > best_val_acc:
+                best_val_acc = val_acc
+                logger.save_checkpoint(
+                    train_step=step + len(train_loader),
+                    policy=model,
+                    optimizer=optimizer,
+                    scheduler=None,
+                    identifier="best",
+                )
+
+        # Periodic checkpointing
+        if cfg.training.save_checkpoint and (epoch + 1) % cfg.training.save_freq == 0:
+            logger.save_checkpoint(
+                train_step=step + len(train_loader),
+                policy=model,
+                optimizer=optimizer,
+                scheduler=None,
+                identifier=f"{epoch+1:06d}",
+            )
+
+        step += len(train_loader)
+
+    logging.info("Training completed")
+
+
+if __name__ == "__main__":
+    train()
diff --git a/tests/test_train_hilserl_classifier.py b/tests/test_train_hilserl_classifier.py
new file mode 100644
index 00000000..66d8fbe4
--- /dev/null
+++ b/tests/test_train_hilserl_classifier.py
@@ -0,0 +1,251 @@
+import os
+import tempfile
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+from hydra import compose, initialize_config_dir
+from torch import nn
+from torch.utils.data import Dataset
+
+from lerobot.common.policies.hilserl.classifier.configuration_classifier import ClassifierConfig
+from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
+from lerobot.scripts.train_hilserl_classifier import (
+    create_balanced_sampler,
+    train,
+    train_epoch,
+    validate,
+)
+
+
+class MockDataset(Dataset):
+    def __init__(self, data):
+        self.data = data
+        self.meta = MagicMock()
+        self.meta.stats = {}
+
+    def __getitem__(self, idx):
+        return self.data[idx]
+
+    def __len__(self):
+        return len(self.data)
+
+
+def make_dummy_model():
+    model_config = ClassifierConfig(num_classes=2, model_name="hf-tiny-model-private/tiny-random-ResNetModel")
+    model = Classifier(config=model_config)
+    return model
+
+
+def test_create_balanced_sampler():
+    # Mock dataset with imbalanced classes
+    data = [
+        {"label": 0},
+        {"label": 0},
+        {"label": 1},
+        {"label": 0},
+        {"label": 1},
+        {"label": 1},
+        {"label": 1},
+        {"label": 1},
+    ]
+    dataset = MockDataset(data)
+    cfg = MagicMock()
+    cfg.training.label_key = "label"
+
+    sampler = create_balanced_sampler(dataset, cfg)
+
+    # Get weights from the sampler
+    weights = sampler.weights.float()
+
+    # Check that samples have appropriate weights
+    labels = [item["label"] for item in data]
+    class_counts = torch.tensor([labels.count(0), labels.count(1)], dtype=torch.float32)
+    class_weights = 1.0 / class_counts
+    expected_weights = torch.tensor([class_weights[label] for label in labels], dtype=torch.float32)
+
+    # Test that the weights are correct
+    assert torch.allclose(weights, expected_weights)
+
+
+def test_train_epoch():
+    model = make_dummy_model()
+    # Mock components
+    model.train = MagicMock()
+
+    train_loader = [
+        {
+            "image": torch.rand(2, 3, 224, 224),
+            "label": torch.tensor([0.0, 1.0]),
+        }
+    ]
+
+    criterion = nn.BCEWithLogitsLoss()
+    optimizer = MagicMock()
+    grad_scaler = MagicMock()
+    device = torch.device("cpu")
+    logger = MagicMock()
+    step = 0
+    cfg = MagicMock()
+    cfg.training.image_key = "image"
+    cfg.training.label_key = "label"
+    cfg.training.use_amp = False
+
+    # Call the function under test
+    train_epoch(
+        model,
+        train_loader,
+        criterion,
+        optimizer,
+        grad_scaler,
+        device,
+        logger,
+        step,
+        cfg,
+    )
+
+    # Check that model.train() was called
+    model.train.assert_called_once()
+
+    # Check that optimizer.zero_grad() was called
+    optimizer.zero_grad.assert_called()
+
+    # Check that logger.log_dict was called
+    logger.log_dict.assert_called()
+
+
+def test_validate():
+    model = make_dummy_model()
+
+    # Mock components
+    model.eval = MagicMock()
+    val_loader = [
+        {
+            "image": torch.rand(2, 3, 224, 224),
+            "label": torch.tensor([0.0, 1.0]),
+        }
+    ]
+    criterion = nn.BCEWithLogitsLoss()
+    device = torch.device("cpu")
+    logger = MagicMock()
+    cfg = MagicMock()
+    cfg.training.image_key = "image"
+    cfg.training.label_key = "label"
+    cfg.training.use_amp = False
+
+    # Call validate
+    accuracy, eval_info = validate(model, val_loader, criterion, device, logger, cfg)
+
+    # Check that model.eval() was called
+    model.eval.assert_called_once()
+
+    # Check accuracy/eval_info are calculated and of the correct type
+    assert isinstance(accuracy, float)
+    assert isinstance(eval_info, dict)
+
+
+@pytest.mark.parametrize("resume", [True, False])
+@patch("lerobot.scripts.train_hilserl_classifier.init_hydra_config")
+@patch("lerobot.scripts.train_hilserl_classifier.Logger.get_last_checkpoint_dir")
+@patch("lerobot.scripts.train_hilserl_classifier.Logger.get_last_pretrained_model_dir")
+@patch("lerobot.scripts.train_hilserl_classifier.Logger")
+@patch("lerobot.scripts.train_hilserl_classifier.LeRobotDataset")
+@patch("lerobot.scripts.train_hilserl_classifier.make_policy")
+def test_resume_function(
+    mock_make_policy,
+    mock_dataset,
+    mock_logger,
+    mock_get_last_pretrained_model_dir,
+    mock_get_last_checkpoint_dir,
+    mock_init_hydra_config,
+    resume,
+):
+    # Initialize Hydra
+    test_file_dir = os.path.dirname(os.path.abspath(__file__))
+    config_dir = os.path.abspath(os.path.join(test_file_dir, "..", "lerobot", "configs", "policy"))
+    assert os.path.exists(config_dir), f"Config directory does not exist at {config_dir}"
+
+    with initialize_config_dir(config_dir=config_dir, job_name="test_app", version_base="1.2"):
+        cfg = compose(
+            config_name="reward_classifier",
+            overrides=[
+                "device=cpu",
+                "seed=42",
+                f"output_dir={tempfile.mkdtemp()}",
+                "wandb.enable=False",
+                f"resume={resume}",
+                "dataset_repo_id=dataset_repo_id",
+                "train_split_proportion=0.8",
+                "training.num_workers=0",
+                "training.batch_size=2",
+                "training.image_key=image",
+                "training.label_key=label",
+                "training.use_amp=False",
+                "training.num_epochs=1",
+                "eval.batch_size=2",
+            ],
+        )
+
+    # Mock the init_hydra_config function to return cfg
+    mock_init_hydra_config.return_value = cfg
+
+    # Mock dataset
+    dataset = MockDataset([{"image": torch.rand(3, 224, 224), "label": i % 2} for i in range(10)])
+    mock_dataset.return_value = dataset
+
+    # Mock checkpoint handling
+    mock_checkpoint_dir = MagicMock(spec=Path)
+    mock_checkpoint_dir.exists.return_value = resume  # Only exists if resuming
+    mock_get_last_checkpoint_dir.return_value = mock_checkpoint_dir
+    mock_get_last_pretrained_model_dir.return_value = Path(tempfile.mkdtemp())
+
+    # Mock logger
+    logger = MagicMock()
+    resumed_step = 1000
+    if resume:
+        logger.load_last_training_state.return_value = resumed_step
+    else:
+        logger.load_last_training_state.return_value = 0
+    mock_logger.return_value = logger
+
+    # Instantiate the model and set make_policy to return it
+    model = make_dummy_model()
+    mock_make_policy.return_value = model
+
+    # Call train
+    train(cfg)
+
+    # Check that checkpoint handling methods were called
+    if resume:
+        mock_get_last_checkpoint_dir.assert_called_once_with(Path(cfg.output_dir))
+        mock_get_last_pretrained_model_dir.assert_called_once_with(Path(cfg.output_dir))
+        mock_checkpoint_dir.exists.assert_called_once()
+        logger.load_last_training_state.assert_called_once()
+    else:
+        mock_get_last_checkpoint_dir.assert_not_called()
+        mock_get_last_pretrained_model_dir.assert_not_called()
+        mock_checkpoint_dir.exists.assert_not_called()
+        logger.load_last_training_state.assert_not_called()
+
+    # Collect the steps from logger.log_dict calls
+    train_log_calls = logger.log_dict.call_args_list
+
+    # Extract the steps used in the train logging
+    steps = []
+    for call in train_log_calls:
+        mode = call.kwargs.get("mode", call.args[2] if len(call.args) > 2 else None)
+        if mode == "train":
+            step = call.kwargs.get("step", call.args[1] if len(call.args) > 1 else None)
+            steps.append(step)
+
+    expected_start_step = resumed_step if resume else 0
+
+    # Calculate expected_steps
+    train_size = int(cfg.train_split_proportion * len(dataset))
+    batch_size = cfg.training.batch_size
+    num_batches = (train_size + batch_size - 1) // batch_size
+
+    expected_steps = [expected_start_step + i for i in range(num_batches)]
+
+    assert steps == expected_steps, f"Expected steps {expected_steps}, got {steps}"

From 7fcf638c0d350aa40ac6cfed46ed4b285647b7ea Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Mon, 9 Dec 2024 19:17:47 +0100
Subject: [PATCH 003/112] Add human intervention mechanism and eval_robot
 script to evaluate policy on the robot (#541)

Co-authored-by: Yoel <yoel.chornton@gmail.com>
---
 lerobot/configs/robot/koch.yaml  |   4 +-
 lerobot/configs/robot/so100.yaml |   2 +-
 lerobot/scripts/eval_on_robot.py | 335 +++++++++++++++++++++++++++++++
 3 files changed, 338 insertions(+), 3 deletions(-)
 create mode 100644 lerobot/scripts/eval_on_robot.py

diff --git a/lerobot/configs/robot/koch.yaml b/lerobot/configs/robot/koch.yaml
index 40969dc7..334db830 100644
--- a/lerobot/configs/robot/koch.yaml
+++ b/lerobot/configs/robot/koch.yaml
@@ -10,7 +10,7 @@ max_relative_target: null
 leader_arms:
   main:
     _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/tty.usbmodem575E0031751
+    port: /dev/tty.usbmodem58760430441
     motors:
       # name: (index, model)
       shoulder_pan: [1, "xl330-m077"]
@@ -23,7 +23,7 @@ leader_arms:
 follower_arms:
   main:
     _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/tty.usbmodem575E0032081
+    port: /dev/tty.usbmodem585A0083391
     motors:
       # name: (index, model)
       shoulder_pan: [1, "xl430-w250"]
diff --git a/lerobot/configs/robot/so100.yaml b/lerobot/configs/robot/so100.yaml
index ec6f3e3f..0978de64 100644
--- a/lerobot/configs/robot/so100.yaml
+++ b/lerobot/configs/robot/so100.yaml
@@ -18,7 +18,7 @@ max_relative_target: null
 leader_arms:
   main:
     _target_: lerobot.common.robot_devices.motors.feetech.FeetechMotorsBus
-    port: /dev/tty.usbmodem585A0077581
+    port: /dev/tty.usbmodem58760433331
     motors:
       # name: (index, model)
       shoulder_pan: [1, "sts3215"]
diff --git a/lerobot/scripts/eval_on_robot.py b/lerobot/scripts/eval_on_robot.py
new file mode 100644
index 00000000..6a790f0a
--- /dev/null
+++ b/lerobot/scripts/eval_on_robot.py
@@ -0,0 +1,335 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluate a policy by running rollouts on the real robot and computing metrics.
+
+Usage examples: evaluate a checkpoint from the LeRobot training script for 10 episodes.
+
+```
+python lerobot/scripts/eval_on_robot.py \
+    -p outputs/train/model/checkpoints/005000/pretrained_model \
+    eval.n_episodes=10
+```
+
+**NOTE** (michel-aractingi): This script is incomplete and it is being prepared
+for running training on the real robot. 
+"""
+
+import argparse
+import logging
+import time
+from copy import deepcopy
+
+import numpy as np
+import torch
+from tqdm import trange
+
+from lerobot.common.policies.policy_protocol import Policy
+from lerobot.common.robot_devices.control_utils import busy_wait, is_headless
+from lerobot.common.robot_devices.robots.factory import Robot, make_robot
+from lerobot.common.utils.utils import (
+    init_hydra_config,
+    init_logging,
+    log_say,
+)
+
+
+def rollout(robot: Robot, policy: Policy, fps: int, control_time_s: float = 20, use_amp: bool = True) -> dict:
+    """Run a batched policy rollout on the real robot. 
+
+    The return dictionary contains:
+        "robot": A a dictionary of (batch, sequence + 1, *) tensors mapped to observation
+            keys. NOTE the that this has an extra sequence element relative to the other keys in the
+            dictionary. This is because an extra observation is included for after the environment is
+            terminated or truncated.
+        "action": A (batch, sequence, action_dim) tensor of actions applied based on the observations (not
+            including the last observations).
+        "reward": A (batch, sequence) tensor of rewards received for applying the actions.
+        "success": A (batch, sequence) tensor of success conditions (the only time this can be True is upon
+            environment termination/truncation).
+        "done": A (batch, sequence) tensor of **cumulative** done conditions. For any given batch element,
+            the first True is followed by True's all the way till the end. This can be used for masking
+            extraneous elements from the sequences above.
+
+    Args:
+        robot: The robot class that defines the interface with the real robot. 
+        policy: The policy. Must be a PyTorch nn module.
+
+    Returns:
+        The dictionary described above.
+    """
+    # assert isinstance(policy, nn.Module), "Policy must be a PyTorch nn module."
+    # device = get_device_from_parameters(policy)
+
+    # define keyboard listener
+    listener, events = init_keyboard_listener()
+
+    # Reset the policy. TODO (michel-aractingi) add real policy evaluation once the code is ready.
+    # policy.reset() 
+
+    # Get observation from real robot
+    observation = robot.capture_observation()
+
+    # Calculate reward. TODO (michel-aractingi)
+    # in HIL-SERL it will be with a reward classifier
+    reward = calculate_reward(observation)
+    all_observations = []
+    all_actions = []
+    all_rewards = []
+    all_successes = []
+
+    start_episode_t = time.perf_counter()
+    timestamp = 0.0
+    while timestamp < control_time_s:
+        start_loop_t = time.perf_counter()
+
+        all_observations.append(deepcopy(observation))
+        # observation = {key: observation[key].to(device, non_blocking=True) for key in observation}
+
+        # Apply the next action.
+        while events["pause_policy"] and not events["human_intervention_step"]:
+            busy_wait(0.5)
+
+        if events["human_intervention_step"]:
+            # take over the robot's actions
+            observation, action = robot.teleop_step(record_data=True)
+            action = action["action"]  # teleop step returns torch tensors but in a dict
+        else:
+            # explore with policy
+            with torch.inference_mode():
+                action = robot.follower_arms["main"].read("Present_Position")
+                action = torch.from_numpy(action)
+                robot.send_action(action)
+                # action = predict_action(observation, policy, device, use_amp)
+
+        observation = robot.capture_observation()
+        # Calculate reward
+        # in HIL-SERL it will be with a reward classifier
+        reward = calculate_reward(observation)
+
+        all_actions.append(action)
+        all_rewards.append(torch.from_numpy(reward))
+        all_successes.append(torch.tensor([False]))
+
+        dt_s = time.perf_counter() - start_loop_t
+        busy_wait(1 / fps - dt_s)
+        timestamp = time.perf_counter() - start_episode_t
+        if events["exit_early"]:
+            events["exit_early"] = False
+            events["human_intervention_step"] = False
+            events["pause_policy"] = False
+            break
+    all_observations.append(deepcopy(observation))
+
+    dones = torch.tensor([False] * len(all_actions))
+    dones[-1] = True
+    # Stack the sequence along the first dimension so that we have (batch, sequence, *) tensors.
+    ret = {
+        "action": torch.stack(all_actions, dim=1),
+        "next.reward": torch.stack(all_rewards, dim=1),
+        "next.success": torch.stack(all_successes, dim=1),
+        "done": dones,
+    }
+    stacked_observations = {}
+    for key in all_observations[0]:
+        stacked_observations[key] = torch.stack([obs[key] for obs in all_observations], dim=1)
+    ret["observation"] = stacked_observations
+
+    listener.stop()
+
+    return ret
+
+
+def eval_policy(
+    robot: Robot,
+    policy: torch.nn.Module,
+    fps: float,
+    n_episodes: int,
+    control_time_s: int = 20,
+    use_amp: bool = True,
+) -> dict:
+    """
+    Args:
+        env: The batch of environments.
+        policy: The policy.
+        n_episodes: The number of episodes to evaluate.
+    Returns:
+        Dictionary with metrics and data regarding the rollouts.
+    """
+    # TODO (michel-aractingi) comment this out for testing with a fixed policy
+    # assert isinstance(policy, Policy)
+    # policy.eval()
+
+    sum_rewards = []
+    max_rewards = []
+    successes = []
+    rollouts = []
+
+    start_eval = time.perf_counter()
+    progbar = trange(n_episodes, desc="Evaluating policy on real robot")
+    for _batch_idx in progbar:
+        rollout_data = rollout(robot, policy, fps, control_time_s, use_amp)
+
+        rollouts.append(rollout_data)
+        sum_rewards.append(sum(rollout_data["next.reward"]))
+        max_rewards.append(max(rollout_data["next.reward"]))
+        successes.append(rollout_data["next.success"][-1])
+
+    info = {
+        "per_episode": [
+            {
+                "episode_ix": i,
+                "sum_reward": sum_reward,
+                "max_reward": max_reward,
+                "pc_success": success * 100,
+            }
+            for i, (sum_reward, max_reward, success) in enumerate(
+                zip(
+                    sum_rewards[:n_episodes],
+                    max_rewards[:n_episodes],
+                    successes[:n_episodes],
+                    strict=False,
+                )
+            )
+        ],
+        "aggregated": {
+            "avg_sum_reward": float(np.nanmean(torch.cat(sum_rewards[:n_episodes]))),
+            "avg_max_reward": float(np.nanmean(torch.cat(max_rewards[:n_episodes]))),
+            "pc_success": float(np.nanmean(torch.cat(successes[:n_episodes])) * 100),
+            "eval_s": time.time() - start_eval,
+            "eval_ep_s": (time.time() - start_eval) / n_episodes,
+        },
+    }
+
+    if robot.is_connected:
+        robot.disconnect()
+
+    return info
+
+
+def calculate_reward(observation):
+    """
+    Method to calculate reward function in some way.
+    In HIL-SERL this is done through defining a reward classifier
+    """
+    # reward = reward_classifier(observation)
+    return np.array([0.0])
+
+
+def init_keyboard_listener():
+    # Allow to exit early while recording an episode or resetting the environment,
+    # by tapping the right arrow key '->'. This might require a sudo permission
+    # to allow your terminal to monitor keyboard events.
+    events = {}
+    events["exit_early"] = False
+    events["rerecord_episode"] = False
+    events["pause_policy"] = False
+    events["human_intervention_step"] = False
+
+    if is_headless():
+        logging.warning(
+            "Headless environment detected. On-screen cameras display and keyboard inputs will not be available."
+        )
+        listener = None
+        return listener, events
+
+    # Only import pynput if not in a headless environment
+    from pynput import keyboard
+
+    def on_press(key):
+        try:
+            if key == keyboard.Key.right:
+                print("Right arrow key pressed. Exiting loop...")
+                events["exit_early"] = True
+            elif key == keyboard.Key.left:
+                print("Left arrow key pressed. Exiting loop and rerecord the last episode...")
+                events["rerecord_episode"] = True
+                events["exit_early"] = True
+            elif key == keyboard.Key.space:
+                # check if first space press then pause the policy for the user to get ready
+                # if second space press then the user is ready to start intervention
+                if not events["pause_policy"]:
+                    print(
+                        "Space key pressed. Human intervention required.\n"
+                        "Place the leader in similar pose to the follower and press space again."
+                    )
+                    events["pause_policy"] = True
+                    log_say("Human intervention stage. Get ready to take over.", play_sounds=True)
+                else:
+                    events["human_intervention_step"] = True
+                    print("Space key pressed. Human intervention starting.")
+                    log_say("Starting human intervention.", play_sounds=True)
+
+        except Exception as e:
+            print(f"Error handling key press: {e}")
+
+    listener = keyboard.Listener(on_press=on_press)
+    listener.start()
+
+    return listener, events
+
+
+if __name__ == "__main__":
+    init_logging()
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument(
+        "--robot-path",
+        type=str,
+        default="lerobot/configs/robot/koch.yaml",
+        help="Path to robot yaml file used to instantiate the robot using `make_robot` factory function.",
+    )
+    group.add_argument(
+        "--robot-overrides",
+        type=str,
+        nargs="*",
+        help="Any key=value arguments to override config values (use dots for.nested=overrides)",
+    )
+    group.add_argument(
+        "-p",
+        "--pretrained-policy-name-or-path",
+        help=(
+            "Either the repo ID of a model hosted on the Hub or a path to a directory containing weights "
+            "saved using `Policy.save_pretrained`. If not provided, the policy is initialized from scratch "
+            "(useful for debugging). This argument is mutually exclusive with `--config`."
+        ),
+    )
+    group.add_argument(
+        "--config",
+        help=(
+            "Path to a yaml config you want to use for initializing a policy from scratch (useful for "
+            "debugging). This argument is mutually exclusive with `--pretrained-policy-name-or-path` (`-p`)."
+        ),
+    )
+    parser.add_argument("--revision", help="Optionally provide the Hugging Face Hub revision ID.")
+    parser.add_argument(
+        "--out-dir",
+        help=(
+            "Where to save the evaluation outputs. If not provided, outputs are saved in "
+            "outputs/eval/{timestamp}_{env_name}_{policy_name}"
+        ),
+    )
+
+    args = parser.parse_args()
+
+    robot_cfg = init_hydra_config(args.robot_path, args.robot_overrides)
+    robot = make_robot(robot_cfg)
+    if not robot.is_connected:
+        robot.connect()
+
+    eval_policy(robot, None, fps=40, n_episodes=2, control_time_s=100)

From 1020bc3108b79f6f9c6d5a6d3e7ea241419dc8fe Mon Sep 17 00:00:00 2001
From: Eugene Mironov <helper2424@gmail.com>
Date: Tue, 17 Dec 2024 02:42:53 +0700
Subject: [PATCH 004/112] Fixup

---
 lerobot/common/logger.py                      | 2 +-
 lerobot/common/robot_devices/control_utils.py | 6 +++---
 lerobot/scripts/train_hilserl_classifier.py   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lerobot/common/logger.py b/lerobot/common/logger.py
index dec8b465..4015492d 100644
--- a/lerobot/common/logger.py
+++ b/lerobot/common/logger.py
@@ -25,13 +25,13 @@ from glob import glob
 from pathlib import Path
 
 import torch
+import wandb
 from huggingface_hub.constants import SAFETENSORS_SINGLE_FILE
 from omegaconf import DictConfig, OmegaConf
 from termcolor import colored
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LRScheduler
 
-import wandb
 from lerobot.common.policies.policy_protocol import Policy
 from lerobot.common.utils.utils import get_global_random_state, set_global_random_state
 
diff --git a/lerobot/common/robot_devices/control_utils.py b/lerobot/common/robot_devices/control_utils.py
index 911a265b..8a6bcfbd 100644
--- a/lerobot/common/robot_devices/control_utils.py
+++ b/lerobot/common/robot_devices/control_utils.py
@@ -122,12 +122,12 @@ def predict_action(observation, policy, device, use_amp):
 
 def init_keyboard_listener(assign_rewards=False):
     """
-    Initializes a keyboard listener to enable early termination of an episode 
-    or environment reset by pressing the right arrow key ('->'). This may require 
+    Initializes a keyboard listener to enable early termination of an episode
+    or environment reset by pressing the right arrow key ('->'). This may require
     sudo permissions to allow the terminal to monitor keyboard events.
 
     Args:
-        assign_rewards (bool): If True, allows annotating the collected trajectory 
+        assign_rewards (bool): If True, allows annotating the collected trajectory
         with a binary reward at the end of the episode to indicate success.
     """
     events = {}
diff --git a/lerobot/scripts/train_hilserl_classifier.py b/lerobot/scripts/train_hilserl_classifier.py
index 8dea68c6..86fa90f2 100644
--- a/lerobot/scripts/train_hilserl_classifier.py
+++ b/lerobot/scripts/train_hilserl_classifier.py
@@ -22,6 +22,7 @@ from pprint import pformat
 import hydra
 import torch
 import torch.nn as nn
+import wandb
 from deepdiff import DeepDiff
 from omegaconf import DictConfig, OmegaConf
 from termcolor import colored
@@ -30,7 +31,6 @@ from torch.cuda.amp import GradScaler
 from torch.utils.data import DataLoader, WeightedRandomSampler, random_split
 from tqdm import tqdm
 
-import wandb
 from lerobot.common.datasets.factory import resolve_delta_timestamps
 from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
 from lerobot.common.logger import Logger

From 668d493bf997b7d08178d4288a7177d71bb808cf Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Wed, 11 Dec 2024 00:22:10 +0100
Subject: [PATCH 010/112] Update lerobot/scripts/train_hilserl_classifier.py

Co-authored-by: Yoel <yoel.chornton@gmail.com>
---
 lerobot/scripts/train_hilserl_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lerobot/scripts/train_hilserl_classifier.py b/lerobot/scripts/train_hilserl_classifier.py
index 86fa90f2..78659dc8 100644
--- a/lerobot/scripts/train_hilserl_classifier.py
+++ b/lerobot/scripts/train_hilserl_classifier.py
@@ -170,7 +170,7 @@ def validate(model, val_loader, criterion, device, logger, cfg, num_samples_to_l
     return accuracy, eval_info
 
 
-@hydra.main(version_base="1.2", config_path="../configs", config_name="classifier")
+@hydra.main(version_base="1.2", config_path="../configs", config_name="hilserl_classifier")
 def train(cfg: DictConfig) -> None:
     # Main training pipeline with support for resuming training
     logging.info(OmegaConf.to_yaml(cfg))

From ed66c92383da2bb297d76ae488cd178d8642b252 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Wed, 11 Dec 2024 00:30:33 +0100
Subject: [PATCH 011/112] nit in control_robot.py

---
 .../policies/hilserl/configuration_hilserl.py | 23 +++++++++++++++
 .../policies/hilserl/modeling_hilserl.py      | 29 +++++++++++++++++++
 2 files changed, 52 insertions(+)
 create mode 100644 lerobot/common/policies/hilserl/configuration_hilserl.py
 create mode 100644 lerobot/common/policies/hilserl/modeling_hilserl.py

diff --git a/lerobot/common/policies/hilserl/configuration_hilserl.py b/lerobot/common/policies/hilserl/configuration_hilserl.py
new file mode 100644
index 00000000..f1bc850f
--- /dev/null
+++ b/lerobot/common/policies/hilserl/configuration_hilserl.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. 
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+
+@dataclass
+class HILSerlConfig:
+    pass
diff --git a/lerobot/common/policies/hilserl/modeling_hilserl.py b/lerobot/common/policies/hilserl/modeling_hilserl.py
new file mode 100644
index 00000000..236ed433
--- /dev/null
+++ b/lerobot/common/policies/hilserl/modeling_hilserl.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. 
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+
+
+class HILSerlPolicy(
+    nn.Module,
+    PyTorchModelHubMixin,
+    library_name="lerobot",
+    repo_url="https://github.com/huggingface/lerobot",
+    tags=["robotics", "hilserl"],
+):
+    pass
\ No newline at end of file

From c9af8e36a722d95908ffdf173038863a628f17e3 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Thu, 12 Dec 2024 11:45:30 +0100
Subject: [PATCH 012/112] completed losses

---
 lerobot/common/policies/sac/modeling_sac.py | 187 ++++++++++++++++++++
 1 file changed, 187 insertions(+)
 create mode 100644 lerobot/common/policies/sac/modeling_sac.py

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
new file mode 100644
index 00000000..fb2e5542
--- /dev/null
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. 
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import deque
+
+import einops
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F  # noqa: N812
+from torch import Tensor
+
+from huggingface_hub import PyTorchModelHubMixin
+from lerobot.common.policies.normalize import Normalize, Unnormalize
+from lerobot.common.policies.sac.configuration_sac import SACConfig
+
+class SACPolicy(
+    nn.Module,
+    PyTorchModelHubMixin,
+    library_name="lerobot",
+    repo_url="https://github.com/huggingface/lerobot",
+    tags=["robotics", "RL", "SAC"],
+):
+    
+    def __init__(
+        self, config: SACConfig | None = None, dataset_stats: dict[str, dict[str, Tensor]] | None = None
+    ):
+        
+        super().__init__()
+
+        if config is None:
+            config = SACConfig()
+        self.config = config
+
+        if config.input_normalization_modes is not None:
+            self.normalize_inputs = Normalize(
+                config.input_shapes, config.input_normalization_modes, dataset_stats
+            )
+        else:
+            self.normalize_inputs = nn.Identity()
+        self.normalize_targets = Normalize(
+            config.output_shapes, config.output_normalization_modes, dataset_stats
+        )
+        self.unnormalize_outputs = Unnormalize(
+            config.output_shapes, config.output_normalization_modes, dataset_stats
+        )
+        
+        self.critic_ensemble = ...
+        self.critic_target = ...
+        self.actor_network = ...
+
+        self.temperature = ...
+
+    def reset(self):
+        """
+        Clear observation and action queues. Should be called on `env.reset()`
+        queues are populated during rollout of the policy, they contain the n latest observations and actions
+        """
+
+        self._queues = {
+            "observation.state": deque(maxlen=1),
+            "action": deque(maxlen=1),
+        }
+        if self._use_image:
+            self._queues["observation.image"] = deque(maxlen=1)
+        if self._use_env_state:
+            self._queues["observation.environment_state"] = deque(maxlen=1)
+    
+    @torch.no_grad()
+    def select_action(self, batch: dict[str, Tensor]) -> Tensor:
+        actions, _ = self.actor_network(batch['observations'])###
+
+    def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor | float]:
+        """Run the batch through the model and compute the loss.
+
+        Returns a dictionary with loss as a tensor, and other information as native floats.
+        """
+        batch = self.normalize_inputs(batch)
+        # batch shape is (b, 2, ...) where index 1 returns the current observation and 
+        # the next observation for caluculating the right td index. 
+        actions = batch["action"][:, 0]
+        rewards = batch["next.reward"][:, 0]
+        observations = {}
+        next_observations = {}
+        for k in batch:
+            if k.startswith("observation."):
+                observations[k] = batch[k][:, 0]
+                next_observations[k] = batch[k][:, 1]
+       
+        # perform image augmentation
+
+        # reward bias
+        # from HIL-SERL code base 
+        # add_or_replace={"rewards": batch["rewards"] + self.config["reward_bias"]} in reward_batch
+        
+
+        # calculate critics loss
+        # 1- compute actions from policy
+        action_preds, log_probs = self.actor_network(observations)
+        # 2- compute q targets
+        q_targets = self.target_qs(next_observations, action_preds)
+
+        # critics subsample size
+        min_q = q_targets.min(dim=0)
+
+        # backup entropy    
+        td_target = rewards + self.discount * min_q
+
+        # 3- compute predicted qs
+        q_preds = self.critic_ensemble(observations, actions)
+
+        # 4- Calculate loss
+        # Compute state-action value loss (TD loss) for all of the Q functions in the ensemble.
+        critics_loss = (   
+            F.mse_loss(
+                    q_preds,
+                    einops.repeat(td_target, "t b -> e t b", e=q_preds.shape[0]),
+                    reduction="none",
+                ).sum(0)  # sum over ensemble
+                # `q_preds_ensemble` depends on the first observation and the actions.
+                * ~batch["observation.state_is_pad"][0]
+                * ~batch["action_is_pad"]
+                # q_targets depends on the reward and the next observations.
+                * ~batch["next.reward_is_pad"]
+                * ~batch["observation.state_is_pad"][1:]
+            ).sum(0).mean()
+        
+        # calculate actors loss
+        # 1- temperature
+        temperature = self.temperature()
+
+        # 2- get actions (batch_size, action_dim) and log probs (batch_size,)
+        actions, log_probs = self.actor_network(observations) \
+
+        # 3- get q-value predictions
+        with torch.no_grad():
+            q_preds = self.critic_ensemble(observations, actions, return_type="mean")
+        actor_loss = (
+            -(q_preds - temperature * log_probs).mean()
+            * ~batch["observation.state_is_pad"][0]
+            * ~batch["action_is_pad"]
+        ).mean()
+
+
+        # calculate temperature loss
+        # 1- calculate entropy
+        entropy = -log_probs.mean()
+        temperature_loss = temperature * (entropy - self.target_entropy).mean()
+
+        loss = critics_loss + actor_loss + temperature_loss
+
+        return {
+                "critics_loss": critics_loss.item(),
+                "actor_loss": actor_loss.item(),
+                "temperature_loss": temperature_loss.item(),
+                "temperature": temperature.item(),
+                "entropy": entropy.item(),
+                "loss": loss,
+
+            }
+    
+    def update(self):
+        self.critic_target.lerp_(self.critic_ensemble, self.config.critic_target_update_weight)
+        #for target_param, param in zip(self.critic_target.parameters(), self.critic_ensemble.parameters()):
+        #    target_param.data.copy_(target_param.data * (1.0 - self.config.critic_target_update_weight) + param.data * self.critic_target_update_weight)
+
+class SACObservationEncoder(nn.Module):
+    """Encode image and/or state vector observations."""
+
+    def __init__(self, config: SACConfig):
+
+        super().__init__()
+        self.config = config

From def42ff4874bfeda79b8d9746c858789d7fd81fb Mon Sep 17 00:00:00 2001
From: KeWang <superwk1017@gmail.com>
Date: Tue, 17 Dec 2024 13:26:17 +0000
Subject: [PATCH 013/112] Port SAC WIP (#581)

Co-authored-by: KeWang1017 <ke.wang@helloleap.ai>
---
 .../common/policies/sac/configuration_sac.py  |  39 ++
 lerobot/common/policies/sac/modeling_sac.py   | 508 +++++++++++++++++-
 2 files changed, 541 insertions(+), 6 deletions(-)
 create mode 100644 lerobot/common/policies/sac/configuration_sac.py

diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
new file mode 100644
index 00000000..441b3566
--- /dev/null
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. 
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+
+@dataclass
+class SACConfig:
+    discount = 0.99
+    temperature_init = 1.0
+    num_critics = 2
+    critic_lr = 3e-4
+    actor_lr = 3e-4
+    critic_network_kwargs = {
+            "hidden_dims": [256, 256],
+            "activate_final": True,
+        }
+    actor_network_kwargs = {
+            "hidden_dims": [256, 256],
+            "activate_final": True,
+        }
+    policy_kwargs = {
+            "tanh_squash_distribution": True,
+            "std_parameterization": "uniform",
+        }
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index fb2e5542..9ea9449d 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -15,7 +15,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# TODO: (1) better device management
+
 from collections import deque
+from copy import deepcopy
+from functools import partial
 
 import einops
 
@@ -27,6 +31,10 @@ from torch import Tensor
 from huggingface_hub import PyTorchModelHubMixin
 from lerobot.common.policies.normalize import Normalize, Unnormalize
 from lerobot.common.policies.sac.configuration_sac import SACConfig
+import numpy as np
+from typing import Callable, Optional, Tuple, Sequence
+
+
 
 class SACPolicy(
     nn.Module,
@@ -58,12 +66,27 @@ class SACPolicy(
         self.unnormalize_outputs = Unnormalize(
             config.output_shapes, config.output_normalization_modes, dataset_stats
         )
+        encoder = SACObservationEncoder(config)
+        # Define networks
+        critic_nets = []
+        for _ in range(config.num_critics):
+            critic_net = Critic(
+                encoder=encoder,
+                network=MLP(**config.critic_network_kwargs)
+            )
+            critic_nets.append(critic_net)
         
-        self.critic_ensemble = ...
-        self.critic_target = ...
-        self.actor_network = ...
+        self.critic_ensemble = create_critic_ensemble(critic_nets, config.num_critics)
+        self.critic_target = deepcopy(self.critic_ensemble)
 
-        self.temperature = ...
+        self.actor_network = Policy(
+            encoder=encoder,
+            network=MLP(**config.actor_network_kwargs),
+            action_dim=config.output_shapes["action"][0],
+            **config.policy_kwargs
+        )
+
+        self.temperature = LagrangeMultiplier(init_value=config.temperature_init)    
 
     def reset(self):
         """
@@ -178,10 +201,483 @@ class SACPolicy(
         #for target_param, param in zip(self.critic_target.parameters(), self.critic_ensemble.parameters()):
         #    target_param.data.copy_(target_param.data * (1.0 - self.config.critic_target_update_weight) + param.data * self.critic_target_update_weight)
 
+
+class MLP(nn.Module):
+    def __init__(
+        self,
+        config: SACConfig,
+        activations: Callable[[torch.Tensor], torch.Tensor] | str = nn.SiLU(),
+        activate_final: bool = False,
+        dropout_rate: Optional[float] = None,
+    ):
+        super().__init__()
+        self.activate_final = config.activate_final
+        layers = []
+        
+        for i, size in enumerate(config.network_hidden_dims):
+            layers.append(nn.Linear(config.network_hidden_dims[i-1] if i > 0 else config.network_hidden_dims[0], size))
+            
+            if i + 1 < len(config.network_hidden_dims) or activate_final:
+                if dropout_rate is not None and dropout_rate > 0:
+                    layers.append(nn.Dropout(p=dropout_rate))
+                layers.append(nn.LayerNorm(size))
+                layers.append(activations if isinstance(activations, nn.Module) else getattr(nn, activations)())
+                
+        self.net = nn.Sequential(*layers)
+
+    def forward(self, x: torch.Tensor, train: bool = False) -> torch.Tensor:
+        # in training mode or not. TODO: find better way to do this
+        self.train(train) 
+        return self.net(x)
+    
+    
+class Critic(nn.Module):
+    def __init__(
+        self,
+        encoder: Optional[nn.Module],
+        network: nn.Module,
+        init_final: Optional[float] = None,
+        activate_final: bool = False,
+        device: str = "cuda"
+    ):
+        super().__init__()
+        self.device = torch.device(device)
+        self.encoder = encoder
+        self.network = network
+        self.init_final = init_final
+        self.activate_final = activate_final
+        
+        # Output layer
+        if init_final is not None:
+            if self.activate_final:
+                self.output_layer = nn.Linear(network.net[-3].out_features, 1)
+            else:
+                self.output_layer = nn.Linear(network.net[-2].out_features, 1)
+            nn.init.uniform_(self.output_layer.weight, -init_final, init_final)
+            nn.init.uniform_(self.output_layer.bias, -init_final, init_final)
+        else:
+            if self.activate_final:
+                self.output_layer = nn.Linear(network.net[-3].out_features, 1)
+            else:
+                self.output_layer = nn.Linear(network.net[-2].out_features, 1)
+            orthogonal_init()(self.output_layer.weight)
+        
+        self.to(self.device)
+
+    def forward(
+        self, 
+        observations: torch.Tensor, 
+        actions: torch.Tensor,
+        train: bool = False
+    ) -> torch.Tensor:
+        self.train(train)
+        
+        observations = observations.to(self.device)
+        actions = actions.to(self.device)
+        
+        if self.encoder is not None:
+            obs_enc = self.encoder(observations)
+        else:
+            obs_enc = observations
+            
+        inputs = torch.cat([obs_enc, actions], dim=-1)
+        x = self.network(inputs)
+        value = self.output_layer(x)
+        return value.squeeze(-1)
+    
+    def q_value_ensemble(
+        self,
+        observations: torch.Tensor,
+        actions: torch.Tensor,
+        train: bool = False
+    ) -> torch.Tensor:
+        observations = observations.to(self.device)
+        actions = actions.to(self.device)
+        
+        if len(actions.shape) == 3:  # [batch_size, num_actions, action_dim]
+            batch_size, num_actions = actions.shape[:2]
+            obs_expanded = observations.unsqueeze(1).expand(-1, num_actions, -1)
+            obs_flat = obs_expanded.reshape(-1, observations.shape[-1])
+            actions_flat = actions.reshape(-1, actions.shape[-1])
+            q_values = self(obs_flat, actions_flat, train)
+            return q_values.reshape(batch_size, num_actions)
+        else:
+            return self(observations, actions, train)
+
+
+class Policy(nn.Module):
+    def __init__(
+        self,
+        encoder: Optional[nn.Module],
+        network: nn.Module,
+        action_dim: int,
+        std_parameterization: str = "exp",
+        std_min: float = 1e-5,
+        std_max: float = 10.0,
+        tanh_squash_distribution: bool = False,
+        fixed_std: Optional[torch.Tensor] = None,
+        init_final: Optional[float] = None,
+        activate_final: bool = False,
+        device: str = "cuda"
+    ):
+        super().__init__()
+        self.device = torch.device(device)
+        self.encoder = encoder
+        self.network = network
+        self.action_dim = action_dim
+        self.std_parameterization = std_parameterization
+        self.std_min = std_min
+        self.std_max = std_max
+        self.tanh_squash_distribution = tanh_squash_distribution
+        self.fixed_std = fixed_std.to(self.device) if fixed_std is not None else None
+        self.activate_final = activate_final
+        
+        # Mean layer
+        if self.activate_final:
+            self.mean_layer = nn.Linear(network.net[-3].out_features, action_dim)
+        else:
+            self.mean_layer = nn.Linear(network.net[-2].out_features, action_dim)
+        if init_final is not None:
+            nn.init.uniform_(self.mean_layer.weight, -init_final, init_final)
+            nn.init.uniform_(self.mean_layer.bias, -init_final, init_final)
+        else:
+            orthogonal_init()(self.mean_layer.weight)
+        
+        # Standard deviation layer or parameter
+        if fixed_std is None:
+            if std_parameterization == "uniform":
+                self.log_stds = nn.Parameter(torch.zeros(action_dim, device=self.device))
+            else:
+                if self.activate_final:
+                    self.std_layer = nn.Linear(network.net[-3].out_features, action_dim)
+                else:
+                    self.std_layer = nn.Linear(network.net[-2].out_features, action_dim)
+                if init_final is not None:
+                    nn.init.uniform_(self.std_layer.weight, -init_final, init_final)
+                    nn.init.uniform_(self.std_layer.bias, -init_final, init_final)
+                else:
+                    orthogonal_init()(self.std_layer.weight)
+        
+        self.to(self.device)
+
+    def forward(
+        self, 
+        observations: torch.Tensor,
+        temperature: float = 1.0,
+        train: bool = False,
+        non_squash_distribution: bool = False
+    ) -> torch.distributions.Distribution:
+        self.train(train)
+                
+        # Encode observations if encoder exists
+        if self.encoder is not None:
+            with torch.set_grad_enabled(train):
+                obs_enc = self.encoder(observations, train=train)
+        else:
+            obs_enc = observations
+        # Get network outputs
+        outputs = self.network(obs_enc)
+        means = self.mean_layer(outputs)
+        
+        # Compute standard deviations
+        if self.fixed_std is None:
+            if self.std_parameterization == "exp":
+                log_stds = self.std_layer(outputs)
+                stds = torch.exp(log_stds)
+            elif self.std_parameterization == "softplus":
+                stds = torch.nn.functional.softplus(self.std_layer(outputs))
+            elif self.std_parameterization == "uniform":
+                stds = torch.exp(self.log_stds).expand_as(means)
+            else:
+                raise ValueError(
+                    f"Invalid std_parameterization: {self.std_parameterization}"
+                )
+        else:
+            assert self.std_parameterization == "fixed"
+            stds = self.fixed_std.expand_as(means)
+
+        # Clip standard deviations and scale with temperature
+        temperature = torch.tensor(temperature, device=self.device)
+        stds = torch.clamp(stds, self.std_min, self.std_max) * torch.sqrt(temperature)
+
+        # Create distribution
+        if self.tanh_squash_distribution and not non_squash_distribution:
+            distribution = TanhMultivariateNormalDiag(
+                loc=means,
+                scale_diag=stds,
+            )
+        else:
+            distribution = torch.distributions.Normal(
+                loc=means,
+                scale=stds,
+            )
+
+        return distribution
+    
+    def get_features(self, observations: torch.Tensor) -> torch.Tensor:
+        """Get encoded features from observations"""
+        observations = observations.to(self.device)
+        if self.encoder is not None:
+            with torch.no_grad():
+                return self.encoder(observations, train=False)
+        return observations
+
+
 class SACObservationEncoder(nn.Module):
-    """Encode image and/or state vector observations."""
+    """Encode image and/or state vector observations.
+    TODO(ke-wang): The original work allows for (1) stacking multiple history frames and (2) using pretrained resnet encoders.
+    """
 
     def __init__(self, config: SACConfig):
-
+        """
+        Creates encoders for pixel and/or state modalities.
+        """
         super().__init__()
         self.config = config
+
+        if "observation.image" in config.input_shapes:
+            self.image_enc_layers = nn.Sequential(
+                nn.Conv2d(
+                    config.input_shapes["observation.image"][0], config.image_encoder_hidden_dim, 7, stride=2
+                ),
+                nn.ReLU(),
+                nn.Conv2d(config.image_encoder_hidden_dim, config.image_encoder_hidden_dim, 5, stride=2),
+                nn.ReLU(),
+                nn.Conv2d(config.image_encoder_hidden_dim, config.image_encoder_hidden_dim, 3, stride=2),
+                nn.ReLU(),
+                nn.Conv2d(config.image_encoder_hidden_dim, config.image_encoder_hidden_dim, 3, stride=2),
+                nn.ReLU(),
+            )
+            dummy_batch = torch.zeros(1, *config.input_shapes["observation.image"])
+            with torch.inference_mode():
+                out_shape = self.image_enc_layers(dummy_batch).shape[1:]
+            self.image_enc_layers.extend(
+                nn.Sequential(
+                    nn.Flatten(),
+                    nn.Linear(np.prod(out_shape), config.latent_dim),
+                    nn.LayerNorm(config.latent_dim),
+                    nn.Tanh(),
+                )
+            )
+        if "observation.state" in config.input_shapes:
+            self.state_enc_layers = nn.Sequential(
+                nn.Linear(config.input_shapes["observation.state"][0], config.state_encoder_hidden_dim),
+                nn.ELU(),
+                nn.Linear(config.state_encoder_hidden_dim, config.latent_dim),
+                nn.LayerNorm(config.latent_dim),
+                nn.Tanh(),
+            )
+        if "observation.environment_state" in config.input_shapes:
+            self.env_state_enc_layers = nn.Sequential(
+                nn.Linear(
+                    config.input_shapes["observation.environment_state"][0], config.state_encoder_hidden_dim
+                ),
+                nn.ELU(),
+                nn.Linear(config.state_encoder_hidden_dim, config.latent_dim),
+                nn.LayerNorm(config.latent_dim),
+                nn.Tanh(),
+            )
+
+    def forward(self, obs_dict: dict[str, Tensor]) -> Tensor:
+        """Encode the image and/or state vector.
+
+        Each modality is encoded into a feature vector of size (latent_dim,) and then a uniform mean is taken
+        over all features.
+        """
+        feat = []
+        # Concatenate all images along the channel dimension.
+        image_keys = [k for k in self.config.input_shapes if k.startswith("observation.image")]
+        for image_key in image_keys:
+            feat.append(flatten_forward_unflatten(self.image_enc_layers, obs_dict[image_key]))
+        if "observation.environment_state" in self.config.input_shapes:
+            feat.append(self.env_state_enc_layers(obs_dict["observation.environment_state"]))
+        if "observation.state" in self.config.input_shapes:
+            feat.append(self.state_enc_layers(obs_dict["observation.state"]))
+        return torch.stack(feat, dim=0).mean(0)
+    
+
+class LagrangeMultiplier(nn.Module):
+    def __init__(
+        self,
+        init_value: float = 1.0,
+        constraint_shape: Sequence[int] = (),
+        device: str = "cuda"
+    ):
+        super().__init__()
+        self.device = torch.device(device)
+        init_value = torch.log(torch.exp(torch.tensor(init_value, device=self.device)) - 1)
+            
+        # Initialize the Lagrange multiplier as a parameter
+        self.lagrange = nn.Parameter(
+            torch.full(constraint_shape, init_value, dtype=torch.float32, device=self.device)
+        )
+        
+        self.to(self.device)
+
+    def forward(
+        self, 
+        lhs: Optional[torch.Tensor] = None, 
+        rhs: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        # Get the multiplier value based on parameterization        
+        multiplier = torch.nn.functional.softplus(self.lagrange)
+                
+        # Return the raw multiplier if no constraint values provided
+        if lhs is None:
+            return multiplier
+            
+        # Move inputs to device
+        lhs = lhs.to(self.device)
+        if rhs is not None:
+            rhs = rhs.to(self.device)
+            
+        # Use the multiplier to compute the Lagrange penalty
+        if rhs is None:
+            rhs = torch.zeros_like(lhs, device=self.device)
+            
+        diff = lhs - rhs
+        
+        assert diff.shape == multiplier.shape, f"Shape mismatch: {diff.shape} vs {multiplier.shape}"
+        
+        return multiplier * diff
+
+
+# The TanhMultivariateNormalDiag is a probability distribution that represents a transformed normal (Gaussian) distribution where:
+# 1. The base distribution is a diagonal multivariate normal distribution 
+# 2. The samples from this normal distribution are transformed through a tanh function, which squashes the values to be between -1 and 1
+# 3. Optionally, the values can be further transformed to fit within arbitrary bounds [low, high] using an affine transformation
+# This type of distribution is commonly used in reinforcement learning, particularly for continuous action spaces
+class TanhMultivariateNormalDiag(torch.distributions.TransformedDistribution):
+    def __init__(
+        self,
+        loc: torch.Tensor,
+        scale_diag: torch.Tensor,
+        low: Optional[torch.Tensor] = None,
+        high: Optional[torch.Tensor] = None,
+    ):
+        # Create base normal distribution
+        base_distribution = torch.distributions.Normal(loc=loc, scale=scale_diag)
+        
+        # Create list of transforms
+        transforms = []
+        
+        # Add tanh transform
+        transforms.append(torch.distributions.transforms.TanhTransform())
+        
+        # Add rescaling transform if bounds are provided
+        if low is not None and high is not None:
+            transforms.append(
+                torch.distributions.transforms.AffineTransform(
+                    loc=(high + low) / 2,
+                    scale=(high - low) / 2
+                )
+            )
+        
+        # Initialize parent class
+        super().__init__(
+            base_distribution=base_distribution,
+            transforms=transforms
+        )
+        
+        # Store parameters
+        self.loc = loc
+        self.scale_diag = scale_diag
+        self.low = low
+        self.high = high
+
+    def mode(self) -> torch.Tensor:
+        """Get the mode of the transformed distribution"""
+        # The mode of a normal distribution is its mean
+        mode = self.loc
+        
+        # Apply transforms
+        for transform in self.transforms:
+            mode = transform(mode)
+        
+        return mode
+
+    def rsample(self, sample_shape=torch.Size()) -> torch.Tensor:
+        """
+        Reparameterized sample from the distribution
+        """
+        # Sample from base distribution
+        x = self.base_dist.rsample(sample_shape)
+        
+        # Apply transforms
+        for transform in self.transforms:
+            x = transform(x)
+            
+        return x
+
+    def log_prob(self, value: torch.Tensor) -> torch.Tensor:
+        """
+        Compute log probability of a value
+        Includes the log det jacobian for the transforms
+        """
+        # Initialize log prob
+        log_prob = torch.zeros_like(value[..., 0])
+        
+        # Inverse transforms to get back to normal distribution
+        q = value
+        for transform in reversed(self.transforms):
+            q = transform.inv(q)
+            log_prob = log_prob - transform.log_abs_det_jacobian(q, transform(q))
+        
+        # Add base distribution log prob
+        log_prob = log_prob + self.base_dist.log_prob(q).sum(-1)
+        
+        return log_prob
+
+    def sample_and_log_prob(self, sample_shape=torch.Size()) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Sample from the distribution and compute log probability
+        """
+        x = self.rsample(sample_shape)
+        log_prob = self.log_prob(x)
+        return x, log_prob
+
+    def entropy(self) -> torch.Tensor:
+        """
+        Compute entropy of the distribution
+        """
+        # Start with base distribution entropy
+        entropy = self.base_dist.entropy().sum(-1)
+        
+        # Add log det jacobian for each transform
+        x = self.rsample()
+        for transform in self.transforms:
+            entropy = entropy + transform.log_abs_det_jacobian(x, transform(x))
+            x = transform(x)
+            
+        return entropy
+
+
+def create_critic_ensemble(critic_class, num_critics: int, device: str = "cuda") -> nn.ModuleList:
+    """Creates an ensemble of critic networks"""
+    critics = nn.ModuleList([critic_class() for _ in range(num_critics)])
+    return critics.to(device)
+
+
+def orthogonal_init():
+    return lambda x: torch.nn.init.orthogonal_(x, gain=1.0)
+
+
+# borrowed from tdmpc
+def flatten_forward_unflatten(fn: Callable[[Tensor], Tensor], image_tensor: Tensor) -> Tensor:
+    """Helper to temporarily flatten extra dims at the start of the image tensor.
+
+    Args:
+        fn: Callable that the image tensor will be passed to. It should accept (B, C, H, W) and return
+            (B, *), where * is any number of dimensions.
+        image_tensor: An image tensor of shape (**, C, H, W), where ** is any number of dimensions and 
+        can be more than 1 dimensions, generally different from *.
+    Returns:
+        A return value from the callable reshaped to (**, *).
+    """
+    if image_tensor.ndim == 4:
+        return fn(image_tensor)
+    start_dims = image_tensor.shape[:-3]
+    inp = torch.flatten(image_tensor, end_dim=-4)
+    flat_out = fn(inp)
+    return torch.reshape(flat_out, (*start_dims, *flat_out.shape[1:]))
+

From 7e0f20fbf285418a78b8619107371f2f0a6c7fd1 Mon Sep 17 00:00:00 2001
From: KeWang1017 <ke.wang@helloleap.ai>
Date: Tue, 17 Dec 2024 15:58:04 +0000
Subject: [PATCH 014/112] Enhance SAC configuration and policy with new
 parameters and subsampling logic

- Added `num_subsample_critics`, `critic_target_update_weight`, and `utd_ratio` to SACConfig.
- Implemented target entropy calculation in SACPolicy if not provided.
- Introduced subsampling of critics to prevent overfitting during updates.
- Updated temperature loss calculation to use the new target entropy.
- Added comments for future UTD update implementation.

These changes improve the flexibility and performance of the SAC implementation.
---
 .../common/policies/sac/configuration_sac.py  |  3 +++
 lerobot/common/policies/sac/modeling_sac.py   | 21 +++++++++++++++----
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index 441b3566..d324462e 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -23,8 +23,11 @@ class SACConfig:
     discount = 0.99
     temperature_init = 1.0
     num_critics = 2
+    num_subsample_critics = None
     critic_lr = 3e-4
     actor_lr = 3e-4
+    critic_target_update_weight = 0.005
+    utd_ratio = 2
     critic_network_kwargs = {
             "hidden_dims": [256, 256],
             "activate_final": True,
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 9ea9449d..7d451b4e 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -85,7 +85,8 @@ class SACPolicy(
             action_dim=config.output_shapes["action"][0],
             **config.policy_kwargs
         )
-
+        if config.target_entropy is None:
+            config.target_entropy = -np.prod(config.output_shapes["action"][0]) #  (-dim(A))
         self.temperature = LagrangeMultiplier(init_value=config.temperature_init)    
 
     def reset(self):
@@ -127,7 +128,6 @@ class SACPolicy(
         # perform image augmentation
 
         # reward bias
-        # from HIL-SERL code base 
         # add_or_replace={"rewards": batch["rewards"] + self.config["reward_bias"]} in reward_batch
         
 
@@ -136,11 +136,16 @@ class SACPolicy(
         action_preds, log_probs = self.actor_network(observations)
         # 2- compute q targets
         q_targets = self.target_qs(next_observations, action_preds)
+        # subsample critics to prevent overfitting if use high UTD (update to date)
+        if self.config.num_subsample_critics is not None:
+            indices = torch.randperm(self.config.num_critics)
+            indices = indices[:self.config.num_subsample_critics]
+            q_targets = q_targets[indices]
 
         # critics subsample size
         min_q = q_targets.min(dim=0)
 
-        # backup entropy    
+        # compute td target
         td_target = rewards + self.discount * min_q
 
         # 3- compute predicted qs
@@ -182,7 +187,10 @@ class SACPolicy(
         # calculate temperature loss
         # 1- calculate entropy
         entropy = -log_probs.mean()
-        temperature_loss = temperature * (entropy - self.target_entropy).mean()
+        temperature_loss = self.temp(
+            lhs=entropy,
+            rhs=self.config.target_entropy
+        )
 
         loss = critics_loss + actor_loss + temperature_loss
 
@@ -198,6 +206,11 @@ class SACPolicy(
     
     def update(self):
         self.critic_target.lerp_(self.critic_ensemble, self.config.critic_target_update_weight)
+        # TODO: implement UTD update
+        #for critic_step in range(self.config.utd_ratio - 1):
+            # only update critic and critic target
+        # Then update critic, critic target, actor and temperature
+
         #for target_param, param in zip(self.critic_target.parameters(), self.critic_ensemble.parameters()):
         #    target_param.data.copy_(target_param.data * (1.0 - self.config.critic_target_update_weight) + param.data * self.critic_target_update_weight)
 

From 7b68bfb73b61f6fb90cac3d46a724274a0f184c7 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Tue, 17 Dec 2024 18:03:46 +0100
Subject: [PATCH 015/112] added comments from kewang

---
 lerobot/common/policies/sac/modeling_sac.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 7d451b4e..de8283de 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -128,6 +128,7 @@ class SACPolicy(
         # perform image augmentation
 
         # reward bias
+        # from HIL-SERL code base 
         # add_or_replace={"rewards": batch["rewards"] + self.config["reward_bias"]} in reward_batch
         
 
@@ -207,6 +208,7 @@ class SACPolicy(
     def update(self):
         self.critic_target.lerp_(self.critic_ensemble, self.config.critic_target_update_weight)
         # TODO: implement UTD update
+        # First update only critics for utd_ratio-1 times
         #for critic_step in range(self.config.utd_ratio - 1):
             # only update critic and critic target
         # Then update critic, critic target, actor and temperature

From 70b652f791b515ea325692439615d366f3712dce Mon Sep 17 00:00:00 2001
From: Eugene Mironov <helper2424@gmail.com>
Date: Mon, 23 Dec 2024 16:43:55 +0700
Subject: [PATCH 016/112] [Port Hil-SERL] Add unit tests for the reward
 classifier & fix imports & check script (#578)

---
 .../classifier/configuration_classifier.py    |   2 +-
 .../hilserl/classifier/modeling_classifier.py |   8 +
 poetry.lock                                   | 153 ++++++++++-
 pyproject.toml                                |   3 +
 tests/conftest.py                             |  13 +
 .../check_hiserl_reward_classifier.py         | 244 ++++++++++++++++++
 .../classifier/test_modelling_classifier.py   |  78 ++++++
 7 files changed, 499 insertions(+), 2 deletions(-)
 create mode 100644 tests/policies/hilserl/classifier/check_hiserl_reward_classifier.py
 create mode 100644 tests/policies/hilserl/classifier/test_modelling_classifier.py

diff --git a/lerobot/common/policies/hilserl/classifier/configuration_classifier.py b/lerobot/common/policies/hilserl/classifier/configuration_classifier.py
index 209ff659..553e4262 100644
--- a/lerobot/common/policies/hilserl/classifier/configuration_classifier.py
+++ b/lerobot/common/policies/hilserl/classifier/configuration_classifier.py
@@ -13,7 +13,7 @@ class ClassifierConfig:
     hidden_dim: int = 256
     dropout_rate: float = 0.1
     model_name: str = "microsoft/resnet-50"
-    device: str = "cuda" if torch.cuda.is_available() else "mps"
+    device: str = "cpu"
     model_type: str = "cnn"  # "transformer" or "cnn"
 
     def save_pretrained(self, save_dir):
diff --git a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
index dbb434a7..0b8d66ac 100644
--- a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
+++ b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
@@ -22,6 +22,11 @@ class ClassifierOutput:
         self.probabilities = probabilities
         self.hidden_states = hidden_states
 
+    def __repr__(self):
+        return (f"ClassifierOutput(logits={self.logits}, "
+                f"probabilities={self.probabilities}, "
+                f"hidden_states={self.hidden_states})")
+
 
 class Classifier(
     nn.Module,
@@ -69,6 +74,8 @@ class Classifier(
             self.feature_dim = self.encoder.config.hidden_sizes[-1]  # Last channel dimension
         else:
             raise ValueError("Unsupported CNN architecture")
+        
+        self.encoder = self.encoder.to(self.config.device)
 
     def _freeze_encoder(self) -> None:
         """Freeze the encoder parameters."""
@@ -93,6 +100,7 @@ class Classifier(
             nn.ReLU(),
             nn.Linear(self.config.hidden_dim, 1 if self.config.num_classes == 2 else self.config.num_classes),
         )
+        self.classifier_head = self.classifier_head.to(self.config.device)
 
     def _get_encoder_output(self, x: torch.Tensor) -> torch.Tensor:
         """Extract the appropriate output from the encoder."""
diff --git a/poetry.lock b/poetry.lock
index 8799e67c..919edd18 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3139,6 +3139,27 @@ dev = ["changelist (==0.5)"]
 lint = ["pre-commit (==3.7.0)"]
 test = ["pytest (>=7.4)", "pytest-cov (>=4.1)"]
 
+[[package]]
+name = "lightning-utilities"
+version = "0.11.9"
+description = "Lightning toolbox for across the our ecosystem."
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "lightning_utilities-0.11.9-py3-none-any.whl", hash = "sha256:ac6d4e9e28faf3ff4be997876750fee10dc604753dbc429bf3848a95c5d7e0d2"},
+    {file = "lightning_utilities-0.11.9.tar.gz", hash = "sha256:f5052b81344cc2684aa9afd74b7ce8819a8f49a858184ec04548a5a109dfd053"},
+]
+
+[package.dependencies]
+packaging = ">=17.1"
+setuptools = "*"
+typing-extensions = "*"
+
+[package.extras]
+cli = ["fire"]
+docs = ["requests (>=2.0.0)"]
+typing = ["mypy (>=1.0.0)", "types-setuptools"]
+
 [[package]]
 name = "llvmlite"
 version = "0.43.0"
@@ -6798,6 +6819,38 @@ webencodings = ">=0.4"
 doc = ["sphinx", "sphinx_rtd_theme"]
 test = ["pytest", "ruff"]
 
+[[package]]
+name = "tokenizers"
+version = "0.21.0"
+description = ""
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "tokenizers-0.21.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:3c4c93eae637e7d2aaae3d376f06085164e1660f89304c0ab2b1d08a406636b2"},
+    {file = "tokenizers-0.21.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:f53ea537c925422a2e0e92a24cce96f6bc5046bbef24a1652a5edc8ba975f62e"},
+    {file = "tokenizers-0.21.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b177fb54c4702ef611de0c069d9169f0004233890e0c4c5bd5508ae05abf193"},
+    {file = "tokenizers-0.21.0-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6b43779a269f4629bebb114e19c3fca0223296ae9fea8bb9a7a6c6fb0657ff8e"},
+    {file = "tokenizers-0.21.0-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9aeb255802be90acfd363626753fda0064a8df06031012fe7d52fd9a905eb00e"},
+    {file = "tokenizers-0.21.0-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d8b09dbeb7a8d73ee204a70f94fc06ea0f17dcf0844f16102b9f414f0b7463ba"},
+    {file = "tokenizers-0.21.0-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:400832c0904f77ce87c40f1a8a27493071282f785724ae62144324f171377273"},
+    {file = "tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e84ca973b3a96894d1707e189c14a774b701596d579ffc7e69debfc036a61a04"},
+    {file = "tokenizers-0.21.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:eb7202d231b273c34ec67767378cd04c767e967fda12d4a9e36208a34e2f137e"},
+    {file = "tokenizers-0.21.0-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:089d56db6782a73a27fd8abf3ba21779f5b85d4a9f35e3b493c7bbcbbf0d539b"},
+    {file = "tokenizers-0.21.0-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:c87ca3dc48b9b1222d984b6b7490355a6fdb411a2d810f6f05977258400ddb74"},
+    {file = "tokenizers-0.21.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:4145505a973116f91bc3ac45988a92e618a6f83eb458f49ea0790df94ee243ff"},
+    {file = "tokenizers-0.21.0-cp39-abi3-win32.whl", hash = "sha256:eb1702c2f27d25d9dd5b389cc1f2f51813e99f8ca30d9e25348db6585a97e24a"},
+    {file = "tokenizers-0.21.0-cp39-abi3-win_amd64.whl", hash = "sha256:87841da5a25a3a5f70c102de371db120f41873b854ba65e52bccd57df5a3780c"},
+    {file = "tokenizers-0.21.0.tar.gz", hash = "sha256:ee0894bf311b75b0c03079f33859ae4b2334d675d4e93f5a4132e1eae2834fe4"},
+]
+
+[package.dependencies]
+huggingface-hub = ">=0.16.4,<1.0"
+
+[package.extras]
+dev = ["tokenizers[testing]"]
+docs = ["setuptools-rust", "sphinx", "sphinx-rtd-theme"]
+testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests", "ruff"]
+
 [[package]]
 name = "tomli"
 version = "2.0.2"
@@ -6863,6 +6916,34 @@ typing-extensions = ">=4.8.0"
 opt-einsum = ["opt-einsum (>=3.3)"]
 optree = ["optree (>=0.11.0)"]
 
+[[package]]
+name = "torchmetrics"
+version = "1.6.0"
+description = "PyTorch native Metrics"
+optional = true
+python-versions = ">=3.9"
+files = [
+    {file = "torchmetrics-1.6.0-py3-none-any.whl", hash = "sha256:a508cdd87766cedaaf55a419812bf9f493aff8fffc02cc19df5a8e2e7ccb942a"},
+    {file = "torchmetrics-1.6.0.tar.gz", hash = "sha256:aebba248708fb90def20cccba6f55bddd134a58de43fb22b0c5ca0f3a89fa984"},
+]
+
+[package.dependencies]
+lightning-utilities = ">=0.8.0"
+numpy = ">1.20.0"
+packaging = ">17.1"
+torch = ">=2.0.0"
+
+[package.extras]
+all = ["SciencePlots (>=2.0.0)", "gammatone (>=1.0.0)", "ipadic (>=1.0.0)", "librosa (>=0.10.0)", "matplotlib (>=3.6.0)", "mecab-python3 (>=1.0.6)", "mypy (==1.13.0)", "nltk (>3.8.1)", "numpy (<2.0)", "onnxruntime (>=1.12.0)", "pesq (>=0.0.4)", "piq (<=0.8.0)", "pycocotools (>2.0.0)", "pystoi (>=0.4.0)", "regex (>=2021.9.24)", "requests (>=2.19.0)", "scipy (>1.0.0)", "sentencepiece (>=0.2.0)", "torch (==2.5.1)", "torch-fidelity (<=0.4.0)", "torchaudio (>=2.0.1)", "torchvision (>=0.15.1)", "tqdm (<4.68.0)", "transformers (>4.4.0)", "transformers (>=4.42.3)", "types-PyYAML", "types-emoji", "types-protobuf", "types-requests", "types-setuptools", "types-six", "types-tabulate"]
+audio = ["gammatone (>=1.0.0)", "librosa (>=0.10.0)", "numpy (<2.0)", "onnxruntime (>=1.12.0)", "pesq (>=0.0.4)", "pystoi (>=0.4.0)", "requests (>=2.19.0)", "torchaudio (>=2.0.1)"]
+detection = ["pycocotools (>2.0.0)", "torchvision (>=0.15.1)"]
+dev = ["PyTDC (==0.4.1)", "SciencePlots (>=2.0.0)", "bert-score (==0.3.13)", "dython (==0.7.6)", "dython (>=0.7.8,<0.8.0)", "fairlearn", "fast-bss-eval (>=0.1.0)", "faster-coco-eval (>=1.6.3)", "gammatone (>=1.0.0)", "huggingface-hub (<0.27)", "ipadic (>=1.0.0)", "jiwer (>=2.3.0)", "kornia (>=0.6.7)", "librosa (>=0.10.0)", "lpips (<=0.1.4)", "matplotlib (>=3.6.0)", "mecab-ko (>=1.0.0,<1.1.0)", "mecab-ko-dic (>=1.0.0)", "mecab-python3 (>=1.0.6)", "mir-eval (>=0.6)", "monai (==1.3.2)", "monai (==1.4.0)", "mypy (==1.13.0)", "netcal (>1.0.0)", "nltk (>3.8.1)", "numpy (<2.0)", "numpy (<2.2.0)", "onnxruntime (>=1.12.0)", "pandas (>1.4.0)", "permetrics (==2.0.0)", "pesq (>=0.0.4)", "piq (<=0.8.0)", "pycocotools (>2.0.0)", "pystoi (>=0.4.0)", "pytorch-msssim (==1.0.0)", "regex (>=2021.9.24)", "requests (>=2.19.0)", "rouge-score (>0.1.0)", "sacrebleu (>=2.3.0)", "scikit-image (>=0.19.0)", "scipy (>1.0.0)", "sentencepiece (>=0.2.0)", "sewar (>=0.4.4)", "statsmodels (>0.13.5)", "torch (==2.5.1)", "torch-complex (<0.5.0)", "torch-fidelity (<=0.4.0)", "torchaudio (>=2.0.1)", "torchvision (>=0.15.1)", "tqdm (<4.68.0)", "transformers (>4.4.0)", "transformers (>=4.42.3)", "types-PyYAML", "types-emoji", "types-protobuf", "types-requests", "types-setuptools", "types-six", "types-tabulate"]
+image = ["scipy (>1.0.0)", "torch-fidelity (<=0.4.0)", "torchvision (>=0.15.1)"]
+multimodal = ["piq (<=0.8.0)", "transformers (>=4.42.3)"]
+text = ["ipadic (>=1.0.0)", "mecab-python3 (>=1.0.6)", "nltk (>3.8.1)", "regex (>=2021.9.24)", "sentencepiece (>=0.2.0)", "tqdm (<4.68.0)", "transformers (>4.4.0)"]
+typing = ["mypy (==1.13.0)", "torch (==2.5.1)", "types-PyYAML", "types-emoji", "types-protobuf", "types-requests", "types-setuptools", "types-six", "types-tabulate"]
+visual = ["SciencePlots (>=2.0.0)", "matplotlib (>=3.6.0)"]
+
 [[package]]
 name = "torchvision"
 version = "0.19.1"
@@ -6956,6 +7037,75 @@ files = [
 docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"]
 test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<8.2)", "pytest-mock", "pytest-mypy-testing"]
 
+[[package]]
+name = "transformers"
+version = "4.47.0"
+description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
+optional = true
+python-versions = ">=3.9.0"
+files = [
+    {file = "transformers-4.47.0-py3-none-any.whl", hash = "sha256:a8e1bafdaae69abdda3cad638fe392e37c86d2ce0ecfcae11d60abb8f949ff4d"},
+    {file = "transformers-4.47.0.tar.gz", hash = "sha256:f8ead7a5a4f6937bb507e66508e5e002dc5930f7b6122a9259c37b099d0f3b19"},
+]
+
+[package.dependencies]
+filelock = "*"
+huggingface-hub = ">=0.24.0,<1.0"
+numpy = ">=1.17"
+packaging = ">=20.0"
+pyyaml = ">=5.1"
+regex = "!=2019.12.17"
+requests = "*"
+safetensors = ">=0.4.1"
+tokenizers = ">=0.21,<0.22"
+tqdm = ">=4.27"
+
+[package.extras]
+accelerate = ["accelerate (>=0.26.0)"]
+agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch"]
+all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch", "torchaudio", "torchvision"]
+audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
+benchmark = ["optimum-benchmark (>=0.3.0)"]
+codecarbon = ["codecarbon (==1.2.0)"]
+deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"]
+deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
+dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.21,<0.22)", "urllib3 (<2.0.0)"]
+dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
+flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
+ftfy = ["ftfy"]
+integrations = ["optuna", "ray[tune] (>=2.7.0)", "sigopt"]
+ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"]
+modelcreation = ["cookiecutter (==1.7.3)"]
+natten = ["natten (>=0.14.6,<0.15.0)"]
+onnx = ["onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "tf2onnx"]
+onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"]
+optuna = ["optuna"]
+quality = ["GitPython (<3.1.19)", "datasets (!=2.5.0)", "isort (>=5.5.4)", "libcst", "rich", "ruff (==0.5.1)", "urllib3 (<2.0.0)"]
+ray = ["ray[tune] (>=2.7.0)"]
+retrieval = ["datasets (!=2.5.0)", "faiss-cpu"]
+ruff = ["ruff (==0.5.1)"]
+sagemaker = ["sagemaker (>=2.31.0)"]
+sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"]
+serving = ["fastapi", "pydantic", "starlette", "uvicorn"]
+sigopt = ["sigopt"]
+sklearn = ["scikit-learn"]
+speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
+testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "parameterized", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
+tf = ["keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
+tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<0.24)", "tensorflow-text (<2.16)", "tf2onnx"]
+tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
+tiktoken = ["blobfile", "tiktoken"]
+timm = ["timm (<=1.0.11)"]
+tokenizers = ["tokenizers (>=0.21,<0.22)"]
+torch = ["accelerate (>=0.26.0)", "torch"]
+torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
+torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
+torchhub = ["filelock", "huggingface-hub (>=0.24.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch", "tqdm (>=4.27)"]
+video = ["av (==9.2.0)"]
+vision = ["Pillow (>=10.0.1,<=15.0)"]
+
 [[package]]
 name = "transforms3d"
 version = "0.4.2"
@@ -7558,6 +7708,7 @@ dev = ["debugpy", "pre-commit"]
 dora = ["gym-dora"]
 dynamixel = ["dynamixel-sdk", "pynput"]
 feetech = ["feetech-servo-sdk", "pynput"]
+hilserl = ["torchmetrics", "transformers"]
 intelrealsense = ["pyrealsense2"]
 pusht = ["gym-pusht"]
 stretch = ["hello-robot-stretch-body", "pynput", "pyrealsense2", "pyrender"]
@@ -7569,4 +7720,4 @@ xarm = ["gym-xarm"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<3.13"
-content-hash = "41344f0eb2d06d9a378abcd10df8205aa3926ff0a08ac5ab1a0b1bcae7440fd8"
+content-hash = "b9d299916ced6af1d243f961a32b0a4aacbef18e0b95337a5224e8511f5d6dda"
diff --git a/pyproject.toml b/pyproject.toml
index 59c2de8b..738903bd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,6 +71,8 @@ pyrender = {git = "https://github.com/mmatl/pyrender.git", markers = "sys_platfo
 hello-robot-stretch-body = {version = ">=0.7.27", markers = "sys_platform == 'linux'", optional = true}
 pyserial = {version = ">=3.5", optional = true}
 jsonlines = ">=4.0.0"
+transformers = {version = "^4.47.0", optional = true}
+torchmetrics = {version = "^1.6.0", optional = true}
 
 
 [tool.poetry.extras]
@@ -86,6 +88,7 @@ dynamixel = ["dynamixel-sdk", "pynput"]
 feetech = ["feetech-servo-sdk", "pynput"]
 intelrealsense = ["pyrealsense2"]
 stretch = ["hello-robot-stretch-body", "pyrender", "pyrealsense2", "pynput"]
+hilserl = ["transformers", "torchmetrics"]
 
 [tool.ruff]
 line-length = 110
diff --git a/tests/conftest.py b/tests/conftest.py
index 2075c2aa..adf050aa 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -14,9 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import random
 import traceback
 
 import pytest
+import torch
 from serial import SerialException
 
 from lerobot import available_cameras, available_motors, available_robots
@@ -124,3 +126,14 @@ def patch_builtins_input(monkeypatch):
             print(text)
 
     monkeypatch.setattr("builtins.input", print_text)
+
+
+def pytest_addoption(parser):
+    parser.addoption("--seed", action="store", default="42", help="Set random seed for reproducibility")
+
+
+@pytest.fixture(autouse=True)
+def set_random_seed(request):
+    seed = int(request.config.getoption("--seed"))
+    random.seed(seed)  # Python random
+    torch.manual_seed(seed)  # PyTorch
diff --git a/tests/policies/hilserl/classifier/check_hiserl_reward_classifier.py b/tests/policies/hilserl/classifier/check_hiserl_reward_classifier.py
new file mode 100644
index 00000000..55e6e381
--- /dev/null
+++ b/tests/policies/hilserl/classifier/check_hiserl_reward_classifier.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+import numpy as np
+import torch
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss
+from torch.optim import Adam
+from torch.utils.data import DataLoader
+from torchmetrics import AUROC, Accuracy, F1Score, Precision, Recall
+from torchvision.datasets import CIFAR10
+from torchvision.transforms import ToTensor
+
+from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier, ClassifierConfig
+
+BATCH_SIZE = 1000
+LR = 0.1
+EPOCH_NUM = 2
+
+if torch.cuda.is_available():
+    DEVICE = torch.device("cuda")
+elif torch.backends.mps.is_available():
+    DEVICE = torch.device("mps")
+else:
+    DEVICE = torch.device("cpu")
+
+
+def train_evaluate_multiclass_classifier():
+    logging.info(
+        f"Start multiclass classifier train eval with {DEVICE} device, batch size {BATCH_SIZE}, learning rate {LR}"
+    )
+    multiclass_config = ClassifierConfig(model_name="microsoft/resnet-18", device=DEVICE, num_classes=10)
+    multiclass_classifier = Classifier(multiclass_config)
+
+    trainset = CIFAR10(root="data", train=True, download=True, transform=ToTensor())
+    testset = CIFAR10(root="data", train=False, download=True, transform=ToTensor())
+
+    trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
+    testloader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)
+
+    multiclass_num_classes = 10
+    epoch = 1
+
+    criterion = CrossEntropyLoss()
+    optimizer = Adam(multiclass_classifier.parameters(), lr=LR)
+
+    multiclass_classifier.train()
+
+    logging.info("Start multiclass classifier training")
+
+    # Training loop
+    while epoch < EPOCH_NUM:  # loop over the dataset multiple times
+        for i, data in enumerate(trainloader):
+            inputs, labels = data
+            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
+
+            # Zero the parameter gradients
+            optimizer.zero_grad()
+
+            # Forward pass
+            outputs = multiclass_classifier(inputs)
+
+            loss = criterion(outputs.logits, labels)
+            loss.backward()
+            optimizer.step()
+
+            if i % 10 == 0:  # print every 10 mini-batches
+                logging.info(f"[Epoch {epoch}, Batch {i}] loss: {loss.item():.3f}")
+
+        epoch += 1
+
+    print("Multiclass classifier training finished")
+
+    multiclass_classifier.eval()
+
+    test_loss = 0.0
+    test_labels = []
+    test_pridections = []
+    test_probs = []
+
+    with torch.no_grad():
+        for data in testloader:
+            images, labels = data
+            images, labels = images.to(DEVICE), labels.to(DEVICE)
+            outputs = multiclass_classifier(images)
+            loss = criterion(outputs.logits, labels)
+            test_loss += loss.item() * BATCH_SIZE
+
+            _, predicted = torch.max(outputs.logits, 1)
+            test_labels.extend(labels.cpu())
+            test_pridections.extend(predicted.cpu())
+            test_probs.extend(outputs.probabilities.cpu())
+
+    test_loss = test_loss / len(testset)
+
+    logging.info(f"Multiclass classifier test loss {test_loss:.3f}")
+
+    test_labels = torch.stack(test_labels)
+    test_predictions = torch.stack(test_pridections)
+    test_probs = torch.stack(test_probs)
+
+    accuracy = Accuracy(task="multiclass", num_classes=multiclass_num_classes)
+    precision = Precision(task="multiclass", average="weighted", num_classes=multiclass_num_classes)
+    recall = Recall(task="multiclass", average="weighted", num_classes=multiclass_num_classes)
+    f1 = F1Score(task="multiclass", average="weighted", num_classes=multiclass_num_classes)
+    auroc = AUROC(task="multiclass", num_classes=multiclass_num_classes, average="weighted")
+
+    # Calculate metrics
+    acc = accuracy(test_predictions, test_labels)
+    prec = precision(test_predictions, test_labels)
+    rec = recall(test_predictions, test_labels)
+    f1_score = f1(test_predictions, test_labels)
+    auroc_score = auroc(test_probs, test_labels)
+
+    logging.info(f"Accuracy: {acc:.2f}")
+    logging.info(f"Precision: {prec:.2f}")
+    logging.info(f"Recall: {rec:.2f}")
+    logging.info(f"F1 Score: {f1_score:.2f}")
+    logging.info(f"AUROC Score: {auroc_score:.2f}")
+
+
+def train_evaluate_binary_classifier():
+    logging.info(
+        f"Start binary classifier train eval with {DEVICE} device, batch size {BATCH_SIZE}, learning rate {LR}"
+    )
+
+    target_binary_class = 3
+
+    def one_vs_rest(dataset, target_class):
+        new_targets = []
+        for _, label in dataset:
+            new_label = float(1.0) if label == target_class else float(0.0)
+            new_targets.append(new_label)
+
+        dataset.targets = new_targets  # Replace the original labels with the binary ones
+        return dataset
+
+    binary_train_dataset = CIFAR10(root="data", train=True, download=True, transform=ToTensor())
+    binary_test_dataset = CIFAR10(root="data", train=False, download=True, transform=ToTensor())
+
+    # Apply one-vs-rest labeling
+    binary_train_dataset = one_vs_rest(binary_train_dataset, target_binary_class)
+    binary_test_dataset = one_vs_rest(binary_test_dataset, target_binary_class)
+
+    binary_trainloader = DataLoader(binary_train_dataset, batch_size=BATCH_SIZE, shuffle=True)
+    binary_testloader = DataLoader(binary_test_dataset, batch_size=BATCH_SIZE, shuffle=False)
+
+    binary_epoch = 1
+
+    binary_config = ClassifierConfig(model_name="microsoft/resnet-50", device=DEVICE)
+    binary_classifier = Classifier(binary_config)
+
+    class_counts = np.bincount(binary_train_dataset.targets)
+    n = len(binary_train_dataset)
+    w0 = n / (2.0 * class_counts[0])
+    w1 = n / (2.0 * class_counts[1])
+
+    binary_criterion = BCEWithLogitsLoss(pos_weight=torch.tensor(w1 / w0))
+    binary_optimizer = Adam(binary_classifier.parameters(), lr=LR)
+
+    binary_classifier.train()
+
+    logging.info("Start binary classifier training")
+
+    # Training loop
+    while binary_epoch < EPOCH_NUM:  # loop over the dataset multiple times
+        for i, data in enumerate(binary_trainloader):
+            inputs, labels = data
+            inputs, labels = inputs.to(DEVICE), labels.to(torch.float32).to(DEVICE)
+
+            # Zero the parameter gradients
+            binary_optimizer.zero_grad()
+
+            # Forward pass
+            outputs = binary_classifier(inputs)
+            loss = binary_criterion(outputs.logits, labels)
+            loss.backward()
+            binary_optimizer.step()
+
+            if i % 10 == 0:  # print every 10 mini-batches
+                print(f"[Epoch {binary_epoch}, Batch {i}] loss: {loss.item():.3f}")
+        binary_epoch += 1
+
+    logging.info("Binary classifier training finished")
+    logging.info("Start binary classifier evaluation")
+
+    binary_classifier.eval()
+
+    test_loss = 0.0
+    test_labels = []
+    test_pridections = []
+    test_probs = []
+
+    with torch.no_grad():
+        for data in binary_testloader:
+            images, labels = data
+            images, labels = images.to(DEVICE), labels.to(torch.float32).to(DEVICE)
+            outputs = binary_classifier(images)
+            loss = binary_criterion(outputs.logits, labels)
+            test_loss += loss.item() * BATCH_SIZE
+
+            test_labels.extend(labels.cpu())
+            test_pridections.extend(outputs.logits.cpu())
+            test_probs.extend(outputs.probabilities.cpu())
+
+    test_loss = test_loss / len(binary_test_dataset)
+
+    logging.info(f"Binary classifier test loss {test_loss:.3f}")
+
+    test_labels = torch.stack(test_labels)
+    test_predictions = torch.stack(test_pridections)
+    test_probs = torch.stack(test_probs)
+
+    # Calculate metrics
+    acc = Accuracy(task="binary")(test_predictions, test_labels)
+    prec = Precision(task="binary", average="weighted")(test_predictions, test_labels)
+    rec = Recall(task="binary", average="weighted")(test_predictions, test_labels)
+    f1_score = F1Score(task="binary", average="weighted")(test_predictions, test_labels)
+    auroc_score = AUROC(task="binary", average="weighted")(test_probs, test_labels)
+
+    logging.info(f"Accuracy: {acc:.2f}")
+    logging.info(f"Precision: {prec:.2f}")
+    logging.info(f"Recall: {rec:.2f}")
+    logging.info(f"F1 Score: {f1_score:.2f}")
+    logging.info(f"AUROC Score: {auroc_score:.2f}")
+
+
+if __name__ == "__main__":
+    train_evaluate_multiclass_classifier()
+    train_evaluate_binary_classifier()
diff --git a/tests/policies/hilserl/classifier/test_modelling_classifier.py b/tests/policies/hilserl/classifier/test_modelling_classifier.py
new file mode 100644
index 00000000..014165eb
--- /dev/null
+++ b/tests/policies/hilserl/classifier/test_modelling_classifier.py
@@ -0,0 +1,78 @@
+import torch
+
+from lerobot.common.policies.hilserl.classifier.modeling_classifier import (
+    Classifier,
+    ClassifierConfig,
+    ClassifierOutput,
+)
+from tests.utils import require_package
+
+
+def test_classifier_output():
+    output = ClassifierOutput(
+        logits=torch.tensor([1, 2, 3]), probabilities=torch.tensor([0.1, 0.2, 0.3]), hidden_states=None
+    )
+
+    assert (
+        f"{output}"
+        == "ClassifierOutput(logits=tensor([1, 2, 3]), probabilities=tensor([0.1000, 0.2000, 0.3000]), hidden_states=None)"
+    )
+
+
+@require_package("transformers")
+def test_binary_classifier_with_default_params():
+    config = ClassifierConfig()
+    classifier = Classifier(config)
+
+    batch_size = 10
+
+    input = torch.rand(batch_size, 3, 224, 224)
+    output = classifier(input)
+
+    assert output is not None
+    assert output.logits.shape == torch.Size([batch_size])
+    assert not torch.isnan(output.logits).any(), "Tensor contains NaN values"
+    assert output.probabilities.shape == torch.Size([batch_size])
+    assert not torch.isnan(output.probabilities).any(), "Tensor contains NaN values"
+    assert output.hidden_states.shape == torch.Size([batch_size, 2048])
+    assert not torch.isnan(output.hidden_states).any(), "Tensor contains NaN values"
+
+
+@require_package("transformers")
+def test_multiclass_classifier():
+    num_classes = 5
+    config = ClassifierConfig(num_classes=num_classes)
+    classifier = Classifier(config)
+
+    batch_size = 10
+
+    input = torch.rand(batch_size, 3, 224, 224)
+    output = classifier(input)
+
+    assert output is not None
+    assert output.logits.shape == torch.Size([batch_size, num_classes])
+    assert not torch.isnan(output.logits).any(), "Tensor contains NaN values"
+    assert output.probabilities.shape == torch.Size([batch_size, num_classes])
+    assert not torch.isnan(output.probabilities).any(), "Tensor contains NaN values"
+    assert output.hidden_states.shape == torch.Size([batch_size, 2048])
+    assert not torch.isnan(output.hidden_states).any(), "Tensor contains NaN values"
+
+
+@require_package("transformers")
+def test_default_device():
+    config = ClassifierConfig()
+    assert config.device == "cpu"
+
+    classifier = Classifier(config)
+    for p in classifier.parameters():
+        assert p.device == torch.device("cpu")
+
+
+@require_package("transformers")
+def test_explicit_device_setup():
+    config = ClassifierConfig(device="meta")
+    assert config.device == "meta"
+
+    classifier = Classifier(config)
+    for p in classifier.parameters():
+        assert p.device == torch.device("meta")

From b53d6e0ff254d17aa8e4e1639cfc6aea899e3df6 Mon Sep 17 00:00:00 2001
From: Eugene Mironov <helper2424@gmail.com>
Date: Mon, 23 Dec 2024 16:44:29 +0700
Subject: [PATCH 017/112] [HIL-SERL PORT] Fix linter issues (#588)

---
 lerobot/common/policies/sac/modeling_sac.py | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index de8283de..c5e3f209 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -19,21 +19,18 @@
 
 from collections import deque
 from copy import deepcopy
-from functools import partial
+from typing import Callable, Optional, Sequence, Tuple
 
 import einops
-
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F  # noqa: N812
+from huggingface_hub import PyTorchModelHubMixin
 from torch import Tensor
 
-from huggingface_hub import PyTorchModelHubMixin
 from lerobot.common.policies.normalize import Normalize, Unnormalize
 from lerobot.common.policies.sac.configuration_sac import SACConfig
-import numpy as np
-from typing import Callable, Optional, Tuple, Sequence
-
 
 
 class SACPolicy(
@@ -290,10 +287,7 @@ class Critic(nn.Module):
         observations = observations.to(self.device)
         actions = actions.to(self.device)
         
-        if self.encoder is not None:
-            obs_enc = self.encoder(observations)
-        else:
-            obs_enc = observations
+        obs_enc = observations if self.encoder is None else self.encoder(observations)
             
         inputs = torch.cat([obs_enc, actions], dim=-1)
         x = self.network(inputs)
@@ -563,6 +557,8 @@ class LagrangeMultiplier(nn.Module):
 # 3. Optionally, the values can be further transformed to fit within arbitrary bounds [low, high] using an affine transformation
 # This type of distribution is commonly used in reinforcement learning, particularly for continuous action spaces
 class TanhMultivariateNormalDiag(torch.distributions.TransformedDistribution):
+    DEFAULT_SAMPLE_SHAPE = torch.Size()
+
     def __init__(
         self,
         loc: torch.Tensor,
@@ -611,7 +607,7 @@ class TanhMultivariateNormalDiag(torch.distributions.TransformedDistribution):
         
         return mode
 
-    def rsample(self, sample_shape=torch.Size()) -> torch.Tensor:
+    def rsample(self, sample_shape=DEFAULT_SAMPLE_SHAPE) -> torch.Tensor:
         """
         Reparameterized sample from the distribution
         """
@@ -643,7 +639,7 @@ class TanhMultivariateNormalDiag(torch.distributions.TransformedDistribution):
         
         return log_prob
 
-    def sample_and_log_prob(self, sample_shape=torch.Size()) -> Tuple[torch.Tensor, torch.Tensor]:
+    def sample_and_log_prob(self, sample_shape=DEFAULT_SAMPLE_SHAPE) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Sample from the distribution and compute log probability
         """

From 08ec971086488277fc8745bc5c11a445e46c51ea Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Mon, 23 Dec 2024 14:12:03 +0100
Subject: [PATCH 018/112] added optimizer and sac to factory.py

---
 lerobot/common/policies/factory.py               | 6 ++++++
 lerobot/common/policies/sac/configuration_sac.py | 1 +
 lerobot/scripts/train.py                         | 9 +++++++++
 3 files changed, 16 insertions(+)

diff --git a/lerobot/common/policies/factory.py b/lerobot/common/policies/factory.py
index 5cb2fd52..7f550d90 100644
--- a/lerobot/common/policies/factory.py
+++ b/lerobot/common/policies/factory.py
@@ -66,6 +66,12 @@ def get_policy_and_config_classes(name: str) -> tuple[Policy, object]:
         from lerobot.common.policies.vqbet.modeling_vqbet import VQBeTPolicy
 
         return VQBeTPolicy, VQBeTConfig
+    elif name == "sac":
+        from lerobot.common.policies.sac.configuration_sac import SACConfig
+        from lerobot.common.policies.sac.modeling_sac import SACPolicy
+
+        return SACPolicy, SACConfig
+
     else:
         raise NotImplementedError(f"Policy with name {name} is not implemented.")
 
diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index d324462e..6db198e8 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -26,6 +26,7 @@ class SACConfig:
     num_subsample_critics = None
     critic_lr = 3e-4
     actor_lr = 3e-4
+    temperature_lr = 3e-4
     critic_target_update_weight = 0.005
     utd_ratio = 2
     critic_network_kwargs = {
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index 9a0b7e4c..346c3acd 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -93,6 +93,15 @@ def make_optimizer_and_scheduler(cfg, policy):
     elif policy.name == "tdmpc":
         optimizer = torch.optim.Adam(policy.parameters(), cfg.training.lr)
         lr_scheduler = None
+
+    elif policy.name == "sac":
+        optimizer = torch.optim.Adam([
+			{'params': policy.actor.parameters(), 'lr': policy.config.actor_lr},
+            {'params': policy.critic_ensemble.parameters(), 'lr': policy.config.critic_lr},
+			{'params': policy.temperature.parameters(), 'lr': policy.config.temperature_lr},
+            ])
+        lr_scheduler = None		
+
     elif cfg.policy.name == "vqbet":
         from lerobot.common.policies.vqbet.modeling_vqbet import VQBeTOptimizer, VQBeTScheduler
 

From dc54d357ca9106d72b0d70b064e2740f10b8fc53 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Sun, 29 Dec 2024 12:51:21 +0000
Subject: [PATCH 019/112] Added normalization schemes and style checks

---
 lerobot/common/logger.py                      |   2 +-
 .../classifier/configuration_classifier.py    |   2 -
 .../hilserl/classifier/modeling_classifier.py |  10 +-
 .../policies/hilserl/configuration_hilserl.py |   2 +-
 .../policies/hilserl/modeling_hilserl.py      |   4 +-
 .../common/policies/sac/configuration_sac.py  |  42 +++-
 lerobot/common/policies/sac/modeling_sac.py   | 220 ++++++++----------
 lerobot/scripts/eval_on_robot.py              |   8 +-
 lerobot/scripts/train.py                      |  14 +-
 lerobot/scripts/train_hilserl_classifier.py   |   2 +-
 10 files changed, 150 insertions(+), 156 deletions(-)

diff --git a/lerobot/common/logger.py b/lerobot/common/logger.py
index 4015492d..dec8b465 100644
--- a/lerobot/common/logger.py
+++ b/lerobot/common/logger.py
@@ -25,13 +25,13 @@ from glob import glob
 from pathlib import Path
 
 import torch
-import wandb
 from huggingface_hub.constants import SAFETENSORS_SINGLE_FILE
 from omegaconf import DictConfig, OmegaConf
 from termcolor import colored
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LRScheduler
 
+import wandb
 from lerobot.common.policies.policy_protocol import Policy
 from lerobot.common.utils.utils import get_global_random_state, set_global_random_state
 
diff --git a/lerobot/common/policies/hilserl/classifier/configuration_classifier.py b/lerobot/common/policies/hilserl/classifier/configuration_classifier.py
index 553e4262..f0b9352f 100644
--- a/lerobot/common/policies/hilserl/classifier/configuration_classifier.py
+++ b/lerobot/common/policies/hilserl/classifier/configuration_classifier.py
@@ -2,8 +2,6 @@ import json
 import os
 from dataclasses import asdict, dataclass
 
-import torch
-
 
 @dataclass
 class ClassifierConfig:
diff --git a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
index 0b8d66ac..28b05744 100644
--- a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
+++ b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
@@ -23,9 +23,11 @@ class ClassifierOutput:
         self.hidden_states = hidden_states
 
     def __repr__(self):
-        return (f"ClassifierOutput(logits={self.logits}, "
-                f"probabilities={self.probabilities}, "
-                f"hidden_states={self.hidden_states})")
+        return (
+            f"ClassifierOutput(logits={self.logits}, "
+            f"probabilities={self.probabilities}, "
+            f"hidden_states={self.hidden_states})"
+        )
 
 
 class Classifier(
@@ -74,7 +76,7 @@ class Classifier(
             self.feature_dim = self.encoder.config.hidden_sizes[-1]  # Last channel dimension
         else:
             raise ValueError("Unsupported CNN architecture")
-        
+
         self.encoder = self.encoder.to(self.config.device)
 
     def _freeze_encoder(self) -> None:
diff --git a/lerobot/common/policies/hilserl/configuration_hilserl.py b/lerobot/common/policies/hilserl/configuration_hilserl.py
index f1bc850f..80d2f578 100644
--- a/lerobot/common/policies/hilserl/configuration_hilserl.py
+++ b/lerobot/common/policies/hilserl/configuration_hilserl.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-# Copyright 2024 The HuggingFace Inc. team. 
+# Copyright 2024 The HuggingFace Inc. team.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/lerobot/common/policies/hilserl/modeling_hilserl.py b/lerobot/common/policies/hilserl/modeling_hilserl.py
index 236ed433..679eb010 100644
--- a/lerobot/common/policies/hilserl/modeling_hilserl.py
+++ b/lerobot/common/policies/hilserl/modeling_hilserl.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-# Copyright 2024 The HuggingFace Inc. team. 
+# Copyright 2024 The HuggingFace Inc. team.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -26,4 +26,4 @@ class HILSerlPolicy(
     repo_url="https://github.com/huggingface/lerobot",
     tags=["robotics", "hilserl"],
 ):
-    pass
\ No newline at end of file
+    pass
diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index 6db198e8..f4a2bc4c 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-# Copyright 2024 The HuggingFace Inc. team. 
+# Copyright 2024 The HuggingFace Inc. team.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,7 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 
 
 @dataclass
@@ -30,14 +30,36 @@ class SACConfig:
     critic_target_update_weight = 0.005
     utd_ratio = 2
     critic_network_kwargs = {
-            "hidden_dims": [256, 256],
-            "activate_final": True,
-        }
+        "hidden_dims": [256, 256],
+        "activate_final": True,
+    }
     actor_network_kwargs = {
-            "hidden_dims": [256, 256],
-            "activate_final": True,
-        }
+        "hidden_dims": [256, 256],
+        "activate_final": True,
+    }
     policy_kwargs = {
-            "tanh_squash_distribution": True,
-            "std_parameterization": "uniform",
+        "tanh_squash_distribution": True,
+        "std_parameterization": "uniform",
+    }
+
+    input_shapes: dict[str, list[int]] = field(
+        default_factory=lambda: {
+            "observation.image": [3, 84, 84],
+            "observation.state": [4],
         }
+    )
+    output_shapes: dict[str, list[int]] = field(
+        default_factory=lambda: {
+            "action": [4],
+        }
+    )
+
+    state_encoder_hidden_dim: int = 256
+    latent_dim: int = 256
+    network_hidden_dims: int = 256
+
+    # Normalization / Unnormalization
+    input_normalization_modes: dict[str, str] | None = None
+    output_normalization_modes: dict[str, str] = field(
+        default_factory=lambda: {"action": "min_max"},
+    )
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index c5e3f209..51258fac 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-# Copyright 2024 The HuggingFace Inc. team. 
+# Copyright 2024 The HuggingFace Inc. team.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -40,11 +40,9 @@ class SACPolicy(
     repo_url="https://github.com/huggingface/lerobot",
     tags=["robotics", "RL", "SAC"],
 ):
-    
     def __init__(
         self, config: SACConfig | None = None, dataset_stats: dict[str, dict[str, Tensor]] | None = None
     ):
-        
         super().__init__()
 
         if config is None:
@@ -67,12 +65,9 @@ class SACPolicy(
         # Define networks
         critic_nets = []
         for _ in range(config.num_critics):
-            critic_net = Critic(
-                encoder=encoder,
-                network=MLP(**config.critic_network_kwargs)
-            )
+            critic_net = Critic(encoder=encoder, network=MLP(**config.critic_network_kwargs))
             critic_nets.append(critic_net)
-        
+
         self.critic_ensemble = create_critic_ensemble(critic_nets, config.num_critics)
         self.critic_target = deepcopy(self.critic_ensemble)
 
@@ -80,11 +75,11 @@ class SACPolicy(
             encoder=encoder,
             network=MLP(**config.actor_network_kwargs),
             action_dim=config.output_shapes["action"][0],
-            **config.policy_kwargs
+            **config.policy_kwargs,
         )
         if config.target_entropy is None:
-            config.target_entropy = -np.prod(config.output_shapes["action"][0]) #  (-dim(A))
-        self.temperature = LagrangeMultiplier(init_value=config.temperature_init)    
+            config.target_entropy = -np.prod(config.output_shapes["action"][0])  #  (-dim(A))
+        self.temperature = LagrangeMultiplier(init_value=config.temperature_init)
 
     def reset(self):
         """
@@ -100,10 +95,10 @@ class SACPolicy(
             self._queues["observation.image"] = deque(maxlen=1)
         if self._use_env_state:
             self._queues["observation.environment_state"] = deque(maxlen=1)
-    
+
     @torch.no_grad()
     def select_action(self, batch: dict[str, Tensor]) -> Tensor:
-        actions, _ = self.actor_network(batch['observations'])###
+        actions, _ = self.actor_network(batch["observations"])  ###
 
     def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor | float]:
         """Run the batch through the model and compute the loss.
@@ -111,8 +106,8 @@ class SACPolicy(
         Returns a dictionary with loss as a tensor, and other information as native floats.
         """
         batch = self.normalize_inputs(batch)
-        # batch shape is (b, 2, ...) where index 1 returns the current observation and 
-        # the next observation for caluculating the right td index. 
+        # batch shape is (b, 2, ...) where index 1 returns the current observation and
+        # the next observation for caluculating the right td index.
         actions = batch["action"][:, 0]
         rewards = batch["next.reward"][:, 0]
         observations = {}
@@ -121,13 +116,12 @@ class SACPolicy(
             if k.startswith("observation."):
                 observations[k] = batch[k][:, 0]
                 next_observations[k] = batch[k][:, 1]
-       
+
         # perform image augmentation
 
         # reward bias
-        # from HIL-SERL code base 
+        # from HIL-SERL code base
         # add_or_replace={"rewards": batch["rewards"] + self.config["reward_bias"]} in reward_batch
-        
 
         # calculate critics loss
         # 1- compute actions from policy
@@ -137,7 +131,7 @@ class SACPolicy(
         # subsample critics to prevent overfitting if use high UTD (update to date)
         if self.config.num_subsample_critics is not None:
             indices = torch.randperm(self.config.num_critics)
-            indices = indices[:self.config.num_subsample_critics]
+            indices = indices[: self.config.num_subsample_critics]
             q_targets = q_targets[indices]
 
         # critics subsample size
@@ -151,8 +145,9 @@ class SACPolicy(
 
         # 4- Calculate loss
         # Compute state-action value loss (TD loss) for all of the Q functions in the ensemble.
-        critics_loss = (   
-            F.mse_loss(
+        critics_loss = (
+            (
+                F.mse_loss(
                     q_preds,
                     einops.repeat(td_target, "t b -> e t b", e=q_preds.shape[0]),
                     reduction="none",
@@ -163,15 +158,17 @@ class SACPolicy(
                 # q_targets depends on the reward and the next observations.
                 * ~batch["next.reward_is_pad"]
                 * ~batch["observation.state_is_pad"][1:]
-            ).sum(0).mean()
-        
+            )
+            .sum(0)
+            .mean()
+        )
+
         # calculate actors loss
         # 1- temperature
         temperature = self.temperature()
 
         # 2- get actions (batch_size, action_dim) and log probs (batch_size,)
-        actions, log_probs = self.actor_network(observations) \
-
+        actions, log_probs = self.actor_network(observations)
         # 3- get q-value predictions
         with torch.no_grad():
             q_preds = self.critic_ensemble(observations, actions, return_type="mean")
@@ -181,36 +178,31 @@ class SACPolicy(
             * ~batch["action_is_pad"]
         ).mean()
 
-
         # calculate temperature loss
         # 1- calculate entropy
         entropy = -log_probs.mean()
-        temperature_loss = self.temp(
-            lhs=entropy,
-            rhs=self.config.target_entropy
-        )
+        temperature_loss = self.temp(lhs=entropy, rhs=self.config.target_entropy)
 
         loss = critics_loss + actor_loss + temperature_loss
 
         return {
-                "critics_loss": critics_loss.item(),
-                "actor_loss": actor_loss.item(),
-                "temperature_loss": temperature_loss.item(),
-                "temperature": temperature.item(),
-                "entropy": entropy.item(),
-                "loss": loss,
+            "critics_loss": critics_loss.item(),
+            "actor_loss": actor_loss.item(),
+            "temperature_loss": temperature_loss.item(),
+            "temperature": temperature.item(),
+            "entropy": entropy.item(),
+            "loss": loss,
+        }
 
-            }
-    
     def update(self):
         self.critic_target.lerp_(self.critic_ensemble, self.config.critic_target_update_weight)
         # TODO: implement UTD update
         # First update only critics for utd_ratio-1 times
-        #for critic_step in range(self.config.utd_ratio - 1):
-            # only update critic and critic target
+        # for critic_step in range(self.config.utd_ratio - 1):
+        # only update critic and critic target
         # Then update critic, critic target, actor and temperature
 
-        #for target_param, param in zip(self.critic_target.parameters(), self.critic_ensemble.parameters()):
+        # for target_param, param in zip(self.critic_target.parameters(), self.critic_ensemble.parameters()):
         #    target_param.data.copy_(target_param.data * (1.0 - self.config.critic_target_update_weight) + param.data * self.critic_target_update_weight)
 
 
@@ -225,24 +217,28 @@ class MLP(nn.Module):
         super().__init__()
         self.activate_final = config.activate_final
         layers = []
-        
+
         for i, size in enumerate(config.network_hidden_dims):
-            layers.append(nn.Linear(config.network_hidden_dims[i-1] if i > 0 else config.network_hidden_dims[0], size))
-            
+            layers.append(
+                nn.Linear(config.network_hidden_dims[i - 1] if i > 0 else config.network_hidden_dims[0], size)
+            )
+
             if i + 1 < len(config.network_hidden_dims) or activate_final:
                 if dropout_rate is not None and dropout_rate > 0:
                     layers.append(nn.Dropout(p=dropout_rate))
                 layers.append(nn.LayerNorm(size))
-                layers.append(activations if isinstance(activations, nn.Module) else getattr(nn, activations)())
-                
+                layers.append(
+                    activations if isinstance(activations, nn.Module) else getattr(nn, activations)()
+                )
+
         self.net = nn.Sequential(*layers)
 
     def forward(self, x: torch.Tensor, train: bool = False) -> torch.Tensor:
         # in training mode or not. TODO: find better way to do this
-        self.train(train) 
+        self.train(train)
         return self.net(x)
-    
-    
+
+
 class Critic(nn.Module):
     def __init__(
         self,
@@ -250,7 +246,7 @@ class Critic(nn.Module):
         network: nn.Module,
         init_final: Optional[float] = None,
         activate_final: bool = False,
-        device: str = "cuda"
+        device: str = "cuda",
     ):
         super().__init__()
         self.device = torch.device(device)
@@ -258,7 +254,7 @@ class Critic(nn.Module):
         self.network = network
         self.init_final = init_final
         self.activate_final = activate_final
-        
+
         # Output layer
         if init_final is not None:
             if self.activate_final:
@@ -273,36 +269,28 @@ class Critic(nn.Module):
             else:
                 self.output_layer = nn.Linear(network.net[-2].out_features, 1)
             orthogonal_init()(self.output_layer.weight)
-        
+
         self.to(self.device)
 
-    def forward(
-        self, 
-        observations: torch.Tensor, 
-        actions: torch.Tensor,
-        train: bool = False
-    ) -> torch.Tensor:
+    def forward(self, observations: torch.Tensor, actions: torch.Tensor, train: bool = False) -> torch.Tensor:
         self.train(train)
-        
+
         observations = observations.to(self.device)
         actions = actions.to(self.device)
-        
+
         obs_enc = observations if self.encoder is None else self.encoder(observations)
-            
+
         inputs = torch.cat([obs_enc, actions], dim=-1)
         x = self.network(inputs)
         value = self.output_layer(x)
         return value.squeeze(-1)
-    
+
     def q_value_ensemble(
-        self,
-        observations: torch.Tensor,
-        actions: torch.Tensor,
-        train: bool = False
+        self, observations: torch.Tensor, actions: torch.Tensor, train: bool = False
     ) -> torch.Tensor:
         observations = observations.to(self.device)
         actions = actions.to(self.device)
-        
+
         if len(actions.shape) == 3:  # [batch_size, num_actions, action_dim]
             batch_size, num_actions = actions.shape[:2]
             obs_expanded = observations.unsqueeze(1).expand(-1, num_actions, -1)
@@ -327,7 +315,7 @@ class Policy(nn.Module):
         fixed_std: Optional[torch.Tensor] = None,
         init_final: Optional[float] = None,
         activate_final: bool = False,
-        device: str = "cuda"
+        device: str = "cuda",
     ):
         super().__init__()
         self.device = torch.device(device)
@@ -340,7 +328,7 @@ class Policy(nn.Module):
         self.tanh_squash_distribution = tanh_squash_distribution
         self.fixed_std = fixed_std.to(self.device) if fixed_std is not None else None
         self.activate_final = activate_final
-        
+
         # Mean layer
         if self.activate_final:
             self.mean_layer = nn.Linear(network.net[-3].out_features, action_dim)
@@ -351,7 +339,7 @@ class Policy(nn.Module):
             nn.init.uniform_(self.mean_layer.bias, -init_final, init_final)
         else:
             orthogonal_init()(self.mean_layer.weight)
-        
+
         # Standard deviation layer or parameter
         if fixed_std is None:
             if std_parameterization == "uniform":
@@ -366,18 +354,18 @@ class Policy(nn.Module):
                     nn.init.uniform_(self.std_layer.bias, -init_final, init_final)
                 else:
                     orthogonal_init()(self.std_layer.weight)
-        
+
         self.to(self.device)
 
     def forward(
-        self, 
+        self,
         observations: torch.Tensor,
         temperature: float = 1.0,
         train: bool = False,
-        non_squash_distribution: bool = False
+        non_squash_distribution: bool = False,
     ) -> torch.distributions.Distribution:
         self.train(train)
-                
+
         # Encode observations if encoder exists
         if self.encoder is not None:
             with torch.set_grad_enabled(train):
@@ -387,7 +375,7 @@ class Policy(nn.Module):
         # Get network outputs
         outputs = self.network(obs_enc)
         means = self.mean_layer(outputs)
-        
+
         # Compute standard deviations
         if self.fixed_std is None:
             if self.std_parameterization == "exp":
@@ -398,9 +386,7 @@ class Policy(nn.Module):
             elif self.std_parameterization == "uniform":
                 stds = torch.exp(self.log_stds).expand_as(means)
             else:
-                raise ValueError(
-                    f"Invalid std_parameterization: {self.std_parameterization}"
-                )
+                raise ValueError(f"Invalid std_parameterization: {self.std_parameterization}")
         else:
             assert self.std_parameterization == "fixed"
             stds = self.fixed_std.expand_as(means)
@@ -422,7 +408,7 @@ class Policy(nn.Module):
             )
 
         return distribution
-    
+
     def get_features(self, observations: torch.Tensor) -> torch.Tensor:
         """Get encoded features from observations"""
         observations = observations.to(self.device)
@@ -503,56 +489,47 @@ class SACObservationEncoder(nn.Module):
         if "observation.state" in self.config.input_shapes:
             feat.append(self.state_enc_layers(obs_dict["observation.state"]))
         return torch.stack(feat, dim=0).mean(0)
-    
+
 
 class LagrangeMultiplier(nn.Module):
-    def __init__(
-        self,
-        init_value: float = 1.0,
-        constraint_shape: Sequence[int] = (),
-        device: str = "cuda"
-    ):
+    def __init__(self, init_value: float = 1.0, constraint_shape: Sequence[int] = (), device: str = "cuda"):
         super().__init__()
         self.device = torch.device(device)
         init_value = torch.log(torch.exp(torch.tensor(init_value, device=self.device)) - 1)
-            
+
         # Initialize the Lagrange multiplier as a parameter
         self.lagrange = nn.Parameter(
             torch.full(constraint_shape, init_value, dtype=torch.float32, device=self.device)
         )
-        
+
         self.to(self.device)
 
-    def forward(
-        self, 
-        lhs: Optional[torch.Tensor] = None, 
-        rhs: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
-        # Get the multiplier value based on parameterization        
+    def forward(self, lhs: Optional[torch.Tensor] = None, rhs: Optional[torch.Tensor] = None) -> torch.Tensor:
+        # Get the multiplier value based on parameterization
         multiplier = torch.nn.functional.softplus(self.lagrange)
-                
+
         # Return the raw multiplier if no constraint values provided
         if lhs is None:
             return multiplier
-            
+
         # Move inputs to device
         lhs = lhs.to(self.device)
         if rhs is not None:
             rhs = rhs.to(self.device)
-            
+
         # Use the multiplier to compute the Lagrange penalty
         if rhs is None:
             rhs = torch.zeros_like(lhs, device=self.device)
-            
+
         diff = lhs - rhs
-        
+
         assert diff.shape == multiplier.shape, f"Shape mismatch: {diff.shape} vs {multiplier.shape}"
-        
+
         return multiplier * diff
 
 
 # The TanhMultivariateNormalDiag is a probability distribution that represents a transformed normal (Gaussian) distribution where:
-# 1. The base distribution is a diagonal multivariate normal distribution 
+# 1. The base distribution is a diagonal multivariate normal distribution
 # 2. The samples from this normal distribution are transformed through a tanh function, which squashes the values to be between -1 and 1
 # 3. Optionally, the values can be further transformed to fit within arbitrary bounds [low, high] using an affine transformation
 # This type of distribution is commonly used in reinforcement learning, particularly for continuous action spaces
@@ -568,28 +545,22 @@ class TanhMultivariateNormalDiag(torch.distributions.TransformedDistribution):
     ):
         # Create base normal distribution
         base_distribution = torch.distributions.Normal(loc=loc, scale=scale_diag)
-        
+
         # Create list of transforms
         transforms = []
-        
+
         # Add tanh transform
         transforms.append(torch.distributions.transforms.TanhTransform())
-        
+
         # Add rescaling transform if bounds are provided
         if low is not None and high is not None:
             transforms.append(
-                torch.distributions.transforms.AffineTransform(
-                    loc=(high + low) / 2,
-                    scale=(high - low) / 2
-                )
+                torch.distributions.transforms.AffineTransform(loc=(high + low) / 2, scale=(high - low) / 2)
             )
-        
+
         # Initialize parent class
-        super().__init__(
-            base_distribution=base_distribution,
-            transforms=transforms
-        )
-        
+        super().__init__(base_distribution=base_distribution, transforms=transforms)
+
         # Store parameters
         self.loc = loc
         self.scale_diag = scale_diag
@@ -600,11 +571,11 @@ class TanhMultivariateNormalDiag(torch.distributions.TransformedDistribution):
         """Get the mode of the transformed distribution"""
         # The mode of a normal distribution is its mean
         mode = self.loc
-        
+
         # Apply transforms
         for transform in self.transforms:
             mode = transform(mode)
-        
+
         return mode
 
     def rsample(self, sample_shape=DEFAULT_SAMPLE_SHAPE) -> torch.Tensor:
@@ -613,11 +584,11 @@ class TanhMultivariateNormalDiag(torch.distributions.TransformedDistribution):
         """
         # Sample from base distribution
         x = self.base_dist.rsample(sample_shape)
-        
+
         # Apply transforms
         for transform in self.transforms:
             x = transform(x)
-            
+
         return x
 
     def log_prob(self, value: torch.Tensor) -> torch.Tensor:
@@ -627,16 +598,16 @@ class TanhMultivariateNormalDiag(torch.distributions.TransformedDistribution):
         """
         # Initialize log prob
         log_prob = torch.zeros_like(value[..., 0])
-        
+
         # Inverse transforms to get back to normal distribution
         q = value
         for transform in reversed(self.transforms):
             q = transform.inv(q)
             log_prob = log_prob - transform.log_abs_det_jacobian(q, transform(q))
-        
+
         # Add base distribution log prob
         log_prob = log_prob + self.base_dist.log_prob(q).sum(-1)
-        
+
         return log_prob
 
     def sample_and_log_prob(self, sample_shape=DEFAULT_SAMPLE_SHAPE) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -653,13 +624,13 @@ class TanhMultivariateNormalDiag(torch.distributions.TransformedDistribution):
         """
         # Start with base distribution entropy
         entropy = self.base_dist.entropy().sum(-1)
-        
+
         # Add log det jacobian for each transform
         x = self.rsample()
         for transform in self.transforms:
             entropy = entropy + transform.log_abs_det_jacobian(x, transform(x))
             x = transform(x)
-            
+
         return entropy
 
 
@@ -680,7 +651,7 @@ def flatten_forward_unflatten(fn: Callable[[Tensor], Tensor], image_tensor: Tens
     Args:
         fn: Callable that the image tensor will be passed to. It should accept (B, C, H, W) and return
             (B, *), where * is any number of dimensions.
-        image_tensor: An image tensor of shape (**, C, H, W), where ** is any number of dimensions and 
+        image_tensor: An image tensor of shape (**, C, H, W), where ** is any number of dimensions and
         can be more than 1 dimensions, generally different from *.
     Returns:
         A return value from the callable reshaped to (**, *).
@@ -691,4 +662,3 @@ def flatten_forward_unflatten(fn: Callable[[Tensor], Tensor], image_tensor: Tens
     inp = torch.flatten(image_tensor, end_dim=-4)
     flat_out = fn(inp)
     return torch.reshape(flat_out, (*start_dims, *flat_out.shape[1:]))
-
diff --git a/lerobot/scripts/eval_on_robot.py b/lerobot/scripts/eval_on_robot.py
index 6a790f0a..92daa860 100644
--- a/lerobot/scripts/eval_on_robot.py
+++ b/lerobot/scripts/eval_on_robot.py
@@ -24,7 +24,7 @@ python lerobot/scripts/eval_on_robot.py \
 ```
 
 **NOTE** (michel-aractingi): This script is incomplete and it is being prepared
-for running training on the real robot. 
+for running training on the real robot.
 """
 
 import argparse
@@ -47,7 +47,7 @@ from lerobot.common.utils.utils import (
 
 
 def rollout(robot: Robot, policy: Policy, fps: int, control_time_s: float = 20, use_amp: bool = True) -> dict:
-    """Run a batched policy rollout on the real robot. 
+    """Run a batched policy rollout on the real robot.
 
     The return dictionary contains:
         "robot": A a dictionary of (batch, sequence + 1, *) tensors mapped to observation
@@ -64,7 +64,7 @@ def rollout(robot: Robot, policy: Policy, fps: int, control_time_s: float = 20,
             extraneous elements from the sequences above.
 
     Args:
-        robot: The robot class that defines the interface with the real robot. 
+        robot: The robot class that defines the interface with the real robot.
         policy: The policy. Must be a PyTorch nn module.
 
     Returns:
@@ -77,7 +77,7 @@ def rollout(robot: Robot, policy: Policy, fps: int, control_time_s: float = 20,
     listener, events = init_keyboard_listener()
 
     # Reset the policy. TODO (michel-aractingi) add real policy evaluation once the code is ready.
-    # policy.reset() 
+    # policy.reset()
 
     # Get observation from real robot
     observation = robot.capture_observation()
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index 346c3acd..fbe7927d 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -95,12 +95,14 @@ def make_optimizer_and_scheduler(cfg, policy):
         lr_scheduler = None
 
     elif policy.name == "sac":
-        optimizer = torch.optim.Adam([
-			{'params': policy.actor.parameters(), 'lr': policy.config.actor_lr},
-            {'params': policy.critic_ensemble.parameters(), 'lr': policy.config.critic_lr},
-			{'params': policy.temperature.parameters(), 'lr': policy.config.temperature_lr},
-            ])
-        lr_scheduler = None		
+        optimizer = torch.optim.Adam(
+            [
+                {"params": policy.actor.parameters(), "lr": policy.config.actor_lr},
+                {"params": policy.critic_ensemble.parameters(), "lr": policy.config.critic_lr},
+                {"params": policy.temperature.parameters(), "lr": policy.config.temperature_lr},
+            ]
+        )
+        lr_scheduler = None
 
     elif cfg.policy.name == "vqbet":
         from lerobot.common.policies.vqbet.modeling_vqbet import VQBeTOptimizer, VQBeTScheduler
diff --git a/lerobot/scripts/train_hilserl_classifier.py b/lerobot/scripts/train_hilserl_classifier.py
index 78659dc8..ea8336a9 100644
--- a/lerobot/scripts/train_hilserl_classifier.py
+++ b/lerobot/scripts/train_hilserl_classifier.py
@@ -22,7 +22,6 @@ from pprint import pformat
 import hydra
 import torch
 import torch.nn as nn
-import wandb
 from deepdiff import DeepDiff
 from omegaconf import DictConfig, OmegaConf
 from termcolor import colored
@@ -31,6 +30,7 @@ from torch.cuda.amp import GradScaler
 from torch.utils.data import DataLoader, WeightedRandomSampler, random_split
 from tqdm import tqdm
 
+import wandb
 from lerobot.common.datasets.factory import resolve_delta_timestamps
 from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
 from lerobot.common.logger import Logger

From 18a45989861a3c737dfd92d612271040fd897c19 Mon Sep 17 00:00:00 2001
From: KeWang1017 <ke.wang@helloleap.ai>
Date: Thu, 26 Dec 2024 23:38:46 +0000
Subject: [PATCH 020/112] trying to get sac running

---
 .../common/policies/sac/configuration_sac.py  | 21 +++++
 lerobot/common/policies/sac/modeling_sac.py   | 79 ++++++++--------
 .../configs/policy/sac_pusht_keypoints.yaml   | 89 +++++++++++++++++++
 3 files changed, 149 insertions(+), 40 deletions(-)
 create mode 100644 lerobot/configs/policy/sac_pusht_keypoints.yaml

diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index f4a2bc4c..6df94761 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -20,6 +20,24 @@ from dataclasses import dataclass, field
 
 @dataclass
 class SACConfig:
+    input_shapes: dict[str, list[int]] = field(
+        default_factory=lambda: {
+            "observation.image": [3, 84, 84],
+            "observation.state": [4],
+        }
+    )
+    output_shapes: dict[str, list[int]] = field(
+        default_factory=lambda: {
+            "action": [4],
+        }
+    )
+
+    # Normalization / Unnormalization
+    input_normalization_modes: dict[str, str] | None = None
+    output_normalization_modes: dict[str, str] = field(
+        default_factory=lambda: {"action": "min_max"},
+    )
+
     discount = 0.99
     temperature_init = 1.0
     num_critics = 2
@@ -29,6 +47,9 @@ class SACConfig:
     temperature_lr = 3e-4
     critic_target_update_weight = 0.005
     utd_ratio = 2
+    state_encoder_hidden_dim = 256
+    latent_dim = 50
+    target_entropy = None
     critic_network_kwargs = {
         "hidden_dims": [256, 256],
         "activate_final": True,
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 51258fac..87170d20 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -40,6 +40,8 @@ class SACPolicy(
     repo_url="https://github.com/huggingface/lerobot",
     tags=["robotics", "RL", "SAC"],
 ):
+    name = "sac"
+
     def __init__(
         self, config: SACConfig | None = None, dataset_stats: dict[str, dict[str, Tensor]] | None = None
     ):
@@ -71,7 +73,7 @@ class SACPolicy(
         self.critic_ensemble = create_critic_ensemble(critic_nets, config.num_critics)
         self.critic_target = deepcopy(self.critic_ensemble)
 
-        self.actor_network = Policy(
+        self.actor = Policy(
             encoder=encoder,
             network=MLP(**config.actor_network_kwargs),
             action_dim=config.output_shapes["action"][0],
@@ -91,14 +93,14 @@ class SACPolicy(
             "observation.state": deque(maxlen=1),
             "action": deque(maxlen=1),
         }
-        if self._use_image:
+        if "observation.image" in self.config.input_shapes:
             self._queues["observation.image"] = deque(maxlen=1)
-        if self._use_env_state:
+        if "observation.environment_state" in self.config.input_shapes:
             self._queues["observation.environment_state"] = deque(maxlen=1)
 
     @torch.no_grad()
     def select_action(self, batch: dict[str, Tensor]) -> Tensor:
-        actions, _ = self.actor_network(batch["observations"])  ###
+        actions, _ = self.actor(batch['observations'])
 
     def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor | float]:
         """Run the batch through the model and compute the loss.
@@ -119,19 +121,18 @@ class SACPolicy(
 
         # perform image augmentation
 
-        # reward bias
-        # from HIL-SERL code base
+        # reward bias from HIL-SERL code base 
         # add_or_replace={"rewards": batch["rewards"] + self.config["reward_bias"]} in reward_batch
 
         # calculate critics loss
         # 1- compute actions from policy
-        action_preds, log_probs = self.actor_network(observations)
+        action_preds, log_probs = self.actor(observations)
         # 2- compute q targets
         q_targets = self.target_qs(next_observations, action_preds)
         # subsample critics to prevent overfitting if use high UTD (update to date)
         if self.config.num_subsample_critics is not None:
             indices = torch.randperm(self.config.num_critics)
-            indices = indices[: self.config.num_subsample_critics]
+            indices = indices[:self.config.num_subsample_critics]
             q_targets = q_targets[indices]
 
         # critics subsample size
@@ -168,7 +169,8 @@ class SACPolicy(
         temperature = self.temperature()
 
         # 2- get actions (batch_size, action_dim) and log probs (batch_size,)
-        actions, log_probs = self.actor_network(observations)
+        actions, log_probs = self.actor(observations) \
+
         # 3- get q-value predictions
         with torch.no_grad():
             q_preds = self.critic_ensemble(observations, actions, return_type="mean")
@@ -209,21 +211,19 @@ class SACPolicy(
 class MLP(nn.Module):
     def __init__(
         self,
-        config: SACConfig,
+        hidden_dims: list[int],
         activations: Callable[[torch.Tensor], torch.Tensor] | str = nn.SiLU(),
         activate_final: bool = False,
         dropout_rate: Optional[float] = None,
     ):
         super().__init__()
-        self.activate_final = config.activate_final
+        self.activate_final = activate_final
         layers = []
-
-        for i, size in enumerate(config.network_hidden_dims):
-            layers.append(
-                nn.Linear(config.network_hidden_dims[i - 1] if i > 0 else config.network_hidden_dims[0], size)
-            )
-
-            if i + 1 < len(config.network_hidden_dims) or activate_final:
+        
+        for i, size in enumerate(hidden_dims):
+            layers.append(nn.Linear(hidden_dims[i-1] if i > 0 else hidden_dims[0], size))
+            
+            if i + 1 < len(hidden_dims) or activate_final:
                 if dropout_rate is not None and dropout_rate > 0:
                     layers.append(nn.Dropout(p=dropout_rate))
                 layers.append(nn.LayerNorm(size))
@@ -254,20 +254,20 @@ class Critic(nn.Module):
         self.network = network
         self.init_final = init_final
         self.activate_final = activate_final
-
+        
+        # Find the last Linear layer's output dimension
+        for layer in reversed(network.net):
+            if isinstance(layer, nn.Linear):
+                out_features = layer.out_features
+                break
+        
         # Output layer
         if init_final is not None:
-            if self.activate_final:
-                self.output_layer = nn.Linear(network.net[-3].out_features, 1)
-            else:
-                self.output_layer = nn.Linear(network.net[-2].out_features, 1)
+            self.output_layer = nn.Linear(out_features, 1)
             nn.init.uniform_(self.output_layer.weight, -init_final, init_final)
             nn.init.uniform_(self.output_layer.bias, -init_final, init_final)
         else:
-            if self.activate_final:
-                self.output_layer = nn.Linear(network.net[-3].out_features, 1)
-            else:
-                self.output_layer = nn.Linear(network.net[-2].out_features, 1)
+            self.output_layer = nn.Linear(out_features, 1)
             orthogonal_init()(self.output_layer.weight)
 
         self.to(self.device)
@@ -328,12 +328,15 @@ class Policy(nn.Module):
         self.tanh_squash_distribution = tanh_squash_distribution
         self.fixed_std = fixed_std.to(self.device) if fixed_std is not None else None
         self.activate_final = activate_final
-
+        
+        # Find the last Linear layer's output dimension
+        for layer in reversed(network.net):
+            if isinstance(layer, nn.Linear):
+                out_features = layer.out_features
+                break
+        
         # Mean layer
-        if self.activate_final:
-            self.mean_layer = nn.Linear(network.net[-3].out_features, action_dim)
-        else:
-            self.mean_layer = nn.Linear(network.net[-2].out_features, action_dim)
+        self.mean_layer = nn.Linear(out_features, action_dim)
         if init_final is not None:
             nn.init.uniform_(self.mean_layer.weight, -init_final, init_final)
             nn.init.uniform_(self.mean_layer.bias, -init_final, init_final)
@@ -345,10 +348,7 @@ class Policy(nn.Module):
             if std_parameterization == "uniform":
                 self.log_stds = nn.Parameter(torch.zeros(action_dim, device=self.device))
             else:
-                if self.activate_final:
-                    self.std_layer = nn.Linear(network.net[-3].out_features, action_dim)
-                else:
-                    self.std_layer = nn.Linear(network.net[-2].out_features, action_dim)
+                self.std_layer = nn.Linear(out_features, action_dim)
                 if init_final is not None:
                     nn.init.uniform_(self.std_layer.weight, -init_final, init_final)
                     nn.init.uniform_(self.std_layer.bias, -init_final, init_final)
@@ -571,7 +571,6 @@ class TanhMultivariateNormalDiag(torch.distributions.TransformedDistribution):
         """Get the mode of the transformed distribution"""
         # The mode of a normal distribution is its mean
         mode = self.loc
-
         # Apply transforms
         for transform in self.transforms:
             mode = transform(mode)
@@ -634,10 +633,10 @@ class TanhMultivariateNormalDiag(torch.distributions.TransformedDistribution):
         return entropy
 
 
-def create_critic_ensemble(critic_class, num_critics: int, device: str = "cuda") -> nn.ModuleList:
+def create_critic_ensemble(critics: list[nn.Module], num_critics: int, device: str = "cuda") -> nn.ModuleList:
     """Creates an ensemble of critic networks"""
-    critics = nn.ModuleList([critic_class() for _ in range(num_critics)])
-    return critics.to(device)
+    assert len(critics) == num_critics, f"Expected {num_critics} critics, got {len(critics)}"
+    return nn.ModuleList(critics).to(device)
 
 
 def orthogonal_init():
diff --git a/lerobot/configs/policy/sac_pusht_keypoints.yaml b/lerobot/configs/policy/sac_pusht_keypoints.yaml
new file mode 100644
index 00000000..19af60d4
--- /dev/null
+++ b/lerobot/configs/policy/sac_pusht_keypoints.yaml
@@ -0,0 +1,89 @@
+# @package _global_
+
+# Train with:
+#
+# python lerobot/scripts/train.py \
+#   env=pusht \
+#   +dataset=lerobot/pusht_keypoints
+
+seed: 1
+dataset_repo_id: lerobot/pusht_keypoints
+
+training:
+  offline_steps: 0
+
+  # Offline training dataloader
+  num_workers: 4
+
+  batch_size: 128
+  grad_clip_norm: 10.0
+  lr: 3e-4
+
+  eval_freq: 10000
+  log_freq: 500
+  save_freq: 50000
+
+  online_steps: 1000000
+  online_rollout_n_episodes: 10
+  online_rollout_batch_size: 10
+  online_steps_between_rollouts: 1000
+  online_sampling_ratio: 1.0
+  online_env_seed: 10000
+  online_buffer_capacity: 40000
+  online_buffer_seed_size: 0
+  do_online_rollout_async: false
+
+  delta_timestamps:
+    observation.environment_state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
+    observation.state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
+    action: "[i / ${fps} for i in range(${policy.horizon})]"
+    next.reward: "[i / ${fps} for i in range(${policy.horizon})]"
+
+policy:
+  name: sac
+
+  pretrained_model_path:
+
+  # Input / output structure.
+  n_action_repeats: 1
+  horizon: 5
+  n_action_steps: 5
+
+  input_shapes:
+    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
+    observation.environment_state: [16]
+    observation.state: ["${env.state_dim}"]
+  output_shapes:
+    action: ["${env.action_dim}"]
+
+  # Normalization / Unnormalization
+  input_normalization_modes:
+    observation.environment_state: min_max
+    observation.state: min_max
+  output_normalization_modes:
+    action: min_max
+
+  # Architecture / modeling.
+  # Neural networks.
+  # image_encoder_hidden_dim: 32
+  discount: 0.99
+  temperature_init: 1.0
+  num_critics: 2
+  num_subsample_critics: None
+  critic_lr: 3e-4
+  actor_lr: 3e-4
+  temperature_lr: 3e-4
+  critic_target_update_weight: 0.005
+  utd_ratio: 2
+
+
+  # # Loss coefficients.
+  # reward_coeff: 0.5
+  # expectile_weight: 0.9
+  # value_coeff: 0.1
+  # consistency_coeff: 20.0
+  # advantage_scaling: 3.0
+  # pi_coeff: 0.5
+  # temporal_decay_coeff: 0.5
+  # # Target model.
+  # target_model_momentum: 0.995

From ca74a13d616bdb9df6a8c2b3544837d73c2641a4 Mon Sep 17 00:00:00 2001
From: KeWang1017 <ke.wang@helloleap.ai>
Date: Sat, 28 Dec 2024 18:07:15 +0000
Subject: [PATCH 021/112] Refactor SACPolicy for improved action sampling and
 standard deviation handling

- Updated action selection to use distribution sampling and log probabilities for better stochastic behavior.
- Enhanced standard deviation clamping to prevent extreme values, ensuring stability in policy outputs.
- Cleaned up code by removing unnecessary comments and improving readability.

These changes aim to refine the SAC implementation, enhancing its robustness and performance during training and inference.
---
 lerobot/common/policies/sac/modeling_sac.py | 75 ++++++++++++++-------
 1 file changed, 52 insertions(+), 23 deletions(-)

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 87170d20..821cb93f 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -19,6 +19,7 @@
 
 from collections import deque
 from copy import deepcopy
+import math
 from typing import Callable, Optional, Sequence, Tuple
 
 import einops
@@ -100,7 +101,12 @@ class SACPolicy(
 
     @torch.no_grad()
     def select_action(self, batch: dict[str, Tensor]) -> Tensor:
-        actions, _ = self.actor(batch['observations'])
+        """Select action for inference/evaluation"""
+        distribution = self.actor(batch)
+        # Sample from the distribution and return just the actions
+        actions = distribution.mode()  # or distribution.sample() for stochastic actions
+        actions = self.unnormalize_outputs({"action": actions})["action"]
+        return actions
 
     def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor | float]:
         """Run the batch through the model and compute the loss.
@@ -126,7 +132,10 @@ class SACPolicy(
 
         # calculate critics loss
         # 1- compute actions from policy
-        action_preds, log_probs = self.actor(observations)
+        distribution = self.actor(observations)
+        action_preds = distribution.sample()
+        log_probs = distribution.log_prob(action_preds)
+        action_preds = torch.clamp(action_preds, -1, +1)
         # 2- compute q targets
         q_targets = self.target_qs(next_observations, action_preds)
         # subsample critics to prevent overfitting if use high UTD (update to date)
@@ -146,31 +155,46 @@ class SACPolicy(
 
         # 4- Calculate loss
         # Compute state-action value loss (TD loss) for all of the Q functions in the ensemble.
-        critics_loss = (
-            (
-                F.mse_loss(
-                    q_preds,
-                    einops.repeat(td_target, "t b -> e t b", e=q_preds.shape[0]),
-                    reduction="none",
-                ).sum(0)  # sum over ensemble
-                # `q_preds_ensemble` depends on the first observation and the actions.
-                * ~batch["observation.state_is_pad"][0]
-                * ~batch["action_is_pad"]
-                # q_targets depends on the reward and the next observations.
-                * ~batch["next.reward_is_pad"]
-                * ~batch["observation.state_is_pad"][1:]
-            )
-            .sum(0)
-            .mean()
-        )
+        #critics_loss = (
+        #    (
+        #        F.mse_loss(
+        #            q_preds,
+        #            einops.repeat(td_target, "t b -> e t b", e=q_preds.shape[0]),
+        #            reduction="none",
+        #        ).sum(0)  # sum over ensemble
+        #        # `q_preds_ensemble` depends on the first observation and the actions.
+        #        * ~batch["observation.state_is_pad"][0]
+        #        * ~batch["action_is_pad"]
+        #        # q_targets depends on the reward and the next observations.
+        #        * ~batch["next.reward_is_pad"]
+        #        * ~batch["observation.state_is_pad"][1:]
+        #    )
+        #    .sum(0)
+        #    .mean()
+        #)
+        # 4- Calculate loss
+        # Compute state-action value loss (TD loss) for all of the Q functions in the ensemble.
+        critics_loss = F.mse_loss(
+            q_preds,  # shape: [num_critics, batch_size]
+            einops.repeat(td_target, "b -> e b", e=q_preds.shape[0]), # expand td_target to match q_preds shape
+            reduction="none"
+        ).sum(0).mean()
+        # breakpoint()
 
         # calculate actors loss
         # 1- temperature
         temperature = self.temperature()
 
         # 2- get actions (batch_size, action_dim) and log probs (batch_size,)
+<<<<<<< HEAD
         actions, log_probs = self.actor(observations) \
 
+=======
+        distribution = self.actor(observations)
+        actions = distribution.sample()
+        log_probs = distribution.log_prob(actions)
+        actions = torch.clamp(actions, -1, +1)
+>>>>>>> d3c62b92 (Refactor SACPolicy for improved action sampling and standard deviation handling)
         # 3- get q-value predictions
         with torch.no_grad():
             q_preds = self.critic_ensemble(observations, actions, return_type="mean")
@@ -309,8 +333,8 @@ class Policy(nn.Module):
         network: nn.Module,
         action_dim: int,
         std_parameterization: str = "exp",
-        std_min: float = 1e-5,
-        std_max: float = 10.0,
+        std_min: float = 0.05,
+        std_max: float = 2.0,
         tanh_squash_distribution: bool = False,
         fixed_std: Optional[torch.Tensor] = None,
         init_final: Optional[float] = None,
@@ -372,6 +396,7 @@ class Policy(nn.Module):
                 obs_enc = self.encoder(observations, train=train)
         else:
             obs_enc = observations
+            
         # Get network outputs
         outputs = self.network(obs_enc)
         means = self.mean_layer(outputs)
@@ -380,18 +405,22 @@ class Policy(nn.Module):
         if self.fixed_std is None:
             if self.std_parameterization == "exp":
                 log_stds = self.std_layer(outputs)
+                # Clamp log_stds to prevent too large or small values
+                log_stds = torch.clamp(log_stds, math.log(self.std_min), math.log(self.std_max))
                 stds = torch.exp(log_stds)
             elif self.std_parameterization == "softplus":
                 stds = torch.nn.functional.softplus(self.std_layer(outputs))
+                stds = torch.clamp(stds, self.std_min, self.std_max)
             elif self.std_parameterization == "uniform":
-                stds = torch.exp(self.log_stds).expand_as(means)
+                log_stds = torch.clamp(self.log_stds, math.log(self.std_min), math.log(self.std_max))
+                stds = torch.exp(log_stds).expand_as(means)
             else:
                 raise ValueError(f"Invalid std_parameterization: {self.std_parameterization}")
         else:
             assert self.std_parameterization == "fixed"
             stds = self.fixed_std.expand_as(means)
 
-        # Clip standard deviations and scale with temperature
+        # Scale with temperature
         temperature = torch.tensor(temperature, device=self.device)
         stds = torch.clamp(stds, self.std_min, self.std_max) * torch.sqrt(temperature)
 

From 22fbc9ea4a8b7d168f8227b463f9270b897fed56 Mon Sep 17 00:00:00 2001
From: KeWang1017 <ke.wang@helloleap.ai>
Date: Sat, 28 Dec 2024 22:11:34 +0000
Subject: [PATCH 022/112] Refine SAC configuration and policy for enhanced
 performance

- Updated standard deviation parameterization in SACConfig to 'softplus' with defined min and max values for improved stability.
- Modified action sampling in SACPolicy to use reparameterized sampling, ensuring better gradient flow and log probability calculations.
- Cleaned up log probability calculations in TanhMultivariateNormalDiag for clarity and efficiency.
- Increased evaluation frequency in YAML configuration to 50000 for more efficient training cycles.

These changes aim to enhance the robustness and performance of the SAC implementation during training and inference.
---
 .../common/policies/sac/configuration_sac.py  | 12 ++--
 lerobot/common/policies/sac/modeling_sac.py   | 56 +++++++++----------
 .../configs/policy/sac_pusht_keypoints.yaml   |  2 +-
 3 files changed, 31 insertions(+), 39 deletions(-)

diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index 6df94761..7a4bd364 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -59,14 +59,10 @@ class SACConfig:
         "activate_final": True,
     }
     policy_kwargs = {
-        "tanh_squash_distribution": True,
-        "std_parameterization": "uniform",
-    }
-
-    input_shapes: dict[str, list[int]] = field(
-        default_factory=lambda: {
-            "observation.image": [3, 84, 84],
-            "observation.state": [4],
+            "tanh_squash_distribution": True,
+            "std_parameterization": "softplus",
+            "std_min": 0.005,
+            "std_max": 5.0,
         }
     )
     output_shapes: dict[str, list[int]] = field(
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 821cb93f..806cb767 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -134,7 +134,6 @@ class SACPolicy(
         # 1- compute actions from policy
         distribution = self.actor(observations)
         action_preds = distribution.sample()
-        log_probs = distribution.log_prob(action_preds)
         action_preds = torch.clamp(action_preds, -1, +1)
         # 2- compute q targets
         q_targets = self.target_qs(next_observations, action_preds)
@@ -186,15 +185,11 @@ class SACPolicy(
         temperature = self.temperature()
 
         # 2- get actions (batch_size, action_dim) and log probs (batch_size,)
-<<<<<<< HEAD
-        actions, log_probs = self.actor(observations) \
-
-=======
         distribution = self.actor(observations)
-        actions = distribution.sample()
-        log_probs = distribution.log_prob(actions)
+        actions = distribution.rsample()
+        log_probs = distribution.log_prob(actions).sum(-1)
+        # breakpoint()
         actions = torch.clamp(actions, -1, +1)
->>>>>>> d3c62b92 (Refactor SACPolicy for improved action sampling and standard deviation handling)
         # 3- get q-value predictions
         with torch.no_grad():
             q_preds = self.critic_ensemble(observations, actions, return_type="mean")
@@ -610,7 +605,7 @@ class TanhMultivariateNormalDiag(torch.distributions.TransformedDistribution):
         """
         Reparameterized sample from the distribution
         """
-        # Sample from base distribution
+        # Sample from base distributionrsample
         x = self.base_dist.rsample(sample_shape)
 
         # Apply transforms
@@ -625,17 +620,18 @@ class TanhMultivariateNormalDiag(torch.distributions.TransformedDistribution):
         Includes the log det jacobian for the transforms
         """
         # Initialize log prob
-        log_prob = torch.zeros_like(value[..., 0])
-
+        log_prob = torch.zeros_like(value)
+        
         # Inverse transforms to get back to normal distribution
         q = value
         for transform in reversed(self.transforms):
-            q = transform.inv(q)
-            log_prob = log_prob - transform.log_abs_det_jacobian(q, transform(q))
-
+            q_prev = transform.inv(q)  # Get the pre-transform value
+            log_prob = log_prob - transform.log_abs_det_jacobian(q_prev, q)  # Sum over action dimensions
+            q = q_prev
+        
         # Add base distribution log prob
-        log_prob = log_prob + self.base_dist.log_prob(q).sum(-1)
-
+        log_prob = log_prob + self.base_dist.log_prob(q)  # Sum over action dimensions
+        
         return log_prob
 
     def sample_and_log_prob(self, sample_shape=DEFAULT_SAMPLE_SHAPE) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -646,20 +642,20 @@ class TanhMultivariateNormalDiag(torch.distributions.TransformedDistribution):
         log_prob = self.log_prob(x)
         return x, log_prob
 
-    def entropy(self) -> torch.Tensor:
-        """
-        Compute entropy of the distribution
-        """
-        # Start with base distribution entropy
-        entropy = self.base_dist.entropy().sum(-1)
-
-        # Add log det jacobian for each transform
-        x = self.rsample()
-        for transform in self.transforms:
-            entropy = entropy + transform.log_abs_det_jacobian(x, transform(x))
-            x = transform(x)
-
-        return entropy
+    # def entropy(self) -> torch.Tensor:
+    #     """
+    #     Compute entropy of the distribution
+    #     """
+    #     # Start with base distribution entropy
+    #     entropy = self.base_dist.entropy().sum(-1)
+        
+    #     # Add log det jacobian for each transform
+    #     x = self.rsample()
+    #     for transform in self.transforms:
+    #         entropy = entropy + transform.log_abs_det_jacobian(x, transform(x))
+    #         x = transform(x)
+            
+    #     return entropy
 
 
 def create_critic_ensemble(critics: list[nn.Module], num_critics: int, device: str = "cuda") -> nn.ModuleList:
diff --git a/lerobot/configs/policy/sac_pusht_keypoints.yaml b/lerobot/configs/policy/sac_pusht_keypoints.yaml
index 19af60d4..6d8971a2 100644
--- a/lerobot/configs/policy/sac_pusht_keypoints.yaml
+++ b/lerobot/configs/policy/sac_pusht_keypoints.yaml
@@ -19,7 +19,7 @@ training:
   grad_clip_norm: 10.0
   lr: 3e-4
 
-  eval_freq: 10000
+  eval_freq: 50000
   log_freq: 500
   save_freq: 50000
 

From 5b4adc00bb3da018cf10cbde6e120fd5e890c179 Mon Sep 17 00:00:00 2001
From: KeWang1017 <ke.wang@helloleap.ai>
Date: Sun, 29 Dec 2024 12:30:39 +0000
Subject: [PATCH 023/112] Refactor SAC configuration and policy for improved
 action sampling and stability

- Updated SACConfig to replace standard deviation parameterization with log_std_min and log_std_max for better control over action distributions.
- Modified SACPolicy to streamline action selection and log probability calculations, enhancing stochastic behavior.
- Removed deprecated TanhMultivariateNormalDiag class to simplify the codebase and improve maintainability.

These changes aim to enhance the robustness and performance of the SAC implementation during training and inference.
---
 .../common/policies/sac/configuration_sac.py  |  27 +-
 lerobot/common/policies/sac/modeling_sac.py   | 233 +++---------------
 2 files changed, 43 insertions(+), 217 deletions(-)

diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index 7a4bd364..52c564a6 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -53,30 +53,13 @@ class SACConfig:
     critic_network_kwargs = {
         "hidden_dims": [256, 256],
         "activate_final": True,
-    }
+        }
     actor_network_kwargs = {
         "hidden_dims": [256, 256],
         "activate_final": True,
-    }
+        }
     policy_kwargs = {
-            "tanh_squash_distribution": True,
-            "std_parameterization": "softplus",
-            "std_min": 0.005,
-            "std_max": 5.0,
+        "use_tanh_squash": True,
+        "log_std_min": -5,
+        "log_std_max": 2,
         }
-    )
-    output_shapes: dict[str, list[int]] = field(
-        default_factory=lambda: {
-            "action": [4],
-        }
-    )
-
-    state_encoder_hidden_dim: int = 256
-    latent_dim: int = 256
-    network_hidden_dims: int = 256
-
-    # Normalization / Unnormalization
-    input_normalization_modes: dict[str, str] | None = None
-    output_normalization_modes: dict[str, str] = field(
-        default_factory=lambda: {"action": "min_max"},
-    )
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 806cb767..1e7fd92b 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -102,9 +102,7 @@ class SACPolicy(
     @torch.no_grad()
     def select_action(self, batch: dict[str, Tensor]) -> Tensor:
         """Select action for inference/evaluation"""
-        distribution = self.actor(batch)
-        # Sample from the distribution and return just the actions
-        actions = distribution.mode()  # or distribution.sample() for stochastic actions
+        actions, _ = self.actor(batch)
         actions = self.unnormalize_outputs({"action": actions})["action"]
         return actions
 
@@ -129,12 +127,11 @@ class SACPolicy(
 
         # reward bias from HIL-SERL code base 
         # add_or_replace={"rewards": batch["rewards"] + self.config["reward_bias"]} in reward_batch
-
+        
         # calculate critics loss
         # 1- compute actions from policy
-        distribution = self.actor(observations)
-        action_preds = distribution.sample()
-        action_preds = torch.clamp(action_preds, -1, +1)
+        action_preds, log_probs = self.actor(next_observations)
+
         # 2- compute q targets
         q_targets = self.target_qs(next_observations, action_preds)
         # subsample critics to prevent overfitting if use high UTD (update to date)
@@ -147,7 +144,7 @@ class SACPolicy(
         min_q = q_targets.min(dim=0)
 
         # compute td target
-        td_target = rewards + self.discount * min_q
+        td_target = rewards + self.config.discount * min_q #+ self.config.discount * self.temperature() * log_probs # add entropy term
 
         # 3- compute predicted qs
         q_preds = self.critic_ensemble(observations, actions)
@@ -178,18 +175,12 @@ class SACPolicy(
             einops.repeat(td_target, "b -> e b", e=q_preds.shape[0]), # expand td_target to match q_preds shape
             reduction="none"
         ).sum(0).mean()
-        # breakpoint()
 
         # calculate actors loss
         # 1- temperature
         temperature = self.temperature()
-
         # 2- get actions (batch_size, action_dim) and log probs (batch_size,)
-        distribution = self.actor(observations)
-        actions = distribution.rsample()
-        log_probs = distribution.log_prob(actions).sum(-1)
-        # breakpoint()
-        actions = torch.clamp(actions, -1, +1)
+        actions, log_probs = self.actor(observations)
         # 3- get q-value predictions
         with torch.no_grad():
             q_preds = self.critic_ensemble(observations, actions, return_type="mean")
@@ -264,15 +255,13 @@ class Critic(nn.Module):
         encoder: Optional[nn.Module],
         network: nn.Module,
         init_final: Optional[float] = None,
-        activate_final: bool = False,
-        device: str = "cuda",
+        device: str = "cuda"
     ):
         super().__init__()
         self.device = torch.device(device)
         self.encoder = encoder
         self.network = network
         self.init_final = init_final
-        self.activate_final = activate_final
         
         # Find the last Linear layer's output dimension
         for layer in reversed(network.net):
@@ -304,22 +293,6 @@ class Critic(nn.Module):
         value = self.output_layer(x)
         return value.squeeze(-1)
 
-    def q_value_ensemble(
-        self, observations: torch.Tensor, actions: torch.Tensor, train: bool = False
-    ) -> torch.Tensor:
-        observations = observations.to(self.device)
-        actions = actions.to(self.device)
-
-        if len(actions.shape) == 3:  # [batch_size, num_actions, action_dim]
-            batch_size, num_actions = actions.shape[:2]
-            obs_expanded = observations.unsqueeze(1).expand(-1, num_actions, -1)
-            obs_flat = obs_expanded.reshape(-1, observations.shape[-1])
-            actions_flat = actions.reshape(-1, actions.shape[-1])
-            q_values = self(obs_flat, actions_flat, train)
-            return q_values.reshape(batch_size, num_actions)
-        else:
-            return self(observations, actions, train)
-
 
 class Policy(nn.Module):
     def __init__(
@@ -327,26 +300,22 @@ class Policy(nn.Module):
         encoder: Optional[nn.Module],
         network: nn.Module,
         action_dim: int,
-        std_parameterization: str = "exp",
-        std_min: float = 0.05,
-        std_max: float = 2.0,
-        tanh_squash_distribution: bool = False,
+        log_std_min: float = -5,
+        log_std_max: float = 2,
         fixed_std: Optional[torch.Tensor] = None,
         init_final: Optional[float] = None,
-        activate_final: bool = False,
-        device: str = "cuda",
+        use_tanh_squash: bool = False,
+        device: str = "cuda"
     ):
         super().__init__()
         self.device = torch.device(device)
         self.encoder = encoder
         self.network = network
         self.action_dim = action_dim
-        self.std_parameterization = std_parameterization
-        self.std_min = std_min
-        self.std_max = std_max
-        self.tanh_squash_distribution = tanh_squash_distribution
+        self.log_std_min = log_std_min
+        self.log_std_max = log_std_max
         self.fixed_std = fixed_std.to(self.device) if fixed_std is not None else None
-        self.activate_final = activate_final
+        self.use_tanh_squash = use_tanh_squash
         
         # Find the last Linear layer's output dimension
         for layer in reversed(network.net):
@@ -364,27 +333,20 @@ class Policy(nn.Module):
 
         # Standard deviation layer or parameter
         if fixed_std is None:
-            if std_parameterization == "uniform":
-                self.log_stds = nn.Parameter(torch.zeros(action_dim, device=self.device))
+            self.std_layer = nn.Linear(out_features, action_dim)
+            if init_final is not None:
+                nn.init.uniform_(self.std_layer.weight, -init_final, init_final)
+                nn.init.uniform_(self.std_layer.bias, -init_final, init_final)
             else:
-                self.std_layer = nn.Linear(out_features, action_dim)
-                if init_final is not None:
-                    nn.init.uniform_(self.std_layer.weight, -init_final, init_final)
-                    nn.init.uniform_(self.std_layer.bias, -init_final, init_final)
-                else:
-                    orthogonal_init()(self.std_layer.weight)
-
+                orthogonal_init()(self.std_layer.weight)
+        
         self.to(self.device)
 
     def forward(
         self,
         observations: torch.Tensor,
-        temperature: float = 1.0,
-        train: bool = False,
-        non_squash_distribution: bool = False,
-    ) -> torch.distributions.Distribution:
-        self.train(train)
-
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+                
         # Encode observations if encoder exists
         if self.encoder is not None:
             with torch.set_grad_enabled(train):
@@ -398,41 +360,24 @@ class Policy(nn.Module):
 
         # Compute standard deviations
         if self.fixed_std is None:
-            if self.std_parameterization == "exp":
-                log_stds = self.std_layer(outputs)
-                # Clamp log_stds to prevent too large or small values
-                log_stds = torch.clamp(log_stds, math.log(self.std_min), math.log(self.std_max))
-                stds = torch.exp(log_stds)
-            elif self.std_parameterization == "softplus":
-                stds = torch.nn.functional.softplus(self.std_layer(outputs))
-                stds = torch.clamp(stds, self.std_min, self.std_max)
-            elif self.std_parameterization == "uniform":
-                log_stds = torch.clamp(self.log_stds, math.log(self.std_min), math.log(self.std_max))
-                stds = torch.exp(log_stds).expand_as(means)
-            else:
-                raise ValueError(f"Invalid std_parameterization: {self.std_parameterization}")
+            log_std = self.std_layer(outputs)
+            if self.use_tanh_squash:
+                log_std = torch.tanh(log_std)
+            log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
         else:
-            assert self.std_parameterization == "fixed"
             stds = self.fixed_std.expand_as(means)
 
-        # Scale with temperature
-        temperature = torch.tensor(temperature, device=self.device)
-        stds = torch.clamp(stds, self.std_min, self.std_max) * torch.sqrt(temperature)
-
-        # Create distribution
-        if self.tanh_squash_distribution and not non_squash_distribution:
-            distribution = TanhMultivariateNormalDiag(
-                loc=means,
-                scale_diag=stds,
-            )
-        else:
-            distribution = torch.distributions.Normal(
-                loc=means,
-                scale=stds,
-            )
-
-        return distribution
+        # uses tahn activation function to squash the action to be in the range of [-1, 1]
+        normal = torch.distributions.Normal(means, stds)
+        x_t = normal.rsample()  # for reparameterization trick (mean + std * N(0,1)) 
+        log_probs = normal.log_prob(x_t)
+        if self.use_tanh_squash:
+            actions = torch.tanh(x_t)
+            log_probs -= torch.log((1 - actions.pow(2)) + 1e-6)
+        log_probs = log_probs.sum(-1) # sum over action dim
 
+        return actions, log_probs
+    
     def get_features(self, observations: torch.Tensor) -> torch.Tensor:
         """Get encoded features from observations"""
         observations = observations.to(self.device)
@@ -552,110 +497,8 @@ class LagrangeMultiplier(nn.Module):
         return multiplier * diff
 
 
-# The TanhMultivariateNormalDiag is a probability distribution that represents a transformed normal (Gaussian) distribution where:
-# 1. The base distribution is a diagonal multivariate normal distribution
-# 2. The samples from this normal distribution are transformed through a tanh function, which squashes the values to be between -1 and 1
-# 3. Optionally, the values can be further transformed to fit within arbitrary bounds [low, high] using an affine transformation
-# This type of distribution is commonly used in reinforcement learning, particularly for continuous action spaces
-class TanhMultivariateNormalDiag(torch.distributions.TransformedDistribution):
-    DEFAULT_SAMPLE_SHAPE = torch.Size()
-
-    def __init__(
-        self,
-        loc: torch.Tensor,
-        scale_diag: torch.Tensor,
-        low: Optional[torch.Tensor] = None,
-        high: Optional[torch.Tensor] = None,
-    ):
-        # Create base normal distribution
-        base_distribution = torch.distributions.Normal(loc=loc, scale=scale_diag)
-
-        # Create list of transforms
-        transforms = []
-
-        # Add tanh transform
-        transforms.append(torch.distributions.transforms.TanhTransform())
-
-        # Add rescaling transform if bounds are provided
-        if low is not None and high is not None:
-            transforms.append(
-                torch.distributions.transforms.AffineTransform(loc=(high + low) / 2, scale=(high - low) / 2)
-            )
-
-        # Initialize parent class
-        super().__init__(base_distribution=base_distribution, transforms=transforms)
-
-        # Store parameters
-        self.loc = loc
-        self.scale_diag = scale_diag
-        self.low = low
-        self.high = high
-
-    def mode(self) -> torch.Tensor:
-        """Get the mode of the transformed distribution"""
-        # The mode of a normal distribution is its mean
-        mode = self.loc
-        # Apply transforms
-        for transform in self.transforms:
-            mode = transform(mode)
-
-        return mode
-
-    def rsample(self, sample_shape=DEFAULT_SAMPLE_SHAPE) -> torch.Tensor:
-        """
-        Reparameterized sample from the distribution
-        """
-        # Sample from base distributionrsample
-        x = self.base_dist.rsample(sample_shape)
-
-        # Apply transforms
-        for transform in self.transforms:
-            x = transform(x)
-
-        return x
-
-    def log_prob(self, value: torch.Tensor) -> torch.Tensor:
-        """
-        Compute log probability of a value
-        Includes the log det jacobian for the transforms
-        """
-        # Initialize log prob
-        log_prob = torch.zeros_like(value)
-        
-        # Inverse transforms to get back to normal distribution
-        q = value
-        for transform in reversed(self.transforms):
-            q_prev = transform.inv(q)  # Get the pre-transform value
-            log_prob = log_prob - transform.log_abs_det_jacobian(q_prev, q)  # Sum over action dimensions
-            q = q_prev
-        
-        # Add base distribution log prob
-        log_prob = log_prob + self.base_dist.log_prob(q)  # Sum over action dimensions
-        
-        return log_prob
-
-    def sample_and_log_prob(self, sample_shape=DEFAULT_SAMPLE_SHAPE) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Sample from the distribution and compute log probability
-        """
-        x = self.rsample(sample_shape)
-        log_prob = self.log_prob(x)
-        return x, log_prob
-
-    # def entropy(self) -> torch.Tensor:
-    #     """
-    #     Compute entropy of the distribution
-    #     """
-    #     # Start with base distribution entropy
-    #     entropy = self.base_dist.entropy().sum(-1)
-        
-    #     # Add log det jacobian for each transform
-    #     x = self.rsample()
-    #     for transform in self.transforms:
-    #         entropy = entropy + transform.log_abs_det_jacobian(x, transform(x))
-    #         x = transform(x)
-            
-    #     return entropy
+def orthogonal_init():
+    return lambda x: torch.nn.init.orthogonal_(x, gain=1.0)
 
 
 def create_critic_ensemble(critics: list[nn.Module], num_critics: int, device: str = "cuda") -> nn.ModuleList:

From bae3b02928c7de4d3243eb3fead4c67f236ee167 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Sun, 29 Dec 2024 14:35:21 +0000
Subject: [PATCH 024/112] style fixes

---
 .../common/policies/sac/configuration_sac.py  |  6 +-
 lerobot/common/policies/sac/modeling_sac.py   | 70 +++++++++----------
 2 files changed, 37 insertions(+), 39 deletions(-)

diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index 52c564a6..a324294c 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -53,13 +53,13 @@ class SACConfig:
     critic_network_kwargs = {
         "hidden_dims": [256, 256],
         "activate_final": True,
-        }
+    }
     actor_network_kwargs = {
         "hidden_dims": [256, 256],
         "activate_final": True,
-        }
+    }
     policy_kwargs = {
         "use_tanh_squash": True,
         "log_std_min": -5,
         "log_std_max": 2,
-        }
+    }
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 1e7fd92b..9df2c859 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -19,7 +19,6 @@
 
 from collections import deque
 from copy import deepcopy
-import math
 from typing import Callable, Optional, Sequence, Tuple
 
 import einops
@@ -125,9 +124,9 @@ class SACPolicy(
 
         # perform image augmentation
 
-        # reward bias from HIL-SERL code base 
+        # reward bias from HIL-SERL code base
         # add_or_replace={"rewards": batch["rewards"] + self.config["reward_bias"]} in reward_batch
-        
+
         # calculate critics loss
         # 1- compute actions from policy
         action_preds, log_probs = self.actor(next_observations)
@@ -137,21 +136,23 @@ class SACPolicy(
         # subsample critics to prevent overfitting if use high UTD (update to date)
         if self.config.num_subsample_critics is not None:
             indices = torch.randperm(self.config.num_critics)
-            indices = indices[:self.config.num_subsample_critics]
+            indices = indices[: self.config.num_subsample_critics]
             q_targets = q_targets[indices]
 
         # critics subsample size
         min_q = q_targets.min(dim=0)
 
         # compute td target
-        td_target = rewards + self.config.discount * min_q #+ self.config.discount * self.temperature() * log_probs # add entropy term
+        td_target = (
+            rewards + self.config.discount * min_q
+        )  # + self.config.discount * self.temperature() * log_probs # add entropy term
 
         # 3- compute predicted qs
         q_preds = self.critic_ensemble(observations, actions)
 
         # 4- Calculate loss
         # Compute state-action value loss (TD loss) for all of the Q functions in the ensemble.
-        #critics_loss = (
+        # critics_loss = (
         #    (
         #        F.mse_loss(
         #            q_preds,
@@ -167,14 +168,20 @@ class SACPolicy(
         #    )
         #    .sum(0)
         #    .mean()
-        #)
+        # )
         # 4- Calculate loss
         # Compute state-action value loss (TD loss) for all of the Q functions in the ensemble.
-        critics_loss = F.mse_loss(
-            q_preds,  # shape: [num_critics, batch_size]
-            einops.repeat(td_target, "b -> e b", e=q_preds.shape[0]), # expand td_target to match q_preds shape
-            reduction="none"
-        ).sum(0).mean()
+        critics_loss = (
+            F.mse_loss(
+                q_preds,  # shape: [num_critics, batch_size]
+                einops.repeat(
+                    td_target, "b -> e b", e=q_preds.shape[0]
+                ),  # expand td_target to match q_preds shape
+                reduction="none",
+            )
+            .sum(0)
+            .mean()
+        )
 
         # calculate actors loss
         # 1- temperature
@@ -229,10 +236,10 @@ class MLP(nn.Module):
         super().__init__()
         self.activate_final = activate_final
         layers = []
-        
+
         for i, size in enumerate(hidden_dims):
-            layers.append(nn.Linear(hidden_dims[i-1] if i > 0 else hidden_dims[0], size))
-            
+            layers.append(nn.Linear(hidden_dims[i - 1] if i > 0 else hidden_dims[0], size))
+
             if i + 1 < len(hidden_dims) or activate_final:
                 if dropout_rate is not None and dropout_rate > 0:
                     layers.append(nn.Dropout(p=dropout_rate))
@@ -255,20 +262,20 @@ class Critic(nn.Module):
         encoder: Optional[nn.Module],
         network: nn.Module,
         init_final: Optional[float] = None,
-        device: str = "cuda"
+        device: str = "cuda",
     ):
         super().__init__()
         self.device = torch.device(device)
         self.encoder = encoder
         self.network = network
         self.init_final = init_final
-        
+
         # Find the last Linear layer's output dimension
         for layer in reversed(network.net):
             if isinstance(layer, nn.Linear):
                 out_features = layer.out_features
                 break
-        
+
         # Output layer
         if init_final is not None:
             self.output_layer = nn.Linear(out_features, 1)
@@ -305,7 +312,7 @@ class Policy(nn.Module):
         fixed_std: Optional[torch.Tensor] = None,
         init_final: Optional[float] = None,
         use_tanh_squash: bool = False,
-        device: str = "cuda"
+        device: str = "cuda",
     ):
         super().__init__()
         self.device = torch.device(device)
@@ -316,13 +323,13 @@ class Policy(nn.Module):
         self.log_std_max = log_std_max
         self.fixed_std = fixed_std.to(self.device) if fixed_std is not None else None
         self.use_tanh_squash = use_tanh_squash
-        
+
         # Find the last Linear layer's output dimension
         for layer in reversed(network.net):
             if isinstance(layer, nn.Linear):
                 out_features = layer.out_features
                 break
-        
+
         # Mean layer
         self.mean_layer = nn.Linear(out_features, action_dim)
         if init_final is not None:
@@ -339,21 +346,16 @@ class Policy(nn.Module):
                 nn.init.uniform_(self.std_layer.bias, -init_final, init_final)
             else:
                 orthogonal_init()(self.std_layer.weight)
-        
+
         self.to(self.device)
 
     def forward(
         self,
         observations: torch.Tensor,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-                
         # Encode observations if encoder exists
-        if self.encoder is not None:
-            with torch.set_grad_enabled(train):
-                obs_enc = self.encoder(observations, train=train)
-        else:
-            obs_enc = observations
-            
+        obs_enc = observations if self.encoder is not None else self.encoder(observations)
+
         # Get network outputs
         outputs = self.network(obs_enc)
         means = self.mean_layer(outputs)
@@ -369,15 +371,15 @@ class Policy(nn.Module):
 
         # uses tahn activation function to squash the action to be in the range of [-1, 1]
         normal = torch.distributions.Normal(means, stds)
-        x_t = normal.rsample()  # for reparameterization trick (mean + std * N(0,1)) 
+        x_t = normal.rsample()  # for reparameterization trick (mean + std * N(0,1))
         log_probs = normal.log_prob(x_t)
         if self.use_tanh_squash:
             actions = torch.tanh(x_t)
             log_probs -= torch.log((1 - actions.pow(2)) + 1e-6)
-        log_probs = log_probs.sum(-1) # sum over action dim
+        log_probs = log_probs.sum(-1)  # sum over action dim
 
         return actions, log_probs
-    
+
     def get_features(self, observations: torch.Tensor) -> torch.Tensor:
         """Get encoded features from observations"""
         observations = observations.to(self.device)
@@ -507,10 +509,6 @@ def create_critic_ensemble(critics: list[nn.Module], num_critics: int, device: s
     return nn.ModuleList(critics).to(device)
 
 
-def orthogonal_init():
-    return lambda x: torch.nn.init.orthogonal_(x, gain=1.0)
-
-
 # borrowed from tdmpc
 def flatten_forward_unflatten(fn: Callable[[Tensor], Tensor], image_tensor: Tensor) -> Tensor:
     """Helper to temporarily flatten extra dims at the start of the image tensor.

From ee306e2f9b5bdfb5abebcb0228334536f260817d Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Sun, 29 Dec 2024 23:59:39 +0000
Subject: [PATCH 025/112] split encoder for critic and actor

---
 .../common/policies/sac/configuration_sac.py  |   2 +-
 lerobot/common/policies/sac/modeling_sac.py   | 306 ++++++++++--------
 2 files changed, 177 insertions(+), 131 deletions(-)

diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index a324294c..5f676933 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -48,7 +48,7 @@ class SACConfig:
     critic_target_update_weight = 0.005
     utd_ratio = 2
     state_encoder_hidden_dim = 256
-    latent_dim = 50
+    latent_dim = 128
     target_entropy = None
     critic_network_kwargs = {
         "hidden_dims": [256, 256],
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 9df2c859..bd77408e 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -63,25 +63,35 @@ class SACPolicy(
         self.unnormalize_outputs = Unnormalize(
             config.output_shapes, config.output_normalization_modes, dataset_stats
         )
-        encoder = SACObservationEncoder(config)
+        encoder_critic = SACObservationEncoder(config)
+        encoder_actor = SACObservationEncoder(config)
         # Define networks
         critic_nets = []
         for _ in range(config.num_critics):
-            critic_net = Critic(encoder=encoder, network=MLP(**config.critic_network_kwargs))
+            critic_net = Critic(
+                encoder=encoder_critic,
+                network=MLP(
+                    input_dim=encoder_critic.output_dim + config.output_shapes["action"][0],
+                    **config.critic_network_kwargs
+                )
+            )
             critic_nets.append(critic_net)
 
         self.critic_ensemble = create_critic_ensemble(critic_nets, config.num_critics)
         self.critic_target = deepcopy(self.critic_ensemble)
 
         self.actor = Policy(
-            encoder=encoder,
-            network=MLP(**config.actor_network_kwargs),
+            encoder=encoder_actor,
+            network=MLP(
+                input_dim=encoder_actor.output_dim,
+                **config.actor_network_kwargs
+            ),
             action_dim=config.output_shapes["action"][0],
-            **config.policy_kwargs,
+            **config.policy_kwargs
         )
         if config.target_entropy is None:
-            config.target_entropy = -np.prod(config.output_shapes["action"][0])  #  (-dim(A))
-        self.temperature = LagrangeMultiplier(init_value=config.temperature_init)
+            config.target_entropy = -np.prod(config.output_shapes["action"][0]) #  (-dim(A))
+        self.temperature = LagrangeMultiplier(init_value=config.temperature_init)    
 
     def reset(self):
         """
@@ -104,15 +114,31 @@ class SACPolicy(
         actions, _ = self.actor(batch)
         actions = self.unnormalize_outputs({"action": actions})["action"]
         return actions
+    
+    def critic_forward(self, observations: dict[str, Tensor], actions: Tensor, use_target: bool = False) -> Tensor:
+        """Forward pass through a critic network ensemble
+        
+        Args:
+            observations: Dictionary of observations
+            actions: Action tensor
+            use_target: If True, use target critics, otherwise use ensemble critics
+        
+        Returns:
+            Tensor of Q-values from all critics
+        """
+        critics = self.critic_target if use_target else self.critic_ensemble
+        q_values = torch.stack([critic(observations, actions) for critic in critics])
+        return q_values
+
 
     def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor | float]:
         """Run the batch through the model and compute the loss.
-
+        
         Returns a dictionary with loss as a tensor, and other information as native floats.
         """
         batch = self.normalize_inputs(batch)
-        # batch shape is (b, 2, ...) where index 1 returns the current observation and
-        # the next observation for caluculating the right td index.
+        # batch shape is (b, 2, ...) where index 1 returns the current observation and 
+        # the next observation for calculating the right td index. 
         actions = batch["action"][:, 0]
         rewards = batch["next.reward"][:, 0]
         observations = {}
@@ -121,113 +147,109 @@ class SACPolicy(
             if k.startswith("observation."):
                 observations[k] = batch[k][:, 0]
                 next_observations[k] = batch[k][:, 1]
-
+       
         # perform image augmentation
 
-        # reward bias from HIL-SERL code base
+        # reward bias from HIL-SERL code base 
         # add_or_replace={"rewards": batch["rewards"] + self.config["reward_bias"]} in reward_batch
-
+        
         # calculate critics loss
         # 1- compute actions from policy
         action_preds, log_probs = self.actor(next_observations)
 
         # 2- compute q targets
-        q_targets = self.target_qs(next_observations, action_preds)
+        q_targets = self.critic_forward(next_observations, action_preds, use_target=True)
+
         # subsample critics to prevent overfitting if use high UTD (update to date)
         if self.config.num_subsample_critics is not None:
             indices = torch.randperm(self.config.num_critics)
-            indices = indices[: self.config.num_subsample_critics]
+            indices = indices[:self.config.num_subsample_critics]
             q_targets = q_targets[indices]
 
         # critics subsample size
-        min_q = q_targets.min(dim=0)
+        min_q, _ = q_targets.min(dim=0)  # Get values from min operation
 
         # compute td target
-        td_target = (
-            rewards + self.config.discount * min_q
-        )  # + self.config.discount * self.temperature() * log_probs # add entropy term
+        td_target = rewards + self.config.discount * min_q #+ self.config.discount * self.temperature() * log_probs # add entropy term
 
         # 3- compute predicted qs
-        q_preds = self.critic_ensemble(observations, actions)
+        q_preds = self.critic_forward(observations, actions, use_target=False)
 
         # 4- Calculate loss
         # Compute state-action value loss (TD loss) for all of the Q functions in the ensemble.
-        # critics_loss = (
-        #    (
-        #        F.mse_loss(
-        #            q_preds,
-        #            einops.repeat(td_target, "t b -> e t b", e=q_preds.shape[0]),
-        #            reduction="none",
-        #        ).sum(0)  # sum over ensemble
-        #        # `q_preds_ensemble` depends on the first observation and the actions.
-        #        * ~batch["observation.state_is_pad"][0]
-        #        * ~batch["action_is_pad"]
-        #        # q_targets depends on the reward and the next observations.
-        #        * ~batch["next.reward_is_pad"]
-        #        * ~batch["observation.state_is_pad"][1:]
-        #    )
-        #    .sum(0)
-        #    .mean()
-        # )
-        # 4- Calculate loss
-        # Compute state-action value loss (TD loss) for all of the Q functions in the ensemble.
-        critics_loss = (
-            F.mse_loss(
-                q_preds,  # shape: [num_critics, batch_size]
-                einops.repeat(
-                    td_target, "b -> e b", e=q_preds.shape[0]
-                ),  # expand td_target to match q_preds shape
-                reduction="none",
-            )
-            .sum(0)
-            .mean()
-        )
+        critics_loss = F.mse_loss(
+            q_preds,  # shape: [num_critics, batch_size]
+            einops.repeat(td_target, "b -> e b", e=q_preds.shape[0]), # expand td_target to match q_preds shape
+            reduction="none"
+        ).sum(0).mean()
 
+        # critics_loss = (   
+        #     F.mse_loss(
+        #             q_preds,
+        #             einops.repeat(td_target, "b -> e b", e=q_preds.shape[0]),
+        #             reduction="none",
+        #         ).sum(0)  # sum over ensemble
+        #         # `q_preds_ensemble` depends on the first observation and the actions.
+        #         * ~batch["observation.state_is_pad"][0]
+        #         * ~batch["action_is_pad"]
+        #         # q_targets depends on the reward and the next observations.
+        #         * ~batch["next.reward_is_pad"]
+        #         * ~batch["observation.state_is_pad"][1:]
+        #     ).sum(0).mean()
+        
         # calculate actors loss
         # 1- temperature
         temperature = self.temperature()
         # 2- get actions (batch_size, action_dim) and log probs (batch_size,)
         actions, log_probs = self.actor(observations)
         # 3- get q-value predictions
-        with torch.no_grad():
-            q_preds = self.critic_ensemble(observations, actions, return_type="mean")
+        with torch.inference_mode():
+            q_preds = self.critic_forward(observations, actions, use_target=False)
         actor_loss = (
             -(q_preds - temperature * log_probs).mean()
-            * ~batch["observation.state_is_pad"][0]
-            * ~batch["action_is_pad"]
+            # * ~batch["observation.state_is_pad"][0]
+            # * ~batch["action_is_pad"]
         ).mean()
 
+
         # calculate temperature loss
         # 1- calculate entropy
         entropy = -log_probs.mean()
-        temperature_loss = self.temp(lhs=entropy, rhs=self.config.target_entropy)
+        temperature_loss = self.temperature(
+            lhs=entropy,
+            rhs=self.config.target_entropy
+        )
 
         loss = critics_loss + actor_loss + temperature_loss
 
         return {
-            "critics_loss": critics_loss.item(),
-            "actor_loss": actor_loss.item(),
-            "temperature_loss": temperature_loss.item(),
-            "temperature": temperature.item(),
-            "entropy": entropy.item(),
-            "loss": loss,
-        }
-
+                "critics_loss": critics_loss.item(),
+                "actor_loss": actor_loss.item(),
+                "temperature_loss": temperature_loss.item(),
+                "temperature": temperature.item(),
+                "entropy": entropy.item(),
+                "loss": loss,
+            }
+ 
     def update(self):
-        self.critic_target.lerp_(self.critic_ensemble, self.config.critic_target_update_weight)
         # TODO: implement UTD update
         # First update only critics for utd_ratio-1 times
-        # for critic_step in range(self.config.utd_ratio - 1):
-        # only update critic and critic target
+        #for critic_step in range(self.config.utd_ratio - 1):
+            # only update critic and critic target
         # Then update critic, critic target, actor and temperature
-
-        # for target_param, param in zip(self.critic_target.parameters(), self.critic_ensemble.parameters()):
-        #    target_param.data.copy_(target_param.data * (1.0 - self.config.critic_target_update_weight) + param.data * self.critic_target_update_weight)
-
-
+        """Update target networks with exponential moving average"""
+        with torch.no_grad():
+            for target_critic, critic in zip(self.critic_target, self.critic_ensemble, strict=False):
+                for target_param, param in zip(target_critic.parameters(), critic.parameters(), strict=False):
+                    target_param.data.copy_(
+                        target_param.data * self.config.critic_target_update_weight + 
+                        param.data * (1.0 - self.config.critic_target_update_weight)
+                    )
+ 
 class MLP(nn.Module):
     def __init__(
         self,
+        input_dim: int,
         hidden_dims: list[int],
         activations: Callable[[torch.Tensor], torch.Tensor] | str = nn.SiLU(),
         activate_final: bool = False,
@@ -236,46 +258,52 @@ class MLP(nn.Module):
         super().__init__()
         self.activate_final = activate_final
         layers = []
-
-        for i, size in enumerate(hidden_dims):
-            layers.append(nn.Linear(hidden_dims[i - 1] if i > 0 else hidden_dims[0], size))
-
+        
+        # First layer uses input_dim
+        layers.append(nn.Linear(input_dim, hidden_dims[0]))
+        
+        # Add activation after first layer
+        if dropout_rate is not None and dropout_rate > 0:
+            layers.append(nn.Dropout(p=dropout_rate))
+        layers.append(nn.LayerNorm(hidden_dims[0]))
+        layers.append(activations if isinstance(activations, nn.Module) else getattr(nn, activations)())
+        
+        # Rest of the layers
+        for i in range(1, len(hidden_dims)):
+            layers.append(nn.Linear(hidden_dims[i-1], hidden_dims[i]))
+            
             if i + 1 < len(hidden_dims) or activate_final:
                 if dropout_rate is not None and dropout_rate > 0:
                     layers.append(nn.Dropout(p=dropout_rate))
-                layers.append(nn.LayerNorm(size))
-                layers.append(
-                    activations if isinstance(activations, nn.Module) else getattr(nn, activations)()
-                )
-
+                layers.append(nn.LayerNorm(hidden_dims[i]))
+                layers.append(activations if isinstance(activations, nn.Module) else getattr(nn, activations)())
+                
         self.net = nn.Sequential(*layers)
 
-    def forward(self, x: torch.Tensor, train: bool = False) -> torch.Tensor:
-        # in training mode or not. TODO: find better way to do this
-        self.train(train)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.net(x)
-
-
+    
+    
 class Critic(nn.Module):
     def __init__(
         self,
         encoder: Optional[nn.Module],
         network: nn.Module,
         init_final: Optional[float] = None,
-        device: str = "cuda",
+        device: str = "cuda"
     ):
         super().__init__()
         self.device = torch.device(device)
         self.encoder = encoder
         self.network = network
         self.init_final = init_final
-
+        
         # Find the last Linear layer's output dimension
         for layer in reversed(network.net):
             if isinstance(layer, nn.Linear):
                 out_features = layer.out_features
                 break
-
+        
         # Output layer
         if init_final is not None:
             self.output_layer = nn.Linear(out_features, 1)
@@ -284,17 +312,22 @@ class Critic(nn.Module):
         else:
             self.output_layer = nn.Linear(out_features, 1)
             orthogonal_init()(self.output_layer.weight)
-
+        
         self.to(self.device)
 
-    def forward(self, observations: torch.Tensor, actions: torch.Tensor, train: bool = False) -> torch.Tensor:
-        self.train(train)
-
-        observations = observations.to(self.device)
+    def forward(
+        self, 
+        observations: dict[str, torch.Tensor], 
+        actions: torch.Tensor,
+    ) -> torch.Tensor:
+        # Move each tensor in observations to device
+        observations = {
+            k: v.to(self.device) for k, v in observations.items()
+        }
         actions = actions.to(self.device)
-
+        
         obs_enc = observations if self.encoder is None else self.encoder(observations)
-
+            
         inputs = torch.cat([obs_enc, actions], dim=-1)
         x = self.network(inputs)
         value = self.output_layer(x)
@@ -312,7 +345,7 @@ class Policy(nn.Module):
         fixed_std: Optional[torch.Tensor] = None,
         init_final: Optional[float] = None,
         use_tanh_squash: bool = False,
-        device: str = "cuda",
+        device: str = "cuda"
     ):
         super().__init__()
         self.device = torch.device(device)
@@ -323,13 +356,13 @@ class Policy(nn.Module):
         self.log_std_max = log_std_max
         self.fixed_std = fixed_std.to(self.device) if fixed_std is not None else None
         self.use_tanh_squash = use_tanh_squash
-
+        
         # Find the last Linear layer's output dimension
         for layer in reversed(network.net):
             if isinstance(layer, nn.Linear):
                 out_features = layer.out_features
                 break
-
+        
         # Mean layer
         self.mean_layer = nn.Linear(out_features, action_dim)
         if init_final is not None:
@@ -337,7 +370,7 @@ class Policy(nn.Module):
             nn.init.uniform_(self.mean_layer.bias, -init_final, init_final)
         else:
             orthogonal_init()(self.mean_layer.weight)
-
+        
         # Standard deviation layer or parameter
         if fixed_std is None:
             self.std_layer = nn.Linear(out_features, action_dim)
@@ -346,20 +379,21 @@ class Policy(nn.Module):
                 nn.init.uniform_(self.std_layer.bias, -init_final, init_final)
             else:
                 orthogonal_init()(self.std_layer.weight)
-
+        
         self.to(self.device)
 
     def forward(
-        self,
+        self, 
         observations: torch.Tensor,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+                
         # Encode observations if encoder exists
-        obs_enc = observations if self.encoder is not None else self.encoder(observations)
+        obs_enc = observations if self.encoder is None else self.encoder(observations)
 
         # Get network outputs
         outputs = self.network(obs_enc)
         means = self.mean_layer(outputs)
-
+        
         # Compute standard deviations
         if self.fixed_std is None:
             log_std = self.std_layer(outputs)
@@ -367,25 +401,25 @@ class Policy(nn.Module):
                 log_std = torch.tanh(log_std)
             log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
         else:
-            stds = self.fixed_std.expand_as(means)
-
+            log_std = self.fixed_std.expand_as(means)
+    
         # uses tahn activation function to squash the action to be in the range of [-1, 1]
-        normal = torch.distributions.Normal(means, stds)
-        x_t = normal.rsample()  # for reparameterization trick (mean + std * N(0,1))
+        normal = torch.distributions.Normal(means, torch.exp(log_std))
+        x_t = normal.rsample()  # for reparameterization trick (mean + std * N(0,1)) 
         log_probs = normal.log_prob(x_t)
         if self.use_tanh_squash:
             actions = torch.tanh(x_t)
             log_probs -= torch.log((1 - actions.pow(2)) + 1e-6)
-        log_probs = log_probs.sum(-1)  # sum over action dim
+        log_probs = log_probs.sum(-1) # sum over action dim
 
         return actions, log_probs
-
+    
     def get_features(self, observations: torch.Tensor) -> torch.Tensor:
         """Get encoded features from observations"""
         observations = observations.to(self.device)
         if self.encoder is not None:
-            with torch.no_grad():
-                return self.encoder(observations, train=False)
+            with torch.inference_mode():
+                return self.encoder(observations)
         return observations
 
 
@@ -459,43 +493,56 @@ class SACObservationEncoder(nn.Module):
             feat.append(self.env_state_enc_layers(obs_dict["observation.environment_state"]))
         if "observation.state" in self.config.input_shapes:
             feat.append(self.state_enc_layers(obs_dict["observation.state"]))
+        # TODO(ke-wang): currently average over all features, concatenate all features maybe a better way
         return torch.stack(feat, dim=0).mean(0)
+    
+    @property
+    def output_dim(self) -> int:
+        """Returns the dimension of the encoder output"""
+        return self.config.latent_dim
 
 
 class LagrangeMultiplier(nn.Module):
-    def __init__(self, init_value: float = 1.0, constraint_shape: Sequence[int] = (), device: str = "cuda"):
+    def __init__(
+        self,
+        init_value: float = 1.0,
+        constraint_shape: Sequence[int] = (),
+        device: str = "cuda"
+    ):
         super().__init__()
         self.device = torch.device(device)
         init_value = torch.log(torch.exp(torch.tensor(init_value, device=self.device)) - 1)
-
+            
         # Initialize the Lagrange multiplier as a parameter
         self.lagrange = nn.Parameter(
             torch.full(constraint_shape, init_value, dtype=torch.float32, device=self.device)
         )
-
+        
         self.to(self.device)
 
-    def forward(self, lhs: Optional[torch.Tensor] = None, rhs: Optional[torch.Tensor] = None) -> torch.Tensor:
-        # Get the multiplier value based on parameterization
+    def forward(
+        self, 
+        lhs: Optional[torch.Tensor | float | int] = None, 
+        rhs: Optional[torch.Tensor | float | int] = None
+    ) -> torch.Tensor:
+        # Get the multiplier value based on parameterization        
         multiplier = torch.nn.functional.softplus(self.lagrange)
-
+                
         # Return the raw multiplier if no constraint values provided
         if lhs is None:
             return multiplier
-
-        # Move inputs to device
-        lhs = lhs.to(self.device)
+            
+        # Convert inputs to tensors and move to device
+        lhs = torch.tensor(lhs, device=self.device) if not isinstance(lhs, torch.Tensor) else lhs.to(self.device)
         if rhs is not None:
-            rhs = rhs.to(self.device)
-
-        # Use the multiplier to compute the Lagrange penalty
-        if rhs is None:
+            rhs = torch.tensor(rhs, device=self.device) if not isinstance(rhs, torch.Tensor) else rhs.to(self.device)
+        else:
             rhs = torch.zeros_like(lhs, device=self.device)
-
+            
         diff = lhs - rhs
-
+        
         assert diff.shape == multiplier.shape, f"Shape mismatch: {diff.shape} vs {multiplier.shape}"
-
+        
         return multiplier * diff
 
 
@@ -508,7 +555,6 @@ def create_critic_ensemble(critics: list[nn.Module], num_critics: int, device: s
     assert len(critics) == num_critics, f"Expected {num_critics} critics, got {len(critics)}"
     return nn.ModuleList(critics).to(device)
 
-
 # borrowed from tdmpc
 def flatten_forward_unflatten(fn: Callable[[Tensor], Tensor], image_tensor: Tensor) -> Tensor:
     """Helper to temporarily flatten extra dims at the start of the image tensor.
@@ -516,7 +562,7 @@ def flatten_forward_unflatten(fn: Callable[[Tensor], Tensor], image_tensor: Tens
     Args:
         fn: Callable that the image tensor will be passed to. It should accept (B, C, H, W) and return
             (B, *), where * is any number of dimensions.
-        image_tensor: An image tensor of shape (**, C, H, W), where ** is any number of dimensions and
+        image_tensor: An image tensor of shape (**, C, H, W), where ** is any number of dimensions and 
         can be more than 1 dimensions, generally different from *.
     Returns:
         A return value from the callable reshaped to (**, *).
@@ -526,4 +572,4 @@ def flatten_forward_unflatten(fn: Callable[[Tensor], Tensor], image_tensor: Tens
     start_dims = image_tensor.shape[:-3]
     inp = torch.flatten(image_tensor, end_dim=-4)
     flat_out = fn(inp)
-    return torch.reshape(flat_out, (*start_dims, *flat_out.shape[1:]))
+    return torch.reshape(flat_out, (*start_dims, *flat_out.shape[1:]))
\ No newline at end of file

From 35de91ef2bed8d25ef6aa40e6ff8514a39666436 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Mon, 30 Dec 2024 13:47:28 +0000
Subject: [PATCH 026/112] added temporary fix for missing task_index key in
 online environment

---
 lerobot/common/policies/sac/configuration_sac.py | 1 +
 lerobot/scripts/train.py                         | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index 5f676933..4ae6e5d4 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -50,6 +50,7 @@ class SACConfig:
     state_encoder_hidden_dim = 256
     latent_dim = 128
     target_entropy = None
+    backup_entropy = True
     critic_network_kwargs = {
         "hidden_dims": [256, 256],
         "activate_final": True,
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index fbe7927d..a4eb3528 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -322,6 +322,11 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
 
     logging.info("make_dataset")
     offline_dataset = make_dataset(cfg)
+    # TODO (michel-aractingi): temporary fix to avoid datasets with task_index key that doesn't exist in online environment
+    # i.e., pusht
+    if "task_index" in offline_dataset.hf_dataset[0]:
+        offline_dataset.hf_dataset = offline_dataset.hf_dataset.remove_columns(["task_index"])
+
     if isinstance(offline_dataset, MultiLeRobotDataset):
         logging.info(
             "Multiple datasets were provided. Applied the following index mapping to the provided datasets: "

From c5bca1cf0f1055f898d58c46432b80d70b615cda Mon Sep 17 00:00:00 2001
From: Eugene Mironov <helper2424@gmail.com>
Date: Mon, 6 Jan 2025 17:34:00 +0700
Subject: [PATCH 027/112] [Port HIL_SERL] Final fixes for the Reward Classifier
 (#598)

---
 .../hilserl/classifier/modeling_classifier.py |  3 ++-
 lerobot/common/policies/sac/modeling_sac.py   |  1 -
 lerobot/common/robot_devices/control_utils.py |  8 +++++--
 .../configs/policy/hilserl_classifier.yaml    |  1 -
 lerobot/scripts/control_robot.py              |  2 +-
 lerobot/scripts/control_sim_robot.py          | 23 ++++++++++++++++++-
 lerobot/scripts/train_hilserl_classifier.py   | 17 ++++++++++----
 poetry.lock                                   |  2 +-
 pyproject.toml                                |  4 ++--
 .../classifier/test_modelling_classifier.py   |  9 +++++++-
 tests/test_train_hilserl_classifier.py        |  8 +++----
 11 files changed, 59 insertions(+), 19 deletions(-)

diff --git a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
index 28b05744..d7bd42cd 100644
--- a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
+++ b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
@@ -4,7 +4,6 @@ from typing import Optional
 import torch
 from huggingface_hub import PyTorchModelHubMixin
 from torch import Tensor, nn
-from transformers import AutoImageProcessor, AutoModel
 
 from .configuration_classifier import ClassifierConfig
 
@@ -44,6 +43,8 @@ class Classifier(
     name = "classifier"
 
     def __init__(self, config: ClassifierConfig):
+        from transformers import AutoImageProcessor, AutoModel
+
         super().__init__()
         self.config = config
         self.processor = AutoImageProcessor.from_pretrained(self.config.model_name, trust_remote_code=True)
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index bd77408e..62725ce1 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -333,7 +333,6 @@ class Critic(nn.Module):
         value = self.output_layer(x)
         return value.squeeze(-1)
 
-
 class Policy(nn.Module):
     def __init__(
         self,
diff --git a/lerobot/common/robot_devices/control_utils.py b/lerobot/common/robot_devices/control_utils.py
index 8a6bcfbd..ad6f5632 100644
--- a/lerobot/common/robot_devices/control_utils.py
+++ b/lerobot/common/robot_devices/control_utils.py
@@ -362,12 +362,16 @@ def sanity_check_dataset_name(repo_id, policy):
 
 
 def sanity_check_dataset_robot_compatibility(
-    dataset: LeRobotDataset, robot: Robot, fps: int, use_videos: bool
+    dataset: LeRobotDataset, robot: Robot, fps: int, use_videos: bool, extra_features: dict = None
 ) -> None:
+    features_from_robot = get_features_from_robot(robot, use_videos)
+    if extra_features is not None:
+        features_from_robot.update(extra_features)
+
     fields = [
         ("robot_type", dataset.meta.robot_type, robot.robot_type),
         ("fps", dataset.fps, fps),
-        ("features", dataset.features, get_features_from_robot(robot, use_videos)),
+        ("features", dataset.features, features_from_robot),
     ]
 
     mismatches = []
diff --git a/lerobot/configs/policy/hilserl_classifier.yaml b/lerobot/configs/policy/hilserl_classifier.yaml
index be82bc4e..498c9983 100644
--- a/lerobot/configs/policy/hilserl_classifier.yaml
+++ b/lerobot/configs/policy/hilserl_classifier.yaml
@@ -39,7 +39,6 @@ policy:
 wandb:
   enable: false
   project: "classifier-training"
-  entity: "wandb_entity"
   job_name: "classifier_training_0"
   disable_artifact: false
 
diff --git a/lerobot/scripts/control_robot.py b/lerobot/scripts/control_robot.py
index 45a6bd66..f45e6b48 100644
--- a/lerobot/scripts/control_robot.py
+++ b/lerobot/scripts/control_robot.py
@@ -246,7 +246,7 @@ def record(
             num_processes=num_image_writer_processes,
             num_threads=num_image_writer_threads_per_camera * len(robot.cameras),
         )
-        sanity_check_dataset_robot_compatibility(dataset, robot, fps, video)
+        sanity_check_dataset_robot_compatibility(dataset, robot, fps, video, extra_features)
     else:
         # Create empty dataset or load existing saved episodes
         sanity_check_dataset_name(repo_id, policy)
diff --git a/lerobot/scripts/control_sim_robot.py b/lerobot/scripts/control_sim_robot.py
index 4fffa8c7..67bdfb85 100644
--- a/lerobot/scripts/control_sim_robot.py
+++ b/lerobot/scripts/control_sim_robot.py
@@ -183,8 +183,14 @@ def record(
     resume: bool = False,
     local_files_only: bool = False,
     run_compute_stats: bool = True,
+    assign_rewards: bool = False,
 ) -> LeRobotDataset:
     # Load pretrained policy
+
+    extra_features = (
+        {"next.reward": {"dtype": "int64", "shape": (1,), "names": None}} if assign_rewards else None
+    )
+
     policy = None
     if pretrained_policy_name_or_path is not None:
         policy, policy_fps, device, use_amp = init_policy(pretrained_policy_name_or_path, policy_overrides)
@@ -197,7 +203,7 @@ def record(
         raise ValueError("Either policy or process_action_fn has to be set to enable control in sim.")
 
     # initialize listener before sim env
-    listener, events = init_keyboard_listener()
+    listener, events = init_keyboard_listener(assign_rewards=assign_rewards)
 
     # create sim env
     env = env()
@@ -237,6 +243,7 @@ def record(
             }
 
         features["action"] = {"dtype": "float32", "shape": env.action_space.shape, "names": None}
+        features = {**features, **extra_features}
 
         # Create empty dataset or load existing saved episodes
         sanity_check_dataset_name(repo_id, policy)
@@ -288,6 +295,13 @@ def record(
                 "timestamp": env_timestamp,
             }
 
+            # Overwrite environment reward with manually assigned reward
+            if assign_rewards:
+                frame["next.reward"] = events["next.reward"]
+
+                # Should success always be false to match what we do in control_utils?
+                frame["next.success"] = False
+
             for key in image_keys:
                 if not key.startswith("observation.image"):
                     frame["observation.image." + key] = observation[key]
@@ -472,6 +486,13 @@ if __name__ == "__main__":
         default=0,
         help="Resume recording on an existing dataset.",
     )
+    parser_record.add_argument(
+        "--assign-rewards",
+        type=int,
+        default=0,
+        help="Enables the assignation of rewards to frames (by default no assignation). When enabled, assign a 0 reward to frames until the space bar is pressed which assign a 1 reward. Press the space bar a second time to assign a 0 reward. The reward assigned is reset to 0 when the episode ends.",
+    )
+
     parser_replay = subparsers.add_parser("replay", parents=[base_parser])
     parser_replay.add_argument(
         "--fps", type=none_or_int, default=None, help="Frames per second (set to None to disable)"
diff --git a/lerobot/scripts/train_hilserl_classifier.py b/lerobot/scripts/train_hilserl_classifier.py
index ea8336a9..22ff2957 100644
--- a/lerobot/scripts/train_hilserl_classifier.py
+++ b/lerobot/scripts/train_hilserl_classifier.py
@@ -45,7 +45,7 @@ from lerobot.common.utils.utils import (
 )
 
 
-def get_model(cfg, logger):
+def get_model(cfg, logger):  # noqa I001
     classifier_config = _policy_cfg_from_hydra_cfg(ClassifierConfig, cfg)
     model = Classifier(classifier_config)
     if cfg.resume:
@@ -64,6 +64,12 @@ def create_balanced_sampler(dataset, cfg):
     return WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)
 
 
+def support_amp(device: torch.device, cfg: DictConfig) -> bool:
+    # Check if the device supports AMP
+    # Here is an example of the issue that says that MPS doesn't support AMP properply
+    return cfg.training.use_amp and device.type in ("cuda", "cpu")
+
+
 def train_epoch(model, train_loader, criterion, optimizer, grad_scaler, device, logger, step, cfg):
     # Single epoch training loop with AMP support and progress tracking
     model.train()
@@ -77,7 +83,7 @@ def train_epoch(model, train_loader, criterion, optimizer, grad_scaler, device,
         labels = batch[cfg.training.label_key].float().to(device)
 
         # Forward pass with optional AMP
-        with torch.autocast(device_type=device.type) if cfg.training.use_amp else nullcontext():
+        with torch.autocast(device_type=device.type) if support_amp(device, cfg) else nullcontext():
             outputs = model(images)
             loss = criterion(outputs.logits, labels)
 
@@ -119,7 +125,10 @@ def validate(model, val_loader, criterion, device, logger, cfg, num_samples_to_l
     samples = []
     running_loss = 0
 
-    with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.training.use_amp else nullcontext():
+    with (
+        torch.no_grad(),
+        torch.autocast(device_type=device.type) if support_amp(device, cfg) else nullcontext(),
+    ):
         for batch in tqdm(val_loader, desc="Validation"):
             images = batch[cfg.training.image_key].to(device)
             labels = batch[cfg.training.label_key].float().to(device)
@@ -170,7 +179,7 @@ def validate(model, val_loader, criterion, device, logger, cfg, num_samples_to_l
     return accuracy, eval_info
 
 
-@hydra.main(version_base="1.2", config_path="../configs", config_name="hilserl_classifier")
+@hydra.main(version_base="1.2", config_path="../configs/policy", config_name="hilserl_classifier")
 def train(cfg: DictConfig) -> None:
     # Main training pipeline with support for resuming training
     logging.info(OmegaConf.to_yaml(cfg))
diff --git a/poetry.lock b/poetry.lock
index 919edd18..81462fe8 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -7720,4 +7720,4 @@ xarm = ["gym-xarm"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<3.13"
-content-hash = "b9d299916ced6af1d243f961a32b0a4aacbef18e0b95337a5224e8511f5d6dda"
+content-hash = "44c74163e398e8ff16973957f69a47bb09b789e92ac4d8fb3ab268defab96427"
diff --git a/pyproject.toml b/pyproject.toml
index 738903bd..05ab921a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,8 +71,8 @@ pyrender = {git = "https://github.com/mmatl/pyrender.git", markers = "sys_platfo
 hello-robot-stretch-body = {version = ">=0.7.27", markers = "sys_platform == 'linux'", optional = true}
 pyserial = {version = ">=3.5", optional = true}
 jsonlines = ">=4.0.0"
-transformers = {version = "^4.47.0", optional = true}
-torchmetrics = {version = "^1.6.0", optional = true}
+transformers = {version = ">=4.47.0", optional = true}
+torchmetrics = {version = ">=1.6.0", optional = true}
 
 
 [tool.poetry.extras]
diff --git a/tests/policies/hilserl/classifier/test_modelling_classifier.py b/tests/policies/hilserl/classifier/test_modelling_classifier.py
index 014165eb..a3db4211 100644
--- a/tests/policies/hilserl/classifier/test_modelling_classifier.py
+++ b/tests/policies/hilserl/classifier/test_modelling_classifier.py
@@ -1,7 +1,6 @@
 import torch
 
 from lerobot.common.policies.hilserl.classifier.modeling_classifier import (
-    Classifier,
     ClassifierConfig,
     ClassifierOutput,
 )
@@ -21,6 +20,8 @@ def test_classifier_output():
 
 @require_package("transformers")
 def test_binary_classifier_with_default_params():
+    from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
+
     config = ClassifierConfig()
     classifier = Classifier(config)
 
@@ -40,6 +41,8 @@ def test_binary_classifier_with_default_params():
 
 @require_package("transformers")
 def test_multiclass_classifier():
+    from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
+
     num_classes = 5
     config = ClassifierConfig(num_classes=num_classes)
     classifier = Classifier(config)
@@ -60,6 +63,8 @@ def test_multiclass_classifier():
 
 @require_package("transformers")
 def test_default_device():
+    from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
+
     config = ClassifierConfig()
     assert config.device == "cpu"
 
@@ -70,6 +75,8 @@ def test_default_device():
 
 @require_package("transformers")
 def test_explicit_device_setup():
+    from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
+
     config = ClassifierConfig(device="meta")
     assert config.device == "meta"
 
diff --git a/tests/test_train_hilserl_classifier.py b/tests/test_train_hilserl_classifier.py
index 66d8fbe4..c1d854ac 100644
--- a/tests/test_train_hilserl_classifier.py
+++ b/tests/test_train_hilserl_classifier.py
@@ -151,9 +151,9 @@ def test_validate():
 @patch("lerobot.scripts.train_hilserl_classifier.Logger.get_last_pretrained_model_dir")
 @patch("lerobot.scripts.train_hilserl_classifier.Logger")
 @patch("lerobot.scripts.train_hilserl_classifier.LeRobotDataset")
-@patch("lerobot.scripts.train_hilserl_classifier.make_policy")
+@patch("lerobot.scripts.train_hilserl_classifier.get_model")
 def test_resume_function(
-    mock_make_policy,
+    mock_get_model,
     mock_dataset,
     mock_logger,
     mock_get_last_pretrained_model_dir,
@@ -168,7 +168,7 @@ def test_resume_function(
 
     with initialize_config_dir(config_dir=config_dir, job_name="test_app", version_base="1.2"):
         cfg = compose(
-            config_name="reward_classifier",
+            config_name="hilserl_classifier",
             overrides=[
                 "device=cpu",
                 "seed=42",
@@ -211,7 +211,7 @@ def test_resume_function(
 
     # Instantiate the model and set make_policy to return it
     model = make_dummy_model()
-    mock_make_policy.return_value = model
+    mock_get_model.return_value = model
 
     # Call train
     train(cfg)

From 3bb5ed5e91a2b78d9a8f5883171ce45da3c496ff Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Mon, 13 Jan 2025 13:57:49 +0100
Subject: [PATCH 028/112] Extend reward classifier for multiple camera views
 (#626)

---
 lerobot/common/logger.py                      |   2 +-
 .../classifier/configuration_classifier.py    |   1 +
 .../hilserl/classifier/modeling_classifier.py |  16 ++-
 lerobot/common/robot_devices/control_utils.py |   9 ++
 .../configs/policy/hilserl_classifier.yaml    |   9 +-
 lerobot/scripts/control_robot.py              |  13 ++
 lerobot/scripts/eval_on_robot.py              | 123 +++++++++++++-----
 lerobot/scripts/train_hilserl_classifier.py   |   7 +-
 tests/test_train_hilserl_classifier.py        |  61 ++++++++-
 9 files changed, 192 insertions(+), 49 deletions(-)

diff --git a/lerobot/common/logger.py b/lerobot/common/logger.py
index dec8b465..4015492d 100644
--- a/lerobot/common/logger.py
+++ b/lerobot/common/logger.py
@@ -25,13 +25,13 @@ from glob import glob
 from pathlib import Path
 
 import torch
+import wandb
 from huggingface_hub.constants import SAFETENSORS_SINGLE_FILE
 from omegaconf import DictConfig, OmegaConf
 from termcolor import colored
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LRScheduler
 
-import wandb
 from lerobot.common.policies.policy_protocol import Policy
 from lerobot.common.utils.utils import get_global_random_state, set_global_random_state
 
diff --git a/lerobot/common/policies/hilserl/classifier/configuration_classifier.py b/lerobot/common/policies/hilserl/classifier/configuration_classifier.py
index f0b9352f..de3742ec 100644
--- a/lerobot/common/policies/hilserl/classifier/configuration_classifier.py
+++ b/lerobot/common/policies/hilserl/classifier/configuration_classifier.py
@@ -13,6 +13,7 @@ class ClassifierConfig:
     model_name: str = "microsoft/resnet-50"
     device: str = "cpu"
     model_type: str = "cnn"  # "transformer" or "cnn"
+    num_cameras: int = 2
 
     def save_pretrained(self, save_dir):
         """Save config to json file."""
diff --git a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
index d7bd42cd..4a022335 100644
--- a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
+++ b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
@@ -97,7 +97,7 @@ class Classifier(
                 raise ValueError("Unsupported transformer architecture since hidden_size is not found")
 
         self.classifier_head = nn.Sequential(
-            nn.Linear(input_dim, self.config.hidden_dim),
+            nn.Linear(input_dim * self.config.num_cameras, self.config.hidden_dim),
             nn.Dropout(self.config.dropout_rate),
             nn.LayerNorm(self.config.hidden_dim),
             nn.ReLU(),
@@ -130,11 +130,11 @@ class Classifier(
                     return outputs.pooler_output
                 return outputs.last_hidden_state[:, 0, :]
 
-    def forward(self, x: torch.Tensor) -> ClassifierOutput:
+    def forward(self, xs: torch.Tensor) -> ClassifierOutput:
         """Forward pass of the classifier."""
         # For training, we expect input to be a tensor directly from LeRobotDataset
-        encoder_output = self._get_encoder_output(x)
-        logits = self.classifier_head(encoder_output)
+        encoder_outputs = torch.hstack([self._get_encoder_output(x) for x in xs])
+        logits = self.classifier_head(encoder_outputs)
 
         if self.config.num_classes == 2:
             logits = logits.squeeze(-1)
@@ -142,4 +142,10 @@ class Classifier(
         else:
             probabilities = torch.softmax(logits, dim=-1)
 
-        return ClassifierOutput(logits=logits, probabilities=probabilities, hidden_states=encoder_output)
+        return ClassifierOutput(logits=logits, probabilities=probabilities, hidden_states=encoder_outputs)
+
+    def predict_reward(self, x):
+        if self.config.num_classes == 2:
+            return (self.forward(x).probabilities > 0.5).float()
+        else:
+            return torch.argmax(self.forward(x).probabilities, dim=1)
diff --git a/lerobot/common/robot_devices/control_utils.py b/lerobot/common/robot_devices/control_utils.py
index ad6f5632..10cb9f5c 100644
--- a/lerobot/common/robot_devices/control_utils.py
+++ b/lerobot/common/robot_devices/control_utils.py
@@ -11,6 +11,7 @@ from copy import copy
 from functools import cache
 
 import cv2
+import numpy as np
 import torch
 import tqdm
 from deepdiff import DeepDiff
@@ -332,6 +333,14 @@ def reset_environment(robot, events, reset_time_s):
                 break
 
 
+def reset_follower_position(robot: Robot, target_position):
+    current_position = robot.follower_arms["main"].read("Present_Position")
+    trajectory = torch.from_numpy(np.linspace(current_position, target_position, 30)) # NOTE: 30 is just an aribtrary number 
+    for pose in trajectory:
+        robot.send_action(pose)
+        busy_wait(0.015)
+
+
 def stop_recording(robot, listener, display_cameras):
     robot.disconnect()
 
diff --git a/lerobot/configs/policy/hilserl_classifier.yaml b/lerobot/configs/policy/hilserl_classifier.yaml
index 498c9983..f8137b69 100644
--- a/lerobot/configs/policy/hilserl_classifier.yaml
+++ b/lerobot/configs/policy/hilserl_classifier.yaml
@@ -4,7 +4,7 @@ defaults:
   - _self_
 
 seed: 13
-dataset_repo_id: "dataset_repo_id"
+dataset_repo_id: aractingi/pick_place_lego_cube_1
 train_split_proportion: 0.8
 
 # Required by logger
@@ -24,7 +24,7 @@ training:
   eval_freq: 1  # How often to run validation (in epochs)
   save_freq: 1  # How often to save checkpoints (in epochs)
   save_checkpoint: true
-  image_key: "observation.images.phone"
+  image_keys: ["observation.images.top", "observation.images.wrist"]
   label_key: "next.reward"
 
 eval:
@@ -32,9 +32,10 @@ eval:
   num_samples_to_log: 30  # Number of validation samples to log in the table
 
 policy:
-  name: "hilserl/classifier"
+  name: "hilserl/classifier/pick_place_lego_cube_1"
   model_name: "facebook/convnext-base-224"
   model_type: "cnn"
+  num_cameras: 2 # Has to be len(training.image_keys)
 
 wandb:
   enable: false
@@ -44,4 +45,4 @@ wandb:
 
 device: "mps"
 resume: false
-output_dir: "output"
+output_dir: "outputs/classifier"
diff --git a/lerobot/scripts/control_robot.py b/lerobot/scripts/control_robot.py
index f45e6b48..8187e8a3 100644
--- a/lerobot/scripts/control_robot.py
+++ b/lerobot/scripts/control_robot.py
@@ -109,6 +109,7 @@ from lerobot.common.robot_devices.control_utils import (
     log_control_info,
     record_episode,
     reset_environment,
+    reset_follower_position,
     sanity_check_dataset_name,
     sanity_check_dataset_robot_compatibility,
     stop_recording,
@@ -205,6 +206,7 @@ def record(
     num_image_writer_threads_per_camera: int = 4,
     display_cameras: bool = True,
     play_sounds: bool = True,
+    reset_follower: bool = False, 
     resume: bool = False,
     # TODO(rcadene, aliberts): remove local_files_only when refactor with dataset as argument
     local_files_only: bool = False,
@@ -265,6 +267,9 @@ def record(
         robot.connect()
     listener, events = init_keyboard_listener(assign_rewards=assign_rewards)
 
+    if reset_follower:
+        initial_position = robot.follower_arms["main"].read("Present_Position")
+        
     # Execute a few seconds without recording to:
     # 1. teleoperate the robot to move it in starting position if no policy provided,
     # 2. give times to the robot devices to connect and start synchronizing,
@@ -307,6 +312,8 @@ def record(
             (dataset.num_episodes < num_episodes - 1) or events["rerecord_episode"]
         ):
             log_say("Reset the environment", play_sounds)
+            if reset_follower:
+                reset_follower_position(robot, initial_position)
             reset_environment(robot, events, reset_time_s)
 
         if events["rerecord_episode"]:
@@ -527,6 +534,12 @@ if __name__ == "__main__":
         default=0,
         help="Enables the assignation of rewards to frames (by default no assignation). When enabled, assign a 0 reward to frames until the space bar is pressed which assign a 1 reward. Press the space bar a second time to assign a 0 reward. The reward assigned is reset to 0 when the episode ends.",
     )
+    parser_record.add_argument(
+        "--reset-follower",
+        type=int,
+        default=0,
+        help="Resets the follower to the initial position during while reseting the evironment, this is to avoid having the follower start at an awkward position in the next episode",
+    )
 
     parser_replay = subparsers.add_parser("replay", parents=[base_parser])
     parser_replay.add_argument(
diff --git a/lerobot/scripts/eval_on_robot.py b/lerobot/scripts/eval_on_robot.py
index 92daa860..842c1a28 100644
--- a/lerobot/scripts/eval_on_robot.py
+++ b/lerobot/scripts/eval_on_robot.py
@@ -23,6 +23,15 @@ python lerobot/scripts/eval_on_robot.py \
     eval.n_episodes=10
 ```
 
+Test reward classifier with teleoperation (you need to press space to take over)
+```
+python lerobot/scripts/eval_on_robot.py \
+    --robot-path lerobot/configs/robot/so100.yaml \
+    --reward-classifier-pretrained-path outputs/classifier/checkpoints/best/pretrained_model \
+    --reward-classifier-config-file lerobot/configs/policy/hilserl_classifier.yaml \
+    --display-cameras 1
+```
+
 **NOTE** (michel-aractingi): This script is incomplete and it is being prepared
 for running training on the real robot.
 """
@@ -30,14 +39,14 @@ for running training on the real robot.
 import argparse
 import logging
 import time
-from copy import deepcopy
 
+import cv2
 import numpy as np
 import torch
 from tqdm import trange
 
 from lerobot.common.policies.policy_protocol import Policy
-from lerobot.common.robot_devices.control_utils import busy_wait, is_headless
+from lerobot.common.robot_devices.control_utils import busy_wait, is_headless, reset_follower_position
 from lerobot.common.robot_devices.robots.factory import Robot, make_robot
 from lerobot.common.utils.utils import (
     init_hydra_config,
@@ -46,7 +55,33 @@ from lerobot.common.utils.utils import (
 )
 
 
-def rollout(robot: Robot, policy: Policy, fps: int, control_time_s: float = 20, use_amp: bool = True) -> dict:
+def get_classifier(pretrained_path, config_path):
+    if pretrained_path is None or config_path is None:
+        return
+
+    from lerobot.common.policies.factory import _policy_cfg_from_hydra_cfg
+    from lerobot.common.policies.hilserl.classifier.configuration_classifier import ClassifierConfig
+    from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
+
+    cfg = init_hydra_config(config_path)
+
+    classifier_config = _policy_cfg_from_hydra_cfg(ClassifierConfig, cfg)
+    classifier_config.num_cameras = len(cfg.training.image_keys)  # TODO automate these paths
+    model = Classifier(classifier_config)
+    model.load_state_dict(Classifier.from_pretrained(pretrained_path).state_dict())
+    model = model.to("mps")
+    return model
+
+
+def rollout(
+    robot: Robot,
+    policy: Policy,
+    reward_classifier,
+    fps: int,
+    control_time_s: float = 20,
+    use_amp: bool = True,
+    display_cameras: bool = False,
+) -> dict:
     """Run a batched policy rollout on the real robot.
 
     The return dictionary contains:
@@ -70,6 +105,7 @@ def rollout(robot: Robot, policy: Policy, fps: int, control_time_s: float = 20,
     Returns:
         The dictionary described above.
     """
+    # TODO (michel-aractingi): Infer the device from policy parameters when policy is added
     # assert isinstance(policy, nn.Module), "Policy must be a PyTorch nn module."
     # device = get_device_from_parameters(policy)
 
@@ -79,25 +115,21 @@ def rollout(robot: Robot, policy: Policy, fps: int, control_time_s: float = 20,
     # Reset the policy. TODO (michel-aractingi) add real policy evaluation once the code is ready.
     # policy.reset()
 
-    # Get observation from real robot
+    # NOTE: sorting to make sure the key sequence is the same during training and testing.
     observation = robot.capture_observation()
+    image_keys = [key for key in observation if "image" in key]
+    image_keys.sort()
 
-    # Calculate reward. TODO (michel-aractingi)
-    # in HIL-SERL it will be with a reward classifier
-    reward = calculate_reward(observation)
-    all_observations = []
     all_actions = []
     all_rewards = []
     all_successes = []
 
     start_episode_t = time.perf_counter()
+    init_pos = robot.follower_arms["main"].read("Present_Position")
     timestamp = 0.0
     while timestamp < control_time_s:
         start_loop_t = time.perf_counter()
 
-        all_observations.append(deepcopy(observation))
-        # observation = {key: observation[key].to(device, non_blocking=True) for key in observation}
-
         # Apply the next action.
         while events["pause_policy"] and not events["human_intervention_step"]:
             busy_wait(0.5)
@@ -109,18 +141,26 @@ def rollout(robot: Robot, policy: Policy, fps: int, control_time_s: float = 20,
         else:
             # explore with policy
             with torch.inference_mode():
+                # TODO (michel-aractingi) replace this part with policy (predict_action)
                 action = robot.follower_arms["main"].read("Present_Position")
                 action = torch.from_numpy(action)
                 robot.send_action(action)
                 # action = predict_action(observation, policy, device, use_amp)
 
         observation = robot.capture_observation()
-        # Calculate reward
-        # in HIL-SERL it will be with a reward classifier
-        reward = calculate_reward(observation)
+        images = []
+        for key in image_keys:
+            if display_cameras:
+                cv2.imshow(key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR))
+                cv2.waitKey(1)
+            images.append(observation[key].to("mps"))
+
+        reward = reward_classifier.predict_reward(images) if reward_classifier is not None else 0.0
+        all_rewards.append(reward)
+
+        # print("REWARD : ", reward)
 
         all_actions.append(action)
-        all_rewards.append(torch.from_numpy(reward))
         all_successes.append(torch.tensor([False]))
 
         dt_s = time.perf_counter() - start_loop_t
@@ -131,7 +171,8 @@ def rollout(robot: Robot, policy: Policy, fps: int, control_time_s: float = 20,
             events["human_intervention_step"] = False
             events["pause_policy"] = False
             break
-    all_observations.append(deepcopy(observation))
+
+    reset_follower_position(robot, target_position=init_pos)
 
     dones = torch.tensor([False] * len(all_actions))
     dones[-1] = True
@@ -142,10 +183,6 @@ def rollout(robot: Robot, policy: Policy, fps: int, control_time_s: float = 20,
         "next.success": torch.stack(all_successes, dim=1),
         "done": dones,
     }
-    stacked_observations = {}
-    for key in all_observations[0]:
-        stacked_observations[key] = torch.stack([obs[key] for obs in all_observations], dim=1)
-    ret["observation"] = stacked_observations
 
     listener.stop()
 
@@ -159,6 +196,9 @@ def eval_policy(
     n_episodes: int,
     control_time_s: int = 20,
     use_amp: bool = True,
+    display_cameras: bool = False,
+    reward_classifier_pretrained_path: str | None = None,
+    reward_classifier_config_file: str | None = None,
 ) -> dict:
     """
     Args:
@@ -179,8 +219,12 @@ def eval_policy(
 
     start_eval = time.perf_counter()
     progbar = trange(n_episodes, desc="Evaluating policy on real robot")
-    for _batch_idx in progbar:
-        rollout_data = rollout(robot, policy, fps, control_time_s, use_amp)
+    reward_classifier = get_classifier(reward_classifier_pretrained_path, reward_classifier_config_file)
+
+    for _ in progbar:
+        rollout_data = rollout(
+            robot, policy, reward_classifier, fps, control_time_s, use_amp, display_cameras
+        )
 
         rollouts.append(rollout_data)
         sum_rewards.append(sum(rollout_data["next.reward"]))
@@ -219,15 +263,6 @@ def eval_policy(
     return info
 
 
-def calculate_reward(observation):
-    """
-    Method to calculate reward function in some way.
-    In HIL-SERL this is done through defining a reward classifier
-    """
-    # reward = reward_classifier(observation)
-    return np.array([0.0])
-
-
 def init_keyboard_listener():
     # Allow to exit early while recording an episode or resetting the environment,
     # by tapping the right arrow key '->'. This might require a sudo permission
@@ -324,6 +359,21 @@ if __name__ == "__main__":
             "outputs/eval/{timestamp}_{env_name}_{policy_name}"
         ),
     )
+    parser.add_argument(
+        "--display-cameras", help=("Whether to display the camera feed while the rollout is happening")
+    )
+    parser.add_argument(
+        "--reward-classifier-pretrained-path",
+        type=str,
+        default=None,
+        help="Path to the pretrained classifier weights.",
+    )
+    parser.add_argument(
+        "--reward-classifier-config-file",
+        type=str,
+        default=None,
+        help="Path to a yaml config file that is necessary to build the reward classifier model.",
+    )
 
     args = parser.parse_args()
 
@@ -332,4 +382,13 @@ if __name__ == "__main__":
     if not robot.is_connected:
         robot.connect()
 
-    eval_policy(robot, None, fps=40, n_episodes=2, control_time_s=100)
+    eval_policy(
+        robot,
+        None,
+        fps=40,
+        n_episodes=2,
+        control_time_s=100,
+        display_cameras=args.display_cameras,
+        reward_classifier_config_file=args.reward_classifier_config_file,
+        reward_classifier_pretrained_path=args.reward_classifier_pretrained_path,
+    )
diff --git a/lerobot/scripts/train_hilserl_classifier.py b/lerobot/scripts/train_hilserl_classifier.py
index 22ff2957..458e3ff1 100644
--- a/lerobot/scripts/train_hilserl_classifier.py
+++ b/lerobot/scripts/train_hilserl_classifier.py
@@ -22,6 +22,7 @@ from pprint import pformat
 import hydra
 import torch
 import torch.nn as nn
+import wandb
 from deepdiff import DeepDiff
 from omegaconf import DictConfig, OmegaConf
 from termcolor import colored
@@ -30,7 +31,6 @@ from torch.cuda.amp import GradScaler
 from torch.utils.data import DataLoader, WeightedRandomSampler, random_split
 from tqdm import tqdm
 
-import wandb
 from lerobot.common.datasets.factory import resolve_delta_timestamps
 from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
 from lerobot.common.logger import Logger
@@ -79,7 +79,7 @@ def train_epoch(model, train_loader, criterion, optimizer, grad_scaler, device,
     pbar = tqdm(train_loader, desc="Training")
     for batch_idx, batch in enumerate(pbar):
         start_time = time.perf_counter()
-        images = batch[cfg.training.image_key].to(device)
+        images = [batch[img_key].to(device) for img_key in cfg.training.image_keys]
         labels = batch[cfg.training.label_key].float().to(device)
 
         # Forward pass with optional AMP
@@ -130,7 +130,7 @@ def validate(model, val_loader, criterion, device, logger, cfg, num_samples_to_l
         torch.autocast(device_type=device.type) if support_amp(device, cfg) else nullcontext(),
     ):
         for batch in tqdm(val_loader, desc="Validation"):
-            images = batch[cfg.training.image_key].to(device)
+            images = [batch[img_key].to(device) for img_key in cfg.training.image_keys]
             labels = batch[cfg.training.label_key].float().to(device)
 
             outputs = model(images)
@@ -163,6 +163,7 @@ def validate(model, val_loader, criterion, device, logger, cfg, num_samples_to_l
 
     accuracy = 100 * correct / total
     avg_loss = running_loss / len(val_loader)
+    print(f"Average validation loss {avg_loss}, and accuracy {accuracy}")
 
     eval_info = {
         "loss": avg_loss,
diff --git a/tests/test_train_hilserl_classifier.py b/tests/test_train_hilserl_classifier.py
index c1d854ac..8c1ad453 100644
--- a/tests/test_train_hilserl_classifier.py
+++ b/tests/test_train_hilserl_classifier.py
@@ -33,7 +33,9 @@ class MockDataset(Dataset):
 
 
 def make_dummy_model():
-    model_config = ClassifierConfig(num_classes=2, model_name="hf-tiny-model-private/tiny-random-ResNetModel")
+    model_config = ClassifierConfig(
+        num_classes=2, model_name="hf-tiny-model-private/tiny-random-ResNetModel", num_cameras=1
+    )
     model = Classifier(config=model_config)
     return model
 
@@ -88,7 +90,7 @@ def test_train_epoch():
     logger = MagicMock()
     step = 0
     cfg = MagicMock()
-    cfg.training.image_key = "image"
+    cfg.training.image_keys = ["image"]
     cfg.training.label_key = "label"
     cfg.training.use_amp = False
 
@@ -130,7 +132,7 @@ def test_validate():
     device = torch.device("cpu")
     logger = MagicMock()
     cfg = MagicMock()
-    cfg.training.image_key = "image"
+    cfg.training.image_keys = ["image"]
     cfg.training.label_key = "label"
     cfg.training.use_amp = False
 
@@ -145,6 +147,57 @@ def test_validate():
     assert isinstance(eval_info, dict)
 
 
+def test_train_epoch_multiple_cameras():
+    model_config = ClassifierConfig(
+        num_classes=2, model_name="hf-tiny-model-private/tiny-random-ResNetModel", num_cameras=2
+    )
+    model = Classifier(config=model_config)
+
+    # Mock components
+    model.train = MagicMock()
+
+    train_loader = [
+        {
+            "image_1": torch.rand(2, 3, 224, 224),
+            "image_2": torch.rand(2, 3, 224, 224),
+            "label": torch.tensor([0.0, 1.0]),
+        }
+    ]
+
+    criterion = nn.BCEWithLogitsLoss()
+    optimizer = MagicMock()
+    grad_scaler = MagicMock()
+    device = torch.device("cpu")
+    logger = MagicMock()
+    step = 0
+    cfg = MagicMock()
+    cfg.training.image_keys = ["image_1", "image_2"]
+    cfg.training.label_key = "label"
+    cfg.training.use_amp = False
+
+    # Call the function under test
+    train_epoch(
+        model,
+        train_loader,
+        criterion,
+        optimizer,
+        grad_scaler,
+        device,
+        logger,
+        step,
+        cfg,
+    )
+
+    # Check that model.train() was called
+    model.train.assert_called_once()
+
+    # Check that optimizer.zero_grad() was called
+    optimizer.zero_grad.assert_called()
+
+    # Check that logger.log_dict was called
+    logger.log_dict.assert_called()
+
+
 @pytest.mark.parametrize("resume", [True, False])
 @patch("lerobot.scripts.train_hilserl_classifier.init_hydra_config")
 @patch("lerobot.scripts.train_hilserl_classifier.Logger.get_last_checkpoint_dir")
@@ -179,7 +232,7 @@ def test_resume_function(
                 "train_split_proportion=0.8",
                 "training.num_workers=0",
                 "training.batch_size=2",
-                "training.image_key=image",
+                "training.image_keys=[image]",
                 "training.label_key=label",
                 "training.use_amp=False",
                 "training.num_epochs=1",

From 0a4e9e25d0d6008cb364aaceb80c759236af376c Mon Sep 17 00:00:00 2001
From: Mishig <dmishig@gmail.com>
Date: Fri, 20 Dec 2024 16:26:23 +0100
Subject: [PATCH 029/112] [vizualizer] for LeRobodDataset V2 (#576)

---
 lerobot/common/datasets/utils.py              |  57 +++
 lerobot/scripts/visualize_dataset_html.py     | 327 ++++++++++++++----
 .../templates/visualize_dataset_homepage.html |  68 ++++
 .../templates/visualize_dataset_template.html |  80 +++--
 tests/test_visualize_dataset_html.py          |  30 --
 5 files changed, 428 insertions(+), 134 deletions(-)
 create mode 100644 lerobot/templates/visualize_dataset_homepage.html
 delete mode 100644 tests/test_visualize_dataset_html.py

diff --git a/lerobot/common/datasets/utils.py b/lerobot/common/datasets/utils.py
index af5b03cc..1490adda 100644
--- a/lerobot/common/datasets/utils.py
+++ b/lerobot/common/datasets/utils.py
@@ -17,9 +17,11 @@ import importlib.resources
 import json
 import logging
 import textwrap
+from collections.abc import Iterator
 from itertools import accumulate
 from pathlib import Path
 from pprint import pformat
+from types import SimpleNamespace
 from typing import Any
 
 import datasets
@@ -502,3 +504,58 @@ def create_lerobot_dataset_card(
         template_path=str(card_template_path),
         **kwargs,
     )
+
+
+class IterableNamespace(SimpleNamespace):
+    """
+    A namespace object that supports both dictionary-like iteration and dot notation access.
+    Automatically converts nested dictionaries into IterableNamespaces.
+
+    This class extends SimpleNamespace to provide:
+    - Dictionary-style iteration over keys
+    - Access to items via both dot notation (obj.key) and brackets (obj["key"])
+    - Dictionary-like methods: items(), keys(), values()
+    - Recursive conversion of nested dictionaries
+
+    Args:
+        dictionary: Optional dictionary to initialize the namespace
+        **kwargs: Additional keyword arguments passed to SimpleNamespace
+
+    Examples:
+        >>> data = {"name": "Alice", "details": {"age": 25}}
+        >>> ns = IterableNamespace(data)
+        >>> ns.name
+        'Alice'
+        >>> ns.details.age
+        25
+        >>> list(ns.keys())
+        ['name', 'details']
+        >>> for key, value in ns.items():
+        ...     print(f"{key}: {value}")
+        name: Alice
+        details: IterableNamespace(age=25)
+    """
+
+    def __init__(self, dictionary: dict[str, Any] = None, **kwargs):
+        super().__init__(**kwargs)
+        if dictionary is not None:
+            for key, value in dictionary.items():
+                if isinstance(value, dict):
+                    setattr(self, key, IterableNamespace(value))
+                else:
+                    setattr(self, key, value)
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(vars(self))
+
+    def __getitem__(self, key: str) -> Any:
+        return vars(self)[key]
+
+    def items(self):
+        return vars(self).items()
+
+    def values(self):
+        return vars(self).values()
+
+    def keys(self):
+        return vars(self).keys()
diff --git a/lerobot/scripts/visualize_dataset_html.py b/lerobot/scripts/visualize_dataset_html.py
index 2c81fbfc..ec6eca22 100644
--- a/lerobot/scripts/visualize_dataset_html.py
+++ b/lerobot/scripts/visualize_dataset_html.py
@@ -53,20 +53,29 @@ python lerobot/scripts/visualize_dataset_html.py \
 """
 
 import argparse
+import csv
+import json
 import logging
+import re
 import shutil
+import tempfile
+from io import StringIO
 from pathlib import Path
 
-import tqdm
-from flask import Flask, redirect, render_template, url_for
+import numpy as np
+import pandas as pd
+import requests
+from flask import Flask, redirect, render_template, request, url_for
 
+from lerobot import available_datasets
 from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.common.datasets.utils import IterableNamespace
 from lerobot.common.utils.utils import init_logging
 
 
 def run_server(
-    dataset: LeRobotDataset,
-    episodes: list[int],
+    dataset: LeRobotDataset | IterableNamespace | None,
+    episodes: list[int] | None,
     host: str,
     port: str,
     static_folder: Path,
@@ -76,10 +85,50 @@ def run_server(
     app.config["SEND_FILE_MAX_AGE_DEFAULT"] = 0  # specifying not to cache
 
     @app.route("/")
-    def index():
-        # home page redirects to the first episode page
-        [dataset_namespace, dataset_name] = dataset.repo_id.split("/")
-        first_episode_id = episodes[0]
+    def hommepage(dataset=dataset):
+        if dataset:
+            dataset_namespace, dataset_name = dataset.repo_id.split("/")
+            return redirect(
+                url_for(
+                    "show_episode",
+                    dataset_namespace=dataset_namespace,
+                    dataset_name=dataset_name,
+                    episode_id=0,
+                )
+            )
+
+        dataset_param, episode_param = None, None
+        all_params = request.args
+        if "dataset" in all_params:
+            dataset_param = all_params["dataset"]
+        if "episode" in all_params:
+            episode_param = int(all_params["episode"])
+
+        if dataset_param:
+            dataset_namespace, dataset_name = dataset_param.split("/")
+            return redirect(
+                url_for(
+                    "show_episode",
+                    dataset_namespace=dataset_namespace,
+                    dataset_name=dataset_name,
+                    episode_id=episode_param if episode_param is not None else 0,
+                )
+            )
+
+        featured_datasets = [
+            "lerobot/aloha_static_cups_open",
+            "lerobot/columbia_cairlab_pusht_real",
+            "lerobot/taco_play",
+        ]
+        return render_template(
+            "visualize_dataset_homepage.html",
+            featured_datasets=featured_datasets,
+            lerobot_datasets=available_datasets,
+        )
+
+    @app.route("/<string:dataset_namespace>/<string:dataset_name>")
+    def show_first_episode(dataset_namespace, dataset_name):
+        first_episode_id = 0
         return redirect(
             url_for(
                 "show_episode",
@@ -90,30 +139,85 @@ def run_server(
         )
 
     @app.route("/<string:dataset_namespace>/<string:dataset_name>/episode_<int:episode_id>")
-    def show_episode(dataset_namespace, dataset_name, episode_id):
+    def show_episode(dataset_namespace, dataset_name, episode_id, dataset=dataset, episodes=episodes):
+        repo_id = f"{dataset_namespace}/{dataset_name}"
+        try:
+            if dataset is None:
+                dataset = get_dataset_info(repo_id)
+        except FileNotFoundError:
+            return (
+                "Make sure to convert your LeRobotDataset to v2 & above. See how to convert your dataset at https://github.com/huggingface/lerobot/pull/461",
+                400,
+            )
+        dataset_version = (
+            dataset.meta._version if isinstance(dataset, LeRobotDataset) else dataset.codebase_version
+        )
+        match = re.search(r"v(\d+)\.", dataset_version)
+        if match:
+            major_version = int(match.group(1))
+            if major_version < 2:
+                return "Make sure to convert your LeRobotDataset to v2 & above."
+
+        episode_data_csv_str, columns = get_episode_data(dataset, episode_id)
         dataset_info = {
-            "repo_id": dataset.repo_id,
-            "num_samples": dataset.num_frames,
-            "num_episodes": dataset.num_episodes,
+            "repo_id": f"{dataset_namespace}/{dataset_name}",
+            "num_samples": dataset.num_frames
+            if isinstance(dataset, LeRobotDataset)
+            else dataset.total_frames,
+            "num_episodes": dataset.num_episodes
+            if isinstance(dataset, LeRobotDataset)
+            else dataset.total_episodes,
             "fps": dataset.fps,
         }
-        video_paths = [dataset.meta.get_video_file_path(episode_id, key) for key in dataset.meta.video_keys]
-        tasks = dataset.meta.episodes[episode_id]["tasks"]
-        videos_info = [
-            {"url": url_for("static", filename=video_path), "filename": video_path.name}
-            for video_path in video_paths
-        ]
+        if isinstance(dataset, LeRobotDataset):
+            video_paths = [
+                dataset.meta.get_video_file_path(episode_id, key) for key in dataset.meta.video_keys
+            ]
+            videos_info = [
+                {"url": url_for("static", filename=video_path), "filename": video_path.parent.name}
+                for video_path in video_paths
+            ]
+            tasks = dataset.meta.episodes[0]["tasks"]
+        else:
+            video_keys = [key for key, ft in dataset.features.items() if ft["dtype"] == "video"]
+            videos_info = [
+                {
+                    "url": f"https://huggingface.co/datasets/{repo_id}/resolve/main/"
+                    + dataset.video_path.format(
+                        episode_chunk=int(episode_id) // dataset.chunks_size,
+                        video_key=video_key,
+                        episode_index=episode_id,
+                    ),
+                    "filename": video_key,
+                }
+                for video_key in video_keys
+            ]
+
+            response = requests.get(
+                f"https://huggingface.co/datasets/{repo_id}/resolve/main/meta/episodes.jsonl"
+            )
+            response.raise_for_status()
+            # Split into lines and parse each line as JSON
+            tasks_jsonl = [json.loads(line) for line in response.text.splitlines() if line.strip()]
+
+            filtered_tasks_jsonl = [row for row in tasks_jsonl if row["episode_index"] == episode_id]
+            tasks = filtered_tasks_jsonl[0]["tasks"]
+
         videos_info[0]["language_instruction"] = tasks
 
-        ep_csv_url = url_for("static", filename=get_ep_csv_fname(episode_id))
+        if episodes is None:
+            episodes = list(
+                range(dataset.num_episodes if isinstance(dataset, LeRobotDataset) else dataset.total_episodes)
+            )
+
         return render_template(
             "visualize_dataset_template.html",
             episode_id=episode_id,
             episodes=episodes,
             dataset_info=dataset_info,
             videos_info=videos_info,
-            ep_csv_url=ep_csv_url,
-            has_policy=False,
+            episode_data_csv_str=episode_data_csv_str,
+            columns=columns,
         )
 
     app.run(host=host, port=port)
@@ -124,46 +228,84 @@ def get_ep_csv_fname(episode_id: int):
     return ep_csv_fname
 
 
-def write_episode_data_csv(output_dir, file_name, episode_index, dataset):
-    """Write a csv file containg timeseries data of an episode (e.g. state and action).
+def get_episode_data(dataset: LeRobotDataset | IterableNamespace, episode_index):
+    """Get a csv str containing timeseries data of an episode (e.g. state and action).
     This file will be loaded by Dygraph javascript to plot data in real time."""
-    from_idx = dataset.episode_data_index["from"][episode_index]
-    to_idx = dataset.episode_data_index["to"][episode_index]
-
+    columns = []
     has_state = "observation.state" in dataset.features
     has_action = "action" in dataset.features
 
     # init header of csv with state and action names
     header = ["timestamp"]
     if has_state:
-        dim_state = dataset.meta.shapes["observation.state"][0]
+        dim_state = (
+            dataset.meta.shapes["observation.state"][0]
+            if isinstance(dataset, LeRobotDataset)
+            else dataset.features["observation.state"].shape[0]
+        )
         header += [f"state_{i}" for i in range(dim_state)]
+        column_names = dataset.features["observation.state"]["names"]
+        while not isinstance(column_names, list):
+            column_names = list(column_names.values())[0]
+        columns.append({"key": "state", "value": column_names})
     if has_action:
-        dim_action = dataset.meta.shapes["action"][0]
+        dim_action = (
+            dataset.meta.shapes["action"][0]
+            if isinstance(dataset, LeRobotDataset)
+            else dataset.features.action.shape[0]
+        )
         header += [f"action_{i}" for i in range(dim_action)]
+        column_names = dataset.features["action"]["names"]
+        while not isinstance(column_names, list):
+            column_names = list(column_names.values())[0]
+        columns.append({"key": "action", "value": column_names})
 
-    columns = ["timestamp"]
-    if has_state:
-        columns += ["observation.state"]
-    if has_action:
-        columns += ["action"]
-
-    rows = []
-    data = dataset.hf_dataset.select_columns(columns)
-    for i in range(from_idx, to_idx):
-        row = [data[i]["timestamp"].item()]
+    if isinstance(dataset, LeRobotDataset):
+        from_idx = dataset.episode_data_index["from"][episode_index]
+        to_idx = dataset.episode_data_index["to"][episode_index]
+        selected_columns = ["timestamp"]
         if has_state:
-            row += data[i]["observation.state"].tolist()
+            selected_columns += ["observation.state"]
         if has_action:
-            row += data[i]["action"].tolist()
-        rows.append(row)
+            selected_columns += ["action"]
+        data = (
+            dataset.hf_dataset.select(range(from_idx, to_idx))
+            .select_columns(selected_columns)
+            .with_format("numpy")
+        )
+        rows = np.hstack(
+            (np.expand_dims(data["timestamp"], axis=1), *[data[col] for col in selected_columns[1:]])
+        ).tolist()
+    else:
+        repo_id = dataset.repo_id
+        selected_columns = ["timestamp"]
+        if "observation.state" in dataset.features:
+            selected_columns.append("observation.state")
+        if "action" in dataset.features:
+            selected_columns.append("action")
 
-    output_dir.mkdir(parents=True, exist_ok=True)
-    with open(output_dir / file_name, "w") as f:
-        f.write(",".join(header) + "\n")
-        for row in rows:
-            row_str = [str(col) for col in row]
-            f.write(",".join(row_str) + "\n")
+        url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/" + dataset.data_path.format(
+            episode_chunk=int(episode_index) // dataset.chunks_size, episode_index=episode_index
+        )
+        df = pd.read_parquet(url)
+        data = df[selected_columns]  # Select specific columns
+        rows = np.hstack(
+            (
+                np.expand_dims(data["timestamp"], axis=1),
+                *[np.vstack(data[col]) for col in selected_columns[1:]],
+            )
+        ).tolist()
+
+    # Convert data to CSV string
+    csv_buffer = StringIO()
+    csv_writer = csv.writer(csv_buffer)
+    # Write header
+    csv_writer.writerow(header)
+    # Write data rows
+    csv_writer.writerows(rows)
+    csv_string = csv_buffer.getvalue()
+
+    return csv_string, columns
 
 
 def get_episode_video_paths(dataset: LeRobotDataset, ep_index: int) -> list[str]:
@@ -175,9 +317,31 @@ def get_episode_video_paths(dataset: LeRobotDataset, ep_index: int) -> list[str]
     ]
 
 
+def get_episode_language_instruction(dataset: LeRobotDataset, ep_index: int) -> list[str]:
+    # check if the dataset has language instructions
+    if "language_instruction" not in dataset.features:
+        return None
+
+    # get first frame index
+    first_frame_idx = dataset.episode_data_index["from"][ep_index].item()
+
+    language_instruction = dataset.hf_dataset[first_frame_idx]["language_instruction"]
+    # TODO (michel-aractingi) hack to get the sentence, some strings in openx are badly stored
+    # with the tf.tensor appearing in the string
+    return language_instruction.removeprefix("tf.Tensor(b'").removesuffix("', shape=(), dtype=string)")
+
+
+def get_dataset_info(repo_id: str) -> IterableNamespace:
+    response = requests.get(f"https://huggingface.co/datasets/{repo_id}/resolve/main/meta/info.json")
+    response.raise_for_status()  # Raises an HTTPError for bad responses
+    dataset_info = response.json()
+    dataset_info["repo_id"] = repo_id
+    return IterableNamespace(dataset_info)
+
+
 def visualize_dataset_html(
-    dataset: LeRobotDataset,
-    episodes: list[int] = None,
+    dataset: LeRobotDataset | None,
+    episodes: list[int] | None = None,
     output_dir: Path | None = None,
     serve: bool = True,
     host: str = "127.0.0.1",
@@ -186,11 +350,11 @@ def visualize_dataset_html(
 ) -> Path | None:
     init_logging()
 
-    if len(dataset.meta.image_keys) > 0:
-        raise NotImplementedError(f"Image keys ({dataset.meta.image_keys=}) are currently not supported.")
+    template_dir = Path(__file__).resolve().parent.parent / "templates"
 
     if output_dir is None:
-        output_dir = f"outputs/visualize_dataset_html/{dataset.repo_id}"
+        # Create a temporary directory that will be automatically cleaned up
+        output_dir = tempfile.mkdtemp(prefix="lerobot_visualize_dataset_")
 
     output_dir = Path(output_dir)
     if output_dir.exists():
@@ -201,28 +365,33 @@ def visualize_dataset_html(
 
     output_dir.mkdir(parents=True, exist_ok=True)
 
-    # Create a simlink from the dataset video folder containg mp4 files to the output directory
-    # so that the http server can get access to the mp4 files.
     static_dir = output_dir / "static"
     static_dir.mkdir(parents=True, exist_ok=True)
-    ln_videos_dir = static_dir / "videos"
-    if not ln_videos_dir.exists():
-        ln_videos_dir.symlink_to((dataset.root / "videos").resolve())
 
-    template_dir = Path(__file__).resolve().parent.parent / "templates"
+    if dataset is None:
+        if serve:
+            run_server(
+                dataset=None,
+                episodes=None,
+                host=host,
+                port=port,
+                static_folder=static_dir,
+                template_folder=template_dir,
+            )
+    else:
+        image_keys = dataset.meta.image_keys if isinstance(dataset, LeRobotDataset) else []
+        if len(image_keys) > 0:
+            raise NotImplementedError(f"Image keys ({image_keys=}) are currently not supported.")
 
-    if episodes is None:
-        episodes = list(range(dataset.num_episodes))
+        # Create a simlink from the dataset video folder containg mp4 files to the output directory
+        # so that the http server can get access to the mp4 files.
+        if isinstance(dataset, LeRobotDataset):
+            ln_videos_dir = static_dir / "videos"
+            if not ln_videos_dir.exists():
+                ln_videos_dir.symlink_to((dataset.root / "videos").resolve())
 
-    logging.info("Writing CSV files")
-    for episode_index in tqdm.tqdm(episodes):
-        # write states and actions in a csv (it can be slow for big datasets)
-        ep_csv_fname = get_ep_csv_fname(episode_index)
-        # TODO(rcadene): speedup script by loading directly from dataset, pyarrow, parquet, safetensors?
-        write_episode_data_csv(static_dir, ep_csv_fname, episode_index, dataset)
-
-    if serve:
-        run_server(dataset, episodes, host, port, static_dir, template_dir)
+        if serve:
+            run_server(dataset, episodes, host, port, static_dir, template_dir)
 
 
 def main():
@@ -231,7 +400,7 @@ def main():
     parser.add_argument(
         "--repo-id",
         type=str,
-        required=True,
+        default=None,
         help="Name of hugging face repositery containing a LeRobotDataset dataset (e.g. `lerobot/pusht` for https://huggingface.co/datasets/lerobot/pusht).",
     )
     parser.add_argument(
@@ -246,6 +415,12 @@ def main():
         default=None,
         help="Root directory for a dataset stored locally (e.g. `--root data`). By default, the dataset will be loaded from hugging face cache folder, or downloaded from the hub if available.",
     )
+    parser.add_argument(
+        "--load-from-hf-hub",
+        type=int,
+        default=0,
+        help="Load videos and parquet files from HF Hub rather than local system.",
+    )
     parser.add_argument(
         "--episodes",
         type=int,
@@ -287,11 +462,19 @@ def main():
     args = parser.parse_args()
     kwargs = vars(args)
     repo_id = kwargs.pop("repo_id")
+    load_from_hf_hub = kwargs.pop("load_from_hf_hub")
     root = kwargs.pop("root")
     local_files_only = kwargs.pop("local_files_only")
 
-    dataset = LeRobotDataset(repo_id, root=root, local_files_only=local_files_only)
-    visualize_dataset_html(dataset, **kwargs)
+    dataset = None
+    if repo_id:
+        dataset = (
+            LeRobotDataset(repo_id, root=root, local_files_only=local_files_only)
+            if not load_from_hf_hub
+            else get_dataset_info(repo_id)
+        )
+
+    visualize_dataset_html(dataset, **vars(args))
 
 
 if __name__ == "__main__":
diff --git a/lerobot/templates/visualize_dataset_homepage.html b/lerobot/templates/visualize_dataset_homepage.html
new file mode 100644
index 00000000..adff07be
--- /dev/null
+++ b/lerobot/templates/visualize_dataset_homepage.html
@@ -0,0 +1,68 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Interactive Video Background Page</title>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <script defer src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"></script>
+</head>
+<body class="h-screen overflow-hidden font-mono text-white" x-data="{ 
+    inputValue: '',
+    navigateToDataset() {
+        const trimmedValue = this.inputValue.trim();
+        if (trimmedValue) {
+            window.location.href = `/${trimmedValue}`;
+        }
+    }
+}">
+    <div class="fixed inset-0 w-full h-full overflow-hidden">
+        <video class="absolute min-w-full min-h-full w-auto h-auto top-1/2 left-1/2 transform -translate-x-1/2 -translate-y-1/2" autoplay muted loop>
+            <source src="https://huggingface.co/datasets/cadene/koch_bimanual_folding/resolve/v1.6/videos/observation.images.phone_episode_000037.mp4" type="video/mp4">
+            Your browser does not support HTML5 video.
+        </video>
+    </div>
+    <div class="fixed inset-0 bg-black bg-opacity-80"></div>
+    <div class="relative z-10 flex flex-col items-center justify-center h-screen">
+        <div class="text-center mb-8">
+            <h1 class="text-4xl font-bold mb-4">LeRobot Dataset Visualizer</h1>
+
+            <a href="https://x.com/RemiCadene/status/1825455895561859185" target="_blank" rel="noopener noreferrer" class="underline">create & train your own robots</a>
+
+            <p class="text-xl mb-4"></p>
+            <div class="text-left inline-block">
+                <h3 class="font-semibold mb-2 mt-4">Example Datasets:</h3>
+                <ul class="list-disc list-inside">
+                    {% for dataset in featured_datasets %}
+                        <li><a href="/{{ dataset }}" class="text-blue-300 hover:text-blue-100 hover:underline">{{ dataset }}</a></li>
+                    {% endfor %}
+                </ul>
+            </div>
+        </div>
+        <div class="flex w-full max-w-lg px-4 mb-4">
+            <input 
+                type="text" 
+                x-model="inputValue"
+                @keyup.enter="navigateToDataset"
+                placeholder="enter dataset id (ex: lerobot/droid_100)"
+                class="flex-grow px-4 py-2 rounded-l bg-white bg-opacity-20 text-white placeholder-gray-300 focus:outline-none focus:ring-2 focus:ring-blue-300"
+            >
+            <button 
+                @click="navigateToDataset"
+                class="px-4 py-2 bg-blue-500 text-white rounded-r hover:bg-blue-600 focus:outline-none focus:ring-2 focus:ring-blue-300"
+            >
+                Go
+            </button>
+        </div>
+
+        <details class="mt-4 max-w-full px-4">
+            <summary>More example datasets</summary>
+            <ul class="list-disc list-inside max-h-28 overflow-y-auto break-all">
+                {% for dataset in lerobot_datasets %}
+                    <li><a href="/{{ dataset }}" class="text-blue-300 hover:text-blue-100 hover:underline">{{ dataset }}</a></li>
+                {% endfor %}
+            </ul>
+        </details>
+    </div>
+</body>
+</html>
\ No newline at end of file
diff --git a/lerobot/templates/visualize_dataset_template.html b/lerobot/templates/visualize_dataset_template.html
index 0fa1e713..12d6e991 100644
--- a/lerobot/templates/visualize_dataset_template.html
+++ b/lerobot/templates/visualize_dataset_template.html
@@ -31,11 +31,16 @@
 }">
     <!-- Sidebar -->
     <div x-ref="sidebar" class="bg-slate-900 p-5 break-words overflow-y-auto shrink-0 md:shrink md:w-60 md:max-h-screen">
-        <h1 class="mb-4 text-xl font-semibold">{{ dataset_info.repo_id }}</h1>
+        <a href="https://github.com/huggingface/lerobot" target="_blank" class="hidden md:block">
+            <img src="https://github.com/huggingface/lerobot/raw/main/media/lerobot-logo-thumbnail.png">
+        </a>
+        <a href="https://huggingface.co/datasets/{{ dataset_info.repo_id }}" target="_blank">
+            <h1 class="mb-4 text-xl font-semibold">{{ dataset_info.repo_id }}</h1>
+        </a>
 
         <ul>
             <li>
-                Number of samples/frames: {{ dataset_info.num_frames }}
+                Number of samples/frames: {{ dataset_info.num_samples }}
             </li>
             <li>
                 Number of episodes: {{ dataset_info.num_episodes }}
@@ -93,10 +98,10 @@
         </div>
 
         <!-- Videos -->
-        <div class="flex flex-wrap gap-1">
+        <div class="flex flex-wrap gap-x-2 gap-y-6">
             {% for video_info in videos_info %}
-            <div x-show="!videoCodecError" class="max-w-96">
-                <p class="text-sm text-gray-300 bg-gray-800 px-2 rounded-t-xl truncate">{{ video_info.filename }}</p>
+            <div x-show="!videoCodecError" class="max-w-96 relative">
+                <p class="absolute inset-x-0 -top-4 text-sm text-gray-300 bg-gray-800 px-2 rounded-t-xl truncate">{{ video_info.filename }}</p>
                 <video muted loop type="video/mp4" class="object-contain w-full h-full" @canplaythrough="videoCanPlay" @timeupdate="() => {
                     if (video.duration) {
                       const time = video.currentTime;
@@ -182,12 +187,12 @@
                 <thead>
                     <tr>
                         <th></th>
-                        <template x-for="(_, colIndex) in Array.from({length: nColumns}, (_, index) => index)">
+                        <template x-for="(_, colIndex) in Array.from({length: columns.length}, (_, index) => index)">
                             <th class="border border-slate-700">
                                 <div class="flex gap-x-2 justify-between px-2">
                                     <input type="checkbox" :checked="isColumnChecked(colIndex)"
                                         @change="toggleColumn(colIndex)">
-                                    <p x-text="`${columnNames[colIndex]}`"></p>
+                                    <p x-text="`${columns[colIndex].key}`"></p>
                                 </div>
                             </th>
                         </template>
@@ -197,10 +202,10 @@
                     <template x-for="(row, rowIndex) in rows">
                         <tr class="odd:bg-gray-800 even:bg-gray-900">
                             <td class="border border-slate-700">
-                                <div class="flex gap-x-2 w-24 font-semibold px-1">
+                                <div class="flex gap-x-2 max-w-64 font-semibold px-1 break-all">
                                     <input type="checkbox" :checked="isRowChecked(rowIndex)"
                                         @change="toggleRow(rowIndex)">
-                                    <p x-text="`Motor ${rowIndex}`"></p>
+                                    <p x-text="`${rowLabels[rowIndex]}`"></p>
                                 </div>
                             </td>
                             <template x-for="(cell, colIndex) in row">
@@ -222,16 +227,20 @@
         </div>
     </div>
 
+    <script>
+        const parentOrigin = "https://huggingface.co";
+        const searchParams = new URLSearchParams();
+        searchParams.set("dataset", "{{ dataset_info.repo_id }}");
+        searchParams.set("episode", "{{ episode_id }}");
+		window.parent.postMessage({ queryString: searchParams.toString() }, parentOrigin);
+    </script>
+
     <script>
         function createAlpineData() {
             return {
                 // state
                 dygraph: null,
                 currentFrameData: null,
-                columnNames: ["state", "action", "pred action"],
-                nColumns: 2,
-                nStates: 0,
-                nActions: 0,
                 checked: [],
                 dygraphTime: 0.0,
                 dygraphIndex: 0,
@@ -241,6 +250,8 @@
                 nVideos: {{ videos_info | length }},
                 nVideoReadyToPlay: 0,
                 videoCodecError: false,
+                columns: {{ columns | tojson }},
+                rowLabels: {{ columns | tojson }}.reduce((colA, colB) => colA.value.length > colB.value.length ? colA : colB).value,
 
                 // alpine initialization
                 init() {
@@ -251,10 +262,17 @@
                         this.videoCodecError = true;
                     }
 
+                    // process CSV data
+                    const csvDataStr = {{ episode_data_csv_str|tojson|safe }};
+                    // Create a Blob with the CSV data
+                    const blob = new Blob([csvDataStr], { type: 'text/csv;charset=utf-8;' });
+                    // Create a URL for the Blob
+                    const csvUrl = URL.createObjectURL(blob);
+
                     // process CSV data
                     this.videos = document.querySelectorAll('video');
                     this.video = this.videos[0];
-                    this.dygraph = new Dygraph(document.getElementById("graph"), '{{ ep_csv_url }}', {
+                    this.dygraph = new Dygraph(document.getElementById("graph"), csvUrl, {
                         pixelsPerPoint: 0.01,
                         legend: 'always',
                         labelsDiv: document.getElementById('labels'),
@@ -275,21 +293,17 @@
                                 this.colors = this.dygraph.getColors();
                                 this.checked = Array(this.colors.length).fill(true);
 
-                                const seriesNames = this.dygraph.getLabels().slice(1);
-                                this.nStates = seriesNames.findIndex(item => item.startsWith('action_'));
-                                this.nActions = seriesNames.length - this.nStates;
                                 const colors = [];
-                                const LIGHTNESS = [30, 65, 85]; // state_lightness, action_lightness, pred_action_lightness
-                                // colors for "state" lines
-                                for (let hue = 0; hue < 360; hue += parseInt(360/this.nStates)) {
-                                    const color = `hsl(${hue}, 100%, ${LIGHTNESS[0]}%)`;
-                                    colors.push(color);
-                                }
-                                // colors for "action" lines
-                                for (let hue = 0; hue < 360; hue += parseInt(360/this.nActions)) {
-                                    const color = `hsl(${hue}, 100%, ${LIGHTNESS[1]}%)`;
-                                    colors.push(color);
+                                let lightness = 30; // const LIGHTNESS = [30, 65, 85]; // state_lightness, action_lightness, pred_action_lightness
+                                for(const column of this.columns){
+                                    const nValues = column.value.length;
+                                    for (let hue = 0; hue < 360; hue += parseInt(360/nValues)) {
+                                        const color = `hsl(${hue}, 100%, ${lightness}%)`;
+                                        colors.push(color);
+                                    }
+                                    lightness += 35;
                                 }
+
                                 this.dygraph.updateOptions({ colors });
                                 this.colors = colors;
 
@@ -316,17 +330,19 @@
                         return [];
                     }
                     const rows = [];
-                    const nRows = Math.max(this.nStates, this.nActions);
+                    const nRows = Math.max(...this.columns.map(column => column.value.length));
                     let rowIndex = 0;
                     while(rowIndex < nRows){
                         const row = [];
                         // number of states may NOT match number of actions. In this case, we null-pad the 2D array to make a fully rectangular 2d array
                         const nullCell = { isNull: true };
-                        const stateValueIdx = rowIndex;
-                        const actionValueIdx = stateValueIdx + this.nStates; // because this.currentFrameData = [state0, state1, ..., stateN, action0, action1, ..., actionN]
                         // row consists of [state value, action value]
-                        row.push(rowIndex < this.nStates ? this.currentFrameData[stateValueIdx] : nullCell); // push "state value" to row
-                        row.push(rowIndex < this.nActions ? this.currentFrameData[actionValueIdx] : nullCell); // push "action value" to row
+                        let idx = rowIndex;
+                        for(const column of this.columns){
+                            const nColumn = column.value.length;
+                            row.push(rowIndex < nColumn ? this.currentFrameData[idx] : nullCell);
+                            idx += nColumn; // because this.currentFrameData = [state0, state1, ..., stateN, action0, action1, ..., actionN]
+                        }
                         rowIndex += 1;
                         rows.push(row);
                     }
diff --git a/tests/test_visualize_dataset_html.py b/tests/test_visualize_dataset_html.py
deleted file mode 100644
index 53924f56..00000000
--- a/tests/test_visualize_dataset_html.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from lerobot.scripts.visualize_dataset_html import visualize_dataset_html
-
-
-def test_visualize_dataset_html(tmp_path, lerobot_dataset_factory):
-    root = tmp_path / "dataset"
-    output_dir = tmp_path / "outputs"
-    dataset = lerobot_dataset_factory(root=root)
-    visualize_dataset_html(
-        dataset,
-        episodes=[0],
-        output_dir=output_dir,
-        serve=False,
-    )
-    assert (output_dir / "static" / "episode_0.csv").exists()

From 4a43c83522139180001765e802bfc9c40fe811d0 Mon Sep 17 00:00:00 2001
From: Eugene Mironov <helper2424@gmail.com>
Date: Mon, 23 Dec 2024 21:05:59 +0700
Subject: [PATCH 030/112] Fix broken `create_lerobot_dataset_card`  (#590)

---
 lerobot/common/datasets/utils.py            |  5 +--
 tests/lerobot/common/datasets/test_utils.py | 38 +++++++++++++++++++++
 2 files changed, 41 insertions(+), 2 deletions(-)
 create mode 100644 tests/lerobot/common/datasets/test_utils.py

diff --git a/lerobot/common/datasets/utils.py b/lerobot/common/datasets/utils.py
index 1490adda..123c5960 100644
--- a/lerobot/common/datasets/utils.py
+++ b/lerobot/common/datasets/utils.py
@@ -479,7 +479,6 @@ def create_lerobot_dataset_card(
     Note: If specified, license must be one of https://huggingface.co/docs/hub/repositories-licenses.
     """
     card_tags = ["LeRobot"]
-    card_template_path = importlib.resources.path("lerobot.common.datasets", "card_template.md")
 
     if tags:
         card_tags += tags
@@ -499,9 +498,11 @@ def create_lerobot_dataset_card(
         ],
     )
 
+    card_template = (importlib.resources.files("lerobot.common.datasets") / "card_template.md").read_text()
+
     return DatasetCard.from_template(
         card_data=card_data,
-        template_path=str(card_template_path),
+        template_str=card_template,
         **kwargs,
     )
 
diff --git a/tests/lerobot/common/datasets/test_utils.py b/tests/lerobot/common/datasets/test_utils.py
new file mode 100644
index 00000000..f484e1ae
--- /dev/null
+++ b/tests/lerobot/common/datasets/test_utils.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from huggingface_hub import DatasetCard
+
+from lerobot.common.datasets.utils import create_lerobot_dataset_card
+
+
+def test_default_parameters():
+    card = create_lerobot_dataset_card()
+    assert isinstance(card, DatasetCard)
+    assert card.data.tags == ["LeRobot"]
+    assert card.data.task_categories == ["robotics"]
+    assert card.data.configs == [
+        {
+            "config_name": "default",
+            "data_files": "data/*/*.parquet",
+        }
+    ]
+
+
+def test_with_tags():
+    tags = ["tag1", "tag2"]
+    card = create_lerobot_dataset_card(tags=tags)
+    assert card.data.tags == ["LeRobot", "tag1", "tag2"]

From b1cfb6a710751cc16af9ad606562f23bc56c909a Mon Sep 17 00:00:00 2001
From: CharlesCNorton <135471798+CharlesCNorton@users.noreply.github.com>
Date: Fri, 3 Jan 2025 10:19:37 -0500
Subject: [PATCH 031/112] Update README.md (#612)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9331bdec..849a14de 100644
--- a/README.md
+++ b/README.md
@@ -68,7 +68,7 @@
 
 ### Acknowledgment
 
-- Thanks to Tony Zaho, Zipeng Fu and colleagues for open sourcing ACT policy, ALOHA environments and datasets. Ours are adapted from [ALOHA](https://tonyzhaozh.github.io/aloha) and [Mobile ALOHA](https://mobile-aloha.github.io).
+- Thanks to Tony Zhao, Zipeng Fu and colleagues for open sourcing ACT policy, ALOHA environments and datasets. Ours are adapted from [ALOHA](https://tonyzhaozh.github.io/aloha) and [Mobile ALOHA](https://mobile-aloha.github.io).
 - Thanks to Cheng Chi, Zhenjia Xu and colleagues for open sourcing Diffusion policy, Pusht environment and datasets, as well as UMI datasets. Ours are adapted from [Diffusion Policy](https://diffusion-policy.cs.columbia.edu) and [UMI Gripper](https://umi-gripper.github.io).
 - Thanks to Nicklas Hansen, Yunhai Feng and colleagues for open sourcing TDMPC policy, Simxarm environments and datasets. Ours are adapted from [TDMPC](https://github.com/nicklashansen/tdmpc) and [FOWM](https://www.yunhaifeng.com/FOWM).
 - Thanks to Antonio Loquercio and Ashish Kumar for their early support.

From 31c34a4a49c6126f4a4d0003a840766545a92815 Mon Sep 17 00:00:00 2001
From: Simon Alibert <75076266+aliberts@users.noreply.github.com>
Date: Wed, 8 Jan 2025 13:35:11 +0100
Subject: [PATCH 032/112] Fix Quality workflow (#622)

---
 .github/workflows/quality.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml
index 851869a0..c245345f 100644
--- a/.github/workflows/quality.yml
+++ b/.github/workflows/quality.yml
@@ -50,7 +50,7 @@ jobs:
         uses: actions/checkout@v3
 
       - name: Install poetry
-        run: pipx install poetry
+        run: pipx install "poetry<2.0.0"
 
       - name: Poetry check
         run: poetry check
@@ -64,7 +64,7 @@ jobs:
         uses: actions/checkout@v3
 
       - name: Install poetry
-        run: pipx install poetry
+        run: pipx install "poetry<2.0.0"
 
       - name: Install poetry-relax
         run: poetry self add poetry-relax

From d6498150bf80376dd610ddc01ceec1f5a9445c33 Mon Sep 17 00:00:00 2001
From: CharlesCNorton <135471798+CharlesCNorton@users.noreply.github.com>
Date: Thu, 9 Jan 2025 03:35:27 -0500
Subject: [PATCH 033/112] fix(docs): typos in benchmark readme.md (#614)

Co-authored-by: Simon Alibert <75076266+aliberts@users.noreply.github.com>
---
 benchmarks/video/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/benchmarks/video/README.md b/benchmarks/video/README.md
index 890c1142..56cd1d1e 100644
--- a/benchmarks/video/README.md
+++ b/benchmarks/video/README.md
@@ -21,7 +21,7 @@ How to decode videos?
 
 ## Variables
 **Image content & size**
-We don't expect the same optimal settings for a dataset of images from a simulation, or from real-world in an appartment, or in a factory, or outdoor, or with lots of moving objects in the scene, etc. Similarly, loading times might not vary linearly with the image size (resolution).
+We don't expect the same optimal settings for a dataset of images from a simulation, or from real-world in an apartment, or in a factory, or outdoor, or with lots of moving objects in the scene, etc. Similarly, loading times might not vary linearly with the image size (resolution).
 For these reasons, we run this benchmark on four representative datasets:
 - `lerobot/pusht_image`: (96 x 96 pixels) simulation with simple geometric shapes, fixed camera.
 - `aliberts/aloha_mobile_shrimp_image`: (480 x 640 pixels) real-world indoor, moving camera.
@@ -63,7 +63,7 @@ This of course is affected by the `-g` parameter during encoding, which specifie
 
 Note that this differs significantly from a typical use case like watching a movie, in which every frame is loaded sequentially from the beginning to the end and it's acceptable to have big values for `-g`.
 
-Additionally, because some policies might request single timestamps that are a few frames appart, we also have the following scenario:
+Additionally, because some policies might request single timestamps that are a few frames apart, we also have the following scenario:
 - `2_frames_4_space`: 2 frames with 4 consecutive frames of spacing in between (e.g `[t, t + 5 / fps]`),
 
 However, due to how video decoding is implemented with `pyav`, we don't have access to an accurate seek so in practice this scenario is essentially the same as `6_frames` since all 6 frames between `t` and `t + 5 / fps` will be decoded.
@@ -85,8 +85,8 @@ However, due to how video decoding is implemented with `pyav`, we don't have acc
 **Average Structural Similarity Index Measure (higher is better)**
 `avg_ssim` evaluates the perceived quality of images by comparing luminance, contrast, and structure. SSIM values range from -1 to 1, where 1 indicates perfect similarity.
 
-One aspect that can't be measured here with those metrics is the compatibility of the encoding accross platforms, in particular on web browser, for visualization purposes.
-h264, h265 and AV1 are all commonly used codecs and should not be pose an issue. However, the chroma subsampling (`pix_fmt`) format might affect compatibility:
+One aspect that can't be measured here with those metrics is the compatibility of the encoding across platforms, in particular on web browser, for visualization purposes.
+h264, h265 and AV1 are all commonly used codecs and should not pose an issue. However, the chroma subsampling (`pix_fmt`) format might affect compatibility:
 - `yuv420p` is more widely supported across various platforms, including web browsers.
 - `yuv444p` offers higher color fidelity but might not be supported as broadly.
 
@@ -116,7 +116,7 @@ Additional encoding parameters exist that are not included in this benchmark. In
 - `-preset` which allows for selecting encoding presets. This represents a collection of options that will provide a certain encoding speed to compression ratio. By leaving this parameter unspecified, it is considered to be `medium` for libx264 and libx265 and `8` for libsvtav1.
 - `-tune` which allows to optimize the encoding for certains aspects (e.g. film quality, fast decoding, etc.).
 
-See the documentation mentioned above for more detailled info on these settings and for a more comprehensive list of other parameters.
+See the documentation mentioned above for more detailed info on these settings and for a more comprehensive list of other parameters.
 
 Similarly on the decoding side, other decoders exist but are not implemented in our current benchmark. To name a few:
 - `torchaudio`

From a1b5d0faf257d4956b84d9db8d39138c38b300f9 Mon Sep 17 00:00:00 2001
From: Ville Kuosmanen <ville@villekuosmanen.com>
Date: Thu, 9 Jan 2025 08:39:48 +0000
Subject: [PATCH 034/112] fix(visualise): use correct language description for
 each episode id (#604)

Co-authored-by: Simon Alibert <75076266+aliberts@users.noreply.github.com>
---
 lerobot/scripts/visualize_dataset_html.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lerobot/scripts/visualize_dataset_html.py b/lerobot/scripts/visualize_dataset_html.py
index ec6eca22..39b4c27d 100644
--- a/lerobot/scripts/visualize_dataset_html.py
+++ b/lerobot/scripts/visualize_dataset_html.py
@@ -177,7 +177,7 @@ def run_server(
                 {"url": url_for("static", filename=video_path), "filename": video_path.parent.name}
                 for video_path in video_paths
             ]
-            tasks = dataset.meta.episodes[0]["tasks"]
+            tasks = dataset.meta.episodes[episode_id]["tasks"]
         else:
             video_keys = [key for key, ft in dataset.features.items() if ft["dtype"] == "video"]
             videos_info = [

From c2f7af3339c83ce283b851bda88d5374e58e09b3 Mon Sep 17 00:00:00 2001
From: CharlesCNorton <135471798+CharlesCNorton@users.noreply.github.com>
Date: Thu, 9 Jan 2025 03:57:45 -0500
Subject: [PATCH 035/112] typo fix: batch_convert_dataset_v1_to_v2.py (#615)

Co-authored-by: Simon Alibert <75076266+aliberts@users.noreply.github.com>
---
 lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py b/lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py
index c8da2fe1..eeeb8fe7 100644
--- a/lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py
+++ b/lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py
@@ -159,11 +159,11 @@ DATASETS = {
         **ALOHA_STATIC_INFO,
     },
     "aloha_static_vinh_cup": {
-        "single_task": "Pick up the platic cup with the right arm, then pop its lid open with the left arm.",
+        "single_task": "Pick up the plastic cup with the right arm, then pop its lid open with the left arm.",
         **ALOHA_STATIC_INFO,
     },
     "aloha_static_vinh_cup_left": {
-        "single_task": "Pick up the platic cup with the left arm, then pop its lid open with the right arm.",
+        "single_task": "Pick up the plastic cup with the left arm, then pop its lid open with the right arm.",
         **ALOHA_STATIC_INFO,
     },
     "aloha_static_ziploc_slide": {"single_task": "Slide open the ziploc bag.", **ALOHA_STATIC_INFO},

From 100f54ee07418c4d72fd93400ada1fc98f639296 Mon Sep 17 00:00:00 2001
From: Mishig <dmishig@gmail.com>
Date: Thu, 9 Jan 2025 11:39:54 +0100
Subject: [PATCH 036/112] [viz] Fixes & updates to html visualizer (#617)

---
 lerobot/scripts/visualize_dataset_html.py     | 71 +++++++------------
 .../templates/visualize_dataset_template.html | 31 +++++++-
 2 files changed, 56 insertions(+), 46 deletions(-)

diff --git a/lerobot/scripts/visualize_dataset_html.py b/lerobot/scripts/visualize_dataset_html.py
index 39b4c27d..cc3f3930 100644
--- a/lerobot/scripts/visualize_dataset_html.py
+++ b/lerobot/scripts/visualize_dataset_html.py
@@ -232,69 +232,54 @@ def get_episode_data(dataset: LeRobotDataset | IterableNamespace, episode_index)
     """Get a csv str containing timeseries data of an episode (e.g. state and action).
     This file will be loaded by Dygraph javascript to plot data in real time."""
     columns = []
-    has_state = "observation.state" in dataset.features
-    has_action = "action" in dataset.features
+
+    selected_columns = [col for col, ft in dataset.features.items() if ft["dtype"] == "float32"]
+    selected_columns.remove("timestamp")
 
     # init header of csv with state and action names
     header = ["timestamp"]
-    if has_state:
+
+    for column_name in selected_columns:
         dim_state = (
-            dataset.meta.shapes["observation.state"][0]
+            dataset.meta.shapes[column_name][0]
             if isinstance(dataset, LeRobotDataset)
-            else dataset.features["observation.state"].shape[0]
+            else dataset.features[column_name].shape[0]
         )
-        header += [f"state_{i}" for i in range(dim_state)]
-        column_names = dataset.features["observation.state"]["names"]
-        while not isinstance(column_names, list):
-            column_names = list(column_names.values())[0]
-        columns.append({"key": "state", "value": column_names})
-    if has_action:
-        dim_action = (
-            dataset.meta.shapes["action"][0]
-            if isinstance(dataset, LeRobotDataset)
-            else dataset.features.action.shape[0]
-        )
-        header += [f"action_{i}" for i in range(dim_action)]
-        column_names = dataset.features["action"]["names"]
-        while not isinstance(column_names, list):
-            column_names = list(column_names.values())[0]
-        columns.append({"key": "action", "value": column_names})
+        header += [f"{column_name}_{i}" for i in range(dim_state)]
+
+        if "names" in dataset.features[column_name] and dataset.features[column_name]["names"]:
+            column_names = dataset.features[column_name]["names"]
+            while not isinstance(column_names, list):
+                column_names = list(column_names.values())[0]
+        else:
+            column_names = [f"motor_{i}" for i in range(dim_state)]
+        columns.append({"key": column_name, "value": column_names})
+
+    selected_columns.insert(0, "timestamp")
 
     if isinstance(dataset, LeRobotDataset):
         from_idx = dataset.episode_data_index["from"][episode_index]
         to_idx = dataset.episode_data_index["to"][episode_index]
-        selected_columns = ["timestamp"]
-        if has_state:
-            selected_columns += ["observation.state"]
-        if has_action:
-            selected_columns += ["action"]
         data = (
             dataset.hf_dataset.select(range(from_idx, to_idx))
             .select_columns(selected_columns)
-            .with_format("numpy")
+            .with_format("pandas")
         )
-        rows = np.hstack(
-            (np.expand_dims(data["timestamp"], axis=1), *[data[col] for col in selected_columns[1:]])
-        ).tolist()
     else:
         repo_id = dataset.repo_id
-        selected_columns = ["timestamp"]
-        if "observation.state" in dataset.features:
-            selected_columns.append("observation.state")
-        if "action" in dataset.features:
-            selected_columns.append("action")
 
         url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/" + dataset.data_path.format(
             episode_chunk=int(episode_index) // dataset.chunks_size, episode_index=episode_index
         )
         df = pd.read_parquet(url)
         data = df[selected_columns]  # Select specific columns
-        rows = np.hstack(
-            (
-                np.expand_dims(data["timestamp"], axis=1),
-                *[np.vstack(data[col]) for col in selected_columns[1:]],
-            )
-        ).tolist()
+
+    rows = np.hstack(
+        (
+            np.expand_dims(data["timestamp"], axis=1),
+            *[np.vstack(data[col]) for col in selected_columns[1:]],
+        )
+    ).tolist()
 
     # Convert data to CSV string
     csv_buffer = StringIO()
@@ -379,10 +364,6 @@ def visualize_dataset_html(
                 template_folder=template_dir,
             )
     else:
-        image_keys = dataset.meta.image_keys if isinstance(dataset, LeRobotDataset) else []
-        if len(image_keys) > 0:
-            raise NotImplementedError(f"Image keys ({image_keys=}) are currently not supported.")
-
         # Create a simlink from the dataset video folder containg mp4 files to the output directory
         # so that the http server can get access to the mp4 files.
         if isinstance(dataset, LeRobotDataset):
diff --git a/lerobot/templates/visualize_dataset_template.html b/lerobot/templates/visualize_dataset_template.html
index 12d6e991..3c93d2d6 100644
--- a/lerobot/templates/visualize_dataset_template.html
+++ b/lerobot/templates/visualize_dataset_template.html
@@ -98,9 +98,34 @@
         </div>
 
         <!-- Videos -->
+        <div  class="max-w-32 relative text-sm mb-4 select-none"
+            @click.outside="isVideosDropdownOpen = false">
+            <div
+                @click="isVideosDropdownOpen = !isVideosDropdownOpen"
+                class="p-2 border border-slate-500 rounded flex justify-between items-center cursor-pointer"
+            >
+            <span class="truncate">filter videos</span>
+            <div class="transition-transform" :class="{ 'rotate-180': isVideosDropdownOpen }">🔽</div>
+            </div>
+    
+            <div x-show="isVideosDropdownOpen" 
+                class="absolute mt-1 border border-slate-500 rounded shadow-lg z-10">
+            <div>
+                <template x-for="option in videosKeys" :key="option">
+                <div
+                    @click="videosKeysSelected = videosKeysSelected.includes(option) ? videosKeysSelected.filter(v => v !== option) : [...videosKeysSelected, option]"
+                    class="p-2 cursor-pointer bg-slate-900"
+                    :class="{ 'bg-slate-700': videosKeysSelected.includes(option) }"
+                    x-text="option"
+                ></div>
+                </template>
+            </div>
+            </div>
+        </div>
+
         <div class="flex flex-wrap gap-x-2 gap-y-6">
             {% for video_info in videos_info %}
-            <div x-show="!videoCodecError" class="max-w-96 relative">
+            <div x-show="!videoCodecError && videosKeysSelected.includes('{{ video_info.filename }}')" class="max-w-96 relative">
                 <p class="absolute inset-x-0 -top-4 text-sm text-gray-300 bg-gray-800 px-2 rounded-t-xl truncate">{{ video_info.filename }}</p>
                 <video muted loop type="video/mp4" class="object-contain w-full h-full" @canplaythrough="videoCanPlay" @timeupdate="() => {
                     if (video.duration) {
@@ -250,6 +275,9 @@
                 nVideos: {{ videos_info | length }},
                 nVideoReadyToPlay: 0,
                 videoCodecError: false,
+                isVideosDropdownOpen: false,
+                videosKeys: {{ videos_info | map(attribute='filename') | list | tojson }},
+                videosKeysSelected: [],
                 columns: {{ columns | tojson }},
                 rowLabels: {{ columns | tojson }}.reduce((colA, colB) => colA.value.length > colB.value.length ? colA : colB).value,
 
@@ -261,6 +289,7 @@
                     if(!canPlayVideos){
                         this.videoCodecError = true;
                     }
+                    this.videosKeysSelected = this.videosKeys.map(opt => opt)
 
                     // process CSV data
                     const csvDataStr = {{ episode_data_csv_str|tojson|safe }};

From df7310ea4016b5a10177a83f1376b23c1591034a Mon Sep 17 00:00:00 2001
From: Philip Fung <1054593+philfung@users.noreply.github.com>
Date: Fri, 10 Jan 2025 02:30:01 -0800
Subject: [PATCH 037/112] fixes to SO-100 readme (#600)

Co-authored-by: Philip Fung <no@one>
Co-authored-by: Simon Alibert <75076266+aliberts@users.noreply.github.com>
---
 examples/10_use_so100.md | 85 +++++++++++++++++++++++++---------------
 1 file changed, 53 insertions(+), 32 deletions(-)

diff --git a/examples/10_use_so100.md b/examples/10_use_so100.md
index 70e4ed8b..155bbe51 100644
--- a/examples/10_use_so100.md
+++ b/examples/10_use_so100.md
@@ -1,25 +1,31 @@
-This tutorial explains how to use [SO-100](https://github.com/TheRobotStudio/SO-ARM100) with LeRobot.
+# Using the [SO-100](https://github.com/TheRobotStudio/SO-ARM100) with LeRobot
 
-## Source the parts
+
+## A. Source the parts
 
 Follow this [README](https://github.com/TheRobotStudio/SO-ARM100). It contains the bill of materials, with link to source the parts, as well as the instructions to 3D print the parts, and advices if it's your first time printing or if you don't own a 3D printer already.
 
 **Important**: Before assembling, you will first need to configure your motors. To this end, we provide a nice script, so let's first install LeRobot. After configuration, we will also guide you through assembly.
 
-## Install LeRobot
+## B. Install LeRobot
 
 On your computer:
 
 1. [Install Miniconda](https://docs.anaconda.com/miniconda/#quick-command-line-install):
 ```bash
 mkdir -p ~/miniconda3
+# Linux:
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh
+# Mac M-series: 
+# curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh -o ~/miniconda3/miniconda.sh
+# Mac Intel: 
+# curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o ~/miniconda3/miniconda.sh
 bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
 rm ~/miniconda3/miniconda.sh
 ~/miniconda3/bin/conda init bash
 ```
 
-2. Restart shell or `source ~/.bashrc`
+2. Restart shell or `source ~/.bashrc` (*Mac*: `source ~/.bash_profile`) or `source ~/.zshrc` if you're using zshell
 
 3. Create and activate a fresh conda environment for lerobot
 ```bash
@@ -36,23 +42,30 @@ git clone https://github.com/huggingface/lerobot.git ~/lerobot
 cd ~/lerobot && pip install -e ".[feetech]"
 ```
 
-For Linux only (not Mac), install extra dependencies for recording datasets:
+*For Linux only (not Mac)*: install extra dependencies for recording datasets:
 ```bash
 conda install -y -c conda-forge ffmpeg
 pip uninstall -y opencv-python
 conda install -y -c conda-forge "opencv>=4.10.0"
 ```
 
-## Configure the motors
+## C. Configure the motors
 
-Follow steps 1 of the [assembly video](https://www.youtube.com/watch?v=FioA2oeFZ5I) which illustrates the use of our scripts below.
+### 1. Find the USB ports associated to each arm
 
-**Find USB ports associated to your arms**
-To find the correct ports for each arm, run the utility script twice:
+Designate one bus servo adapter and 6 motors for your leader arm, and similarly the other bus servo adapter and 6 motors for the follower arm.
+
+#### a. Run the script to find ports
+
+Follow Step 1 of the [assembly video](https://www.youtube.com/watch?v=FioA2oeFZ5I), which illustrates the use of our scripts below.
+
+To find the port for each bus servo adapter, run the utility script:
 ```bash
 python lerobot/scripts/find_motors_bus_port.py
 ```
 
+#### b. Example outputs
+
 Example output when identifying the leader arm's port (e.g., `/dev/tty.usbmodem575E0031751` on Mac, or possibly `/dev/ttyACM0` on Linux):
 ```
 Finding all available ports for the MotorBus.
@@ -64,7 +77,6 @@ Remove the usb cable from your DynamixelMotorsBus and press Enter when done.
 The port of this DynamixelMotorsBus is /dev/tty.usbmodem575E0031751
 Reconnect the usb cable.
 ```
-
 Example output when identifying the follower arm's port (e.g., `/dev/tty.usbmodem575E0032081`, or possibly `/dev/ttyACM1` on Linux):
 ```
 Finding all available ports for the MotorBus.
@@ -77,13 +89,20 @@ The port of this DynamixelMotorsBus is /dev/tty.usbmodem575E0032081
 Reconnect the usb cable.
 ```
 
-Troubleshooting: On Linux, you might need to give access to the USB ports by running:
+#### c. Troubleshooting
+On Linux, you might need to give access to the USB ports by running:
 ```bash
 sudo chmod 666 /dev/ttyACM0
 sudo chmod 666 /dev/ttyACM1
 ```
 
-**Configure your motors**
+#### d. Update YAML file
+
+Now that you have the ports, modify the *port* sections in `so100.yaml` 
+
+### 2. Configure the motors
+
+#### a. Set IDs for all 12 motors
 Plug your first motor and run this script to set its ID to 1. It will also set its present position to 2048, so expect your motor to rotate:
 ```bash
 python lerobot/scripts/configure_motor.py \
@@ -94,7 +113,7 @@ python lerobot/scripts/configure_motor.py \
   --ID 1
 ```
 
-Note: These motors are currently limitated. They can take values between 0 and 4096 only, which corresponds to a full turn. They can't turn more than that. 2048 is at the middle of this range, so we can take -2048 steps (180 degrees anticlockwise) and reach the maximum range, or take +2048 steps (180 degrees clockwise) and reach the maximum range. The configuration step also sets the homing offset to 0, so that if you misassembled the arm, you can always update the homing offset to account for a shift up to ± 2048 steps (± 180 degrees).
+*Note: These motors are currently limitated. They can take values between 0 and 4096 only, which corresponds to a full turn. They can't turn more than that. 2048 is at the middle of this range, so we can take -2048 steps (180 degrees anticlockwise) and reach the maximum range, or take +2048 steps (180 degrees clockwise) and reach the maximum range. The configuration step also sets the homing offset to 0, so that if you misassembled the arm, you can always update the homing offset to account for a shift up to ± 2048 steps (± 180 degrees).*
 
 Then unplug your motor and plug the second motor and set its ID to 2.
 ```bash
@@ -108,23 +127,25 @@ python lerobot/scripts/configure_motor.py \
 
 Redo the process for all your motors until ID 6. Do the same for the 6 motors of the leader arm.
 
-**Remove the gears of the 6 leader motors**
-Follow step 2 of the [assembly video](https://www.youtube.com/watch?v=FioA2oeFZ5I). You need to remove the gear for the motors of the leader arm. As a result, you will only use the position encoding of the motor and reduce friction to more easily operate the leader arm.
 
-**Add motor horn to the motors**
-Follow step 3 of the [assembly video](https://www.youtube.com/watch?v=FioA2oeFZ5I). For SO-100, you need to align the holes on the motor horn to the motor spline to be approximately 1:30, 4:30, 7:30 and 10:30.
+#### b. Remove the gears of the 6 leader motors
+
+Follow step 2 of the [assembly video](https://youtu.be/FioA2oeFZ5I?t=248). You need to remove the gear for the motors of the leader arm. As a result, you will only use the position encoding of the motor and reduce friction to more easily operate the leader arm.
+
+#### c. Add motor horn to all 12 motors
+Follow step 3 of the [assembly video](https://youtu.be/FioA2oeFZ5I?t=569). For SO-100, you need to align the holes on the motor horn to the motor spline to be approximately 1:30, 4:30, 7:30 and 10:30.
 Try to avoid rotating the motor while doing so to keep position 2048 set during configuration. It is especially tricky for the leader motors as it is more sensible without the gears, but it's ok if it's a bit rotated.
 
-## Assemble the arms
+## D. Assemble the arms
 
-Follow step 4 of the [assembly video](https://www.youtube.com/watch?v=FioA2oeFZ5I). The first arm should take a bit more than 1 hour to assemble, but once you get use to it, you can do it under 1 hour for the second arm.
+Follow step 4 of the [assembly video](https://youtu.be/FioA2oeFZ5I?t=610). The first arm should take a bit more than 1 hour to assemble, but once you get use to it, you can do it under 1 hour for the second arm.
 
-## Calibrate
+## E. Calibrate
 
 Next, you'll need to calibrate your SO-100 robot to ensure that the leader and follower arms have the same position values when they are in the same physical position. This calibration is essential because it allows a neural network trained on one SO-100 robot to work on another.
 
-**Manual calibration of follower arm**
-/!\ Contrarily to step 6 of the [assembly video](https://www.youtube.com/watch?v=FioA2oeFZ5I) which illustrates the auto calibration, we will actually do manual calibration of follower for now.
+#### a. Manual calibration of follower arm
+/!\ Contrarily to step 6 of the [assembly video](https://youtu.be/FioA2oeFZ5I?t=724) which illustrates the auto calibration, we will actually do manual calibration of follower for now.
 
 You will need to move the follower arm to these positions sequentially:
 
@@ -139,8 +160,8 @@ python lerobot/scripts/control_robot.py calibrate \
     --robot-overrides '~cameras' --arms main_follower
 ```
 
-**Manual calibration of leader arm**
-Follow step 6 of the [assembly video](https://www.youtube.com/watch?v=FioA2oeFZ5I) which illustrates the manual calibration. You will need to move the leader arm to these positions sequentially:
+#### b. Manual calibration of leader arm
+Follow step 6 of the [assembly video](https://youtu.be/FioA2oeFZ5I?t=724) which illustrates the manual calibration. You will need to move the leader arm to these positions sequentially:
 
 | 1. Zero position | 2. Rotated position | 3. Rest position |
 |---|---|---|
@@ -153,7 +174,7 @@ python lerobot/scripts/control_robot.py calibrate \
     --robot-overrides '~cameras' --arms main_leader
 ```
 
-## Teleoperate
+## F. Teleoperate
 
 **Simple teleop**
 Then you are ready to teleoperate your robot! Run this simple script (it won't connect and display the cameras):
@@ -165,14 +186,14 @@ python lerobot/scripts/control_robot.py teleoperate \
 ```
 
 
-**Teleop with displaying cameras**
+#### a. Teleop with displaying cameras
 Follow [this guide to setup your cameras](https://github.com/huggingface/lerobot/blob/main/examples/7_get_started_with_real_robot.md#c-add-your-cameras-with-opencvcamera). Then you will be able to display the cameras on your computer while you are teleoperating by running the following code. This is useful to prepare your setup before recording your first dataset.
 ```bash
 python lerobot/scripts/control_robot.py teleoperate \
     --robot-path lerobot/configs/robot/so100.yaml
 ```
 
-## Record a dataset
+## G. Record a dataset
 
 Once you're familiar with teleoperation, you can record your first dataset with SO-100.
 
@@ -201,7 +222,7 @@ python lerobot/scripts/control_robot.py record \
     --push-to-hub 1
 ```
 
-## Visualize a dataset
+## H. Visualize a dataset
 
 If you uploaded your dataset to the hub with `--push-to-hub 1`, you can [visualize your dataset online](https://huggingface.co/spaces/lerobot/visualize_dataset) by copy pasting your repo id given by:
 ```bash
@@ -214,7 +235,7 @@ python lerobot/scripts/visualize_dataset_html.py \
   --repo-id ${HF_USER}/so100_test
 ```
 
-## Replay an episode
+## I. Replay an episode
 
 Now try to replay the first episode on your robot:
 ```bash
@@ -225,7 +246,7 @@ python lerobot/scripts/control_robot.py replay \
     --episode 0
 ```
 
-## Train a policy
+## J. Train a policy
 
 To train a policy to control your robot, use the [`python lerobot/scripts/train.py`](../lerobot/scripts/train.py) script. A few arguments are required. Here is an example command:
 ```bash
@@ -248,7 +269,7 @@ Let's explain it:
 
 Training should take several hours. You will find checkpoints in `outputs/train/act_so100_test/checkpoints`.
 
-## Evaluate your policy
+## K. Evaluate your policy
 
 You can use the `record` function from [`lerobot/scripts/control_robot.py`](../lerobot/scripts/control_robot.py) but with a policy checkpoint as input. For instance, run this command to record 10 evaluation episodes:
 ```bash
@@ -268,7 +289,7 @@ As you can see, it's almost the same command as previously used to record your t
 1. There is an additional `-p` argument which indicates the path to your policy checkpoint with  (e.g. `-p outputs/train/eval_so100_test/checkpoints/last/pretrained_model`). You can also use the model repository if you uploaded a model checkpoint to the hub (e.g. `-p ${HF_USER}/act_so100_test`).
 2. The name of dataset begins by `eval` to reflect that you are running inference (e.g. `--repo-id ${HF_USER}/eval_act_so100_test`).
 
-## More
+## L. More Information
 
 Follow this [previous tutorial](https://github.com/huggingface/lerobot/blob/main/examples/7_get_started_with_real_robot.md#4-train-a-policy-on-your-data) for a more in-depth tutorial on controlling real robots with LeRobot.
 

From 068efce3f87f516d45303db6d54649cb6346bd68 Mon Sep 17 00:00:00 2001
From: Pradeep Kadubandi <pradeep.v.kadubandi@gmail.com>
Date: Wed, 15 Jan 2025 01:50:38 -0800
Subject: [PATCH 038/112] Fix for the issue
 https://github.com/huggingface/lerobot/issues/638 (#639)

---
 lerobot/scripts/control_robot.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lerobot/scripts/control_robot.py b/lerobot/scripts/control_robot.py
index 8187e8a3..9f266e2f 100644
--- a/lerobot/scripts/control_robot.py
+++ b/lerobot/scripts/control_robot.py
@@ -309,7 +309,7 @@ def record(
         # TODO(rcadene): add an option to enable teleoperation during reset
         # Skip reset for the last episode to be recorded
         if not events["stop_recording"] and (
-            (dataset.num_episodes < num_episodes - 1) or events["rerecord_episode"]
+            (recorded_episodes < num_episodes - 1) or events["rerecord_episode"]
         ):
             log_say("Reset the environment", play_sounds)
             if reset_follower:

From 472a7f58ad8c9f78ed6a291b2ebf957e5dfa016b Mon Sep 17 00:00:00 2001
From: Adil Zouitine <adilzouitinegm@gmail.com>
Date: Mon, 13 Jan 2025 17:54:11 +0100
Subject: [PATCH 039/112] [WIP] correct sac implementation

---
 lerobot/common/policies/sac/modeling_sac.py | 404 ++++----
 lerobot/scripts/train_sac.py                | 991 ++++++++++++++++++++
 2 files changed, 1202 insertions(+), 193 deletions(-)
 create mode 100644 lerobot/scripts/train_sac.py

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 62725ce1..a504142c 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -18,8 +18,7 @@
 # TODO: (1) better device management
 
 from collections import deque
-from copy import deepcopy
-from typing import Callable, Optional, Sequence, Tuple
+from typing import Callable, Optional, Sequence, Tuple, Union
 
 import einops
 import numpy as np
@@ -57,12 +56,20 @@ class SACPolicy(
             )
         else:
             self.normalize_inputs = nn.Identity()
+        # HACK: we need to pass the dataset_stats to the normalization functions
+        dataset_stats = dataset_stats or {
+            "action": {
+                "min": torch.tensor([-1.0, -1.0, -1.0, -1.0]),
+                "max": torch.tensor([1.0, 1.0, 1.0, 1.0]),
+            }
+        }
         self.normalize_targets = Normalize(
             config.output_shapes, config.output_normalization_modes, dataset_stats
         )
         self.unnormalize_outputs = Unnormalize(
             config.output_shapes, config.output_normalization_modes, dataset_stats
         )
+
         encoder_critic = SACObservationEncoder(config)
         encoder_actor = SACObservationEncoder(config)
         # Define networks
@@ -72,26 +79,38 @@ class SACPolicy(
                 encoder=encoder_critic,
                 network=MLP(
                     input_dim=encoder_critic.output_dim + config.output_shapes["action"][0],
-                    **config.critic_network_kwargs
-                )
+                    **config.critic_network_kwargs,
+                ),
             )
             critic_nets.append(critic_net)
 
+        target_critic_nets = []
+        for _ in range(config.num_critics):
+            target_critic_net = Critic(
+                encoder=encoder_critic,
+                network=MLP(
+                    input_dim=encoder_critic.output_dim + config.output_shapes["action"][0],
+                    **config.critic_network_kwargs,
+                ),
+            )
+            target_critic_nets.append(target_critic_net)
+
         self.critic_ensemble = create_critic_ensemble(critic_nets, config.num_critics)
-        self.critic_target = deepcopy(self.critic_ensemble)
+        self.critic_target = create_critic_ensemble(target_critic_nets, config.num_critics)
+        self.critic_target.load_state_dict(self.critic_ensemble.state_dict())
 
         self.actor = Policy(
             encoder=encoder_actor,
-            network=MLP(
-                input_dim=encoder_actor.output_dim,
-                **config.actor_network_kwargs
-            ),
+            network=MLP(input_dim=encoder_actor.output_dim, **config.actor_network_kwargs),
             action_dim=config.output_shapes["action"][0],
-            **config.policy_kwargs
+            **config.policy_kwargs,
         )
         if config.target_entropy is None:
-            config.target_entropy = -np.prod(config.output_shapes["action"][0]) #  (-dim(A))
-        self.temperature = LagrangeMultiplier(init_value=config.temperature_init)    
+            config.target_entropy = -np.prod(config.output_shapes["action"][0]) / 2  # (-dim(A)/2)
+        # TODO: fix later device
+        # TODO: Handle the case where the temparameter is a fixed
+        self.log_alpha = torch.zeros(1, requires_grad=True, device="cpu")
+        self.temperature = self.log_alpha.exp().item()
 
     def reset(self):
         """
@@ -111,18 +130,20 @@ class SACPolicy(
     @torch.no_grad()
     def select_action(self, batch: dict[str, Tensor]) -> Tensor:
         """Select action for inference/evaluation"""
-        actions, _ = self.actor(batch)
+        actions, _, _ = self.actor(batch)
         actions = self.unnormalize_outputs({"action": actions})["action"]
         return actions
-    
-    def critic_forward(self, observations: dict[str, Tensor], actions: Tensor, use_target: bool = False) -> Tensor:
+
+    def critic_forward(
+        self, observations: dict[str, Tensor], actions: Tensor, use_target: bool = False
+    ) -> Tensor:
         """Forward pass through a critic network ensemble
-        
+
         Args:
             observations: Dictionary of observations
             actions: Action tensor
             use_target: If True, use target critics, otherwise use ensemble critics
-        
+
         Returns:
             Tensor of Q-values from all critics
         """
@@ -130,16 +151,20 @@ class SACPolicy(
         q_values = torch.stack([critic(observations, actions) for critic in critics])
         return q_values
 
-
     def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor | float]:
         """Run the batch through the model and compute the loss.
-        
+
         Returns a dictionary with loss as a tensor, and other information as native floats.
         """
+        # We have to actualize the value of the temperature because in the previous
+        self.temperature = self.log_alpha.exp().item()
+        temperature = self.temperature
+
         batch = self.normalize_inputs(batch)
-        # batch shape is (b, 2, ...) where index 1 returns the current observation and 
-        # the next observation for calculating the right td index. 
-        actions = batch["action"][:, 0]
+        # batch shape is (b, 2, ...) where index 1 returns the current observation and
+        # the next observation for calculating the right td index.
+        # actions = batch["action"][:, 0]
+        actions = batch["action"]
         rewards = batch["next.reward"][:, 0]
         observations = {}
         next_observations = {}
@@ -147,105 +172,139 @@ class SACPolicy(
             if k.startswith("observation."):
                 observations[k] = batch[k][:, 0]
                 next_observations[k] = batch[k][:, 1]
-       
-        # perform image augmentation
+        done = batch["next.done"]
 
-        # reward bias from HIL-SERL code base 
-        # add_or_replace={"rewards": batch["rewards"] + self.config["reward_bias"]} in reward_batch
-        
-        # calculate critics loss
-        # 1- compute actions from policy
-        action_preds, log_probs = self.actor(next_observations)
+        with torch.no_grad():
+            next_action_preds, next_log_probs, _ = self.actor(next_observations)
 
-        # 2- compute q targets
-        q_targets = self.critic_forward(next_observations, action_preds, use_target=True)
+            # 2- compute q targets
+            q_targets = self.critic_forward(next_observations, next_action_preds, use_target=True)
 
-        # subsample critics to prevent overfitting if use high UTD (update to date)
-        if self.config.num_subsample_critics is not None:
-            indices = torch.randperm(self.config.num_critics)
-            indices = indices[:self.config.num_subsample_critics]
-            q_targets = q_targets[indices]
+            # subsample critics to prevent overfitting if use high UTD (update to date)
+            if self.config.num_subsample_critics is not None:
+                indices = torch.randperm(self.config.num_critics)
+                indices = indices[: self.config.num_subsample_critics]
+                q_targets = q_targets[indices]
 
-        # critics subsample size
-        min_q, _ = q_targets.min(dim=0)  # Get values from min operation
-
-        # compute td target
-        td_target = rewards + self.config.discount * min_q #+ self.config.discount * self.temperature() * log_probs # add entropy term
+            # critics subsample size
+            min_q, _ = q_targets.min(dim=0)  # Get values from min operation
+            if self.config.use_backup_entropy:
+                min_q -= self.temperature * next_log_probs
+            td_target = rewards + self.config.discount * min_q * ~done
 
         # 3- compute predicted qs
         q_preds = self.critic_forward(observations, actions, use_target=False)
 
         # 4- Calculate loss
         # Compute state-action value loss (TD loss) for all of the Q functions in the ensemble.
-        critics_loss = F.mse_loss(
-            q_preds,  # shape: [num_critics, batch_size]
-            einops.repeat(td_target, "b -> e b", e=q_preds.shape[0]), # expand td_target to match q_preds shape
-            reduction="none"
-        ).sum(0).mean()
+        td_target_duplicate = einops.repeat(td_target, "b -> e b", e=q_preds.shape[0])
+        # You compute the mean loss of the batch for each critic and then to compute the final loss you sum them up
+        critics_loss = (
+            F.mse_loss(
+                input=q_preds,
+                target=td_target_duplicate,
+                reduction="none",
+            ).mean(1)
+        ).sum()
 
-        # critics_loss = (   
-        #     F.mse_loss(
-        #             q_preds,
-        #             einops.repeat(td_target, "b -> e b", e=q_preds.shape[0]),
-        #             reduction="none",
-        #         ).sum(0)  # sum over ensemble
-        #         # `q_preds_ensemble` depends on the first observation and the actions.
-        #         * ~batch["observation.state_is_pad"][0]
-        #         * ~batch["action_is_pad"]
-        #         # q_targets depends on the reward and the next observations.
-        #         * ~batch["next.reward_is_pad"]
-        #         * ~batch["observation.state_is_pad"][1:]
-        #     ).sum(0).mean()
-        
-        # calculate actors loss
-        # 1- temperature
-        temperature = self.temperature()
-        # 2- get actions (batch_size, action_dim) and log probs (batch_size,)
-        actions, log_probs = self.actor(observations)
-        # 3- get q-value predictions
+        actions_pi, log_probs, _ = self.actor(observations)
         with torch.inference_mode():
-            q_preds = self.critic_forward(observations, actions, use_target=False)
-        actor_loss = (
-            -(q_preds - temperature * log_probs).mean()
-            # * ~batch["observation.state_is_pad"][0]
-            # * ~batch["action_is_pad"]
-        ).mean()
+            q_preds = self.critic_forward(observations, actions_pi, use_target=False)
+        min_q_preds = q_preds.min(dim=0)[0]
 
+        actor_loss = ((temperature * log_probs) - min_q_preds).mean()
 
         # calculate temperature loss
-        # 1- calculate entropy
-        entropy = -log_probs.mean()
-        temperature_loss = self.temperature(
-            lhs=entropy,
-            rhs=self.config.target_entropy
-        )
+        with torch.no_grad():
+            _, log_probs, _ = self.actor(observations)
+        temperature_loss = (-self.log_alpha.exp() * (log_probs + self.config.target_entropy)).mean()
 
         loss = critics_loss + actor_loss + temperature_loss
 
         return {
-                "critics_loss": critics_loss.item(),
-                "actor_loss": actor_loss.item(),
-                "temperature_loss": temperature_loss.item(),
-                "temperature": temperature.item(),
-                "entropy": entropy.item(),
-                "loss": loss,
-            }
- 
-    def update(self):
-        # TODO: implement UTD update
-        # First update only critics for utd_ratio-1 times
-        #for critic_step in range(self.config.utd_ratio - 1):
-            # only update critic and critic target
-        # Then update critic, critic target, actor and temperature
+            "critics_loss": critics_loss.item(),
+            "actor_loss": actor_loss.item(),
+            "mean_q_predicts": min_q_preds.mean().item(),
+            "min_q_predicts": min_q_preds.min().item(),
+            "max_q_predicts": min_q_preds.max().item(),
+            "temperature_loss": temperature_loss.item(),
+            "temperature": temperature,
+            "mean_log_probs": log_probs.mean().item(),
+            "min_log_probs": log_probs.min().item(),
+            "max_log_probs": log_probs.max().item(),
+            "td_target_mean": td_target.mean().item(),
+            "td_target_max": td_target.max().item(),
+            "action_mean": actions.mean().item(),
+            "entropy": log_probs.mean().item(),
+            "loss": loss,
+        }
+
+    def update_target_networks(self):
         """Update target networks with exponential moving average"""
+        for target_critic, critic in zip(self.critic_target, self.critic_ensemble, strict=False):
+            for target_param, param in zip(target_critic.parameters(), critic.parameters(), strict=False):
+                target_param.data.copy_(
+                    param.data * self.config.critic_target_update_weight
+                    + target_param.data * (1.0 - self.config.critic_target_update_weight)
+                )
+
+    def compute_loss_critic(self, observations, actions, rewards, next_observations, done) -> Tensor:
+        temperature = self.log_alpha.exp().item()
         with torch.no_grad():
-            for target_critic, critic in zip(self.critic_target, self.critic_ensemble, strict=False):
-                for target_param, param in zip(target_critic.parameters(), critic.parameters(), strict=False):
-                    target_param.data.copy_(
-                        target_param.data * self.config.critic_target_update_weight + 
-                        param.data * (1.0 - self.config.critic_target_update_weight)
-                    )
- 
+            next_action_preds, next_log_probs, _ = self.actor(next_observations)
+
+            # 2- compute q targets
+            q_targets = self.critic_forward(next_observations, next_action_preds, use_target=True)
+
+            # subsample critics to prevent overfitting if use high UTD (update to date)
+            if self.config.num_subsample_critics is not None:
+                indices = torch.randperm(self.config.num_critics)
+                indices = indices[: self.config.num_subsample_critics]
+                q_targets = q_targets[indices]
+
+            # critics subsample size
+            min_q, _ = q_targets.min(dim=0)  # Get values from min operation
+            if self.config.use_backup_entropy:
+                min_q -= temperature * next_log_probs
+            td_target = rewards + self.config.discount * min_q * ~done
+
+        # 3- compute predicted qs
+        q_preds = self.critic_forward(observations, actions, use_target=False)
+
+        # 4- Calculate loss
+        # Compute state-action value loss (TD loss) for all of the Q functions in the ensemble.
+        td_target_duplicate = einops.repeat(td_target, "b -> e b", e=q_preds.shape[0])
+        # You compute the mean loss of the batch for each critic and then to compute the final loss you sum them up
+        critics_loss = (
+            F.mse_loss(
+                input=q_preds,
+                target=td_target_duplicate,
+                reduction="none",
+            ).mean(1)
+        ).sum()
+        return critics_loss
+
+    def compute_loss_temperature(self, observations) -> Tensor:
+        breakpoint()
+        """Compute the temperature loss"""
+        # calculate temperature loss
+        with torch.no_grad():
+            _, log_probs, _ = self.actor(observations)
+        temperature_loss = (-self.log_alpha.exp() * (log_probs + self.config.target_entropy)).mean()
+        return temperature_loss
+
+    def compute_loss_actor(self, observations) -> Tensor:
+        temperature = self.log_alpha.exp().item()
+
+        actions_pi, log_probs, _ = self.actor(observations)
+
+        q_preds = self.critic_forward(observations, actions_pi, use_target=False)
+        min_q_preds = q_preds.min(dim=0)[0]
+
+        actor_loss = ((temperature * log_probs) - min_q_preds).mean()
+        return actor_loss
+
+
 class MLP(nn.Module):
     def __init__(
         self,
@@ -258,52 +317,54 @@ class MLP(nn.Module):
         super().__init__()
         self.activate_final = activate_final
         layers = []
-        
+
         # First layer uses input_dim
         layers.append(nn.Linear(input_dim, hidden_dims[0]))
-        
+
         # Add activation after first layer
         if dropout_rate is not None and dropout_rate > 0:
             layers.append(nn.Dropout(p=dropout_rate))
         layers.append(nn.LayerNorm(hidden_dims[0]))
         layers.append(activations if isinstance(activations, nn.Module) else getattr(nn, activations)())
-        
+
         # Rest of the layers
         for i in range(1, len(hidden_dims)):
-            layers.append(nn.Linear(hidden_dims[i-1], hidden_dims[i]))
-            
+            layers.append(nn.Linear(hidden_dims[i - 1], hidden_dims[i]))
+
             if i + 1 < len(hidden_dims) or activate_final:
                 if dropout_rate is not None and dropout_rate > 0:
                     layers.append(nn.Dropout(p=dropout_rate))
                 layers.append(nn.LayerNorm(hidden_dims[i]))
-                layers.append(activations if isinstance(activations, nn.Module) else getattr(nn, activations)())
-                
+                layers.append(
+                    activations if isinstance(activations, nn.Module) else getattr(nn, activations)()
+                )
+
         self.net = nn.Sequential(*layers)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.net(x)
-    
-    
+
+
 class Critic(nn.Module):
     def __init__(
         self,
         encoder: Optional[nn.Module],
         network: nn.Module,
         init_final: Optional[float] = None,
-        device: str = "cuda"
+        device: str = "cpu",
     ):
         super().__init__()
         self.device = torch.device(device)
         self.encoder = encoder
         self.network = network
         self.init_final = init_final
-        
+
         # Find the last Linear layer's output dimension
         for layer in reversed(network.net):
             if isinstance(layer, nn.Linear):
                 out_features = layer.out_features
                 break
-        
+
         # Output layer
         if init_final is not None:
             self.output_layer = nn.Linear(out_features, 1)
@@ -312,27 +373,26 @@ class Critic(nn.Module):
         else:
             self.output_layer = nn.Linear(out_features, 1)
             orthogonal_init()(self.output_layer.weight)
-        
+
         self.to(self.device)
 
     def forward(
-        self, 
-        observations: dict[str, torch.Tensor], 
+        self,
+        observations: dict[str, torch.Tensor],
         actions: torch.Tensor,
     ) -> torch.Tensor:
         # Move each tensor in observations to device
-        observations = {
-            k: v.to(self.device) for k, v in observations.items()
-        }
+        observations = {k: v.to(self.device) for k, v in observations.items()}
         actions = actions.to(self.device)
-        
+
         obs_enc = observations if self.encoder is None else self.encoder(observations)
-            
+
         inputs = torch.cat([obs_enc, actions], dim=-1)
         x = self.network(inputs)
         value = self.output_layer(x)
         return value.squeeze(-1)
 
+
 class Policy(nn.Module):
     def __init__(
         self,
@@ -344,7 +404,7 @@ class Policy(nn.Module):
         fixed_std: Optional[torch.Tensor] = None,
         init_final: Optional[float] = None,
         use_tanh_squash: bool = False,
-        device: str = "cuda"
+        device: str = "cpu",
     ):
         super().__init__()
         self.device = torch.device(device)
@@ -355,13 +415,13 @@ class Policy(nn.Module):
         self.log_std_max = log_std_max
         self.fixed_std = fixed_std.to(self.device) if fixed_std is not None else None
         self.use_tanh_squash = use_tanh_squash
-        
+
         # Find the last Linear layer's output dimension
         for layer in reversed(network.net):
             if isinstance(layer, nn.Linear):
                 out_features = layer.out_features
                 break
-        
+
         # Mean layer
         self.mean_layer = nn.Linear(out_features, action_dim)
         if init_final is not None:
@@ -369,7 +429,7 @@ class Policy(nn.Module):
             nn.init.uniform_(self.mean_layer.bias, -init_final, init_final)
         else:
             orthogonal_init()(self.mean_layer.weight)
-        
+
         # Standard deviation layer or parameter
         if fixed_std is None:
             self.std_layer = nn.Linear(out_features, action_dim)
@@ -378,41 +438,48 @@ class Policy(nn.Module):
                 nn.init.uniform_(self.std_layer.bias, -init_final, init_final)
             else:
                 orthogonal_init()(self.std_layer.weight)
-        
+
         self.to(self.device)
 
     def forward(
-        self, 
+        self,
         observations: torch.Tensor,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-                
         # Encode observations if encoder exists
         obs_enc = observations if self.encoder is None else self.encoder(observations)
 
         # Get network outputs
         outputs = self.network(obs_enc)
         means = self.mean_layer(outputs)
-        
+
         # Compute standard deviations
         if self.fixed_std is None:
             log_std = self.std_layer(outputs)
+            assert not torch.isnan(log_std).any(), "[ERROR] log_std became NaN after std_layer!"
+
             if self.use_tanh_squash:
                 log_std = torch.tanh(log_std)
-            log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
+                log_std = self.log_std_min + 0.5 * (self.log_std_max - self.log_std_min) * (log_std + 1.0)
+            else:
+                log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
         else:
             log_std = self.fixed_std.expand_as(means)
-    
-        # uses tahn activation function to squash the action to be in the range of [-1, 1]
+
+        # uses tanh activation function to squash the action to be in the range of [-1, 1]
         normal = torch.distributions.Normal(means, torch.exp(log_std))
-        x_t = normal.rsample()  # for reparameterization trick (mean + std * N(0,1)) 
-        log_probs = normal.log_prob(x_t)
+        x_t = normal.rsample()  # Reparameterization trick (mean + std * N(0,1))
+        log_probs = normal.log_prob(x_t)  # Base log probability before Tanh
+
         if self.use_tanh_squash:
             actions = torch.tanh(x_t)
-            log_probs -= torch.log((1 - actions.pow(2)) + 1e-6)
-        log_probs = log_probs.sum(-1) # sum over action dim
+            log_probs -= torch.log((1 - actions.pow(2)) + 1e-6)  # Adjust log-probs for Tanh
+        else:
+            actions = x_t  # No Tanh; raw Gaussian sample
+
+        log_probs = log_probs.sum(-1)  # Sum over action dimensions
+        means = torch.tanh(means) if self.use_tanh_squash else means
+        return actions, log_probs, means
 
-        return actions, log_probs
-    
     def get_features(self, observations: torch.Tensor) -> torch.Tensor:
         """Get encoded features from observations"""
         observations = observations.to(self.device)
@@ -460,19 +527,13 @@ class SACObservationEncoder(nn.Module):
             )
         if "observation.state" in config.input_shapes:
             self.state_enc_layers = nn.Sequential(
-                nn.Linear(config.input_shapes["observation.state"][0], config.state_encoder_hidden_dim),
-                nn.ELU(),
-                nn.Linear(config.state_encoder_hidden_dim, config.latent_dim),
+                nn.Linear(config.input_shapes["observation.state"][0], config.latent_dim),
                 nn.LayerNorm(config.latent_dim),
                 nn.Tanh(),
             )
         if "observation.environment_state" in config.input_shapes:
             self.env_state_enc_layers = nn.Sequential(
-                nn.Linear(
-                    config.input_shapes["observation.environment_state"][0], config.state_encoder_hidden_dim
-                ),
-                nn.ELU(),
-                nn.Linear(config.state_encoder_hidden_dim, config.latent_dim),
+                nn.Linear(config.input_shapes["observation.environment_state"][0], config.latent_dim),
                 nn.LayerNorm(config.latent_dim),
                 nn.Tanh(),
             )
@@ -494,66 +555,23 @@ class SACObservationEncoder(nn.Module):
             feat.append(self.state_enc_layers(obs_dict["observation.state"]))
         # TODO(ke-wang): currently average over all features, concatenate all features maybe a better way
         return torch.stack(feat, dim=0).mean(0)
-    
+
     @property
     def output_dim(self) -> int:
         """Returns the dimension of the encoder output"""
         return self.config.latent_dim
 
 
-class LagrangeMultiplier(nn.Module):
-    def __init__(
-        self,
-        init_value: float = 1.0,
-        constraint_shape: Sequence[int] = (),
-        device: str = "cuda"
-    ):
-        super().__init__()
-        self.device = torch.device(device)
-        init_value = torch.log(torch.exp(torch.tensor(init_value, device=self.device)) - 1)
-            
-        # Initialize the Lagrange multiplier as a parameter
-        self.lagrange = nn.Parameter(
-            torch.full(constraint_shape, init_value, dtype=torch.float32, device=self.device)
-        )
-        
-        self.to(self.device)
-
-    def forward(
-        self, 
-        lhs: Optional[torch.Tensor | float | int] = None, 
-        rhs: Optional[torch.Tensor | float | int] = None
-    ) -> torch.Tensor:
-        # Get the multiplier value based on parameterization        
-        multiplier = torch.nn.functional.softplus(self.lagrange)
-                
-        # Return the raw multiplier if no constraint values provided
-        if lhs is None:
-            return multiplier
-            
-        # Convert inputs to tensors and move to device
-        lhs = torch.tensor(lhs, device=self.device) if not isinstance(lhs, torch.Tensor) else lhs.to(self.device)
-        if rhs is not None:
-            rhs = torch.tensor(rhs, device=self.device) if not isinstance(rhs, torch.Tensor) else rhs.to(self.device)
-        else:
-            rhs = torch.zeros_like(lhs, device=self.device)
-            
-        diff = lhs - rhs
-        
-        assert diff.shape == multiplier.shape, f"Shape mismatch: {diff.shape} vs {multiplier.shape}"
-        
-        return multiplier * diff
-
-
 def orthogonal_init():
     return lambda x: torch.nn.init.orthogonal_(x, gain=1.0)
 
 
-def create_critic_ensemble(critics: list[nn.Module], num_critics: int, device: str = "cuda") -> nn.ModuleList:
+def create_critic_ensemble(critics: list[nn.Module], num_critics: int, device: str = "cpu") -> nn.ModuleList:
     """Creates an ensemble of critic networks"""
     assert len(critics) == num_critics, f"Expected {num_critics} critics, got {len(critics)}"
     return nn.ModuleList(critics).to(device)
 
+
 # borrowed from tdmpc
 def flatten_forward_unflatten(fn: Callable[[Tensor], Tensor], image_tensor: Tensor) -> Tensor:
     """Helper to temporarily flatten extra dims at the start of the image tensor.
@@ -561,7 +579,7 @@ def flatten_forward_unflatten(fn: Callable[[Tensor], Tensor], image_tensor: Tens
     Args:
         fn: Callable that the image tensor will be passed to. It should accept (B, C, H, W) and return
             (B, *), where * is any number of dimensions.
-        image_tensor: An image tensor of shape (**, C, H, W), where ** is any number of dimensions and 
+        image_tensor: An image tensor of shape (**, C, H, W), where ** is any number of dimensions and
         can be more than 1 dimensions, generally different from *.
     Returns:
         A return value from the callable reshaped to (**, *).
@@ -571,4 +589,4 @@ def flatten_forward_unflatten(fn: Callable[[Tensor], Tensor], image_tensor: Tens
     start_dims = image_tensor.shape[:-3]
     inp = torch.flatten(image_tensor, end_dim=-4)
     flat_out = fn(inp)
-    return torch.reshape(flat_out, (*start_dims, *flat_out.shape[1:]))
\ No newline at end of file
+    return torch.reshape(flat_out, (*start_dims, *flat_out.shape[1:]))
diff --git a/lerobot/scripts/train_sac.py b/lerobot/scripts/train_sac.py
new file mode 100644
index 00000000..9edafb76
--- /dev/null
+++ b/lerobot/scripts/train_sac.py
@@ -0,0 +1,991 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import time
+from contextlib import nullcontext
+from copy import deepcopy
+from pathlib import Path
+from pprint import pformat
+import random
+from typing import Optional, Sequence, TypedDict
+
+import hydra
+import numpy as np
+import torch
+from deepdiff import DeepDiff
+from omegaconf import DictConfig, ListConfig, OmegaConf
+from termcolor import colored
+from torch import nn
+from torch.cuda.amp import GradScaler
+
+from lerobot.common.datasets.factory import make_dataset, resolve_delta_timestamps
+from lerobot.common.datasets.lerobot_dataset import MultiLeRobotDataset
+from lerobot.common.datasets.online_buffer import OnlineBuffer, compute_sampler_weights
+from lerobot.common.datasets.sampler import EpisodeAwareSampler
+from lerobot.common.datasets.utils import cycle
+from lerobot.common.envs.factory import make_env
+from lerobot.common.envs.utils import preprocess_observation
+from lerobot.common.logger import Logger, log_output_dir
+from lerobot.common.policies.factory import make_policy
+from lerobot.common.policies.policy_protocol import PolicyWithUpdate
+from lerobot.common.policies.sac.modeling_sac import SACPolicy
+from lerobot.common.policies.utils import get_device_from_parameters
+from lerobot.common.utils.utils import (
+    format_big_number,
+    get_safe_torch_device,
+    init_hydra_config,
+    init_logging,
+    set_global_seed,
+)
+from lerobot.scripts.eval import eval_policy
+
+
+def make_optimizers_and_scheduler(cfg, policy):
+    optimizer_actor = torch.optim.Adam(
+        params=policy.actor.parameters(),
+        lr=policy.config.actor_lr,
+    )
+    optimizer_critic = torch.optim.Adam(
+        params=policy.critic_ensemble.parameters(), lr=policy.config.critic_lr
+    )
+    # We wrap policy log temperature in list because this is a torch tensor and not a nn.Module
+    optimizer_temperature = torch.optim.Adam(params=[policy.log_alpha], lr=policy.config.critic_lr)
+    lr_scheduler = None
+
+    optimizers = {
+        "actor": optimizer_actor,
+        "critic": optimizer_critic,
+        "temperature": optimizer_temperature,
+    }
+    return optimizers, lr_scheduler
+
+
+# def update_policy(policy, batch, optimizers, grad_clip_norm):
+
+# NOTE: This is temporary, online buffer or query lerobot dataset is not performant enough yet
+
+
+class Transition(TypedDict):
+    state: dict[str, torch.Tensor]
+    action: torch.Tensor
+    reward: float
+    next_state: dict[str, torch.Tensor]
+    done: bool
+    complementary_info: dict[str, torch.Tensor] = None
+
+
+class BatchTransition(TypedDict):
+    state: dict[str, torch.Tensor]
+    action: torch.Tensor
+    reward: torch.Tensor
+    next_state: dict[str, torch.Tensor]
+    done: torch.Tensor
+
+
+class ReplayBuffer:
+    def __init__(self, capacity: int, device: str = "cuda:0", state_keys: Optional[Sequence[str]] = None):
+        """
+        Args:
+            capacity (int): Maximum number of transitions to store in the buffer.
+            device (str): The device where the tensors will be moved ("cuda:0" or "cpu").
+            state_keys (List[str]): The list of keys that appear in `state` and `next_state`.
+        """
+        self.capacity = capacity
+        self.device = device
+        self.memory: list[Transition] = []
+        self.position = 0
+
+        # If no state_keys provided, default to an empty list
+        # (you can handle this differently if needed)
+        self.state_keys = state_keys if state_keys is not None else []
+
+    def add(
+        self,
+        state: dict[str, torch.Tensor],
+        action: torch.Tensor,
+        reward: float,
+        next_state: dict[str, torch.Tensor],
+        done: bool,
+        complementary_info: Optional[dict[str, torch.Tensor]] = None,
+    ):
+        """Saves a transition."""
+        if len(self.memory) < self.capacity:
+            self.memory.append(None)
+
+        # Create and store the Transition
+        self.memory[self.position] = Transition(
+            state=state,
+            action=action,
+            reward=reward,
+            next_state=next_state,
+            done=done,
+            complementary_info=complementary_info,
+        )
+        self.position = (self.position + 1) % self.capacity
+
+    def sample(self, batch_size: int) -> BatchTransition:
+        """Sample a random batch of transitions and collate them into batched tensors."""
+        list_of_transitions = random.sample(self.memory, batch_size)
+
+        # -- Build batched states --
+        batch_state = {}
+        for key in self.state_keys:
+            batch_state[key] = torch.cat([t["state"][key] for t in list_of_transitions], dim=0).to(
+                self.device
+            )
+
+        # -- Build batched actions --
+        batch_actions = torch.cat([t["action"] for t in list_of_transitions]).to(self.device)
+
+        # -- Build batched rewards --
+        batch_rewards = torch.tensor([t["reward"] for t in list_of_transitions], dtype=torch.float32).to(
+            self.device
+        )
+
+        # -- Build batched next states --
+        batch_next_state = {}
+        for key in self.state_keys:
+            batch_next_state[key] = torch.cat([t["next_state"][key] for t in list_of_transitions], dim=0).to(
+                self.device
+            )
+
+        # -- Build batched dones --
+        batch_dones = torch.tensor([t["done"] for t in list_of_transitions], dtype=torch.bool).to(self.device)
+
+        # Return a BatchTransition typed dict
+        return BatchTransition(
+            state=batch_state,
+            action=batch_actions,
+            reward=batch_rewards,
+            next_state=batch_next_state,
+            done=batch_dones,
+        )
+
+    # def sample(self, batch_size: int):
+    #     # 1) Randomly sample transitions
+    #     transitions = random.sample(self.memory, batch_size)
+
+    #     # 2) For each key in state_keys, gather states [b, state_dim], next_states [b, state_dim]
+    #     batch_state = {}
+    #     batch_next_state = {}
+    #     for key in self.state_keys:
+    #         batch_state[key] = torch.cat([t["state"][key] for t in transitions], dim=0).to(
+    #             self.device
+    #         )  # shape [b, state_dim, ...] depending on your data
+    #         batch_next_state[key] = torch.cat([t["next_state"][key] for t in transitions], dim=0).to(
+    #             self.device
+    #         )  # shape [b, state_dim, ...]
+
+    #     # 3) Build the other tensors
+    #     batch_action = torch.cat([t["action"] for t in transitions], dim=0).to(
+    #         self.device
+    #     )  # shape [b, ...] or [b, action_dim, ...]
+
+    #     batch_reward = torch.tensor(
+    #         [t["reward"] for t in transitions], dtype=torch.float32, device=self.device
+    #     ).unsqueeze(dim=-1)  # shape [b, 1]
+
+    #     batch_done = torch.tensor(
+    #         [t["done"] for t in transitions], dtype=torch.bool, device=self.device
+    #     )  # shape [b]
+
+    #     # 4) Create the observation and next_observation dicts
+    #     #
+    #     #    Each key is stacked along dim=1 so final shape is [b, 2, state_dim, ...]
+    #     #    - observation[key][..., 0, :] is the current state
+    #     #    - observation[key][..., 1, :] is the next state
+    #     #    - next_observation[key] duplicates the next state to shape [b, 2, ...]
+    #     observation = {}
+    #     for key in self.state_keys:
+    #         observation[key] = torch.stack([batch_state[key], batch_next_state[key]], dim=1)
+
+    #     # 5) Return your structure
+    #     ret = observation | {"action": batch_action, "next.reward": batch_reward, "next.done": batch_done}
+    #     return ret
+
+
+def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = None):
+    if out_dir is None:
+        raise NotImplementedError()
+    if job_name is None:
+        raise NotImplementedError()
+
+    init_logging()
+    logging.info(pformat(OmegaConf.to_container(cfg)))
+
+    if cfg.training.online_steps > 0 and isinstance(cfg.dataset_repo_id, ListConfig):
+        raise NotImplementedError("Online training with LeRobotMultiDataset is not implemented.")
+
+    # Create an env dedicated to online episodes collection from policy rollout.
+    # online_env = make_env(cfg, n_envs=cfg.training.online_rollout_batch_size)
+    # NOTE: Off policy algorithm are efficient enought to use a single environment
+    logging.info("make_env online")
+    online_env = make_env(cfg, n_envs=1)
+
+    if cfg.training.eval_freq > 0:
+        logging.info("make_env eval")
+        eval_env = make_env(cfg, n_envs=1)
+
+    # TODO: Add a way to resume training
+
+    # log metrics to terminal and wandb
+    logger = Logger(cfg, out_dir, wandb_job_name=job_name)
+
+    set_global_seed(cfg.seed)
+
+    # Check device is available
+    device = get_safe_torch_device(cfg.device, log=True)
+
+    torch.backends.cudnn.benchmark = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+
+    logging.info("make_policy")
+    # TODO: At some point we should just need make sac policy
+    policy: SACPolicy = make_policy(
+        hydra_cfg=cfg,
+        # dataset_stats=offline_dataset.meta.stats if not cfg.resume else None,
+        # Hack: But if we do online traning, we do not need dataset_stats
+        dataset_stats=None,
+        pretrained_policy_name_or_path=str(logger.last_pretrained_model_dir) if cfg.resume else None,
+    )
+    assert isinstance(policy, nn.Module)
+
+    optimizers, lr_scheduler = make_optimizers_and_scheduler(cfg, policy)
+
+    step = 0  # number of policy updates (forward + backward + optim)
+
+    # TODO: Handle resume
+
+    num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
+    num_total_params = sum(p.numel() for p in policy.parameters())
+
+    log_output_dir(out_dir)
+    logging.info(f"{cfg.env.task=}")
+    # TODO: Handle offline steps
+    # logging.info(f"{cfg.training.offline_steps=} ({format_big_number(cfg.training.offline_steps)})")
+    logging.info(f"{cfg.training.online_steps=}")
+    # logging.info(f"{offline_dataset.num_frames=} ({format_big_number(offline_dataset.num_frames)})")
+    # logging.info(f"{offline_dataset.num_episodes=}")
+    logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
+    logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
+
+    obs, info = online_env.reset()
+
+    obs = preprocess_observation(obs)
+    obs = {key: obs[key].to(device, non_blocking=True) for key in obs}
+
+    replay_buffer = ReplayBuffer(
+        capacity=cfg.training.online_buffer_capacity, device=device, state_keys=cfg.policy.input_shapes.keys()
+    )
+    # NOTE: For the moment we will solely handle the case of a single environment
+    sum_reward_episode = 0
+
+    for interaction_step in range(cfg.training.online_steps):
+        # NOTE: At some point we should use a  wrapper to handle the observation
+
+        if interaction_step >= cfg.training.online_step_before_learning:
+            with torch.inference_mode():
+                action = policy.select_action(batch=obs)
+            next_obs, reward, done, truncated, info = online_env.step(action.cpu().numpy())
+        else:
+            action = online_env.action_space.sample()
+            next_obs, reward, done, truncated, info = online_env.step(action)
+            # HACK
+            action = torch.tensor(action, dtype=torch.float32).to(device, non_blocking=True)
+
+        next_obs = preprocess_observation(next_obs)
+        next_obs = {key: next_obs[key].to(device, non_blocking=True) for key in obs}
+        sum_reward_episode += float(reward[0])
+        # Because we are using a single environment
+        # we can safely assume that the episode is done
+        if done[0] or truncated[0]:
+            logging.info(f"Global step {interaction_step}: Episode reward: {sum_reward_episode}")
+            logger.log_dict({"Sum episode reward": sum_reward_episode}, interaction_step)
+            sum_reward_episode = 0
+
+        replay_buffer.add(
+            state=obs,
+            action=action,
+            reward=float(reward[0]),
+            next_state=next_obs,
+            done=done[0],
+        )
+        obs = next_obs
+
+        if interaction_step >= cfg.training.online_step_before_learning:
+            batch = replay_buffer.sample(cfg.training.batch_size)
+            # 'observation.state', 'action', 'next.reward', 'next.done'
+            # TODO: (azouitine) interface to refine
+            # TODO: At some point we should find a way to normalize the inputs
+            # batch = policy.normalize_inputs(batch)
+
+            actions = batch["action"]
+            rewards = batch["reward"]
+            observations = batch["state"]
+            next_observations = batch["next_state"]
+            done = batch["done"]
+
+            loss_critic = policy.compute_loss_critic(
+                observations=observations,
+                actions=actions,
+                rewards=rewards,
+                next_observations=next_observations,
+                done=done,
+            )
+            optimizers["critic"].zero_grad()
+            loss_critic.backward()
+            optimizers["critic"].step()
+
+            training_infos = {}
+            training_infos["loss_critic"] = loss_critic.item()
+
+            if interaction_step % cfg.training.policy_update_freq == 0:
+                # TD3 Trick
+                for _ in range(cfg.training.policy_update_freq):
+                    loss_actor = policy.compute_loss_actor(observations=observations)
+
+                    optimizers["actor"].zero_grad()
+                    loss_actor.backward()
+                    optimizers["actor"].step()
+
+                    training_infos["loss_actor"] = loss_actor.item()
+
+                    loss_temperature = policy.compute_loss_temperature(observations=observations)
+                    optimizers["temperature"].zero_grad()
+                    loss_temperature.backward()
+                    optimizers["temperature"].step()
+
+                    training_infos["loss_temperature"] = loss_temperature.item()
+
+            if interaction_step % cfg.training.log_freq == 0:
+                logger.log_dict(training_infos, interaction_step, mode="train")
+
+            policy.update_target_networks()
+
+
+def clip_grad_norm(loss, clip_grad_norm_value, parameters):
+    grad_norm = torch.nn.utils.clip_grad_norm_(
+        parameters=parameters,
+        max_norm=clip_grad_norm_value,
+        error_if_nonfinite=False,
+    )
+    return grad_norm
+
+
+def update_policy(
+    policy,
+    batch,
+    optimizer,
+    grad_clip_norm,
+    grad_scaler: GradScaler,
+    lr_scheduler=None,
+    use_amp: bool = False,
+    lock=None,
+):
+    """Returns a dictionary of items for logging."""
+    start_time = time.perf_counter()
+    device = get_device_from_parameters(policy)
+    policy.train()
+    with torch.autocast(device_type=device.type) if use_amp else nullcontext():
+        output_dict = policy.forward(batch)
+        # TODO(rcadene): policy.unnormalize_outputs(out_dict)
+        loss = output_dict["loss"]
+    grad_scaler.scale(loss).backward()
+
+    # Unscale the graident of the optimzer's assigned params in-place **prior to gradient clipping**.
+    grad_scaler.unscale_(optimizer)
+
+    grad_norm = torch.nn.utils.clip_grad_norm_(
+        policy.parameters(),
+        grad_clip_norm,
+        error_if_nonfinite=False,
+    )
+
+    # Optimizer's gradients are already unscaled, so scaler.step does not unscale them,
+    # although it still skips optimizer.step() if the gradients contain infs or NaNs.
+    with lock if lock is not None else nullcontext():
+        grad_scaler.step(optimizer)
+    # Updates the scale for next iteration.
+    grad_scaler.update()
+
+    optimizer.zero_grad()
+
+    if lr_scheduler is not None:
+        lr_scheduler.step()
+
+    if isinstance(policy, PolicyWithUpdate):
+        # To possibly update an internal buffer (for instance an Exponential Moving Average like in TDMPC).
+        policy.update()
+
+    info = {
+        "loss": loss.item(),
+        "grad_norm": float(grad_norm),
+        "lr": optimizer.param_groups[0]["lr"],
+        "update_s": time.perf_counter() - start_time,
+        **{k: v for k, v in output_dict.items() if k != "loss"},
+    }
+    info.update({k: v for k, v in output_dict.items() if k not in info})
+
+    return info
+
+
+def log_train_info(logger: Logger, info, step, cfg, dataset, is_online):
+    loss = info["loss"]
+    grad_norm = info["grad_norm"]
+    lr = info["lr"]
+    update_s = info["update_s"]
+    dataloading_s = info["dataloading_s"]
+
+    # A sample is an (observation,action) pair, where observation and action
+    # can be on multiple timestamps. In a batch, we have `batch_size`` number of samples.
+    num_samples = (step + 1) * cfg.training.batch_size
+    avg_samples_per_ep = dataset.num_frames / dataset.num_episodes
+    num_episodes = num_samples / avg_samples_per_ep
+    num_epochs = num_samples / dataset.num_frames
+    log_items = [
+        f"step:{format_big_number(step)}",
+        # number of samples seen during training
+        f"smpl:{format_big_number(num_samples)}",
+        # number of episodes seen during training
+        f"ep:{format_big_number(num_episodes)}",
+        # number of time all unique samples are seen
+        f"epch:{num_epochs:.2f}",
+        f"loss:{loss:.3f}",
+        f"grdn:{grad_norm:.3f}",
+        f"lr:{lr:0.1e}",
+        # in seconds
+        f"updt_s:{update_s:.3f}",
+        f"data_s:{dataloading_s:.3f}",  # if not ~0, you are bottlenecked by cpu or io
+    ]
+    logging.info(" ".join(log_items))
+
+    info["step"] = step
+    info["num_samples"] = num_samples
+    info["num_episodes"] = num_episodes
+    info["num_epochs"] = num_epochs
+    info["is_online"] = is_online
+
+    logger.log_dict(info, step, mode="train")
+
+
+def log_eval_info(logger, info, step, cfg, dataset, is_online):
+    eval_s = info["eval_s"]
+    avg_sum_reward = info["avg_sum_reward"]
+    pc_success = info["pc_success"]
+
+    # A sample is an (observation,action) pair, where observation and action
+    # can be on multiple timestamps. In a batch, we have `batch_size`` number of samples.
+    num_samples = (step + 1) * cfg.training.batch_size
+    avg_samples_per_ep = dataset.num_frames / dataset.num_episodes
+    num_episodes = num_samples / avg_samples_per_ep
+    num_epochs = num_samples / dataset.num_frames
+    log_items = [
+        f"step:{format_big_number(step)}",
+        # number of samples seen during training
+        f"smpl:{format_big_number(num_samples)}",
+        # number of episodes seen during training
+        f"ep:{format_big_number(num_episodes)}",
+        # number of time all unique samples are seen
+        f"epch:{num_epochs:.2f}",
+        f"∑rwrd:{avg_sum_reward:.3f}",
+        f"success:{pc_success:.1f}%",
+        f"eval_s:{eval_s:.3f}",
+    ]
+    logging.info(" ".join(log_items))
+
+    info["step"] = step
+    info["num_samples"] = num_samples
+    info["num_episodes"] = num_episodes
+    info["num_epochs"] = num_epochs
+    info["is_online"] = is_online
+
+    logger.log_dict(info, step, mode="eval")
+
+
+# def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = None):
+#     if out_dir is None:
+#         raise NotImplementedError()
+#     if job_name is None:
+#         raise NotImplementedError()
+
+#     init_logging()
+#     logging.info(pformat(OmegaConf.to_container(cfg)))
+
+#     if cfg.training.online_steps > 0 and isinstance(cfg.dataset_repo_id, ListConfig):
+#         raise NotImplementedError("Online training with LeRobotMultiDataset is not implemented.")
+
+#     # Create an env dedicated to online episodes collection from policy rollout.
+#     online_env = make_env(cfg, n_envs=cfg.training.online_rollout_batch_size)
+
+#     if cfg.training.eval_freq > 0:
+#         logging.info("make_env")
+#         eval_env = make_env(cfg)
+
+#     # If we are resuming a run, we need to check that a checkpoint exists in the log directory, and we need
+#     # to check for any differences between the provided config and the checkpoint's config.
+#     if cfg.resume:
+#         if not Logger.get_last_checkpoint_dir(out_dir).exists():
+#             raise RuntimeError(
+#                 "You have set resume=True, but there is no model checkpoint in "
+#                 f"{Logger.get_last_checkpoint_dir(out_dir)}"
+#             )
+#         checkpoint_cfg_path = str(Logger.get_last_pretrained_model_dir(out_dir) / "config.yaml")
+#         logging.info(
+#             colored(
+#                 "You have set resume=True, indicating that you wish to resume a run",
+#                 color="yellow",
+#                 attrs=["bold"],
+#             )
+#         )
+#         # Get the configuration file from the last checkpoint.
+#         checkpoint_cfg = init_hydra_config(checkpoint_cfg_path)
+#         # Check for differences between the checkpoint configuration and provided configuration.
+#         # Hack to resolve the delta_timestamps ahead of time in order to properly diff.
+#         resolve_delta_timestamps(cfg)
+#         diff = DeepDiff(OmegaConf.to_container(checkpoint_cfg), OmegaConf.to_container(cfg))
+#         # Ignore the `resume` and parameters.
+#         if "values_changed" in diff and "root['resume']" in diff["values_changed"]:
+#             del diff["values_changed"]["root['resume']"]
+#         # Log a warning about differences between the checkpoint configuration and the provided
+#         # configuration.
+#         if len(diff) > 0:
+#             logging.warning(
+#                 "At least one difference was detected between the checkpoint configuration and "
+#                 f"the provided configuration: \n{pformat(diff)}\nNote that the checkpoint configuration "
+#                 "takes precedence.",
+#             )
+#         # Use the checkpoint config instead of the provided config (but keep `resume` parameter).
+#         cfg = checkpoint_cfg
+#         cfg.resume = True
+#     elif Logger.get_last_checkpoint_dir(out_dir).exists():
+#         raise RuntimeError(
+#             f"The configured output directory {Logger.get_last_checkpoint_dir(out_dir)} already exists. If "
+#             "you meant to resume training, please use `resume=true` in your command or yaml configuration."
+#         )
+
+#     if cfg.eval.batch_size > cfg.eval.n_episodes:
+#         raise ValueError(
+#             "The eval batch size is greater than the number of eval episodes "
+#             f"({cfg.eval.batch_size} > {cfg.eval.n_episodes}). As a result, {cfg.eval.batch_size} "
+#             f"eval environments will be instantiated, but only {cfg.eval.n_episodes} will be used. "
+#             "This might significantly slow down evaluation. To fix this, you should update your command "
+#             f"to increase the number of episodes to match the batch size (e.g. `eval.n_episodes={cfg.eval.batch_size}`), "
+#             f"or lower the batch size (e.g. `eval.batch_size={cfg.eval.n_episodes}`)."
+#         )
+
+#     # log metrics to terminal and wandb
+#     logger = Logger(cfg, out_dir, wandb_job_name=job_name)
+
+#     set_global_seed(cfg.seed)
+
+#     # Check device is available
+#     device = get_safe_torch_device(cfg.device, log=True)
+
+#     torch.backends.cudnn.benchmark = True
+#     torch.backends.cuda.matmul.allow_tf32 = True
+
+#     logging.info("make_dataset")
+#     # offline_dataset = make_dataset(cfg)
+#     # TODO (michel-aractingi): temporary fix to avoid datasets with task_index key that doesn't exist in online environment
+#     # i.e., pusht
+#     # if "task_index" in offline_dataset.hf_dataset[0]:
+#     #     offline_dataset.hf_dataset = offline_dataset.hf_dataset.remove_columns(["task_index"])
+
+#     # if isinstance(offline_dataset, MultiLeRobotDataset):
+#     #     logging.info(
+#     #         "Multiple datasets were provided. Applied the following index mapping to the provided datasets: "
+#     #         f"{pformat(offline_dataset.repo_id_to_index , indent=2)}"
+#     #     )
+
+#     # Create environment used for evaluating checkpoints during training on simulation data.
+#     # On real-world data, no need to create an environment as evaluations are done outside train.py,
+#     # using the eval.py instead, with gym_dora environment and dora-rs.
+#     eval_env = None
+#     if cfg.training.eval_freq > 0:
+#         logging.info("make_env")
+#         eval_env = make_env(cfg)
+
+#     logging.info("make_policy")
+#     policy = make_policy(
+#         hydra_cfg=cfg,
+#         # dataset_stats=offline_dataset.meta.stats if not cfg.resume else None,
+#         # Hack: But if we do online traning, we do not need dataset_stats
+#         dataset_stats=None,
+#         pretrained_policy_name_or_path=str(logger.last_pretrained_model_dir) if cfg.resume else None,
+#     )
+#     assert isinstance(policy, nn.Module)
+#     # Create optimizer and scheduler
+#     # Temporary hack to move optimizer out of policy
+#     optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy)
+#     grad_scaler = GradScaler(enabled=cfg.use_amp)
+
+#     step = 0  # number of policy updates (forward + backward + optim)
+
+#     if cfg.resume:
+#         step = logger.load_last_training_state(optimizer, lr_scheduler)
+
+#     num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
+#     num_total_params = sum(p.numel() for p in policy.parameters())
+
+#     log_output_dir(out_dir)
+#     logging.info(f"{cfg.env.task=}")
+#     logging.info(f"{cfg.training.offline_steps=} ({format_big_number(cfg.training.offline_steps)})")
+#     logging.info(f"{cfg.training.online_steps=}")
+#     # logging.info(f"{offline_dataset.num_frames=} ({format_big_number(offline_dataset.num_frames)})")
+#     # logging.info(f"{offline_dataset.num_episodes=}")
+#     logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
+#     logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
+
+#     # Note: this helper will be used in offline and online training loops.
+#     def evaluate_and_checkpoint_if_needed(step, is_online):
+#         _num_digits = max(6, len(str(cfg.training.offline_steps + cfg.training.online_steps)))
+#         step_identifier = f"{step:0{_num_digits}d}"
+
+#         if cfg.training.eval_freq > 0 and step % cfg.training.eval_freq == 0:
+#             logging.info(f"Eval policy at step {step}")
+#             with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.use_amp else nullcontext():
+#                 assert eval_env is not None
+#                 eval_info = eval_policy(
+#                     eval_env,
+#                     policy,
+#                     cfg.eval.n_episodes,
+#                     videos_dir=Path(out_dir) / "eval" / f"videos_step_{step_identifier}",
+#                     max_episodes_rendered=4,
+#                     start_seed=cfg.seed,
+#                 )
+#             # log_eval_info(logger, eval_info["aggregated"], step, cfg, offline_dataset, is_online=is_online)
+#             log_eval_info(logger, eval_info["aggregated"], step, cfg, online_dataset, is_online=is_online)
+#             if cfg.wandb.enable:
+#                 logger.log_video(eval_info["video_paths"][0], step, mode="eval")
+#             logging.info("Resume training")
+
+#         if cfg.training.save_checkpoint and (
+#             step % cfg.training.save_freq == 0
+#             or step == cfg.training.offline_steps + cfg.training.online_steps
+#         ):
+#             logging.info(f"Checkpoint policy after step {step}")
+#             # Note: Save with step as the identifier, and format it to have at least 6 digits but more if
+#             # needed (choose 6 as a minimum for consistency without being overkill).
+#             logger.save_checkpoint(
+#                 step,
+#                 policy,
+#                 optimizer,
+#                 lr_scheduler,
+#                 identifier=step_identifier,
+#             )
+#             logging.info("Resume training")
+
+#     # create dataloader for offline training
+#     # if cfg.training.get("drop_n_last_frames"):
+#     #     shuffle = False
+#     #     sampler = EpisodeAwareSampler(
+#     #         offline_dataset.episode_data_index,
+#     #         drop_n_last_frames=cfg.training.drop_n_last_frames,
+#     #         shuffle=True,
+#     #     )
+#     # else:
+#     #     shuffle = True
+#     #     sampler = None
+#     # dataloader = torch.utils.data.DataLoader(
+#     #     offline_dataset,
+#     #     num_workers=cfg.training.num_workers,
+#     #     batch_size=cfg.training.batch_size,
+#     #     shuffle=shuffle,
+#     #     sampler=sampler,
+#     #     pin_memory=device.type != "cpu",
+#     #     drop_last=False,
+#     # )
+#     # dl_iter = cycle(dataloader)
+
+#     policy.train()
+#     # offline_step = 0
+#     # for _ in range(step, cfg.training.offline_steps):
+#     #     if offline_step == 0:
+#     #         logging.info("Start offline training on a fixed dataset")
+
+#     #     start_time = time.perf_counter()
+#     #     batch = next(dl_iter)
+#     #     dataloading_s = time.perf_counter() - start_time
+
+#     #     for key in batch:
+#     #         batch[key] = batch[key].to(device, non_blocking=True)
+
+#     #     train_info = update_policy(
+#     #         policy,
+#     #         batch,
+#     #         optimizer,
+#     #         cfg.training.grad_clip_norm,
+#     #         grad_scaler=grad_scaler,
+#     #         lr_scheduler=lr_scheduler,
+#     #         use_amp=cfg.use_amp,
+#     #     )
+
+#     #     train_info["dataloading_s"] = dataloading_s
+
+#     #     if step % cfg.training.log_freq == 0:
+#     #         log_train_info(logger, train_info, step, cfg, offline_dataset, is_online=False)
+
+#     #     # Note: evaluate_and_checkpoint_if_needed happens **after** the `step`th training update has completed,
+#     #     # so we pass in step + 1.
+#     #     evaluate_and_checkpoint_if_needed(step + 1, is_online=False)
+
+#     #     step += 1
+#     #     offline_step += 1  # noqa: SIM113
+
+#     # if cfg.training.online_steps == 0:
+#     #     if eval_env:
+#     #         eval_env.close()
+#     #     logging.info("End of training")
+#     #     return
+
+#     # Online training.
+
+#     # Create an env dedicated to online episodes collection from policy rollout.
+#     online_env = make_env(cfg, n_envs=cfg.training.online_rollout_batch_size)
+#     resolve_delta_timestamps(cfg)
+#     online_buffer_path = logger.log_dir / "online_buffer"
+#     if cfg.resume and not online_buffer_path.exists():
+#         # If we are resuming a run, we default to the data shapes and buffer capacity from the saved online
+#         # buffer.
+#         logging.warning(
+#             "When online training is resumed, we load the latest online buffer from the prior run, "
+#             "and this might not coincide with the state of the buffer as it was at the moment the checkpoint "
+#             "was made. This is because the online buffer is updated on disk during training, independently "
+#             "of our explicit checkpointing mechanisms."
+#         )
+#     online_dataset = OnlineBuffer(
+#         online_buffer_path,
+#         data_spec={
+#             **{k: {"shape": v, "dtype": np.dtype("float32")} for k, v in policy.config.input_shapes.items()},
+#             **{k: {"shape": v, "dtype": np.dtype("float32")} for k, v in policy.config.output_shapes.items()},
+#             "next.reward": {"shape": (), "dtype": np.dtype("float32")},
+#             "next.done": {"shape": (), "dtype": np.dtype("?")},
+#             "next.success": {"shape": (), "dtype": np.dtype("?")},
+#         },
+#         buffer_capacity=cfg.training.online_buffer_capacity,
+#         fps=online_env.unwrapped.metadata["render_fps"],
+#         delta_timestamps=cfg.training.delta_timestamps,
+#     )
+
+#     # If we are doing online rollouts asynchronously, deepcopy the policy to use for online rollouts (this
+#     # makes it possible to do online rollouts in parallel with training updates).
+#     online_rollout_policy = deepcopy(policy) if cfg.training.do_online_rollout_async else policy
+
+#     # Create dataloader for online training.
+#     # concat_dataset = torch.utils.data.ConcatDataset([offline_dataset, online_dataset])
+#     # sampler_weights = compute_sampler_weights(
+#     #     offline_dataset,
+#     #     offline_drop_n_last_frames=cfg.training.get("drop_n_last_frames", 0),
+#     #     online_dataset=online_dataset,
+#     #     # +1 because online rollouts return an extra frame for the "final observation". Note: we don't have
+#     #     # this final observation in the offline datasets, but we might add them in future.
+#     #     online_drop_n_last_frames=cfg.training.get("drop_n_last_frames", 0) + 1,
+#     #     online_sampling_ratio=cfg.training.online_sampling_ratio,
+#     # )
+#     # sampler = torch.utils.data.WeightedRandomSampler(
+#     #     sampler_weights,
+#     #     num_samples=len(concat_dataset),
+#     #     replacement=True,
+#     # )
+#     # dataloader = torch.utils.data.DataLoader(
+#     #     concat_dataset,
+#     #     batch_size=cfg.training.batch_size,
+#     #     num_workers=cfg.training.num_workers,
+#     #     sampler=sampler,
+#     #     pin_memory=device.type != "cpu",
+#     #     drop_last=True,
+#     # )
+
+#     dataloader = torch.utils.data.DataLoader(
+#         online_dataset,
+#         batch_size=cfg.training.batch_size,
+#         # num_workers=cfg.training.num_workers,
+#         num_workers=0,
+#         # sampler=sampler,
+#         pin_memory=device.type != "cpu",
+#         drop_last=True,
+#     )
+#     dl_iter = cycle(dataloader)
+
+#     # Lock and thread pool executor for asynchronous online rollouts. When asynchronous mode is disabled,
+#     # these are still used but effectively do nothing.
+#     # Hack: Comment the lock
+#     # lock = Lock()
+#     # Note: 1 worker because we only ever want to run one set of online rollouts at a time. Batch
+#     # parallelization of rollouts is handled within the job.
+
+#     # Hack: ThreadPoolExecutor
+#     # executor = ThreadPoolExecutor(max_workers=1)
+
+#     online_step = 0
+#     online_rollout_s = 0  # time take to do online rollout
+#     update_online_buffer_s = 0  # time taken to update the online buffer with the online rollout data
+#     # Time taken waiting for the online buffer to finish being updated. This is relevant when using the async
+#     # online rollout option.
+#     await_update_online_buffer_s = 0
+#     rollout_start_seed = cfg.training.online_env_seed
+
+#     while True:
+#         if online_step == cfg.training.online_steps:
+#             break
+
+#         if online_step == 0:
+#             logging.info("Start online training by interacting with environment")
+
+#         def sample_trajectory_and_update_buffer():
+#             nonlocal rollout_start_seed
+#             # with lock:
+#             online_rollout_policy.load_state_dict(policy.state_dict())
+
+#             online_rollout_policy.eval()
+#             start_rollout_time = time.perf_counter()
+#             with torch.no_grad():
+#                 eval_info = eval_policy(
+#                     online_env,
+#                     online_rollout_policy,
+#                     n_episodes=cfg.training.online_rollout_n_episodes,
+#                     max_episodes_rendered=min(10, cfg.training.online_rollout_n_episodes),
+#                     videos_dir=logger.log_dir / "online_rollout_videos",
+#                     return_episode_data=True,
+#                     start_seed=(
+#                         rollout_start_seed := (rollout_start_seed + cfg.training.batch_size) % 1000000
+#                     ),
+#                 )
+#             online_rollout_s = time.perf_counter() - start_rollout_time
+
+#             # with lock:
+#             start_update_buffer_time = time.perf_counter()
+#             online_dataset.add_data(eval_info["episodes"])
+
+#             # Update the concatenated dataset length used during sampling.
+#             # concat_dataset.cumulative_sizes = concat_dataset.cumsum(concat_dataset.datasets)
+#             # HACK: We do only online training, so we don't need update dataset length because
+#             # we do not concatenate offline and online datasets.
+#             # online_dataset.cumulative_sizes = online_dataset.cumsum(online_dataset.datasets)
+
+#             # Update the sampling weights.
+#             # sampler.weights = compute_sampler_weights(
+#             #     offline_dataset,
+#             #     offline_drop_n_last_frames=cfg.training.get("drop_n_last_frames", 0),
+#             #     online_dataset=online_dataset,
+#             #     # +1 because online rollouts return an extra frame for the "final observation". Note: we don't have
+#             #     # this final observation in the offline datasets, but we might add them in future.
+#             #     online_drop_n_last_frames=cfg.training.get("drop_n_last_frames", 0) + 1,
+#             #     online_sampling_ratio=cfg.training.online_sampling_ratio,
+#             # )
+#             # sampler.num_frames = len(concat_dataset)
+
+#             update_online_buffer_s = time.perf_counter() - start_update_buffer_time
+
+#             return online_rollout_s, update_online_buffer_s
+
+#         # Hack:Comment it
+#         # future = executor.submit(sample_trajectory_and_update_buffer)
+#         # sample_trajectory_and_update_buffer()
+#         # If we aren't doing async rollouts, or if we haven't yet gotten enough examples in our buffer, wait
+#         # here until the rollout and buffer update is done, before proceeding to the policy update steps.
+#         if (
+#             not cfg.training.do_online_rollout_async
+#             or len(online_dataset) <= cfg.training.online_buffer_seed_size
+#         ):
+#             # online_rollout_s, update_online_buffer_s = future.result()
+#             online_rollout_s, update_online_buffer_s = sample_trajectory_and_update_buffer()
+
+#         if len(online_dataset) <= cfg.training.online_buffer_seed_size:
+#             logging.info(
+#                 f"Seeding online buffer: {len(online_dataset)}/{cfg.training.online_buffer_seed_size}"
+#             )
+#             continue
+
+#         policy.train()
+#         for _ in range(cfg.training.online_steps_between_rollouts):
+#             # Hack: Comment the lock and reindent
+#             # with lock:
+#             start_time = time.perf_counter()
+#             batch = next(dl_iter)
+#             dataloading_s = time.perf_counter() - start_time
+
+#             for key in batch:
+#                 batch[key] = batch[key].to(cfg.device, non_blocking=True)
+
+#             train_info = update_policy(
+#                 policy,
+#                 batch,
+#                 optimizer,
+#                 cfg.training.grad_clip_norm,
+#                 grad_scaler=grad_scaler,
+#                 lr_scheduler=lr_scheduler,
+#                 use_amp=cfg.use_amp,
+#                 # lock=lock,
+#                 # Hack: Comment the lock
+#                 lock=None,
+#             )
+
+#             train_info["dataloading_s"] = dataloading_s
+#             train_info["online_rollout_s"] = online_rollout_s
+#             train_info["update_online_buffer_s"] = update_online_buffer_s
+#             train_info["await_update_online_buffer_s"] = await_update_online_buffer_s
+#             # Hack: Comment the lock and reindent
+#             # with lock:
+#             train_info["online_buffer_size"] = len(online_dataset)
+
+#             if step % cfg.training.log_freq == 0:
+#                 log_train_info(logger, train_info, step, cfg, online_dataset, is_online=True)
+
+#             # Note: evaluate_and_checkpoint_if_needed happens **after** the `step`th training update has completed,
+#             # so we pass in step + 1.
+#             evaluate_and_checkpoint_if_needed(step + 1, is_online=True)
+
+#             step += 1
+#             online_step += 1
+
+#         # If we're doing async rollouts, we should now wait until we've completed them before proceeding
+#         # to do the next batch of rollouts.
+#         # Hack: comment it
+#         # if future.running():
+#         start = time.perf_counter()
+#         # online_rollout_s, update_online_buffer_s = future.result()
+#         online_rollout_s, update_online_buffer_s = sample_trajectory_and_update_buffer()
+#         await_update_online_buffer_s = time.perf_counter() - start
+
+#         if online_step >= cfg.training.online_steps:
+#             break
+
+#     if eval_env:
+#         eval_env.close()
+#     logging.info("End of training")
+
+
+@hydra.main(version_base="1.2", config_name="default", config_path="../configs")
+def train_cli(cfg: dict):
+    train(
+        cfg,
+        out_dir=hydra.core.hydra_config.HydraConfig.get().run.dir,
+        job_name=hydra.core.hydra_config.HydraConfig.get().job.name,
+    )
+
+
+def train_notebook(out_dir=None, job_name=None, config_name="default", config_path="../configs"):
+    from hydra import compose, initialize
+
+    hydra.core.global_hydra.GlobalHydra.instance().clear()
+    initialize(config_path=config_path)
+    cfg = compose(config_name=config_name)
+    train(cfg, out_dir=out_dir, job_name=job_name)
+
+
+if __name__ == "__main__":
+    train_cli()

From c86dace4c21ba2a3bffd993a8189bdebead976d6 Mon Sep 17 00:00:00 2001
From: Adil Zouitine <adilzouitinegm@gmail.com>
Date: Mon, 13 Jan 2025 17:58:00 +0100
Subject: [PATCH 040/112] remove breakpoint

---
 lerobot/common/policies/sac/modeling_sac.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index a504142c..d48cec88 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -285,7 +285,6 @@ class SACPolicy(
         return critics_loss
 
     def compute_loss_temperature(self, observations) -> Tensor:
-        breakpoint()
         """Compute the temperature loss"""
         # calculate temperature loss
         with torch.no_grad():

From a0a50de8c9869d6a6eafc6025ced2c906a3b7ce0 Mon Sep 17 00:00:00 2001
From: Adil Zouitine <adilzouitinegm@gmail.com>
Date: Tue, 14 Jan 2025 11:34:52 +0100
Subject: [PATCH 041/112] SAC works

---
 lerobot/common/policies/sac/modeling_sac.py | 10 +++--
 lerobot/scripts/train_sac.py                | 49 ++-------------------
 2 files changed, 10 insertions(+), 49 deletions(-)

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index d48cec88..f2d10ae5 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -254,7 +254,9 @@ class SACPolicy(
             next_action_preds, next_log_probs, _ = self.actor(next_observations)
 
             # 2- compute q targets
-            q_targets = self.critic_forward(next_observations, next_action_preds, use_target=True)
+            q_targets = self.critic_forward(
+                observations=next_observations, actions=next_action_preds, use_target=True
+            )
 
             # subsample critics to prevent overfitting if use high UTD (update to date)
             if self.config.num_subsample_critics is not None:
@@ -264,9 +266,9 @@ class SACPolicy(
 
             # critics subsample size
             min_q, _ = q_targets.min(dim=0)  # Get values from min operation
-            if self.config.use_backup_entropy:
-                min_q -= temperature * next_log_probs
-            td_target = rewards + self.config.discount * min_q * ~done
+            min_q = min_q - (temperature * next_log_probs)
+
+            td_target = rewards + (1 - done) * self.config.discount * min_q
 
         # 3- compute predicted qs
         q_preds = self.critic_forward(observations, actions, use_target=False)
diff --git a/lerobot/scripts/train_sac.py b/lerobot/scripts/train_sac.py
index 9edafb76..30891db9 100644
--- a/lerobot/scripts/train_sac.py
+++ b/lerobot/scripts/train_sac.py
@@ -163,7 +163,9 @@ class ReplayBuffer:
             )
 
         # -- Build batched dones --
-        batch_dones = torch.tensor([t["done"] for t in list_of_transitions], dtype=torch.bool).to(self.device)
+        batch_dones = torch.tensor([t["done"] for t in list_of_transitions], dtype=torch.float32).to(
+            self.device
+        )
 
         # Return a BatchTransition typed dict
         return BatchTransition(
@@ -174,48 +176,6 @@ class ReplayBuffer:
             done=batch_dones,
         )
 
-    # def sample(self, batch_size: int):
-    #     # 1) Randomly sample transitions
-    #     transitions = random.sample(self.memory, batch_size)
-
-    #     # 2) For each key in state_keys, gather states [b, state_dim], next_states [b, state_dim]
-    #     batch_state = {}
-    #     batch_next_state = {}
-    #     for key in self.state_keys:
-    #         batch_state[key] = torch.cat([t["state"][key] for t in transitions], dim=0).to(
-    #             self.device
-    #         )  # shape [b, state_dim, ...] depending on your data
-    #         batch_next_state[key] = torch.cat([t["next_state"][key] for t in transitions], dim=0).to(
-    #             self.device
-    #         )  # shape [b, state_dim, ...]
-
-    #     # 3) Build the other tensors
-    #     batch_action = torch.cat([t["action"] for t in transitions], dim=0).to(
-    #         self.device
-    #     )  # shape [b, ...] or [b, action_dim, ...]
-
-    #     batch_reward = torch.tensor(
-    #         [t["reward"] for t in transitions], dtype=torch.float32, device=self.device
-    #     ).unsqueeze(dim=-1)  # shape [b, 1]
-
-    #     batch_done = torch.tensor(
-    #         [t["done"] for t in transitions], dtype=torch.bool, device=self.device
-    #     )  # shape [b]
-
-    #     # 4) Create the observation and next_observation dicts
-    #     #
-    #     #    Each key is stacked along dim=1 so final shape is [b, 2, state_dim, ...]
-    #     #    - observation[key][..., 0, :] is the current state
-    #     #    - observation[key][..., 1, :] is the next state
-    #     #    - next_observation[key] duplicates the next state to shape [b, 2, ...]
-    #     observation = {}
-    #     for key in self.state_keys:
-    #         observation[key] = torch.stack([batch_state[key], batch_next_state[key]], dim=1)
-
-    #     # 5) Return your structure
-    #     ret = observation | {"action": batch_action, "next.reward": batch_reward, "next.done": batch_done}
-    #     return ret
-
 
 def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = None):
     if out_dir is None:
@@ -297,8 +257,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
         # NOTE: At some point we should use a  wrapper to handle the observation
 
         if interaction_step >= cfg.training.online_step_before_learning:
-            with torch.inference_mode():
-                action = policy.select_action(batch=obs)
+            action = policy.select_action(batch=obs)
             next_obs, reward, done, truncated, info = online_env.step(action.cpu().numpy())
         else:
             action = online_env.action_space.sample()

From be965019bd6104d63ed3e0691d23e3614acdcdc4 Mon Sep 17 00:00:00 2001
From: Adil Zouitine <adilzouitinegm@gmail.com>
Date: Wed, 15 Jan 2025 15:49:24 +0100
Subject: [PATCH 042/112] Add rlpd tricks

---
 lerobot/common/policies/sac/modeling_sac.py |   3 +-
 lerobot/scripts/train_sac.py                | 174 +++++++++++++++++++-
 2 files changed, 170 insertions(+), 7 deletions(-)

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index f2d10ae5..e5173e04 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -266,7 +266,8 @@ class SACPolicy(
 
             # critics subsample size
             min_q, _ = q_targets.min(dim=0)  # Get values from min operation
-            min_q = min_q - (temperature * next_log_probs)
+            if self.config.use_backup_entropy:
+                min_q = min_q - (temperature * next_log_probs)
 
             td_target = rewards + (1 - done) * self.config.discount * min_q
 
diff --git a/lerobot/scripts/train_sac.py b/lerobot/scripts/train_sac.py
index 30891db9..942a19ab 100644
--- a/lerobot/scripts/train_sac.py
+++ b/lerobot/scripts/train_sac.py
@@ -30,9 +30,10 @@ from omegaconf import DictConfig, ListConfig, OmegaConf
 from termcolor import colored
 from torch import nn
 from torch.cuda.amp import GradScaler
+from tqdm import tqdm
 
 from lerobot.common.datasets.factory import make_dataset, resolve_delta_timestamps
-from lerobot.common.datasets.lerobot_dataset import MultiLeRobotDataset
+from lerobot.common.datasets.lerobot_dataset import MultiLeRobotDataset, LeRobotDataset
 from lerobot.common.datasets.online_buffer import OnlineBuffer, compute_sampler_weights
 from lerobot.common.datasets.sampler import EpisodeAwareSampler
 from lerobot.common.datasets.utils import cycle
@@ -64,7 +65,6 @@ def make_optimizers_and_scheduler(cfg, policy):
     # We wrap policy log temperature in list because this is a torch tensor and not a nn.Module
     optimizer_temperature = torch.optim.Adam(params=[policy.log_alpha], lr=policy.config.critic_lr)
     lr_scheduler = None
-
     optimizers = {
         "actor": optimizer_actor,
         "critic": optimizer_critic,
@@ -136,6 +136,105 @@ class ReplayBuffer:
         )
         self.position = (self.position + 1) % self.capacity
 
+    @classmethod
+    def from_lerobot_dataset(
+        cls,
+        lerobot_dataset: LeRobotDataset,
+        device: str = "cuda:0",
+        state_keys: Optional[Sequence[str]] = None,
+    ) -> "ReplayBuffer":
+        replay_buffer = cls(capacity=len(lerobot_dataset), device=device, state_keys=state_keys)
+        list_transition = cls._lerobotdataset_to_transitions(dataset=lerobot_dataset, state_keys=state_keys)
+        for data in list_transition:
+            replay_buffer.add(
+                state=data["state"],
+                action=data["action"],
+                reward=data["reward"],
+                next_state=data["next_state"],
+                done=data["done"],
+            )
+        return replay_buffer
+
+    @staticmethod
+    def _lerobotdataset_to_transitions(
+        dataset: LeRobotDataset,
+        state_keys: Optional[Sequence[str]] = None,
+    ) -> list[Transition]:
+        """
+        Convert a LeRobotDataset into a list of RL (s, a, r, s', done) transitions.
+
+        Args:
+            dataset (LeRobotDataset):
+                The dataset to convert. Each item in the dataset is expected to have
+                at least the following keys:
+                {
+                    "action": ...
+                    "next.reward": ...
+                    "next.done": ...
+                    "episode_index": ...
+                }
+                plus whatever your 'state_keys' specify.
+
+            state_keys (Optional[Sequence[str]]):
+                The dataset keys to include in 'state' and 'next_state'. Their names
+                will be kept as-is in the output transitions. E.g.
+                ["observation.state", "observation.environment_state"].
+                If None, you must handle or define default keys.
+
+        Returns:
+            transitions (List[Transition]):
+                A list of Transition dictionaries with the same length as `dataset`.
+        """
+
+        # If not provided, you can either raise an error or define a default:
+        if state_keys is None:
+            raise ValueError("You must provide a list of keys in `state_keys` that define your 'state'.")
+
+        transitions: list[Transition] = []
+        num_frames = len(dataset)
+
+        for i in tqdm(range(num_frames)):
+            current_sample = dataset[i]
+
+            # ----- 1) Current state -----
+            current_state: dict[str, torch.Tensor] = {}
+            for key in state_keys:
+                val = current_sample[key]
+                current_state[key] = val.unsqueeze(0)  # Add batch dimension
+
+            # ----- 2) Action -----
+            action = current_sample["action"].unsqueeze(0)  # Add batch dimension
+
+            # ----- 3) Reward and done -----
+            reward = float(current_sample["next.reward"].item())  # ensure float
+            done = bool(current_sample["next.done"].item())  # ensure bool
+
+            # ----- 4) Next state -----
+            # If not done and the next sample is in the same episode, we pull the next sample's state.
+            # Otherwise (done=True or next sample crosses to a new episode), next_state = current_state.
+            next_state = current_state  # default
+            if not done and (i < num_frames - 1):
+                next_sample = dataset[i + 1]
+                if next_sample["episode_index"] == current_sample["episode_index"]:
+                    # Build next_state from the same keys
+                    next_state_data: dict[str, torch.Tensor] = {}
+                    for key in state_keys:
+                        val = next_sample[key]
+                        next_state_data[key] = val.unsqueeze(0)  # Add batch dimension
+                    next_state = next_state_data
+
+            # ----- Construct the Transition -----
+            transition = Transition(
+                state=current_state,
+                action=action,
+                reward=reward,
+                next_state=next_state,
+                done=done,
+            )
+            transitions.append(transition)
+
+        return transitions
+
     def sample(self, batch_size: int) -> BatchTransition:
         """Sample a random batch of transitions and collate them into batched tensors."""
         list_of_transitions = random.sample(self.memory, batch_size)
@@ -177,6 +276,32 @@ class ReplayBuffer:
         )
 
 
+def concatenate_batch_transitions(
+    left_batch_transitions: BatchTransition, right_batch_transition: BatchTransition
+) -> BatchTransition:
+    """Be careful it change the left_batch_transitions in place"""
+    left_batch_transitions["state"] = {
+        key: torch.cat([left_batch_transitions["state"][key], right_batch_transition["state"][key]], dim=0)
+        for key in left_batch_transitions["state"]
+    }
+    left_batch_transitions["action"] = torch.cat(
+        [left_batch_transitions["action"], right_batch_transition["action"]], dim=0
+    )
+    left_batch_transitions["reward"] = torch.cat(
+        [left_batch_transitions["reward"], right_batch_transition["reward"]], dim=0
+    )
+    left_batch_transitions["next_state"] = {
+        key: torch.cat(
+            [left_batch_transitions["next_state"][key], right_batch_transition["next_state"][key]], dim=0
+        )
+        for key in left_batch_transitions["next_state"]
+    }
+    left_batch_transitions["done"] = torch.cat(
+        [left_batch_transitions["done"], right_batch_transition["done"]], dim=0
+    )
+    return left_batch_transitions
+
+
 def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = None):
     if out_dir is None:
         raise NotImplementedError()
@@ -186,9 +311,6 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     init_logging()
     logging.info(pformat(OmegaConf.to_container(cfg)))
 
-    if cfg.training.online_steps > 0 and isinstance(cfg.dataset_repo_id, ListConfig):
-        raise NotImplementedError("Online training with LeRobotMultiDataset is not implemented.")
-
     # Create an env dedicated to online episodes collection from policy rollout.
     # online_env = make_env(cfg, n_envs=cfg.training.online_rollout_batch_size)
     # NOTE: Off policy algorithm are efficient enought to use a single environment
@@ -250,6 +372,20 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     replay_buffer = ReplayBuffer(
         capacity=cfg.training.online_buffer_capacity, device=device, state_keys=cfg.policy.input_shapes.keys()
     )
+
+    breakpoint()
+    batch_size = cfg.training.batch_size
+    # if cfg.training.online_steps > 0 and isinstance(cfg.dataset_repo_id, ListConfig):
+    #     raise NotImplementedError("Online training with LeRobotMultiDataset is not implemented.")
+    if cfg.dataset_repo_id is not None:
+        logging.info("make_dataset offline buffer")
+        offline_dataset = make_dataset(cfg)
+        logging.info("Convertion to a offline replay buffer")
+        offline_replay_buffer = ReplayBuffer.from_lerobot_dataset(
+            offline_dataset, device=device, state_keys=cfg.policy.input_shapes.keys()
+        )
+        batch_size: int = batch_size // 2  # We will sample from both replay buffer
+
     # NOTE: For the moment we will solely handle the case of a single environment
     sum_reward_episode = 0
 
@@ -285,7 +421,33 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
         obs = next_obs
 
         if interaction_step >= cfg.training.online_step_before_learning:
-            batch = replay_buffer.sample(cfg.training.batch_size)
+            for _ in range(cfg.policy.utd_ratio - 1):
+                batch = replay_buffer.sample(batch_size)
+                if cfg.dataset_repo_id is not None:
+                    batch_offline = offline_replay_buffer.sample(batch_size)
+                    batch = concatenate_batch_transitions(batch, batch_offline)
+
+                actions = batch["action"]
+                rewards = batch["reward"]
+                observations = batch["state"]
+                next_observations = batch["next_state"]
+                done = batch["done"]
+
+                loss_critic = policy.compute_loss_critic(
+                    observations=observations,
+                    actions=actions,
+                    rewards=rewards,
+                    next_observations=next_observations,
+                    done=done,
+                )
+                optimizers["critic"].zero_grad()
+                loss_critic.backward()
+                optimizers["critic"].step()
+
+            batch = replay_buffer.sample(batch_size)
+            if cfg.dataset_repo_id is not None:
+                batch_offline = offline_replay_buffer.sample(batch_size)
+                batch = concatenate_batch_transitions(batch, batch_offline)
             # 'observation.state', 'action', 'next.reward', 'next.done'
             # TODO: (azouitine) interface to refine
             # TODO: At some point we should find a way to normalize the inputs

From 956c5472548307d29e204682e907f2c3cdf58efc Mon Sep 17 00:00:00 2001
From: Adil Zouitine <adilzouitinegm@gmail.com>
Date: Mon, 13 Jan 2025 17:54:11 +0100
Subject: [PATCH 043/112] [WIP] correct sac implementation

---
 lerobot/common/policies/sac/modeling_sac.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index e5173e04..23513916 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -63,6 +63,13 @@ class SACPolicy(
                 "max": torch.tensor([1.0, 1.0, 1.0, 1.0]),
             }
         }
+        # HACK: we need to pass the dataset_stats to the normalization functions
+        dataset_stats = dataset_stats or {
+            "action": {
+                "min": torch.tensor([-1.0, -1.0, -1.0, -1.0]),
+                "max": torch.tensor([1.0, 1.0, 1.0, 1.0]),
+            }
+        }
         self.normalize_targets = Normalize(
             config.output_shapes, config.output_normalization_modes, dataset_stats
         )
@@ -98,6 +105,7 @@ class SACPolicy(
         self.critic_ensemble = create_critic_ensemble(critic_nets, config.num_critics)
         self.critic_target = create_critic_ensemble(target_critic_nets, config.num_critics)
         self.critic_target.load_state_dict(self.critic_ensemble.state_dict())
+        self.critic_target.load_state_dict(self.critic_ensemble.state_dict())
 
         self.actor = Policy(
             encoder=encoder_actor,
@@ -159,12 +167,15 @@ class SACPolicy(
         # We have to actualize the value of the temperature because in the previous
         self.temperature = self.log_alpha.exp().item()
         temperature = self.temperature
+        temperature = self.temperature
 
         batch = self.normalize_inputs(batch)
         # batch shape is (b, 2, ...) where index 1 returns the current observation and
         # the next observation for calculating the right td index.
         # actions = batch["action"][:, 0]
         actions = batch["action"]
+        # actions = batch["action"][:, 0]
+        actions = batch["action"]
         rewards = batch["next.reward"][:, 0]
         observations = {}
         next_observations = {}
@@ -191,6 +202,7 @@ class SACPolicy(
             if self.config.use_backup_entropy:
                 min_q -= self.temperature * next_log_probs
             td_target = rewards + self.config.discount * min_q * ~done
+            td_target = rewards + self.config.discount * min_q * ~done
 
         # 3- compute predicted qs
         q_preds = self.critic_forward(observations, actions, use_target=False)
@@ -207,9 +219,11 @@ class SACPolicy(
             ).mean(1)
         ).sum()
 
+        actions_pi, log_probs, _ = self.actor(observations)
         actions_pi, log_probs, _ = self.actor(observations)
         with torch.inference_mode():
             q_preds = self.critic_forward(observations, actions_pi, use_target=False)
+            q_preds = self.critic_forward(observations, actions_pi, use_target=False)
         min_q_preds = q_preds.min(dim=0)[0]
 
         actor_loss = ((temperature * log_probs) - min_q_preds).mean()

From c1d4bf4b637796e97ba1cf7708b86e867b16a07c Mon Sep 17 00:00:00 2001
From: Adil Zouitine <adilzouitinegm@gmail.com>
Date: Tue, 14 Jan 2025 11:34:52 +0100
Subject: [PATCH 045/112] SAC works

---
 lerobot/common/policies/sac/modeling_sac.py | 3 +++
 lerobot/scripts/train_sac.py                | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 23513916..fece59f0 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -271,6 +271,9 @@ class SACPolicy(
             q_targets = self.critic_forward(
                 observations=next_observations, actions=next_action_preds, use_target=True
             )
+            q_targets = self.critic_forward(
+                observations=next_observations, actions=next_action_preds, use_target=True
+            )
 
             # subsample critics to prevent overfitting if use high UTD (update to date)
             if self.config.num_subsample_critics is not None:
diff --git a/lerobot/scripts/train_sac.py b/lerobot/scripts/train_sac.py
index 942a19ab..eba504d3 100644
--- a/lerobot/scripts/train_sac.py
+++ b/lerobot/scripts/train_sac.py
@@ -265,6 +265,9 @@ class ReplayBuffer:
         batch_dones = torch.tensor([t["done"] for t in list_of_transitions], dtype=torch.float32).to(
             self.device
         )
+        batch_dones = torch.tensor([t["done"] for t in list_of_transitions], dtype=torch.float32).to(
+            self.device
+        )
 
         # Return a BatchTransition typed dict
         return BatchTransition(

From 7d2970fdfe4688abb9cbecf0558bf556a636bb89 Mon Sep 17 00:00:00 2001
From: Adil Zouitine <adilzouitinegm@gmail.com>
Date: Fri, 17 Jan 2025 09:39:04 +0100
Subject: [PATCH 047/112] Change SAC policy implementation with configuration
 and modeling classes

---
 lerobot/common/policies/factory.py            |   9 +-
 .../common/policies/sac/configuration_sac.py  |  22 +-
 lerobot/common/policies/sac/modeling_sac.py   | 125 +---
 lerobot/scripts/train_sac.py                  | 617 +-----------------
 4 files changed, 55 insertions(+), 718 deletions(-)

diff --git a/lerobot/common/policies/factory.py b/lerobot/common/policies/factory.py
index 7f550d90..f4a2039c 100644
--- a/lerobot/common/policies/factory.py
+++ b/lerobot/common/policies/factory.py
@@ -71,7 +71,6 @@ def get_policy_and_config_classes(name: str) -> tuple[Policy, object]:
         from lerobot.common.policies.sac.modeling_sac import SACPolicy
 
         return SACPolicy, SACConfig
-
     else:
         raise NotImplementedError(f"Policy with name {name} is not implemented.")
 
@@ -91,10 +90,10 @@ def make_policy(
             be provided when initializing a new policy, and must not be provided when loading a pretrained
             policy. Therefore, this argument is mutually exclusive with `pretrained_policy_name_or_path`.
     """
-    if not (pretrained_policy_name_or_path is None) ^ (dataset_stats is None):
-        raise ValueError(
-            "Exactly one of `pretrained_policy_name_or_path` and `dataset_stats` must be provided."
-        )
+    # if not (pretrained_policy_name_or_path is None) ^ (dataset_stats is None):
+    #     raise ValueError(
+    #         "Exactly one of `pretrained_policy_name_or_path` and `dataset_stats` must be provided."
+    #     )
 
     policy_cls, policy_cfg_class = get_policy_and_config_classes(hydra_cfg.policy.name)
 
diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index 4ae6e5d4..3f5dae1c 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -28,29 +28,41 @@ class SACConfig:
     )
     output_shapes: dict[str, list[int]] = field(
         default_factory=lambda: {
-            "action": [4],
+            "action": [2],
         }
     )
 
     # Normalization / Unnormalization
-    input_normalization_modes: dict[str, str] | None = None
+    input_normalization_modes: dict[str, str] = field(
+        default_factory=lambda: {
+            "observation.image": "mean_std",
+            "observation.state": "min_max",
+            "observation.environment_state": "min_max",
+        }
+    )
     output_normalization_modes: dict[str, str] = field(
         default_factory=lambda: {"action": "min_max"},
     )
 
+    shared_encoder = False
     discount = 0.99
     temperature_init = 1.0
     num_critics = 2
+    # num_critics = 8
     num_subsample_critics = None
+    # num_subsample_critics = 2
+    # critic_lr = 1e-3
     critic_lr = 3e-4
     actor_lr = 3e-4
     temperature_lr = 3e-4
     critic_target_update_weight = 0.005
-    utd_ratio = 2
+    # utd_ratio = 8
+    utd_ratio = 1  # If you want enable utd_ratio, you need to set it to >1
     state_encoder_hidden_dim = 256
-    latent_dim = 128
+    latent_dim = 256
     target_entropy = None
-    backup_entropy = True
+    # backup_entropy = False
+    use_backup_entropy = True
     critic_network_kwargs = {
         "hidden_dims": [256, 256],
         "activate_final": True,
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index fece59f0..35b1bd5a 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -57,19 +57,22 @@ class SACPolicy(
         else:
             self.normalize_inputs = nn.Identity()
         # HACK: we need to pass the dataset_stats to the normalization functions
+
+        # NOTE: This is for biwalker environment
         dataset_stats = dataset_stats or {
             "action": {
                 "min": torch.tensor([-1.0, -1.0, -1.0, -1.0]),
                 "max": torch.tensor([1.0, 1.0, 1.0, 1.0]),
             }
         }
-        # HACK: we need to pass the dataset_stats to the normalization functions
-        dataset_stats = dataset_stats or {
-            "action": {
-                "min": torch.tensor([-1.0, -1.0, -1.0, -1.0]),
-                "max": torch.tensor([1.0, 1.0, 1.0, 1.0]),
-            }
-        }
+
+        # NOTE: This is for pusht environment
+        # dataset_stats = dataset_stats or {
+        #     "action": {
+        #         "min": torch.tensor([0, 0]),
+        #         "max": torch.tensor([512, 512]),
+        #     }
+        # }
         self.normalize_targets = Normalize(
             config.output_shapes, config.output_normalization_modes, dataset_stats
         )
@@ -77,8 +80,12 @@ class SACPolicy(
             config.output_shapes, config.output_normalization_modes, dataset_stats
         )
 
-        encoder_critic = SACObservationEncoder(config)
-        encoder_actor = SACObservationEncoder(config)
+        if config.shared_encoder:
+            encoder_critic = SACObservationEncoder(config)
+            encoder_actor = encoder_critic
+        else:
+            encoder_critic = SACObservationEncoder(config)
+            encoder_actor = SACObservationEncoder(config)
         # Define networks
         critic_nets = []
         for _ in range(config.num_critics):
@@ -105,7 +112,6 @@ class SACPolicy(
         self.critic_ensemble = create_critic_ensemble(critic_nets, config.num_critics)
         self.critic_target = create_critic_ensemble(target_critic_nets, config.num_critics)
         self.critic_target.load_state_dict(self.critic_ensemble.state_dict())
-        self.critic_target.load_state_dict(self.critic_ensemble.state_dict())
 
         self.actor = Policy(
             encoder=encoder_actor,
@@ -159,100 +165,7 @@ class SACPolicy(
         q_values = torch.stack([critic(observations, actions) for critic in critics])
         return q_values
 
-    def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor | float]:
-        """Run the batch through the model and compute the loss.
-
-        Returns a dictionary with loss as a tensor, and other information as native floats.
-        """
-        # We have to actualize the value of the temperature because in the previous
-        self.temperature = self.log_alpha.exp().item()
-        temperature = self.temperature
-        temperature = self.temperature
-
-        batch = self.normalize_inputs(batch)
-        # batch shape is (b, 2, ...) where index 1 returns the current observation and
-        # the next observation for calculating the right td index.
-        # actions = batch["action"][:, 0]
-        actions = batch["action"]
-        # actions = batch["action"][:, 0]
-        actions = batch["action"]
-        rewards = batch["next.reward"][:, 0]
-        observations = {}
-        next_observations = {}
-        for k in batch:
-            if k.startswith("observation."):
-                observations[k] = batch[k][:, 0]
-                next_observations[k] = batch[k][:, 1]
-        done = batch["next.done"]
-
-        with torch.no_grad():
-            next_action_preds, next_log_probs, _ = self.actor(next_observations)
-
-            # 2- compute q targets
-            q_targets = self.critic_forward(next_observations, next_action_preds, use_target=True)
-
-            # subsample critics to prevent overfitting if use high UTD (update to date)
-            if self.config.num_subsample_critics is not None:
-                indices = torch.randperm(self.config.num_critics)
-                indices = indices[: self.config.num_subsample_critics]
-                q_targets = q_targets[indices]
-
-            # critics subsample size
-            min_q, _ = q_targets.min(dim=0)  # Get values from min operation
-            if self.config.use_backup_entropy:
-                min_q -= self.temperature * next_log_probs
-            td_target = rewards + self.config.discount * min_q * ~done
-            td_target = rewards + self.config.discount * min_q * ~done
-
-        # 3- compute predicted qs
-        q_preds = self.critic_forward(observations, actions, use_target=False)
-
-        # 4- Calculate loss
-        # Compute state-action value loss (TD loss) for all of the Q functions in the ensemble.
-        td_target_duplicate = einops.repeat(td_target, "b -> e b", e=q_preds.shape[0])
-        # You compute the mean loss of the batch for each critic and then to compute the final loss you sum them up
-        critics_loss = (
-            F.mse_loss(
-                input=q_preds,
-                target=td_target_duplicate,
-                reduction="none",
-            ).mean(1)
-        ).sum()
-
-        actions_pi, log_probs, _ = self.actor(observations)
-        actions_pi, log_probs, _ = self.actor(observations)
-        with torch.inference_mode():
-            q_preds = self.critic_forward(observations, actions_pi, use_target=False)
-            q_preds = self.critic_forward(observations, actions_pi, use_target=False)
-        min_q_preds = q_preds.min(dim=0)[0]
-
-        actor_loss = ((temperature * log_probs) - min_q_preds).mean()
-
-        # calculate temperature loss
-        with torch.no_grad():
-            _, log_probs, _ = self.actor(observations)
-        temperature_loss = (-self.log_alpha.exp() * (log_probs + self.config.target_entropy)).mean()
-
-        loss = critics_loss + actor_loss + temperature_loss
-
-        return {
-            "critics_loss": critics_loss.item(),
-            "actor_loss": actor_loss.item(),
-            "mean_q_predicts": min_q_preds.mean().item(),
-            "min_q_predicts": min_q_preds.min().item(),
-            "max_q_predicts": min_q_preds.max().item(),
-            "temperature_loss": temperature_loss.item(),
-            "temperature": temperature,
-            "mean_log_probs": log_probs.mean().item(),
-            "min_log_probs": log_probs.min().item(),
-            "max_log_probs": log_probs.max().item(),
-            "td_target_mean": td_target.mean().item(),
-            "td_target_max": td_target.max().item(),
-            "action_mean": actions.mean().item(),
-            "entropy": log_probs.mean().item(),
-            "loss": loss,
-        }
-
+    def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor | float]: ...
     def update_target_networks(self):
         """Update target networks with exponential moving average"""
         for target_critic, critic in zip(self.critic_target, self.critic_ensemble, strict=False):
@@ -271,9 +184,6 @@ class SACPolicy(
             q_targets = self.critic_forward(
                 observations=next_observations, actions=next_action_preds, use_target=True
             )
-            q_targets = self.critic_forward(
-                observations=next_observations, actions=next_action_preds, use_target=True
-            )
 
             # subsample critics to prevent overfitting if use high UTD (update to date)
             if self.config.num_subsample_critics is not None:
@@ -440,7 +350,6 @@ class Policy(nn.Module):
             if isinstance(layer, nn.Linear):
                 out_features = layer.out_features
                 break
-
         # Mean layer
         self.mean_layer = nn.Linear(out_features, action_dim)
         if init_final is not None:
diff --git a/lerobot/scripts/train_sac.py b/lerobot/scripts/train_sac.py
index eba504d3..bb9b51d5 100644
--- a/lerobot/scripts/train_sac.py
+++ b/lerobot/scripts/train_sac.py
@@ -143,8 +143,11 @@ class ReplayBuffer:
         device: str = "cuda:0",
         state_keys: Optional[Sequence[str]] = None,
     ) -> "ReplayBuffer":
+        # We convert the LeRobotDataset into a replay buffer, because it is more efficient to sample from
+        # a replay buffer than from a lerobot dataset.
         replay_buffer = cls(capacity=len(lerobot_dataset), device=device, state_keys=state_keys)
         list_transition = cls._lerobotdataset_to_transitions(dataset=lerobot_dataset, state_keys=state_keys)
+        # Fill the replay buffer with the lerobot dataset transitions
         for data in list_transition:
             replay_buffer.add(
                 state=data["state"],
@@ -350,8 +353,6 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
 
     optimizers, lr_scheduler = make_optimizers_and_scheduler(cfg, policy)
 
-    step = 0  # number of policy updates (forward + backward + optim)
-
     # TODO: Handle resume
 
     num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
@@ -376,7 +377,6 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
         capacity=cfg.training.online_buffer_capacity, device=device, state_keys=cfg.policy.input_shapes.keys()
     )
 
-    breakpoint()
     batch_size = cfg.training.batch_size
     # if cfg.training.online_steps > 0 and isinstance(cfg.dataset_repo_id, ListConfig):
     #     raise NotImplementedError("Online training with LeRobotMultiDataset is not implemented.")
@@ -413,6 +413,16 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
             logging.info(f"Global step {interaction_step}: Episode reward: {sum_reward_episode}")
             logger.log_dict({"Sum episode reward": sum_reward_episode}, interaction_step)
             sum_reward_episode = 0
+            if "final_info" in info:
+                if "is_success" in info["final_info"][0]:
+                    logging.info(
+                        f"Global step {interaction_step}: Episode success: {info['final_info'][0]['is_success']}"
+                    )
+                if "coverage" in info["final_info"][0]:
+                    logging.info(
+                        f"Global step {interaction_step}: Episode final coverage: {info['final_info'][0]['coverage']} \n"
+                    )
+                    logger.log_dict({"Final coverage": info["final_info"][0]["coverage"]}, interaction_step)
 
         replay_buffer.add(
             state=obs,
@@ -450,10 +460,10 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
             batch = replay_buffer.sample(batch_size)
             if cfg.dataset_repo_id is not None:
                 batch_offline = offline_replay_buffer.sample(batch_size)
-                batch = concatenate_batch_transitions(batch, batch_offline)
-            # 'observation.state', 'action', 'next.reward', 'next.done'
-            # TODO: (azouitine) interface to refine
-            # TODO: At some point we should find a way to normalize the inputs
+                batch = concatenate_batch_transitions(
+                    left_batch_transitions=batch, right_batch_transition=batch_offline
+                )
+            # NOTE: We have to handle the normalization for the batch
             # batch = policy.normalize_inputs(batch)
 
             actions = batch["action"]
@@ -500,599 +510,6 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
             policy.update_target_networks()
 
 
-def clip_grad_norm(loss, clip_grad_norm_value, parameters):
-    grad_norm = torch.nn.utils.clip_grad_norm_(
-        parameters=parameters,
-        max_norm=clip_grad_norm_value,
-        error_if_nonfinite=False,
-    )
-    return grad_norm
-
-
-def update_policy(
-    policy,
-    batch,
-    optimizer,
-    grad_clip_norm,
-    grad_scaler: GradScaler,
-    lr_scheduler=None,
-    use_amp: bool = False,
-    lock=None,
-):
-    """Returns a dictionary of items for logging."""
-    start_time = time.perf_counter()
-    device = get_device_from_parameters(policy)
-    policy.train()
-    with torch.autocast(device_type=device.type) if use_amp else nullcontext():
-        output_dict = policy.forward(batch)
-        # TODO(rcadene): policy.unnormalize_outputs(out_dict)
-        loss = output_dict["loss"]
-    grad_scaler.scale(loss).backward()
-
-    # Unscale the graident of the optimzer's assigned params in-place **prior to gradient clipping**.
-    grad_scaler.unscale_(optimizer)
-
-    grad_norm = torch.nn.utils.clip_grad_norm_(
-        policy.parameters(),
-        grad_clip_norm,
-        error_if_nonfinite=False,
-    )
-
-    # Optimizer's gradients are already unscaled, so scaler.step does not unscale them,
-    # although it still skips optimizer.step() if the gradients contain infs or NaNs.
-    with lock if lock is not None else nullcontext():
-        grad_scaler.step(optimizer)
-    # Updates the scale for next iteration.
-    grad_scaler.update()
-
-    optimizer.zero_grad()
-
-    if lr_scheduler is not None:
-        lr_scheduler.step()
-
-    if isinstance(policy, PolicyWithUpdate):
-        # To possibly update an internal buffer (for instance an Exponential Moving Average like in TDMPC).
-        policy.update()
-
-    info = {
-        "loss": loss.item(),
-        "grad_norm": float(grad_norm),
-        "lr": optimizer.param_groups[0]["lr"],
-        "update_s": time.perf_counter() - start_time,
-        **{k: v for k, v in output_dict.items() if k != "loss"},
-    }
-    info.update({k: v for k, v in output_dict.items() if k not in info})
-
-    return info
-
-
-def log_train_info(logger: Logger, info, step, cfg, dataset, is_online):
-    loss = info["loss"]
-    grad_norm = info["grad_norm"]
-    lr = info["lr"]
-    update_s = info["update_s"]
-    dataloading_s = info["dataloading_s"]
-
-    # A sample is an (observation,action) pair, where observation and action
-    # can be on multiple timestamps. In a batch, we have `batch_size`` number of samples.
-    num_samples = (step + 1) * cfg.training.batch_size
-    avg_samples_per_ep = dataset.num_frames / dataset.num_episodes
-    num_episodes = num_samples / avg_samples_per_ep
-    num_epochs = num_samples / dataset.num_frames
-    log_items = [
-        f"step:{format_big_number(step)}",
-        # number of samples seen during training
-        f"smpl:{format_big_number(num_samples)}",
-        # number of episodes seen during training
-        f"ep:{format_big_number(num_episodes)}",
-        # number of time all unique samples are seen
-        f"epch:{num_epochs:.2f}",
-        f"loss:{loss:.3f}",
-        f"grdn:{grad_norm:.3f}",
-        f"lr:{lr:0.1e}",
-        # in seconds
-        f"updt_s:{update_s:.3f}",
-        f"data_s:{dataloading_s:.3f}",  # if not ~0, you are bottlenecked by cpu or io
-    ]
-    logging.info(" ".join(log_items))
-
-    info["step"] = step
-    info["num_samples"] = num_samples
-    info["num_episodes"] = num_episodes
-    info["num_epochs"] = num_epochs
-    info["is_online"] = is_online
-
-    logger.log_dict(info, step, mode="train")
-
-
-def log_eval_info(logger, info, step, cfg, dataset, is_online):
-    eval_s = info["eval_s"]
-    avg_sum_reward = info["avg_sum_reward"]
-    pc_success = info["pc_success"]
-
-    # A sample is an (observation,action) pair, where observation and action
-    # can be on multiple timestamps. In a batch, we have `batch_size`` number of samples.
-    num_samples = (step + 1) * cfg.training.batch_size
-    avg_samples_per_ep = dataset.num_frames / dataset.num_episodes
-    num_episodes = num_samples / avg_samples_per_ep
-    num_epochs = num_samples / dataset.num_frames
-    log_items = [
-        f"step:{format_big_number(step)}",
-        # number of samples seen during training
-        f"smpl:{format_big_number(num_samples)}",
-        # number of episodes seen during training
-        f"ep:{format_big_number(num_episodes)}",
-        # number of time all unique samples are seen
-        f"epch:{num_epochs:.2f}",
-        f"∑rwrd:{avg_sum_reward:.3f}",
-        f"success:{pc_success:.1f}%",
-        f"eval_s:{eval_s:.3f}",
-    ]
-    logging.info(" ".join(log_items))
-
-    info["step"] = step
-    info["num_samples"] = num_samples
-    info["num_episodes"] = num_episodes
-    info["num_epochs"] = num_epochs
-    info["is_online"] = is_online
-
-    logger.log_dict(info, step, mode="eval")
-
-
-# def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = None):
-#     if out_dir is None:
-#         raise NotImplementedError()
-#     if job_name is None:
-#         raise NotImplementedError()
-
-#     init_logging()
-#     logging.info(pformat(OmegaConf.to_container(cfg)))
-
-#     if cfg.training.online_steps > 0 and isinstance(cfg.dataset_repo_id, ListConfig):
-#         raise NotImplementedError("Online training with LeRobotMultiDataset is not implemented.")
-
-#     # Create an env dedicated to online episodes collection from policy rollout.
-#     online_env = make_env(cfg, n_envs=cfg.training.online_rollout_batch_size)
-
-#     if cfg.training.eval_freq > 0:
-#         logging.info("make_env")
-#         eval_env = make_env(cfg)
-
-#     # If we are resuming a run, we need to check that a checkpoint exists in the log directory, and we need
-#     # to check for any differences between the provided config and the checkpoint's config.
-#     if cfg.resume:
-#         if not Logger.get_last_checkpoint_dir(out_dir).exists():
-#             raise RuntimeError(
-#                 "You have set resume=True, but there is no model checkpoint in "
-#                 f"{Logger.get_last_checkpoint_dir(out_dir)}"
-#             )
-#         checkpoint_cfg_path = str(Logger.get_last_pretrained_model_dir(out_dir) / "config.yaml")
-#         logging.info(
-#             colored(
-#                 "You have set resume=True, indicating that you wish to resume a run",
-#                 color="yellow",
-#                 attrs=["bold"],
-#             )
-#         )
-#         # Get the configuration file from the last checkpoint.
-#         checkpoint_cfg = init_hydra_config(checkpoint_cfg_path)
-#         # Check for differences between the checkpoint configuration and provided configuration.
-#         # Hack to resolve the delta_timestamps ahead of time in order to properly diff.
-#         resolve_delta_timestamps(cfg)
-#         diff = DeepDiff(OmegaConf.to_container(checkpoint_cfg), OmegaConf.to_container(cfg))
-#         # Ignore the `resume` and parameters.
-#         if "values_changed" in diff and "root['resume']" in diff["values_changed"]:
-#             del diff["values_changed"]["root['resume']"]
-#         # Log a warning about differences between the checkpoint configuration and the provided
-#         # configuration.
-#         if len(diff) > 0:
-#             logging.warning(
-#                 "At least one difference was detected between the checkpoint configuration and "
-#                 f"the provided configuration: \n{pformat(diff)}\nNote that the checkpoint configuration "
-#                 "takes precedence.",
-#             )
-#         # Use the checkpoint config instead of the provided config (but keep `resume` parameter).
-#         cfg = checkpoint_cfg
-#         cfg.resume = True
-#     elif Logger.get_last_checkpoint_dir(out_dir).exists():
-#         raise RuntimeError(
-#             f"The configured output directory {Logger.get_last_checkpoint_dir(out_dir)} already exists. If "
-#             "you meant to resume training, please use `resume=true` in your command or yaml configuration."
-#         )
-
-#     if cfg.eval.batch_size > cfg.eval.n_episodes:
-#         raise ValueError(
-#             "The eval batch size is greater than the number of eval episodes "
-#             f"({cfg.eval.batch_size} > {cfg.eval.n_episodes}). As a result, {cfg.eval.batch_size} "
-#             f"eval environments will be instantiated, but only {cfg.eval.n_episodes} will be used. "
-#             "This might significantly slow down evaluation. To fix this, you should update your command "
-#             f"to increase the number of episodes to match the batch size (e.g. `eval.n_episodes={cfg.eval.batch_size}`), "
-#             f"or lower the batch size (e.g. `eval.batch_size={cfg.eval.n_episodes}`)."
-#         )
-
-#     # log metrics to terminal and wandb
-#     logger = Logger(cfg, out_dir, wandb_job_name=job_name)
-
-#     set_global_seed(cfg.seed)
-
-#     # Check device is available
-#     device = get_safe_torch_device(cfg.device, log=True)
-
-#     torch.backends.cudnn.benchmark = True
-#     torch.backends.cuda.matmul.allow_tf32 = True
-
-#     logging.info("make_dataset")
-#     # offline_dataset = make_dataset(cfg)
-#     # TODO (michel-aractingi): temporary fix to avoid datasets with task_index key that doesn't exist in online environment
-#     # i.e., pusht
-#     # if "task_index" in offline_dataset.hf_dataset[0]:
-#     #     offline_dataset.hf_dataset = offline_dataset.hf_dataset.remove_columns(["task_index"])
-
-#     # if isinstance(offline_dataset, MultiLeRobotDataset):
-#     #     logging.info(
-#     #         "Multiple datasets were provided. Applied the following index mapping to the provided datasets: "
-#     #         f"{pformat(offline_dataset.repo_id_to_index , indent=2)}"
-#     #     )
-
-#     # Create environment used for evaluating checkpoints during training on simulation data.
-#     # On real-world data, no need to create an environment as evaluations are done outside train.py,
-#     # using the eval.py instead, with gym_dora environment and dora-rs.
-#     eval_env = None
-#     if cfg.training.eval_freq > 0:
-#         logging.info("make_env")
-#         eval_env = make_env(cfg)
-
-#     logging.info("make_policy")
-#     policy = make_policy(
-#         hydra_cfg=cfg,
-#         # dataset_stats=offline_dataset.meta.stats if not cfg.resume else None,
-#         # Hack: But if we do online traning, we do not need dataset_stats
-#         dataset_stats=None,
-#         pretrained_policy_name_or_path=str(logger.last_pretrained_model_dir) if cfg.resume else None,
-#     )
-#     assert isinstance(policy, nn.Module)
-#     # Create optimizer and scheduler
-#     # Temporary hack to move optimizer out of policy
-#     optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy)
-#     grad_scaler = GradScaler(enabled=cfg.use_amp)
-
-#     step = 0  # number of policy updates (forward + backward + optim)
-
-#     if cfg.resume:
-#         step = logger.load_last_training_state(optimizer, lr_scheduler)
-
-#     num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
-#     num_total_params = sum(p.numel() for p in policy.parameters())
-
-#     log_output_dir(out_dir)
-#     logging.info(f"{cfg.env.task=}")
-#     logging.info(f"{cfg.training.offline_steps=} ({format_big_number(cfg.training.offline_steps)})")
-#     logging.info(f"{cfg.training.online_steps=}")
-#     # logging.info(f"{offline_dataset.num_frames=} ({format_big_number(offline_dataset.num_frames)})")
-#     # logging.info(f"{offline_dataset.num_episodes=}")
-#     logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
-#     logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
-
-#     # Note: this helper will be used in offline and online training loops.
-#     def evaluate_and_checkpoint_if_needed(step, is_online):
-#         _num_digits = max(6, len(str(cfg.training.offline_steps + cfg.training.online_steps)))
-#         step_identifier = f"{step:0{_num_digits}d}"
-
-#         if cfg.training.eval_freq > 0 and step % cfg.training.eval_freq == 0:
-#             logging.info(f"Eval policy at step {step}")
-#             with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.use_amp else nullcontext():
-#                 assert eval_env is not None
-#                 eval_info = eval_policy(
-#                     eval_env,
-#                     policy,
-#                     cfg.eval.n_episodes,
-#                     videos_dir=Path(out_dir) / "eval" / f"videos_step_{step_identifier}",
-#                     max_episodes_rendered=4,
-#                     start_seed=cfg.seed,
-#                 )
-#             # log_eval_info(logger, eval_info["aggregated"], step, cfg, offline_dataset, is_online=is_online)
-#             log_eval_info(logger, eval_info["aggregated"], step, cfg, online_dataset, is_online=is_online)
-#             if cfg.wandb.enable:
-#                 logger.log_video(eval_info["video_paths"][0], step, mode="eval")
-#             logging.info("Resume training")
-
-#         if cfg.training.save_checkpoint and (
-#             step % cfg.training.save_freq == 0
-#             or step == cfg.training.offline_steps + cfg.training.online_steps
-#         ):
-#             logging.info(f"Checkpoint policy after step {step}")
-#             # Note: Save with step as the identifier, and format it to have at least 6 digits but more if
-#             # needed (choose 6 as a minimum for consistency without being overkill).
-#             logger.save_checkpoint(
-#                 step,
-#                 policy,
-#                 optimizer,
-#                 lr_scheduler,
-#                 identifier=step_identifier,
-#             )
-#             logging.info("Resume training")
-
-#     # create dataloader for offline training
-#     # if cfg.training.get("drop_n_last_frames"):
-#     #     shuffle = False
-#     #     sampler = EpisodeAwareSampler(
-#     #         offline_dataset.episode_data_index,
-#     #         drop_n_last_frames=cfg.training.drop_n_last_frames,
-#     #         shuffle=True,
-#     #     )
-#     # else:
-#     #     shuffle = True
-#     #     sampler = None
-#     # dataloader = torch.utils.data.DataLoader(
-#     #     offline_dataset,
-#     #     num_workers=cfg.training.num_workers,
-#     #     batch_size=cfg.training.batch_size,
-#     #     shuffle=shuffle,
-#     #     sampler=sampler,
-#     #     pin_memory=device.type != "cpu",
-#     #     drop_last=False,
-#     # )
-#     # dl_iter = cycle(dataloader)
-
-#     policy.train()
-#     # offline_step = 0
-#     # for _ in range(step, cfg.training.offline_steps):
-#     #     if offline_step == 0:
-#     #         logging.info("Start offline training on a fixed dataset")
-
-#     #     start_time = time.perf_counter()
-#     #     batch = next(dl_iter)
-#     #     dataloading_s = time.perf_counter() - start_time
-
-#     #     for key in batch:
-#     #         batch[key] = batch[key].to(device, non_blocking=True)
-
-#     #     train_info = update_policy(
-#     #         policy,
-#     #         batch,
-#     #         optimizer,
-#     #         cfg.training.grad_clip_norm,
-#     #         grad_scaler=grad_scaler,
-#     #         lr_scheduler=lr_scheduler,
-#     #         use_amp=cfg.use_amp,
-#     #     )
-
-#     #     train_info["dataloading_s"] = dataloading_s
-
-#     #     if step % cfg.training.log_freq == 0:
-#     #         log_train_info(logger, train_info, step, cfg, offline_dataset, is_online=False)
-
-#     #     # Note: evaluate_and_checkpoint_if_needed happens **after** the `step`th training update has completed,
-#     #     # so we pass in step + 1.
-#     #     evaluate_and_checkpoint_if_needed(step + 1, is_online=False)
-
-#     #     step += 1
-#     #     offline_step += 1  # noqa: SIM113
-
-#     # if cfg.training.online_steps == 0:
-#     #     if eval_env:
-#     #         eval_env.close()
-#     #     logging.info("End of training")
-#     #     return
-
-#     # Online training.
-
-#     # Create an env dedicated to online episodes collection from policy rollout.
-#     online_env = make_env(cfg, n_envs=cfg.training.online_rollout_batch_size)
-#     resolve_delta_timestamps(cfg)
-#     online_buffer_path = logger.log_dir / "online_buffer"
-#     if cfg.resume and not online_buffer_path.exists():
-#         # If we are resuming a run, we default to the data shapes and buffer capacity from the saved online
-#         # buffer.
-#         logging.warning(
-#             "When online training is resumed, we load the latest online buffer from the prior run, "
-#             "and this might not coincide with the state of the buffer as it was at the moment the checkpoint "
-#             "was made. This is because the online buffer is updated on disk during training, independently "
-#             "of our explicit checkpointing mechanisms."
-#         )
-#     online_dataset = OnlineBuffer(
-#         online_buffer_path,
-#         data_spec={
-#             **{k: {"shape": v, "dtype": np.dtype("float32")} for k, v in policy.config.input_shapes.items()},
-#             **{k: {"shape": v, "dtype": np.dtype("float32")} for k, v in policy.config.output_shapes.items()},
-#             "next.reward": {"shape": (), "dtype": np.dtype("float32")},
-#             "next.done": {"shape": (), "dtype": np.dtype("?")},
-#             "next.success": {"shape": (), "dtype": np.dtype("?")},
-#         },
-#         buffer_capacity=cfg.training.online_buffer_capacity,
-#         fps=online_env.unwrapped.metadata["render_fps"],
-#         delta_timestamps=cfg.training.delta_timestamps,
-#     )
-
-#     # If we are doing online rollouts asynchronously, deepcopy the policy to use for online rollouts (this
-#     # makes it possible to do online rollouts in parallel with training updates).
-#     online_rollout_policy = deepcopy(policy) if cfg.training.do_online_rollout_async else policy
-
-#     # Create dataloader for online training.
-#     # concat_dataset = torch.utils.data.ConcatDataset([offline_dataset, online_dataset])
-#     # sampler_weights = compute_sampler_weights(
-#     #     offline_dataset,
-#     #     offline_drop_n_last_frames=cfg.training.get("drop_n_last_frames", 0),
-#     #     online_dataset=online_dataset,
-#     #     # +1 because online rollouts return an extra frame for the "final observation". Note: we don't have
-#     #     # this final observation in the offline datasets, but we might add them in future.
-#     #     online_drop_n_last_frames=cfg.training.get("drop_n_last_frames", 0) + 1,
-#     #     online_sampling_ratio=cfg.training.online_sampling_ratio,
-#     # )
-#     # sampler = torch.utils.data.WeightedRandomSampler(
-#     #     sampler_weights,
-#     #     num_samples=len(concat_dataset),
-#     #     replacement=True,
-#     # )
-#     # dataloader = torch.utils.data.DataLoader(
-#     #     concat_dataset,
-#     #     batch_size=cfg.training.batch_size,
-#     #     num_workers=cfg.training.num_workers,
-#     #     sampler=sampler,
-#     #     pin_memory=device.type != "cpu",
-#     #     drop_last=True,
-#     # )
-
-#     dataloader = torch.utils.data.DataLoader(
-#         online_dataset,
-#         batch_size=cfg.training.batch_size,
-#         # num_workers=cfg.training.num_workers,
-#         num_workers=0,
-#         # sampler=sampler,
-#         pin_memory=device.type != "cpu",
-#         drop_last=True,
-#     )
-#     dl_iter = cycle(dataloader)
-
-#     # Lock and thread pool executor for asynchronous online rollouts. When asynchronous mode is disabled,
-#     # these are still used but effectively do nothing.
-#     # Hack: Comment the lock
-#     # lock = Lock()
-#     # Note: 1 worker because we only ever want to run one set of online rollouts at a time. Batch
-#     # parallelization of rollouts is handled within the job.
-
-#     # Hack: ThreadPoolExecutor
-#     # executor = ThreadPoolExecutor(max_workers=1)
-
-#     online_step = 0
-#     online_rollout_s = 0  # time take to do online rollout
-#     update_online_buffer_s = 0  # time taken to update the online buffer with the online rollout data
-#     # Time taken waiting for the online buffer to finish being updated. This is relevant when using the async
-#     # online rollout option.
-#     await_update_online_buffer_s = 0
-#     rollout_start_seed = cfg.training.online_env_seed
-
-#     while True:
-#         if online_step == cfg.training.online_steps:
-#             break
-
-#         if online_step == 0:
-#             logging.info("Start online training by interacting with environment")
-
-#         def sample_trajectory_and_update_buffer():
-#             nonlocal rollout_start_seed
-#             # with lock:
-#             online_rollout_policy.load_state_dict(policy.state_dict())
-
-#             online_rollout_policy.eval()
-#             start_rollout_time = time.perf_counter()
-#             with torch.no_grad():
-#                 eval_info = eval_policy(
-#                     online_env,
-#                     online_rollout_policy,
-#                     n_episodes=cfg.training.online_rollout_n_episodes,
-#                     max_episodes_rendered=min(10, cfg.training.online_rollout_n_episodes),
-#                     videos_dir=logger.log_dir / "online_rollout_videos",
-#                     return_episode_data=True,
-#                     start_seed=(
-#                         rollout_start_seed := (rollout_start_seed + cfg.training.batch_size) % 1000000
-#                     ),
-#                 )
-#             online_rollout_s = time.perf_counter() - start_rollout_time
-
-#             # with lock:
-#             start_update_buffer_time = time.perf_counter()
-#             online_dataset.add_data(eval_info["episodes"])
-
-#             # Update the concatenated dataset length used during sampling.
-#             # concat_dataset.cumulative_sizes = concat_dataset.cumsum(concat_dataset.datasets)
-#             # HACK: We do only online training, so we don't need update dataset length because
-#             # we do not concatenate offline and online datasets.
-#             # online_dataset.cumulative_sizes = online_dataset.cumsum(online_dataset.datasets)
-
-#             # Update the sampling weights.
-#             # sampler.weights = compute_sampler_weights(
-#             #     offline_dataset,
-#             #     offline_drop_n_last_frames=cfg.training.get("drop_n_last_frames", 0),
-#             #     online_dataset=online_dataset,
-#             #     # +1 because online rollouts return an extra frame for the "final observation". Note: we don't have
-#             #     # this final observation in the offline datasets, but we might add them in future.
-#             #     online_drop_n_last_frames=cfg.training.get("drop_n_last_frames", 0) + 1,
-#             #     online_sampling_ratio=cfg.training.online_sampling_ratio,
-#             # )
-#             # sampler.num_frames = len(concat_dataset)
-
-#             update_online_buffer_s = time.perf_counter() - start_update_buffer_time
-
-#             return online_rollout_s, update_online_buffer_s
-
-#         # Hack:Comment it
-#         # future = executor.submit(sample_trajectory_and_update_buffer)
-#         # sample_trajectory_and_update_buffer()
-#         # If we aren't doing async rollouts, or if we haven't yet gotten enough examples in our buffer, wait
-#         # here until the rollout and buffer update is done, before proceeding to the policy update steps.
-#         if (
-#             not cfg.training.do_online_rollout_async
-#             or len(online_dataset) <= cfg.training.online_buffer_seed_size
-#         ):
-#             # online_rollout_s, update_online_buffer_s = future.result()
-#             online_rollout_s, update_online_buffer_s = sample_trajectory_and_update_buffer()
-
-#         if len(online_dataset) <= cfg.training.online_buffer_seed_size:
-#             logging.info(
-#                 f"Seeding online buffer: {len(online_dataset)}/{cfg.training.online_buffer_seed_size}"
-#             )
-#             continue
-
-#         policy.train()
-#         for _ in range(cfg.training.online_steps_between_rollouts):
-#             # Hack: Comment the lock and reindent
-#             # with lock:
-#             start_time = time.perf_counter()
-#             batch = next(dl_iter)
-#             dataloading_s = time.perf_counter() - start_time
-
-#             for key in batch:
-#                 batch[key] = batch[key].to(cfg.device, non_blocking=True)
-
-#             train_info = update_policy(
-#                 policy,
-#                 batch,
-#                 optimizer,
-#                 cfg.training.grad_clip_norm,
-#                 grad_scaler=grad_scaler,
-#                 lr_scheduler=lr_scheduler,
-#                 use_amp=cfg.use_amp,
-#                 # lock=lock,
-#                 # Hack: Comment the lock
-#                 lock=None,
-#             )
-
-#             train_info["dataloading_s"] = dataloading_s
-#             train_info["online_rollout_s"] = online_rollout_s
-#             train_info["update_online_buffer_s"] = update_online_buffer_s
-#             train_info["await_update_online_buffer_s"] = await_update_online_buffer_s
-#             # Hack: Comment the lock and reindent
-#             # with lock:
-#             train_info["online_buffer_size"] = len(online_dataset)
-
-#             if step % cfg.training.log_freq == 0:
-#                 log_train_info(logger, train_info, step, cfg, online_dataset, is_online=True)
-
-#             # Note: evaluate_and_checkpoint_if_needed happens **after** the `step`th training update has completed,
-#             # so we pass in step + 1.
-#             evaluate_and_checkpoint_if_needed(step + 1, is_online=True)
-
-#             step += 1
-#             online_step += 1
-
-#         # If we're doing async rollouts, we should now wait until we've completed them before proceeding
-#         # to do the next batch of rollouts.
-#         # Hack: comment it
-#         # if future.running():
-#         start = time.perf_counter()
-#         # online_rollout_s, update_online_buffer_s = future.result()
-#         online_rollout_s, update_online_buffer_s = sample_trajectory_and_update_buffer()
-#         await_update_online_buffer_s = time.perf_counter() - start
-
-#         if online_step >= cfg.training.online_steps:
-#             break
-
-#     if eval_env:
-#         eval_env.close()
-#     logging.info("End of training")
-
-
 @hydra.main(version_base="1.2", config_name="default", config_path="../configs")
 def train_cli(cfg: dict):
     train(

From 1fb03d4cf260ed65dc58cf46eeb37b2f34c83de3 Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Tue, 21 Jan 2025 09:51:12 +0000
Subject: [PATCH 048/112] Add type annotations and restructure SACConfig class
 fields

---
 .../common/policies/sac/configuration_sac.py  | 94 ++++++++++++-------
 1 file changed, 62 insertions(+), 32 deletions(-)

diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index 3f5dae1c..97ba04b1 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -16,6 +16,7 @@
 # limitations under the License.
 
 from dataclasses import dataclass, field
+from typing import Any
 
 
 @dataclass
@@ -26,6 +27,7 @@ class SACConfig:
             "observation.state": [4],
         }
     )
+
     output_shapes: dict[str, list[int]] = field(
         default_factory=lambda: {
             "action": [2],
@@ -43,36 +45,64 @@ class SACConfig:
     output_normalization_modes: dict[str, str] = field(
         default_factory=lambda: {"action": "min_max"},
     )
+from dataclasses import dataclass, field
 
-    shared_encoder = False
-    discount = 0.99
-    temperature_init = 1.0
-    num_critics = 2
-    # num_critics = 8
-    num_subsample_critics = None
-    # num_subsample_critics = 2
-    # critic_lr = 1e-3
-    critic_lr = 3e-4
-    actor_lr = 3e-4
-    temperature_lr = 3e-4
-    critic_target_update_weight = 0.005
-    # utd_ratio = 8
-    utd_ratio = 1  # If you want enable utd_ratio, you need to set it to >1
-    state_encoder_hidden_dim = 256
-    latent_dim = 256
-    target_entropy = None
-    # backup_entropy = False
-    use_backup_entropy = True
-    critic_network_kwargs = {
-        "hidden_dims": [256, 256],
-        "activate_final": True,
-    }
-    actor_network_kwargs = {
-        "hidden_dims": [256, 256],
-        "activate_final": True,
-    }
-    policy_kwargs = {
-        "use_tanh_squash": True,
-        "log_std_min": -5,
-        "log_std_max": 2,
-    }
+@dataclass
+class SACConfig:
+    input_shapes: dict[str, list[int]] = field(
+        default_factory=lambda: {
+            "observation.image": [3, 84, 84],
+            "observation.state": [4],
+        }
+    )
+    output_shapes: dict[str, list[int]] = field(
+        default_factory=lambda: {
+            "action": [2],
+        }
+    )
+    input_normalization_modes: dict[str, str] = field(
+        default_factory=lambda: {
+            "observation.image": "mean_std",
+            "observation.state": "min_max",
+            "observation.environment_state": "min_max",
+        }
+    )
+    output_normalization_modes: dict[str, str] = field(
+        default_factory=lambda: {"action": "min_max"}
+    )
+
+    # Add type annotations for these fields:
+    image_encoder_hidden_dim: int = 32
+    shared_encoder: bool = False
+    discount: float = 0.99
+    temperature_init: float = 1.0
+    num_critics: int = 2
+    num_subsample_critics: int | None = None
+    critic_lr: float = 3e-4
+    actor_lr: float = 3e-4
+    temperature_lr: float = 3e-4
+    critic_target_update_weight: float = 0.005
+    utd_ratio: int = 1  # If you want enable utd_ratio, you need to set it to >1
+    state_encoder_hidden_dim: int = 256
+    latent_dim: int = 256
+    target_entropy: float | None = None
+    use_backup_entropy: bool = True
+    critic_network_kwargs: dict[str, Any] = field(
+        default_factory=lambda: {
+            "hidden_dims": [256, 256],
+            "activate_final": True,
+        }
+    )
+    actor_network_kwargs: dict[str, Any] = field(
+        default_factory=lambda: {
+            "hidden_dims": [256, 256],
+            "activate_final": True,
+        }
+    )
+    policy_kwargs: dict[str, Any] = field(
+        default_factory=lambda: {
+            "use_tanh_squash": True,
+            "log_std_min": -5,
+            "log_std_max": 2,
+        }
+    )

From d75b44f89fbaed90b36984be4b9b2e95a2655b4c Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Wed, 22 Jan 2025 09:00:16 +0000
Subject: [PATCH 049/112] Stable version of rlpd + drq

---
 lerobot/common/envs/factory.py                |  91 +++++++
 lerobot/common/envs/utils.py                  |  40 +++
 .../common/policies/sac/configuration_sac.py  |  37 +--
 lerobot/common/policies/sac/modeling_sac.py   | 132 ++++++----
 lerobot/configs/policy/sac_manyskill.yaml     |  97 +++++++
 lerobot/scripts/train_sac.py                  | 244 +++++++++++-------
 6 files changed, 467 insertions(+), 174 deletions(-)
 create mode 100644 lerobot/configs/policy/sac_manyskill.yaml

diff --git a/lerobot/common/envs/factory.py b/lerobot/common/envs/factory.py
index 54f24ea8..cf6d7208 100644
--- a/lerobot/common/envs/factory.py
+++ b/lerobot/common/envs/factory.py
@@ -14,9 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import importlib
+from collections import deque
 
 import gymnasium as gym
+import numpy as np
+import torch
 from omegaconf import DictConfig
+from mani_skill.utils import common
 
 
 def make_env(cfg: DictConfig, n_envs: int | None = None) -> gym.vector.VectorEnv | None:
@@ -30,6 +34,10 @@ def make_env(cfg: DictConfig, n_envs: int | None = None) -> gym.vector.VectorEnv
     if cfg.env.name == "real_world":
         return
 
+    if "maniskill" in cfg.env.name:
+        env = make_maniskill_env(cfg, n_envs if n_envs is not None else cfg.eval.batch_size)
+        return env
+
     package_name = f"gym_{cfg.env.name}"
 
     try:
@@ -56,3 +64,86 @@ def make_env(cfg: DictConfig, n_envs: int | None = None) -> gym.vector.VectorEnv
     )
 
     return env
+
+
+def make_maniskill_env(cfg: DictConfig, n_envs: int | None = None) -> gym.vector.VectorEnv | None:
+    """Make ManiSkill3 gym environment"""
+    from mani_skill.vector.wrappers.gymnasium import ManiSkillVectorEnv
+
+    env = gym.make(
+        cfg.env.task,
+        obs_mode=cfg.env.obs,
+        control_mode=cfg.env.control_mode,
+        render_mode=cfg.env.render_mode,
+        sensor_configs=dict(width=cfg.env.image_size, height=cfg.env.image_size),
+        num_envs=n_envs,
+    )
+    # cfg.env_cfg.control_mode = cfg.eval_env_cfg.control_mode = env.control_mode
+    env = ManiSkillVectorEnv(env, ignore_terminations=True)
+    # state should have the size of 25
+    # env = ConvertToLeRobotEnv(env, n_envs)
+    # env = PixelWrapper(cfg, env, n_envs)
+    env._max_episode_steps = env.max_episode_steps = 50  # gym_utils.find_max_episode_steps_value(env)
+    env.unwrapped.metadata["render_fps"] = 20
+
+    return env
+
+
+class PixelWrapper(gym.Wrapper):
+    """
+    Wrapper for pixel observations. Works with Maniskill vectorized environments
+    """
+
+    def __init__(self, cfg, env, num_envs, num_frames=3):
+        super().__init__(env)
+        self.cfg = cfg
+        self.env = env
+        self.observation_space = gym.spaces.Box(
+            low=0,
+            high=255,
+            shape=(num_envs, num_frames * 3, cfg.env.render_size, cfg.env.render_size),
+            dtype=np.uint8,
+        )
+        self._frames = deque([], maxlen=num_frames)
+        self._render_size = cfg.env.render_size
+
+    def _get_obs(self, obs):
+        frame = obs["sensor_data"]["base_camera"]["rgb"].cpu().permute(0, 3, 1, 2)
+        self._frames.append(frame)
+        return {"pixels": torch.from_numpy(np.concatenate(self._frames, axis=1)).to(self.env.device)}
+
+    def reset(self, seed):
+        obs, info = self.env.reset()  # (seed=seed)
+        for _ in range(self._frames.maxlen):
+            obs_frames = self._get_obs(obs)
+        return obs_frames, info
+
+    def step(self, action):
+        obs, reward, terminated, truncated, info = self.env.step(action)
+        return self._get_obs(obs), reward, terminated, truncated, info
+
+class ConvertToLeRobotEnv(gym.Wrapper):
+    def __init__(self, env, num_envs):
+        super().__init__(env)
+    def reset(self, seed=None, options=None):
+        obs, info = self.env.reset(seed=seed, options={})
+        return self._get_obs(obs), info
+    def step(self, action):
+        obs, reward, terminated, truncated, info = self.env.step(action)
+        return self._get_obs(obs), reward, terminated, truncated, info
+    def _get_obs(self, observation):
+        sensor_data = observation.pop("sensor_data")
+        del observation["sensor_param"]
+        images = []
+        for cam_data in sensor_data.values():
+                images.append(cam_data["rgb"])
+
+        images = torch.concat(images, axis=-1)
+        # flatten the rest of the data which should just be state data
+        observation = common.flatten_state_dict(
+            observation, use_torch=True, device=self.base_env.device
+        )
+        ret = dict()
+        ret["state"] = observation
+        ret["pixels"] = images
+        return ret
\ No newline at end of file
diff --git a/lerobot/common/envs/utils.py b/lerobot/common/envs/utils.py
index 001973bc..fab83d49 100644
--- a/lerobot/common/envs/utils.py
+++ b/lerobot/common/envs/utils.py
@@ -28,6 +28,9 @@ def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Ten
     """
     # map to expected inputs for the policy
     return_observations = {}
+    # TODO: You have to merge all tensors from agent key and extra key
+    # You don't keep sensor param key in the observation
+    # And you keep sensor data rgb
     if "pixels" in observations:
         if isinstance(observations["pixels"], dict):
             imgs = {f"observation.images.{key}": img for key, img in observations["pixels"].items()}
@@ -50,6 +53,8 @@ def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Ten
             img /= 255
 
             return_observations[imgkey] = img
+        # obs state agent qpos and qvel
+        # image
 
     if "environment_state" in observations:
         return_observations["observation.environment_state"] = torch.from_numpy(
@@ -60,3 +65,38 @@ def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Ten
     # requirement for "agent_pos"
     return_observations["observation.state"] = torch.from_numpy(observations["agent_pos"]).float()
     return return_observations
+
+
+def preprocess_maniskill_observation(observations: dict[str, np.ndarray]) -> dict[str, Tensor]:
+    """Convert environment observation to LeRobot format observation.
+    Args:
+        observation: Dictionary of observation batches from a Gym vector environment.
+    Returns:
+        Dictionary of observation batches with keys renamed to LeRobot format and values as tensors.
+    """
+    # map to expected inputs for the policy
+    return_observations = {}
+    # TODO: You have to merge all tensors from agent key and extra key
+    # You don't keep sensor param key in the observation
+    # And you keep sensor data rgb
+    q_pos = observations["agent"]["qpos"]
+    q_vel = observations["agent"]["qvel"]
+    tcp_pos = observations["extra"]["tcp_pose"]
+    img = observations["sensor_data"]["base_camera"]["rgb"]
+
+    _, h, w, c = img.shape
+    assert c < h and c < w, f"expect channel last images, but instead got {img.shape=}"
+
+    # sanity check that images are uint8
+    assert img.dtype == torch.uint8, f"expect torch.uint8, but instead {img.dtype=}"
+
+    # convert to channel first of type float32 in range [0,1]
+    img = einops.rearrange(img, "b h w c -> b c h w").contiguous()
+    img = img.type(torch.float32)
+    img /= 255
+
+    state = torch.cat([q_pos, q_vel, tcp_pos], dim=-1)
+
+    return_observations["observation.image"] = img
+    return_observations["observation.state"] = state
+    return return_observations
diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index 97ba04b1..62f35ed5 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -19,34 +19,6 @@ from dataclasses import dataclass, field
 from typing import Any
 
 
-@dataclass
-class SACConfig:
-    input_shapes: dict[str, list[int]] = field(
-        default_factory=lambda: {
-            "observation.image": [3, 84, 84],
-            "observation.state": [4],
-        }
-    )
-
-    output_shapes: dict[str, list[int]] = field(
-        default_factory=lambda: {
-            "action": [2],
-        }
-    )
-
-    # Normalization / Unnormalization
-    input_normalization_modes: dict[str, str] = field(
-        default_factory=lambda: {
-            "observation.image": "mean_std",
-            "observation.state": "min_max",
-            "observation.environment_state": "min_max",
-        }
-    )
-    output_normalization_modes: dict[str, str] = field(
-        default_factory=lambda: {"action": "min_max"},
-    )
-from dataclasses import dataclass, field
-
 @dataclass
 class SACConfig:
     input_shapes: dict[str, list[int]] = field(
@@ -67,10 +39,13 @@ class SACConfig:
             "observation.environment_state": "min_max",
         }
     )
-    output_normalization_modes: dict[str, str] = field(
-        default_factory=lambda: {"action": "min_max"}
+    output_normalization_modes: dict[str, str] = field(default_factory=lambda: {"action": "min_max"})
+    output_normalization_params: dict[str, dict[str, list[float]]] = field(
+        default_factory=lambda: {
+            "action": {"min": [-1, -1], "max": [1, 1]},
+        }
     )
-
+    camera_number: int = 1
     # Add type annotations for these fields:
     image_encoder_hidden_dim: int = 32
     shared_encoder: bool = False
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 35b1bd5a..8fb46199 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -42,37 +42,31 @@ class SACPolicy(
     name = "sac"
 
     def __init__(
-        self, config: SACConfig | None = None, dataset_stats: dict[str, dict[str, Tensor]] | None = None
+        self,
+        config: SACConfig | None = None,
+        dataset_stats: dict[str, dict[str, Tensor]] | None = None,
+        device: str = "cpu",
     ):
         super().__init__()
 
         if config is None:
             config = SACConfig()
         self.config = config
-
         if config.input_normalization_modes is not None:
             self.normalize_inputs = Normalize(
                 config.input_shapes, config.input_normalization_modes, dataset_stats
             )
         else:
             self.normalize_inputs = nn.Identity()
-        # HACK: we need to pass the dataset_stats to the normalization functions
 
-        # NOTE: This is for biwalker environment
-        dataset_stats = dataset_stats or {
-            "action": {
-                "min": torch.tensor([-1.0, -1.0, -1.0, -1.0]),
-                "max": torch.tensor([1.0, 1.0, 1.0, 1.0]),
-            }
-        }
+        output_normalization_params = {}
+        for outer_key, inner_dict in config.output_normalization_params.items():
+            output_normalization_params[outer_key] = {}
+            for key, value in inner_dict.items():
+                output_normalization_params[outer_key][key] = torch.tensor(value)
 
-        # NOTE: This is for pusht environment
-        # dataset_stats = dataset_stats or {
-        #     "action": {
-        #         "min": torch.tensor([0, 0]),
-        #         "max": torch.tensor([512, 512]),
-        #     }
-        # }
+        # HACK: This is hacky and should be removed
+        dataset_stats = dataset_stats or output_normalization_params
         self.normalize_targets = Normalize(
             config.output_shapes, config.output_normalization_modes, dataset_stats
         )
@@ -82,7 +76,7 @@ class SACPolicy(
 
         if config.shared_encoder:
             encoder_critic = SACObservationEncoder(config)
-            encoder_actor = encoder_critic
+            encoder_actor: SACObservationEncoder = encoder_critic
         else:
             encoder_critic = SACObservationEncoder(config)
             encoder_actor = SACObservationEncoder(config)
@@ -95,6 +89,7 @@ class SACPolicy(
                     input_dim=encoder_critic.output_dim + config.output_shapes["action"][0],
                     **config.critic_network_kwargs,
                 ),
+                device=device,
             )
             critic_nets.append(critic_net)
 
@@ -106,40 +101,35 @@ class SACPolicy(
                     input_dim=encoder_critic.output_dim + config.output_shapes["action"][0],
                     **config.critic_network_kwargs,
                 ),
+                device=device,
             )
             target_critic_nets.append(target_critic_net)
 
-        self.critic_ensemble = create_critic_ensemble(critic_nets, config.num_critics)
-        self.critic_target = create_critic_ensemble(target_critic_nets, config.num_critics)
+        self.critic_ensemble = create_critic_ensemble(
+            critics=critic_nets, num_critics=config.num_critics, device=device
+        )
+        self.critic_target = create_critic_ensemble(
+            critics=target_critic_nets, num_critics=config.num_critics, device=device
+        )
         self.critic_target.load_state_dict(self.critic_ensemble.state_dict())
 
         self.actor = Policy(
             encoder=encoder_actor,
             network=MLP(input_dim=encoder_actor.output_dim, **config.actor_network_kwargs),
             action_dim=config.output_shapes["action"][0],
+            device=device,
+            encoder_is_shared=config.shared_encoder,
             **config.policy_kwargs,
         )
         if config.target_entropy is None:
             config.target_entropy = -np.prod(config.output_shapes["action"][0]) / 2  # (-dim(A)/2)
-        # TODO: fix later device
         # TODO: Handle the case where the temparameter is a fixed
-        self.log_alpha = torch.zeros(1, requires_grad=True, device="cpu")
+        self.log_alpha = torch.zeros(1, requires_grad=True, device=device)
         self.temperature = self.log_alpha.exp().item()
 
     def reset(self):
-        """
-        Clear observation and action queues. Should be called on `env.reset()`
-        queues are populated during rollout of the policy, they contain the n latest observations and actions
-        """
-
-        self._queues = {
-            "observation.state": deque(maxlen=1),
-            "action": deque(maxlen=1),
-        }
-        if "observation.image" in self.config.input_shapes:
-            self._queues["observation.image"] = deque(maxlen=1)
-        if "observation.environment_state" in self.config.input_shapes:
-            self._queues["observation.environment_state"] = deque(maxlen=1)
+        """Reset the policy"""
+        pass
 
     @torch.no_grad()
     def select_action(self, batch: dict[str, Tensor]) -> Tensor:
@@ -334,6 +324,7 @@ class Policy(nn.Module):
         init_final: Optional[float] = None,
         use_tanh_squash: bool = False,
         device: str = "cpu",
+        encoder_is_shared: bool = False,
     ):
         super().__init__()
         self.device = torch.device(device)
@@ -344,7 +335,12 @@ class Policy(nn.Module):
         self.log_std_max = log_std_max
         self.fixed_std = fixed_std.to(self.device) if fixed_std is not None else None
         self.use_tanh_squash = use_tanh_squash
+        self.parameters_to_optimize = []
 
+        self.parameters_to_optimize += list(self.network.parameters())
+
+        if self.encoder is not None and not encoder_is_shared:
+            self.parameters_to_optimize += list(self.encoder.parameters())
         # Find the last Linear layer's output dimension
         for layer in reversed(network.net):
             if isinstance(layer, nn.Linear):
@@ -358,6 +354,7 @@ class Policy(nn.Module):
         else:
             orthogonal_init()(self.mean_layer.weight)
 
+        self.parameters_to_optimize += list(self.mean_layer.parameters())
         # Standard deviation layer or parameter
         if fixed_std is None:
             self.std_layer = nn.Linear(out_features, action_dim)
@@ -366,6 +363,7 @@ class Policy(nn.Module):
                 nn.init.uniform_(self.std_layer.bias, -init_final, init_final)
             else:
                 orthogonal_init()(self.std_layer.weight)
+            self.parameters_to_optimize += list(self.std_layer.parameters())
 
         self.to(self.device)
 
@@ -428,44 +426,78 @@ class SACObservationEncoder(nn.Module):
         """
         super().__init__()
         self.config = config
-
         if "observation.image" in config.input_shapes:
             self.image_enc_layers = nn.Sequential(
                 nn.Conv2d(
-                    config.input_shapes["observation.image"][0], config.image_encoder_hidden_dim, 7, stride=2
+                    in_channels=config.input_shapes["observation.image"][0],
+                    out_channels=config.image_encoder_hidden_dim,
+                    kernel_size=7,
+                    stride=2,
                 ),
                 nn.ReLU(),
-                nn.Conv2d(config.image_encoder_hidden_dim, config.image_encoder_hidden_dim, 5, stride=2),
+                nn.Conv2d(
+                    in_channels=config.image_encoder_hidden_dim,
+                    out_channels=config.image_encoder_hidden_dim,
+                    kernel_size=5,
+                    stride=2,
+                ),
                 nn.ReLU(),
-                nn.Conv2d(config.image_encoder_hidden_dim, config.image_encoder_hidden_dim, 3, stride=2),
+                nn.Conv2d(
+                    in_channels=config.image_encoder_hidden_dim,
+                    out_channels=config.image_encoder_hidden_dim,
+                    kernel_size=3,
+                    stride=2,
+                ),
                 nn.ReLU(),
-                nn.Conv2d(config.image_encoder_hidden_dim, config.image_encoder_hidden_dim, 3, stride=2),
+                nn.Conv2d(
+                    in_channels=config.image_encoder_hidden_dim,
+                    out_channels=config.image_encoder_hidden_dim,
+                    kernel_size=3,
+                    stride=2,
+                ),
                 nn.ReLU(),
             )
+            self.camera_number = config.camera_number
+            self.aggregation_size: int = 0
+
             dummy_batch = torch.zeros(1, *config.input_shapes["observation.image"])
             with torch.inference_mode():
                 out_shape = self.image_enc_layers(dummy_batch).shape[1:]
             self.image_enc_layers.extend(
-                nn.Sequential(
+                sequential=nn.Sequential(
                     nn.Flatten(),
-                    nn.Linear(np.prod(out_shape), config.latent_dim),
-                    nn.LayerNorm(config.latent_dim),
+                    nn.Linear(
+                        in_features=np.prod(out_shape) * self.camera_number, out_features=config.latent_dim
+                    ),
+                    nn.LayerNorm(normalized_shape=config.latent_dim),
                     nn.Tanh(),
                 )
             )
+
+            self.aggregation_size += config.latent_dim * self.camera_number
         if "observation.state" in config.input_shapes:
             self.state_enc_layers = nn.Sequential(
-                nn.Linear(config.input_shapes["observation.state"][0], config.latent_dim),
-                nn.LayerNorm(config.latent_dim),
+                nn.Linear(
+                    in_features=config.input_shapes["observation.state"][0], out_features=config.latent_dim
+                ),
+                nn.LayerNorm(normalized_shape=config.latent_dim),
                 nn.Tanh(),
             )
+            self.aggregation_size += config.latent_dim
+
         if "observation.environment_state" in config.input_shapes:
             self.env_state_enc_layers = nn.Sequential(
-                nn.Linear(config.input_shapes["observation.environment_state"][0], config.latent_dim),
-                nn.LayerNorm(config.latent_dim),
+                nn.Linear(
+                    in_features=config.input_shapes["observation.environment_state"][0],
+                    out_features=config.latent_dim,
+                ),
+                nn.LayerNorm(normalized_shape=config.latent_dim),
                 nn.Tanh(),
             )
 
+            self.aggregation_size += config.latent_dim
+        self.aggregation_layer = nn.Linear(in_features=self.aggregation_size, out_features=config.latent_dim)
+
     def forward(self, obs_dict: dict[str, Tensor]) -> Tensor:
         """Encode the image and/or state vector.
 
@@ -482,7 +514,11 @@ class SACObservationEncoder(nn.Module):
         if "observation.state" in self.config.input_shapes:
             feat.append(self.state_enc_layers(obs_dict["observation.state"]))
         # TODO(ke-wang): currently average over all features, concatenate all features maybe a better way
-        return torch.stack(feat, dim=0).mean(0)
+        # return torch.stack(feat, dim=0).mean(0)
+        features = torch.cat(tensors=feat, dim=-1)
+        features = self.aggregation_layer(features)
+
+        return features
 
     @property
     def output_dim(self) -> int:
diff --git a/lerobot/configs/policy/sac_manyskill.yaml b/lerobot/configs/policy/sac_manyskill.yaml
new file mode 100644
index 00000000..e4c3f17d
--- /dev/null
+++ b/lerobot/configs/policy/sac_manyskill.yaml
@@ -0,0 +1,97 @@
+# @package _global_
+
+# Train with:
+#
+# python lerobot/scripts/train.py \
+#   +dataset=lerobot/pusht_keypoints
+#   env=pusht \
+#   env.gym.obs_type=environment_state_agent_pos \
+
+seed: 1
+dataset_repo_id: null 
+
+
+training:
+  # Offline training dataloader
+  num_workers: 4
+
+  # batch_size: 256
+  batch_size: 512
+  grad_clip_norm: 10.0
+  lr: 3e-4
+
+  eval_freq: 2500
+  log_freq: 500
+  save_freq: 50000
+
+  online_steps: 1000000
+  online_rollout_n_episodes: 10
+  online_rollout_batch_size: 10
+  online_steps_between_rollouts: 1000
+  online_sampling_ratio: 1.0
+  online_env_seed: 10000
+  online_buffer_capacity: 1000000
+  online_buffer_seed_size: 0
+  online_step_before_learning: 5000
+  do_online_rollout_async: false
+  policy_update_freq: 1
+
+  # delta_timestamps:
+  #   observation.environment_state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
+  #   observation.state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
+  #   action: "[i / ${fps} for i in range(${policy.horizon})]"
+  #   next.reward: "[i / ${fps} for i in range(${policy.horizon})]"
+
+policy:
+  name: sac
+
+  pretrained_model_path:
+
+  # Input / output structure.
+  n_action_repeats: 1
+  horizon: 1
+  n_action_steps: 1
+
+  shared_encoder: true
+  input_shapes:
+    # # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
+    observation.state: ["${env.state_dim}"]
+    observation.image: [3, 64, 64]
+  output_shapes:
+    action: ["${env.action_dim}"]
+
+  # Normalization / Unnormalization
+  input_normalization_modes: null
+  output_normalization_modes:
+    action: min_max
+  output_normalization_params:
+    action:
+      min: [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]
+      max: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+
+  # Architecture / modeling.
+  # Neural networks.
+  image_encoder_hidden_dim: 32
+  # discount: 0.99
+  discount: 0.80
+  temperature_init: 1.0
+  num_critics: 2
+  num_subsample_critics: null
+  critic_lr: 3e-4
+  actor_lr: 3e-4
+  temperature_lr: 3e-4
+  # critic_target_update_weight: 0.005
+  critic_target_update_weight: 0.01
+  utd_ratio: 1
+
+
+  # # Loss coefficients.
+  # reward_coeff: 0.5
+  # expectile_weight: 0.9
+  # value_coeff: 0.1
+  # consistency_coeff: 20.0
+  # advantage_scaling: 3.0
+  # pi_coeff: 0.5
+  # temporal_decay_coeff: 0.5
+  # # Target model.
+  # target_model_momentum: 0.995
diff --git a/lerobot/scripts/train_sac.py b/lerobot/scripts/train_sac.py
index bb9b51d5..866415d0 100644
--- a/lerobot/scripts/train_sac.py
+++ b/lerobot/scripts/train_sac.py
@@ -14,34 +14,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-import time
-from contextlib import nullcontext
-from copy import deepcopy
-from pathlib import Path
+import functools
 from pprint import pformat
 import random
-from typing import Optional, Sequence, TypedDict
+from typing import Optional, Sequence, TypedDict, Callable
 
 import hydra
-import numpy as np
 import torch
-from deepdiff import DeepDiff
-from omegaconf import DictConfig, ListConfig, OmegaConf
-from termcolor import colored
+import torch.nn.functional as F
 from torch import nn
-from torch.cuda.amp import GradScaler
 from tqdm import tqdm
+from deepdiff import DeepDiff
+from omegaconf import DictConfig, OmegaConf
 
-from lerobot.common.datasets.factory import make_dataset, resolve_delta_timestamps
-from lerobot.common.datasets.lerobot_dataset import MultiLeRobotDataset, LeRobotDataset
-from lerobot.common.datasets.online_buffer import OnlineBuffer, compute_sampler_weights
-from lerobot.common.datasets.sampler import EpisodeAwareSampler
-from lerobot.common.datasets.utils import cycle
-from lerobot.common.envs.factory import make_env
-from lerobot.common.envs.utils import preprocess_observation
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+
+# TODO: Remove the import of maniskill
+from lerobot.common.datasets.factory import make_dataset
+from lerobot.common.envs.factory import make_env, make_maniskill_env
+from lerobot.common.envs.utils import preprocess_observation, preprocess_maniskill_observation
 from lerobot.common.logger import Logger, log_output_dir
 from lerobot.common.policies.factory import make_policy
-from lerobot.common.policies.policy_protocol import PolicyWithUpdate
 from lerobot.common.policies.sac.modeling_sac import SACPolicy
 from lerobot.common.policies.utils import get_device_from_parameters
 from lerobot.common.utils.utils import (
@@ -56,7 +49,8 @@ from lerobot.scripts.eval import eval_policy
 
 def make_optimizers_and_scheduler(cfg, policy):
     optimizer_actor = torch.optim.Adam(
-        params=policy.actor.parameters(),
+        # NOTE: Handle the case of shared encoder where the encoder weights are not optimized with the gradient of the actor
+        params=policy.actor.parameters_to_optimize,
         lr=policy.config.actor_lr,
     )
     optimizer_critic = torch.optim.Adam(
@@ -73,11 +67,6 @@ def make_optimizers_and_scheduler(cfg, policy):
     return optimizers, lr_scheduler
 
 
-# def update_policy(policy, batch, optimizers, grad_clip_norm):
-
-# NOTE: This is temporary, online buffer or query lerobot dataset is not performant enough yet
-
-
 class Transition(TypedDict):
     state: dict[str, torch.Tensor]
     action: torch.Tensor
@@ -95,13 +84,62 @@ class BatchTransition(TypedDict):
     done: torch.Tensor
 
 
+def random_crop_vectorized(images: torch.Tensor, output_size: tuple) -> torch.Tensor:
+    """
+    Perform a per-image random crop over a batch of images in a vectorized way.
+    (Same as shown previously.)
+    """
+    B, C, H, W = images.shape
+    crop_h, crop_w = output_size
+
+    if crop_h > H or crop_w > W:
+        raise ValueError(
+            f"Requested crop size ({crop_h}, {crop_w}) is bigger than the image size ({H}, {W})."
+        )
+
+    tops = torch.randint(0, H - crop_h + 1, (B,), device=images.device)
+    lefts = torch.randint(0, W - crop_w + 1, (B,), device=images.device)
+
+    rows = torch.arange(crop_h, device=images.device).unsqueeze(0) + tops.unsqueeze(1)
+    cols = torch.arange(crop_w, device=images.device).unsqueeze(0) + lefts.unsqueeze(1)
+
+    rows = rows.unsqueeze(2).expand(-1, -1, crop_w)  # (B, crop_h, crop_w)
+    cols = cols.unsqueeze(1).expand(-1, crop_h, -1)  # (B, crop_h, crop_w)
+
+    images_hwcn = images.permute(0, 2, 3, 1)  # (B, H, W, C)
+
+    # Gather pixels
+    cropped_hwcn = images_hwcn[torch.arange(B, device=images.device).view(B, 1, 1), rows, cols, :]
+    # cropped_hwcn => (B, crop_h, crop_w, C)
+
+    cropped = cropped_hwcn.permute(0, 3, 1, 2)  # (B, C, crop_h, crop_w)
+    return cropped
+
+
+def random_shift(images: torch.Tensor, pad: int = 4):
+    """Vectorized random shift, imgs: (B,C,H,W), pad: #pixels"""
+    _, _, h, w = images.shape
+    images = F.pad(input=images, pad=(pad, pad, pad, pad), mode="replicate")
+    return random_crop_vectorized(images=images, output_size=(h, w))
+
+
 class ReplayBuffer:
-    def __init__(self, capacity: int, device: str = "cuda:0", state_keys: Optional[Sequence[str]] = None):
+    def __init__(
+        self,
+        capacity: int,
+        device: str = "cuda:0",
+        state_keys: Optional[Sequence[str]] = None,
+        image_augmentation_function: Optional[Callable] = None,
+        use_drq: bool = True,
+    ):
         """
         Args:
             capacity (int): Maximum number of transitions to store in the buffer.
             device (str): The device where the tensors will be moved ("cuda:0" or "cpu").
             state_keys (List[str]): The list of keys that appear in `state` and `next_state`.
+            image_augmentation_function (Optional[Callable]): A function that takes a batch of images
+                and returns a batch of augmented images. If None, a default augmentation function is used.
+            use_drq (bool): Whether to use the default DRQ image augmentation style, when sampling in the buffer.
         """
         self.capacity = capacity
         self.device = device
@@ -111,6 +149,9 @@ class ReplayBuffer:
         # If no state_keys provided, default to an empty list
         # (you can handle this differently if needed)
         self.state_keys = state_keys if state_keys is not None else []
+        if image_augmentation_function is None:
+            self.image_augmentation_function = functools.partial(random_shift, pad=4)
+        self.use_drq = use_drq
 
     def add(
         self,
@@ -134,7 +175,7 @@ class ReplayBuffer:
             done=done,
             complementary_info=complementary_info,
         )
-        self.position = (self.position + 1) % self.capacity
+        self.position: int = (self.position + 1) % self.capacity
 
     @classmethod
     def from_lerobot_dataset(
@@ -143,6 +184,18 @@ class ReplayBuffer:
         device: str = "cuda:0",
         state_keys: Optional[Sequence[str]] = None,
     ) -> "ReplayBuffer":
+        """
+        Convert a LeRobotDataset into a ReplayBuffer.
+
+        Args:
+            lerobot_dataset (LeRobotDataset): The dataset to convert.
+            device (str): The device . Defaults to "cuda:0".
+            state_keys (Optional[Sequence[str]], optional): The list of keys that appear in `state` and `next_state`.
+            Defaults to None.
+
+        Returns:
+            ReplayBuffer: The replay buffer with offline dataset transitions.
+        """
         # We convert the LeRobotDataset into a replay buffer, because it is more efficient to sample from
         # a replay buffer than from a lerobot dataset.
         replay_buffer = cls(capacity=len(lerobot_dataset), device=device, state_keys=state_keys)
@@ -248,6 +301,8 @@ class ReplayBuffer:
             batch_state[key] = torch.cat([t["state"][key] for t in list_of_transitions], dim=0).to(
                 self.device
             )
+            if key.startswith("observation.image") and self.use_drq:
+                batch_state[key] = self.image_augmentation_function(batch_state[key])
 
         # -- Build batched actions --
         batch_actions = torch.cat([t["action"] for t in list_of_transitions]).to(self.device)
@@ -263,6 +318,8 @@ class ReplayBuffer:
             batch_next_state[key] = torch.cat([t["next_state"][key] for t in list_of_transitions], dim=0).to(
                 self.device
             )
+            if key.startswith("observation.image") and self.use_drq:
+                batch_next_state[key] = self.image_augmentation_function(batch_next_state[key])
 
         # -- Build batched dones --
         batch_dones = torch.tensor([t["done"] for t in list_of_transitions], dtype=torch.float32).to(
@@ -285,7 +342,7 @@ class ReplayBuffer:
 def concatenate_batch_transitions(
     left_batch_transitions: BatchTransition, right_batch_transition: BatchTransition
 ) -> BatchTransition:
-    """Be careful it change the left_batch_transitions in place"""
+    """NOTE: Be careful it change the left_batch_transitions in place"""
     left_batch_transitions["state"] = {
         key: torch.cat([left_batch_transitions["state"][key], right_batch_transition["state"][key]], dim=0)
         for key in left_batch_transitions["state"]
@@ -321,11 +378,14 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     # online_env = make_env(cfg, n_envs=cfg.training.online_rollout_batch_size)
     # NOTE: Off policy algorithm are efficient enought to use a single environment
     logging.info("make_env online")
-    online_env = make_env(cfg, n_envs=1)
-
+    # online_env = make_env(cfg, n_envs=1)
+    # TODO: Remove the import of maniskill and unifiy with make env
+    online_env = make_maniskill_env(cfg, n_envs=1)
     if cfg.training.eval_freq > 0:
         logging.info("make_env eval")
-        eval_env = make_env(cfg, n_envs=1)
+        # eval_env = make_env(cfg, n_envs=1)
+        # TODO: Remove the import of maniskill and unifiy with make env
+        eval_env = make_maniskill_env(cfg, n_envs=1)
 
     # TODO: Add a way to resume training
 
@@ -348,6 +408,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
         # Hack: But if we do online traning, we do not need dataset_stats
         dataset_stats=None,
         pretrained_policy_name_or_path=str(logger.last_pretrained_model_dir) if cfg.resume else None,
+        device=device,
     )
     assert isinstance(policy, nn.Module)
 
@@ -360,17 +421,15 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
 
     log_output_dir(out_dir)
     logging.info(f"{cfg.env.task=}")
-    # TODO: Handle offline steps
-    # logging.info(f"{cfg.training.offline_steps=} ({format_big_number(cfg.training.offline_steps)})")
     logging.info(f"{cfg.training.online_steps=}")
-    # logging.info(f"{offline_dataset.num_frames=} ({format_big_number(offline_dataset.num_frames)})")
-    # logging.info(f"{offline_dataset.num_episodes=}")
     logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
     logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
 
     obs, info = online_env.reset()
 
-    obs = preprocess_observation(obs)
+    # HACK for maniskill
+    # obs = preprocess_observation(obs)
+    obs = preprocess_maniskill_observation(obs)
     obs = {key: obs[key].to(device, non_blocking=True) for key in obs}
 
     replay_buffer = ReplayBuffer(
@@ -378,8 +437,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     )
 
     batch_size = cfg.training.batch_size
-    # if cfg.training.online_steps > 0 and isinstance(cfg.dataset_repo_id, ListConfig):
-    #     raise NotImplementedError("Online training with LeRobotMultiDataset is not implemented.")
+
     if cfg.dataset_repo_id is not None:
         logging.info("make_dataset offline buffer")
         offline_dataset = make_dataset(cfg)
@@ -404,7 +462,9 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
             # HACK
             action = torch.tensor(action, dtype=torch.float32).to(device, non_blocking=True)
 
-        next_obs = preprocess_observation(next_obs)
+        # HACK: For maniskill
+        # next_obs = preprocess_observation(next_obs)
+        next_obs = preprocess_maniskill_observation(next_obs)
         next_obs = {key: next_obs[key].to(device, non_blocking=True) for key in obs}
         sum_reward_episode += float(reward[0])
         # Because we are using a single environment
@@ -413,16 +473,11 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
             logging.info(f"Global step {interaction_step}: Episode reward: {sum_reward_episode}")
             logger.log_dict({"Sum episode reward": sum_reward_episode}, interaction_step)
             sum_reward_episode = 0
-            if "final_info" in info:
-                if "is_success" in info["final_info"][0]:
-                    logging.info(
-                        f"Global step {interaction_step}: Episode success: {info['final_info'][0]['is_success']}"
-                    )
-                if "coverage" in info["final_info"][0]:
-                    logging.info(
-                        f"Global step {interaction_step}: Episode final coverage: {info['final_info'][0]['coverage']} \n"
-                    )
-                    logger.log_dict({"Final coverage": info["final_info"][0]["coverage"]}, interaction_step)
+            # HACK: This is for maniskill
+            logging.info(
+                f"global step {interaction_step}: episode success: {info['success'].float().item()} \n"
+            )
+            logger.log_dict({"Episode success": info["success"].float().item()}, interaction_step)
 
         replay_buffer.add(
             state=obs,
@@ -433,38 +488,13 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
         )
         obs = next_obs
 
-        if interaction_step >= cfg.training.online_step_before_learning:
-            for _ in range(cfg.policy.utd_ratio - 1):
-                batch = replay_buffer.sample(batch_size)
-                if cfg.dataset_repo_id is not None:
-                    batch_offline = offline_replay_buffer.sample(batch_size)
-                    batch = concatenate_batch_transitions(batch, batch_offline)
-
-                actions = batch["action"]
-                rewards = batch["reward"]
-                observations = batch["state"]
-                next_observations = batch["next_state"]
-                done = batch["done"]
-
-                loss_critic = policy.compute_loss_critic(
-                    observations=observations,
-                    actions=actions,
-                    rewards=rewards,
-                    next_observations=next_observations,
-                    done=done,
-                )
-                optimizers["critic"].zero_grad()
-                loss_critic.backward()
-                optimizers["critic"].step()
-
+        if interaction_step < cfg.training.online_step_before_learning:
+            continue
+        for _ in range(cfg.policy.utd_ratio - 1):
             batch = replay_buffer.sample(batch_size)
             if cfg.dataset_repo_id is not None:
                 batch_offline = offline_replay_buffer.sample(batch_size)
-                batch = concatenate_batch_transitions(
-                    left_batch_transitions=batch, right_batch_transition=batch_offline
-                )
-            # NOTE: We have to handle the normalization for the batch
-            # batch = policy.normalize_inputs(batch)
+                batch = concatenate_batch_transitions(batch, batch_offline)
 
             actions = batch["action"]
             rewards = batch["reward"]
@@ -483,31 +513,55 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
             loss_critic.backward()
             optimizers["critic"].step()
 
-            training_infos = {}
-            training_infos["loss_critic"] = loss_critic.item()
+        batch = replay_buffer.sample(batch_size)
+        if cfg.dataset_repo_id is not None:
+            batch_offline = offline_replay_buffer.sample(batch_size)
+            batch = concatenate_batch_transitions(
+                left_batch_transitions=batch, right_batch_transition=batch_offline
+            )
 
-            if interaction_step % cfg.training.policy_update_freq == 0:
-                # TD3 Trick
-                for _ in range(cfg.training.policy_update_freq):
-                    loss_actor = policy.compute_loss_actor(observations=observations)
+        actions = batch["action"]
+        rewards = batch["reward"]
+        observations = batch["state"]
+        next_observations = batch["next_state"]
+        done = batch["done"]
 
-                    optimizers["actor"].zero_grad()
-                    loss_actor.backward()
-                    optimizers["actor"].step()
+        loss_critic = policy.compute_loss_critic(
+            observations=observations,
+            actions=actions,
+            rewards=rewards,
+            next_observations=next_observations,
+            done=done,
+        )
+        optimizers["critic"].zero_grad()
+        loss_critic.backward()
+        optimizers["critic"].step()
 
-                    training_infos["loss_actor"] = loss_actor.item()
+        training_infos = {}
+        training_infos["loss_critic"] = loss_critic.item()
 
-                    loss_temperature = policy.compute_loss_temperature(observations=observations)
-                    optimizers["temperature"].zero_grad()
-                    loss_temperature.backward()
-                    optimizers["temperature"].step()
+        if interaction_step % cfg.training.policy_update_freq == 0:
+            # TD3 Trick
+            for _ in range(cfg.training.policy_update_freq):
+                loss_actor = policy.compute_loss_actor(observations=observations)
 
-                    training_infos["loss_temperature"] = loss_temperature.item()
+                optimizers["actor"].zero_grad()
+                loss_actor.backward()
+                optimizers["actor"].step()
 
-            if interaction_step % cfg.training.log_freq == 0:
-                logger.log_dict(training_infos, interaction_step, mode="train")
+                training_infos["loss_actor"] = loss_actor.item()
 
-            policy.update_target_networks()
+                loss_temperature = policy.compute_loss_temperature(observations=observations)
+                optimizers["temperature"].zero_grad()
+                loss_temperature.backward()
+                optimizers["temperature"].step()
+
+                training_infos["loss_temperature"] = loss_temperature.item()
+
+        if interaction_step % cfg.training.log_freq == 0:
+            logger.log_dict(training_infos, interaction_step, mode="train")
+
+        policy.update_target_networks()
 
 
 @hydra.main(version_base="1.2", config_name="default", config_path="../configs")

From 322a78a378c58bb634f37bb674f5875a587fae06 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Tue, 28 Jan 2025 15:52:03 +0000
Subject: [PATCH 050/112] Added server directory in `lerobot/scripts` that
 contains scripts and the protobuf message types to split training into two
 processes, acting and learning. The actor rollouts the policy and collects
 interaction data while the learner recieves the data, trains the policy and
 sends the updated parameters to the actor. The two scripts are ran
 simultaneously Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>

---
 lerobot/common/logger.py                 |  33 +-
 lerobot/common/policies/factory.py       |  10 +-
 lerobot/scripts/server/actor_server.py   | 282 ++++++++++++++++
 lerobot/scripts/server/hilserl.proto     |  42 +++
 lerobot/scripts/server/learner_server.py | 394 +++++++++++++++++++++++
 lerobot/scripts/train_sac.py             |   1 +
 6 files changed, 759 insertions(+), 3 deletions(-)
 create mode 100644 lerobot/scripts/server/actor_server.py
 create mode 100644 lerobot/scripts/server/hilserl.proto
 create mode 100644 lerobot/scripts/server/learner_server.py

diff --git a/lerobot/common/logger.py b/lerobot/common/logger.py
index 4015492d..35c12062 100644
--- a/lerobot/common/logger.py
+++ b/lerobot/common/logger.py
@@ -127,6 +127,8 @@ class Logger:
                 job_type="train_eval",
                 resume="must" if cfg.resume else None,
             )
+            # Handle custom step key for rl asynchronous training.
+            self._wandb_custom_step_key = None
             print(colored("Logs will be synced with wandb.", "blue", attrs=["bold"]))
             logging.info(f"Track this run --> {colored(wandb.run.get_url(), 'yellow', attrs=['bold'])}")
             self._wandb = wandb
@@ -226,18 +228,47 @@ class Logger:
         set_global_random_state({k: training_state[k] for k in get_global_random_state()})
         return training_state["step"]
 
-    def log_dict(self, d, step, mode="train"):
+    def log_dict(self, d, step:int | None = None, mode="train", custom_step_key: str | None = None):
+        """Log a dictionary of metrics to WandB."""
         assert mode in {"train", "eval"}
         # TODO(alexander-soare): Add local text log.
+        if step is None and custom_step_key is None:
+            raise ValueError("Either step or custom_step_key must be provided.")
+
         if self._wandb is not None:
+            
+            # NOTE: This is not simple. Wandb step is it must always monotonically increase and it 
+            # increases with each wandb.log call, but in the case of asynchronous RL for example,
+            # multiple time steps is possible for example, the interaction step with the environment, 
+            # the training step, the evaluation step, etc. So we need to define a custom step key
+            # to log the correct step for each metric.
+            if custom_step_key is not None and self._wandb_custom_step_key is None:
+                # NOTE: Define the custom step key, once for the moment this implementation support only one
+                # custom step.
+                self._wandb_custom_step_key = f"{mode}/{custom_step_key}"
+                self._wandb.define_metric(self._wandb_custom_step_key, hidden=True)
+            
             for k, v in d.items():
                 if not isinstance(v, (int, float, str, wandb.Table)):
                     logging.warning(
                         f'WandB logging of key "{k}" was ignored as its type is not handled by this wrapper.'
                     )
                     continue
+
+                # We don't want to log the custom step
+                if k == custom_step_key:
+                    continue
+
+                if self._wandb_custom_step_key is not None:
+                    # NOTE: Log the metric with the custom step key.
+                    value_custom_step_key = d[custom_step_key]
+                    self._wandb.log({f"{mode}/{k}": v, self._wandb_custom_step_key: value_custom_step_key})
+                    continue
+
                 self._wandb.log({f"{mode}/{k}": v}, step=step)
 
+
+
     def log_video(self, video_path: str, step: int, mode: str = "train"):
         assert mode in {"train", "eval"}
         assert self._wandb is not None
diff --git a/lerobot/common/policies/factory.py b/lerobot/common/policies/factory.py
index f4a2039c..24026a86 100644
--- a/lerobot/common/policies/factory.py
+++ b/lerobot/common/policies/factory.py
@@ -76,7 +76,11 @@ def get_policy_and_config_classes(name: str) -> tuple[Policy, object]:
 
 
 def make_policy(
-    hydra_cfg: DictConfig, pretrained_policy_name_or_path: str | None = None, dataset_stats=None
+    hydra_cfg: DictConfig,
+    pretrained_policy_name_or_path: str | None = None,
+    dataset_stats=None,
+    *args,
+    **kwargs,
 ) -> Policy:
     """Make an instance of a policy class.
 
@@ -100,7 +104,9 @@ def make_policy(
     policy_cfg = _policy_cfg_from_hydra_cfg(policy_cfg_class, hydra_cfg)
     if pretrained_policy_name_or_path is None:
         # Make a fresh policy.
-        policy = policy_cls(policy_cfg, dataset_stats)
+        # HACK: We pass *args and **kwargs to the policy constructor to allow for additional arguments
+        # for example device for the sac policy.
+        policy = policy_cls(*args, **kwargs, config=policy_cfg, dataset_stats=dataset_stats)
     else:
         # Load a pretrained policy and override the config if needed (for example, if there are inference-time
         # hyperparameters that we want to vary).
diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
new file mode 100644
index 00000000..afa6a6e0
--- /dev/null
+++ b/lerobot/scripts/server/actor_server.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import functools
+from pprint import pformat
+import random
+from typing import Optional, Sequence, TypedDict, Callable
+import pickle
+
+import hydra
+import torch
+import torch.nn.functional as F
+from torch import nn
+from tqdm import tqdm
+from deepdiff import DeepDiff
+from omegaconf import DictConfig, OmegaConf
+
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+
+# TODO: Remove the import of maniskill
+from lerobot.common.datasets.factory import make_dataset
+from lerobot.common.envs.factory import make_env, make_maniskill_env
+from lerobot.common.envs.utils import preprocess_observation, preprocess_maniskill_observation
+from lerobot.common.logger import Logger, log_output_dir
+from lerobot.common.policies.factory import make_policy
+from lerobot.common.policies.sac.modeling_sac import SACPolicy
+from lerobot.common.policies.utils import get_device_from_parameters
+from lerobot.common.utils.utils import (
+    format_big_number,
+    get_safe_torch_device,
+    init_hydra_config,
+    init_logging,
+    set_global_seed,
+)
+# from lerobot.scripts.eval import eval_policy
+from threading import Thread
+import queue
+
+import grpc
+from lerobot.scripts.server import hilserl_pb2, hilserl_pb2_grpc
+import io
+import time
+import logging
+from concurrent import futures
+from threading import Thread
+from lerobot.scripts.server.buffer import move_state_dict_to_device, move_transition_to_device, Transition
+
+import faulthandler
+import signal
+
+logging.basicConfig(level=logging.INFO)
+
+parameters_queue = queue.Queue(maxsize=1)
+message_queue = queue.Queue(maxsize=1_000_000)
+
+class ActorInformation:
+    def __init__(self, transition=None, interaction_message=None):
+        self.transition = transition
+        self.interaction_message = interaction_message
+
+
+# 1) Implement ActorService so the Learner can send parameters to this Actor.
+class ActorServiceServicer(hilserl_pb2_grpc.ActorServiceServicer):
+    def StreamTransition(self, request, context):
+        while True:
+            # logging.info(f"[ACTOR] before message.empty()")
+            # logging.info(f"[ACTOR] size transition queue {message_queue.qsize()}")
+            # time.sleep(0.01)
+            # if message_queue.empty():
+            #     continue
+            # logging.info(f"[ACTOR] after message.empty()")
+            start = time.time()
+            message = message_queue.get(block=True)
+            # logging.info(f"[ACTOR] Message queue get time {time.time() - start}")
+
+            if message.transition is not None:
+                # transition_to_send_to_learner = move_transition_to_device(message.transition, device="cpu")
+                transition_to_send_to_learner = [move_transition_to_device(T, device="cpu") for T in message.transition]
+                # logging.info(f"[ACTOR] Message queue get time {time.time() - start}")
+
+                # Serialize it
+                buf = io.BytesIO()
+                torch.save(transition_to_send_to_learner, buf)
+                transition_bytes = buf.getvalue()
+                
+                transition_message = hilserl_pb2.Transition(
+                    transition_bytes=transition_bytes
+                )
+
+                response = hilserl_pb2.ActorInformation(
+                    transition=transition_message
+                )
+                logging.info(f"[ACTOR] time to yield transition response {time.time() - start}")
+                logging.info(f"[ACTOR] size transition queue {message_queue.qsize()}")
+                
+            elif message.interaction_message is not None:
+                # Serialize it and send it to the Learner's server
+                content = hilserl_pb2.InteractionMessage(
+                    interaction_message_bytes=pickle.dumps(message.interaction_message)
+                    )
+                response = hilserl_pb2.ActorInformation(
+                    interaction_message=content
+                )
+
+            # logging.info(f"[ACTOR] yield response before")
+            yield response
+            # logging.info(f"[ACTOR] response yielded after")
+
+    def SendParameters(self, request, context):
+        """
+        Learner calls this with updated Parameters -> Actor
+        """
+        # logging.info("[ACTOR] Received parameters from Learner.")
+        buffer = io.BytesIO(request.parameter_bytes)
+        params = torch.load(buffer)
+        parameters_queue.put(params)
+        return hilserl_pb2.Empty()
+
+
+def serve_actor_service(port=50052):
+    """
+    Runs a gRPC server so that the Learner can push parameters to the Actor.
+    """
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=20),
+                            options=[('grpc.max_send_message_length', -1),
+                                     ('grpc.max_receive_message_length', -1)])
+    hilserl_pb2_grpc.add_ActorServiceServicer_to_server(
+        ActorServiceServicer(), server
+    )
+    server.add_insecure_port(f'[::]:{port}')
+    server.start()
+    logging.info(f"[ACTOR] gRPC server listening on port {port}")
+    server.wait_for_termination()
+
+def act_with_policy(cfg: DictConfig, 
+                   out_dir: str | None = None, 
+                   job_name: str | None = None):
+
+    if out_dir is None:
+        raise NotImplementedError()
+    if job_name is None:
+        raise NotImplementedError()
+
+    logging.info("make_env online")
+
+    # online_env = make_env(cfg, n_envs=1)
+    # TODO: Remove the import of maniskill and unifiy with make env
+    online_env = make_maniskill_env(cfg, n_envs=1)
+    if cfg.training.eval_freq > 0:
+        logging.info("make_env eval")
+        # eval_env = make_env(cfg, n_envs=1)
+        # TODO: Remove the import of maniskill and unifiy with make env
+        eval_env = make_maniskill_env(cfg, n_envs=1)
+
+    set_global_seed(cfg.seed)
+    device = get_safe_torch_device(cfg.device, log=True)
+
+    torch.backends.cudnn.benchmark = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+
+    logging.info("make_policy")
+    
+    
+    ### Instantiate the policy in both the actor and learner processes
+    ### To avoid sending a SACPolicy object through the port, we create a policy intance
+    ### on both sides, the learner sends the updated parameters every n steps to update the actor's parameters
+    # TODO: At some point we should just need make sac policy
+    policy: SACPolicy = make_policy(
+        hydra_cfg=cfg,
+        # dataset_stats=offline_dataset.meta.stats if not cfg.resume else None,
+        # Hack: But if we do online traning, we do not need dataset_stats
+        dataset_stats=None,
+        # TODO: Handle resume training
+        pretrained_policy_name_or_path=None,
+        device=device,
+    )
+    assert isinstance(policy, nn.Module)
+
+    # HACK for maniskill
+    obs, info = online_env.reset()
+
+    # obs = preprocess_observation(obs)
+    obs = preprocess_maniskill_observation(obs)
+    obs = {key: obs[key].to(device, non_blocking=True) for key in obs}
+    ### ACTOR ==================
+    # NOTE: For the moment we will solely handle the case of a single environment
+    sum_reward_episode = 0
+    list_transition_to_send_to_learner = []
+
+    for interaction_step in range(cfg.training.online_steps):
+        # NOTE: At some point we should use a  wrapper to handle the observation
+
+        # start = time.time()
+        if interaction_step >= cfg.training.online_step_before_learning:
+            action = policy.select_action(batch=obs)
+            next_obs, reward, done, truncated, info = online_env.step(action.cpu().numpy())
+        else:
+            action = online_env.action_space.sample()
+            next_obs, reward, done, truncated, info = online_env.step(action)
+            # HACK
+            action = torch.tensor(action, dtype=torch.float32).to(device, non_blocking=True)
+
+        # logging.info(f"[ACTOR] Time for env step {time.time() - start}")
+
+        # HACK: For maniskill
+        # next_obs = preprocess_observation(next_obs)
+        next_obs = preprocess_maniskill_observation(next_obs)
+        next_obs = {key: next_obs[key].to(device, non_blocking=True) for key in obs}
+        sum_reward_episode += float(reward[0])
+        # Because we are using a single environment
+        # we can safely assume that the episode is done
+        if done[0].item() or truncated[0].item():
+            # TODO: Handle logging for episode information
+            logging.info(f"[ACTOR] Global step {interaction_step}: Episode reward: {sum_reward_episode}")
+
+            if not parameters_queue.empty():
+                logging.info("[ACTOR] Load new parameters from Learner.")
+                # Load new parameters from Learner
+                state_dict = parameters_queue.get()
+                state_dict = move_state_dict_to_device(state_dict, device=device)
+                policy.actor.load_state_dict(state_dict)
+            
+            if len(list_transition_to_send_to_learner) > 0:
+                logging.info(f"[ACTOR] Sending {len(list_transition_to_send_to_learner)} transitions to Learner.")
+                message_queue.put(ActorInformation(transition=list_transition_to_send_to_learner))
+                list_transition_to_send_to_learner = []
+
+            # Send episodic reward to the learner
+            message_queue.put(ActorInformation(interaction_message={"episodic_reward": sum_reward_episode,"interaction_step": interaction_step}))
+            sum_reward_episode = 0.0
+
+        # ============================
+        # Prepare transition to send
+        # ============================
+        # Label the reward
+        # if config.label_reward_on_actor:
+        #     reward = reward_classifier(obs)
+
+        list_transition_to_send_to_learner.append(Transition(
+        # transition_to_send_to_learner = Transition(
+                    state=obs,
+                    action=action,
+                    reward=reward,
+                    next_state=next_obs,
+                    done=done,
+                    complementary_info=None,
+                )
+        )
+        # message_queue.put(ActorInformation(transition=transition_to_send_to_learner))
+
+        # assign obs to the next obs and continue the rollout
+        obs = next_obs
+
+@hydra.main(version_base="1.2", config_name="default", config_path="../../configs")
+def actor_cli(cfg: dict):
+        server_thread = Thread(target=serve_actor_service, args=(50051,), daemon=True)
+        server_thread.start()
+        policy_thread = Thread(target=act_with_policy, 
+                               daemon=True, 
+                               args=(cfg,hydra.core.hydra_config.HydraConfig.get().run.dir, hydra.core.hydra_config.HydraConfig.get().job.name))
+        policy_thread.start()
+        policy_thread.join()
+        server_thread.join()
+
+if __name__ == "__main__":
+    with open("traceback.log", "w") as f:
+        faulthandler.register(signal.SIGUSR1, file=f)
+
+    actor_cli()
\ No newline at end of file
diff --git a/lerobot/scripts/server/hilserl.proto b/lerobot/scripts/server/hilserl.proto
new file mode 100644
index 00000000..41f85100
--- /dev/null
+++ b/lerobot/scripts/server/hilserl.proto
@@ -0,0 +1,42 @@
+syntax = "proto3";
+
+package hil_serl;
+
+// LearnerService: the Actor calls this to push transitions.
+// The Learner implements this service.
+service LearnerService {
+  // Actor -> Learner to store transitions
+  rpc SendTransition(Transition) returns (Empty);
+  rpc SendInteractionMessage(InteractionMessage) returns (Empty);  
+}
+
+// ActorService: the Learner calls this to push parameters.
+// The Actor implements this service.
+service ActorService {
+  // Learner -> Actor to send new parameters
+  rpc StreamTransition(Empty) returns (stream ActorInformation) {};
+  rpc SendParameters(Parameters) returns (Empty);
+}
+
+
+message ActorInformation {
+    oneof data {
+        Transition transition = 1;
+        InteractionMessage interaction_message = 2;
+    }
+}
+
+// Messages
+message Transition {
+  bytes transition_bytes = 1;
+}
+
+message Parameters {
+  bytes parameter_bytes = 1;
+}
+
+message InteractionMessage {
+  bytes interaction_message_bytes = 1; 
+}
+
+message Empty {}
\ No newline at end of file
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
new file mode 100644
index 00000000..22777a26
--- /dev/null
+++ b/lerobot/scripts/server/learner_server.py
@@ -0,0 +1,394 @@
+import grpc
+from concurrent import futures
+import functools
+import logging
+import queue
+import pickle
+import torch
+import torch.nn.functional as F
+import io
+import time
+
+from pprint import pformat
+import random
+from typing import Optional, Sequence, TypedDict, Callable
+
+import hydra
+import torch
+import torch.nn.functional as F
+from torch import nn
+from tqdm import tqdm
+from deepdiff import DeepDiff
+from omegaconf import DictConfig, OmegaConf
+from threading import Thread, Lock
+
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+
+# TODO: Remove the import of maniskill
+from lerobot.common.datasets.factory import make_dataset
+from lerobot.common.logger import Logger, log_output_dir
+from lerobot.common.policies.factory import make_policy
+from lerobot.common.policies.sac.modeling_sac import SACPolicy
+from lerobot.common.policies.utils import get_device_from_parameters
+from lerobot.common.utils.utils import (
+    format_big_number,
+    get_safe_torch_device,
+    init_hydra_config,
+    init_logging,
+    set_global_seed,
+)
+from lerobot.scripts.server.buffer import ReplayBuffer, move_transition_to_device, concatenate_batch_transitions, move_state_dict_to_device, Transition
+
+# Import generated stubs
+import hilserl_pb2
+import hilserl_pb2_grpc
+
+logging.basicConfig(level=logging.INFO)
+
+
+
+# TODO: Implement it in cleaner way maybe
+transition_queue = queue.Queue()
+interaction_message_queue = queue.Queue()
+
+
+# 1) Implement the LearnerService so the Actor can send transitions here.
+class LearnerServiceServicer(hilserl_pb2_grpc.LearnerServiceServicer):
+    # def SendTransition(self, request, context):
+    #     """
+    #     Actor calls this method to push a Transition -> Learner.
+    #     """
+    #     buffer = io.BytesIO(request.transition_bytes)
+    #     transition = torch.load(buffer)
+    #     transition_queue.put(transition)
+    #     return hilserl_pb2.Empty()
+    def SendInteractionMessage(self, request, context):
+        """
+        Actor calls this method to push a Transition -> Learner.
+        """
+        content = pickle.loads(request.interaction_message_bytes)
+        interaction_message_queue.put(content)
+        return hilserl_pb2.Empty()
+
+
+
+def stream_transitions_from_actor(port=50051):
+    """
+    Runs a gRPC server listening for transitions from the Actor.
+    """
+    time.sleep(10)
+    channel = grpc.insecure_channel(f'127.0.0.1:{port}',
+                             options=[('grpc.max_send_message_length', -1),
+                                      ('grpc.max_receive_message_length', -1)])
+    stub = hilserl_pb2_grpc.ActorServiceStub(channel)
+    for response in stub.StreamTransition(hilserl_pb2.Empty()):
+        if response.HasField('transition'):
+            buffer = io.BytesIO(response.transition.transition_bytes)
+            transition = torch.load(buffer)
+            transition_queue.put(transition)
+        if response.HasField('interaction_message'):
+            content = pickle.loads(response.interaction_message.interaction_message_bytes)
+            interaction_message_queue.put(content)
+        # NOTE: Cool down the CPU, if you comment this line you will make a huge bottleneck
+        time.sleep(0.001)
+
+def learner_push_parameters(
+    policy: nn.Module, policy_lock: Lock, actor_host="127.0.0.1", actor_port=50052, seconds_between_pushes=5
+):
+    """
+    As a client, connect to the Actor's gRPC server (ActorService)
+    and periodically push new parameters.
+    """
+    time.sleep(10)
+    # The Actor's server is presumably listening on a different port, e.g. 50052
+    channel = grpc.insecure_channel(f"{actor_host}:{actor_port}",
+                             options=[('grpc.max_send_message_length', -1),
+                                      ('grpc.max_receive_message_length', -1)])
+    actor_stub = hilserl_pb2_grpc.ActorServiceStub(channel)
+
+    while True:
+        with policy_lock:
+            params_dict = policy.actor.state_dict()
+        params_dict = move_state_dict_to_device(params_dict, device="cpu")
+        # Serialize
+        buf = io.BytesIO()
+        torch.save(params_dict, buf)
+        params_bytes = buf.getvalue()
+
+        # Push them to the Actor’s "SendParameters" method
+        response = actor_stub.SendParameters(hilserl_pb2.Parameters(parameter_bytes=params_bytes))
+        time.sleep(seconds_between_pushes)
+
+
+# Checked 
+def add_actor_information(
+    cfg,
+    device,
+    replay_buffer: ReplayBuffer,
+    offline_replay_buffer: ReplayBuffer,
+    batch_size: int,
+    optimizers,
+    policy, 
+    policy_lock: Lock,
+    buffer_lock: Lock,
+    offline_buffer_lock: Lock,
+    logger_lock: Lock,
+    logger: Logger,
+):
+    """
+    In a real application, you might run your training loop here,
+    reading from the transition queue and doing gradient updates.
+    """
+    # NOTE: This function doesn't have a single responsibility, it should be split into multiple functions
+    # in the future. The reason why we did that is the  GIL in Python. It's super slow the performance
+    # are divided by 200. So we need to have a single thread that does all the work.
+    start = time.time()
+    optimization_step = 0
+
+    while True:
+        time_for_adding_transitions = time.time()
+        while not transition_queue.empty():
+
+            transition_list = transition_queue.get()
+            for transition in transition_list:
+                transition = move_transition_to_device(transition, device=device)
+                replay_buffer.add(**transition)
+                logging.info(f"[LEARNER] size of replay buffer: {len(replay_buffer)}")
+                logging.info(f"[LEARNER] size of transition queues: {transition_queue.qsize()}")
+            
+
+        while not interaction_message_queue.empty():
+            interaction_message = interaction_message_queue.get()
+            logger.log_dict(interaction_message,mode="train",custom_step_key="interaction_step")
+            logging.info(f"[LEARNER] size of interaction message queue: {interaction_message_queue.qsize()}")
+
+        # if len(replay_buffer.memory) < cfg.training.online_step_before_learning:
+        #     continue
+
+        # for _ in range(cfg.policy.utd_ratio - 1):
+
+        #     batch = replay_buffer.sample(batch_size)
+        #     if cfg.dataset_repo_id is not None:
+        #         batch_offline = offline_replay_buffer.sample(batch_size)
+        #         batch = concatenate_batch_transitions(batch, batch_offline)
+
+        #     actions = batch["action"]
+        #     rewards = batch["reward"]
+        #     observations = batch["state"]
+        #     next_observations = batch["next_state"]
+        #     done = batch["done"]
+
+        #     with policy_lock:
+        #         loss_critic = policy.compute_loss_critic(
+        #             observations=observations,
+        #             actions=actions,
+        #             rewards=rewards,
+        #             next_observations=next_observations,
+        #             done=done,
+        #         )
+        #         optimizers["critic"].zero_grad()
+        #         loss_critic.backward()
+        #         optimizers["critic"].step()
+
+        # batch = replay_buffer.sample(batch_size)
+
+        # if cfg.dataset_repo_id is not None:
+        #     batch_offline = offline_replay_buffer.sample(batch_size)
+        #     batch = concatenate_batch_transitions(
+        #         left_batch_transitions=batch, right_batch_transition=batch_offline
+        #     )
+
+        # actions = batch["action"]
+        # rewards = batch["reward"]
+        # observations = batch["state"]
+        # next_observations = batch["next_state"]
+        # done = batch["done"]
+
+        # with policy_lock:
+        #     loss_critic = policy.compute_loss_critic(
+        #         observations=observations,
+        #         actions=actions,
+        #         rewards=rewards, 
+        #         next_observations=next_observations,
+        #         done=done,
+        #     )
+        #     optimizers["critic"].zero_grad()
+        #     loss_critic.backward()
+        #     optimizers["critic"].step()
+
+        # training_infos = {}
+        # training_infos["loss_critic"] = loss_critic.item()
+
+        # if optimization_step % cfg.training.policy_update_freq == 0:
+        #     for _ in range(cfg.training.policy_update_freq):
+        #         with policy_lock:
+        #             loss_actor = policy.compute_loss_actor(observations=observations)
+
+        #             optimizers["actor"].zero_grad()
+        #             loss_actor.backward()
+        #             optimizers["actor"].step()
+
+        #             training_infos["loss_actor"] = loss_actor.item()
+
+        #             loss_temperature = policy.compute_loss_temperature(observations=observations)
+        #             optimizers["temperature"].zero_grad()
+        #             loss_temperature.backward()
+        #             optimizers["temperature"].step()
+
+        #             training_infos["loss_temperature"] = loss_temperature.item()
+
+        # if optimization_step % cfg.training.log_freq == 0:
+        #     logger.log_dict(training_infos, step=optimization_step, mode="train")
+
+        # policy.update_target_networks()
+        # optimization_step += 1
+        # time_for_one_optimization_step = time.time() - time_for_one_optimization_step
+
+        # logger.log_dict({"[LEARNER] Time optimization step":time_for_one_optimization_step}, step=optimization_step, mode="train")
+        # time_for_one_optimization_step = time.time()
+
+
+def make_optimizers_and_scheduler(cfg, policy):
+    optimizer_actor = torch.optim.Adam(
+        # NOTE: Handle the case of shared encoder where the encoder weights are not optimized with the gradient of the actor
+        params=policy.actor.parameters_to_optimize,
+        lr=policy.config.actor_lr,
+    )
+    optimizer_critic = torch.optim.Adam(
+        params=policy.critic_ensemble.parameters(), lr=policy.config.critic_lr
+    )
+    # We wrap policy log temperature in list because this is a torch tensor and not a nn.Module
+    optimizer_temperature = torch.optim.Adam(params=[policy.log_alpha], lr=policy.config.critic_lr)
+    lr_scheduler = None
+    optimizers = {
+        "actor": optimizer_actor,
+        "critic": optimizer_critic,
+        "temperature": optimizer_temperature,
+    }
+    return optimizers, lr_scheduler
+
+
+
+
+def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = None):
+    if out_dir is None:
+        raise NotImplementedError()
+    if job_name is None:
+        raise NotImplementedError()
+
+    init_logging()
+    logging.info(pformat(OmegaConf.to_container(cfg)))
+
+    logger = Logger(cfg, out_dir, wandb_job_name=job_name)
+    logger_lock = Lock()
+
+    set_global_seed(cfg.seed)
+
+    device = get_safe_torch_device(cfg.device, log=True)
+
+    torch.backends.cudnn.benchmark = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+
+    logging.info("make_policy")
+
+    ### Instantiate the policy in both the actor and learner processes
+    ### To avoid sending a SACPolicy object through the port, we create a policy intance
+    ### on both sides, the learner sends the updated parameters every n steps to update the actor's parameters
+    # TODO: At some point we should just need make sac policy
+    policy_lock = Lock()
+    with logger_lock:
+        policy: SACPolicy = make_policy(
+            hydra_cfg=cfg,
+            # dataset_stats=offline_dataset.meta.stats if not cfg.resume else None,
+            # Hack: But if we do online traning, we do not need dataset_stats
+            dataset_stats=None,
+            pretrained_policy_name_or_path=str(logger.last_pretrained_model_dir) if cfg.resume else None,
+            device=device,
+        )
+    assert isinstance(policy, nn.Module)
+
+    optimizers, lr_scheduler = make_optimizers_and_scheduler(cfg, policy)
+
+    # TODO: Handle resume
+    num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
+    num_total_params = sum(p.numel() for p in policy.parameters())
+
+    log_output_dir(out_dir)
+    logging.info(f"{cfg.env.task=}")
+    logging.info(f"{cfg.training.online_steps=}")
+    logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
+    logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
+
+    buffer_lock = Lock()
+    replay_buffer = ReplayBuffer(
+        capacity=cfg.training.online_buffer_capacity, device=device, state_keys=cfg.policy.input_shapes.keys()
+    )
+
+    batch_size = cfg.training.batch_size
+    offline_buffer_lock = None
+    offline_replay_buffer = None
+    if cfg.dataset_repo_id is not None:
+        logging.info("make_dataset offline buffer")
+        offline_dataset = make_dataset(cfg)
+        logging.info("Convertion to a offline replay buffer")
+        offline_replay_buffer = ReplayBuffer.from_lerobot_dataset(
+            offline_dataset, device=device, state_keys=cfg.policy.input_shapes.keys()
+        )
+        offline_buffer_lock = Lock()
+        batch_size: int = batch_size // 2  # We will sample from both replay buffer
+
+    server_thread = Thread(target=stream_transitions_from_actor, args=(50051,), daemon=True)
+    server_thread.start()
+
+
+    # Start a background thread to process transitions from the queue
+    transition_thread = Thread(
+        target=add_actor_information,
+        daemon=True,
+        args=(cfg,
+              device,
+              replay_buffer,
+              offline_replay_buffer,
+              batch_size,
+              optimizers,
+              policy,
+              policy_lock,
+              buffer_lock,
+              offline_buffer_lock,
+              logger_lock,
+              logger),
+    )
+    transition_thread.start()
+
+    # param_push_thread = Thread(
+    #     target=learner_push_parameters,
+    #     args=(policy, policy_lock, "127.0.0.1", 50052, 15),
+    #     # args=("127.0.0.1", 50052),
+    #     daemon=True,
+    # )
+    # param_push_thread.start()
+
+        # interaction_thread = Thread(
+    #     target=add_message_interaction_to_wandb,
+    #     daemon=True,
+    #     args=(cfg, logger, logger_lock),
+    # )
+    # interaction_thread.start()
+
+    transition_thread.join()
+    # param_push_thread.join()
+    server_thread.join()
+    # interaction_thread.join()
+
+
+@hydra.main(version_base="1.2", config_name="default", config_path="../../configs")
+def train_cli(cfg: dict):
+    train(
+        cfg,
+        out_dir=hydra.core.hydra_config.HydraConfig.get().run.dir,
+        job_name=hydra.core.hydra_config.HydraConfig.get().job.name,
+    )
+
+
+if __name__ == "__main__":
+    train_cli()
diff --git a/lerobot/scripts/train_sac.py b/lerobot/scripts/train_sac.py
index 866415d0..936d65ee 100644
--- a/lerobot/scripts/train_sac.py
+++ b/lerobot/scripts/train_sac.py
@@ -177,6 +177,7 @@ class ReplayBuffer:
         )
         self.position: int = (self.position + 1) % self.capacity
 
+    # TODO: ADD image_augmentation and use_drq arguments in this function in order to instantiate the class with them
     @classmethod
     def from_lerobot_dataset(
         cls,

From 36576c958fbc41cdbdbc286b49fb18cea3de332e Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Tue, 28 Jan 2025 17:25:49 +0000
Subject: [PATCH 051/112] FREEDOM, added back the optimization loop code in
 `learner_server.py` Ran experiment with pushcube env from maniskill. The
 learning seem to work.

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/common/logger.py                  |   2 +-
 lerobot/configs/policy/sac_manyskill.yaml |   2 +-
 lerobot/scripts/server/learner_server.py  | 161 +++++++++++-----------
 3 files changed, 85 insertions(+), 80 deletions(-)

diff --git a/lerobot/common/logger.py b/lerobot/common/logger.py
index 35c12062..569cad69 100644
--- a/lerobot/common/logger.py
+++ b/lerobot/common/logger.py
@@ -259,7 +259,7 @@ class Logger:
                 if k == custom_step_key:
                     continue
 
-                if self._wandb_custom_step_key is not None:
+                if self._wandb_custom_step_key is not None and custom_step_key is not None:
                     # NOTE: Log the metric with the custom step key.
                     value_custom_step_key = d[custom_step_key]
                     self._wandb.log({f"{mode}/{k}": v, self._wandb_custom_step_key: value_custom_step_key})
diff --git a/lerobot/configs/policy/sac_manyskill.yaml b/lerobot/configs/policy/sac_manyskill.yaml
index e4c3f17d..fc824da5 100644
--- a/lerobot/configs/policy/sac_manyskill.yaml
+++ b/lerobot/configs/policy/sac_manyskill.yaml
@@ -82,7 +82,7 @@ policy:
   temperature_lr: 3e-4
   # critic_target_update_weight: 0.005
   critic_target_update_weight: 0.01
-  utd_ratio: 1
+  utd_ratio: 2
 
 
   # # Loss coefficients.
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index 22777a26..bd15fc01 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -116,6 +116,7 @@ def learner_push_parameters(
         params_bytes = buf.getvalue()
 
         # Push them to the Actor’s "SendParameters" method
+        logging.info(f"[LEARNER] Pushing parameters to the Actor")
         response = actor_stub.SendParameters(hilserl_pb2.Parameters(parameter_bytes=params_bytes))
         time.sleep(seconds_between_pushes)
 
@@ -144,7 +145,7 @@ def add_actor_information(
     # are divided by 200. So we need to have a single thread that does all the work.
     start = time.time()
     optimization_step = 0
-
+    timeout_for_adding_transitions = 1
     while True:
         time_for_adding_transitions = time.time()
         while not transition_queue.empty():
@@ -153,99 +154,103 @@ def add_actor_information(
             for transition in transition_list:
                 transition = move_transition_to_device(transition, device=device)
                 replay_buffer.add(**transition)
+                # logging.info(f"[LEARNER] size of replay buffer: {len(replay_buffer)}")
+                # logging.info(f"[LEARNER] size of transition queues: {transition_queue.qsize()}")
+                # logging.info(f"[LEARNER] size of replay buffer: {len(replay_buffer)}")
+                # logging.info(f"[LEARNER] size of transition queues: {transition }")
+            if len(replay_buffer) > cfg.training.online_step_before_learning:
                 logging.info(f"[LEARNER] size of replay buffer: {len(replay_buffer)}")
-                logging.info(f"[LEARNER] size of transition queues: {transition_queue.qsize()}")
-            
 
         while not interaction_message_queue.empty():
             interaction_message = interaction_message_queue.get()
             logger.log_dict(interaction_message,mode="train",custom_step_key="interaction_step")
-            logging.info(f"[LEARNER] size of interaction message queue: {interaction_message_queue.qsize()}")
+            # logging.info(f"[LEARNER] size of interaction message queue: {interaction_message_queue.qsize()}")
 
-        # if len(replay_buffer.memory) < cfg.training.online_step_before_learning:
-        #     continue
+        if len(replay_buffer) < cfg.training.online_step_before_learning:
+            continue
+        time_for_one_optimization_step = time.time()
+        for _ in range(cfg.policy.utd_ratio - 1):
+            batch = replay_buffer.sample(batch_size)
 
-        # for _ in range(cfg.policy.utd_ratio - 1):
+            if cfg.dataset_repo_id is not None:
+                batch_offline = offline_replay_buffer.sample(batch_size)
+                batch = concatenate_batch_transitions(batch, batch_offline)
 
-        #     batch = replay_buffer.sample(batch_size)
-        #     if cfg.dataset_repo_id is not None:
-        #         batch_offline = offline_replay_buffer.sample(batch_size)
-        #         batch = concatenate_batch_transitions(batch, batch_offline)
+            actions = batch["action"]
+            rewards = batch["reward"]
+            observations = batch["state"]
+            next_observations = batch["next_state"]
+            done = batch["done"]
 
-        #     actions = batch["action"]
-        #     rewards = batch["reward"]
-        #     observations = batch["state"]
-        #     next_observations = batch["next_state"]
-        #     done = batch["done"]
+            with policy_lock:
+                loss_critic = policy.compute_loss_critic(
+                    observations=observations,
+                    actions=actions,
+                    rewards=rewards,
+                    next_observations=next_observations,
+                    done=done,
+                )
+                optimizers["critic"].zero_grad()
+                loss_critic.backward()
+                optimizers["critic"].step()
 
-        #     with policy_lock:
-        #         loss_critic = policy.compute_loss_critic(
-        #             observations=observations,
-        #             actions=actions,
-        #             rewards=rewards,
-        #             next_observations=next_observations,
-        #             done=done,
-        #         )
-        #         optimizers["critic"].zero_grad()
-        #         loss_critic.backward()
-        #         optimizers["critic"].step()
+        batch = replay_buffer.sample(batch_size)
 
-        # batch = replay_buffer.sample(batch_size)
+        if cfg.dataset_repo_id is not None:
+            batch_offline = offline_replay_buffer.sample(batch_size)
+            batch = concatenate_batch_transitions(
+                left_batch_transitions=batch, right_batch_transition=batch_offline
+            )
 
-        # if cfg.dataset_repo_id is not None:
-        #     batch_offline = offline_replay_buffer.sample(batch_size)
-        #     batch = concatenate_batch_transitions(
-        #         left_batch_transitions=batch, right_batch_transition=batch_offline
-        #     )
+        actions = batch["action"]
+        rewards = batch["reward"]
+        observations = batch["state"]
+        next_observations = batch["next_state"]
+        done = batch["done"]
 
-        # actions = batch["action"]
-        # rewards = batch["reward"]
-        # observations = batch["state"]
-        # next_observations = batch["next_state"]
-        # done = batch["done"]
+        with policy_lock:
+            loss_critic = policy.compute_loss_critic(
+                observations=observations,
+                actions=actions,
+                rewards=rewards, 
+                next_observations=next_observations,
+                done=done,
+            )
+            optimizers["critic"].zero_grad()
+            loss_critic.backward()
+            optimizers["critic"].step()
 
-        # with policy_lock:
-        #     loss_critic = policy.compute_loss_critic(
-        #         observations=observations,
-        #         actions=actions,
-        #         rewards=rewards, 
-        #         next_observations=next_observations,
-        #         done=done,
-        #     )
-        #     optimizers["critic"].zero_grad()
-        #     loss_critic.backward()
-        #     optimizers["critic"].step()
+        training_infos = {}
+        training_infos["loss_critic"] = loss_critic.item()
 
-        # training_infos = {}
-        # training_infos["loss_critic"] = loss_critic.item()
 
-        # if optimization_step % cfg.training.policy_update_freq == 0:
-        #     for _ in range(cfg.training.policy_update_freq):
-        #         with policy_lock:
-        #             loss_actor = policy.compute_loss_actor(observations=observations)
+        if optimization_step % cfg.training.policy_update_freq == 0:
+            for _ in range(cfg.training.policy_update_freq):
+                with policy_lock:
+                    loss_actor = policy.compute_loss_actor(observations=observations)
 
-        #             optimizers["actor"].zero_grad()
-        #             loss_actor.backward()
-        #             optimizers["actor"].step()
+                    optimizers["actor"].zero_grad()
+                    loss_actor.backward()
+                    optimizers["actor"].step()
 
-        #             training_infos["loss_actor"] = loss_actor.item()
+                    training_infos["loss_actor"] = loss_actor.item()
 
-        #             loss_temperature = policy.compute_loss_temperature(observations=observations)
-        #             optimizers["temperature"].zero_grad()
-        #             loss_temperature.backward()
-        #             optimizers["temperature"].step()
+                    loss_temperature = policy.compute_loss_temperature(observations=observations)
+                    optimizers["temperature"].zero_grad()
+                    loss_temperature.backward()
+                    optimizers["temperature"].step()
 
-        #             training_infos["loss_temperature"] = loss_temperature.item()
+                    training_infos["loss_temperature"] = loss_temperature.item()
 
-        # if optimization_step % cfg.training.log_freq == 0:
-        #     logger.log_dict(training_infos, step=optimization_step, mode="train")
+        if optimization_step % cfg.training.log_freq == 0:
+            logger.log_dict(training_infos, step=optimization_step, mode="train")
 
-        # policy.update_target_networks()
-        # optimization_step += 1
-        # time_for_one_optimization_step = time.time() - time_for_one_optimization_step
+        policy.update_target_networks()
+        optimization_step += 1
+        time_for_one_optimization_step = time.time() - time_for_one_optimization_step
 
-        # logger.log_dict({"[LEARNER] Time optimization step":time_for_one_optimization_step}, step=optimization_step, mode="train")
-        # time_for_one_optimization_step = time.time()
+        logging.info(f"[LEARNER] Time for one optimization step: {time_for_one_optimization_step}")
+        logger.log_dict({"Time optimization step":time_for_one_optimization_step}, step=optimization_step, mode="train")
 
 
 def make_optimizers_and_scheduler(cfg, policy):
@@ -360,13 +365,13 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     )
     transition_thread.start()
 
-    # param_push_thread = Thread(
-    #     target=learner_push_parameters,
-    #     args=(policy, policy_lock, "127.0.0.1", 50052, 15),
-    #     # args=("127.0.0.1", 50052),
-    #     daemon=True,
-    # )
-    # param_push_thread.start()
+    param_push_thread = Thread(
+        target=learner_push_parameters,
+        args=(policy, policy_lock, "127.0.0.1", 50051, 15),
+        # args=("127.0.0.1", 50052),
+        daemon=True,
+    )
+    param_push_thread.start()
 
         # interaction_thread = Thread(
     #     target=add_message_interaction_to_wandb,

From 42618f4bd68b6d97ea35e2138727cf7f1fa869e7 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Wed, 29 Jan 2025 15:50:46 +0000
Subject: [PATCH 052/112] - Added additional logging information in wandb
 around the timings of the policy loop and optimization loop. - Optimized
 critic design that improves the performance of the learner loop by a factor
 of 2 - Cleaned the code and fixed style issues

- Completed the config with actor_learner_config field that contains host-ip and port elemnts that are necessary for the actor-learner servers.

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 .../common/policies/sac/configuration_sac.py  |   8 +
 lerobot/common/policies/sac/modeling_sac.py   | 167 ++++++----
 lerobot/configs/policy/sac_manyskill.yaml     |  10 +-
 lerobot/scripts/server/actor_server.py        | 294 ++++++++++--------
 lerobot/scripts/server/hilserl.proto          |  16 +
 lerobot/scripts/server/learner_server.py      | 279 ++++++++++-------
 6 files changed, 461 insertions(+), 313 deletions(-)

diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index 62f35ed5..904679e8 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -45,6 +45,14 @@ class SACConfig:
             "action": {"min": [-1, -1], "max": [1, 1]},
         }
     )
+    # TODO: Move it outside of the config
+    actor_learner_config: dict[str, str | int] = field(
+        default_factory=lambda: {
+            "actor_ip": "127.0.0.1",
+            "port": 50051,
+            "learner_ip": "127.0.0.1",
+        }
+    )
     camera_number: int = 1
     # Add type annotations for these fields:
     image_encoder_hidden_dim: int = 32
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 8fb46199..8567313d 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -17,8 +17,7 @@
 
 # TODO: (1) better device management
 
-from collections import deque
-from typing import Callable, Optional, Sequence, Tuple, Union
+from typing import Callable, Optional, Tuple
 
 import einops
 import numpy as np
@@ -74,43 +73,42 @@ class SACPolicy(
             config.output_shapes, config.output_normalization_modes, dataset_stats
         )
 
+        # NOTE: For images the encoder should be shared between the actor and critic
         if config.shared_encoder:
             encoder_critic = SACObservationEncoder(config)
             encoder_actor: SACObservationEncoder = encoder_critic
         else:
             encoder_critic = SACObservationEncoder(config)
             encoder_actor = SACObservationEncoder(config)
-        # Define networks
-        critic_nets = []
-        for _ in range(config.num_critics):
-            critic_net = Critic(
-                encoder=encoder_critic,
-                network=MLP(
-                    input_dim=encoder_critic.output_dim + config.output_shapes["action"][0],
-                    **config.critic_network_kwargs,
-                ),
-                device=device,
-            )
-            critic_nets.append(critic_net)
 
-        target_critic_nets = []
-        for _ in range(config.num_critics):
-            target_critic_net = Critic(
-                encoder=encoder_critic,
-                network=MLP(
-                    input_dim=encoder_critic.output_dim + config.output_shapes["action"][0],
-                    **config.critic_network_kwargs,
-                ),
-                device=device,
-            )
-            target_critic_nets.append(target_critic_net)
+        self.critic_ensemble = CriticEnsemble(
+            encoder=encoder_critic,
+            network_list=nn.ModuleList(
+                [
+                    MLP(
+                        input_dim=encoder_critic.output_dim + config.output_shapes["action"][0],
+                        **config.critic_network_kwargs,
+                    )
+                    for _ in range(config.num_critics)
+                ]
+            ),
+            device=device,
+        )
 
-        self.critic_ensemble = create_critic_ensemble(
-            critics=critic_nets, num_critics=config.num_critics, device=device
-        )
-        self.critic_target = create_critic_ensemble(
-            critics=target_critic_nets, num_critics=config.num_critics, device=device
+        self.critic_target = CriticEnsemble(
+            encoder=encoder_critic,
+            network_list=nn.ModuleList(
+                [
+                    MLP(
+                        input_dim=encoder_critic.output_dim + config.output_shapes["action"][0],
+                        **config.critic_network_kwargs,
+                    )
+                    for _ in range(config.num_critics)
+                ]
+            ),
+            device=device,
         )
+
         self.critic_target.load_state_dict(self.critic_ensemble.state_dict())
 
         self.actor = Policy(
@@ -123,7 +121,8 @@ class SACPolicy(
         )
         if config.target_entropy is None:
             config.target_entropy = -np.prod(config.output_shapes["action"][0]) / 2  # (-dim(A)/2)
-        # TODO: Handle the case where the temparameter is a fixed
+
+        # TODO (azouitine): Handle the case where the temparameter is a fixed
         self.log_alpha = torch.zeros(1, requires_grad=True, device=device)
         self.temperature = self.log_alpha.exp().item()
 
@@ -152,18 +151,19 @@ class SACPolicy(
             Tensor of Q-values from all critics
         """
         critics = self.critic_target if use_target else self.critic_ensemble
-        q_values = torch.stack([critic(observations, actions) for critic in critics])
+        q_values = critics(observations, actions)
         return q_values
 
     def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor | float]: ...
     def update_target_networks(self):
         """Update target networks with exponential moving average"""
-        for target_critic, critic in zip(self.critic_target, self.critic_ensemble, strict=False):
-            for target_param, param in zip(target_critic.parameters(), critic.parameters(), strict=False):
-                target_param.data.copy_(
-                    param.data * self.config.critic_target_update_weight
-                    + target_param.data * (1.0 - self.config.critic_target_update_weight)
-                )
+        for target_param, param in zip(
+            self.critic_target.parameters(), self.critic_ensemble.parameters(), strict=False
+        ):
+            target_param.data.copy_(
+                param.data * self.config.critic_target_update_weight
+                + target_param.data * (1.0 - self.config.critic_target_update_weight)
+            )
 
     def compute_loss_critic(self, observations, actions, rewards, next_observations, done) -> Tensor:
         temperature = self.log_alpha.exp().item()
@@ -264,34 +264,83 @@ class MLP(nn.Module):
         return self.net(x)
 
 
-class Critic(nn.Module):
+class CriticEnsemble(nn.Module):
+    """
+    ┌──────────────────┬─────────────────────────────────────────────────────────┐
+    │ Critic Ensemble  │                                                         │
+    ├──────────────────┘                                                         │
+    │                                                                            │
+    │        ┌────┐             ┌────┐                               ┌────┐      │
+    │        │ Q1 │             │ Q2 │                               │ Qn │      │
+    │        └────┘             └────┘                               └────┘      │
+    │  ┌──────────────┐    ┌──────────────┐                     ┌──────────────┐ │
+    │  │              │    │              │                     │              │ │
+    │  │    MLP 1     │    │    MLP 2     │                     │     MLP      │ │
+    │  │              │    │              │       ...           │ num_critics  │ │
+    │  │              │    │              │                     │              │ │
+    │  └──────────────┘    └──────────────┘                     └──────────────┘ │
+    │          ▲                   ▲                                    ▲        │
+    │          └───────────────────┴───────┬────────────────────────────┘        │
+    │                                      │                                     │
+    │                                      │                                     │
+    │                            ┌───────────────────┐                           │
+    │                            │     Embedding     │                           │
+    │                            │                   │                           │
+    │                            └───────────────────┘                           │
+    │                                      ▲                                     │
+    │                                      │                                     │
+    │                        ┌─────────────┴────────────┐                        │
+    │                        │                          │                        │
+    │                        │  SACObservationEncoder   │                        │
+    │                        │                          │                        │
+    │                        └──────────────────────────┘                        │
+    │                                      ▲                                     │
+    │                                      │                                     │
+    │                                      │                                     │
+    │                                      │                                     │
+    └───────────────────────────┬────────────────────┬───────────────────────────┘
+                                │    Observation     │
+                                └────────────────────┘
+    """
+
     def __init__(
         self,
         encoder: Optional[nn.Module],
-        network: nn.Module,
+        network_list: nn.Module,
         init_final: Optional[float] = None,
         device: str = "cpu",
     ):
         super().__init__()
         self.device = torch.device(device)
         self.encoder = encoder
-        self.network = network
+        self.network_list = network_list
         self.init_final = init_final
 
+        # for network in network_list:
+        #     network.to(self.device)
+
         # Find the last Linear layer's output dimension
-        for layer in reversed(network.net):
+        for layer in reversed(network_list[0].net):
             if isinstance(layer, nn.Linear):
                 out_features = layer.out_features
                 break
 
         # Output layer
+        self.output_layers = []
         if init_final is not None:
-            self.output_layer = nn.Linear(out_features, 1)
-            nn.init.uniform_(self.output_layer.weight, -init_final, init_final)
-            nn.init.uniform_(self.output_layer.bias, -init_final, init_final)
+            for _ in network_list:
+                output_layer = nn.Linear(out_features, 1, device=device)
+                nn.init.uniform_(output_layer.weight, -init_final, init_final)
+                nn.init.uniform_(output_layer.bias, -init_final, init_final)
+                self.output_layers.append(output_layer)
         else:
-            self.output_layer = nn.Linear(out_features, 1)
-            orthogonal_init()(self.output_layer.weight)
+            self.output_layers = []
+            for _ in network_list:
+                output_layer = nn.Linear(out_features, 1, device=device)
+                orthogonal_init()(output_layer.weight)
+                self.output_layers.append(output_layer)
+
+        self.output_layers = nn.ModuleList(self.output_layers)
 
         self.to(self.device)
 
@@ -307,9 +356,12 @@ class Critic(nn.Module):
         obs_enc = observations if self.encoder is None else self.encoder(observations)
 
         inputs = torch.cat([obs_enc, actions], dim=-1)
-        x = self.network(inputs)
-        value = self.output_layer(x)
-        return value.squeeze(-1)
+        list_q_values = []
+        for network, output_layer in zip(self.network_list, self.output_layers, strict=False):
+            x = network(inputs)
+            value = output_layer(x)
+            list_q_values.append(value.squeeze(-1))
+        return torch.stack(list_q_values)
 
 
 class Policy(nn.Module):
@@ -416,9 +468,7 @@ class Policy(nn.Module):
 
 
 class SACObservationEncoder(nn.Module):
-    """Encode image and/or state vector observations.
-    TODO(ke-wang): The original work allows for (1) stacking multiple history frames and (2) using pretrained resnet encoders.
-    """
+    """Encode image and/or state vector observations."""
 
     def __init__(self, config: SACConfig):
         """
@@ -513,8 +563,7 @@ class SACObservationEncoder(nn.Module):
             feat.append(self.env_state_enc_layers(obs_dict["observation.environment_state"]))
         if "observation.state" in self.config.input_shapes:
             feat.append(self.state_enc_layers(obs_dict["observation.state"]))
-        # TODO(ke-wang): currently average over all features, concatenate all features maybe a better way
-        # return torch.stack(feat, dim=0).mean(0)
+
         features = torch.cat(tensors=feat, dim=-1)
         features = self.aggregation_layer(features)
 
@@ -530,12 +579,8 @@ def orthogonal_init():
     return lambda x: torch.nn.init.orthogonal_(x, gain=1.0)
 
 
-def create_critic_ensemble(critics: list[nn.Module], num_critics: int, device: str = "cpu") -> nn.ModuleList:
-    """Creates an ensemble of critic networks"""
-    assert len(critics) == num_critics, f"Expected {num_critics} critics, got {len(critics)}"
-    return nn.ModuleList(critics).to(device)
-
-
+# TODO (azouitine): I think in our case this function is not usefull we should remove it
+# after some investigation
 # borrowed from tdmpc
 def flatten_forward_unflatten(fn: Callable[[Tensor], Tensor], image_tensor: Tensor) -> Tensor:
     """Helper to temporarily flatten extra dims at the start of the image tensor.
diff --git a/lerobot/configs/policy/sac_manyskill.yaml b/lerobot/configs/policy/sac_manyskill.yaml
index fc824da5..59f42247 100644
--- a/lerobot/configs/policy/sac_manyskill.yaml
+++ b/lerobot/configs/policy/sac_manyskill.yaml
@@ -8,8 +8,7 @@
 #   env.gym.obs_type=environment_state_agent_pos \
 
 seed: 1
-dataset_repo_id: null 
-
+dataset_repo_id: null
 
 training:
   # Offline training dataloader
@@ -75,15 +74,18 @@ policy:
   # discount: 0.99
   discount: 0.80
   temperature_init: 1.0
-  num_critics: 2
+  num_critics: 2 #10
   num_subsample_critics: null
   critic_lr: 3e-4
   actor_lr: 3e-4
   temperature_lr: 3e-4
   # critic_target_update_weight: 0.005
   critic_target_update_weight: 0.01
-  utd_ratio: 2
+  utd_ratio: 2 # 10
 
+actor_learner_config:
+  actor_ip: "127.0.0.1"
+  port: 50051
 
   # # Loss coefficients.
   # reward_coeff: 0.5
diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index afa6a6e0..0d2a1a5e 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -13,117 +13,123 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import io
 import logging
-import functools
-from pprint import pformat
-import random
-from typing import Optional, Sequence, TypedDict, Callable
 import pickle
+import queue
+import time
+from concurrent import futures
+from statistics import mean, quantiles
 
-import hydra
-import torch
-import torch.nn.functional as F
-from torch import nn
-from tqdm import tqdm
-from deepdiff import DeepDiff
-from omegaconf import DictConfig, OmegaConf
-
-from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
-
-# TODO: Remove the import of maniskill
-from lerobot.common.datasets.factory import make_dataset
-from lerobot.common.envs.factory import make_env, make_maniskill_env
-from lerobot.common.envs.utils import preprocess_observation, preprocess_maniskill_observation
-from lerobot.common.logger import Logger, log_output_dir
-from lerobot.common.policies.factory import make_policy
-from lerobot.common.policies.sac.modeling_sac import SACPolicy
-from lerobot.common.policies.utils import get_device_from_parameters
-from lerobot.common.utils.utils import (
-    format_big_number,
-    get_safe_torch_device,
-    init_hydra_config,
-    init_logging,
-    set_global_seed,
-)
 # from lerobot.scripts.eval import eval_policy
 from threading import Thread
-import queue
 
 import grpc
-from lerobot.scripts.server import hilserl_pb2, hilserl_pb2_grpc
-import io
-import time
-import logging
-from concurrent import futures
-from threading import Thread
-from lerobot.scripts.server.buffer import move_state_dict_to_device, move_transition_to_device, Transition
+import hydra
+import torch
+from omegaconf import DictConfig
+from torch import nn
 
-import faulthandler
-import signal
+# TODO: Remove the import of maniskill
+from lerobot.common.envs.factory import make_maniskill_env
+from lerobot.common.envs.utils import preprocess_maniskill_observation
+from lerobot.common.policies.factory import make_policy
+from lerobot.common.policies.sac.modeling_sac import SACPolicy
+from lerobot.common.utils.utils import (
+    get_safe_torch_device,
+    set_global_seed,
+)
+from lerobot.scripts.server import hilserl_pb2, hilserl_pb2_grpc
+from lerobot.scripts.server.buffer import Transition, move_state_dict_to_device, move_transition_to_device
 
 logging.basicConfig(level=logging.INFO)
 
 parameters_queue = queue.Queue(maxsize=1)
 message_queue = queue.Queue(maxsize=1_000_000)
 
+
 class ActorInformation:
+    """
+    This helper class is used to differentiate between two types of messages that are placed in the same queue during streaming:
+
+    - **Transition Data:** Contains experience tuples (observation, action, reward, next observation) collected during interaction.
+    - **Interaction Messages:** Encapsulates statistics related to the interaction process.
+
+    Attributes:
+        transition (Optional): Transition data to be sent to the learner.
+        interaction_message (Optional): Iteraction message providing additional statistics for logging.
+    """
+
     def __init__(self, transition=None, interaction_message=None):
         self.transition = transition
         self.interaction_message = interaction_message
 
 
-# 1) Implement ActorService so the Learner can send parameters to this Actor.
 class ActorServiceServicer(hilserl_pb2_grpc.ActorServiceServicer):
-    def StreamTransition(self, request, context):
+    """
+    gRPC service for actor-learner communication in reinforcement learning.
+
+    This service is responsible for:
+    1. Streaming batches of transition data and statistical metrics from the actor to the learner.
+    2. Receiving updated network parameters from the learner.
+    """
+
+    def StreamTransition(self, request, context):  # noqa: N802
+        """
+        Streams data from the actor to the learner.
+
+        This function continuously retrieves messages from the queue and processes them based on their type:
+
+        - **Transition Data:**
+          - A batch of transitions (observation, action, reward, next observation) is collected.
+          - Transitions are moved to the CPU and serialized using PyTorch.
+          - The serialized data is wrapped in a `hilserl_pb2.Transition` message and sent to the learner.
+
+        - **Interaction Messages:**
+          - Contains useful statistics about episodic rewards and policy timings.
+          - The message is serialized using `pickle` and sent to the learner.
+
+        Yields:
+            hilserl_pb2.ActorInformation: The response message containing either transition data or an interaction message.
+        """
         while True:
-            # logging.info(f"[ACTOR] before message.empty()")
-            # logging.info(f"[ACTOR] size transition queue {message_queue.qsize()}")
-            # time.sleep(0.01)
-            # if message_queue.empty():
-            #     continue
-            # logging.info(f"[ACTOR] after message.empty()")
-            start = time.time()
             message = message_queue.get(block=True)
-            # logging.info(f"[ACTOR] Message queue get time {time.time() - start}")
 
             if message.transition is not None:
-                # transition_to_send_to_learner = move_transition_to_device(message.transition, device="cpu")
-                transition_to_send_to_learner = [move_transition_to_device(T, device="cpu") for T in message.transition]
-                # logging.info(f"[ACTOR] Message queue get time {time.time() - start}")
+                transition_to_send_to_learner = [
+                    move_transition_to_device(T, device="cpu") for T in message.transition
+                ]
 
-                # Serialize it
                 buf = io.BytesIO()
                 torch.save(transition_to_send_to_learner, buf)
                 transition_bytes = buf.getvalue()
-                
-                transition_message = hilserl_pb2.Transition(
-                    transition_bytes=transition_bytes
-                )
 
-                response = hilserl_pb2.ActorInformation(
-                    transition=transition_message
-                )
-                logging.info(f"[ACTOR] time to yield transition response {time.time() - start}")
-                logging.info(f"[ACTOR] size transition queue {message_queue.qsize()}")
-                
+                transition_message = hilserl_pb2.Transition(transition_bytes=transition_bytes)
+
+                response = hilserl_pb2.ActorInformation(transition=transition_message)
+
             elif message.interaction_message is not None:
-                # Serialize it and send it to the Learner's server
                 content = hilserl_pb2.InteractionMessage(
                     interaction_message_bytes=pickle.dumps(message.interaction_message)
-                    )
-                response = hilserl_pb2.ActorInformation(
-                    interaction_message=content
                 )
+                response = hilserl_pb2.ActorInformation(interaction_message=content)
 
-            # logging.info(f"[ACTOR] yield response before")
             yield response
-            # logging.info(f"[ACTOR] response yielded after")
 
-    def SendParameters(self, request, context):
+    def SendParameters(self, request, context):  # noqa: N802
         """
-        Learner calls this with updated Parameters -> Actor
+        Receives updated parameters from the learner and updates the actor.
+
+        The learner calls this method to send new model parameters. The received parameters are deserialized
+        and placed in a queue to be consumed by the actor.
+
+        Args:
+            request (hilserl_pb2.ParameterUpdate): The request containing serialized network parameters.
+            context (grpc.ServicerContext): The gRPC context.
+
+        Returns:
+            hilserl_pb2.Empty: An empty response to acknowledge receipt.
         """
-        # logging.info("[ACTOR] Received parameters from Learner.")
         buffer = io.BytesIO(request.parameter_bytes)
         params = torch.load(buffer)
         parameters_queue.put(params)
@@ -132,38 +138,38 @@ class ActorServiceServicer(hilserl_pb2_grpc.ActorServiceServicer):
 
 def serve_actor_service(port=50052):
     """
-    Runs a gRPC server so that the Learner can push parameters to the Actor.
+    Runs a gRPC server to start streaming the data from the actor to the learner.
+     Throught this server the learner can push parameters to the Actor as well.
     """
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=20),
-                            options=[('grpc.max_send_message_length', -1),
-                                     ('grpc.max_receive_message_length', -1)])
-    hilserl_pb2_grpc.add_ActorServiceServicer_to_server(
-        ActorServiceServicer(), server
+    server = grpc.server(
+        futures.ThreadPoolExecutor(max_workers=20),
+        options=[("grpc.max_send_message_length", -1), ("grpc.max_receive_message_length", -1)],
     )
-    server.add_insecure_port(f'[::]:{port}')
+    hilserl_pb2_grpc.add_ActorServiceServicer_to_server(ActorServiceServicer(), server)
+    server.add_insecure_port(f"[::]:{port}")
     server.start()
     logging.info(f"[ACTOR] gRPC server listening on port {port}")
     server.wait_for_termination()
 
-def act_with_policy(cfg: DictConfig, 
-                   out_dir: str | None = None, 
-                   job_name: str | None = None):
 
-    if out_dir is None:
-        raise NotImplementedError()
-    if job_name is None:
-        raise NotImplementedError()
+def act_with_policy(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = None):
+    """
+    Executes policy interaction within the environment.
+
+    This function rolls out the policy in the environment, collecting interaction data and pushing it to a queue for streaming to the learner.
+    Once an episode is completed, updated network parameters received from the learner are retrieved from a queue and loaded into the network.
+
+    Args:
+        cfg (DictConfig): Configuration settings for the interaction process.
+        out_dir (Optional[str]): Directory to store output logs or results. Defaults to None.
+        job_name (Optional[str]): Name of the job for logging or tracking purposes. Defaults to None.
+    """
 
     logging.info("make_env online")
 
     # online_env = make_env(cfg, n_envs=1)
     # TODO: Remove the import of maniskill and unifiy with make env
     online_env = make_maniskill_env(cfg, n_envs=1)
-    if cfg.training.eval_freq > 0:
-        logging.info("make_env eval")
-        # eval_env = make_env(cfg, n_envs=1)
-        # TODO: Remove the import of maniskill and unifiy with make env
-        eval_env = make_maniskill_env(cfg, n_envs=1)
 
     set_global_seed(cfg.seed)
     device = get_safe_torch_device(cfg.device, log=True)
@@ -172,8 +178,7 @@ def act_with_policy(cfg: DictConfig,
     torch.backends.cuda.matmul.allow_tf32 = True
 
     logging.info("make_policy")
-    
-    
+
     ### Instantiate the policy in both the actor and learner processes
     ### To avoid sending a SACPolicy object through the port, we create a policy intance
     ### on both sides, the learner sends the updated parameters every n steps to update the actor's parameters
@@ -181,7 +186,7 @@ def act_with_policy(cfg: DictConfig,
     policy: SACPolicy = make_policy(
         hydra_cfg=cfg,
         # dataset_stats=offline_dataset.meta.stats if not cfg.resume else None,
-        # Hack: But if we do online traning, we do not need dataset_stats
+        # Hack: But if we do online training, we do not need dataset_stats
         dataset_stats=None,
         # TODO: Handle resume training
         pretrained_policy_name_or_path=None,
@@ -195,17 +200,22 @@ def act_with_policy(cfg: DictConfig,
     # obs = preprocess_observation(obs)
     obs = preprocess_maniskill_observation(obs)
     obs = {key: obs[key].to(device, non_blocking=True) for key in obs}
-    ### ACTOR ==================
+
     # NOTE: For the moment we will solely handle the case of a single environment
     sum_reward_episode = 0
     list_transition_to_send_to_learner = []
+    list_policy_fps = []
 
     for interaction_step in range(cfg.training.online_steps):
-        # NOTE: At some point we should use a  wrapper to handle the observation
-
-        # start = time.time()
         if interaction_step >= cfg.training.online_step_before_learning:
+            start = time.perf_counter()
             action = policy.select_action(batch=obs)
+            list_policy_fps.append(1.0 / (time.perf_counter() - start + 1e-9))
+            if list_policy_fps[-1] < cfg.fps:
+                logging.warning(
+                    f"[ACTOR] policy frame rate {list_policy_fps[-1]} during interaction step {interaction_step} is below the required control frame rate {cfg.fps}"
+                )
+
             next_obs, reward, done, truncated, info = online_env.step(action.cpu().numpy())
         else:
             action = online_env.action_space.sample()
@@ -213,70 +223,88 @@ def act_with_policy(cfg: DictConfig,
             # HACK
             action = torch.tensor(action, dtype=torch.float32).to(device, non_blocking=True)
 
-        # logging.info(f"[ACTOR] Time for env step {time.time() - start}")
-
         # HACK: For maniskill
         # next_obs = preprocess_observation(next_obs)
         next_obs = preprocess_maniskill_observation(next_obs)
         next_obs = {key: next_obs[key].to(device, non_blocking=True) for key in obs}
         sum_reward_episode += float(reward[0])
-        # Because we are using a single environment
-        # we can safely assume that the episode is done
+
+        # Because we are using a single environment we can index at zero
         if done[0].item() or truncated[0].item():
             # TODO: Handle logging for episode information
             logging.info(f"[ACTOR] Global step {interaction_step}: Episode reward: {sum_reward_episode}")
 
             if not parameters_queue.empty():
-                logging.info("[ACTOR] Load new parameters from Learner.")
-                # Load new parameters from Learner
+                logging.debug("[ACTOR] Load new parameters from Learner.")
                 state_dict = parameters_queue.get()
                 state_dict = move_state_dict_to_device(state_dict, device=device)
                 policy.actor.load_state_dict(state_dict)
-            
+
             if len(list_transition_to_send_to_learner) > 0:
-                logging.info(f"[ACTOR] Sending {len(list_transition_to_send_to_learner)} transitions to Learner.")
+                logging.debug(
+                    f"[ACTOR] Sending {len(list_transition_to_send_to_learner)} transitions to Learner."
+                )
                 message_queue.put(ActorInformation(transition=list_transition_to_send_to_learner))
                 list_transition_to_send_to_learner = []
 
+            stats = {}
+            if len(list_policy_fps) > 0:
+                policy_fps = mean(list_policy_fps)
+                quantiles_90 = quantiles(list_policy_fps, n=10)[-1]
+                logging.debug(f"[ACTOR] Average policy frame rate: {policy_fps}")
+                logging.debug(f"[ACTOR] Policy frame rate 90th percentile: {quantiles_90}")
+                stats = {"Policy frequency [Hz]": policy_fps, "Policy frequency 90th-p [Hz]": quantiles_90}
+                list_policy_fps = []
+
             # Send episodic reward to the learner
-            message_queue.put(ActorInformation(interaction_message={"episodic_reward": sum_reward_episode,"interaction_step": interaction_step}))
+            message_queue.put(
+                ActorInformation(
+                    interaction_message={
+                        "Episodic reward": sum_reward_episode,
+                        "Interaction step": interaction_step,
+                        **stats,
+                    }
+                )
+            )
             sum_reward_episode = 0.0
 
-        # ============================
-        # Prepare transition to send
-        # ============================
-        # Label the reward
+        # TODO (michel-aractingi): Label the reward
         # if config.label_reward_on_actor:
         #     reward = reward_classifier(obs)
 
-        list_transition_to_send_to_learner.append(Transition(
-        # transition_to_send_to_learner = Transition(
-                    state=obs,
-                    action=action,
-                    reward=reward,
-                    next_state=next_obs,
-                    done=done,
-                    complementary_info=None,
-                )
+        list_transition_to_send_to_learner.append(
+            Transition(
+                state=obs,
+                action=action,
+                reward=reward,
+                next_state=next_obs,
+                done=done,
+                complementary_info=None,
+            )
         )
-        # message_queue.put(ActorInformation(transition=transition_to_send_to_learner))
 
         # assign obs to the next obs and continue the rollout
         obs = next_obs
 
+
 @hydra.main(version_base="1.2", config_name="default", config_path="../../configs")
 def actor_cli(cfg: dict):
-        server_thread = Thread(target=serve_actor_service, args=(50051,), daemon=True)
-        server_thread.start()
-        policy_thread = Thread(target=act_with_policy, 
-                               daemon=True, 
-                               args=(cfg,hydra.core.hydra_config.HydraConfig.get().run.dir, hydra.core.hydra_config.HydraConfig.get().job.name))
-        policy_thread.start()
-        policy_thread.join()
-        server_thread.join()
+    port = cfg.actor_learner_config.port
+    server_thread = Thread(target=serve_actor_service, args=(port,), daemon=True)
+    server_thread.start()
+    policy_thread = Thread(
+        target=act_with_policy,
+        daemon=True,
+        args=(
+            cfg,
+            hydra.core.hydra_config.HydraConfig.get().run.dir,
+            hydra.core.hydra_config.HydraConfig.get().job.name,
+        ),
+    )
+    policy_thread.start()
+    policy_thread.join()
+    server_thread.join()
+
 
 if __name__ == "__main__":
-    with open("traceback.log", "w") as f:
-        faulthandler.register(signal.SIGUSR1, file=f)
-
-    actor_cli()
\ No newline at end of file
+    actor_cli()
diff --git a/lerobot/scripts/server/hilserl.proto b/lerobot/scripts/server/hilserl.proto
index 41f85100..9fd8663f 100644
--- a/lerobot/scripts/server/hilserl.proto
+++ b/lerobot/scripts/server/hilserl.proto
@@ -1,3 +1,19 @@
+// !/usr/bin/env python
+
+//  Copyright 2024 The HuggingFace Inc. team.
+//  All rights reserved.
+
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+
+//      http://www.apache.org/licenses/LICENSE-2.0
+
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
 syntax = "proto3";
 
 package hil_serl;
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index bd15fc01..4c5c358c 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -1,97 +1,97 @@
-import grpc
-from concurrent import futures
-import functools
-import logging
-import queue
-import pickle
-import torch
-import torch.nn.functional as F
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import io
+import logging
+import pickle
+import queue
 import time
-
 from pprint import pformat
-import random
-from typing import Optional, Sequence, TypedDict, Callable
+from threading import Lock, Thread
 
+import grpc
+
+# Import generated stubs
+import hilserl_pb2  # type: ignore
+import hilserl_pb2_grpc  # type: ignore
 import hydra
 import torch
-import torch.nn.functional as F
-from torch import nn
-from tqdm import tqdm
-from deepdiff import DeepDiff
 from omegaconf import DictConfig, OmegaConf
-from threading import Thread, Lock
-
-from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+from torch import nn
 
 # TODO: Remove the import of maniskill
 from lerobot.common.datasets.factory import make_dataset
 from lerobot.common.logger import Logger, log_output_dir
 from lerobot.common.policies.factory import make_policy
 from lerobot.common.policies.sac.modeling_sac import SACPolicy
-from lerobot.common.policies.utils import get_device_from_parameters
 from lerobot.common.utils.utils import (
     format_big_number,
     get_safe_torch_device,
-    init_hydra_config,
     init_logging,
     set_global_seed,
 )
-from lerobot.scripts.server.buffer import ReplayBuffer, move_transition_to_device, concatenate_batch_transitions, move_state_dict_to_device, Transition
-
-# Import generated stubs
-import hilserl_pb2
-import hilserl_pb2_grpc
+from lerobot.scripts.server.buffer import (
+    ReplayBuffer,
+    concatenate_batch_transitions,
+    move_state_dict_to_device,
+    move_transition_to_device,
+)
 
 logging.basicConfig(level=logging.INFO)
 
-
-
 # TODO: Implement it in cleaner way maybe
 transition_queue = queue.Queue()
 interaction_message_queue = queue.Queue()
 
 
-# 1) Implement the LearnerService so the Actor can send transitions here.
-class LearnerServiceServicer(hilserl_pb2_grpc.LearnerServiceServicer):
-    # def SendTransition(self, request, context):
-    #     """
-    #     Actor calls this method to push a Transition -> Learner.
-    #     """
-    #     buffer = io.BytesIO(request.transition_bytes)
-    #     transition = torch.load(buffer)
-    #     transition_queue.put(transition)
-    #     return hilserl_pb2.Empty()
-    def SendInteractionMessage(self, request, context):
-        """
-        Actor calls this method to push a Transition -> Learner.
-        """
-        content = pickle.loads(request.interaction_message_bytes)
-        interaction_message_queue.put(content)
-        return hilserl_pb2.Empty()
-
-
-
-def stream_transitions_from_actor(port=50051):
+def stream_transitions_from_actor(host="127.0.0.1", port=50051):
     """
-    Runs a gRPC server listening for transitions from the Actor.
+    Runs a gRPC client that listens for transition and interaction messages from an Actor service.
+
+    This function establishes a gRPC connection with the given `host` and `port`, then continuously
+    streams transition data from the `ActorServiceStub`. The received transition data is deserialized
+    and stored in a queue (`transition_queue`). Similarly, interaction messages are also deserialized
+    and stored in a separate queue (`interaction_message_queue`).
+
+    Args:
+        host (str, optional): The IP address or hostname of the gRPC server. Defaults to `"127.0.0.1"`.
+        port (int, optional): The port number on which the gRPC server is running. Defaults to `50051`.
+
     """
+    # NOTE: This is waiting for the handshake to be done
+    # In the future we will do it in a canonical way with a proper handshake
     time.sleep(10)
-    channel = grpc.insecure_channel(f'127.0.0.1:{port}',
-                             options=[('grpc.max_send_message_length', -1),
-                                      ('grpc.max_receive_message_length', -1)])
+    channel = grpc.insecure_channel(
+        f"{host}:{port}",
+        options=[("grpc.max_send_message_length", -1), ("grpc.max_receive_message_length", -1)],
+    )
     stub = hilserl_pb2_grpc.ActorServiceStub(channel)
     for response in stub.StreamTransition(hilserl_pb2.Empty()):
-        if response.HasField('transition'):
+        if response.HasField("transition"):
             buffer = io.BytesIO(response.transition.transition_bytes)
             transition = torch.load(buffer)
             transition_queue.put(transition)
-        if response.HasField('interaction_message'):
+        if response.HasField("interaction_message"):
             content = pickle.loads(response.interaction_message.interaction_message_bytes)
             interaction_message_queue.put(content)
         # NOTE: Cool down the CPU, if you comment this line you will make a huge bottleneck
+        # TODO: LOOK TO REMOVE IT
         time.sleep(0.001)
 
+
 def learner_push_parameters(
     policy: nn.Module, policy_lock: Lock, actor_host="127.0.0.1", actor_port=50052, seconds_between_pushes=5
 ):
@@ -100,10 +100,10 @@ def learner_push_parameters(
     and periodically push new parameters.
     """
     time.sleep(10)
-    # The Actor's server is presumably listening on a different port, e.g. 50052
-    channel = grpc.insecure_channel(f"{actor_host}:{actor_port}",
-                             options=[('grpc.max_send_message_length', -1),
-                                      ('grpc.max_receive_message_length', -1)])
+    channel = grpc.insecure_channel(
+        f"{actor_host}:{actor_port}",
+        options=[("grpc.max_send_message_length", -1), ("grpc.max_receive_message_length", -1)],
+    )
     actor_stub = hilserl_pb2_grpc.ActorServiceStub(channel)
 
     while True:
@@ -116,20 +116,19 @@ def learner_push_parameters(
         params_bytes = buf.getvalue()
 
         # Push them to the Actor’s "SendParameters" method
-        logging.info(f"[LEARNER] Pushing parameters to the Actor")
-        response = actor_stub.SendParameters(hilserl_pb2.Parameters(parameter_bytes=params_bytes))
+        logging.info("[LEARNER] Publishing parameters to the Actor")
+        response = actor_stub.SendParameters(hilserl_pb2.Parameters(parameter_bytes=params_bytes))  # noqa: F841
         time.sleep(seconds_between_pushes)
 
 
-# Checked 
-def add_actor_information(
+def add_actor_information_and_train(
     cfg,
-    device,
+    device: str,
     replay_buffer: ReplayBuffer,
     offline_replay_buffer: ReplayBuffer,
     batch_size: int,
-    optimizers,
-    policy, 
+    optimizers: dict[str, torch.optim.Optimizer],
+    policy: nn.Module,
     policy_lock: Lock,
     buffer_lock: Lock,
     offline_buffer_lock: Lock,
@@ -137,34 +136,52 @@ def add_actor_information(
     logger: Logger,
 ):
     """
-    In a real application, you might run your training loop here,
-    reading from the transition queue and doing gradient updates.
+    Handles data transfer from the actor to the learner, manages training updates,
+    and logs training progress in an online reinforcement learning setup.
+
+    This function continuously:
+    - Transfers transitions from the actor to the replay buffer.
+    - Logs received interaction messages.
+    - Ensures training begins only when the replay buffer has a sufficient number of transitions.
+    - Samples batches from the replay buffer and performs multiple critic updates.
+    - Periodically updates the actor, critic, and temperature optimizers.
+    - Logs training statistics, including loss values and optimization frequency.
+
+    **NOTE:**
+    - This function performs multiple responsibilities (data transfer, training, and logging).
+      It should ideally be split into smaller functions in the future.
+    - Due to Python's **Global Interpreter Lock (GIL)**, running separate threads for different tasks
+      significantly reduces performance. Instead, this function executes all operations in a single thread.
+
+    Args:
+        cfg: Configuration object containing hyperparameters.
+        device (str): The computing device (`"cpu"` or `"cuda"`).
+        replay_buffer (ReplayBuffer): The primary replay buffer storing online transitions.
+        offline_replay_buffer (ReplayBuffer): An additional buffer for offline transitions.
+        batch_size (int): The number of transitions to sample per training step.
+        optimizers (Dict[str, torch.optim.Optimizer]): A dictionary of optimizers (`"actor"`, `"critic"`, `"temperature"`).
+        policy (nn.Module): The reinforcement learning policy with critic, actor, and temperature parameters.
+        policy_lock (Lock): A threading lock to ensure safe policy updates.
+        buffer_lock (Lock): A threading lock to safely access the online replay buffer.
+        offline_buffer_lock (Lock): A threading lock to safely access the offline replay buffer.
+        logger_lock (Lock): A threading lock to safely log training metrics.
+        logger (Logger): Logger instance for tracking training progress.
     """
     # NOTE: This function doesn't have a single responsibility, it should be split into multiple functions
     # in the future. The reason why we did that is the  GIL in Python. It's super slow the performance
     # are divided by 200. So we need to have a single thread that does all the work.
-    start = time.time()
+    time.time()
     optimization_step = 0
-    timeout_for_adding_transitions = 1
     while True:
-        time_for_adding_transitions = time.time()
         while not transition_queue.empty():
-
             transition_list = transition_queue.get()
             for transition in transition_list:
                 transition = move_transition_to_device(transition, device=device)
                 replay_buffer.add(**transition)
-                # logging.info(f"[LEARNER] size of replay buffer: {len(replay_buffer)}")
-                # logging.info(f"[LEARNER] size of transition queues: {transition_queue.qsize()}")
-                # logging.info(f"[LEARNER] size of replay buffer: {len(replay_buffer)}")
-                # logging.info(f"[LEARNER] size of transition queues: {transition }")
-            if len(replay_buffer) > cfg.training.online_step_before_learning:
-                logging.info(f"[LEARNER] size of replay buffer: {len(replay_buffer)}")
 
         while not interaction_message_queue.empty():
             interaction_message = interaction_message_queue.get()
-            logger.log_dict(interaction_message,mode="train",custom_step_key="interaction_step")
-            # logging.info(f"[LEARNER] size of interaction message queue: {interaction_message_queue.qsize()}")
+            logger.log_dict(interaction_message, mode="train", custom_step_key="Interaction step")
 
         if len(replay_buffer) < cfg.training.online_step_before_learning:
             continue
@@ -212,7 +229,7 @@ def add_actor_information(
             loss_critic = policy.compute_loss_critic(
                 observations=observations,
                 actions=actions,
-                rewards=rewards, 
+                rewards=rewards,
                 next_observations=next_observations,
                 done=done,
             )
@@ -223,7 +240,6 @@ def add_actor_information(
         training_infos = {}
         training_infos["loss_critic"] = loss_critic.item()
 
-
         if optimization_step % cfg.training.policy_update_freq == 0:
             for _ in range(cfg.training.policy_update_freq):
                 with policy_lock:
@@ -242,18 +258,52 @@ def add_actor_information(
 
                     training_infos["loss_temperature"] = loss_temperature.item()
 
+        policy.update_target_networks()
         if optimization_step % cfg.training.log_freq == 0:
             logger.log_dict(training_infos, step=optimization_step, mode="train")
 
-        policy.update_target_networks()
-        optimization_step += 1
         time_for_one_optimization_step = time.time() - time_for_one_optimization_step
+        frequency_for_one_optimization_step = 1 / (time_for_one_optimization_step + 1e-9)
 
-        logging.info(f"[LEARNER] Time for one optimization step: {time_for_one_optimization_step}")
-        logger.log_dict({"Time optimization step":time_for_one_optimization_step}, step=optimization_step, mode="train")
+        logging.debug(f"[LEARNER] Optimization frequency loop [Hz]: {frequency_for_one_optimization_step}")
+
+        logger.log_dict(
+            {"Optimization frequency loop [Hz]": frequency_for_one_optimization_step},
+            step=optimization_step,
+            mode="train",
+        )
+
+        optimization_step += 1
+        if optimization_step % cfg.training.log_freq == 0:
+            logging.info(f"[LEARNER] Number of optimization step: {optimization_step}")
 
 
-def make_optimizers_and_scheduler(cfg, policy):
+def make_optimizers_and_scheduler(cfg, policy: nn.Module):
+    """
+    Creates and returns optimizers for the actor, critic, and temperature components of a reinforcement learning policy.
+
+    This function sets up Adam optimizers for:
+    - The **actor network**, ensuring that only relevant parameters are optimized.
+    - The **critic ensemble**, which evaluates the value function.
+    - The **temperature parameter**, which controls the entropy in soft actor-critic (SAC)-like methods.
+
+    It also initializes a learning rate scheduler, though currently, it is set to `None`.
+
+    **NOTE:**
+    - If the encoder is shared, its parameters are excluded from the actor’s optimization process.
+    - The policy’s log temperature (`log_alpha`) is wrapped in a list to ensure proper optimization as a standalone tensor.
+
+    Args:
+        cfg: Configuration object containing hyperparameters.
+        policy (nn.Module): The policy model containing the actor, critic, and temperature components.
+
+    Returns:
+        Tuple[Dict[str, torch.optim.Optimizer], Optional[torch.optim.lr_scheduler._LRScheduler]]:
+        A tuple containing:
+        - `optimizers`: A dictionary mapping component names ("actor", "critic", "temperature") to their respective Adam optimizers.
+        - `lr_scheduler`: Currently set to `None` but can be extended to support learning rate scheduling.
+
+    """
     optimizer_actor = torch.optim.Adam(
         # NOTE: Handle the case of shared encoder where the encoder weights are not optimized with the gradient of the actor
         params=policy.actor.parameters_to_optimize,
@@ -273,8 +323,6 @@ def make_optimizers_and_scheduler(cfg, policy):
     return optimizers, lr_scheduler
 
 
-
-
 def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = None):
     if out_dir is None:
         raise NotImplementedError()
@@ -332,6 +380,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     batch_size = cfg.training.batch_size
     offline_buffer_lock = None
     offline_replay_buffer = None
+
     if cfg.dataset_repo_id is not None:
         logging.info("make_dataset offline buffer")
         offline_dataset = make_dataset(cfg)
@@ -342,48 +391,48 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
         offline_buffer_lock = Lock()
         batch_size: int = batch_size // 2  # We will sample from both replay buffer
 
-    server_thread = Thread(target=stream_transitions_from_actor, args=(50051,), daemon=True)
+    actor_ip = cfg.actor_learner_config.actor_ip
+    port = cfg.actor_learner_config.port
+
+    server_thread = Thread(
+        target=stream_transitions_from_actor,
+        args=(
+            actor_ip,
+            port,
+        ),
+        daemon=True,
+    )
     server_thread.start()
 
-
-    # Start a background thread to process transitions from the queue
     transition_thread = Thread(
-        target=add_actor_information,
+        target=add_actor_information_and_train,
         daemon=True,
-        args=(cfg,
-              device,
-              replay_buffer,
-              offline_replay_buffer,
-              batch_size,
-              optimizers,
-              policy,
-              policy_lock,
-              buffer_lock,
-              offline_buffer_lock,
-              logger_lock,
-              logger),
+        args=(
+            cfg,
+            device,
+            replay_buffer,
+            offline_replay_buffer,
+            batch_size,
+            optimizers,
+            policy,
+            policy_lock,
+            buffer_lock,
+            offline_buffer_lock,
+            logger_lock,
+            logger,
+        ),
     )
     transition_thread.start()
 
     param_push_thread = Thread(
         target=learner_push_parameters,
-        args=(policy, policy_lock, "127.0.0.1", 50051, 15),
-        # args=("127.0.0.1", 50052),
+        args=(policy, policy_lock, actor_ip, port, 15),
         daemon=True,
     )
     param_push_thread.start()
 
-        # interaction_thread = Thread(
-    #     target=add_message_interaction_to_wandb,
-    #     daemon=True,
-    #     args=(cfg, logger, logger_lock),
-    # )
-    # interaction_thread.start()
-
     transition_thread.join()
-    # param_push_thread.join()
     server_thread.join()
-    # interaction_thread.join()
 
 
 @hydra.main(version_base="1.2", config_name="default", config_path="../../configs")

From 9aabe212eac35e8ed10c46643f131c2c56fae4e9 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Wed, 29 Jan 2025 16:07:32 +0000
Subject: [PATCH 053/112] Added missing config files
 `env/maniskill_example.yaml` and `policy/sac_maniskill.yaml` that are
 necessary to run the lerobot implementation of sac with the maniskill
 baselines.

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/configs/env/maniskill_example.yaml        | 15 +++++++++++++++
 .../{sac_manyskill.yaml => sac_maniskill.yaml}    |  0
 2 files changed, 15 insertions(+)
 create mode 100644 lerobot/configs/env/maniskill_example.yaml
 rename lerobot/configs/policy/{sac_manyskill.yaml => sac_maniskill.yaml} (100%)

diff --git a/lerobot/configs/env/maniskill_example.yaml b/lerobot/configs/env/maniskill_example.yaml
new file mode 100644
index 00000000..cedf7a30
--- /dev/null
+++ b/lerobot/configs/env/maniskill_example.yaml
@@ -0,0 +1,15 @@
+# @package _global_
+
+fps: 20
+
+env:
+  name: maniskill/pushcube
+  task:  PushCube-v1
+  image_size: 64
+  control_mode: pd_ee_delta_pose
+  state_dim: 25
+  action_dim: 7
+  fps: ${fps}
+  obs: rgb
+  render_mode: rgb_array
+  render_size: 64
\ No newline at end of file
diff --git a/lerobot/configs/policy/sac_manyskill.yaml b/lerobot/configs/policy/sac_maniskill.yaml
similarity index 100%
rename from lerobot/configs/policy/sac_manyskill.yaml
rename to lerobot/configs/policy/sac_maniskill.yaml

From e856ffc91ec002bbfeaede3989ce9cdb80c92ee4 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Wed, 29 Jan 2025 16:31:38 +0000
Subject: [PATCH 054/112] Removed unnecessary time.sleep in the streaming
 server on the learner side

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/scripts/server/learner_server.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index 4c5c358c..a9375972 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -87,9 +87,6 @@ def stream_transitions_from_actor(host="127.0.0.1", port=50051):
         if response.HasField("interaction_message"):
             content = pickle.loads(response.interaction_message.interaction_message_bytes)
             interaction_message_queue.put(content)
-        # NOTE: Cool down the CPU, if you comment this line you will make a huge bottleneck
-        # TODO: LOOK TO REMOVE IT
-        time.sleep(0.001)
 
 
 def learner_push_parameters(

From 367dfe51c6bb70c3e219b522b6766b33af894c8d Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Thu, 30 Jan 2025 17:39:41 +0000
Subject: [PATCH 055/112] Added support for checkpointing the policy. We can
 save and load the policy state dict, optimizers state, optimization step and
 interaction step Added functions for converting the replay buffer from and to
 LeRobotDataset. When we want to save the replay buffer, we convert it first
 to LeRobotDataset format and save it locally and vice-versa.

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/common/datasets/utils.py            |   1 +
 lerobot/common/logger.py                    |  44 +++--
 lerobot/common/policies/factory.py          |   2 +-
 lerobot/common/policies/sac/modeling_sac.py |  41 ++--
 lerobot/configs/policy/sac_maniskill.yaml   |   4 +-
 lerobot/scripts/server/actor_server.py      |  15 +-
 lerobot/scripts/server/learner_server.py    | 202 +++++++++++++++-----
 7 files changed, 217 insertions(+), 92 deletions(-)

diff --git a/lerobot/common/datasets/utils.py b/lerobot/common/datasets/utils.py
index 123c5960..8d65367d 100644
--- a/lerobot/common/datasets/utils.py
+++ b/lerobot/common/datasets/utils.py
@@ -275,6 +275,7 @@ def get_hf_features_from_features(features: dict) -> datasets.Features:
             hf_features[key] = datasets.Sequence(
                 length=ft["shape"][0], feature=datasets.Value(dtype=ft["dtype"])
             )
+            # TODO: (alibers, azouitine) Add support for ft["shap"] == 0 as Value
 
     return datasets.Features(hf_features)
 
diff --git a/lerobot/common/logger.py b/lerobot/common/logger.py
index 569cad69..6dc724db 100644
--- a/lerobot/common/logger.py
+++ b/lerobot/common/logger.py
@@ -174,18 +174,32 @@ class Logger:
         self,
         save_dir: Path,
         train_step: int,
-        optimizer: Optimizer,
+        optimizer: Optimizer | dict,
         scheduler: LRScheduler | None,
+        interaction_step: int | None = None,
     ):
         """Checkpoint the global training_step, optimizer state, scheduler state, and random state.
 
         All of these are saved as "training_state.pth" under the checkpoint directory.
         """
+        # In Sac, for example, we have a dictionary of torch.optim.Optimizer
+        if type(optimizer) is dict:
+            optimizer_state_dict = {}
+            for k in optimizer:
+                optimizer_state_dict[k] = optimizer[k].state_dict()
+        else:
+            optimizer_state_dict = optimizer.state_dict()
+
         training_state = {
             "step": train_step,
-            "optimizer": optimizer.state_dict(),
+            "optimizer": optimizer_state_dict,
             **get_global_random_state(),
         }
+        # Interaction step is related to the distributed training code
+        # In that setup, we have two kinds of steps, the online step of the env and the optimization step
+        # We need to save both in order to resume the optimization properly and not break the logs dependant on the interaction step
+        if interaction_step is not None:
+            training_state["interaction_step"] = interaction_step
         if scheduler is not None:
             training_state["scheduler"] = scheduler.state_dict()
         torch.save(training_state, save_dir / self.training_state_file_name)
@@ -197,6 +211,7 @@ class Logger:
         optimizer: Optimizer,
         scheduler: LRScheduler | None,
         identifier: str,
+        interaction_step: int | None = None,
     ):
         """Checkpoint the model weights and the training state."""
         checkpoint_dir = self.checkpoints_dir / str(identifier)
@@ -208,16 +223,24 @@ class Logger:
         self.save_model(
             checkpoint_dir / self.pretrained_model_dir_name, policy, wandb_artifact_name=wandb_artifact_name
         )
-        self.save_training_state(checkpoint_dir, train_step, optimizer, scheduler)
+        self.save_training_state(checkpoint_dir, train_step, optimizer, scheduler, interaction_step)
         os.symlink(checkpoint_dir.absolute(), self.last_checkpoint_dir)
 
-    def load_last_training_state(self, optimizer: Optimizer, scheduler: LRScheduler | None) -> int:
+    def load_last_training_state(self, optimizer: Optimizer | dict, scheduler: LRScheduler | None) -> int:
         """
         Given the last checkpoint in the logging directory, load the optimizer state, scheduler state, and
         random state, and return the global training step.
         """
         training_state = torch.load(self.last_checkpoint_dir / self.training_state_file_name)
-        optimizer.load_state_dict(training_state["optimizer"])
+        # For the case where the optimizer is a dictionary of optimizers (e.g., sac)
+        if type(training_state["optimizer"]) is dict:
+            assert set(training_state["optimizer"].keys()) == set(optimizer.keys()), (
+                "Optimizer dictionaries do not have the same keys during resume!"
+            )
+            for k, v in training_state["optimizer"].items():
+                optimizer[k].load_state_dict(v)
+        else:
+            optimizer.load_state_dict(training_state["optimizer"])
         if scheduler is not None:
             scheduler.load_state_dict(training_state["scheduler"])
         elif "scheduler" in training_state:
@@ -228,7 +251,7 @@ class Logger:
         set_global_random_state({k: training_state[k] for k in get_global_random_state()})
         return training_state["step"]
 
-    def log_dict(self, d, step:int | None = None, mode="train", custom_step_key: str | None = None):
+    def log_dict(self, d, step: int | None = None, mode="train", custom_step_key: str | None = None):
         """Log a dictionary of metrics to WandB."""
         assert mode in {"train", "eval"}
         # TODO(alexander-soare): Add local text log.
@@ -236,10 +259,9 @@ class Logger:
             raise ValueError("Either step or custom_step_key must be provided.")
 
         if self._wandb is not None:
-            
-            # NOTE: This is not simple. Wandb step is it must always monotonically increase and it 
+            # NOTE: This is not simple. Wandb step is it must always monotonically increase and it
             # increases with each wandb.log call, but in the case of asynchronous RL for example,
-            # multiple time steps is possible for example, the interaction step with the environment, 
+            # multiple time steps is possible for example, the interaction step with the environment,
             # the training step, the evaluation step, etc. So we need to define a custom step key
             # to log the correct step for each metric.
             if custom_step_key is not None and self._wandb_custom_step_key is None:
@@ -247,7 +269,7 @@ class Logger:
                 # custom step.
                 self._wandb_custom_step_key = f"{mode}/{custom_step_key}"
                 self._wandb.define_metric(self._wandb_custom_step_key, hidden=True)
-            
+
             for k, v in d.items():
                 if not isinstance(v, (int, float, str, wandb.Table)):
                     logging.warning(
@@ -267,8 +289,6 @@ class Logger:
 
                 self._wandb.log({f"{mode}/{k}": v}, step=step)
 
-
-
     def log_video(self, video_path: str, step: int, mode: str = "train"):
         assert mode in {"train", "eval"}
         assert self._wandb is not None
diff --git a/lerobot/common/policies/factory.py b/lerobot/common/policies/factory.py
index 24026a86..7162c39f 100644
--- a/lerobot/common/policies/factory.py
+++ b/lerobot/common/policies/factory.py
@@ -106,7 +106,7 @@ def make_policy(
         # Make a fresh policy.
         # HACK: We pass *args and **kwargs to the policy constructor to allow for additional arguments
         # for example device for the sac policy.
-        policy = policy_cls(*args, **kwargs, config=policy_cfg, dataset_stats=dataset_stats)
+        policy = policy_cls(config=policy_cfg, dataset_stats=dataset_stats)
     else:
         # Load a pretrained policy and override the config if needed (for example, if there are inference-time
         # hyperparameters that we want to vary).
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 8567313d..64688b1b 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -29,6 +29,7 @@ from torch import Tensor
 
 from lerobot.common.policies.normalize import Normalize, Unnormalize
 from lerobot.common.policies.sac.configuration_sac import SACConfig
+from lerobot.common.policies.utils import get_device_from_parameters
 
 
 class SACPolicy(
@@ -44,7 +45,6 @@ class SACPolicy(
         self,
         config: SACConfig | None = None,
         dataset_stats: dict[str, dict[str, Tensor]] | None = None,
-        device: str = "cpu",
     ):
         super().__init__()
 
@@ -92,7 +92,6 @@ class SACPolicy(
                     for _ in range(config.num_critics)
                 ]
             ),
-            device=device,
         )
 
         self.critic_target = CriticEnsemble(
@@ -106,7 +105,6 @@ class SACPolicy(
                     for _ in range(config.num_critics)
                 ]
             ),
-            device=device,
         )
 
         self.critic_target.load_state_dict(self.critic_ensemble.state_dict())
@@ -115,7 +113,6 @@ class SACPolicy(
             encoder=encoder_actor,
             network=MLP(input_dim=encoder_actor.output_dim, **config.actor_network_kwargs),
             action_dim=config.output_shapes["action"][0],
-            device=device,
             encoder_is_shared=config.shared_encoder,
             **config.policy_kwargs,
         )
@@ -123,13 +120,22 @@ class SACPolicy(
             config.target_entropy = -np.prod(config.output_shapes["action"][0]) / 2  # (-dim(A)/2)
 
         # TODO (azouitine): Handle the case where the temparameter is a fixed
-        self.log_alpha = torch.zeros(1, requires_grad=True, device=device)
+        # TODO (michel-aractingi): Put the log_alpha in cuda by default because otherwise
+        # it triggers "can't optimize a non-leaf Tensor"
+        self.log_alpha = torch.zeros(1, requires_grad=True, device=torch.device("cuda:0"))
         self.temperature = self.log_alpha.exp().item()
 
     def reset(self):
         """Reset the policy"""
         pass
 
+    def to(self, *args, **kwargs):
+        """Override .to(device) method to involve moving the log_alpha fixed_std"""
+        if self.actor.fixed_std is not None:
+            self.actor.fixed_std = self.actor.fixed_std.to(*args, **kwargs)
+        self.log_alpha = self.log_alpha.to(*args, **kwargs)
+        super().to(*args, **kwargs)
+
     @torch.no_grad()
     def select_action(self, batch: dict[str, Tensor]) -> Tensor:
         """Select action for inference/evaluation"""
@@ -308,17 +314,12 @@ class CriticEnsemble(nn.Module):
         encoder: Optional[nn.Module],
         network_list: nn.Module,
         init_final: Optional[float] = None,
-        device: str = "cpu",
     ):
         super().__init__()
-        self.device = torch.device(device)
         self.encoder = encoder
         self.network_list = network_list
         self.init_final = init_final
 
-        # for network in network_list:
-        #     network.to(self.device)
-
         # Find the last Linear layer's output dimension
         for layer in reversed(network_list[0].net):
             if isinstance(layer, nn.Linear):
@@ -329,29 +330,28 @@ class CriticEnsemble(nn.Module):
         self.output_layers = []
         if init_final is not None:
             for _ in network_list:
-                output_layer = nn.Linear(out_features, 1, device=device)
+                output_layer = nn.Linear(out_features, 1)
                 nn.init.uniform_(output_layer.weight, -init_final, init_final)
                 nn.init.uniform_(output_layer.bias, -init_final, init_final)
                 self.output_layers.append(output_layer)
         else:
             self.output_layers = []
             for _ in network_list:
-                output_layer = nn.Linear(out_features, 1, device=device)
+                output_layer = nn.Linear(out_features, 1)
                 orthogonal_init()(output_layer.weight)
                 self.output_layers.append(output_layer)
 
         self.output_layers = nn.ModuleList(self.output_layers)
 
-        self.to(self.device)
-
     def forward(
         self,
         observations: dict[str, torch.Tensor],
         actions: torch.Tensor,
     ) -> torch.Tensor:
+        device = get_device_from_parameters(self)
         # Move each tensor in observations to device
-        observations = {k: v.to(self.device) for k, v in observations.items()}
-        actions = actions.to(self.device)
+        observations = {k: v.to(device) for k, v in observations.items()}
+        actions = actions.to(device)
 
         obs_enc = observations if self.encoder is None else self.encoder(observations)
 
@@ -375,17 +375,15 @@ class Policy(nn.Module):
         fixed_std: Optional[torch.Tensor] = None,
         init_final: Optional[float] = None,
         use_tanh_squash: bool = False,
-        device: str = "cpu",
         encoder_is_shared: bool = False,
     ):
         super().__init__()
-        self.device = torch.device(device)
         self.encoder = encoder
         self.network = network
         self.action_dim = action_dim
         self.log_std_min = log_std_min
         self.log_std_max = log_std_max
-        self.fixed_std = fixed_std.to(self.device) if fixed_std is not None else None
+        self.fixed_std = fixed_std
         self.use_tanh_squash = use_tanh_squash
         self.parameters_to_optimize = []
 
@@ -417,8 +415,6 @@ class Policy(nn.Module):
                 orthogonal_init()(self.std_layer.weight)
             self.parameters_to_optimize += list(self.std_layer.parameters())
 
-        self.to(self.device)
-
     def forward(
         self,
         observations: torch.Tensor,
@@ -460,7 +456,8 @@ class Policy(nn.Module):
 
     def get_features(self, observations: torch.Tensor) -> torch.Tensor:
         """Get encoded features from observations"""
-        observations = observations.to(self.device)
+        device = get_device_from_parameters(self)
+        observations = observations.to(device)
         if self.encoder is not None:
             with torch.inference_mode():
                 return self.encoder(observations)
diff --git a/lerobot/configs/policy/sac_maniskill.yaml b/lerobot/configs/policy/sac_maniskill.yaml
index 59f42247..fa3dca37 100644
--- a/lerobot/configs/policy/sac_maniskill.yaml
+++ b/lerobot/configs/policy/sac_maniskill.yaml
@@ -8,7 +8,7 @@
 #   env.gym.obs_type=environment_state_agent_pos \
 
 seed: 1
-dataset_repo_id: null
+dataset_repo_id: aractingi/hil-serl-maniskill-pushcube
 
 training:
   # Offline training dataloader
@@ -21,7 +21,7 @@ training:
 
   eval_freq: 2500
   log_freq: 500
-  save_freq: 50000
+  save_freq: 1000000
 
   online_steps: 1000000
   online_rollout_n_episodes: 10
diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index 0d2a1a5e..294f07a6 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -152,7 +152,7 @@ def serve_actor_service(port=50052):
     server.wait_for_termination()
 
 
-def act_with_policy(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = None):
+def act_with_policy(cfg: DictConfig):
     """
     Executes policy interaction within the environment.
 
@@ -161,8 +161,6 @@ def act_with_policy(cfg: DictConfig, out_dir: str | None = None, job_name: str |
 
     Args:
         cfg (DictConfig): Configuration settings for the interaction process.
-        out_dir (Optional[str]): Directory to store output logs or results. Defaults to None.
-        job_name (Optional[str]): Name of the job for logging or tracking purposes. Defaults to None.
     """
 
     logging.info("make_env online")
@@ -189,9 +187,10 @@ def act_with_policy(cfg: DictConfig, out_dir: str | None = None, job_name: str |
         # Hack: But if we do online training, we do not need dataset_stats
         dataset_stats=None,
         # TODO: Handle resume training
-        pretrained_policy_name_or_path=None,
-        device=device,
     )
+    #     pretrained_policy_name_or_path=None,
+    #     device=device,
+    # )
     assert isinstance(policy, nn.Module)
 
     # HACK for maniskill
@@ -295,11 +294,7 @@ def actor_cli(cfg: dict):
     policy_thread = Thread(
         target=act_with_policy,
         daemon=True,
-        args=(
-            cfg,
-            hydra.core.hydra_config.HydraConfig.get().run.dir,
-            hydra.core.hydra_config.HydraConfig.get().job.name,
-        ),
+        args=(cfg,),
     )
     policy_thread.start()
     policy_thread.join()
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index a9375972..cf6d8c76 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -18,6 +18,7 @@ import io
 import logging
 import pickle
 import queue
+import shutil
 import time
 from pprint import pformat
 from threading import Lock, Thread
@@ -29,18 +30,25 @@ import hilserl_pb2  # type: ignore
 import hilserl_pb2_grpc  # type: ignore
 import hydra
 import torch
+from deepdiff import DeepDiff
 from omegaconf import DictConfig, OmegaConf
+from termcolor import colored
 from torch import nn
 
-# TODO: Remove the import of maniskill
 from lerobot.common.datasets.factory import make_dataset
+
+# TODO: Remove the import of maniskill
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
 from lerobot.common.logger import Logger, log_output_dir
 from lerobot.common.policies.factory import make_policy
 from lerobot.common.policies.sac.modeling_sac import SACPolicy
 from lerobot.common.utils.utils import (
     format_big_number,
+    get_global_random_state,
     get_safe_torch_device,
+    init_hydra_config,
     init_logging,
+    set_global_random_state,
     set_global_seed,
 )
 from lerobot.scripts.server.buffer import (
@@ -127,10 +135,9 @@ def add_actor_information_and_train(
     optimizers: dict[str, torch.optim.Optimizer],
     policy: nn.Module,
     policy_lock: Lock,
-    buffer_lock: Lock,
-    offline_buffer_lock: Lock,
-    logger_lock: Lock,
     logger: Logger,
+    resume_optimization_step: int | None = None,
+    resume_interaction_step: int | None = None,
 ):
     """
     Handles data transfer from the actor to the learner, manages training updates,
@@ -159,16 +166,17 @@ def add_actor_information_and_train(
         optimizers (Dict[str, torch.optim.Optimizer]): A dictionary of optimizers (`"actor"`, `"critic"`, `"temperature"`).
         policy (nn.Module): The reinforcement learning policy with critic, actor, and temperature parameters.
         policy_lock (Lock): A threading lock to ensure safe policy updates.
-        buffer_lock (Lock): A threading lock to safely access the online replay buffer.
-        offline_buffer_lock (Lock): A threading lock to safely access the offline replay buffer.
-        logger_lock (Lock): A threading lock to safely log training metrics.
         logger (Logger): Logger instance for tracking training progress.
+        resume_optimization_step (int | None): In the case of resume training, start from the last optimization step reached.
+        resume_interaction_step (int | None): In the case of resume training, shift the interaction step with the last saved step in order to not break logging.
     """
     # NOTE: This function doesn't have a single responsibility, it should be split into multiple functions
     # in the future. The reason why we did that is the  GIL in Python. It's super slow the performance
     # are divided by 200. So we need to have a single thread that does all the work.
     time.time()
-    optimization_step = 0
+    interaction_message, transition = None, None
+    optimization_step = resume_optimization_step if resume_optimization_step is not None else 0
+    interaction_step_shift = resume_interaction_step if resume_interaction_step is not None else 0
     while True:
         while not transition_queue.empty():
             transition_list = transition_queue.get()
@@ -178,6 +186,8 @@ def add_actor_information_and_train(
 
         while not interaction_message_queue.empty():
             interaction_message = interaction_message_queue.get()
+            # If cfg.resume, shift the interaction step with the last checkpointed step in order to not break the logging
+            interaction_message["Interaction step"] += interaction_step_shift
             logger.log_dict(interaction_message, mode="train", custom_step_key="Interaction step")
 
         if len(replay_buffer) < cfg.training.online_step_before_learning:
@@ -186,9 +196,9 @@ def add_actor_information_and_train(
         for _ in range(cfg.policy.utd_ratio - 1):
             batch = replay_buffer.sample(batch_size)
 
-            if cfg.dataset_repo_id is not None:
-                batch_offline = offline_replay_buffer.sample(batch_size)
-                batch = concatenate_batch_transitions(batch, batch_offline)
+            # if cfg.offline_dataset_repo_id is not None:
+            #     batch_offline = offline_replay_buffer.sample(batch_size)
+            #     batch = concatenate_batch_transitions(batch, batch_offline)
 
             actions = batch["action"]
             rewards = batch["reward"]
@@ -210,11 +220,11 @@ def add_actor_information_and_train(
 
         batch = replay_buffer.sample(batch_size)
 
-        if cfg.dataset_repo_id is not None:
-            batch_offline = offline_replay_buffer.sample(batch_size)
-            batch = concatenate_batch_transitions(
-                left_batch_transitions=batch, right_batch_transition=batch_offline
-            )
+        # if cfg.offline_dataset_repo_id is not None:
+        #     batch_offline = offline_replay_buffer.sample(batch_size)
+        #     batch = concatenate_batch_transitions(
+        #         left_batch_transitions=batch, right_batch_transition=batch_offline
+        #     )
 
         actions = batch["action"]
         rewards = batch["reward"]
@@ -274,6 +284,39 @@ def add_actor_information_and_train(
         if optimization_step % cfg.training.log_freq == 0:
             logging.info(f"[LEARNER] Number of optimization step: {optimization_step}")
 
+        if cfg.training.save_checkpoint and (
+            optimization_step % cfg.training.save_freq == 0 or optimization_step == cfg.training.online_steps
+        ):
+            logging.info(f"Checkpoint policy after step {optimization_step}")
+            # Note: Save with step as the identifier, and format it to have at least 6 digits but more if
+            # needed (choose 6 as a minimum for consistency without being overkill).
+            _num_digits = max(6, len(str(cfg.training.online_steps)))
+            step_identifier = f"{optimization_step:0{_num_digits}d}"
+            interaction_step = (
+                interaction_message["Interaction step"] if interaction_message is not None else 0
+            )
+            logger.save_checkpoint(
+                optimization_step,
+                policy,
+                optimizers,
+                scheduler=None,
+                identifier=step_identifier,
+                interaction_step=interaction_step,
+            )
+
+            # TODO : temporarly save replay buffer here, remove later when on the robot
+            # We want to control this with the keyboard inputs
+            dataset_dir = logger.log_dir / "dataset"
+            if dataset_dir.exists() and dataset_dir.is_dir():
+                shutil.rmtree(
+                    dataset_dir,
+                )
+            replay_buffer.to_lerobot_dataset(
+                cfg.dataset_repo_id, fps=cfg.fps, root=logger.log_dir / "dataset"
+            )
+
+            logging.info("Resume training")
+
 
 def make_optimizers_and_scheduler(cfg, policy: nn.Module):
     """
@@ -330,7 +373,49 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     logging.info(pformat(OmegaConf.to_container(cfg)))
 
     logger = Logger(cfg, out_dir, wandb_job_name=job_name)
-    logger_lock = Lock()
+
+    ## Handle resume by reloading the state of the policy and optimization
+    # If we are resuming a run, we need to check that a checkpoint exists in the log directory, and we need
+    # to check for any differences between the provided config and the checkpoint's config.
+    if cfg.resume:
+        if not Logger.get_last_checkpoint_dir(out_dir).exists():
+            raise RuntimeError(
+                "You have set resume=True, but there is no model checkpoint in "
+                f"{Logger.get_last_checkpoint_dir(out_dir)}"
+            )
+        checkpoint_cfg_path = str(Logger.get_last_pretrained_model_dir(out_dir) / "config.yaml")
+        logging.info(
+            colored(
+                "You have set resume=True, indicating that you wish to resume a run",
+                color="yellow",
+                attrs=["bold"],
+            )
+        )
+        # Get the configuration file from the last checkpoint.
+        checkpoint_cfg = init_hydra_config(checkpoint_cfg_path)
+        # Check for differences between the checkpoint configuration and provided configuration.
+        diff = DeepDiff(OmegaConf.to_container(checkpoint_cfg), OmegaConf.to_container(cfg))
+        # Ignore the `resume` and parameters.
+        if "values_changed" in diff and "root['resume']" in diff["values_changed"]:
+            del diff["values_changed"]["root['resume']"]
+
+        # Log a warning about differences between the checkpoint configuration and the provided
+        # configuration.
+        if len(diff) > 0:
+            logging.warning(
+                "At least one difference was detected between the checkpoint configuration and "
+                f"the provided configuration: \n{pformat(diff)}\nNote that the checkpoint configuration "
+                "takes precedence.",
+            )
+        # Use the checkpoint config instead of the provided config (but keep `resume` parameter).
+        cfg = checkpoint_cfg
+        cfg.resume = True
+    elif Logger.get_last_checkpoint_dir(out_dir).exists():
+        raise RuntimeError(
+            f"The configured output directory {Logger.get_last_checkpoint_dir(out_dir)} already exists. If "
+            "you meant to resume training, please use `resume=true` in your command or yaml configuration."
+        )
+    # ===========================
 
     set_global_seed(cfg.seed)
 
@@ -346,20 +431,38 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     ### on both sides, the learner sends the updated parameters every n steps to update the actor's parameters
     # TODO: At some point we should just need make sac policy
     policy_lock = Lock()
-    with logger_lock:
-        policy: SACPolicy = make_policy(
-            hydra_cfg=cfg,
-            # dataset_stats=offline_dataset.meta.stats if not cfg.resume else None,
-            # Hack: But if we do online traning, we do not need dataset_stats
-            dataset_stats=None,
-            pretrained_policy_name_or_path=str(logger.last_pretrained_model_dir) if cfg.resume else None,
-            device=device,
-        )
+    policy: SACPolicy = make_policy(
+        hydra_cfg=cfg,
+        # dataset_stats=offline_dataset.meta.stats if not cfg.resume else None,
+        # Hack: But if we do online traning, we do not need dataset_stats
+        dataset_stats=None,
+        pretrained_policy_name_or_path=str(logger.last_pretrained_model_dir) if cfg.resume else None,
+    )
+    #     device=device,
+    # )
     assert isinstance(policy, nn.Module)
 
     optimizers, lr_scheduler = make_optimizers_and_scheduler(cfg, policy)
+    # load last training state
+    # We can't use the logger function in `lerobot/common/logger.py`
+    # because it only loads the optimization step and not the interaction one
+    # to avoid altering that code, we will just load the optimization state manually
+    resume_interaction_step, resume_optimization_step = None, None
+    if cfg.resume:
+        training_state = torch.load(logger.last_checkpoint_dir / logger.training_state_file_name)
+        if type(training_state["optimizer"]) is dict:
+            assert set(training_state["optimizer"].keys()) == set(optimizers.keys()), (
+                "Optimizer dictionaries do not have the same keys during resume!"
+            )
+            for k, v in training_state["optimizer"].items():
+                optimizers[k].load_state_dict(v)
+        else:
+            optimizers.load_state_dict(training_state["optimizer"])
+        # Small hack to get the expected keys: use `get_global_random_state`.
+        set_global_random_state({k: training_state[k] for k in get_global_random_state()})
+        resume_optimization_step = training_state["step"]
+        resume_interaction_step = training_state["interaction_step"]
 
-    # TODO: Handle resume
     num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
     num_total_params = sum(p.numel() for p in policy.parameters())
 
@@ -369,24 +472,34 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
     logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
 
-    buffer_lock = Lock()
-    replay_buffer = ReplayBuffer(
-        capacity=cfg.training.online_buffer_capacity, device=device, state_keys=cfg.policy.input_shapes.keys()
-    )
-
+    if not cfg.resume:
+        replay_buffer = ReplayBuffer(
+            capacity=cfg.training.online_buffer_capacity,
+            device=device,
+            state_keys=cfg.policy.input_shapes.keys(),
+        )
+    else:
+        # Reload replay buffer
+        dataset = LeRobotDataset(
+            repo_id=cfg.dataset_repo_id, local_files_only=True, root=logger.log_dir / "dataset"
+        )
+        replay_buffer = ReplayBuffer.from_lerobot_dataset(
+            lerobot_dataset=dataset,
+            capacity=cfg.training.online_buffer_capacity,
+            device=device,
+            state_keys=cfg.policy.input_shapes.keys(),
+        )
     batch_size = cfg.training.batch_size
-    offline_buffer_lock = None
     offline_replay_buffer = None
 
-    if cfg.dataset_repo_id is not None:
-        logging.info("make_dataset offline buffer")
-        offline_dataset = make_dataset(cfg)
-        logging.info("Convertion to a offline replay buffer")
-        offline_replay_buffer = ReplayBuffer.from_lerobot_dataset(
-            offline_dataset, device=device, state_keys=cfg.policy.input_shapes.keys()
-        )
-        offline_buffer_lock = Lock()
-        batch_size: int = batch_size // 2  # We will sample from both replay buffer
+    # if cfg.dataset_repo_id is not None:
+    #     logging.info("make_dataset offline buffer")
+    #     offline_dataset = make_dataset(cfg)
+    #     logging.info("Convertion to a offline replay buffer")
+    #     offline_replay_buffer = ReplayBuffer.from_lerobot_dataset(
+    #         offline_dataset, device=device, state_keys=cfg.policy.input_shapes.keys()
+    #     )
+    #     batch_size: int = batch_size // 2  # We will sample from both replay buffer
 
     actor_ip = cfg.actor_learner_config.actor_ip
     port = cfg.actor_learner_config.port
@@ -413,10 +526,9 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
             optimizers,
             policy,
             policy_lock,
-            buffer_lock,
-            offline_buffer_lock,
-            logger_lock,
             logger,
+            resume_optimization_step,
+            resume_interaction_step,
         ),
     )
     transition_thread.start()

From 7c89bd10183da7b4456d9f84d11ec71d11f82fa9 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Fri, 31 Jan 2025 08:33:33 +0000
Subject: [PATCH 056/112] Cleaned `learner_server.py`. Added several block
 function to improve readability.

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/configs/policy/sac_maniskill.yaml |   2 +-
 lerobot/scripts/server/learner_server.py  | 298 ++++++++++++----------
 2 files changed, 166 insertions(+), 134 deletions(-)

diff --git a/lerobot/configs/policy/sac_maniskill.yaml b/lerobot/configs/policy/sac_maniskill.yaml
index fa3dca37..2776b39d 100644
--- a/lerobot/configs/policy/sac_maniskill.yaml
+++ b/lerobot/configs/policy/sac_maniskill.yaml
@@ -21,7 +21,7 @@ training:
 
   eval_freq: 2500
   log_freq: 500
-  save_freq: 1000000
+  save_freq: 2000000
 
   online_steps: 1000000
   online_rollout_n_episodes: 10
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index cf6d8c76..dbafeb42 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -34,8 +34,7 @@ from deepdiff import DeepDiff
 from omegaconf import DictConfig, OmegaConf
 from termcolor import colored
 from torch import nn
-
-from lerobot.common.datasets.factory import make_dataset
+from torch.optim.optimizer import Optimizer
 
 # TODO: Remove the import of maniskill
 from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
@@ -53,18 +52,164 @@ from lerobot.common.utils.utils import (
 )
 from lerobot.scripts.server.buffer import (
     ReplayBuffer,
-    concatenate_batch_transitions,
     move_state_dict_to_device,
     move_transition_to_device,
 )
 
 logging.basicConfig(level=logging.INFO)
 
-# TODO: Implement it in cleaner way maybe
 transition_queue = queue.Queue()
 interaction_message_queue = queue.Queue()
 
 
+def handle_resume_logic(cfg: DictConfig, out_dir: str) -> DictConfig:
+    if not cfg.resume:
+        if Logger.get_last_checkpoint_dir(out_dir).exists():
+            raise RuntimeError(
+                f"Output directory {Logger.get_last_checkpoint_dir(out_dir)} already exists. "
+                "Use `resume=true` to resume training."
+            )
+        return cfg
+
+    # if resume == True
+    checkpoint_dir = Logger.get_last_checkpoint_dir(out_dir)
+    if not checkpoint_dir.exists():
+        raise RuntimeError(f"No model checkpoint found in {checkpoint_dir} for resume=True")
+
+    checkpoint_cfg_path = str(Logger.get_last_pretrained_model_dir(out_dir) / "config.yaml")
+    logging.info(
+        colored(
+            "Resume=True detected, resuming previous run",
+            color="yellow",
+            attrs=["bold"],
+        )
+    )
+
+    checkpoint_cfg = init_hydra_config(checkpoint_cfg_path)
+    diff = DeepDiff(OmegaConf.to_container(checkpoint_cfg), OmegaConf.to_container(cfg))
+
+    if "values_changed" in diff and "root['resume']" in diff["values_changed"]:
+        del diff["values_changed"]["root['resume']"]
+
+    if len(diff) > 0:
+        logging.warning(
+            f"Differences between the checkpoint config and the provided config detected: \n{pformat(diff)}\n"
+            "Checkpoint configuration takes precedence."
+        )
+
+    checkpoint_cfg.resume = True
+    return checkpoint_cfg
+
+
+def load_training_state(
+    cfg: DictConfig,
+    logger: Logger,
+    optimizers: Optimizer | dict,
+):
+    if not cfg.resume:
+        return None, None
+
+    training_state = torch.load(logger.last_checkpoint_dir / logger.training_state_file_name)
+
+    if isinstance(training_state["optimizer"], dict):
+        assert set(training_state["optimizer"].keys()) == set(optimizers.keys())
+        for k, v in training_state["optimizer"].items():
+            optimizers[k].load_state_dict(v)
+    else:
+        optimizers.load_state_dict(training_state["optimizer"])
+
+    set_global_random_state({k: training_state[k] for k in get_global_random_state()})
+    return training_state["step"], training_state["interaction_step"]
+
+
+def log_training_info(cfg: DictConfig, out_dir: str, policy: nn.Module) -> None:
+    num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
+    num_total_params = sum(p.numel() for p in policy.parameters())
+
+    log_output_dir(out_dir)
+    logging.info(f"{cfg.env.task=}")
+    logging.info(f"{cfg.training.online_steps=}")
+    logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
+    logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
+
+
+def initialize_replay_buffer(cfg: DictConfig, logger: Logger, device: str) -> ReplayBuffer:
+    if not cfg.resume:
+        return ReplayBuffer(
+            capacity=cfg.training.online_buffer_capacity,
+            device=device,
+            state_keys=cfg.policy.input_shapes.keys(),
+        )
+
+    dataset = LeRobotDataset(
+        repo_id=cfg.dataset_repo_id, local_files_only=True, root=logger.log_dir / "dataset"
+    )
+    return ReplayBuffer.from_lerobot_dataset(
+        lerobot_dataset=dataset,
+        capacity=cfg.training.online_buffer_capacity,
+        device=device,
+        state_keys=cfg.policy.input_shapes.keys(),
+    )
+
+
+def start_learner_threads(
+    cfg: DictConfig,
+    device: str,
+    replay_buffer: ReplayBuffer,
+    offline_replay_buffer: ReplayBuffer,
+    batch_size: int,
+    optimizers: dict,
+    policy: SACPolicy,
+    policy_lock: Lock,
+    logger: Logger,
+    resume_optimization_step: int | None = None,
+    resume_interaction_step: int | None = None,
+) -> None:
+    actor_ip = cfg.actor_learner_config.actor_ip
+    port = cfg.actor_learner_config.port
+
+    server_thread = Thread(
+        target=stream_transitions_from_actor,
+        args=(
+            actor_ip,
+            port,
+        ),
+        daemon=True,
+    )
+
+    transition_thread = Thread(
+        target=add_actor_information_and_train,
+        daemon=True,
+        args=(
+            cfg,
+            device,
+            replay_buffer,
+            offline_replay_buffer,
+            batch_size,
+            optimizers,
+            policy,
+            policy_lock,
+            logger,
+            resume_optimization_step,
+            resume_interaction_step,
+        ),
+    )
+
+    param_push_thread = Thread(
+        target=learner_push_parameters,
+        args=(policy, policy_lock, actor_ip, port, 15),
+        daemon=True,
+    )
+
+    server_thread.start()
+    transition_thread.start()
+    param_push_thread.start()
+
+    param_push_thread.join()
+    transition_thread.join()
+    server_thread.join()
+
+
 def stream_transitions_from_actor(host="127.0.0.1", port=50051):
     """
     Runs a gRPC client that listens for transition and interaction messages from an Actor service.
@@ -373,49 +518,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     logging.info(pformat(OmegaConf.to_container(cfg)))
 
     logger = Logger(cfg, out_dir, wandb_job_name=job_name)
-
-    ## Handle resume by reloading the state of the policy and optimization
-    # If we are resuming a run, we need to check that a checkpoint exists in the log directory, and we need
-    # to check for any differences between the provided config and the checkpoint's config.
-    if cfg.resume:
-        if not Logger.get_last_checkpoint_dir(out_dir).exists():
-            raise RuntimeError(
-                "You have set resume=True, but there is no model checkpoint in "
-                f"{Logger.get_last_checkpoint_dir(out_dir)}"
-            )
-        checkpoint_cfg_path = str(Logger.get_last_pretrained_model_dir(out_dir) / "config.yaml")
-        logging.info(
-            colored(
-                "You have set resume=True, indicating that you wish to resume a run",
-                color="yellow",
-                attrs=["bold"],
-            )
-        )
-        # Get the configuration file from the last checkpoint.
-        checkpoint_cfg = init_hydra_config(checkpoint_cfg_path)
-        # Check for differences between the checkpoint configuration and provided configuration.
-        diff = DeepDiff(OmegaConf.to_container(checkpoint_cfg), OmegaConf.to_container(cfg))
-        # Ignore the `resume` and parameters.
-        if "values_changed" in diff and "root['resume']" in diff["values_changed"]:
-            del diff["values_changed"]["root['resume']"]
-
-        # Log a warning about differences between the checkpoint configuration and the provided
-        # configuration.
-        if len(diff) > 0:
-            logging.warning(
-                "At least one difference was detected between the checkpoint configuration and "
-                f"the provided configuration: \n{pformat(diff)}\nNote that the checkpoint configuration "
-                "takes precedence.",
-            )
-        # Use the checkpoint config instead of the provided config (but keep `resume` parameter).
-        cfg = checkpoint_cfg
-        cfg.resume = True
-    elif Logger.get_last_checkpoint_dir(out_dir).exists():
-        raise RuntimeError(
-            f"The configured output directory {Logger.get_last_checkpoint_dir(out_dir)} already exists. If "
-            "you meant to resume training, please use `resume=true` in your command or yaml configuration."
-        )
-    # ===========================
+    cfg = handle_resume_logic(cfg, out_dir)
 
     set_global_seed(cfg.seed)
 
@@ -438,57 +541,14 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
         dataset_stats=None,
         pretrained_policy_name_or_path=str(logger.last_pretrained_model_dir) if cfg.resume else None,
     )
-    #     device=device,
-    # )
     assert isinstance(policy, nn.Module)
 
     optimizers, lr_scheduler = make_optimizers_and_scheduler(cfg, policy)
-    # load last training state
-    # We can't use the logger function in `lerobot/common/logger.py`
-    # because it only loads the optimization step and not the interaction one
-    # to avoid altering that code, we will just load the optimization state manually
-    resume_interaction_step, resume_optimization_step = None, None
-    if cfg.resume:
-        training_state = torch.load(logger.last_checkpoint_dir / logger.training_state_file_name)
-        if type(training_state["optimizer"]) is dict:
-            assert set(training_state["optimizer"].keys()) == set(optimizers.keys()), (
-                "Optimizer dictionaries do not have the same keys during resume!"
-            )
-            for k, v in training_state["optimizer"].items():
-                optimizers[k].load_state_dict(v)
-        else:
-            optimizers.load_state_dict(training_state["optimizer"])
-        # Small hack to get the expected keys: use `get_global_random_state`.
-        set_global_random_state({k: training_state[k] for k in get_global_random_state()})
-        resume_optimization_step = training_state["step"]
-        resume_interaction_step = training_state["interaction_step"]
+    resume_optimization_step, resume_interaction_step = load_training_state(cfg, logger, optimizers)
 
-    num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
-    num_total_params = sum(p.numel() for p in policy.parameters())
+    log_training_info(cfg, out_dir, policy)
 
-    log_output_dir(out_dir)
-    logging.info(f"{cfg.env.task=}")
-    logging.info(f"{cfg.training.online_steps=}")
-    logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
-    logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
-
-    if not cfg.resume:
-        replay_buffer = ReplayBuffer(
-            capacity=cfg.training.online_buffer_capacity,
-            device=device,
-            state_keys=cfg.policy.input_shapes.keys(),
-        )
-    else:
-        # Reload replay buffer
-        dataset = LeRobotDataset(
-            repo_id=cfg.dataset_repo_id, local_files_only=True, root=logger.log_dir / "dataset"
-        )
-        replay_buffer = ReplayBuffer.from_lerobot_dataset(
-            lerobot_dataset=dataset,
-            capacity=cfg.training.online_buffer_capacity,
-            device=device,
-            state_keys=cfg.policy.input_shapes.keys(),
-        )
+    replay_buffer = initialize_replay_buffer(cfg, logger, device)
     batch_size = cfg.training.batch_size
     offline_replay_buffer = None
 
@@ -501,47 +561,19 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     #     )
     #     batch_size: int = batch_size // 2  # We will sample from both replay buffer
 
-    actor_ip = cfg.actor_learner_config.actor_ip
-    port = cfg.actor_learner_config.port
-
-    server_thread = Thread(
-        target=stream_transitions_from_actor,
-        args=(
-            actor_ip,
-            port,
-        ),
-        daemon=True,
+    start_learner_threads(
+        cfg,
+        device,
+        replay_buffer,
+        offline_replay_buffer,
+        batch_size,
+        optimizers,
+        policy,
+        policy_lock,
+        logger,
+        resume_optimization_step,
+        resume_interaction_step,
     )
-    server_thread.start()
-
-    transition_thread = Thread(
-        target=add_actor_information_and_train,
-        daemon=True,
-        args=(
-            cfg,
-            device,
-            replay_buffer,
-            offline_replay_buffer,
-            batch_size,
-            optimizers,
-            policy,
-            policy_lock,
-            logger,
-            resume_optimization_step,
-            resume_interaction_step,
-        ),
-    )
-    transition_thread.start()
-
-    param_push_thread = Thread(
-        target=learner_push_parameters,
-        args=(policy, policy_lock, actor_ip, port, 15),
-        daemon=True,
-    )
-    param_push_thread.start()
-
-    transition_thread.join()
-    server_thread.join()
 
 
 @hydra.main(version_base="1.2", config_name="default", config_path="../../configs")

From f1c8bfe01e469cc4723587f195505bdf4f14af55 Mon Sep 17 00:00:00 2001
From: Yoel <yoel.chornton@gmail.com>
Date: Fri, 31 Jan 2025 09:42:13 +0100
Subject: [PATCH 057/112] [Port HIL-SERL] Add HF vision encoder option in SAC
 (#651)

Added support with custom pretrained vision encoder to the modeling sac implementation. Great job @ChorntonYoel !
---
 lerobot/common/datasets/factory.py            |  22 +++-
 lerobot/common/datasets/transforms.py         |  25 ++++
 .../common/policies/sac/configuration_sac.py  |   1 +
 lerobot/common/policies/sac/modeling_sac.py   | 122 +++++++++++-------
 4 files changed, 123 insertions(+), 47 deletions(-)

diff --git a/lerobot/common/datasets/factory.py b/lerobot/common/datasets/factory.py
index f6164ed1..2f280372 100644
--- a/lerobot/common/datasets/factory.py
+++ b/lerobot/common/datasets/factory.py
@@ -74,7 +74,23 @@ def make_dataset(cfg, split: str = "train") -> LeRobotDataset | MultiLeRobotData
 
     image_transforms = None
     if cfg.training.image_transforms.enable:
-        cfg_tf = cfg.training.image_transforms
+        default_tf = OmegaConf.create(
+            {
+                "brightness": {"weight": 0.0, "min_max": None},
+                "contrast": {"weight": 0.0, "min_max": None},
+                "saturation": {"weight": 0.0, "min_max": None},
+                "hue": {"weight": 0.0, "min_max": None},
+                "sharpness": {"weight": 0.0, "min_max": None},
+                "max_num_transforms": None,
+                "random_order": False,
+                "image_size": None,
+                "interpolation": None,
+                "image_mean": None,
+                "image_std": None,
+            }
+        )
+        cfg_tf = OmegaConf.merge(OmegaConf.create(default_tf), cfg.training.image_transforms)
+
         image_transforms = get_image_transforms(
             brightness_weight=cfg_tf.brightness.weight,
             brightness_min_max=cfg_tf.brightness.min_max,
@@ -88,6 +104,10 @@ def make_dataset(cfg, split: str = "train") -> LeRobotDataset | MultiLeRobotData
             sharpness_min_max=cfg_tf.sharpness.min_max,
             max_num_transforms=cfg_tf.max_num_transforms,
             random_order=cfg_tf.random_order,
+            image_size=(cfg_tf.image_size.height, cfg_tf.image_size.width) if cfg_tf.image_size else None,
+            interpolation=cfg_tf.interpolation,
+            image_mean=cfg_tf.image_mean,
+            image_std=cfg_tf.image_std,
         )
 
     if isinstance(cfg.dataset_repo_id, str):
diff --git a/lerobot/common/datasets/transforms.py b/lerobot/common/datasets/transforms.py
index 899f0d66..1a72e68e 100644
--- a/lerobot/common/datasets/transforms.py
+++ b/lerobot/common/datasets/transforms.py
@@ -150,6 +150,10 @@ def get_image_transforms(
     sharpness_min_max: tuple[float, float] | None = None,
     max_num_transforms: int | None = None,
     random_order: bool = False,
+    interpolation: str | None = None,
+    image_size: tuple[int, int] | None = None,
+    image_mean: list[float] | None = None,
+    image_std: list[float] | None = None,
 ):
     def check_value(name, weight, min_max):
         if min_max is not None:
@@ -170,6 +174,18 @@ def get_image_transforms(
 
     weights = []
     transforms = []
+    if image_size is not None:
+        interpolations = [interpolation.value for interpolation in v2.InterpolationMode]
+        if interpolation is None:
+            # Use BICUBIC as default interpolation
+            interpolation_mode = v2.InterpolationMode.BICUBIC
+        elif interpolation in interpolations:
+            interpolation_mode = v2.InterpolationMode(interpolation)
+        else:
+            raise ValueError("The interpolation passed is not supported")
+        # Weight for resizing is always 1
+        weights.append(1.0)
+        transforms.append(v2.Resize(size=(image_size[0], image_size[1]), interpolation=interpolation_mode))
     if brightness_min_max is not None and brightness_weight > 0.0:
         weights.append(brightness_weight)
         transforms.append(v2.ColorJitter(brightness=brightness_min_max))
@@ -185,6 +201,15 @@ def get_image_transforms(
     if sharpness_min_max is not None and sharpness_weight > 0.0:
         weights.append(sharpness_weight)
         transforms.append(SharpnessJitter(sharpness=sharpness_min_max))
+    if image_mean is not None and image_std is not None:
+        # Weight for normalization is always 1
+        weights.append(1.0)
+        transforms.append(
+            v2.Normalize(
+                mean=image_mean,
+                std=image_std,
+            )
+        )
 
     n_subset = len(transforms)
     if max_num_transforms is not None:
diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index 904679e8..3c6344de 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -55,6 +55,7 @@ class SACConfig:
     )
     camera_number: int = 1
     # Add type annotations for these fields:
+    vision_encoder_name: str = field(default="microsoft/resnet-18")
     image_encoder_hidden_dim: int = 32
     shared_encoder: bool = False
     discount: float = 0.99
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 64688b1b..bd6e9ef2 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -473,54 +473,61 @@ class SACObservationEncoder(nn.Module):
         """
         super().__init__()
         self.config = config
+        self.has_pretrained_vision_encoder = False
         if "observation.image" in config.input_shapes:
-            self.image_enc_layers = nn.Sequential(
-                nn.Conv2d(
-                    in_channels=config.input_shapes["observation.image"][0],
-                    out_channels=config.image_encoder_hidden_dim,
-                    kernel_size=7,
-                    stride=2,
-                ),
-                nn.ReLU(),
-                nn.Conv2d(
-                    in_channels=config.image_encoder_hidden_dim,
-                    out_channels=config.image_encoder_hidden_dim,
-                    kernel_size=5,
-                    stride=2,
-                ),
-                nn.ReLU(),
-                nn.Conv2d(
-                    in_channels=config.image_encoder_hidden_dim,
-                    out_channels=config.image_encoder_hidden_dim,
-                    kernel_size=3,
-                    stride=2,
-                ),
-                nn.ReLU(),
-                nn.Conv2d(
-                    in_channels=config.image_encoder_hidden_dim,
-                    out_channels=config.image_encoder_hidden_dim,
-                    kernel_size=3,
-                    stride=2,
-                ),
-                nn.ReLU(),
-            )
             self.camera_number = config.camera_number
             self.aggregation_size: int = 0
-
-            dummy_batch = torch.zeros(1, *config.input_shapes["observation.image"])
-            with torch.inference_mode():
-                out_shape = self.image_enc_layers(dummy_batch).shape[1:]
-            self.image_enc_layers.extend(
-                sequential=nn.Sequential(
-                    nn.Flatten(),
-                    nn.Linear(
-                        in_features=np.prod(out_shape) * self.camera_number, out_features=config.latent_dim
-                    ),
-                    nn.LayerNorm(normalized_shape=config.latent_dim),
+            if self.config.vision_encoder_name is not None:
+                self.has_pretrained_vision_encoder = True
+                self.image_enc_layers, self.image_enc_out_shape = self._load_pretrained_vision_encoder()
+                self.freeze_encoder()
+                self.image_enc_proj = nn.Sequential(
+                    nn.Linear(np.prod(self.image_enc_out_shape), config.latent_dim),
+                    nn.LayerNorm(config.latent_dim),
                     nn.Tanh(),
                 )
-            )
-
+            else:
+                self.image_enc_layers = nn.Sequential(
+                    nn.Conv2d(
+                        in_channels=config.input_shapes["observation.image"][0],
+                        out_channels=config.image_encoder_hidden_dim,
+                        kernel_size=7,
+                        stride=2,
+                    ),
+                    nn.ReLU(),
+                    nn.Conv2d(
+                        in_channels=config.image_encoder_hidden_dim,
+                        out_channels=config.image_encoder_hidden_dim,
+                        kernel_size=5,
+                        stride=2,
+                    ),
+                    nn.ReLU(),
+                    nn.Conv2d(
+                        in_channels=config.image_encoder_hidden_dim,
+                        out_channels=config.image_encoder_hidden_dim,
+                        kernel_size=3,
+                        stride=2,
+                    ),
+                    nn.ReLU(),
+                    nn.Conv2d(
+                        in_channels=config.image_encoder_hidden_dim,
+                        out_channels=config.image_encoder_hidden_dim,
+                        kernel_size=3,
+                        stride=2,
+                    ),
+                    nn.ReLU(),
+                )
+                dummy_batch = torch.zeros(1, *config.input_shapes["observation.image"])
+                with torch.inference_mode():
+                    self.image_enc_out_shape = self.image_enc_layers(dummy_batch).shape[1:]
+                self.image_enc_layers.extend(
+                    nn.Sequential(
+                        nn.Flatten(),
+                        nn.Linear(np.prod(self.image_enc_out_shape), config.latent_dim),
+                        nn.LayerNorm(config.latent_dim),
+                        nn.Tanh(),
+                    )
+                )
             self.aggregation_size += config.latent_dim * self.camera_number
         if "observation.state" in config.input_shapes:
             self.state_enc_layers = nn.Sequential(
@@ -541,10 +548,27 @@ class SACObservationEncoder(nn.Module):
                 nn.LayerNorm(normalized_shape=config.latent_dim),
                 nn.Tanh(),
             )
-
-            self.aggregation_size += config.latent_dim
+        self.aggregation_size += config.latent_dim
         self.aggregation_layer = nn.Linear(in_features=self.aggregation_size, out_features=config.latent_dim)
 
+    def _load_pretrained_vision_encoder(self):
+        """Set up CNN encoder"""
+        from transformers import AutoModel
+
+        self.image_enc_layers = AutoModel.from_pretrained(self.config.vision_encoder_name)
+        if hasattr(self.image_enc_layers.config, "hidden_sizes"):
+            self.image_enc_out_shape = self.image_enc_layers.config.hidden_sizes[-1]  # Last channel dimension
+        elif hasattr(self.image_enc_layers, "fc"):
+            self.image_enc_out_shape = self.image_enc_layers.fc.in_features
+        else:
+            raise ValueError("Unsupported vision encoder architecture, make sure you are using a CNN")
+        return self.image_enc_layers, self.image_enc_out_shape
+
+    def freeze_encoder(self):
+        """Freeze all parameters in the encoder"""
+        for param in self.image_enc_layers.parameters():
+            param.requires_grad = False
+
     def forward(self, obs_dict: dict[str, Tensor]) -> Tensor:
         """Encode the image and/or state vector.
 
@@ -555,7 +579,13 @@ class SACObservationEncoder(nn.Module):
         # Concatenate all images along the channel dimension.
         image_keys = [k for k in self.config.input_shapes if k.startswith("observation.image")]
         for image_key in image_keys:
-            feat.append(flatten_forward_unflatten(self.image_enc_layers, obs_dict[image_key]))
+            if self.has_pretrained_vision_encoder:
+                enc_feat = self.image_enc_layers(obs_dict[image_key]).pooler_output
+                enc_feat = self.image_enc_proj(enc_feat.view(enc_feat.shape[0], -1))
+            else:
+                enc_feat = flatten_forward_unflatten(self.image_enc_layers, obs_dict[image_key])
+
+            feat.append(enc_feat)
         if "observation.environment_state" in self.config.input_shapes:
             feat.append(self.env_state_enc_layers(obs_dict["observation.environment_state"]))
         if "observation.state" in self.config.input_shapes:

From 506821c7df2f1845b91b296fde62de2ccfa59c29 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Fri, 31 Jan 2025 16:45:52 +0000
Subject: [PATCH 058/112] - Refactor observation encoder in `modeling_sac.py` -
 added `torch.compile` to the actor and learner servers. - organized imports
 in `train_sac.py` - optimized the parameters push by not sending the frozen
 pre-trained encoder.

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 .../common/policies/sac/configuration_sac.py  |   5 +-
 lerobot/common/policies/sac/modeling_sac.py   | 252 +++++++++++++-----
 lerobot/configs/policy/sac_maniskill.yaml     |   2 +
 lerobot/scripts/server/actor_server.py        |   5 +-
 lerobot/scripts/server/learner_server.py      |   5 +
 lerobot/scripts/train_sac.py                  |  15 +-
 6 files changed, 199 insertions(+), 85 deletions(-)

diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index 3c6344de..7bb7f167 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -55,9 +55,10 @@ class SACConfig:
     )
     camera_number: int = 1
     # Add type annotations for these fields:
-    vision_encoder_name: str = field(default="microsoft/resnet-18")
+    vision_encoder_name: str | None = field(default="microsoft/resnet-18")
+    freeze_vision_encoder: bool = True
     image_encoder_hidden_dim: int = 32
-    shared_encoder: bool = False
+    shared_encoder: bool = True
     discount: float = 0.99
     temperature_init: float = 1.0
     num_critics: int = 2
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index bd6e9ef2..9faeeeb6 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -312,7 +312,7 @@ class CriticEnsemble(nn.Module):
     def __init__(
         self,
         encoder: Optional[nn.Module],
-        network_list: nn.Module,
+        network_list: nn.ModuleList,
         init_final: Optional[float] = None,
     ):
         super().__init__()
@@ -320,6 +320,12 @@ class CriticEnsemble(nn.Module):
         self.network_list = network_list
         self.init_final = init_final
 
+        self.parameters_to_optimize = []
+        # Handle the case where a part of the encoder if frozen
+        if self.encoder is not None:
+            self.parameters_to_optimize += list(self.encoder.parameters_to_optimize)
+
+        self.parameters_to_optimize += list(self.network_list.parameters())
         # Find the last Linear layer's output dimension
         for layer in reversed(network_list[0].net):
             if isinstance(layer, nn.Linear):
@@ -342,6 +348,7 @@ class CriticEnsemble(nn.Module):
                 self.output_layers.append(output_layer)
 
         self.output_layers = nn.ModuleList(self.output_layers)
+        self.parameters_to_optimize += list(self.output_layers.parameters())
 
     def forward(
         self,
@@ -474,61 +481,25 @@ class SACObservationEncoder(nn.Module):
         super().__init__()
         self.config = config
         self.has_pretrained_vision_encoder = False
+        self.parameters_to_optimize = []
+
+        self.aggregation_size: int = 0
         if "observation.image" in config.input_shapes:
             self.camera_number = config.camera_number
-            self.aggregation_size: int = 0
+
             if self.config.vision_encoder_name is not None:
+                self.image_enc_layers = PretrainedImageEncoder(config)
                 self.has_pretrained_vision_encoder = True
-                self.image_enc_layers, self.image_enc_out_shape = self._load_pretrained_vision_encoder()
-                self.freeze_encoder()
-                self.image_enc_proj = nn.Sequential(
-                    nn.Linear(np.prod(self.image_enc_out_shape), config.latent_dim),
-                    nn.LayerNorm(config.latent_dim),
-                    nn.Tanh(),
-                )
             else:
-                self.image_enc_layers = nn.Sequential(
-                    nn.Conv2d(
-                        in_channels=config.input_shapes["observation.image"][0],
-                        out_channels=config.image_encoder_hidden_dim,
-                        kernel_size=7,
-                        stride=2,
-                    ),
-                    nn.ReLU(),
-                    nn.Conv2d(
-                        in_channels=config.image_encoder_hidden_dim,
-                        out_channels=config.image_encoder_hidden_dim,
-                        kernel_size=5,
-                        stride=2,
-                    ),
-                    nn.ReLU(),
-                    nn.Conv2d(
-                        in_channels=config.image_encoder_hidden_dim,
-                        out_channels=config.image_encoder_hidden_dim,
-                        kernel_size=3,
-                        stride=2,
-                    ),
-                    nn.ReLU(),
-                    nn.Conv2d(
-                        in_channels=config.image_encoder_hidden_dim,
-                        out_channels=config.image_encoder_hidden_dim,
-                        kernel_size=3,
-                        stride=2,
-                    ),
-                    nn.ReLU(),
-                )
-                dummy_batch = torch.zeros(1, *config.input_shapes["observation.image"])
-                with torch.inference_mode():
-                    self.image_enc_out_shape = self.image_enc_layers(dummy_batch).shape[1:]
-                self.image_enc_layers.extend(
-                    nn.Sequential(
-                        nn.Flatten(),
-                        nn.Linear(np.prod(self.image_enc_out_shape), config.latent_dim),
-                        nn.LayerNorm(config.latent_dim),
-                        nn.Tanh(),
-                    )
-                )
+                self.image_enc_layers = DefaultImageEncoder(config)
+
             self.aggregation_size += config.latent_dim * self.camera_number
+
+            if config.freeze_vision_encoder:
+                freeze_image_encoder(self.image_enc_layers)
+            else:
+                self.parameters_to_optimize += list(self.image_enc_layers.parameters())
+
         if "observation.state" in config.input_shapes:
             self.state_enc_layers = nn.Sequential(
                 nn.Linear(
@@ -539,6 +510,8 @@ class SACObservationEncoder(nn.Module):
             )
             self.aggregation_size += config.latent_dim
 
+            self.parameters_to_optimize += list(self.state_enc_layers.parameters())
+
         if "observation.environment_state" in config.input_shapes:
             self.env_state_enc_layers = nn.Sequential(
                 nn.Linear(
@@ -548,26 +521,11 @@ class SACObservationEncoder(nn.Module):
                 nn.LayerNorm(normalized_shape=config.latent_dim),
                 nn.Tanh(),
             )
-        self.aggregation_size += config.latent_dim
+            self.aggregation_size += config.latent_dim
+            self.parameters_to_optimize += list(self.env_state_enc_layers.parameters())
+
         self.aggregation_layer = nn.Linear(in_features=self.aggregation_size, out_features=config.latent_dim)
-
-    def _load_pretrained_vision_encoder(self):
-        """Set up CNN encoder"""
-        from transformers import AutoModel
-
-        self.image_enc_layers = AutoModel.from_pretrained(self.config.vision_encoder_name)
-        if hasattr(self.image_enc_layers.config, "hidden_sizes"):
-            self.image_enc_out_shape = self.image_enc_layers.config.hidden_sizes[-1]  # Last channel dimension
-        elif hasattr(self.image_enc_layers, "fc"):
-            self.image_enc_out_shape = self.image_enc_layers.fc.in_features
-        else:
-            raise ValueError("Unsupported vision encoder architecture, make sure you are using a CNN")
-        return self.image_enc_layers, self.image_enc_out_shape
-
-    def freeze_encoder(self):
-        """Freeze all parameters in the encoder"""
-        for param in self.image_enc_layers.parameters():
-            param.requires_grad = False
+        self.parameters_to_optimize += list(self.aggregation_layer.parameters())
 
     def forward(self, obs_dict: dict[str, Tensor]) -> Tensor:
         """Encode the image and/or state vector.
@@ -579,12 +537,10 @@ class SACObservationEncoder(nn.Module):
         # Concatenate all images along the channel dimension.
         image_keys = [k for k in self.config.input_shapes if k.startswith("observation.image")]
         for image_key in image_keys:
-            if self.has_pretrained_vision_encoder:
-                enc_feat = self.image_enc_layers(obs_dict[image_key]).pooler_output
-                enc_feat = self.image_enc_proj(enc_feat.view(enc_feat.shape[0], -1))
-            else:
-                enc_feat = flatten_forward_unflatten(self.image_enc_layers, obs_dict[image_key])
+            enc_feat = self.image_enc_layers(obs_dict[image_key])
 
+            # if not self.has_pretrained_vision_encoder:
+            #     enc_feat = flatten_forward_unflatten(self.image_enc_layers, obs_dict[image_key])
             feat.append(enc_feat)
         if "observation.environment_state" in self.config.input_shapes:
             feat.append(self.env_state_enc_layers(obs_dict["observation.environment_state"]))
@@ -602,10 +558,107 @@ class SACObservationEncoder(nn.Module):
         return self.config.latent_dim
 
 
+class DefaultImageEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.image_enc_layers = nn.Sequential(
+            nn.Conv2d(
+                in_channels=config.input_shapes["observation.image"][0],
+                out_channels=config.image_encoder_hidden_dim,
+                kernel_size=7,
+                stride=2,
+            ),
+            nn.ReLU(),
+            nn.Conv2d(
+                in_channels=config.image_encoder_hidden_dim,
+                out_channels=config.image_encoder_hidden_dim,
+                kernel_size=5,
+                stride=2,
+            ),
+            nn.ReLU(),
+            nn.Conv2d(
+                in_channels=config.image_encoder_hidden_dim,
+                out_channels=config.image_encoder_hidden_dim,
+                kernel_size=3,
+                stride=2,
+            ),
+            nn.ReLU(),
+            nn.Conv2d(
+                in_channels=config.image_encoder_hidden_dim,
+                out_channels=config.image_encoder_hidden_dim,
+                kernel_size=3,
+                stride=2,
+            ),
+            nn.ReLU(),
+        )
+        dummy_batch = torch.zeros(1, *config.input_shapes["observation.image"])
+        with torch.inference_mode():
+            self.image_enc_out_shape = self.image_enc_layers(dummy_batch).shape[1:]
+        self.image_enc_layers.extend(
+            nn.Sequential(
+                nn.Flatten(),
+                nn.Linear(np.prod(self.image_enc_out_shape), config.latent_dim),
+                nn.LayerNorm(config.latent_dim),
+                nn.Tanh(),
+            )
+        )
+
+    def forward(self, x):
+        return self.image_enc_layers(x)
+
+
+class PretrainedImageEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.image_enc_layers, self.image_enc_out_shape = self._load_pretrained_vision_encoder(config)
+        self.image_enc_proj = nn.Sequential(
+            nn.Linear(np.prod(self.image_enc_out_shape), config.latent_dim),
+            nn.LayerNorm(config.latent_dim),
+            nn.Tanh(),
+        )
+
+    def _load_pretrained_vision_encoder(self, config):
+        """Set up CNN encoder"""
+        from transformers import AutoModel
+
+        self.image_enc_layers = AutoModel.from_pretrained(config.vision_encoder_name)
+        # self.image_enc_layers.pooler = Identity()
+
+        if hasattr(self.image_enc_layers.config, "hidden_sizes"):
+            self.image_enc_out_shape = self.image_enc_layers.config.hidden_sizes[-1]  # Last channel dimension
+        elif hasattr(self.image_enc_layers, "fc"):
+            self.image_enc_out_shape = self.image_enc_layers.fc.in_features
+        else:
+            raise ValueError("Unsupported vision encoder architecture, make sure you are using a CNN")
+        return self.image_enc_layers, self.image_enc_out_shape
+
+    def forward(self, x):
+        # TODO: (maractingi, azouitine) check the forward pass of the pretrained model
+        # doesn't reach the classifier layer because we don't need it
+        enc_feat = self.image_enc_layers(x).pooler_output
+        enc_feat = self.image_enc_proj(enc_feat.view(enc_feat.shape[0], -1))
+        return enc_feat
+
+
+def freeze_image_encoder(image_encoder: nn.Module):
+    """Freeze all parameters in the encoder"""
+    for param in image_encoder.parameters():
+        param.requires_grad = False
+
+
 def orthogonal_init():
     return lambda x: torch.nn.init.orthogonal_(x, gain=1.0)
 
 
+class Identity(nn.Module):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
 # TODO (azouitine): I think in our case this function is not usefull we should remove it
 # after some investigation
 # borrowed from tdmpc
@@ -626,3 +679,54 @@ def flatten_forward_unflatten(fn: Callable[[Tensor], Tensor], image_tensor: Tens
     inp = torch.flatten(image_tensor, end_dim=-4)
     flat_out = fn(inp)
     return torch.reshape(flat_out, (*start_dims, *flat_out.shape[1:]))
+
+
+if __name__ == "__main__":
+    # Test the SACObservationEncoder
+    import time
+
+    config = SACConfig()
+    config.num_critics = 10
+    encoder = SACObservationEncoder(config)
+    actor_encoder = SACObservationEncoder(config)
+    encoder = torch.compile(encoder)
+    critic_ensemble = CriticEnsemble(
+        encoder=encoder,
+        network_list=nn.ModuleList(
+            [
+                MLP(
+                    input_dim=encoder.output_dim + config.output_shapes["action"][0],
+                    **config.critic_network_kwargs,
+                )
+                for _ in range(config.num_critics)
+            ]
+        ),
+    )
+    actor = Policy(
+        encoder=actor_encoder,
+        network=MLP(input_dim=actor_encoder.output_dim, **config.actor_network_kwargs),
+        action_dim=config.output_shapes["action"][0],
+        encoder_is_shared=config.shared_encoder,
+        **config.policy_kwargs,
+    )
+    encoder = encoder.to("cuda:0")
+    critic_ensemble = torch.compile(critic_ensemble)
+    critic_ensemble = critic_ensemble.to("cuda:0")
+    actor = torch.compile(actor)
+    actor = actor.to("cuda:0")
+    obs_dict = {
+        "observation.image": torch.randn(1, 3, 84, 84),
+        "observation.state": torch.randn(1, 4),
+    }
+    actions = torch.randn(1, 2).to("cuda:0")
+    obs_dict = {k: v.to("cuda:0") for k, v in obs_dict.items()}
+    print("compiling...")
+    # q_value = critic_ensemble(obs_dict, actions)
+    action = actor(obs_dict)
+    print("compiled")
+    start = time.perf_counter()
+    for _ in range(1000):
+        # features = encoder(obs_dict)
+        action = actor(obs_dict)
+        # q_value = critic_ensemble(obs_dict, actions)
+    print("Time taken:", time.perf_counter() - start)
diff --git a/lerobot/configs/policy/sac_maniskill.yaml b/lerobot/configs/policy/sac_maniskill.yaml
index 2776b39d..aaf59e53 100644
--- a/lerobot/configs/policy/sac_maniskill.yaml
+++ b/lerobot/configs/policy/sac_maniskill.yaml
@@ -52,6 +52,8 @@ policy:
   n_action_steps: 1
 
   shared_encoder: true
+  # vision_encoder_name: null
+  freeze_vision_encoder: false
   input_shapes:
     # # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
     observation.state: ["${env.state_dim}"]
diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index 294f07a6..952590e8 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -191,6 +191,7 @@ def act_with_policy(cfg: DictConfig):
     #     pretrained_policy_name_or_path=None,
     #     device=device,
     # )
+    policy = torch.compile(policy)
     assert isinstance(policy, nn.Module)
 
     # HACK for maniskill
@@ -237,7 +238,9 @@ def act_with_policy(cfg: DictConfig):
                 logging.debug("[ACTOR] Load new parameters from Learner.")
                 state_dict = parameters_queue.get()
                 state_dict = move_state_dict_to_device(state_dict, device=device)
-                policy.actor.load_state_dict(state_dict)
+                # strict=False for the case when the image encoder is frozen and not sent through
+                # the network. Becareful might cause issues if the wrong keys are passed
+                policy.actor.load_state_dict(state_dict, strict=False)
 
             if len(list_transition_to_send_to_learner) > 0:
                 logging.debug(
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index dbafeb42..6dd33fed 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -259,6 +259,9 @@ def learner_push_parameters(
     while True:
         with policy_lock:
             params_dict = policy.actor.state_dict()
+            if policy.config.vision_encoder_name is not None and policy.config.freeze_vision_encoder:
+                params_dict = {k: v for k, v in params_dict if not k.startswith("encoder.")}
+
         params_dict = move_state_dict_to_device(params_dict, device="cpu")
         # Serialize
         buf = io.BytesIO()
@@ -541,6 +544,8 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
         dataset_stats=None,
         pretrained_policy_name_or_path=str(logger.last_pretrained_model_dir) if cfg.resume else None,
     )
+    # compile policy
+    policy = torch.compile(policy)
     assert isinstance(policy, nn.Module)
 
     optimizers, lr_scheduler = make_optimizers_and_scheduler(cfg, policy)
diff --git a/lerobot/scripts/train_sac.py b/lerobot/scripts/train_sac.py
index 936d65ee..4f7b55cc 100644
--- a/lerobot/scripts/train_sac.py
+++ b/lerobot/scripts/train_sac.py
@@ -13,26 +13,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import logging
 import functools
-from pprint import pformat
+import logging
 import random
-from typing import Optional, Sequence, TypedDict, Callable
+from pprint import pformat
+from typing import Callable, Optional, Sequence, TypedDict
 
 import hydra
 import torch
 import torch.nn.functional as F
-from torch import nn
-from tqdm import tqdm
 from deepdiff import DeepDiff
 from omegaconf import DictConfig, OmegaConf
-
-from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+from torch import nn
+from tqdm import tqdm
 
 # TODO: Remove the import of maniskill
 from lerobot.common.datasets.factory import make_dataset
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
 from lerobot.common.envs.factory import make_env, make_maniskill_env
-from lerobot.common.envs.utils import preprocess_observation, preprocess_maniskill_observation
+from lerobot.common.envs.utils import preprocess_maniskill_observation, preprocess_observation
 from lerobot.common.logger import Logger, log_output_dir
 from lerobot.common.policies.factory import make_policy
 from lerobot.common.policies.sac.modeling_sac import SACPolicy

From 2211209be5df57202a7ee7c641ae7de3dc6b914a Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Mon, 3 Feb 2025 14:52:45 +0000
Subject: [PATCH 059/112] - Added base gym env class for the real robot
 environment. - Added several wrappers around the base gym env robot class. -
 Including: time limit, reward classifier, crop images, preprocess
 observations. - Added an interactive script crop_roi.py where the user can
 interactively select the roi in the observation images and return the correct
 crop values that will improve the policy and reward classifier performance.

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/scripts/server/crop_roi.py            | 148 +++++++
 .../server/wrappers/gym_manipulator.py        | 380 ++++++++++++++++++
 2 files changed, 528 insertions(+)
 create mode 100644 lerobot/scripts/server/crop_roi.py
 create mode 100644 lerobot/scripts/server/wrappers/gym_manipulator.py

diff --git a/lerobot/scripts/server/crop_roi.py b/lerobot/scripts/server/crop_roi.py
new file mode 100644
index 00000000..f00f3eb6
--- /dev/null
+++ b/lerobot/scripts/server/crop_roi.py
@@ -0,0 +1,148 @@
+import cv2
+
+from lerobot.common.robot_devices.cameras.opencv import OpenCVCamera
+
+
+def select_square_roi(img):
+    """
+    Allows the user to draw a square ROI on the image.
+
+    The user must click and drag to draw the square.
+    - While dragging, the square is dynamically drawn.
+    - On mouse button release, the square is fixed.
+    - Press 'c' to confirm the selection.
+    - Press 'r' to reset the selection.
+    - Press ESC to cancel.
+
+    Returns:
+        A tuple (top, left, height, width) representing the square ROI,
+        or None if no valid ROI is selected.
+    """
+    # Create a working copy of the image
+    clone = img.copy()
+    working_img = clone.copy()
+
+    roi = None  # Will store the final ROI as (top, left, side, side)
+    drawing = False
+    ix, iy = -1, -1  # Initial click coordinates
+
+    def mouse_callback(event, x, y, flags, param):
+        nonlocal ix, iy, drawing, roi, working_img
+
+        if event == cv2.EVENT_LBUTTONDOWN:
+            # Start drawing: record starting coordinates
+            drawing = True
+            ix, iy = x, y
+
+        elif event == cv2.EVENT_MOUSEMOVE:
+            if drawing:
+                # Compute side length as the minimum of horizontal/vertical drags
+                side = min(abs(x - ix), abs(y - iy))
+                # Determine the direction to draw (in case of dragging to top/left)
+                dx = side if x >= ix else -side
+                dy = side if y >= iy else -side
+                # Show a temporary image with the current square drawn
+                temp = working_img.copy()
+                cv2.rectangle(temp, (ix, iy), (ix + dx, iy + dy), (0, 255, 0), 2)
+                cv2.imshow("Select ROI", temp)
+
+        elif event == cv2.EVENT_LBUTTONUP:
+            # Finish drawing
+            drawing = False
+            side = min(abs(x - ix), abs(y - iy))
+            dx = side if x >= ix else -side
+            dy = side if y >= iy else -side
+            # Normalize coordinates: (top, left) is the minimum of the two points
+            x1 = min(ix, ix + dx)
+            y1 = min(iy, iy + dy)
+            roi = (y1, x1, side, side)  # (top, left, height, width)
+            # Draw the final square on the working image and display it
+            working_img = clone.copy()
+            cv2.rectangle(working_img, (ix, iy), (ix + dx, iy + dy), (0, 255, 0), 2)
+            cv2.imshow("Select ROI", working_img)
+
+    # Create the window and set the callback
+    cv2.namedWindow("Select ROI")
+    cv2.setMouseCallback("Select ROI", mouse_callback)
+    cv2.imshow("Select ROI", working_img)
+
+    print("Instructions for ROI selection:")
+    print("  - Click and drag to draw a square ROI.")
+    print("  - Press 'c' to confirm the selection.")
+    print("  - Press 'r' to reset and draw again.")
+    print("  - Press ESC to cancel the selection.")
+
+    # Wait until the user confirms with 'c', resets with 'r', or cancels with ESC
+    while True:
+        key = cv2.waitKey(1) & 0xFF
+        # Confirm ROI if one has been drawn
+        if key == ord("c") and roi is not None:
+            break
+        # Reset: clear the ROI and restore the original image
+        elif key == ord("r"):
+            working_img = clone.copy()
+            roi = None
+            cv2.imshow("Select ROI", working_img)
+        # Cancel selection for this image
+        elif key == 27:  # ESC key
+            roi = None
+            break
+
+    cv2.destroyWindow("Select ROI")
+    return roi
+
+
+def select_square_roi_for_images(images: dict) -> dict:
+    """
+    For each image in the provided dictionary, open a window to allow the user
+    to select a square ROI. Returns a dictionary mapping each key to a tuple
+    (top, left, height, width) representing the ROI.
+
+    Parameters:
+        images (dict): Dictionary where keys are identifiers and values are OpenCV images.
+
+    Returns:
+        dict: Mapping of image keys to the selected square ROI.
+    """
+    selected_rois = {}
+
+    for key, img in images.items():
+        if img is None:
+            print(f"Image for key '{key}' is None, skipping.")
+            continue
+
+        print(f"\nSelect square ROI for image with key: '{key}'")
+        roi = select_square_roi(img)
+
+        if roi is None:
+            print(f"No valid ROI selected for '{key}'.")
+        else:
+            selected_rois[key] = roi
+            print(f"ROI for '{key}': {roi}")
+
+    return selected_rois
+
+
+if __name__ == "__main__":
+    # Example usage:
+    # Replace 'image1.jpg' and 'image2.jpg' with valid paths to your image files.
+    fps = [5, 30]
+    cameras = [OpenCVCamera(i, fps=fps[i], width=640, height=480, mock=False) for i in range(2)]
+    [camera.connect() for camera in cameras]
+
+    image_keys = ["image_" + str(i) for i in range(len(cameras))]
+
+    images = {image_keys[i]: cameras[i].read() for i in range(len(cameras))}
+
+    # Verify images loaded correctly
+    for key, img in images.items():
+        if img is None:
+            raise ValueError(f"Failed to load image for key '{key}'. Check the file path.")
+
+    # Let the user select a square ROI for each image
+    rois = select_square_roi_for_images(images)
+
+    # Print the selected square ROIs
+    print("\nSelected Square Regions of Interest (top, left, height, width):")
+    for key, roi in rois.items():
+        print(f"{key}: {roi}")
diff --git a/lerobot/scripts/server/wrappers/gym_manipulator.py b/lerobot/scripts/server/wrappers/gym_manipulator.py
new file mode 100644
index 00000000..749d4358
--- /dev/null
+++ b/lerobot/scripts/server/wrappers/gym_manipulator.py
@@ -0,0 +1,380 @@
+import argparse
+import logging
+import time
+from typing import Annotated, Any, Dict, Optional, Tuple
+
+import gymnasium as gym
+import numpy as np
+import torch
+import torch.nn as nn
+import torchvision.transforms.functional as F  # noqa: N812
+
+from lerobot.common.envs.utils import preprocess_observation
+from lerobot.common.robot_devices.control_utils import reset_follower_position
+from lerobot.common.robot_devices.robots.factory import make_robot
+from lerobot.common.utils.utils import init_hydra_config
+
+logging.basicConfig(level=logging.INFO)
+
+
+class HILSerlRobotEnv(gym.Env):
+    """
+    Gym-like environment wrapper for robot policy evaluation.
+
+    This wrapper provides a consistent interface for interacting with the robot,
+    following the OpenAI Gym environment conventions.
+    """
+
+    def __init__(
+        self,
+        robot,
+        reset_follower_position=True,
+        display_cameras=False,
+    ):
+        """
+        Initialize the robot environment.
+
+        Args:
+            robot: The robot interface object
+            reward_classifier: Optional reward classifier
+            fps: Frames per second for control
+            control_time_s: Total control time for each episode
+            display_cameras: Whether to display camera feeds
+        """
+        super().__init__()
+
+        self.robot = robot
+        self.display_cameras = display_cameras
+
+        # connect robot
+        if not self.robot.is_connected:
+            self.robot.connect()
+
+        # Dynamically determine observation and action spaces
+        self._setup_spaces()
+
+        self._initial_follower_position = robot.follower_arms["main"].read("Present_Position")
+        self.reset_follower_position = reset_follower_position
+
+        # Episode tracking
+        self.current_step = 0
+        self.episode_data = None
+
+    def _setup_spaces(self):
+        """
+        Dynamically determine observation and action spaces based on robot capabilities.
+
+        This method should be customized based on the specific robot's observation
+        and action representations.
+        """
+        # Example space setup - you'll need to adapt this to your specific robot
+        example_obs = self.robot.capture_observation()
+
+        # Observation space (assuming image-based observations)
+        image_keys = [key for key in example_obs if "image" in key]
+        state_keys = [key for key in example_obs if "image" not in key]
+        observation_spaces = {
+            key: gym.spaces.Box(low=0, high=255, shape=example_obs[key].shape, dtype=np.uint8)
+            for key in image_keys
+        }
+        observation_spaces["observation.state"] = gym.spaces.Dict(
+            {
+                key: gym.spaces.Box(low=0, high=10, shape=example_obs[key].shape, dtype=np.float32)
+                for key in state_keys
+            }
+        )
+
+        self.observation_space = gym.spaces.Dict(observation_spaces)
+
+        # Action space (assuming joint positions)
+        action_dim = len(self.robot.follower_arms["main"].read("Present_Position"))
+        self.action_space = gym.spaces.Tuple(
+            (
+                gym.spaces.Box(low=-np.inf, high=np.inf, shape=(action_dim,), dtype=np.float32),
+                gym.spaces.Discrete(2),
+            ),
+        )
+
+    def reset(self, seed=None, options=None) -> Tuple[Dict[str, np.ndarray], Dict[str, Any]]:
+        """
+        Reset the environment to initial state.
+
+        Returns:
+            observation (dict): Initial observation
+            info (dict): Additional information
+        """
+        super().reset(seed=seed, options=options)
+
+        if self.reset_follower_position:
+            reset_follower_position(self.robot, target_position=self._initial_follower_position)
+
+        # Capture initial observation
+        observation = self.robot.capture_observation()
+
+        # Reset tracking variables
+        self.current_step = 0
+        self.episode_data = None
+
+        return observation, {}
+
+    def step(
+        self, action: Tuple[np.ndarray, bool]
+    ) -> Tuple[Dict[str, np.ndarray], float, bool, bool, Dict[str, Any]]:
+        """
+        Take a step in the environment.
+
+        Args:
+            action tuple(np.ndarray, bool):
+                    Policy action to be executed on the robot and boolean to determine
+                    whether to choose policy action or expert action.
+
+        Returns:
+            observation (dict): Next observation
+            reward (float): Reward for this step
+            terminated (bool): Whether the episode has terminated
+            truncated (bool): Whether the episode was truncated
+            info (dict): Additional information
+        """
+        # The actions recieved are the in form of a tuple containing the policy action and an intervention bool
+        # The boolean inidicated whether we will use the expert's actions (through teleoperation) or the policy actions
+        policy_action, intervention_bool = action
+        teleop_action = None
+        if not intervention_bool:
+            self.robot.send_action(policy_action.cpu().numpy())
+            observation = self.robot.capture_observation()
+        else:
+            observation, teleop_action = self.robot.teleop_step(record_data=True)
+            teleop_action = teleop_action["action"]  # teleop step returns torch tensors but in a dict
+
+        self.current_step += 1
+
+        reward = 0.0
+        terminated = False
+        truncated = False
+
+        return observation, reward, terminated, truncated, {"action": teleop_action}
+
+    def render(self):
+        """
+        Render the environment (in this case, display camera feeds).
+        """
+        import cv2
+
+        observation = self.robot.capture_observation()
+        image_keys = [key for key in observation if "image" in key]
+
+        for key in image_keys:
+            cv2.imshow(key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR))
+
+        cv2.waitKey(1)
+
+    def close(self):
+        """
+        Close the environment and disconnect the robot.
+        """
+        if self.robot.is_connected:
+            self.robot.disconnect()
+
+
+class HILSerlTimeLimitWrapper(gym.Wrapper):
+    def __init__(self, env, control_time_s, fps):
+        self.env = env
+        self.control_time_s = control_time_s
+        self.fps = fps
+
+        self.last_timestamp = 0.0
+        self.episode_time_in_s = 0.0
+
+    def step(self, action):
+        ret = self.env.step(action)
+        time_since_last_step = time.perf_counter() - self.last_timestamp
+        self.episode_time_in_s += time_since_last_step
+        self.last_timestamp = time.perf_counter()
+
+        # check if last timestep took more time than the expected fps
+        if 1.0 / time_since_last_step > self.fps:
+            logging.warning(f"Current timestep exceeded expected fps {self.fps}")
+
+        if self.episode_time_in_s > self.control_time_s:
+            # Terminated = True
+            ret[2] = True
+        return ret
+
+    def reset(self, seed=None, options=None):
+        self.episode_time_in_s = 0.0
+        self.last_timestamp = time.perf_counter()
+        return self.env.reset(seed, options=None)
+
+
+class HILSerlRewardWrapper(gym.Wrapper):
+    def __init__(self, env, reward_classifier: Optional[None], device: torch.device = "cuda"):
+        self.env = env
+        self.reward_classifier = reward_classifier
+        self.device = device
+
+    def step(self, action):
+        observation, _, terminated, truncated, info = self.env.step(action)
+        images = [
+            observation[key].to(self.device, non_blocking=True) for key in observation if "image" in key
+        ]
+        reward = self.reward_classifier.predict_reward(images) if self.reward_classifier is not None else 0.0
+        reward = reward.item()
+        return observation, reward, terminated, truncated, info
+
+    def reset(self, seed=None, options=None):
+        return self.env.reset(seed=seed, options=options)
+
+
+class HILSerlImageCropResizeWrapper(gym.Wrapper):
+    def __init__(self, env, crop_params_dict: Dict[str, Annotated[Tuple[int], 4]], resize_size=None):
+        self.env = env
+        self.crop_params_dict = crop_params_dict
+        for key in crop_params_dict:
+            assert key in self.env.observation_space, f"Key {key} not in observation space"
+            top, left, height, width = crop_params_dict[key]
+            new_shape = (top + height, left + width)
+            self.observation_space[key] = gym.spaces.Box(low=0, high=255, shape=new_shape)
+
+        self.resize_size = resize_size
+        if self.resize_size is None:
+            self.resize_size = (128, 128)
+
+    def step(self, action):
+        obs, reward, terminated, truncated, info = self.env.step(action)
+        for k in self.crop_params_dict:
+            obs[k] = F.crop(obs[k], *self.crop_params_dict[k])
+            obs[k] = F.resize(obs[k], self.resize_size)
+        return obs, reward, terminated, truncated, info
+
+
+class ConvertToLeRobotObservation(gym.ObservationWrapper):
+    def __init__(self, env, device):
+        super().__init__(env)
+        self.device = device
+
+    def observation(self, observation):
+        observation = preprocess_observation(observation)
+
+        observation = {key: observation[key].to(self.device, non_blocking=True) for key in observation}
+        observation = {k: torch.tensor(v, device=self.device) for k, v in observation.items()}
+        return observation
+
+
+def make_robot_env(
+    robot,
+    reward_classifier,
+    crop_params_dict=None,
+    fps=30,
+    control_time_s=20,
+    reset_follower_pos=True,
+    display_cameras=False,
+    device="cuda:0",
+    resize_size=None,
+):
+    """
+    Factory function to create the robot environment.
+
+    Mimics gym.make() for consistent environment creation.
+    """
+    env = HILSerlRobotEnv(robot, reset_follower_pos, display_cameras)
+    env = ConvertToLeRobotObservation(env, device)
+    if crop_params_dict is not None:
+        env = HILSerlImageCropResizeWrapper(env, crop_params_dict, resize_size=resize_size)
+    env = HILSerlRewardWrapper(env, reward_classifier)
+    env = HILSerlTimeLimitWrapper(env, control_time_s, fps)
+    return env
+
+
+def get_classifier(pretrained_path, config_path, device="mps"):
+    if pretrained_path is None or config_path is None:
+        return
+
+    from lerobot.common.policies.factory import _policy_cfg_from_hydra_cfg
+    from lerobot.common.policies.hilserl.classifier.configuration_classifier import ClassifierConfig
+    from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
+
+    cfg = init_hydra_config(config_path)
+
+    classifier_config = _policy_cfg_from_hydra_cfg(ClassifierConfig, cfg)
+    classifier_config.num_cameras = len(cfg.training.image_keys)  # TODO automate these paths
+    model = Classifier(classifier_config)
+    model.load_state_dict(Classifier.from_pretrained(pretrained_path).state_dict())
+    model = model.to(device)
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--fps", type=int, default=30, help="control frequency")
+    parser.add_argument(
+        "--robot-path",
+        type=str,
+        default="lerobot/configs/robot/koch.yaml",
+        help="Path to robot yaml file used to instantiate the robot using `make_robot` factory function.",
+    )
+    parser.add_argument(
+        "--robot-overrides",
+        type=str,
+        nargs="*",
+        help="Any key=value arguments to override config values (use dots for.nested=overrides)",
+    )
+    parser.add_argument(
+        "-p",
+        "--pretrained-policy-name-or-path",
+        help=(
+            "Either the repo ID of a model hosted on the Hub or a path to a directory containing weights "
+            "saved using `Policy.save_pretrained`. If not provided, the policy is initialized from scratch "
+            "(useful for debugging). This argument is mutually exclusive with `--config`."
+        ),
+    )
+    parser.add_argument(
+        "--config",
+        help=(
+            "Path to a yaml config you want to use for initializing a policy from scratch (useful for "
+            "debugging). This argument is mutually exclusive with `--pretrained-policy-name-or-path` (`-p`)."
+        ),
+    )
+    parser.add_argument(
+        "--display-cameras", help=("Whether to display the camera feed while the rollout is happening")
+    )
+    parser.add_argument(
+        "--reward-classifier-pretrained-path",
+        type=str,
+        default=None,
+        help="Path to the pretrained classifier weights.",
+    )
+    parser.add_argument(
+        "--reward-classifier-config-file",
+        type=str,
+        default=None,
+        help="Path to a yaml config file that is necessary to build the reward classifier model.",
+    )
+    parser.add_argument("--control-time-s", type=float, default=20, help="Maximum episode length in seconds")
+    parser.add_argument("--reset-follower-pos", type=int, default=1, help="Reset follower between episodes")
+    args = parser.parse_args()
+
+    robot_cfg = init_hydra_config(args.robot_path, args.robot_overrides)
+    robot = make_robot(robot_cfg)
+
+    reward_classifier = get_classifier(
+        args.reward_classifier_pretrained_path, args.reward_classifier_config_file
+    )
+
+    env = make_robot_env(
+        robot,
+        reward_classifier,
+        None,
+        args.fps,
+        args.control_time_s,
+        args.reset_follower_pos,
+        args.display_cameras,
+        device="mps",
+    )
+
+    env.reset()
+    while True:
+        intervention_action = (None, True)
+        obs, reward, terminated, truncated, info = env.step(intervention_action)
+        if terminated or truncated:
+            logging.info("Max control time reached, reset environment.")
+            env.reset()

From efb1982eecc316aee61e957f4288de20559709a6 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Mon, 3 Feb 2025 17:48:35 +0000
Subject: [PATCH 060/112] Added crop_dataset_roi.py that allows you to load a
 lerobotdataset -> crop its images -> create a new lerobot dataset with the
 cropped and resized images.

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/common/robot_devices/control_utils.py |   6 +-
 lerobot/scripts/server/crop_dataset_roi.py    | 264 ++++++++++++++++++
 lerobot/scripts/server/crop_roi.py            | 148 ----------
 3 files changed, 268 insertions(+), 150 deletions(-)
 create mode 100644 lerobot/scripts/server/crop_dataset_roi.py
 delete mode 100644 lerobot/scripts/server/crop_roi.py

diff --git a/lerobot/common/robot_devices/control_utils.py b/lerobot/common/robot_devices/control_utils.py
index 10cb9f5c..f88f6d3e 100644
--- a/lerobot/common/robot_devices/control_utils.py
+++ b/lerobot/common/robot_devices/control_utils.py
@@ -36,7 +36,7 @@ def log_control_info(robot: Robot, dt_s, episode_index=None, frame_index=None, f
 
     def log_dt(shortname, dt_val_s):
         nonlocal log_items, fps
-        info_str = f"{shortname}:{dt_val_s * 1000:5.2f} ({1/ dt_val_s:3.1f}hz)"
+        info_str = f"{shortname}:{dt_val_s * 1000:5.2f} ({1 / dt_val_s:3.1f}hz)"
         if fps is not None:
             actual_fps = 1 / dt_val_s
             if actual_fps < fps - 1:
@@ -335,7 +335,9 @@ def reset_environment(robot, events, reset_time_s):
 
 def reset_follower_position(robot: Robot, target_position):
     current_position = robot.follower_arms["main"].read("Present_Position")
-    trajectory = torch.from_numpy(np.linspace(current_position, target_position, 30)) # NOTE: 30 is just an aribtrary number 
+    trajectory = torch.from_numpy(
+        np.linspace(current_position, target_position, 30)
+    )  # NOTE: 30 is just an aribtrary number
     for pose in trajectory:
         robot.send_action(pose)
         busy_wait(0.015)
diff --git a/lerobot/scripts/server/crop_dataset_roi.py b/lerobot/scripts/server/crop_dataset_roi.py
new file mode 100644
index 00000000..8d7d7ebf
--- /dev/null
+++ b/lerobot/scripts/server/crop_dataset_roi.py
@@ -0,0 +1,264 @@
+import argparse  # noqa: I001
+from copy import deepcopy
+from typing import Dict, Tuple
+
+import cv2
+
+# import torch.nn.functional as F  # noqa: N812
+import torchvision.transforms.functional as F  # type: ignore  # noqa: N812
+from tqdm import tqdm  # type: ignore
+
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+
+
+def select_rect_roi(img):
+    """
+    Allows the user to draw a rectangular ROI on the image.
+
+    The user must click and drag to draw the rectangle.
+    - While dragging, the rectangle is dynamically drawn.
+    - On mouse button release, the rectangle is fixed.
+    - Press 'c' to confirm the selection.
+    - Press 'r' to reset the selection.
+    - Press ESC to cancel.
+
+    Returns:
+        A tuple (top, left, height, width) representing the rectangular ROI,
+        or None if no valid ROI is selected.
+    """
+    # Create a working copy of the image
+    clone = img.copy()
+    working_img = clone.copy()
+
+    roi = None  # Will store the final ROI as (top, left, height, width)
+    drawing = False
+    ix, iy = -1, -1  # Initial click coordinates
+
+    def mouse_callback(event, x, y, flags, param):
+        nonlocal ix, iy, drawing, roi, working_img
+
+        if event == cv2.EVENT_LBUTTONDOWN:
+            # Start drawing: record starting coordinates
+            drawing = True
+            ix, iy = x, y
+
+        elif event == cv2.EVENT_MOUSEMOVE:
+            if drawing:
+                # Compute the top-left and bottom-right corners regardless of drag direction
+                top = min(iy, y)
+                left = min(ix, x)
+                bottom = max(iy, y)
+                right = max(ix, x)
+                # Show a temporary image with the current rectangle drawn
+                temp = working_img.copy()
+                cv2.rectangle(temp, (left, top), (right, bottom), (0, 255, 0), 2)
+                cv2.imshow("Select ROI", temp)
+
+        elif event == cv2.EVENT_LBUTTONUP:
+            # Finish drawing
+            drawing = False
+            top = min(iy, y)
+            left = min(ix, x)
+            bottom = max(iy, y)
+            right = max(ix, x)
+            height = bottom - top
+            width = right - left
+            roi = (top, left, height, width)  # (top, left, height, width)
+            # Draw the final rectangle on the working image and display it
+            working_img = clone.copy()
+            cv2.rectangle(working_img, (left, top), (right, bottom), (0, 255, 0), 2)
+            cv2.imshow("Select ROI", working_img)
+
+    # Create the window and set the callback
+    cv2.namedWindow("Select ROI")
+    cv2.setMouseCallback("Select ROI", mouse_callback)
+    cv2.imshow("Select ROI", working_img)
+
+    print("Instructions for ROI selection:")
+    print("  - Click and drag to draw a rectangular ROI.")
+    print("  - Press 'c' to confirm the selection.")
+    print("  - Press 'r' to reset and draw again.")
+    print("  - Press ESC to cancel the selection.")
+
+    # Wait until the user confirms with 'c', resets with 'r', or cancels with ESC
+    while True:
+        key = cv2.waitKey(1) & 0xFF
+        # Confirm ROI if one has been drawn
+        if key == ord("c") and roi is not None:
+            break
+        # Reset: clear the ROI and restore the original image
+        elif key == ord("r"):
+            working_img = clone.copy()
+            roi = None
+            cv2.imshow("Select ROI", working_img)
+        # Cancel selection for this image
+        elif key == 27:  # ESC key
+            roi = None
+            break
+
+    cv2.destroyWindow("Select ROI")
+    return roi
+
+
+def select_square_roi_for_images(images: dict) -> dict:
+    """
+    For each image in the provided dictionary, open a window to allow the user
+    to select a rectangular ROI. Returns a dictionary mapping each key to a tuple
+    (top, left, height, width) representing the ROI.
+
+    Parameters:
+        images (dict): Dictionary where keys are identifiers and values are OpenCV images.
+
+    Returns:
+        dict: Mapping of image keys to the selected rectangular ROI.
+    """
+    selected_rois = {}
+
+    for key, img in images.items():
+        if img is None:
+            print(f"Image for key '{key}' is None, skipping.")
+            continue
+
+        print(f"\nSelect rectangular ROI for image with key: '{key}'")
+        roi = select_rect_roi(img)
+
+        if roi is None:
+            print(f"No valid ROI selected for '{key}'.")
+        else:
+            selected_rois[key] = roi
+            print(f"ROI for '{key}': {roi}")
+
+    return selected_rois
+
+
+def get_image_from_lerobot_dataset(dataset: LeRobotDataset):
+    """
+    Find the first row in the dataset and extract the image in order to be used for the crop.
+    """
+    row = dataset[0]
+    image_dict = {}
+    for k in row:
+        if "image" in k:
+            image_dict[k] = deepcopy(row[k])
+    return image_dict
+
+
+def convert_lerobot_dataset_to_cropper_lerobot_dataset(
+    original_dataset: LeRobotDataset,
+    crop_params_dict: Dict[str, Tuple[int, int, int, int]],
+    new_repo_id: str,
+    new_dataset_root: str,
+    resize_size: Tuple[int, int] = (128, 128),
+) -> LeRobotDataset:
+    """
+    Converts an existing LeRobotDataset by iterating over its episodes and frames,
+    applying cropping and resizing to image observations, and saving a new dataset
+    with the transformed data.
+
+    Args:
+        original_dataset (LeRobotDataset): The source dataset.
+        crop_params_dict (Dict[str, Tuple[int, int, int, int]]):
+            A dictionary mapping observation keys to crop parameters (top, left, height, width).
+        new_repo_id (str): Repository id for the new dataset.
+        new_dataset_root (str): The root directory where the new dataset will be written.
+        resize_size (Tuple[int, int], optional): The target size (height, width) after cropping.
+            Defaults to (128, 128).
+
+    Returns:
+        LeRobotDataset: A new LeRobotDataset where the specified image observations have been cropped
+                        and resized.
+    """
+    # 1. Create a new (empty) LeRobotDataset for writing.
+    new_dataset = LeRobotDataset.create(
+        repo_id=new_repo_id,
+        fps=original_dataset.fps,
+        root=new_dataset_root,
+        robot_type=original_dataset.meta.robot_type,
+        features=original_dataset.meta.info["features"],
+        use_videos=len(original_dataset.meta.video_keys) > 0,
+    )
+
+    # Update the metadata for every image key that will be cropped:
+    # (Here we simply set the shape to be the final resize_size.)
+    for key in crop_params_dict:
+        if key in new_dataset.meta.info["features"]:
+            new_dataset.meta.info["features"][key]["shape"] = list(resize_size)
+
+    # 2. Process each episode in the original dataset.
+    episodes_info = original_dataset.meta.episodes
+    # (Sort episodes by episode_index for consistency.)
+    episodes_info = sorted(episodes_info, key=lambda x: x["episode_index"])
+
+    for ep in tqdm(episodes_info[:3], desc="Processing episodes"):
+        ep_index = ep.pop("episode_index")
+        # Use the first task from the episode metadata (or "unknown" if not provided)
+        task = ep["tasks"][0] if ep.get("tasks") else "unknown"
+
+        # Reset the episode buffer in the new dataset (this will store frames for one episode).
+        new_dataset.episode_buffer = new_dataset.create_episode_buffer(episode_index=ep_index)
+
+        # 3. Filter and process all frames belonging to this episode.
+        # Here we loop over the entire dataset and select the frames with the matching episode_index.
+        # (Depending on the dataset size, you might want a more efficient method.)
+        ep_frames = [sample for sample in original_dataset if sample["episode_index"] == ep_index]
+
+        for sample in tqdm(ep_frames):
+            sample.pop("episode_index")
+            sample.pop("frame_index")
+            # Make a shallow copy of the sample (the values—e.g. torch tensors—are assumed immutable)
+            new_sample = sample.copy()
+            # Loop over each observation key that should be cropped/resized.
+            for key, params in crop_params_dict.items():
+                if key in new_sample:
+                    top, left, height, width = params
+                    # Apply crop then resize.
+                    cropped = F.crop(new_sample[key], top, left, height, width)
+                    resized = F.resize(cropped, resize_size)
+                    new_sample[key] = resized
+            # Add the transformed frame to the new dataset.
+            new_dataset.add_frame(new_sample)
+
+        # 4. Save the episode (this writes the parquet file and image files).
+        new_dataset.save_episode(task, encode_videos=True)
+
+    # Optionally, consolidate the new dataset to compute statistics and update video info.
+    new_dataset.consolidate(run_compute_stats=True, keep_image_files=True)
+
+    return new_dataset
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Crop rectangular ROIs from a LeRobot dataset.")
+    parser.add_argument(
+        "--repo-id",
+        type=str,
+        default="lerobot",
+        help="The repository id of the LeRobot dataset to process.",
+    )
+    parser.add_argument(
+        "--root",
+        type=str,
+        default=None,
+        help="The root directory of the LeRobot dataset.",
+    )
+    args = parser.parse_args()
+
+    dataset = LeRobotDataset(repo_id=args.repo_id, root=args.root)
+
+    images = get_image_from_lerobot_dataset(dataset)
+    images = {k: v.cpu().permute(1, 2, 0).numpy() for k, v in images.items()}
+    images = {k: (v * 255).astype("uint8") for k, v in images.items()}
+
+    rois = select_square_roi_for_images(images)
+
+    # Print the selected rectangular ROIs
+    print("\nSelected Rectangular Regions of Interest (top, left, height, width):")
+    for key, roi in rois.items():
+        print(f"{key}: {roi}")
+    croped_resized_dataset = convert_lerobot_dataset_to_cropper_lerobot_dataset(
+        original_dataset=dataset,
+        crop_params_dict=rois,
+        new_repo_id=args.repo_id + "_cropped_resized",
+        new_dataset_root="data/" + args.repo_id + "_cropped_resized",
+        resize_size=(128, 128),
+    )
diff --git a/lerobot/scripts/server/crop_roi.py b/lerobot/scripts/server/crop_roi.py
deleted file mode 100644
index f00f3eb6..00000000
--- a/lerobot/scripts/server/crop_roi.py
+++ /dev/null
@@ -1,148 +0,0 @@
-import cv2
-
-from lerobot.common.robot_devices.cameras.opencv import OpenCVCamera
-
-
-def select_square_roi(img):
-    """
-    Allows the user to draw a square ROI on the image.
-
-    The user must click and drag to draw the square.
-    - While dragging, the square is dynamically drawn.
-    - On mouse button release, the square is fixed.
-    - Press 'c' to confirm the selection.
-    - Press 'r' to reset the selection.
-    - Press ESC to cancel.
-
-    Returns:
-        A tuple (top, left, height, width) representing the square ROI,
-        or None if no valid ROI is selected.
-    """
-    # Create a working copy of the image
-    clone = img.copy()
-    working_img = clone.copy()
-
-    roi = None  # Will store the final ROI as (top, left, side, side)
-    drawing = False
-    ix, iy = -1, -1  # Initial click coordinates
-
-    def mouse_callback(event, x, y, flags, param):
-        nonlocal ix, iy, drawing, roi, working_img
-
-        if event == cv2.EVENT_LBUTTONDOWN:
-            # Start drawing: record starting coordinates
-            drawing = True
-            ix, iy = x, y
-
-        elif event == cv2.EVENT_MOUSEMOVE:
-            if drawing:
-                # Compute side length as the minimum of horizontal/vertical drags
-                side = min(abs(x - ix), abs(y - iy))
-                # Determine the direction to draw (in case of dragging to top/left)
-                dx = side if x >= ix else -side
-                dy = side if y >= iy else -side
-                # Show a temporary image with the current square drawn
-                temp = working_img.copy()
-                cv2.rectangle(temp, (ix, iy), (ix + dx, iy + dy), (0, 255, 0), 2)
-                cv2.imshow("Select ROI", temp)
-
-        elif event == cv2.EVENT_LBUTTONUP:
-            # Finish drawing
-            drawing = False
-            side = min(abs(x - ix), abs(y - iy))
-            dx = side if x >= ix else -side
-            dy = side if y >= iy else -side
-            # Normalize coordinates: (top, left) is the minimum of the two points
-            x1 = min(ix, ix + dx)
-            y1 = min(iy, iy + dy)
-            roi = (y1, x1, side, side)  # (top, left, height, width)
-            # Draw the final square on the working image and display it
-            working_img = clone.copy()
-            cv2.rectangle(working_img, (ix, iy), (ix + dx, iy + dy), (0, 255, 0), 2)
-            cv2.imshow("Select ROI", working_img)
-
-    # Create the window and set the callback
-    cv2.namedWindow("Select ROI")
-    cv2.setMouseCallback("Select ROI", mouse_callback)
-    cv2.imshow("Select ROI", working_img)
-
-    print("Instructions for ROI selection:")
-    print("  - Click and drag to draw a square ROI.")
-    print("  - Press 'c' to confirm the selection.")
-    print("  - Press 'r' to reset and draw again.")
-    print("  - Press ESC to cancel the selection.")
-
-    # Wait until the user confirms with 'c', resets with 'r', or cancels with ESC
-    while True:
-        key = cv2.waitKey(1) & 0xFF
-        # Confirm ROI if one has been drawn
-        if key == ord("c") and roi is not None:
-            break
-        # Reset: clear the ROI and restore the original image
-        elif key == ord("r"):
-            working_img = clone.copy()
-            roi = None
-            cv2.imshow("Select ROI", working_img)
-        # Cancel selection for this image
-        elif key == 27:  # ESC key
-            roi = None
-            break
-
-    cv2.destroyWindow("Select ROI")
-    return roi
-
-
-def select_square_roi_for_images(images: dict) -> dict:
-    """
-    For each image in the provided dictionary, open a window to allow the user
-    to select a square ROI. Returns a dictionary mapping each key to a tuple
-    (top, left, height, width) representing the ROI.
-
-    Parameters:
-        images (dict): Dictionary where keys are identifiers and values are OpenCV images.
-
-    Returns:
-        dict: Mapping of image keys to the selected square ROI.
-    """
-    selected_rois = {}
-
-    for key, img in images.items():
-        if img is None:
-            print(f"Image for key '{key}' is None, skipping.")
-            continue
-
-        print(f"\nSelect square ROI for image with key: '{key}'")
-        roi = select_square_roi(img)
-
-        if roi is None:
-            print(f"No valid ROI selected for '{key}'.")
-        else:
-            selected_rois[key] = roi
-            print(f"ROI for '{key}': {roi}")
-
-    return selected_rois
-
-
-if __name__ == "__main__":
-    # Example usage:
-    # Replace 'image1.jpg' and 'image2.jpg' with valid paths to your image files.
-    fps = [5, 30]
-    cameras = [OpenCVCamera(i, fps=fps[i], width=640, height=480, mock=False) for i in range(2)]
-    [camera.connect() for camera in cameras]
-
-    image_keys = ["image_" + str(i) for i in range(len(cameras))]
-
-    images = {image_keys[i]: cameras[i].read() for i in range(len(cameras))}
-
-    # Verify images loaded correctly
-    for key, img in images.items():
-        if img is None:
-            raise ValueError(f"Failed to load image for key '{key}'. Check the file path.")
-
-    # Let the user select a square ROI for each image
-    rois = select_square_roi_for_images(images)
-
-    # Print the selected square ROIs
-    print("\nSelected Square Regions of Interest (top, left, height, width):")
-    for key, roi in rois.items():
-        print(f"{key}: {roi}")

From e0527b4a6bf41650cd25389f0cf82d4f29ccf68e Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Tue, 4 Feb 2025 17:41:14 +0000
Subject: [PATCH 061/112] Added additional wrappers for the environment: Action
 repeat, keyboard interface, reset wrapper Tested the reset mechanism and
 keyboard interface and the convert wrapper on the robots.

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/scripts/server/actor_server.py        |   5 +-
 lerobot/scripts/server/learner_server.py      |   3 +
 .../server/wrappers/gym_manipulator.py        | 306 +++++++++++++++---
 3 files changed, 262 insertions(+), 52 deletions(-)

diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index 952590e8..be5c0818 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -273,6 +273,9 @@ def act_with_policy(cfg: DictConfig):
         # TODO (michel-aractingi): Label the reward
         # if config.label_reward_on_actor:
         #     reward = reward_classifier(obs)
+        if info["is_intervention"]:
+            # TODO: Check the shape
+            action = info["action_intervention"]
 
         list_transition_to_send_to_learner.append(
             Transition(
@@ -281,7 +284,7 @@ def act_with_policy(cfg: DictConfig):
                 reward=reward,
                 next_state=next_obs,
                 done=done,
-                complementary_info=None,
+                complementary_info=info,  # TODO Handle information for the transition, is_demonstraction: bool
             )
         )
 
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index 6dd33fed..5766c69c 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -332,6 +332,9 @@ def add_actor_information_and_train(
                 transition = move_transition_to_device(transition, device=device)
                 replay_buffer.add(**transition)
 
+                if transition.get("complementary_info", {}).get("is_interaction"):
+                    offline_replay_buffer.add(**transition)
+
         while not interaction_message_queue.empty():
             interaction_message = interaction_message_queue.get()
             # If cfg.resume, shift the interaction step with the last checkpointed step in order to not break the logging
diff --git a/lerobot/scripts/server/wrappers/gym_manipulator.py b/lerobot/scripts/server/wrappers/gym_manipulator.py
index 749d4358..f95b7731 100644
--- a/lerobot/scripts/server/wrappers/gym_manipulator.py
+++ b/lerobot/scripts/server/wrappers/gym_manipulator.py
@@ -1,18 +1,18 @@
 import argparse
 import logging
 import time
-from typing import Annotated, Any, Dict, Optional, Tuple
+from threading import Lock
+from typing import Annotated, Any, Callable, Dict, Optional, Tuple
 
 import gymnasium as gym
 import numpy as np
 import torch
-import torch.nn as nn
 import torchvision.transforms.functional as F  # noqa: N812
 
 from lerobot.common.envs.utils import preprocess_observation
-from lerobot.common.robot_devices.control_utils import reset_follower_position
+from lerobot.common.robot_devices.control_utils import is_headless, reset_follower_position
 from lerobot.common.robot_devices.robots.factory import make_robot
-from lerobot.common.utils.utils import init_hydra_config
+from lerobot.common.utils.utils import init_hydra_config, log_say
 
 logging.basicConfig(level=logging.INFO)
 
@@ -28,7 +28,6 @@ class HILSerlRobotEnv(gym.Env):
     def __init__(
         self,
         robot,
-        reset_follower_position=True,
         display_cameras=False,
     ):
         """
@@ -53,8 +52,7 @@ class HILSerlRobotEnv(gym.Env):
         # Dynamically determine observation and action spaces
         self._setup_spaces()
 
-        self._initial_follower_position = robot.follower_arms["main"].read("Present_Position")
-        self.reset_follower_position = reset_follower_position
+        self.initial_follower_position = robot.follower_arms["main"].read("Present_Position")
 
         # Episode tracking
         self.current_step = 0
@@ -105,9 +103,6 @@ class HILSerlRobotEnv(gym.Env):
         """
         super().reset(seed=seed, options=options)
 
-        if self.reset_follower_position:
-            reset_follower_position(self.robot, target_position=self._initial_follower_position)
-
         # Capture initial observation
         observation = self.robot.capture_observation()
 
@@ -115,7 +110,7 @@ class HILSerlRobotEnv(gym.Env):
         self.current_step = 0
         self.episode_data = None
 
-        return observation, {}
+        return observation, {"initial_position": self.initial_follower_position}
 
     def step(
         self, action: Tuple[np.ndarray, bool]
@@ -140,7 +135,7 @@ class HILSerlRobotEnv(gym.Env):
         policy_action, intervention_bool = action
         teleop_action = None
         if not intervention_bool:
-            self.robot.send_action(policy_action.cpu().numpy())
+            self.robot.send_action(policy_action.cpu())
             observation = self.robot.capture_observation()
         else:
             observation, teleop_action = self.robot.teleop_step(record_data=True)
@@ -152,7 +147,13 @@ class HILSerlRobotEnv(gym.Env):
         terminated = False
         truncated = False
 
-        return observation, reward, terminated, truncated, {"action": teleop_action}
+        return (
+            observation,
+            reward,
+            terminated,
+            truncated,
+            {"action_intervention": teleop_action, "is_intervention": teleop_action is not None},
+        )
 
     def render(self):
         """
@@ -176,37 +177,51 @@ class HILSerlRobotEnv(gym.Env):
             self.robot.disconnect()
 
 
-class HILSerlTimeLimitWrapper(gym.Wrapper):
-    def __init__(self, env, control_time_s, fps):
-        self.env = env
-        self.control_time_s = control_time_s
-        self.fps = fps
-
-        self.last_timestamp = 0.0
-        self.episode_time_in_s = 0.0
+class ActionRepeatWrapper(gym.Wrapper):
+    def __init__(self, env, nb_repeat: int = 1):
+        super().__init__(env)
+        self.nb_repeat = nb_repeat
 
     def step(self, action):
-        ret = self.env.step(action)
-        time_since_last_step = time.perf_counter() - self.last_timestamp
-        self.episode_time_in_s += time_since_last_step
-        self.last_timestamp = time.perf_counter()
-
-        # check if last timestep took more time than the expected fps
-        if 1.0 / time_since_last_step > self.fps:
-            logging.warning(f"Current timestep exceeded expected fps {self.fps}")
-
-        if self.episode_time_in_s > self.control_time_s:
-            # Terminated = True
-            ret[2] = True
-        return ret
-
-    def reset(self, seed=None, options=None):
-        self.episode_time_in_s = 0.0
-        self.last_timestamp = time.perf_counter()
-        return self.env.reset(seed, options=None)
+        for _ in range(self.nb_repeat):
+            obs, reward, done, truncated, info = self.env.step(action)
+            if done or truncated:
+                break
+        return obs, reward, done, truncated, info
 
 
-class HILSerlRewardWrapper(gym.Wrapper):
+class RelativeJointPositionActionWrapper(gym.Wrapper):
+    def __init__(self, env: HILSerlRobotEnv, delta: float = 0.1):
+        super().__init__(env)
+        self.joint_positions = self.unwrapped.robot.follower_arms["main"].read("Present_Position")
+        self.delta = delta
+
+    def step(self, action):
+        action_joint = action
+        self.joint_positions = self.unwrapped.robot.follower_arms["main"].read("Present_Position")
+        if isinstance(self.env.action_space, gym.spaces.Tuple):
+            action_joint = action[0]
+        joint_positions = self.joint_positions + (self.delta * action_joint)
+        # clip the joint positions to the joint limits with the action space
+        joint_positions = np.clip(joint_positions, self.action_space.low, self.action_space.high)
+
+        if isinstance(self.env.action_space, gym.spaces.Tuple):
+            return self.env.step((joint_positions, action[1]))
+
+        obs, reward, terminated, truncated, info = self.env.step(joint_positions)
+        if info["is_intervention"]:
+            # teleop actions are returned in absolute joint space
+            # If we are using a relative joint position action space,
+            # there will be a mismatch between the spaces of the policy and teleop actions
+            # Solution is to transform the teleop actions into relative space.
+            teleop_action = info["action_intervention"]  # teleop actions are in absolute joint space
+            relative_teleop_action = (teleop_action - self.joint_positions) / self.delta
+            info["action_intervention"] = relative_teleop_action
+
+        return self.env.step(joint_positions)
+
+
+class RewardWrapper(gym.Wrapper):
     def __init__(self, env, reward_classifier: Optional[None], device: torch.device = "cuda"):
         self.env = env
         self.reward_classifier = reward_classifier
@@ -225,7 +240,37 @@ class HILSerlRewardWrapper(gym.Wrapper):
         return self.env.reset(seed=seed, options=options)
 
 
-class HILSerlImageCropResizeWrapper(gym.Wrapper):
+class TimeLimitWrapper(gym.Wrapper):
+    def __init__(self, env, control_time_s, fps):
+        self.env = env
+        self.control_time_s = control_time_s
+        self.fps = fps
+
+        self.last_timestamp = 0.0
+        self.episode_time_in_s = 0.0
+
+    def step(self, action):
+        obs, reward, terminated, truncated, info = self.env.step(action)
+        time_since_last_step = time.perf_counter() - self.last_timestamp
+        self.episode_time_in_s += time_since_last_step
+        self.last_timestamp = time.perf_counter()
+
+        # check if last timestep took more time than the expected fps
+        if 1.0 / time_since_last_step < self.fps:
+            logging.warning(f"Current timestep exceeded expected fps {self.fps}")
+
+        if self.episode_time_in_s > self.control_time_s:
+            # Terminated = True
+            terminated = True
+        return obs, reward, terminated, truncated, info
+
+    def reset(self, seed=None, options=None):
+        self.episode_time_in_s = 0.0
+        self.last_timestamp = time.perf_counter()
+        return self.env.reset(seed=seed, options=options)
+
+
+class ImageCropResizeWrapper(gym.Wrapper):
     def __init__(self, env, crop_params_dict: Dict[str, Annotated[Tuple[int], 4]], resize_size=None):
         self.env = env
         self.crop_params_dict = crop_params_dict
@@ -260,6 +305,131 @@ class ConvertToLeRobotObservation(gym.ObservationWrapper):
         return observation
 
 
+class KeyboardInterfaceWrapper(gym.Wrapper):
+    def __init__(self, env):
+        super().__init__(env)
+        self.listener = None
+        self.events = {
+            "exit_early": False,
+            "pause_policy": False,
+            "reset_env": False,
+            "human_intervention_step": False,
+        }
+        self.event_lock = Lock()  # Thread-safe access to events
+        self._init_keyboard_listener()
+
+    def _init_keyboard_listener(self):
+        """Initialize keyboard listener if not in headless mode"""
+
+        if is_headless():
+            logging.warning(
+                "Headless environment detected. On-screen cameras display and keyboard inputs will not be available."
+            )
+            return
+        try:
+            from pynput import keyboard
+
+            def on_press(key):
+                with self.event_lock:
+                    try:
+                        if key == keyboard.Key.right or key == keyboard.Key.esc:
+                            print("Right arrow key pressed. Exiting loop...")
+                            self.events["exit_early"] = True
+                        elif key == keyboard.Key.space:
+                            if not self.events["pause_policy"]:
+                                print(
+                                    "Space key pressed. Human intervention required.\n"
+                                    "Place the leader in similar pose to the follower and press space again."
+                                )
+                                self.events["pause_policy"] = True
+                                log_say("Human intervention stage. Get ready to take over.", play_sounds=True)
+                            elif self.events["pause_policy"] and not self.events["human_intervention_step"]:
+                                self.events["human_intervention_step"] = True
+                                print("Space key pressed. Human intervention starting.")
+                                log_say("Starting human intervention.", play_sounds=True)
+                            else:
+                                self.events["pause_policy"] = False
+                                self.events["human_intervention_step"] = False
+                                print("Space key pressed for a third time.")
+                                log_say("Continuing with policy actions.", play_sounds=True)
+                    except Exception as e:
+                        print(f"Error handling key press: {e}")
+
+            self.listener = keyboard.Listener(on_press=on_press)
+            self.listener.start()
+        except ImportError:
+            logging.warning("Could not import pynput. Keyboard interface will not be available.")
+            self.listener = None
+
+    def step(self, action: Any) -> Tuple[Any, float, bool, bool, Dict]:
+        is_intervention = False
+        terminated_by_keyboard = False
+
+        # Extract policy_action if needed
+        if isinstance(self.env.action_space, gym.spaces.Tuple):
+            policy_action = action[0]
+
+        # Check the event flags without holding the lock for too long.
+        with self.event_lock:
+            if self.events["exit_early"]:
+                terminated_by_keyboard = True
+            # If we need to wait for human intervention, we note that outside the lock.
+            pause_policy = self.events["pause_policy"]
+
+        if pause_policy:
+            # Now, wait for human_intervention_step without holding the lock
+            while True:
+                with self.event_lock:
+                    if self.events["human_intervention_step"]:
+                        is_intervention = True
+                        break
+                time.sleep(0.1)  # Check more frequently if desired
+
+        # Execute the step in the underlying environment
+        obs, reward, terminated, truncated, info = self.env.step((policy_action, is_intervention))
+        return obs, reward, terminated or terminated_by_keyboard, truncated, info
+
+    def reset(self, **kwargs) -> Tuple[Any, Dict]:
+        """
+        Reset the environment and clear any pending events
+        """
+        with self.event_lock:
+            self.events = {k: False for k in self.events}
+        return self.env.reset(**kwargs)
+
+    def close(self):
+        """
+        Properly clean up the keyboard listener when the environment is closed
+        """
+        if self.listener is not None:
+            self.listener.stop()
+        super().close()
+
+
+class ResetWrapper(gym.Wrapper):
+    def __init__(
+        self, env: HILSerlRobotEnv, reset_fn: Optional[Callable[[], None]] = None, reset_time_s: float = 5
+    ):
+        super().__init__(env)
+        self.reset_fn = reset_fn
+        self.reset_time_s = reset_time_s
+
+        self.robot = self.unwrapped.robot
+        self.init_pos = self.unwrapped.initial_follower_position
+
+    def reset(self, *, seed=None, options=None):
+        if self.reset_fn is not None:
+            self.reset_fn(self.env)
+        else:
+            log_say(f"Manually reset the environment for {self.reset_time_s} seconds.", play_sounds=True)
+            start_time = time.perf_counter()
+            while time.perf_counter() - start_time < self.reset_time_s:
+                self.robot.teleop_step()
+
+            log_say("Manual reseting of the environment done.", play_sounds=True)
+        return super().reset(seed=seed, options=options)
+
+
 def make_robot_env(
     robot,
     reward_classifier,
@@ -270,18 +440,27 @@ def make_robot_env(
     display_cameras=False,
     device="cuda:0",
     resize_size=None,
+    reset_time_s=10,
+    delta_action=0.1,
+    nb_repeats=1,
+    use_relative_joint_positions=False,
 ):
     """
     Factory function to create the robot environment.
 
     Mimics gym.make() for consistent environment creation.
     """
-    env = HILSerlRobotEnv(robot, reset_follower_pos, display_cameras)
+    env = HILSerlRobotEnv(robot, display_cameras)
     env = ConvertToLeRobotObservation(env, device)
-    if crop_params_dict is not None:
-        env = HILSerlImageCropResizeWrapper(env, crop_params_dict, resize_size=resize_size)
-    env = HILSerlRewardWrapper(env, reward_classifier)
-    env = HILSerlTimeLimitWrapper(env, control_time_s, fps)
+    # if crop_params_dict is not None:
+    #     env = ImageCropResizeWrapper(env, crop_params_dict, resize_size=resize_size)
+    # env = RewardWrapper(env, reward_classifier)
+    env = TimeLimitWrapper(env, control_time_s, fps)
+    # if use_relative_joint_positions:
+    #     env = RelativeJointPositionActionWrapper(env, delta=delta_action)
+    # env = ActionRepeatWrapper(env, nb_repeat=nb_repeats)
+    env = KeyboardInterfaceWrapper(env)
+    env = ResetWrapper(env, reset_fn=None, reset_time_s=reset_time_s)
     return env
 
 
@@ -369,12 +548,37 @@ if __name__ == "__main__":
         args.reset_follower_pos,
         args.display_cameras,
         device="mps",
+        resize_size=None,
+        reset_time_s=10,
+        delta_action=0.1,
+        nb_repeats=1,
+        use_relative_joint_positions=False,
     )
 
     env.reset()
+    init_pos = env.unwrapped.initial_follower_position
+    goal_pos = init_pos
+
+    right_goal = init_pos.copy()
+    right_goal[0] += 50
+
+    left_goal = init_pos.copy()
+    left_goal[0] -= 50
+
+    # Michel is a beast
+    pitch_angle = np.linspace(left_goal[0], right_goal[0], 1000)
+
     while True:
-        intervention_action = (None, True)
-        obs, reward, terminated, truncated, info = env.step(intervention_action)
-        if terminated or truncated:
-            logging.info("Max control time reached, reset environment.")
-            env.reset()
+        for i in range(len(pitch_angle)):
+            goal_pos[0] = pitch_angle[i]
+            obs, reward, terminated, truncated, info = env.step((torch.from_numpy(goal_pos), False))
+            if terminated or truncated:
+                logging.info("Max control time reached, reset environment.")
+                env.reset()
+
+        for i in reversed(range(len(pitch_angle))):
+            goal_pos[0] = pitch_angle[i]
+            obs, reward, terminated, truncated, info = env.step((torch.from_numpy(goal_pos), False))
+            if terminated or truncated:
+                logging.info("Max control time reached, reset environment.")
+                env.reset()

From 7d5a9530f78951df2d4922a62ea9bbe0e1837a09 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Wed, 5 Feb 2025 18:22:50 +0000
Subject: [PATCH 062/112] fixed bug in crop_dataset_roi.py added missing
 buffer.pt in server dir

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/scripts/server/buffer.py           | 560 +++++++++++++++++++++
 lerobot/scripts/server/crop_dataset_roi.py |  56 +--
 2 files changed, 586 insertions(+), 30 deletions(-)
 create mode 100644 lerobot/scripts/server/buffer.py

diff --git a/lerobot/scripts/server/buffer.py b/lerobot/scripts/server/buffer.py
new file mode 100644
index 00000000..828116b9
--- /dev/null
+++ b/lerobot/scripts/server/buffer.py
@@ -0,0 +1,560 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import random
+from typing import Any, Callable, Optional, Sequence, TypedDict
+
+import torch
+import torch.nn.functional as F  # noqa: N812
+from tqdm import tqdm
+
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+
+
+class Transition(TypedDict):
+    state: dict[str, torch.Tensor]
+    action: torch.Tensor
+    reward: float
+    next_state: dict[str, torch.Tensor]
+    done: bool
+    complementary_info: dict[str, Any] = None
+
+
+class BatchTransition(TypedDict):
+    state: dict[str, torch.Tensor]
+    action: torch.Tensor
+    reward: torch.Tensor
+    next_state: dict[str, torch.Tensor]
+    done: torch.Tensor
+
+
+def move_transition_to_device(transition: Transition, device: str = "cpu") -> Transition:
+    # Move state tensors to CPU
+    transition["state"] = {key: val.to(device, non_blocking=True) for key, val in transition["state"].items()}
+
+    # Move action to CPU
+    transition["action"] = transition["action"].to(device, non_blocking=True)
+
+    # No need to move reward or done, as they are float and bool
+
+    # Move next_state tensors to CPU
+    transition["next_state"] = {
+        key: val.to(device, non_blocking=True) for key, val in transition["next_state"].items()
+    }
+
+    # If complementary_info is present, move its tensors to CPU
+    if transition["complementary_info"] is not None:
+        transition["complementary_info"] = {
+            key: val.to(device, non_blocking=True) for key, val in transition["complementary_info"].items()
+        }
+    return transition
+
+
+def move_state_dict_to_device(state_dict, device):
+    """
+    Recursively move all tensors in a (potentially) nested
+    dict/list/tuple structure to the CPU.
+    """
+    if isinstance(state_dict, torch.Tensor):
+        return state_dict.to(device)
+    elif isinstance(state_dict, dict):
+        return {k: move_state_dict_to_device(v, device=device) for k, v in state_dict.items()}
+    elif isinstance(state_dict, list):
+        return [move_state_dict_to_device(v, device=device) for v in state_dict]
+    elif isinstance(state_dict, tuple):
+        return tuple(move_state_dict_to_device(v, device=device) for v in state_dict)
+    else:
+        return state_dict
+
+
+def random_crop_vectorized(images: torch.Tensor, output_size: tuple) -> torch.Tensor:
+    """
+    Perform a per-image random crop over a batch of images in a vectorized way.
+    (Same as shown previously.)
+    """
+    B, C, H, W = images.shape  # noqa: N806
+    crop_h, crop_w = output_size
+
+    if crop_h > H or crop_w > W:
+        raise ValueError(
+            f"Requested crop size ({crop_h}, {crop_w}) is bigger than the image size ({H}, {W})."
+        )
+
+    tops = torch.randint(0, H - crop_h + 1, (B,), device=images.device)
+    lefts = torch.randint(0, W - crop_w + 1, (B,), device=images.device)
+
+    rows = torch.arange(crop_h, device=images.device).unsqueeze(0) + tops.unsqueeze(1)
+    cols = torch.arange(crop_w, device=images.device).unsqueeze(0) + lefts.unsqueeze(1)
+
+    rows = rows.unsqueeze(2).expand(-1, -1, crop_w)  # (B, crop_h, crop_w)
+    cols = cols.unsqueeze(1).expand(-1, crop_h, -1)  # (B, crop_h, crop_w)
+
+    images_hwcn = images.permute(0, 2, 3, 1)  # (B, H, W, C)
+
+    # Gather pixels
+    cropped_hwcn = images_hwcn[torch.arange(B, device=images.device).view(B, 1, 1), rows, cols, :]
+    # cropped_hwcn => (B, crop_h, crop_w, C)
+
+    cropped = cropped_hwcn.permute(0, 3, 1, 2)  # (B, C, crop_h, crop_w)
+    return cropped
+
+
+def random_shift(images: torch.Tensor, pad: int = 4):
+    """Vectorized random shift, imgs: (B,C,H,W), pad: #pixels"""
+    _, _, h, w = images.shape
+    images = F.pad(input=images, pad=(pad, pad, pad, pad), mode="replicate")
+    return random_crop_vectorized(images=images, output_size=(h, w))
+
+
+class ReplayBuffer:
+    def __init__(
+        self,
+        capacity: int,
+        device: str = "cuda:0",
+        state_keys: Optional[Sequence[str]] = None,
+        image_augmentation_function: Optional[Callable] = None,
+        use_drq: bool = True,
+    ):
+        """
+        Args:
+            capacity (int): Maximum number of transitions to store in the buffer.
+            device (str): The device where the tensors will be moved ("cuda:0" or "cpu").
+            state_keys (List[str]): The list of keys that appear in `state` and `next_state`.
+            image_augmentation_function (Optional[Callable]): A function that takes a batch of images
+                and returns a batch of augmented images. If None, a default augmentation function is used.
+            use_drq (bool): Whether to use the default DRQ image augmentation style, when sampling in the buffer.
+        """
+        self.capacity = capacity
+        self.device = device
+        self.memory: list[Transition] = []
+        self.position = 0
+
+        # If no state_keys provided, default to an empty list
+        # (you can handle this differently if needed)
+        self.state_keys = state_keys if state_keys is not None else []
+        if image_augmentation_function is None:
+            self.image_augmentation_function = functools.partial(random_shift, pad=4)
+        self.use_drq = use_drq
+
+    def __len__(self):
+        return len(self.memory)
+
+    def add(
+        self,
+        state: dict[str, torch.Tensor],
+        action: torch.Tensor,
+        reward: float,
+        next_state: dict[str, torch.Tensor],
+        done: bool,
+        complementary_info: Optional[dict[str, torch.Tensor]] = None,
+    ):
+        """Saves a transition."""
+        if len(self.memory) < self.capacity:
+            self.memory.append(None)
+
+        # Create and store the Transition
+        self.memory[self.position] = Transition(
+            state=state,
+            action=action,
+            reward=reward,
+            next_state=next_state,
+            done=done,
+            complementary_info=complementary_info,
+        )
+        self.position: int = (self.position + 1) % self.capacity
+
+    # TODO: ADD image_augmentation and use_drq arguments in this function in order to instantiate the class with them
+    @classmethod
+    def from_lerobot_dataset(
+        cls,
+        lerobot_dataset: LeRobotDataset,
+        device: str = "cuda:0",
+        state_keys: Optional[Sequence[str]] = None,
+        capacity: Optional[int] = None,
+    ) -> "ReplayBuffer":
+        """
+        Convert a LeRobotDataset into a ReplayBuffer.
+
+        Args:
+            lerobot_dataset (LeRobotDataset): The dataset to convert.
+            device (str): The device . Defaults to "cuda:0".
+            state_keys (Optional[Sequence[str]], optional): The list of keys that appear in `state` and `next_state`.
+            Defaults to None.
+
+        Returns:
+            ReplayBuffer: The replay buffer with offline dataset transitions.
+        """
+        # We convert the LeRobotDataset into a replay buffer, because it is more efficient to sample from
+        # a replay buffer than from a lerobot dataset.
+        if capacity is None:
+            capacity = len(lerobot_dataset)
+
+        if capacity < len(lerobot_dataset):
+            raise ValueError(
+                "The capacity of the ReplayBuffer must be greater than or equal to the length of the LeRobotDataset."
+            )
+
+        replay_buffer = cls(capacity=capacity, device=device, state_keys=state_keys)
+        list_transition = cls._lerobotdataset_to_transitions(dataset=lerobot_dataset, state_keys=state_keys)
+        # Fill the replay buffer with the lerobot dataset transitions
+        for data in list_transition:
+            for k, v in data.items():
+                if isinstance(v, dict):
+                    for key, tensor in v.items():
+                        v[key] = tensor.to(device)
+                elif isinstance(v, torch.Tensor):
+                    data[k] = v.to(device)
+
+            replay_buffer.add(
+                state=data["state"],
+                action=data["action"],
+                reward=data["reward"],
+                next_state=data["next_state"],
+                done=data["done"],
+            )
+        return replay_buffer
+
+    @staticmethod
+    def _lerobotdataset_to_transitions(
+        dataset: LeRobotDataset,
+        state_keys: Optional[Sequence[str]] = None,
+    ) -> list[Transition]:
+        """
+        Convert a LeRobotDataset into a list of RL (s, a, r, s', done) transitions.
+
+        Args:
+            dataset (LeRobotDataset):
+                The dataset to convert. Each item in the dataset is expected to have
+                at least the following keys:
+                {
+                    "action": ...
+                    "next.reward": ...
+                    "next.done": ...
+                    "episode_index": ...
+                }
+                plus whatever your 'state_keys' specify.
+
+            state_keys (Optional[Sequence[str]]):
+                The dataset keys to include in 'state' and 'next_state'. Their names
+                will be kept as-is in the output transitions. E.g.
+                ["observation.state", "observation.environment_state"].
+                If None, you must handle or define default keys.
+
+        Returns:
+            transitions (List[Transition]):
+                A list of Transition dictionaries with the same length as `dataset`.
+        """
+
+        # If not provided, you can either raise an error or define a default:
+        if state_keys is None:
+            raise ValueError("You must provide a list of keys in `state_keys` that define your 'state'.")
+
+        transitions: list[Transition] = []
+        num_frames = len(dataset)
+
+        for i in tqdm(range(num_frames)):
+            current_sample = dataset[i]
+
+            # ----- 1) Current state -----
+            current_state: dict[str, torch.Tensor] = {}
+            for key in state_keys:
+                val = current_sample[key]
+                current_state[key] = val.unsqueeze(0)  # Add batch dimension
+
+            # ----- 2) Action -----
+            action = current_sample["action"].unsqueeze(0)  # Add batch dimension
+
+            # ----- 3) Reward and done -----
+            reward = float(current_sample["next.reward"].item())  # ensure float
+            done = bool(current_sample["next.done"].item())  # ensure bool
+
+            # ----- 4) Next state -----
+            # If not done and the next sample is in the same episode, we pull the next sample's state.
+            # Otherwise (done=True or next sample crosses to a new episode), next_state = current_state.
+            next_state = current_state  # default
+            if not done and (i < num_frames - 1):
+                next_sample = dataset[i + 1]
+                if next_sample["episode_index"] == current_sample["episode_index"]:
+                    # Build next_state from the same keys
+                    next_state_data: dict[str, torch.Tensor] = {}
+                    for key in state_keys:
+                        val = next_sample[key]
+                        next_state_data[key] = val.unsqueeze(0)  # Add batch dimension
+                    next_state = next_state_data
+
+            # ----- Construct the Transition -----
+            transition = Transition(
+                state=current_state,
+                action=action,
+                reward=reward,
+                next_state=next_state,
+                done=done,
+            )
+            transitions.append(transition)
+
+        return transitions
+
+    def sample(self, batch_size: int) -> BatchTransition:
+        """Sample a random batch of transitions and collate them into batched tensors."""
+        list_of_transitions = random.sample(self.memory, batch_size)
+
+        # -- Build batched states --
+        batch_state = {}
+        for key in self.state_keys:
+            batch_state[key] = torch.cat([t["state"][key] for t in list_of_transitions], dim=0).to(
+                self.device
+            )
+            if key.startswith("observation.image") and self.use_drq:
+                batch_state[key] = self.image_augmentation_function(batch_state[key])
+
+        # -- Build batched actions --
+        batch_actions = torch.cat([t["action"] for t in list_of_transitions]).to(self.device)
+
+        # -- Build batched rewards --
+        batch_rewards = torch.tensor([t["reward"] for t in list_of_transitions], dtype=torch.float32).to(
+            self.device
+        )
+
+        # -- Build batched next states --
+        batch_next_state = {}
+        for key in self.state_keys:
+            batch_next_state[key] = torch.cat([t["next_state"][key] for t in list_of_transitions], dim=0).to(
+                self.device
+            )
+            if key.startswith("observation.image") and self.use_drq:
+                batch_next_state[key] = self.image_augmentation_function(batch_next_state[key])
+
+        # -- Build batched dones --
+        batch_dones = torch.tensor([t["done"] for t in list_of_transitions], dtype=torch.float32).to(
+            self.device
+        )
+        batch_dones = torch.tensor([t["done"] for t in list_of_transitions], dtype=torch.float32).to(
+            self.device
+        )
+
+        # Return a BatchTransition typed dict
+        return BatchTransition(
+            state=batch_state,
+            action=batch_actions,
+            reward=batch_rewards,
+            next_state=batch_next_state,
+            done=batch_dones,
+        )
+
+    def to_lerobot_dataset(
+        self,
+        repo_id: str,
+        fps=1,  # If you have real timestamps, adjust this
+        root=None,
+        task_name="from_replay_buffer",
+    ) -> LeRobotDataset:
+        """
+        Converts all transitions in this ReplayBuffer into a single LeRobotDataset object,
+        splitting episodes by transitions where 'done=True'.
+
+        Returns:
+            LeRobotDataset: The resulting offline dataset.
+        """
+        if len(self.memory) == 0:
+            raise ValueError("The replay buffer is empty. Cannot convert to a dataset.")
+
+        # Infer the shapes and dtypes of your features
+        #    We'll create a features dict that is suitable for LeRobotDataset
+        # --------------------------------------------------------------------------------------------
+        # First, grab one transition to inspect shapes
+        first_transition = self.memory[0]
+
+        # We'll store default metadata for every episode: indexes, timestamps, etc.
+        features = {
+            "index": {"dtype": "int64", "shape": [1]},  # global index across episodes
+            "episode_index": {"dtype": "int64", "shape": [1]},  # which episode
+            "frame_index": {"dtype": "int64", "shape": [1]},  # index inside an episode
+            "timestamp": {"dtype": "float32", "shape": [1]},  # for now we store dummy
+            "task_index": {"dtype": "int64", "shape": [1]},
+        }
+
+        # Add "action"
+        act_info = guess_feature_info(
+            first_transition["action"].squeeze(dim=0), "action"
+        )  # Remove batch dimension
+        features["action"] = act_info
+
+        # Add "reward" (scalars)
+        features["next.reward"] = {"dtype": "float32", "shape": (1,)}
+
+        # Add "done" (boolean scalars)
+        features["next.done"] = {"dtype": "bool", "shape": (1,)}
+
+        # Add state keys
+        for key in self.state_keys:
+            sample_val = first_transition["state"][key].squeeze(dim=0)  # Remove batch dimension
+            if not isinstance(sample_val, torch.Tensor):
+                raise ValueError(
+                    f"State key '{key}' is not a torch.Tensor. Please ensure your states are stored as torch.Tensors."
+                )
+            f_info = guess_feature_info(sample_val, key)
+            features[key] = f_info
+
+        # --------------------------------------------------------------------------------------------
+        # Create an empty LeRobotDataset
+        #    We'll store all frames as separate images only if we detect shape = (3, H, W) or (1, H, W).
+        #    By default we won't do videos, but feel free to adapt if you have them.
+        # --------------------------------------------------------------------------------------------
+        lerobot_dataset = LeRobotDataset.create(
+            repo_id=repo_id,
+            fps=fps,  # If you have real timestamps, adjust this
+            root=root,  # Or some local path where you'd like the dataset files to go
+            robot=None,
+            robot_type=None,
+            features=features,
+            use_videos=True,  # We won't do actual video encoding for a replay buffer
+        )
+
+        # Start writing images if needed. If you have no image features, this is harmless.
+        # Set num_processes or num_threads if you want concurrency.
+        lerobot_dataset.start_image_writer(num_processes=0, num_threads=2)
+
+        # --------------------------------------------------------------------------------------------
+        # Convert transitions into episodes and frames
+        #    We detect episode boundaries by `done == True`.
+        # --------------------------------------------------------------------------------------------
+        episode_index = 0
+        lerobot_dataset.episode_buffer = lerobot_dataset.create_episode_buffer(episode_index)
+
+        frame_idx_in_episode = 0
+        for global_frame_idx, transition in enumerate(self.memory):
+            frame_dict = {}
+
+            # Fill the data for state keys
+            for key in self.state_keys:
+                # Expand dimension to match what the dataset expects (the dataset wants the raw shape)
+                # We assume your buffer has shape [C, H, W] (if image) or [D] if vector
+                # This is typically already correct, but if needed you can reshape below.
+                frame_dict[key] = transition["state"][key].cpu().squeeze(dim=0)  # Remove batch dimension
+
+            # Fill action, reward, done
+            # Make sure they are shape (X,) or (X,Y,...) as needed.
+            frame_dict["action"] = transition["action"].cpu().squeeze(dim=0)  # Remove batch dimension
+            frame_dict["next.reward"] = (
+                torch.tensor([transition["reward"]], dtype=torch.float32).cpu().squeeze(dim=0)
+            )
+            frame_dict["next.done"] = (
+                torch.tensor([transition["done"]], dtype=torch.bool).cpu().squeeze(dim=0)
+            )
+            # Add to the dataset's buffer
+            lerobot_dataset.add_frame(frame_dict)
+
+            # Move to next frame
+            frame_idx_in_episode += 1
+
+            # If we reached an episode boundary, call save_episode, reset counters
+            if transition["done"]:
+                # Use some placeholder name for the task
+                lerobot_dataset.save_episode(task="from_replay_buffer")
+                episode_index += 1
+                frame_idx_in_episode = 0
+                # Start a new buffer for the next episode
+                lerobot_dataset.episode_buffer = lerobot_dataset.create_episode_buffer(episode_index)
+
+        # We are done adding frames
+        # If the last transition wasn't done=True, we still have an open buffer with frames.
+        # We'll consider that an incomplete episode and still save it:
+        if lerobot_dataset.episode_buffer["size"] > 0:
+            lerobot_dataset.save_episode(task=task_name)
+
+        lerobot_dataset.stop_image_writer()
+
+        lerobot_dataset.consolidate(run_compute_stats=False, keep_image_files=False)
+
+        return lerobot_dataset
+
+
+# Utility function to guess shapes/dtypes from a tensor
+def guess_feature_info(t: torch.Tensor, name: str):
+    """
+    Return a dictionary with the 'dtype' and 'shape' for a given tensor or array.
+    If it looks like a 3D (C,H,W) shape, we might consider it an 'image'.
+    Otherwise default to 'float32' for numeric. You can customize as needed.
+    """
+    shape = tuple(t.shape)
+    # Basic guess: if we have exactly 3 dims and shape[0] in {1, 3}, guess 'image'
+    if len(shape) == 3 and shape[0] in [1, 3]:
+        return {
+            "dtype": "image",
+            "shape": shape,
+        }
+    else:
+        # Otherwise treat as numeric
+        return {
+            "dtype": "float32",
+            "shape": shape,
+        }
+
+
+def concatenate_batch_transitions(
+    left_batch_transitions: BatchTransition, right_batch_transition: BatchTransition
+) -> BatchTransition:
+    """NOTE: Be careful it change the left_batch_transitions in place"""
+    left_batch_transitions["state"] = {
+        key: torch.cat([left_batch_transitions["state"][key], right_batch_transition["state"][key]], dim=0)
+        for key in left_batch_transitions["state"]
+    }
+    left_batch_transitions["action"] = torch.cat(
+        [left_batch_transitions["action"], right_batch_transition["action"]], dim=0
+    )
+    left_batch_transitions["reward"] = torch.cat(
+        [left_batch_transitions["reward"], right_batch_transition["reward"]], dim=0
+    )
+    left_batch_transitions["next_state"] = {
+        key: torch.cat(
+            [left_batch_transitions["next_state"][key], right_batch_transition["next_state"][key]], dim=0
+        )
+        for key in left_batch_transitions["next_state"]
+    }
+    left_batch_transitions["done"] = torch.cat(
+        [left_batch_transitions["done"], right_batch_transition["done"]], dim=0
+    )
+    return left_batch_transitions
+
+
+# if __name__ == "__main__":
+#     dataset_name = "lerobot/pusht_image"
+#     dataset = LeRobotDataset(repo_id=dataset_name, episodes=range(1, 3))
+#     replay_buffer = ReplayBuffer.from_lerobot_dataset(
+#         lerobot_dataset=dataset, state_keys=["observation.image", "observation.state"]
+#     )
+#     replay_buffer_converted = replay_buffer.to_lerobot_dataset(repo_id="AdilZtn/pusht_image_converted")
+#     for i in range(len(replay_buffer_converted)):
+#         replay_convert = replay_buffer_converted[i]
+#         dataset_convert = dataset[i]
+#         for key in replay_convert.keys():
+#             if key in {"index", "episode_index", "frame_index", "timestamp", "task_index"}:
+#                 continue
+#             if key in dataset_convert.keys():
+#                 assert torch.equal(replay_convert[key], dataset_convert[key])
+#                 print(f"Key {key} is equal : {replay_convert[key].size()}, {dataset_convert[key].size()}")
+#     re_reconverted_dataset = ReplayBuffer.from_lerobot_dataset(
+#         replay_buffer_converted, state_keys=["observation.image", "observation.state"], device="cpu"
+#     )
+#     for _ in range(20):
+#         batch = re_reconverted_dataset.sample(32)
+
+#         for key in batch.keys():
+#             if key in {"state", "next_state"}:
+#                 for key_state in batch[key].keys():
+#                     print(key_state, batch[key][key_state].size())
+#                 continue
+#             print(key, batch[key].size())
diff --git a/lerobot/scripts/server/crop_dataset_roi.py b/lerobot/scripts/server/crop_dataset_roi.py
index 8d7d7ebf..41be58a8 100644
--- a/lerobot/scripts/server/crop_dataset_roi.py
+++ b/lerobot/scripts/server/crop_dataset_roi.py
@@ -187,43 +187,39 @@ def convert_lerobot_dataset_to_cropper_lerobot_dataset(
     # 2. Process each episode in the original dataset.
     episodes_info = original_dataset.meta.episodes
     # (Sort episodes by episode_index for consistency.)
+
     episodes_info = sorted(episodes_info, key=lambda x: x["episode_index"])
+    # Use the first task from the episode metadata (or "unknown" if not provided)
+    task = episodes_info[0]["tasks"][0] if episodes_info[0].get("tasks") else "unknown"
 
-    for ep in tqdm(episodes_info[:3], desc="Processing episodes"):
-        ep_index = ep.pop("episode_index")
-        # Use the first task from the episode metadata (or "unknown" if not provided)
-        task = ep["tasks"][0] if ep.get("tasks") else "unknown"
+    last_episode_index = 0
+    for sample in tqdm(original_dataset):
+        episode_index = sample.pop("episode_index")
+        if episode_index != last_episode_index:
+            new_dataset.save_episode(task, encode_videos=True)
+            last_episode_index = episode_index
+        sample.pop("frame_index")
+        # Make a shallow copy of the sample (the values—e.g. torch tensors—are assumed immutable)
+        new_sample = sample.copy()
+        # Loop over each observation key that should be cropped/resized.
+        for key, params in crop_params_dict.items():
+            if key in new_sample:
+                top, left, height, width = params
+                # Apply crop then resize.
+                cropped = F.crop(new_sample[key], top, left, height, width)
+                resized = F.resize(cropped, resize_size)
+                new_sample[key] = resized
+        # Add the transformed frame to the new dataset.
+        new_dataset.add_frame(new_sample)
 
-        # Reset the episode buffer in the new dataset (this will store frames for one episode).
-        new_dataset.episode_buffer = new_dataset.create_episode_buffer(episode_index=ep_index)
-
-        # 3. Filter and process all frames belonging to this episode.
-        # Here we loop over the entire dataset and select the frames with the matching episode_index.
-        # (Depending on the dataset size, you might want a more efficient method.)
-        ep_frames = [sample for sample in original_dataset if sample["episode_index"] == ep_index]
-
-        for sample in tqdm(ep_frames):
-            sample.pop("episode_index")
-            sample.pop("frame_index")
-            # Make a shallow copy of the sample (the values—e.g. torch tensors—are assumed immutable)
-            new_sample = sample.copy()
-            # Loop over each observation key that should be cropped/resized.
-            for key, params in crop_params_dict.items():
-                if key in new_sample:
-                    top, left, height, width = params
-                    # Apply crop then resize.
-                    cropped = F.crop(new_sample[key], top, left, height, width)
-                    resized = F.resize(cropped, resize_size)
-                    new_sample[key] = resized
-            # Add the transformed frame to the new dataset.
-            new_dataset.add_frame(new_sample)
-
-        # 4. Save the episode (this writes the parquet file and image files).
-        new_dataset.save_episode(task, encode_videos=True)
+    # save last episode
+    new_dataset.save_episode(task, encode_videos=True)
 
     # Optionally, consolidate the new dataset to compute statistics and update video info.
     new_dataset.consolidate(run_compute_stats=True, keep_image_files=True)
 
+    new_dataset.push_to_hub(tags=None)
+
     return new_dataset
 
 

From 12525242ceb896296ae52fcebf1556c223a01d3f Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Thu, 6 Feb 2025 16:29:37 +0100
Subject: [PATCH 063/112] - Added `lerobot/scripts/server/gym_manipulator.py`
 that contains all the necessary wrappers to run a gym-style env around the
 real robot. - Added `lerobot/scripts/server/find_joint_limits.py` to test the
 min and max angles of the motion you wish the robot to explore during RL
 training. - Added logic in `manipulator.py` to limit the maximum possible
 joint angles to allow motion within a predefined joint position range. The
 limits are specified in the yaml config for each robot. Checkout the
 so100.yaml.

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/common/envs/factory.py                |  14 +-
 lerobot/common/envs/utils.py                  |  36 +-
 .../hilserl/classifier/modeling_classifier.py |  15 +-
 .../robot_devices/robots/manipulator.py       |  24 +-
 .../configs/policy/hilserl_classifier.yaml    |   7 +-
 lerobot/configs/robot/so100.yaml              |   3 +
 lerobot/scripts/server/find_joint_limits.py   |  64 ++
 lerobot/scripts/server/gym_manipulator.py     | 697 ++++++++++++++++++
 8 files changed, 824 insertions(+), 36 deletions(-)
 create mode 100644 lerobot/scripts/server/find_joint_limits.py
 create mode 100644 lerobot/scripts/server/gym_manipulator.py

diff --git a/lerobot/common/envs/factory.py b/lerobot/common/envs/factory.py
index cf6d7208..72b8f1c7 100644
--- a/lerobot/common/envs/factory.py
+++ b/lerobot/common/envs/factory.py
@@ -20,7 +20,7 @@ import gymnasium as gym
 import numpy as np
 import torch
 from omegaconf import DictConfig
-from mani_skill.utils import common
+# from mani_skill.utils import common
 
 
 def make_env(cfg: DictConfig, n_envs: int | None = None) -> gym.vector.VectorEnv | None:
@@ -122,28 +122,30 @@ class PixelWrapper(gym.Wrapper):
         obs, reward, terminated, truncated, info = self.env.step(action)
         return self._get_obs(obs), reward, terminated, truncated, info
 
+
 class ConvertToLeRobotEnv(gym.Wrapper):
     def __init__(self, env, num_envs):
         super().__init__(env)
+
     def reset(self, seed=None, options=None):
         obs, info = self.env.reset(seed=seed, options={})
         return self._get_obs(obs), info
+
     def step(self, action):
         obs, reward, terminated, truncated, info = self.env.step(action)
         return self._get_obs(obs), reward, terminated, truncated, info
+
     def _get_obs(self, observation):
         sensor_data = observation.pop("sensor_data")
         del observation["sensor_param"]
         images = []
         for cam_data in sensor_data.values():
-                images.append(cam_data["rgb"])
+            images.append(cam_data["rgb"])
 
         images = torch.concat(images, axis=-1)
         # flatten the rest of the data which should just be state data
-        observation = common.flatten_state_dict(
-            observation, use_torch=True, device=self.base_env.device
-        )
+        observation = common.flatten_state_dict(observation, use_torch=True, device=self.base_env.device)
         ret = dict()
         ret["state"] = observation
         ret["pixels"] = images
-        return ret
\ No newline at end of file
+        return ret
diff --git a/lerobot/common/envs/utils.py b/lerobot/common/envs/utils.py
index fab83d49..faa4c026 100644
--- a/lerobot/common/envs/utils.py
+++ b/lerobot/common/envs/utils.py
@@ -31,28 +31,25 @@ def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Ten
     # TODO: You have to merge all tensors from agent key and extra key
     # You don't keep sensor param key in the observation
     # And you keep sensor data rgb
-    if "pixels" in observations:
-        if isinstance(observations["pixels"], dict):
-            imgs = {f"observation.images.{key}": img for key, img in observations["pixels"].items()}
-        else:
-            imgs = {"observation.image": observations["pixels"]}
+    for key, img in observations.items():
+        if "images" not in key:
+            continue
 
-        for imgkey, img in imgs.items():
-            img = torch.from_numpy(img)
+        if img.ndim == 3:
+            img = img.unsqueeze(0)
+        # sanity check that images are channel last
+        _, h, w, c = img.shape
+        assert c < h and c < w, f"expect channel last images, but instead got {img.shape=}"
 
-            # sanity check that images are channel last
-            _, h, w, c = img.shape
-            assert c < h and c < w, f"expect channel last images, but instead got {img.shape=}"
+        # sanity check that images are uint8
+        assert img.dtype == torch.uint8, f"expect torch.uint8, but instead {img.dtype=}"
 
-            # sanity check that images are uint8
-            assert img.dtype == torch.uint8, f"expect torch.uint8, but instead {img.dtype=}"
+        # convert to channel first of type float32 in range [0,1]
+        img = einops.rearrange(img, "b h w c -> b c h w").contiguous()
+        img = img.type(torch.float32)
+        img /= 255
 
-            # convert to channel first of type float32 in range [0,1]
-            img = einops.rearrange(img, "b h w c -> b c h w").contiguous()
-            img = img.type(torch.float32)
-            img /= 255
-
-            return_observations[imgkey] = img
+        return_observations[key] = img
         # obs state agent qpos and qvel
         # image
 
@@ -63,7 +60,8 @@ def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Ten
 
     # TODO(rcadene): enable pixels only baseline with `obs_type="pixels"` in environment by removing
     # requirement for "agent_pos"
-    return_observations["observation.state"] = torch.from_numpy(observations["agent_pos"]).float()
+    # return_observations["observation.state"] = torch.from_numpy(observations["agent_pos"]).float()
+    return_observations["observation.state"] = observations["observation.state"].float()
     return return_observations
 
 
diff --git a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
index 4a022335..58532302 100644
--- a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
+++ b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
@@ -47,7 +47,7 @@ class Classifier(
 
         super().__init__()
         self.config = config
-        self.processor = AutoImageProcessor.from_pretrained(self.config.model_name, trust_remote_code=True)
+        # self.processor = AutoImageProcessor.from_pretrained(self.config.model_name, trust_remote_code=True)
         encoder = AutoModel.from_pretrained(self.config.model_name, trust_remote_code=True)
         # Extract vision model if we're given a multimodal model
         if hasattr(encoder, "vision_model"):
@@ -108,11 +108,12 @@ class Classifier(
     def _get_encoder_output(self, x: torch.Tensor) -> torch.Tensor:
         """Extract the appropriate output from the encoder."""
         # Process images with the processor (handles resizing and normalization)
-        processed = self.processor(
-            images=x,  # LeRobotDataset already provides proper tensor format
-            return_tensors="pt",
-        )
-        processed = processed["pixel_values"].to(x.device)
+        # processed = self.processor(
+        #     images=x,  # LeRobotDataset already provides proper tensor format
+        #     return_tensors="pt",
+        # )
+        # processed = processed["pixel_values"].to(x.device)
+        processed = x
 
         with torch.no_grad():
             if self.is_cnn:
@@ -146,6 +147,6 @@ class Classifier(
 
     def predict_reward(self, x):
         if self.config.num_classes == 2:
-            return (self.forward(x).probabilities > 0.5).float()
+            return (self.forward(x).probabilities > 0.6).float()
         else:
             return torch.argmax(self.forward(x).probabilities, dim=1)
diff --git a/lerobot/common/robot_devices/robots/manipulator.py b/lerobot/common/robot_devices/robots/manipulator.py
index 61810506..2381de28 100644
--- a/lerobot/common/robot_devices/robots/manipulator.py
+++ b/lerobot/common/robot_devices/robots/manipulator.py
@@ -32,7 +32,7 @@ def ensure_safe_goal_position(
     safe_goal_pos = present_pos + safe_diff
 
     if not torch.allclose(goal_pos, safe_goal_pos):
-        logging.warning(
+        logging.debug(
             "Relative goal position magnitude had to be clamped to be safe.\n"
             f"  requested relative goal position target: {diff}\n"
             f"    clamped relative goal position target: {safe_diff}"
@@ -67,6 +67,8 @@ class ManipulatorRobotConfig:
     # gripper is not put in torque mode.
     gripper_open_degree: float | None = None
 
+    joint_position_relative_bounds: dict[np.ndarray] | None = None
+
     def __setattr__(self, prop: str, val):
         if prop == "max_relative_target" and val is not None and isinstance(val, Sequence):
             for name in self.follower_arms:
@@ -78,6 +80,9 @@ class ManipulatorRobotConfig:
                         "Note: This feature does not yet work with robots where different follower arms have "
                         "different numbers of motors."
                     )
+        if prop == "joint_position_relative_bounds" and val is not None:
+            for key in val:
+                val[key] = torch.tensor(val[key])
         super().__setattr__(prop, val)
 
     def __post_init__(self):
@@ -523,6 +528,14 @@ class ManipulatorRobot:
             before_fwrite_t = time.perf_counter()
             goal_pos = leader_pos[name]
 
+            # If specified, clip the goal positions within predefined bounds specified in the config of the robot
+            if self.config.joint_position_relative_bounds is not None:
+                goal_pos = torch.clamp(
+                    goal_pos,
+                    self.config.joint_position_relative_bounds["min"],
+                    self.config.joint_position_relative_bounds["max"],
+                )
+
             # Cap goal position when too far away from present position.
             # Slower fps expected due to reading from the follower.
             if self.config.max_relative_target is not None:
@@ -644,6 +657,14 @@ class ManipulatorRobot:
             goal_pos = action[from_idx:to_idx]
             from_idx = to_idx
 
+            # If specified, clip the goal positions within predefined bounds specified in the config of the robot
+            if self.config.joint_position_relative_bounds is not None:
+                goal_pos = torch.clamp(
+                    goal_pos,
+                    self.config.joint_position_relative_bounds["min"],
+                    self.config.joint_position_relative_bounds["max"],
+                )
+
             # Cap goal position when too far away from present position.
             # Slower fps expected due to reading from the follower.
             if self.config.max_relative_target is not None:
@@ -656,6 +677,7 @@ class ManipulatorRobot:
 
             # Send goal position to each follower
             goal_pos = goal_pos.numpy().astype(np.int32)
+
             self.follower_arms[name].write("Goal_Position", goal_pos)
 
         return torch.cat(action_sent)
diff --git a/lerobot/configs/policy/hilserl_classifier.yaml b/lerobot/configs/policy/hilserl_classifier.yaml
index f8137b69..21fd4a1a 100644
--- a/lerobot/configs/policy/hilserl_classifier.yaml
+++ b/lerobot/configs/policy/hilserl_classifier.yaml
@@ -4,7 +4,7 @@ defaults:
   - _self_
 
 seed: 13
-dataset_repo_id: aractingi/pick_place_lego_cube_1
+dataset_repo_id: aractingi/push_green_cube_hf_cropped_resized
 train_split_proportion: 0.8
 
 # Required by logger
@@ -24,7 +24,8 @@ training:
   eval_freq: 1  # How often to run validation (in epochs)
   save_freq: 1  # How often to save checkpoints (in epochs)
   save_checkpoint: true
-  image_keys: ["observation.images.top", "observation.images.wrist"]
+  # image_keys: ["observation.images.top", "observation.images.wrist"]
+  image_keys: ["observation.images.laptop", "observation.images.phone"]
   label_key: "next.reward"
 
 eval:
@@ -32,7 +33,7 @@ eval:
   num_samples_to_log: 30  # Number of validation samples to log in the table
 
 policy:
-  name: "hilserl/classifier/pick_place_lego_cube_1"
+  name: "hilserl/classifier/push_green_cube_hf_cropped_resized" #"hilserl/classifier/pick_place_lego_cube_1"
   model_name: "facebook/convnext-base-224"
   model_type: "cnn"
   num_cameras: 2 # Has to be len(training.image_keys)
diff --git a/lerobot/configs/robot/so100.yaml b/lerobot/configs/robot/so100.yaml
index 0978de64..d57ae721 100644
--- a/lerobot/configs/robot/so100.yaml
+++ b/lerobot/configs/robot/so100.yaml
@@ -14,6 +14,9 @@ calibration_dir: .cache/calibration/so100
 # Set this to a positive scalar to have the same value for all motors, or a list that is the same length as
 # the number of motors in your follower arms.
 max_relative_target: null
+joint_position_relative_bounds: 
+  min: [-88.50586,  23.81836, 0.87890625, -32.16797, 78.66211,   0.53691274]
+  max: [84.55078, 187.11914, 145.98633, 101.60156, 146.60156,  88.18792]
 
 leader_arms:
   main:
diff --git a/lerobot/scripts/server/find_joint_limits.py b/lerobot/scripts/server/find_joint_limits.py
new file mode 100644
index 00000000..6ec9d89f
--- /dev/null
+++ b/lerobot/scripts/server/find_joint_limits.py
@@ -0,0 +1,64 @@
+import argparse
+import time
+
+import cv2
+import numpy as np
+
+from lerobot.common.robot_devices.control_utils import is_headless
+from lerobot.common.robot_devices.robots.factory import make_robot
+from lerobot.common.utils.utils import init_hydra_config
+
+
+def find_joint_bounds(
+    robot,
+    control_time_s=20,
+    display_cameras=False,
+):
+    # TODO(rcadene): Add option to record logs
+    if not robot.is_connected:
+        robot.connect()
+
+        control_time_s = float("inf")
+
+    timestamp = 0
+    start_episode_t = time.perf_counter()
+    pos_list = []
+    while timestamp < control_time_s:
+        observation, action = robot.teleop_step(record_data=True)
+
+        pos_list.append(robot.follower_arms["main"].read("Present_Position"))
+
+        if display_cameras and not is_headless():
+            image_keys = [key for key in observation if "image" in key]
+            for key in image_keys:
+                cv2.imshow(key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR))
+            cv2.waitKey(1)
+
+        timestamp = time.perf_counter() - start_episode_t
+        if timestamp > 60:
+            max = np.max(np.stack(pos_list), 0)
+            min = np.min(np.stack(pos_list), 0)
+            print(f"Max angle position per joint {max}")
+            print(f"Min angle position per joint {min}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--robot-path",
+        type=str,
+        default="lerobot/configs/robot/koch.yaml",
+        help="Path to robot yaml file used to instantiate the robot using `make_robot` factory function.",
+    )
+    parser.add_argument(
+        "--robot-overrides",
+        type=str,
+        nargs="*",
+        help="Any key=value arguments to override config values (use dots for.nested=overrides)",
+    )
+    parser.add_argument("--control-time-s", type=float, default=20, help="Maximum episode length in seconds")
+    args = parser.parse_args()
+    robot_cfg = init_hydra_config(args.robot_path, args.robot_overrides)
+
+    robot = make_robot(robot_cfg)
+    find_joint_bounds(robot, control_time_s=args.control_time_s)
diff --git a/lerobot/scripts/server/gym_manipulator.py b/lerobot/scripts/server/gym_manipulator.py
new file mode 100644
index 00000000..40dc2784
--- /dev/null
+++ b/lerobot/scripts/server/gym_manipulator.py
@@ -0,0 +1,697 @@
+import argparse
+import logging
+import time
+from threading import Lock
+from typing import Annotated, Any, Callable, Dict, Optional, Tuple
+
+import cv2
+import gymnasium as gym
+import numpy as np
+import torch
+import torchvision.transforms.functional as F  # noqa: N812
+
+from lerobot.common.envs.utils import preprocess_observation
+from lerobot.common.robot_devices.control_utils import busy_wait, is_headless, reset_follower_position
+from lerobot.common.robot_devices.robots.factory import make_robot
+from lerobot.common.utils.utils import init_hydra_config, log_say
+
+logging.basicConfig(level=logging.INFO)
+
+
+class HILSerlRobotEnv(gym.Env):
+    """
+    Gym-like environment wrapper for robot policy evaluation.
+
+    This wrapper provides a consistent interface for interacting with the robot,
+    following the OpenAI Gym environment conventions.
+    """
+
+    def __init__(
+        self,
+        robot,
+        use_delta_action_space: bool = True,
+        delta: float | None = None,
+        display_cameras=False,
+    ):
+        """
+        Initialize the robot environment.
+
+        Args:
+            robot: The robot interface object
+            reward_classifier: Optional reward classifier
+            fps: Frames per second for control
+            control_time_s: Total control time for each episode
+            display_cameras: Whether to display camera feeds
+            output_normalization_params_action: Bound parameters for the action space
+            delta: The delta for the relative joint position action space
+        """
+        super().__init__()
+
+        self.robot = robot
+        self.display_cameras = display_cameras
+
+        # connect robot
+        if not self.robot.is_connected:
+            self.robot.connect()
+
+        self.initial_follower_position = robot.follower_arms["main"].read("Present_Position")
+
+        # Episode tracking
+        self.current_step = 0
+        self.episode_data = None
+
+        self.delta = delta
+        self.use_delta_action_space = use_delta_action_space
+        self.current_joint_positions = self.robot.follower_arms["main"].read("Present_Position")
+
+        self.relative_bounds_size = (
+            self.robot.config.joint_position_relative_bounds["max"]
+            - self.robot.config.joint_position_relative_bounds["min"]
+        )
+
+        self.delta_relative_bounds_size = self.relative_bounds_size * self.delta
+
+        self.robot.config.max_relative_target = self.delta_relative_bounds_size.float()
+
+        # Dynamically determine observation and action spaces
+        self._setup_spaces()
+
+    def _setup_spaces(self):
+        """
+        Dynamically determine observation and action spaces based on robot capabilities.
+
+        This method should be customized based on the specific robot's observation
+        and action representations.
+        """
+        # Example space setup - you'll need to adapt this to your specific robot
+        example_obs = self.robot.capture_observation()
+
+        # Observation space (assuming image-based observations)
+        image_keys = [key for key in example_obs if "image" in key]
+        state_keys = [key for key in example_obs if "image" not in key]
+        observation_spaces = {
+            key: gym.spaces.Box(low=0, high=255, shape=example_obs[key].shape, dtype=np.uint8)
+            for key in image_keys
+        }
+        observation_spaces["observation.state"] = gym.spaces.Dict(
+            {
+                key: gym.spaces.Box(low=0, high=10, shape=example_obs[key].shape, dtype=np.float32)
+                for key in state_keys
+            }
+        )
+
+        self.observation_space = gym.spaces.Dict(observation_spaces)
+
+        # Action space (assuming joint positions)
+        action_dim = len(self.robot.follower_arms["main"].read("Present_Position"))
+        if self.use_delta_action_space:
+            action_space_robot = gym.spaces.Box(
+                low=-self.relative_bounds_size.cpu().numpy(),
+                high=self.relative_bounds_size.cpu().numpy(),
+                shape=(action_dim,),
+                dtype=np.float32,
+            )
+        else:
+            action_space_robot = gym.spaces.Box(
+                low=self.robot.config.joint_position_relative_bounds["min"].cpu().numpy(),
+                high=self.robot.config.joint_position_relative_bounds["max"].cpu().numpy(),
+                shape=(action_dim,),
+                dtype=np.float32,
+            )
+
+        self.action_space = gym.spaces.Tuple(
+            (
+                action_space_robot,
+                gym.spaces.Discrete(2),
+            ),
+        )
+
+    def reset(self, seed=None, options=None) -> Tuple[Dict[str, np.ndarray], Dict[str, Any]]:
+        """
+        Reset the environment to initial state.
+
+        Returns:
+            observation (dict): Initial observation
+            info (dict): Additional information
+        """
+        super().reset(seed=seed, options=options)
+
+        # Capture initial observation
+        observation = self.robot.capture_observation()
+
+        # Reset tracking variables
+        self.current_step = 0
+        self.episode_data = None
+
+        return observation, {"initial_position": self.initial_follower_position}
+
+    def step(
+        self, action: Tuple[np.ndarray, bool]
+    ) -> Tuple[Dict[str, np.ndarray], float, bool, bool, Dict[str, Any]]:
+        """
+        Take a step in the environment.
+
+        Args:
+            action tuple(np.ndarray, bool):
+                    Policy action to be executed on the robot and boolean to determine
+                    whether to choose policy action or expert action.
+
+        Returns:
+            observation (dict): Next observation
+            reward (float): Reward for this step
+            terminated (bool): Whether the episode has terminated
+            truncated (bool): Whether the episode was truncated
+            info (dict): Additional information
+        """
+        # The actions recieved are the in form of a tuple containing the policy action and an intervention bool
+        # The boolean inidicated whether we will use the expert's actions (through teleoperation) or the policy actions
+        policy_action, intervention_bool = action
+        teleop_action = None
+        self.current_joint_positions = self.robot.follower_arms["main"].read("Present_Position")
+        if isinstance(policy_action, torch.Tensor):
+            policy_action = policy_action.cpu().numpy()
+            olicy_action = np.clip(policy_action, self.action_space[0].low, self.action_space[0].high)
+        if not intervention_bool:
+            if self.use_delta_action_space:
+                target_joint_positions = self.current_joint_positions + self.delta * policy_action
+            else:
+                target_joint_positions = policy_action
+            self.robot.send_action(torch.from_numpy(target_joint_positions))
+            observation = self.robot.capture_observation()
+        else:
+            observation, teleop_action = self.robot.teleop_step(record_data=True)
+            teleop_action = teleop_action["action"]  # teleop step returns torch tensors but in a dict
+
+            # teleop actions are returned in absolute joint space
+            # If we are using a relative joint position action space,
+            # there will be a mismatch between the spaces of the policy and teleop actions
+            # Solution is to transform the teleop actions into relative space.
+            # teleop relative action is:
+            if self.use_delta_action_space:
+                teleop_action = teleop_action - self.current_joint_positions
+                if torch.any(teleop_action < -self.delta_relative_bounds_size * self.delta) and torch.any(
+                    teleop_action > self.delta_relative_bounds_size
+                ):
+                    print(
+                        f"relative teleop delta exceeded bounds {self.delta_relative_bounds_size}, teleop_action {teleop_action}\n"
+                        f"lower bounds condition {teleop_action < -self.delta_relative_bounds_size}\n"
+                        f"upper bounds condition {teleop_action > self.delta_relative_bounds_size}"
+                    )
+                    teleop_action = torch.clamp(
+                        teleop_action, -self.delta_relative_bounds_size, self.delta_relative_bounds_size
+                    )
+
+        self.current_step += 1
+
+        reward = 0.0
+        terminated = False
+        truncated = False
+
+        return (
+            observation,
+            reward,
+            terminated,
+            truncated,
+            {"action_intervention": teleop_action, "is_intervention": teleop_action is not None},
+        )
+
+    def render(self):
+        """
+        Render the environment (in this case, display camera feeds).
+        """
+        import cv2
+
+        observation = self.robot.capture_observation()
+        image_keys = [key for key in observation if "image" in key]
+
+        for key in image_keys:
+            cv2.imshow(key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR))
+
+        cv2.waitKey(1)
+
+    def close(self):
+        """
+        Close the environment and disconnect the robot.
+        """
+        if self.robot.is_connected:
+            self.robot.disconnect()
+
+
+class ActionRepeatWrapper(gym.Wrapper):
+    def __init__(self, env, nb_repeat: int = 1):
+        super().__init__(env)
+        self.nb_repeat = nb_repeat
+
+    def step(self, action):
+        for _ in range(self.nb_repeat):
+            obs, reward, done, truncated, info = self.env.step(action)
+            if done or truncated:
+                break
+        return obs, reward, done, truncated, info
+
+
+class RelativeJointPositionActionWrapper(gym.Wrapper):
+    def __init__(
+        self,
+        env: HILSerlRobotEnv,
+        # output_normalization_params_action: dict[str, list[float]],
+        delta: float = 0.1,
+    ):
+        super().__init__(env)
+        self.joint_positions = self.unwrapped.robot.follower_arms["main"].read("Present_Position")
+        self.delta = delta
+        if delta > 1:
+            raise ValueError("Delta should be less than 1")
+
+    def step(self, action):
+        action_joint = action
+        self.joint_positions = self.unwrapped.robot.follower_arms["main"].read("Present_Position")
+        if isinstance(self.env.action_space, gym.spaces.Tuple):
+            action_joint = action[0]
+        joint_positions = self.joint_positions + (self.delta * action_joint)
+        # clip the joint positions to the joint limits with the action space
+        joint_positions = np.clip(joint_positions, self.action_space.low, self.action_space.high)
+
+        if isinstance(self.env.action_space, gym.spaces.Tuple):
+            return self.env.step((joint_positions, action[1]))
+
+        obs, reward, terminated, truncated, info = self.env.step(joint_positions)
+        if info["is_intervention"]:
+            # teleop actions are returned in absolute joint space
+            # If we are using a relative joint position action space,
+            # there will be a mismatch between the spaces of the policy and teleop actions
+            # Solution is to transform the teleop actions into relative space.
+            self.joint_positions = self.unwrapped.robot.follower_arms["main"].read("Present_Position")
+            teleop_action = info["action_intervention"]  # teleop actions are in absolute joint space
+            relative_teleop_action = (teleop_action - self.joint_positions) / self.delta
+            info["action_intervention"] = relative_teleop_action
+
+        return self.env.step(joint_positions)
+
+
+class RewardWrapper(gym.Wrapper):
+    def __init__(self, env, reward_classifier: Optional[None], device: torch.device = "cuda"):
+        self.env = env
+        self.reward_classifier = torch.compile(reward_classifier)
+        self.device = device
+
+    def step(self, action):
+        observation, _, terminated, truncated, info = self.env.step(action)
+        images = [
+            observation[key].to(self.device, non_blocking=True) for key in observation if "image" in key
+        ]
+        start_time = time.perf_counter()
+        with torch.inference_mode():
+            reward = (
+                self.reward_classifier.predict_reward(images) if self.reward_classifier is not None else 0.0
+            )
+        # print(f"fps for reward classifier {1/(time.perf_counter() - start_time)}")
+        reward = reward.item()
+        # print(f"Reward from reward classifier {reward}")
+        return observation, reward, terminated, truncated, info
+
+    def reset(self, seed=None, options=None):
+        return self.env.reset(seed=seed, options=options)
+
+
+class TimeLimitWrapper(gym.Wrapper):
+    def __init__(self, env, control_time_s, fps):
+        self.env = env
+        self.control_time_s = control_time_s
+        self.fps = fps
+
+        self.last_timestamp = 0.0
+        self.episode_time_in_s = 0.0
+
+    def step(self, action):
+        obs, reward, terminated, truncated, info = self.env.step(action)
+        time_since_last_step = time.perf_counter() - self.last_timestamp
+        self.episode_time_in_s += time_since_last_step
+        self.last_timestamp = time.perf_counter()
+
+        # check if last timestep took more time than the expected fps
+        if 1.0 / time_since_last_step < self.fps:
+            logging.warning(f"Current timestep is lower than the expected fps {self.fps}")
+
+        if self.episode_time_in_s > self.control_time_s:
+            # Terminated = True
+            terminated = True
+        return obs, reward, terminated, truncated, info
+
+    def reset(self, seed=None, options=None):
+        self.episode_time_in_s = 0.0
+        self.last_timestamp = time.perf_counter()
+        return self.env.reset(seed=seed, options=options)
+
+
+class ImageCropResizeWrapper(gym.Wrapper):
+    def __init__(self, env, crop_params_dict: Dict[str, Annotated[Tuple[int], 4]], resize_size=None):
+        self.env = env
+        self.crop_params_dict = crop_params_dict
+        print(f"obs_keys , {self.env.observation_space}")
+        print(f"crop params dict {crop_params_dict.keys()}")
+        for key_crop in crop_params_dict:
+            if key_crop not in self.env.observation_space.keys():
+                raise ValueError(f"Key {key_crop} not in observation space")
+        for key in crop_params_dict:
+            top, left, height, width = crop_params_dict[key]
+            new_shape = (top + height, left + width)
+            self.observation_space[key] = gym.spaces.Box(low=0, high=255, shape=new_shape)
+
+        self.resize_size = resize_size
+        if self.resize_size is None:
+            self.resize_size = (128, 128)
+
+    def step(self, action):
+        obs, reward, terminated, truncated, info = self.env.step(action)
+        for k in self.crop_params_dict:
+            device = obs[k].device
+            if device == torch.device("mps:0"):
+                obs[k] = obs[k].cpu()
+            obs[k] = F.crop(obs[k], *self.crop_params_dict[k])
+            obs[k] = F.resize(obs[k], self.resize_size)
+            obs[k] = obs[k].to(device)
+            # print(f"observation with key {k} with size {obs[k].size()}")
+            cv2.imshow(k, cv2.cvtColor(obs[k].cpu().squeeze(0).permute(1, 2, 0).numpy(), cv2.COLOR_RGB2BGR))
+            cv2.waitKey(1)
+        return obs, reward, terminated, truncated, info
+
+
+class ConvertToLeRobotObservation(gym.ObservationWrapper):
+    def __init__(self, env, device):
+        super().__init__(env)
+        self.device = device
+
+    def observation(self, observation):
+        observation = preprocess_observation(observation)
+
+        observation = {key: observation[key].to(self.device, non_blocking=True) for key in observation}
+        observation = {k: torch.tensor(v, device=self.device) for k, v in observation.items()}
+        return observation
+
+
+class KeyboardInterfaceWrapper(gym.Wrapper):
+    def __init__(self, env):
+        super().__init__(env)
+        self.listener = None
+        self.events = {
+            "exit_early": False,
+            "pause_policy": False,
+            "reset_env": False,
+            "human_intervention_step": False,
+        }
+        self.event_lock = Lock()  # Thread-safe access to events
+        self._init_keyboard_listener()
+
+    def _init_keyboard_listener(self):
+        """Initialize keyboard listener if not in headless mode"""
+
+        if is_headless():
+            logging.warning(
+                "Headless environment detected. On-screen cameras display and keyboard inputs will not be available."
+            )
+            return
+        try:
+            from pynput import keyboard
+
+            def on_press(key):
+                with self.event_lock:
+                    try:
+                        if key == keyboard.Key.right or key == keyboard.Key.esc:
+                            print("Right arrow key pressed. Exiting loop...")
+                            self.events["exit_early"] = True
+                        elif key == keyboard.Key.space:
+                            if not self.events["pause_policy"]:
+                                print(
+                                    "Space key pressed. Human intervention required.\n"
+                                    "Place the leader in similar pose to the follower and press space again."
+                                )
+                                self.events["pause_policy"] = True
+                                log_say("Human intervention stage. Get ready to take over.", play_sounds=True)
+                            elif self.events["pause_policy"] and not self.events["human_intervention_step"]:
+                                self.events["human_intervention_step"] = True
+                                print("Space key pressed. Human intervention starting.")
+                                log_say("Starting human intervention.", play_sounds=True)
+                            else:
+                                self.events["pause_policy"] = False
+                                self.events["human_intervention_step"] = False
+                                print("Space key pressed for a third time.")
+                                log_say("Continuing with policy actions.", play_sounds=True)
+                    except Exception as e:
+                        print(f"Error handling key press: {e}")
+
+            self.listener = keyboard.Listener(on_press=on_press)
+            self.listener.start()
+        except ImportError:
+            logging.warning("Could not import pynput. Keyboard interface will not be available.")
+            self.listener = None
+
+    def step(self, action: Any) -> Tuple[Any, float, bool, bool, Dict]:
+        is_intervention = False
+        terminated_by_keyboard = False
+
+        # Extract policy_action if needed
+        if isinstance(self.env.action_space, gym.spaces.Tuple):
+            policy_action = action[0]
+
+        # Check the event flags without holding the lock for too long.
+        with self.event_lock:
+            if self.events["exit_early"]:
+                terminated_by_keyboard = True
+            # If we need to wait for human intervention, we note that outside the lock.
+            pause_policy = self.events["pause_policy"]
+
+        if pause_policy:
+            # Now, wait for human_intervention_step without holding the lock
+            while True:
+                with self.event_lock:
+                    if self.events["human_intervention_step"]:
+                        is_intervention = True
+                        break
+                time.sleep(0.1)  # Check more frequently if desired
+
+        # Execute the step in the underlying environment
+        obs, reward, terminated, truncated, info = self.env.step((policy_action, is_intervention))
+        return obs, reward, terminated or terminated_by_keyboard, truncated, info
+
+    def reset(self, **kwargs) -> Tuple[Any, Dict]:
+        """
+        Reset the environment and clear any pending events
+        """
+        with self.event_lock:
+            self.events = {k: False for k in self.events}
+        return self.env.reset(**kwargs)
+
+    def close(self):
+        """
+        Properly clean up the keyboard listener when the environment is closed
+        """
+        if self.listener is not None:
+            self.listener.stop()
+        super().close()
+
+
+class ResetWrapper(gym.Wrapper):
+    def __init__(
+        self, env: HILSerlRobotEnv, reset_fn: Optional[Callable[[], None]] = None, reset_time_s: float = 5
+    ):
+        super().__init__(env)
+        self.reset_fn = reset_fn
+        self.reset_time_s = reset_time_s
+
+        self.robot = self.unwrapped.robot
+        self.init_pos = self.unwrapped.initial_follower_position
+
+    def reset(self, *, seed=None, options=None):
+        if self.reset_fn is not None:
+            self.reset_fn(self.env)
+        else:
+            log_say(f"Manually reset the environment for {self.reset_time_s} seconds.", play_sounds=True)
+            start_time = time.perf_counter()
+            while time.perf_counter() - start_time < self.reset_time_s:
+                self.robot.teleop_step()
+
+            log_say("Manual reseting of the environment done.", play_sounds=True)
+        return super().reset(seed=seed, options=options)
+
+
+def make_robot_env(
+    robot,
+    reward_classifier,
+    crop_params_dict=None,
+    fps=30,
+    control_time_s=20,
+    reset_follower_pos=True,
+    display_cameras=False,
+    device="cuda:0",
+    resize_size=None,
+    reset_time_s=10,
+    delta_action=0.1,
+    nb_repeats=1,
+    use_relative_joint_positions=False,
+):
+    """
+    Factory function to create the robot environment.
+
+    Mimics gym.make() for consistent environment creation.
+    """
+    env = HILSerlRobotEnv(
+        robot,
+        display_cameras=display_cameras,
+        delta=delta_action,
+        use_delta_action_space=use_relative_joint_positions,
+    )
+    env = ConvertToLeRobotObservation(env, device)
+    if crop_params_dict is not None:
+        env = ImageCropResizeWrapper(env, crop_params_dict, resize_size=resize_size)
+    env = RewardWrapper(env, reward_classifier, device=device)
+    env = TimeLimitWrapper(env, control_time_s, fps)
+    # env = ActionRepeatWrapper(env, nb_repeat=nb_repeats)
+    env = KeyboardInterfaceWrapper(env)
+    env = ResetWrapper(env, reset_fn=None, reset_time_s=reset_time_s)
+    return env
+
+
+def get_classifier(pretrained_path, config_path, device="mps"):
+    if pretrained_path is None or config_path is None:
+        return
+
+    from lerobot.common.policies.factory import _policy_cfg_from_hydra_cfg
+    from lerobot.common.policies.hilserl.classifier.configuration_classifier import ClassifierConfig
+    from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
+
+    cfg = init_hydra_config(config_path)
+
+    classifier_config = _policy_cfg_from_hydra_cfg(ClassifierConfig, cfg)
+    classifier_config.num_cameras = len(cfg.training.image_keys)  # TODO automate these paths
+    model = Classifier(classifier_config)
+    model.load_state_dict(Classifier.from_pretrained(pretrained_path).state_dict())
+    model = model.to(device)
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--fps", type=int, default=30, help="control frequency")
+    parser.add_argument(
+        "--robot-path",
+        type=str,
+        default="lerobot/configs/robot/koch.yaml",
+        help="Path to robot yaml file used to instantiate the robot using `make_robot` factory function.",
+    )
+    parser.add_argument(
+        "--robot-overrides",
+        type=str,
+        nargs="*",
+        help="Any key=value arguments to override config values (use dots for.nested=overrides)",
+    )
+    parser.add_argument(
+        "-p",
+        "--pretrained-policy-name-or-path",
+        help=(
+            "Either the repo ID of a model hosted on the Hub or a path to a directory containing weights "
+            "saved using `Policy.save_pretrained`. If not provided, the policy is initialized from scratch "
+            "(useful for debugging). This argument is mutually exclusive with `--config`."
+        ),
+    )
+    parser.add_argument(
+        "--config",
+        help=(
+            "Path to a yaml config you want to use for initializing a policy from scratch (useful for "
+            "debugging). This argument is mutually exclusive with `--pretrained-policy-name-or-path` (`-p`)."
+        ),
+    )
+    parser.add_argument(
+        "--display-cameras", help=("Whether to display the camera feed while the rollout is happening")
+    )
+    parser.add_argument(
+        "--reward-classifier-pretrained-path",
+        type=str,
+        default=None,
+        help="Path to the pretrained classifier weights.",
+    )
+    parser.add_argument(
+        "--reward-classifier-config-file",
+        type=str,
+        default=None,
+        help="Path to a yaml config file that is necessary to build the reward classifier model.",
+    )
+    parser.add_argument("--control-time-s", type=float, default=20, help="Maximum episode length in seconds")
+    parser.add_argument("--reset-follower-pos", type=int, default=1, help="Reset follower between episodes")
+    args = parser.parse_args()
+
+    robot_cfg = init_hydra_config(args.robot_path, args.robot_overrides)
+    robot = make_robot(robot_cfg)
+
+    reward_classifier = get_classifier(
+        args.reward_classifier_pretrained_path, args.reward_classifier_config_file
+    )
+
+    crop_parameters = {
+        "observation.images.laptop": (58, 89, 357, 455),
+        "observation.images.phone": (3, 4, 471, 633),
+    }
+
+    user_relative_joint_positions = True
+
+    env = make_robot_env(
+        robot,
+        reward_classifier,
+        crop_parameters,
+        args.fps,
+        args.control_time_s,
+        args.reset_follower_pos,
+        args.display_cameras,
+        device="mps",
+        resize_size=None,
+        reset_time_s=10,
+        delta_action=0.1,
+        nb_repeats=1,
+        use_relative_joint_positions=user_relative_joint_positions,
+    )
+
+    env.reset()
+    init_pos = env.unwrapped.initial_follower_position
+
+    right_goal = init_pos.copy()
+    right_goal[0] += 50
+
+    left_goal = init_pos.copy()
+    left_goal[0] -= 50
+
+    pitch_angle = np.linspace(left_goal[0], right_goal[0], 1000)
+
+    delta_angle = np.concatenate((-np.ones(50), np.ones(50))) * 100
+
+    while True:
+        action = np.zeros(len(init_pos))
+        for i in range(len(delta_angle)):
+            start_loop_s = time.perf_counter()
+            action[0] = delta_angle[i]
+            obs, reward, terminated, truncated, info = env.step((torch.from_numpy(action), False))
+            if terminated or truncated:
+                env.reset()
+
+            dt_s = time.perf_counter() - start_loop_s
+            busy_wait(1 / args.fps - dt_s)
+        # action = np.zeros(len(init_pos)) if user_relative_joint_positions else init_pos
+        # for i in range(len(pitch_angle)):
+        #     if user_relative_joint_positions:
+        #         action[0] = delta_angle[i]
+        #     else:
+        #         action[0] = pitch_angle[i]
+        #     obs, reward, terminated, truncated, info = env.step((torch.from_numpy(action), False))
+        #     if terminated or truncated:
+        #         logging.info("Max control time reached, reset environment.")
+        #         env.reset()
+
+        # for i in reversed(range(len(pitch_angle))):
+        #     if user_relative_joint_positions:
+        #         action[0] = delta_angle[i]
+        #     else:
+        #         action[0] = pitch_angle[i]
+        #     obs, reward, terminated, truncated, info = env.step((torch.from_numpy(action), False))
+
+        #     if terminated or truncated:
+        #         logging.info("Max control time reached, reset environment.")
+        #         env.reset()

From b63738674ca5406cfa0db767dd467ad17d1ce371 Mon Sep 17 00:00:00 2001
From: Eugene Mironov <helper2424@gmail.com>
Date: Fri, 7 Feb 2025 00:39:51 +0700
Subject: [PATCH 064/112] [HIL-SERL port] Add Reward classifier benchmark
 tracking to chose best visual encoder (#688)

---
 .../configs/policy/hilserl_classifier.yaml    |  2 +
 lerobot/scripts/train_hilserl_classifier.py   | 89 ++++++++++++++++++-
 2 files changed, 88 insertions(+), 3 deletions(-)

diff --git a/lerobot/configs/policy/hilserl_classifier.yaml b/lerobot/configs/policy/hilserl_classifier.yaml
index 21fd4a1a..a315902b 100644
--- a/lerobot/configs/policy/hilserl_classifier.yaml
+++ b/lerobot/configs/policy/hilserl_classifier.yaml
@@ -27,6 +27,8 @@ training:
   # image_keys: ["observation.images.top", "observation.images.wrist"]
   image_keys: ["observation.images.laptop", "observation.images.phone"]
   label_key: "next.reward"
+  profile_inference_time: false
+  profile_inference_time_iters: 20
 
 eval:
   batch_size: 16
diff --git a/lerobot/scripts/train_hilserl_classifier.py b/lerobot/scripts/train_hilserl_classifier.py
index 458e3ff1..0ca8eae4 100644
--- a/lerobot/scripts/train_hilserl_classifier.py
+++ b/lerobot/scripts/train_hilserl_classifier.py
@@ -20,17 +20,19 @@ from pathlib import Path
 from pprint import pformat
 
 import hydra
+import numpy as np
 import torch
 import torch.nn as nn
-import wandb
 from deepdiff import DeepDiff
 from omegaconf import DictConfig, OmegaConf
 from termcolor import colored
 from torch import optim
+from torch.autograd import profiler
 from torch.cuda.amp import GradScaler
-from torch.utils.data import DataLoader, WeightedRandomSampler, random_split
+from torch.utils.data import DataLoader, RandomSampler, WeightedRandomSampler, random_split
 from tqdm import tqdm
 
+import wandb
 from lerobot.common.datasets.factory import resolve_delta_timestamps
 from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
 from lerobot.common.logger import Logger
@@ -124,6 +126,7 @@ def validate(model, val_loader, criterion, device, logger, cfg, num_samples_to_l
     batch_start_time = time.perf_counter()
     samples = []
     running_loss = 0
+    inference_times = []
 
     with (
         torch.no_grad(),
@@ -133,7 +136,18 @@ def validate(model, val_loader, criterion, device, logger, cfg, num_samples_to_l
             images = [batch[img_key].to(device) for img_key in cfg.training.image_keys]
             labels = batch[cfg.training.label_key].float().to(device)
 
-            outputs = model(images)
+            if cfg.training.profile_inference_time and logger._cfg.wandb.enable:
+                with (
+                    profiler.profile(record_shapes=True) as prof,
+                    profiler.record_function("model_inference"),
+                ):
+                    outputs = model(images)
+                inference_times.append(
+                    next(x for x in prof.key_averages() if x.key == "model_inference").cpu_time
+                )
+            else:
+                outputs = model(images)
+
             loss = criterion(outputs.logits, labels)
 
             # Track metrics
@@ -177,9 +191,76 @@ def validate(model, val_loader, criterion, device, logger, cfg, num_samples_to_l
         else None,
     }
 
+    if len(inference_times) > 0:
+        eval_info["inference_time_avg"] = np.mean(inference_times)
+        eval_info["inference_time_median"] = np.median(inference_times)
+        eval_info["inference_time_std"] = np.std(inference_times)
+        eval_info["inference_time_batch_size"] = val_loader.batch_size
+
+        print(
+            f"Inference mean time: {eval_info['inference_time_avg']:.2f} us, median: {eval_info['inference_time_median']:.2f} us, std: {eval_info['inference_time_std']:.2f} us, with {len(inference_times)} iterations on {device.type} device, batch size: {eval_info['inference_time_batch_size']}"
+        )
+
     return accuracy, eval_info
 
 
+def benchmark_inference_time(model, dataset, logger, cfg, device, step):
+    if not cfg.training.profile_inference_time:
+        return
+
+    iters = cfg.training.profile_inference_time_iters
+    inference_times = []
+
+    loader = DataLoader(
+        dataset,
+        batch_size=1,
+        num_workers=cfg.training.num_workers,
+        sampler=RandomSampler(dataset),
+        pin_memory=True,
+    )
+
+    model.eval()
+    with torch.no_grad():
+        for _ in tqdm(range(iters), desc="Benchmarking inference time"):
+            x = next(iter(loader))
+            x = [x[img_key].to(device) for img_key in cfg.training.image_keys]
+
+            # Warm up
+            for _ in range(10):
+                _ = model(x)
+
+            # sync the device
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+            elif device.type == "mps":
+                torch.mps.synchronize()
+
+            with profiler.profile(record_shapes=True) as prof, profiler.record_function("model_inference"):
+                _ = model(x)
+
+            inference_times.append(
+                next(x for x in prof.key_averages() if x.key == "model_inference").cpu_time
+            )
+
+    inference_times = np.array(inference_times)
+    avg, median, std = inference_times.mean(), np.median(inference_times), inference_times.std()
+    print(
+        f"Inference time mean: {avg:.2f} us, median: {median:.2f} us, std: {std:.2f} us, with {iters} iterations on {device.type} device"
+    )
+    if logger._cfg.wandb.enable:
+        logger.log_dict(
+            {
+                "inference_time_benchmark_avg": avg,
+                "inference_time_benchmark_median": median,
+                "inference_time_benchmark_std": std,
+            },
+            step + 1,
+            mode="eval",
+        )
+
+    return avg, median, std
+
+
 @hydra.main(version_base="1.2", config_path="../configs/policy", config_name="hilserl_classifier")
 def train(cfg: DictConfig) -> None:
     # Main training pipeline with support for resuming training
@@ -313,6 +394,8 @@ def train(cfg: DictConfig) -> None:
 
         step += len(train_loader)
 
+    benchmark_inference_time(model, dataset, logger, cfg, device, step)
+
     logging.info("Training completed")
 
 

From d51374ce12a2782cfa5a888c84c78c440b71d44a Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Mon, 10 Feb 2025 16:03:39 +0100
Subject: [PATCH 065/112] Several fixes to move the actor_server and
 learner_server code from the maniskill environment to the real robot
 environment.

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 .../common/policies/sac/configuration_sac.py  |   6 +
 lerobot/common/policies/sac/modeling_sac.py   |  46 ++-
 lerobot/common/utils/utils.py                 |  26 ++
 lerobot/configs/default.yaml                  |   1 +
 lerobot/configs/env/so100_real.yaml           |  17 +
 lerobot/scripts/server/actor_server.py        | 192 ++++++----
 lerobot/scripts/server/buffer.py              |  65 ++--
 lerobot/scripts/server/gym_manipulator.py     | 362 +++++++++---------
 lerobot/scripts/server/learner_server.py      |  54 ++-
 lerobot/scripts/train_hilserl_classifier.py   |   6 +-
 10 files changed, 457 insertions(+), 318 deletions(-)

diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index 7bb7f167..bcca8976 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -39,6 +39,12 @@ class SACConfig:
             "observation.environment_state": "min_max",
         }
     )
+    input_normalization_params: dict[str, dict[str, list[float]]] = field(
+        default_factory=lambda: {
+            "observation.image": {"mean": [[0.485, 0.456, 0.406]], "std": [[0.229, 0.224, 0.225]]},
+            "observation.state": {"min": [-1, -1, -1, -1], "max": [1, 1, 1, 1]},
+        }
+    )
     output_normalization_modes: dict[str, str] = field(default_factory=lambda: {"action": "min_max"})
     output_normalization_params: dict[str, dict[str, list[float]]] = field(
         default_factory=lambda: {
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 9faeeeb6..a3d5d8e6 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -51,18 +51,20 @@ class SACPolicy(
         if config is None:
             config = SACConfig()
         self.config = config
+
         if config.input_normalization_modes is not None:
+            input_normalization_params = _convert_normalization_params_to_tensor(
+                config.input_normalization_params
+            )
             self.normalize_inputs = Normalize(
-                config.input_shapes, config.input_normalization_modes, dataset_stats
+                config.input_shapes, config.input_normalization_modes, input_normalization_params
             )
         else:
             self.normalize_inputs = nn.Identity()
 
-        output_normalization_params = {}
-        for outer_key, inner_dict in config.output_normalization_params.items():
-            output_normalization_params[outer_key] = {}
-            for key, value in inner_dict.items():
-                output_normalization_params[outer_key][key] = torch.tensor(value)
+        output_normalization_params = _convert_normalization_params_to_tensor(
+            config.output_normalization_params
+        )
 
         # HACK: This is hacky and should be removed
         dataset_stats = dataset_stats or output_normalization_params
@@ -75,7 +77,7 @@ class SACPolicy(
 
         # NOTE: For images the encoder should be shared between the actor and critic
         if config.shared_encoder:
-            encoder_critic = SACObservationEncoder(config)
+            encoder_critic = SACObservationEncoder(config, self.normalize_inputs)
             encoder_actor: SACObservationEncoder = encoder_critic
         else:
             encoder_critic = SACObservationEncoder(config)
@@ -92,6 +94,7 @@ class SACPolicy(
                     for _ in range(config.num_critics)
                 ]
             ),
+            output_normalization=self.normalize_targets,
         )
 
         self.critic_target = CriticEnsemble(
@@ -105,6 +108,7 @@ class SACPolicy(
                     for _ in range(config.num_critics)
                 ]
             ),
+            output_normalization=self.normalize_targets,
         )
 
         self.critic_target.load_state_dict(self.critic_ensemble.state_dict())
@@ -122,7 +126,7 @@ class SACPolicy(
         # TODO (azouitine): Handle the case where the temparameter is a fixed
         # TODO (michel-aractingi): Put the log_alpha in cuda by default because otherwise
         # it triggers "can't optimize a non-leaf Tensor"
-        self.log_alpha = torch.zeros(1, requires_grad=True, device=torch.device("cuda:0"))
+        self.log_alpha = torch.tensor([0.0], requires_grad=True, device=torch.device("mps"))
         self.temperature = self.log_alpha.exp().item()
 
     def reset(self):
@@ -313,12 +317,14 @@ class CriticEnsemble(nn.Module):
         self,
         encoder: Optional[nn.Module],
         network_list: nn.ModuleList,
+        output_normalization: nn.Module,
         init_final: Optional[float] = None,
     ):
         super().__init__()
         self.encoder = encoder
         self.network_list = network_list
         self.init_final = init_final
+        self.output_normalization = output_normalization
 
         self.parameters_to_optimize = []
         # Handle the case where a part of the encoder if frozen
@@ -358,6 +364,10 @@ class CriticEnsemble(nn.Module):
         device = get_device_from_parameters(self)
         # Move each tensor in observations to device
         observations = {k: v.to(device) for k, v in observations.items()}
+        # NOTE: We normalize actions it helps for sample efficiency
+        actions: dict[str, torch.tensor] = {"action": actions}
+        # NOTE: Normalization layer took dict in input and outputs a dict that why
+        actions = self.output_normalization(actions)["action"]
         actions = actions.to(device)
 
         obs_enc = observations if self.encoder is None else self.encoder(observations)
@@ -474,17 +484,18 @@ class Policy(nn.Module):
 class SACObservationEncoder(nn.Module):
     """Encode image and/or state vector observations."""
 
-    def __init__(self, config: SACConfig):
+    def __init__(self, config: SACConfig, input_normalizer: nn.Module):
         """
         Creates encoders for pixel and/or state modalities.
         """
         super().__init__()
         self.config = config
+        self.input_normalization = input_normalizer
         self.has_pretrained_vision_encoder = False
         self.parameters_to_optimize = []
 
         self.aggregation_size: int = 0
-        if "observation.image" in config.input_shapes:
+        if any("observation.image" in key for key in config.input_shapes):
             self.camera_number = config.camera_number
 
             if self.config.vision_encoder_name is not None:
@@ -534,8 +545,9 @@ class SACObservationEncoder(nn.Module):
         over all features.
         """
         feat = []
+        obs_dict = self.input_normalization(obs_dict)
         # Concatenate all images along the channel dimension.
-        image_keys = [k for k in self.config.input_shapes if k.startswith("observation.image")]
+        image_keys = [k for k in obs_dict if k.startswith("observation.image")]
         for image_key in image_keys:
             enc_feat = self.image_enc_layers(obs_dict[image_key])
 
@@ -681,6 +693,18 @@ def flatten_forward_unflatten(fn: Callable[[Tensor], Tensor], image_tensor: Tens
     return torch.reshape(flat_out, (*start_dims, *flat_out.shape[1:]))
 
 
+def _convert_normalization_params_to_tensor(normalization_params: dict) -> dict:
+    converted_params = {}
+    for outer_key, inner_dict in normalization_params.items():
+        converted_params[outer_key] = {}
+        for key, value in inner_dict.items():
+            converted_params[outer_key][key] = torch.tensor(value)
+            if "image" in outer_key:
+                converted_params[outer_key][key] = converted_params[outer_key][key].view(3, 1, 1)
+
+    return converted_params
+
+
 if __name__ == "__main__":
     # Test the SACObservationEncoder
     import time
diff --git a/lerobot/common/utils/utils.py b/lerobot/common/utils/utils.py
index 4e276e16..e4460f5f 100644
--- a/lerobot/common/utils/utils.py
+++ b/lerobot/common/utils/utils.py
@@ -18,6 +18,7 @@ import os
 import os.path as osp
 import platform
 import random
+import time
 from contextlib import contextmanager
 from datetime import datetime, timezone
 from pathlib import Path
@@ -217,3 +218,28 @@ def log_say(text, play_sounds, blocking=False):
 
     if play_sounds:
         say(text, blocking)
+
+
+class TimerManager:
+    def __init__(self, elapsed_time_list: list[float] | None = None, label="Elapsed time", log=True):
+        self.label = label
+        self.elapsed_time_list = elapsed_time_list
+        self.log = log
+        self.elapsed = 0.0
+
+    def __enter__(self):
+        self.start = time.perf_counter()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.elapsed: float = time.perf_counter() - self.start
+
+        if self.elapsed_time_list is not None:
+            self.elapsed_time_list.append(self.elapsed)
+
+        if self.log:
+            print(f"{self.label}: {self.elapsed:.6f} seconds")
+
+    @property
+    def elapsed_seconds(self):
+        return self.elapsed
diff --git a/lerobot/configs/default.yaml b/lerobot/configs/default.yaml
index a3ff1d41..7750ba3a 100644
--- a/lerobot/configs/default.yaml
+++ b/lerobot/configs/default.yaml
@@ -2,6 +2,7 @@ defaults:
   - _self_
   - env: pusht
   - policy: diffusion
+  - robot: so100
 
 hydra:
   run:
diff --git a/lerobot/configs/env/so100_real.yaml b/lerobot/configs/env/so100_real.yaml
index 8e65d72f..862ea951 100644
--- a/lerobot/configs/env/so100_real.yaml
+++ b/lerobot/configs/env/so100_real.yaml
@@ -8,3 +8,20 @@ env:
   state_dim: 6
   action_dim: 6
   fps: ${fps}
+  device: mps
+  
+  wrapper:
+    crop_params_dict:
+      observation.images.laptop: [58, 89, 357, 455]
+      observation.images.phone: [3, 4, 471, 633]
+    resize_size: [128, 128]
+    control_time_s: 20
+    reset_follower_pos: true
+    use_relative_joint_positions: true
+    reset_time_s: 10
+    display_cameras: false
+    delta_action: 0.1
+
+  reward_classifier:
+    pretrained_path: outputs/classifier/checkpoints/best/pretrained_model
+    config_path: lerobot/configs/policy/hilserl_classifier.yaml
diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index be5c0818..28c582d2 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -31,16 +31,21 @@ from omegaconf import DictConfig
 from torch import nn
 
 # TODO: Remove the import of maniskill
-from lerobot.common.envs.factory import make_maniskill_env
-from lerobot.common.envs.utils import preprocess_maniskill_observation
+# from lerobot.common.envs.factory import make_maniskill_env
+# from lerobot.common.envs.utils import preprocess_maniskill_observation
 from lerobot.common.policies.factory import make_policy
 from lerobot.common.policies.sac.modeling_sac import SACPolicy
+from lerobot.common.robot_devices.control_utils import busy_wait
+from lerobot.common.robot_devices.robots.factory import make_robot
+from lerobot.common.robot_devices.robots.utils import Robot
 from lerobot.common.utils.utils import (
+    TimerManager,
     get_safe_torch_device,
     set_global_seed,
 )
 from lerobot.scripts.server import hilserl_pb2, hilserl_pb2_grpc
 from lerobot.scripts.server.buffer import Transition, move_state_dict_to_device, move_transition_to_device
+from lerobot.scripts.server.gym_manipulator import get_classifier, make_robot_env
 
 logging.basicConfig(level=logging.INFO)
 
@@ -152,7 +157,15 @@ def serve_actor_service(port=50052):
     server.wait_for_termination()
 
 
-def act_with_policy(cfg: DictConfig):
+def update_policy_parameters(policy: SACPolicy, parameters_queue: queue.Queue, device):
+    if not parameters_queue.empty():
+        logging.debug("[ACTOR] Load new parameters from Learner.")
+        state_dict = parameters_queue.get()
+        state_dict = move_state_dict_to_device(state_dict, device=device)
+        policy.load_state_dict(state_dict)
+
+
+def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module):
     """
     Executes policy interaction within the environment.
 
@@ -165,9 +178,7 @@ def act_with_policy(cfg: DictConfig):
 
     logging.info("make_env online")
 
-    # online_env = make_env(cfg, n_envs=1)
-    # TODO: Remove the import of maniskill and unifiy with make env
-    online_env = make_maniskill_env(cfg, n_envs=1)
+    online_env = make_robot_env(robot=robot, reward_classifier=reward_classifier, cfg=cfg.env)
 
     set_global_seed(cfg.seed)
     device = get_safe_torch_device(cfg.device, log=True)
@@ -177,6 +188,16 @@ def act_with_policy(cfg: DictConfig):
 
     logging.info("make_policy")
 
+    # HACK: This is an ugly hack to pass the normalization parameters to the policy
+    # Because the action space is dynamic so we override the output normalization parameters
+    # it's ugly, we know ... and we will fix it
+    min_action_space: list = online_env.action_space.spaces[0].low.tolist()
+    max_action_space: list = online_env.action_space.spaces[0].high.tolist()
+    output_normalization_params: dict[dict[str, list]] = {
+        "action": {"min": min_action_space, "max": max_action_space}
+    }
+    cfg.policy.output_normalization_params = output_normalization_params
+
     ### Instantiate the policy in both the actor and learner processes
     ### To avoid sending a SACPolicy object through the port, we create a policy intance
     ### on both sides, the learner sends the updated parameters every n steps to update the actor's parameters
@@ -187,92 +208,41 @@ def act_with_policy(cfg: DictConfig):
         # Hack: But if we do online training, we do not need dataset_stats
         dataset_stats=None,
         # TODO: Handle resume training
+        device=device,
     )
-    #     pretrained_policy_name_or_path=None,
-    #     device=device,
-    # )
     policy = torch.compile(policy)
     assert isinstance(policy, nn.Module)
 
-    # HACK for maniskill
     obs, info = online_env.reset()
 
-    # obs = preprocess_observation(obs)
-    obs = preprocess_maniskill_observation(obs)
-    obs = {key: obs[key].to(device, non_blocking=True) for key in obs}
-
     # NOTE: For the moment we will solely handle the case of a single environment
     sum_reward_episode = 0
     list_transition_to_send_to_learner = []
-    list_policy_fps = []
+    list_policy_time = []
 
     for interaction_step in range(cfg.training.online_steps):
         if interaction_step >= cfg.training.online_step_before_learning:
-            start = time.perf_counter()
-            action = policy.select_action(batch=obs)
-            list_policy_fps.append(1.0 / (time.perf_counter() - start + 1e-9))
-            if list_policy_fps[-1] < cfg.fps:
-                logging.warning(
-                    f"[ACTOR] policy frame rate {list_policy_fps[-1]} during interaction step {interaction_step} is below the required control frame rate {cfg.fps}"
-                )
+            # Time policy inference and check if it meets FPS requirement
+            with TimerManager(
+                elapsed_time_list=list_policy_time, label="Policy inference time", log=False
+            ) as timer:  # noqa: F841
+                action = policy.select_action(batch=obs) * 0.0
+            policy_fps = 1.0 / (list_policy_time[-1] + 1e-9)
 
-            next_obs, reward, done, truncated, info = online_env.step(action.cpu().numpy())
+            log_policy_frequency_issue(policy_fps=policy_fps, cfg=cfg, interaction_step=interaction_step)
+
+            next_obs, reward, done, truncated, info = online_env.step(action.squeeze(dim=0).cpu().numpy())
         else:
+            # TODO (azouitine): Make a custom space for torch tensor
             action = online_env.action_space.sample()
             next_obs, reward, done, truncated, info = online_env.step(action)
-            # HACK
-            action = torch.tensor(action, dtype=torch.float32).to(device, non_blocking=True)
 
-        # HACK: For maniskill
-        # next_obs = preprocess_observation(next_obs)
-        next_obs = preprocess_maniskill_observation(next_obs)
-        next_obs = {key: next_obs[key].to(device, non_blocking=True) for key in obs}
-        sum_reward_episode += float(reward[0])
+            # HACK: We have only one env but we want to batch it, it will be resolved with the torch box
+            action = torch.from_numpy(action[0]).to(device, non_blocking=True).unsqueeze(dim=0)
 
-        # Because we are using a single environment we can index at zero
-        if done[0].item() or truncated[0].item():
-            # TODO: Handle logging for episode information
-            logging.info(f"[ACTOR] Global step {interaction_step}: Episode reward: {sum_reward_episode}")
+        sum_reward_episode += float(reward)
 
-            if not parameters_queue.empty():
-                logging.debug("[ACTOR] Load new parameters from Learner.")
-                state_dict = parameters_queue.get()
-                state_dict = move_state_dict_to_device(state_dict, device=device)
-                # strict=False for the case when the image encoder is frozen and not sent through
-                # the network. Becareful might cause issues if the wrong keys are passed
-                policy.actor.load_state_dict(state_dict, strict=False)
-
-            if len(list_transition_to_send_to_learner) > 0:
-                logging.debug(
-                    f"[ACTOR] Sending {len(list_transition_to_send_to_learner)} transitions to Learner."
-                )
-                message_queue.put(ActorInformation(transition=list_transition_to_send_to_learner))
-                list_transition_to_send_to_learner = []
-
-            stats = {}
-            if len(list_policy_fps) > 0:
-                policy_fps = mean(list_policy_fps)
-                quantiles_90 = quantiles(list_policy_fps, n=10)[-1]
-                logging.debug(f"[ACTOR] Average policy frame rate: {policy_fps}")
-                logging.debug(f"[ACTOR] Policy frame rate 90th percentile: {quantiles_90}")
-                stats = {"Policy frequency [Hz]": policy_fps, "Policy frequency 90th-p [Hz]": quantiles_90}
-                list_policy_fps = []
-
-            # Send episodic reward to the learner
-            message_queue.put(
-                ActorInformation(
-                    interaction_message={
-                        "Episodic reward": sum_reward_episode,
-                        "Interaction step": interaction_step,
-                        **stats,
-                    }
-                )
-            )
-            sum_reward_episode = 0.0
-
-        # TODO (michel-aractingi): Label the reward
-        # if config.label_reward_on_actor:
-        #     reward = reward_classifier(obs)
+        # NOTE: We overide the action if the intervention is True, because the action applied is the intervention action
         if info["is_intervention"]:
             # TODO: Check the shape
             action = info["action_intervention"]
@@ -291,17 +261,85 @@ def act_with_policy(cfg: DictConfig):
         # assign obs to the next obs and continue the rollout
         obs = next_obs
 
+        # HACK: We have only one env but we want to batch it, it will be resolved with the torch box
+        # Because we are using a single environment we can index at zero
+        if done or truncated:
+            # TODO: Handle logging for episode information
+            logging.info(f"[ACTOR] Global step {interaction_step}: Episode reward: {sum_reward_episode}")
+
+            # update_policy_parameters(policy=policy, parameters_queue=parameters_queue, device=device)
+
+            if len(list_transition_to_send_to_learner) > 0:
+                send_transitions_in_chunks(
+                    transitions=list_transition_to_send_to_learner, message_queue=message_queue, chunk_size=4
+                )
+                list_transition_to_send_to_learner = []
+
+            stats = get_frequency_stats(list_policy_time)
+            list_policy_time.clear()
+
+            # Send episodic reward to the learner
+            message_queue.put(
+                ActorInformation(
+                    interaction_message={
+                        "Episodic reward": sum_reward_episode,
+                        "Interaction step": interaction_step,
+                        **stats,
+                    }
+                )
+            )
+            sum_reward_episode = 0.0
+            obs, info = online_env.reset()
+
+
+def send_transitions_in_chunks(transitions: list, message_queue, chunk_size: int = 100):
+    """Send transitions to learner in smaller chunks to avoid network issues.
+
+    Args:
+        transitions: List of transitions to send
+        message_queue: Queue to send messages to learner
+        chunk_size: Size of each chunk to send
+    """
+    for i in range(0, len(transitions), chunk_size):
+        chunk = transitions[i : i + chunk_size]
+        logging.debug(f"[ACTOR] Sending chunk of {len(chunk)} transitions to Learner.")
+        message_queue.put(ActorInformation(transition=chunk))
+
+
+def get_frequency_stats(list_policy_time: list[float]) -> dict[str, float]:
+    stats = {}
+    list_policy_fps = [1.0 / t for t in list_policy_time]
+    if len(list_policy_fps) > 0:
+        policy_fps = mean(list_policy_fps)
+        quantiles_90 = quantiles(list_policy_fps, n=10)[-1]
+        logging.debug(f"[ACTOR] Average policy frame rate: {policy_fps}")
+        logging.debug(f"[ACTOR] Policy frame rate 90th percentile: {quantiles_90}")
+        stats = {"Policy frequency [Hz]": policy_fps, "Policy frequency 90th-p [Hz]": quantiles_90}
+    return stats
+
+
+def log_policy_frequency_issue(policy_fps: float, cfg: DictConfig, interaction_step: int):
+    if policy_fps < cfg.fps:
+        logging.warning(
+            f"[ACTOR] Policy FPS {policy_fps:.1f} below required {cfg.fps} at step {interaction_step}"
+        )
+
 
 @hydra.main(version_base="1.2", config_name="default", config_path="../../configs")
 def actor_cli(cfg: dict):
-    port = cfg.actor_learner_config.port
-    server_thread = Thread(target=serve_actor_service, args=(port,), daemon=True)
-    server_thread.start()
+    robot = make_robot(cfg=cfg.robot)
+
+    server_thread = Thread(target=serve_actor_service, args=(cfg.actor_learner_config.port,), daemon=True)
+    reward_classifier = get_classifier(
+        pretrained_path=cfg.env.reward_classifier.pretrained_path,
+        config_path=cfg.env.reward_classifier.config_path,
+    )
     policy_thread = Thread(
         target=act_with_policy,
         daemon=True,
-        args=(cfg,),
+        args=(cfg, robot, reward_classifier),
     )
+    server_thread.start()
     policy_thread.start()
     policy_thread.join()
     server_thread.join()
diff --git a/lerobot/scripts/server/buffer.py b/lerobot/scripts/server/buffer.py
index 828116b9..8be21365 100644
--- a/lerobot/scripts/server/buffer.py
+++ b/lerobot/scripts/server/buffer.py
@@ -56,10 +56,10 @@ def move_transition_to_device(transition: Transition, device: str = "cpu") -> Tr
     }
 
     # If complementary_info is present, move its tensors to CPU
-    if transition["complementary_info"] is not None:
-        transition["complementary_info"] = {
-            key: val.to(device, non_blocking=True) for key, val in transition["complementary_info"].items()
-        }
+    # if transition["complementary_info"] is not None:
+    #     transition["complementary_info"] = {
+    #         key: val.to(device, non_blocking=True) for key, val in transition["complementary_info"].items()
+    #     }
     return transition
 
 
@@ -309,6 +309,7 @@ class ReplayBuffer:
 
     def sample(self, batch_size: int) -> BatchTransition:
         """Sample a random batch of transitions and collate them into batched tensors."""
+        batch_size = min(batch_size, len(self.memory))
         list_of_transitions = random.sample(self.memory, batch_size)
 
         # -- Build batched states --
@@ -341,9 +342,6 @@ class ReplayBuffer:
         batch_dones = torch.tensor([t["done"] for t in list_of_transitions], dtype=torch.float32).to(
             self.device
         )
-        batch_dones = torch.tensor([t["done"] for t in list_of_transitions], dtype=torch.float32).to(
-            self.device
-        )
 
         # Return a BatchTransition typed dict
         return BatchTransition(
@@ -531,30 +529,31 @@ def concatenate_batch_transitions(
 
 
 # if __name__ == "__main__":
-#     dataset_name = "lerobot/pusht_image"
-#     dataset = LeRobotDataset(repo_id=dataset_name, episodes=range(1, 3))
-#     replay_buffer = ReplayBuffer.from_lerobot_dataset(
-#         lerobot_dataset=dataset, state_keys=["observation.image", "observation.state"]
-#     )
-#     replay_buffer_converted = replay_buffer.to_lerobot_dataset(repo_id="AdilZtn/pusht_image_converted")
-#     for i in range(len(replay_buffer_converted)):
-#         replay_convert = replay_buffer_converted[i]
-#         dataset_convert = dataset[i]
-#         for key in replay_convert.keys():
-#             if key in {"index", "episode_index", "frame_index", "timestamp", "task_index"}:
-#                 continue
-#             if key in dataset_convert.keys():
-#                 assert torch.equal(replay_convert[key], dataset_convert[key])
-#                 print(f"Key {key} is equal : {replay_convert[key].size()}, {dataset_convert[key].size()}")
-#     re_reconverted_dataset = ReplayBuffer.from_lerobot_dataset(
-#         replay_buffer_converted, state_keys=["observation.image", "observation.state"], device="cpu"
-#     )
-#     for _ in range(20):
-#         batch = re_reconverted_dataset.sample(32)
+# dataset_name = "aractingi/push_green_cube_hf_cropped_resized"
+# dataset = LeRobotDataset(repo_id=dataset_name)
 
-#         for key in batch.keys():
-#             if key in {"state", "next_state"}:
-#                 for key_state in batch[key].keys():
-#                     print(key_state, batch[key][key_state].size())
-#                 continue
-#             print(key, batch[key].size())
+# replay_buffer = ReplayBuffer.from_lerobot_dataset(
+#     lerobot_dataset=dataset, state_keys=["observation.image", "observation.state"]
+# )
+# replay_buffer_converted = replay_buffer.to_lerobot_dataset(repo_id="AdilZtn/pusht_image_converted")
+# for i in range(len(replay_buffer_converted)):
+#     replay_convert = replay_buffer_converted[i]
+#     dataset_convert = dataset[i]
+#     for key in replay_convert.keys():
+#         if key in {"index", "episode_index", "frame_index", "timestamp", "task_index"}:
+#             continue
+#         if key in dataset_convert.keys():
+#             assert torch.equal(replay_convert[key], dataset_convert[key])
+#             print(f"Key {key} is equal : {replay_convert[key].size()}, {dataset_convert[key].size()}")
+# re_reconverted_dataset = ReplayBuffer.from_lerobot_dataset(
+#     replay_buffer_converted, state_keys=["observation.image", "observation.state"], device="cpu"
+# )
+# for _ in range(20):
+#     batch = re_reconverted_dataset.sample(32)
+
+#     for key in batch.keys():
+#         if key in {"state", "next_state"}:
+#             for key_state in batch[key].keys():
+#                 print(key_state, batch[key][key_state].size())
+#             continue
+#         print(key, batch[key].size())
diff --git a/lerobot/scripts/server/gym_manipulator.py b/lerobot/scripts/server/gym_manipulator.py
index 40dc2784..5bf51868 100644
--- a/lerobot/scripts/server/gym_manipulator.py
+++ b/lerobot/scripts/server/gym_manipulator.py
@@ -4,7 +4,6 @@ import time
 from threading import Lock
 from typing import Annotated, Any, Callable, Dict, Optional, Tuple
 
-import cv2
 import gymnasium as gym
 import numpy as np
 import torch
@@ -20,10 +19,15 @@ logging.basicConfig(level=logging.INFO)
 
 class HILSerlRobotEnv(gym.Env):
     """
-    Gym-like environment wrapper for robot policy evaluation.
+    Gym-compatible environment for evaluating robotic control policies with integrated human intervention.
 
-    This wrapper provides a consistent interface for interacting with the robot,
-    following the OpenAI Gym environment conventions.
+    This environment wraps a robot interface to provide a consistent API for policy evaluation. It supports both relative (delta)
+    and absolute joint position commands and automatically configures its observation and action spaces based on the robot's
+    sensors and configuration.
+
+    The environment can switch between executing actions from a policy or using teleoperated actions (human intervention) during
+    each step. When teleoperation is used, the override action is captured and returned in the `info` dict along with a flag
+    `is_intervention`.
     """
 
     def __init__(
@@ -31,32 +35,34 @@ class HILSerlRobotEnv(gym.Env):
         robot,
         use_delta_action_space: bool = True,
         delta: float | None = None,
-        display_cameras=False,
+        display_cameras: bool = False,
     ):
         """
-        Initialize the robot environment.
+        Initialize the HILSerlRobotEnv environment.
+
+        The environment is set up with a robot interface, which is used to capture observations and send joint commands. The setup
+        supports both relative (delta) adjustments and absolute joint positions for controlling the robot.
 
         Args:
-            robot: The robot interface object
-            reward_classifier: Optional reward classifier
-            fps: Frames per second for control
-            control_time_s: Total control time for each episode
-            display_cameras: Whether to display camera feeds
-            output_normalization_params_action: Bound parameters for the action space
-            delta: The delta for the relative joint position action space
+            robot: The robot interface object used to connect and interact with the physical robot.
+            use_delta_action_space (bool): If True, uses a delta (relative) action space for joint control. Otherwise, absolute
+                joint positions are used.
+            delta (float or None): A scaling factor for the relative adjustments applied to joint positions. Should be a value between
+                0 and 1 when using a delta action space.
+            display_cameras (bool): If True, the robot's camera feeds will be displayed during execution.
         """
         super().__init__()
 
         self.robot = robot
         self.display_cameras = display_cameras
 
-        # connect robot
+        # Connect to the robot if not already connected.
         if not self.robot.is_connected:
             self.robot.connect()
 
         self.initial_follower_position = robot.follower_arms["main"].read("Present_Position")
 
-        # Episode tracking
+        # Episode tracking.
         self.current_step = 0
         self.episode_data = None
 
@@ -64,6 +70,7 @@ class HILSerlRobotEnv(gym.Env):
         self.use_delta_action_space = use_delta_action_space
         self.current_joint_positions = self.robot.follower_arms["main"].read("Present_Position")
 
+        # Retrieve the size of the joint position interval bound.
         self.relative_bounds_size = (
             self.robot.config.joint_position_relative_bounds["max"]
             - self.robot.config.joint_position_relative_bounds["min"]
@@ -73,20 +80,26 @@ class HILSerlRobotEnv(gym.Env):
 
         self.robot.config.max_relative_target = self.delta_relative_bounds_size.float()
 
-        # Dynamically determine observation and action spaces
+        # Dynamically configure the observation and action spaces.
         self._setup_spaces()
 
     def _setup_spaces(self):
         """
-        Dynamically determine observation and action spaces based on robot capabilities.
+        Dynamically configure the observation and action spaces based on the robot's capabilities.
 
-        This method should be customized based on the specific robot's observation
-        and action representations.
+        Observation Space:
+            - For keys with "image": A Box space with pixel values ranging from 0 to 255.
+            - For non-image keys: A nested Dict space is created under 'observation.state' with a suitable range.
+
+        Action Space:
+            - The action space is defined as a Tuple where:
+                • The first element is a Box space representing joint position commands. It is defined as relative (delta)
+                  or absolute, based on the configuration.
+                • The second element is a Discrete space (with 2 values) serving as a flag for intervention (teleoperation).
         """
-        # Example space setup - you'll need to adapt this to your specific robot
         example_obs = self.robot.capture_observation()
 
-        # Observation space (assuming image-based observations)
+        # Define observation spaces for images and other states.
         image_keys = [key for key in example_obs if "image" in key]
         state_keys = [key for key in example_obs if "image" not in key]
         observation_spaces = {
@@ -102,7 +115,7 @@ class HILSerlRobotEnv(gym.Env):
 
         self.observation_space = gym.spaces.Dict(observation_spaces)
 
-        # Action space (assuming joint positions)
+        # Define the action space for joint positions along with setting an intervention flag.
         action_dim = len(self.robot.follower_arms["main"].read("Present_Position"))
         if self.use_delta_action_space:
             action_space_robot = gym.spaces.Box(
@@ -128,18 +141,24 @@ class HILSerlRobotEnv(gym.Env):
 
     def reset(self, seed=None, options=None) -> Tuple[Dict[str, np.ndarray], Dict[str, Any]]:
         """
-        Reset the environment to initial state.
+        Reset the environment to its initial state.
+        This method resets the step counter and clears any episodic data.
+
+        Args:
+            seed (Optional[int]): A seed for random number generation to ensure reproducibility.
+            options (Optional[dict]): Additional options to influence the reset behavior.
 
         Returns:
-            observation (dict): Initial observation
-            info (dict): Additional information
+            A tuple containing:
+                - observation (dict): The initial sensor observation.
+                - info (dict): A dictionary with supplementary information, including the key "initial_position".
         """
         super().reset(seed=seed, options=options)
 
-        # Capture initial observation
+        # Capture the initial observation.
         observation = self.robot.capture_observation()
 
-        # Reset tracking variables
+        # Reset episode tracking variables.
         self.current_step = 0
         self.episode_data = None
 
@@ -149,28 +168,38 @@ class HILSerlRobotEnv(gym.Env):
         self, action: Tuple[np.ndarray, bool]
     ) -> Tuple[Dict[str, np.ndarray], float, bool, bool, Dict[str, Any]]:
         """
-        Take a step in the environment.
+        Execute a single step within the environment using the specified action.
+
+        The provided action is a tuple comprised of:
+            • A policy action (joint position commands) that may be either in absolute values or as a delta.
+            • A boolean flag indicating whether teleoperation (human intervention) should be used for this step.
+
+        Behavior:
+            - When the intervention flag is False, the environment processes and sends the policy action to the robot.
+            - When True, a teleoperation step is executed. If using a delta action space, an absolute teleop action is converted
+              to relative change based on the current joint positions.
 
         Args:
-            action tuple(np.ndarray, bool):
-                    Policy action to be executed on the robot and boolean to determine
-                    whether to choose policy action or expert action.
+            action (tuple): A tuple with two elements:
+                - policy_action (np.ndarray or torch.Tensor): The commanded joint positions.
+                - intervention_bool (bool): True if the human operator intervenes by providing a teleoperation input.
 
         Returns:
-            observation (dict): Next observation
-            reward (float): Reward for this step
-            terminated (bool): Whether the episode has terminated
-            truncated (bool): Whether the episode was truncated
-            info (dict): Additional information
+            tuple: A tuple containing:
+                - observation (dict): The new sensor observation after taking the step.
+                - reward (float): The step reward (default is 0.0 within this wrapper).
+                - terminated (bool): True if the episode has reached a terminal state.
+                - truncated (bool): True if the episode was truncated (e.g., time constraints).
+                - info (dict): Additional debugging information including:
+                    ◦ "action_intervention": The teleop action if intervention was used.
+                    ◦ "is_intervention": Flag indicating whether teleoperation was employed.
         """
-        # The actions recieved are the in form of a tuple containing the policy action and an intervention bool
-        # The boolean inidicated whether we will use the expert's actions (through teleoperation) or the policy actions
         policy_action, intervention_bool = action
         teleop_action = None
         self.current_joint_positions = self.robot.follower_arms["main"].read("Present_Position")
         if isinstance(policy_action, torch.Tensor):
             policy_action = policy_action.cpu().numpy()
-            olicy_action = np.clip(policy_action, self.action_space[0].low, self.action_space[0].high)
+            policy_action = np.clip(policy_action, self.action_space[0].low, self.action_space[0].high)
         if not intervention_bool:
             if self.use_delta_action_space:
                 target_joint_positions = self.current_joint_positions + self.delta * policy_action
@@ -180,26 +209,26 @@ class HILSerlRobotEnv(gym.Env):
             observation = self.robot.capture_observation()
         else:
             observation, teleop_action = self.robot.teleop_step(record_data=True)
-            teleop_action = teleop_action["action"]  # teleop step returns torch tensors but in a dict
+            teleop_action = teleop_action["action"]  # Convert tensor to appropriate format
 
-            # teleop actions are returned in absolute joint space
-            # If we are using a relative joint position action space,
-            # there will be a mismatch between the spaces of the policy and teleop actions
-            # Solution is to transform the teleop actions into relative space.
-            # teleop relative action is:
+            # When applying the delta action space, convert teleop absolute values to relative differences.
             if self.use_delta_action_space:
                 teleop_action = teleop_action - self.current_joint_positions
                 if torch.any(teleop_action < -self.delta_relative_bounds_size * self.delta) and torch.any(
                     teleop_action > self.delta_relative_bounds_size
                 ):
                     print(
-                        f"relative teleop delta exceeded bounds {self.delta_relative_bounds_size}, teleop_action {teleop_action}\n"
+                        f"Relative teleop delta exceeded bounds {self.delta_relative_bounds_size}, teleop_action {teleop_action}\n"
                         f"lower bounds condition {teleop_action < -self.delta_relative_bounds_size}\n"
                         f"upper bounds condition {teleop_action > self.delta_relative_bounds_size}"
                     )
+
                     teleop_action = torch.clamp(
                         teleop_action, -self.delta_relative_bounds_size, self.delta_relative_bounds_size
                     )
+            # NOTE: To mimic the shape of a neural network output, we add a batch dimension to the teleop action.
+            if teleop_action.dim() == 1:
+                teleop_action = teleop_action.unsqueeze(0)
 
         self.current_step += 1
 
@@ -217,7 +246,7 @@ class HILSerlRobotEnv(gym.Env):
 
     def render(self):
         """
-        Render the environment (in this case, display camera feeds).
+        Render the current state of the environment by displaying the robot's camera feeds.
         """
         import cv2
 
@@ -231,7 +260,10 @@ class HILSerlRobotEnv(gym.Env):
 
     def close(self):
         """
-        Close the environment and disconnect the robot.
+        Close the environment and clean up resources by disconnecting the robot.
+
+        If the robot is currently connected, this method properly terminates the connection to ensure that all
+        associated resources are released.
         """
         if self.robot.is_connected:
             self.robot.disconnect()
@@ -250,48 +282,19 @@ class ActionRepeatWrapper(gym.Wrapper):
         return obs, reward, done, truncated, info
 
 
-class RelativeJointPositionActionWrapper(gym.Wrapper):
-    def __init__(
-        self,
-        env: HILSerlRobotEnv,
-        # output_normalization_params_action: dict[str, list[float]],
-        delta: float = 0.1,
-    ):
-        super().__init__(env)
-        self.joint_positions = self.unwrapped.robot.follower_arms["main"].read("Present_Position")
-        self.delta = delta
-        if delta > 1:
-            raise ValueError("Delta should be less than 1")
-
-    def step(self, action):
-        action_joint = action
-        self.joint_positions = self.unwrapped.robot.follower_arms["main"].read("Present_Position")
-        if isinstance(self.env.action_space, gym.spaces.Tuple):
-            action_joint = action[0]
-        joint_positions = self.joint_positions + (self.delta * action_joint)
-        # clip the joint positions to the joint limits with the action space
-        joint_positions = np.clip(joint_positions, self.action_space.low, self.action_space.high)
-
-        if isinstance(self.env.action_space, gym.spaces.Tuple):
-            return self.env.step((joint_positions, action[1]))
-
-        obs, reward, terminated, truncated, info = self.env.step(joint_positions)
-        if info["is_intervention"]:
-            # teleop actions are returned in absolute joint space
-            # If we are using a relative joint position action space,
-            # there will be a mismatch between the spaces of the policy and teleop actions
-            # Solution is to transform the teleop actions into relative space.
-            self.joint_positions = self.unwrapped.robot.follower_arms["main"].read("Present_Position")
-            teleop_action = info["action_intervention"]  # teleop actions are in absolute joint space
-            relative_teleop_action = (teleop_action - self.joint_positions) / self.delta
-            info["action_intervention"] = relative_teleop_action
-
-        return self.env.step(joint_positions)
-
-
 class RewardWrapper(gym.Wrapper):
-    def __init__(self, env, reward_classifier: Optional[None], device: torch.device = "cuda"):
+    def __init__(self, env, reward_classifier, device: torch.device = "cuda"):
+        """
+        Wrapper to add reward prediction to the environment, it use a trained classifer.
+
+        Args:
+            env: The environment to wrap
+            reward_classifier: The reward classifier model
+            device: The device to run the model on
+        """
         self.env = env
+
+        # NOTE: We got 15% speedup by compiling the model
         self.reward_classifier = torch.compile(reward_classifier)
         self.device = device
 
@@ -305,9 +308,7 @@ class RewardWrapper(gym.Wrapper):
             reward = (
                 self.reward_classifier.predict_reward(images) if self.reward_classifier is not None else 0.0
             )
-        # print(f"fps for reward classifier {1/(time.perf_counter() - start_time)}")
-        reward = reward.item()
-        # print(f"Reward from reward classifier {reward}")
+        info["Reward classifer frequency"] = 1 / (time.perf_counter() - start_time)
         return observation, reward, terminated, truncated, info
 
     def reset(self, seed=None, options=None):
@@ -323,17 +324,23 @@ class TimeLimitWrapper(gym.Wrapper):
         self.last_timestamp = 0.0
         self.episode_time_in_s = 0.0
 
+        self.max_episode_steps = int(self.control_time_s * self.fps)
+
+        self.current_step = 0
+
     def step(self, action):
         obs, reward, terminated, truncated, info = self.env.step(action)
         time_since_last_step = time.perf_counter() - self.last_timestamp
+        # logging.warning(f"Current timestep is lower than the expected fps {self.fps}")
         self.episode_time_in_s += time_since_last_step
         self.last_timestamp = time.perf_counter()
-
+        self.current_step += 1
         # check if last timestep took more time than the expected fps
-        if 1.0 / time_since_last_step < self.fps:
-            logging.warning(f"Current timestep is lower than the expected fps {self.fps}")
+        # if 1.0 / time_since_last_step < self.fps:
+        #     logging.warning(f"Current timestep exceeded expected fps {self.fps}")
 
         if self.episode_time_in_s > self.control_time_s:
+            # if self.current_step >= self.max_episode_steps:
             # Terminated = True
             terminated = True
         return obs, reward, terminated, truncated, info
@@ -341,11 +348,13 @@ class TimeLimitWrapper(gym.Wrapper):
     def reset(self, seed=None, options=None):
         self.episode_time_in_s = 0.0
         self.last_timestamp = time.perf_counter()
+        self.current_step = 0
         return self.env.reset(seed=seed, options=options)
 
 
 class ImageCropResizeWrapper(gym.Wrapper):
     def __init__(self, env, crop_params_dict: Dict[str, Annotated[Tuple[int], 4]], resize_size=None):
+        super().__init__(env)
         self.env = env
         self.crop_params_dict = crop_params_dict
         print(f"obs_keys , {self.env.observation_space}")
@@ -372,10 +381,21 @@ class ImageCropResizeWrapper(gym.Wrapper):
             obs[k] = F.resize(obs[k], self.resize_size)
             obs[k] = obs[k].to(device)
             # print(f"observation with key {k} with size {obs[k].size()}")
-            cv2.imshow(k, cv2.cvtColor(obs[k].cpu().squeeze(0).permute(1, 2, 0).numpy(), cv2.COLOR_RGB2BGR))
-            cv2.waitKey(1)
+            # cv2.imshow(k, cv2.cvtColor(obs[k].cpu().squeeze(0).permute(1, 2, 0).numpy(), cv2.COLOR_RGB2BGR))
+            # cv2.waitKey(1)
         return obs, reward, terminated, truncated, info
 
+    def reset(self, seed=None, options=None):
+        obs, info = self.env.reset(seed=seed, options=options)
+        for k in self.crop_params_dict:
+            device = obs[k].device
+            if device == torch.device("mps:0"):
+                obs[k] = obs[k].cpu()
+            obs[k] = F.crop(obs[k], *self.crop_params_dict[k])
+            obs[k] = F.resize(obs[k], self.resize_size)
+            obs[k] = obs[k].to(device)
+        return obs, info
+
 
 class ConvertToLeRobotObservation(gym.ObservationWrapper):
     def __init__(self, env, device):
@@ -515,42 +535,64 @@ class ResetWrapper(gym.Wrapper):
         return super().reset(seed=seed, options=options)
 
 
+class BatchCompitableWrapper(gym.ObservationWrapper):
+    def __init__(self, env):
+        super().__init__(env)
+
+    def observation(self, observation: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+        for key in observation:
+            if "image" in key and observation[key].dim() == 3:
+                observation[key] = observation[key].unsqueeze(0)
+            if "state" in key and observation[key].dim() == 1:
+                observation[key] = observation[key].unsqueeze(0)
+        return observation
+
+
 def make_robot_env(
     robot,
     reward_classifier,
-    crop_params_dict=None,
-    fps=30,
-    control_time_s=20,
-    reset_follower_pos=True,
-    display_cameras=False,
-    device="cuda:0",
-    resize_size=None,
-    reset_time_s=10,
-    delta_action=0.1,
-    nb_repeats=1,
-    use_relative_joint_positions=False,
-):
+    cfg,
+    n_envs: int = 1,
+) -> gym.vector.VectorEnv:
     """
-    Factory function to create the robot environment.
+    Factory function to create a vectorized robot environment.
 
-    Mimics gym.make() for consistent environment creation.
+    Args:
+        robot: Robot instance to control
+        reward_classifier: Classifier model for computing rewards
+        cfg: Configuration object containing environment parameters
+        n_envs: Number of environments to create in parallel. Defaults to 1.
+
+    Returns:
+        A vectorized gym environment with all the necessary wrappers applied.
     """
+
+    # Create base environment
     env = HILSerlRobotEnv(
-        robot,
-        display_cameras=display_cameras,
-        delta=delta_action,
-        use_delta_action_space=use_relative_joint_positions,
+        robot=robot,
+        display_cameras=cfg.wrapper.display_cameras,
+        delta=cfg.wrapper.delta_action,
+        use_delta_action_space=cfg.wrapper.use_relative_joint_positions,
     )
-    env = ConvertToLeRobotObservation(env, device)
-    if crop_params_dict is not None:
-        env = ImageCropResizeWrapper(env, crop_params_dict, resize_size=resize_size)
-    env = RewardWrapper(env, reward_classifier, device=device)
-    env = TimeLimitWrapper(env, control_time_s, fps)
-    # env = ActionRepeatWrapper(env, nb_repeat=nb_repeats)
-    env = KeyboardInterfaceWrapper(env)
-    env = ResetWrapper(env, reset_fn=None, reset_time_s=reset_time_s)
+
+    # Add observation and image processing
+    env = ConvertToLeRobotObservation(env=env, device=cfg.device)
+    if cfg.wrapper.crop_params_dict is not None:
+        env = ImageCropResizeWrapper(
+            env=env, crop_params_dict=cfg.wrapper.crop_params_dict, resize_size=cfg.wrapper.resize_size
+        )
+
+    # Add reward computation and control wrappers
+    env = RewardWrapper(env=env, reward_classifier=reward_classifier, device=cfg.device)
+    env = TimeLimitWrapper(env=env, control_time_s=cfg.wrapper.control_time_s, fps=cfg.fps)
+    env = KeyboardInterfaceWrapper(env=env)
+    env = ResetWrapper(env=env, reset_fn=None, reset_time_s=cfg.wrapper.reset_time_s)
+    env = BatchCompitableWrapper(env=env)
+
     return env
 
+    # batched version of the env that returns an observation of shape (b, c)
+
 
 def get_classifier(pretrained_path, config_path, device="mps"):
     if pretrained_path is None or config_path is None:
@@ -616,6 +658,8 @@ if __name__ == "__main__":
         default=None,
         help="Path to a yaml config file that is necessary to build the reward classifier model.",
     )
+    parser.add_argument("--env-path", type=str, default=None, help="Path to the env yaml file")
+    parser.add_argument("--env-overrides", type=str, default=None, help="Overrides for the env yaml file")
     parser.add_argument("--control-time-s", type=float, default=20, help="Maximum episode length in seconds")
     parser.add_argument("--reset-follower-pos", type=int, default=1, help="Reset follower between episodes")
     args = parser.parse_args()
@@ -626,72 +670,38 @@ if __name__ == "__main__":
     reward_classifier = get_classifier(
         args.reward_classifier_pretrained_path, args.reward_classifier_config_file
     )
-
-    crop_parameters = {
-        "observation.images.laptop": (58, 89, 357, 455),
-        "observation.images.phone": (3, 4, 471, 633),
-    }
-
     user_relative_joint_positions = True
 
+    cfg = init_hydra_config(args.env_path, args.env_overrides)
     env = make_robot_env(
         robot,
         reward_classifier,
-        crop_parameters,
-        args.fps,
-        args.control_time_s,
-        args.reset_follower_pos,
-        args.display_cameras,
-        device="mps",
-        resize_size=None,
-        reset_time_s=10,
-        delta_action=0.1,
-        nb_repeats=1,
-        use_relative_joint_positions=user_relative_joint_positions,
+        cfg.wrapper,
     )
 
     env.reset()
-    init_pos = env.unwrapped.initial_follower_position
 
-    right_goal = init_pos.copy()
-    right_goal[0] += 50
+    # Retrieve the robot's action space for joint commands.
+    action_space_robot = env.action_space.spaces[0]
 
-    left_goal = init_pos.copy()
-    left_goal[0] -= 50
+    # Initialize the smoothed action as a random sample.
+    smoothed_action = action_space_robot.sample()
 
-    pitch_angle = np.linspace(left_goal[0], right_goal[0], 1000)
-
-    delta_angle = np.concatenate((-np.ones(50), np.ones(50))) * 100
+    # Smoothing coefficient (alpha) defines how much of the new random sample to mix in.
+    # A value close to 0 makes the trajectory very smooth (slow to change), while a value close to 1 is less smooth.
+    alpha = 0.4
 
     while True:
-        action = np.zeros(len(init_pos))
-        for i in range(len(delta_angle)):
-            start_loop_s = time.perf_counter()
-            action[0] = delta_angle[i]
-            obs, reward, terminated, truncated, info = env.step((torch.from_numpy(action), False))
-            if terminated or truncated:
-                env.reset()
+        start_loop_s = time.perf_counter()
+        # Sample a new random action from the robot's action space.
+        new_random_action = action_space_robot.sample()
+        # Update the smoothed action using an exponential moving average.
+        smoothed_action = alpha * new_random_action + (1 - alpha) * smoothed_action
 
-            dt_s = time.perf_counter() - start_loop_s
-            busy_wait(1 / args.fps - dt_s)
-        # action = np.zeros(len(init_pos)) if user_relative_joint_positions else init_pos
-        # for i in range(len(pitch_angle)):
-        #     if user_relative_joint_positions:
-        #         action[0] = delta_angle[i]
-        #     else:
-        #         action[0] = pitch_angle[i]
-        #     obs, reward, terminated, truncated, info = env.step((torch.from_numpy(action), False))
-        #     if terminated or truncated:
-        #         logging.info("Max control time reached, reset environment.")
-        #         env.reset()
+        # Execute the step: wrap the NumPy action in a torch tensor.
+        obs, reward, terminated, truncated, info = env.step((torch.from_numpy(smoothed_action), False))
+        if terminated or truncated:
+            env.reset()
 
-        # for i in reversed(range(len(pitch_angle))):
-        #     if user_relative_joint_positions:
-        #         action[0] = delta_angle[i]
-        #     else:
-        #         action[0] = pitch_angle[i]
-        #     obs, reward, terminated, truncated, info = env.step((torch.from_numpy(action), False))
-
-        #     if terminated or truncated:
-        #         logging.info("Max control time reached, reset environment.")
-        #         env.reset()
+        dt_s = time.perf_counter() - start_loop_s
+        busy_wait(1 / args.fps - dt_s)
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index 5766c69c..bbd70598 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -36,6 +36,8 @@ from termcolor import colored
 from torch import nn
 from torch.optim.optimizer import Optimizer
 
+from lerobot.common.datasets.factory import make_dataset
+
 # TODO: Remove the import of maniskill
 from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
 from lerobot.common.logger import Logger, log_output_dir
@@ -52,6 +54,7 @@ from lerobot.common.utils.utils import (
 )
 from lerobot.scripts.server.buffer import (
     ReplayBuffer,
+    concatenate_batch_transitions,
     move_state_dict_to_device,
     move_transition_to_device,
 )
@@ -259,8 +262,15 @@ def learner_push_parameters(
     while True:
         with policy_lock:
             params_dict = policy.actor.state_dict()
-            if policy.config.vision_encoder_name is not None and policy.config.freeze_vision_encoder:
-                params_dict = {k: v for k, v in params_dict if not k.startswith("encoder.")}
+            if policy.config.vision_encoder_name is not None:
+                if policy.config.freeze_vision_encoder:
+                    params_dict: dict[str, torch.Tensor] = {
+                        k: v for k, v in params_dict.items() if not k.startswith("encoder.")
+                    }
+                else:
+                    raise NotImplementedError(
+                        "Vision encoder is not frozen, we need to send the full model over the network which requires chunking the model."
+                    )
 
         params_dict = move_state_dict_to_device(params_dict, device="cpu")
         # Serialize
@@ -322,6 +332,7 @@ def add_actor_information_and_train(
     # in the future. The reason why we did that is the  GIL in Python. It's super slow the performance
     # are divided by 200. So we need to have a single thread that does all the work.
     time.time()
+    logging.info("Starting learner thread")
     interaction_message, transition = None, None
     optimization_step = resume_optimization_step if resume_optimization_step is not None else 0
     interaction_step_shift = resume_interaction_step if resume_interaction_step is not None else 0
@@ -340,16 +351,21 @@ def add_actor_information_and_train(
             # If cfg.resume, shift the interaction step with the last checkpointed step in order to not break the logging
             interaction_message["Interaction step"] += interaction_step_shift
             logger.log_dict(interaction_message, mode="train", custom_step_key="Interaction step")
+            logging.info(f"Interaction message: {interaction_message}")
 
         if len(replay_buffer) < cfg.training.online_step_before_learning:
             continue
+
+        # logging.info(f"Size of replay buffer: {len(replay_buffer)}")
+        # logging.info(f"Size of offline replay buffer: {len(offline_replay_buffer)}")
+
         time_for_one_optimization_step = time.time()
         for _ in range(cfg.policy.utd_ratio - 1):
             batch = replay_buffer.sample(batch_size)
 
-            # if cfg.offline_dataset_repo_id is not None:
-            #     batch_offline = offline_replay_buffer.sample(batch_size)
-            #     batch = concatenate_batch_transitions(batch, batch_offline)
+            if cfg.dataset_repo_id is not None:
+                batch_offline = offline_replay_buffer.sample(batch_size)
+                batch = concatenate_batch_transitions(batch, batch_offline)
 
             actions = batch["action"]
             rewards = batch["reward"]
@@ -371,11 +387,11 @@ def add_actor_information_and_train(
 
         batch = replay_buffer.sample(batch_size)
 
-        # if cfg.offline_dataset_repo_id is not None:
-        #     batch_offline = offline_replay_buffer.sample(batch_size)
-        #     batch = concatenate_batch_transitions(
-        #         left_batch_transitions=batch, right_batch_transition=batch_offline
-        #     )
+        if cfg.dataset_repo_id is not None:
+            batch_offline = offline_replay_buffer.sample(batch_size)
+            batch = concatenate_batch_transitions(
+                left_batch_transitions=batch, right_batch_transition=batch_offline
+            )
 
         actions = batch["action"]
         rewards = batch["reward"]
@@ -423,7 +439,7 @@ def add_actor_information_and_train(
         time_for_one_optimization_step = time.time() - time_for_one_optimization_step
         frequency_for_one_optimization_step = 1 / (time_for_one_optimization_step + 1e-9)
 
-        logging.debug(f"[LEARNER] Optimization frequency loop [Hz]: {frequency_for_one_optimization_step}")
+        logging.info(f"[LEARNER] Optimization frequency loop [Hz]: {frequency_for_one_optimization_step}")
 
         logger.log_dict(
             {"Optimization frequency loop [Hz]": frequency_for_one_optimization_step},
@@ -560,14 +576,14 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     batch_size = cfg.training.batch_size
     offline_replay_buffer = None
 
-    # if cfg.dataset_repo_id is not None:
-    #     logging.info("make_dataset offline buffer")
-    #     offline_dataset = make_dataset(cfg)
-    #     logging.info("Convertion to a offline replay buffer")
-    #     offline_replay_buffer = ReplayBuffer.from_lerobot_dataset(
-    #         offline_dataset, device=device, state_keys=cfg.policy.input_shapes.keys()
-    #     )
-    #     batch_size: int = batch_size // 2  # We will sample from both replay buffer
+    if cfg.dataset_repo_id is not None:
+        logging.info("make_dataset offline buffer")
+        offline_dataset = make_dataset(cfg)
+        logging.info("Convertion to a offline replay buffer")
+        offline_replay_buffer = ReplayBuffer.from_lerobot_dataset(
+            offline_dataset, device=device, state_keys=cfg.policy.input_shapes.keys()
+        )
+        batch_size: int = batch_size // 2  # We will sample from both replay buffer
 
     start_learner_threads(
         cfg,
diff --git a/lerobot/scripts/train_hilserl_classifier.py b/lerobot/scripts/train_hilserl_classifier.py
index 0ca8eae4..15cf3d0b 100644
--- a/lerobot/scripts/train_hilserl_classifier.py
+++ b/lerobot/scripts/train_hilserl_classifier.py
@@ -279,8 +279,10 @@ def train(cfg: DictConfig) -> None:
     logging.info(f"Dataset size: {len(dataset)}")
 
     train_size = int(cfg.train_split_proportion * len(dataset))
-    val_size = len(dataset) - train_size
-    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
+    # val_size = len(dataset) - train_size
+    # train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
+    train_dataset = dataset[:train_size]
+    val_dataset = dataset[train_size:]
 
     sampler = create_balanced_sampler(train_dataset, cfg)
     train_loader = DataLoader(

From b5f89439ff31e8de1a287d67aa711a683fec0ff1 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Mon, 10 Feb 2025 16:08:13 +0100
Subject: [PATCH 066/112] Added sac_real config file in the policym configs
 dir. Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>

---
 lerobot/configs/policy/sac_real.yaml | 118 +++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 lerobot/configs/policy/sac_real.yaml

diff --git a/lerobot/configs/policy/sac_real.yaml b/lerobot/configs/policy/sac_real.yaml
new file mode 100644
index 00000000..cbce0b00
--- /dev/null
+++ b/lerobot/configs/policy/sac_real.yaml
@@ -0,0 +1,118 @@
+# @package _global_
+
+# Train with:
+#
+# python lerobot/scripts/train.py \
+#   +dataset=lerobot/pusht_keypoints
+#   env=pusht \
+#   env.gym.obs_type=environment_state_agent_pos \
+
+seed: 1
+dataset_repo_id: null # aractingi/push_green_cube_hf_cropped_resized
+
+training:
+  # Offline training dataloader
+  num_workers: 4
+
+  # batch_size: 256
+  batch_size: 512
+  grad_clip_norm: 10.0
+  lr: 3e-4
+
+  eval_freq: 2500
+  log_freq: 500
+  save_freq: 2000000
+
+  online_steps: 1000000
+  online_rollout_n_episodes: 10
+  online_rollout_batch_size: 10
+  online_steps_between_rollouts: 1000
+  online_sampling_ratio: 1.0
+  online_env_seed: 10000
+  online_buffer_capacity: 1000000
+  online_buffer_seed_size: 0
+  online_step_before_learning: 100 #5000
+  do_online_rollout_async: false
+  policy_update_freq: 1
+
+  # delta_timestamps:
+  #   observation.environment_state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
+  #   observation.state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
+  #   action: "[i / ${fps} for i in range(${policy.horizon})]"
+  #   next.reward: "[i / ${fps} for i in range(${policy.horizon})]"
+
+policy:
+  name: sac
+
+  pretrained_model_path:
+
+  # Input / output structure.
+  n_action_repeats: 1
+  horizon: 1
+  n_action_steps: 1
+
+  shared_encoder: true
+  # vision_encoder_name: null
+  freeze_vision_encoder: true
+  input_shapes:
+    # # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
+    observation.state: ["${env.state_dim}"]
+    observation.images.laptop: [3, 128, 128]
+    observation.images.phone: [3, 128, 128]
+    # observation.image: [3, 128, 128]
+  output_shapes:
+    action: ["${env.action_dim}"]
+
+  # Normalization / Unnormalization
+  input_normalization_modes: 
+    observation.images.laptop: mean_std
+    observation.images.phone: mean_std
+    observation.state: min_max
+  input_normalization_params:
+    observation.images.laptop:
+      mean: [0.485, 0.456, 0.406]
+      std: [0.229, 0.224, 0.225]
+    observation.images.phone:
+      mean: [0.485, 0.456, 0.406]
+      std: [0.229, 0.224, 0.225]
+    observation.state:
+      min: [-88.50586,  23.81836, 0.87890625, -32.16797, 78.66211,   0.53691274]
+      max: [84.55078, 187.11914, 145.98633, 101.60156, 146.60156,  88.18792]
+
+  output_normalization_modes:
+    action: min_max
+  output_normalization_params:
+    action:
+      min: [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0]
+      max: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+
+  # Architecture / modeling.
+  # Neural networks.
+  image_encoder_hidden_dim: 32
+  # discount: 0.99
+  discount: 0.80
+  temperature_init: 1.0
+  num_critics: 2 #10
+  camera_number: 2
+  num_subsample_critics: null
+  critic_lr: 3e-4
+  actor_lr: 3e-4
+  temperature_lr: 3e-4
+  # critic_target_update_weight: 0.005
+  critic_target_update_weight: 0.01
+  utd_ratio: 2 # 10
+
+actor_learner_config:
+  actor_ip: "127.0.0.1"
+  port: 50051
+
+  # # Loss coefficients.
+  # reward_coeff: 0.5
+  # expectile_weight: 0.9
+  # value_coeff: 0.1
+  # consistency_coeff: 20.0
+  # advantage_scaling: 3.0
+  # pi_coeff: 0.5
+  # temporal_decay_coeff: 0.5
+  # # Target model.
+  # target_model_momentum: 0.995

From a7db3959f50bfd9e5f5072856446d8442323fe5d Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Tue, 11 Feb 2025 11:34:46 +0100
Subject: [PATCH 067/112] - Added JointMaskingActionSpace wrapper in
 `gym_manipulator` in order to select which joints will be controlled. For
 example, we can disable the gripper actions for some tasks. - Added Nan
 detection mechanisms in the actor, learner and gym_manipulator for the case
 where we encounter nans in the loop. - changed the non-blocking in the
 `.to(device)` functions to only work for the case of cuda because they were
 causing nans when running the policy on mps - Added some joint clipping and
 limits in the env, robot and policy configs. TODO clean this part and make
 the limits in one config file only.

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 .../hilserl/classifier/modeling_classifier.py |   4 +-
 lerobot/configs/env/so100_real.yaml           |   4 +-
 lerobot/configs/policy/sac_real.yaml          |  10 +-
 lerobot/configs/robot/so100.yaml              |   9 +-
 lerobot/scripts/server/actor_server.py        |  21 +++-
 lerobot/scripts/server/buffer.py              |  17 ++-
 lerobot/scripts/server/find_joint_limits.py   |   1 +
 lerobot/scripts/server/gym_manipulator.py     | 106 ++++++++++++++++--
 lerobot/scripts/server/learner_server.py      |  20 +++-
 9 files changed, 161 insertions(+), 31 deletions(-)

diff --git a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
index 58532302..c5485227 100644
--- a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
+++ b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
@@ -145,8 +145,8 @@ class Classifier(
 
         return ClassifierOutput(logits=logits, probabilities=probabilities, hidden_states=encoder_outputs)
 
-    def predict_reward(self, x):
+    def predict_reward(self, x, threshold=0.6):
         if self.config.num_classes == 2:
-            return (self.forward(x).probabilities > 0.6).float()
+            return (self.forward(x).probabilities > threshold).float()
         else:
             return torch.argmax(self.forward(x).probabilities, dim=1)
diff --git a/lerobot/configs/env/so100_real.yaml b/lerobot/configs/env/so100_real.yaml
index 862ea951..82dcfeea 100644
--- a/lerobot/configs/env/so100_real.yaml
+++ b/lerobot/configs/env/so100_real.yaml
@@ -18,10 +18,12 @@ env:
     control_time_s: 20
     reset_follower_pos: true
     use_relative_joint_positions: true
-    reset_time_s: 10
+    reset_time_s: 5
     display_cameras: false
     delta_action: 0.1
+    joint_masking_action_space: [1, 1, 1, 1, 0, 0] # disable wrist and gripper
 
   reward_classifier:
     pretrained_path: outputs/classifier/checkpoints/best/pretrained_model
     config_path: lerobot/configs/policy/hilserl_classifier.yaml
+    
\ No newline at end of file
diff --git a/lerobot/configs/policy/sac_real.yaml b/lerobot/configs/policy/sac_real.yaml
index cbce0b00..de0ffe9b 100644
--- a/lerobot/configs/policy/sac_real.yaml
+++ b/lerobot/configs/policy/sac_real.yaml
@@ -20,7 +20,7 @@ training:
   lr: 3e-4
 
   eval_freq: 2500
-  log_freq: 500
+  log_freq: 1
   save_freq: 2000000
 
   online_steps: 1000000
@@ -31,7 +31,7 @@ training:
   online_env_seed: 10000
   online_buffer_capacity: 1000000
   online_buffer_seed_size: 0
-  online_step_before_learning: 100 #5000
+  online_step_before_learning: 1000 #5000
   do_online_rollout_async: false
   policy_update_freq: 1
 
@@ -76,8 +76,10 @@ policy:
       mean: [0.485, 0.456, 0.406]
       std: [0.229, 0.224, 0.225]
     observation.state:
-      min: [-88.50586,  23.81836, 0.87890625, -32.16797, 78.66211,   0.53691274]
-      max: [84.55078, 187.11914, 145.98633, 101.60156, 146.60156,  88.18792]
+      min: [-87.09961,     62.402344,    67.23633,     36.035156,    77.34375,0.53691274] 
+      max: [58.183594,   131.83594,    145.98633,     82.08984,     78.22266, 0.60402685]
+      # min: [-88.50586,  23.81836, 0.87890625, -32.16797, 78.66211,   0.53691274]
+      # max: [84.55078, 187.11914, 145.98633, 101.60156, 146.60156,  88.18792]
 
   output_normalization_modes:
     action: min_max
diff --git a/lerobot/configs/robot/so100.yaml b/lerobot/configs/robot/so100.yaml
index d57ae721..59c52a6d 100644
--- a/lerobot/configs/robot/so100.yaml
+++ b/lerobot/configs/robot/so100.yaml
@@ -15,8 +15,13 @@ calibration_dir: .cache/calibration/so100
 # the number of motors in your follower arms.
 max_relative_target: null
 joint_position_relative_bounds: 
-  min: [-88.50586,  23.81836, 0.87890625, -32.16797, 78.66211,   0.53691274]
-  max: [84.55078, 187.11914, 145.98633, 101.60156, 146.60156,  88.18792]
+   min: [-87.09961,     62.402344,    67.23633,     36.035156,    77.34375,
+   0.53691274] 
+   max: [58.183594,   131.83594,    145.98633,     82.08984,     78.22266,
+   0.60402685]
+   
+  # min: [-88.50586,  23.81836, 0.87890625, -32.16797, 78.66211,   0.53691274]
+  # max: [84.55078, 187.11914, 145.98633, 101.60156, 146.60156,  88.18792]
 
 leader_arms:
   main:
diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index 28c582d2..7ee91b2c 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -101,10 +101,14 @@ class ActorServiceServicer(hilserl_pb2_grpc.ActorServiceServicer):
             message = message_queue.get(block=True)
 
             if message.transition is not None:
-                transition_to_send_to_learner = [
-                    move_transition_to_device(T, device="cpu") for T in message.transition
+                transition_to_send_to_learner: list[Transition] = [
+                    move_transition_to_device(transition=T, device="cpu") for T in message.transition
                 ]
-
+                # Check for NaNs in transitions before sending to learner
+                for transition in transition_to_send_to_learner:
+                    for key, value in transition["state"].items():
+                        if torch.isnan(value).any():
+                            logging.warning(f"Found NaN values in transition {key}")
                 buf = io.BytesIO()
                 torch.save(transition_to_send_to_learner, buf)
                 transition_bytes = buf.getvalue()
@@ -226,7 +230,7 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module)
             with TimerManager(
                 elapsed_time_list=list_policy_time, label="Policy inference time", log=False
             ) as timer:  # noqa: F841
-                action = policy.select_action(batch=obs) * 0.0
+                action = policy.select_action(batch=obs)
             policy_fps = 1.0 / (list_policy_time[-1] + 1e-9)
 
             log_policy_frequency_issue(policy_fps=policy_fps, cfg=cfg, interaction_step=interaction_step)
@@ -238,7 +242,9 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module)
             next_obs, reward, done, truncated, info = online_env.step(action)
 
             # HACK: We have only one env but we want to batch it, it will be resolved with the torch box
-            action = torch.from_numpy(action[0]).to(device, non_blocking=True).unsqueeze(dim=0)
+            action = (
+                torch.from_numpy(action[0]).to(device, non_blocking=device.type == "cuda").unsqueeze(dim=0)
+            )
 
         sum_reward_episode += float(reward)
 
@@ -247,6 +253,11 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module)
             # TODO: Check the shape
             action = info["action_intervention"]
 
+        # Check for NaN values in observations
+        for key, tensor in obs.items():
+            if torch.isnan(tensor).any():
+                logging.error(f"[ACTOR] NaN values found in obs[{key}] at step {interaction_step}")
+
         list_transition_to_send_to_learner.append(
             Transition(
                 state=obs,
diff --git a/lerobot/scripts/server/buffer.py b/lerobot/scripts/server/buffer.py
index 8be21365..6caa9df7 100644
--- a/lerobot/scripts/server/buffer.py
+++ b/lerobot/scripts/server/buffer.py
@@ -43,16 +43,27 @@ class BatchTransition(TypedDict):
 
 def move_transition_to_device(transition: Transition, device: str = "cpu") -> Transition:
     # Move state tensors to CPU
-    transition["state"] = {key: val.to(device, non_blocking=True) for key, val in transition["state"].items()}
+    device = torch.device(device)
+    transition["state"] = {
+        key: val.to(device, non_blocking=device.type == "cuda") for key, val in transition["state"].items()
+    }
 
     # Move action to CPU
-    transition["action"] = transition["action"].to(device, non_blocking=True)
+    transition["action"] = transition["action"].to(device, non_blocking=device.type == "cuda")
 
     # No need to move reward or done, as they are float and bool
 
+    # No need to move reward or done, as they are float and bool
+    if isinstance(transition["reward"], torch.Tensor):
+        transition["reward"] = transition["reward"].to(device=device, non_blocking=device.type == "cuda")
+
+    if isinstance(transition["done"], torch.Tensor):
+        transition["done"] = transition["done"].to(device, non_blocking=device.type == "cuda")
+
     # Move next_state tensors to CPU
     transition["next_state"] = {
-        key: val.to(device, non_blocking=True) for key, val in transition["next_state"].items()
+        key: val.to(device, non_blocking=device.type == "cuda")
+        for key, val in transition["next_state"].items()
     }
 
     # If complementary_info is present, move its tensors to CPU
diff --git a/lerobot/scripts/server/find_joint_limits.py b/lerobot/scripts/server/find_joint_limits.py
index 6ec9d89f..1c2443d6 100644
--- a/lerobot/scripts/server/find_joint_limits.py
+++ b/lerobot/scripts/server/find_joint_limits.py
@@ -40,6 +40,7 @@ def find_joint_bounds(
             min = np.min(np.stack(pos_list), 0)
             print(f"Max angle position per joint {max}")
             print(f"Min angle position per joint {min}")
+            break
 
 
 if __name__ == "__main__":
diff --git a/lerobot/scripts/server/gym_manipulator.py b/lerobot/scripts/server/gym_manipulator.py
index 5bf51868..09b979c5 100644
--- a/lerobot/scripts/server/gym_manipulator.py
+++ b/lerobot/scripts/server/gym_manipulator.py
@@ -296,12 +296,17 @@ class RewardWrapper(gym.Wrapper):
 
         # NOTE: We got 15% speedup by compiling the model
         self.reward_classifier = torch.compile(reward_classifier)
+
+        if isinstance(device, str):
+            device = torch.device(device)
         self.device = device
 
     def step(self, action):
         observation, _, terminated, truncated, info = self.env.step(action)
         images = [
-            observation[key].to(self.device, non_blocking=True) for key in observation if "image" in key
+            observation[key].to(self.device, non_blocking=self.device.type == "cuda")
+            for key in observation
+            if "image" in key
         ]
         start_time = time.perf_counter()
         with torch.inference_mode():
@@ -309,12 +314,76 @@ class RewardWrapper(gym.Wrapper):
                 self.reward_classifier.predict_reward(images) if self.reward_classifier is not None else 0.0
             )
         info["Reward classifer frequency"] = 1 / (time.perf_counter() - start_time)
+
+        if reward == 1.0:
+            terminated = True
         return observation, reward, terminated, truncated, info
 
     def reset(self, seed=None, options=None):
         return self.env.reset(seed=seed, options=options)
 
 
+class JointMaskingActionSpace(gym.ActionWrapper):
+    def __init__(self, env, mask):
+        """
+        Wrapper to mask out dimensions of the action space.
+
+        Args:
+            env: The environment to wrap
+            mask: Binary mask array where 0 indicates dimensions to remove
+        """
+        super().__init__(env)
+
+        # Validate mask matches action space
+
+        # Keep only dimensions where mask is 1
+        self.active_dims = np.where(mask)[0]
+
+        if isinstance(env.action_space, gym.spaces.Box):
+            if len(mask) != env.action_space.shape[0]:
+                raise ValueError("Mask length must match action space dimensions")
+            low = env.action_space.low[self.active_dims]
+            high = env.action_space.high[self.active_dims]
+            self.action_space = gym.spaces.Box(low=low, high=high, dtype=env.action_space.dtype)
+
+        if isinstance(env.action_space, gym.spaces.Tuple):
+            if len(mask) != env.action_space[0].shape[0]:
+                raise ValueError("Mask length must match action space 0 dimensions")
+
+            low = env.action_space[0].low[self.active_dims]
+            high = env.action_space[0].high[self.active_dims]
+            action_space_masked = gym.spaces.Box(low=low, high=high, dtype=env.action_space[0].dtype)
+            self.action_space = gym.spaces.Tuple((action_space_masked, env.action_space[1]))
+            # Create new action space with masked dimensions
+
+    def action(self, action):
+        """
+        Convert masked action back to full action space.
+
+        Args:
+            action: Action in masked space. For Tuple spaces, the first element is masked.
+
+        Returns:
+            Action in original space with masked dims set to 0.
+        """
+
+        # Determine whether we are handling a Tuple space or a Box.
+        if isinstance(self.env.action_space, gym.spaces.Tuple):
+            # Extract the masked component from the tuple.
+            masked_action = action[0] if isinstance(action, tuple) else action
+            # Create a full action for the Box element.
+            full_box_action = np.zeros(self.env.action_space[0].shape, dtype=self.env.action_space[0].dtype)
+            full_box_action[self.active_dims] = masked_action
+            # Return a tuple with the reconstructed Box action and the unchanged remainder.
+            return (full_box_action, action[1])
+        else:
+            # For Box action spaces.
+            masked_action = action if not isinstance(action, tuple) else action[0]
+            full_action = np.zeros(self.env.action_space.shape, dtype=self.env.action_space.dtype)
+            full_action[self.active_dims] = masked_action
+            return full_action
+
+
 class TimeLimitWrapper(gym.Wrapper):
     def __init__(self, env, control_time_s, fps):
         self.env = env
@@ -331,13 +400,12 @@ class TimeLimitWrapper(gym.Wrapper):
     def step(self, action):
         obs, reward, terminated, truncated, info = self.env.step(action)
         time_since_last_step = time.perf_counter() - self.last_timestamp
-        # logging.warning(f"Current timestep is lower than the expected fps {self.fps}")
         self.episode_time_in_s += time_since_last_step
         self.last_timestamp = time.perf_counter()
         self.current_step += 1
         # check if last timestep took more time than the expected fps
-        # if 1.0 / time_since_last_step < self.fps:
-        #     logging.warning(f"Current timestep exceeded expected fps {self.fps}")
+        if 1.0 / time_since_last_step < self.fps:
+            logging.debug(f"Current timestep exceeded expected fps {self.fps}")
 
         if self.episode_time_in_s > self.control_time_s:
             # if self.current_step >= self.max_episode_steps:
@@ -360,7 +428,7 @@ class ImageCropResizeWrapper(gym.Wrapper):
         print(f"obs_keys , {self.env.observation_space}")
         print(f"crop params dict {crop_params_dict.keys()}")
         for key_crop in crop_params_dict:
-            if key_crop not in self.env.observation_space.keys():
+            if key_crop not in self.env.observation_space.keys():  # noqa: SIM118
                 raise ValueError(f"Key {key_crop} not in observation space")
         for key in crop_params_dict:
             top, left, height, width = crop_params_dict[key]
@@ -375,14 +443,23 @@ class ImageCropResizeWrapper(gym.Wrapper):
         obs, reward, terminated, truncated, info = self.env.step(action)
         for k in self.crop_params_dict:
             device = obs[k].device
+
+            # Check for NaNs before processing
+            if torch.isnan(obs[k]).any():
+                logging.error(f"NaN values detected in observation {k} before crop and resize")
+
             if device == torch.device("mps:0"):
                 obs[k] = obs[k].cpu()
+
             obs[k] = F.crop(obs[k], *self.crop_params_dict[k])
             obs[k] = F.resize(obs[k], self.resize_size)
+
+            # Check for NaNs after processing
+            if torch.isnan(obs[k]).any():
+                logging.error(f"NaN values detected in observation {k} after crop and resize")
+
             obs[k] = obs[k].to(device)
-            # print(f"observation with key {k} with size {obs[k].size()}")
-            # cv2.imshow(k, cv2.cvtColor(obs[k].cpu().squeeze(0).permute(1, 2, 0).numpy(), cv2.COLOR_RGB2BGR))
-            # cv2.waitKey(1)
+
         return obs, reward, terminated, truncated, info
 
     def reset(self, seed=None, options=None):
@@ -400,12 +477,18 @@ class ImageCropResizeWrapper(gym.Wrapper):
 class ConvertToLeRobotObservation(gym.ObservationWrapper):
     def __init__(self, env, device):
         super().__init__(env)
+
+        if isinstance(device, str):
+            device = torch.device(device)
         self.device = device
 
     def observation(self, observation):
         observation = preprocess_observation(observation)
 
-        observation = {key: observation[key].to(self.device, non_blocking=True) for key in observation}
+        observation = {
+            key: observation[key].to(self.device, non_blocking=self.device.type == "cuda")
+            for key in observation
+        }
         observation = {k: torch.tensor(v, device=self.device) for k, v in observation.items()}
         return observation
 
@@ -440,7 +523,7 @@ class KeyboardInterfaceWrapper(gym.Wrapper):
                         if key == keyboard.Key.right or key == keyboard.Key.esc:
                             print("Right arrow key pressed. Exiting loop...")
                             self.events["exit_early"] = True
-                        elif key == keyboard.Key.space:
+                        elif key == keyboard.Key.space and not self.events["exit_early"]:
                             if not self.events["pause_policy"]:
                                 print(
                                     "Space key pressed. Human intervention required.\n"
@@ -587,6 +670,7 @@ def make_robot_env(
     env = TimeLimitWrapper(env=env, control_time_s=cfg.wrapper.control_time_s, fps=cfg.fps)
     env = KeyboardInterfaceWrapper(env=env)
     env = ResetWrapper(env=env, reset_fn=None, reset_time_s=cfg.wrapper.reset_time_s)
+    env = JointMaskingActionSpace(env=env, mask=cfg.wrapper.joint_masking_action_space)
     env = BatchCompitableWrapper(env=env)
 
     return env
@@ -596,7 +680,7 @@ def make_robot_env(
 
 def get_classifier(pretrained_path, config_path, device="mps"):
     if pretrained_path is None or config_path is None:
-        return
+        return None
 
     from lerobot.common.policies.factory import _policy_cfg_from_hydra_cfg
     from lerobot.common.policies.hilserl.classifier.configuration_classifier import ClassifierConfig
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index bbd70598..1b54e3a9 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -278,12 +278,23 @@ def learner_push_parameters(
         torch.save(params_dict, buf)
         params_bytes = buf.getvalue()
 
-        # Push them to the Actor’s "SendParameters" method
+        # Push them to the Actor's "SendParameters" method
         logging.info("[LEARNER] Publishing parameters to the Actor")
         response = actor_stub.SendParameters(hilserl_pb2.Parameters(parameter_bytes=params_bytes))  # noqa: F841
         time.sleep(seconds_between_pushes)
 
 
+def check_nan_in_transition(observations: torch.Tensor, actions: torch.Tensor, next_state: torch.Tensor):
+    for k in observations:
+        if torch.isnan(observations[k]).any():
+            logging.error(f"observations[{k}] contains NaN values")
+    for k in next_state:
+        if torch.isnan(next_state[k]).any():
+            logging.error(f"next_state[{k}] contains NaN values")
+    if torch.isnan(actions).any():
+        logging.error("actions contains NaN values")
+
+
 def add_actor_information_and_train(
     cfg,
     device: str,
@@ -372,6 +383,7 @@ def add_actor_information_and_train(
             observations = batch["state"]
             next_observations = batch["next_state"]
             done = batch["done"]
+            check_nan_in_transition(observations=observations, actions=actions, next_state=next_observations)
 
             with policy_lock:
                 loss_critic = policy.compute_loss_critic(
@@ -399,6 +411,8 @@ def add_actor_information_and_train(
         next_observations = batch["next_state"]
         done = batch["done"]
 
+        assert_and_breakpoint(observations=observations, actions=actions, next_state=next_observations)
+
         with policy_lock:
             loss_critic = policy.compute_loss_critic(
                 observations=observations,
@@ -497,8 +511,8 @@ def make_optimizers_and_scheduler(cfg, policy: nn.Module):
     It also initializes a learning rate scheduler, though currently, it is set to `None`.
 
     **NOTE:**
-    - If the encoder is shared, its parameters are excluded from the actor’s optimization process.
-    - The policy’s log temperature (`log_alpha`) is wrapped in a list to ensure proper optimization as a standalone tensor.
+    - If the encoder is shared, its parameters are excluded from the actor's optimization process.
+    - The policy's log temperature (`log_alpha`) is wrapped in a list to ensure proper optimization as a standalone tensor.
 
     Args:
         cfg: Configuration object containing hyperparameters.

From a1d16fb4009fc49a02c5f4fcbe1369189fbb61b1 Mon Sep 17 00:00:00 2001
From: Eugene Mironov <helper2424@gmail.com>
Date: Tue, 11 Feb 2025 17:37:00 +0700
Subject: [PATCH 068/112] [Port HIL-SERL] Add resnet-10 as default encoder for
 HIL-SERL (#696)

Co-authored-by: Khalil Meftah <kmeftah.khalil@gmail.com>
Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
Co-authored-by: Michel Aractingi <michel.aractingi@huggingface.co>
Co-authored-by: Ke Wang <superwk1017@gmail.com>
---
 .../policies/hilserl/classifier/configuration_classifier.py     | 2 +-
 lerobot/common/policies/sac/configuration_sac.py                | 2 +-
 lerobot/configs/policy/hilserl_classifier.yaml                  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lerobot/common/policies/hilserl/classifier/configuration_classifier.py b/lerobot/common/policies/hilserl/classifier/configuration_classifier.py
index de3742ec..fe7eb142 100644
--- a/lerobot/common/policies/hilserl/classifier/configuration_classifier.py
+++ b/lerobot/common/policies/hilserl/classifier/configuration_classifier.py
@@ -10,7 +10,7 @@ class ClassifierConfig:
     num_classes: int = 2
     hidden_dim: int = 256
     dropout_rate: float = 0.1
-    model_name: str = "microsoft/resnet-50"
+    model_name: str = "helper2424/resnet10"
     device: str = "cpu"
     model_type: str = "cnn"  # "transformer" or "cnn"
     num_cameras: int = 2
diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index bcca8976..e9d78fdd 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -61,7 +61,7 @@ class SACConfig:
     )
     camera_number: int = 1
     # Add type annotations for these fields:
-    vision_encoder_name: str | None = field(default="microsoft/resnet-18")
+    vision_encoder_name: str | None = field(default="helper2424/resnet10")
     freeze_vision_encoder: bool = True
     image_encoder_hidden_dim: int = 32
     shared_encoder: bool = True
diff --git a/lerobot/configs/policy/hilserl_classifier.yaml b/lerobot/configs/policy/hilserl_classifier.yaml
index a315902b..1a95f000 100644
--- a/lerobot/configs/policy/hilserl_classifier.yaml
+++ b/lerobot/configs/policy/hilserl_classifier.yaml
@@ -36,7 +36,7 @@ eval:
 
 policy:
   name: "hilserl/classifier/push_green_cube_hf_cropped_resized" #"hilserl/classifier/pick_place_lego_cube_1"
-  model_name: "facebook/convnext-base-224"
+  model_name: "helper2424/resnet10"
   model_type: "cnn"
   num_cameras: 2 # Has to be len(training.image_keys)
 

From 6868c88ef1cd9061647cc922a2564fe0f607fd42 Mon Sep 17 00:00:00 2001
From: Yoel <yoel.chornton@gmail.com>
Date: Tue, 11 Feb 2025 11:39:17 +0100
Subject: [PATCH 069/112] [PORT-Hilserl] classifier fixes (#695)

Co-authored-by: Michel Aractingi <michel.aractingi@huggingface.co>
Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/scripts/train_hilserl_classifier.py | 25 ++++++++++-----------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/lerobot/scripts/train_hilserl_classifier.py b/lerobot/scripts/train_hilserl_classifier.py
index 15cf3d0b..0db19cd6 100644
--- a/lerobot/scripts/train_hilserl_classifier.py
+++ b/lerobot/scripts/train_hilserl_classifier.py
@@ -118,7 +118,7 @@ def train_epoch(model, train_loader, criterion, optimizer, grad_scaler, device,
         pbar.set_postfix({"loss": f"{loss.item():.4f}", "acc": f"{current_acc:.2f}%"})
 
 
-def validate(model, val_loader, criterion, device, logger, cfg, num_samples_to_log=8):
+def validate(model, val_loader, criterion, device, logger, cfg):
     # Validation loop with metric tracking and sample logging
     model.eval()
     correct = 0
@@ -160,15 +160,15 @@ def validate(model, val_loader, criterion, device, logger, cfg, num_samples_to_l
             running_loss += loss.item()
 
             # Log sample predictions for visualization
-            if len(samples) < num_samples_to_log:
-                for i in range(min(num_samples_to_log - len(samples), len(images))):
+            if len(samples) < cfg.eval.num_samples_to_log:
+                for i in range(min( cfg.eval.num_samples_to_log - len(samples), len(images))):
                     if model.config.num_classes == 2:
                         confidence = round(outputs.probabilities[i].item(), 3)
                     else:
                         confidence = [round(prob, 3) for prob in outputs.probabilities[i].tolist()]
                     samples.append(
                         {
-                            "image": wandb.Image(images[i].cpu()),
+                            **{f"image_{img_key}": wandb.Image(images[img_idx][i].cpu()) for img_idx, img_key in enumerate(cfg.training.image_keys)},
                             "true_label": labels[i].item(),
                             "predicted": predictions[i].item(),
                             "confidence": confidence,
@@ -184,8 +184,8 @@ def validate(model, val_loader, criterion, device, logger, cfg, num_samples_to_l
         "accuracy": accuracy,
         "eval_s": time.perf_counter() - batch_start_time,
         "eval/prediction_samples": wandb.Table(
-            data=[[s["image"], s["true_label"], s["predicted"], f"{s['confidence']}"] for s in samples],
-            columns=["Image", "True Label", "Predicted", "Confidence"],
+            data=[list(s.values()) for s in samples],
+            columns=list(samples[0].keys()),
         )
         if logger._cfg.wandb.enable
         else None,
@@ -270,19 +270,18 @@ def train(cfg: DictConfig) -> None:
     device = get_safe_torch_device(cfg.device, log=True)
     set_global_seed(cfg.seed)
 
-    out_dir = Path(cfg.output_dir)
-    out_dir.mkdir(parents=True, exist_ok=True)
+    out_dir = hydra.core.hydra_config.HydraConfig.get().run.dir + "classifier"
     logger = Logger(cfg, out_dir, cfg.wandb.job_name if cfg.wandb.enable else None)
 
     # Setup dataset and dataloaders
     dataset = LeRobotDataset(cfg.dataset_repo_id)
     logging.info(f"Dataset size: {len(dataset)}")
 
-    train_size = int(cfg.train_split_proportion * len(dataset))
-    # val_size = len(dataset) - train_size
-    # train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
-    train_dataset = dataset[:train_size]
-    val_dataset = dataset[train_size:]
+    n_total = len(dataset)
+    n_train = int(cfg.train_split_proportion * len(dataset))
+    train_dataset = torch.utils.data.Subset(dataset, range(0, n_train))
+    val_dataset = torch.utils.data.Subset(dataset, range(n_train, n_total))
+
 
     sampler = create_balanced_sampler(train_dataset, cfg)
     train_loader = DataLoader(

From b9217b06dbf7ce1b41254b47d3ef622dffda6b3a Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Wed, 12 Feb 2025 19:25:41 +0100
Subject: [PATCH 070/112] Added possiblity to record and replay delta actions
 during teleoperation rather than absolute actions

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/common/datasets/lerobot_dataset.py    |   6 +-
 .../hilserl/classifier/modeling_classifier.py |   4 +-
 lerobot/common/robot_devices/control_utils.py |  11 +
 lerobot/configs/env/so100_real.yaml           |   6 +-
 .../configs/policy/hilserl_classifier.yaml    |  14 +-
 lerobot/configs/policy/sac_real.yaml          |  12 +-
 lerobot/configs/robot/so100.yaml              |   4 +-
 lerobot/scripts/control_robot.py              |  30 +-
 lerobot/scripts/server/crop_dataset_roi.py    |   6 +-
 lerobot/scripts/server/gym_manipulator.py     |  13 +-
 .../server/wrappers/gym_manipulator.py        | 584 ------------------
 lerobot/scripts/train_hilserl_classifier.py   |  24 +-
 12 files changed, 92 insertions(+), 622 deletions(-)
 delete mode 100644 lerobot/scripts/server/wrappers/gym_manipulator.py

diff --git a/lerobot/common/datasets/lerobot_dataset.py b/lerobot/common/datasets/lerobot_dataset.py
index 23255805..5c56d10c 100644
--- a/lerobot/common/datasets/lerobot_dataset.py
+++ b/lerobot/common/datasets/lerobot_dataset.py
@@ -84,7 +84,7 @@ class LeRobotDatasetMetadata:
 
         # Load metadata
         (self.root / "meta").mkdir(exist_ok=True, parents=True)
-        self.pull_from_repo(allow_patterns="meta/")
+        # self.pull_from_repo(allow_patterns="meta/")
         self.info = load_info(self.root)
         self.stats = load_stats(self.root)
         self.tasks = load_tasks(self.root)
@@ -537,7 +537,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
                 ]
                 files += video_files
 
-        self.pull_from_repo(allow_patterns=files, ignore_patterns=ignore_patterns)
+        # HACK: UNCOMMENT IF YOU REVIEW THAT, PLEASE SUGGEST TO UNCOMMENT
+        logging.warning("HACK: WE COMMENT THIS LINE, IF SOMETHING IS WEIRD WITH DATASETS UNCOMMENT")
+        # self.pull_from_repo(allow_patterns=files, ignore_patterns=ignore_patterns)
 
     def load_hf_dataset(self) -> datasets.Dataset:
         """hf_dataset contains all the observations, states, actions, rewards, etc."""
diff --git a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
index c5485227..a9fbb601 100644
--- a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
+++ b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
@@ -147,6 +147,8 @@ class Classifier(
 
     def predict_reward(self, x, threshold=0.6):
         if self.config.num_classes == 2:
-            return (self.forward(x).probabilities > threshold).float()
+            probs = self.forward(x).probabilities
+            logging.info(f"Predicted reward images: {probs}")
+            return (probs > threshold).float()
         else:
             return torch.argmax(self.forward(x).probabilities, dim=1)
diff --git a/lerobot/common/robot_devices/control_utils.py b/lerobot/common/robot_devices/control_utils.py
index f88f6d3e..7fdf37d0 100644
--- a/lerobot/common/robot_devices/control_utils.py
+++ b/lerobot/common/robot_devices/control_utils.py
@@ -225,6 +225,7 @@ def record_episode(
     device,
     use_amp,
     fps,
+    record_delta_actions,
 ):
     control_loop(
         robot=robot,
@@ -236,6 +237,7 @@ def record_episode(
         device=device,
         use_amp=use_amp,
         fps=fps,
+        record_delta_actions=record_delta_actions,
         teleoperate=policy is None,
     )
 
@@ -252,6 +254,7 @@ def control_loop(
     device=None,
     use_amp=None,
     fps=None,
+    record_delta_actions=False,
 ):
     # TODO(rcadene): Add option to record logs
     if not robot.is_connected:
@@ -274,8 +277,12 @@ def control_loop(
     while timestamp < control_time_s:
         start_loop_t = time.perf_counter()
 
+        current_joint_positions = robot.follower_arms["main"].read("Present_Position")
+
         if teleoperate:
             observation, action = robot.teleop_step(record_data=True)
+            if record_delta_actions:
+                action["action"] = action["action"] - current_joint_positions
         else:
             observation = robot.capture_observation()
 
@@ -290,8 +297,12 @@ def control_loop(
             frame = {**observation, **action}
             if "next.reward" in events:
                 frame["next.reward"] = events["next.reward"]
+                frame["next.done"] = (events["next.reward"] == 1) or (events["exit_early"])
             dataset.add_frame(frame)
 
+            # if frame["next.done"]:
+            # break
+
         if display_cameras and not is_headless():
             image_keys = [key for key in observation if "image" in key]
             for key in image_keys:
diff --git a/lerobot/configs/env/so100_real.yaml b/lerobot/configs/env/so100_real.yaml
index 82dcfeea..e6b07c69 100644
--- a/lerobot/configs/env/so100_real.yaml
+++ b/lerobot/configs/env/so100_real.yaml
@@ -12,8 +12,10 @@ env:
   
   wrapper:
     crop_params_dict:
-      observation.images.laptop: [58, 89, 357, 455]
-      observation.images.phone: [3, 4, 471, 633]
+      observation.images.front: [126, 43, 329, 518]
+      observation.images.side: [93, 69, 381, 434]
+      # observation.images.front: [135, 59, 331, 527]
+      # observation.images.side: [79, 47, 397, 450]
     resize_size: [128, 128]
     control_time_s: 20
     reset_follower_pos: true
diff --git a/lerobot/configs/policy/hilserl_classifier.yaml b/lerobot/configs/policy/hilserl_classifier.yaml
index 1a95f000..9b00d7ef 100644
--- a/lerobot/configs/policy/hilserl_classifier.yaml
+++ b/lerobot/configs/policy/hilserl_classifier.yaml
@@ -4,7 +4,9 @@ defaults:
   - _self_
 
 seed: 13
-dataset_repo_id: aractingi/push_green_cube_hf_cropped_resized
+dataset_repo_id: aractingi/push_cube_square_reward_cropped_resized
+dataset_root: data/aractingi/push_cube_square_reward_cropped_resized
+local_files_only: true
 train_split_proportion: 0.8
 
 # Required by logger
@@ -14,7 +16,7 @@ env:
 
 
 training:
-  num_epochs: 5
+  num_epochs: 6
   batch_size: 16
   learning_rate: 1e-4
   num_workers: 4
@@ -25,7 +27,7 @@ training:
   save_freq: 1  # How often to save checkpoints (in epochs)
   save_checkpoint: true
   # image_keys: ["observation.images.top", "observation.images.wrist"]
-  image_keys: ["observation.images.laptop", "observation.images.phone"]
+  image_keys: ["observation.images.front", "observation.images.side"]
   label_key: "next.reward"
   profile_inference_time: false
   profile_inference_time_iters: 20
@@ -35,8 +37,8 @@ eval:
   num_samples_to_log: 30  # Number of validation samples to log in the table
 
 policy:
-  name: "hilserl/classifier/push_green_cube_hf_cropped_resized" #"hilserl/classifier/pick_place_lego_cube_1"
-  model_name: "helper2424/resnet10"
+  name: "hilserl/classifier/push_cube_square_reward_cropped_resized" #"hilserl/classifier/pick_place_lego_cube_120
+  model_name: "helper2424/resnet10" # "facebook/convnext-base-224" #"helper2424/resnet10"
   model_type: "cnn"
   num_cameras: 2 # Has to be len(training.image_keys)
 
@@ -48,4 +50,4 @@ wandb:
 
 device: "mps"
 resume: false
-output_dir: "outputs/classifier"
+output_dir: "outputs/classifier/resnet10_frozen"
diff --git a/lerobot/configs/policy/sac_real.yaml b/lerobot/configs/policy/sac_real.yaml
index de0ffe9b..afcb408e 100644
--- a/lerobot/configs/policy/sac_real.yaml
+++ b/lerobot/configs/policy/sac_real.yaml
@@ -57,22 +57,22 @@ policy:
   input_shapes:
     # # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
     observation.state: ["${env.state_dim}"]
-    observation.images.laptop: [3, 128, 128]
-    observation.images.phone: [3, 128, 128]
+    observation.images.front: [3, 128, 128]
+    observation.images.side: [3, 128, 128]
     # observation.image: [3, 128, 128]
   output_shapes:
     action: ["${env.action_dim}"]
 
   # Normalization / Unnormalization
   input_normalization_modes: 
-    observation.images.laptop: mean_std
-    observation.images.phone: mean_std
+    observation.images.front: mean_std
+    observation.images.side: mean_std
     observation.state: min_max
   input_normalization_params:
-    observation.images.laptop:
+    observation.images.front:
       mean: [0.485, 0.456, 0.406]
       std: [0.229, 0.224, 0.225]
-    observation.images.phone:
+    observation.images.side:
       mean: [0.485, 0.456, 0.406]
       std: [0.229, 0.224, 0.225]
     observation.state:
diff --git a/lerobot/configs/robot/so100.yaml b/lerobot/configs/robot/so100.yaml
index 59c52a6d..82689753 100644
--- a/lerobot/configs/robot/so100.yaml
+++ b/lerobot/configs/robot/so100.yaml
@@ -50,13 +50,13 @@ follower_arms:
       gripper: [6, "sts3215"]
 
 cameras:
-  laptop:
+  front:
     _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
     camera_index: 0
     fps: 30
     width: 640
     height: 480
-  phone:
+  side:
     _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
     camera_index: 1
     fps: 30
diff --git a/lerobot/scripts/control_robot.py b/lerobot/scripts/control_robot.py
index 9f266e2f..77ab9a5d 100644
--- a/lerobot/scripts/control_robot.py
+++ b/lerobot/scripts/control_robot.py
@@ -206,7 +206,8 @@ def record(
     num_image_writer_threads_per_camera: int = 4,
     display_cameras: bool = True,
     play_sounds: bool = True,
-    reset_follower: bool = False, 
+    reset_follower: bool = False,
+    record_delta_actions: bool = False,
     resume: bool = False,
     # TODO(rcadene, aliberts): remove local_files_only when refactor with dataset as argument
     local_files_only: bool = False,
@@ -218,7 +219,12 @@ def record(
     device = None
     use_amp = None
     extra_features = (
-        {"next.reward": {"dtype": "int64", "shape": (1,), "names": None}} if assign_rewards else None
+        {
+            "next.reward": {"dtype": "int64", "shape": (1,), "names": None},
+            "next.done": {"dtype": "bool", "shape": (1,), "names": None},
+        }
+        if assign_rewards
+        else None
     )
 
     if single_task:
@@ -269,7 +275,7 @@ def record(
 
     if reset_follower:
         initial_position = robot.follower_arms["main"].read("Present_Position")
-        
+
     # Execute a few seconds without recording to:
     # 1. teleoperate the robot to move it in starting position if no policy provided,
     # 2. give times to the robot devices to connect and start synchronizing,
@@ -302,6 +308,7 @@ def record(
             device=device,
             use_amp=use_amp,
             fps=fps,
+            record_delta_actions=record_delta_actions,
         )
 
         # Execute a few seconds without recording to give time to manually reset the environment
@@ -353,21 +360,24 @@ def replay(
     fps: int | None = None,
     play_sounds: bool = True,
     local_files_only: bool = False,
+    replay_delta_actions: bool = False,
 ):
     # TODO(rcadene, aliberts): refactor with control_loop, once `dataset` is an instance of LeRobotDataset
     # TODO(rcadene): Add option to record logs
 
     dataset = LeRobotDataset(repo_id, root=root, episodes=[episode], local_files_only=local_files_only)
     actions = dataset.hf_dataset.select_columns("action")
-
     if not robot.is_connected:
         robot.connect()
 
     log_say("Replaying episode", play_sounds, blocking=True)
     for idx in range(dataset.num_frames):
+        current_joint_positions = robot.follower_arms["main"].read("Present_Position")
         start_episode_t = time.perf_counter()
 
         action = actions[idx]["action"]
+        if replay_delta_actions:
+            action = action + current_joint_positions
         robot.send_action(action)
 
         dt_s = time.perf_counter() - start_episode_t
@@ -534,6 +544,12 @@ if __name__ == "__main__":
         default=0,
         help="Enables the assignation of rewards to frames (by default no assignation). When enabled, assign a 0 reward to frames until the space bar is pressed which assign a 1 reward. Press the space bar a second time to assign a 0 reward. The reward assigned is reset to 0 when the episode ends.",
     )
+    parser_record.add_argument(
+        "--record-delta-actions",
+        type=int,
+        default=0,
+        help="Enables the recording of delta actions instead of absolute actions.",
+    )
     parser_record.add_argument(
         "--reset-follower",
         type=int,
@@ -563,6 +579,12 @@ if __name__ == "__main__":
         default=0,
         help="Use local files only. By default, this script will try to fetch the dataset from the hub if it exists.",
     )
+    parser_replay.add_argument(
+        "--replay-delta-actions",
+        type=int,
+        default=0,
+        help="Enables the replay of delta actions instead of absolute actions.",
+    )
     parser_replay.add_argument("--episode", type=int, default=0, help="Index of the episode to replay.")
 
     args = parser.parse_args()
diff --git a/lerobot/scripts/server/crop_dataset_roi.py b/lerobot/scripts/server/crop_dataset_roi.py
index 41be58a8..53fda473 100644
--- a/lerobot/scripts/server/crop_dataset_roi.py
+++ b/lerobot/scripts/server/crop_dataset_roi.py
@@ -239,13 +239,17 @@ if __name__ == "__main__":
     )
     args = parser.parse_args()
 
-    dataset = LeRobotDataset(repo_id=args.repo_id, root=args.root)
+    dataset = LeRobotDataset(repo_id=args.repo_id, root=args.root, local_files_only=True)
 
     images = get_image_from_lerobot_dataset(dataset)
     images = {k: v.cpu().permute(1, 2, 0).numpy() for k, v in images.items()}
     images = {k: (v * 255).astype("uint8") for k, v in images.items()}
 
     rois = select_square_roi_for_images(images)
+    # rois = {
+    #   "observation.images.front": [126, 43, 329, 518],
+    #   "observation.images.side": [93, 69, 381, 434],
+    # }
 
     # Print the selected rectangular ROIs
     print("\nSelected Rectangular Regions of Interest (top, left, height, width):")
diff --git a/lerobot/scripts/server/gym_manipulator.py b/lerobot/scripts/server/gym_manipulator.py
index 09b979c5..c29450bc 100644
--- a/lerobot/scripts/server/gym_manipulator.py
+++ b/lerobot/scripts/server/gym_manipulator.py
@@ -230,6 +230,8 @@ class HILSerlRobotEnv(gym.Env):
             if teleop_action.dim() == 1:
                 teleop_action = teleop_action.unsqueeze(0)
 
+        # self.render()
+
         self.current_step += 1
 
         reward = 0.0
@@ -255,8 +257,7 @@ class HILSerlRobotEnv(gym.Env):
 
         for key in image_keys:
             cv2.imshow(key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR))
-
-        cv2.waitKey(1)
+            cv2.waitKey(1)
 
     def close(self):
         """
@@ -311,10 +312,14 @@ class RewardWrapper(gym.Wrapper):
         start_time = time.perf_counter()
         with torch.inference_mode():
             reward = (
-                self.reward_classifier.predict_reward(images) if self.reward_classifier is not None else 0.0
+                self.reward_classifier.predict_reward(images, threshold=0.5)
+                if self.reward_classifier is not None
+                else 0.0
             )
         info["Reward classifer frequency"] = 1 / (time.perf_counter() - start_time)
 
+        logging.info(f"Reward: {reward}")
+
         if reward == 1.0:
             terminated = True
         return observation, reward, terminated, truncated, info
@@ -760,7 +765,7 @@ if __name__ == "__main__":
     env = make_robot_env(
         robot,
         reward_classifier,
-        cfg.wrapper,
+        cfg.env,  # .wrapper,
     )
 
     env.reset()
diff --git a/lerobot/scripts/server/wrappers/gym_manipulator.py b/lerobot/scripts/server/wrappers/gym_manipulator.py
deleted file mode 100644
index f95b7731..00000000
--- a/lerobot/scripts/server/wrappers/gym_manipulator.py
+++ /dev/null
@@ -1,584 +0,0 @@
-import argparse
-import logging
-import time
-from threading import Lock
-from typing import Annotated, Any, Callable, Dict, Optional, Tuple
-
-import gymnasium as gym
-import numpy as np
-import torch
-import torchvision.transforms.functional as F  # noqa: N812
-
-from lerobot.common.envs.utils import preprocess_observation
-from lerobot.common.robot_devices.control_utils import is_headless, reset_follower_position
-from lerobot.common.robot_devices.robots.factory import make_robot
-from lerobot.common.utils.utils import init_hydra_config, log_say
-
-logging.basicConfig(level=logging.INFO)
-
-
-class HILSerlRobotEnv(gym.Env):
-    """
-    Gym-like environment wrapper for robot policy evaluation.
-
-    This wrapper provides a consistent interface for interacting with the robot,
-    following the OpenAI Gym environment conventions.
-    """
-
-    def __init__(
-        self,
-        robot,
-        display_cameras=False,
-    ):
-        """
-        Initialize the robot environment.
-
-        Args:
-            robot: The robot interface object
-            reward_classifier: Optional reward classifier
-            fps: Frames per second for control
-            control_time_s: Total control time for each episode
-            display_cameras: Whether to display camera feeds
-        """
-        super().__init__()
-
-        self.robot = robot
-        self.display_cameras = display_cameras
-
-        # connect robot
-        if not self.robot.is_connected:
-            self.robot.connect()
-
-        # Dynamically determine observation and action spaces
-        self._setup_spaces()
-
-        self.initial_follower_position = robot.follower_arms["main"].read("Present_Position")
-
-        # Episode tracking
-        self.current_step = 0
-        self.episode_data = None
-
-    def _setup_spaces(self):
-        """
-        Dynamically determine observation and action spaces based on robot capabilities.
-
-        This method should be customized based on the specific robot's observation
-        and action representations.
-        """
-        # Example space setup - you'll need to adapt this to your specific robot
-        example_obs = self.robot.capture_observation()
-
-        # Observation space (assuming image-based observations)
-        image_keys = [key for key in example_obs if "image" in key]
-        state_keys = [key for key in example_obs if "image" not in key]
-        observation_spaces = {
-            key: gym.spaces.Box(low=0, high=255, shape=example_obs[key].shape, dtype=np.uint8)
-            for key in image_keys
-        }
-        observation_spaces["observation.state"] = gym.spaces.Dict(
-            {
-                key: gym.spaces.Box(low=0, high=10, shape=example_obs[key].shape, dtype=np.float32)
-                for key in state_keys
-            }
-        )
-
-        self.observation_space = gym.spaces.Dict(observation_spaces)
-
-        # Action space (assuming joint positions)
-        action_dim = len(self.robot.follower_arms["main"].read("Present_Position"))
-        self.action_space = gym.spaces.Tuple(
-            (
-                gym.spaces.Box(low=-np.inf, high=np.inf, shape=(action_dim,), dtype=np.float32),
-                gym.spaces.Discrete(2),
-            ),
-        )
-
-    def reset(self, seed=None, options=None) -> Tuple[Dict[str, np.ndarray], Dict[str, Any]]:
-        """
-        Reset the environment to initial state.
-
-        Returns:
-            observation (dict): Initial observation
-            info (dict): Additional information
-        """
-        super().reset(seed=seed, options=options)
-
-        # Capture initial observation
-        observation = self.robot.capture_observation()
-
-        # Reset tracking variables
-        self.current_step = 0
-        self.episode_data = None
-
-        return observation, {"initial_position": self.initial_follower_position}
-
-    def step(
-        self, action: Tuple[np.ndarray, bool]
-    ) -> Tuple[Dict[str, np.ndarray], float, bool, bool, Dict[str, Any]]:
-        """
-        Take a step in the environment.
-
-        Args:
-            action tuple(np.ndarray, bool):
-                    Policy action to be executed on the robot and boolean to determine
-                    whether to choose policy action or expert action.
-
-        Returns:
-            observation (dict): Next observation
-            reward (float): Reward for this step
-            terminated (bool): Whether the episode has terminated
-            truncated (bool): Whether the episode was truncated
-            info (dict): Additional information
-        """
-        # The actions recieved are the in form of a tuple containing the policy action and an intervention bool
-        # The boolean inidicated whether we will use the expert's actions (through teleoperation) or the policy actions
-        policy_action, intervention_bool = action
-        teleop_action = None
-        if not intervention_bool:
-            self.robot.send_action(policy_action.cpu())
-            observation = self.robot.capture_observation()
-        else:
-            observation, teleop_action = self.robot.teleop_step(record_data=True)
-            teleop_action = teleop_action["action"]  # teleop step returns torch tensors but in a dict
-
-        self.current_step += 1
-
-        reward = 0.0
-        terminated = False
-        truncated = False
-
-        return (
-            observation,
-            reward,
-            terminated,
-            truncated,
-            {"action_intervention": teleop_action, "is_intervention": teleop_action is not None},
-        )
-
-    def render(self):
-        """
-        Render the environment (in this case, display camera feeds).
-        """
-        import cv2
-
-        observation = self.robot.capture_observation()
-        image_keys = [key for key in observation if "image" in key]
-
-        for key in image_keys:
-            cv2.imshow(key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR))
-
-        cv2.waitKey(1)
-
-    def close(self):
-        """
-        Close the environment and disconnect the robot.
-        """
-        if self.robot.is_connected:
-            self.robot.disconnect()
-
-
-class ActionRepeatWrapper(gym.Wrapper):
-    def __init__(self, env, nb_repeat: int = 1):
-        super().__init__(env)
-        self.nb_repeat = nb_repeat
-
-    def step(self, action):
-        for _ in range(self.nb_repeat):
-            obs, reward, done, truncated, info = self.env.step(action)
-            if done or truncated:
-                break
-        return obs, reward, done, truncated, info
-
-
-class RelativeJointPositionActionWrapper(gym.Wrapper):
-    def __init__(self, env: HILSerlRobotEnv, delta: float = 0.1):
-        super().__init__(env)
-        self.joint_positions = self.unwrapped.robot.follower_arms["main"].read("Present_Position")
-        self.delta = delta
-
-    def step(self, action):
-        action_joint = action
-        self.joint_positions = self.unwrapped.robot.follower_arms["main"].read("Present_Position")
-        if isinstance(self.env.action_space, gym.spaces.Tuple):
-            action_joint = action[0]
-        joint_positions = self.joint_positions + (self.delta * action_joint)
-        # clip the joint positions to the joint limits with the action space
-        joint_positions = np.clip(joint_positions, self.action_space.low, self.action_space.high)
-
-        if isinstance(self.env.action_space, gym.spaces.Tuple):
-            return self.env.step((joint_positions, action[1]))
-
-        obs, reward, terminated, truncated, info = self.env.step(joint_positions)
-        if info["is_intervention"]:
-            # teleop actions are returned in absolute joint space
-            # If we are using a relative joint position action space,
-            # there will be a mismatch between the spaces of the policy and teleop actions
-            # Solution is to transform the teleop actions into relative space.
-            teleop_action = info["action_intervention"]  # teleop actions are in absolute joint space
-            relative_teleop_action = (teleop_action - self.joint_positions) / self.delta
-            info["action_intervention"] = relative_teleop_action
-
-        return self.env.step(joint_positions)
-
-
-class RewardWrapper(gym.Wrapper):
-    def __init__(self, env, reward_classifier: Optional[None], device: torch.device = "cuda"):
-        self.env = env
-        self.reward_classifier = reward_classifier
-        self.device = device
-
-    def step(self, action):
-        observation, _, terminated, truncated, info = self.env.step(action)
-        images = [
-            observation[key].to(self.device, non_blocking=True) for key in observation if "image" in key
-        ]
-        reward = self.reward_classifier.predict_reward(images) if self.reward_classifier is not None else 0.0
-        reward = reward.item()
-        return observation, reward, terminated, truncated, info
-
-    def reset(self, seed=None, options=None):
-        return self.env.reset(seed=seed, options=options)
-
-
-class TimeLimitWrapper(gym.Wrapper):
-    def __init__(self, env, control_time_s, fps):
-        self.env = env
-        self.control_time_s = control_time_s
-        self.fps = fps
-
-        self.last_timestamp = 0.0
-        self.episode_time_in_s = 0.0
-
-    def step(self, action):
-        obs, reward, terminated, truncated, info = self.env.step(action)
-        time_since_last_step = time.perf_counter() - self.last_timestamp
-        self.episode_time_in_s += time_since_last_step
-        self.last_timestamp = time.perf_counter()
-
-        # check if last timestep took more time than the expected fps
-        if 1.0 / time_since_last_step < self.fps:
-            logging.warning(f"Current timestep exceeded expected fps {self.fps}")
-
-        if self.episode_time_in_s > self.control_time_s:
-            # Terminated = True
-            terminated = True
-        return obs, reward, terminated, truncated, info
-
-    def reset(self, seed=None, options=None):
-        self.episode_time_in_s = 0.0
-        self.last_timestamp = time.perf_counter()
-        return self.env.reset(seed=seed, options=options)
-
-
-class ImageCropResizeWrapper(gym.Wrapper):
-    def __init__(self, env, crop_params_dict: Dict[str, Annotated[Tuple[int], 4]], resize_size=None):
-        self.env = env
-        self.crop_params_dict = crop_params_dict
-        for key in crop_params_dict:
-            assert key in self.env.observation_space, f"Key {key} not in observation space"
-            top, left, height, width = crop_params_dict[key]
-            new_shape = (top + height, left + width)
-            self.observation_space[key] = gym.spaces.Box(low=0, high=255, shape=new_shape)
-
-        self.resize_size = resize_size
-        if self.resize_size is None:
-            self.resize_size = (128, 128)
-
-    def step(self, action):
-        obs, reward, terminated, truncated, info = self.env.step(action)
-        for k in self.crop_params_dict:
-            obs[k] = F.crop(obs[k], *self.crop_params_dict[k])
-            obs[k] = F.resize(obs[k], self.resize_size)
-        return obs, reward, terminated, truncated, info
-
-
-class ConvertToLeRobotObservation(gym.ObservationWrapper):
-    def __init__(self, env, device):
-        super().__init__(env)
-        self.device = device
-
-    def observation(self, observation):
-        observation = preprocess_observation(observation)
-
-        observation = {key: observation[key].to(self.device, non_blocking=True) for key in observation}
-        observation = {k: torch.tensor(v, device=self.device) for k, v in observation.items()}
-        return observation
-
-
-class KeyboardInterfaceWrapper(gym.Wrapper):
-    def __init__(self, env):
-        super().__init__(env)
-        self.listener = None
-        self.events = {
-            "exit_early": False,
-            "pause_policy": False,
-            "reset_env": False,
-            "human_intervention_step": False,
-        }
-        self.event_lock = Lock()  # Thread-safe access to events
-        self._init_keyboard_listener()
-
-    def _init_keyboard_listener(self):
-        """Initialize keyboard listener if not in headless mode"""
-
-        if is_headless():
-            logging.warning(
-                "Headless environment detected. On-screen cameras display and keyboard inputs will not be available."
-            )
-            return
-        try:
-            from pynput import keyboard
-
-            def on_press(key):
-                with self.event_lock:
-                    try:
-                        if key == keyboard.Key.right or key == keyboard.Key.esc:
-                            print("Right arrow key pressed. Exiting loop...")
-                            self.events["exit_early"] = True
-                        elif key == keyboard.Key.space:
-                            if not self.events["pause_policy"]:
-                                print(
-                                    "Space key pressed. Human intervention required.\n"
-                                    "Place the leader in similar pose to the follower and press space again."
-                                )
-                                self.events["pause_policy"] = True
-                                log_say("Human intervention stage. Get ready to take over.", play_sounds=True)
-                            elif self.events["pause_policy"] and not self.events["human_intervention_step"]:
-                                self.events["human_intervention_step"] = True
-                                print("Space key pressed. Human intervention starting.")
-                                log_say("Starting human intervention.", play_sounds=True)
-                            else:
-                                self.events["pause_policy"] = False
-                                self.events["human_intervention_step"] = False
-                                print("Space key pressed for a third time.")
-                                log_say("Continuing with policy actions.", play_sounds=True)
-                    except Exception as e:
-                        print(f"Error handling key press: {e}")
-
-            self.listener = keyboard.Listener(on_press=on_press)
-            self.listener.start()
-        except ImportError:
-            logging.warning("Could not import pynput. Keyboard interface will not be available.")
-            self.listener = None
-
-    def step(self, action: Any) -> Tuple[Any, float, bool, bool, Dict]:
-        is_intervention = False
-        terminated_by_keyboard = False
-
-        # Extract policy_action if needed
-        if isinstance(self.env.action_space, gym.spaces.Tuple):
-            policy_action = action[0]
-
-        # Check the event flags without holding the lock for too long.
-        with self.event_lock:
-            if self.events["exit_early"]:
-                terminated_by_keyboard = True
-            # If we need to wait for human intervention, we note that outside the lock.
-            pause_policy = self.events["pause_policy"]
-
-        if pause_policy:
-            # Now, wait for human_intervention_step without holding the lock
-            while True:
-                with self.event_lock:
-                    if self.events["human_intervention_step"]:
-                        is_intervention = True
-                        break
-                time.sleep(0.1)  # Check more frequently if desired
-
-        # Execute the step in the underlying environment
-        obs, reward, terminated, truncated, info = self.env.step((policy_action, is_intervention))
-        return obs, reward, terminated or terminated_by_keyboard, truncated, info
-
-    def reset(self, **kwargs) -> Tuple[Any, Dict]:
-        """
-        Reset the environment and clear any pending events
-        """
-        with self.event_lock:
-            self.events = {k: False for k in self.events}
-        return self.env.reset(**kwargs)
-
-    def close(self):
-        """
-        Properly clean up the keyboard listener when the environment is closed
-        """
-        if self.listener is not None:
-            self.listener.stop()
-        super().close()
-
-
-class ResetWrapper(gym.Wrapper):
-    def __init__(
-        self, env: HILSerlRobotEnv, reset_fn: Optional[Callable[[], None]] = None, reset_time_s: float = 5
-    ):
-        super().__init__(env)
-        self.reset_fn = reset_fn
-        self.reset_time_s = reset_time_s
-
-        self.robot = self.unwrapped.robot
-        self.init_pos = self.unwrapped.initial_follower_position
-
-    def reset(self, *, seed=None, options=None):
-        if self.reset_fn is not None:
-            self.reset_fn(self.env)
-        else:
-            log_say(f"Manually reset the environment for {self.reset_time_s} seconds.", play_sounds=True)
-            start_time = time.perf_counter()
-            while time.perf_counter() - start_time < self.reset_time_s:
-                self.robot.teleop_step()
-
-            log_say("Manual reseting of the environment done.", play_sounds=True)
-        return super().reset(seed=seed, options=options)
-
-
-def make_robot_env(
-    robot,
-    reward_classifier,
-    crop_params_dict=None,
-    fps=30,
-    control_time_s=20,
-    reset_follower_pos=True,
-    display_cameras=False,
-    device="cuda:0",
-    resize_size=None,
-    reset_time_s=10,
-    delta_action=0.1,
-    nb_repeats=1,
-    use_relative_joint_positions=False,
-):
-    """
-    Factory function to create the robot environment.
-
-    Mimics gym.make() for consistent environment creation.
-    """
-    env = HILSerlRobotEnv(robot, display_cameras)
-    env = ConvertToLeRobotObservation(env, device)
-    # if crop_params_dict is not None:
-    #     env = ImageCropResizeWrapper(env, crop_params_dict, resize_size=resize_size)
-    # env = RewardWrapper(env, reward_classifier)
-    env = TimeLimitWrapper(env, control_time_s, fps)
-    # if use_relative_joint_positions:
-    #     env = RelativeJointPositionActionWrapper(env, delta=delta_action)
-    # env = ActionRepeatWrapper(env, nb_repeat=nb_repeats)
-    env = KeyboardInterfaceWrapper(env)
-    env = ResetWrapper(env, reset_fn=None, reset_time_s=reset_time_s)
-    return env
-
-
-def get_classifier(pretrained_path, config_path, device="mps"):
-    if pretrained_path is None or config_path is None:
-        return
-
-    from lerobot.common.policies.factory import _policy_cfg_from_hydra_cfg
-    from lerobot.common.policies.hilserl.classifier.configuration_classifier import ClassifierConfig
-    from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
-
-    cfg = init_hydra_config(config_path)
-
-    classifier_config = _policy_cfg_from_hydra_cfg(ClassifierConfig, cfg)
-    classifier_config.num_cameras = len(cfg.training.image_keys)  # TODO automate these paths
-    model = Classifier(classifier_config)
-    model.load_state_dict(Classifier.from_pretrained(pretrained_path).state_dict())
-    model = model.to(device)
-    return model
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--fps", type=int, default=30, help="control frequency")
-    parser.add_argument(
-        "--robot-path",
-        type=str,
-        default="lerobot/configs/robot/koch.yaml",
-        help="Path to robot yaml file used to instantiate the robot using `make_robot` factory function.",
-    )
-    parser.add_argument(
-        "--robot-overrides",
-        type=str,
-        nargs="*",
-        help="Any key=value arguments to override config values (use dots for.nested=overrides)",
-    )
-    parser.add_argument(
-        "-p",
-        "--pretrained-policy-name-or-path",
-        help=(
-            "Either the repo ID of a model hosted on the Hub or a path to a directory containing weights "
-            "saved using `Policy.save_pretrained`. If not provided, the policy is initialized from scratch "
-            "(useful for debugging). This argument is mutually exclusive with `--config`."
-        ),
-    )
-    parser.add_argument(
-        "--config",
-        help=(
-            "Path to a yaml config you want to use for initializing a policy from scratch (useful for "
-            "debugging). This argument is mutually exclusive with `--pretrained-policy-name-or-path` (`-p`)."
-        ),
-    )
-    parser.add_argument(
-        "--display-cameras", help=("Whether to display the camera feed while the rollout is happening")
-    )
-    parser.add_argument(
-        "--reward-classifier-pretrained-path",
-        type=str,
-        default=None,
-        help="Path to the pretrained classifier weights.",
-    )
-    parser.add_argument(
-        "--reward-classifier-config-file",
-        type=str,
-        default=None,
-        help="Path to a yaml config file that is necessary to build the reward classifier model.",
-    )
-    parser.add_argument("--control-time-s", type=float, default=20, help="Maximum episode length in seconds")
-    parser.add_argument("--reset-follower-pos", type=int, default=1, help="Reset follower between episodes")
-    args = parser.parse_args()
-
-    robot_cfg = init_hydra_config(args.robot_path, args.robot_overrides)
-    robot = make_robot(robot_cfg)
-
-    reward_classifier = get_classifier(
-        args.reward_classifier_pretrained_path, args.reward_classifier_config_file
-    )
-
-    env = make_robot_env(
-        robot,
-        reward_classifier,
-        None,
-        args.fps,
-        args.control_time_s,
-        args.reset_follower_pos,
-        args.display_cameras,
-        device="mps",
-        resize_size=None,
-        reset_time_s=10,
-        delta_action=0.1,
-        nb_repeats=1,
-        use_relative_joint_positions=False,
-    )
-
-    env.reset()
-    init_pos = env.unwrapped.initial_follower_position
-    goal_pos = init_pos
-
-    right_goal = init_pos.copy()
-    right_goal[0] += 50
-
-    left_goal = init_pos.copy()
-    left_goal[0] -= 50
-
-    # Michel is a beast
-    pitch_angle = np.linspace(left_goal[0], right_goal[0], 1000)
-
-    while True:
-        for i in range(len(pitch_angle)):
-            goal_pos[0] = pitch_angle[i]
-            obs, reward, terminated, truncated, info = env.step((torch.from_numpy(goal_pos), False))
-            if terminated or truncated:
-                logging.info("Max control time reached, reset environment.")
-                env.reset()
-
-        for i in reversed(range(len(pitch_angle))):
-            goal_pos[0] = pitch_angle[i]
-            obs, reward, terminated, truncated, info = env.step((torch.from_numpy(goal_pos), False))
-            if terminated or truncated:
-                logging.info("Max control time reached, reset environment.")
-                env.reset()
diff --git a/lerobot/scripts/train_hilserl_classifier.py b/lerobot/scripts/train_hilserl_classifier.py
index 0db19cd6..e0e01a5d 100644
--- a/lerobot/scripts/train_hilserl_classifier.py
+++ b/lerobot/scripts/train_hilserl_classifier.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python
-
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -23,6 +21,7 @@ import hydra
 import numpy as np
 import torch
 import torch.nn as nn
+import wandb
 from deepdiff import DeepDiff
 from omegaconf import DictConfig, OmegaConf
 from termcolor import colored
@@ -32,7 +31,6 @@ from torch.cuda.amp import GradScaler
 from torch.utils.data import DataLoader, RandomSampler, WeightedRandomSampler, random_split
 from tqdm import tqdm
 
-import wandb
 from lerobot.common.datasets.factory import resolve_delta_timestamps
 from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
 from lerobot.common.logger import Logger
@@ -45,6 +43,7 @@ from lerobot.common.utils.utils import (
     init_hydra_config,
     set_global_seed,
 )
+from lerobot.scripts.server.buffer import random_shift
 
 
 def get_model(cfg, logger):  # noqa I001
@@ -82,6 +81,7 @@ def train_epoch(model, train_loader, criterion, optimizer, grad_scaler, device,
     for batch_idx, batch in enumerate(pbar):
         start_time = time.perf_counter()
         images = [batch[img_key].to(device) for img_key in cfg.training.image_keys]
+        images = [random_shift(img, 4) for img in images]
         labels = batch[cfg.training.label_key].float().to(device)
 
         # Forward pass with optional AMP
@@ -161,14 +161,17 @@ def validate(model, val_loader, criterion, device, logger, cfg):
 
             # Log sample predictions for visualization
             if len(samples) < cfg.eval.num_samples_to_log:
-                for i in range(min( cfg.eval.num_samples_to_log - len(samples), len(images))):
+                for i in range(min(cfg.eval.num_samples_to_log - len(samples), len(images))):
                     if model.config.num_classes == 2:
                         confidence = round(outputs.probabilities[i].item(), 3)
                     else:
                         confidence = [round(prob, 3) for prob in outputs.probabilities[i].tolist()]
                     samples.append(
                         {
-                            **{f"image_{img_key}": wandb.Image(images[img_idx][i].cpu()) for img_idx, img_key in enumerate(cfg.training.image_keys)},
+                            **{
+                                f"image_{img_key}": wandb.Image(images[img_idx][i].cpu())
+                                for img_idx, img_key in enumerate(cfg.training.image_keys)
+                            },
                             "true_label": labels[i].item(),
                             "predicted": predictions[i].item(),
                             "confidence": confidence,
@@ -270,11 +273,13 @@ def train(cfg: DictConfig) -> None:
     device = get_safe_torch_device(cfg.device, log=True)
     set_global_seed(cfg.seed)
 
-    out_dir = hydra.core.hydra_config.HydraConfig.get().run.dir + "classifier"
+    out_dir = hydra.core.hydra_config.HydraConfig.get().run.dir + "frozen_resnet10_2"
     logger = Logger(cfg, out_dir, cfg.wandb.job_name if cfg.wandb.enable else None)
 
     # Setup dataset and dataloaders
-    dataset = LeRobotDataset(cfg.dataset_repo_id)
+    dataset = LeRobotDataset(
+        cfg.dataset_repo_id, root=cfg.dataset_root, local_files_only=cfg.local_files_only
+    )
     logging.info(f"Dataset size: {len(dataset)}")
 
     n_total = len(dataset)
@@ -282,14 +287,13 @@ def train(cfg: DictConfig) -> None:
     train_dataset = torch.utils.data.Subset(dataset, range(0, n_train))
     val_dataset = torch.utils.data.Subset(dataset, range(n_train, n_total))
 
-
     sampler = create_balanced_sampler(train_dataset, cfg)
     train_loader = DataLoader(
         train_dataset,
         batch_size=cfg.training.batch_size,
         num_workers=cfg.training.num_workers,
         sampler=sampler,
-        pin_memory=True,
+        pin_memory=device.type == "cuda",
     )
 
     val_loader = DataLoader(
@@ -297,7 +301,7 @@ def train(cfg: DictConfig) -> None:
         batch_size=cfg.eval.batch_size,
         shuffle=False,
         num_workers=cfg.training.num_workers,
-        pin_memory=True,
+        pin_memory=device.type == "cuda",
     )
 
     # Resume training if requested

From dc086dc21f0e99aa1d1f0b9a3d2887755c24ac9c Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Thu, 13 Feb 2025 11:04:49 +0100
Subject: [PATCH 071/112] Added logging for interventions to monitor the rate
 of interventions through time Added an s keyboard command to force success in
 the case the reward classifier fails

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/configs/env/so100_real.yaml           |  8 +++---
 .../configs/policy/hilserl_classifier.yaml    | 12 ++++-----
 lerobot/scripts/server/actor_server.py        |  4 +++
 lerobot/scripts/server/crop_dataset_roi.py    | 14 ++++++++---
 lerobot/scripts/server/gym_manipulator.py     | 25 +++++++++++++++----
 5 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/lerobot/configs/env/so100_real.yaml b/lerobot/configs/env/so100_real.yaml
index e6b07c69..b5afea52 100644
--- a/lerobot/configs/env/so100_real.yaml
+++ b/lerobot/configs/env/so100_real.yaml
@@ -12,10 +12,10 @@ env:
   
   wrapper:
     crop_params_dict:
-      observation.images.front: [126, 43, 329, 518]
-      observation.images.side: [93, 69, 381, 434]
-      # observation.images.front: [135, 59, 331, 527]
-      # observation.images.side: [79, 47, 397, 450]
+      observation.images.front: [102, 43, 358, 523]
+      observation.images.side: [92, 123, 379, 349]
+      # observation.images.front: [109, 37, 361, 557]
+      # observation.images.side: [94, 161, 372, 315]
     resize_size: [128, 128]
     control_time_s: 20
     reset_follower_pos: true
diff --git a/lerobot/configs/policy/hilserl_classifier.yaml b/lerobot/configs/policy/hilserl_classifier.yaml
index 9b00d7ef..149eeab2 100644
--- a/lerobot/configs/policy/hilserl_classifier.yaml
+++ b/lerobot/configs/policy/hilserl_classifier.yaml
@@ -4,8 +4,9 @@ defaults:
   - _self_
 
 seed: 13
-dataset_repo_id: aractingi/push_cube_square_reward_cropped_resized
-dataset_root: data/aractingi/push_cube_square_reward_cropped_resized
+dataset_repo_id: aractingi/push_cube_square_light_reward_cropped_resized
+# aractingi/push_cube_square_reward_1_cropped_resized
+dataset_root: data/aractingi/push_cube_square_light_reward_cropped_resized
 local_files_only: true
 train_split_proportion: 0.8
 
@@ -26,7 +27,6 @@ training:
   eval_freq: 1  # How often to run validation (in epochs)
   save_freq: 1  # How often to save checkpoints (in epochs)
   save_checkpoint: true
-  # image_keys: ["observation.images.top", "observation.images.wrist"]
   image_keys: ["observation.images.front", "observation.images.side"]
   label_key: "next.reward"
   profile_inference_time: false
@@ -37,8 +37,8 @@ eval:
   num_samples_to_log: 30  # Number of validation samples to log in the table
 
 policy:
-  name: "hilserl/classifier/push_cube_square_reward_cropped_resized" #"hilserl/classifier/pick_place_lego_cube_120
-  model_name: "helper2424/resnet10" # "facebook/convnext-base-224" #"helper2424/resnet10"
+  name: "hilserl/classifier"
+  model_name: "helper2424/resnet10" # "facebook/convnext-base-224
   model_type: "cnn"
   num_cameras: 2 # Has to be len(training.image_keys)
 
@@ -50,4 +50,4 @@ wandb:
 
 device: "mps"
 resume: false
-output_dir: "outputs/classifier/resnet10_frozen"
+output_dir: "outputs/classifier/old_trainer_resnet10_frozen"
diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index 7ee91b2c..7b1866f9 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -223,6 +223,7 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module)
     sum_reward_episode = 0
     list_transition_to_send_to_learner = []
     list_policy_time = []
+    episode_intervention = False
 
     for interaction_step in range(cfg.training.online_steps):
         if interaction_step >= cfg.training.online_step_before_learning:
@@ -252,6 +253,7 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module)
         if info["is_intervention"]:
             # TODO: Check the shape
             action = info["action_intervention"]
+            episode_intervention = True
 
         # Check for NaN values in observations
         for key, tensor in obs.items():
@@ -295,11 +297,13 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module)
                     interaction_message={
                         "Episodic reward": sum_reward_episode,
                         "Interaction step": interaction_step,
+                        "Episode intervention": int(episode_intervention),
                         **stats,
                     }
                 )
             )
             sum_reward_episode = 0.0
+            episode_intervention = False
             obs, info = online_env.reset()
 
 
diff --git a/lerobot/scripts/server/crop_dataset_roi.py b/lerobot/scripts/server/crop_dataset_roi.py
index 53fda473..da1bf96a 100644
--- a/lerobot/scripts/server/crop_dataset_roi.py
+++ b/lerobot/scripts/server/crop_dataset_roi.py
@@ -245,10 +245,18 @@ if __name__ == "__main__":
     images = {k: v.cpu().permute(1, 2, 0).numpy() for k, v in images.items()}
     images = {k: (v * 255).astype("uint8") for k, v in images.items()}
 
-    rois = select_square_roi_for_images(images)
+    # rois = select_square_roi_for_images(images)
+    rois = {
+        "observation.images.front": [102, 43, 358, 523],
+        "observation.images.side": [92, 123, 379, 349],
+    }
     # rois = {
-    #   "observation.images.front": [126, 43, 329, 518],
-    #   "observation.images.side": [93, 69, 381, 434],
+    #     "observation.images.side": (92, 123, 379, 349),
+    #     "observation.images.front": (109, 37, 361, 557),
+    # }
+    # rois = {
+    #   "observation.images.front": [109, 37, 361, 557],
+    #   "observation.images.side": [94, 161, 372, 315],
     # }
 
     # Print the selected rectangular ROIs
diff --git a/lerobot/scripts/server/gym_manipulator.py b/lerobot/scripts/server/gym_manipulator.py
index c29450bc..baaa3da9 100644
--- a/lerobot/scripts/server/gym_manipulator.py
+++ b/lerobot/scripts/server/gym_manipulator.py
@@ -312,7 +312,7 @@ class RewardWrapper(gym.Wrapper):
         start_time = time.perf_counter()
         with torch.inference_mode():
             reward = (
-                self.reward_classifier.predict_reward(images, threshold=0.5)
+                self.reward_classifier.predict_reward(images, threshold=0.6)
                 if self.reward_classifier is not None
                 else 0.0
             )
@@ -507,6 +507,7 @@ class KeyboardInterfaceWrapper(gym.Wrapper):
             "pause_policy": False,
             "reset_env": False,
             "human_intervention_step": False,
+            "episode_success": False,
         }
         self.event_lock = Lock()  # Thread-safe access to events
         self._init_keyboard_listener()
@@ -528,7 +529,12 @@ class KeyboardInterfaceWrapper(gym.Wrapper):
                         if key == keyboard.Key.right or key == keyboard.Key.esc:
                             print("Right arrow key pressed. Exiting loop...")
                             self.events["exit_early"] = True
-                        elif key == keyboard.Key.space and not self.events["exit_early"]:
+                            return
+                        if hasattr(key, "char") and key.char == "s":
+                            print("Key 's' pressed. Episode success triggered.")
+                            self.events["episode_success"] = True
+                            return
+                        if key == keyboard.Key.space and not self.events["exit_early"]:
                             if not self.events["pause_policy"]:
                                 print(
                                     "Space key pressed. Human intervention required.\n"
@@ -536,15 +542,18 @@ class KeyboardInterfaceWrapper(gym.Wrapper):
                                 )
                                 self.events["pause_policy"] = True
                                 log_say("Human intervention stage. Get ready to take over.", play_sounds=True)
-                            elif self.events["pause_policy"] and not self.events["human_intervention_step"]:
+                                return
+                            if self.events["pause_policy"] and not self.events["human_intervention_step"]:
                                 self.events["human_intervention_step"] = True
                                 print("Space key pressed. Human intervention starting.")
                                 log_say("Starting human intervention.", play_sounds=True)
-                            else:
+                                return
+                            if self.events["pause_policy"] and self.events["human_intervention_step"]:
                                 self.events["pause_policy"] = False
                                 self.events["human_intervention_step"] = False
                                 print("Space key pressed for a third time.")
                                 log_say("Continuing with policy actions.", play_sounds=True)
+                                return
                     except Exception as e:
                         print(f"Error handling key press: {e}")
 
@@ -566,7 +575,6 @@ class KeyboardInterfaceWrapper(gym.Wrapper):
         with self.event_lock:
             if self.events["exit_early"]:
                 terminated_by_keyboard = True
-            # If we need to wait for human intervention, we note that outside the lock.
             pause_policy = self.events["pause_policy"]
 
         if pause_policy:
@@ -580,6 +588,13 @@ class KeyboardInterfaceWrapper(gym.Wrapper):
 
         # Execute the step in the underlying environment
         obs, reward, terminated, truncated, info = self.env.step((policy_action, is_intervention))
+
+        # Override reward and termination if episode success event triggered
+        with self.event_lock:
+            if self.events["episode_success"]:
+                reward = 1
+                terminated_by_keyboard = True
+
         return obs, reward, terminated or terminated_by_keyboard, truncated, info
 
     def reset(self, **kwargs) -> Tuple[Any, Dict]:

From 459f22ed30337590945bb728ed583b4af1e25933 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Thu, 13 Feb 2025 11:26:24 +0100
Subject: [PATCH 072/112] fix log_alpha in modeling_sac: change to nn.parameter
 added pretrained vision model in policy

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/common/policies/sac/modeling_sac.py | 4 ++--
 lerobot/configs/env/so100_real.yaml         | 4 ++--
 lerobot/configs/policy/sac_real.yaml        | 4 ++--
 lerobot/scripts/server/learner_server.py    | 3 +--
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index a3d5d8e6..11830aa1 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -126,7 +126,7 @@ class SACPolicy(
         # TODO (azouitine): Handle the case where the temparameter is a fixed
         # TODO (michel-aractingi): Put the log_alpha in cuda by default because otherwise
         # it triggers "can't optimize a non-leaf Tensor"
-        self.log_alpha = torch.tensor([0.0], requires_grad=True, device=torch.device("mps"))
+        self.log_alpha = nn.Parameter(torch.tensor([0.0]))
         self.temperature = self.log_alpha.exp().item()
 
     def reset(self):
@@ -634,7 +634,7 @@ class PretrainedImageEncoder(nn.Module):
         """Set up CNN encoder"""
         from transformers import AutoModel
 
-        self.image_enc_layers = AutoModel.from_pretrained(config.vision_encoder_name)
+        self.image_enc_layers = AutoModel.from_pretrained(config.vision_encoder_name, trust_remote_code=True)
         # self.image_enc_layers.pooler = Identity()
 
         if hasattr(self.image_enc_layers.config, "hidden_sizes"):
diff --git a/lerobot/configs/env/so100_real.yaml b/lerobot/configs/env/so100_real.yaml
index b5afea52..bceeae59 100644
--- a/lerobot/configs/env/so100_real.yaml
+++ b/lerobot/configs/env/so100_real.yaml
@@ -1,6 +1,6 @@
 # @package _global_
 
-fps: 30
+fps: 10
 
 env:
   name: real_world
@@ -26,6 +26,6 @@ env:
     joint_masking_action_space: [1, 1, 1, 1, 0, 0] # disable wrist and gripper
 
   reward_classifier:
-    pretrained_path: outputs/classifier/checkpoints/best/pretrained_model
+    pretrained_path:  outputs/classifier/13-02-random-sample-resnet10-frozen/checkpoints/best/pretrained_model
     config_path: lerobot/configs/policy/hilserl_classifier.yaml
     
\ No newline at end of file
diff --git a/lerobot/configs/policy/sac_real.yaml b/lerobot/configs/policy/sac_real.yaml
index afcb408e..f5607867 100644
--- a/lerobot/configs/policy/sac_real.yaml
+++ b/lerobot/configs/policy/sac_real.yaml
@@ -8,7 +8,7 @@
 #   env.gym.obs_type=environment_state_agent_pos \
 
 seed: 1
-dataset_repo_id: null # aractingi/push_green_cube_hf_cropped_resized
+dataset_repo_id: aractingi/push_cube_square_light_offline_demo_cropped_resized
 
 training:
   # Offline training dataloader
@@ -52,7 +52,7 @@ policy:
   n_action_steps: 1
 
   shared_encoder: true
-  # vision_encoder_name: null
+  vision_encoder_name: "helper2424/resnet10"
   freeze_vision_encoder: true
   input_shapes:
     # # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index 1b54e3a9..460b845d 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -411,7 +411,7 @@ def add_actor_information_and_train(
         next_observations = batch["next_state"]
         done = batch["done"]
 
-        assert_and_breakpoint(observations=observations, actions=actions, next_state=next_observations)
+        check_nan_in_transition(observations=observations, actions=actions, next_state=next_observations)
 
         with policy_lock:
             loss_critic = policy.compute_loss_critic(
@@ -533,7 +533,6 @@ def make_optimizers_and_scheduler(cfg, policy: nn.Module):
     optimizer_critic = torch.optim.Adam(
         params=policy.critic_ensemble.parameters(), lr=policy.config.critic_lr
     )
-    # We wrap policy log temperature in list because this is a torch tensor and not a nn.Module
     optimizer_temperature = torch.optim.Adam(params=[policy.log_alpha], lr=policy.config.critic_lr)
     lr_scheduler = None
     optimizers = {

From c462a478c7ddce984c4b8ada189f34579cb5ca26 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Thu, 13 Feb 2025 14:27:14 +0100
Subject: [PATCH 073/112] Hardcoded some normalization parameters. TODO
 refactor Added masking actions on the level of the intervention actions and
 offline dataset

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/common/datasets/lerobot_dataset.py  |  4 ++--
 lerobot/common/policies/sac/modeling_sac.py |  2 +-
 lerobot/configs/policy/sac_real.yaml        | 11 +++++++----
 lerobot/scripts/server/actor_server.py      |  3 +++
 lerobot/scripts/server/buffer.py            |  7 +++++++
 lerobot/scripts/server/gym_manipulator.py   | 12 +++++++++++-
 lerobot/scripts/server/learner_server.py    |  9 +++++++--
 7 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/lerobot/common/datasets/lerobot_dataset.py b/lerobot/common/datasets/lerobot_dataset.py
index 5c56d10c..5278987b 100644
--- a/lerobot/common/datasets/lerobot_dataset.py
+++ b/lerobot/common/datasets/lerobot_dataset.py
@@ -84,7 +84,7 @@ class LeRobotDatasetMetadata:
 
         # Load metadata
         (self.root / "meta").mkdir(exist_ok=True, parents=True)
-        # self.pull_from_repo(allow_patterns="meta/")
+        self.pull_from_repo(allow_patterns="meta/")
         self.info = load_info(self.root)
         self.stats = load_stats(self.root)
         self.tasks = load_tasks(self.root)
@@ -539,7 +539,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
 
         # HACK: UNCOMMENT IF YOU REVIEW THAT, PLEASE SUGGEST TO UNCOMMENT
         logging.warning("HACK: WE COMMENT THIS LINE, IF SOMETHING IS WEIRD WITH DATASETS UNCOMMENT")
-        # self.pull_from_repo(allow_patterns=files, ignore_patterns=ignore_patterns)
+        self.pull_from_repo(allow_patterns=files, ignore_patterns=ignore_patterns)
 
     def load_hf_dataset(self) -> datasets.Dataset:
         """hf_dataset contains all the observations, states, actions, rewards, etc."""
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 11830aa1..622919b9 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -137,7 +137,7 @@ class SACPolicy(
         """Override .to(device) method to involve moving the log_alpha fixed_std"""
         if self.actor.fixed_std is not None:
             self.actor.fixed_std = self.actor.fixed_std.to(*args, **kwargs)
-        self.log_alpha = self.log_alpha.to(*args, **kwargs)
+        # self.log_alpha = self.log_alpha.to(*args, **kwargs)
         super().to(*args, **kwargs)
 
     @torch.no_grad()
diff --git a/lerobot/configs/policy/sac_real.yaml b/lerobot/configs/policy/sac_real.yaml
index f5607867..9b78f593 100644
--- a/lerobot/configs/policy/sac_real.yaml
+++ b/lerobot/configs/policy/sac_real.yaml
@@ -31,7 +31,7 @@ training:
   online_env_seed: 10000
   online_buffer_capacity: 1000000
   online_buffer_seed_size: 0
-  online_step_before_learning: 1000 #5000
+  online_step_before_learning: 100 #5000
   do_online_rollout_async: false
   policy_update_freq: 1
 
@@ -61,7 +61,7 @@ policy:
     observation.images.side: [3, 128, 128]
     # observation.image: [3, 128, 128]
   output_shapes:
-    action: ["${env.action_dim}"]
+    action: [4] # ["${env.action_dim}"]
 
   # Normalization / Unnormalization
   input_normalization_modes: 
@@ -84,9 +84,12 @@ policy:
   output_normalization_modes:
     action: min_max
   output_normalization_params:
+    # action:
+    #   min: [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0]
+    #   max: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
     action:
-      min: [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0]
-      max: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+      min: [-145.283203125, -69.43359375, -78.75, -46.0546875]
+      max: [145.283203125, 69.43359375, 78.75, 46.0546875]
 
   # Architecture / modeling.
   # Neural networks.
diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index 7b1866f9..2be6674c 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -201,6 +201,7 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module)
         "action": {"min": min_action_space, "max": max_action_space}
     }
     cfg.policy.output_normalization_params = output_normalization_params
+    cfg.policy.output_shapes["action"] = online_env.action_space.spaces[0].shape
 
     ### Instantiate the policy in both the actor and learner processes
     ### To avoid sending a SACPolicy object through the port, we create a policy intance
@@ -252,6 +253,8 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module)
         # NOTE: We overide the action if the intervention is True, because the action applied is the intervention action
         if info["is_intervention"]:
             # TODO: Check the shape
+            # NOTE: The action space for demonstration before hand is with the full action space
+            # but sometimes for example we want to deactivate the gripper
             action = info["action_intervention"]
             episode_intervention = True
 
diff --git a/lerobot/scripts/server/buffer.py b/lerobot/scripts/server/buffer.py
index 6caa9df7..99f5c55b 100644
--- a/lerobot/scripts/server/buffer.py
+++ b/lerobot/scripts/server/buffer.py
@@ -195,6 +195,7 @@ class ReplayBuffer:
         device: str = "cuda:0",
         state_keys: Optional[Sequence[str]] = None,
         capacity: Optional[int] = None,
+        action_mask: Optional[Sequence[int]] = None,
     ) -> "ReplayBuffer":
         """
         Convert a LeRobotDataset into a ReplayBuffer.
@@ -229,6 +230,12 @@ class ReplayBuffer:
                 elif isinstance(v, torch.Tensor):
                     data[k] = v.to(device)
 
+            if action_mask is not None:
+                if data["action"].dim() == 1:
+                    data["action"] = data["action"][action_mask]
+                else:
+                    data["action"] = data["action"][:, action_mask]
+
             replay_buffer.add(
                 state=data["state"],
                 action=data["action"],
diff --git a/lerobot/scripts/server/gym_manipulator.py b/lerobot/scripts/server/gym_manipulator.py
index baaa3da9..3ed1fdc9 100644
--- a/lerobot/scripts/server/gym_manipulator.py
+++ b/lerobot/scripts/server/gym_manipulator.py
@@ -328,7 +328,7 @@ class RewardWrapper(gym.Wrapper):
         return self.env.reset(seed=seed, options=options)
 
 
-class JointMaskingActionSpace(gym.ActionWrapper):
+class JointMaskingActionSpace(gym.Wrapper):
     def __init__(self, env, mask):
         """
         Wrapper to mask out dimensions of the action space.
@@ -388,6 +388,16 @@ class JointMaskingActionSpace(gym.ActionWrapper):
             full_action[self.active_dims] = masked_action
             return full_action
 
+    def step(self, action):
+        action = self.action(action)
+        obs, reward, terminated, truncated, info = self.env.step(action)
+        if "action_intervention" in info and info["action_intervention"] is not None:
+            if info["action_intervention"].dim() == 1:
+                info["action_intervention"] = info["action_intervention"][self.active_dims]
+            else:
+                info["action_intervention"] = info["action_intervention"][:, self.active_dims]
+        return obs, reward, terminated, truncated, info
+
 
 class TimeLimitWrapper(gym.Wrapper):
     def __init__(self, env, control_time_s, fps):
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index 460b845d..31976537 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -354,7 +354,7 @@ def add_actor_information_and_train(
                 transition = move_transition_to_device(transition, device=device)
                 replay_buffer.add(**transition)
 
-                if transition.get("complementary_info", {}).get("is_interaction"):
+                if transition.get("complementary_info", {}).get("is_intervention"):
                     offline_replay_buffer.add(**transition)
 
         while not interaction_message_queue.empty():
@@ -568,6 +568,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     ### To avoid sending a SACPolicy object through the port, we create a policy intance
     ### on both sides, the learner sends the updated parameters every n steps to update the actor's parameters
     # TODO: At some point we should just need make sac policy
+
     policy_lock = Lock()
     policy: SACPolicy = make_policy(
         hydra_cfg=cfg,
@@ -593,8 +594,12 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
         logging.info("make_dataset offline buffer")
         offline_dataset = make_dataset(cfg)
         logging.info("Convertion to a offline replay buffer")
+        active_action_dims = [i for i, mask in enumerate(cfg.env.wrapper.joint_masking_action_space) if mask]
         offline_replay_buffer = ReplayBuffer.from_lerobot_dataset(
-            offline_dataset, device=device, state_keys=cfg.policy.input_shapes.keys()
+            offline_dataset,
+            device=device,
+            state_keys=cfg.policy.input_shapes.keys(),
+            action_mask=active_action_dims,
         )
         batch_size: int = batch_size // 2  # We will sample from both replay buffer
 

From 0c320084669e241072df30e110cd133073f950a0 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Thu, 13 Feb 2025 15:43:30 +0100
Subject: [PATCH 074/112] Changed bounds for a new so100 robot

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/configs/policy/sac_real.yaml       |  7 ++++---
 lerobot/configs/robot/so100.yaml           | 13 ++++---------
 lerobot/scripts/server/crop_dataset_roi.py |  2 +-
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/lerobot/configs/policy/sac_real.yaml b/lerobot/configs/policy/sac_real.yaml
index 9b78f593..14a63713 100644
--- a/lerobot/configs/policy/sac_real.yaml
+++ b/lerobot/configs/policy/sac_real.yaml
@@ -8,7 +8,8 @@
 #   env.gym.obs_type=environment_state_agent_pos \
 
 seed: 1
-dataset_repo_id: aractingi/push_cube_square_light_offline_demo_cropped_resized
+dataset_repo_id: aractingi/push_cube_square_offline_demo_cropped_resized
+#aractingi/push_cube_square_light_offline_demo_cropped_resized
 
 training:
   # Offline training dataloader
@@ -88,8 +89,8 @@ policy:
     #   min: [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0]
     #   max: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
     action:
-      min: [-145.283203125, -69.43359375, -78.75, -46.0546875]
-      max: [145.283203125, 69.43359375, 78.75, 46.0546875]
+      min: [-149.23828125, -97.734375, -100.1953125, -73.740234375]
+      max: [149.23828125, 97.734375, 100.1953125, 73.740234375]
 
   # Architecture / modeling.
   # Neural networks.
diff --git a/lerobot/configs/robot/so100.yaml b/lerobot/configs/robot/so100.yaml
index 82689753..459308ae 100644
--- a/lerobot/configs/robot/so100.yaml
+++ b/lerobot/configs/robot/so100.yaml
@@ -14,14 +14,9 @@ calibration_dir: .cache/calibration/so100
 # Set this to a positive scalar to have the same value for all motors, or a list that is the same length as
 # the number of motors in your follower arms.
 max_relative_target: null
-joint_position_relative_bounds: 
-   min: [-87.09961,     62.402344,    67.23633,     36.035156,    77.34375,
-   0.53691274] 
-   max: [58.183594,   131.83594,    145.98633,     82.08984,     78.22266,
-   0.60402685]
-   
-  # min: [-88.50586,  23.81836, 0.87890625, -32.16797, 78.66211,   0.53691274]
-  # max: [84.55078, 187.11914, 145.98633, 101.60156, 146.60156,  88.18792]
+joint_position_relative_bounds:
+  max: [ 7.2158203e+01,  1.5398438e+02,  1.6075195e+02,  9.3251953e+01, 0., -1.4184397e-01]
+  min: [-77.08008,     56.25,        60.55664,     19.511719,   0., -0.63829786]
 
 leader_arms:
   main:
@@ -39,7 +34,7 @@ leader_arms:
 follower_arms:
   main:
     _target_: lerobot.common.robot_devices.motors.feetech.FeetechMotorsBus
-    port: /dev/tty.usbmodem585A0080971
+    port: /dev/tty.usbmodem58760431631
     motors:
       # name: (index, model)
       shoulder_pan: [1, "sts3215"]
diff --git a/lerobot/scripts/server/crop_dataset_roi.py b/lerobot/scripts/server/crop_dataset_roi.py
index da1bf96a..5b534a46 100644
--- a/lerobot/scripts/server/crop_dataset_roi.py
+++ b/lerobot/scripts/server/crop_dataset_roi.py
@@ -239,7 +239,7 @@ if __name__ == "__main__":
     )
     args = parser.parse_args()
 
-    dataset = LeRobotDataset(repo_id=args.repo_id, root=args.root, local_files_only=True)
+    dataset = LeRobotDataset(repo_id=args.repo_id, root=args.root, local_files_only=False)
 
     images = get_image_from_lerobot_dataset(dataset)
     images = {k: v.cpu().permute(1, 2, 0).numpy() for k, v in images.items()}

From d9a70376d83e8639bf532125bc866adce86ee40e Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Thu, 13 Feb 2025 16:42:43 +0100
Subject: [PATCH 075/112] Changed the init_final value to center the starting
 mean and std of the policy

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 .../common/policies/hilserl/classifier/modeling_classifier.py | 2 +-
 lerobot/common/policies/sac/configuration_sac.py              | 1 +
 lerobot/scripts/server/actor_server.py                        | 2 +-
 lerobot/scripts/server/gym_manipulator.py                     | 4 ++--
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
index a9fbb601..e6700547 100644
--- a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
+++ b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
@@ -148,7 +148,7 @@ class Classifier(
     def predict_reward(self, x, threshold=0.6):
         if self.config.num_classes == 2:
             probs = self.forward(x).probabilities
-            logging.info(f"Predicted reward images: {probs}")
+            logging.debug(f"Predicted reward images: {probs}")
             return (probs > threshold).float()
         else:
             return torch.argmax(self.forward(x).probabilities, dim=1)
diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index e9d78fdd..18ceee24 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -95,5 +95,6 @@ class SACConfig:
             "use_tanh_squash": True,
             "log_std_min": -5,
             "log_std_max": 2,
+            "init_final": 0.01,
         }
     )
diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index 2be6674c..6cdc49e3 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -327,7 +327,7 @@ def send_transitions_in_chunks(transitions: list, message_queue, chunk_size: int
 def get_frequency_stats(list_policy_time: list[float]) -> dict[str, float]:
     stats = {}
     list_policy_fps = [1.0 / t for t in list_policy_time]
-    if len(list_policy_fps) > 0:
+    if len(list_policy_fps) > 1:
         policy_fps = mean(list_policy_fps)
         quantiles_90 = quantiles(list_policy_fps, n=10)[-1]
         logging.debug(f"[ACTOR] Average policy frame rate: {policy_fps}")
diff --git a/lerobot/scripts/server/gym_manipulator.py b/lerobot/scripts/server/gym_manipulator.py
index 3ed1fdc9..a43f07ca 100644
--- a/lerobot/scripts/server/gym_manipulator.py
+++ b/lerobot/scripts/server/gym_manipulator.py
@@ -217,7 +217,7 @@ class HILSerlRobotEnv(gym.Env):
                 if torch.any(teleop_action < -self.delta_relative_bounds_size * self.delta) and torch.any(
                     teleop_action > self.delta_relative_bounds_size
                 ):
-                    print(
+                    logging.debug(
                         f"Relative teleop delta exceeded bounds {self.delta_relative_bounds_size}, teleop_action {teleop_action}\n"
                         f"lower bounds condition {teleop_action < -self.delta_relative_bounds_size}\n"
                         f"upper bounds condition {teleop_action > self.delta_relative_bounds_size}"
@@ -318,7 +318,7 @@ class RewardWrapper(gym.Wrapper):
             )
         info["Reward classifer frequency"] = 1 / (time.perf_counter() - start_time)
 
-        logging.info(f"Reward: {reward}")
+        # logging.info(f"Reward: {reward}")
 
         if reward == 1.0:
             terminated = True

From b07d95f0dd4b249222ebc697a98109ceca2de95a Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Thu, 13 Feb 2025 16:53:33 +0100
Subject: [PATCH 076/112] removed uncomment in actor server

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/scripts/server/actor_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index 6cdc49e3..fa4e34af 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -283,7 +283,7 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module)
             # TODO: Handle logging for episode information
             logging.info(f"[ACTOR] Global step {interaction_step}: Episode reward: {sum_reward_episode}")
 
-            # update_policy_parameters(policy=policy, parameters_queue=parameters_queue, device=device)
+            update_policy_parameters(policy=policy, parameters_queue=parameters_queue, device=device)
 
             if len(list_transition_to_send_to_learner) > 0:
                 send_transitions_in_chunks(

From 95de8e273dffd9e2f5991a6c73d84d02cf880c2b Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Thu, 13 Feb 2025 17:12:57 +0100
Subject: [PATCH 077/112] nit

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/common/policies/sac/configuration_sac.py | 2 +-
 lerobot/scripts/server/actor_server.py           | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index 18ceee24..1d296bf1 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -95,6 +95,6 @@ class SACConfig:
             "use_tanh_squash": True,
             "log_std_min": -5,
             "log_std_max": 2,
-            "init_final": 0.01,
+            "init_final": 0.005,
         }
     )
diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index fa4e34af..8284f024 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -163,10 +163,10 @@ def serve_actor_service(port=50052):
 
 def update_policy_parameters(policy: SACPolicy, parameters_queue: queue.Queue, device):
     if not parameters_queue.empty():
-        logging.debug("[ACTOR] Load new parameters from Learner.")
+        logging.info("[ACTOR] Load new parameters from Learner.")
         state_dict = parameters_queue.get()
         state_dict = move_state_dict_to_device(state_dict, device=device)
-        policy.load_state_dict(state_dict)
+        policy.load_state_dict(state_dict, strict=False)
 
 
 def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module):

From c9e50bb9b151755bcf1f91c1277d8bc85f0a23e5 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Thu, 13 Feb 2025 18:03:57 +0100
Subject: [PATCH 078/112] Optimized the replay buffer from the memory side to
 store data on cpu instead of a gpu device and send the batches to the gpu.

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/scripts/server/buffer.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/lerobot/scripts/server/buffer.py b/lerobot/scripts/server/buffer.py
index 99f5c55b..fb463762 100644
--- a/lerobot/scripts/server/buffer.py
+++ b/lerobot/scripts/server/buffer.py
@@ -138,6 +138,7 @@ class ReplayBuffer:
         state_keys: Optional[Sequence[str]] = None,
         image_augmentation_function: Optional[Callable] = None,
         use_drq: bool = True,
+        storage_device: str = "cpu",
     ):
         """
         Args:
@@ -147,9 +148,12 @@ class ReplayBuffer:
             image_augmentation_function (Optional[Callable]): A function that takes a batch of images
                 and returns a batch of augmented images. If None, a default augmentation function is used.
             use_drq (bool): Whether to use the default DRQ image augmentation style, when sampling in the buffer.
+            storage_device: The device (e.g. "cpu" or "cuda:0") where the data will be stored when adding transitions to the buffer.
+                Using "cpu" can help save GPU memory.
         """
         self.capacity = capacity
         self.device = device
+        self.storage_device = storage_device
         self.memory: list[Transition] = []
         self.position = 0
 
@@ -172,7 +176,16 @@ class ReplayBuffer:
         done: bool,
         complementary_info: Optional[dict[str, torch.Tensor]] = None,
     ):
-        """Saves a transition."""
+        """Saves a transition, ensuring tensors are stored on the designated storage device."""
+        # Move tensors to the storage device
+        state = {key: tensor.to(self.storage_device) for key, tensor in state.items()}
+        next_state = {key: tensor.to(self.storage_device) for key, tensor in next_state.items()}
+        action = action.to(self.storage_device)
+        if complementary_info is not None:
+            complementary_info = {
+                key: tensor.to(self.storage_device) for key, tensor in complementary_info.items()
+            }
+
         if len(self.memory) < self.capacity:
             self.memory.append(None)
 
@@ -185,7 +198,7 @@ class ReplayBuffer:
             done=done,
             complementary_info=complementary_info,
         )
-        self.position: int = (self.position + 1) % self.capacity
+        self.position = (self.position + 1) % self.capacity
 
     # TODO: ADD image_augmentation and use_drq arguments in this function in order to instantiate the class with them
     @classmethod
@@ -475,7 +488,6 @@ class ReplayBuffer:
 
             # Move to next frame
             frame_idx_in_episode += 1
-
             # If we reached an episode boundary, call save_episode, reset counters
             if transition["done"]:
                 # Use some placeholder name for the task

From 36711d766a77eba30591aeca3ff303c228df1895 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Fri, 14 Feb 2025 12:32:45 +0100
Subject: [PATCH 079/112] Modified crop_dataset_roi interface to automatically
 write the cropped parameters to a json file in the meta of the dataset

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/common/datasets/lerobot_dataset.py |  8 ++---
 lerobot/configs/policy/sac_real.yaml       |  9 ++++--
 lerobot/scripts/server/buffer.py           |  8 ++---
 lerobot/scripts/server/crop_dataset_roi.py | 37 ++++++++++++++++------
 4 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/lerobot/common/datasets/lerobot_dataset.py b/lerobot/common/datasets/lerobot_dataset.py
index 5278987b..000b0bcb 100644
--- a/lerobot/common/datasets/lerobot_dataset.py
+++ b/lerobot/common/datasets/lerobot_dataset.py
@@ -84,7 +84,8 @@ class LeRobotDatasetMetadata:
 
         # Load metadata
         (self.root / "meta").mkdir(exist_ok=True, parents=True)
-        self.pull_from_repo(allow_patterns="meta/")
+        if not self.local_files_only:
+            self.pull_from_repo(allow_patterns="meta/")
         self.info = load_info(self.root)
         self.stats = load_stats(self.root)
         self.tasks = load_tasks(self.root)
@@ -537,9 +538,8 @@ class LeRobotDataset(torch.utils.data.Dataset):
                 ]
                 files += video_files
 
-        # HACK: UNCOMMENT IF YOU REVIEW THAT, PLEASE SUGGEST TO UNCOMMENT
-        logging.warning("HACK: WE COMMENT THIS LINE, IF SOMETHING IS WEIRD WITH DATASETS UNCOMMENT")
-        self.pull_from_repo(allow_patterns=files, ignore_patterns=ignore_patterns)
+        if not self.local_files_only:
+            self.pull_from_repo(allow_patterns=files, ignore_patterns=ignore_patterns)
 
     def load_hf_dataset(self) -> datasets.Dataset:
         """hf_dataset contains all the observations, states, actions, rewards, etc."""
diff --git a/lerobot/configs/policy/sac_real.yaml b/lerobot/configs/policy/sac_real.yaml
index 14a63713..5d248aef 100644
--- a/lerobot/configs/policy/sac_real.yaml
+++ b/lerobot/configs/policy/sac_real.yaml
@@ -77,8 +77,11 @@ policy:
       mean: [0.485, 0.456, 0.406]
       std: [0.229, 0.224, 0.225]
     observation.state:
-      min: [-87.09961,     62.402344,    67.23633,     36.035156,    77.34375,0.53691274] 
-      max: [58.183594,   131.83594,    145.98633,     82.08984,     78.22266, 0.60402685]
+      min: [-77.08008,     56.25,        60.55664,     19.511719,   0., -0.63829786]
+      max: [ 7.215820e+01,  1.5398438e+02,  1.6075195e+02,  9.3251953e+01, 0., -1.4184397e-01]
+
+      # min: [-87.09961,     62.402344,    67.23633,     36.035156,    77.34375,0.53691274] 
+      # max: [58.183594,   131.83594,    145.98633,     82.08984,     78.22266, 0.60402685]
       # min: [-88.50586,  23.81836, 0.87890625, -32.16797, 78.66211,   0.53691274]
       # max: [84.55078, 187.11914, 145.98633, 101.60156, 146.60156,  88.18792]
 
@@ -96,7 +99,7 @@ policy:
   # Neural networks.
   image_encoder_hidden_dim: 32
   # discount: 0.99
-  discount: 0.80
+  discount: 0.97
   temperature_init: 1.0
   num_critics: 2 #10
   camera_number: 2
diff --git a/lerobot/scripts/server/buffer.py b/lerobot/scripts/server/buffer.py
index fb463762..dcfc259c 100644
--- a/lerobot/scripts/server/buffer.py
+++ b/lerobot/scripts/server/buffer.py
@@ -181,10 +181,10 @@ class ReplayBuffer:
         state = {key: tensor.to(self.storage_device) for key, tensor in state.items()}
         next_state = {key: tensor.to(self.storage_device) for key, tensor in next_state.items()}
         action = action.to(self.storage_device)
-        if complementary_info is not None:
-            complementary_info = {
-                key: tensor.to(self.storage_device) for key, tensor in complementary_info.items()
-            }
+        # if complementary_info is not None:
+        #     complementary_info = {
+        #         key: tensor.to(self.storage_device) for key, tensor in complementary_info.items()
+        #     }
 
         if len(self.memory) < self.capacity:
             self.memory.append(None)
diff --git a/lerobot/scripts/server/crop_dataset_roi.py b/lerobot/scripts/server/crop_dataset_roi.py
index 5b534a46..172eb22c 100644
--- a/lerobot/scripts/server/crop_dataset_roi.py
+++ b/lerobot/scripts/server/crop_dataset_roi.py
@@ -1,7 +1,8 @@
 import argparse  # noqa: I001
+import json
 from copy import deepcopy
 from typing import Dict, Tuple
-
+from pathlib import Path
 import cv2
 
 # import torch.nn.functional as F  # noqa: N812
@@ -237,19 +238,27 @@ if __name__ == "__main__":
         default=None,
         help="The root directory of the LeRobot dataset.",
     )
+    parser.add_argument(
+        "--crop-params-path",
+        type=str,
+        default=None,
+        help="The path to the JSON file containing the ROIs.",
+    )
     args = parser.parse_args()
 
-    dataset = LeRobotDataset(repo_id=args.repo_id, root=args.root, local_files_only=False)
+    local_files_only = args.root is not None
+    dataset = LeRobotDataset(repo_id=args.repo_id, root=args.root, local_files_only=local_files_only)
 
     images = get_image_from_lerobot_dataset(dataset)
     images = {k: v.cpu().permute(1, 2, 0).numpy() for k, v in images.items()}
     images = {k: (v * 255).astype("uint8") for k, v in images.items()}
 
-    # rois = select_square_roi_for_images(images)
-    rois = {
-        "observation.images.front": [102, 43, 358, 523],
-        "observation.images.side": [92, 123, 379, 349],
-    }
+    if args.crop_params_path is None:
+        rois = select_square_roi_for_images(images)
+    else:
+        with open(args.crop_params_path, "r") as f:
+            rois = json.load(f)
+
     # rois = {
     #     "observation.images.side": (92, 123, 379, 349),
     #     "observation.images.front": (109, 37, 361, 557),
@@ -263,10 +272,20 @@ if __name__ == "__main__":
     print("\nSelected Rectangular Regions of Interest (top, left, height, width):")
     for key, roi in rois.items():
         print(f"{key}: {roi}")
+
+    new_repo_id = args.repo_id + "_cropped_resized"
+    new_dataset_root = Path(str(dataset.root) + "_cropped_resized")
+
     croped_resized_dataset = convert_lerobot_dataset_to_cropper_lerobot_dataset(
         original_dataset=dataset,
         crop_params_dict=rois,
-        new_repo_id=args.repo_id + "_cropped_resized",
-        new_dataset_root="data/" + args.repo_id + "_cropped_resized",
+        new_repo_id=new_repo_id,
+        new_dataset_root=new_dataset_root,
         resize_size=(128, 128),
     )
+
+    meta_dir = new_dataset_root / "meta"
+    meta_dir.mkdir(exist_ok=True)
+
+    with open(meta_dir / "crop_params.json", "w") as f:
+        json.dump(rois, f, indent=4)

From 7ae368e98365f881790c519a39a8e31baf4a7234 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Fri, 14 Feb 2025 15:17:16 +0100
Subject: [PATCH 080/112] Fixed bug in the action scale of the intervention
 actions and offline dataset actions. (scale by inverse delta)

Co-authored-by: Adil Zouitine <adizouitinegm@gmail.com>
---
 lerobot/configs/policy/sac_real.yaml       |  4 +--
 lerobot/scripts/server/buffer.py           |  4 +++
 lerobot/scripts/server/crop_dataset_roi.py |  8 ++---
 lerobot/scripts/server/gym_manipulator.py  | 41 +++++++++++++++++-----
 lerobot/scripts/server/learner_server.py   |  1 +
 5 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/lerobot/configs/policy/sac_real.yaml b/lerobot/configs/policy/sac_real.yaml
index 5d248aef..4b021aaa 100644
--- a/lerobot/configs/policy/sac_real.yaml
+++ b/lerobot/configs/policy/sac_real.yaml
@@ -8,8 +8,8 @@
 #   env.gym.obs_type=environment_state_agent_pos \
 
 seed: 1
-dataset_repo_id: aractingi/push_cube_square_offline_demo_cropped_resized
-#aractingi/push_cube_square_light_offline_demo_cropped_resized
+dataset_repo_id: aractingi/push_cube_overfit_cropped_resized
+#aractingi/push_cube_square_offline_demo_cropped_resized
 
 training:
   # Offline training dataloader
diff --git a/lerobot/scripts/server/buffer.py b/lerobot/scripts/server/buffer.py
index dcfc259c..6a290e6e 100644
--- a/lerobot/scripts/server/buffer.py
+++ b/lerobot/scripts/server/buffer.py
@@ -209,6 +209,7 @@ class ReplayBuffer:
         state_keys: Optional[Sequence[str]] = None,
         capacity: Optional[int] = None,
         action_mask: Optional[Sequence[int]] = None,
+        action_delta: Optional[float] = None,
     ) -> "ReplayBuffer":
         """
         Convert a LeRobotDataset into a ReplayBuffer.
@@ -249,6 +250,9 @@ class ReplayBuffer:
                 else:
                     data["action"] = data["action"][:, action_mask]
 
+            if action_delta is not None:
+                data["action"] = data["action"] / action_delta
+
             replay_buffer.add(
                 state=data["state"],
                 action=data["action"],
diff --git a/lerobot/scripts/server/crop_dataset_roi.py b/lerobot/scripts/server/crop_dataset_roi.py
index 172eb22c..fb9077c9 100644
--- a/lerobot/scripts/server/crop_dataset_roi.py
+++ b/lerobot/scripts/server/crop_dataset_roi.py
@@ -260,12 +260,8 @@ if __name__ == "__main__":
             rois = json.load(f)
 
     # rois = {
-    #     "observation.images.side": (92, 123, 379, 349),
-    #     "observation.images.front": (109, 37, 361, 557),
-    # }
-    # rois = {
-    #   "observation.images.front": [109, 37, 361, 557],
-    #   "observation.images.side": [94, 161, 372, 315],
+    #     "observation.images.front": [102, 43, 358, 523],
+    #     "observation.images.side": [92, 123, 379, 349],
     # }
 
     # Print the selected rectangular ROIs
diff --git a/lerobot/scripts/server/gym_manipulator.py b/lerobot/scripts/server/gym_manipulator.py
index a43f07ca..b3d71d4d 100644
--- a/lerobot/scripts/server/gym_manipulator.py
+++ b/lerobot/scripts/server/gym_manipulator.py
@@ -213,18 +213,18 @@ class HILSerlRobotEnv(gym.Env):
 
             # When applying the delta action space, convert teleop absolute values to relative differences.
             if self.use_delta_action_space:
-                teleop_action = teleop_action - self.current_joint_positions
-                if torch.any(teleop_action < -self.delta_relative_bounds_size * self.delta) and torch.any(
-                    teleop_action > self.delta_relative_bounds_size
+                teleop_action = (teleop_action - self.current_joint_positions) / self.delta
+                if torch.any(teleop_action < -self.relative_bounds_size) and torch.any(
+                    teleop_action > self.relative_bounds_size
                 ):
                     logging.debug(
-                        f"Relative teleop delta exceeded bounds {self.delta_relative_bounds_size}, teleop_action {teleop_action}\n"
-                        f"lower bounds condition {teleop_action < -self.delta_relative_bounds_size}\n"
-                        f"upper bounds condition {teleop_action > self.delta_relative_bounds_size}"
+                        f"Relative teleop delta exceeded bounds {self.relative_bounds_size}, teleop_action {teleop_action}\n"
+                        f"lower bounds condition {teleop_action < -self.relative_bounds_size}\n"
+                        f"upper bounds condition {teleop_action > self.relative_bounds_size}"
                     )
 
                     teleop_action = torch.clamp(
-                        teleop_action, -self.delta_relative_bounds_size, self.delta_relative_bounds_size
+                        teleop_action, -self.relative_bounds_size, self.relative_bounds_size
                     )
             # NOTE: To mimic the shape of a neural network output, we add a batch dimension to the teleop action.
             if teleop_action.dim() == 1:
@@ -312,7 +312,7 @@ class RewardWrapper(gym.Wrapper):
         start_time = time.perf_counter()
         with torch.inference_mode():
             reward = (
-                self.reward_classifier.predict_reward(images, threshold=0.6)
+                self.reward_classifier.predict_reward(images, threshold=0.8)
                 if self.reward_classifier is not None
                 else 0.0
             )
@@ -726,6 +726,24 @@ def get_classifier(pretrained_path, config_path, device="mps"):
     return model
 
 
+def replay_episode(env, repo_id, root=None, episode=0):
+    from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+
+    local_files_only = root is not None
+    dataset = LeRobotDataset(repo_id, root=root, episodes=[episode], local_files_only=local_files_only)
+    actions = dataset.hf_dataset.select_columns("action")
+
+    for idx in range(dataset.num_frames):
+        start_episode_t = time.perf_counter()
+
+        action = actions[idx]["action"][:4]
+        print(action)
+        env.step((action / env.unwrapped.delta, False))
+
+        dt_s = time.perf_counter() - start_episode_t
+        busy_wait(1 / 10 - dt_s)
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--fps", type=int, default=30, help="control frequency")
@@ -776,6 +794,9 @@ if __name__ == "__main__":
     parser.add_argument("--env-overrides", type=str, default=None, help="Overrides for the env yaml file")
     parser.add_argument("--control-time-s", type=float, default=20, help="Maximum episode length in seconds")
     parser.add_argument("--reset-follower-pos", type=int, default=1, help="Reset follower between episodes")
+    parser.add_argument("--replay-repo-id", type=str, default=None, help="Repo ID of the episode to replay")
+    parser.add_argument("--replay-root", type=str, default=None, help="Root of the dataset to replay")
+    parser.add_argument("--replay-episode", type=int, default=0, help="Episode to replay")
     args = parser.parse_args()
 
     robot_cfg = init_hydra_config(args.robot_path, args.robot_overrides)
@@ -795,6 +816,10 @@ if __name__ == "__main__":
 
     env.reset()
 
+    if args.replay_repo_id is not None:
+        replay_episode(env, args.replay_repo_id, root=args.replay_root, episode=args.replay_episode)
+        exit()
+
     # Retrieve the robot's action space for joint commands.
     action_space_robot = env.action_space.spaces[0]
 
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index 31976537..2d8eab67 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -600,6 +600,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
             device=device,
             state_keys=cfg.policy.input_shapes.keys(),
             action_mask=active_action_dims,
+            action_delta=cfg.env.wrapper.delta_action,
         )
         batch_size: int = batch_size // 2  # We will sample from both replay buffer
 

From 2f3370e42fc984b14bea375dff58110cc0fcbe4a Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Fri, 14 Feb 2025 19:53:29 +0000
Subject: [PATCH 081/112] Add maniskill support. Co-authored-by: Michel
 Aractingi <michel.aractingi@gmail.com>

---
 lerobot/configs/env/maniskill_example.yaml    |   9 +-
 lerobot/configs/policy/sac_maniskill.yaml     |  27 +--
 lerobot/scripts/server/actor_server.py        |  18 +-
 lerobot/scripts/server/gym_manipulator.py     |  17 +-
 lerobot/scripts/server/learner_server.py      |   2 +-
 .../scripts/server/maniskill_manipulator.py   | 176 ++++++++++++++++++
 6 files changed, 222 insertions(+), 27 deletions(-)
 create mode 100644 lerobot/scripts/server/maniskill_manipulator.py

diff --git a/lerobot/configs/env/maniskill_example.yaml b/lerobot/configs/env/maniskill_example.yaml
index cedf7a30..03814614 100644
--- a/lerobot/configs/env/maniskill_example.yaml
+++ b/lerobot/configs/env/maniskill_example.yaml
@@ -5,11 +5,16 @@ fps: 20
 env:
   name: maniskill/pushcube
   task:  PushCube-v1
-  image_size: 64
+  image_size: 128
   control_mode: pd_ee_delta_pose
   state_dim: 25
   action_dim: 7
   fps: ${fps}
   obs: rgb
   render_mode: rgb_array
-  render_size: 64
\ No newline at end of file
+  render_size: 128
+  device: cuda
+
+  reward_classifier:
+    pretrained_path: null
+    config_path: null
\ No newline at end of file
diff --git a/lerobot/configs/policy/sac_maniskill.yaml b/lerobot/configs/policy/sac_maniskill.yaml
index aaf59e53..8a36947c 100644
--- a/lerobot/configs/policy/sac_maniskill.yaml
+++ b/lerobot/configs/policy/sac_maniskill.yaml
@@ -8,7 +8,7 @@
 #   env.gym.obs_type=environment_state_agent_pos \
 
 seed: 1
-dataset_repo_id: aractingi/hil-serl-maniskill-pushcube
+dataset_repo_id: null
 
 training:
   # Offline training dataloader
@@ -20,7 +20,7 @@ training:
   lr: 3e-4
 
   eval_freq: 2500
-  log_freq: 500
+  log_freq: 10
   save_freq: 2000000
 
   online_steps: 1000000
@@ -52,14 +52,16 @@ policy:
   n_action_steps: 1
 
   shared_encoder: true
-  # vision_encoder_name: null
+  vision_encoder_name: null
+  # vision_encoder_name: "helper2424/resnet10"
+  # freeze_vision_encoder: true
   freeze_vision_encoder: false
   input_shapes:
     # # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
     observation.state: ["${env.state_dim}"]
-    observation.image: [3, 64, 64]
+    observation.image: [3, 128, 128]
   output_shapes:
-    action: ["${env.action_dim}"]
+    action: [7]
 
   # Normalization / Unnormalization
   input_normalization_modes: null
@@ -67,8 +69,8 @@ policy:
     action: min_max
   output_normalization_params:
     action:
-      min: [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]
-      max: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+      min: [-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0]
+      max: [10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0]
 
   # Architecture / modeling.
   # Neural networks.
@@ -88,14 +90,3 @@ policy:
 actor_learner_config:
   actor_ip: "127.0.0.1"
   port: 50051
-
-  # # Loss coefficients.
-  # reward_coeff: 0.5
-  # expectile_weight: 0.9
-  # value_coeff: 0.1
-  # consistency_coeff: 20.0
-  # advantage_scaling: 3.0
-  # pi_coeff: 0.5
-  # temporal_decay_coeff: 0.5
-  # # Target model.
-  # target_model_momentum: 0.995
diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index 8284f024..b5a6183d 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -251,7 +251,7 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module)
         sum_reward_episode += float(reward)
 
         # NOTE: We overide the action if the intervention is True, because the action applied is the intervention action
-        if info["is_intervention"]:
+        if "is_intervention" in info and info["is_intervention"]:
             # TODO: Check the shape
             # NOTE: The action space for demonstration before hand is with the full action space
             # but sometimes for example we want to deactivate the gripper
@@ -348,10 +348,18 @@ def actor_cli(cfg: dict):
     robot = make_robot(cfg=cfg.robot)
 
     server_thread = Thread(target=serve_actor_service, args=(cfg.actor_learner_config.port,), daemon=True)
-    reward_classifier = get_classifier(
-        pretrained_path=cfg.env.reward_classifier.pretrained_path,
-        config_path=cfg.env.reward_classifier.config_path,
-    )
+
+    # HACK: FOR MANISKILL we do not have a reward classifier
+    # TODO: Remove this once we merge into main
+    reward_classifier = None
+    if (
+        cfg.env.reward_classifier.pretrained_path is not None
+        and cfg.env.reward_classifier.config_path is not None
+    ):
+        reward_classifier = get_classifier(
+            pretrained_path=cfg.env.reward_classifier.pretrained_path,
+            config_path=cfg.env.reward_classifier.config_path,
+        )
     policy_thread = Thread(
         target=act_with_policy,
         daemon=True,
diff --git a/lerobot/scripts/server/gym_manipulator.py b/lerobot/scripts/server/gym_manipulator.py
index b3d71d4d..d0fabc8e 100644
--- a/lerobot/scripts/server/gym_manipulator.py
+++ b/lerobot/scripts/server/gym_manipulator.py
@@ -13,6 +13,8 @@ from lerobot.common.envs.utils import preprocess_observation
 from lerobot.common.robot_devices.control_utils import busy_wait, is_headless, reset_follower_position
 from lerobot.common.robot_devices.robots.factory import make_robot
 from lerobot.common.utils.utils import init_hydra_config, log_say
+from lerobot.scripts.server.maniskill_manipulator import make_maniskill
+
 
 logging.basicConfig(level=logging.INFO)
 
@@ -661,6 +663,9 @@ class BatchCompitableWrapper(gym.ObservationWrapper):
         return observation
 
 
+# TODO: REMOVE TH
+
+
 def make_robot_env(
     robot,
     reward_classifier,
@@ -679,7 +684,17 @@ def make_robot_env(
     Returns:
         A vectorized gym environment with all the necessary wrappers applied.
     """
-
+    if "maniskill" in cfg.name:
+        logging.warning("WE SHOULD REMOVE THE MANISKILL BEFORE THE MERGE INTO MAIN")
+        env = make_maniskill(
+            task=cfg.task,
+            obs_mode=cfg.obs,
+            control_mode=cfg.control_mode,
+            render_mode=cfg.render_mode,
+            sensor_configs={"width": cfg.render_size, "height": cfg.render_size},
+            device=cfg.device,
+        )
+        return env
     # Create base environment
     env = HILSerlRobotEnv(
         robot=robot,
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index 2d8eab67..78b5d7b8 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -362,7 +362,7 @@ def add_actor_information_and_train(
             # If cfg.resume, shift the interaction step with the last checkpointed step in order to not break the logging
             interaction_message["Interaction step"] += interaction_step_shift
             logger.log_dict(interaction_message, mode="train", custom_step_key="Interaction step")
-            logging.info(f"Interaction message: {interaction_message}")
+            # logging.info(f"Interaction message: {interaction_message}")
 
         if len(replay_buffer) < cfg.training.online_step_before_learning:
             continue
diff --git a/lerobot/scripts/server/maniskill_manipulator.py b/lerobot/scripts/server/maniskill_manipulator.py
new file mode 100644
index 00000000..8544d157
--- /dev/null
+++ b/lerobot/scripts/server/maniskill_manipulator.py
@@ -0,0 +1,176 @@
+import einops
+import numpy as np
+import gymnasium as gym
+import torch
+
+"""Make ManiSkill3 gym environment"""
+from mani_skill.vector.wrappers.gymnasium import ManiSkillVectorEnv
+
+
+def preprocess_maniskill_observation(observations: dict[str, np.ndarray]) -> dict[str, torch.Tensor]:
+    """Convert environment observation to LeRobot format observation.
+    Args:
+        observation: Dictionary of observation batches from a Gym vector environment.
+    Returns:
+        Dictionary of observation batches with keys renamed to LeRobot format and values as tensors.
+    """
+    # map to expected inputs for the policy
+    return_observations = {}
+    # TODO: You have to merge all tensors from agent key and extra key
+    # You don't keep sensor param key in the observation
+    # And you keep sensor data rgb
+    q_pos = observations["agent"]["qpos"]
+    q_vel = observations["agent"]["qvel"]
+    tcp_pos = observations["extra"]["tcp_pose"]
+    img = observations["sensor_data"]["base_camera"]["rgb"]
+
+    _, h, w, c = img.shape
+    assert c < h and c < w, f"expect channel last images, but instead got {img.shape=}"
+
+    # sanity check that images are uint8
+    assert img.dtype == torch.uint8, f"expect torch.uint8, but instead {img.dtype=}"
+
+    # convert to channel first of type float32 in range [0,1]
+    img = einops.rearrange(img, "b h w c -> b c h w").contiguous()
+    img = img.type(torch.float32)
+    img /= 255
+
+    state = torch.cat([q_pos, q_vel, tcp_pos], dim=-1)
+
+    return_observations["observation.image"] = img
+    return_observations["observation.state"] = state
+    return return_observations
+
+
+class ManiSkillObservationWrapper(gym.ObservationWrapper):
+    def __init__(self, env):
+        super().__init__(env)
+
+    def observation(self, observation):
+        return preprocess_maniskill_observation(observation)
+
+
+class ManiSkillToDeviceWrapper(gym.Wrapper):
+    def __init__(self, env, device: torch.device = "cuda"):
+        super().__init__(env)
+        self.device = device
+
+    def reset(self, seed=None, options=None):
+        obs, info = self.env.reset(seed=seed, options=options)
+        obs = {k: v.to(self.device) for k, v in obs.items()}
+        return obs, info
+
+    def step(self, action):
+        obs, reward, terminated, truncated, info = self.env.step(action)
+        obs = {k: v.to(self.device) for k, v in obs.items()}
+        return obs, reward, terminated, truncated, info
+
+
+class ManiSkillCompat(gym.Wrapper):
+    def __init__(self, env):
+        super().__init__(env)
+
+    def step(self, action):
+        obs, reward, terminated, truncated, info = self.env.step(action)
+        reward = reward.item()
+        terminated = terminated.item()
+        truncated = truncated.item()
+        return obs, reward, terminated, truncated, info
+
+
+class ManiSkillActionWrapper(gym.ActionWrapper):
+    def __init__(self, env):
+        super().__init__(env)
+        self.action_space = gym.spaces.Tuple(spaces=(env.action_space, gym.spaces.Discrete(2)))
+
+    def action(self, action):
+        action, telop = action
+        return action
+
+
+class ManiSkillMultiplyActionWrapper(gym.Wrapper):
+    def __init__(self, env, multiply_factor: float = 10):
+        super().__init__(env)
+        self.multiply_factor = multiply_factor
+        action_space_agent: gym.spaces.Box = env.action_space[0]
+        action_space_agent.low = action_space_agent.low * multiply_factor
+        action_space_agent.high = action_space_agent.high * multiply_factor
+        self.action_space = gym.spaces.Tuple(spaces=(action_space_agent, gym.spaces.Discrete(2)))
+
+    def step(self, action):
+        if isinstance(action, tuple):
+            action, telop = action
+        else:
+            telop = 0
+        action = action / self.multiply_factor
+        obs, reward, terminated, truncated, info = self.env.step((action, telop))
+        return obs, reward, terminated, truncated, info
+
+
+def make_maniskill(
+    task: str = "PushCube-v1",
+    obs_mode: str = "rgb",
+    control_mode: str = "pd_ee_delta_pose",
+    render_mode: str = "rgb_array",
+    sensor_configs: dict[str, int] | None = None,
+    n_envs: int = 1,
+    device: torch.device = "cuda",
+) -> gym.Env:
+    """
+    Factory function to create a ManiSkill environment with standard wrappers.
+
+    Args:
+        task: Name of the ManiSkill task
+        obs_mode: Observation mode (rgb, rgbd, etc)
+        control_mode: Control mode for the robot
+        render_mode: Rendering mode
+        sensor_configs: Camera sensor configurations
+        n_envs: Number of parallel environments
+
+    Returns:
+        A wrapped ManiSkill environment
+    """
+    if sensor_configs is None:
+        sensor_configs = {"width": 64, "height": 64}
+
+    env = gym.make(
+        task,
+        obs_mode=obs_mode,
+        control_mode=control_mode,
+        render_mode=render_mode,
+        sensor_configs=sensor_configs,
+        num_envs=n_envs,
+    )
+    env = ManiSkillCompat(env)
+    env = ManiSkillObservationWrapper(env)
+    env = ManiSkillActionWrapper(env)
+    env = ManiSkillMultiplyActionWrapper(env)
+    env = ManiSkillToDeviceWrapper(env, device=device)
+    return env
+
+
+if __name__ == "__main__":
+    import argparse
+    import hydra
+    from omegaconf import OmegaConf
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default="lerobot/configs/env/maniskill_example.yaml")
+    args = parser.parse_args()
+
+    # Initialize config
+    with hydra.initialize(version_base=None, config_path="../../configs"):
+        cfg = hydra.compose(config_name="env/maniskill_example.yaml")
+
+    env = make_maniskill(
+        task=cfg.env.task,
+        obs_mode=cfg.env.obs,
+        control_mode=cfg.env.control_mode,
+        render_mode=cfg.env.render_mode,
+        sensor_configs={"width": cfg.env.render_size, "height": cfg.env.render_size},
+    )
+
+    print("env done")
+    obs, info = env.reset()
+    random_action = env.action_space.sample()
+    obs, reward, terminated, truncated, info = env.step(random_action)

From 446f434a8ec9cd1ddc89dd449ee4ea03103b6fa7 Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Mon, 17 Feb 2025 10:08:49 +0000
Subject: [PATCH 082/112] Improve wandb logging and custom step tracking in
 logger

- Modify logger to support multiple custom step keys
- Update logging method to handle custom step keys more flexibly

- Enhance logging of optimization step and frequency
Co-authored-by: michel-aractingi  <michel.aractingi@gmail.com>
---
 lerobot/common/logger.py                 | 27 ++++++++++++------------
 lerobot/scripts/server/learner_server.py | 15 ++++++++-----
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/lerobot/common/logger.py b/lerobot/common/logger.py
index 6dc724db..3a9cb2a5 100644
--- a/lerobot/common/logger.py
+++ b/lerobot/common/logger.py
@@ -128,7 +128,7 @@ class Logger:
                 resume="must" if cfg.resume else None,
             )
             # Handle custom step key for rl asynchronous training.
-            self._wandb_custom_step_key = None
+            self._wandb_custom_step_key: set[str] | None = None
             print(colored("Logs will be synced with wandb.", "blue", attrs=["bold"]))
             logging.info(f"Track this run --> {colored(wandb.run.get_url(), 'yellow', attrs=['bold'])}")
             self._wandb = wandb
@@ -264,11 +264,13 @@ class Logger:
             # multiple time steps is possible for example, the interaction step with the environment,
             # the training step, the evaluation step, etc. So we need to define a custom step key
             # to log the correct step for each metric.
-            if custom_step_key is not None and self._wandb_custom_step_key is None:
-                # NOTE: Define the custom step key, once for the moment this implementation support only one
-                # custom step.
-                self._wandb_custom_step_key = f"{mode}/{custom_step_key}"
-                self._wandb.define_metric(self._wandb_custom_step_key, hidden=True)
+            if custom_step_key is not None:
+                if self._wandb_custom_step_key is None:
+                    self._wandb_custom_step_key = set()
+                new_custom_key = f"{mode}/{custom_step_key}"
+                if new_custom_key not in self._wandb_custom_step_key:
+                    self._wandb_custom_step_key.add(new_custom_key)
+                    self._wandb.define_metric(new_custom_key, hidden=True)
 
             for k, v in d.items():
                 if not isinstance(v, (int, float, str, wandb.Table)):
@@ -277,17 +279,16 @@ class Logger:
                     )
                     continue
 
-                # We don't want to log the custom step
-                if k == custom_step_key:
+                # Do not log the custom step key itself.
+                if self._wandb_custom_step_key is not None and k in self._wandb_custom_step_key:
                     continue
 
-                if self._wandb_custom_step_key is not None and custom_step_key is not None:
-                    # NOTE: Log the metric with the custom step key.
-                    value_custom_step_key = d[custom_step_key]
-                    self._wandb.log({f"{mode}/{k}": v, self._wandb_custom_step_key: value_custom_step_key})
+                if custom_step_key is not None:
+                    value_custom_step = d[custom_step_key]
+                    self._wandb.log({f"{mode}/{k}": v, f"{mode}/{custom_step_key}": value_custom_step})
                     continue
 
-                self._wandb.log({f"{mode}/{k}": v}, step=step)
+                self._wandb.log(data={f"{mode}/{k}": v}, step=step)
 
     def log_video(self, video_path: str, step: int, mode: str = "train"):
         assert mode in {"train", "eval"}
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index 78b5d7b8..29afca7b 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -206,9 +206,9 @@ def start_learner_threads(
 
     server_thread.start()
     transition_thread.start()
-    param_push_thread.start()
+    # param_push_thread.start()
 
-    param_push_thread.join()
+    # param_push_thread.join()
     transition_thread.join()
     server_thread.join()
 
@@ -448,7 +448,9 @@ def add_actor_information_and_train(
 
         policy.update_target_networks()
         if optimization_step % cfg.training.log_freq == 0:
-            logger.log_dict(training_infos, step=optimization_step, mode="train")
+            training_infos["Optimization step"] = optimization_step
+            logger.log_dict(d=training_infos, mode="train", custom_step_key="Optimization step")
+            # logging.info(f"Training infos: {training_infos}")
 
         time_for_one_optimization_step = time.time() - time_for_one_optimization_step
         frequency_for_one_optimization_step = 1 / (time_for_one_optimization_step + 1e-9)
@@ -456,9 +458,12 @@ def add_actor_information_and_train(
         logging.info(f"[LEARNER] Optimization frequency loop [Hz]: {frequency_for_one_optimization_step}")
 
         logger.log_dict(
-            {"Optimization frequency loop [Hz]": frequency_for_one_optimization_step},
-            step=optimization_step,
+            {
+                "Optimization frequency loop [Hz]": frequency_for_one_optimization_step,
+                "Optimization step": optimization_step,
+            },
             mode="train",
+            custom_step_key="Optimization step",
         )
 
         optimization_step += 1

From befa1fe9afe9d9e2437b0a307a1d7276b4deeefc Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Mon, 17 Feb 2025 10:26:33 +0000
Subject: [PATCH 083/112] Re-enable parameter push thread in learner server

- Uncomment and start the param_push_thread
- Restore thread joining for param_push_thread
---
 checkport.py                               |  18 ++
 lerobot/scripts/server/hilserl_pb2.py      |  48 ++++
 lerobot/scripts/server/hilserl_pb2_grpc.py | 269 +++++++++++++++++++++
 lerobot/scripts/server/learner_server.py   |   5 +-
 4 files changed, 337 insertions(+), 3 deletions(-)
 create mode 100644 checkport.py
 create mode 100644 lerobot/scripts/server/hilserl_pb2.py
 create mode 100644 lerobot/scripts/server/hilserl_pb2_grpc.py

diff --git a/checkport.py b/checkport.py
new file mode 100644
index 00000000..7f79af6f
--- /dev/null
+++ b/checkport.py
@@ -0,0 +1,18 @@
+import socket
+
+
+def check_port(host, port):
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    try:
+        s.connect((host, port))
+        print(f"Connection successful to {host}:{port}!")
+    except Exception as e:
+        print(f"Connection failed to {host}:{port}: {e}")
+    finally:
+        s.close()
+
+
+if __name__ == "__main__":
+    host = "127.0.0.1"  # or "localhost"
+    port = 51350
+    check_port(host, port)
diff --git a/lerobot/scripts/server/hilserl_pb2.py b/lerobot/scripts/server/hilserl_pb2.py
new file mode 100644
index 00000000..bf605a37
--- /dev/null
+++ b/lerobot/scripts/server/hilserl_pb2.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# NO CHECKED-IN PROTOBUF GENCODE
+# source: hilserl.proto
+# Protobuf Python Version: 5.29.0
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import runtime_version as _runtime_version
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+_runtime_version.ValidateProtobufRuntimeVersion(
+    _runtime_version.Domain.PUBLIC,
+    5,
+    29,
+    0,
+    '',
+    'hilserl.proto'
+)
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rhilserl.proto\x12\x08hil_serl\"\x83\x01\n\x10\x41\x63torInformation\x12*\n\ntransition\x18\x01 \x01(\x0b\x32\x14.hil_serl.TransitionH\x00\x12;\n\x13interaction_message\x18\x02 \x01(\x0b\x32\x1c.hil_serl.InteractionMessageH\x00\x42\x06\n\x04\x64\x61ta\"&\n\nTransition\x12\x18\n\x10transition_bytes\x18\x01 \x01(\x0c\"%\n\nParameters\x12\x17\n\x0fparameter_bytes\x18\x01 \x01(\x0c\"7\n\x12InteractionMessage\x12!\n\x19interaction_message_bytes\x18\x01 \x01(\x0c\"\x07\n\x05\x45mpty2\x92\x01\n\x0eLearnerService\x12\x37\n\x0eSendTransition\x12\x14.hil_serl.Transition\x1a\x0f.hil_serl.Empty\x12G\n\x16SendInteractionMessage\x12\x1c.hil_serl.InteractionMessage\x1a\x0f.hil_serl.Empty2\x8c\x01\n\x0c\x41\x63torService\x12\x43\n\x10StreamTransition\x12\x0f.hil_serl.Empty\x1a\x1a.hil_serl.ActorInformation\"\x00\x30\x01\x12\x37\n\x0eSendParameters\x12\x14.hil_serl.Parameters\x1a\x0f.hil_serl.Emptyb\x06proto3')
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'hilserl_pb2', _globals)
+if not _descriptor._USE_C_DESCRIPTORS:
+  DESCRIPTOR._loaded_options = None
+  _globals['_ACTORINFORMATION']._serialized_start=28
+  _globals['_ACTORINFORMATION']._serialized_end=159
+  _globals['_TRANSITION']._serialized_start=161
+  _globals['_TRANSITION']._serialized_end=199
+  _globals['_PARAMETERS']._serialized_start=201
+  _globals['_PARAMETERS']._serialized_end=238
+  _globals['_INTERACTIONMESSAGE']._serialized_start=240
+  _globals['_INTERACTIONMESSAGE']._serialized_end=295
+  _globals['_EMPTY']._serialized_start=297
+  _globals['_EMPTY']._serialized_end=304
+  _globals['_LEARNERSERVICE']._serialized_start=307
+  _globals['_LEARNERSERVICE']._serialized_end=453
+  _globals['_ACTORSERVICE']._serialized_start=456
+  _globals['_ACTORSERVICE']._serialized_end=596
+# @@protoc_insertion_point(module_scope)
diff --git a/lerobot/scripts/server/hilserl_pb2_grpc.py b/lerobot/scripts/server/hilserl_pb2_grpc.py
new file mode 100644
index 00000000..7dcc8221
--- /dev/null
+++ b/lerobot/scripts/server/hilserl_pb2_grpc.py
@@ -0,0 +1,269 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+import warnings
+
+import hilserl_pb2 as hilserl__pb2
+
+GRPC_GENERATED_VERSION = '1.70.0'
+GRPC_VERSION = grpc.__version__
+_version_not_supported = False
+
+try:
+    from grpc._utilities import first_version_is_lower
+    _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
+except ImportError:
+    _version_not_supported = True
+
+if _version_not_supported:
+    raise RuntimeError(
+        f'The grpc package installed is at version {GRPC_VERSION},'
+        + f' but the generated code in hilserl_pb2_grpc.py depends on'
+        + f' grpcio>={GRPC_GENERATED_VERSION}.'
+        + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
+        + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
+    )
+
+
+class LearnerServiceStub(object):
+    """LearnerService: the Actor calls this to push transitions.
+    The Learner implements this service.
+    """
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.SendTransition = channel.unary_unary(
+                '/hil_serl.LearnerService/SendTransition',
+                request_serializer=hilserl__pb2.Transition.SerializeToString,
+                response_deserializer=hilserl__pb2.Empty.FromString,
+                _registered_method=True)
+        self.SendInteractionMessage = channel.unary_unary(
+                '/hil_serl.LearnerService/SendInteractionMessage',
+                request_serializer=hilserl__pb2.InteractionMessage.SerializeToString,
+                response_deserializer=hilserl__pb2.Empty.FromString,
+                _registered_method=True)
+
+
+class LearnerServiceServicer(object):
+    """LearnerService: the Actor calls this to push transitions.
+    The Learner implements this service.
+    """
+
+    def SendTransition(self, request, context):
+        """Actor -> Learner to store transitions
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def SendInteractionMessage(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_LearnerServiceServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'SendTransition': grpc.unary_unary_rpc_method_handler(
+                    servicer.SendTransition,
+                    request_deserializer=hilserl__pb2.Transition.FromString,
+                    response_serializer=hilserl__pb2.Empty.SerializeToString,
+            ),
+            'SendInteractionMessage': grpc.unary_unary_rpc_method_handler(
+                    servicer.SendInteractionMessage,
+                    request_deserializer=hilserl__pb2.InteractionMessage.FromString,
+                    response_serializer=hilserl__pb2.Empty.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'hil_serl.LearnerService', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+    server.add_registered_method_handlers('hil_serl.LearnerService', rpc_method_handlers)
+
+
+ # This class is part of an EXPERIMENTAL API.
+class LearnerService(object):
+    """LearnerService: the Actor calls this to push transitions.
+    The Learner implements this service.
+    """
+
+    @staticmethod
+    def SendTransition(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/hil_serl.LearnerService/SendTransition',
+            hilserl__pb2.Transition.SerializeToString,
+            hilserl__pb2.Empty.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+
+    @staticmethod
+    def SendInteractionMessage(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/hil_serl.LearnerService/SendInteractionMessage',
+            hilserl__pb2.InteractionMessage.SerializeToString,
+            hilserl__pb2.Empty.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+
+
+class ActorServiceStub(object):
+    """ActorService: the Learner calls this to push parameters.
+    The Actor implements this service.
+    """
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.StreamTransition = channel.unary_stream(
+                '/hil_serl.ActorService/StreamTransition',
+                request_serializer=hilserl__pb2.Empty.SerializeToString,
+                response_deserializer=hilserl__pb2.ActorInformation.FromString,
+                _registered_method=True)
+        self.SendParameters = channel.unary_unary(
+                '/hil_serl.ActorService/SendParameters',
+                request_serializer=hilserl__pb2.Parameters.SerializeToString,
+                response_deserializer=hilserl__pb2.Empty.FromString,
+                _registered_method=True)
+
+
+class ActorServiceServicer(object):
+    """ActorService: the Learner calls this to push parameters.
+    The Actor implements this service.
+    """
+
+    def StreamTransition(self, request, context):
+        """Learner -> Actor to send new parameters
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def SendParameters(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_ActorServiceServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'StreamTransition': grpc.unary_stream_rpc_method_handler(
+                    servicer.StreamTransition,
+                    request_deserializer=hilserl__pb2.Empty.FromString,
+                    response_serializer=hilserl__pb2.ActorInformation.SerializeToString,
+            ),
+            'SendParameters': grpc.unary_unary_rpc_method_handler(
+                    servicer.SendParameters,
+                    request_deserializer=hilserl__pb2.Parameters.FromString,
+                    response_serializer=hilserl__pb2.Empty.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'hil_serl.ActorService', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+    server.add_registered_method_handlers('hil_serl.ActorService', rpc_method_handlers)
+
+
+ # This class is part of an EXPERIMENTAL API.
+class ActorService(object):
+    """ActorService: the Learner calls this to push parameters.
+    The Actor implements this service.
+    """
+
+    @staticmethod
+    def StreamTransition(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_stream(
+            request,
+            target,
+            '/hil_serl.ActorService/StreamTransition',
+            hilserl__pb2.Empty.SerializeToString,
+            hilserl__pb2.ActorInformation.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+
+    @staticmethod
+    def SendParameters(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/hil_serl.ActorService/SendParameters',
+            hilserl__pb2.Parameters.SerializeToString,
+            hilserl__pb2.Empty.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index 29afca7b..faa7a0e7 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -206,9 +206,8 @@ def start_learner_threads(
 
     server_thread.start()
     transition_thread.start()
-    # param_push_thread.start()
-
-    # param_push_thread.join()
+    param_push_thread.start()
+    param_push_thread.join()
     transition_thread.join()
     server_thread.join()
 

From ff47c0b0d37dae20ae4f82437dab7774db8ae128 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Wed, 19 Feb 2025 16:22:51 +0000
Subject: [PATCH 084/112] - Fixed big issue in the loading of the policy
 parameters sent by the learner to the actor -- pass only the actor to the
 `update_policy_parameters` and remove `strict=False` - Fixed big issue in the
 normalization of the actions in the `forward` function of the critic --
 remove the `torch.no_grad` decorator in `normalize.py` in the normalization
 function - Fixed performance issue to boost the optimization frequency by
 setting the storage device to be the same as the device of learning.

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/common/policies/normalize.py          |  2 +-
 lerobot/common/policies/sac/modeling_sac.py   |  4 +-
 lerobot/configs/policy/sac_maniskill.yaml     | 18 ++++-
 lerobot/scripts/server/actor_server.py        |  6 +-
 lerobot/scripts/server/gym_manipulator.py     | 26 +++----
 lerobot/scripts/server/learner_server.py      |  1 +
 .../scripts/server/maniskill_manipulator.py   | 68 +++++++++----------
 7 files changed, 68 insertions(+), 57 deletions(-)

diff --git a/lerobot/common/policies/normalize.py b/lerobot/common/policies/normalize.py
index f2e1179c..2e0b266e 100644
--- a/lerobot/common/policies/normalize.py
+++ b/lerobot/common/policies/normalize.py
@@ -130,7 +130,7 @@ class Normalize(nn.Module):
             setattr(self, "buffer_" + key.replace(".", "_"), buffer)
 
     # TODO(rcadene): should we remove torch.no_grad?
-    @torch.no_grad
+    # @torch.no_grad
     def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
         batch = dict(batch)  # shallow copy avoids mutating the input batch
         for key, mode in self.modes.items():
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 622919b9..84ff6081 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -80,8 +80,8 @@ class SACPolicy(
             encoder_critic = SACObservationEncoder(config, self.normalize_inputs)
             encoder_actor: SACObservationEncoder = encoder_critic
         else:
-            encoder_critic = SACObservationEncoder(config)
-            encoder_actor = SACObservationEncoder(config)
+            encoder_critic = SACObservationEncoder(config, self.normalize_inputs)
+            encoder_actor = SACObservationEncoder(config, self.normalize_inputs)
 
         self.critic_ensemble = CriticEnsemble(
             encoder=encoder_critic,
diff --git a/lerobot/configs/policy/sac_maniskill.yaml b/lerobot/configs/policy/sac_maniskill.yaml
index 8a36947c..3edf7d67 100644
--- a/lerobot/configs/policy/sac_maniskill.yaml
+++ b/lerobot/configs/policy/sac_maniskill.yaml
@@ -64,13 +64,29 @@ policy:
     action: [7]
 
   # Normalization / Unnormalization
-  input_normalization_modes: null
+  input_normalization_modes:
+    observation.state: min_max
+  input_normalization_params: 
+    observation.state:
+      min: [-1.9361e+00, -7.7640e-01, -7.7094e-01, -2.9709e+00, -8.5656e-01,
+          1.0764e+00, -1.2680e+00,  0.0000e+00,  0.0000e+00, -9.3448e+00,
+         -3.3828e+00, -3.8420e+00, -5.2553e+00, -3.4154e+00, -6.5082e+00,
+         -6.0500e+00, -8.7193e+00, -8.2337e+00, -3.4650e-01, -4.9441e-01,
+          8.3516e-03, -3.1114e-01, -9.9700e-01, -2.3471e-01, -2.7137e-01] 
+
+      max: [ 0.8644,  1.4306,  1.8520, -0.7578,  0.9508,  3.4901,  1.9381,  0.0400,
+          0.0400,  5.0885,  4.7156,  7.9393,  7.9100,  2.9796,  5.7720,  4.7163,
+          7.8145,  9.7415,  0.2422,  0.4505,  0.6306,  0.2622,  1.0000,  0.5135,
+          0.4001]
+
   output_normalization_modes:
     action: min_max
   output_normalization_params:
     action:
       min: [-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0]
       max: [10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0]
+  output_normalization_shapes:
+    action: [7]
 
   # Architecture / modeling.
   # Neural networks.
diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index b5a6183d..d74b2cfe 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -166,7 +166,7 @@ def update_policy_parameters(policy: SACPolicy, parameters_queue: queue.Queue, d
         logging.info("[ACTOR] Load new parameters from Learner.")
         state_dict = parameters_queue.get()
         state_dict = move_state_dict_to_device(state_dict, device=device)
-        policy.load_state_dict(state_dict, strict=False)
+        policy.load_state_dict(state_dict)
 
 
 def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module):
@@ -182,7 +182,7 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module)
 
     logging.info("make_env online")
 
-    online_env = make_robot_env(robot=robot, reward_classifier=reward_classifier, cfg=cfg.env)
+    online_env = make_robot_env(robot=robot, reward_classifier=reward_classifier, cfg=cfg)
 
     set_global_seed(cfg.seed)
     device = get_safe_torch_device(cfg.device, log=True)
@@ -283,7 +283,7 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module)
             # TODO: Handle logging for episode information
             logging.info(f"[ACTOR] Global step {interaction_step}: Episode reward: {sum_reward_episode}")
 
-            update_policy_parameters(policy=policy, parameters_queue=parameters_queue, device=device)
+            update_policy_parameters(policy=policy.actor, parameters_queue=parameters_queue, device=device)
 
             if len(list_transition_to_send_to_learner) > 0:
                 send_transitions_in_chunks(
diff --git a/lerobot/scripts/server/gym_manipulator.py b/lerobot/scripts/server/gym_manipulator.py
index d0fabc8e..d981f4b3 100644
--- a/lerobot/scripts/server/gym_manipulator.py
+++ b/lerobot/scripts/server/gym_manipulator.py
@@ -684,38 +684,34 @@ def make_robot_env(
     Returns:
         A vectorized gym environment with all the necessary wrappers applied.
     """
-    if "maniskill" in cfg.name:
+    if "maniskill" in cfg.env.name:
         logging.warning("WE SHOULD REMOVE THE MANISKILL BEFORE THE MERGE INTO MAIN")
         env = make_maniskill(
-            task=cfg.task,
-            obs_mode=cfg.obs,
-            control_mode=cfg.control_mode,
-            render_mode=cfg.render_mode,
-            sensor_configs={"width": cfg.render_size, "height": cfg.render_size},
-            device=cfg.device,
+            cfg=cfg,
+            n_envs=1,
         )
         return env
     # Create base environment
     env = HILSerlRobotEnv(
         robot=robot,
-        display_cameras=cfg.wrapper.display_cameras,
-        delta=cfg.wrapper.delta_action,
-        use_delta_action_space=cfg.wrapper.use_relative_joint_positions,
+        display_cameras=cfg.env.wrapper.display_cameras,
+        delta=cfg.env.wrapper.delta_action,
+        use_delta_action_space=cfg.env.wrapper.use_relative_joint_positions,
     )
 
     # Add observation and image processing
     env = ConvertToLeRobotObservation(env=env, device=cfg.device)
-    if cfg.wrapper.crop_params_dict is not None:
+    if cfg.env.wrapper.crop_params_dict is not None:
         env = ImageCropResizeWrapper(
-            env=env, crop_params_dict=cfg.wrapper.crop_params_dict, resize_size=cfg.wrapper.resize_size
+            env=env, crop_params_dict=cfg.env.wrapper.crop_params_dict, resize_size=cfg.env.wrapper.resize_size
         )
 
     # Add reward computation and control wrappers
     env = RewardWrapper(env=env, reward_classifier=reward_classifier, device=cfg.device)
-    env = TimeLimitWrapper(env=env, control_time_s=cfg.wrapper.control_time_s, fps=cfg.fps)
+    env = TimeLimitWrapper(env=env, control_time_s=cfg.env.wrapper.control_time_s, fps=cfg.fps)
     env = KeyboardInterfaceWrapper(env=env)
-    env = ResetWrapper(env=env, reset_fn=None, reset_time_s=cfg.wrapper.reset_time_s)
-    env = JointMaskingActionSpace(env=env, mask=cfg.wrapper.joint_masking_action_space)
+    env = ResetWrapper(env=env, reset_fn=None, reset_time_s=cfg.env.wrapper.reset_time_s)
+    env = JointMaskingActionSpace(env=env, mask=cfg.env.wrapper.joint_masking_action_space)
     env = BatchCompitableWrapper(env=env)
 
     return env
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index faa7a0e7..3a608538 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -142,6 +142,7 @@ def initialize_replay_buffer(cfg: DictConfig, logger: Logger, device: str) -> Re
             capacity=cfg.training.online_buffer_capacity,
             device=device,
             state_keys=cfg.policy.input_shapes.keys(),
+            storage_device=device
         )
 
     dataset = LeRobotDataset(
diff --git a/lerobot/scripts/server/maniskill_manipulator.py b/lerobot/scripts/server/maniskill_manipulator.py
index 8544d157..b50698a9 100644
--- a/lerobot/scripts/server/maniskill_manipulator.py
+++ b/lerobot/scripts/server/maniskill_manipulator.py
@@ -3,10 +3,14 @@ import numpy as np
 import gymnasium as gym
 import torch
 
+from omegaconf import DictConfig
+from typing import Any
+
 """Make ManiSkill3 gym environment"""
 from mani_skill.vector.wrappers.gymnasium import ManiSkillVectorEnv
 
 
+
 def preprocess_maniskill_observation(observations: dict[str, np.ndarray]) -> dict[str, torch.Tensor]:
     """Convert environment observation to LeRobot format observation.
     Args:
@@ -43,32 +47,29 @@ def preprocess_maniskill_observation(observations: dict[str, np.ndarray]) -> dic
 
 
 class ManiSkillObservationWrapper(gym.ObservationWrapper):
-    def __init__(self, env):
-        super().__init__(env)
-
-    def observation(self, observation):
-        return preprocess_maniskill_observation(observation)
-
-
-class ManiSkillToDeviceWrapper(gym.Wrapper):
     def __init__(self, env, device: torch.device = "cuda"):
         super().__init__(env)
         self.device = device
 
-    def reset(self, seed=None, options=None):
-        obs, info = self.env.reset(seed=seed, options=options)
-        obs = {k: v.to(self.device) for k, v in obs.items()}
-        return obs, info
-
-    def step(self, action):
-        obs, reward, terminated, truncated, info = self.env.step(action)
-        obs = {k: v.to(self.device) for k, v in obs.items()}
-        return obs, reward, terminated, truncated, info
+    def observation(self, observation):
+        observation = preprocess_maniskill_observation(observation)
+        observation = {k: v.to(self.device) for k, v in observation.items()}
+        return observation
 
 
 class ManiSkillCompat(gym.Wrapper):
     def __init__(self, env):
         super().__init__(env)
+        new_action_space_shape = env.action_space.shape[-1]
+        new_low = np.squeeze(env.action_space.low, axis=0)
+        new_high = np.squeeze(env.action_space.high, axis=0)
+        self.action_space = gym.spaces.Box(low=new_low, high=new_high, shape=(new_action_space_shape,))
+
+    def reset(
+        self, *, seed: int | None = None, options: dict[str, Any] | None = None
+    ) -> tuple[Any, dict[str, Any]]:
+        options = {}
+        return super().reset(seed=seed, options=options)
 
     def step(self, action):
         obs, reward, terminated, truncated, info = self.env.step(action)
@@ -89,7 +90,7 @@ class ManiSkillActionWrapper(gym.ActionWrapper):
 
 
 class ManiSkillMultiplyActionWrapper(gym.Wrapper):
-    def __init__(self, env, multiply_factor: float = 10):
+    def __init__(self, env, multiply_factor: float = 1):
         super().__init__(env)
         self.multiply_factor = multiply_factor
         action_space_agent: gym.spaces.Box = env.action_space[0]
@@ -108,13 +109,8 @@ class ManiSkillMultiplyActionWrapper(gym.Wrapper):
 
 
 def make_maniskill(
-    task: str = "PushCube-v1",
-    obs_mode: str = "rgb",
-    control_mode: str = "pd_ee_delta_pose",
-    render_mode: str = "rgb_array",
-    sensor_configs: dict[str, int] | None = None,
-    n_envs: int = 1,
-    device: torch.device = "cuda",
+    cfg: DictConfig,
+    n_envs: int | None = None,
 ) -> gym.Env:
     """
     Factory function to create a ManiSkill environment with standard wrappers.
@@ -130,22 +126,24 @@ def make_maniskill(
     Returns:
         A wrapped ManiSkill environment
     """
-    if sensor_configs is None:
-        sensor_configs = {"width": 64, "height": 64}
 
     env = gym.make(
-        task,
-        obs_mode=obs_mode,
-        control_mode=control_mode,
-        render_mode=render_mode,
-        sensor_configs=sensor_configs,
+        cfg.env.task,
+        obs_mode=cfg.env.obs,
+        control_mode=cfg.env.control_mode,
+        render_mode=cfg.env.render_mode,
+        sensor_configs={"width": cfg.env.image_size, "height": cfg.env.image_size},
         num_envs=n_envs,
     )
+
+    env = ManiSkillObservationWrapper(env, device=cfg.env.device)
+    env = ManiSkillVectorEnv(env, ignore_terminations=True, auto_reset=False)
+    env._max_episode_steps = env.max_episode_steps = 50  # gym_utils.find_max_episode_steps_value(env)
+    env.unwrapped.metadata["render_fps"] = 20
     env = ManiSkillCompat(env)
-    env = ManiSkillObservationWrapper(env)
     env = ManiSkillActionWrapper(env)
-    env = ManiSkillMultiplyActionWrapper(env)
-    env = ManiSkillToDeviceWrapper(env, device=device)
+    env = ManiSkillMultiplyActionWrapper(env, multiply_factor=10.0)
+
     return env
 
 

From ff82367c628548ffcc52a747f43ecf2e8a42e487 Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Thu, 20 Feb 2025 17:14:27 +0000
Subject: [PATCH 085/112] Refactor SAC policy with performance optimizations
 and multi-camera support

- Introduced Ensemble and CriticHead classes for more efficient critic network handling
- Added support for multiple camera inputs in observation encoder
- Optimized image encoding by batching image processing
- Updated configuration for ManiSkill environment with reduced image size and action scaling
- Compiled critic networks for improved performance
- Simplified normalization and ensemble handling in critic networks
Co-authored-by: michel-aractingi <michel.aractingi@gmail.com>
---
 lerobot/common/policies/sac/modeling_sac.py   | 198 +++++++++++-------
 lerobot/configs/env/maniskill_example.yaml    |   4 +-
 lerobot/configs/policy/sac_maniskill.yaml     |  40 ++--
 .../scripts/server/maniskill_manipulator.py   |   4 +-
 4 files changed, 153 insertions(+), 93 deletions(-)

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 84ff6081..7cb41ebd 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -17,10 +17,12 @@
 
 # TODO: (1) better device management
 
+from copy import deepcopy
 from typing import Callable, Optional, Tuple
 
 import einops
 import numpy as np
+from tensordict import from_modules
 import torch
 import torch.nn as nn
 import torch.nn.functional as F  # noqa: N812
@@ -85,9 +87,9 @@ class SACPolicy(
 
         self.critic_ensemble = CriticEnsemble(
             encoder=encoder_critic,
-            network_list=nn.ModuleList(
+            ensemble=Ensemble(
                 [
-                    MLP(
+                    CriticHead(
                         input_dim=encoder_critic.output_dim + config.output_shapes["action"][0],
                         **config.critic_network_kwargs,
                     )
@@ -99,9 +101,9 @@ class SACPolicy(
 
         self.critic_target = CriticEnsemble(
             encoder=encoder_critic,
-            network_list=nn.ModuleList(
+            ensemble=Ensemble(
                 [
-                    MLP(
+                    CriticHead(
                         input_dim=encoder_critic.output_dim + config.output_shapes["action"][0],
                         **config.critic_network_kwargs,
                     )
@@ -113,6 +115,9 @@ class SACPolicy(
 
         self.critic_target.load_state_dict(self.critic_ensemble.state_dict())
 
+        self.critic_ensemble = torch.compile(self.critic_ensemble)
+        self.critic_target = torch.compile(self.critic_target)
+
         self.actor = Policy(
             encoder=encoder_actor,
             network=MLP(input_dim=encoder_actor.output_dim, **config.actor_network_kwargs),
@@ -274,6 +279,35 @@ class MLP(nn.Module):
         return self.net(x)
 
 
+class CriticHead(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dims: list[int],
+        activations: Callable[[torch.Tensor], torch.Tensor] | str = nn.SiLU(),
+        activate_final: bool = False,
+        dropout_rate: Optional[float] = None,
+        init_final: Optional[float] = None,
+    ):
+        super().__init__()
+        self.net = MLP(
+            input_dim=input_dim,
+            hidden_dims=hidden_dims,
+            activations=activations,
+            activate_final=activate_final,
+            dropout_rate=dropout_rate,
+        )
+        self.output_layer = nn.Linear(in_features=hidden_dims[-1], out_features=1)
+        if init_final is not None:
+            nn.init.uniform_(self.output_layer.weight, -init_final, init_final)
+            nn.init.uniform_(self.output_layer.bias, -init_final, init_final)
+        else:
+            orthogonal_init()(self.output_layer.weight)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.output_layer(self.net(x))
+
+
 class CriticEnsemble(nn.Module):
     """
     ┌──────────────────┬─────────────────────────────────────────────────────────┐
@@ -316,13 +350,13 @@ class CriticEnsemble(nn.Module):
     def __init__(
         self,
         encoder: Optional[nn.Module],
-        network_list: nn.ModuleList,
+        ensemble: "Ensemble[CriticHead]",
         output_normalization: nn.Module,
         init_final: Optional[float] = None,
     ):
         super().__init__()
         self.encoder = encoder
-        self.network_list = network_list
+        self.ensemble = ensemble
         self.init_final = init_final
         self.output_normalization = output_normalization
 
@@ -330,31 +364,7 @@ class CriticEnsemble(nn.Module):
         # Handle the case where a part of the encoder if frozen
         if self.encoder is not None:
             self.parameters_to_optimize += list(self.encoder.parameters_to_optimize)
-
-        self.parameters_to_optimize += list(self.network_list.parameters())
-        # Find the last Linear layer's output dimension
-        for layer in reversed(network_list[0].net):
-            if isinstance(layer, nn.Linear):
-                out_features = layer.out_features
-                break
-
-        # Output layer
-        self.output_layers = []
-        if init_final is not None:
-            for _ in network_list:
-                output_layer = nn.Linear(out_features, 1)
-                nn.init.uniform_(output_layer.weight, -init_final, init_final)
-                nn.init.uniform_(output_layer.bias, -init_final, init_final)
-                self.output_layers.append(output_layer)
-        else:
-            self.output_layers = []
-            for _ in network_list:
-                output_layer = nn.Linear(out_features, 1)
-                orthogonal_init()(output_layer.weight)
-                self.output_layers.append(output_layer)
-
-        self.output_layers = nn.ModuleList(self.output_layers)
-        self.parameters_to_optimize += list(self.output_layers.parameters())
+        self.parameters_to_optimize += list(self.ensemble.parameters())
 
     def forward(
         self,
@@ -373,12 +383,8 @@ class CriticEnsemble(nn.Module):
         obs_enc = observations if self.encoder is None else self.encoder(observations)
 
         inputs = torch.cat([obs_enc, actions], dim=-1)
-        list_q_values = []
-        for network, output_layer in zip(self.network_list, self.output_layers, strict=False):
-            x = network(inputs)
-            value = output_layer(x)
-            list_q_values.append(value.squeeze(-1))
-        return torch.stack(list_q_values)
+        q_values = self.ensemble(inputs)  # [num_critics, B, 1]
+        return q_values.squeeze(-1)  # [num_critics, B]
 
 
 class Policy(nn.Module):
@@ -510,6 +516,7 @@ class SACObservationEncoder(nn.Module):
                 freeze_image_encoder(self.image_enc_layers)
             else:
                 self.parameters_to_optimize += list(self.image_enc_layers.parameters())
+            self.all_image_keys = [k for k in config.input_shapes if k.startswith("observation.image")]
 
         if "observation.state" in config.input_shapes:
             self.state_enc_layers = nn.Sequential(
@@ -546,14 +553,13 @@ class SACObservationEncoder(nn.Module):
         """
         feat = []
         obs_dict = self.input_normalization(obs_dict)
-        # Concatenate all images along the channel dimension.
-        image_keys = [k for k in obs_dict if k.startswith("observation.image")]
-        for image_key in image_keys:
-            enc_feat = self.image_enc_layers(obs_dict[image_key])
+        # Batch all images along the batch dimension, then encode them.
+        if len(self.all_image_keys) > 0:
+            images_batched = torch.cat([obs_dict[key] for key in self.all_image_keys], dim=0)
+            images_batched = self.image_enc_layers(images_batched)
+            embeddings_chunks = torch.chunk(images_batched, dim=0, chunks=len(self.all_image_keys))
+            feat.extend(embeddings_chunks)
 
-            # if not self.has_pretrained_vision_encoder:
-            #     enc_feat = flatten_forward_unflatten(self.image_enc_layers, obs_dict[image_key])
-            feat.append(enc_feat)
         if "observation.environment_state" in self.config.input_shapes:
             feat.append(self.env_state_enc_layers(obs_dict["observation.environment_state"]))
         if "observation.state" in self.config.input_shapes:
@@ -671,6 +677,34 @@ class Identity(nn.Module):
         return x
 
 
+class Ensemble(nn.Module):
+    """
+    Vectorized ensemble of modules.
+    """
+
+    def __init__(self, modules, **kwargs):
+        super().__init__()
+        # combine_state_for_ensemble causes graph breaks
+        self.params = from_modules(*modules, as_module=True)
+        with self.params[0].data.to("meta").to_module(modules[0]):
+            self.module = deepcopy(modules[0])
+        self._repr = str(modules[0])
+        self._n = len(modules)
+
+    def __len__(self):
+        return self._n
+
+    def _call(self, params, *args, **kwargs):
+        with params.to_module(self.module):
+            return self.module(*args, **kwargs)
+
+    def forward(self, *args, **kwargs):
+        return torch.vmap(self._call, (0, None), randomness="different")(self.params, *args, **kwargs)
+
+    def __repr__(self):
+        return f"Vectorized {len(self)}x " + self._repr
+
+
 # TODO (azouitine): I think in our case this function is not usefull we should remove it
 # after some investigation
 # borrowed from tdmpc
@@ -711,46 +745,68 @@ if __name__ == "__main__":
 
     config = SACConfig()
     config.num_critics = 10
-    encoder = SACObservationEncoder(config)
-    actor_encoder = SACObservationEncoder(config)
-    encoder = torch.compile(encoder)
+    config.vision_encoder_name = None
+    encoder = SACObservationEncoder(config, nn.Identity())
+    # actor_encoder = SACObservationEncoder(config)
+    # encoder = torch.compile(encoder)
     critic_ensemble = CriticEnsemble(
         encoder=encoder,
-        network_list=nn.ModuleList(
+        ensemble=Ensemble(
             [
-                MLP(
+                CriticHead(
                     input_dim=encoder.output_dim + config.output_shapes["action"][0],
                     **config.critic_network_kwargs,
                 )
                 for _ in range(config.num_critics)
             ]
         ),
+        output_normalization=nn.Identity(),
     )
-    actor = Policy(
-        encoder=actor_encoder,
-        network=MLP(input_dim=actor_encoder.output_dim, **config.actor_network_kwargs),
-        action_dim=config.output_shapes["action"][0],
-        encoder_is_shared=config.shared_encoder,
-        **config.policy_kwargs,
-    )
-    encoder = encoder.to("cuda:0")
-    critic_ensemble = torch.compile(critic_ensemble)
+    # actor = Policy(
+    #     encoder=actor_encoder,
+    #     network=MLP(input_dim=actor_encoder.output_dim, **config.actor_network_kwargs),
+    #     action_dim=config.output_shapes["action"][0],
+    #     encoder_is_shared=config.shared_encoder,
+    #     **config.policy_kwargs,
+    # )
+    # encoder = encoder.to("cuda:0")
+    # critic_ensemble = torch.compile(critic_ensemble)
     critic_ensemble = critic_ensemble.to("cuda:0")
-    actor = torch.compile(actor)
-    actor = actor.to("cuda:0")
+    # actor = torch.compile(actor)
+    # actor = actor.to("cuda:0")
     obs_dict = {
-        "observation.image": torch.randn(1, 3, 84, 84),
-        "observation.state": torch.randn(1, 4),
+        "observation.image": torch.randn(8, 3, 84, 84),
+        "observation.state": torch.randn(8, 4),
     }
-    actions = torch.randn(1, 2).to("cuda:0")
-    obs_dict = {k: v.to("cuda:0") for k, v in obs_dict.items()}
-    print("compiling...")
-    # q_value = critic_ensemble(obs_dict, actions)
-    action = actor(obs_dict)
-    print("compiled")
+    actions = torch.randn(8, 2).to("cuda:0")
+    # obs_dict = {k: v.to("cuda:0") for k, v in obs_dict.items()}
+    # print("compiling...")
+    q_value = critic_ensemble(obs_dict, actions)
+    print(q_value.size())
+    # action = actor(obs_dict)
+    # print("compiled")
+    # start = time.perf_counter()
+    # for _ in range(1000):
+    #     # features = encoder(obs_dict)
+    #     action = actor(obs_dict)
+    #     # q_value = critic_ensemble(obs_dict, actions)
+    # print("Time taken:", time.perf_counter() - start)
+    # Compare the performance of the ensemble vs a for loop of 16 MLPs
+    ensemble = Ensemble([CriticHead(256, [256, 256]) for _ in range(2)])
+    ensemble = ensemble.to("cuda:0")
+    critic = CriticHead(256, [256, 256])
+    critic = critic.to("cuda:0")
+    data_ensemble = torch.randn(8, 256).to("cuda:0")
+    ensemble = torch.compile(ensemble)
+    # critic = torch.compile(critic)
+    print(ensemble(data_ensemble).size())
+    print(critic(data_ensemble).size())
     start = time.perf_counter()
     for _ in range(1000):
-        # features = encoder(obs_dict)
-        action = actor(obs_dict)
-        # q_value = critic_ensemble(obs_dict, actions)
+        ensemble(data_ensemble)
+    print("Time taken:", time.perf_counter() - start)
+    start = time.perf_counter()
+    for _ in range(1000):
+        for i in range(2):
+            critic(data_ensemble)
     print("Time taken:", time.perf_counter() - start)
diff --git a/lerobot/configs/env/maniskill_example.yaml b/lerobot/configs/env/maniskill_example.yaml
index 03814614..2b9966c9 100644
--- a/lerobot/configs/env/maniskill_example.yaml
+++ b/lerobot/configs/env/maniskill_example.yaml
@@ -5,14 +5,14 @@ fps: 20
 env:
   name: maniskill/pushcube
   task:  PushCube-v1
-  image_size: 128
+  image_size: 64
   control_mode: pd_ee_delta_pose
   state_dim: 25
   action_dim: 7
   fps: ${fps}
   obs: rgb
   render_mode: rgb_array
-  render_size: 128
+  render_size: 64
   device: cuda
 
   reward_classifier:
diff --git a/lerobot/configs/policy/sac_maniskill.yaml b/lerobot/configs/policy/sac_maniskill.yaml
index 3edf7d67..e657434a 100644
--- a/lerobot/configs/policy/sac_maniskill.yaml
+++ b/lerobot/configs/policy/sac_maniskill.yaml
@@ -59,32 +59,36 @@ policy:
   input_shapes:
     # # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
     observation.state: ["${env.state_dim}"]
-    observation.image: [3, 128, 128]
+    observation.image: [3, 64, 64]
+    observation.image.2: [3, 64, 64]
   output_shapes:
     action: [7]
+  
+  camera_number: 2
 
   # Normalization / Unnormalization
-  input_normalization_modes:
-    observation.state: min_max
-  input_normalization_params: 
-    observation.state:
-      min: [-1.9361e+00, -7.7640e-01, -7.7094e-01, -2.9709e+00, -8.5656e-01,
-          1.0764e+00, -1.2680e+00,  0.0000e+00,  0.0000e+00, -9.3448e+00,
-         -3.3828e+00, -3.8420e+00, -5.2553e+00, -3.4154e+00, -6.5082e+00,
-         -6.0500e+00, -8.7193e+00, -8.2337e+00, -3.4650e-01, -4.9441e-01,
-          8.3516e-03, -3.1114e-01, -9.9700e-01, -2.3471e-01, -2.7137e-01] 
+  input_normalization_modes: null
+  # input_normalization_modes:
+  #  observation.state: min_max
+  input_normalization_params: null
+    # observation.state:
+    #   min: [-1.9361e+00, -7.7640e-01, -7.7094e-01, -2.9709e+00, -8.5656e-01,
+    #       1.0764e+00, -1.2680e+00,  0.0000e+00,  0.0000e+00, -9.3448e+00,
+    #      -3.3828e+00, -3.8420e+00, -5.2553e+00, -3.4154e+00, -6.5082e+00,
+    #      -6.0500e+00, -8.7193e+00, -8.2337e+00, -3.4650e-01, -4.9441e-01,
+    #       8.3516e-03, -3.1114e-01, -9.9700e-01, -2.3471e-01, -2.7137e-01] 
 
-      max: [ 0.8644,  1.4306,  1.8520, -0.7578,  0.9508,  3.4901,  1.9381,  0.0400,
-          0.0400,  5.0885,  4.7156,  7.9393,  7.9100,  2.9796,  5.7720,  4.7163,
-          7.8145,  9.7415,  0.2422,  0.4505,  0.6306,  0.2622,  1.0000,  0.5135,
-          0.4001]
+    #   max: [ 0.8644,  1.4306,  1.8520, -0.7578,  0.9508,  3.4901,  1.9381,  0.0400,
+    #       0.0400,  5.0885,  4.7156,  7.9393,  7.9100,  2.9796,  5.7720,  4.7163,
+    #       7.8145,  9.7415,  0.2422,  0.4505,  0.6306,  0.2622,  1.0000,  0.5135,
+    #       0.4001]
 
   output_normalization_modes:
     action: min_max
   output_normalization_params:
     action:
-      min: [-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0]
-      max: [10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0]
+      min: [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]
+      max: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
   output_normalization_shapes:
     action: [7]
 
@@ -94,8 +98,8 @@ policy:
   # discount: 0.99
   discount: 0.80
   temperature_init: 1.0
-  num_critics: 2 #10
-  num_subsample_critics: null
+  num_critics: 10 #10
+  num_subsample_critics: 2
   critic_lr: 3e-4
   actor_lr: 3e-4
   temperature_lr: 3e-4
diff --git a/lerobot/scripts/server/maniskill_manipulator.py b/lerobot/scripts/server/maniskill_manipulator.py
index b50698a9..105deeb4 100644
--- a/lerobot/scripts/server/maniskill_manipulator.py
+++ b/lerobot/scripts/server/maniskill_manipulator.py
@@ -10,7 +10,6 @@ from typing import Any
 from mani_skill.vector.wrappers.gymnasium import ManiSkillVectorEnv
 
 
-
 def preprocess_maniskill_observation(observations: dict[str, np.ndarray]) -> dict[str, torch.Tensor]:
     """Convert environment observation to LeRobot format observation.
     Args:
@@ -42,6 +41,7 @@ def preprocess_maniskill_observation(observations: dict[str, np.ndarray]) -> dic
     state = torch.cat([q_pos, q_vel, tcp_pos], dim=-1)
 
     return_observations["observation.image"] = img
+    return_observations["observation.image.2"] = img
     return_observations["observation.state"] = state
     return return_observations
 
@@ -142,7 +142,7 @@ def make_maniskill(
     env.unwrapped.metadata["render_fps"] = 20
     env = ManiSkillCompat(env)
     env = ManiSkillActionWrapper(env)
-    env = ManiSkillMultiplyActionWrapper(env, multiply_factor=10.0)
+    env = ManiSkillMultiplyActionWrapper(env, multiply_factor=1)
 
     return env
 

From 3ffe0cf0f4f8d5b2a341bfa3a184140c9c3f3b12 Mon Sep 17 00:00:00 2001
From: Eugene Mironov <helper2424@gmail.com>
Date: Fri, 21 Feb 2025 16:29:00 +0700
Subject: [PATCH 086/112] [Port HIL-SERL] Adjust Actor-Learner architecture &
 clean up dependency management for HIL-SERL (#722)

---
 .pre-commit-config.yaml                       |    1 +
 docker/lerobot-gpu-mani-skill/Dockerfile      |   11 +
 examples/12_train_hilserl_classifier.md       |   11 +
 .../common/policies/sac/configuration_sac.py  |   14 +-
 lerobot/configs/policy/sac_maniskill.yaml     |    5 +-
 lerobot/configs/policy/sac_real.yaml          |    9 +-
 lerobot/scripts/server/actor_server.py        |  347 ++++--
 lerobot/scripts/server/buffer.py              |  131 +-
 lerobot/scripts/server/gym_manipulator.py     |  246 +++-
 lerobot/scripts/server/hilserl.proto          |   28 +-
 lerobot/scripts/server/hilserl_pb2.py         |   20 +-
 lerobot/scripts/server/hilserl_pb2_grpc.py    |  151 +--
 lerobot/scripts/server/learner_server.py      |  242 ++--
 lerobot/scripts/server/learner_service.py     |  113 ++
 poetry.lock                                   | 1051 ++++++++++++++++-
 pyproject.toml                                |    8 +-
 ruff.toml                                     |   11 +
 17 files changed, 1925 insertions(+), 474 deletions(-)
 create mode 100644 docker/lerobot-gpu-mani-skill/Dockerfile
 create mode 100644 lerobot/scripts/server/learner_service.py
 create mode 100644 ruff.toml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 58eca320..bec3b1d8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,6 +17,7 @@ repos:
     rev: v3.19.0
     hooks:
     -   id: pyupgrade
+        exclude: '^(.*_pb2_grpc\.py|.*_pb2\.py$)'
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.8.2
     hooks:
diff --git a/docker/lerobot-gpu-mani-skill/Dockerfile b/docker/lerobot-gpu-mani-skill/Dockerfile
new file mode 100644
index 00000000..e45d84e8
--- /dev/null
+++ b/docker/lerobot-gpu-mani-skill/Dockerfile
@@ -0,0 +1,11 @@
+FROM huggingface/lerobot-gpu:latest
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libvulkan1 vulkan-tools \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+RUN pip install --upgrade --no-cache-dir pip
+RUN pip install --no-cache-dir ".[mani-skill]"
+
+# Set EGL as the rendering backend for MuJoCo
+ENV MUJOCO_GL="egl"
diff --git a/examples/12_train_hilserl_classifier.md b/examples/12_train_hilserl_classifier.md
index eeaf0f2b..9f7ccf81 100644
--- a/examples/12_train_hilserl_classifier.md
+++ b/examples/12_train_hilserl_classifier.md
@@ -81,3 +81,14 @@ You can also log sample predictions during evaluation. Each logged sample will i
 - The **classifier's "confidence" (logits/probability)**.
 
 These logs can be useful for diagnosing and debugging performance issues.
+
+
+#### Generate protobuf files
+
+```bash
+python -m grpc_tools.protoc \
+    -I lerobot/scripts/server \
+    --python_out=lerobot/scripts/server \
+    --grpc_python_out=lerobot/scripts/server \
+    lerobot/scripts/server/hilserl.proto
+```
diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index 1d296bf1..d225f11b 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -41,11 +41,16 @@ class SACConfig:
     )
     input_normalization_params: dict[str, dict[str, list[float]]] = field(
         default_factory=lambda: {
-            "observation.image": {"mean": [[0.485, 0.456, 0.406]], "std": [[0.229, 0.224, 0.225]]},
+            "observation.image": {
+                "mean": [[0.485, 0.456, 0.406]],
+                "std": [[0.229, 0.224, 0.225]],
+            },
             "observation.state": {"min": [-1, -1, -1, -1], "max": [1, 1, 1, 1]},
         }
     )
-    output_normalization_modes: dict[str, str] = field(default_factory=lambda: {"action": "min_max"})
+    output_normalization_modes: dict[str, str] = field(
+        default_factory=lambda: {"action": "min_max"}
+    )
     output_normalization_params: dict[str, dict[str, list[float]]] = field(
         default_factory=lambda: {
             "action": {"min": [-1, -1], "max": [1, 1]},
@@ -54,9 +59,8 @@ class SACConfig:
     # TODO: Move it outside of the config
     actor_learner_config: dict[str, str | int] = field(
         default_factory=lambda: {
-            "actor_ip": "127.0.0.1",
-            "port": 50051,
-            "learner_ip": "127.0.0.1",
+            "learner_host": "127.0.0.1",
+            "learner_port": 50051,
         }
     )
     camera_number: int = 1
diff --git a/lerobot/configs/policy/sac_maniskill.yaml b/lerobot/configs/policy/sac_maniskill.yaml
index e657434a..3e0dbe61 100644
--- a/lerobot/configs/policy/sac_maniskill.yaml
+++ b/lerobot/configs/policy/sac_maniskill.yaml
@@ -108,5 +108,6 @@ policy:
   utd_ratio: 2 # 10
 
 actor_learner_config:
-  actor_ip: "127.0.0.1"
-  port: 50051
+  learner_host: "127.0.0.1"
+  learner_port: 50051
+  policy_parameters_push_frequency: 15
diff --git a/lerobot/configs/policy/sac_real.yaml b/lerobot/configs/policy/sac_real.yaml
index 4b021aaa..139463f9 100644
--- a/lerobot/configs/policy/sac_real.yaml
+++ b/lerobot/configs/policy/sac_real.yaml
@@ -65,7 +65,7 @@ policy:
     action: [4] # ["${env.action_dim}"]
 
   # Normalization / Unnormalization
-  input_normalization_modes: 
+  input_normalization_modes:
     observation.images.front: mean_std
     observation.images.side: mean_std
     observation.state: min_max
@@ -80,7 +80,7 @@ policy:
       min: [-77.08008,     56.25,        60.55664,     19.511719,   0., -0.63829786]
       max: [ 7.215820e+01,  1.5398438e+02,  1.6075195e+02,  9.3251953e+01, 0., -1.4184397e-01]
 
-      # min: [-87.09961,     62.402344,    67.23633,     36.035156,    77.34375,0.53691274] 
+      # min: [-87.09961,     62.402344,    67.23633,     36.035156,    77.34375,0.53691274]
       # max: [58.183594,   131.83594,    145.98633,     82.08984,     78.22266, 0.60402685]
       # min: [-88.50586,  23.81836, 0.87890625, -32.16797, 78.66211,   0.53691274]
       # max: [84.55078, 187.11914, 145.98633, 101.60156, 146.60156,  88.18792]
@@ -112,8 +112,9 @@ policy:
   utd_ratio: 2 # 10
 
 actor_learner_config:
-  actor_ip: "127.0.0.1"
-  port: 50051
+  learner_host: "127.0.0.1"
+  learner_port: 50051
+  policy_parameters_push_frequency: 15
 
   # # Loss coefficients.
   # reward_coeff: 0.5
diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index d74b2cfe..f0c6f2a9 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -17,9 +17,9 @@ import io
 import logging
 import pickle
 import queue
-import time
-from concurrent import futures
 from statistics import mean, quantiles
+import signal
+from functools import lru_cache
 
 # from lerobot.scripts.eval import eval_policy
 from threading import Thread
@@ -35,7 +35,6 @@ from torch import nn
 # from lerobot.common.envs.utils import preprocess_maniskill_observation
 from lerobot.common.policies.factory import make_policy
 from lerobot.common.policies.sac.modeling_sac import SACPolicy
-from lerobot.common.robot_devices.control_utils import busy_wait
 from lerobot.common.robot_devices.robots.factory import make_robot
 from lerobot.common.robot_devices.robots.utils import Robot
 from lerobot.common.utils.utils import (
@@ -44,14 +43,24 @@ from lerobot.common.utils.utils import (
     set_global_seed,
 )
 from lerobot.scripts.server import hilserl_pb2, hilserl_pb2_grpc
-from lerobot.scripts.server.buffer import Transition, move_state_dict_to_device, move_transition_to_device
+from lerobot.scripts.server.buffer import (
+    Transition,
+    move_state_dict_to_device,
+    move_transition_to_device,
+    bytes_buffer_size,
+)
 from lerobot.scripts.server.gym_manipulator import get_classifier, make_robot_env
+from lerobot.scripts.server import learner_service
+
+from threading import Event
 
 logging.basicConfig(level=logging.INFO)
 
 parameters_queue = queue.Queue(maxsize=1)
 message_queue = queue.Queue(maxsize=1_000_000)
 
+ACTOR_SHUTDOWN_TIMEOUT = 30
+
 
 class ActorInformation:
     """
@@ -70,95 +79,171 @@ class ActorInformation:
         self.interaction_message = interaction_message
 
 
-class ActorServiceServicer(hilserl_pb2_grpc.ActorServiceServicer):
-    """
-    gRPC service for actor-learner communication in reinforcement learning.
+def receive_policy(
+    learner_client: hilserl_pb2_grpc.LearnerServiceStub,
+    shutdown_event: Event,
+    parameters_queue: queue.Queue,
+):
+    logging.info("[ACTOR] Start receiving parameters from the Learner")
+    bytes_buffer = io.BytesIO()
+    step = 0
+    try:
+        for model_update in learner_client.StreamParameters(hilserl_pb2.Empty()):
+            if shutdown_event.is_set():
+                logging.info("[ACTOR] Shutting down policy streaming receiver")
+                return hilserl_pb2.Empty()
 
-    This service is responsible for:
-    1. Streaming batches of transition data and statistical metrics from the actor to the learner.
-    2. Receiving updated network parameters from the learner.
-    """
-
-    def StreamTransition(self, request, context):  # noqa: N802
-        """
-        Streams data from the actor to the learner.
-
-        This function continuously retrieves messages from the queue and processes them based on their type:
-
-        - **Transition Data:**
-          - A batch of transitions (observation, action, reward, next observation) is collected.
-          - Transitions are moved to the CPU and serialized using PyTorch.
-          - The serialized data is wrapped in a `hilserl_pb2.Transition` message and sent to the learner.
-
-        - **Interaction Messages:**
-          - Contains useful statistics about episodic rewards and policy timings.
-          - The message is serialized using `pickle` and sent to the learner.
-
-        Yields:
-            hilserl_pb2.ActorInformation: The response message containing either transition data or an interaction message.
-        """
-        while True:
-            message = message_queue.get(block=True)
-
-            if message.transition is not None:
-                transition_to_send_to_learner: list[Transition] = [
-                    move_transition_to_device(transition=T, device="cpu") for T in message.transition
-                ]
-                # Check for NaNs in transitions before sending to learner
-                for transition in transition_to_send_to_learner:
-                    for key, value in transition["state"].items():
-                        if torch.isnan(value).any():
-                            logging.warning(f"Found NaN values in transition {key}")
-                buf = io.BytesIO()
-                torch.save(transition_to_send_to_learner, buf)
-                transition_bytes = buf.getvalue()
-
-                transition_message = hilserl_pb2.Transition(transition_bytes=transition_bytes)
-
-                response = hilserl_pb2.ActorInformation(transition=transition_message)
-
-            elif message.interaction_message is not None:
-                content = hilserl_pb2.InteractionMessage(
-                    interaction_message_bytes=pickle.dumps(message.interaction_message)
+            if model_update.transfer_state == hilserl_pb2.TransferState.TRANSFER_BEGIN:
+                bytes_buffer.seek(0)
+                bytes_buffer.truncate(0)
+                bytes_buffer.write(model_update.parameter_bytes)
+                logging.info("Received model update at step 0")
+                step = 0
+                continue
+            elif (
+                model_update.transfer_state == hilserl_pb2.TransferState.TRANSFER_MIDDLE
+            ):
+                bytes_buffer.write(model_update.parameter_bytes)
+                step += 1
+                logging.info(f"Received model update at step {step}")
+            elif model_update.transfer_state == hilserl_pb2.TransferState.TRANSFER_END:
+                bytes_buffer.write(model_update.parameter_bytes)
+                logging.info(
+                    f"Received model update at step end size {bytes_buffer_size(bytes_buffer)}"
                 )
-                response = hilserl_pb2.ActorInformation(interaction_message=content)
 
-            yield response
+                state_dict = torch.load(bytes_buffer)
 
-    def SendParameters(self, request, context):  # noqa: N802
-        """
-        Receives updated parameters from the learner and updates the actor.
+                bytes_buffer.seek(0)
+                bytes_buffer.truncate(0)
+                step = 0
 
-        The learner calls this method to send new model parameters. The received parameters are deserialized
-        and placed in a queue to be consumed by the actor.
+                logging.info("Model updated")
 
-        Args:
-            request (hilserl_pb2.ParameterUpdate): The request containing serialized network parameters.
-            context (grpc.ServicerContext): The gRPC context.
+                parameters_queue.put(state_dict)
 
-        Returns:
-            hilserl_pb2.Empty: An empty response to acknowledge receipt.
-        """
-        buffer = io.BytesIO(request.parameter_bytes)
-        params = torch.load(buffer)
-        parameters_queue.put(params)
-        return hilserl_pb2.Empty()
+    except grpc.RpcError as e:
+        logging.error(f"[ACTOR] gRPC error: {e}")
+
+    return hilserl_pb2.Empty()
 
 
-def serve_actor_service(port=50052):
+def transitions_stream(shutdown_event: Event, message_queue: queue.Queue):
+    while not shutdown_event.is_set():
+        try:
+            message = message_queue.get(block=True, timeout=5)
+        except queue.Empty:
+            logging.debug("[ACTOR] Transition queue is empty")
+            continue
+
+        if message.transition is not None:
+            transition_to_send_to_learner: list[Transition] = [
+                move_transition_to_device(transition=T, device="cpu")
+                for T in message.transition
+            ]
+            # Check for NaNs in transitions before sending to learner
+            for transition in transition_to_send_to_learner:
+                for key, value in transition["state"].items():
+                    if torch.isnan(value).any():
+                        logging.warning(f"Found NaN values in transition {key}")
+            buf = io.BytesIO()
+            torch.save(transition_to_send_to_learner, buf)
+            transition_bytes = buf.getvalue()
+
+            transition_message = hilserl_pb2.Transition(
+                transition_bytes=transition_bytes
+            )
+
+            response = hilserl_pb2.ActorInformation(transition=transition_message)
+
+        elif message.interaction_message is not None:
+            content = hilserl_pb2.InteractionMessage(
+                interaction_message_bytes=pickle.dumps(message.interaction_message)
+            )
+            response = hilserl_pb2.ActorInformation(interaction_message=content)
+
+        yield response
+
+    return hilserl_pb2.Empty()
+
+
+def send_transitions(
+    learner_client: hilserl_pb2_grpc.LearnerServiceStub,
+    shutdown_event: Event,
+    message_queue: queue.Queue,
+):
     """
-    Runs a gRPC server to start streaming the data from the actor to the learner.
-     Throught this server the learner can push parameters to the Actor as well.
+    Streams data from the actor to the learner.
+
+    This function continuously retrieves messages from the queue and processes them based on their type:
+
+    - **Transition Data:**
+        - A batch of transitions (observation, action, reward, next observation) is collected.
+        - Transitions are moved to the CPU and serialized using PyTorch.
+        - The serialized data is wrapped in a `hilserl_pb2.Transition` message and sent to the learner.
+
+    - **Interaction Messages:**
+        - Contains useful statistics about episodic rewards and policy timings.
+        - The message is serialized using `pickle` and sent to the learner.
+
+    Yields:
+        hilserl_pb2.ActorInformation: The response message containing either transition data or an interaction message.
     """
-    server = grpc.server(
-        futures.ThreadPoolExecutor(max_workers=20),
-        options=[("grpc.max_send_message_length", -1), ("grpc.max_receive_message_length", -1)],
+    try:
+        learner_client.ReceiveTransitions(
+            transitions_stream(shutdown_event, message_queue)
+        )
+    except grpc.RpcError as e:
+        logging.error(f"[ACTOR] gRPC error: {e}")
+
+    logging.info("[ACTOR] Finished streaming transitions")
+
+
+@lru_cache(maxsize=1)
+def learner_service_client(
+    host="127.0.0.1", port=50051
+) -> tuple[hilserl_pb2_grpc.LearnerServiceStub, grpc.Channel]:
+    import json
+
+    """
+    Returns a client for the learner service.
+
+    GRPC uses HTTP/2, which is a binary protocol and multiplexes requests over a single connection.
+    So we need to create only one client and reuse it.
+    """
+
+    service_config = {
+        "methodConfig": [
+            {
+                "name": [{}],  # Applies to ALL methods in ALL services
+                "retryPolicy": {
+                    "maxAttempts": 5,  # Max retries (total attempts = 5)
+                    "initialBackoff": "0.1s",  # First retry after 0.1s
+                    "maxBackoff": "2s",  # Max wait time between retries
+                    "backoffMultiplier": 2,  # Exponential backoff factor
+                    "retryableStatusCodes": [
+                        "UNAVAILABLE",
+                        "DEADLINE_EXCEEDED",
+                    ],  # Retries on network failures
+                },
+            }
+        ]
+    }
+
+    service_config_json = json.dumps(service_config)
+
+    channel = grpc.insecure_channel(
+        f"{host}:{port}",
+        options=[
+            ("grpc.max_receive_message_length", learner_service.MAX_MESSAGE_SIZE),
+            ("grpc.max_send_message_length", learner_service.MAX_MESSAGE_SIZE),
+            ("grpc.enable_retries", 1),
+            ("grpc.service_config", service_config_json),
+        ],
     )
-    hilserl_pb2_grpc.add_ActorServiceServicer_to_server(ActorServiceServicer(), server)
-    server.add_insecure_port(f"[::]:{port}")
-    server.start()
-    logging.info(f"[ACTOR] gRPC server listening on port {port}")
-    server.wait_for_termination()
+    stub = hilserl_pb2_grpc.LearnerServiceStub(channel)
+    logging.info("[LEARNER] Learner service client created")
+    return stub, channel
 
 
 def update_policy_parameters(policy: SACPolicy, parameters_queue: queue.Queue, device):
@@ -169,7 +254,9 @@ def update_policy_parameters(policy: SACPolicy, parameters_queue: queue.Queue, d
         policy.load_state_dict(state_dict)
 
 
-def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module):
+def act_with_policy(
+    cfg: DictConfig, robot: Robot, reward_classifier: nn.Module, shutdown_event: Event
+):
     """
     Executes policy interaction within the environment.
 
@@ -182,7 +269,9 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module)
 
     logging.info("make_env online")
 
-    online_env = make_robot_env(robot=robot, reward_classifier=reward_classifier, cfg=cfg)
+    online_env = make_robot_env(
+        robot=robot, reward_classifier=reward_classifier, cfg=cfg
+    )
 
     set_global_seed(cfg.seed)
     device = get_safe_torch_device(cfg.device, log=True)
@@ -227,17 +316,27 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module)
     episode_intervention = False
 
     for interaction_step in range(cfg.training.online_steps):
+        if shutdown_event.is_set():
+            logging.info("[ACTOR] Shutdown signal received. Exiting...")
+            return
+
         if interaction_step >= cfg.training.online_step_before_learning:
             # Time policy inference and check if it meets FPS requirement
             with TimerManager(
-                elapsed_time_list=list_policy_time, label="Policy inference time", log=False
+                elapsed_time_list=list_policy_time,
+                label="Policy inference time",
+                log=False,
             ) as timer:  # noqa: F841
                 action = policy.select_action(batch=obs)
             policy_fps = 1.0 / (list_policy_time[-1] + 1e-9)
 
-            log_policy_frequency_issue(policy_fps=policy_fps, cfg=cfg, interaction_step=interaction_step)
+            log_policy_frequency_issue(
+                policy_fps=policy_fps, cfg=cfg, interaction_step=interaction_step
+            )
 
-            next_obs, reward, done, truncated, info = online_env.step(action.squeeze(dim=0).cpu().numpy())
+            next_obs, reward, done, truncated, info = online_env.step(
+                action.squeeze(dim=0).cpu().numpy()
+            )
         else:
             # TODO (azouitine): Make a custom space for torch tensor
             action = online_env.action_space.sample()
@@ -245,7 +344,9 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module)
 
             # HACK: We have only one env but we want to batch it, it will be resolved with the torch box
             action = (
-                torch.from_numpy(action[0]).to(device, non_blocking=device.type == "cuda").unsqueeze(dim=0)
+                torch.from_numpy(action[0])
+                .to(device, non_blocking=device.type == "cuda")
+                .unsqueeze(dim=0)
             )
 
         sum_reward_episode += float(reward)
@@ -261,7 +362,9 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module)
         # Check for NaN values in observations
         for key, tensor in obs.items():
             if torch.isnan(tensor).any():
-                logging.error(f"[ACTOR] NaN values found in obs[{key}] at step {interaction_step}")
+                logging.error(
+                    f"[ACTOR] NaN values found in obs[{key}] at step {interaction_step}"
+                )
 
         list_transition_to_send_to_learner.append(
             Transition(
@@ -281,13 +384,19 @@ def act_with_policy(cfg: DictConfig, robot: Robot, reward_classifier: nn.Module)
         # Because we are using a single environment we can index at zero
         if done or truncated:
             # TODO: Handle logging for episode information
-            logging.info(f"[ACTOR] Global step {interaction_step}: Episode reward: {sum_reward_episode}")
+            logging.info(
+                f"[ACTOR] Global step {interaction_step}: Episode reward: {sum_reward_episode}"
+            )
 
-            update_policy_parameters(policy=policy.actor, parameters_queue=parameters_queue, device=device)
+            update_policy_parameters(
+                policy=policy.actor, parameters_queue=parameters_queue, device=device
+            )
 
             if len(list_transition_to_send_to_learner) > 0:
                 send_transitions_in_chunks(
-                    transitions=list_transition_to_send_to_learner, message_queue=message_queue, chunk_size=4
+                    transitions=list_transition_to_send_to_learner,
+                    message_queue=message_queue,
+                    chunk_size=4,
                 )
                 list_transition_to_send_to_learner = []
 
@@ -332,11 +441,16 @@ def get_frequency_stats(list_policy_time: list[float]) -> dict[str, float]:
         quantiles_90 = quantiles(list_policy_fps, n=10)[-1]
         logging.debug(f"[ACTOR] Average policy frame rate: {policy_fps}")
         logging.debug(f"[ACTOR] Policy frame rate 90th percentile: {quantiles_90}")
-        stats = {"Policy frequency [Hz]": policy_fps, "Policy frequency 90th-p [Hz]": quantiles_90}
+        stats = {
+            "Policy frequency [Hz]": policy_fps,
+            "Policy frequency 90th-p [Hz]": quantiles_90,
+        }
     return stats
 
 
-def log_policy_frequency_issue(policy_fps: float, cfg: DictConfig, interaction_step: int):
+def log_policy_frequency_issue(
+    policy_fps: float, cfg: DictConfig, interaction_step: int
+):
     if policy_fps < cfg.fps:
         logging.warning(
             f"[ACTOR] Policy FPS {policy_fps:.1f} below required {cfg.fps} at step {interaction_step}"
@@ -347,7 +461,34 @@ def log_policy_frequency_issue(policy_fps: float, cfg: DictConfig, interaction_s
 def actor_cli(cfg: dict):
     robot = make_robot(cfg=cfg.robot)
 
-    server_thread = Thread(target=serve_actor_service, args=(cfg.actor_learner_config.port,), daemon=True)
+    shutdown_event = Event()
+
+    # Define signal handler
+    def signal_handler(signum, frame):
+        logging.info("Shutdown signal received. Cleaning up...")
+        shutdown_event.set()
+
+    signal.signal(signal.SIGINT, signal_handler)  # Ctrl+C
+    signal.signal(signal.SIGTERM, signal_handler)  # Termination request (kill)
+    signal.signal(signal.SIGHUP, signal_handler)  # Terminal closed/Hangup
+    signal.signal(signal.SIGQUIT, signal_handler)  # Ctrl+\
+
+    learner_client, grpc_channel = learner_service_client(
+        host=cfg.actor_learner_config.learner_host,
+        port=cfg.actor_learner_config.learner_port,
+    )
+
+    receive_policy_thread = Thread(
+        target=receive_policy,
+        args=(learner_client, shutdown_event, parameters_queue),
+        daemon=True,
+    )
+
+    transitions_thread = Thread(
+        target=send_transitions,
+        args=(learner_client, shutdown_event, message_queue),
+        daemon=True,
+    )
 
     # HACK: FOR MANISKILL we do not have a reward classifier
     # TODO: Remove this once we merge into main
@@ -360,15 +501,27 @@ def actor_cli(cfg: dict):
             pretrained_path=cfg.env.reward_classifier.pretrained_path,
             config_path=cfg.env.reward_classifier.config_path,
         )
+
     policy_thread = Thread(
         target=act_with_policy,
         daemon=True,
-        args=(cfg, robot, reward_classifier),
+        args=(cfg, robot, reward_classifier, shutdown_event),
     )
-    server_thread.start()
+
+    transitions_thread.start()
     policy_thread.start()
+    receive_policy_thread.start()
+
+    shutdown_event.wait()
+    logging.info("[ACTOR] Shutdown event received")
+    grpc_channel.close()
+
     policy_thread.join()
-    server_thread.join()
+    logging.info("[ACTOR] Policy thread joined")
+    transitions_thread.join()
+    logging.info("[ACTOR] Transitions thread joined")
+    receive_policy_thread.join()
+    logging.info("[ACTOR] Receive policy thread joined")
 
 
 if __name__ == "__main__":
diff --git a/lerobot/scripts/server/buffer.py b/lerobot/scripts/server/buffer.py
index 6a290e6e..c113678b 100644
--- a/lerobot/scripts/server/buffer.py
+++ b/lerobot/scripts/server/buffer.py
@@ -17,6 +17,7 @@ import functools
 import random
 from typing import Any, Callable, Optional, Sequence, TypedDict
 
+import io
 import torch
 import torch.nn.functional as F  # noqa: N812
 from tqdm import tqdm
@@ -41,24 +42,33 @@ class BatchTransition(TypedDict):
     done: torch.Tensor
 
 
-def move_transition_to_device(transition: Transition, device: str = "cpu") -> Transition:
+def move_transition_to_device(
+    transition: Transition, device: str = "cpu"
+) -> Transition:
     # Move state tensors to CPU
     device = torch.device(device)
     transition["state"] = {
-        key: val.to(device, non_blocking=device.type == "cuda") for key, val in transition["state"].items()
+        key: val.to(device, non_blocking=device.type == "cuda")
+        for key, val in transition["state"].items()
     }
 
     # Move action to CPU
-    transition["action"] = transition["action"].to(device, non_blocking=device.type == "cuda")
+    transition["action"] = transition["action"].to(
+        device, non_blocking=device.type == "cuda"
+    )
 
     # No need to move reward or done, as they are float and bool
 
     # No need to move reward or done, as they are float and bool
     if isinstance(transition["reward"], torch.Tensor):
-        transition["reward"] = transition["reward"].to(device=device, non_blocking=device.type == "cuda")
+        transition["reward"] = transition["reward"].to(
+            device=device, non_blocking=device.type == "cuda"
+        )
 
     if isinstance(transition["done"], torch.Tensor):
-        transition["done"] = transition["done"].to(device, non_blocking=device.type == "cuda")
+        transition["done"] = transition["done"].to(
+            device, non_blocking=device.type == "cuda"
+        )
 
     # Move next_state tensors to CPU
     transition["next_state"] = {
@@ -82,7 +92,10 @@ def move_state_dict_to_device(state_dict, device):
     if isinstance(state_dict, torch.Tensor):
         return state_dict.to(device)
     elif isinstance(state_dict, dict):
-        return {k: move_state_dict_to_device(v, device=device) for k, v in state_dict.items()}
+        return {
+            k: move_state_dict_to_device(v, device=device)
+            for k, v in state_dict.items()
+        }
     elif isinstance(state_dict, list):
         return [move_state_dict_to_device(v, device=device) for v in state_dict]
     elif isinstance(state_dict, tuple):
@@ -91,6 +104,22 @@ def move_state_dict_to_device(state_dict, device):
         return state_dict
 
 
+def state_to_bytes(state_dict: dict[str, torch.Tensor]) -> io.BytesIO:
+    """Convert model state dict to flat array for transmission"""
+    buffer = io.BytesIO()
+
+    torch.save(state_dict, buffer)
+
+    return buffer
+
+
+def bytes_buffer_size(buffer: io.BytesIO) -> int:
+    buffer.seek(0, io.SEEK_END)
+    result = buffer.tell()
+    buffer.seek(0)
+    return result
+
+
 def random_crop_vectorized(images: torch.Tensor, output_size: tuple) -> torch.Tensor:
     """
     Perform a per-image random crop over a batch of images in a vectorized way.
@@ -116,7 +145,9 @@ def random_crop_vectorized(images: torch.Tensor, output_size: tuple) -> torch.Te
     images_hwcn = images.permute(0, 2, 3, 1)  # (B, H, W, C)
 
     # Gather pixels
-    cropped_hwcn = images_hwcn[torch.arange(B, device=images.device).view(B, 1, 1), rows, cols, :]
+    cropped_hwcn = images_hwcn[
+        torch.arange(B, device=images.device).view(B, 1, 1), rows, cols, :
+    ]
     # cropped_hwcn => (B, crop_h, crop_w, C)
 
     cropped = cropped_hwcn.permute(0, 3, 1, 2)  # (B, C, crop_h, crop_w)
@@ -179,7 +210,9 @@ class ReplayBuffer:
         """Saves a transition, ensuring tensors are stored on the designated storage device."""
         # Move tensors to the storage device
         state = {key: tensor.to(self.storage_device) for key, tensor in state.items()}
-        next_state = {key: tensor.to(self.storage_device) for key, tensor in next_state.items()}
+        next_state = {
+            key: tensor.to(self.storage_device) for key, tensor in next_state.items()
+        }
         action = action.to(self.storage_device)
         # if complementary_info is not None:
         #     complementary_info = {
@@ -234,7 +267,9 @@ class ReplayBuffer:
             )
 
         replay_buffer = cls(capacity=capacity, device=device, state_keys=state_keys)
-        list_transition = cls._lerobotdataset_to_transitions(dataset=lerobot_dataset, state_keys=state_keys)
+        list_transition = cls._lerobotdataset_to_transitions(
+            dataset=lerobot_dataset, state_keys=state_keys
+        )
         # Fill the replay buffer with the lerobot dataset transitions
         for data in list_transition:
             for k, v in data.items():
@@ -295,7 +330,9 @@ class ReplayBuffer:
 
         # If not provided, you can either raise an error or define a default:
         if state_keys is None:
-            raise ValueError("You must provide a list of keys in `state_keys` that define your 'state'.")
+            raise ValueError(
+                "You must provide a list of keys in `state_keys` that define your 'state'."
+            )
 
         transitions: list[Transition] = []
         num_frames = len(dataset)
@@ -350,33 +387,37 @@ class ReplayBuffer:
         # -- Build batched states --
         batch_state = {}
         for key in self.state_keys:
-            batch_state[key] = torch.cat([t["state"][key] for t in list_of_transitions], dim=0).to(
-                self.device
-            )
+            batch_state[key] = torch.cat(
+                [t["state"][key] for t in list_of_transitions], dim=0
+            ).to(self.device)
             if key.startswith("observation.image") and self.use_drq:
                 batch_state[key] = self.image_augmentation_function(batch_state[key])
 
         # -- Build batched actions --
-        batch_actions = torch.cat([t["action"] for t in list_of_transitions]).to(self.device)
-
-        # -- Build batched rewards --
-        batch_rewards = torch.tensor([t["reward"] for t in list_of_transitions], dtype=torch.float32).to(
+        batch_actions = torch.cat([t["action"] for t in list_of_transitions]).to(
             self.device
         )
 
+        # -- Build batched rewards --
+        batch_rewards = torch.tensor(
+            [t["reward"] for t in list_of_transitions], dtype=torch.float32
+        ).to(self.device)
+
         # -- Build batched next states --
         batch_next_state = {}
         for key in self.state_keys:
-            batch_next_state[key] = torch.cat([t["next_state"][key] for t in list_of_transitions], dim=0).to(
-                self.device
-            )
+            batch_next_state[key] = torch.cat(
+                [t["next_state"][key] for t in list_of_transitions], dim=0
+            ).to(self.device)
             if key.startswith("observation.image") and self.use_drq:
-                batch_next_state[key] = self.image_augmentation_function(batch_next_state[key])
+                batch_next_state[key] = self.image_augmentation_function(
+                    batch_next_state[key]
+                )
 
         # -- Build batched dones --
-        batch_dones = torch.tensor([t["done"] for t in list_of_transitions], dtype=torch.float32).to(
-            self.device
-        )
+        batch_dones = torch.tensor(
+            [t["done"] for t in list_of_transitions], dtype=torch.float32
+        ).to(self.device)
 
         # Return a BatchTransition typed dict
         return BatchTransition(
@@ -433,7 +474,9 @@ class ReplayBuffer:
 
         # Add state keys
         for key in self.state_keys:
-            sample_val = first_transition["state"][key].squeeze(dim=0)  # Remove batch dimension
+            sample_val = first_transition["state"][key].squeeze(
+                dim=0
+            )  # Remove batch dimension
             if not isinstance(sample_val, torch.Tensor):
                 raise ValueError(
                     f"State key '{key}' is not a torch.Tensor. Please ensure your states are stored as torch.Tensors."
@@ -465,7 +508,9 @@ class ReplayBuffer:
         #    We detect episode boundaries by `done == True`.
         # --------------------------------------------------------------------------------------------
         episode_index = 0
-        lerobot_dataset.episode_buffer = lerobot_dataset.create_episode_buffer(episode_index)
+        lerobot_dataset.episode_buffer = lerobot_dataset.create_episode_buffer(
+            episode_index
+        )
 
         frame_idx_in_episode = 0
         for global_frame_idx, transition in enumerate(self.memory):
@@ -476,16 +521,24 @@ class ReplayBuffer:
                 # Expand dimension to match what the dataset expects (the dataset wants the raw shape)
                 # We assume your buffer has shape [C, H, W] (if image) or [D] if vector
                 # This is typically already correct, but if needed you can reshape below.
-                frame_dict[key] = transition["state"][key].cpu().squeeze(dim=0)  # Remove batch dimension
+                frame_dict[key] = (
+                    transition["state"][key].cpu().squeeze(dim=0)
+                )  # Remove batch dimension
 
             # Fill action, reward, done
             # Make sure they are shape (X,) or (X,Y,...) as needed.
-            frame_dict["action"] = transition["action"].cpu().squeeze(dim=0)  # Remove batch dimension
+            frame_dict["action"] = (
+                transition["action"].cpu().squeeze(dim=0)
+            )  # Remove batch dimension
             frame_dict["next.reward"] = (
-                torch.tensor([transition["reward"]], dtype=torch.float32).cpu().squeeze(dim=0)
+                torch.tensor([transition["reward"]], dtype=torch.float32)
+                .cpu()
+                .squeeze(dim=0)
             )
             frame_dict["next.done"] = (
-                torch.tensor([transition["done"]], dtype=torch.bool).cpu().squeeze(dim=0)
+                torch.tensor([transition["done"]], dtype=torch.bool)
+                .cpu()
+                .squeeze(dim=0)
             )
             # Add to the dataset's buffer
             lerobot_dataset.add_frame(frame_dict)
@@ -499,7 +552,9 @@ class ReplayBuffer:
                 episode_index += 1
                 frame_idx_in_episode = 0
                 # Start a new buffer for the next episode
-                lerobot_dataset.episode_buffer = lerobot_dataset.create_episode_buffer(episode_index)
+                lerobot_dataset.episode_buffer = lerobot_dataset.create_episode_buffer(
+                    episode_index
+                )
 
         # We are done adding frames
         # If the last transition wasn't done=True, we still have an open buffer with frames.
@@ -541,7 +596,13 @@ def concatenate_batch_transitions(
 ) -> BatchTransition:
     """NOTE: Be careful it change the left_batch_transitions in place"""
     left_batch_transitions["state"] = {
-        key: torch.cat([left_batch_transitions["state"][key], right_batch_transition["state"][key]], dim=0)
+        key: torch.cat(
+            [
+                left_batch_transitions["state"][key],
+                right_batch_transition["state"][key],
+            ],
+            dim=0,
+        )
         for key in left_batch_transitions["state"]
     }
     left_batch_transitions["action"] = torch.cat(
@@ -552,7 +613,11 @@ def concatenate_batch_transitions(
     )
     left_batch_transitions["next_state"] = {
         key: torch.cat(
-            [left_batch_transitions["next_state"][key], right_batch_transition["next_state"][key]], dim=0
+            [
+                left_batch_transitions["next_state"][key],
+                right_batch_transition["next_state"][key],
+            ],
+            dim=0,
         )
         for key in left_batch_transitions["next_state"]
     }
diff --git a/lerobot/scripts/server/gym_manipulator.py b/lerobot/scripts/server/gym_manipulator.py
index d981f4b3..c1a7c88c 100644
--- a/lerobot/scripts/server/gym_manipulator.py
+++ b/lerobot/scripts/server/gym_manipulator.py
@@ -10,11 +10,9 @@ import torch
 import torchvision.transforms.functional as F  # noqa: N812
 
 from lerobot.common.envs.utils import preprocess_observation
-from lerobot.common.robot_devices.control_utils import busy_wait, is_headless, reset_follower_position
+from lerobot.common.robot_devices.control_utils import busy_wait, is_headless
 from lerobot.common.robot_devices.robots.factory import make_robot
 from lerobot.common.utils.utils import init_hydra_config, log_say
-from lerobot.scripts.server.maniskill_manipulator import make_maniskill
-
 
 logging.basicConfig(level=logging.INFO)
 
@@ -62,7 +60,9 @@ class HILSerlRobotEnv(gym.Env):
         if not self.robot.is_connected:
             self.robot.connect()
 
-        self.initial_follower_position = robot.follower_arms["main"].read("Present_Position")
+        self.initial_follower_position = robot.follower_arms["main"].read(
+            "Present_Position"
+        )
 
         # Episode tracking.
         self.current_step = 0
@@ -70,7 +70,9 @@ class HILSerlRobotEnv(gym.Env):
 
         self.delta = delta
         self.use_delta_action_space = use_delta_action_space
-        self.current_joint_positions = self.robot.follower_arms["main"].read("Present_Position")
+        self.current_joint_positions = self.robot.follower_arms["main"].read(
+            "Present_Position"
+        )
 
         # Retrieve the size of the joint position interval bound.
         self.relative_bounds_size = (
@@ -105,12 +107,16 @@ class HILSerlRobotEnv(gym.Env):
         image_keys = [key for key in example_obs if "image" in key]
         state_keys = [key for key in example_obs if "image" not in key]
         observation_spaces = {
-            key: gym.spaces.Box(low=0, high=255, shape=example_obs[key].shape, dtype=np.uint8)
+            key: gym.spaces.Box(
+                low=0, high=255, shape=example_obs[key].shape, dtype=np.uint8
+            )
             for key in image_keys
         }
         observation_spaces["observation.state"] = gym.spaces.Dict(
             {
-                key: gym.spaces.Box(low=0, high=10, shape=example_obs[key].shape, dtype=np.float32)
+                key: gym.spaces.Box(
+                    low=0, high=10, shape=example_obs[key].shape, dtype=np.float32
+                )
                 for key in state_keys
             }
         )
@@ -128,8 +134,12 @@ class HILSerlRobotEnv(gym.Env):
             )
         else:
             action_space_robot = gym.spaces.Box(
-                low=self.robot.config.joint_position_relative_bounds["min"].cpu().numpy(),
-                high=self.robot.config.joint_position_relative_bounds["max"].cpu().numpy(),
+                low=self.robot.config.joint_position_relative_bounds["min"]
+                .cpu()
+                .numpy(),
+                high=self.robot.config.joint_position_relative_bounds["max"]
+                .cpu()
+                .numpy(),
                 shape=(action_dim,),
                 dtype=np.float32,
             )
@@ -141,7 +151,9 @@ class HILSerlRobotEnv(gym.Env):
             ),
         )
 
-    def reset(self, seed=None, options=None) -> Tuple[Dict[str, np.ndarray], Dict[str, Any]]:
+    def reset(
+        self, seed=None, options=None
+    ) -> Tuple[Dict[str, np.ndarray], Dict[str, Any]]:
         """
         Reset the environment to its initial state.
         This method resets the step counter and clears any episodic data.
@@ -198,24 +210,34 @@ class HILSerlRobotEnv(gym.Env):
         """
         policy_action, intervention_bool = action
         teleop_action = None
-        self.current_joint_positions = self.robot.follower_arms["main"].read("Present_Position")
+        self.current_joint_positions = self.robot.follower_arms["main"].read(
+            "Present_Position"
+        )
         if isinstance(policy_action, torch.Tensor):
             policy_action = policy_action.cpu().numpy()
-            policy_action = np.clip(policy_action, self.action_space[0].low, self.action_space[0].high)
+            policy_action = np.clip(
+                policy_action, self.action_space[0].low, self.action_space[0].high
+            )
         if not intervention_bool:
             if self.use_delta_action_space:
-                target_joint_positions = self.current_joint_positions + self.delta * policy_action
+                target_joint_positions = (
+                    self.current_joint_positions + self.delta * policy_action
+                )
             else:
                 target_joint_positions = policy_action
             self.robot.send_action(torch.from_numpy(target_joint_positions))
             observation = self.robot.capture_observation()
         else:
             observation, teleop_action = self.robot.teleop_step(record_data=True)
-            teleop_action = teleop_action["action"]  # Convert tensor to appropriate format
+            teleop_action = teleop_action[
+                "action"
+            ]  # Convert tensor to appropriate format
 
             # When applying the delta action space, convert teleop absolute values to relative differences.
             if self.use_delta_action_space:
-                teleop_action = (teleop_action - self.current_joint_positions) / self.delta
+                teleop_action = (
+                    teleop_action - self.current_joint_positions
+                ) / self.delta
                 if torch.any(teleop_action < -self.relative_bounds_size) and torch.any(
                     teleop_action > self.relative_bounds_size
                 ):
@@ -226,7 +248,9 @@ class HILSerlRobotEnv(gym.Env):
                     )
 
                     teleop_action = torch.clamp(
-                        teleop_action, -self.relative_bounds_size, self.relative_bounds_size
+                        teleop_action,
+                        -self.relative_bounds_size,
+                        self.relative_bounds_size,
                     )
             # NOTE: To mimic the shape of a neural network output, we add a batch dimension to the teleop action.
             if teleop_action.dim() == 1:
@@ -245,7 +269,10 @@ class HILSerlRobotEnv(gym.Env):
             reward,
             terminated,
             truncated,
-            {"action_intervention": teleop_action, "is_intervention": teleop_action is not None},
+            {
+                "action_intervention": teleop_action,
+                "is_intervention": teleop_action is not None,
+            },
         )
 
     def render(self):
@@ -351,7 +378,9 @@ class JointMaskingActionSpace(gym.Wrapper):
                 raise ValueError("Mask length must match action space dimensions")
             low = env.action_space.low[self.active_dims]
             high = env.action_space.high[self.active_dims]
-            self.action_space = gym.spaces.Box(low=low, high=high, dtype=env.action_space.dtype)
+            self.action_space = gym.spaces.Box(
+                low=low, high=high, dtype=env.action_space.dtype
+            )
 
         if isinstance(env.action_space, gym.spaces.Tuple):
             if len(mask) != env.action_space[0].shape[0]:
@@ -359,8 +388,12 @@ class JointMaskingActionSpace(gym.Wrapper):
 
             low = env.action_space[0].low[self.active_dims]
             high = env.action_space[0].high[self.active_dims]
-            action_space_masked = gym.spaces.Box(low=low, high=high, dtype=env.action_space[0].dtype)
-            self.action_space = gym.spaces.Tuple((action_space_masked, env.action_space[1]))
+            action_space_masked = gym.spaces.Box(
+                low=low, high=high, dtype=env.action_space[0].dtype
+            )
+            self.action_space = gym.spaces.Tuple(
+                (action_space_masked, env.action_space[1])
+            )
             # Create new action space with masked dimensions
 
     def action(self, action):
@@ -379,14 +412,18 @@ class JointMaskingActionSpace(gym.Wrapper):
             # Extract the masked component from the tuple.
             masked_action = action[0] if isinstance(action, tuple) else action
             # Create a full action for the Box element.
-            full_box_action = np.zeros(self.env.action_space[0].shape, dtype=self.env.action_space[0].dtype)
+            full_box_action = np.zeros(
+                self.env.action_space[0].shape, dtype=self.env.action_space[0].dtype
+            )
             full_box_action[self.active_dims] = masked_action
             # Return a tuple with the reconstructed Box action and the unchanged remainder.
             return (full_box_action, action[1])
         else:
             # For Box action spaces.
             masked_action = action if not isinstance(action, tuple) else action[0]
-            full_action = np.zeros(self.env.action_space.shape, dtype=self.env.action_space.dtype)
+            full_action = np.zeros(
+                self.env.action_space.shape, dtype=self.env.action_space.dtype
+            )
             full_action[self.active_dims] = masked_action
             return full_action
 
@@ -395,9 +432,13 @@ class JointMaskingActionSpace(gym.Wrapper):
         obs, reward, terminated, truncated, info = self.env.step(action)
         if "action_intervention" in info and info["action_intervention"] is not None:
             if info["action_intervention"].dim() == 1:
-                info["action_intervention"] = info["action_intervention"][self.active_dims]
+                info["action_intervention"] = info["action_intervention"][
+                    self.active_dims
+                ]
             else:
-                info["action_intervention"] = info["action_intervention"][:, self.active_dims]
+                info["action_intervention"] = info["action_intervention"][
+                    :, self.active_dims
+                ]
         return obs, reward, terminated, truncated, info
 
 
@@ -438,7 +479,12 @@ class TimeLimitWrapper(gym.Wrapper):
 
 
 class ImageCropResizeWrapper(gym.Wrapper):
-    def __init__(self, env, crop_params_dict: Dict[str, Annotated[Tuple[int], 4]], resize_size=None):
+    def __init__(
+        self,
+        env,
+        crop_params_dict: Dict[str, Annotated[Tuple[int], 4]],
+        resize_size=None,
+    ):
         super().__init__(env)
         self.env = env
         self.crop_params_dict = crop_params_dict
@@ -450,7 +496,9 @@ class ImageCropResizeWrapper(gym.Wrapper):
         for key in crop_params_dict:
             top, left, height, width = crop_params_dict[key]
             new_shape = (top + height, left + width)
-            self.observation_space[key] = gym.spaces.Box(low=0, high=255, shape=new_shape)
+            self.observation_space[key] = gym.spaces.Box(
+                low=0, high=255, shape=new_shape
+            )
 
         self.resize_size = resize_size
         if self.resize_size is None:
@@ -463,7 +511,9 @@ class ImageCropResizeWrapper(gym.Wrapper):
 
             # Check for NaNs before processing
             if torch.isnan(obs[k]).any():
-                logging.error(f"NaN values detected in observation {k} before crop and resize")
+                logging.error(
+                    f"NaN values detected in observation {k} before crop and resize"
+                )
 
             if device == torch.device("mps:0"):
                 obs[k] = obs[k].cpu()
@@ -473,7 +523,9 @@ class ImageCropResizeWrapper(gym.Wrapper):
 
             # Check for NaNs after processing
             if torch.isnan(obs[k]).any():
-                logging.error(f"NaN values detected in observation {k} after crop and resize")
+                logging.error(
+                    f"NaN values detected in observation {k} after crop and resize"
+                )
 
             obs[k] = obs[k].to(device)
 
@@ -503,10 +555,14 @@ class ConvertToLeRobotObservation(gym.ObservationWrapper):
         observation = preprocess_observation(observation)
 
         observation = {
-            key: observation[key].to(self.device, non_blocking=self.device.type == "cuda")
+            key: observation[key].to(
+                self.device, non_blocking=self.device.type == "cuda"
+            )
             for key in observation
         }
-        observation = {k: torch.tensor(v, device=self.device) for k, v in observation.items()}
+        observation = {
+            k: torch.tensor(v, device=self.device) for k, v in observation.items()
+        }
         return observation
 
 
@@ -553,18 +609,31 @@ class KeyboardInterfaceWrapper(gym.Wrapper):
                                     "Place the leader in similar pose to the follower and press space again."
                                 )
                                 self.events["pause_policy"] = True
-                                log_say("Human intervention stage. Get ready to take over.", play_sounds=True)
+                                log_say(
+                                    "Human intervention stage. Get ready to take over.",
+                                    play_sounds=True,
+                                )
                                 return
-                            if self.events["pause_policy"] and not self.events["human_intervention_step"]:
+                            if (
+                                self.events["pause_policy"]
+                                and not self.events["human_intervention_step"]
+                            ):
                                 self.events["human_intervention_step"] = True
                                 print("Space key pressed. Human intervention starting.")
-                                log_say("Starting human intervention.", play_sounds=True)
+                                log_say(
+                                    "Starting human intervention.", play_sounds=True
+                                )
                                 return
-                            if self.events["pause_policy"] and self.events["human_intervention_step"]:
+                            if (
+                                self.events["pause_policy"]
+                                and self.events["human_intervention_step"]
+                            ):
                                 self.events["pause_policy"] = False
                                 self.events["human_intervention_step"] = False
                                 print("Space key pressed for a third time.")
-                                log_say("Continuing with policy actions.", play_sounds=True)
+                                log_say(
+                                    "Continuing with policy actions.", play_sounds=True
+                                )
                                 return
                     except Exception as e:
                         print(f"Error handling key press: {e}")
@@ -572,7 +641,9 @@ class KeyboardInterfaceWrapper(gym.Wrapper):
             self.listener = keyboard.Listener(on_press=on_press)
             self.listener.start()
         except ImportError:
-            logging.warning("Could not import pynput. Keyboard interface will not be available.")
+            logging.warning(
+                "Could not import pynput. Keyboard interface will not be available."
+            )
             self.listener = None
 
     def step(self, action: Any) -> Tuple[Any, float, bool, bool, Dict]:
@@ -599,7 +670,9 @@ class KeyboardInterfaceWrapper(gym.Wrapper):
                 time.sleep(0.1)  # Check more frequently if desired
 
         # Execute the step in the underlying environment
-        obs, reward, terminated, truncated, info = self.env.step((policy_action, is_intervention))
+        obs, reward, terminated, truncated, info = self.env.step(
+            (policy_action, is_intervention)
+        )
 
         # Override reward and termination if episode success event triggered
         with self.event_lock:
@@ -628,7 +701,10 @@ class KeyboardInterfaceWrapper(gym.Wrapper):
 
 class ResetWrapper(gym.Wrapper):
     def __init__(
-        self, env: HILSerlRobotEnv, reset_fn: Optional[Callable[[], None]] = None, reset_time_s: float = 5
+        self,
+        env: HILSerlRobotEnv,
+        reset_fn: Optional[Callable[[], None]] = None,
+        reset_time_s: float = 5,
     ):
         super().__init__(env)
         self.reset_fn = reset_fn
@@ -641,7 +717,10 @@ class ResetWrapper(gym.Wrapper):
         if self.reset_fn is not None:
             self.reset_fn(self.env)
         else:
-            log_say(f"Manually reset the environment for {self.reset_time_s} seconds.", play_sounds=True)
+            log_say(
+                f"Manually reset the environment for {self.reset_time_s} seconds.",
+                play_sounds=True,
+            )
             start_time = time.perf_counter()
             while time.perf_counter() - start_time < self.reset_time_s:
                 self.robot.teleop_step()
@@ -654,7 +733,9 @@ class BatchCompitableWrapper(gym.ObservationWrapper):
     def __init__(self, env):
         super().__init__(env)
 
-    def observation(self, observation: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+    def observation(
+        self, observation: dict[str, torch.Tensor]
+    ) -> dict[str, torch.Tensor]:
         for key in observation:
             if "image" in key and observation[key].dim() == 3:
                 observation[key] = observation[key].unsqueeze(0)
@@ -685,6 +766,8 @@ def make_robot_env(
         A vectorized gym environment with all the necessary wrappers applied.
     """
     if "maniskill" in cfg.env.name:
+        from lerobot.scripts.server.maniskill_manipulator import make_maniskill
+
         logging.warning("WE SHOULD REMOVE THE MANISKILL BEFORE THE MERGE INTO MAIN")
         env = make_maniskill(
             cfg=cfg,
@@ -703,15 +786,23 @@ def make_robot_env(
     env = ConvertToLeRobotObservation(env=env, device=cfg.device)
     if cfg.env.wrapper.crop_params_dict is not None:
         env = ImageCropResizeWrapper(
-            env=env, crop_params_dict=cfg.env.wrapper.crop_params_dict, resize_size=cfg.env.wrapper.resize_size
+            env=env,
+            crop_params_dict=cfg.env.wrapper.crop_params_dict,
+            resize_size=cfg.env.wrapper.resize_size,
         )
 
     # Add reward computation and control wrappers
     env = RewardWrapper(env=env, reward_classifier=reward_classifier, device=cfg.device)
-    env = TimeLimitWrapper(env=env, control_time_s=cfg.env.wrapper.control_time_s, fps=cfg.fps)
+    env = TimeLimitWrapper(
+        env=env, control_time_s=cfg.env.wrapper.control_time_s, fps=cfg.fps
+    )
     env = KeyboardInterfaceWrapper(env=env)
-    env = ResetWrapper(env=env, reset_fn=None, reset_time_s=cfg.env.wrapper.reset_time_s)
-    env = JointMaskingActionSpace(env=env, mask=cfg.env.wrapper.joint_masking_action_space)
+    env = ResetWrapper(
+        env=env, reset_fn=None, reset_time_s=cfg.env.wrapper.reset_time_s
+    )
+    env = JointMaskingActionSpace(
+        env=env, mask=cfg.env.wrapper.joint_masking_action_space
+    )
     env = BatchCompitableWrapper(env=env)
 
     return env
@@ -724,13 +815,19 @@ def get_classifier(pretrained_path, config_path, device="mps"):
         return None
 
     from lerobot.common.policies.factory import _policy_cfg_from_hydra_cfg
-    from lerobot.common.policies.hilserl.classifier.configuration_classifier import ClassifierConfig
-    from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
+    from lerobot.common.policies.hilserl.classifier.configuration_classifier import (
+        ClassifierConfig,
+    )
+    from lerobot.common.policies.hilserl.classifier.modeling_classifier import (
+        Classifier,
+    )
 
     cfg = init_hydra_config(config_path)
 
     classifier_config = _policy_cfg_from_hydra_cfg(ClassifierConfig, cfg)
-    classifier_config.num_cameras = len(cfg.training.image_keys)  # TODO automate these paths
+    classifier_config.num_cameras = len(
+        cfg.training.image_keys
+    )  # TODO automate these paths
     model = Classifier(classifier_config)
     model.load_state_dict(Classifier.from_pretrained(pretrained_path).state_dict())
     model = model.to(device)
@@ -741,7 +838,9 @@ def replay_episode(env, repo_id, root=None, episode=0):
     from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
 
     local_files_only = root is not None
-    dataset = LeRobotDataset(repo_id, root=root, episodes=[episode], local_files_only=local_files_only)
+    dataset = LeRobotDataset(
+        repo_id, root=root, episodes=[episode], local_files_only=local_files_only
+    )
     actions = dataset.hf_dataset.select_columns("action")
 
     for idx in range(dataset.num_frames):
@@ -787,7 +886,8 @@ if __name__ == "__main__":
         ),
     )
     parser.add_argument(
-        "--display-cameras", help=("Whether to display the camera feed while the rollout is happening")
+        "--display-cameras",
+        help=("Whether to display the camera feed while the rollout is happening"),
     )
     parser.add_argument(
         "--reward-classifier-pretrained-path",
@@ -801,13 +901,39 @@ if __name__ == "__main__":
         default=None,
         help="Path to a yaml config file that is necessary to build the reward classifier model.",
     )
-    parser.add_argument("--env-path", type=str, default=None, help="Path to the env yaml file")
-    parser.add_argument("--env-overrides", type=str, default=None, help="Overrides for the env yaml file")
-    parser.add_argument("--control-time-s", type=float, default=20, help="Maximum episode length in seconds")
-    parser.add_argument("--reset-follower-pos", type=int, default=1, help="Reset follower between episodes")
-    parser.add_argument("--replay-repo-id", type=str, default=None, help="Repo ID of the episode to replay")
-    parser.add_argument("--replay-root", type=str, default=None, help="Root of the dataset to replay")
-    parser.add_argument("--replay-episode", type=int, default=0, help="Episode to replay")
+    parser.add_argument(
+        "--env-path", type=str, default=None, help="Path to the env yaml file"
+    )
+    parser.add_argument(
+        "--env-overrides",
+        type=str,
+        default=None,
+        help="Overrides for the env yaml file",
+    )
+    parser.add_argument(
+        "--control-time-s",
+        type=float,
+        default=20,
+        help="Maximum episode length in seconds",
+    )
+    parser.add_argument(
+        "--reset-follower-pos",
+        type=int,
+        default=1,
+        help="Reset follower between episodes",
+    )
+    parser.add_argument(
+        "--replay-repo-id",
+        type=str,
+        default=None,
+        help="Repo ID of the episode to replay",
+    )
+    parser.add_argument(
+        "--replay-root", type=str, default=None, help="Root of the dataset to replay"
+    )
+    parser.add_argument(
+        "--replay-episode", type=int, default=0, help="Episode to replay"
+    )
     args = parser.parse_args()
 
     robot_cfg = init_hydra_config(args.robot_path, args.robot_overrides)
@@ -828,7 +954,9 @@ if __name__ == "__main__":
     env.reset()
 
     if args.replay_repo_id is not None:
-        replay_episode(env, args.replay_repo_id, root=args.replay_root, episode=args.replay_episode)
+        replay_episode(
+            env, args.replay_repo_id, root=args.replay_root, episode=args.replay_episode
+        )
         exit()
 
     # Retrieve the robot's action space for joint commands.
@@ -849,7 +977,9 @@ if __name__ == "__main__":
         smoothed_action = alpha * new_random_action + (1 - alpha) * smoothed_action
 
         # Execute the step: wrap the NumPy action in a torch tensor.
-        obs, reward, terminated, truncated, info = env.step((torch.from_numpy(smoothed_action), False))
+        obs, reward, terminated, truncated, info = env.step(
+            (torch.from_numpy(smoothed_action), False)
+        )
         if terminated or truncated:
             env.reset()
 
diff --git a/lerobot/scripts/server/hilserl.proto b/lerobot/scripts/server/hilserl.proto
index 9fd8663f..6aa46e0e 100644
--- a/lerobot/scripts/server/hilserl.proto
+++ b/lerobot/scripts/server/hilserl.proto
@@ -22,19 +22,11 @@ package hil_serl;
 // The Learner implements this service.
 service LearnerService {
   // Actor -> Learner to store transitions
-  rpc SendTransition(Transition) returns (Empty);
-  rpc SendInteractionMessage(InteractionMessage) returns (Empty);  
+  rpc SendInteractionMessage(InteractionMessage) returns (Empty);
+  rpc StreamParameters(Empty) returns (stream Parameters);
+  rpc ReceiveTransitions(stream ActorInformation) returns (Empty);
 }
 
-// ActorService: the Learner calls this to push parameters.
-// The Actor implements this service.
-service ActorService {
-  // Learner -> Actor to send new parameters
-  rpc StreamTransition(Empty) returns (stream ActorInformation) {};
-  rpc SendParameters(Parameters) returns (Empty);
-}
-
-
 message ActorInformation {
     oneof data {
         Transition transition = 1;
@@ -42,17 +34,25 @@ message ActorInformation {
     }
 }
 
+enum TransferState {
+    TRANSFER_UNKNOWN = 0;
+    TRANSFER_BEGIN = 1;
+    TRANSFER_MIDDLE = 2;
+    TRANSFER_END = 3;
+}
+
 // Messages
 message Transition {
   bytes transition_bytes = 1;
 }
 
 message Parameters {
-  bytes parameter_bytes = 1;
+  TransferState transfer_state = 1;
+  bytes parameter_bytes = 2;
 }
 
 message InteractionMessage {
-  bytes interaction_message_bytes = 1; 
+  bytes interaction_message_bytes = 1;
 }
 
-message Empty {}
\ No newline at end of file
+message Empty {}
diff --git a/lerobot/scripts/server/hilserl_pb2.py b/lerobot/scripts/server/hilserl_pb2.py
index bf605a37..d5eb8d4c 100644
--- a/lerobot/scripts/server/hilserl_pb2.py
+++ b/lerobot/scripts/server/hilserl_pb2.py
@@ -24,25 +24,25 @@ _sym_db = _symbol_database.Default()
 
 
 
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rhilserl.proto\x12\x08hil_serl\"\x83\x01\n\x10\x41\x63torInformation\x12*\n\ntransition\x18\x01 \x01(\x0b\x32\x14.hil_serl.TransitionH\x00\x12;\n\x13interaction_message\x18\x02 \x01(\x0b\x32\x1c.hil_serl.InteractionMessageH\x00\x42\x06\n\x04\x64\x61ta\"&\n\nTransition\x12\x18\n\x10transition_bytes\x18\x01 \x01(\x0c\"%\n\nParameters\x12\x17\n\x0fparameter_bytes\x18\x01 \x01(\x0c\"7\n\x12InteractionMessage\x12!\n\x19interaction_message_bytes\x18\x01 \x01(\x0c\"\x07\n\x05\x45mpty2\x92\x01\n\x0eLearnerService\x12\x37\n\x0eSendTransition\x12\x14.hil_serl.Transition\x1a\x0f.hil_serl.Empty\x12G\n\x16SendInteractionMessage\x12\x1c.hil_serl.InteractionMessage\x1a\x0f.hil_serl.Empty2\x8c\x01\n\x0c\x41\x63torService\x12\x43\n\x10StreamTransition\x12\x0f.hil_serl.Empty\x1a\x1a.hil_serl.ActorInformation\"\x00\x30\x01\x12\x37\n\x0eSendParameters\x12\x14.hil_serl.Parameters\x1a\x0f.hil_serl.Emptyb\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rhilserl.proto\x12\x08hil_serl\"\x83\x01\n\x10\x41\x63torInformation\x12*\n\ntransition\x18\x01 \x01(\x0b\x32\x14.hil_serl.TransitionH\x00\x12;\n\x13interaction_message\x18\x02 \x01(\x0b\x32\x1c.hil_serl.InteractionMessageH\x00\x42\x06\n\x04\x64\x61ta\"&\n\nTransition\x12\x18\n\x10transition_bytes\x18\x01 \x01(\x0c\"V\n\nParameters\x12/\n\x0etransfer_state\x18\x01 \x01(\x0e\x32\x17.hil_serl.TransferState\x12\x17\n\x0fparameter_bytes\x18\x02 \x01(\x0c\"7\n\x12InteractionMessage\x12!\n\x19interaction_message_bytes\x18\x01 \x01(\x0c\"\x07\n\x05\x45mpty*`\n\rTransferState\x12\x14\n\x10TRANSFER_UNKNOWN\x10\x00\x12\x12\n\x0eTRANSFER_BEGIN\x10\x01\x12\x13\n\x0fTRANSFER_MIDDLE\x10\x02\x12\x10\n\x0cTRANSFER_END\x10\x03\x32\xdb\x01\n\x0eLearnerService\x12G\n\x16SendInteractionMessage\x12\x1c.hil_serl.InteractionMessage\x1a\x0f.hil_serl.Empty\x12;\n\x10StreamParameters\x12\x0f.hil_serl.Empty\x1a\x14.hil_serl.Parameters0\x01\x12\x43\n\x12ReceiveTransitions\x12\x1a.hil_serl.ActorInformation\x1a\x0f.hil_serl.Empty(\x01\x62\x06proto3')
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
 _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'hilserl_pb2', _globals)
 if not _descriptor._USE_C_DESCRIPTORS:
   DESCRIPTOR._loaded_options = None
+  _globals['_TRANSFERSTATE']._serialized_start=355
+  _globals['_TRANSFERSTATE']._serialized_end=451
   _globals['_ACTORINFORMATION']._serialized_start=28
   _globals['_ACTORINFORMATION']._serialized_end=159
   _globals['_TRANSITION']._serialized_start=161
   _globals['_TRANSITION']._serialized_end=199
   _globals['_PARAMETERS']._serialized_start=201
-  _globals['_PARAMETERS']._serialized_end=238
-  _globals['_INTERACTIONMESSAGE']._serialized_start=240
-  _globals['_INTERACTIONMESSAGE']._serialized_end=295
-  _globals['_EMPTY']._serialized_start=297
-  _globals['_EMPTY']._serialized_end=304
-  _globals['_LEARNERSERVICE']._serialized_start=307
-  _globals['_LEARNERSERVICE']._serialized_end=453
-  _globals['_ACTORSERVICE']._serialized_start=456
-  _globals['_ACTORSERVICE']._serialized_end=596
+  _globals['_PARAMETERS']._serialized_end=287
+  _globals['_INTERACTIONMESSAGE']._serialized_start=289
+  _globals['_INTERACTIONMESSAGE']._serialized_end=344
+  _globals['_EMPTY']._serialized_start=346
+  _globals['_EMPTY']._serialized_end=353
+  _globals['_LEARNERSERVICE']._serialized_start=454
+  _globals['_LEARNERSERVICE']._serialized_end=673
 # @@protoc_insertion_point(module_scope)
diff --git a/lerobot/scripts/server/hilserl_pb2_grpc.py b/lerobot/scripts/server/hilserl_pb2_grpc.py
index 7dcc8221..42d4674e 100644
--- a/lerobot/scripts/server/hilserl_pb2_grpc.py
+++ b/lerobot/scripts/server/hilserl_pb2_grpc.py
@@ -36,16 +36,21 @@ class LearnerServiceStub(object):
         Args:
             channel: A grpc.Channel.
         """
-        self.SendTransition = channel.unary_unary(
-                '/hil_serl.LearnerService/SendTransition',
-                request_serializer=hilserl__pb2.Transition.SerializeToString,
-                response_deserializer=hilserl__pb2.Empty.FromString,
-                _registered_method=True)
         self.SendInteractionMessage = channel.unary_unary(
                 '/hil_serl.LearnerService/SendInteractionMessage',
                 request_serializer=hilserl__pb2.InteractionMessage.SerializeToString,
                 response_deserializer=hilserl__pb2.Empty.FromString,
                 _registered_method=True)
+        self.StreamParameters = channel.unary_stream(
+                '/hil_serl.LearnerService/StreamParameters',
+                request_serializer=hilserl__pb2.Empty.SerializeToString,
+                response_deserializer=hilserl__pb2.Parameters.FromString,
+                _registered_method=True)
+        self.ReceiveTransitions = channel.stream_unary(
+                '/hil_serl.LearnerService/ReceiveTransitions',
+                request_serializer=hilserl__pb2.ActorInformation.SerializeToString,
+                response_deserializer=hilserl__pb2.Empty.FromString,
+                _registered_method=True)
 
 
 class LearnerServiceServicer(object):
@@ -53,14 +58,20 @@ class LearnerServiceServicer(object):
     The Learner implements this service.
     """
 
-    def SendTransition(self, request, context):
+    def SendInteractionMessage(self, request, context):
         """Actor -> Learner to store transitions
         """
         context.set_code(grpc.StatusCode.UNIMPLEMENTED)
         context.set_details('Method not implemented!')
         raise NotImplementedError('Method not implemented!')
 
-    def SendInteractionMessage(self, request, context):
+    def StreamParameters(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def ReceiveTransitions(self, request_iterator, context):
         """Missing associated documentation comment in .proto file."""
         context.set_code(grpc.StatusCode.UNIMPLEMENTED)
         context.set_details('Method not implemented!')
@@ -69,16 +80,21 @@ class LearnerServiceServicer(object):
 
 def add_LearnerServiceServicer_to_server(servicer, server):
     rpc_method_handlers = {
-            'SendTransition': grpc.unary_unary_rpc_method_handler(
-                    servicer.SendTransition,
-                    request_deserializer=hilserl__pb2.Transition.FromString,
-                    response_serializer=hilserl__pb2.Empty.SerializeToString,
-            ),
             'SendInteractionMessage': grpc.unary_unary_rpc_method_handler(
                     servicer.SendInteractionMessage,
                     request_deserializer=hilserl__pb2.InteractionMessage.FromString,
                     response_serializer=hilserl__pb2.Empty.SerializeToString,
             ),
+            'StreamParameters': grpc.unary_stream_rpc_method_handler(
+                    servicer.StreamParameters,
+                    request_deserializer=hilserl__pb2.Empty.FromString,
+                    response_serializer=hilserl__pb2.Parameters.SerializeToString,
+            ),
+            'ReceiveTransitions': grpc.stream_unary_rpc_method_handler(
+                    servicer.ReceiveTransitions,
+                    request_deserializer=hilserl__pb2.ActorInformation.FromString,
+                    response_serializer=hilserl__pb2.Empty.SerializeToString,
+            ),
     }
     generic_handler = grpc.method_handlers_generic_handler(
             'hil_serl.LearnerService', rpc_method_handlers)
@@ -92,33 +108,6 @@ class LearnerService(object):
     The Learner implements this service.
     """
 
-    @staticmethod
-    def SendTransition(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            '/hil_serl.LearnerService/SendTransition',
-            hilserl__pb2.Transition.SerializeToString,
-            hilserl__pb2.Empty.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-            _registered_method=True)
-
     @staticmethod
     def SendInteractionMessage(request,
             target,
@@ -146,76 +135,8 @@ class LearnerService(object):
             metadata,
             _registered_method=True)
 
-
-class ActorServiceStub(object):
-    """ActorService: the Learner calls this to push parameters.
-    The Actor implements this service.
-    """
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.StreamTransition = channel.unary_stream(
-                '/hil_serl.ActorService/StreamTransition',
-                request_serializer=hilserl__pb2.Empty.SerializeToString,
-                response_deserializer=hilserl__pb2.ActorInformation.FromString,
-                _registered_method=True)
-        self.SendParameters = channel.unary_unary(
-                '/hil_serl.ActorService/SendParameters',
-                request_serializer=hilserl__pb2.Parameters.SerializeToString,
-                response_deserializer=hilserl__pb2.Empty.FromString,
-                _registered_method=True)
-
-
-class ActorServiceServicer(object):
-    """ActorService: the Learner calls this to push parameters.
-    The Actor implements this service.
-    """
-
-    def StreamTransition(self, request, context):
-        """Learner -> Actor to send new parameters
-        """
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def SendParameters(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_ActorServiceServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'StreamTransition': grpc.unary_stream_rpc_method_handler(
-                    servicer.StreamTransition,
-                    request_deserializer=hilserl__pb2.Empty.FromString,
-                    response_serializer=hilserl__pb2.ActorInformation.SerializeToString,
-            ),
-            'SendParameters': grpc.unary_unary_rpc_method_handler(
-                    servicer.SendParameters,
-                    request_deserializer=hilserl__pb2.Parameters.FromString,
-                    response_serializer=hilserl__pb2.Empty.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'hil_serl.ActorService', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-    server.add_registered_method_handlers('hil_serl.ActorService', rpc_method_handlers)
-
-
- # This class is part of an EXPERIMENTAL API.
-class ActorService(object):
-    """ActorService: the Learner calls this to push parameters.
-    The Actor implements this service.
-    """
-
     @staticmethod
-    def StreamTransition(request,
+    def StreamParameters(request,
             target,
             options=(),
             channel_credentials=None,
@@ -228,9 +149,9 @@ class ActorService(object):
         return grpc.experimental.unary_stream(
             request,
             target,
-            '/hil_serl.ActorService/StreamTransition',
+            '/hil_serl.LearnerService/StreamParameters',
             hilserl__pb2.Empty.SerializeToString,
-            hilserl__pb2.ActorInformation.FromString,
+            hilserl__pb2.Parameters.FromString,
             options,
             channel_credentials,
             insecure,
@@ -242,7 +163,7 @@ class ActorService(object):
             _registered_method=True)
 
     @staticmethod
-    def SendParameters(request,
+    def ReceiveTransitions(request_iterator,
             target,
             options=(),
             channel_credentials=None,
@@ -252,11 +173,11 @@ class ActorService(object):
             wait_for_ready=None,
             timeout=None,
             metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
+        return grpc.experimental.stream_unary(
+            request_iterator,
             target,
-            '/hil_serl.ActorService/SendParameters',
-            hilserl__pb2.Parameters.SerializeToString,
+            '/hil_serl.LearnerService/ReceiveTransitions',
+            hilserl__pb2.ActorInformation.SerializeToString,
             hilserl__pb2.Empty.FromString,
             options,
             channel_credentials,
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index 3a608538..2d00e7ed 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -14,19 +14,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import io
 import logging
-import pickle
 import queue
 import shutil
 import time
 from pprint import pformat
 from threading import Lock, Thread
+import signal
+from threading import Event
+from concurrent.futures import ThreadPoolExecutor
 
 import grpc
 
 # Import generated stubs
-import hilserl_pb2  # type: ignore
 import hilserl_pb2_grpc  # type: ignore
 import hydra
 import torch
@@ -55,10 +55,11 @@ from lerobot.common.utils.utils import (
 from lerobot.scripts.server.buffer import (
     ReplayBuffer,
     concatenate_batch_transitions,
-    move_state_dict_to_device,
     move_transition_to_device,
 )
 
+from lerobot.scripts.server import learner_service
+
 logging.basicConfig(level=logging.INFO)
 
 transition_queue = queue.Queue()
@@ -77,9 +78,13 @@ def handle_resume_logic(cfg: DictConfig, out_dir: str) -> DictConfig:
     # if resume == True
     checkpoint_dir = Logger.get_last_checkpoint_dir(out_dir)
     if not checkpoint_dir.exists():
-        raise RuntimeError(f"No model checkpoint found in {checkpoint_dir} for resume=True")
+        raise RuntimeError(
+            f"No model checkpoint found in {checkpoint_dir} for resume=True"
+        )
 
-    checkpoint_cfg_path = str(Logger.get_last_pretrained_model_dir(out_dir) / "config.yaml")
+    checkpoint_cfg_path = str(
+        Logger.get_last_pretrained_model_dir(out_dir) / "config.yaml"
+    )
     logging.info(
         colored(
             "Resume=True detected, resuming previous run",
@@ -112,7 +117,9 @@ def load_training_state(
     if not cfg.resume:
         return None, None
 
-    training_state = torch.load(logger.last_checkpoint_dir / logger.training_state_file_name)
+    training_state = torch.load(
+        logger.last_checkpoint_dir / logger.training_state_file_name
+    )
 
     if isinstance(training_state["optimizer"], dict):
         assert set(training_state["optimizer"].keys()) == set(optimizers.keys())
@@ -126,7 +133,9 @@ def load_training_state(
 
 
 def log_training_info(cfg: DictConfig, out_dir: str, policy: nn.Module) -> None:
-    num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
+    num_learnable_params = sum(
+        p.numel() for p in policy.parameters() if p.requires_grad
+    )
     num_total_params = sum(p.numel() for p in policy.parameters())
 
     log_output_dir(out_dir)
@@ -136,7 +145,9 @@ def log_training_info(cfg: DictConfig, out_dir: str, policy: nn.Module) -> None:
     logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
 
 
-def initialize_replay_buffer(cfg: DictConfig, logger: Logger, device: str) -> ReplayBuffer:
+def initialize_replay_buffer(
+    cfg: DictConfig, logger: Logger, device: str
+) -> ReplayBuffer:
     if not cfg.resume:
         return ReplayBuffer(
             capacity=cfg.training.online_buffer_capacity,
@@ -146,7 +157,9 @@ def initialize_replay_buffer(cfg: DictConfig, logger: Logger, device: str) -> Re
         )
 
     dataset = LeRobotDataset(
-        repo_id=cfg.dataset_repo_id, local_files_only=True, root=logger.log_dir / "dataset"
+        repo_id=cfg.dataset_repo_id,
+        local_files_only=True,
+        root=logger.log_dir / "dataset",
     )
     return ReplayBuffer.from_lerobot_dataset(
         lerobot_dataset=dataset,
@@ -168,18 +181,10 @@ def start_learner_threads(
     logger: Logger,
     resume_optimization_step: int | None = None,
     resume_interaction_step: int | None = None,
+    shutdown_event: Event | None = None,
 ) -> None:
-    actor_ip = cfg.actor_learner_config.actor_ip
-    port = cfg.actor_learner_config.port
-
-    server_thread = Thread(
-        target=stream_transitions_from_actor,
-        args=(
-            actor_ip,
-            port,
-        ),
-        daemon=True,
-    )
+    host = cfg.actor_learner_config.learner_host
+    port = cfg.actor_learner_config.learner_port
 
     transition_thread = Thread(
         target=add_actor_information_and_train,
@@ -196,95 +201,56 @@ def start_learner_threads(
             logger,
             resume_optimization_step,
             resume_interaction_step,
+            shutdown_event,
         ),
     )
 
-    param_push_thread = Thread(
-        target=learner_push_parameters,
-        args=(policy, policy_lock, actor_ip, port, 15),
-        daemon=True,
-    )
-
-    server_thread.start()
     transition_thread.start()
-    param_push_thread.start()
-    param_push_thread.join()
+
+    service = learner_service.LearnerService(
+        shutdown_event,
+        policy,
+        policy_lock,
+        cfg.actor_learner_config.policy_parameters_push_frequency,
+        transition_queue,
+        interaction_message_queue,
+    )
+    server = start_learner_server(service, host, port)
+
+    shutdown_event.wait()
+    server.stop(learner_service.STUTDOWN_TIMEOUT)
+    logging.info("[LEARNER] gRPC server stopped")
+
     transition_thread.join()
-    server_thread.join()
+    logging.info("[LEARNER] Transition thread stopped")
 
 
-def stream_transitions_from_actor(host="127.0.0.1", port=50051):
-    """
-    Runs a gRPC client that listens for transition and interaction messages from an Actor service.
-
-    This function establishes a gRPC connection with the given `host` and `port`, then continuously
-    streams transition data from the `ActorServiceStub`. The received transition data is deserialized
-    and stored in a queue (`transition_queue`). Similarly, interaction messages are also deserialized
-    and stored in a separate queue (`interaction_message_queue`).
-
-    Args:
-        host (str, optional): The IP address or hostname of the gRPC server. Defaults to `"127.0.0.1"`.
-        port (int, optional): The port number on which the gRPC server is running. Defaults to `50051`.
-
-    """
-    # NOTE: This is waiting for the handshake to be done
-    # In the future we will do it in a canonical way with a proper handshake
-    time.sleep(10)
-    channel = grpc.insecure_channel(
-        f"{host}:{port}",
-        options=[("grpc.max_send_message_length", -1), ("grpc.max_receive_message_length", -1)],
+def start_learner_server(
+    service: learner_service.LearnerService,
+    host="0.0.0.0",
+    port=50051,
+) -> grpc.server:
+    server = grpc.server(
+        ThreadPoolExecutor(max_workers=learner_service.MAX_WORKERS),
+        options=[
+            ("grpc.max_receive_message_length", learner_service.MAX_MESSAGE_SIZE),
+            ("grpc.max_send_message_length", learner_service.MAX_MESSAGE_SIZE),
+        ],
     )
-    stub = hilserl_pb2_grpc.ActorServiceStub(channel)
-    for response in stub.StreamTransition(hilserl_pb2.Empty()):
-        if response.HasField("transition"):
-            buffer = io.BytesIO(response.transition.transition_bytes)
-            transition = torch.load(buffer)
-            transition_queue.put(transition)
-        if response.HasField("interaction_message"):
-            content = pickle.loads(response.interaction_message.interaction_message_bytes)
-            interaction_message_queue.put(content)
+    hilserl_pb2_grpc.add_LearnerServiceServicer_to_server(
+        service,
+        server,
+    )
+    server.add_insecure_port(f"{host}:{port}")
+    server.start()
+    logging.info("[LEARNER] gRPC server started")
+
+    return server
 
 
-def learner_push_parameters(
-    policy: nn.Module, policy_lock: Lock, actor_host="127.0.0.1", actor_port=50052, seconds_between_pushes=5
+def check_nan_in_transition(
+    observations: torch.Tensor, actions: torch.Tensor, next_state: torch.Tensor
 ):
-    """
-    As a client, connect to the Actor's gRPC server (ActorService)
-    and periodically push new parameters.
-    """
-    time.sleep(10)
-    channel = grpc.insecure_channel(
-        f"{actor_host}:{actor_port}",
-        options=[("grpc.max_send_message_length", -1), ("grpc.max_receive_message_length", -1)],
-    )
-    actor_stub = hilserl_pb2_grpc.ActorServiceStub(channel)
-
-    while True:
-        with policy_lock:
-            params_dict = policy.actor.state_dict()
-            if policy.config.vision_encoder_name is not None:
-                if policy.config.freeze_vision_encoder:
-                    params_dict: dict[str, torch.Tensor] = {
-                        k: v for k, v in params_dict.items() if not k.startswith("encoder.")
-                    }
-                else:
-                    raise NotImplementedError(
-                        "Vision encoder is not frozen, we need to send the full model over the network which requires chunking the model."
-                    )
-
-        params_dict = move_state_dict_to_device(params_dict, device="cpu")
-        # Serialize
-        buf = io.BytesIO()
-        torch.save(params_dict, buf)
-        params_bytes = buf.getvalue()
-
-        # Push them to the Actor's "SendParameters" method
-        logging.info("[LEARNER] Publishing parameters to the Actor")
-        response = actor_stub.SendParameters(hilserl_pb2.Parameters(parameter_bytes=params_bytes))  # noqa: F841
-        time.sleep(seconds_between_pushes)
-
-
-def check_nan_in_transition(observations: torch.Tensor, actions: torch.Tensor, next_state: torch.Tensor):
     for k in observations:
         if torch.isnan(observations[k]).any():
             logging.error(f"observations[{k}] contains NaN values")
@@ -307,6 +273,7 @@ def add_actor_information_and_train(
     logger: Logger,
     resume_optimization_step: int | None = None,
     resume_interaction_step: int | None = None,
+    shutdown_event: Event | None = None,
 ):
     """
     Handles data transfer from the actor to the learner, manages training updates,
@@ -338,6 +305,7 @@ def add_actor_information_and_train(
         logger (Logger): Logger instance for tracking training progress.
         resume_optimization_step (int | None): In the case of resume training, start from the last optimization step reached.
         resume_interaction_step (int | None): In the case of resume training, shift the interaction step with the last saved step in order to not break logging.
+        shutdown_event (Event | None): Event to signal shutdown.
     """
     # NOTE: This function doesn't have a single responsibility, it should be split into multiple functions
     # in the future. The reason why we did that is the  GIL in Python. It's super slow the performance
@@ -345,9 +313,17 @@ def add_actor_information_and_train(
     time.time()
     logging.info("Starting learner thread")
     interaction_message, transition = None, None
-    optimization_step = resume_optimization_step if resume_optimization_step is not None else 0
-    interaction_step_shift = resume_interaction_step if resume_interaction_step is not None else 0
+    optimization_step = (
+        resume_optimization_step if resume_optimization_step is not None else 0
+    )
+    interaction_step_shift = (
+        resume_interaction_step if resume_interaction_step is not None else 0
+    )
     while True:
+        if shutdown_event is not None and shutdown_event.is_set():
+            logging.info("[LEARNER] Shutdown signal received. Exiting...")
+            break
+
         while not transition_queue.empty():
             transition_list = transition_queue.get()
             for transition in transition_list:
@@ -361,7 +337,9 @@ def add_actor_information_and_train(
             interaction_message = interaction_message_queue.get()
             # If cfg.resume, shift the interaction step with the last checkpointed step in order to not break the logging
             interaction_message["Interaction step"] += interaction_step_shift
-            logger.log_dict(interaction_message, mode="train", custom_step_key="Interaction step")
+            logger.log_dict(
+                interaction_message, mode="train", custom_step_key="Interaction step"
+            )
             # logging.info(f"Interaction message: {interaction_message}")
 
         if len(replay_buffer) < cfg.training.online_step_before_learning:
@@ -383,7 +361,9 @@ def add_actor_information_and_train(
             observations = batch["state"]
             next_observations = batch["next_state"]
             done = batch["done"]
-            check_nan_in_transition(observations=observations, actions=actions, next_state=next_observations)
+            check_nan_in_transition(
+                observations=observations, actions=actions, next_state=next_observations
+            )
 
             with policy_lock:
                 loss_critic = policy.compute_loss_critic(
@@ -411,7 +391,9 @@ def add_actor_information_and_train(
         next_observations = batch["next_state"]
         done = batch["done"]
 
-        check_nan_in_transition(observations=observations, actions=actions, next_state=next_observations)
+        check_nan_in_transition(
+            observations=observations, actions=actions, next_state=next_observations
+        )
 
         with policy_lock:
             loss_critic = policy.compute_loss_critic(
@@ -439,7 +421,9 @@ def add_actor_information_and_train(
 
                     training_infos["loss_actor"] = loss_actor.item()
 
-                    loss_temperature = policy.compute_loss_temperature(observations=observations)
+                    loss_temperature = policy.compute_loss_temperature(
+                        observations=observations
+                    )
                     optimizers["temperature"].zero_grad()
                     loss_temperature.backward()
                     optimizers["temperature"].step()
@@ -453,9 +437,13 @@ def add_actor_information_and_train(
             # logging.info(f"Training infos: {training_infos}")
 
         time_for_one_optimization_step = time.time() - time_for_one_optimization_step
-        frequency_for_one_optimization_step = 1 / (time_for_one_optimization_step + 1e-9)
+        frequency_for_one_optimization_step = 1 / (
+            time_for_one_optimization_step + 1e-9
+        )
 
-        logging.info(f"[LEARNER] Optimization frequency loop [Hz]: {frequency_for_one_optimization_step}")
+        logging.info(
+            f"[LEARNER] Optimization frequency loop [Hz]: {frequency_for_one_optimization_step}"
+        )
 
         logger.log_dict(
             {
@@ -471,7 +459,8 @@ def add_actor_information_and_train(
             logging.info(f"[LEARNER] Number of optimization step: {optimization_step}")
 
         if cfg.training.save_checkpoint and (
-            optimization_step % cfg.training.save_freq == 0 or optimization_step == cfg.training.online_steps
+            optimization_step % cfg.training.save_freq == 0
+            or optimization_step == cfg.training.online_steps
         ):
             logging.info(f"Checkpoint policy after step {optimization_step}")
             # Note: Save with step as the identifier, and format it to have at least 6 digits but more if
@@ -479,7 +468,9 @@ def add_actor_information_and_train(
             _num_digits = max(6, len(str(cfg.training.online_steps)))
             step_identifier = f"{optimization_step:0{_num_digits}d}"
             interaction_step = (
-                interaction_message["Interaction step"] if interaction_message is not None else 0
+                interaction_message["Interaction step"]
+                if interaction_message is not None
+                else 0
             )
             logger.save_checkpoint(
                 optimization_step,
@@ -538,7 +529,9 @@ def make_optimizers_and_scheduler(cfg, policy: nn.Module):
     optimizer_critic = torch.optim.Adam(
         params=policy.critic_ensemble.parameters(), lr=policy.config.critic_lr
     )
-    optimizer_temperature = torch.optim.Adam(params=[policy.log_alpha], lr=policy.config.critic_lr)
+    optimizer_temperature = torch.optim.Adam(
+        params=[policy.log_alpha], lr=policy.config.critic_lr
+    )
     lr_scheduler = None
     optimizers = {
         "actor": optimizer_actor,
@@ -580,14 +573,18 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
         # dataset_stats=offline_dataset.meta.stats if not cfg.resume else None,
         # Hack: But if we do online traning, we do not need dataset_stats
         dataset_stats=None,
-        pretrained_policy_name_or_path=str(logger.last_pretrained_model_dir) if cfg.resume else None,
+        pretrained_policy_name_or_path=str(logger.last_pretrained_model_dir)
+        if cfg.resume
+        else None,
     )
     # compile policy
     policy = torch.compile(policy)
     assert isinstance(policy, nn.Module)
 
     optimizers, lr_scheduler = make_optimizers_and_scheduler(cfg, policy)
-    resume_optimization_step, resume_interaction_step = load_training_state(cfg, logger, optimizers)
+    resume_optimization_step, resume_interaction_step = load_training_state(
+        cfg, logger, optimizers
+    )
 
     log_training_info(cfg, out_dir, policy)
 
@@ -599,7 +596,11 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
         logging.info("make_dataset offline buffer")
         offline_dataset = make_dataset(cfg)
         logging.info("Convertion to a offline replay buffer")
-        active_action_dims = [i for i, mask in enumerate(cfg.env.wrapper.joint_masking_action_space) if mask]
+        active_action_dims = [
+            i
+            for i, mask in enumerate(cfg.env.wrapper.joint_masking_action_space)
+            if mask
+        ]
         offline_replay_buffer = ReplayBuffer.from_lerobot_dataset(
             offline_dataset,
             device=device,
@@ -609,6 +610,20 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
         )
         batch_size: int = batch_size // 2  # We will sample from both replay buffer
 
+    shutdown_event = Event()
+
+    def signal_handler(signum, frame):
+        print(
+            f"\nReceived signal {signal.Signals(signum).name}. Initiating learner shutdown..."
+        )
+        shutdown_event.set()
+
+    # Register signal handlers
+    signal.signal(signal.SIGINT, signal_handler)  # Ctrl+C
+    signal.signal(signal.SIGTERM, signal_handler)  # Termination request
+    signal.signal(signal.SIGHUP, signal_handler)  # Terminal closed
+    signal.signal(signal.SIGQUIT, signal_handler)  # Ctrl+\
+
     start_learner_threads(
         cfg,
         device,
@@ -621,6 +636,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
         logger,
         resume_optimization_step,
         resume_interaction_step,
+        shutdown_event,
     )
 
 
diff --git a/lerobot/scripts/server/learner_service.py b/lerobot/scripts/server/learner_service.py
new file mode 100644
index 00000000..97601528
--- /dev/null
+++ b/lerobot/scripts/server/learner_service.py
@@ -0,0 +1,113 @@
+import hilserl_pb2  # type: ignore
+import hilserl_pb2_grpc  # type: ignore
+import torch
+from torch import nn
+from threading import Lock, Event
+import logging
+import queue
+import io
+import pickle
+
+from lerobot.scripts.server.buffer import (
+    move_state_dict_to_device,
+    bytes_buffer_size,
+    state_to_bytes,
+)
+
+
+MAX_MESSAGE_SIZE = 4 * 1024 * 1024  # 4 MB
+CHUNK_SIZE = 2 * 1024 * 1024  # 2 MB
+MAX_WORKERS = 10
+STUTDOWN_TIMEOUT = 10
+
+
+class LearnerService(hilserl_pb2_grpc.LearnerServiceServicer):
+    def __init__(
+        self,
+        shutdown_event: Event,
+        policy: nn.Module,
+        policy_lock: Lock,
+        seconds_between_pushes: float,
+        transition_queue: queue.Queue,
+        interaction_message_queue: queue.Queue,
+    ):
+        self.shutdown_event = shutdown_event
+        self.policy = policy
+        self.policy_lock = policy_lock
+        self.seconds_between_pushes = seconds_between_pushes
+        self.transition_queue = transition_queue
+        self.interaction_message_queue = interaction_message_queue
+
+    def _get_policy_state(self):
+        with self.policy_lock:
+            params_dict = self.policy.actor.state_dict()
+            if self.policy.config.vision_encoder_name is not None:
+                if self.policy.config.freeze_vision_encoder:
+                    params_dict: dict[str, torch.Tensor] = {
+                        k: v
+                        for k, v in params_dict.items()
+                        if not k.startswith("encoder.")
+                    }
+                else:
+                    raise NotImplementedError(
+                        "Vision encoder is not frozen, we need to send the full model over the network which requires chunking the model."
+                    )
+
+        return move_state_dict_to_device(params_dict, device="cpu")
+
+    def _send_bytes(self, buffer: bytes):
+        size_in_bytes = bytes_buffer_size(buffer)
+
+        sent_bytes = 0
+
+        logging.info(f"Model state size {size_in_bytes/1024/1024} MB with")
+
+        while sent_bytes < size_in_bytes:
+            transfer_state = hilserl_pb2.TransferState.TRANSFER_MIDDLE
+
+            if sent_bytes + CHUNK_SIZE >= size_in_bytes:
+                transfer_state = hilserl_pb2.TransferState.TRANSFER_END
+            elif sent_bytes == 0:
+                transfer_state = hilserl_pb2.TransferState.TRANSFER_BEGIN
+
+            size_to_read = min(CHUNK_SIZE, size_in_bytes - sent_bytes)
+            chunk = buffer.read(size_to_read)
+
+            yield hilserl_pb2.Parameters(
+                transfer_state=transfer_state, parameter_bytes=chunk
+            )
+            sent_bytes += size_to_read
+            logging.info(
+                f"[Learner] Sent {sent_bytes}/{size_in_bytes} bytes with state {transfer_state}"
+            )
+
+        logging.info(f"[LEARNER] Published {sent_bytes/1024/1024} MB to the Actor")
+
+    def StreamParameters(self, request, context):
+        # TODO: authorize the request
+        logging.info("[LEARNER] Received request to stream parameters from the Actor")
+
+        while not self.shutdown_event.is_set():
+            logging.debug("[LEARNER] Push parameters to the Actor")
+            state_dict = self._get_policy_state()
+
+            with state_to_bytes(state_dict) as buffer:
+                yield from self._send_bytes(buffer)
+
+            self.shutdown_event.wait(self.seconds_between_pushes)
+
+    def ReceiveTransitions(self, request_iterator, context):
+        # TODO: authorize the request
+        logging.info("[LEARNER] Received request to receive transitions from the Actor")
+
+        for request in request_iterator:
+            logging.debug("[LEARNER] Received request")
+            if request.HasField("transition"):
+                buffer = io.BytesIO(request.transition.transition_bytes)
+                transition = torch.load(buffer)
+                self.transition_queue.put(transition)
+            if request.HasField("interaction_message"):
+                content = pickle.loads(
+                    request.interaction_message.interaction_message_bytes
+                )
+                self.interaction_message_queue.put(content)
diff --git a/poetry.lock b/poetry.lock
index 81462fe8..75df6f3c 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.
 
 [[package]]
 name = "absl-py"
@@ -6,6 +6,8 @@ version = "2.1.0"
 description = "Abseil Python Common Libraries, see https://github.com/abseil/abseil-py."
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "(extra == \"xarm\" or extra == \"aloha\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "absl-py-2.1.0.tar.gz", hash = "sha256:7820790efbb316739cde8b4e19357243fc3608a152024288513dd968d7d959ff"},
     {file = "absl_py-2.1.0-py3-none-any.whl", hash = "sha256:526a04eadab8b4ee719ce68f204172ead1027549089702d99b9059f129ff1308"},
@@ -17,6 +19,8 @@ version = "2.4.3"
 description = "Happy Eyeballs for asyncio"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "aiohappyeyeballs-2.4.3-py3-none-any.whl", hash = "sha256:8a7a83727b2756f394ab2895ea0765a0a8c475e3c71e98d43d76f22b4b435572"},
     {file = "aiohappyeyeballs-2.4.3.tar.gz", hash = "sha256:75cf88a15106a5002a8eb1dab212525c00d1f4c0fa96e551c9fbe6f09a621586"},
@@ -28,6 +32,8 @@ version = "3.10.10"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:be7443669ae9c016b71f402e43208e13ddf00912f47f623ee5994e12fc7d4b3f"},
     {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7b06b7843929e41a94ea09eb1ce3927865387e3e23ebe108e0d0d09b08d25be9"},
@@ -140,6 +146,8 @@ version = "1.3.1"
 description = "An asynchronous serial port library of Python"
 optional = true
 python-versions = ">=3.6,<4.0"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "aioserial-1.3.1.tar.gz", hash = "sha256:702bf03b0eb84b8ef2d8dac5cb925e1e685dce98f77b125569bc6fd2b3b58228"},
 ]
@@ -153,6 +161,8 @@ version = "1.3.1"
 description = "aiosignal: a list of registered asynchronous callbacks"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"},
     {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"},
@@ -167,6 +177,8 @@ version = "4.9.3"
 description = "ANTLR 4.9.3 runtime for Python 3.7"
 optional = false
 python-versions = "*"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "antlr4-python3-runtime-4.9.3.tar.gz", hash = "sha256:f224469b4168294902bb1efa80a8bf7855f24c99aef99cbefc1bcd3cce77881b"},
 ]
@@ -177,6 +189,8 @@ version = "4.6.0"
 description = "High level compatibility layer for multiple asynchronous event loop implementations"
 optional = true
 python-versions = ">=3.9"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "anyio-4.6.0-py3-none-any.whl", hash = "sha256:c7d2e9d63e31599eeb636c8c5c03a7e108d73b345f064f1c19fdc87b79036a9a"},
     {file = "anyio-4.6.0.tar.gz", hash = "sha256:137b4559cbb034c477165047febb6ff83f390fc3b20bf181c1fc0a728cb8beeb"},
@@ -199,6 +213,8 @@ version = "0.1.4"
 description = "Disable App Nap on macOS >= 10.9"
 optional = true
 python-versions = ">=3.6"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and platform_system == \"Darwin\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c"},
     {file = "appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee"},
@@ -210,6 +226,8 @@ version = "23.1.0"
 description = "Argon2 for Python"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "argon2_cffi-23.1.0-py3-none-any.whl", hash = "sha256:c670642b78ba29641818ab2e68bd4e6a78ba53b7eff7b4c3815ae16abf91c7ea"},
     {file = "argon2_cffi-23.1.0.tar.gz", hash = "sha256:879c3e79a2729ce768ebb7d36d4609e3a78a4ca2ec3a9f12286ca057e3d0db08"},
@@ -230,6 +248,8 @@ version = "21.2.0"
 description = "Low-level CFFI bindings for Argon2"
 optional = true
 python-versions = ">=3.6"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "argon2-cffi-bindings-21.2.0.tar.gz", hash = "sha256:bb89ceffa6c791807d1305ceb77dbfacc5aa499891d2c55661c6459651fc39e3"},
     {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ccb949252cb2ab3a08c02024acb77cfb179492d5701c7cbdbfd776124d4d2367"},
@@ -261,12 +281,37 @@ cffi = ">=1.0.1"
 dev = ["cogapp", "pre-commit", "pytest", "wheel"]
 tests = ["pytest"]
 
+[[package]]
+name = "arm-pytorch-utilities"
+version = "0.4.3"
+description = "Utilities for working with pytorch"
+optional = true
+python-versions = ">=3.6"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"mani-skill\""
+files = [
+    {file = "arm_pytorch_utilities-0.4.3-py3-none-any.whl", hash = "sha256:39b0e1080c66614d446a25219787e656fee142817d8aac2d9eb153239707fbd1"},
+    {file = "arm_pytorch_utilities-0.4.3.tar.gz", hash = "sha256:508125d6610aac7b93596a2b546f458d3c31fcc4c9ae87869269a3a8aa53f7a8"},
+]
+
+[package.dependencies]
+matplotlib = "*"
+numpy = "*"
+pytorch-seed = "*"
+scipy = "*"
+torch = "*"
+
+[package.extras]
+test = ["pytest"]
+
 [[package]]
 name = "arrow"
 version = "1.3.0"
 description = "Better dates & times for Python"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "arrow-1.3.0-py3-none-any.whl", hash = "sha256:c728b120ebc00eb84e01882a6f5e7927a53960aa990ce7dd2b10f39005a67f80"},
     {file = "arrow-1.3.0.tar.gz", hash = "sha256:d4540617648cb5f895730f1ad8c82a65f2dad0166f57b75f3ca54759c4d67a85"},
@@ -286,6 +331,8 @@ version = "0.3.3"
 description = "Draws ASCII trees."
 optional = false
 python-versions = "*"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "asciitree-0.3.3.tar.gz", hash = "sha256:4aa4b9b649f85e3fcb343363d97564aa1fb62e249677f2e18a96765145cc0f6e"},
 ]
@@ -296,6 +343,8 @@ version = "2.4.1"
 description = "Annotate AST trees with source code positions"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "asttokens-2.4.1-py2.py3-none-any.whl", hash = "sha256:051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24"},
     {file = "asttokens-2.4.1.tar.gz", hash = "sha256:b03869718ba9a6eb027e134bfdf69f38a236d681c83c160d510768af11254ba0"},
@@ -314,6 +363,8 @@ version = "2.0.4"
 description = "Simple LRU cache for asyncio"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "async-lru-2.0.4.tar.gz", hash = "sha256:b8a59a5df60805ff63220b2a0c5b5393da5521b113cd5465a44eb037d81a5627"},
     {file = "async_lru-2.0.4-py3-none-any.whl", hash = "sha256:ff02944ce3c288c5be660c42dbcca0742b32c3b279d6dceda655190240b99224"},
@@ -328,6 +379,8 @@ version = "4.0.3"
 description = "Timeout context manager for asyncio programs"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version < \"3.11\""
 files = [
     {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
     {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
@@ -339,6 +392,8 @@ version = "24.2.0"
 description = "Classes Without Boilerplate"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"},
     {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"},
@@ -358,6 +413,8 @@ version = "2.16.0"
 description = "Internationalization utilities"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b"},
     {file = "babel-2.16.0.tar.gz", hash = "sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316"},
@@ -372,6 +429,8 @@ version = "4.12.3"
 description = "Screen-scraping library"
 optional = false
 python-versions = ">=3.6.0"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
     {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
@@ -393,6 +452,8 @@ version = "6.1.0"
 description = "An easy safelist-based HTML-sanitizing tool."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "bleach-6.1.0-py3-none-any.whl", hash = "sha256:3225f354cfc436b9789c66c4ee030194bee0568fbf9cbdad3bc8b5c26c5f12b6"},
     {file = "bleach-6.1.0.tar.gz", hash = "sha256:0a31f1837963c41d46bbf1331b8778e1308ea0791db03cc4e7357b97cf42a8fe"},
@@ -411,6 +472,8 @@ version = "1.8.2"
 description = "Fast, simple object-to-object and broadcast signaling"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "blinker-1.8.2-py3-none-any.whl", hash = "sha256:1779309f71bf239144b9399d06ae925637cf6634cf6bd131104184531bf67c01"},
     {file = "blinker-1.8.2.tar.gz", hash = "sha256:8f77b09d3bf7c795e969e9486f39c2c5e9c39d4ee07424be2bc594ece9642d83"},
@@ -422,6 +485,8 @@ version = "2024.8.30"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"},
     {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"},
@@ -433,6 +498,8 @@ version = "1.17.1"
 description = "Foreign Function Interface for Python calling C code."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"},
     {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"},
@@ -512,6 +579,8 @@ version = "3.4.0"
 description = "Validate configuration and produce human readable error messages."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"dev\""
 files = [
     {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"},
     {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
@@ -523,6 +592,8 @@ version = "3.4.0"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 optional = false
 python-versions = ">=3.7.0"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6"},
     {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b"},
@@ -637,6 +708,8 @@ version = "0.7.0"
 description = "Python sound notifications made easy."
 optional = true
 python-versions = ">=3.6,<4.0"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "chime-0.7.0-py3-none-any.whl", hash = "sha256:9626f8151cb008b1e0ffb7de6d1834b7013ba5fc4c4e3c9ba6e29dc9bf5feac6"},
     {file = "chime-0.7.0.tar.gz", hash = "sha256:ba4af8934ec8bd9a89a340b4433b2e500097b979823386432be7128e0b201f0d"},
@@ -648,6 +721,8 @@ version = "8.1.7"
 description = "Composable command line interface toolkit"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"},
     {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"},
@@ -662,6 +737,8 @@ version = "3.0.0"
 description = "Pickler class to extend the standard pickle.Pickler functionality"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "cloudpickle-3.0.0-py3-none-any.whl", hash = "sha256:246ee7d0c295602a036e86369c77fecda4ab17b506496730f2f576d9016fd9c7"},
     {file = "cloudpickle-3.0.0.tar.gz", hash = "sha256:996d9a482c6fb4f33c1a35335cf8afd065d2a56e973270364840712d9131a882"},
@@ -673,6 +750,8 @@ version = "4.0.0"
 description = "CMA-ES, Covariance Matrix Adaptation Evolution Strategy for non-linear numerical optimization in Python"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "cma-4.0.0-py3-none-any.whl", hash = "sha256:97b86ba1ac9f1cbb189a06c4d4a78f591f0878e5dd3e55c95e88e622e78c1a10"},
     {file = "cma-4.0.0.tar.gz", hash = "sha256:fd28ce56983bf2fca0e614189d60134ebb80bf604f070d1ea095ea4e856f13a5"},
@@ -691,6 +770,8 @@ version = "3.30.4"
 description = "CMake is an open-source, cross-platform family of tools designed to build, test and package software"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "cmake-3.30.4-py3-none-macosx_10_10_universal2.whl", hash = "sha256:8a1a30125213c3d44b81a1af0085ad1dcd77abc61bcdf330556e83898428198a"},
     {file = "cmake-3.30.4-py3-none-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:9f69b3706ae93fa48762871bdc7cb759fbbbadb04452e5eab820537c35fabcb6"},
@@ -720,6 +801,8 @@ version = "0.4.6"
 description = "Cross-platform colored terminal text."
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+groups = ["main"]
+markers = "(extra == \"test\" or extra == \"mani-skill\" or platform_system == \"Windows\" or sys_platform == \"linux\") and (extra == \"test\" or platform_system != \"Darwin\" or extra == \"mani-skill\" or sys_platform == \"linux\" or sys_platform == \"win32\") and (extra == \"test\" or extra == \"mani-skill\" or extra == \"stretch\" or platform_system == \"Windows\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (sys_platform == \"win32\" or sys_platform == \"linux\" or platform_system == \"Windows\") and (sys_platform == \"win32\" or extra == \"stretch\" or platform_system == \"Windows\")"
 files = [
     {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
     {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
@@ -731,6 +814,8 @@ version = "0.2.2"
 description = "Jupyter Python Comm implementation, for usage in ipykernel, xeus-python etc."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "comm-0.2.2-py3-none-any.whl", hash = "sha256:e6fb86cb70ff661ee8c9c14e7d36d6de3b4066f1441be4063df9c5009f0a64d3"},
     {file = "comm-0.2.2.tar.gz", hash = "sha256:3fd7a84065306e07bea1773df6eb8282de51ba82f77c72f9c85716ab11fe980e"},
@@ -748,6 +833,8 @@ version = "1.7"
 description = "A drop-in replacement for argparse that allows options to also be set via config files and/or environment variables."
 optional = true
 python-versions = ">=3.5"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "ConfigArgParse-1.7-py3-none-any.whl", hash = "sha256:d249da6591465c6c26df64a9f73d2536e743be2f244eb3ebe61114af2f94f86b"},
     {file = "ConfigArgParse-1.7.tar.gz", hash = "sha256:e7067471884de5478c58a511e529f0f9bd1c66bfef1dea90935438d6c23306d1"},
@@ -763,6 +850,8 @@ version = "1.3.0"
 description = "Python library for calculating contours of 2D quadrilateral grids"
 optional = true
 python-versions = ">=3.9"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "contourpy-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:880ea32e5c774634f9fcd46504bf9f080a41ad855f4fef54f5380f5133d343c7"},
     {file = "contourpy-1.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:76c905ef940a4474a6289c71d53122a4f77766eef23c03cd57016ce19d0f7b42"},
@@ -847,6 +936,8 @@ version = "7.6.2"
 description = "Code coverage measurement for Python"
 optional = true
 python-versions = ">=3.9"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"test\""
 files = [
     {file = "coverage-7.6.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c9df1950fb92d49970cce38100d7e7293c84ed3606eaa16ea0b6bc27175bb667"},
     {file = "coverage-7.6.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:24500f4b0e03aab60ce575c85365beab64b44d4db837021e08339f61d1fbfe52"},
@@ -924,6 +1015,8 @@ version = "0.12.1"
 description = "Composable style cycles"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30"},
     {file = "cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c"},
@@ -933,12 +1026,30 @@ files = [
 docs = ["ipython", "matplotlib", "numpydoc", "sphinx"]
 tests = ["pytest", "pytest-cov", "pytest-xdist"]
 
+[[package]]
+name = "dacite"
+version = "1.9.2"
+description = "Simple creation of data classes from dictionaries."
+optional = true
+python-versions = ">=3.7"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"mani-skill\""
+files = [
+    {file = "dacite-1.9.2-py3-none-any.whl", hash = "sha256:053f7c3f5128ca2e9aceb66892b1a3c8936d02c686e707bee96e19deef4bc4a0"},
+    {file = "dacite-1.9.2.tar.gz", hash = "sha256:6ccc3b299727c7aa17582f0021f6ae14d5de47c7227932c47fec4cdfefd26f09"},
+]
+
+[package.extras]
+dev = ["black", "coveralls", "mypy", "pre-commit", "pylint", "pytest (>=5)", "pytest-benchmark", "pytest-cov"]
+
 [[package]]
 name = "dash"
 version = "2.9.3"
 description = "A Python framework for building reactive web-apps. Developed by Plotly."
 optional = true
 python-versions = ">=3.6"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "dash-2.9.3-py3-none-any.whl", hash = "sha256:a749ae1ea9de3fe7b785353a818ec9b629d39c6b7e02462954203bd1e296fd0e"},
     {file = "dash-2.9.3.tar.gz", hash = "sha256:47392f8d6455dc989a697407eb5941f3bad80604df985ab1ac9d4244568ffb34"},
@@ -965,6 +1076,8 @@ version = "2.0.0"
 description = "Core component suite for Dash"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "dash_core_components-2.0.0-py3-none-any.whl", hash = "sha256:52b8e8cce13b18d0802ee3acbc5e888cb1248a04968f962d63d070400af2e346"},
     {file = "dash_core_components-2.0.0.tar.gz", hash = "sha256:c6733874af975e552f95a1398a16c2ee7df14ce43fa60bb3718a3c6e0b63ffee"},
@@ -976,6 +1089,8 @@ version = "2.0.0"
 description = "Vanilla HTML components for Dash"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "dash_html_components-2.0.0-py3-none-any.whl", hash = "sha256:b42cc903713c9706af03b3f2548bda4be7307a7cf89b7d6eae3da872717d1b63"},
     {file = "dash_html_components-2.0.0.tar.gz", hash = "sha256:8703a601080f02619a6390998e0b3da4a5daabe97a1fd7a9cebc09d015f26e50"},
@@ -987,6 +1102,8 @@ version = "5.0.0"
 description = "Dash table"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "dash_table-5.0.0-py3-none-any.whl", hash = "sha256:19036fa352bb1c11baf38068ec62d172f0515f73ca3276c79dee49b95ddc16c9"},
     {file = "dash_table-5.0.0.tar.gz", hash = "sha256:18624d693d4c8ef2ddec99a6f167593437a7ea0bf153aa20f318c170c5bc7308"},
@@ -998,6 +1115,8 @@ version = "3.0.1"
 description = "HuggingFace community-driven open-source library of datasets"
 optional = false
 python-versions = ">=3.8.0"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "datasets-3.0.1-py3-none-any.whl", hash = "sha256:db080aab41c8cc68645117a0f172e5c6789cbc672f066de0aa5a08fc3eebc686"},
     {file = "datasets-3.0.1.tar.gz", hash = "sha256:40d63b09e76a3066c32e746d6fdc36fd3f29ed2acd49bf5b1a2100da32936511"},
@@ -1040,6 +1159,8 @@ version = "1.8.7"
 description = "An implementation of the Debug Adapter Protocol for Python"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"dev\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"dev\")"
 files = [
     {file = "debugpy-1.8.7-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:95fe04a573b8b22896c404365e03f4eda0ce0ba135b7667a1e57bd079793b96b"},
     {file = "debugpy-1.8.7-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:628a11f4b295ffb4141d8242a9bb52b77ad4a63a2ad19217a93be0f77f2c28c9"},
@@ -1075,6 +1196,8 @@ version = "5.1.1"
 description = "Decorators for Humans"
 optional = true
 python-versions = ">=3.5"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"},
     {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"},
@@ -1086,6 +1209,8 @@ version = "8.0.1"
 description = "Deep Difference and Search of any Python object/data. Recreate objects by adding adding deltas to each other."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "deepdiff-8.0.1-py3-none-any.whl", hash = "sha256:42e99004ce603f9a53934c634a57b04ad5900e0d8ed0abb15e635767489cbc05"},
     {file = "deepdiff-8.0.1.tar.gz", hash = "sha256:245599a4586ab59bb599ca3517a9c42f3318ff600ded5e80a3432693c8ec3c4b"},
@@ -1104,6 +1229,8 @@ version = "0.7.1"
 description = "XML bomb protection for Python stdlib modules"
 optional = true
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61"},
     {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
@@ -1115,6 +1242,8 @@ version = "0.30.3"
 description = "State-of-the-art diffusion in PyTorch and JAX."
 optional = false
 python-versions = ">=3.8.0"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "diffusers-0.30.3-py3-none-any.whl", hash = "sha256:1b70209e4d2c61223b96a7e13bc4d70869c8b0b68f54a35ce3a67fcf813edeee"},
     {file = "diffusers-0.30.3.tar.gz", hash = "sha256:67c5eb25d5b50bf0742624ef43fe0f6d1e1604f64aad3e8558469cbe89ecf72f"},
@@ -1145,6 +1274,8 @@ version = "0.3.8"
 description = "serialize all of Python"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "dill-0.3.8-py3-none-any.whl", hash = "sha256:c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7"},
     {file = "dill-0.3.8.tar.gz", hash = "sha256:3ebe3c479ad625c4553aca177444d89b486b1d84982eeacded644afc0cf797ca"},
@@ -1160,6 +1291,8 @@ version = "0.3.9"
 description = "Distribution utilities"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"dev\""
 files = [
     {file = "distlib-0.3.9-py2.py3-none-any.whl", hash = "sha256:47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87"},
     {file = "distlib-0.3.9.tar.gz", hash = "sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403"},
@@ -1171,6 +1304,8 @@ version = "1.0.14"
 description = "Continuous control environments and MuJoCo Python bindings."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"aloha\""
 files = [
     {file = "dm_control-1.0.14-py3-none-any.whl", hash = "sha256:883c63244a7ebf598700a97564ed19fffd3479ca79efd090aed881609cdb9fc6"},
     {file = "dm_control-1.0.14.tar.gz", hash = "sha256:def1ece747b6f175c581150826b50f1a6134086dab34f8f3fd2d088ea035cf3d"},
@@ -1202,6 +1337,8 @@ version = "1.6"
 description = "A Python interface for Reinforcement Learning environments."
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"aloha\""
 files = [
     {file = "dm-env-1.6.tar.gz", hash = "sha256:a436eb1c654c39e0c986a516cee218bea7140b510fceff63f97eb4fcff3d93de"},
     {file = "dm_env-1.6-py3-none-any.whl", hash = "sha256:0eabb6759dd453b625e041032f7ae0c1e87d4eb61b6a96b9ca586483837abf29"},
@@ -1218,6 +1355,8 @@ version = "0.1.8"
 description = "Tree is a library for working with nested data structures."
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"aloha\""
 files = [
     {file = "dm-tree-0.1.8.tar.gz", hash = "sha256:0fcaabbb14e7980377439e7140bd05552739ca5e515ecb3119f234acee4b9430"},
     {file = "dm_tree-0.1.8-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:35cc164a79336bfcfafb47e5f297898359123bbd3330c1967f0c4994f9cf9f60"},
@@ -1273,6 +1412,8 @@ version = "0.4.0"
 description = "Python bindings for the docker credentials store API"
 optional = false
 python-versions = "*"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "docker-pycreds-0.4.0.tar.gz", hash = "sha256:6ce3270bcaf404cc4c3e27e4b6c70d3521deae82fb508767870fdbf772d584d4"},
     {file = "docker_pycreds-0.4.0-py2.py3-none-any.whl", hash = "sha256:7266112468627868005106ec19cd0d722702d2b7d5912a28e19b826c3d37af49"},
@@ -1281,12 +1422,27 @@ files = [
 [package.dependencies]
 six = ">=1.4.0"
 
+[[package]]
+name = "docstring-parser"
+version = "0.16"
+description = "Parse Python docstrings in reST, Google and Numpydoc format"
+optional = true
+python-versions = ">=3.6,<4.0"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"mani-skill\""
+files = [
+    {file = "docstring_parser-0.16-py3-none-any.whl", hash = "sha256:bf0a1387354d3691d102edef7ec124f219ef639982d096e26e3b60aeffa90637"},
+    {file = "docstring_parser-0.16.tar.gz", hash = "sha256:538beabd0af1e2db0146b6bd3caa526c35a34d61af9fd2887f3a8a27a739aa6e"},
+]
+
 [[package]]
 name = "dora-rs"
 version = "0.3.6"
 description = "`dora` goal is to be a low latency, composable, and distributed data flow."
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"dora\""
 files = [
     {file = "dora_rs-0.3.6-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c036d2d0792d8d6e0e9db1936ab5fd4c6d19e097f3fc259058733e526f94253a"},
     {file = "dora_rs-0.3.6-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:95036f6fcb5aeb7bba8a1f37d84c627eefe09af1db17e36bc19209e950652446"},
@@ -1294,6 +1450,10 @@ files = [
     {file = "dora_rs-0.3.6-cp37-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:78656d3ae1282a142a5fed410ec3a6f725fdf8d9f9192ed673e336ea3b083e12"},
     {file = "dora_rs-0.3.6-cp37-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:681e22c8ecb3b48d11cb9019f8a32d4ae1e353e20d4ce3a0f0eedd0ccbd95e5f"},
     {file = "dora_rs-0.3.6-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4598572bab6f726ec41fabb43bf0f7e3cf8082ea0f6f8f4e57845a6c919f31b3"},
+    {file = "dora_rs-0.3.6-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:157fc1fed50946646f09df75c6d52198735a5973e53d252199bbb1c65e1594d2"},
+    {file = "dora_rs-0.3.6-cp37-abi3-manylinux_2_28_armv7l.whl", hash = "sha256:7ae2724c181be10692c24fb8d9ce2a99a9afc57237332c3658e2ea6f4f33c091"},
+    {file = "dora_rs-0.3.6-cp37-abi3-manylinux_2_28_i686.whl", hash = "sha256:3d324835f292edd81b962f8c0df44f7f47c0a6f8fe6f7d081951aeb1f5ba57d2"},
+    {file = "dora_rs-0.3.6-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:474c087b5e584293685a7d4837165b2ead96dc74fb435ae50d5fa0ac168a0de0"},
     {file = "dora_rs-0.3.6-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:297350f05f5f87a0bf647a1e5b4446728e5f800788c6bb28b462bcd167f1de7f"},
     {file = "dora_rs-0.3.6-cp37-abi3-musllinux_1_2_i686.whl", hash = "sha256:b1870a8e30f0ac298d17fd546224348d13a648bcfa0cbc51dba7e5136c1af928"},
     {file = "dora_rs-0.3.6-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:182a189212d41be0c960fd3299bf6731af2e771f8858cfb1be7ebcc17d60a254"},
@@ -1309,6 +1469,8 @@ version = "0.72.5"
 description = "MATLAB-like drawnow"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "drawnow-0.72.5-py3-none-any.whl", hash = "sha256:4ff83a8b15f61a781edaaa2a3e6b71e2c8fd948960f188b870def701afcfa0d5"},
     {file = "drawnow-0.72.5.tar.gz", hash = "sha256:9d1855605b2ec6ebc4e8a95201a7a0068eb1e2a5d1695cb1b7c462d660f32593"},
@@ -1323,6 +1485,8 @@ version = "3.7.31"
 description = "Dynamixel SDK 3. python package"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"dynamixel\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"dynamixel\")"
 files = [
     {file = "dynamixel_sdk-3.7.31-py3-none-any.whl", hash = "sha256:74e8c112ca6b0b869b196dd8c6a44ffd5dd5c1a3cb9fe2030e9933922406b466"},
 ]
@@ -1336,6 +1500,8 @@ version = "0.8.0"
 description = "A new flavour of deep learning operations"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "einops-0.8.0-py3-none-any.whl", hash = "sha256:9572fb63046264a862693b0a87088af3bdc8c068fde03de63453cbbde245465f"},
     {file = "einops-0.8.0.tar.gz", hash = "sha256:63486517fed345712a8385c100cb279108d9d47e6ae59099b07657e983deae85"},
@@ -1347,6 +1513,8 @@ version = "1.7.1"
 description = "Bindings to the Linux input handling subsystem"
 optional = true
 python-versions = ">=3.6"
+groups = ["main"]
+markers = "(extra == \"dynamixel\" or extra == \"feetech\" or extra == \"stretch\") and sys_platform in \"linux\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "evdev-1.7.1.tar.gz", hash = "sha256:0c72c370bda29d857e188d931019c32651a9c1ea977c08c8d939b1ced1637fde"},
 ]
@@ -1357,6 +1525,8 @@ version = "1.2.2"
 description = "Backport of PEP 654 (exception groups)"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version < \"3.11\" and (extra == \"test\" or sys_platform == \"linux\" or extra == \"mani-skill\") and (extra == \"test\" or extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
     {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
@@ -1371,6 +1541,8 @@ version = "2.1.0"
 description = "Get the currently executing AST node of a frame, and other information"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "executing-2.1.0-py2.py3-none-any.whl", hash = "sha256:8d63781349375b5ebccc3142f4b30350c0cd9c79f921cde38be2be4637e98eaf"},
     {file = "executing-2.1.0.tar.gz", hash = "sha256:8ea27ddd260da8150fa5a708269c4a10e76161e2496ec3e587da9e3c0fe4b9ab"},
@@ -1385,17 +1557,37 @@ version = "0.0.4"
 description = "Notifications for all Farama Foundation maintained libraries."
 optional = false
 python-versions = "*"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "Farama-Notifications-0.0.4.tar.gz", hash = "sha256:13fceff2d14314cf80703c8266462ebf3733c7d165336eee998fc58e545efd18"},
     {file = "Farama_Notifications-0.0.4-py3-none-any.whl", hash = "sha256:14de931035a41961f7c056361dc7f980762a143d05791ef5794a751a2caf05ae"},
 ]
 
+[[package]]
+name = "fast-kinematics"
+version = "0.2.2"
+description = "A fast kinematics library for robotics"
+optional = true
+python-versions = ">=3.8"
+groups = ["main"]
+markers = "extra == \"mani-skill\" and platform_system == \"Linux\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
+files = [
+    {file = "fast_kinematics-0.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:908f7ec94dfd947028170c0a37b326b3dde1b8c0a14417646e874f3e81b2cd71"},
+    {file = "fast_kinematics-0.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2e24e736944aa7723f7445c8630d421eb61a6fcd2c23822e9bf4c6bc2d5bf69"},
+    {file = "fast_kinematics-0.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0500a63494e69d1e51d9770f4fa429e1d50dbbe27e5c448f8daca32bc4e03bb6"},
+    {file = "fast_kinematics-0.2.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c31be181899b0ad2600f00899810496a829b54819cc59d012c32ad0d0b32854"},
+    {file = "fast_kinematics-0.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c01274f03fda3660c45816a86a5db820ec3d2ebc19abfac66a49451701a308f4"},
+]
+
 [[package]]
 name = "fasteners"
 version = "0.19"
 description = "A python package that provides useful locks"
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and sys_platform != \"emscripten\""
 files = [
     {file = "fasteners-0.19-py3-none-any.whl", hash = "sha256:758819cb5d94cdedf4e836988b74de396ceacb8e2794d21f82d131fd9ee77237"},
     {file = "fasteners-0.19.tar.gz", hash = "sha256:b4f37c3ac52d8a445af3a66bce57b33b5e90b97c696b7b984f530cf8f0ded09c"},
@@ -1407,6 +1599,8 @@ version = "2.20.0"
 description = "Fastest Python implementation of JSON schema"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "fastjsonschema-2.20.0-py3-none-any.whl", hash = "sha256:5875f0b0fa7a0043a91e93a9b8f793bcbbba9691e7fd83dca95c28ba26d21f0a"},
     {file = "fastjsonschema-2.20.0.tar.gz", hash = "sha256:3d48fc5300ee96f5d116f10fe6f28d938e6008f59a6a025c2649475b87f76a23"},
@@ -1421,6 +1615,8 @@ version = "1.0.0"
 description = "This is source code from official feetech repository"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"feetech\""
 files = [
     {file = "feetech-servo-sdk-1.0.0.tar.gz", hash = "sha256:d4d3832e4b1b22a8222133a414db9f868224c2fb639426a1b11d96ddfe84e69c"},
 ]
@@ -1434,6 +1630,8 @@ version = "3.16.1"
 description = "A platform independent file lock."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "filelock-3.16.1-py3-none-any.whl", hash = "sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0"},
     {file = "filelock-3.16.1.tar.gz", hash = "sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435"},
@@ -1450,6 +1648,8 @@ version = "3.0.3"
 description = "A simple framework for building complex web applications."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "flask-3.0.3-py3-none-any.whl", hash = "sha256:34e815dfaa43340d1d15a5c3a02b8476004037eb4840b34910c6e21679d288f3"},
     {file = "flask-3.0.3.tar.gz", hash = "sha256:ceb27b0af3823ea2737928a4d99d125a06175b8512c445cbd9a9ce200ef76842"},
@@ -1472,6 +1672,8 @@ version = "4.54.1"
 description = "Tools to manipulate font files"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "fonttools-4.54.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7ed7ee041ff7b34cc62f07545e55e1468808691dddfd315d51dd82a6b37ddef2"},
     {file = "fonttools-4.54.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:41bb0b250c8132b2fcac148e2e9198e62ff06f3cc472065dff839327945c5882"},
@@ -1543,6 +1745,8 @@ version = "1.5.1"
 description = "Validates fully-qualified domain names against RFC 1123, so that they are acceptable to modern bowsers"
 optional = true
 python-versions = ">=2.7, !=3.0, !=3.1, !=3.2, !=3.3, !=3.4, <4"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "fqdn-1.5.1-py3-none-any.whl", hash = "sha256:3a179af3761e4df6eb2e026ff9e1a3033d3587bf980a0b1b2e1e5d08d7358014"},
     {file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"},
@@ -1554,6 +1758,8 @@ version = "2.5.1"
 description = "Freetype python bindings"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "freetype-py-2.5.1.zip", hash = "sha256:cfe2686a174d0dd3d71a9d8ee9bf6a2c23f5872385cf8ce9f24af83d076e2fbd"},
     {file = "freetype_py-2.5.1-py3-none-macosx_10_9_universal2.whl", hash = "sha256:d01ded2557694f06aa0413f3400c0c0b2b5ebcaabeef7aaf3d756be44f51e90b"},
@@ -1570,6 +1776,8 @@ version = "1.4.1"
 description = "A list-like structure which implements collections.abc.MutableSequence"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f9aa1878d1083b276b0196f2dfbe00c9b7e752475ed3b682025ff20c1c1f51ac"},
     {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:29acab3f66f0f24674b7dc4736477bcd4bc3ad4b896f5f45379a67bce8b96868"},
@@ -1656,6 +1864,8 @@ version = "2024.6.1"
 description = "File-system specification"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "fsspec-2024.6.1-py3-none-any.whl", hash = "sha256:3cb443f8bcd2efb31295a5b9fdb02aee81d8452c80d28f97a6d0959e6cee101e"},
     {file = "fsspec-2024.6.1.tar.gz", hash = "sha256:fad7d7e209dd4c1208e3bbfda706620e0da5142bebbd9c384afb95b07e798e49"},
@@ -1698,6 +1908,8 @@ version = "1.0.0"
 description = "Clean single-source support for Python 3 and 2"
 optional = true
 python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "future-1.0.0-py3-none-any.whl", hash = "sha256:929292d34f5872e70396626ef385ec22355a1fae8ad29e1a734c3e43f9fbc216"},
     {file = "future-1.0.0.tar.gz", hash = "sha256:bd2968309307861edae1458a4f8a4f3598c03be43b97521076aebf5d94c07b05"},
@@ -1709,6 +1921,8 @@ version = "5.2.0"
 description = "Google Drive Public File/Folder Downloader"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "gdown-5.2.0-py3-none-any.whl", hash = "sha256:33083832d82b1101bdd0e9df3edd0fbc0e1c5f14c9d8c38d2a35bf1683b526d6"},
     {file = "gdown-5.2.0.tar.gz", hash = "sha256:2145165062d85520a3cd98b356c9ed522c5e7984d408535409fd46f94defc787"},
@@ -1729,6 +1943,8 @@ version = "4.0.11"
 description = "Git Object Database"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "gitdb-4.0.11-py3-none-any.whl", hash = "sha256:81a3407ddd2ee8df444cbacea00e2d038e40150acfa3001696fe0dcf1d3adfa4"},
     {file = "gitdb-4.0.11.tar.gz", hash = "sha256:bf5421126136d6d0af55bc1e7c1af1c397a34f5b7bd79e776cd3e89785c2b04b"},
@@ -1743,6 +1959,8 @@ version = "3.1.43"
 description = "GitPython is a Python library used to interact with Git repositories"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "GitPython-3.1.43-py3-none-any.whl", hash = "sha256:eec7ec56b92aad751f9912a73404bc02ba212a23adb2c7098ee668417051a1ff"},
     {file = "GitPython-3.1.43.tar.gz", hash = "sha256:35f314a9f878467f5453cc1fee295c3e18e52f1b99f10f6cf5b1682e968a9e7c"},
@@ -1761,6 +1979,8 @@ version = "2.7.0"
 description = "A ctypes-based wrapper for GLFW3."
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "(extra == \"xarm\" or extra == \"aloha\") and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "glfw-2.7.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-macosx_10_6_intel.whl", hash = "sha256:bd82849edcceda4e262bd1227afaa74b94f9f0731c1197863cd25c15bfc613fc"},
     {file = "glfw-2.7.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-macosx_11_0_arm64.whl", hash = "sha256:56ea163c964bb0bc336def2d6a6a1bd42f9db4b870ef834ac77d7b7ee68b8dfc"},
@@ -1776,12 +1996,83 @@ files = [
 [package.extras]
 preview = ["glfw-preview"]
 
+[[package]]
+name = "grpcio"
+version = "1.70.0"
+description = "HTTP/2-based RPC framework"
+optional = true
+python-versions = ">=3.8"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"hilserl\""
+files = [
+    {file = "grpcio-1.70.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:95469d1977429f45fe7df441f586521361e235982a0b39e33841549143ae2851"},
+    {file = "grpcio-1.70.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:ed9718f17fbdb472e33b869c77a16d0b55e166b100ec57b016dc7de9c8d236bf"},
+    {file = "grpcio-1.70.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:374d014f29f9dfdb40510b041792e0e2828a1389281eb590df066e1cc2b404e5"},
+    {file = "grpcio-1.70.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2af68a6f5c8f78d56c145161544ad0febbd7479524a59c16b3e25053f39c87f"},
+    {file = "grpcio-1.70.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce7df14b2dcd1102a2ec32f621cc9fab6695effef516efbc6b063ad749867295"},
+    {file = "grpcio-1.70.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:c78b339869f4dbf89881e0b6fbf376313e4f845a42840a7bdf42ee6caed4b11f"},
+    {file = "grpcio-1.70.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:58ad9ba575b39edef71f4798fdb5c7b6d02ad36d47949cd381d4392a5c9cbcd3"},
+    {file = "grpcio-1.70.0-cp310-cp310-win32.whl", hash = "sha256:2b0d02e4b25a5c1f9b6c7745d4fa06efc9fd6a611af0fb38d3ba956786b95199"},
+    {file = "grpcio-1.70.0-cp310-cp310-win_amd64.whl", hash = "sha256:0de706c0a5bb9d841e353f6343a9defc9fc35ec61d6eb6111802f3aa9fef29e1"},
+    {file = "grpcio-1.70.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:17325b0be0c068f35770f944124e8839ea3185d6d54862800fc28cc2ffad205a"},
+    {file = "grpcio-1.70.0-cp311-cp311-macosx_10_14_universal2.whl", hash = "sha256:dbe41ad140df911e796d4463168e33ef80a24f5d21ef4d1e310553fcd2c4a386"},
+    {file = "grpcio-1.70.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:5ea67c72101d687d44d9c56068328da39c9ccba634cabb336075fae2eab0d04b"},
+    {file = "grpcio-1.70.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cb5277db254ab7586769e490b7b22f4ddab3876c490da0a1a9d7c695ccf0bf77"},
+    {file = "grpcio-1.70.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7831a0fc1beeeb7759f737f5acd9fdcda520e955049512d68fda03d91186eea"},
+    {file = "grpcio-1.70.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:27cc75e22c5dba1fbaf5a66c778e36ca9b8ce850bf58a9db887754593080d839"},
+    {file = "grpcio-1.70.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d63764963412e22f0491d0d32833d71087288f4e24cbcddbae82476bfa1d81fd"},
+    {file = "grpcio-1.70.0-cp311-cp311-win32.whl", hash = "sha256:bb491125103c800ec209d84c9b51f1c60ea456038e4734688004f377cfacc113"},
+    {file = "grpcio-1.70.0-cp311-cp311-win_amd64.whl", hash = "sha256:d24035d49e026353eb042bf7b058fb831db3e06d52bee75c5f2f3ab453e71aca"},
+    {file = "grpcio-1.70.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:ef4c14508299b1406c32bdbb9fb7b47612ab979b04cf2b27686ea31882387cff"},
+    {file = "grpcio-1.70.0-cp312-cp312-macosx_10_14_universal2.whl", hash = "sha256:aa47688a65643afd8b166928a1da6247d3f46a2784d301e48ca1cc394d2ffb40"},
+    {file = "grpcio-1.70.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:880bfb43b1bb8905701b926274eafce5c70a105bc6b99e25f62e98ad59cb278e"},
+    {file = "grpcio-1.70.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9e654c4b17d07eab259d392e12b149c3a134ec52b11ecdc6a515b39aceeec898"},
+    {file = "grpcio-1.70.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2394e3381071045a706ee2eeb6e08962dd87e8999b90ac15c55f56fa5a8c9597"},
+    {file = "grpcio-1.70.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:b3c76701428d2df01964bc6479422f20e62fcbc0a37d82ebd58050b86926ef8c"},
+    {file = "grpcio-1.70.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ac073fe1c4cd856ebcf49e9ed6240f4f84d7a4e6ee95baa5d66ea05d3dd0df7f"},
+    {file = "grpcio-1.70.0-cp312-cp312-win32.whl", hash = "sha256:cd24d2d9d380fbbee7a5ac86afe9787813f285e684b0271599f95a51bce33528"},
+    {file = "grpcio-1.70.0-cp312-cp312-win_amd64.whl", hash = "sha256:0495c86a55a04a874c7627fd33e5beaee771917d92c0e6d9d797628ac40e7655"},
+    {file = "grpcio-1.70.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:aa573896aeb7d7ce10b1fa425ba263e8dddd83d71530d1322fd3a16f31257b4a"},
+    {file = "grpcio-1.70.0-cp313-cp313-macosx_10_14_universal2.whl", hash = "sha256:d405b005018fd516c9ac529f4b4122342f60ec1cee181788249372524e6db429"},
+    {file = "grpcio-1.70.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:f32090238b720eb585248654db8e3afc87b48d26ac423c8dde8334a232ff53c9"},
+    {file = "grpcio-1.70.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dfa089a734f24ee5f6880c83d043e4f46bf812fcea5181dcb3a572db1e79e01c"},
+    {file = "grpcio-1.70.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f19375f0300b96c0117aca118d400e76fede6db6e91f3c34b7b035822e06c35f"},
+    {file = "grpcio-1.70.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:7c73c42102e4a5ec76608d9b60227d917cea46dff4d11d372f64cbeb56d259d0"},
+    {file = "grpcio-1.70.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:0a5c78d5198a1f0aa60006cd6eb1c912b4a1520b6a3968e677dbcba215fabb40"},
+    {file = "grpcio-1.70.0-cp313-cp313-win32.whl", hash = "sha256:fe9dbd916df3b60e865258a8c72ac98f3ac9e2a9542dcb72b7a34d236242a5ce"},
+    {file = "grpcio-1.70.0-cp313-cp313-win_amd64.whl", hash = "sha256:4119fed8abb7ff6c32e3d2255301e59c316c22d31ab812b3fbcbaf3d0d87cc68"},
+    {file = "grpcio-1.70.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:8058667a755f97407fca257c844018b80004ae8035565ebc2812cc550110718d"},
+    {file = "grpcio-1.70.0-cp38-cp38-macosx_10_14_universal2.whl", hash = "sha256:879a61bf52ff8ccacbedf534665bb5478ec8e86ad483e76fe4f729aaef867cab"},
+    {file = "grpcio-1.70.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:0ba0a173f4feacf90ee618fbc1a27956bfd21260cd31ced9bc707ef551ff7dc7"},
+    {file = "grpcio-1.70.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:558c386ecb0148f4f99b1a65160f9d4b790ed3163e8610d11db47838d452512d"},
+    {file = "grpcio-1.70.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:412faabcc787bbc826f51be261ae5fa996b21263de5368a55dc2cf824dc5090e"},
+    {file = "grpcio-1.70.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:3b0f01f6ed9994d7a0b27eeddea43ceac1b7e6f3f9d86aeec0f0064b8cf50fdb"},
+    {file = "grpcio-1.70.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7385b1cb064734005204bc8994eed7dcb801ed6c2eda283f613ad8c6c75cf873"},
+    {file = "grpcio-1.70.0-cp38-cp38-win32.whl", hash = "sha256:07269ff4940f6fb6710951116a04cd70284da86d0a4368fd5a3b552744511f5a"},
+    {file = "grpcio-1.70.0-cp38-cp38-win_amd64.whl", hash = "sha256:aba19419aef9b254e15011b230a180e26e0f6864c90406fdbc255f01d83bc83c"},
+    {file = "grpcio-1.70.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:4f1937f47c77392ccd555728f564a49128b6a197a05a5cd527b796d36f3387d0"},
+    {file = "grpcio-1.70.0-cp39-cp39-macosx_10_14_universal2.whl", hash = "sha256:0cd430b9215a15c10b0e7d78f51e8a39d6cf2ea819fd635a7214fae600b1da27"},
+    {file = "grpcio-1.70.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:e27585831aa6b57b9250abaf147003e126cd3a6c6ca0c531a01996f31709bed1"},
+    {file = "grpcio-1.70.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c1af8e15b0f0fe0eac75195992a63df17579553b0c4af9f8362cc7cc99ccddf4"},
+    {file = "grpcio-1.70.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cbce24409beaee911c574a3d75d12ffb8c3e3dd1b813321b1d7a96bbcac46bf4"},
+    {file = "grpcio-1.70.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ff4a8112a79464919bb21c18e956c54add43ec9a4850e3949da54f61c241a4a6"},
+    {file = "grpcio-1.70.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5413549fdf0b14046c545e19cfc4eb1e37e9e1ebba0ca390a8d4e9963cab44d2"},
+    {file = "grpcio-1.70.0-cp39-cp39-win32.whl", hash = "sha256:b745d2c41b27650095e81dea7091668c040457483c9bdb5d0d9de8f8eb25e59f"},
+    {file = "grpcio-1.70.0-cp39-cp39-win_amd64.whl", hash = "sha256:a31d7e3b529c94e930a117b2175b2efd179d96eb3c7a21ccb0289a8ab05b645c"},
+    {file = "grpcio-1.70.0.tar.gz", hash = "sha256:8d1584a68d5922330025881e63a6c1b54cc8117291d382e4fa69339b6d914c56"},
+]
+
+[package.extras]
+protobuf = ["grpcio-tools (>=1.70.0)"]
+
 [[package]]
 name = "gym-aloha"
 version = "0.1.1"
 description = "A gym environment for ALOHA"
 optional = true
 python-versions = "<4.0,>=3.10"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"aloha\""
 files = [
     {file = "gym_aloha-0.1.1-py3-none-any.whl", hash = "sha256:2698037246dbb106828f0bc229b61007b0a21d5967c72cc373f7bc1083203584"},
     {file = "gym_aloha-0.1.1.tar.gz", hash = "sha256:614ae1cf116323e7b5ae2f0e9bd282c4f052aee15e839e5587ddce45995359bc"},
@@ -1803,6 +2094,8 @@ version = "0.1.0"
 description = ""
 optional = true
 python-versions = "^3.10"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"dora\""
 files = []
 develop = false
 
@@ -1824,6 +2117,8 @@ version = "0.1.5"
 description = "A gymnasium environment for PushT."
 optional = true
 python-versions = "<4.0,>=3.10"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"pusht\""
 files = [
     {file = "gym_pusht-0.1.5-py3-none-any.whl", hash = "sha256:d9e3ba5f44916dc4a802d71764b08f4e7e09bda256e25af9dda16e9364dc777f"},
     {file = "gym_pusht-0.1.5.tar.gz", hash = "sha256:981e135f6e0ca91e4ec63603e9551bc77cba989d06a2888ed31a1d68f7cbdae2"},
@@ -1847,6 +2142,8 @@ version = "0.1.1"
 description = "A gym environment for xArm"
 optional = true
 python-versions = "<4.0,>=3.10"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"xarm\""
 files = [
     {file = "gym_xarm-0.1.1-py3-none-any.whl", hash = "sha256:3bd7e3c1c5521ba80a56536f01a5e11321580704d72160355ce47a828a8808ad"},
     {file = "gym_xarm-0.1.1.tar.gz", hash = "sha256:e455524561b02d06b92a4f7d524f448d84a7484d9a2dbc78600e3c66240e0fb7"},
@@ -1867,6 +2164,8 @@ version = "0.29.1"
 description = "A standard API for reinforcement learning and a diverse set of reference environments (formerly Gym)."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "gymnasium-0.29.1-py3-none-any.whl", hash = "sha256:61c3384b5575985bb7f85e43213bcb40f36fcdff388cae6bc229304c71f2843e"},
     {file = "gymnasium-0.29.1.tar.gz", hash = "sha256:1a532752efcb7590478b1cc7aa04f608eb7a2fdad5570cd217b66b6a35274bb1"},
@@ -1897,6 +2196,8 @@ version = "1.2.4"
 description = "Robotics environments for the Gymnasium repo."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"xarm\""
 files = [
     {file = "gymnasium-robotics-1.2.4.tar.gz", hash = "sha256:d304192b066f8b800599dfbe3d9d90bba9b761ee884472bdc4d05968a8bc61cb"},
     {file = "gymnasium_robotics-1.2.4-py3-none-any.whl", hash = "sha256:c2cb23e087ca0280ae6802837eb7b3a6d14e5bd24c00803ab09f015fcff3eef5"},
@@ -1920,6 +2221,8 @@ version = "0.14.0"
 description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
     {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
@@ -1931,6 +2234,8 @@ version = "3.12.1"
 description = "Read and write HDF5 files from Python"
 optional = false
 python-versions = ">=3.9"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "h5py-3.12.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f0f1a382cbf494679c07b4371f90c70391dedb027d517ac94fa2c05299dacda"},
     {file = "h5py-3.12.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cb65f619dfbdd15e662423e8d257780f9a66677eae5b4b3fc9dca70b5fd2d2a3"},
@@ -1969,6 +2274,8 @@ version = "0.7.27"
 description = "Stretch Body low level Python API"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "hello_robot_stretch_body-0.7.27-py3-none-any.whl", hash = "sha256:740e6abae4a0ba43b23ce7831129e3ef9356acd706ea73b5512873b04ba3c5f0"},
     {file = "hello_robot_stretch_body-0.7.27.tar.gz", hash = "sha256:dd289ea95f9df7be1306cbc26ac75037946db04f4f22503fc6e2741a57c68732"},
@@ -2024,6 +2331,8 @@ version = "0.7.13"
 description = "Stretch Body Tools"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "hello_robot_stretch_body_tools-0.7.13-py3-none-any.whl", hash = "sha256:f12bd4ee40e48c11e68392e7fd91c3a752e87d44d864d1adb3998b30c0166e75"},
     {file = "hello_robot_stretch_body_tools-0.7.13.tar.gz", hash = "sha256:9ce65bfc9a53444b7622c3479ab45c6aa9369618eb3bf102ef1172474d1873b7"},
@@ -2067,6 +2376,8 @@ version = "0.5.6"
 description = "Stretch Factory Tools"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "hello-robot-stretch-factory-0.5.6.tar.gz", hash = "sha256:e2b060daf5eda699781cde96faf608b7ed3c234ac5b22317f028a69f889846de"},
     {file = "hello_robot_stretch_factory-0.5.6-py3-none-any.whl", hash = "sha256:09bb97bf1fc146855843af042684d1820d6b1775945dbc3e1cd44eff75be702f"},
@@ -2087,6 +2398,8 @@ version = "0.3.4"
 description = "Stretch end of arm tool interfaces"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "hello_robot_stretch_tool_share-0.3.4-py3-none-any.whl", hash = "sha256:230d24f88a84cc983c019078911c579882d9c2c9e24129e5acbe1c756189a1d1"},
     {file = "hello_robot_stretch_tool_share-0.3.4.tar.gz", hash = "sha256:8e0a2cea088dcb50e41257aade5c6190964a0f1407f1f54f24d114ff31ecb2c6"},
@@ -2098,6 +2411,8 @@ version = "0.1.0"
 description = "Stretch URDF"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "hello_robot_stretch_urdf-0.1.0-py3-none-any.whl", hash = "sha256:324f5ce0834b45b343e84bb8e8f5cbdd02f1315c6954856f0c68badb2b03e026"},
     {file = "hello_robot_stretch_urdf-0.1.0.tar.gz", hash = "sha256:51ed5984dbb6538e9f7cdc573b8a4a283118a13faaa06dc773c9bdda8bfe1034"},
@@ -2112,6 +2427,8 @@ version = "0.1.8"
 description = "Speed up file transfers with the Hugging Face Hub."
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "hf_transfer-0.1.8-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:70858f9e94286738ed300484a45beb5cfee6a7ddac4c5886f9c6fce7823ac5ab"},
     {file = "hf_transfer-0.1.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:38adc73f0a8526319d90f7cc5dc2d5e4bb66f487a513d94b98aa6725be732e4a"},
@@ -2176,6 +2493,8 @@ version = "1.0.6"
 description = "A minimal low-level HTTP client."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "httpcore-1.0.6-py3-none-any.whl", hash = "sha256:27b59625743b85577a8c0e10e55b50b5368a4f2cfe8cc7bcfa9cf00829c2682f"},
     {file = "httpcore-1.0.6.tar.gz", hash = "sha256:73f6dbd6eb8c21bbf7ef8efad555481853f5f6acdeaff1edb0694289269ee17f"},
@@ -2197,6 +2516,8 @@ version = "0.27.2"
 description = "The next generation HTTP client."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0"},
     {file = "httpx-0.27.2.tar.gz", hash = "sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2"},
@@ -2222,6 +2543,8 @@ version = "0.25.2"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
 python-versions = ">=3.8.0"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "huggingface_hub-0.25.2-py3-none-any.whl", hash = "sha256:1897caf88ce7f97fe0110603d8f66ac264e3ba6accdf30cd66cc0fed5282ad25"},
     {file = "huggingface_hub-0.25.2.tar.gz", hash = "sha256:a1014ea111a5f40ccd23f7f7ba8ac46e20fa3b658ced1f86a00c75c06ec6423c"},
@@ -2258,6 +2581,8 @@ version = "1.3.2"
 description = "A framework for elegantly configuring complex applications"
 optional = false
 python-versions = "*"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "hydra-core-1.3.2.tar.gz", hash = "sha256:8a878ed67216997c3e9d88a8e72e7b4767e81af37afb4ea3334b269a4390a824"},
     {file = "hydra_core-1.3.2-py3-none-any.whl", hash = "sha256:fa0238a9e31df3373b35b0bfb672c34cc92718d21f81311d8996a16de1141d8b"},
@@ -2274,6 +2599,8 @@ version = "2.6.1"
 description = "File identification library for Python"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"dev\""
 files = [
     {file = "identify-2.6.1-py2.py3-none-any.whl", hash = "sha256:53863bcac7caf8d2ed85bd20312ea5dcfc22226800f6d6881f232d861db5a8f0"},
     {file = "identify-2.6.1.tar.gz", hash = "sha256:91478c5fb7c3aac5ff7bf9b4344f803843dc586832d5f110d672b19aa1984c98"},
@@ -2288,6 +2615,8 @@ version = "3.10"
 description = "Internationalized Domain Names in Applications (IDNA)"
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"},
     {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"},
@@ -2302,6 +2631,8 @@ version = "2024.9.22"
 description = "Image transformation, compression, and decompression codecs"
 optional = true
 python-versions = ">=3.9"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"umi\""
 files = [
     {file = "imagecodecs-2024.9.22-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:4cc21a59c6eb409bc3930dc642039eb1ff67a36b3f8d9e8c229eaede6b26557e"},
     {file = "imagecodecs-2024.9.22-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:321ff2e6907820bdbf8350d20733f5068bf53513476d522028117aefab55fc03"},
@@ -2353,6 +2684,8 @@ version = "2.35.1"
 description = "Library for reading and writing a wide range of image, video, scientific, and volumetric data formats."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "imageio-2.35.1-py3-none-any.whl", hash = "sha256:6eb2e5244e7a16b85c10b5c2fe0f7bf961b40fcb9f1a9fd1bd1d2c2f8fb3cd65"},
     {file = "imageio-2.35.1.tar.gz", hash = "sha256:4952dfeef3c3947957f6d5dedb1f4ca31c6e509a476891062396834048aeed2a"},
@@ -2388,6 +2721,8 @@ version = "0.5.1"
 description = "FFMPEG wrapper for Python"
 optional = false
 python-versions = ">=3.5"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "imageio-ffmpeg-0.5.1.tar.gz", hash = "sha256:0ed7a9b31f560b0c9d929c5291cd430edeb9bed3ce9a497480e536dd4326484c"},
     {file = "imageio_ffmpeg-0.5.1-py3-none-macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:1460e84712b9d06910c1f7bb524096b0341d4b7844cea6c20e099d0a24e795b1"},
@@ -2406,6 +2741,8 @@ version = "8.5.0"
 description = "Read metadata from Python packages"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b"},
     {file = "importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7"},
@@ -2429,6 +2766,8 @@ version = "2.0.0"
 description = "brain-dead simple config-ini parsing"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"test\""
 files = [
     {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
     {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
@@ -2440,6 +2779,8 @@ version = "0.5"
 description = "Cross-platform Python support for keyboards, mice and gamepads."
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "inputs-0.5-py2.py3-none-any.whl", hash = "sha256:13f894564e52134cf1e3862b1811da034875eb1f2b62e6021e3776e9669a96ec"},
     {file = "inputs-0.5.tar.gz", hash = "sha256:a31d5b96a3525f1232f326be9e7ce8ccaf873c6b1fb84d9f3c9bc3d79b23eae4"},
@@ -2451,6 +2792,8 @@ version = "0.3.4"
 description = "Python port of Inquirer.js (A collection of common interactive command-line user interfaces)"
 optional = false
 python-versions = ">=3.7,<4.0"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "InquirerPy-0.3.4-py3-none-any.whl", hash = "sha256:c65fdfbac1fa00e3ee4fb10679f4d3ed7a012abf4833910e63c295827fe2a7d4"},
     {file = "InquirerPy-0.3.4.tar.gz", hash = "sha256:89d2ada0111f337483cb41ae31073108b2ec1e618a49d7110b0d7ade89fc197e"},
@@ -2469,6 +2812,8 @@ version = "6.29.5"
 description = "IPython Kernel for Jupyter"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "ipykernel-6.29.5-py3-none-any.whl", hash = "sha256:afdb66ba5aa354b09b91379bac28ae4afebbb30e8b39510c9690afb7a10421b5"},
     {file = "ipykernel-6.29.5.tar.gz", hash = "sha256:f093a22c4a40f8828f8e330a9c297cb93dcab13bd9678ded6de8e5cf81c56215"},
@@ -2502,12 +2847,15 @@ version = "8.28.0"
 description = "IPython: Productive Interactive Computing"
 optional = true
 python-versions = ">=3.10"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "ipython-8.28.0-py3-none-any.whl", hash = "sha256:530ef1e7bb693724d3cdc37287c80b07ad9b25986c007a53aa1857272dac3f35"},
     {file = "ipython-8.28.0.tar.gz", hash = "sha256:0d0d15ca1e01faeb868ef56bc7ee5a0de5bd66885735682e8a322ae289a13d1a"},
 ]
 
 [package.dependencies]
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
 decorator = "*"
 exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
 jedi = ">=0.16"
@@ -2539,6 +2887,8 @@ version = "8.1.5"
 description = "Jupyter interactive widgets"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "ipywidgets-8.1.5-py3-none-any.whl", hash = "sha256:3290f526f87ae6e77655555baba4f36681c555b8bdbbff430b70e52c34c86245"},
     {file = "ipywidgets-8.1.5.tar.gz", hash = "sha256:870e43b1a35656a80c18c9503bbf2d16802db1cb487eec6fab27d683381dde17"},
@@ -2560,6 +2910,8 @@ version = "20.11.0"
 description = "Operations with ISO 8601 durations"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "isoduration-20.11.0-py3-none-any.whl", hash = "sha256:b2904c2a4228c3d44f409c8ae8e2370eb21a26f7ac2ec5446df141dde3452042"},
     {file = "isoduration-20.11.0.tar.gz", hash = "sha256:ac2f9015137935279eac671f94f89eb00584f940f5dc49462a0c4ee692ba1bd9"},
@@ -2574,6 +2926,8 @@ version = "2.2.0"
 description = "Safely pass data to untrusted environments and back."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef"},
     {file = "itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173"},
@@ -2585,6 +2939,8 @@ version = "0.19.1"
 description = "An autocompletion tool for Python that can be used for text editors."
 optional = true
 python-versions = ">=3.6"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "jedi-0.19.1-py2.py3-none-any.whl", hash = "sha256:e983c654fe5c02867aef4cdfce5a2fbb4a50adc0af145f70504238f18ef5e7e0"},
     {file = "jedi-0.19.1.tar.gz", hash = "sha256:cf0496f3651bc65d7174ac1b7d043eff454892c708a87d1b683e57b569927ffd"},
@@ -2604,6 +2960,8 @@ version = "3.1.4"
 description = "A very fast and expressive template engine."
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"},
     {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"},
@@ -2621,6 +2979,8 @@ version = "0.9.25"
 description = "A Python implementation of the JSON5 data format."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "json5-0.9.25-py3-none-any.whl", hash = "sha256:34ed7d834b1341a86987ed52f3f76cd8ee184394906b6e22a1e0deb9ab294e8f"},
     {file = "json5-0.9.25.tar.gz", hash = "sha256:548e41b9be043f9426776f05df8635a00fe06104ea51ed24b67f908856e151ae"},
@@ -2632,6 +2992,8 @@ version = "4.0.0"
 description = "Library with helpers for the jsonlines file format"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "jsonlines-4.0.0-py3-none-any.whl", hash = "sha256:185b334ff2ca5a91362993f42e83588a360cf95ce4b71a73548502bda52a7c55"},
     {file = "jsonlines-4.0.0.tar.gz", hash = "sha256:0c6d2c09117550c089995247f605ae4cf77dd1533041d366351f6f298822ea74"},
@@ -2646,6 +3008,8 @@ version = "3.0.0"
 description = "Identify specific nodes in a JSON document (RFC 6901)"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "jsonpointer-3.0.0-py2.py3-none-any.whl", hash = "sha256:13e088adc14fca8b6aa8177c044e12701e6ad4b28ff10e65f2267a90109c9942"},
     {file = "jsonpointer-3.0.0.tar.gz", hash = "sha256:2b2d729f2091522d61c3b31f82e11870f60b68f43fbc705cb76bf4b832af59ef"},
@@ -2657,6 +3021,8 @@ version = "4.23.0"
 description = "An implementation of JSON Schema validation for Python"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "jsonschema-4.23.0-py3-none-any.whl", hash = "sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566"},
     {file = "jsonschema-4.23.0.tar.gz", hash = "sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4"},
@@ -2686,6 +3052,8 @@ version = "2024.10.1"
 description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry"
 optional = true
 python-versions = ">=3.9"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "jsonschema_specifications-2024.10.1-py3-none-any.whl", hash = "sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf"},
     {file = "jsonschema_specifications-2024.10.1.tar.gz", hash = "sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272"},
@@ -2700,6 +3068,8 @@ version = "1.1.1"
 description = "Jupyter metapackage. Install all the Jupyter components in one go."
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "jupyter-1.1.1-py2.py3-none-any.whl", hash = "sha256:7a59533c22af65439b24bbe60373a4e95af8f16ac65a6c00820ad378e3f7cc83"},
     {file = "jupyter-1.1.1.tar.gz", hash = "sha256:d55467bceabdea49d7e3624af7e33d59c37fff53ed3a350e1ac957bed731de7a"},
@@ -2719,6 +3089,8 @@ version = "8.6.3"
 description = "Jupyter protocol implementation and client libraries"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "jupyter_client-8.6.3-py3-none-any.whl", hash = "sha256:e8a19cc986cc45905ac3362915f410f3af85424b4c0905e94fa5f2cb08e8f23f"},
     {file = "jupyter_client-8.6.3.tar.gz", hash = "sha256:35b3a0947c4a6e9d589eb97d7d4cd5e90f910ee73101611f01283732bd6d9419"},
@@ -2741,6 +3113,8 @@ version = "6.6.3"
 description = "Jupyter terminal console"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "jupyter_console-6.6.3-py3-none-any.whl", hash = "sha256:309d33409fcc92ffdad25f0bcdf9a4a9daa61b6f341177570fdac03de5352485"},
     {file = "jupyter_console-6.6.3.tar.gz", hash = "sha256:566a4bf31c87adbfadf22cdf846e3069b59a71ed5da71d6ba4d8aaad14a53539"},
@@ -2765,6 +3139,8 @@ version = "5.7.2"
 description = "Jupyter core package. A base package on which Jupyter projects rely."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "jupyter_core-5.7.2-py3-none-any.whl", hash = "sha256:4f7315d2f6b4bcf2e3e7cb6e46772eba760ae459cd1f59d29eb57b0a01bd7409"},
     {file = "jupyter_core-5.7.2.tar.gz", hash = "sha256:aa5f8d32bbf6b431ac830496da7392035d6f61b4f54872f15c4bd2a9c3f536d9"},
@@ -2784,6 +3160,8 @@ version = "0.10.0"
 description = "Jupyter Event System library"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "jupyter_events-0.10.0-py3-none-any.whl", hash = "sha256:4b72130875e59d57716d327ea70d3ebc3af1944d3717e5a498b8a06c6c159960"},
     {file = "jupyter_events-0.10.0.tar.gz", hash = "sha256:670b8229d3cc882ec782144ed22e0d29e1c2d639263f92ca8383e66682845e22"},
@@ -2809,6 +3187,8 @@ version = "2.2.5"
 description = "Multi-Language Server WebSocket proxy for Jupyter Notebook/Lab server"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "jupyter-lsp-2.2.5.tar.gz", hash = "sha256:793147a05ad446f809fd53ef1cd19a9f5256fd0a2d6b7ce943a982cb4f545001"},
     {file = "jupyter_lsp-2.2.5-py3-none-any.whl", hash = "sha256:45fbddbd505f3fbfb0b6cb2f1bc5e15e83ab7c79cd6e89416b248cb3c00c11da"},
@@ -2823,6 +3203,8 @@ version = "2.14.2"
 description = "The backend—i.e. core services, APIs, and REST endpoints—to Jupyter web applications."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "jupyter_server-2.14.2-py3-none-any.whl", hash = "sha256:47ff506127c2f7851a17bf4713434208fc490955d0e8632e95014a9a9afbeefd"},
     {file = "jupyter_server-2.14.2.tar.gz", hash = "sha256:66095021aa9638ced276c248b1d81862e4c50f292d575920bbe960de1c56b12b"},
@@ -2859,6 +3241,8 @@ version = "0.5.3"
 description = "A Jupyter Server Extension Providing Terminals."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "jupyter_server_terminals-0.5.3-py3-none-any.whl", hash = "sha256:41ee0d7dc0ebf2809c668e0fc726dfaf258fcd3e769568996ca731b6194ae9aa"},
     {file = "jupyter_server_terminals-0.5.3.tar.gz", hash = "sha256:5ae0295167220e9ace0edcfdb212afd2b01ee8d179fe6f23c899590e9b8a5269"},
@@ -2878,6 +3262,8 @@ version = "4.2.5"
 description = "JupyterLab computational environment"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "jupyterlab-4.2.5-py3-none-any.whl", hash = "sha256:73b6e0775d41a9fee7ee756c80f58a6bed4040869ccc21411dc559818874d321"},
     {file = "jupyterlab-4.2.5.tar.gz", hash = "sha256:ae7f3a1b8cb88b4f55009ce79fa7c06f99d70cd63601ee4aa91815d054f46f75"},
@@ -2912,6 +3298,8 @@ version = "0.3.0"
 description = "Pygments theme using JupyterLab CSS variables"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "jupyterlab_pygments-0.3.0-py3-none-any.whl", hash = "sha256:841a89020971da1d8693f1a99997aefc5dc424bb1b251fd6322462a1b8842780"},
     {file = "jupyterlab_pygments-0.3.0.tar.gz", hash = "sha256:721aca4d9029252b11cfa9d185e5b5af4d54772bb8072f9b7036f4170054d35d"},
@@ -2923,6 +3311,8 @@ version = "2.27.3"
 description = "A set of server components for JupyterLab and JupyterLab like applications."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "jupyterlab_server-2.27.3-py3-none-any.whl", hash = "sha256:e697488f66c3db49df675158a77b3b017520d772c6e1548c7d9bcc5df7944ee4"},
     {file = "jupyterlab_server-2.27.3.tar.gz", hash = "sha256:eb36caca59e74471988f0ae25c77945610b887f777255aa21f8065def9e51ed4"},
@@ -2948,6 +3338,8 @@ version = "3.0.13"
 description = "Jupyter interactive widgets for JupyterLab"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "jupyterlab_widgets-3.0.13-py3-none-any.whl", hash = "sha256:e3cda2c233ce144192f1e29914ad522b2f4c40e77214b0cc97377ca3d323db54"},
     {file = "jupyterlab_widgets-3.0.13.tar.gz", hash = "sha256:a2966d385328c1942b683a8cd96b89b8dd82c8b8f81dda902bb2bc06d46f5bed"},
@@ -2959,6 +3351,8 @@ version = "1.4.7"
 description = "A fast implementation of the Cassowary constraint solver"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "kiwisolver-1.4.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8a9c83f75223d5e48b0bc9cb1bf2776cf01563e00ade8775ffe13b0b6e1af3a6"},
     {file = "kiwisolver-1.4.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:58370b1ffbd35407444d57057b57da5d6549d2d854fa30249771775c63b5fe17"},
@@ -3082,6 +3476,8 @@ version = "1.0.6"
 description = "LabMaze: DeepMind Lab's text maze generator."
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"aloha\""
 files = [
     {file = "labmaze-1.0.6-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:b2ddef976dfd8d992b19cfa6c633f2eba7576d759c2082da534e3f727479a84a"},
     {file = "labmaze-1.0.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:157efaa93228c8ccce5cae337902dd652093e0fba9d3a0f6506e4bee272bb66f"},
@@ -3126,6 +3522,8 @@ version = "0.4"
 description = "Makes it easy to load subpackages and functions on demand."
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "(extra == \"pusht\" or extra == \"video-benchmark\" or sys_platform == \"linux\") and (extra == \"pusht\" or extra == \"video-benchmark\" or extra == \"stretch\") and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "lazy_loader-0.4-py3-none-any.whl", hash = "sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc"},
     {file = "lazy_loader-0.4.tar.gz", hash = "sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1"},
@@ -3145,6 +3543,8 @@ version = "0.11.9"
 description = "Lightning toolbox for across the our ecosystem."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"hilserl\""
 files = [
     {file = "lightning_utilities-0.11.9-py3-none-any.whl", hash = "sha256:ac6d4e9e28faf3ff4be997876750fee10dc604753dbc429bf3848a95c5d7e0d2"},
     {file = "lightning_utilities-0.11.9.tar.gz", hash = "sha256:f5052b81344cc2684aa9afd74b7ce8819a8f49a858184ec04548a5a109dfd053"},
@@ -3166,6 +3566,8 @@ version = "0.43.0"
 description = "lightweight wrapper around basic LLVM functionality"
 optional = false
 python-versions = ">=3.9"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "llvmlite-0.43.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a289af9a1687c6cf463478f0fa8e8aa3b6fb813317b0d70bf1ed0759eab6f761"},
     {file = "llvmlite-0.43.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6d4fd101f571a31acb1559ae1af30f30b1dc4b3186669f92ad780e17c81e91bc"},
@@ -3196,6 +3598,8 @@ version = "5.3.0"
 description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
 optional = true
 python-versions = ">=3.6"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"aloha\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"aloha\" or extra == \"mani-skill\")"
 files = [
     {file = "lxml-5.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:dd36439be765e2dde7660212b5275641edbc813e7b24668831a5c8ac91180656"},
     {file = "lxml-5.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ae5fe5c4b525aa82b8076c1a59d642c17b6e8739ecf852522c6321852178119d"},
@@ -3344,12 +3748,56 @@ html5 = ["html5lib"]
 htmlsoup = ["BeautifulSoup4"]
 source = ["Cython (>=3.0.11)"]
 
+[[package]]
+name = "mani-skill"
+version = "3.0.0b18"
+description = "ManiSkill3: A Unified Benchmark for Generalizable Manipulation Skills"
+optional = true
+python-versions = ">=3.9"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"mani-skill\""
+files = [
+    {file = "mani_skill-3.0.0b18-py3-none-any.whl", hash = "sha256:bf968fc391e5acfc67cd1fb4fbaade6e796d088961500f28cc5ecc1de33618e6"},
+    {file = "mani_skill-3.0.0b18.tar.gz", hash = "sha256:75a2cbfa8539b2f5d0422179a44b0d843ffe570c6e840331927ff2e8369acd07"},
+]
+
+[package.dependencies]
+dacite = "*"
+fast_kinematics = {version = "0.2.2", markers = "platform_system == \"Linux\""}
+GitPython = "*"
+gymnasium = "0.29.1"
+h5py = "*"
+huggingface_hub = "*"
+imageio = [
+    {version = "*"},
+    {version = "*", extras = ["ffmpeg"]},
+]
+IPython = "*"
+mplib = {version = "0.1.1", markers = "platform_system == \"Linux\""}
+numpy = ">=1.22,<2.0.0"
+pynvml = "*"
+pytorch_kinematics = "0.7.4"
+pyyaml = "*"
+sapien = "3.0.0.b1"
+scipy = "*"
+tabulate = "*"
+tqdm = "*"
+transforms3d = "*"
+trimesh = "*"
+tyro = ">=0.8.5"
+
+[package.extras]
+dev = ["black", "build", "isort", "pre-commit", "pynvml", "pytest", "pytest-forked", "pytest-xdist[psutil]", "stable_baselines3", "twine"]
+docs = ["myst-parser", "pydata_sphinx_theme", "sphinx (==6.2.1)", "sphinx-autobuild", "sphinx-autodoc-typehints", "sphinx-subfigure", "sphinx-togglebutton", "sphinx_copybutton", "sphinx_design", "sphinxcontrib-video", "sphinxcontrib.spelling"]
+
 [[package]]
 name = "markdown-it-py"
 version = "3.0.0"
 description = "Python port of markdown-it. Markdown parsing, done right!"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"},
     {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"},
@@ -3374,6 +3822,8 @@ version = "3.0.1"
 description = "Safely add untrusted strings to HTML/XML markup."
 optional = false
 python-versions = ">=3.9"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "MarkupSafe-3.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:db842712984e91707437461930e6011e60b39136c7331e971952bb30465bc1a1"},
     {file = "MarkupSafe-3.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3ffb4a8e7d46ed96ae48805746755fadd0909fea2306f93d5d8233ba23dda12a"},
@@ -3444,6 +3894,8 @@ version = "3.9.2"
 description = "Python plotting package"
 optional = true
 python-versions = ">=3.9"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "matplotlib-3.9.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:9d78bbc0cbc891ad55b4f39a48c22182e9bdaea7fc0e5dbd364f49f729ca1bbb"},
     {file = "matplotlib-3.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c375cc72229614632c87355366bdf2570c2dac01ac66b8ad048d2dabadf2d0d4"},
@@ -3507,6 +3959,8 @@ version = "0.1.7"
 description = "Inline Matplotlib backend for Jupyter"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca"},
     {file = "matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90"},
@@ -3521,6 +3975,8 @@ version = "0.1.2"
 description = "Markdown URL utilities"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
     {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
@@ -3532,6 +3988,8 @@ version = "5.3.5"
 description = "I/O for many mesh formats"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "meshio-5.3.5-py3-none-any.whl", hash = "sha256:0736c6e34ecc768f62f2cde5d8233a3529512a9399b25c68ea2ca0d5900cdc10"},
     {file = "meshio-5.3.5.tar.gz", hash = "sha256:f21f01abd9f29ba06ea119304b3d39e610421cfe93b9dd23362834919f87586d"},
@@ -3550,17 +4008,43 @@ version = "3.0.2"
 description = "A sane and fast Markdown parser with useful plugins and renderers"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "mistune-3.0.2-py3-none-any.whl", hash = "sha256:71481854c30fdbc938963d3605b72501f5c10a9320ecd412c121c163a1c7d205"},
     {file = "mistune-3.0.2.tar.gz", hash = "sha256:fc7f93ded930c92394ef2cb6f04a8aabab4117a91449e72dcc8dfa646a508be8"},
 ]
 
+[[package]]
+name = "mplib"
+version = "0.1.1"
+description = "A lightweight motion planning library"
+optional = true
+python-versions = ">=3.7"
+groups = ["main"]
+markers = "extra == \"mani-skill\" and platform_system == \"Linux\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
+files = [
+    {file = "mplib-0.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e320ec9e77b19ada2d4ae854fac16938916aaedbed3822f4e698adbcdb6a3e6"},
+    {file = "mplib-0.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06c045092eb03c376cc75abc7c82fa583dc10757b9f0b6e863627aafe14d6d95"},
+    {file = "mplib-0.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ee54fe2528bd468094b8344977dee351332d7cddafb6f42ffa1c288df889694"},
+    {file = "mplib-0.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3035db7b16ffca685b547fbd56cffa3f93cf4ce63c9348f16c5c11d998cb1367"},
+    {file = "mplib-0.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:064f769117d7b7503a9c69c99c5f1b5b061bb4bd1d6c5e2f4f920a095e7e9b11"},
+    {file = "mplib-0.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a1cd6754421cd1274734bc8baa33b009aba55059869796592957c0a696e675f"},
+]
+
+[package.dependencies]
+numpy = "*"
+toppra = ">=0.4.0"
+transforms3d = ">=0.3.1"
+
 [[package]]
 name = "mpmath"
 version = "1.3.0"
 description = "Python library for arbitrary-precision floating-point arithmetic"
 optional = false
 python-versions = "*"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"},
     {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"},
@@ -3578,6 +4062,8 @@ version = "2.3.7"
 description = "MuJoCo Physics Simulator"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(extra == \"xarm\" or extra == \"aloha\") and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "mujoco-2.3.7-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:e8714a5ff6a1561b364b7b4648d4c0c8d13e751874cf7401c309b9d23fa9598b"},
     {file = "mujoco-2.3.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a934315f858a4e0c4b90a682fde519471cfdd7baa64435179da8cd20d4ae3f99"},
@@ -3618,6 +4104,8 @@ version = "6.1.0"
 description = "multidict implementation"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3380252550e372e8511d49481bd836264c009adb826b23fefcc5dd3c69692f60"},
     {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:99f826cbf970077383d7de805c0681799491cb939c25450b9b5b3ced03ca99f1"},
@@ -3722,6 +4210,8 @@ version = "0.70.16"
 description = "better multiprocessing and multithreading in Python"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "multiprocess-0.70.16-pp310-pypy310_pp73-macosx_10_13_x86_64.whl", hash = "sha256:476887be10e2f59ff183c006af746cb6f1fd0eadcfd4ef49e605cbe2659920ee"},
     {file = "multiprocess-0.70.16-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d951bed82c8f73929ac82c61f01a7b5ce8f3e5ef40f5b52553b4f547ce2b08ec"},
@@ -3746,6 +4236,8 @@ version = "0.10.0"
 description = "A client library for executing notebooks. Formerly nbconvert's ExecutePreprocessor."
 optional = true
 python-versions = ">=3.8.0"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "nbclient-0.10.0-py3-none-any.whl", hash = "sha256:f13e3529332a1f1f81d82a53210322476a168bb7090a0289c795fe9cc11c9d3f"},
     {file = "nbclient-0.10.0.tar.gz", hash = "sha256:4b3f1b7dba531e498449c4db4f53da339c91d449dc11e9af3a43b4eb5c5abb09"},
@@ -3768,6 +4260,8 @@ version = "7.16.4"
 description = "Converting Jupyter Notebooks (.ipynb files) to other formats.  Output formats include asciidoc, html, latex, markdown, pdf, py, rst, script.  nbconvert can be used both as a Python library (`import nbconvert`) or as a command line tool (invoked as `jupyter nbconvert ...`)."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "nbconvert-7.16.4-py3-none-any.whl", hash = "sha256:05873c620fe520b6322bf8a5ad562692343fe3452abda5765c7a34b7d1aa3eb3"},
     {file = "nbconvert-7.16.4.tar.gz", hash = "sha256:86ca91ba266b0a448dc96fa6c5b9d98affabde2867b363258703536807f9f7f4"},
@@ -3805,6 +4299,8 @@ version = "5.10.4"
 description = "The Jupyter Notebook format"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "nbformat-5.10.4-py3-none-any.whl", hash = "sha256:3b48d6c8fbca4b299bf3982ea7db1af21580e4fec269ad087b9e81588891200b"},
     {file = "nbformat-5.10.4.tar.gz", hash = "sha256:322168b14f937a5d11362988ecac2a4952d3d8e3a2cbeb2319584631226d5b3a"},
@@ -3826,6 +4322,8 @@ version = "1.6.0"
 description = "Patch asyncio to allow nested event loops"
 optional = true
 python-versions = ">=3.5"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"},
     {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"},
@@ -3837,6 +4335,8 @@ version = "3.4"
 description = "Python package for creating and manipulating graphs and networks"
 optional = false
 python-versions = ">=3.10"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "networkx-3.4-py3-none-any.whl", hash = "sha256:46dad0ec74a825a968e2b36c37ef5b91faa3868f017b2283d9cbff33112222ce"},
     {file = "networkx-3.4.tar.gz", hash = "sha256:1269b90f8f0d3a4095f016f49650f35ac169729f49b69d0572b2bb142748162b"},
@@ -3856,6 +4356,8 @@ version = "1.9.1"
 description = "Node.js virtual environment builder"
 optional = true
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"dev\""
 files = [
     {file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"},
     {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"},
@@ -3867,6 +4369,8 @@ version = "1.3.7"
 description = "nose extends unittest to make testing easier"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "nose-1.3.7-py2-none-any.whl", hash = "sha256:dadcddc0aefbf99eea214e0f1232b94f2fa9bd98fa8353711dacb112bfcbbb2a"},
     {file = "nose-1.3.7-py3-none-any.whl", hash = "sha256:9ff7c6cc443f8c51994b34a667bbcf45afd6d945be7477b52e97516fd17c53ac"},
@@ -3879,6 +4383,8 @@ version = "7.2.2"
 description = "Jupyter Notebook - A web-based notebook environment for interactive computing"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "notebook-7.2.2-py3-none-any.whl", hash = "sha256:c89264081f671bc02eec0ed470a627ed791b9156cad9285226b31611d3e9fe1c"},
     {file = "notebook-7.2.2.tar.gz", hash = "sha256:2ef07d4220421623ad3fe88118d687bc0450055570cdd160814a59cf3a1c516e"},
@@ -3902,6 +4408,8 @@ version = "0.2.4"
 description = "A shim layer for notebook traits and config"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "notebook_shim-0.2.4-py3-none-any.whl", hash = "sha256:411a5be4e9dc882a074ccbcae671eda64cceb068767e9a3419096986560e1cef"},
     {file = "notebook_shim-0.2.4.tar.gz", hash = "sha256:b4b2cfa1b65d98307ca24361f5b30fe785b53c3fd07b7a47e89acb5e6ac638cb"},
@@ -3919,6 +4427,8 @@ version = "0.60.0"
 description = "compiling Python code using LLVM"
 optional = false
 python-versions = ">=3.9"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "numba-0.60.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d761de835cd38fb400d2c26bb103a2726f548dc30368853121d66201672e651"},
     {file = "numba-0.60.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:159e618ef213fba758837f9837fb402bbe65326e60ba0633dbe6c7f274d42c1b"},
@@ -3953,6 +4463,8 @@ version = "0.13.1"
 description = "A Python package providing buffer compression and transformation codecs for use in data storage and communication applications."
 optional = false
 python-versions = ">=3.10"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "numcodecs-0.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:96add4f783c5ce57cc7e650b6cac79dd101daf887c479a00a29bc1487ced180b"},
     {file = "numcodecs-0.13.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:237b7171609e868a20fd313748494444458ccd696062f67e198f7f8f52000c15"},
@@ -3990,6 +4502,8 @@ version = "1.26.4"
 description = "Fundamental package for array computing in Python"
 optional = false
 python-versions = ">=3.9"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"},
     {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"},
@@ -4035,6 +4549,8 @@ version = "3.1.2"
 description = "Library to make reading, writing and modifying both binary and ascii STL files easy."
 optional = true
 python-versions = ">3.6.0"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "numpy_stl-3.1.2-py3-none-any.whl", hash = "sha256:a55288340c837378bf44753a1c595c6823312995acda97f27ed04db4ff1d25f3"},
     {file = "numpy_stl-3.1.2.tar.gz", hash = "sha256:72b46950dfa3642df1c7b873cfa78a548533724b907478c567db42fdf57ee3d2"},
@@ -4050,6 +4566,8 @@ version = "12.1.3.1"
 description = "CUBLAS native runtime libraries"
 optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:ee53ccca76a6fc08fb9701aa95b6ceb242cdaab118c3bb152af4e579af792728"},
     {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-win_amd64.whl", hash = "sha256:2b964d60e8cf11b5e1073d179d85fa340c120e99b3067558f3cf98dd69d02906"},
@@ -4061,6 +4579,8 @@ version = "12.1.105"
 description = "CUDA profiling tools runtime libs."
 optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:e54fde3983165c624cb79254ae9818a456eb6e87a7fd4d56a2352c24ee542d7e"},
     {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:bea8236d13a0ac7190bd2919c3e8e6ce1e402104276e6f9694479e48bb0eb2a4"},
@@ -4072,6 +4592,8 @@ version = "12.1.105"
 description = "NVRTC native runtime libraries"
 optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:339b385f50c309763ca65456ec75e17bbefcbbf2893f462cb8b90584cd27a1c2"},
     {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:0a98a522d9ff138b96c010a65e145dc1b4850e9ecb75a0172371793752fd46ed"},
@@ -4083,6 +4605,8 @@ version = "12.1.105"
 description = "CUDA Runtime native Libraries"
 optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:6e258468ddf5796e25f1dc591a31029fa317d97a0a94ed93468fc86301d61e40"},
     {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:dfb46ef84d73fababab44cf03e3b83f80700d27ca300e537f85f636fac474344"},
@@ -4094,6 +4618,8 @@ version = "9.1.0.70"
 description = "cuDNN runtime libraries"
 optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f"},
     {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-win_amd64.whl", hash = "sha256:6278562929433d68365a07a4a1546c237ba2849852c0d4b2262a486e805b977a"},
@@ -4108,6 +4634,8 @@ version = "11.0.2.54"
 description = "CUFFT native runtime libraries"
 optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl", hash = "sha256:794e3948a1aa71fd817c3775866943936774d1c14e7628c74f6f7417224cdf56"},
     {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-win_amd64.whl", hash = "sha256:d9ac353f78ff89951da4af698f80870b1534ed69993f10a4cf1d96f21357e253"},
@@ -4119,6 +4647,8 @@ version = "10.3.2.106"
 description = "CURAND native runtime libraries"
 optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:9d264c5036dde4e64f1de8c50ae753237c12e0b1348738169cd0f8a536c0e1e0"},
     {file = "nvidia_curand_cu12-10.3.2.106-py3-none-win_amd64.whl", hash = "sha256:75b6b0c574c0037839121317e17fd01f8a69fd2ef8e25853d826fec30bdba74a"},
@@ -4130,6 +4660,8 @@ version = "11.4.5.107"
 description = "CUDA solver native runtime libraries"
 optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl", hash = "sha256:8a7ec542f0412294b15072fa7dab71d31334014a69f953004ea7a118206fe0dd"},
     {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-win_amd64.whl", hash = "sha256:74e0c3a24c78612192a74fcd90dd117f1cf21dea4822e66d89e8ea80e3cd2da5"},
@@ -4146,6 +4678,8 @@ version = "12.1.0.106"
 description = "CUSPARSE native runtime libraries"
 optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:f3b50f42cf363f86ab21f720998517a659a48131e8d538dc02f8768237bd884c"},
     {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-win_amd64.whl", hash = "sha256:b798237e81b9719373e8fae8d4f091b70a0cf09d9d85c95a557e11df2d8e9a5a"},
@@ -4154,12 +4688,27 @@ files = [
 [package.dependencies]
 nvidia-nvjitlink-cu12 = "*"
 
+[[package]]
+name = "nvidia-ml-py"
+version = "12.570.86"
+description = "Python Bindings for the NVIDIA Management Library"
+optional = true
+python-versions = "*"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"mani-skill\""
+files = [
+    {file = "nvidia_ml_py-12.570.86-py3-none-any.whl", hash = "sha256:58907de35a845abd13dcb227f18298f3b5dd94a72d04c9e594e77711e95c0b51"},
+    {file = "nvidia_ml_py-12.570.86.tar.gz", hash = "sha256:0508d4a0c7b6d015cf574530b95a62ed4fc89da3b8b47e1aefe6777db170ec8b"},
+]
+
 [[package]]
 name = "nvidia-nccl-cu12"
 version = "2.20.5"
 description = "NVIDIA Collective Communication Library (NCCL) Runtime"
 optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01"},
     {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:057f6bf9685f75215d0c53bf3ac4a10b3e6578351de307abad9e18a99182af56"},
@@ -4171,6 +4720,8 @@ version = "12.6.77"
 description = "Nvidia JIT LTO Library"
 optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "nvidia_nvjitlink_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:3bf10d85bb1801e9c894c6e197e44dd137d2a0a9e43f8450e9ad13f2df0dd52d"},
     {file = "nvidia_nvjitlink_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9ae346d16203ae4ea513be416495167a0101d33d2d14935aa9c1829a3fb45142"},
@@ -4183,6 +4734,8 @@ version = "12.1.105"
 description = "NVIDIA Tools Extension"
 optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:dc21cf308ca5691e7c04d962e213f8a4aa9bbfa23d95412f452254c2caeb09e5"},
     {file = "nvidia_nvtx_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:65f4d98982b31b60026e0e6de73fbdfc09d08a96f4656dd3665ca616a11e1e82"},
@@ -4194,6 +4747,8 @@ version = "2.3.0"
 description = "A flexible configuration library"
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "omegaconf-2.3.0-py3-none-any.whl", hash = "sha256:7b4df175cdb08ba400f45cae3bdcae7ba8365db4d165fc65fd04b050ab63b46b"},
     {file = "omegaconf-2.3.0.tar.gz", hash = "sha256:d5d4b6d29955cc50ad50c46dc269bcd92c6e00f5f90d23ab5fee7bfca4ba4cc7"},
@@ -4209,6 +4764,8 @@ version = "0.18.0"
 description = "Open3D: A Modern Library for 3D Data Processing."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "open3d-0.18.0-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:48ee627a142a5453c4a2869b529310acb6f6b2507989cb9199c56e75796c575e"},
     {file = "open3d-0.18.0-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:9f3df5e8e8fe514b8285d05e43a4a3d57243d42d5c1dc9212adf8f18b6ab59b4"},
@@ -4245,6 +4802,8 @@ version = "4.10.0.84"
 description = "Wrapper package for OpenCV python bindings."
 optional = true
 python-versions = ">=3.6"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "opencv-contrib-python-4.10.0.84.tar.gz", hash = "sha256:4a3eae0ed9cadf1abe9293a6938a25a540e2fd6d7fc308595caa5896c8b36a0c"},
     {file = "opencv_contrib_python-4.10.0.84-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:ee4b0919026d8c533aeb69b16c6ec4a891a2f6844efaa14121bf68838753209c"},
@@ -4257,10 +4816,10 @@ files = [
 
 [package.dependencies]
 numpy = [
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
     {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
     {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
-    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
-    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
 
 [[package]]
@@ -4269,6 +4828,8 @@ version = "4.10.0.84"
 description = "Wrapper package for OpenCV python bindings."
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "opencv-python-4.10.0.84.tar.gz", hash = "sha256:72d234e4582e9658ffea8e9cae5b63d488ad06994ef12d81dc303b17472f3526"},
     {file = "opencv_python-4.10.0.84-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:fc182f8f4cda51b45f01c64e4cbedfc2f00aff799debebc305d8d0210c43f251"},
@@ -4281,10 +4842,10 @@ files = [
 
 [package.dependencies]
 numpy = [
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
     {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
     {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
-    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
-    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
 
 [[package]]
@@ -4293,6 +4854,8 @@ version = "5.2.2"
 description = "Orderly set"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "orderly_set-5.2.2-py3-none-any.whl", hash = "sha256:f7a37c95a38c01cdfe41c3ffb62925a318a2286ea0a41790c057fc802aec54da"},
     {file = "orderly_set-5.2.2.tar.gz", hash = "sha256:52a18b86aaf3f5d5a498bbdb27bf3253a4e5c57ab38e5b7a56fa00115cd28448"},
@@ -4304,6 +4867,8 @@ version = "7.7.0"
 description = "A decorator to automatically detect mismatch when overriding a method."
 optional = true
 python-versions = ">=3.6"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "overrides-7.7.0-py3-none-any.whl", hash = "sha256:c7ed9d062f78b8e4c1a7b70bd8796b35ead4d9f510227ef9c5dc7626c60d7e49"},
     {file = "overrides-7.7.0.tar.gz", hash = "sha256:55158fa3d93b98cc75299b1e67078ad9003ca27945c76162c1c0766d6f91820a"},
@@ -4315,6 +4880,8 @@ version = "24.1"
 description = "Core utilities for Python packages"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"},
     {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"},
@@ -4326,6 +4893,8 @@ version = "2.2.3"
 description = "Powerful data structures for data analysis, time series, and statistics"
 optional = false
 python-versions = ">=3.9"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5"},
     {file = "pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348"},
@@ -4373,9 +4942,9 @@ files = [
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
-    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
     {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
+    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
 ]
 python-dateutil = ">=2.8.2"
 pytz = ">=2020.1"
@@ -4412,6 +4981,8 @@ version = "1.5.1"
 description = "Utilities for writing pandoc filters in python"
 optional = true
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "pandocfilters-1.5.1-py2.py3-none-any.whl", hash = "sha256:93be382804a9cdb0a7267585f157e5d1731bbe5545a85b268d6f5fe6232de2bc"},
     {file = "pandocfilters-1.5.1.tar.gz", hash = "sha256:002b4a555ee4ebc03f8b66307e287fa492e4a77b4ea14d3f934328297bb4939e"},
@@ -4423,6 +4994,8 @@ version = "0.8.4"
 description = "A Python Parser"
 optional = true
 python-versions = ">=3.6"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18"},
     {file = "parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d"},
@@ -4438,6 +5011,8 @@ version = "1.0.1"
 description = "Object-oriented filesystem paths"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "pathlib-1.0.1-py3-none-any.whl", hash = "sha256:f35f95ab8b0f59e6d354090350b44a80a80635d22efdedfa84c7ad1cf0a74147"},
     {file = "pathlib-1.0.1.tar.gz", hash = "sha256:6940718dfc3eff4258203ad5021090933e5c04707d5ca8cc9e73c94a7894ea9f"},
@@ -4449,6 +5024,8 @@ version = "1.24.3"
 description = "Gymnasium for multi-agent reinforcement learning."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"xarm\""
 files = [
     {file = "pettingzoo-1.24.3-py3-none-any.whl", hash = "sha256:23ed90517d2e8a7098bdaf5e31234b3a7f7b73ca578d70d1ca7b9d0cb0e37982"},
     {file = "pettingzoo-1.24.3.tar.gz", hash = "sha256:91f9094f18e06fb74b98f4099cd22e8ae4396125e51719d50b30c9f1c7ab07e6"},
@@ -4474,6 +5051,8 @@ version = "4.9.0"
 description = "Pexpect allows easy control of interactive console applications."
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "(extra == \"mani-skill\" or sys_platform == \"linux\") and (sys_platform != \"win32\" and sys_platform != \"emscripten\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523"},
     {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"},
@@ -4488,6 +5067,8 @@ version = "0.3.4"
 description = "Python port of the fzy fuzzy string matching algorithm"
 optional = false
 python-versions = ">=3.7,<4.0"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "pfzy-0.3.4-py3-none-any.whl", hash = "sha256:5f50d5b2b3207fa72e7ec0ef08372ef652685470974a107d0d4999fc5a903a96"},
     {file = "pfzy-0.3.4.tar.gz", hash = "sha256:717ea765dd10b63618e7298b2d98efd819e0b30cd5905c9707223dceeb94b3f1"},
@@ -4502,6 +5083,8 @@ version = "10.4.0"
 description = "Python Imaging Library (Fork)"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "pillow-10.4.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:4d9667937cfa347525b319ae34375c37b9ee6b525440f3ef48542fcf66f2731e"},
     {file = "pillow-10.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:543f3dc61c18dafb755773efc89aae60d06b6596a63914107f75459cf984164d"},
@@ -4599,6 +5182,8 @@ version = "0.1.0"
 description = "respeaker series pixel ring library"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "pixel-ring-0.1.0.tar.gz", hash = "sha256:9480f23b58ccb912321b989d00e9d31f087f7bbcd8d970fca0fb319853d03270"},
     {file = "pixel_ring-0.1.0-py2.py3-none-any.whl", hash = "sha256:c0fa51beb67be81b1f6ab058f651c489d69b47fb884d4361a0cf7594f093885b"},
@@ -4614,6 +5199,8 @@ version = "4.3.6"
 description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"},
     {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"},
@@ -4630,6 +5217,8 @@ version = "5.24.1"
 description = "An open-source, interactive data visualization library for Python"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "plotly-5.24.1-py3-none-any.whl", hash = "sha256:f67073a1e637eb0dc3e46324d9d51e2fe76e9727c892dde64ddf1e1b51f29089"},
     {file = "plotly-5.24.1.tar.gz", hash = "sha256:dbc8ac8339d248a4bcc36e08a5659bacfe1b079390b8953533f4eb22169b4bae"},
@@ -4645,6 +5234,8 @@ version = "1.5.0"
 description = "plugin and hook calling mechanisms for python"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"test\""
 files = [
     {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
     {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
@@ -4660,6 +5251,8 @@ version = "4.0.1"
 description = "A framework for managing and maintaining multi-language pre-commit hooks."
 optional = true
 python-versions = ">=3.9"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"dev\""
 files = [
     {file = "pre_commit-4.0.1-py2.py3-none-any.whl", hash = "sha256:efde913840816312445dc98787724647c65473daefe420785f885e8ed9a06878"},
     {file = "pre_commit-4.0.1.tar.gz", hash = "sha256:80905ac375958c0444c65e9cebebd948b3cdb518f335a091a670a89d652139d2"},
@@ -4678,6 +5271,8 @@ version = "0.21.0"
 description = "Python client for the Prometheus monitoring system."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "prometheus_client-0.21.0-py3-none-any.whl", hash = "sha256:4fa6b4dd0ac16d58bb587c04b1caae65b8c5043e85f778f42f5f632f6af2e166"},
     {file = "prometheus_client-0.21.0.tar.gz", hash = "sha256:96c83c606b71ff2b0a433c98889d275f51ffec6c5e267de37c7a2b5c9aa9233e"},
@@ -4692,6 +5287,8 @@ version = "3.0.48"
 description = "Library for building powerful interactive command lines in Python"
 optional = false
 python-versions = ">=3.7.0"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "prompt_toolkit-3.0.48-py3-none-any.whl", hash = "sha256:f49a827f90062e411f1ce1f854f2aedb3c23353244f8108b89283587397ac10e"},
     {file = "prompt_toolkit-3.0.48.tar.gz", hash = "sha256:d6623ab0477a80df74e646bdbc93621143f5caf104206aa29294d53de1a03d90"},
@@ -4706,6 +5303,8 @@ version = "0.2.0"
 description = "Accelerated property cache"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c5869b8fd70b81835a6f187c5fdbe67917a04d7e52b6e7cc4e5fe39d55c39d58"},
     {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:952e0d9d07609d9c5be361f33b0d6d650cd2bae393aabb11d9b719364521984b"},
@@ -4809,22 +5408,24 @@ files = [
 
 [[package]]
 name = "protobuf"
-version = "5.28.2"
+version = "5.29.3"
 description = ""
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
-    {file = "protobuf-5.28.2-cp310-abi3-win32.whl", hash = "sha256:eeea10f3dc0ac7e6b4933d32db20662902b4ab81bf28df12218aa389e9c2102d"},
-    {file = "protobuf-5.28.2-cp310-abi3-win_amd64.whl", hash = "sha256:2c69461a7fcc8e24be697624c09a839976d82ae75062b11a0972e41fd2cd9132"},
-    {file = "protobuf-5.28.2-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a8b9403fc70764b08d2f593ce44f1d2920c5077bf7d311fefec999f8c40f78b7"},
-    {file = "protobuf-5.28.2-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:35cfcb15f213449af7ff6198d6eb5f739c37d7e4f1c09b5d0641babf2cc0c68f"},
-    {file = "protobuf-5.28.2-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:5e8a95246d581eef20471b5d5ba010d55f66740942b95ba9b872d918c459452f"},
-    {file = "protobuf-5.28.2-cp38-cp38-win32.whl", hash = "sha256:87317e9bcda04a32f2ee82089a204d3a2f0d3c8aeed16568c7daf4756e4f1fe0"},
-    {file = "protobuf-5.28.2-cp38-cp38-win_amd64.whl", hash = "sha256:c0ea0123dac3399a2eeb1a1443d82b7afc9ff40241433296769f7da42d142ec3"},
-    {file = "protobuf-5.28.2-cp39-cp39-win32.whl", hash = "sha256:ca53faf29896c526863366a52a8f4d88e69cd04ec9571ed6082fa117fac3ab36"},
-    {file = "protobuf-5.28.2-cp39-cp39-win_amd64.whl", hash = "sha256:8ddc60bf374785fb7cb12510b267f59067fa10087325b8e1855b898a0d81d276"},
-    {file = "protobuf-5.28.2-py3-none-any.whl", hash = "sha256:52235802093bd8a2811abbe8bf0ab9c5f54cca0a751fdd3f6ac2a21438bffece"},
-    {file = "protobuf-5.28.2.tar.gz", hash = "sha256:59379674ff119717404f7454647913787034f03fe7049cbef1d74a97bb4593f0"},
+    {file = "protobuf-5.29.3-cp310-abi3-win32.whl", hash = "sha256:3ea51771449e1035f26069c4c7fd51fba990d07bc55ba80701c78f886bf9c888"},
+    {file = "protobuf-5.29.3-cp310-abi3-win_amd64.whl", hash = "sha256:a4fa6f80816a9a0678429e84973f2f98cbc218cca434abe8db2ad0bffc98503a"},
+    {file = "protobuf-5.29.3-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a8434404bbf139aa9e1300dbf989667a83d42ddda9153d8ab76e0d5dcaca484e"},
+    {file = "protobuf-5.29.3-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:daaf63f70f25e8689c072cfad4334ca0ac1d1e05a92fc15c54eb9cf23c3efd84"},
+    {file = "protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:c027e08a08be10b67c06bf2370b99c811c466398c357e615ca88c91c07f0910f"},
+    {file = "protobuf-5.29.3-cp38-cp38-win32.whl", hash = "sha256:84a57163a0ccef3f96e4b6a20516cedcf5bb3a95a657131c5c3ac62200d23252"},
+    {file = "protobuf-5.29.3-cp38-cp38-win_amd64.whl", hash = "sha256:b89c115d877892a512f79a8114564fb435943b59067615894c3b13cd3e1fa107"},
+    {file = "protobuf-5.29.3-cp39-cp39-win32.whl", hash = "sha256:0eb32bfa5219fc8d4111803e9a690658aa2e6366384fd0851064b963b6d1f2a7"},
+    {file = "protobuf-5.29.3-cp39-cp39-win_amd64.whl", hash = "sha256:6ce8cc3389a20693bfde6c6562e03474c40851b44975c9b2bf6df7d8c4f864da"},
+    {file = "protobuf-5.29.3-py3-none-any.whl", hash = "sha256:0a18ed4a24198528f2333802eb075e59dea9d679ab7a6c5efb017a59004d849f"},
+    {file = "protobuf-5.29.3.tar.gz", hash = "sha256:5da0f41edaf117bde316404bad1a486cb4ededf8e4a54891296f648e8e076620"},
 ]
 
 [[package]]
@@ -4833,6 +5434,8 @@ version = "6.0.0"
 description = "Cross-platform lib for process and system monitoring in Python."
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "psutil-6.0.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a021da3e881cd935e64a3d0a20983bda0bb4cf80e4f74fa9bfcb1bc5785360c6"},
     {file = "psutil-6.0.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:1287c2b95f1c0a364d23bc6f2ea2365a8d4d9b726a3be7294296ff7ba97c17f0"},
@@ -4862,6 +5465,8 @@ version = "0.7.0"
 description = "Run a subprocess in a pseudo terminal"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "(extra == \"mani-skill\" or sys_platform == \"linux\") and (sys_platform != \"win32\" and sys_platform != \"emscripten\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"},
     {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"},
@@ -4873,6 +5478,8 @@ version = "0.2.3"
 description = "Safely evaluate AST nodes without side effects"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0"},
     {file = "pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42"},
@@ -4887,6 +5494,8 @@ version = "17.0.0"
 description = "Python library for Apache Arrow"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"},
     {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"},
@@ -4938,6 +5547,8 @@ version = "0.2.14"
 description = "Cross-platform audio I/O with PortAudio"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "PyAudio-0.2.14-cp310-cp310-win32.whl", hash = "sha256:126065b5e82a1c03ba16e7c0404d8f54e17368836e7d2d92427358ad44fefe61"},
     {file = "PyAudio-0.2.14-cp310-cp310-win_amd64.whl", hash = "sha256:2a166fc88d435a2779810dd2678354adc33499e9d4d7f937f28b20cc55893e83"},
@@ -4945,6 +5556,8 @@ files = [
     {file = "PyAudio-0.2.14-cp311-cp311-win_amd64.whl", hash = "sha256:bbeb01d36a2f472ae5ee5e1451cacc42112986abe622f735bb870a5db77cf903"},
     {file = "PyAudio-0.2.14-cp312-cp312-win32.whl", hash = "sha256:5fce4bcdd2e0e8c063d835dbe2860dac46437506af509353c7f8114d4bacbd5b"},
     {file = "PyAudio-0.2.14-cp312-cp312-win_amd64.whl", hash = "sha256:12f2f1ba04e06ff95d80700a78967897a489c05e093e3bffa05a84ed9c0a7fa3"},
+    {file = "PyAudio-0.2.14-cp313-cp313-win32.whl", hash = "sha256:95328285b4dab57ea8c52a4a996cb52be6d629353315be5bfda403d15932a497"},
+    {file = "PyAudio-0.2.14-cp313-cp313-win_amd64.whl", hash = "sha256:692d8c1446f52ed2662120bcd9ddcb5aa2b71f38bda31e58b19fb4672fffba69"},
     {file = "PyAudio-0.2.14-cp38-cp38-win32.whl", hash = "sha256:858caf35b05c26d8fc62f1efa2e8f53d5fa1a01164842bd622f70ddc41f55000"},
     {file = "PyAudio-0.2.14-cp38-cp38-win_amd64.whl", hash = "sha256:2dac0d6d675fe7e181ba88f2de88d321059b69abd52e3f4934a8878e03a7a074"},
     {file = "PyAudio-0.2.14-cp39-cp39-win32.whl", hash = "sha256:f745109634a7c19fa4d6b8b7d6967c3123d988c9ade0cd35d4295ee1acdb53e9"},
@@ -4961,6 +5574,8 @@ version = "13.1.0"
 description = "Pythonic bindings for FFmpeg's libraries."
 optional = false
 python-versions = ">=3.10"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "pyav-13.1.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:64a81022e60dfba7dee9767a6fd150f42293855ea127979b2f38a3fd86f908fd"},
     {file = "pyav-13.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3971089334cc91e331c5014c8ea5fcbca0ccc82eb14952c128ce50570010a3cf"},
@@ -4996,6 +5611,8 @@ version = "0.8"
 description = "python library for reading and writing collada documents"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "pycollada-0.8.tar.gz", hash = "sha256:f3a3759cc4cec1d59e932aad74399dbcf541d18862aad903c770040da42af20e"},
 ]
@@ -5014,6 +5631,8 @@ version = "2.22"
 description = "C parser in Python"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"},
     {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"},
@@ -5025,6 +5644,8 @@ version = "2.6.1"
 description = "Python Game Development"
 optional = true
 python-versions = ">=3.6"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"pusht\""
 files = [
     {file = "pygame-2.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9beeb647e555afb5657111fa83acb74b99ad88761108eaea66472e8b8547b55b"},
     {file = "pygame-2.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:10e3d2a55f001f6c0a6eb44aa79ea7607091c9352b946692acedb2ac1482f1c9"},
@@ -5094,6 +5715,8 @@ version = "2.0.18"
 description = "pyglet is a cross-platform games and multimedia package."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "pyglet-2.0.18-py3-none-any.whl", hash = "sha256:e592952ae0297e456c587b6486ed8c3e5f9d0c3519d517bb92dde5fdf4c26b41"},
     {file = "pyglet-2.0.18.tar.gz", hash = "sha256:7cf9238d70082a2da282759679f8a011cc979753a32224a8ead8ed80e48f99dc"},
@@ -5105,6 +5728,8 @@ version = "2.18.0"
 description = "Pygments is a syntax highlighting package written in Python."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "pygments-2.18.0-py3-none-any.whl", hash = "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a"},
     {file = "pygments-2.18.0.tar.gz", hash = "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199"},
@@ -5119,6 +5744,8 @@ version = "6.8.1"
 description = "Pymunk is a easy-to-use pythonic 2D physics library"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "pymunk-6.8.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4203cb73ab1ecffbe6ff2c903542987828eec204acb012eba41592303a63a85c"},
     {file = "pymunk-6.8.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:aff5d00f05f78ab98f3cb699ba417db1eca1fe07ac88cb0f70a850d1f06d94bb"},
@@ -5192,6 +5819,8 @@ version = "1.7.7"
 description = "Monitor and control user input devices"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "(extra == \"dynamixel\" or extra == \"feetech\" or extra == \"stretch\") and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "pynput-1.7.7-py2.py3-none-any.whl", hash = "sha256:afc43f651684c98818de048abc76adf9f2d3d797083cb07c1f82be764a2d44cb"},
 ]
@@ -5203,12 +5832,33 @@ pyobjc-framework-Quartz = {version = ">=8.0", markers = "sys_platform == \"darwi
 python-xlib = {version = ">=0.17", markers = "sys_platform in \"linux\""}
 six = "*"
 
+[[package]]
+name = "pynvml"
+version = "12.0.0"
+description = "Python utilities for the NVIDIA Management Library"
+optional = true
+python-versions = ">=3.9"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"mani-skill\""
+files = [
+    {file = "pynvml-12.0.0-py3-none-any.whl", hash = "sha256:fdff84b62a27dbe98e08e1a647eb77342bef1aebe0878bcd15e99a83fcbecb9e"},
+    {file = "pynvml-12.0.0.tar.gz", hash = "sha256:299ce2451a6a17e6822d6faee750103e25b415f06f59abb8db65d30f794166f5"},
+]
+
+[package.dependencies]
+nvidia-ml-py = ">=12.0.0,<13.0.0a0"
+
+[package.extras]
+test = ["pytest (>=3.6)", "pytest-cov", "pytest-runner"]
+
 [[package]]
 name = "pyobjc-core"
 version = "10.3.1"
 description = "Python<->ObjC Interoperability Module"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(extra == \"dynamixel\" or extra == \"feetech\" or extra == \"stretch\") and sys_platform == \"darwin\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "pyobjc_core-10.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ea46d2cda17921e417085ac6286d43ae448113158afcf39e0abe484c58fb3d78"},
     {file = "pyobjc_core-10.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:899d3c84d2933d292c808f385dc881a140cf08632907845043a333a9d7c899f9"},
@@ -5226,6 +5876,8 @@ version = "10.3.1"
 description = "Wrappers for the framework ApplicationServices on macOS"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(extra == \"dynamixel\" or extra == \"feetech\" or extra == \"stretch\") and sys_platform == \"darwin\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "pyobjc_framework_ApplicationServices-10.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b694260d423c470cb90c3a7009cfde93e332ea6fb4b9b9526ad3acbd33460e3d"},
     {file = "pyobjc_framework_ApplicationServices-10.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d886ba1f65df47b77ff7546f3fc9bc7d08cfb6b3c04433b719f6b0689a2c0d1f"},
@@ -5249,6 +5901,8 @@ version = "10.3.1"
 description = "Wrappers for the Cocoa frameworks on macOS"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(extra == \"dynamixel\" or extra == \"feetech\" or extra == \"stretch\") and sys_platform == \"darwin\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "pyobjc_framework_Cocoa-10.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4cb4f8491ab4d9b59f5187e42383f819f7a46306a4fa25b84f126776305291d1"},
     {file = "pyobjc_framework_Cocoa-10.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5f31021f4f8fdf873b57a97ee1f3c1620dbe285e0b4eaed73dd0005eb72fd773"},
@@ -5269,6 +5923,8 @@ version = "10.3.1"
 description = "Wrappers for the framework CoreText on macOS"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(extra == \"dynamixel\" or extra == \"feetech\" or extra == \"stretch\") and sys_platform == \"darwin\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "pyobjc_framework_CoreText-10.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:dd6123cfccc38e32be884d1a13fb62bd636ecb192b9e8ae2b8011c977dec229e"},
     {file = "pyobjc_framework_CoreText-10.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:834142a14235bd80edaef8d3a28d1e203ed3c988810a9b78005df7c561390288"},
@@ -5291,6 +5947,8 @@ version = "10.3.1"
 description = "Wrappers for the Quartz frameworks on macOS"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(extra == \"dynamixel\" or extra == \"feetech\" or extra == \"stretch\") and sys_platform == \"darwin\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "pyobjc_framework_Quartz-10.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5ef4fd315ed2bc42ef77fdeb2bae28a88ec986bd7b8079a87ba3b3475348f96e"},
     {file = "pyobjc_framework_Quartz-10.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:96578d4a3e70164efe44ad7dc320ecd4e211758ffcde5dcd694de1bbdfe090a4"},
@@ -5312,6 +5970,8 @@ version = "3.1.7"
 description = "Standard OpenGL bindings for Python"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "(extra == \"xarm\" or extra == \"aloha\" or sys_platform == \"linux\") and (extra == \"xarm\" or extra == \"aloha\" or extra == \"stretch\") and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "PyOpenGL-3.1.7-py3-none-any.whl", hash = "sha256:a6ab19cf290df6101aaf7470843a9c46207789855746399d0af92521a0a92b7a"},
     {file = "PyOpenGL-3.1.7.tar.gz", hash = "sha256:eef31a3888e6984fd4d8e6c9961b184c9813ca82604d37fe3da80eb000a76c86"},
@@ -5323,6 +5983,8 @@ version = "3.1.4"
 description = "pyparsing module - Classes and methods to define and execute parsing grammars"
 optional = true
 python-versions = ">=3.6.8"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\" or extra == \"aloha\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\" or extra == \"aloha\")"
 files = [
     {file = "pyparsing-3.1.4-py3-none-any.whl", hash = "sha256:a6a7ee4235a3f944aa1fa2249307708f893fe5717dc603503c6c7969c070fb7c"},
     {file = "pyparsing-3.1.4.tar.gz", hash = "sha256:f86ec8d1a83f11977c9a6ea7598e8c27fc5cddfa5b07ea2241edbbde1d7bc032"},
@@ -5331,12 +5993,26 @@ files = [
 [package.extras]
 diagrams = ["jinja2", "railroad-diagrams"]
 
+[[package]]
+name = "pyperclip"
+version = "1.9.0"
+description = "A cross-platform clipboard module for Python. (Only handles plain text for now.)"
+optional = true
+python-versions = "*"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"mani-skill\""
+files = [
+    {file = "pyperclip-1.9.0.tar.gz", hash = "sha256:b7de0142ddc81bfc5c7507eea19da920b92252b548b96186caf94a5e2527d310"},
+]
+
 [[package]]
 name = "pyrealsense2"
 version = "2.55.1.6486"
 description = "Python Wrapper for Intel Realsense SDK 2.0."
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "(extra == \"intelrealsense\" or extra == \"stretch\") and sys_platform != \"darwin\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "pyrealsense2-2.55.1.6486-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:882613808289c602b23f2e19bf1fbadd63fb3af9be9c2997cc4ea74741a65136"},
     {file = "pyrealsense2-2.55.1.6486-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:686320811ef30c162c7240cb619e9b152420c0a32337a137139276c87f213336"},
@@ -5360,6 +6036,8 @@ version = "0.1.45"
 description = "Easy-to-use Python renderer for 3D visualization"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = []
 develop = false
 
@@ -5391,6 +6069,8 @@ version = "0.1.46"
 description = "Easy-to-use Python renderer for 3D visualization"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "pyribbit-0.1.46-py3-none-any.whl", hash = "sha256:0d4943f7cc6903f20ef42787e9357d7bb25c95f2c04da9dfa1a8021bdf9e0ab6"},
     {file = "pyribbit-0.1.46.tar.gz", hash = "sha256:3bb7a31841549ed74c50e31415738d2494b720df825cf387501f17102299940b"},
@@ -5418,6 +6098,8 @@ version = "3.5"
 description = "Python Serial Port Extension"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"dynamixel\" or extra == \"feetech\" or extra == \"test\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"dynamixel\" or extra == \"feetech\" or extra == \"test\")"
 files = [
     {file = "pyserial-3.5-py2.py3-none-any.whl", hash = "sha256:c4451db6ba391ca6ca299fb3ec7bae67a5c55dde170964c7a14ceefec02f2cf0"},
     {file = "pyserial-3.5.tar.gz", hash = "sha256:3c77e014170dfffbd816e6ffc205e9842efb10be9f58ec16d3e8675b4925cddb"},
@@ -5432,6 +6114,8 @@ version = "1.7.1"
 description = "A Python SOCKS client module. See https://github.com/Anorov/PySocks for more information."
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "PySocks-1.7.1-py27-none-any.whl", hash = "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299"},
     {file = "PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5"},
@@ -5444,6 +6128,8 @@ version = "8.3.3"
 description = "pytest: simple powerful testing with Python"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"test\""
 files = [
     {file = "pytest-8.3.3-py3-none-any.whl", hash = "sha256:a6853c7375b2663155079443d2e45de913a911a11d669df02a50814944db57b2"},
     {file = "pytest-8.3.3.tar.gz", hash = "sha256:70b98107bd648308a7952b06e6ca9a50bc660be218d53c257cc1fc94fda10181"},
@@ -5466,6 +6152,8 @@ version = "5.0.0"
 description = "Pytest plugin for measuring coverage."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"test\""
 files = [
     {file = "pytest-cov-5.0.0.tar.gz", hash = "sha256:5837b58e9f6ebd335b0f8060eecce69b662415b16dc503883a02f45dfeb14857"},
     {file = "pytest_cov-5.0.0-py3-none-any.whl", hash = "sha256:4f0764a1219df53214206bf1feea4633c3b558a2925c8b59f144f682861ce652"},
@@ -5484,6 +6172,8 @@ version = "2.9.0.post0"
 description = "Extensions to the standard Python datetime module"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
     {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
@@ -5498,6 +6188,8 @@ version = "2.0.7"
 description = "A python library adding a json log formatter"
 optional = true
 python-versions = ">=3.6"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "python-json-logger-2.0.7.tar.gz", hash = "sha256:23e7ec02d34237c5aa1e29a070193a4ea87583bb4e7f8fd06d3de8264c4b2e1c"},
     {file = "python_json_logger-2.0.7-py3-none-any.whl", hash = "sha256:f380b826a991ebbe3de4d897aeec42760035ac760345e57b812938dc8b35e2bd"},
@@ -5509,6 +6201,8 @@ version = "3.9.0"
 description = "Python Utils is a module with some convenient utilities not included with the standard Python install"
 optional = true
 python-versions = ">3.9.0"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "python_utils-3.9.0-py2.py3-none-any.whl", hash = "sha256:a7719a5ef4bae7360d2a15c13b08c4e3c3e39b9df19bd16f119ff8d0cfeaafb7"},
     {file = "python_utils-3.9.0.tar.gz", hash = "sha256:3689556884e3ae53aec5a4c9f17b36e752a3e93a7ba2768c6553fc4dd6fa70ef"},
@@ -5528,6 +6222,8 @@ version = "0.33"
 description = "Python X Library"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "(extra == \"dynamixel\" or extra == \"feetech\" or extra == \"stretch\") and sys_platform in \"linux\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "python-xlib-0.33.tar.gz", hash = "sha256:55af7906a2c75ce6cb280a584776080602444f75815a7aff4d287bb2d7018b32"},
     {file = "python_xlib-0.33-py2.py3-none-any.whl", hash = "sha256:c3534038d42e0df2f1392a1b30a15a4ff5fdc2b86cfa94f072bf11b10a164398"},
@@ -5536,12 +6232,60 @@ files = [
 [package.dependencies]
 six = ">=1.10.0"
 
+[[package]]
+name = "pytorch-kinematics"
+version = "0.7.4"
+description = "Robot kinematics implemented in pytorch"
+optional = true
+python-versions = ">=3.6"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"mani-skill\""
+files = [
+    {file = "pytorch_kinematics-0.7.4-py3-none-any.whl", hash = "sha256:753fb81091d0692d763fef960305f6a0d5a343ec714da8d630f8479ab04f47aa"},
+    {file = "pytorch_kinematics-0.7.4.tar.gz", hash = "sha256:34d45b51c1ead384b0a420a49d5bb92c2d838aa9a76cfc239369485bdcd65bd1"},
+]
+
+[package.dependencies]
+absl-py = "*"
+arm-pytorch-utilities = "*"
+lxml = "*"
+matplotlib = "*"
+numpy = "<2"
+pytorch-seed = "*"
+pyyaml = "*"
+torch = "*"
+
+[package.extras]
+test = ["pybullet", "pytest"]
+
+[[package]]
+name = "pytorch-seed"
+version = "0.2.0"
+description = "RNG seeding and context management for pytorch"
+optional = true
+python-versions = ">=3.6"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"mani-skill\""
+files = [
+    {file = "pytorch_seed-0.2.0-py3-none-any.whl", hash = "sha256:50a1ee2f62e55f88c20069849aa12265a007aeaea6893f3d23ad4e40173c5c89"},
+    {file = "pytorch_seed-0.2.0.tar.gz", hash = "sha256:096edd3404f8a00f3df2bab41024945806baf1f64b05678c82373b780458e1a3"},
+]
+
+[package.dependencies]
+numpy = "*"
+torch = "*"
+
+[package.extras]
+test = ["pytest"]
+
 [[package]]
 name = "pytz"
 version = "2024.2"
 description = "World timezone definitions, modern and historical"
 optional = false
 python-versions = "*"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725"},
     {file = "pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a"},
@@ -5553,6 +6297,8 @@ version = "1.2.1"
 description = "Python USB access module"
 optional = true
 python-versions = ">=3.6.0"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "pyusb-1.2.1-py3-none-any.whl", hash = "sha256:2b4c7cb86dbadf044dfb9d3a4ff69fd217013dbe78a792177a3feb172449ea36"},
     {file = "pyusb-1.2.1.tar.gz", hash = "sha256:a4cc7404a203144754164b8b40994e2849fde1cfff06b08492f12fff9d9de7b9"},
@@ -5564,6 +6310,8 @@ version = "2.0.13"
 description = "Pseudo terminal support for Windows from Python."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and os_name == \"nt\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "pywinpty-2.0.13-cp310-none-win_amd64.whl", hash = "sha256:697bff211fb5a6508fee2dc6ff174ce03f34a9a233df9d8b5fe9c8ce4d5eaf56"},
     {file = "pywinpty-2.0.13-cp311-none-win_amd64.whl", hash = "sha256:b96fb14698db1284db84ca38c79f15b4cfdc3172065b5137383910567591fa99"},
@@ -5579,6 +6327,8 @@ version = "6.0.2"
 description = "YAML parser and emitter for Python"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
     {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
@@ -5641,6 +6391,8 @@ version = "26.2.0"
 description = "Python bindings for 0MQ"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "pyzmq-26.2.0-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:ddf33d97d2f52d89f6e6e7ae66ee35a4d9ca6f36eda89c24591b0c40205a3629"},
     {file = "pyzmq-26.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dacd995031a01d16eec825bf30802fceb2c3791ef24bcce48fa98ce40918c27b"},
@@ -5762,6 +6514,8 @@ version = "0.35.1"
 description = "JSON Referencing + Python"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "referencing-0.35.1-py3-none-any.whl", hash = "sha256:eda6d3234d62814d1c64e305c1331c9a3a6132da475ab6382eaa997b21ee75de"},
     {file = "referencing-0.35.1.tar.gz", hash = "sha256:25b42124a6c8b632a425174f24087783efb348a6f1e0008e63cd4466fedf703c"},
@@ -5777,6 +6531,8 @@ version = "2024.9.11"
 description = "Alternative regular expression module, to replace re."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "regex-2024.9.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1494fa8725c285a81d01dc8c06b55287a1ee5e0e382d8413adc0a9197aac6408"},
     {file = "regex-2024.9.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0e12c481ad92d129c78f13a2a3662317e46ee7ef96c94fd332e1c29131875b7d"},
@@ -5880,6 +6636,8 @@ version = "2022.1.5"
 description = "Wrapper package for OpenCV with Inference Engine python bindings, but compiled under another namespace to prevent conflicts with the default OpenCV python packages"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "renamed_opencv_python_inference_engine-2022.1.5-py3-none-manylinux1_x86_64.whl", hash = "sha256:c92666acfd75f8b29b9f1aa566d4ad3851387fcea3992f113f72adf449477523"},
 ]
@@ -5893,6 +6651,8 @@ version = "2.32.3"
 description = "Python HTTP for Humans."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
     {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
@@ -5915,6 +6675,8 @@ version = "0.18.2"
 description = "The Rerun Logging SDK"
 optional = false
 python-versions = "<3.13,>=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "rerun_sdk-0.18.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:bc4e73275f428e4e9feb8e85f88db7a9fd18b997b1570de62f949a926978f1b2"},
     {file = "rerun_sdk-0.18.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:efbba40a59710ae83607cb0dc140398a35979c2d2acf5190c9def2ac4697f6a8"},
@@ -5940,6 +6702,8 @@ version = "0.1.4"
 description = "A pure python RFC3339 validator"
 optional = true
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "rfc3339_validator-0.1.4-py2.py3-none-any.whl", hash = "sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa"},
     {file = "rfc3339_validator-0.1.4.tar.gz", hash = "sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b"},
@@ -5954,6 +6718,8 @@ version = "0.1.1"
 description = "Pure python rfc3986 validator"
 optional = true
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "rfc3986_validator-0.1.1-py2.py3-none-any.whl", hash = "sha256:2f235c432ef459970b4306369336b9d5dbdda31b510ca1e327636e01f528bfa9"},
     {file = "rfc3986_validator-0.1.1.tar.gz", hash = "sha256:3d44bde7921b3b9ec3ae4e3adca370438eccebc676456449b145d533b240d055"},
@@ -5965,6 +6731,8 @@ version = "13.9.2"
 description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
 optional = true
 python-versions = ">=3.8.0"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "rich-13.9.2-py3-none-any.whl", hash = "sha256:8c82a3d3f8dcfe9e734771313e606b39d8247bb6b826e196f4914b333b743cf1"},
     {file = "rich-13.9.2.tar.gz", hash = "sha256:51a2c62057461aaf7152b4d611168f93a9fc73068f8ded2790f29fe2b5366d0c"},
@@ -5984,6 +6752,8 @@ version = "0.20.0"
 description = "Python bindings to Rust's persistent data structures (rpds)"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "rpds_py-0.20.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3ad0fda1635f8439cde85c700f964b23ed5fc2d28016b32b9ee5fe30da5c84e2"},
     {file = "rpds_py-0.20.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9bb4a0d90fdb03437c109a17eade42dfbf6190408f29b2744114d11586611d6f"},
@@ -6096,6 +6866,8 @@ version = "0.9.5"
 description = "Simple and lightweight module for working with RPLidar laser scanners"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "rplidar-roboticia-0.9.5.tar.gz", hash = "sha256:709e9143f7701d69e8439231b065e676f7d5a6086cd2922113b055bedf99f0e3"},
 ]
@@ -6109,6 +6881,8 @@ version = "0.4.5"
 description = ""
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "safetensors-0.4.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:a63eaccd22243c67e4f2b1c3e258b257effc4acd78f3b9d397edc8cf8f1298a7"},
     {file = "safetensors-0.4.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:23fc9b4ec7b602915cbb4ec1a7c1ad96d2743c322f20ab709e2c35d1b66dad27"},
@@ -6235,12 +7009,44 @@ tensorflow = ["safetensors[numpy]", "tensorflow (>=2.11.0)"]
 testing = ["h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "hypothesis (>=6.70.2)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "safetensors[numpy]", "setuptools-rust (>=1.5.2)"]
 torch = ["safetensors[numpy]", "torch (>=1.10)"]
 
+[[package]]
+name = "sapien"
+version = "3.0.0b1"
+description = "['SAPIEN: A SimulAted Parted based Interactive ENvironment']"
+optional = true
+python-versions = ">=3.7"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"mani-skill\""
+files = [
+    {file = "sapien-3.0.0b1-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:9763215d52374e48db16d8c2e89fb385a5625690c0c88cf2c636c1abe99aef6c"},
+    {file = "sapien-3.0.0b1-cp310-cp310-win_amd64.whl", hash = "sha256:b050bf7c7cbd2d825e0a62a40262de474596bc1df653f8e1aa9fd99aaec42962"},
+    {file = "sapien-3.0.0b1-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:d9497375c9dc26f07b82ba08f347f7ff5df063c057d2744aec0521568ffeb096"},
+    {file = "sapien-3.0.0b1-cp311-cp311-win_amd64.whl", hash = "sha256:6357d3cf6c062cf9a5799252887c37dd29100ecbe8d052566da77b7c4d25d486"},
+    {file = "sapien-3.0.0b1-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:a064c4e3e33140c38464163155942bd710923cc3f9365f78c6ac5d58a97433a1"},
+    {file = "sapien-3.0.0b1-cp312-cp312-win_amd64.whl", hash = "sha256:293ec283ef3f790f13569cff9b54fda92e278bae943e214b7dad4f0579c6bd84"},
+    {file = "sapien-3.0.0b1-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:64bcab386cac14b827356c7d8f77986142e8142d903d6625c1c4a06fd915a550"},
+    {file = "sapien-3.0.0b1-cp38-cp38-win_amd64.whl", hash = "sha256:0f8101a10e6db3a4ca8bf54c197ff5eefcd5e5aa7e74e5f9948c3e379a197702"},
+    {file = "sapien-3.0.0b1-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:c77f7a5877dde8edc40bb9faaf5ff9420f064e04bb899b73600cd910d762c18d"},
+    {file = "sapien-3.0.0b1-cp39-cp39-win_amd64.whl", hash = "sha256:28f858831c11c6f9f5c80f873c9de1ff23bd7c14f6477bbf82dbba3b77a4da26"},
+]
+
+[package.dependencies]
+lxml = "*"
+networkx = "*"
+numpy = ">=1.17"
+opencv-python = ">=4.0"
+pyperclip = "*"
+requests = ">=2.22"
+transforms3d = ">=0.3"
+
 [[package]]
 name = "scikit-image"
 version = "0.24.0"
 description = "Image processing in Python"
 optional = true
 python-versions = ">=3.9"
+groups = ["main"]
+markers = "(extra == \"pusht\" or extra == \"video-benchmark\" or sys_platform == \"linux\") and (extra == \"pusht\" or extra == \"video-benchmark\" or extra == \"stretch\") and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "scikit_image-0.24.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cb3bc0264b6ab30b43c4179ee6156bc18b4861e78bb329dd8d16537b7bbf827a"},
     {file = "scikit_image-0.24.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:9c7a52e20cdd760738da38564ba1fed7942b623c0317489af1a598a8dedf088b"},
@@ -6289,6 +7095,8 @@ version = "1.14.1"
 description = "Fundamental algorithms for scientific computing in Python"
 optional = true
 python-versions = ">=3.10"
+groups = ["main"]
+markers = "(extra == \"pusht\" or extra == \"video-benchmark\" or sys_platform == \"linux\" or extra == \"aloha\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"pusht\" or extra == \"video-benchmark\" or extra == \"stretch\" or extra == \"aloha\" or extra == \"mani-skill\")"
 files = [
     {file = "scipy-1.14.1-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:b28d2ca4add7ac16ae8bb6632a3c86e4b9e4d52d3e34267f6e1b0c1f8d87e389"},
     {file = "scipy-1.14.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d0d2821003174de06b69e58cef2316a6622b60ee613121199cb2852a873f8cf3"},
@@ -6339,6 +7147,8 @@ version = "1.8.3"
 description = "Send file to trash natively under Mac OS X, Windows and Linux"
 optional = true
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "Send2Trash-1.8.3-py3-none-any.whl", hash = "sha256:0c31227e0bd08961c7665474a3d1ef7193929fedda4233843689baa056be46c9"},
     {file = "Send2Trash-1.8.3.tar.gz", hash = "sha256:b18e7a3966d99871aefeb00cfbcfdced55ce4871194810fc71f4aa484b953abf"},
@@ -6355,6 +7165,8 @@ version = "2.16.0"
 description = "Python client for Sentry (https://sentry.io)"
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "sentry_sdk-2.16.0-py2.py3-none-any.whl", hash = "sha256:49139c31ebcd398f4f6396b18910610a0c1602f6e67083240c33019d1f6aa30c"},
     {file = "sentry_sdk-2.16.0.tar.gz", hash = "sha256:90f733b32e15dfc1999e6b7aca67a38688a567329de4d6e184154a73f96c6892"},
@@ -6407,6 +7219,8 @@ version = "1.3.3"
 description = "A Python module to customize the process title"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "setproctitle-1.3.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:897a73208da48db41e687225f355ce993167079eda1260ba5e13c4e53be7f754"},
     {file = "setproctitle-1.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8c331e91a14ba4076f88c29c777ad6b58639530ed5b24b5564b5ed2fd7a95452"},
@@ -6507,6 +7321,8 @@ version = "75.1.0"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "setuptools-75.1.0-py3-none-any.whl", hash = "sha256:35ab7fd3bcd95e6b7fd704e4a1539513edad446c097797f2985e0e4b960772f2"},
     {file = "setuptools-75.1.0.tar.gz", hash = "sha256:d59a21b17a275fb872a9c3dae73963160ae079f1049ed956880cd7c09b120538"},
@@ -6527,6 +7343,8 @@ version = "2.1.0"
 description = "Python subprocess replacement"
 optional = true
 python-versions = "<4.0,>=3.8.1"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "sh-2.1.0-py3-none-any.whl", hash = "sha256:bf5e44178dd96a542126c2774e9b7ab1d89bfe0e2ef84d92e6d0ed7358d63d01"},
     {file = "sh-2.1.0.tar.gz", hash = "sha256:7e27301c574bec8ca5bf6f211851357526455ee97cd27a7c4c6cc5e2375399cb"},
@@ -6538,6 +7356,8 @@ version = "2.0.6"
 description = "Manipulation and analysis of geometric objects"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"pusht\""
 files = [
     {file = "shapely-2.0.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:29a34e068da2d321e926b5073539fd2a1d4429a2c656bd63f0bd4c8f5b236d0b"},
     {file = "shapely-2.0.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e1c84c3f53144febf6af909d6b581bc05e8785d57e27f35ebaa5c1ab9baba13b"},
@@ -6590,12 +7410,30 @@ numpy = ">=1.14,<3"
 docs = ["matplotlib", "numpydoc (==1.1.*)", "sphinx", "sphinx-book-theme", "sphinx-remove-toctrees"]
 test = ["pytest", "pytest-cov"]
 
+[[package]]
+name = "shtab"
+version = "1.7.1"
+description = "Automagic shell tab completion for Python CLI applications"
+optional = true
+python-versions = ">=3.7"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"mani-skill\""
+files = [
+    {file = "shtab-1.7.1-py3-none-any.whl", hash = "sha256:32d3d2ff9022d4c77a62492b6ec875527883891e33c6b479ba4d41a51e259983"},
+    {file = "shtab-1.7.1.tar.gz", hash = "sha256:4e4bcb02eeb82ec45920a5d0add92eac9c9b63b2804c9196c1f1fdc2d039243c"},
+]
+
+[package.extras]
+dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout"]
+
 [[package]]
 name = "six"
 version = "1.16.0"
 description = "Python 2 and 3 compatibility utilities"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
@@ -6607,6 +7445,8 @@ version = "5.0.1"
 description = "A pure Python implementation of a sliding window memory map manager"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "smmap-5.0.1-py3-none-any.whl", hash = "sha256:e6d8668fa5f93e706934a62d7b4db19c8d9eb8cf2adbb75ef1b675aa332b69da"},
     {file = "smmap-5.0.1.tar.gz", hash = "sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62"},
@@ -6618,6 +7458,8 @@ version = "2.2.0"
 description = "A web-based viewer for Python profiler output"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "snakeviz-2.2.0-py2.py3-none-any.whl", hash = "sha256:569e2d71c47f80a886aa6e70d6405cb6d30aa3520969ad956b06f824c5f02b8e"},
     {file = "snakeviz-2.2.0.tar.gz", hash = "sha256:7bfd00be7ae147eb4a170a471578e1cd3f41f803238958b6b8efcf2c698a6aa9"},
@@ -6632,6 +7474,8 @@ version = "1.3.1"
 description = "Sniff out which async library your code is running under"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"},
     {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
@@ -6643,6 +7487,8 @@ version = "2.6"
 description = "A modern CSS selector implementation for Beautiful Soup."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"},
     {file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"},
@@ -6654,6 +7500,8 @@ version = "3.10.4"
 description = "Library for performing speech recognition, with support for several engines and APIs, online and offline."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "SpeechRecognition-3.10.4-py2.py3-none-any.whl", hash = "sha256:723b8155692a8ed11a30013f15f89a3e57c5dc8bc73c8cb024bf9bd14c21fba5"},
     {file = "speechrecognition-3.10.4.tar.gz", hash = "sha256:986bafcf61f14625c2f3cea6a471838edd379ed68aeed7b8f3c0fb41e21f1125"},
@@ -6674,6 +7522,8 @@ version = "3.6"
 description = "Python bindings for Linux SPI access through spidev"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "spidev-3.6-cp39-cp39-linux_armv7l.whl", hash = "sha256:280abc00a1ef7780ef62c3f294f52a2527b6c47d8c269fea98664970bcaf6da5"},
     {file = "spidev-3.6.tar.gz", hash = "sha256:14dbc37594a4aaef85403ab617985d3c3ef464d62bc9b769ef552db53701115b"},
@@ -6685,6 +7535,8 @@ version = "0.6.3"
 description = "Extract data from python stack frames and tracebacks for informative displays"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695"},
     {file = "stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9"},
@@ -6704,6 +7556,8 @@ version = "1.13.3"
 description = "Computer algebra system (CAS) in Python"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "sympy-1.13.3-py3-none-any.whl", hash = "sha256:54612cf55a62755ee71824ce692986f23c88ffa77207b30c1368eda4a7060f73"},
     {file = "sympy-1.13.3.tar.gz", hash = "sha256:b27fd2c6530e0ab39e275fc9b683895367e51d5da91baa8d3d64db2565fec4d9"},
@@ -6721,6 +7575,8 @@ version = "0.9.0"
 description = "Pretty-print tabular data"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"},
     {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"},
@@ -6735,6 +7591,8 @@ version = "9.0.0"
 description = "Retry code until it succeeds"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "tenacity-9.0.0-py3-none-any.whl", hash = "sha256:93de0c98785b27fcf659856aa9f54bfbd399e29969b0621bc7f762bd441b4539"},
     {file = "tenacity-9.0.0.tar.gz", hash = "sha256:807f37ca97d62aa361264d497b0e31e92b8027044942bfa756160d908320d73b"},
@@ -6750,6 +7608,8 @@ version = "2.5.0"
 description = "ANSI color formatting for output in terminal"
 optional = false
 python-versions = ">=3.9"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "termcolor-2.5.0-py3-none-any.whl", hash = "sha256:37b17b5fc1e604945c2642c872a3764b5d547a48009871aea3edd3afa180afb8"},
     {file = "termcolor-2.5.0.tar.gz", hash = "sha256:998d8d27da6d48442e8e1f016119076b690d962507531df4890fcd2db2ef8a6f"},
@@ -6764,6 +7624,8 @@ version = "0.18.1"
 description = "Tornado websocket backend for the Xterm.js Javascript terminal emulator library."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "terminado-0.18.1-py3-none-any.whl", hash = "sha256:a4468e1b37bb318f8a86514f65814e1afc977cf29b3992a4500d9dd305dcceb0"},
     {file = "terminado-0.18.1.tar.gz", hash = "sha256:de09f2c4b85de4765f7714688fff57d3e75bad1f909b589fde880460c753fd2e"},
@@ -6785,6 +7647,8 @@ version = "2024.9.20"
 description = "Read and write TIFF files"
 optional = true
 python-versions = ">=3.10"
+groups = ["main"]
+markers = "(extra == \"pusht\" or extra == \"video-benchmark\" or sys_platform == \"linux\") and (extra == \"pusht\" or extra == \"video-benchmark\" or extra == \"stretch\") and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "tifffile-2024.9.20-py3-none-any.whl", hash = "sha256:c54dc85bc1065d972cb8a6ffb3181389d597876aa80177933459733e4ed243dd"},
     {file = "tifffile-2024.9.20.tar.gz", hash = "sha256:3fbf3be2f995a7051a8ae05a4be70c96fc0789f22ed6f1c4104c973cf68a640b"},
@@ -6807,6 +7671,8 @@ version = "1.3.0"
 description = "A tiny CSS parser"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "tinycss2-1.3.0-py3-none-any.whl", hash = "sha256:54a8dbdffb334d536851be0226030e9505965bb2f30f21a4a82c55fb2a80fae7"},
     {file = "tinycss2-1.3.0.tar.gz", hash = "sha256:152f9acabd296a8375fbca5b84c961ff95971fcfc32e79550c8df8e29118c54d"},
@@ -6825,6 +7691,8 @@ version = "0.21.0"
 description = ""
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"hilserl\""
 files = [
     {file = "tokenizers-0.21.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:3c4c93eae637e7d2aaae3d376f06085164e1660f89304c0ab2b1d08a406636b2"},
     {file = "tokenizers-0.21.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:f53ea537c925422a2e0e92a24cce96f6bc5046bbef24a1652a5edc8ba975f62e"},
@@ -6857,17 +7725,53 @@ version = "2.0.2"
 description = "A lil' TOML parser"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version < \"3.11\" and (extra == \"test\" or sys_platform == \"linux\") and (extra == \"test\" or extra == \"stretch\")"
 files = [
     {file = "tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38"},
     {file = "tomli-2.0.2.tar.gz", hash = "sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed"},
 ]
 
+[[package]]
+name = "toppra"
+version = "0.6.0"
+description = "toppra: time-optimal parametrization of trajectories for robots subject to constraints."
+optional = true
+python-versions = "*"
+groups = ["main"]
+markers = "extra == \"mani-skill\" and platform_system == \"Linux\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
+files = [
+    {file = "toppra-0.6.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:fdb86ff990b9043649de7421225702be336905631ac63705b346f80474202068"},
+    {file = "toppra-0.6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:251acba452e4093e5552048f1a7abd8b77033281889e15420ef7058cd8dc139b"},
+    {file = "toppra-0.6.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:9ebad7f1323d48e971c7c58dda97cb06b2aa97e1b102d4074a6877995c7a4e51"},
+    {file = "toppra-0.6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8f86b6b9d4f6205a6adb38849a3e81e4de8e4c30a8e60c37c3f7fd4daea6e7e9"},
+    {file = "toppra-0.6.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:fec5a4b8fdbd3b0017425d85c989a4457905c963368f127cefe7a4287c5a9dd4"},
+    {file = "toppra-0.6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:73ac5b0e6d7aa58a59c339afe547adc769964dfceb6b5aa18b3ba958fd1bb01e"},
+    {file = "toppra-0.6.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:7aaae509d3dd99b9d3487109a418563988ef422e8cd2027043b9f2d76fea3816"},
+    {file = "toppra-0.6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:429e4f40a7676e6ed07b3703ebc14cc0866a8d437c84e6fac77dbfa490a5f484"},
+    {file = "toppra-0.6.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:af412ecdeaf6e4cf29fb89d296b3fe5351e61190cb5d9101720bf2dab1387581"},
+    {file = "toppra-0.6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c50713044c7b76e215311100d9a8895940dfb22b779057d83d8710ce940622a9"},
+    {file = "toppra-0.6.0-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:02683c04ea64326045ff6ac01a19ab0e9113d7fc22ef98bd60fb6c4ba3666c76"},
+    {file = "toppra-0.6.0-pp37-pypy37_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1280ad9988bb224574b36b3af62db14b7d023685f574ea37a9c8a3039e973923"},
+    {file = "toppra-0.6.0.tar.gz", hash = "sha256:7822e802c809ba3bc3525857985c9fd1577d49a95a5531baabb739aa3d7fc5c4"},
+]
+
+[package.dependencies]
+matplotlib = "*"
+numpy = "*"
+scipy = ">0.18"
+
+[package.extras]
+dev = ["PyYAML (<=5.3.1)", "cvxopt", "cvxpy", "cython", "invoke", "ipdb", "ipython", "msgpack (<=1.0.1)", "mypy", "numpy", "pandas", "pylint", "pytest", "strip-hints", "tabulate"]
+
 [[package]]
 name = "torch"
 version = "2.4.1"
 description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
 optional = false
 python-versions = ">=3.8.0"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:362f82e23a4cd46341daabb76fba08f04cd646df9bfaf5da50af97cb60ca4971"},
     {file = "torch-2.4.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:e8ac1985c3ff0f60d85b991954cfc2cc25f79c84545aead422763148ed2759e3"},
@@ -6922,6 +7826,8 @@ version = "1.6.0"
 description = "PyTorch native Metrics"
 optional = true
 python-versions = ">=3.9"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"hilserl\""
 files = [
     {file = "torchmetrics-1.6.0-py3-none-any.whl", hash = "sha256:a508cdd87766cedaaf55a419812bf9f493aff8fffc02cc19df5a8e2e7ccb942a"},
     {file = "torchmetrics-1.6.0.tar.gz", hash = "sha256:aebba248708fb90def20cccba6f55bddd134a58de43fb22b0c5ca0f3a89fa984"},
@@ -6950,6 +7856,8 @@ version = "0.19.1"
 description = "image and video datasets and models for torch deep learning"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "torchvision-0.19.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:54e8513099e6f586356c70f809d34f391af71ad182fe071cc328a28af2c40608"},
     {file = "torchvision-0.19.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:20a1f5e02bfdad7714e55fa3fa698347c11d829fa65e11e5a84df07d93350eed"},
@@ -6988,6 +7896,8 @@ version = "6.4.1"
 description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "tornado-6.4.1-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:163b0aafc8e23d8cdc3c9dfb24c5368af84a81e3364745ccb4427669bf84aec8"},
     {file = "tornado-6.4.1-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:6d5ce3437e18a2b66fbadb183c1d3364fb03f2be71299e7d10dbeeb69f4b2a14"},
@@ -7008,6 +7918,8 @@ version = "4.66.5"
 description = "Fast, Extensible Progress Meter"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "tqdm-4.66.5-py3-none-any.whl", hash = "sha256:90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd"},
     {file = "tqdm-4.66.5.tar.gz", hash = "sha256:e1020aef2e5096702d8a025ac7d16b1577279c9d63f8375b63083e9a5f0fcbad"},
@@ -7028,6 +7940,8 @@ version = "5.14.3"
 description = "Traitlets Python configuration system"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f"},
     {file = "traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7"},
@@ -7043,6 +7957,8 @@ version = "4.47.0"
 description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
 optional = true
 python-versions = ">=3.9.0"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"hilserl\""
 files = [
     {file = "transformers-4.47.0-py3-none-any.whl", hash = "sha256:a8e1bafdaae69abdda3cad638fe392e37c86d2ce0ecfcae11d60abb8f949ff4d"},
     {file = "transformers-4.47.0.tar.gz", hash = "sha256:f8ead7a5a4f6937bb507e66508e5e002dc5930f7b6122a9259c37b099d0f3b19"},
@@ -7112,6 +8028,8 @@ version = "0.4.2"
 description = "Functions for 3D coordinate transformations"
 optional = true
 python-versions = ">=3.6"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "transforms3d-0.4.2-py3-none-any.whl", hash = "sha256:1c70399d9e9473ecc23311fd947f727f7c69ed0b063244828c383aa1aefa5941"},
     {file = "transforms3d-0.4.2.tar.gz", hash = "sha256:e8b5df30eaedbee556e81c6938e55aab5365894e47d0a17615d7db7fd2393680"},
@@ -7126,6 +8044,8 @@ version = "4.4.7"
 description = "Import, export, process, analyze and view triangular meshes."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "(sys_platform == \"linux\" or extra == \"mani-skill\") and (python_version >= \"3.12\" or python_version <= \"3.11\") and (extra == \"stretch\" or extra == \"mani-skill\")"
 files = [
     {file = "trimesh-4.4.7-py3-none-any.whl", hash = "sha256:6df98f3f5b971945b416f567b7ff6ee0c51b70f01b80a16a990fdcceb8dbd114"},
     {file = "trimesh-4.4.7.tar.gz", hash = "sha256:e6619c70c99006d41f175bd5e1ba2c8c3dfdb00c2b41d65059917942e2f6971a"},
@@ -7147,6 +8067,8 @@ version = "3.0.0"
 description = "A language and compiler for custom Deep Learning operations"
 optional = false
 python-versions = "*"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e1efef76935b2febc365bfadf74bcb65a6f959a9872e5bddf44cc9e0adce1e1a"},
     {file = "triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5ce8520437c602fb633f1324cc3871c47bee3b67acf9756c1a66309b60e3216c"},
@@ -7163,12 +8085,34 @@ build = ["cmake (>=3.20)", "lit"]
 tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"]
 tutorials = ["matplotlib", "pandas", "tabulate"]
 
+[[package]]
+name = "typeguard"
+version = "4.4.1"
+description = "Run-time type checker for Python"
+optional = true
+python-versions = ">=3.9"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"mani-skill\""
+files = [
+    {file = "typeguard-4.4.1-py3-none-any.whl", hash = "sha256:9324ec07a27ec67fc54a9c063020ca4c0ae6abad5e9f0f9804ca59aee68c6e21"},
+    {file = "typeguard-4.4.1.tar.gz", hash = "sha256:0d22a89d00b453b47c49875f42b6601b961757541a2e1e0ef517b6e24213c21b"},
+]
+
+[package.dependencies]
+typing-extensions = ">=4.10.0"
+
+[package.extras]
+doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme (>=1.3.0)"]
+test = ["coverage[toml] (>=7)", "mypy (>=1.2.0)", "pytest (>=7)"]
+
 [[package]]
 name = "types-python-dateutil"
 version = "2.9.0.20241003"
 description = "Typing stubs for python-dateutil"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "types-python-dateutil-2.9.0.20241003.tar.gz", hash = "sha256:58cb85449b2a56d6684e41aeefb4c4280631246a0da1a719bdbe6f3fb0317446"},
     {file = "types_python_dateutil-2.9.0.20241003-py3-none-any.whl", hash = "sha256:250e1d8e80e7bbc3a6c99b907762711d1a1cdd00e978ad39cb5940f6f0a87f3d"},
@@ -7180,17 +8124,45 @@ version = "4.12.2"
 description = "Backported and Experimental Type Hints for Python 3.8+"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
     {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
 ]
 
+[[package]]
+name = "tyro"
+version = "0.9.14"
+description = "CLI interfaces & config objects, from types"
+optional = true
+python-versions = ">=3.7"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"mani-skill\""
+files = [
+    {file = "tyro-0.9.14-py3-none-any.whl", hash = "sha256:043a65306137b72b87f1a3e906874b25d0cf2f36add520d12b82a9c93dc5234c"},
+    {file = "tyro-0.9.14.tar.gz", hash = "sha256:9addf4caaefbe00c2059178151ca003f3ae07b90cc8e777c16f6e5a21ef16ef1"},
+]
+
+[package.dependencies]
+colorama = {version = ">=0.4.0", markers = "platform_system == \"Windows\""}
+docstring-parser = ">=0.15"
+rich = ">=11.1.0"
+shtab = ">=1.5.6"
+typeguard = ">=4.0.0"
+typing-extensions = {version = ">=4.9.0", markers = "python_version >= \"3.8\""}
+
+[package.extras]
+dev = ["attrs (>=21.4.0)", "coverage[toml] (>=6.5.0)", "eval-type-backport (>=0.1.3)", "flax (>=0.6.9)", "mypy (>=1.4.1)", "numpy (>=1.20.0)", "omegaconf (>=2.2.2)", "pydantic (>=2.5.2,!=2.10.0)", "pyright (>=1.1.349,!=1.1.379)", "pytest (>=7.1.2)", "pytest-cov (>=3.0.0)", "pyyaml (>=6.0)", "ruff (>=0.1.13)", "torch (>=1.10.0)"]
+
 [[package]]
 name = "tzdata"
 version = "2024.2"
 description = "Provider of IANA time zone data"
 optional = false
 python-versions = ">=2"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd"},
     {file = "tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc"},
@@ -7202,6 +8174,8 @@ version = "0.0.27"
 description = "URDF parser and manipulator for Python"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "urchin-0.0.27-py3-none-any.whl", hash = "sha256:e4cf43c8f52a44e0075e1778b76c203922085dd1fb9340cd703bf54188208611"},
     {file = "urchin-0.0.27.tar.gz", hash = "sha256:bda308ed7d2b80eb1e097dc3963fabe9e00a6cbd89a1f6be6f063c2a065d3671"},
@@ -7228,6 +8202,8 @@ version = "0.0.4"
 description = "This package contains a python parser for the Unified Robot Description Format (URDF), which is an XML format for representing a robot model."
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "urdf_parser_py-0.0.4.tar.gz", hash = "sha256:e983f637145fded67bcff6a542302069bb975b2edf1b18318c093abba1b794cc"},
 ]
@@ -7242,6 +8218,8 @@ version = "1.3.0"
 description = "RFC 6570 URI Template Processor"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "uri-template-1.3.0.tar.gz", hash = "sha256:0e00f8eb65e18c7de20d595a14336e9f337ead580c70934141624b6d1ffdacc7"},
     {file = "uri_template-1.3.0-py3-none-any.whl", hash = "sha256:a44a133ea12d44a0c0f06d7d42a52d71282e77e2f937d8abd5655b8d56fc1363"},
@@ -7256,6 +8234,8 @@ version = "2.2.3"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"},
     {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"},
@@ -7273,6 +8253,8 @@ version = "20.26.6"
 description = "Virtual Python Environment builder"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "(python_version >= \"3.12\" or python_version <= \"3.11\") and extra == \"dev\""
 files = [
     {file = "virtualenv-20.26.6-py3-none-any.whl", hash = "sha256:7345cc5b25405607a624d8418154577459c3e0277f5466dd79c49d5e492995f2"},
     {file = "virtualenv-20.26.6.tar.gz", hash = "sha256:280aede09a2a5c317e409a00102e7077c6432c5a38f0ef938e643805a7ad2c48"},
@@ -7293,6 +8275,8 @@ version = "0.18.3"
 description = "A CLI and library for interacting with the Weights & Biases API."
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "wandb-0.18.3-py3-none-any.whl", hash = "sha256:7da64f7da0ff7572439de10bfd45534e8811e71e78ac2ccc3b818f1c0f3a9aef"},
     {file = "wandb-0.18.3-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:6674d8a5c40c79065b9c7eb765136756d5ebc9457a5f9abc820a660fb23f8b67"},
@@ -7338,6 +8322,8 @@ version = "0.2.13"
 description = "Measures the displayed width of unicode strings in a terminal"
 optional = false
 python-versions = "*"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"},
     {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"},
@@ -7349,6 +8335,8 @@ version = "24.8.0"
 description = "A library for working with the color formats defined by HTML and CSS."
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "webcolors-24.8.0-py3-none-any.whl", hash = "sha256:fc4c3b59358ada164552084a8ebee637c221e4059267d0f8325b3b560f6c7f0a"},
     {file = "webcolors-24.8.0.tar.gz", hash = "sha256:08b07af286a01bcd30d583a7acadf629583d1f79bfef27dd2c2c5c263817277d"},
@@ -7364,6 +8352,8 @@ version = "0.5.1"
 description = "Character encoding aliases for legacy web content"
 optional = true
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"},
     {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"},
@@ -7375,6 +8365,8 @@ version = "1.8.0"
 description = "WebSocket client for Python with low level API options"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "websocket_client-1.8.0-py3-none-any.whl", hash = "sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526"},
     {file = "websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da"},
@@ -7391,6 +8383,8 @@ version = "3.1.1"
 description = "The comprehensive WSGI web application library."
 optional = false
 python-versions = ">=3.9"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "werkzeug-3.1.1-py3-none-any.whl", hash = "sha256:a71124d1ef06008baafa3d266c02f56e1836a5984afd6dd6c9230669d60d9fb5"},
     {file = "werkzeug-3.1.1.tar.gz", hash = "sha256:8cd39dfbdfc1e051965f156163e2974e52c210f130810e9ad36858f0fd3edad4"},
@@ -7408,6 +8402,8 @@ version = "4.0.13"
 description = "Jupyter interactive widgets for Jupyter Notebook"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "widgetsnbextension-4.0.13-py3-none-any.whl", hash = "sha256:74b2692e8500525cc38c2b877236ba51d34541e6385eeed5aec15a70f88a6c71"},
     {file = "widgetsnbextension-4.0.13.tar.gz", hash = "sha256:ffcb67bc9febd10234a362795f643927f4e0c05d9342c727b65d2384f8feacb6"},
@@ -7419,6 +8415,8 @@ version = "0.14.1"
 description = "Makes working with XML feel like you are working with JSON"
 optional = true
 python-versions = ">=3.6"
+groups = ["main"]
+markers = "sys_platform == \"linux\" and extra == \"stretch\" and (python_version >= \"3.12\" or python_version <= \"3.11\")"
 files = [
     {file = "xmltodict-0.14.1-py2.py3-none-any.whl", hash = "sha256:3ef4a7b71c08f19047fcbea572e1d7f4207ab269da1565b5d40e9823d3894e63"},
     {file = "xmltodict-0.14.1.tar.gz", hash = "sha256:338c8431e4fc554517651972d62f06958718f6262b04316917008e8fd677a6b0"},
@@ -7430,6 +8428,8 @@ version = "3.5.0"
 description = "Python binding for xxHash"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "xxhash-3.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ece616532c499ee9afbb83078b1b952beffef121d989841f7f4b3dc5ac0fd212"},
     {file = "xxhash-3.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3171f693dbc2cef6477054a665dc255d996646b4023fe56cb4db80e26f4cc520"},
@@ -7562,6 +8562,8 @@ version = "1.14.0"
 description = "Yet another URL library"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "yarl-1.14.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1bfc25aa6a7c99cf86564210f79a0b7d4484159c67e01232b116e445b3036547"},
     {file = "yarl-1.14.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0cf21f46a15d445417de8fc89f2568852cf57fe8ca1ab3d19ddb24d45c0383ae"},
@@ -7668,6 +8670,8 @@ version = "2.18.3"
 description = "An implementation of chunked, compressed, N-dimensional arrays for Python"
 optional = false
 python-versions = ">=3.10"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "zarr-2.18.3-py3-none-any.whl", hash = "sha256:b1f7dfd2496f436745cdd4c7bcf8d3b4bc1dceef5fdd0d589c87130d842496dd"},
     {file = "zarr-2.18.3.tar.gz", hash = "sha256:2580d8cb6dd84621771a10d31c4d777dca8a27706a1a89b29f42d2d37e2df5ce"},
@@ -7689,6 +8693,8 @@ version = "3.20.2"
 description = "Backport of pathlib-compatible object wrapper for zip files"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version >= \"3.12\" or python_version <= \"3.11\""
 files = [
     {file = "zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350"},
     {file = "zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29"},
@@ -7708,8 +8714,9 @@ dev = ["debugpy", "pre-commit"]
 dora = ["gym-dora"]
 dynamixel = ["dynamixel-sdk", "pynput"]
 feetech = ["feetech-servo-sdk", "pynput"]
-hilserl = ["torchmetrics", "transformers"]
+hilserl = ["grpcio", "protobuf", "torchmetrics", "transformers"]
 intelrealsense = ["pyrealsense2"]
+mani-skill = ["mani-skill"]
 pusht = ["gym-pusht"]
 stretch = ["hello-robot-stretch-body", "pynput", "pyrealsense2", "pyrender"]
 test = ["pyserial", "pytest", "pytest-cov"]
@@ -7718,6 +8725,6 @@ video-benchmark = ["pandas", "scikit-image"]
 xarm = ["gym-xarm"]
 
 [metadata]
-lock-version = "2.0"
+lock-version = "2.1"
 python-versions = ">=3.10,<3.13"
-content-hash = "44c74163e398e8ff16973957f69a47bb09b789e92ac4d8fb3ab268defab96427"
+content-hash = "6170a86166e0bbfa618505bca966b87dec439707c5de339883a49d77edbf8c2c"
diff --git a/pyproject.toml b/pyproject.toml
index 05ab921a..042225bc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -73,6 +73,9 @@ pyserial = {version = ">=3.5", optional = true}
 jsonlines = ">=4.0.0"
 transformers = {version = ">=4.47.0", optional = true}
 torchmetrics = {version = ">=1.6.0", optional = true}
+grpcio = { version = "^1.70.0", optional = true }
+protobuf = { version = "^5.29.3", optional = true }
+mani-skill = { version = "^3.0.0b18", optional = true }
 
 
 [tool.poetry.extras]
@@ -88,7 +91,8 @@ dynamixel = ["dynamixel-sdk", "pynput"]
 feetech = ["feetech-servo-sdk", "pynput"]
 intelrealsense = ["pyrealsense2"]
 stretch = ["hello-robot-stretch-body", "pyrender", "pyrealsense2", "pynput"]
-hilserl = ["transformers", "torchmetrics"]
+hilserl = ["transformers", "torchmetrics", "grpcio", "protobuf"]
+mani_skill = ["mani-skill"]
 
 [tool.ruff]
 line-length = 110
@@ -116,6 +120,8 @@ exclude = [
     "dist",
     "node_modules",
     "venv",
+    "*_pb2.py",
+    "*_pb2_grpc.py",
 ]
 
 
diff --git a/ruff.toml b/ruff.toml
new file mode 100644
index 00000000..e8694401
--- /dev/null
+++ b/ruff.toml
@@ -0,0 +1,11 @@
+ # Exclude files/directories from Ruff
+exclude = [
+    "*_pb2.py",           # Ignore all protobuf generated files
+    "*_pb2_grpc.py",      # Ignore all gRPC generated files
+    "lerobot/scripts/server/hilserl_pb2.py",  # Ignore specific file
+    ".git",
+    ".env",
+    ".venv",
+    "build",
+    "dist"
+]

From 546719137aba7e173ba5f943a000d3a99cb623ca Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Fri, 21 Feb 2025 10:13:43 +0000
Subject: [PATCH 087/112] Added caching function in the learner_server and
 modeling sac in order to limit the number of forward passes through the
 pretrained encoder when its frozen. Added tensordict dependencies Updated the
 version of torch and torchvision

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/common/policies/sac/modeling_sac.py   | 28 ++++++++-------
 lerobot/configs/env/maniskill_example.yaml    |  4 +--
 lerobot/configs/policy/sac_maniskill.yaml     | 13 +++----
 lerobot/scripts/server/actor_server.py        |  2 +-
 lerobot/scripts/server/learner_server.py      | 35 ++++++++++++++++---
 lerobot/scripts/server/learner_service.py     | 22 ++++++------
 .../scripts/server/maniskill_manipulator.py   |  1 -
 pyproject.toml                                |  6 ++--
 8 files changed, 67 insertions(+), 44 deletions(-)

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 7cb41ebd..db596982 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -153,7 +153,7 @@ class SACPolicy(
         return actions
 
     def critic_forward(
-        self, observations: dict[str, Tensor], actions: Tensor, use_target: bool = False
+        self, observations: dict[str, Tensor], actions: Tensor, use_target: bool = False, observation_features: Tensor | None = None
     ) -> Tensor:
         """Forward pass through a critic network ensemble
 
@@ -166,7 +166,7 @@ class SACPolicy(
             Tensor of Q-values from all critics
         """
         critics = self.critic_target if use_target else self.critic_ensemble
-        q_values = critics(observations, actions)
+        q_values = critics(observations, actions, observation_features)
         return q_values
 
     def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor | float]: ...
@@ -180,14 +180,14 @@ class SACPolicy(
                 + target_param.data * (1.0 - self.config.critic_target_update_weight)
             )
 
-    def compute_loss_critic(self, observations, actions, rewards, next_observations, done) -> Tensor:
+    def compute_loss_critic(self, observations, actions, rewards, next_observations, done, observation_features: Tensor | None = None, next_observation_features: Tensor | None = None) -> Tensor:
         temperature = self.log_alpha.exp().item()
         with torch.no_grad():
-            next_action_preds, next_log_probs, _ = self.actor(next_observations)
+            next_action_preds, next_log_probs, _ = self.actor(next_observations, next_observation_features)
 
             # 2- compute q targets
             q_targets = self.critic_forward(
-                observations=next_observations, actions=next_action_preds, use_target=True
+                observations=next_observations, actions=next_action_preds, use_target=True, observation_features=next_observation_features
             )
 
             # subsample critics to prevent overfitting if use high UTD (update to date)
@@ -204,7 +204,7 @@ class SACPolicy(
             td_target = rewards + (1 - done) * self.config.discount * min_q
 
         # 3- compute predicted qs
-        q_preds = self.critic_forward(observations, actions, use_target=False)
+        q_preds = self.critic_forward(observations, actions, use_target=False, observation_features=observation_features)
 
         # 4- Calculate loss
         # Compute state-action value loss (TD loss) for all of the Q functions in the ensemble.
@@ -219,20 +219,20 @@ class SACPolicy(
         ).sum()
         return critics_loss
 
-    def compute_loss_temperature(self, observations) -> Tensor:
+    def compute_loss_temperature(self, observations, observation_features: Tensor | None = None) -> Tensor:
         """Compute the temperature loss"""
         # calculate temperature loss
         with torch.no_grad():
-            _, log_probs, _ = self.actor(observations)
+            _, log_probs, _ = self.actor(observations, observation_features)
         temperature_loss = (-self.log_alpha.exp() * (log_probs + self.config.target_entropy)).mean()
         return temperature_loss
 
-    def compute_loss_actor(self, observations) -> Tensor:
+    def compute_loss_actor(self, observations, observation_features: Tensor | None = None) -> Tensor:
         temperature = self.log_alpha.exp().item()
 
-        actions_pi, log_probs, _ = self.actor(observations)
+        actions_pi, log_probs, _ = self.actor(observations, observation_features)
 
-        q_preds = self.critic_forward(observations, actions_pi, use_target=False)
+        q_preds = self.critic_forward(observations, actions_pi, use_target=False, observation_features=observation_features)
         min_q_preds = q_preds.min(dim=0)[0]
 
         actor_loss = ((temperature * log_probs) - min_q_preds).mean()
@@ -370,6 +370,7 @@ class CriticEnsemble(nn.Module):
         self,
         observations: dict[str, torch.Tensor],
         actions: torch.Tensor,
+        observation_features: torch.Tensor | None = None,
     ) -> torch.Tensor:
         device = get_device_from_parameters(self)
         # Move each tensor in observations to device
@@ -380,7 +381,7 @@ class CriticEnsemble(nn.Module):
         actions = self.output_normalization(actions)["action"]
         actions = actions.to(device)
 
-        obs_enc = observations if self.encoder is None else self.encoder(observations)
+        obs_enc = observation_features if observation_features is not None else (observations if self.encoder is None else self.encoder(observations))
 
         inputs = torch.cat([obs_enc, actions], dim=-1)
         q_values = self.ensemble(inputs)  # [num_critics, B, 1]
@@ -441,9 +442,10 @@ class Policy(nn.Module):
     def forward(
         self,
         observations: torch.Tensor,
+        observation_features: torch.Tensor | None = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Encode observations if encoder exists
-        obs_enc = observations if self.encoder is None else self.encoder(observations)
+        obs_enc = observation_features if observation_features is not None else (observations if self.encoder is None else self.encoder(observations))
 
         # Get network outputs
         outputs = self.network(obs_enc)
diff --git a/lerobot/configs/env/maniskill_example.yaml b/lerobot/configs/env/maniskill_example.yaml
index 2b9966c9..03814614 100644
--- a/lerobot/configs/env/maniskill_example.yaml
+++ b/lerobot/configs/env/maniskill_example.yaml
@@ -5,14 +5,14 @@ fps: 20
 env:
   name: maniskill/pushcube
   task:  PushCube-v1
-  image_size: 64
+  image_size: 128
   control_mode: pd_ee_delta_pose
   state_dim: 25
   action_dim: 7
   fps: ${fps}
   obs: rgb
   render_mode: rgb_array
-  render_size: 64
+  render_size: 128
   device: cuda
 
   reward_classifier:
diff --git a/lerobot/configs/policy/sac_maniskill.yaml b/lerobot/configs/policy/sac_maniskill.yaml
index 3e0dbe61..d23c0017 100644
--- a/lerobot/configs/policy/sac_maniskill.yaml
+++ b/lerobot/configs/policy/sac_maniskill.yaml
@@ -31,7 +31,7 @@ training:
   online_env_seed: 10000
   online_buffer_capacity: 1000000
   online_buffer_seed_size: 0
-  online_step_before_learning: 5000
+  online_step_before_learning: 500
   do_online_rollout_async: false
   policy_update_freq: 1
 
@@ -52,19 +52,16 @@ policy:
   n_action_steps: 1
 
   shared_encoder: true
-  vision_encoder_name: null
-  # vision_encoder_name: "helper2424/resnet10"
-  # freeze_vision_encoder: true
-  freeze_vision_encoder: false
+  vision_encoder_name: "helper2424/resnet10"
+  freeze_vision_encoder: true
   input_shapes:
     # # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
     observation.state: ["${env.state_dim}"]
-    observation.image: [3, 64, 64]
-    observation.image.2: [3, 64, 64]
+    observation.image: [3, 128, 128]
   output_shapes:
     action: [7]
   
-  camera_number: 2
+  camera_number: 1
 
   # Normalization / Unnormalization
   input_normalization_modes: null
diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index f0c6f2a9..64091883 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -217,7 +217,7 @@ def learner_service_client(
             {
                 "name": [{}],  # Applies to ALL methods in ALL services
                 "retryPolicy": {
-                    "maxAttempts": 5,  # Max retries (total attempts = 5)
+                    "maxAttempts": 7,  # Max retries (total attempts = 5)
                     "initialBackoff": "0.1s",  # First retry after 0.1s
                     "maxBackoff": "2s",  # Max wait time between retries
                     "backoffMultiplier": 2,  # Exponential backoff factor
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index 2d00e7ed..e46681f9 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -169,6 +169,25 @@ def initialize_replay_buffer(
     )
 
 
+def get_observation_features(policy: SACPolicy, observations: torch.Tensor, next_observations: torch.Tensor) -> tuple[torch.Tensor | None, torch.Tensor | None]:
+    if policy.config.vision_encoder_name is None or not policy.config.freeze_vision_encoder:
+        return None, None
+
+    with torch.no_grad():
+        observation_features = (
+            policy.actor.encoder(observations)
+            if policy.actor.encoder is not None
+            else None
+        )
+        next_observation_features = (
+            policy.actor.encoder(next_observations)
+            if policy.actor.encoder is not None
+            else None
+        )
+
+    return observation_features, next_observation_features
+
+
 def start_learner_threads(
     cfg: DictConfig,
     device: str,
@@ -345,9 +364,6 @@ def add_actor_information_and_train(
         if len(replay_buffer) < cfg.training.online_step_before_learning:
             continue
 
-        # logging.info(f"Size of replay buffer: {len(replay_buffer)}")
-        # logging.info(f"Size of offline replay buffer: {len(offline_replay_buffer)}")
-
         time_for_one_optimization_step = time.time()
         for _ in range(cfg.policy.utd_ratio - 1):
             batch = replay_buffer.sample(batch_size)
@@ -356,6 +372,7 @@ def add_actor_information_and_train(
                 batch_offline = offline_replay_buffer.sample(batch_size)
                 batch = concatenate_batch_transitions(batch, batch_offline)
 
+
             actions = batch["action"]
             rewards = batch["reward"]
             observations = batch["state"]
@@ -365,6 +382,7 @@ def add_actor_information_and_train(
                 observations=observations, actions=actions, next_state=next_observations
             )
 
+            observation_features, next_observation_features = get_observation_features(policy, observations, next_observations)
             with policy_lock:
                 loss_critic = policy.compute_loss_critic(
                     observations=observations,
@@ -372,6 +390,8 @@ def add_actor_information_and_train(
                     rewards=rewards,
                     next_observations=next_observations,
                     done=done,
+                    observation_features=observation_features,
+                    next_observation_features=next_observation_features,
                 )
                 optimizers["critic"].zero_grad()
                 loss_critic.backward()
@@ -395,6 +415,7 @@ def add_actor_information_and_train(
             observations=observations, actions=actions, next_state=next_observations
         )
 
+        observation_features, next_observation_features = get_observation_features(policy, observations, next_observations)
         with policy_lock:
             loss_critic = policy.compute_loss_critic(
                 observations=observations,
@@ -402,6 +423,8 @@ def add_actor_information_and_train(
                 rewards=rewards,
                 next_observations=next_observations,
                 done=done,
+                observation_features=observation_features,
+                next_observation_features=next_observation_features,
             )
             optimizers["critic"].zero_grad()
             loss_critic.backward()
@@ -413,7 +436,8 @@ def add_actor_information_and_train(
         if optimization_step % cfg.training.policy_update_freq == 0:
             for _ in range(cfg.training.policy_update_freq):
                 with policy_lock:
-                    loss_actor = policy.compute_loss_actor(observations=observations)
+                    loss_actor = policy.compute_loss_actor(observations=observations,
+                                                           observation_features=observation_features)
 
                     optimizers["actor"].zero_grad()
                     loss_actor.backward()
@@ -422,7 +446,8 @@ def add_actor_information_and_train(
                     training_infos["loss_actor"] = loss_actor.item()
 
                     loss_temperature = policy.compute_loss_temperature(
-                        observations=observations
+                        observations=observations,
+                        observation_features=observation_features
                     )
                     optimizers["temperature"].zero_grad()
                     loss_temperature.backward()
diff --git a/lerobot/scripts/server/learner_service.py b/lerobot/scripts/server/learner_service.py
index 97601528..d6e6b5b7 100644
--- a/lerobot/scripts/server/learner_service.py
+++ b/lerobot/scripts/server/learner_service.py
@@ -41,17 +41,17 @@ class LearnerService(hilserl_pb2_grpc.LearnerServiceServicer):
     def _get_policy_state(self):
         with self.policy_lock:
             params_dict = self.policy.actor.state_dict()
-            if self.policy.config.vision_encoder_name is not None:
-                if self.policy.config.freeze_vision_encoder:
-                    params_dict: dict[str, torch.Tensor] = {
-                        k: v
-                        for k, v in params_dict.items()
-                        if not k.startswith("encoder.")
-                    }
-                else:
-                    raise NotImplementedError(
-                        "Vision encoder is not frozen, we need to send the full model over the network which requires chunking the model."
-                    )
+            # if self.policy.config.vision_encoder_name is not None:
+            #     if self.policy.config.freeze_vision_encoder:
+            #         params_dict: dict[str, torch.Tensor] = {
+            #             k: v
+            #             for k, v in params_dict.items()
+            #             if not k.startswith("encoder.")
+            #         }
+            #     else:
+            #         raise NotImplementedError(
+            #             "Vision encoder is not frozen, we need to send the full model over the network which requires chunking the model."
+            #         )
 
         return move_state_dict_to_device(params_dict, device="cpu")
 
diff --git a/lerobot/scripts/server/maniskill_manipulator.py b/lerobot/scripts/server/maniskill_manipulator.py
index 105deeb4..e1c0840a 100644
--- a/lerobot/scripts/server/maniskill_manipulator.py
+++ b/lerobot/scripts/server/maniskill_manipulator.py
@@ -41,7 +41,6 @@ def preprocess_maniskill_observation(observations: dict[str, np.ndarray]) -> dic
     state = torch.cat([q_pos, q_vel, tcp_pos], dim=-1)
 
     return_observations["observation.image"] = img
-    return_observations["observation.image.2"] = img
     return_observations["observation.state"] = state
     return return_observations
 
diff --git a/pyproject.toml b/pyproject.toml
index 042225bc..eaeda408 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,10 +38,10 @@ einops = ">=0.8.0"
 pymunk = ">=6.6.0"
 zarr = ">=2.17.0"
 numba = ">=0.59.0"
-torch = ">=2.2.1"
+torch = ">=2.6.0"
 opencv-python = ">=4.9.0"
 diffusers = ">=0.27.2"
-torchvision = ">=0.17.1"
+torchvision = ">=0.21.0"
 h5py = ">=3.10.0"
 huggingface-hub = {extras = ["hf-transfer", "cli"], version = ">=0.25.2"}
 gymnasium = "==0.29.1" # TODO(rcadene, aliberts): Make gym 1.0.0 work
@@ -91,7 +91,7 @@ dynamixel = ["dynamixel-sdk", "pynput"]
 feetech = ["feetech-servo-sdk", "pynput"]
 intelrealsense = ["pyrealsense2"]
 stretch = ["hello-robot-stretch-body", "pyrender", "pyrealsense2", "pynput"]
-hilserl = ["transformers", "torchmetrics", "grpcio", "protobuf"]
+hilserl = ["transformers", "torchmetrics", "grpcio", "protobuf", "tensordict"]
 mani_skill = ["mani-skill"]
 
 [tool.ruff]

From 42a038173fee5908ed380eec6baae30df178f78a Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Mon, 24 Feb 2025 16:53:37 +0000
Subject: [PATCH 088/112] Update ManiSkill configuration and replay buffer to
 support truncation and dataset handling

- Reduced image size in ManiSkill environment configuration from 128 to 64
- Added support for truncation in replay buffer and actor server
- Updated SAC policy configuration to use a specific dataset and modify vision encoder settings
- Improved dataset conversion process with progress tracking and task naming
- Added flexibility for joint action space masking in learner server
---
 lerobot/configs/env/maniskill_example.yaml | 10 +++--
 lerobot/configs/policy/sac_maniskill.yaml  | 11 ++++--
 lerobot/scripts/server/actor_server.py     |  1 +
 lerobot/scripts/server/buffer.py           | 38 +++++++++++++++---
 lerobot/scripts/server/learner_server.py   | 45 ++++++++++++++--------
 5 files changed, 78 insertions(+), 27 deletions(-)

diff --git a/lerobot/configs/env/maniskill_example.yaml b/lerobot/configs/env/maniskill_example.yaml
index 03814614..9098bcbe 100644
--- a/lerobot/configs/env/maniskill_example.yaml
+++ b/lerobot/configs/env/maniskill_example.yaml
@@ -5,16 +5,20 @@ fps: 20
 env:
   name: maniskill/pushcube
   task:  PushCube-v1
-  image_size: 128
+  image_size: 64
   control_mode: pd_ee_delta_pose
   state_dim: 25
   action_dim: 7
   fps: ${fps}
   obs: rgb
   render_mode: rgb_array
-  render_size: 128
+  render_size: 64
   device: cuda
 
   reward_classifier:
     pretrained_path: null
-    config_path: null
\ No newline at end of file
+    config_path: null
+
+  wrapper:
+    joint_masking_action_space: null
+    delta_action: null
diff --git a/lerobot/configs/policy/sac_maniskill.yaml b/lerobot/configs/policy/sac_maniskill.yaml
index d23c0017..c78df904 100644
--- a/lerobot/configs/policy/sac_maniskill.yaml
+++ b/lerobot/configs/policy/sac_maniskill.yaml
@@ -8,7 +8,8 @@
 #   env.gym.obs_type=environment_state_agent_pos \
 
 seed: 1
-dataset_repo_id: null
+# dataset_repo_id: null
+dataset_repo_id: "AdilZtn/Maniskill-Pushcube-demonstration-medium"
 
 training:
   # Offline training dataloader
@@ -52,12 +53,14 @@ policy:
   n_action_steps: 1
 
   shared_encoder: true
-  vision_encoder_name: "helper2424/resnet10"
-  freeze_vision_encoder: true
+  # vision_encoder_name: "helper2424/resnet10"
+  vision_encoder_name: null
+  # freeze_vision_encoder: true
+  freeze_vision_encoder: false
   input_shapes:
     # # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
     observation.state: ["${env.state_dim}"]
-    observation.image: [3, 128, 128]
+    observation.image: [3, 64, 64]
   output_shapes:
     action: [7]
   
diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index 64091883..c70417cf 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -373,6 +373,7 @@ def act_with_policy(
                 reward=reward,
                 next_state=next_obs,
                 done=done,
+                truncated=truncated,  # TODO: (azouitine) Handle truncation properly
                 complementary_info=info,  # TODO Handle information for the transition, is_demonstraction: bool
             )
         )
diff --git a/lerobot/scripts/server/buffer.py b/lerobot/scripts/server/buffer.py
index c113678b..fd63b3f0 100644
--- a/lerobot/scripts/server/buffer.py
+++ b/lerobot/scripts/server/buffer.py
@@ -31,6 +31,7 @@ class Transition(TypedDict):
     reward: float
     next_state: dict[str, torch.Tensor]
     done: bool
+    truncated: bool
     complementary_info: dict[str, Any] = None
 
 
@@ -40,6 +41,7 @@ class BatchTransition(TypedDict):
     reward: torch.Tensor
     next_state: dict[str, torch.Tensor]
     done: torch.Tensor
+    truncated: torch.Tensor
 
 
 def move_transition_to_device(
@@ -70,6 +72,11 @@ def move_transition_to_device(
             device, non_blocking=device.type == "cuda"
         )
 
+    if isinstance(transition["truncated"], torch.Tensor):
+        transition["truncated"] = transition["truncated"].to(
+            device, non_blocking=device.type == "cuda"
+        )
+
     # Move next_state tensors to CPU
     transition["next_state"] = {
         key: val.to(device, non_blocking=device.type == "cuda")
@@ -205,6 +212,7 @@ class ReplayBuffer:
         reward: float,
         next_state: dict[str, torch.Tensor],
         done: bool,
+        truncated: bool,
         complementary_info: Optional[dict[str, torch.Tensor]] = None,
     ):
         """Saves a transition, ensuring tensors are stored on the designated storage device."""
@@ -229,6 +237,7 @@ class ReplayBuffer:
             reward=reward,
             next_state=next_state,
             done=done,
+            truncated=truncated,
             complementary_info=complementary_info,
         )
         self.position = (self.position + 1) % self.capacity
@@ -294,6 +303,7 @@ class ReplayBuffer:
                 reward=data["reward"],
                 next_state=data["next_state"],
                 done=data["done"],
+                truncated=False,
             )
         return replay_buffer
 
@@ -352,6 +362,8 @@ class ReplayBuffer:
             # ----- 3) Reward and done -----
             reward = float(current_sample["next.reward"].item())  # ensure float
             done = bool(current_sample["next.done"].item())  # ensure bool
+            # TODO: (azouitine) Handle truncation properly
+            truncated = bool(current_sample["next.done"].item())  # ensure bool
 
             # ----- 4) Next state -----
             # If not done and the next sample is in the same episode, we pull the next sample's state.
@@ -374,6 +386,7 @@ class ReplayBuffer:
                 reward=reward,
                 next_state=next_state,
                 done=done,
+                truncated=truncated,
             )
             transitions.append(transition)
 
@@ -419,6 +432,11 @@ class ReplayBuffer:
             [t["done"] for t in list_of_transitions], dtype=torch.float32
         ).to(self.device)
 
+        # -- Build batched truncateds --
+        batch_truncateds = torch.tensor(
+            [t["truncated"] for t in list_of_transitions], dtype=torch.float32
+        ).to(self.device)
+
         # Return a BatchTransition typed dict
         return BatchTransition(
             state=batch_state,
@@ -426,6 +444,7 @@ class ReplayBuffer:
             reward=batch_rewards,
             next_state=batch_next_state,
             done=batch_dones,
+            truncated=batch_truncateds,
         )
 
     def to_lerobot_dataset(
@@ -501,7 +520,7 @@ class ReplayBuffer:
 
         # Start writing images if needed. If you have no image features, this is harmless.
         # Set num_processes or num_threads if you want concurrency.
-        lerobot_dataset.start_image_writer(num_processes=0, num_threads=2)
+        lerobot_dataset.start_image_writer(num_processes=0, num_threads=3)
 
         # --------------------------------------------------------------------------------------------
         # Convert transitions into episodes and frames
@@ -513,7 +532,11 @@ class ReplayBuffer:
         )
 
         frame_idx_in_episode = 0
-        for global_frame_idx, transition in enumerate(self.memory):
+        for global_frame_idx, transition in tqdm(
+            enumerate(self.memory),
+            desc="Converting replay buffer to dataset",
+            total=len(self.memory),
+        ):
             frame_dict = {}
 
             # Fill the data for state keys
@@ -546,14 +569,15 @@ class ReplayBuffer:
             # Move to next frame
             frame_idx_in_episode += 1
             # If we reached an episode boundary, call save_episode, reset counters
-            if transition["done"]:
+            # TODO: (azouitine) Handle truncation properly
+            if transition["done"] or transition["truncated"]:
                 # Use some placeholder name for the task
-                lerobot_dataset.save_episode(task="from_replay_buffer")
+                lerobot_dataset.save_episode(task=task_name)
                 episode_index += 1
                 frame_idx_in_episode = 0
                 # Start a new buffer for the next episode
                 lerobot_dataset.episode_buffer = lerobot_dataset.create_episode_buffer(
-                    episode_index
+                    episode_index=episode_index
                 )
 
         # We are done adding frames
@@ -624,6 +648,10 @@ def concatenate_batch_transitions(
     left_batch_transitions["done"] = torch.cat(
         [left_batch_transitions["done"], right_batch_transition["done"]], dim=0
     )
+    left_batch_transitions["truncated"] = torch.cat(
+        [left_batch_transitions["truncated"], right_batch_transition["truncated"]],
+        dim=0,
+    )
     return left_batch_transitions
 
 
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index e46681f9..b5c73a80 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -153,7 +153,7 @@ def initialize_replay_buffer(
             capacity=cfg.training.online_buffer_capacity,
             device=device,
             state_keys=cfg.policy.input_shapes.keys(),
-            storage_device=device
+            storage_device=device,
         )
 
     dataset = LeRobotDataset(
@@ -169,8 +169,13 @@ def initialize_replay_buffer(
     )
 
 
-def get_observation_features(policy: SACPolicy, observations: torch.Tensor, next_observations: torch.Tensor) -> tuple[torch.Tensor | None, torch.Tensor | None]:
-    if policy.config.vision_encoder_name is None or not policy.config.freeze_vision_encoder:
+def get_observation_features(
+    policy: SACPolicy, observations: torch.Tensor, next_observations: torch.Tensor
+) -> tuple[torch.Tensor | None, torch.Tensor | None]:
+    if (
+        policy.config.vision_encoder_name is None
+        or not policy.config.freeze_vision_encoder
+    ):
         return None, None
 
     with torch.no_grad():
@@ -338,6 +343,7 @@ def add_actor_information_and_train(
     interaction_step_shift = (
         resume_interaction_step if resume_interaction_step is not None else 0
     )
+    saved_data = False
     while True:
         if shutdown_event is not None and shutdown_event.is_set():
             logging.info("[LEARNER] Shutdown signal received. Exiting...")
@@ -372,7 +378,6 @@ def add_actor_information_and_train(
                 batch_offline = offline_replay_buffer.sample(batch_size)
                 batch = concatenate_batch_transitions(batch, batch_offline)
 
-
             actions = batch["action"]
             rewards = batch["reward"]
             observations = batch["state"]
@@ -382,7 +387,9 @@ def add_actor_information_and_train(
                 observations=observations, actions=actions, next_state=next_observations
             )
 
-            observation_features, next_observation_features = get_observation_features(policy, observations, next_observations)
+            observation_features, next_observation_features = get_observation_features(
+                policy, observations, next_observations
+            )
             with policy_lock:
                 loss_critic = policy.compute_loss_critic(
                     observations=observations,
@@ -415,7 +422,9 @@ def add_actor_information_and_train(
             observations=observations, actions=actions, next_state=next_observations
         )
 
-        observation_features, next_observation_features = get_observation_features(policy, observations, next_observations)
+        observation_features, next_observation_features = get_observation_features(
+            policy, observations, next_observations
+        )
         with policy_lock:
             loss_critic = policy.compute_loss_critic(
                 observations=observations,
@@ -436,8 +445,10 @@ def add_actor_information_and_train(
         if optimization_step % cfg.training.policy_update_freq == 0:
             for _ in range(cfg.training.policy_update_freq):
                 with policy_lock:
-                    loss_actor = policy.compute_loss_actor(observations=observations,
-                                                           observation_features=observation_features)
+                    loss_actor = policy.compute_loss_actor(
+                        observations=observations,
+                        observation_features=observation_features,
+                    )
 
                     optimizers["actor"].zero_grad()
                     loss_actor.backward()
@@ -447,7 +458,7 @@ def add_actor_information_and_train(
 
                     loss_temperature = policy.compute_loss_temperature(
                         observations=observations,
-                        observation_features=observation_features
+                        observation_features=observation_features,
                     )
                     optimizers["temperature"].zero_grad()
                     loss_temperature.backward()
@@ -458,7 +469,9 @@ def add_actor_information_and_train(
         policy.update_target_networks()
         if optimization_step % cfg.training.log_freq == 0:
             training_infos["Optimization step"] = optimization_step
-            logger.log_dict(d=training_infos, mode="train", custom_step_key="Optimization step")
+            logger.log_dict(
+                d=training_infos, mode="train", custom_step_key="Optimization step"
+            )
             # logging.info(f"Training infos: {training_infos}")
 
         time_for_one_optimization_step = time.time() - time_for_one_optimization_step
@@ -621,11 +634,13 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
         logging.info("make_dataset offline buffer")
         offline_dataset = make_dataset(cfg)
         logging.info("Convertion to a offline replay buffer")
-        active_action_dims = [
-            i
-            for i, mask in enumerate(cfg.env.wrapper.joint_masking_action_space)
-            if mask
-        ]
+        active_action_dims = None
+        if cfg.env.wrapper.joint_masking_action_space is not None:
+            active_action_dims = [
+                i
+                for i, mask in enumerate(cfg.env.wrapper.joint_masking_action_space)
+                if mask
+            ]
         offline_replay_buffer = ReplayBuffer.from_lerobot_dataset(
             offline_dataset,
             device=device,

From ef8d943e546440017a88301002426a139b847616 Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Tue, 25 Feb 2025 14:26:44 +0000
Subject: [PATCH 089/112] Refactor ReplayBuffer with tensor-based storage and
 improved sampling efficiency

- Replaced list-based memory storage with pre-allocated tensor storage
- Optimized sampling process with direct tensor indexing
- Added support for DrQ image augmentation during sampling for offline dataset
- Improved dataset conversion with more robust episode handling
- Enhanced buffer initialization and state tracking
- Added comprehensive testing for buffer conversion and sampling
---
 lerobot/scripts/server/buffer.py | 879 +++++++++++++++++++++----------
 1 file changed, 602 insertions(+), 277 deletions(-)

diff --git a/lerobot/scripts/server/buffer.py b/lerobot/scripts/server/buffer.py
index fd63b3f0..de278582 100644
--- a/lerobot/scripts/server/buffer.py
+++ b/lerobot/scripts/server/buffer.py
@@ -23,6 +23,7 @@ import torch.nn.functional as F  # noqa: N812
 from tqdm import tqdm
 
 from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+import os
 
 
 class Transition(TypedDict):
@@ -181,29 +182,58 @@ class ReplayBuffer:
         """
         Args:
             capacity (int): Maximum number of transitions to store in the buffer.
-            device (str): The device where the tensors will be moved ("cuda:0" or "cpu").
+            device (str): The device where the tensors will be moved when sampling ("cuda:0" or "cpu").
             state_keys (List[str]): The list of keys that appear in `state` and `next_state`.
             image_augmentation_function (Optional[Callable]): A function that takes a batch of images
                 and returns a batch of augmented images. If None, a default augmentation function is used.
             use_drq (bool): Whether to use the default DRQ image augmentation style, when sampling in the buffer.
-            storage_device: The device (e.g. "cpu" or "cuda:0") where the data will be stored when adding transitions to the buffer.
+            storage_device: The device (e.g. "cpu" or "cuda:0") where the data will be stored.
                 Using "cpu" can help save GPU memory.
         """
         self.capacity = capacity
         self.device = device
         self.storage_device = storage_device
-        self.memory: list[Transition] = []
         self.position = 0
+        self.size = 0
+        self.initialized = False
 
         # If no state_keys provided, default to an empty list
-        # (you can handle this differently if needed)
         self.state_keys = state_keys if state_keys is not None else []
+
         if image_augmentation_function is None:
-            self.image_augmentation_function = functools.partial(random_shift, pad=4)
+            base_function = functools.partial(random_shift, pad=4)
+            self.image_augmentation_function = torch.compile(base_function)
         self.use_drq = use_drq
 
+    def _initialize_storage(self, state: dict[str, torch.Tensor], action: torch.Tensor):
+        """Initialize the storage tensors based on the first transition."""
+        # Determine shapes from the first transition
+        state_shapes = {key: val.squeeze(0).shape for key, val in state.items()}
+        action_shape = action.squeeze(0).shape
+
+        # Pre-allocate tensors for storage
+        self.states = {
+            key: torch.empty((self.capacity, *shape), device=self.storage_device)
+            for key, shape in state_shapes.items()
+        }
+        self.actions = torch.empty(
+            (self.capacity, *action_shape), device=self.storage_device
+        )
+        self.rewards = torch.empty((self.capacity,), device=self.storage_device)
+        self.next_states = {
+            key: torch.empty((self.capacity, *shape), device=self.storage_device)
+            for key, shape in state_shapes.items()
+        }
+        self.dones = torch.empty(
+            (self.capacity,), dtype=torch.bool, device=self.storage_device
+        )
+        self.truncateds = torch.empty(
+            (self.capacity,), dtype=torch.bool, device=self.storage_device
+        )
+        self.initialized = True
+
     def __len__(self):
-        return len(self.memory)
+        return self.size
 
     def add(
         self,
@@ -216,33 +246,91 @@ class ReplayBuffer:
         complementary_info: Optional[dict[str, torch.Tensor]] = None,
     ):
         """Saves a transition, ensuring tensors are stored on the designated storage device."""
-        # Move tensors to the storage device
-        state = {key: tensor.to(self.storage_device) for key, tensor in state.items()}
-        next_state = {
-            key: tensor.to(self.storage_device) for key, tensor in next_state.items()
-        }
-        action = action.to(self.storage_device)
-        # if complementary_info is not None:
-        #     complementary_info = {
-        #         key: tensor.to(self.storage_device) for key, tensor in complementary_info.items()
-        #     }
+        # Initialize storage if this is the first transition
+        if not self.initialized:
+            self._initialize_storage(state=state, action=action)
 
-        if len(self.memory) < self.capacity:
-            self.memory.append(None)
+        # Store the transition in pre-allocated tensors
+        for key in self.states:
+            self.states[key][self.position].copy_(state[key].squeeze(dim=0))
+            self.next_states[key][self.position].copy_(next_state[key].squeeze(dim=0))
+
+        self.actions[self.position].copy_(action.squeeze(dim=0))
+        self.rewards[self.position] = reward
+        self.dones[self.position] = done
+        self.truncateds[self.position] = truncated
 
-        # Create and store the Transition
-        self.memory[self.position] = Transition(
-            state=state,
-            action=action,
-            reward=reward,
-            next_state=next_state,
-            done=done,
-            truncated=truncated,
-            complementary_info=complementary_info,
-        )
         self.position = (self.position + 1) % self.capacity
+        self.size = min(self.size + 1, self.capacity)
+
+    def sample(self, batch_size: int) -> BatchTransition:
+        """Sample a random batch of transitions and collate them into batched tensors."""
+        if not self.initialized:
+            raise RuntimeError(
+                "Cannot sample from an empty buffer. Add transitions first."
+            )
+
+        batch_size = min(batch_size, self.size)
+
+        # Random indices for sampling - create on the same device as storage
+        idx = torch.randint(
+            low=0, high=self.size, size=(batch_size,), device=self.storage_device
+        )
+
+        # Identify image keys that need augmentation
+        image_keys = (
+            [k for k in self.states if k.startswith("observation.image")]
+            if self.use_drq
+            else []
+        )
+
+        # Create batched state and next_state
+        batch_state = {}
+        batch_next_state = {}
+
+        # First pass: load all tensors to target device
+        for key in self.states:
+            batch_state[key] = self.states[key][idx].to(self.device)
+            batch_next_state[key] = self.next_states[key][idx].to(self.device)
+
+        # Apply image augmentation in a batched way if needed
+        if self.use_drq and image_keys:
+            # Concatenate all images from state and next_state
+            all_images = []
+            for key in image_keys:
+                all_images.append(batch_state[key])
+                all_images.append(batch_next_state[key])
+
+            # Batch all images and apply augmentation once
+            all_images_tensor = torch.cat(all_images, dim=0)
+            augmented_images = self.image_augmentation_function(all_images_tensor)
+
+            # Split the augmented images back to their sources
+            for i, key in enumerate(image_keys):
+                # State images are at even indices (0, 2, 4...)
+                batch_state[key] = augmented_images[
+                    i * 2 * batch_size : (i * 2 + 1) * batch_size
+                ]
+                # Next state images are at odd indices (1, 3, 5...)
+                batch_next_state[key] = augmented_images[
+                    (i * 2 + 1) * batch_size : (i + 1) * 2 * batch_size
+                ]
+
+        # Sample other tensors
+        batch_actions = self.actions[idx].to(self.device)
+        batch_rewards = self.rewards[idx].to(self.device)
+        batch_dones = self.dones[idx].to(self.device).float()
+        batch_truncateds = self.truncateds[idx].to(self.device).float()
+
+        return BatchTransition(
+            state=batch_state,
+            action=batch_actions,
+            reward=batch_rewards,
+            next_state=batch_next_state,
+            done=batch_dones,
+            truncated=batch_truncateds,
+        )
 
-    # TODO: ADD image_augmentation and use_drq arguments in this function in order to instantiate the class with them
     @classmethod
     def from_lerobot_dataset(
         cls,
@@ -252,21 +340,28 @@ class ReplayBuffer:
         capacity: Optional[int] = None,
         action_mask: Optional[Sequence[int]] = None,
         action_delta: Optional[float] = None,
+        image_augmentation_function: Optional[Callable] = None,
+        use_drq: bool = True,
+        storage_device: str = "cpu",
     ) -> "ReplayBuffer":
         """
         Convert a LeRobotDataset into a ReplayBuffer.
 
         Args:
             lerobot_dataset (LeRobotDataset): The dataset to convert.
-            device (str): The device . Defaults to "cuda:0".
-            state_keys (Optional[Sequence[str]], optional): The list of keys that appear in `state` and `next_state`.
-            Defaults to None.
+            device (str): The device for sampling tensors. Defaults to "cuda:0".
+            state_keys (Optional[Sequence[str]]): The list of keys that appear in `state` and `next_state`.
+            capacity (Optional[int]): Buffer capacity. If None, uses dataset length.
+            action_mask (Optional[Sequence[int]]): Indices of action dimensions to keep.
+            action_delta (Optional[float]): Factor to divide actions by.
+            image_augmentation_function (Optional[Callable]): Function for image augmentation.
+                If None, uses default random shift with pad=4.
+            use_drq (bool): Whether to use DrQ image augmentation when sampling.
+            storage_device (str): Device for storing tensor data. Using "cpu" saves GPU memory.
 
         Returns:
-            ReplayBuffer: The replay buffer with offline dataset transitions.
+            ReplayBuffer: The replay buffer with dataset transitions.
         """
-        # We convert the LeRobotDataset into a replay buffer, because it is more efficient to sample from
-        # a replay buffer than from a lerobot dataset.
         if capacity is None:
             capacity = len(lerobot_dataset)
 
@@ -275,11 +370,42 @@ class ReplayBuffer:
                 "The capacity of the ReplayBuffer must be greater than or equal to the length of the LeRobotDataset."
             )
 
-        replay_buffer = cls(capacity=capacity, device=device, state_keys=state_keys)
+        # Create replay buffer with image augmentation and DrQ settings
+        replay_buffer = cls(
+            capacity=capacity,
+            device=device,
+            state_keys=state_keys,
+            image_augmentation_function=image_augmentation_function,
+            use_drq=use_drq,
+            storage_device=storage_device,
+        )
+
+        # Convert dataset to transitions
         list_transition = cls._lerobotdataset_to_transitions(
             dataset=lerobot_dataset, state_keys=state_keys
         )
-        # Fill the replay buffer with the lerobot dataset transitions
+
+        # Initialize the buffer with the first transition to set up storage tensors
+        if list_transition:
+            first_transition = list_transition[0]
+            first_state = {
+                k: v.to(device) for k, v in first_transition["state"].items()
+            }
+            first_action = first_transition["action"].to(device)
+
+            # Apply action mask/delta if needed
+            if action_mask is not None:
+                if first_action.dim() == 1:
+                    first_action = first_action[action_mask]
+                else:
+                    first_action = first_action[:, action_mask]
+
+            if action_delta is not None:
+                first_action = first_action / action_delta
+
+            replay_buffer._initialize_storage(state=first_state, action=first_action)
+
+        # Fill the buffer with all transitions
         for data in list_transition:
             for k, v in data.items():
                 if isinstance(v, dict):
@@ -288,25 +414,127 @@ class ReplayBuffer:
                 elif isinstance(v, torch.Tensor):
                     data[k] = v.to(device)
 
+            action = data["action"]
             if action_mask is not None:
-                if data["action"].dim() == 1:
-                    data["action"] = data["action"][action_mask]
+                if action.dim() == 1:
+                    action = action[action_mask]
                 else:
-                    data["action"] = data["action"][:, action_mask]
+                    action = action[:, action_mask]
 
             if action_delta is not None:
-                data["action"] = data["action"] / action_delta
+                action = action / action_delta
 
             replay_buffer.add(
                 state=data["state"],
-                action=data["action"],
+                action=action,
                 reward=data["reward"],
                 next_state=data["next_state"],
                 done=data["done"],
-                truncated=False,
+                truncated=False,  # NOTE: Truncation are not supported yet in lerobot dataset
             )
+
         return replay_buffer
 
+    def to_lerobot_dataset(
+        self,
+        repo_id: str,
+        fps=1,
+        root=None,
+        task_name="from_replay_buffer",
+    ) -> LeRobotDataset:
+        """
+        Converts all transitions in this ReplayBuffer into a single LeRobotDataset object.
+        """
+        if self.size == 0:
+            raise ValueError("The replay buffer is empty. Cannot convert to a dataset.")
+
+        # Create features dictionary for the dataset
+        features = {
+            "index": {"dtype": "int64", "shape": [1]},  # global index across episodes
+            "episode_index": {"dtype": "int64", "shape": [1]},  # which episode
+            "frame_index": {"dtype": "int64", "shape": [1]},  # index inside an episode
+            "timestamp": {"dtype": "float32", "shape": [1]},  # for now we store dummy
+            "task_index": {"dtype": "int64", "shape": [1]},
+        }
+
+        # Add "action"
+        sample_action = self.actions[0]
+        act_info = guess_feature_info(t=sample_action, name="action")
+        features["action"] = act_info
+
+        # Add "reward" and "done"
+        features["next.reward"] = {"dtype": "float32", "shape": (1,)}
+        features["next.done"] = {"dtype": "bool", "shape": (1,)}
+
+        # Add state keys
+        for key in self.states:
+            sample_val = self.states[key][0]
+            f_info = guess_feature_info(t=sample_val, name=key)
+            features[key] = f_info
+
+        # Create an empty LeRobotDataset
+        lerobot_dataset = LeRobotDataset.create(
+            repo_id=repo_id,
+            fps=fps,
+            root=root,
+            robot=None,  # TODO: (azouitine) Handle robot
+            robot_type=None,
+            features=features,
+            use_videos=True,
+        )
+
+        # Start writing images if needed
+        lerobot_dataset.start_image_writer(num_processes=0, num_threads=3)
+
+        # Convert transitions into episodes and frames
+        episode_index = 0
+        lerobot_dataset.episode_buffer = lerobot_dataset.create_episode_buffer(
+            episode_index=episode_index
+        )
+
+        frame_idx_in_episode = 0
+        for idx in range(self.size):
+            actual_idx = (self.position - self.size + idx) % self.capacity
+
+            frame_dict = {}
+
+            # Fill the data for state keys
+            for key in self.states:
+                frame_dict[key] = self.states[key][actual_idx].cpu()
+
+            # Fill action, reward, done
+            frame_dict["action"] = self.actions[actual_idx].cpu()
+            frame_dict["next.reward"] = torch.tensor(
+                [self.rewards[actual_idx]], dtype=torch.float32
+            ).cpu()
+            frame_dict["next.done"] = torch.tensor(
+                [self.dones[actual_idx]], dtype=torch.bool
+            ).cpu()
+
+            # Add to the dataset's buffer
+            lerobot_dataset.add_frame(frame_dict)
+
+            # Move to next frame
+            frame_idx_in_episode += 1
+
+            # If we reached an episode boundary, call save_episode, reset counters
+            if self.dones[actual_idx] or self.truncateds[actual_idx]:
+                lerobot_dataset.save_episode(task=task_name)
+                episode_index += 1
+                frame_idx_in_episode = 0
+                lerobot_dataset.episode_buffer = lerobot_dataset.create_episode_buffer(
+                    episode_index=episode_index
+                )
+
+        # Save any remaining frames in the buffer
+        if lerobot_dataset.episode_buffer["size"] > 0:
+            lerobot_dataset.save_episode(task=task_name)
+
+        lerobot_dataset.stop_image_writer()
+        lerobot_dataset.consolidate(run_compute_stats=False, keep_image_files=False)
+
+        return lerobot_dataset
+
     @staticmethod
     def _lerobotdataset_to_transitions(
         dataset: LeRobotDataset,
@@ -337,16 +565,24 @@ class ReplayBuffer:
             transitions (List[Transition]):
                 A list of Transition dictionaries with the same length as `dataset`.
         """
-
-        # If not provided, you can either raise an error or define a default:
         if state_keys is None:
             raise ValueError(
-                "You must provide a list of keys in `state_keys` that define your 'state'."
+                "State keys must be provided when converting LeRobotDataset to Transitions."
             )
 
-        transitions: list[Transition] = []
+        transitions = []
         num_frames = len(dataset)
 
+        # Check if the dataset has "next.done" key
+        sample = dataset[0]
+        has_done_key = "next.done" in sample
+
+        # If not, we need to infer it from episode boundaries
+        if not has_done_key:
+            print(
+                "'next.done' key not found in dataset. Inferring from episode boundaries..."
+            )
+
         for i in tqdm(range(num_frames)):
             current_sample = dataset[i]
 
@@ -361,9 +597,22 @@ class ReplayBuffer:
 
             # ----- 3) Reward and done -----
             reward = float(current_sample["next.reward"].item())  # ensure float
-            done = bool(current_sample["next.done"].item())  # ensure bool
-            # TODO: (azouitine) Handle truncation properly
-            truncated = bool(current_sample["next.done"].item())  # ensure bool
+
+            # Determine done flag - use next.done if available, otherwise infer from episode boundaries
+            if has_done_key:
+                done = bool(current_sample["next.done"].item())  # ensure bool
+            else:
+                # If this is the last frame or if next frame is in a different episode, mark as done
+                done = False
+                if i == num_frames - 1:
+                    done = True
+                elif i < num_frames - 1:
+                    next_sample = dataset[i + 1]
+                    if next_sample["episode_index"] != current_sample["episode_index"]:
+                        done = True
+
+            # TODO: (azouitine) Handle truncation (using the same value as done for now)
+            truncated = done
 
             # ----- 4) Next state -----
             # If not done and the next sample is in the same episode, we pull the next sample's state.
@@ -392,206 +641,6 @@ class ReplayBuffer:
 
         return transitions
 
-    def sample(self, batch_size: int) -> BatchTransition:
-        """Sample a random batch of transitions and collate them into batched tensors."""
-        batch_size = min(batch_size, len(self.memory))
-        list_of_transitions = random.sample(self.memory, batch_size)
-
-        # -- Build batched states --
-        batch_state = {}
-        for key in self.state_keys:
-            batch_state[key] = torch.cat(
-                [t["state"][key] for t in list_of_transitions], dim=0
-            ).to(self.device)
-            if key.startswith("observation.image") and self.use_drq:
-                batch_state[key] = self.image_augmentation_function(batch_state[key])
-
-        # -- Build batched actions --
-        batch_actions = torch.cat([t["action"] for t in list_of_transitions]).to(
-            self.device
-        )
-
-        # -- Build batched rewards --
-        batch_rewards = torch.tensor(
-            [t["reward"] for t in list_of_transitions], dtype=torch.float32
-        ).to(self.device)
-
-        # -- Build batched next states --
-        batch_next_state = {}
-        for key in self.state_keys:
-            batch_next_state[key] = torch.cat(
-                [t["next_state"][key] for t in list_of_transitions], dim=0
-            ).to(self.device)
-            if key.startswith("observation.image") and self.use_drq:
-                batch_next_state[key] = self.image_augmentation_function(
-                    batch_next_state[key]
-                )
-
-        # -- Build batched dones --
-        batch_dones = torch.tensor(
-            [t["done"] for t in list_of_transitions], dtype=torch.float32
-        ).to(self.device)
-
-        # -- Build batched truncateds --
-        batch_truncateds = torch.tensor(
-            [t["truncated"] for t in list_of_transitions], dtype=torch.float32
-        ).to(self.device)
-
-        # Return a BatchTransition typed dict
-        return BatchTransition(
-            state=batch_state,
-            action=batch_actions,
-            reward=batch_rewards,
-            next_state=batch_next_state,
-            done=batch_dones,
-            truncated=batch_truncateds,
-        )
-
-    def to_lerobot_dataset(
-        self,
-        repo_id: str,
-        fps=1,  # If you have real timestamps, adjust this
-        root=None,
-        task_name="from_replay_buffer",
-    ) -> LeRobotDataset:
-        """
-        Converts all transitions in this ReplayBuffer into a single LeRobotDataset object,
-        splitting episodes by transitions where 'done=True'.
-
-        Returns:
-            LeRobotDataset: The resulting offline dataset.
-        """
-        if len(self.memory) == 0:
-            raise ValueError("The replay buffer is empty. Cannot convert to a dataset.")
-
-        # Infer the shapes and dtypes of your features
-        #    We'll create a features dict that is suitable for LeRobotDataset
-        # --------------------------------------------------------------------------------------------
-        # First, grab one transition to inspect shapes
-        first_transition = self.memory[0]
-
-        # We'll store default metadata for every episode: indexes, timestamps, etc.
-        features = {
-            "index": {"dtype": "int64", "shape": [1]},  # global index across episodes
-            "episode_index": {"dtype": "int64", "shape": [1]},  # which episode
-            "frame_index": {"dtype": "int64", "shape": [1]},  # index inside an episode
-            "timestamp": {"dtype": "float32", "shape": [1]},  # for now we store dummy
-            "task_index": {"dtype": "int64", "shape": [1]},
-        }
-
-        # Add "action"
-        act_info = guess_feature_info(
-            first_transition["action"].squeeze(dim=0), "action"
-        )  # Remove batch dimension
-        features["action"] = act_info
-
-        # Add "reward" (scalars)
-        features["next.reward"] = {"dtype": "float32", "shape": (1,)}
-
-        # Add "done" (boolean scalars)
-        features["next.done"] = {"dtype": "bool", "shape": (1,)}
-
-        # Add state keys
-        for key in self.state_keys:
-            sample_val = first_transition["state"][key].squeeze(
-                dim=0
-            )  # Remove batch dimension
-            if not isinstance(sample_val, torch.Tensor):
-                raise ValueError(
-                    f"State key '{key}' is not a torch.Tensor. Please ensure your states are stored as torch.Tensors."
-                )
-            f_info = guess_feature_info(sample_val, key)
-            features[key] = f_info
-
-        # --------------------------------------------------------------------------------------------
-        # Create an empty LeRobotDataset
-        #    We'll store all frames as separate images only if we detect shape = (3, H, W) or (1, H, W).
-        #    By default we won't do videos, but feel free to adapt if you have them.
-        # --------------------------------------------------------------------------------------------
-        lerobot_dataset = LeRobotDataset.create(
-            repo_id=repo_id,
-            fps=fps,  # If you have real timestamps, adjust this
-            root=root,  # Or some local path where you'd like the dataset files to go
-            robot=None,
-            robot_type=None,
-            features=features,
-            use_videos=True,  # We won't do actual video encoding for a replay buffer
-        )
-
-        # Start writing images if needed. If you have no image features, this is harmless.
-        # Set num_processes or num_threads if you want concurrency.
-        lerobot_dataset.start_image_writer(num_processes=0, num_threads=3)
-
-        # --------------------------------------------------------------------------------------------
-        # Convert transitions into episodes and frames
-        #    We detect episode boundaries by `done == True`.
-        # --------------------------------------------------------------------------------------------
-        episode_index = 0
-        lerobot_dataset.episode_buffer = lerobot_dataset.create_episode_buffer(
-            episode_index
-        )
-
-        frame_idx_in_episode = 0
-        for global_frame_idx, transition in tqdm(
-            enumerate(self.memory),
-            desc="Converting replay buffer to dataset",
-            total=len(self.memory),
-        ):
-            frame_dict = {}
-
-            # Fill the data for state keys
-            for key in self.state_keys:
-                # Expand dimension to match what the dataset expects (the dataset wants the raw shape)
-                # We assume your buffer has shape [C, H, W] (if image) or [D] if vector
-                # This is typically already correct, but if needed you can reshape below.
-                frame_dict[key] = (
-                    transition["state"][key].cpu().squeeze(dim=0)
-                )  # Remove batch dimension
-
-            # Fill action, reward, done
-            # Make sure they are shape (X,) or (X,Y,...) as needed.
-            frame_dict["action"] = (
-                transition["action"].cpu().squeeze(dim=0)
-            )  # Remove batch dimension
-            frame_dict["next.reward"] = (
-                torch.tensor([transition["reward"]], dtype=torch.float32)
-                .cpu()
-                .squeeze(dim=0)
-            )
-            frame_dict["next.done"] = (
-                torch.tensor([transition["done"]], dtype=torch.bool)
-                .cpu()
-                .squeeze(dim=0)
-            )
-            # Add to the dataset's buffer
-            lerobot_dataset.add_frame(frame_dict)
-
-            # Move to next frame
-            frame_idx_in_episode += 1
-            # If we reached an episode boundary, call save_episode, reset counters
-            # TODO: (azouitine) Handle truncation properly
-            if transition["done"] or transition["truncated"]:
-                # Use some placeholder name for the task
-                lerobot_dataset.save_episode(task=task_name)
-                episode_index += 1
-                frame_idx_in_episode = 0
-                # Start a new buffer for the next episode
-                lerobot_dataset.episode_buffer = lerobot_dataset.create_episode_buffer(
-                    episode_index=episode_index
-                )
-
-        # We are done adding frames
-        # If the last transition wasn't done=True, we still have an open buffer with frames.
-        # We'll consider that an incomplete episode and still save it:
-        if lerobot_dataset.episode_buffer["size"] > 0:
-            lerobot_dataset.save_episode(task=task_name)
-
-        lerobot_dataset.stop_image_writer()
-
-        lerobot_dataset.consolidate(run_compute_stats=False, keep_image_files=False)
-
-        return lerobot_dataset
-
 
 # Utility function to guess shapes/dtypes from a tensor
 def guess_feature_info(t: torch.Tensor, name: str):
@@ -655,32 +704,308 @@ def concatenate_batch_transitions(
     return left_batch_transitions
 
 
-# if __name__ == "__main__":
-# dataset_name = "aractingi/push_green_cube_hf_cropped_resized"
-# dataset = LeRobotDataset(repo_id=dataset_name)
+if __name__ == "__main__":
+    import numpy as np
+    from tempfile import TemporaryDirectory
 
-# replay_buffer = ReplayBuffer.from_lerobot_dataset(
-#     lerobot_dataset=dataset, state_keys=["observation.image", "observation.state"]
-# )
-# replay_buffer_converted = replay_buffer.to_lerobot_dataset(repo_id="AdilZtn/pusht_image_converted")
-# for i in range(len(replay_buffer_converted)):
-#     replay_convert = replay_buffer_converted[i]
-#     dataset_convert = dataset[i]
-#     for key in replay_convert.keys():
-#         if key in {"index", "episode_index", "frame_index", "timestamp", "task_index"}:
-#             continue
-#         if key in dataset_convert.keys():
-#             assert torch.equal(replay_convert[key], dataset_convert[key])
-#             print(f"Key {key} is equal : {replay_convert[key].size()}, {dataset_convert[key].size()}")
-# re_reconverted_dataset = ReplayBuffer.from_lerobot_dataset(
-#     replay_buffer_converted, state_keys=["observation.image", "observation.state"], device="cpu"
-# )
-# for _ in range(20):
-#     batch = re_reconverted_dataset.sample(32)
+    # ===== Test 1: Create and use a synthetic ReplayBuffer =====
+    print("Testing synthetic ReplayBuffer...")
 
-#     for key in batch.keys():
-#         if key in {"state", "next_state"}:
-#             for key_state in batch[key].keys():
-#                 print(key_state, batch[key][key_state].size())
-#             continue
-#         print(key, batch[key].size())
+    # Create sample data dimensions
+    batch_size = 32
+    state_dims = {"observation.image": (3, 84, 84), "observation.state": (10,)}
+    action_dim = (6,)
+
+    # Create a buffer
+    buffer = ReplayBuffer(
+        capacity=1000,
+        device="cpu",
+        state_keys=list(state_dims.keys()),
+        use_drq=True,
+        storage_device="cpu",
+    )
+
+    # Add some random transitions
+    for i in range(100):
+        # Create dummy transition data
+        state = {
+            "observation.image": torch.rand(1, 3, 84, 84),
+            "observation.state": torch.rand(1, 10),
+        }
+        action = torch.rand(1, 6)
+        reward = 0.5
+        next_state = {
+            "observation.image": torch.rand(1, 3, 84, 84),
+            "observation.state": torch.rand(1, 10),
+        }
+        done = False if i < 99 else True
+        truncated = False
+
+        buffer.add(
+            state=state,
+            action=action,
+            reward=reward,
+            next_state=next_state,
+            done=done,
+            truncated=truncated,
+        )
+
+    # Test sampling
+    batch = buffer.sample(batch_size)
+    print(f"Buffer size: {len(buffer)}")
+    print(
+        f"Sampled batch state shapes: {batch['state']['observation.image'].shape}, {batch['state']['observation.state'].shape}"
+    )
+    print(f"Sampled batch action shape: {batch['action'].shape}")
+    print(f"Sampled batch reward shape: {batch['reward'].shape}")
+    print(f"Sampled batch done shape: {batch['done'].shape}")
+    print(f"Sampled batch truncated shape: {batch['truncated'].shape}")
+
+    # ===== Test for state-action-reward alignment =====
+    print("\nTesting state-action-reward alignment...")
+
+    # Create a buffer with controlled transitions where we know the relationships
+    aligned_buffer = ReplayBuffer(
+        capacity=100, device="cpu", state_keys=["state_value"], storage_device="cpu"
+    )
+
+    # Create transitions with known relationships
+    # - Each state has a unique signature value
+    # - Action is 2x the state signature
+    # - Reward is 3x the state signature
+    # - Next state is signature + 0.01 (unless at episode end)
+    for i in range(100):
+        # Create a state with a signature value that encodes the transition number
+        signature = float(i) / 100.0
+        state = {"state_value": torch.tensor([[signature]]).float()}
+
+        # Action is 2x the signature
+        action = torch.tensor([[2.0 * signature]]).float()
+
+        # Reward is 3x the signature
+        reward = 3.0 * signature
+
+        # Next state is signature + 0.01, unless end of episode
+        # End episode every 10 steps
+        is_end = (i + 1) % 10 == 0
+
+        if is_end:
+            # At episode boundaries, next_state repeats current state (as per your implementation)
+            next_state = {"state_value": torch.tensor([[signature]]).float()}
+            done = True
+        else:
+            # Within episodes, next_state has signature + 0.01
+            next_signature = float(i + 1) / 100.0
+            next_state = {"state_value": torch.tensor([[next_signature]]).float()}
+            done = False
+
+        aligned_buffer.add(state, action, reward, next_state, done, False)
+
+    # Sample from this buffer
+    aligned_batch = aligned_buffer.sample(50)
+
+    # Verify alignments in sampled batch
+    correct_relationships = 0
+    total_checks = 0
+
+    # For each transition in the batch
+    for i in range(50):
+        # Extract signature from state
+        state_sig = aligned_batch["state"]["state_value"][i].item()
+
+        # Check action is 2x signature (within reasonable precision)
+        action_val = aligned_batch["action"][i].item()
+        action_check = abs(action_val - 2.0 * state_sig) < 1e-4
+
+        # Check reward is 3x signature (within reasonable precision)
+        reward_val = aligned_batch["reward"][i].item()
+        reward_check = abs(reward_val - 3.0 * state_sig) < 1e-4
+
+        # Check next_state relationship matches our pattern
+        next_state_sig = aligned_batch["next_state"]["state_value"][i].item()
+        is_done = aligned_batch["done"][i].item() > 0.5
+
+        # Calculate expected next_state value based on done flag
+        if is_done:
+            # For episodes that end, next_state should equal state
+            next_state_check = abs(next_state_sig - state_sig) < 1e-4
+        else:
+            # For continuing episodes, check if next_state is approximately state + 0.01
+            # We need to be careful because we don't know the original index
+            # So we check if the increment is roughly 0.01
+            next_state_check = (
+                abs(next_state_sig - state_sig - 0.01) < 1e-4
+                or abs(next_state_sig - state_sig) < 1e-4
+            )
+
+        # Count correct relationships
+        if action_check:
+            correct_relationships += 1
+        if reward_check:
+            correct_relationships += 1
+        if next_state_check:
+            correct_relationships += 1
+
+        total_checks += 3
+
+    alignment_accuracy = 100.0 * correct_relationships / total_checks
+    print(
+        f"State-action-reward-next_state alignment accuracy: {alignment_accuracy:.2f}%"
+    )
+    if alignment_accuracy > 99.0:
+        print(
+            "✅ All relationships verified! Buffer maintains correct temporal relationships."
+        )
+    else:
+        print(
+            "⚠️ Some relationships don't match expected patterns. Buffer may have alignment issues."
+        )
+
+        # Print some debug information about failures
+        print("\nDebug information for failed checks:")
+        for i in range(5):  # Print first 5 transitions for debugging
+            state_sig = aligned_batch["state"]["state_value"][i].item()
+            action_val = aligned_batch["action"][i].item()
+            reward_val = aligned_batch["reward"][i].item()
+            next_state_sig = aligned_batch["next_state"]["state_value"][i].item()
+            is_done = aligned_batch["done"][i].item() > 0.5
+
+            print(f"Transition {i}:")
+            print(f"  State: {state_sig:.6f}")
+            print(f"  Action: {action_val:.6f} (expected: {2.0 * state_sig:.6f})")
+            print(f"  Reward: {reward_val:.6f} (expected: {3.0 * state_sig:.6f})")
+            print(f"  Done: {is_done}")
+            print(f"  Next state: {next_state_sig:.6f}")
+
+            # Calculate expected next state
+            if is_done:
+                expected_next = state_sig
+            else:
+                # This approximation might not be perfect
+                state_idx = round(state_sig * 100)
+                expected_next = (state_idx + 1) / 100.0
+
+            print(f"  Expected next state: {expected_next:.6f}")
+            print()
+
+    # ===== Test 2: Convert to LeRobotDataset and back =====
+    with TemporaryDirectory() as temp_dir:
+        print("\nTesting conversion to LeRobotDataset and back...")
+        # Convert buffer to dataset
+        repo_id = "test/replay_buffer_conversion"
+        # Create a subdirectory to avoid the "directory exists" error
+        dataset_dir = os.path.join(temp_dir, "dataset1")
+        dataset = buffer.to_lerobot_dataset(repo_id=repo_id, root=dataset_dir)
+
+        print(f"Dataset created with {len(dataset)} frames")
+        print(f"Dataset features: {list(dataset.features.keys())}")
+
+        # Check a random sample from the dataset
+        sample = dataset[0]
+        print(
+            f"Dataset sample types: {[(k, type(v)) for k, v in sample.items() if k.startswith('observation')]}"
+        )
+
+        # Convert dataset back to buffer
+        reconverted_buffer = ReplayBuffer.from_lerobot_dataset(
+            dataset, state_keys=list(state_dims.keys()), device="cpu"
+        )
+
+        print(f"Reconverted buffer size: {len(reconverted_buffer)}")
+
+        # Sample from the reconverted buffer
+        reconverted_batch = reconverted_buffer.sample(batch_size)
+        print(
+            f"Reconverted batch state shapes: {reconverted_batch['state']['observation.image'].shape}, {reconverted_batch['state']['observation.state'].shape}"
+        )
+
+        # Verify consistency before and after conversion
+        original_states = batch["state"]["observation.image"].mean().item()
+        reconverted_states = (
+            reconverted_batch["state"]["observation.image"].mean().item()
+        )
+        print(f"Original buffer state mean: {original_states:.4f}")
+        print(f"Reconverted buffer state mean: {reconverted_states:.4f}")
+
+        if abs(original_states - reconverted_states) < 1.0:
+            print("Values are reasonably similar - conversion works as expected")
+        else:
+            print(
+                "WARNING: Significant difference between original and reconverted values"
+            )
+
+    print("\nTesting real LeRobotDataset conversion...")
+    try:
+        # Try to use a real dataset if available
+        dataset_name = "AdilZtn/Maniskill-Pushcube-demonstration-small"
+        dataset = LeRobotDataset(repo_id=dataset_name)
+
+        # Print available keys to debug
+        sample = dataset[0]
+        print("Available keys in first dataset:", list(sample.keys()))
+
+        # Check for required keys
+        if "action" not in sample or "next.reward" not in sample:
+            print("Dataset missing essential keys. Cannot convert.")
+            raise ValueError("Missing required keys in dataset")
+
+        # Auto-detect appropriate state keys
+        image_keys = []
+        state_keys = []
+        for k, v in sample.items():
+            # Skip metadata keys and action/reward keys
+            if k in {
+                "index",
+                "episode_index",
+                "frame_index",
+                "timestamp",
+                "task_index",
+                "action",
+                "next.reward",
+                "next.done",
+            }:
+                continue
+
+            # Infer key type from tensor shape
+            if isinstance(v, torch.Tensor):
+                if len(v.shape) == 3 and (v.shape[0] == 3 or v.shape[0] == 1):
+                    # Likely an image (channels, height, width)
+                    image_keys.append(k)
+                else:
+                    # Likely state or other vector
+                    state_keys.append(k)
+
+        print(f"Detected image keys: {image_keys}")
+        print(f"Detected state keys: {state_keys}")
+
+        if not image_keys and not state_keys:
+            print("No usable keys found in dataset, skipping further tests")
+            raise ValueError("No usable keys found in dataset")
+
+        # Convert to ReplayBuffer with detected keys
+        replay_buffer = ReplayBuffer.from_lerobot_dataset(
+            lerobot_dataset=dataset,
+            state_keys=image_keys + state_keys,
+            device="cpu",
+        )
+        print(f"Loaded {len(replay_buffer)} transitions from {dataset_name}")
+
+        # Test sampling
+        real_batch = replay_buffer.sample(batch_size)
+        print("Sampled batch from real dataset, state shapes:")
+        for key in real_batch["state"]:
+            print(f"  {key}: {real_batch['state'][key].shape}")
+
+        # Convert back to LeRobotDataset
+        with TemporaryDirectory() as temp_dir:
+            replay_buffer_converted = replay_buffer.to_lerobot_dataset(
+                repo_id="test/real_dataset_converted",
+                root=os.path.join(temp_dir, "dataset2"),
+            )
+            print(
+                f"Successfully converted back to LeRobotDataset with {len(replay_buffer_converted)} frames"
+            )
+
+    except Exception as e:
+        print(f"Real dataset test failed: {e}")
+        print("This is expected if running offline or if the dataset is not available.")

From 5b4a7aa81d9562a54910659b08b9baded7100a81 Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Tue, 25 Feb 2025 15:30:39 +0000
Subject: [PATCH 090/112] Add storage device parameter to replay buffer
 initialization

- Specify storage device for replay buffer to optimize memory management
---
 lerobot/scripts/server/learner_server.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index b5c73a80..a4e42305 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -647,6 +647,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
             state_keys=cfg.policy.input_shapes.keys(),
             action_mask=active_action_dims,
             action_delta=cfg.env.wrapper.delta_action,
+            storage_device=device,
         )
         batch_size: int = batch_size // 2  # We will sample from both replay buffer
 

From 1df9ee4f2dc204315fde428458bcaf8513b88292 Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Tue, 25 Feb 2025 19:04:58 +0000
Subject: [PATCH 091/112] Add memory optimization option to ReplayBuffer

- Introduce `optimize_memory` parameter to reduce memory usage in replay buffer
- Implement simplified memory optimization by not storing duplicate next_states
- Update learner server and buffer initialization to use memory optimization by default
---
 lerobot/scripts/server/buffer.py         | 364 ++++++++++++++++++-----
 lerobot/scripts/server/learner_server.py |   3 +
 2 files changed, 296 insertions(+), 71 deletions(-)

diff --git a/lerobot/scripts/server/buffer.py b/lerobot/scripts/server/buffer.py
index de278582..905157f1 100644
--- a/lerobot/scripts/server/buffer.py
+++ b/lerobot/scripts/server/buffer.py
@@ -178,6 +178,7 @@ class ReplayBuffer:
         image_augmentation_function: Optional[Callable] = None,
         use_drq: bool = True,
         storage_device: str = "cpu",
+        optimize_memory: bool = False,
     ):
         """
         Args:
@@ -189,6 +190,8 @@ class ReplayBuffer:
             use_drq (bool): Whether to use the default DRQ image augmentation style, when sampling in the buffer.
             storage_device: The device (e.g. "cpu" or "cuda:0") where the data will be stored.
                 Using "cpu" can help save GPU memory.
+            optimize_memory (bool): If True, optimizes memory by not storing duplicate next_states when
+                they can be derived from states. This is useful for large datasets where next_state[i] = state[i+1].
         """
         self.capacity = capacity
         self.device = device
@@ -196,6 +199,12 @@ class ReplayBuffer:
         self.position = 0
         self.size = 0
         self.initialized = False
+        self.optimize_memory = optimize_memory
+
+        # Track episode boundaries for memory optimization
+        self.episode_ends = torch.zeros(
+            capacity, dtype=torch.bool, device=storage_device
+        )
 
         # If no state_keys provided, default to an empty list
         self.state_keys = state_keys if state_keys is not None else []
@@ -220,10 +229,18 @@ class ReplayBuffer:
             (self.capacity, *action_shape), device=self.storage_device
         )
         self.rewards = torch.empty((self.capacity,), device=self.storage_device)
-        self.next_states = {
-            key: torch.empty((self.capacity, *shape), device=self.storage_device)
-            for key, shape in state_shapes.items()
-        }
+
+        if not self.optimize_memory:
+            # Standard approach: store states and next_states separately
+            self.next_states = {
+                key: torch.empty((self.capacity, *shape), device=self.storage_device)
+                for key, shape in state_shapes.items()
+            }
+        else:
+            # Memory-optimized approach: don't allocate next_states buffer
+            # Just create a reference to states for consistent API
+            self.next_states = self.states  # Just a reference for API consistency
+
         self.dones = torch.empty(
             (self.capacity,), dtype=torch.bool, device=self.storage_device
         )
@@ -253,7 +270,12 @@ class ReplayBuffer:
         # Store the transition in pre-allocated tensors
         for key in self.states:
             self.states[key][self.position].copy_(state[key].squeeze(dim=0))
-            self.next_states[key][self.position].copy_(next_state[key].squeeze(dim=0))
+
+            if not self.optimize_memory:
+                # Only store next_states if not optimizing memory
+                self.next_states[key][self.position].copy_(
+                    next_state[key].squeeze(dim=0)
+                )
 
         self.actions[self.position].copy_(action.squeeze(dim=0))
         self.rewards[self.position] = reward
@@ -288,10 +310,17 @@ class ReplayBuffer:
         batch_state = {}
         batch_next_state = {}
 
-        # First pass: load all tensors to target device
+        # First pass: load all state tensors to target device
         for key in self.states:
             batch_state[key] = self.states[key][idx].to(self.device)
-            batch_next_state[key] = self.next_states[key][idx].to(self.device)
+
+            if not self.optimize_memory:
+                # Standard approach - load next_states directly
+                batch_next_state[key] = self.next_states[key][idx].to(self.device)
+            else:
+                # Memory-optimized approach - get next_state from the next index
+                next_idx = (idx + 1) % self.capacity
+                batch_next_state[key] = self.states[key][next_idx].to(self.device)
 
         # Apply image augmentation in a batched way if needed
         if self.use_drq and image_keys:
@@ -343,6 +372,7 @@ class ReplayBuffer:
         image_augmentation_function: Optional[Callable] = None,
         use_drq: bool = True,
         storage_device: str = "cpu",
+        optimize_memory: bool = False,
     ) -> "ReplayBuffer":
         """
         Convert a LeRobotDataset into a ReplayBuffer.
@@ -358,6 +388,7 @@ class ReplayBuffer:
                 If None, uses default random shift with pad=4.
             use_drq (bool): Whether to use DrQ image augmentation when sampling.
             storage_device (str): Device for storing tensor data. Using "cpu" saves GPU memory.
+            optimize_memory (bool): If True, reduces memory usage by not duplicating state data.
 
         Returns:
             ReplayBuffer: The replay buffer with dataset transitions.
@@ -378,6 +409,7 @@ class ReplayBuffer:
             image_augmentation_function=image_augmentation_function,
             use_drq=use_drq,
             storage_device=storage_device,
+            optimize_memory=optimize_memory,
         )
 
         # Convert dataset to transitions
@@ -934,78 +966,268 @@ if __name__ == "__main__":
                 "WARNING: Significant difference between original and reconverted values"
             )
 
-    print("\nTesting real LeRobotDataset conversion...")
-    try:
-        # Try to use a real dataset if available
-        dataset_name = "AdilZtn/Maniskill-Pushcube-demonstration-small"
-        dataset = LeRobotDataset(repo_id=dataset_name)
+    print("\nAll previous tests completed!")
 
-        # Print available keys to debug
-        sample = dataset[0]
-        print("Available keys in first dataset:", list(sample.keys()))
+    # ===== Test for memory optimization =====
+    print("\n===== Testing Memory Optimization =====")
 
-        # Check for required keys
-        if "action" not in sample or "next.reward" not in sample:
-            print("Dataset missing essential keys. Cannot convert.")
-            raise ValueError("Missing required keys in dataset")
+    # Create two buffers, one with memory optimization and one without
+    standard_buffer = ReplayBuffer(
+        capacity=1000,
+        device="cpu",
+        state_keys=["observation.image", "observation.state"],
+        storage_device="cpu",
+        optimize_memory=False,
+        use_drq=True,
+    )
 
-        # Auto-detect appropriate state keys
-        image_keys = []
-        state_keys = []
-        for k, v in sample.items():
-            # Skip metadata keys and action/reward keys
-            if k in {
-                "index",
-                "episode_index",
-                "frame_index",
-                "timestamp",
-                "task_index",
-                "action",
-                "next.reward",
-                "next.done",
-            }:
-                continue
+    optimized_buffer = ReplayBuffer(
+        capacity=1000,
+        device="cpu",
+        state_keys=["observation.image", "observation.state"],
+        storage_device="cpu",
+        optimize_memory=True,
+        use_drq=True,
+    )
 
-            # Infer key type from tensor shape
-            if isinstance(v, torch.Tensor):
-                if len(v.shape) == 3 and (v.shape[0] == 3 or v.shape[0] == 1):
-                    # Likely an image (channels, height, width)
-                    image_keys.append(k)
-                else:
-                    # Likely state or other vector
-                    state_keys.append(k)
+    # Generate sample data with larger state dimensions for better memory impact
+    print("Generating test data...")
+    num_episodes = 10
+    steps_per_episode = 50
+    total_steps = num_episodes * steps_per_episode
 
-        print(f"Detected image keys: {image_keys}")
-        print(f"Detected state keys: {state_keys}")
+    for episode in range(num_episodes):
+        for step in range(steps_per_episode):
+            # Index in the overall sequence
+            i = episode * steps_per_episode + step
 
-        if not image_keys and not state_keys:
-            print("No usable keys found in dataset, skipping further tests")
-            raise ValueError("No usable keys found in dataset")
+            # Create state with identifiable values
+            img = torch.ones((3, 84, 84)) * (i / total_steps)
+            state_vec = torch.ones((10,)) * (i / total_steps)
 
-        # Convert to ReplayBuffer with detected keys
-        replay_buffer = ReplayBuffer.from_lerobot_dataset(
-            lerobot_dataset=dataset,
-            state_keys=image_keys + state_keys,
-            device="cpu",
+            state = {
+                "observation.image": img.unsqueeze(0),
+                "observation.state": state_vec.unsqueeze(0),
+            }
+
+            # Create next state (i+1 or same as current if last in episode)
+            is_last_step = step == steps_per_episode - 1
+
+            if is_last_step:
+                # At episode end, next state = current state
+                next_img = img.clone()
+                next_state_vec = state_vec.clone()
+                done = True
+                truncated = False
+            else:
+                # Within episode, next state has incremented value
+                next_val = (i + 1) / total_steps
+                next_img = torch.ones((3, 84, 84)) * next_val
+                next_state_vec = torch.ones((10,)) * next_val
+                done = False
+                truncated = False
+
+            next_state = {
+                "observation.image": next_img.unsqueeze(0),
+                "observation.state": next_state_vec.unsqueeze(0),
+            }
+
+            # Action and reward
+            action = torch.tensor([[i / total_steps]])
+            reward = float(i / total_steps)
+
+            # Add to both buffers
+            standard_buffer.add(state, action, reward, next_state, done, truncated)
+            optimized_buffer.add(state, action, reward, next_state, done, truncated)
+
+    # Verify episode boundaries with our simplified approach
+    print("\nVerifying simplified memory optimization...")
+
+    # Test with a new buffer with a small sequence
+    test_buffer = ReplayBuffer(
+        capacity=20,
+        device="cpu",
+        state_keys=["value"],
+        storage_device="cpu",
+        optimize_memory=True,
+        use_drq=False,
+    )
+
+    # Add a simple sequence with known episode boundaries
+    for i in range(20):
+        val = float(i)
+        state = {"value": torch.tensor([[val]]).float()}
+        next_val = float(i + 1) if i % 5 != 4 else val  # Episode ends every 5 steps
+        next_state = {"value": torch.tensor([[next_val]]).float()}
+
+        # Set done=True at every 5th step
+        done = (i % 5) == 4
+        action = torch.tensor([[0.0]])
+        reward = 1.0
+        truncated = False
+
+        test_buffer.add(state, action, reward, next_state, done, truncated)
+
+    # Get sequential batch for verification
+    sequential_batch_size = test_buffer.size
+    all_indices = torch.arange(sequential_batch_size, device=test_buffer.storage_device)
+
+    # Get state tensors
+    batch_state = {
+        "value": test_buffer.states["value"][all_indices].to(test_buffer.device)
+    }
+
+    # Get next_state using memory-optimized approach (simply index+1)
+    next_indices = (all_indices + 1) % test_buffer.capacity
+    batch_next_state = {
+        "value": test_buffer.states["value"][next_indices].to(test_buffer.device)
+    }
+
+    # Get other tensors
+    batch_dones = test_buffer.dones[all_indices].to(test_buffer.device)
+
+    # Print sequential values
+    print("State, Next State, Done (Sequential values with simplified optimization):")
+    state_values = batch_state["value"].squeeze().tolist()
+    next_values = batch_next_state["value"].squeeze().tolist()
+    done_flags = batch_dones.tolist()
+
+    # Print all values
+    for i in range(len(state_values)):
+        print(f"  {state_values[i]:.1f} → {next_values[i]:.1f}, Done: {done_flags[i]}")
+
+    # Explain the memory optimization tradeoff
+    print("\nWith simplified memory optimization:")
+    print("- We always use the next state in the buffer (index+1) as next_state")
+    print("- For terminal states, this means using the first state of the next episode")
+    print("- This is a common tradeoff in RL implementations for memory efficiency")
+    print(
+        "- Since we track done flags, the algorithm can handle these transitions correctly"
+    )
+
+    # Test random sampling
+    print("\nVerifying random sampling with simplified memory optimization...")
+    random_samples = test_buffer.sample(20)  # Sample all transitions
+
+    # Extract values
+    random_state_values = random_samples["state"]["value"].squeeze().tolist()
+    random_next_values = random_samples["next_state"]["value"].squeeze().tolist()
+    random_done_flags = random_samples["done"].bool().tolist()
+
+    # Print a few samples
+    print("Random samples - State, Next State, Done (First 10):")
+    for i in range(10):
+        print(
+            f"  {random_state_values[i]:.1f} → {random_next_values[i]:.1f}, Done: {random_done_flags[i]}"
         )
-        print(f"Loaded {len(replay_buffer)} transitions from {dataset_name}")
 
-        # Test sampling
-        real_batch = replay_buffer.sample(batch_size)
-        print("Sampled batch from real dataset, state shapes:")
-        for key in real_batch["state"]:
-            print(f"  {key}: {real_batch['state'][key].shape}")
+    # Calculate memory savings
+    # Assume optimized_buffer and standard_buffer have already been initialized and filled
+    std_mem = (
+        sum(
+            standard_buffer.states[key].nelement()
+            * standard_buffer.states[key].element_size()
+            for key in standard_buffer.states
+        )
+        * 2
+    )
+    opt_mem = sum(
+        optimized_buffer.states[key].nelement()
+        * optimized_buffer.states[key].element_size()
+        for key in optimized_buffer.states
+    )
 
-        # Convert back to LeRobotDataset
-        with TemporaryDirectory() as temp_dir:
-            replay_buffer_converted = replay_buffer.to_lerobot_dataset(
-                repo_id="test/real_dataset_converted",
-                root=os.path.join(temp_dir, "dataset2"),
-            )
-            print(
-                f"Successfully converted back to LeRobotDataset with {len(replay_buffer_converted)} frames"
-            )
+    savings_percent = (std_mem - opt_mem) / std_mem * 100
 
-    except Exception as e:
-        print(f"Real dataset test failed: {e}")
-        print("This is expected if running offline or if the dataset is not available.")
+    print(f"\nMemory optimization result:")
+    print(f"- Standard buffer state memory: {std_mem / (1024 * 1024):.2f} MB")
+    print(f"- Optimized buffer state memory: {opt_mem / (1024 * 1024):.2f} MB")
+    print(f"- Memory savings for state tensors: {savings_percent:.1f}%")
+
+    print("\nAll memory optimization tests completed!")
+
+    # # ===== Test real dataset conversion =====
+    # print("\n===== Testing Real LeRobotDataset Conversion =====")
+    # try:
+    #     # Try to use a real dataset if available
+    #     dataset_name = "AdilZtn/Maniskill-Pushcube-demonstration-small"
+    #     dataset = LeRobotDataset(repo_id=dataset_name)
+
+    #     # Print available keys to debug
+    #     sample = dataset[0]
+    #     print("Available keys in dataset:", list(sample.keys()))
+
+    #     # Check for required keys
+    #     if "action" not in sample or "next.reward" not in sample:
+    #         print("Dataset missing essential keys. Cannot convert.")
+    #         raise ValueError("Missing required keys in dataset")
+
+    #     # Auto-detect appropriate state keys
+    #     image_keys = []
+    #     state_keys = []
+    #     for k, v in sample.items():
+    #         # Skip metadata keys and action/reward keys
+    #         if k in {
+    #             "index",
+    #             "episode_index",
+    #             "frame_index",
+    #             "timestamp",
+    #             "task_index",
+    #             "action",
+    #             "next.reward",
+    #             "next.done",
+    #         }:
+    #             continue
+
+    #         # Infer key type from tensor shape
+    #         if isinstance(v, torch.Tensor):
+    #             if len(v.shape) == 3 and (v.shape[0] == 3 or v.shape[0] == 1):
+    #                 # Likely an image (channels, height, width)
+    #                 image_keys.append(k)
+    #             else:
+    #                 # Likely state or other vector
+    #                 state_keys.append(k)
+
+    #     print(f"Detected image keys: {image_keys}")
+    #     print(f"Detected state keys: {state_keys}")
+
+    #     if not image_keys and not state_keys:
+    #         print("No usable keys found in dataset, skipping further tests")
+    #         raise ValueError("No usable keys found in dataset")
+
+    #     # Test with standard and memory-optimized buffers
+    #     for optimize_memory in [False, True]:
+    #         buffer_type = "Standard" if not optimize_memory else "Memory-optimized"
+    #         print(f"\nTesting {buffer_type} buffer with real dataset...")
+
+    #         # Convert to ReplayBuffer with detected keys
+    #         replay_buffer = ReplayBuffer.from_lerobot_dataset(
+    #             lerobot_dataset=dataset,
+    #             state_keys=image_keys + state_keys,
+    #             device="cpu",
+    #             optimize_memory=optimize_memory,
+    #         )
+    #         print(f"Loaded {len(replay_buffer)} transitions from {dataset_name}")
+
+    #         # Test sampling
+    #         real_batch = replay_buffer.sample(32)
+    #         print(f"Sampled batch from real dataset ({buffer_type}), state shapes:")
+    #         for key in real_batch["state"]:
+    #             print(f"  {key}: {real_batch['state'][key].shape}")
+
+    #         # Convert back to LeRobotDataset
+    #         with TemporaryDirectory() as temp_dir:
+    #             dataset_name = f"test/real_dataset_converted_{buffer_type}"
+    #             replay_buffer_converted = replay_buffer.to_lerobot_dataset(
+    #                 repo_id=dataset_name,
+    #                 root=os.path.join(temp_dir, f"dataset_{buffer_type}"),
+    #             )
+    #             print(
+    #                 f"Successfully converted back to LeRobotDataset with {len(replay_buffer_converted)} frames"
+    #             )
+
+    # except Exception as e:
+    #     print(f"Real dataset test failed: {e}")
+    #     print("This is expected if running offline or if the dataset is not available.")
+
+    # print("\nAll tests completed!")
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index a4e42305..edbeb01c 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -154,6 +154,7 @@ def initialize_replay_buffer(
             device=device,
             state_keys=cfg.policy.input_shapes.keys(),
             storage_device=device,
+            optimize_memory=True,
         )
 
     dataset = LeRobotDataset(
@@ -166,6 +167,7 @@ def initialize_replay_buffer(
         capacity=cfg.training.online_buffer_capacity,
         device=device,
         state_keys=cfg.policy.input_shapes.keys(),
+        optimize_memory=True,
     )
 
 
@@ -648,6 +650,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
             action_mask=active_action_dims,
             action_delta=cfg.env.wrapper.delta_action,
             storage_device=device,
+            optimize_memory=True,
         )
         batch_size: int = batch_size // 2  # We will sample from both replay buffer
 

From d8a1758122f84c5301bba1a6b226e819c5067320 Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Tue, 4 Mar 2025 13:22:35 +0000
Subject: [PATCH 092/112] Add storage device configuration for SAC policy and
 replay buffer

- Introduce `storage_device` parameter in SAC configuration and training settings
- Update learner server to use configurable storage device for replay buffer
- Reduce online buffer capacity in ManiSkill configuration
- Modify replay buffer initialization to support custom storage device
---
 lerobot/common/policies/sac/configuration_sac.py | 2 ++
 lerobot/configs/policy/sac_maniskill.yaml        | 5 ++++-
 lerobot/scripts/server/learner_server.py         | 9 +++++----
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index d225f11b..b834896e 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -64,6 +64,8 @@ class SACConfig:
         }
     )
     camera_number: int = 1
+
+    storage_device: str = "cpu"
     # Add type annotations for these fields:
     vision_encoder_name: str | None = field(default="helper2424/resnet10")
     freeze_vision_encoder: bool = True
diff --git a/lerobot/configs/policy/sac_maniskill.yaml b/lerobot/configs/policy/sac_maniskill.yaml
index c78df904..87fc4095 100644
--- a/lerobot/configs/policy/sac_maniskill.yaml
+++ b/lerobot/configs/policy/sac_maniskill.yaml
@@ -20,6 +20,9 @@ training:
   grad_clip_norm: 10.0
   lr: 3e-4
 
+
+  storage_device: "cpu"
+
   eval_freq: 2500
   log_freq: 10
   save_freq: 2000000
@@ -30,7 +33,7 @@ training:
   online_steps_between_rollouts: 1000
   online_sampling_ratio: 1.0
   online_env_seed: 10000
-  online_buffer_capacity: 1000000
+  online_buffer_capacity: 200000
   online_buffer_seed_size: 0
   online_step_before_learning: 500
   do_online_rollout_async: false
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index edbeb01c..baba99e7 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -146,14 +146,14 @@ def log_training_info(cfg: DictConfig, out_dir: str, policy: nn.Module) -> None:
 
 
 def initialize_replay_buffer(
-    cfg: DictConfig, logger: Logger, device: str
+    cfg: DictConfig, logger: Logger, device: str, storage_device:str
 ) -> ReplayBuffer:
     if not cfg.resume:
         return ReplayBuffer(
             capacity=cfg.training.online_buffer_capacity,
             device=device,
             state_keys=cfg.policy.input_shapes.keys(),
-            storage_device=device,
+            storage_device=storage_device,
             optimize_memory=True,
         )
 
@@ -596,6 +596,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     set_global_seed(cfg.seed)
 
     device = get_safe_torch_device(cfg.device, log=True)
+    storage_device = get_safe_torch_device(cfg_device=cfg.training.storage_device)
 
     torch.backends.cudnn.benchmark = True
     torch.backends.cuda.matmul.allow_tf32 = True
@@ -628,7 +629,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
 
     log_training_info(cfg, out_dir, policy)
 
-    replay_buffer = initialize_replay_buffer(cfg, logger, device)
+    replay_buffer = initialize_replay_buffer(cfg, logger, device, storage_device)
     batch_size = cfg.training.batch_size
     offline_replay_buffer = None
 
@@ -649,7 +650,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
             state_keys=cfg.policy.input_shapes.keys(),
             action_mask=active_action_dims,
             action_delta=cfg.env.wrapper.delta_action,
-            storage_device=device,
+            storage_device=storage_device,
             optimize_memory=True,
         )
         batch_size: int = batch_size // 2  # We will sample from both replay buffer

From 584cad808eb67c8d7e23dca167b2e81698419e76 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 4 Mar 2025 13:38:47 +0000
Subject: [PATCH 093/112] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 benchmarks/video/run_video_benchmark.py       |  86 +++++--
 examples/10_use_so100.md                      |   6 +-
 examples/1_load_lerobot_dataset.py            |  14 +-
 examples/2_evaluate_pretrained_policy.py      |   4 +-
 examples/3_train_policy.py                    |  19 +-
 examples/6_add_image_transforms.py            |   8 +-
 .../advanced/2_calculate_validation_loss.py   |  32 ++-
 examples/port_datasets/pusht_zarr.py          |  12 +-
 lerobot/__init__.py                           |  14 +-
 lerobot/common/datasets/compute_stats.py      |  53 +++-
 lerobot/common/datasets/factory.py            |  12 +-
 lerobot/common/datasets/image_writer.py       |  12 +-
 lerobot/common/datasets/lerobot_dataset.py    | 158 +++++++++---
 lerobot/common/datasets/online_buffer.py      |  67 +++--
 .../_diffusion_policy_replay_buffer.py        |  81 +++++--
 .../push_dataset_to_hub/_encode_datasets.py   |   8 +-
 .../_umi_imagecodecs_numcodecs.py             |   4 +-
 .../push_dataset_to_hub/aloha_hdf5_format.py  |  24 +-
 .../dora_parquet_format.py                    |  37 ++-
 .../push_dataset_to_hub/openx_rlds_format.py  |  17 +-
 .../push_dataset_to_hub/pusht_zarr_format.py  |  30 ++-
 .../push_dataset_to_hub/umi_zarr_format.py    |  33 ++-
 .../datasets/push_dataset_to_hub/utils.py     |  16 +-
 .../push_dataset_to_hub/xarm_pkl_format.py    |  27 ++-
 lerobot/common/datasets/sampler.py            |   5 +-
 lerobot/common/datasets/transforms.py         |  30 ++-
 lerobot/common/datasets/utils.py              |  63 +++--
 .../v2/batch_convert_dataset_v1_to_v2.py      |  70 ++++--
 .../datasets/v2/convert_dataset_v1_to_v2.py   | 183 +++++++++++---
 lerobot/common/datasets/video_utils.py        |  12 +-
 lerobot/common/envs/factory.py                |  28 ++-
 lerobot/common/envs/utils.py                  |   8 +-
 lerobot/common/logger.py                      |  62 +++--
 .../common/policies/act/configuration_act.py  |   4 +-
 lerobot/common/policies/act/modeling_act.py   | 229 +++++++++++++-----
 .../diffusion/configuration_diffusion.py      |  13 +-
 .../policies/diffusion/modeling_diffusion.py  | 181 ++++++++++----
 lerobot/common/policies/factory.py            |   8 +-
 .../hilserl/classifier/modeling_classifier.py |  37 ++-
 lerobot/common/policies/sac/modeling_sac.py   | 161 +++++++++---
 .../policies/tdmpc/configuration_tdmpc.py     |   8 +-
 .../common/policies/tdmpc/modeling_tdmpc.py   | 196 +++++++++++----
 .../policies/vqbet/configuration_vqbet.py     |   4 +-
 .../common/policies/vqbet/modeling_vqbet.py   | 222 +++++++++++++----
 lerobot/common/policies/vqbet/vqbet_utils.py  | 184 ++++++++++----
 .../robot_devices/cameras/intelrealsense.py   |  64 +++--
 .../common/robot_devices/cameras/opencv.py    |  57 ++++-
 lerobot/common/robot_devices/control_utils.py |  51 +++-
 .../common/robot_devices/motors/dynamixel.py  |  92 +++++--
 .../common/robot_devices/motors/feetech.py    |  92 +++++--
 .../robots/dynamixel_calibration.py           |  33 ++-
 .../robots/feetech_calibration.py             | 112 ++++++---
 .../robot_devices/robots/manipulator.py       |  99 ++++++--
 .../common/robot_devices/robots/stretch.py    |  24 +-
 lerobot/common/robot_devices/utils.py         |   3 +-
 lerobot/common/utils/import_utils.py          |   4 +-
 lerobot/common/utils/io_utils.py              |   4 +-
 lerobot/common/utils/utils.py                 |  46 +++-
 lerobot/configs/env/so100_real.yaml           |   4 +-
 lerobot/configs/policy/sac_maniskill.yaml     |   4 +-
 lerobot/scripts/configure_motor.py            |  59 ++++-
 lerobot/scripts/control_robot.py              |  58 ++++-
 lerobot/scripts/control_sim_robot.py          |  68 ++++--
 lerobot/scripts/display_sys_info.py           |  10 +-
 lerobot/scripts/eval.py                       | 131 +++++++---
 lerobot/scripts/eval_on_robot.py              |  56 ++++-
 lerobot/scripts/find_motors_bus_port.py       |   8 +-
 lerobot/scripts/push_dataset_to_hub.py        |  55 ++++-
 lerobot/scripts/server/buffer.py              |   4 +-
 lerobot/scripts/server/crop_dataset_roi.py    |  10 +-
 lerobot/scripts/server/find_joint_limits.py   |  11 +-
 lerobot/scripts/server/learner_server.py      |   2 +-
 .../scripts/server/maniskill_manipulator.py   |  25 +-
 lerobot/scripts/train.py                      | 125 +++++++---
 lerobot/scripts/train_hilserl_classifier.py   | 102 ++++++--
 lerobot/scripts/train_sac.py                  | 120 ++++++---
 lerobot/scripts/visualize_dataset.py          |   8 +-
 lerobot/scripts/visualize_dataset_html.py     |  77 ++++--
 lerobot/scripts/visualize_image_transforms.py |   8 +-
 .../templates/visualize_dataset_homepage.html |  10 +-
 .../templates/visualize_dataset_template.html |   4 +-
 tests/conftest.py                             |   7 +-
 tests/fixtures/constants.py                   |  30 ++-
 tests/fixtures/dataset_factories.py           |  94 +++++--
 tests/fixtures/files.py                       |  12 +-
 tests/fixtures/hub.py                         |  31 ++-
 tests/mock_dynamixel_sdk.py                   |   4 +-
 tests/mock_pyrealsense2.py                    |   4 +-
 tests/mock_scservo_sdk.py                     |   4 +-
 .../check_hiserl_reward_classifier.py         |  45 +++-
 .../classifier/test_modelling_classifier.py   |  20 +-
 tests/scripts/save_dataset_to_safetensors.py  |  12 +-
 tests/scripts/save_policy_to_safetensors.py   |  22 +-
 tests/test_cameras.py                         |  27 ++-
 tests/test_control_robot.py                   |  41 +++-
 tests/test_datasets.py                        |  66 ++++-
 tests/test_delta_timestamps.py                |  32 ++-
 tests/test_image_transforms.py                |  36 ++-
 tests/test_image_writer.py                    |  12 +-
 tests/test_motors.py                          |   9 +-
 tests/test_online_buffer.py                   |  99 ++++++--
 tests/test_policies.py                        | 116 +++++++--
 tests/test_push_dataset_to_hub.py             | 101 ++++++--
 tests/test_robots.py                          |  20 +-
 tests/test_sampler.py                         |   4 +-
 tests/test_train_hilserl_classifier.py        |  32 ++-
 tests/test_utils.py                           |   4 +-
 tests/utils.py                                |  77 ++++--
 108 files changed, 3894 insertions(+), 1189 deletions(-)

diff --git a/benchmarks/video/run_video_benchmark.py b/benchmarks/video/run_video_benchmark.py
index e9066487..21a143c2 100644
--- a/benchmarks/video/run_video_benchmark.py
+++ b/benchmarks/video/run_video_benchmark.py
@@ -32,7 +32,11 @@ import numpy as np
 import pandas as pd
 import PIL
 import torch
-from skimage.metrics import mean_squared_error, peak_signal_noise_ratio, structural_similarity
+from skimage.metrics import (
+    mean_squared_error,
+    peak_signal_noise_ratio,
+    structural_similarity,
+)
 from tqdm import tqdm
 
 from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
@@ -81,7 +85,9 @@ def get_directory_size(directory: Path) -> int:
     return total_size
 
 
-def load_original_frames(imgs_dir: Path, timestamps: list[float], fps: int) -> torch.Tensor:
+def load_original_frames(
+    imgs_dir: Path, timestamps: list[float], fps: int
+) -> torch.Tensor:
     frames = []
     for ts in timestamps:
         idx = int(ts * fps)
@@ -94,7 +100,11 @@ def load_original_frames(imgs_dir: Path, timestamps: list[float], fps: int) -> t
 
 
 def save_decoded_frames(
-    imgs_dir: Path, save_dir: Path, frames: torch.Tensor, timestamps: list[float], fps: int
+    imgs_dir: Path,
+    save_dir: Path,
+    frames: torch.Tensor,
+    timestamps: list[float],
+    fps: int,
 ) -> None:
     if save_dir.exists() and len(list(save_dir.glob("frame_*.png"))) == len(timestamps):
         return
@@ -104,7 +114,10 @@ def save_decoded_frames(
         idx = int(ts * fps)
         frame_hwc = (frames[i].permute((1, 2, 0)) * 255).type(torch.uint8).cpu().numpy()
         PIL.Image.fromarray(frame_hwc).save(save_dir / f"frame_{idx:06d}_decoded.png")
-        shutil.copyfile(imgs_dir / f"frame_{idx:06d}.png", save_dir / f"frame_{idx:06d}_original.png")
+        shutil.copyfile(
+            imgs_dir / f"frame_{idx:06d}.png",
+            save_dir / f"frame_{idx:06d}_original.png",
+        )
 
 
 def save_first_episode(imgs_dir: Path, dataset: LeRobotDataset) -> None:
@@ -116,11 +129,17 @@ def save_first_episode(imgs_dir: Path, dataset: LeRobotDataset) -> None:
     hf_dataset = dataset.hf_dataset.with_format(None)
 
     # We only save images from the first camera
-    img_keys = [key for key in hf_dataset.features if key.startswith("observation.image")]
+    img_keys = [
+        key for key in hf_dataset.features if key.startswith("observation.image")
+    ]
     imgs_dataset = hf_dataset.select_columns(img_keys[0])
 
     for i, item in enumerate(
-        tqdm(imgs_dataset, desc=f"saving {dataset.repo_id} first episode images", leave=False)
+        tqdm(
+            imgs_dataset,
+            desc=f"saving {dataset.repo_id} first episode images",
+            leave=False,
+        )
     ):
         img = item[img_keys[0]]
         img.save(str(imgs_dir / f"frame_{i:06d}.png"), quality=100)
@@ -129,7 +148,9 @@ def save_first_episode(imgs_dir: Path, dataset: LeRobotDataset) -> None:
             break
 
 
-def sample_timestamps(timestamps_mode: str, ep_num_images: int, fps: int) -> list[float]:
+def sample_timestamps(
+    timestamps_mode: str, ep_num_images: int, fps: int
+) -> list[float]:
     # Start at 5 to allow for 2_frames_4_space and 6_frames
     idx = random.randint(5, ep_num_images - 1)
     match timestamps_mode:
@@ -154,7 +175,9 @@ def decode_video_frames(
     backend: str,
 ) -> torch.Tensor:
     if backend in ["pyav", "video_reader"]:
-        return decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend)
+        return decode_video_frames_torchvision(
+            video_path, timestamps, tolerance_s, backend
+        )
     else:
         raise NotImplementedError(backend)
 
@@ -181,7 +204,9 @@ def benchmark_decoding(
         }
 
         with time_benchmark:
-            frames = decode_video_frames(video_path, timestamps=timestamps, tolerance_s=5e-1, backend=backend)
+            frames = decode_video_frames(
+                video_path, timestamps=timestamps, tolerance_s=5e-1, backend=backend
+            )
         result["load_time_video_ms"] = time_benchmark.result_ms / num_frames
 
         with time_benchmark:
@@ -190,12 +215,18 @@ def benchmark_decoding(
 
         frames_np, original_frames_np = frames.numpy(), original_frames.numpy()
         for i in range(num_frames):
-            result["mse_values"].append(mean_squared_error(original_frames_np[i], frames_np[i]))
+            result["mse_values"].append(
+                mean_squared_error(original_frames_np[i], frames_np[i])
+            )
             result["psnr_values"].append(
-                peak_signal_noise_ratio(original_frames_np[i], frames_np[i], data_range=1.0)
+                peak_signal_noise_ratio(
+                    original_frames_np[i], frames_np[i], data_range=1.0
+                )
             )
             result["ssim_values"].append(
-                structural_similarity(original_frames_np[i], frames_np[i], data_range=1.0, channel_axis=0)
+                structural_similarity(
+                    original_frames_np[i], frames_np[i], data_range=1.0, channel_axis=0
+                )
             )
 
         if save_frames and sample == 0:
@@ -215,7 +246,9 @@ def benchmark_decoding(
     # As these samples are independent, we run them in parallel threads to speed up the benchmark.
     with ThreadPoolExecutor(max_workers=num_workers) as executor:
         futures = [executor.submit(process_sample, i) for i in range(num_samples)]
-        for future in tqdm(as_completed(futures), total=num_samples, desc="samples", leave=False):
+        for future in tqdm(
+            as_completed(futures), total=num_samples, desc="samples", leave=False
+        ):
             result = future.result()
             load_times_video_ms.append(result["load_time_video_ms"])
             load_times_images_ms.append(result["load_time_images_ms"])
@@ -275,9 +308,13 @@ def benchmark_encoding_decoding(
     random.seed(seed)
     benchmark_table = []
     for timestamps_mode in tqdm(
-        decoding_cfg["timestamps_modes"], desc="decodings (timestamps_modes)", leave=False
+        decoding_cfg["timestamps_modes"],
+        desc="decodings (timestamps_modes)",
+        leave=False,
     ):
-        for backend in tqdm(decoding_cfg["backends"], desc="decodings (backends)", leave=False):
+        for backend in tqdm(
+            decoding_cfg["backends"], desc="decodings (backends)", leave=False
+        ):
             benchmark_row = benchmark_decoding(
                 imgs_dir,
                 video_path,
@@ -355,14 +392,23 @@ def main(
                 imgs_dir = output_dir / "images" / dataset.repo_id.replace("/", "_")
                 # We only use the first episode
                 save_first_episode(imgs_dir, dataset)
-                for key, values in tqdm(encoding_benchmarks.items(), desc="encodings (g, crf)", leave=False):
+                for key, values in tqdm(
+                    encoding_benchmarks.items(), desc="encodings (g, crf)", leave=False
+                ):
                     for value in tqdm(values, desc=f"encodings ({key})", leave=False):
                         encoding_cfg = BASE_ENCODING.copy()
                         encoding_cfg["vcodec"] = video_codec
                         encoding_cfg["pix_fmt"] = pixel_format
                         encoding_cfg[key] = value
-                        args_path = Path("_".join(str(value) for value in encoding_cfg.values()))
-                        video_path = output_dir / "videos" / args_path / f"{repo_id.replace('/', '_')}.mp4"
+                        args_path = Path(
+                            "_".join(str(value) for value in encoding_cfg.values())
+                        )
+                        video_path = (
+                            output_dir
+                            / "videos"
+                            / args_path
+                            / f"{repo_id.replace('/', '_')}.mp4"
+                        )
                         benchmark_table += benchmark_encoding_decoding(
                             dataset,
                             video_path,
@@ -388,7 +434,9 @@ def main(
     # Concatenate all results
     df_list = [pd.read_csv(csv_path) for csv_path in file_paths]
     concatenated_df = pd.concat(df_list, ignore_index=True)
-    concatenated_path = output_dir / f"{now:%Y-%m-%d}_{now:%H-%M-%S}_all_{num_samples}-samples.csv"
+    concatenated_path = (
+        output_dir / f"{now:%Y-%m-%d}_{now:%H-%M-%S}_all_{num_samples}-samples.csv"
+    )
     concatenated_df.to_csv(concatenated_path, header=True, index=False)
 
 
diff --git a/examples/10_use_so100.md b/examples/10_use_so100.md
index 155bbe51..b247f980 100644
--- a/examples/10_use_so100.md
+++ b/examples/10_use_so100.md
@@ -16,9 +16,9 @@ On your computer:
 mkdir -p ~/miniconda3
 # Linux:
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh
-# Mac M-series: 
+# Mac M-series:
 # curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh -o ~/miniconda3/miniconda.sh
-# Mac Intel: 
+# Mac Intel:
 # curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o ~/miniconda3/miniconda.sh
 bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
 rm ~/miniconda3/miniconda.sh
@@ -98,7 +98,7 @@ sudo chmod 666 /dev/ttyACM1
 
 #### d. Update YAML file
 
-Now that you have the ports, modify the *port* sections in `so100.yaml` 
+Now that you have the ports, modify the *port* sections in `so100.yaml`
 
 ### 2. Configure the motors
 
diff --git a/examples/1_load_lerobot_dataset.py b/examples/1_load_lerobot_dataset.py
index 96c104b6..1eddbf4b 100644
--- a/examples/1_load_lerobot_dataset.py
+++ b/examples/1_load_lerobot_dataset.py
@@ -18,7 +18,10 @@ import torch
 from huggingface_hub import HfApi
 
 import lerobot
-from lerobot.common.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
+from lerobot.common.datasets.lerobot_dataset import (
+    LeRobotDataset,
+    LeRobotDatasetMetadata,
+)
 
 # We ported a number of existing datasets ourselves, use this to see the list:
 print("List of available datasets:")
@@ -26,7 +29,10 @@ pprint(lerobot.available_datasets)
 
 # You can also browse through the datasets created/ported by the community on the hub using the hub api:
 hub_api = HfApi()
-repo_ids = [info.id for info in hub_api.list_datasets(task_categories="robotics", tags=["LeRobot"])]
+repo_ids = [
+    info.id
+    for info in hub_api.list_datasets(task_categories="robotics", tags=["LeRobot"])
+]
 pprint(repo_ids)
 
 # Or simply explore them in your web browser directly at:
@@ -41,7 +47,9 @@ ds_meta = LeRobotDatasetMetadata(repo_id)
 # structure of the dataset without downloading the actual data yet (only metadata files — which are
 # lightweight).
 print(f"Total number of episodes: {ds_meta.total_episodes}")
-print(f"Average number of frames per episode: {ds_meta.total_frames / ds_meta.total_episodes:.3f}")
+print(
+    f"Average number of frames per episode: {ds_meta.total_frames / ds_meta.total_episodes:.3f}"
+)
 print(f"Frames per second used during data collection: {ds_meta.fps}")
 print(f"Robot type: {ds_meta.robot_type}")
 print(f"keys to access images from cameras: {ds_meta.camera_keys=}\n")
diff --git a/examples/2_evaluate_pretrained_policy.py b/examples/2_evaluate_pretrained_policy.py
index b2fe1dba..85a50129 100644
--- a/examples/2_evaluate_pretrained_policy.py
+++ b/examples/2_evaluate_pretrained_policy.py
@@ -32,7 +32,9 @@ if torch.cuda.is_available():
     print("GPU is available. Device set to:", device)
 else:
     device = torch.device("cpu")
-    print(f"GPU is not available. Device set to: {device}. Inference will be slower than on GPU.")
+    print(
+        f"GPU is not available. Device set to: {device}. Inference will be slower than on GPU."
+    )
     # Decrease the number of reverse-diffusion steps (trades off a bit of quality for 10x speed)
     policy.diffusion.num_inference_steps = 10
 
diff --git a/examples/3_train_policy.py b/examples/3_train_policy.py
index 935ab2db..821e2bfe 100644
--- a/examples/3_train_policy.py
+++ b/examples/3_train_policy.py
@@ -31,7 +31,24 @@ delta_timestamps = {
     # Load the previous action (-0.1), the next action to be executed (0.0),
     # and 14 future actions with a 0.1 seconds spacing. All these actions will be
     # used to supervise the policy.
-    "action": [-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4],
+    "action": [
+        -0.1,
+        0.0,
+        0.1,
+        0.2,
+        0.3,
+        0.4,
+        0.5,
+        0.6,
+        0.7,
+        0.8,
+        0.9,
+        1.0,
+        1.1,
+        1.2,
+        1.3,
+        1.4,
+    ],
 }
 dataset = LeRobotDataset("lerobot/pusht", delta_timestamps=delta_timestamps)
 
diff --git a/examples/6_add_image_transforms.py b/examples/6_add_image_transforms.py
index 882710e3..43024ac2 100644
--- a/examples/6_add_image_transforms.py
+++ b/examples/6_add_image_transforms.py
@@ -34,10 +34,14 @@ transforms = v2.Compose(
 )
 
 # Create another LeRobotDataset with the defined transformations
-transformed_dataset = LeRobotDataset(dataset_repo_id, episodes=[0], image_transforms=transforms)
+transformed_dataset = LeRobotDataset(
+    dataset_repo_id, episodes=[0], image_transforms=transforms
+)
 
 # Get a frame from the transformed dataset
-transformed_frame = transformed_dataset[first_idx][transformed_dataset.meta.camera_keys[0]]
+transformed_frame = transformed_dataset[first_idx][
+    transformed_dataset.meta.camera_keys[0]
+]
 
 # Create a directory to store output images
 output_dir = Path("outputs/image_transforms")
diff --git a/examples/advanced/2_calculate_validation_loss.py b/examples/advanced/2_calculate_validation_loss.py
index 00ba9930..c61aafa3 100644
--- a/examples/advanced/2_calculate_validation_loss.py
+++ b/examples/advanced/2_calculate_validation_loss.py
@@ -14,7 +14,10 @@ from pathlib import Path
 import torch
 from huggingface_hub import snapshot_download
 
-from lerobot.common.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
+from lerobot.common.datasets.lerobot_dataset import (
+    LeRobotDataset,
+    LeRobotDatasetMetadata,
+)
 from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy
 
 device = torch.device("cuda")
@@ -37,7 +40,24 @@ delta_timestamps = {
     # Load the previous action (-0.1), the next action to be executed (0.0),
     # and 14 future actions with a 0.1 seconds spacing. All these actions will be
     # used to calculate the loss.
-    "action": [-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4],
+    "action": [
+        -0.1,
+        0.0,
+        0.1,
+        0.2,
+        0.3,
+        0.4,
+        0.5,
+        0.6,
+        0.7,
+        0.8,
+        0.9,
+        1.0,
+        1.1,
+        1.2,
+        1.3,
+        1.4,
+    ],
 }
 
 # Load the last 10% of episodes of the dataset as a validation set.
@@ -53,8 +73,12 @@ print(f"Number of episodes in full dataset: {total_episodes}")
 print(f"Number of episodes in training dataset (90% subset): {len(train_episodes)}")
 print(f"Number of episodes in validation dataset (10% subset): {len(val_episodes)}")
 # - Load train an val datasets
-train_dataset = LeRobotDataset("lerobot/pusht", episodes=train_episodes, delta_timestamps=delta_timestamps)
-val_dataset = LeRobotDataset("lerobot/pusht", episodes=val_episodes, delta_timestamps=delta_timestamps)
+train_dataset = LeRobotDataset(
+    "lerobot/pusht", episodes=train_episodes, delta_timestamps=delta_timestamps
+)
+val_dataset = LeRobotDataset(
+    "lerobot/pusht", episodes=val_episodes, delta_timestamps=delta_timestamps
+)
 print(f"Number of frames in training dataset (90% subset): {len(train_dataset)}")
 print(f"Number of frames in validation dataset (10% subset): {len(val_dataset)}")
 
diff --git a/examples/port_datasets/pusht_zarr.py b/examples/port_datasets/pusht_zarr.py
index 60df9840..6766ac83 100644
--- a/examples/port_datasets/pusht_zarr.py
+++ b/examples/port_datasets/pusht_zarr.py
@@ -69,7 +69,9 @@ def load_raw_dataset(zarr_path: Path):
             ReplayBuffer as DiffusionPolicyReplayBuffer,
         )
     except ModuleNotFoundError as e:
-        print("`gym_pusht` is not installed. Please install it with `pip install 'lerobot[gym_pusht]'`")
+        print(
+            "`gym_pusht` is not installed. Please install it with `pip install 'lerobot[gym_pusht]'`"
+        )
         raise e
 
     zarr_data = DiffusionPolicyReplayBuffer.copy_from_path(zarr_path)
@@ -81,7 +83,9 @@ def calculate_coverage(zarr_data):
         import pymunk
         from gym_pusht.envs.pusht import PushTEnv, pymunk_to_shapely
     except ModuleNotFoundError as e:
-        print("`gym_pusht` is not installed. Please install it with `pip install 'lerobot[gym_pusht]'`")
+        print(
+            "`gym_pusht` is not installed. Please install it with `pip install 'lerobot[gym_pusht]'`"
+        )
         raise e
 
     block_pos = zarr_data["state"][:, 2:4]
@@ -111,7 +115,9 @@ def calculate_coverage(zarr_data):
         ]
         space.add(*walls)
 
-        block_body, block_shapes = PushTEnv.add_tee(space, block_pos[i].tolist(), block_angle[i].item())
+        block_body, block_shapes = PushTEnv.add_tee(
+            space, block_pos[i].tolist(), block_angle[i].item()
+        )
         goal_geom = pymunk_to_shapely(goal_body, block_body.shapes)
         block_geom = pymunk_to_shapely(block_body, block_body.shapes)
         intersection_area = goal_geom.intersection(block_geom).area
diff --git a/lerobot/__init__.py b/lerobot/__init__.py
index 3d5bb6aa..4540b93e 100644
--- a/lerobot/__init__.py
+++ b/lerobot/__init__.py
@@ -182,7 +182,11 @@ available_real_world_datasets = [
 ]
 
 available_datasets = sorted(
-    set(itertools.chain(*available_datasets_per_env.values(), available_real_world_datasets))
+    set(
+        itertools.chain(
+            *available_datasets_per_env.values(), available_real_world_datasets
+        )
+    )
 )
 
 # lists all available policies from `lerobot/common/policies`
@@ -224,9 +228,13 @@ available_policies_per_env = {
     "dora_aloha_real": ["act_aloha_real"],
 }
 
-env_task_pairs = [(env, task) for env, tasks in available_tasks_per_env.items() for task in tasks]
+env_task_pairs = [
+    (env, task) for env, tasks in available_tasks_per_env.items() for task in tasks
+]
 env_dataset_pairs = [
-    (env, dataset) for env, datasets in available_datasets_per_env.items() for dataset in datasets
+    (env, dataset)
+    for env, datasets in available_datasets_per_env.items()
+    for dataset in datasets
 ]
 env_dataset_policy_triplets = [
     (env, dataset, policy)
diff --git a/lerobot/common/datasets/compute_stats.py b/lerobot/common/datasets/compute_stats.py
index c6211699..4dbd1a57 100644
--- a/lerobot/common/datasets/compute_stats.py
+++ b/lerobot/common/datasets/compute_stats.py
@@ -45,12 +45,20 @@ def get_stats_einops_patterns(dataset, num_workers=0):
         if key in dataset.meta.camera_keys:
             # sanity check that images are channel first
             _, c, h, w = batch[key].shape
-            assert c < h and c < w, f"expect channel first images, but instead {batch[key].shape}"
+            assert (
+                c < h and c < w
+            ), f"expect channel first images, but instead {batch[key].shape}"
 
             # sanity check that images are float32 in range [0,1]
-            assert batch[key].dtype == torch.float32, f"expect torch.float32, but instead {batch[key].dtype=}"
-            assert batch[key].max() <= 1, f"expect pixels lower than 1, but instead {batch[key].max()=}"
-            assert batch[key].min() >= 0, f"expect pixels greater than 1, but instead {batch[key].min()=}"
+            assert (
+                batch[key].dtype == torch.float32
+            ), f"expect torch.float32, but instead {batch[key].dtype=}"
+            assert (
+                batch[key].max() <= 1
+            ), f"expect pixels lower than 1, but instead {batch[key].max()=}"
+            assert (
+                batch[key].min() >= 0
+            ), f"expect pixels greater than 1, but instead {batch[key].min()=}"
 
             stats_patterns[key] = "b c h w -> c 1 1"
         elif batch[key].ndim == 2:
@@ -98,7 +106,11 @@ def compute_stats(dataset, batch_size=8, num_workers=8, max_num_samples=None):
     running_item_count = 0  # for online mean computation
     dataloader = create_seeded_dataloader(dataset, batch_size, seed=1337)
     for i, batch in enumerate(
-        tqdm.tqdm(dataloader, total=ceil(max_num_samples / batch_size), desc="Compute mean, min, max")
+        tqdm.tqdm(
+            dataloader,
+            total=ceil(max_num_samples / batch_size),
+            desc="Compute mean, min, max",
+        )
     ):
         this_batch_size = len(batch["index"])
         running_item_count += this_batch_size
@@ -113,9 +125,16 @@ def compute_stats(dataset, batch_size=8, num_workers=8, max_num_samples=None):
             # and x is the current batch mean. Some rearrangement is then required to avoid risking
             # numerical overflow. Another hint: Nₙ₋₁ = Nₙ - Bₙ. Rearrangement yields
             # x̄ₙ = x̄ₙ₋₁ + Bₙ * (xₙ - x̄ₙ₋₁) / Nₙ
-            mean[key] = mean[key] + this_batch_size * (batch_mean - mean[key]) / running_item_count
-            max[key] = torch.maximum(max[key], einops.reduce(batch[key], pattern, "max"))
-            min[key] = torch.minimum(min[key], einops.reduce(batch[key], pattern, "min"))
+            mean[key] = (
+                mean[key]
+                + this_batch_size * (batch_mean - mean[key]) / running_item_count
+            )
+            max[key] = torch.maximum(
+                max[key], einops.reduce(batch[key], pattern, "max")
+            )
+            min[key] = torch.minimum(
+                min[key], einops.reduce(batch[key], pattern, "min")
+            )
 
         if i == ceil(max_num_samples / batch_size) - 1:
             break
@@ -124,7 +143,9 @@ def compute_stats(dataset, batch_size=8, num_workers=8, max_num_samples=None):
     running_item_count = 0  # for online std computation
     dataloader = create_seeded_dataloader(dataset, batch_size, seed=1337)
     for i, batch in enumerate(
-        tqdm.tqdm(dataloader, total=ceil(max_num_samples / batch_size), desc="Compute std")
+        tqdm.tqdm(
+            dataloader, total=ceil(max_num_samples / batch_size), desc="Compute std"
+        )
     ):
         this_batch_size = len(batch["index"])
         running_item_count += this_batch_size
@@ -138,7 +159,9 @@ def compute_stats(dataset, batch_size=8, num_workers=8, max_num_samples=None):
             # Numerically stable update step for mean computation (where the mean is over squared
             # residuals).See notes in the mean computation loop above.
             batch_std = einops.reduce((batch[key] - mean[key]) ** 2, pattern, "mean")
-            std[key] = std[key] + this_batch_size * (batch_std - std[key]) / running_item_count
+            std[key] = (
+                std[key] + this_batch_size * (batch_std - std[key]) / running_item_count
+            )
 
         if i == ceil(max_num_samples / batch_size) - 1:
             break
@@ -177,13 +200,19 @@ def aggregate_stats(ls_datasets) -> dict[str, torch.Tensor]:
             # compute `max(dataset_0["max"], dataset_1["max"], ...)`
             stats[data_key][stat_key] = einops.reduce(
                 torch.stack(
-                    [ds.meta.stats[data_key][stat_key] for ds in ls_datasets if data_key in ds.meta.stats],
+                    [
+                        ds.meta.stats[data_key][stat_key]
+                        for ds in ls_datasets
+                        if data_key in ds.meta.stats
+                    ],
                     dim=0,
                 ),
                 "n ... -> ...",
                 stat_key,
             )
-        total_samples = sum(d.num_frames for d in ls_datasets if data_key in d.meta.stats)
+        total_samples = sum(
+            d.num_frames for d in ls_datasets if data_key in d.meta.stats
+        )
         # Compute the "sum" statistic by multiplying each mean by the number of samples in the respective
         # dataset, then divide by total_samples to get the overall "mean".
         # NOTE: the brackets around (d.num_frames / total_samples) are needed tor minimize the risk of
diff --git a/lerobot/common/datasets/factory.py b/lerobot/common/datasets/factory.py
index 2f280372..02ec0423 100644
--- a/lerobot/common/datasets/factory.py
+++ b/lerobot/common/datasets/factory.py
@@ -89,7 +89,9 @@ def make_dataset(cfg, split: str = "train") -> LeRobotDataset | MultiLeRobotData
                 "image_std": None,
             }
         )
-        cfg_tf = OmegaConf.merge(OmegaConf.create(default_tf), cfg.training.image_transforms)
+        cfg_tf = OmegaConf.merge(
+            OmegaConf.create(default_tf), cfg.training.image_transforms
+        )
 
         image_transforms = get_image_transforms(
             brightness_weight=cfg_tf.brightness.weight,
@@ -104,7 +106,9 @@ def make_dataset(cfg, split: str = "train") -> LeRobotDataset | MultiLeRobotData
             sharpness_min_max=cfg_tf.sharpness.min_max,
             max_num_transforms=cfg_tf.max_num_transforms,
             random_order=cfg_tf.random_order,
-            image_size=(cfg_tf.image_size.height, cfg_tf.image_size.width) if cfg_tf.image_size else None,
+            image_size=(cfg_tf.image_size.height, cfg_tf.image_size.width)
+            if cfg_tf.image_size
+            else None,
             interpolation=cfg_tf.interpolation,
             image_mean=cfg_tf.image_mean,
             image_std=cfg_tf.image_std,
@@ -131,6 +135,8 @@ def make_dataset(cfg, split: str = "train") -> LeRobotDataset | MultiLeRobotData
             for stats_type, listconfig in stats_dict.items():
                 # example of stats_type: min, max, mean, std
                 stats = OmegaConf.to_container(listconfig, resolve=True)
-                dataset.meta.stats[key][stats_type] = torch.tensor(stats, dtype=torch.float32)
+                dataset.meta.stats[key][stats_type] = torch.tensor(
+                    stats, dtype=torch.float32
+                )
 
     return dataset
diff --git a/lerobot/common/datasets/image_writer.py b/lerobot/common/datasets/image_writer.py
index 85dd6830..ba53d6ff 100644
--- a/lerobot/common/datasets/image_writer.py
+++ b/lerobot/common/datasets/image_writer.py
@@ -109,7 +109,9 @@ class AsyncImageWriter:
         self._stopped = False
 
         if num_threads <= 0 and num_processes <= 0:
-            raise ValueError("Number of threads and processes must be greater than zero.")
+            raise ValueError(
+                "Number of threads and processes must be greater than zero."
+            )
 
         if self.num_processes == 0:
             # Use threading
@@ -123,12 +125,16 @@ class AsyncImageWriter:
             # Use multiprocessing
             self.queue = multiprocessing.JoinableQueue()
             for _ in range(self.num_processes):
-                p = multiprocessing.Process(target=worker_process, args=(self.queue, self.num_threads))
+                p = multiprocessing.Process(
+                    target=worker_process, args=(self.queue, self.num_threads)
+                )
                 p.daemon = True
                 p.start()
                 self.processes.append(p)
 
-    def save_image(self, image: torch.Tensor | np.ndarray | PIL.Image.Image, fpath: Path):
+    def save_image(
+        self, image: torch.Tensor | np.ndarray | PIL.Image.Image, fpath: Path
+    ):
         if isinstance(image, torch.Tensor):
             # Convert tensor to numpy array to minimize main process time
             image = image.cpu().numpy()
diff --git a/lerobot/common/datasets/lerobot_dataset.py b/lerobot/common/datasets/lerobot_dataset.py
index 000b0bcb..1c7ae5b5 100644
--- a/lerobot/common/datasets/lerobot_dataset.py
+++ b/lerobot/common/datasets/lerobot_dataset.py
@@ -68,7 +68,9 @@ from lerobot.common.robot_devices.robots.utils import Robot
 
 # For maintainers, see lerobot/common/datasets/push_dataset_to_hub/CODEBASE_VERSION.md
 CODEBASE_VERSION = "v2.0"
-LEROBOT_HOME = Path(os.getenv("LEROBOT_HOME", "~/.cache/huggingface/lerobot")).expanduser()
+LEROBOT_HOME = Path(
+    os.getenv("LEROBOT_HOME", "~/.cache/huggingface/lerobot")
+).expanduser()
 
 
 class LeRobotDatasetMetadata:
@@ -108,7 +110,11 @@ class LeRobotDatasetMetadata:
 
     @cached_property
     def _hub_version(self) -> str | None:
-        return None if self.local_files_only else get_hub_safe_version(self.repo_id, CODEBASE_VERSION)
+        return (
+            None
+            if self.local_files_only
+            else get_hub_safe_version(self.repo_id, CODEBASE_VERSION)
+        )
 
     @property
     def _version(self) -> str:
@@ -122,7 +128,9 @@ class LeRobotDatasetMetadata:
 
     def get_video_file_path(self, ep_index: int, vid_key: str) -> Path:
         ep_chunk = self.get_episode_chunk(ep_index)
-        fpath = self.video_path.format(episode_chunk=ep_chunk, video_key=vid_key, episode_index=ep_index)
+        fpath = self.video_path.format(
+            episode_chunk=ep_chunk, video_key=vid_key, episode_index=ep_index
+        )
         return Path(fpath)
 
     def get_episode_chunk(self, ep_index: int) -> int:
@@ -166,7 +174,11 @@ class LeRobotDatasetMetadata:
     @property
     def camera_keys(self) -> list[str]:
         """Keys to access visual modalities (regardless of their storage method)."""
-        return [key for key, ft in self.features.items() if ft["dtype"] in ["video", "image"]]
+        return [
+            key
+            for key, ft in self.features.items()
+            if ft["dtype"] in ["video", "image"]
+        ]
 
     @property
     def names(self) -> dict[str, list | dict]:
@@ -215,7 +227,9 @@ class LeRobotDatasetMetadata:
         task_index = self.task_to_task_index.get(task, None)
         return task_index if task_index is not None else self.total_tasks
 
-    def save_episode(self, episode_index: int, episode_length: int, task: str, task_index: int) -> None:
+    def save_episode(
+        self, episode_index: int, episode_length: int, task: str, task_index: int
+    ) -> None:
         self.info["total_episodes"] += 1
         self.info["total_frames"] += episode_length
 
@@ -257,7 +271,9 @@ class LeRobotDatasetMetadata:
         """
         for key in self.video_keys:
             if not self.features[key].get("info", None):
-                video_path = self.root / self.get_video_file_path(ep_index=0, vid_key=key)
+                video_path = self.root / self.get_video_file_path(
+                    ep_index=0, vid_key=key
+                )
                 self.info["features"][key]["info"] = get_video_info(video_path)
 
         write_json(self.info, self.root / INFO_PATH)
@@ -308,7 +324,9 @@ class LeRobotDatasetMetadata:
             features = {**features, **DEFAULT_FEATURES}
 
         obj.tasks, obj.stats, obj.episodes = {}, {}, []
-        obj.info = create_empty_dataset_info(CODEBASE_VERSION, fps, robot_type, features, use_videos)
+        obj.info = create_empty_dataset_info(
+            CODEBASE_VERSION, fps, robot_type, features, use_videos
+        )
         if len(obj.video_keys) > 0 and not use_videos:
             raise ValueError()
         write_json(obj.info, obj.root / INFO_PATH)
@@ -444,7 +462,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
         self.root.mkdir(exist_ok=True, parents=True)
 
         # Load metadata
-        self.meta = LeRobotDatasetMetadata(self.repo_id, self.root, self.local_files_only)
+        self.meta = LeRobotDatasetMetadata(
+            self.repo_id, self.root, self.local_files_only
+        )
 
         # Check version
         check_version_compatibility(self.repo_id, self.meta._version, CODEBASE_VERSION)
@@ -452,10 +472,14 @@ class LeRobotDataset(torch.utils.data.Dataset):
         # Load actual data
         self.download_episodes(download_videos)
         self.hf_dataset = self.load_hf_dataset()
-        self.episode_data_index = get_episode_data_index(self.meta.episodes, self.episodes)
+        self.episode_data_index = get_episode_data_index(
+            self.meta.episodes, self.episodes
+        )
 
         # Check timestamps
-        check_timestamps_sync(self.hf_dataset, self.episode_data_index, self.fps, self.tolerance_s)
+        check_timestamps_sync(
+            self.hf_dataset, self.episode_data_index, self.fps, self.tolerance_s
+        )
 
         # Setup delta_indices
         if self.delta_timestamps is not None:
@@ -501,7 +525,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
             tags=tags, dataset_info=self.meta.info, license=license, **card_kwargs
         )
         card.push_to_hub(repo_id=self.repo_id, repo_type="dataset")
-        create_branch(repo_id=self.repo_id, branch=CODEBASE_VERSION, repo_type="dataset")
+        create_branch(
+            repo_id=self.repo_id, branch=CODEBASE_VERSION, repo_type="dataset"
+        )
 
     def pull_from_repo(
         self,
@@ -529,7 +555,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
         files = None
         ignore_patterns = None if download_videos else "videos/"
         if self.episodes is not None:
-            files = [str(self.meta.get_data_file_path(ep_idx)) for ep_idx in self.episodes]
+            files = [
+                str(self.meta.get_data_file_path(ep_idx)) for ep_idx in self.episodes
+            ]
             if len(self.meta.video_keys) > 0 and download_videos:
                 video_files = [
                     str(self.meta.get_video_file_path(ep_idx, vid_key))
@@ -547,7 +575,10 @@ class LeRobotDataset(torch.utils.data.Dataset):
             path = str(self.root / "data")
             hf_dataset = load_dataset("parquet", data_dir=path, split="train")
         else:
-            files = [str(self.root / self.meta.get_data_file_path(ep_idx)) for ep_idx in self.episodes]
+            files = [
+                str(self.root / self.meta.get_data_file_path(ep_idx))
+                for ep_idx in self.episodes
+            ]
             hf_dataset = load_dataset("parquet", data_files=files, split="train")
 
         # TODO(aliberts): hf_dataset.set_format("torch")
@@ -563,12 +594,20 @@ class LeRobotDataset(torch.utils.data.Dataset):
     @property
     def num_frames(self) -> int:
         """Number of frames in selected episodes."""
-        return len(self.hf_dataset) if self.hf_dataset is not None else self.meta.total_frames
+        return (
+            len(self.hf_dataset)
+            if self.hf_dataset is not None
+            else self.meta.total_frames
+        )
 
     @property
     def num_episodes(self) -> int:
         """Number of episodes selected."""
-        return len(self.episodes) if self.episodes is not None else self.meta.total_episodes
+        return (
+            len(self.episodes)
+            if self.episodes is not None
+            else self.meta.total_episodes
+        )
 
     @property
     def features(self) -> dict[str, dict]:
@@ -582,16 +621,24 @@ class LeRobotDataset(torch.utils.data.Dataset):
         else:
             return get_hf_features_from_features(self.features)
 
-    def _get_query_indices(self, idx: int, ep_idx: int) -> tuple[dict[str, list[int | bool]]]:
+    def _get_query_indices(
+        self, idx: int, ep_idx: int
+    ) -> tuple[dict[str, list[int | bool]]]:
         ep_start = self.episode_data_index["from"][ep_idx]
         ep_end = self.episode_data_index["to"][ep_idx]
         query_indices = {
-            key: [max(ep_start.item(), min(ep_end.item() - 1, idx + delta)) for delta in delta_idx]
+            key: [
+                max(ep_start.item(), min(ep_end.item() - 1, idx + delta))
+                for delta in delta_idx
+            ]
             for key, delta_idx in self.delta_indices.items()
         }
         padding = {  # Pad values outside of current episode range
             f"{key}_is_pad": torch.BoolTensor(
-                [(idx + delta < ep_start.item()) | (idx + delta >= ep_end.item()) for delta in delta_idx]
+                [
+                    (idx + delta < ep_start.item()) | (idx + delta >= ep_end.item())
+                    for delta in delta_idx
+                ]
             )
             for key, delta_idx in self.delta_indices.items()
         }
@@ -619,7 +666,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
             if key not in self.meta.video_keys
         }
 
-    def _query_videos(self, query_timestamps: dict[str, list[float]], ep_idx: int) -> dict:
+    def _query_videos(
+        self, query_timestamps: dict[str, list[float]], ep_idx: int
+    ) -> dict:
         """Note: When using data workers (e.g. DataLoader with num_workers>0), do not call this function
         in the main process (e.g. by using a second Dataloader with num_workers=0). It will result in a
         Segmentation Fault. This probably happens because a memory reference to the video loader is created in
@@ -649,7 +698,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
 
         query_indices = None
         if self.delta_indices is not None:
-            current_ep_idx = self.episodes.index(ep_idx) if self.episodes is not None else ep_idx
+            current_ep_idx = (
+                self.episodes.index(ep_idx) if self.episodes is not None else ep_idx
+            )
             query_indices, padding = self._get_query_indices(idx, current_ep_idx)
             query_result = self._query_hf_dataset(query_indices)
             item = {**item, **padding}
@@ -681,19 +732,28 @@ class LeRobotDataset(torch.utils.data.Dataset):
         )
 
     def create_episode_buffer(self, episode_index: int | None = None) -> dict:
-        current_ep_idx = self.meta.total_episodes if episode_index is None else episode_index
+        current_ep_idx = (
+            self.meta.total_episodes if episode_index is None else episode_index
+        )
         return {
             "size": 0,
-            **{key: current_ep_idx if key == "episode_index" else [] for key in self.features},
+            **{
+                key: current_ep_idx if key == "episode_index" else []
+                for key in self.features
+            },
         }
 
-    def _get_image_file_path(self, episode_index: int, image_key: str, frame_index: int) -> Path:
+    def _get_image_file_path(
+        self, episode_index: int, image_key: str, frame_index: int
+    ) -> Path:
         fpath = DEFAULT_IMAGE_PATH.format(
             image_key=image_key, episode_index=episode_index, frame_index=frame_index
         )
         return self.root / fpath
 
-    def _save_image(self, image: torch.Tensor | np.ndarray | PIL.Image.Image, fpath: Path) -> None:
+    def _save_image(
+        self, image: torch.Tensor | np.ndarray | PIL.Image.Image, fpath: Path
+    ) -> None:
         if self.image_writer is None:
             if isinstance(image, torch.Tensor):
                 image = image.cpu().numpy()
@@ -714,7 +774,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
             self.episode_buffer = self.create_episode_buffer()
 
         frame_index = self.episode_buffer["size"]
-        timestamp = frame.pop("timestamp") if "timestamp" in frame else frame_index / self.fps
+        timestamp = (
+            frame.pop("timestamp") if "timestamp" in frame else frame_index / self.fps
+        )
         self.episode_buffer["frame_index"].append(frame_index)
         self.episode_buffer["timestamp"].append(timestamp)
 
@@ -723,11 +785,17 @@ class LeRobotDataset(torch.utils.data.Dataset):
                 raise ValueError(key)
 
             if self.features[key]["dtype"] not in ["image", "video"]:
-                item = frame[key].numpy() if isinstance(frame[key], torch.Tensor) else frame[key]
+                item = (
+                    frame[key].numpy()
+                    if isinstance(frame[key], torch.Tensor)
+                    else frame[key]
+                )
                 self.episode_buffer[key].append(item)
             elif self.features[key]["dtype"] in ["image", "video"]:
                 img_path = self._get_image_file_path(
-                    episode_index=self.episode_buffer["episode_index"], image_key=key, frame_index=frame_index
+                    episode_index=self.episode_buffer["episode_index"],
+                    image_key=key,
+                    frame_index=frame_index,
                 )
                 if frame_index == 0:
                     img_path.parent.mkdir(parents=True, exist_ok=True)
@@ -736,7 +804,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
 
         self.episode_buffer["size"] += 1
 
-    def save_episode(self, task: str, encode_videos: bool = True, episode_data: dict | None = None) -> None:
+    def save_episode(
+        self, task: str, encode_videos: bool = True, episode_data: dict | None = None
+    ) -> None:
         """
         This will save to disk the current episode in self.episode_buffer. Note that since it affects files on
         disk, it sets self.consolidated to False to ensure proper consolidation later on before uploading to
@@ -803,7 +873,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
 
     def _save_episode_table(self, episode_buffer: dict, episode_index: int) -> None:
         episode_dict = {key: episode_buffer[key] for key in self.hf_features}
-        ep_dataset = datasets.Dataset.from_dict(episode_dict, features=self.hf_features, split="train")
+        ep_dataset = datasets.Dataset.from_dict(
+            episode_dict, features=self.hf_features, split="train"
+        )
         ep_data_path = self.root / self.meta.get_data_file_path(ep_index=episode_index)
         ep_data_path.parent.mkdir(parents=True, exist_ok=True)
         write_parquet(ep_dataset, ep_data_path)
@@ -875,10 +947,16 @@ class LeRobotDataset(torch.utils.data.Dataset):
 
         return video_paths
 
-    def consolidate(self, run_compute_stats: bool = True, keep_image_files: bool = False) -> None:
+    def consolidate(
+        self, run_compute_stats: bool = True, keep_image_files: bool = False
+    ) -> None:
         self.hf_dataset = self.load_hf_dataset()
-        self.episode_data_index = get_episode_data_index(self.meta.episodes, self.episodes)
-        check_timestamps_sync(self.hf_dataset, self.episode_data_index, self.fps, self.tolerance_s)
+        self.episode_data_index = get_episode_data_index(
+            self.meta.episodes, self.episodes
+        )
+        check_timestamps_sync(
+            self.hf_dataset, self.episode_data_index, self.fps, self.tolerance_s
+        )
 
         if len(self.meta.video_keys) > 0:
             self.encode_videos()
@@ -983,7 +1061,9 @@ class MultiLeRobotDataset(torch.utils.data.Dataset):
         super().__init__()
         self.repo_ids = repo_ids
         self.root = Path(root) if root else LEROBOT_HOME
-        self.tolerances_s = tolerances_s if tolerances_s else {repo_id: 1e-4 for repo_id in repo_ids}
+        self.tolerances_s = (
+            tolerances_s if tolerances_s else {repo_id: 1e-4 for repo_id in repo_ids}
+        )
         # Construct the underlying datasets passing everything but `transform` and `delta_timestamps` which
         # are handled by this class.
         self._datasets = [
@@ -1060,7 +1140,13 @@ class MultiLeRobotDataset(torch.utils.data.Dataset):
     def features(self) -> datasets.Features:
         features = {}
         for dataset in self._datasets:
-            features.update({k: v for k, v in dataset.hf_features.items() if k not in self.disabled_features})
+            features.update(
+                {
+                    k: v
+                    for k, v in dataset.hf_features.items()
+                    if k not in self.disabled_features
+                }
+            )
         return features
 
     @property
@@ -1121,7 +1207,9 @@ class MultiLeRobotDataset(torch.utils.data.Dataset):
                 continue
             break
         else:
-            raise AssertionError("We expect the loop to break out as long as the index is within bounds.")
+            raise AssertionError(
+                "We expect the loop to break out as long as the index is within bounds."
+            )
         item = self._datasets[dataset_idx][idx - start_idx]
         item["dataset_index"] = torch.tensor(dataset_idx)
         for data_key in self.disabled_features:
diff --git a/lerobot/common/datasets/online_buffer.py b/lerobot/common/datasets/online_buffer.py
index d907e468..e31206fa 100644
--- a/lerobot/common/datasets/online_buffer.py
+++ b/lerobot/common/datasets/online_buffer.py
@@ -131,7 +131,9 @@ class OnlineBuffer(torch.utils.data.Dataset):
         else:
             self._delta_timestamps = None
 
-    def _make_data_spec(self, data_spec: dict[str, Any], buffer_capacity: int) -> dict[str, dict[str, Any]]:
+    def _make_data_spec(
+        self, data_spec: dict[str, Any], buffer_capacity: int
+    ) -> dict[str, dict[str, Any]]:
         """Makes the data spec for np.memmap."""
         if any(k.startswith("_") for k in data_spec):
             raise ValueError(
@@ -154,14 +156,32 @@ class OnlineBuffer(torch.utils.data.Dataset):
             OnlineBuffer.NEXT_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": ()},
             # Since the memmap is initialized with all-zeros, this keeps track of which indices are occupied
             # with real data rather than the dummy initialization.
-            OnlineBuffer.OCCUPANCY_MASK_KEY: {"dtype": np.dtype("?"), "shape": (buffer_capacity,)},
-            OnlineBuffer.INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
-            OnlineBuffer.FRAME_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
-            OnlineBuffer.EPISODE_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
-            OnlineBuffer.TIMESTAMP_KEY: {"dtype": np.dtype("float64"), "shape": (buffer_capacity,)},
+            OnlineBuffer.OCCUPANCY_MASK_KEY: {
+                "dtype": np.dtype("?"),
+                "shape": (buffer_capacity,),
+            },
+            OnlineBuffer.INDEX_KEY: {
+                "dtype": np.dtype("int64"),
+                "shape": (buffer_capacity,),
+            },
+            OnlineBuffer.FRAME_INDEX_KEY: {
+                "dtype": np.dtype("int64"),
+                "shape": (buffer_capacity,),
+            },
+            OnlineBuffer.EPISODE_INDEX_KEY: {
+                "dtype": np.dtype("int64"),
+                "shape": (buffer_capacity,),
+            },
+            OnlineBuffer.TIMESTAMP_KEY: {
+                "dtype": np.dtype("float64"),
+                "shape": (buffer_capacity,),
+            },
         }
         for k, v in data_spec.items():
-            complete_data_spec[k] = {"dtype": v["dtype"], "shape": (buffer_capacity, *v["shape"])}
+            complete_data_spec[k] = {
+                "dtype": v["dtype"],
+                "shape": (buffer_capacity, *v["shape"]),
+            }
         return complete_data_spec
 
     def add_data(self, data: dict[str, np.ndarray]):
@@ -188,7 +208,9 @@ class OnlineBuffer(torch.utils.data.Dataset):
 
         # Shift the incoming indices if necessary.
         if self.num_frames > 0:
-            last_episode_index = self._data[OnlineBuffer.EPISODE_INDEX_KEY][next_index - 1]
+            last_episode_index = self._data[OnlineBuffer.EPISODE_INDEX_KEY][
+                next_index - 1
+            ]
             last_data_index = self._data[OnlineBuffer.INDEX_KEY][next_index - 1]
             data[OnlineBuffer.EPISODE_INDEX_KEY] += last_episode_index + 1
             data[OnlineBuffer.INDEX_KEY] += last_data_index + 1
@@ -223,7 +245,11 @@ class OnlineBuffer(torch.utils.data.Dataset):
     @property
     def num_episodes(self) -> int:
         return len(
-            np.unique(self._data[OnlineBuffer.EPISODE_INDEX_KEY][self._data[OnlineBuffer.OCCUPANCY_MASK_KEY]])
+            np.unique(
+                self._data[OnlineBuffer.EPISODE_INDEX_KEY][
+                    self._data[OnlineBuffer.OCCUPANCY_MASK_KEY]
+                ]
+            )
         )
 
     @property
@@ -261,7 +287,9 @@ class OnlineBuffer(torch.utils.data.Dataset):
                 self._data[OnlineBuffer.OCCUPANCY_MASK_KEY],
             )
         )[0]
-        episode_timestamps = self._data[OnlineBuffer.TIMESTAMP_KEY][episode_data_indices]
+        episode_timestamps = self._data[OnlineBuffer.TIMESTAMP_KEY][
+            episode_data_indices
+        ]
 
         for data_key in self.delta_timestamps:
             # Note: The logic in this loop is copied from `load_previous_and_future_frames`.
@@ -278,7 +306,8 @@ class OnlineBuffer(torch.utils.data.Dataset):
 
             # Check violated query timestamps are all outside the episode range.
             assert (
-                (query_ts[is_pad] < episode_timestamps[0]) | (episode_timestamps[-1] < query_ts[is_pad])
+                (query_ts[is_pad] < episode_timestamps[0])
+                | (episode_timestamps[-1] < query_ts[is_pad])
             ).all(), (
                 f"One or several timestamps unexpectedly violate the tolerance ({min_} > {self.tolerance_s=}"
                 ") inside the episode range."
@@ -293,7 +322,9 @@ class OnlineBuffer(torch.utils.data.Dataset):
 
     def get_data_by_key(self, key: str) -> torch.Tensor:
         """Returns all data for a given data key as a Tensor."""
-        return torch.from_numpy(self._data[key][self._data[OnlineBuffer.OCCUPANCY_MASK_KEY]])
+        return torch.from_numpy(
+            self._data[key][self._data[OnlineBuffer.OCCUPANCY_MASK_KEY]]
+        )
 
 
 def compute_sampler_weights(
@@ -324,13 +355,19 @@ def compute_sampler_weights(
         - Options `drop_first_n_frames` and `episode_indices_to_use` can be added easily. They were not
           included here to avoid adding complexity.
     """
-    if len(offline_dataset) == 0 and (online_dataset is None or len(online_dataset) == 0):
-        raise ValueError("At least one of `offline_dataset` or `online_dataset` should be contain data.")
+    if len(offline_dataset) == 0 and (
+        online_dataset is None or len(online_dataset) == 0
+    ):
+        raise ValueError(
+            "At least one of `offline_dataset` or `online_dataset` should be contain data."
+        )
     if (online_dataset is None) ^ (online_sampling_ratio is None):
         raise ValueError(
             "`online_dataset` and `online_sampling_ratio` must be provided together or not at all."
         )
-    offline_sampling_ratio = 0 if online_sampling_ratio is None else 1 - online_sampling_ratio
+    offline_sampling_ratio = (
+        0 if online_sampling_ratio is None else 1 - online_sampling_ratio
+    )
 
     weights = []
 
diff --git a/lerobot/common/datasets/push_dataset_to_hub/_diffusion_policy_replay_buffer.py b/lerobot/common/datasets/push_dataset_to_hub/_diffusion_policy_replay_buffer.py
index 33b4c974..8952b585 100644
--- a/lerobot/common/datasets/push_dataset_to_hub/_diffusion_policy_replay_buffer.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/_diffusion_policy_replay_buffer.py
@@ -37,10 +37,16 @@ def check_chunks_compatible(chunks: tuple, shape: tuple):
         assert c > 0
 
 
-def rechunk_recompress_array(group, name, chunks=None, chunk_length=None, compressor=None, tmp_key="_temp"):
+def rechunk_recompress_array(
+    group, name, chunks=None, chunk_length=None, compressor=None, tmp_key="_temp"
+):
     old_arr = group[name]
     if chunks is None:
-        chunks = (chunk_length,) + old_arr.chunks[1:] if chunk_length is not None else old_arr.chunks
+        chunks = (
+            (chunk_length,) + old_arr.chunks[1:]
+            if chunk_length is not None
+            else old_arr.chunks
+        )
     check_chunks_compatible(chunks, old_arr.shape)
 
     if compressor is None:
@@ -82,13 +88,18 @@ def get_optimal_chunks(shape, dtype, target_chunk_bytes=2e6, max_chunk_length=No
     for i in range(len(shape) - 1):
         this_chunk_bytes = itemsize * np.prod(rshape[:i])
         next_chunk_bytes = itemsize * np.prod(rshape[: i + 1])
-        if this_chunk_bytes <= target_chunk_bytes and next_chunk_bytes > target_chunk_bytes:
+        if (
+            this_chunk_bytes <= target_chunk_bytes
+            and next_chunk_bytes > target_chunk_bytes
+        ):
             split_idx = i
 
     rchunks = rshape[:split_idx]
     item_chunk_bytes = itemsize * np.prod(rshape[:split_idx])
     this_max_chunk_length = rshape[split_idx]
-    next_chunk_length = min(this_max_chunk_length, math.ceil(target_chunk_bytes / item_chunk_bytes))
+    next_chunk_length = min(
+        this_max_chunk_length, math.ceil(target_chunk_bytes / item_chunk_bytes)
+    )
     rchunks.append(next_chunk_length)
     len_diff = len(shape) - len(rchunks)
     rchunks.extend([1] * len_diff)
@@ -124,7 +135,13 @@ class ReplayBuffer:
         root.require_group("data", overwrite=False)
         meta = root.require_group("meta", overwrite=False)
         if "episode_ends" not in meta:
-            meta.zeros("episode_ends", shape=(0,), dtype=np.int64, compressor=None, overwrite=False)
+            meta.zeros(
+                "episode_ends",
+                shape=(0,),
+                dtype=np.int64,
+                compressor=None,
+                overwrite=False,
+            )
         return cls(root=root)
 
     @classmethod
@@ -193,7 +210,11 @@ class ReplayBuffer:
             root = zarr.group(store=store)
             # copy without recompression
             n_copied, n_skipped, n_bytes_copied = zarr.copy_store(
-                source=src_store, dest=store, source_path="/meta", dest_path="/meta", if_exists=if_exists
+                source=src_store,
+                dest=store,
+                source_path="/meta",
+                dest_path="/meta",
+                if_exists=if_exists,
             )
             data_group = root.create_group("data", overwrite=True)
             if keys is None:
@@ -201,7 +222,9 @@ class ReplayBuffer:
             for key in keys:
                 value = src_root["data"][key]
                 cks = cls._resolve_array_chunks(chunks=chunks, key=key, array=value)
-                cpr = cls._resolve_array_compressor(compressors=compressors, key=key, array=value)
+                cpr = cls._resolve_array_compressor(
+                    compressors=compressors, key=key, array=value
+                )
                 if cks == value.chunks and cpr == value.compressor:
                     # copy without recompression
                     this_path = "/data/" + key
@@ -286,13 +309,17 @@ class ReplayBuffer:
             meta_group = root.create_group("meta", overwrite=True)
             # save meta, no chunking
             for key, value in self.root["meta"].items():
-                _ = meta_group.array(name=key, data=value, shape=value.shape, chunks=value.shape)
+                _ = meta_group.array(
+                    name=key, data=value, shape=value.shape, chunks=value.shape
+                )
 
         # save data, chunk
         data_group = root.create_group("data", overwrite=True)
         for key, value in self.root["data"].items():
             cks = self._resolve_array_chunks(chunks=chunks, key=key, array=value)
-            cpr = self._resolve_array_compressor(compressors=compressors, key=key, array=value)
+            cpr = self._resolve_array_compressor(
+                compressors=compressors, key=key, array=value
+            )
             if isinstance(value, zarr.Array):
                 if cks == value.chunks and cpr == value.compressor:
                     # copy without recompression
@@ -339,13 +366,19 @@ class ReplayBuffer:
     @staticmethod
     def resolve_compressor(compressor="default"):
         if compressor == "default":
-            compressor = numcodecs.Blosc(cname="lz4", clevel=5, shuffle=numcodecs.Blosc.NOSHUFFLE)
+            compressor = numcodecs.Blosc(
+                cname="lz4", clevel=5, shuffle=numcodecs.Blosc.NOSHUFFLE
+            )
         elif compressor == "disk":
-            compressor = numcodecs.Blosc("zstd", clevel=5, shuffle=numcodecs.Blosc.BITSHUFFLE)
+            compressor = numcodecs.Blosc(
+                "zstd", clevel=5, shuffle=numcodecs.Blosc.BITSHUFFLE
+            )
         return compressor
 
     @classmethod
-    def _resolve_array_compressor(cls, compressors: dict | str | numcodecs.abc.Codec, key, array):
+    def _resolve_array_compressor(
+        cls, compressors: dict | str | numcodecs.abc.Codec, key, array
+    ):
         # allows compressor to be explicitly set to None
         cpr = "nil"
         if isinstance(compressors, dict):
@@ -404,7 +437,11 @@ class ReplayBuffer:
         if self.backend == "zarr":
             for key, value in np_data.items():
                 _ = meta_group.array(
-                    name=key, data=value, shape=value.shape, chunks=value.shape, overwrite=True
+                    name=key,
+                    data=value,
+                    shape=value.shape,
+                    chunks=value.shape,
+                    overwrite=True,
                 )
         else:
             meta_group.update(np_data)
@@ -514,10 +551,18 @@ class ReplayBuffer:
             # create array
             if key not in self.data:
                 if is_zarr:
-                    cks = self._resolve_array_chunks(chunks=chunks, key=key, array=value)
-                    cpr = self._resolve_array_compressor(compressors=compressors, key=key, array=value)
+                    cks = self._resolve_array_chunks(
+                        chunks=chunks, key=key, array=value
+                    )
+                    cpr = self._resolve_array_compressor(
+                        compressors=compressors, key=key, array=value
+                    )
                     arr = self.data.zeros(
-                        name=key, shape=new_shape, chunks=cks, dtype=value.dtype, compressor=cpr
+                        name=key,
+                        shape=new_shape,
+                        chunks=cks,
+                        dtype=value.dtype,
+                        compressor=cpr,
                     )
                 else:
                     # copy data to prevent modify
@@ -544,7 +589,9 @@ class ReplayBuffer:
 
         # rechunk
         if is_zarr and episode_ends.chunks[0] < episode_ends.shape[0]:
-            rechunk_recompress_array(self.meta, "episode_ends", chunk_length=int(episode_ends.shape[0] * 1.5))
+            rechunk_recompress_array(
+                self.meta, "episode_ends", chunk_length=int(episode_ends.shape[0] * 1.5)
+            )
 
     def drop_episode(self):
         is_zarr = self.backend == "zarr"
diff --git a/lerobot/common/datasets/push_dataset_to_hub/_encode_datasets.py b/lerobot/common/datasets/push_dataset_to_hub/_encode_datasets.py
index 184d79fb..bb2bd4a8 100644
--- a/lerobot/common/datasets/push_dataset_to_hub/_encode_datasets.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/_encode_datasets.py
@@ -38,7 +38,9 @@ import argparse
 from pathlib import Path
 
 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION
-from lerobot.common.datasets.push_dataset_to_hub._download_raw import AVAILABLE_RAW_REPO_IDS
+from lerobot.common.datasets.push_dataset_to_hub._download_raw import (
+    AVAILABLE_RAW_REPO_IDS,
+)
 from lerobot.common.datasets.push_dataset_to_hub.utils import check_repo_id
 from lerobot.scripts.push_dataset_to_hub import push_dataset_to_hub
 
@@ -73,7 +75,9 @@ def encode_datasets(
         check_repo_id(raw_repo_id)
         dataset_repo_id_push = get_push_repo_id_from_raw(raw_repo_id, push_repo)
         dataset_raw_dir = raw_dir / raw_repo_id
-        dataset_dir = local_dir / dataset_repo_id_push if local_dir is not None else None
+        dataset_dir = (
+            local_dir / dataset_repo_id_push if local_dir is not None else None
+        )
         encoding = {
             "vcodec": vcodec,
             "pix_fmt": pix_fmt,
diff --git a/lerobot/common/datasets/push_dataset_to_hub/_umi_imagecodecs_numcodecs.py b/lerobot/common/datasets/push_dataset_to_hub/_umi_imagecodecs_numcodecs.py
index a118b7e7..a8898933 100644
--- a/lerobot/common/datasets/push_dataset_to_hub/_umi_imagecodecs_numcodecs.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/_umi_imagecodecs_numcodecs.py
@@ -133,7 +133,9 @@ class Jpeg2k(Codec):
         )
 
     def decode(self, buf, out=None):
-        return imagecodecs.jpeg2k_decode(buf, verbose=self.verbose, numthreads=self.numthreads, out=out)
+        return imagecodecs.jpeg2k_decode(
+            buf, verbose=self.verbose, numthreads=self.numthreads, out=out
+        )
 
 
 class JpegXl(Codec):
diff --git a/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py b/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py
index e2973ef8..527b31b2 100644
--- a/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py
@@ -44,7 +44,9 @@ from lerobot.common.datasets.video_utils import VideoFrame, encode_video_frames
 def get_cameras(hdf5_data):
     # ignore depth channel, not currently handled
     # TODO(rcadene): add depth
-    rgb_cameras = [key for key in hdf5_data["/observations/images"].keys() if "depth" not in key]  # noqa: SIM118
+    rgb_cameras = [
+        key for key in hdf5_data["/observations/images"].keys() if "depth" not in key
+    ]  # noqa: SIM118
     return rgb_cameras
 
 
@@ -73,7 +75,9 @@ def check_format(raw_dir) -> bool:
                 else:
                     assert data[f"/observations/images/{camera}"].ndim == 4
                     b, h, w, c = data[f"/observations/images/{camera}"].shape
-                    assert c < h and c < w, f"Expect (h,w,c) image format but ({h=},{w=},{c=}) provided."
+                    assert (
+                        c < h and c < w
+                    ), f"Expect (h,w,c) image format but ({h=},{w=},{c=}) provided."
 
 
 def load_from_raw(
@@ -134,14 +138,17 @@ def load_from_raw(
                     # encode images to a mp4 video
                     fname = f"{img_key}_episode_{ep_idx:06d}.mp4"
                     video_path = videos_dir / fname
-                    encode_video_frames(tmp_imgs_dir, video_path, fps, **(encoding or {}))
+                    encode_video_frames(
+                        tmp_imgs_dir, video_path, fps, **(encoding or {})
+                    )
 
                     # clean temporary images directory
                     shutil.rmtree(tmp_imgs_dir)
 
                     # store the reference to the video frame
                     ep_dict[img_key] = [
-                        {"path": f"videos/{fname}", "timestamp": i / fps} for i in range(num_frames)
+                        {"path": f"videos/{fname}", "timestamp": i / fps}
+                        for i in range(num_frames)
                     ]
                 else:
                     ep_dict[img_key] = [PILImage.fromarray(x) for x in imgs_array]
@@ -181,15 +188,18 @@ def to_hf_dataset(data_dict, video) -> Dataset:
             features[key] = Image()
 
     features["observation.state"] = Sequence(
-        length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
+        length=data_dict["observation.state"].shape[1],
+        feature=Value(dtype="float32", id=None),
     )
     if "observation.velocity" in data_dict:
         features["observation.velocity"] = Sequence(
-            length=data_dict["observation.velocity"].shape[1], feature=Value(dtype="float32", id=None)
+            length=data_dict["observation.velocity"].shape[1],
+            feature=Value(dtype="float32", id=None),
         )
     if "observation.effort" in data_dict:
         features["observation.effort"] = Sequence(
-            length=data_dict["observation.effort"].shape[1], feature=Value(dtype="float32", id=None)
+            length=data_dict["observation.effort"].shape[1],
+            feature=Value(dtype="float32", id=None),
         )
     features["action"] = Sequence(
         length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)
diff --git a/lerobot/common/datasets/push_dataset_to_hub/dora_parquet_format.py b/lerobot/common/datasets/push_dataset_to_hub/dora_parquet_format.py
index 95f9c007..c90bd929 100644
--- a/lerobot/common/datasets/push_dataset_to_hub/dora_parquet_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/dora_parquet_format.py
@@ -26,7 +26,9 @@ import torch
 from datasets import Dataset, Features, Image, Sequence, Value
 
 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION
-from lerobot.common.datasets.push_dataset_to_hub.utils import calculate_episode_data_index
+from lerobot.common.datasets.push_dataset_to_hub.utils import (
+    calculate_episode_data_index,
+)
 from lerobot.common.datasets.utils import (
     hf_transform_to_torch,
 )
@@ -42,11 +44,19 @@ def check_format(raw_dir) -> bool:
     return True
 
 
-def load_from_raw(raw_dir: Path, videos_dir: Path, fps: int, video: bool, episodes: list[int] | None = None):
+def load_from_raw(
+    raw_dir: Path,
+    videos_dir: Path,
+    fps: int,
+    video: bool,
+    episodes: list[int] | None = None,
+):
     # Load data stream that will be used as reference for the timestamps synchronization
     reference_files = list(raw_dir.glob("observation.images.cam_*.parquet"))
     if len(reference_files) == 0:
-        raise ValueError(f"Missing reference files for camera, starting with  in '{raw_dir}'")
+        raise ValueError(
+            f"Missing reference files for camera, starting with  in '{raw_dir}'"
+        )
     # select first camera in alphanumeric order
     reference_key = sorted(reference_files)[0].stem
     reference_df = pd.read_parquet(raw_dir / f"{reference_key}.parquet")
@@ -107,7 +117,9 @@ def load_from_raw(raw_dir: Path, videos_dir: Path, fps: int, video: bool, episod
 
     df["timestamp"] = df["timestamp_utc"].map(lambda x: x.timestamp())
     # each episode starts with timestamp 0 to match the ones from the video
-    df["timestamp"] = df.groupby("episode_index")["timestamp"].transform(lambda x: x - x.iloc[0])
+    df["timestamp"] = df.groupby("episode_index")["timestamp"].transform(
+        lambda x: x - x.iloc[0]
+    )
 
     del df["timestamp_utc"]
 
@@ -120,7 +132,9 @@ def load_from_raw(raw_dir: Path, videos_dir: Path, fps: int, video: bool, episod
     ep_ids = [ep_idx for ep_idx, _ in df.groupby("episode_index")]
     expected_ep_ids = list(range(df["episode_index"].max() + 1))
     if ep_ids != expected_ep_ids:
-        raise ValueError(f"Episodes indices go from {ep_ids} instead of {expected_ep_ids}")
+        raise ValueError(
+            f"Episodes indices go from {ep_ids} instead of {expected_ep_ids}"
+        )
 
     # Create symlink to raw videos directory (that needs to be absolute not relative)
     videos_dir.parent.mkdir(parents=True, exist_ok=True)
@@ -152,7 +166,9 @@ def load_from_raw(raw_dir: Path, videos_dir: Path, fps: int, video: bool, episod
             data_dict[key] = torch.from_numpy(df[key].values)
         # is vector
         elif df[key].iloc[0].shape[0] > 1:
-            data_dict[key] = torch.stack([torch.from_numpy(x.copy()) for x in df[key].values])
+            data_dict[key] = torch.stack(
+                [torch.from_numpy(x.copy()) for x in df[key].values]
+            )
         else:
             raise ValueError(key)
 
@@ -170,15 +186,18 @@ def to_hf_dataset(data_dict, video) -> Dataset:
             features[key] = Image()
 
     features["observation.state"] = Sequence(
-        length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
+        length=data_dict["observation.state"].shape[1],
+        feature=Value(dtype="float32", id=None),
     )
     if "observation.velocity" in data_dict:
         features["observation.velocity"] = Sequence(
-            length=data_dict["observation.velocity"].shape[1], feature=Value(dtype="float32", id=None)
+            length=data_dict["observation.velocity"].shape[1],
+            feature=Value(dtype="float32", id=None),
         )
     if "observation.effort" in data_dict:
         features["observation.effort"] = Sequence(
-            length=data_dict["observation.effort"].shape[1], feature=Value(dtype="float32", id=None)
+            length=data_dict["observation.effort"].shape[1],
+            feature=Value(dtype="float32", id=None),
         )
     features["action"] = Sequence(
         length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)
diff --git a/lerobot/common/datasets/push_dataset_to_hub/openx_rlds_format.py b/lerobot/common/datasets/push_dataset_to_hub/openx_rlds_format.py
index 1f8a5d14..1c835973 100644
--- a/lerobot/common/datasets/push_dataset_to_hub/openx_rlds_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/openx_rlds_format.py
@@ -143,7 +143,11 @@ def load_from_raw(
         else:
             state_keys.append(key)
 
-    lang_key = "language_instruction" if "language_instruction" in dataset.element_spec else None
+    lang_key = (
+        "language_instruction"
+        if "language_instruction" in dataset.element_spec
+        else None
+    )
 
     print(" - image_keys: ", image_keys)
     print(" - lang_key: ", lang_key)
@@ -202,7 +206,9 @@ def load_from_raw(
 
         # If lang_key is present, convert the entire tensor at once
         if lang_key is not None:
-            ep_dict["language_instruction"] = [x.numpy().decode("utf-8") for x in episode[lang_key]]
+            ep_dict["language_instruction"] = [
+                x.numpy().decode("utf-8") for x in episode[lang_key]
+            ]
 
         ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
         ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames)
@@ -234,7 +240,8 @@ def load_from_raw(
 
                 # store the reference to the video frame
                 ep_dict[img_key] = [
-                    {"path": f"videos/{fname}", "timestamp": i / fps} for i in range(num_frames)
+                    {"path": f"videos/{fname}", "timestamp": i / fps}
+                    for i in range(num_frames)
                 ]
             else:
                 ep_dict[img_key] = [PILImage.fromarray(x) for x in imgs_array]
@@ -259,7 +266,9 @@ def to_hf_dataset(data_dict, video) -> Dataset:
     for key in data_dict:
         # check if vector state obs
         if key.startswith("observation.") and "observation.images." not in key:
-            features[key] = Sequence(length=data_dict[key].shape[1], feature=Value(dtype="float32", id=None))
+            features[key] = Sequence(
+                length=data_dict[key].shape[1], feature=Value(dtype="float32", id=None)
+            )
         # check if image obs
         elif "observation.images." in key:
             if video:
diff --git a/lerobot/common/datasets/push_dataset_to_hub/pusht_zarr_format.py b/lerobot/common/datasets/push_dataset_to_hub/pusht_zarr_format.py
index 27b31ba2..22b5ea78 100644
--- a/lerobot/common/datasets/push_dataset_to_hub/pusht_zarr_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/pusht_zarr_format.py
@@ -56,7 +56,9 @@ def check_format(raw_dir):
 
     required_datasets.remove("meta/episode_ends")
 
-    assert all(nb_frames == zarr_data[dataset].shape[0] for dataset in required_datasets)
+    assert all(
+        nb_frames == zarr_data[dataset].shape[0] for dataset in required_datasets
+    )
 
 
 def load_from_raw(
@@ -76,7 +78,9 @@ def load_from_raw(
             ReplayBuffer as DiffusionPolicyReplayBuffer,
         )
     except ModuleNotFoundError as e:
-        print("`gym_pusht` is not installed. Please install it with `pip install 'lerobot[gym_pusht]'`")
+        print(
+            "`gym_pusht` is not installed. Please install it with `pip install 'lerobot[gym_pusht]'`"
+        )
         raise e
     # as define in gmy-pusht env: https://github.com/huggingface/gym-pusht/blob/e0684ff988d223808c0a9dcfaba9dc4991791370/gym_pusht/envs/pusht.py#L174
     success_threshold = 0.95  # 95% coverage,
@@ -150,7 +154,9 @@ def load_from_raw(
             ]
             space.add(*walls)
 
-            block_body, block_shapes = PushTEnv.add_tee(space, block_pos[i].tolist(), block_angle[i].item())
+            block_body, block_shapes = PushTEnv.add_tee(
+                space, block_pos[i].tolist(), block_angle[i].item()
+            )
             goal_geom = pymunk_to_shapely(goal_body, block_body.shapes)
             block_geom = pymunk_to_shapely(block_body, block_body.shapes)
             intersection_area = goal_geom.intersection(block_geom).area
@@ -159,7 +165,9 @@ def load_from_raw(
             reward[i] = np.clip(coverage / success_threshold, 0, 1)
             success[i] = coverage > success_threshold
             if keypoints_instead_of_image:
-                keypoints[i] = torch.from_numpy(PushTEnv.get_keypoints(block_shapes).flatten())
+                keypoints[i] = torch.from_numpy(
+                    PushTEnv.get_keypoints(block_shapes).flatten()
+                )
 
         # last step of demonstration is considered done
         done[-1] = True
@@ -184,7 +192,8 @@ def load_from_raw(
 
                 # store the reference to the video frame
                 ep_dict[img_key] = [
-                    {"path": f"videos/{fname}", "timestamp": i / fps} for i in range(num_frames)
+                    {"path": f"videos/{fname}", "timestamp": i / fps}
+                    for i in range(num_frames)
                 ]
             else:
                 ep_dict[img_key] = [PILImage.fromarray(x) for x in imgs_array]
@@ -193,7 +202,9 @@ def load_from_raw(
         if keypoints_instead_of_image:
             ep_dict["observation.environment_state"] = keypoints
         ep_dict["action"] = actions[from_idx:to_idx]
-        ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames, dtype=torch.int64)
+        ep_dict["episode_index"] = torch.tensor(
+            [ep_idx] * num_frames, dtype=torch.int64
+        )
         ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
         ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
         # ep_dict["next.observation.image"] = image[1:],
@@ -220,7 +231,8 @@ def to_hf_dataset(data_dict, video, keypoints_instead_of_image: bool = False):
             features["observation.image"] = Image()
 
     features["observation.state"] = Sequence(
-        length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
+        length=data_dict["observation.state"].shape[1],
+        feature=Value(dtype="float32", id=None),
     )
     if keypoints_instead_of_image:
         features["observation.environment_state"] = Sequence(
@@ -261,7 +273,9 @@ def from_raw_to_lerobot_format(
     if fps is None:
         fps = 10
 
-    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes, keypoints_instead_of_image, encoding)
+    data_dict = load_from_raw(
+        raw_dir, videos_dir, fps, video, episodes, keypoints_instead_of_image, encoding
+    )
     hf_dataset = to_hf_dataset(data_dict, video, keypoints_instead_of_image)
     episode_data_index = calculate_episode_data_index(hf_dataset)
     info = {
diff --git a/lerobot/common/datasets/push_dataset_to_hub/umi_zarr_format.py b/lerobot/common/datasets/push_dataset_to_hub/umi_zarr_format.py
index fec893a7..a03cb058 100644
--- a/lerobot/common/datasets/push_dataset_to_hub/umi_zarr_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/umi_zarr_format.py
@@ -26,7 +26,9 @@ from datasets import Dataset, Features, Image, Sequence, Value
 from PIL import Image as PILImage
 
 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION
-from lerobot.common.datasets.push_dataset_to_hub._umi_imagecodecs_numcodecs import register_codecs
+from lerobot.common.datasets.push_dataset_to_hub._umi_imagecodecs_numcodecs import (
+    register_codecs,
+)
 from lerobot.common.datasets.push_dataset_to_hub.utils import (
     calculate_episode_data_index,
     concatenate_episodes,
@@ -61,7 +63,9 @@ def check_format(raw_dir) -> bool:
     nb_frames = zarr_data["data/camera0_rgb"].shape[0]
 
     required_datasets.remove("meta/episode_ends")
-    assert all(nb_frames == zarr_data[dataset].shape[0] for dataset in required_datasets)
+    assert all(
+        nb_frames == zarr_data[dataset].shape[0] for dataset in required_datasets
+    )
 
 
 def load_from_raw(
@@ -79,7 +83,9 @@ def load_from_raw(
     end_pose = torch.from_numpy(zarr_data["data/robot0_demo_end_pose"][:])
     start_pos = torch.from_numpy(zarr_data["data/robot0_demo_start_pose"][:])
     eff_pos = torch.from_numpy(zarr_data["data/robot0_eef_pos"][:])
-    eff_rot_axis_angle = torch.from_numpy(zarr_data["data/robot0_eef_rot_axis_angle"][:])
+    eff_rot_axis_angle = torch.from_numpy(
+        zarr_data["data/robot0_eef_rot_axis_angle"][:]
+    )
     gripper_width = torch.from_numpy(zarr_data["data/robot0_gripper_width"][:])
 
     states_pos = torch.cat([eff_pos, eff_rot_axis_angle], dim=1)
@@ -129,24 +135,31 @@ def load_from_raw(
                     save_images_concurrently(imgs_array, tmp_imgs_dir)
 
                     # encode images to a mp4 video
-                    encode_video_frames(tmp_imgs_dir, video_path, fps, **(encoding or {}))
+                    encode_video_frames(
+                        tmp_imgs_dir, video_path, fps, **(encoding or {})
+                    )
 
                     # clean temporary images directory
                     shutil.rmtree(tmp_imgs_dir)
 
                 # store the reference to the video frame
                 ep_dict[img_key] = [
-                    {"path": f"videos/{fname}", "timestamp": i / fps} for i in range(num_frames)
+                    {"path": f"videos/{fname}", "timestamp": i / fps}
+                    for i in range(num_frames)
                 ]
             else:
                 ep_dict[img_key] = [PILImage.fromarray(x) for x in imgs_array]
 
             ep_dict["observation.state"] = state
-            ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames, dtype=torch.int64)
+            ep_dict["episode_index"] = torch.tensor(
+                [ep_idx] * num_frames, dtype=torch.int64
+            )
             ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
             ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
             ep_dict["episode_data_index_from"] = torch.tensor([from_idx] * num_frames)
-            ep_dict["episode_data_index_to"] = torch.tensor([from_idx + num_frames] * num_frames)
+            ep_dict["episode_data_index_to"] = torch.tensor(
+                [from_idx + num_frames] * num_frames
+            )
             ep_dict["end_pose"] = end_pose[from_idx:to_idx]
             ep_dict["start_pos"] = start_pos[from_idx:to_idx]
             ep_dict["gripper_width"] = gripper_width[from_idx:to_idx]
@@ -172,7 +185,8 @@ def to_hf_dataset(data_dict, video):
         features["observation.image"] = Image()
 
     features["observation.state"] = Sequence(
-        length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
+        length=data_dict["observation.state"].shape[1],
+        feature=Value(dtype="float32", id=None),
     )
     features["episode_index"] = Value(dtype="int64", id=None)
     features["frame_index"] = Value(dtype="int64", id=None)
@@ -192,7 +206,8 @@ def to_hf_dataset(data_dict, video):
         length=data_dict["start_pos"].shape[1], feature=Value(dtype="float32", id=None)
     )
     features["gripper_width"] = Sequence(
-        length=data_dict["gripper_width"].shape[1], feature=Value(dtype="float32", id=None)
+        length=data_dict["gripper_width"].shape[1],
+        feature=Value(dtype="float32", id=None),
     )
 
     hf_dataset = Dataset.from_dict(data_dict, features=Features(features))
diff --git a/lerobot/common/datasets/push_dataset_to_hub/utils.py b/lerobot/common/datasets/push_dataset_to_hub/utils.py
index ebcf87f7..13997c81 100644
--- a/lerobot/common/datasets/push_dataset_to_hub/utils.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/utils.py
@@ -45,7 +45,9 @@ def concatenate_episodes(ep_dicts):
     return data_dict
 
 
-def save_images_concurrently(imgs_array: numpy.array, out_dir: Path, max_workers: int = 4):
+def save_images_concurrently(
+    imgs_array: numpy.array, out_dir: Path, max_workers: int = 4
+):
     out_dir = Path(out_dir)
     out_dir.mkdir(parents=True, exist_ok=True)
 
@@ -55,7 +57,10 @@ def save_images_concurrently(imgs_array: numpy.array, out_dir: Path, max_workers
 
     num_images = len(imgs_array)
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        [executor.submit(save_image, imgs_array[i], i, out_dir) for i in range(num_images)]
+        [
+            executor.submit(save_image, imgs_array[i], i, out_dir)
+            for i in range(num_images)
+        ]
 
 
 def get_default_encoding() -> dict:
@@ -64,7 +69,8 @@ def get_default_encoding() -> dict:
     return {
         k: v.default
         for k, v in signature.parameters.items()
-        if v.default is not inspect.Parameter.empty and k in ["vcodec", "pix_fmt", "g", "crf"]
+        if v.default is not inspect.Parameter.empty
+        and k in ["vcodec", "pix_fmt", "g", "crf"]
     }
 
 
@@ -77,7 +83,9 @@ def check_repo_id(repo_id: str) -> None:
 
 
 # TODO(aliberts): remove
-def calculate_episode_data_index(hf_dataset: datasets.Dataset) -> Dict[str, torch.Tensor]:
+def calculate_episode_data_index(
+    hf_dataset: datasets.Dataset,
+) -> Dict[str, torch.Tensor]:
     """
     Calculate episode data index for the provided HuggingFace Dataset. Relies on episode_index column of hf_dataset.
 
diff --git a/lerobot/common/datasets/push_dataset_to_hub/xarm_pkl_format.py b/lerobot/common/datasets/push_dataset_to_hub/xarm_pkl_format.py
index 0047e48c..f628a5f1 100644
--- a/lerobot/common/datasets/push_dataset_to_hub/xarm_pkl_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/xarm_pkl_format.py
@@ -40,7 +40,10 @@ from lerobot.common.datasets.video_utils import VideoFrame, encode_video_frames
 
 def check_format(raw_dir):
     keys = {"actions", "rewards", "dones"}
-    nested_keys = {"observations": {"rgb", "state"}, "next_observations": {"rgb", "state"}}
+    nested_keys = {
+        "observations": {"rgb", "state"},
+        "next_observations": {"rgb", "state"},
+    }
 
     xarm_files = list(raw_dir.glob("*.pkl"))
     assert len(xarm_files) > 0
@@ -53,11 +56,17 @@ def check_format(raw_dir):
 
     # Check for consistent lengths in nested keys
     expected_len = len(dataset_dict["actions"])
-    assert all(len(dataset_dict[key]) == expected_len for key in keys if key in dataset_dict)
+    assert all(
+        len(dataset_dict[key]) == expected_len for key in keys if key in dataset_dict
+    )
 
     for key, subkeys in nested_keys.items():
         nested_dict = dataset_dict.get(key, {})
-        assert all(len(nested_dict[subkey]) == expected_len for subkey in subkeys if subkey in nested_dict)
+        assert all(
+            len(nested_dict[subkey]) == expected_len
+            for subkey in subkeys
+            if subkey in nested_dict
+        )
 
 
 def load_from_raw(
@@ -122,13 +131,18 @@ def load_from_raw(
             shutil.rmtree(tmp_imgs_dir)
 
             # store the reference to the video frame
-            ep_dict[img_key] = [{"path": f"videos/{fname}", "timestamp": i / fps} for i in range(num_frames)]
+            ep_dict[img_key] = [
+                {"path": f"videos/{fname}", "timestamp": i / fps}
+                for i in range(num_frames)
+            ]
         else:
             ep_dict[img_key] = [PILImage.fromarray(x) for x in imgs_array]
 
         ep_dict["observation.state"] = state
         ep_dict["action"] = action
-        ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames, dtype=torch.int64)
+        ep_dict["episode_index"] = torch.tensor(
+            [ep_idx] * num_frames, dtype=torch.int64
+        )
         ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
         ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
         # ep_dict["next.observation.image"] = next_image
@@ -153,7 +167,8 @@ def to_hf_dataset(data_dict, video):
         features["observation.image"] = Image()
 
     features["observation.state"] = Sequence(
-        length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
+        length=data_dict["observation.state"].shape[1],
+        feature=Value(dtype="float32", id=None),
     )
     features["action"] = Sequence(
         length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)
diff --git a/lerobot/common/datasets/sampler.py b/lerobot/common/datasets/sampler.py
index 2f6c15c1..53d0e2e4 100644
--- a/lerobot/common/datasets/sampler.py
+++ b/lerobot/common/datasets/sampler.py
@@ -43,7 +43,10 @@ class EpisodeAwareSampler:
         ):
             if episode_indices_to_use is None or episode_idx in episode_indices_to_use:
                 indices.extend(
-                    range(start_index.item() + drop_n_first_frames, end_index.item() - drop_n_last_frames)
+                    range(
+                        start_index.item() + drop_n_first_frames,
+                        end_index.item() - drop_n_last_frames,
+                    )
                 )
 
         self.indices = indices
diff --git a/lerobot/common/datasets/transforms.py b/lerobot/common/datasets/transforms.py
index 1a72e68e..3c7922c2 100644
--- a/lerobot/common/datasets/transforms.py
+++ b/lerobot/common/datasets/transforms.py
@@ -57,7 +57,9 @@ class RandomSubsetApply(Transform):
         elif not isinstance(n_subset, int):
             raise TypeError("n_subset should be an int or None")
         elif not (1 <= n_subset <= len(transforms)):
-            raise ValueError(f"n_subset should be in the interval [1, {len(transforms)}]")
+            raise ValueError(
+                f"n_subset should be in the interval [1, {len(transforms)}]"
+            )
 
         self.transforms = transforms
         total = sum(p)
@@ -116,16 +118,22 @@ class SharpnessJitter(Transform):
     def _check_input(self, sharpness):
         if isinstance(sharpness, (int, float)):
             if sharpness < 0:
-                raise ValueError("If sharpness is a single number, it must be non negative.")
+                raise ValueError(
+                    "If sharpness is a single number, it must be non negative."
+                )
             sharpness = [1.0 - sharpness, 1.0 + sharpness]
             sharpness[0] = max(sharpness[0], 0.0)
         elif isinstance(sharpness, collections.abc.Sequence) and len(sharpness) == 2:
             sharpness = [float(v) for v in sharpness]
         else:
-            raise TypeError(f"{sharpness=} should be a single number or a sequence with length 2.")
+            raise TypeError(
+                f"{sharpness=} should be a single number or a sequence with length 2."
+            )
 
         if not 0.0 <= sharpness[0] <= sharpness[1]:
-            raise ValueError(f"sharpnesss values should be between (0., inf), but got {sharpness}.")
+            raise ValueError(
+                f"sharpnesss values should be between (0., inf), but got {sharpness}."
+            )
 
         return float(sharpness[0]), float(sharpness[1])
 
@@ -134,7 +142,9 @@ class SharpnessJitter(Transform):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         sharpness_factor = self._generate_value(self.sharpness[0], self.sharpness[1])
-        return self._call_kernel(F.adjust_sharpness, inpt, sharpness_factor=sharpness_factor)
+        return self._call_kernel(
+            F.adjust_sharpness, inpt, sharpness_factor=sharpness_factor
+        )
 
 
 def get_image_transforms(
@@ -185,7 +195,11 @@ def get_image_transforms(
             raise ValueError("The interpolation passed is not supported")
         # Weight for resizing is always 1
         weights.append(1.0)
-        transforms.append(v2.Resize(size=(image_size[0], image_size[1]), interpolation=interpolation_mode))
+        transforms.append(
+            v2.Resize(
+                size=(image_size[0], image_size[1]), interpolation=interpolation_mode
+            )
+        )
     if brightness_min_max is not None and brightness_weight > 0.0:
         weights.append(brightness_weight)
         transforms.append(v2.ColorJitter(brightness=brightness_min_max))
@@ -219,4 +233,6 @@ def get_image_transforms(
         return v2.Identity()
     else:
         # TODO(rcadene, aliberts): add v2.ToDtype float16?
-        return RandomSubsetApply(transforms, p=weights, n_subset=n_subset, random_order=random_order)
+        return RandomSubsetApply(
+            transforms, p=weights, n_subset=n_subset, random_order=random_order
+        )
diff --git a/lerobot/common/datasets/utils.py b/lerobot/common/datasets/utils.py
index 8d65367d..1162c31e 100644
--- a/lerobot/common/datasets/utils.py
+++ b/lerobot/common/datasets/utils.py
@@ -43,9 +43,15 @@ EPISODES_PATH = "meta/episodes.jsonl"
 STATS_PATH = "meta/stats.json"
 TASKS_PATH = "meta/tasks.jsonl"
 
-DEFAULT_VIDEO_PATH = "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4"
-DEFAULT_PARQUET_PATH = "data/chunk-{episode_chunk:03d}/episode_{episode_index:06d}.parquet"
-DEFAULT_IMAGE_PATH = "images/{image_key}/episode_{episode_index:06d}/frame_{frame_index:06d}.png"
+DEFAULT_VIDEO_PATH = (
+    "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4"
+)
+DEFAULT_PARQUET_PATH = (
+    "data/chunk-{episode_chunk:03d}/episode_{episode_index:06d}.parquet"
+)
+DEFAULT_IMAGE_PATH = (
+    "images/{image_key}/episode_{episode_index:06d}/frame_{frame_index:06d}.png"
+)
 
 DATASET_CARD_TEMPLATE = """
 ---
@@ -99,7 +105,9 @@ def unflatten_dict(d: dict, sep: str = "/") -> dict:
 
 
 def serialize_dict(stats: dict[str, torch.Tensor | np.ndarray | dict]) -> dict:
-    serialized_dict = {key: value.tolist() for key, value in flatten_dict(stats).items()}
+    serialized_dict = {
+        key: value.tolist() for key, value in flatten_dict(stats).items()
+    }
     return unflatten_dict(serialized_dict)
 
 
@@ -157,14 +165,19 @@ def load_stats(local_dir: Path) -> dict:
 
 def load_tasks(local_dir: Path) -> dict:
     tasks = load_jsonlines(local_dir / TASKS_PATH)
-    return {item["task_index"]: item["task"] for item in sorted(tasks, key=lambda x: x["task_index"])}
+    return {
+        item["task_index"]: item["task"]
+        for item in sorted(tasks, key=lambda x: x["task_index"])
+    }
 
 
 def load_episodes(local_dir: Path) -> dict:
     return load_jsonlines(local_dir / EPISODES_PATH)
 
 
-def load_image_as_numpy(fpath: str | Path, dtype="float32", channel_first: bool = True) -> np.ndarray:
+def load_image_as_numpy(
+    fpath: str | Path, dtype="float32", channel_first: bool = True
+) -> np.ndarray:
     img = PILImage.open(fpath).convert("RGB")
     img_array = np.array(img, dtype=dtype)
     if channel_first:  # (H, W, C) -> (C, H, W)
@@ -222,7 +235,10 @@ class BackwardCompatibilityError(Exception):
 
 
 def check_version_compatibility(
-    repo_id: str, version_to_check: str, current_version: str, enforce_breaking_major: bool = True
+    repo_id: str,
+    version_to_check: str,
+    current_version: str,
+    enforce_breaking_major: bool = True,
 ) -> None:
     current_major, _ = _get_major_minor(current_version)
     major_to_check, _ = _get_major_minor(version_to_check)
@@ -317,7 +333,9 @@ def create_empty_dataset_info(
 def get_episode_data_index(
     episode_dicts: list[dict], episodes: list[int] | None = None
 ) -> dict[str, torch.Tensor]:
-    episode_lengths = {ep_idx: ep_dict["length"] for ep_idx, ep_dict in enumerate(episode_dicts)}
+    episode_lengths = {
+        ep_idx: ep_dict["length"] for ep_idx, ep_dict in enumerate(episode_dicts)
+    }
     if episodes is not None:
         episode_lengths = {ep_idx: episode_lengths[ep_idx] for ep_idx in episodes}
 
@@ -338,7 +356,9 @@ def calculate_total_episode(
     return total_episodes
 
 
-def calculate_episode_data_index(hf_dataset: datasets.Dataset) -> dict[str, torch.Tensor]:
+def calculate_episode_data_index(
+    hf_dataset: datasets.Dataset,
+) -> dict[str, torch.Tensor]:
     episode_lengths = []
     table = hf_dataset.data.table
     total_episodes = calculate_total_episode(hf_dataset)
@@ -380,7 +400,9 @@ def check_timestamps_sync(
         # Track original indices before masking
         original_indices = torch.arange(len(diffs))
         filtered_indices = original_indices[mask]
-        outside_tolerance_filtered_indices = torch.nonzero(~filtered_within_tolerance)  # .squeeze()
+        outside_tolerance_filtered_indices = torch.nonzero(
+            ~filtered_within_tolerance
+        )  # .squeeze()
         outside_tolerance_indices = filtered_indices[outside_tolerance_filtered_indices]
         episode_indices = torch.stack(hf_dataset["episode_index"])
 
@@ -405,7 +427,10 @@ def check_timestamps_sync(
 
 
 def check_delta_timestamps(
-    delta_timestamps: dict[str, list[float]], fps: int, tolerance_s: float, raise_value_error: bool = True
+    delta_timestamps: dict[str, list[float]],
+    fps: int,
+    tolerance_s: float,
+    raise_value_error: bool = True,
 ) -> bool:
     """This will check if all the values in delta_timestamps are multiples of 1/fps +/- tolerance.
     This is to ensure that these delta_timestamps added to any timestamp from a dataset will themselves be
@@ -413,10 +438,14 @@ def check_delta_timestamps(
     """
     outside_tolerance = {}
     for key, delta_ts in delta_timestamps.items():
-        within_tolerance = [abs(ts * fps - round(ts * fps)) / fps <= tolerance_s for ts in delta_ts]
+        within_tolerance = [
+            abs(ts * fps - round(ts * fps)) / fps <= tolerance_s for ts in delta_ts
+        ]
         if not all(within_tolerance):
             outside_tolerance[key] = [
-                ts for ts, is_within in zip(delta_ts, within_tolerance, strict=True) if not is_within
+                ts
+                for ts, is_within in zip(delta_ts, within_tolerance, strict=True)
+                if not is_within
             ]
 
     if len(outside_tolerance) > 0:
@@ -434,7 +463,9 @@ def check_delta_timestamps(
     return True
 
 
-def get_delta_indices(delta_timestamps: dict[str, list[float]], fps: int) -> dict[str, list[int]]:
+def get_delta_indices(
+    delta_timestamps: dict[str, list[float]], fps: int
+) -> dict[str, list[int]]:
     delta_indices = {}
     for key, delta_ts in delta_timestamps.items():
         delta_indices[key] = (torch.tensor(delta_ts) * fps).long().tolist()
@@ -499,7 +530,9 @@ def create_lerobot_dataset_card(
         ],
     )
 
-    card_template = (importlib.resources.files("lerobot.common.datasets") / "card_template.md").read_text()
+    card_template = (
+        importlib.resources.files("lerobot.common.datasets") / "card_template.md"
+    ).read_text()
 
     return DatasetCard.from_template(
         card_data=card_data,
diff --git a/lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py b/lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py
index eeeb8fe7..9f0fda41 100644
--- a/lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py
+++ b/lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py
@@ -26,7 +26,10 @@ from pathlib import Path
 from textwrap import dedent
 
 from lerobot import available_datasets
-from lerobot.common.datasets.v2.convert_dataset_v1_to_v2 import convert_dataset, parse_robot_config
+from lerobot.common.datasets.v2.convert_dataset_v1_to_v2 import (
+    convert_dataset,
+    parse_robot_config,
+)
 
 LOCAL_DIR = Path("data/")
 
@@ -117,7 +120,10 @@ DATASETS = {
         "single_task": "Place the battery into the slot of the remote controller.",
         **ALOHA_STATIC_INFO,
     },
-    "aloha_static_candy": {"single_task": "Pick up the candy and unwrap it.", **ALOHA_STATIC_INFO},
+    "aloha_static_candy": {
+        "single_task": "Pick up the candy and unwrap it.",
+        **ALOHA_STATIC_INFO,
+    },
     "aloha_static_coffee": {
         "single_task": "Place the coffee capsule inside the capsule container, then place the cup onto the center of the cup tray, then push the 'Hot Water' and 'Travel Mug' buttons.",
         **ALOHA_STATIC_INFO,
@@ -166,13 +172,22 @@ DATASETS = {
         "single_task": "Pick up the plastic cup with the left arm, then pop its lid open with the right arm.",
         **ALOHA_STATIC_INFO,
     },
-    "aloha_static_ziploc_slide": {"single_task": "Slide open the ziploc bag.", **ALOHA_STATIC_INFO},
-    "aloha_sim_insertion_scripted": {"single_task": "Insert the peg into the socket.", **ALOHA_STATIC_INFO},
+    "aloha_static_ziploc_slide": {
+        "single_task": "Slide open the ziploc bag.",
+        **ALOHA_STATIC_INFO,
+    },
+    "aloha_sim_insertion_scripted": {
+        "single_task": "Insert the peg into the socket.",
+        **ALOHA_STATIC_INFO,
+    },
     "aloha_sim_insertion_scripted_image": {
         "single_task": "Insert the peg into the socket.",
         **ALOHA_STATIC_INFO,
     },
-    "aloha_sim_insertion_human": {"single_task": "Insert the peg into the socket.", **ALOHA_STATIC_INFO},
+    "aloha_sim_insertion_human": {
+        "single_task": "Insert the peg into the socket.",
+        **ALOHA_STATIC_INFO,
+    },
     "aloha_sim_insertion_human_image": {
         "single_task": "Insert the peg into the socket.",
         **ALOHA_STATIC_INFO,
@@ -193,10 +208,19 @@ DATASETS = {
         "single_task": "Pick up the cube with the right arm and transfer it to the left arm.",
         **ALOHA_STATIC_INFO,
     },
-    "pusht": {"single_task": "Push the T-shaped block onto the T-shaped target.", **PUSHT_INFO},
-    "pusht_image": {"single_task": "Push the T-shaped block onto the T-shaped target.", **PUSHT_INFO},
+    "pusht": {
+        "single_task": "Push the T-shaped block onto the T-shaped target.",
+        **PUSHT_INFO,
+    },
+    "pusht_image": {
+        "single_task": "Push the T-shaped block onto the T-shaped target.",
+        **PUSHT_INFO,
+    },
     "unitreeh1_fold_clothes": {"single_task": "Fold the sweatshirt.", **UNITREEH_INFO},
-    "unitreeh1_rearrange_objects": {"single_task": "Put the object into the bin.", **UNITREEH_INFO},
+    "unitreeh1_rearrange_objects": {
+        "single_task": "Put the object into the bin.",
+        **UNITREEH_INFO,
+    },
     "unitreeh1_two_robot_greeting": {
         "single_task": "Greet the other robot with a high five.",
         **UNITREEH_INFO,
@@ -206,13 +230,31 @@ DATASETS = {
         **UNITREEH_INFO,
     },
     "xarm_lift_medium": {"single_task": "Pick up the cube and lift it.", **XARM_INFO},
-    "xarm_lift_medium_image": {"single_task": "Pick up the cube and lift it.", **XARM_INFO},
-    "xarm_lift_medium_replay": {"single_task": "Pick up the cube and lift it.", **XARM_INFO},
-    "xarm_lift_medium_replay_image": {"single_task": "Pick up the cube and lift it.", **XARM_INFO},
+    "xarm_lift_medium_image": {
+        "single_task": "Pick up the cube and lift it.",
+        **XARM_INFO,
+    },
+    "xarm_lift_medium_replay": {
+        "single_task": "Pick up the cube and lift it.",
+        **XARM_INFO,
+    },
+    "xarm_lift_medium_replay_image": {
+        "single_task": "Pick up the cube and lift it.",
+        **XARM_INFO,
+    },
     "xarm_push_medium": {"single_task": "Push the cube onto the target.", **XARM_INFO},
-    "xarm_push_medium_image": {"single_task": "Push the cube onto the target.", **XARM_INFO},
-    "xarm_push_medium_replay": {"single_task": "Push the cube onto the target.", **XARM_INFO},
-    "xarm_push_medium_replay_image": {"single_task": "Push the cube onto the target.", **XARM_INFO},
+    "xarm_push_medium_image": {
+        "single_task": "Push the cube onto the target.",
+        **XARM_INFO,
+    },
+    "xarm_push_medium_replay": {
+        "single_task": "Push the cube onto the target.",
+        **XARM_INFO,
+    },
+    "xarm_push_medium_replay_image": {
+        "single_task": "Push the cube onto the target.",
+        **XARM_INFO,
+    },
     "umi_cup_in_the_wild": {
         "single_task": "Put the cup on the plate.",
         "license": "apache-2.0",
diff --git a/lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py b/lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py
index bf135043..74fe931f 100644
--- a/lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py
+++ b/lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py
@@ -152,7 +152,9 @@ V1_INFO_PATH = "meta_data/info.json"
 V1_STATS_PATH = "meta_data/stats.safetensors"
 
 
-def parse_robot_config(config_path: Path, config_overrides: list[str] | None = None) -> tuple[str, dict]:
+def parse_robot_config(
+    config_path: Path, config_overrides: list[str] | None = None
+) -> tuple[str, dict]:
     robot_cfg = init_hydra_config(config_path, config_overrides)
     if robot_cfg["robot_type"] in ["aloha", "koch"]:
         state_names = [
@@ -203,7 +205,9 @@ def convert_stats_to_json(v1_dir: Path, v2_dir: Path) -> None:
         torch.testing.assert_close(stats_json[key], stats[key])
 
 
-def get_features_from_hf_dataset(dataset: Dataset, robot_config: dict | None = None) -> dict[str, list]:
+def get_features_from_hf_dataset(
+    dataset: Dataset, robot_config: dict | None = None
+) -> dict[str, list]:
     features = {}
     for key, ft in dataset.features.items():
         if isinstance(ft, datasets.Value):
@@ -215,7 +219,9 @@ def get_features_from_hf_dataset(dataset: Dataset, robot_config: dict | None = N
             dtype = ft.feature.dtype
             shape = (ft.length,)
             motor_names = (
-                robot_config["names"][key] if robot_config else [f"motor_{i}" for i in range(ft.length)]
+                robot_config["names"][key]
+                if robot_config
+                else [f"motor_{i}" for i in range(ft.length)]
             )
             assert len(motor_names) == shape[0]
             names = {"motors": motor_names}
@@ -239,11 +245,15 @@ def get_features_from_hf_dataset(dataset: Dataset, robot_config: dict | None = N
     return features
 
 
-def add_task_index_by_episodes(dataset: Dataset, tasks_by_episodes: dict) -> tuple[Dataset, list[str]]:
+def add_task_index_by_episodes(
+    dataset: Dataset, tasks_by_episodes: dict
+) -> tuple[Dataset, list[str]]:
     df = dataset.to_pandas()
     tasks = list(set(tasks_by_episodes.values()))
     tasks_to_task_index = {task: task_idx for task_idx, task in enumerate(tasks)}
-    episodes_to_task_index = {ep_idx: tasks_to_task_index[task] for ep_idx, task in tasks_by_episodes.items()}
+    episodes_to_task_index = {
+        ep_idx: tasks_to_task_index[task] for ep_idx, task in tasks_by_episodes.items()
+    }
     df["task_index"] = df["episode_index"].map(episodes_to_task_index).astype(int)
 
     features = dataset.features
@@ -260,10 +270,19 @@ def add_task_index_from_tasks_col(
     # HACK: This is to clean some of the instructions in our version of Open X datasets
     prefix_to_clean = "tf.Tensor(b'"
     suffix_to_clean = "', shape=(), dtype=string)"
-    df[tasks_col] = df[tasks_col].str.removeprefix(prefix_to_clean).str.removesuffix(suffix_to_clean)
+    df[tasks_col] = (
+        df[tasks_col]
+        .str.removeprefix(prefix_to_clean)
+        .str.removesuffix(suffix_to_clean)
+    )
 
     # Create task_index col
-    tasks_by_episode = df.groupby("episode_index")[tasks_col].unique().apply(lambda x: x.tolist()).to_dict()
+    tasks_by_episode = (
+        df.groupby("episode_index")[tasks_col]
+        .unique()
+        .apply(lambda x: x.tolist())
+        .to_dict()
+    )
     tasks = df[tasks_col].unique().tolist()
     tasks_to_task_index = {task: idx for idx, task in enumerate(tasks)}
     df["task_index"] = df[tasks_col].map(tasks_to_task_index).astype(int)
@@ -288,7 +307,9 @@ def split_parquet_by_episodes(
     for ep_chunk in range(total_chunks):
         ep_chunk_start = DEFAULT_CHUNK_SIZE * ep_chunk
         ep_chunk_end = min(DEFAULT_CHUNK_SIZE * (ep_chunk + 1), total_episodes)
-        chunk_dir = "/".join(DEFAULT_PARQUET_PATH.split("/")[:-1]).format(episode_chunk=ep_chunk)
+        chunk_dir = "/".join(DEFAULT_PARQUET_PATH.split("/")[:-1]).format(
+            episode_chunk=ep_chunk
+        )
         (output_dir / chunk_dir).mkdir(parents=True, exist_ok=True)
         for ep_idx in range(ep_chunk_start, ep_chunk_end):
             ep_table = table.filter(pc.equal(table["episode_index"], ep_idx))
@@ -320,7 +341,9 @@ def move_videos(
     videos_moved = False
     video_files = [str(f.relative_to(work_dir)) for f in work_dir.glob("videos*/*.mp4")]
     if len(video_files) == 0:
-        video_files = [str(f.relative_to(work_dir)) for f in work_dir.glob("videos*/*/*/*.mp4")]
+        video_files = [
+            str(f.relative_to(work_dir)) for f in work_dir.glob("videos*/*/*/*.mp4")
+        ]
         videos_moved = True  # Videos have already been moved
 
     assert len(video_files) == total_episodes * len(video_keys)
@@ -351,7 +374,9 @@ def move_videos(
                 target_path = DEFAULT_VIDEO_PATH.format(
                     episode_chunk=ep_chunk, video_key=vid_key, episode_index=ep_idx
                 )
-                video_file = V1_VIDEO_FILE.format(video_key=vid_key, episode_index=ep_idx)
+                video_file = V1_VIDEO_FILE.format(
+                    video_key=vid_key, episode_index=ep_idx
+                )
                 if len(video_dirs) == 1:
                     video_path = video_dirs[0] / video_file
                 else:
@@ -368,7 +393,9 @@ def move_videos(
     subprocess.run(["git", "push"], cwd=work_dir, check=True)
 
 
-def fix_lfs_video_files_tracking(work_dir: Path, lfs_untracked_videos: list[str]) -> None:
+def fix_lfs_video_files_tracking(
+    work_dir: Path, lfs_untracked_videos: list[str]
+) -> None:
     """
     HACK: This function fixes the tracking by git lfs which was not properly set on some repos. In that case,
     there's no other option than to download the actual files and reupload them with lfs tracking.
@@ -376,7 +403,12 @@ def fix_lfs_video_files_tracking(work_dir: Path, lfs_untracked_videos: list[str]
     for i in range(0, len(lfs_untracked_videos), 100):
         files = lfs_untracked_videos[i : i + 100]
         try:
-            subprocess.run(["git", "rm", "--cached", *files], cwd=work_dir, capture_output=True, check=True)
+            subprocess.run(
+                ["git", "rm", "--cached", *files],
+                cwd=work_dir,
+                capture_output=True,
+                check=True,
+            )
         except subprocess.CalledProcessError as e:
             print("git rm --cached ERROR:")
             print(e.stderr)
@@ -387,10 +419,14 @@ def fix_lfs_video_files_tracking(work_dir: Path, lfs_untracked_videos: list[str]
     subprocess.run(["git", "push"], cwd=work_dir, check=True)
 
 
-def fix_gitattributes(work_dir: Path, current_gittatributes: Path, clean_gittatributes: Path) -> None:
+def fix_gitattributes(
+    work_dir: Path, current_gittatributes: Path, clean_gittatributes: Path
+) -> None:
     shutil.copyfile(clean_gittatributes, current_gittatributes)
     subprocess.run(["git", "add", ".gitattributes"], cwd=work_dir, check=True)
-    subprocess.run(["git", "commit", "-m", "Fix .gitattributes"], cwd=work_dir, check=True)
+    subprocess.run(
+        ["git", "commit", "-m", "Fix .gitattributes"], cwd=work_dir, check=True
+    )
     subprocess.run(["git", "push"], cwd=work_dir, check=True)
 
 
@@ -399,7 +435,17 @@ def _lfs_clone(repo_id: str, work_dir: Path, branch: str) -> None:
     repo_url = f"https://huggingface.co/datasets/{repo_id}"
     env = {"GIT_LFS_SKIP_SMUDGE": "1"}  # Prevent downloading LFS files
     subprocess.run(
-        ["git", "clone", "--branch", branch, "--single-branch", "--depth", "1", repo_url, str(work_dir)],
+        [
+            "git",
+            "clone",
+            "--branch",
+            branch,
+            "--single-branch",
+            "--depth",
+            "1",
+            repo_url,
+            str(work_dir),
+        ],
         check=True,
         env=env,
     )
@@ -407,13 +453,19 @@ def _lfs_clone(repo_id: str, work_dir: Path, branch: str) -> None:
 
 def _get_lfs_untracked_videos(work_dir: Path, video_files: list[str]) -> list[str]:
     lfs_tracked_files = subprocess.run(
-        ["git", "lfs", "ls-files", "-n"], cwd=work_dir, capture_output=True, text=True, check=True
+        ["git", "lfs", "ls-files", "-n"],
+        cwd=work_dir,
+        capture_output=True,
+        text=True,
+        check=True,
     )
     lfs_tracked_files = set(lfs_tracked_files.stdout.splitlines())
     return [f for f in video_files if f not in lfs_tracked_files]
 
 
-def get_videos_info(repo_id: str, local_dir: Path, video_keys: list[str], branch: str) -> dict:
+def get_videos_info(
+    repo_id: str, local_dir: Path, video_keys: list[str], branch: str
+) -> dict:
     # Assumes first episode
     video_files = [
         DEFAULT_VIDEO_PATH.format(episode_chunk=0, video_key=vid_key, episode_index=0)
@@ -421,7 +473,11 @@ def get_videos_info(repo_id: str, local_dir: Path, video_keys: list[str], branch
     ]
     hub_api = HfApi()
     hub_api.snapshot_download(
-        repo_id=repo_id, repo_type="dataset", local_dir=local_dir, revision=branch, allow_patterns=video_files
+        repo_id=repo_id,
+        repo_type="dataset",
+        local_dir=local_dir,
+        revision=branch,
+        allow_patterns=video_files,
     )
     videos_info_dict = {}
     for vid_key, vid_path in zip(video_keys, video_files, strict=True):
@@ -448,7 +504,11 @@ def convert_dataset(
 
     hub_api = HfApi()
     hub_api.snapshot_download(
-        repo_id=repo_id, repo_type="dataset", revision=v1, local_dir=v1x_dir, ignore_patterns="videos*/"
+        repo_id=repo_id,
+        repo_type="dataset",
+        revision=v1,
+        local_dir=v1x_dir,
+        ignore_patterns="videos*/",
     )
     branch = "main"
     if test_branch:
@@ -480,19 +540,31 @@ def convert_dataset(
     if single_task:
         tasks_by_episodes = {ep_idx: single_task for ep_idx in episode_indices}
         dataset, tasks = add_task_index_by_episodes(dataset, tasks_by_episodes)
-        tasks_by_episodes = {ep_idx: [task] for ep_idx, task in tasks_by_episodes.items()}
+        tasks_by_episodes = {
+            ep_idx: [task] for ep_idx, task in tasks_by_episodes.items()
+        }
     elif tasks_path:
         tasks_by_episodes = load_json(tasks_path)
-        tasks_by_episodes = {int(ep_idx): task for ep_idx, task in tasks_by_episodes.items()}
+        tasks_by_episodes = {
+            int(ep_idx): task for ep_idx, task in tasks_by_episodes.items()
+        }
         dataset, tasks = add_task_index_by_episodes(dataset, tasks_by_episodes)
-        tasks_by_episodes = {ep_idx: [task] for ep_idx, task in tasks_by_episodes.items()}
+        tasks_by_episodes = {
+            ep_idx: [task] for ep_idx, task in tasks_by_episodes.items()
+        }
     elif tasks_col:
-        dataset, tasks, tasks_by_episodes = add_task_index_from_tasks_col(dataset, tasks_col)
+        dataset, tasks, tasks_by_episodes = add_task_index_from_tasks_col(
+            dataset, tasks_col
+        )
     else:
         raise ValueError
 
-    assert set(tasks) == {task for ep_tasks in tasks_by_episodes.values() for task in ep_tasks}
-    tasks = [{"task_index": task_idx, "task": task} for task_idx, task in enumerate(tasks)]
+    assert set(tasks) == {
+        task for ep_tasks in tasks_by_episodes.values() for task in ep_tasks
+    }
+    tasks = [
+        {"task_index": task_idx, "task": task} for task_idx, task in enumerate(tasks)
+    ]
     write_jsonlines(tasks, v20_dir / TASKS_PATH)
     features["task_index"] = {
         "dtype": "int64",
@@ -506,14 +578,25 @@ def convert_dataset(
         dataset = dataset.remove_columns(video_keys)
         clean_gitattr = Path(
             hub_api.hf_hub_download(
-                repo_id=GITATTRIBUTES_REF, repo_type="dataset", local_dir=local_dir, filename=".gitattributes"
+                repo_id=GITATTRIBUTES_REF,
+                repo_type="dataset",
+                local_dir=local_dir,
+                filename=".gitattributes",
             )
         ).absolute()
         with tempfile.TemporaryDirectory() as tmp_video_dir:
             move_videos(
-                repo_id, video_keys, total_episodes, total_chunks, Path(tmp_video_dir), clean_gitattr, branch
+                repo_id,
+                video_keys,
+                total_episodes,
+                total_chunks,
+                Path(tmp_video_dir),
+                clean_gitattr,
+                branch,
             )
-        videos_info = get_videos_info(repo_id, v1x_dir, video_keys=video_keys, branch=branch)
+        videos_info = get_videos_info(
+            repo_id, v1x_dir, video_keys=video_keys, branch=branch
+        )
         for key in video_keys:
             features[key]["shape"] = (
                 videos_info[key].pop("video.height"),
@@ -521,15 +604,22 @@ def convert_dataset(
                 videos_info[key].pop("video.channels"),
             )
             features[key]["video_info"] = videos_info[key]
-            assert math.isclose(videos_info[key]["video.fps"], metadata_v1["fps"], rel_tol=1e-3)
+            assert math.isclose(
+                videos_info[key]["video.fps"], metadata_v1["fps"], rel_tol=1e-3
+            )
             if "encoding" in metadata_v1:
-                assert videos_info[key]["video.pix_fmt"] == metadata_v1["encoding"]["pix_fmt"]
+                assert (
+                    videos_info[key]["video.pix_fmt"]
+                    == metadata_v1["encoding"]["pix_fmt"]
+                )
     else:
         assert metadata_v1.get("video", 0) == 0
         videos_info = None
 
     # Split data into 1 parquet file by episode
-    episode_lengths = split_parquet_by_episodes(dataset, total_episodes, total_chunks, v20_dir)
+    episode_lengths = split_parquet_by_episodes(
+        dataset, total_episodes, total_chunks, v20_dir
+    )
 
     if robot_config is not None:
         robot_type = robot_config["robot_type"]
@@ -540,7 +630,11 @@ def convert_dataset(
 
     # Episodes
     episodes = [
-        {"episode_index": ep_idx, "tasks": tasks_by_episodes[ep_idx], "length": episode_lengths[ep_idx]}
+        {
+            "episode_index": ep_idx,
+            "tasks": tasks_by_episodes[ep_idx],
+            "length": episode_lengths[ep_idx],
+        }
         for ep_idx in episode_indices
     ]
     write_jsonlines(episodes, v20_dir / EPISODES_PATH)
@@ -563,16 +657,27 @@ def convert_dataset(
     }
     write_json(metadata_v2_0, v20_dir / INFO_PATH)
     convert_stats_to_json(v1x_dir, v20_dir)
-    card = create_lerobot_dataset_card(tags=repo_tags, dataset_info=metadata_v2_0, **card_kwargs)
+    card = create_lerobot_dataset_card(
+        tags=repo_tags, dataset_info=metadata_v2_0, **card_kwargs
+    )
 
     with contextlib.suppress(EntryNotFoundError, HfHubHTTPError):
-        hub_api.delete_folder(repo_id=repo_id, path_in_repo="data", repo_type="dataset", revision=branch)
+        hub_api.delete_folder(
+            repo_id=repo_id, path_in_repo="data", repo_type="dataset", revision=branch
+        )
 
     with contextlib.suppress(EntryNotFoundError, HfHubHTTPError):
-        hub_api.delete_folder(repo_id=repo_id, path_in_repo="meta_data", repo_type="dataset", revision=branch)
+        hub_api.delete_folder(
+            repo_id=repo_id,
+            path_in_repo="meta_data",
+            repo_type="dataset",
+            revision=branch,
+        )
 
     with contextlib.suppress(EntryNotFoundError, HfHubHTTPError):
-        hub_api.delete_folder(repo_id=repo_id, path_in_repo="meta", repo_type="dataset", revision=branch)
+        hub_api.delete_folder(
+            repo_id=repo_id, path_in_repo="meta", repo_type="dataset", revision=branch
+        )
 
     hub_api.upload_folder(
         repo_id=repo_id,
@@ -655,7 +760,11 @@ def main():
     if not args.local_dir:
         args.local_dir = Path("/tmp/lerobot_dataset_v2")
 
-    robot_config = parse_robot_config(args.robot_config, args.robot_overrides) if args.robot_config else None
+    robot_config = (
+        parse_robot_config(args.robot_config, args.robot_overrides)
+        if args.robot_config
+        else None
+    )
     del args.robot_config, args.robot_overrides
 
     convert_dataset(**vars(args), robot_config=robot_config)
diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py
index 8ed3318d..d63bbf8c 100644
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@@ -227,7 +227,9 @@ def get_audio_info(video_path: Path | str) -> dict:
         "json",
         str(video_path),
     ]
-    result = subprocess.run(ffprobe_audio_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    result = subprocess.run(
+        ffprobe_audio_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
+    )
     if result.returncode != 0:
         raise RuntimeError(f"Error running ffprobe: {result.stderr}")
 
@@ -241,7 +243,9 @@ def get_audio_info(video_path: Path | str) -> dict:
         "has_audio": True,
         "audio.channels": audio_stream_info.get("channels", None),
         "audio.codec": audio_stream_info.get("codec_name", None),
-        "audio.bit_rate": int(audio_stream_info["bit_rate"]) if audio_stream_info.get("bit_rate") else None,
+        "audio.bit_rate": int(audio_stream_info["bit_rate"])
+        if audio_stream_info.get("bit_rate")
+        else None,
         "audio.sample_rate": int(audio_stream_info["sample_rate"])
         if audio_stream_info.get("sample_rate")
         else None,
@@ -263,7 +267,9 @@ def get_video_info(video_path: Path | str) -> dict:
         "json",
         str(video_path),
     ]
-    result = subprocess.run(ffprobe_video_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    result = subprocess.run(
+        ffprobe_video_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
+    )
     if result.returncode != 0:
         raise RuntimeError(f"Error running ffprobe: {result.stderr}")
 
diff --git a/lerobot/common/envs/factory.py b/lerobot/common/envs/factory.py
index 72b8f1c7..457b7af6 100644
--- a/lerobot/common/envs/factory.py
+++ b/lerobot/common/envs/factory.py
@@ -35,7 +35,9 @@ def make_env(cfg: DictConfig, n_envs: int | None = None) -> gym.vector.VectorEnv
         return
 
     if "maniskill" in cfg.env.name:
-        env = make_maniskill_env(cfg, n_envs if n_envs is not None else cfg.eval.batch_size)
+        env = make_maniskill_env(
+            cfg, n_envs if n_envs is not None else cfg.eval.batch_size
+        )
         return env
 
     package_name = f"gym_{cfg.env.name}"
@@ -55,7 +57,11 @@ def make_env(cfg: DictConfig, n_envs: int | None = None) -> gym.vector.VectorEnv
         gym_kwgs["max_episode_steps"] = cfg.env.episode_length
 
     # batched version of the env that returns an observation of shape (b, c)
-    env_cls = gym.vector.AsyncVectorEnv if cfg.eval.use_async_envs else gym.vector.SyncVectorEnv
+    env_cls = (
+        gym.vector.AsyncVectorEnv
+        if cfg.eval.use_async_envs
+        else gym.vector.SyncVectorEnv
+    )
     env = env_cls(
         [
             lambda: gym.make(gym_handle, disable_env_checker=True, **gym_kwgs)
@@ -66,7 +72,9 @@ def make_env(cfg: DictConfig, n_envs: int | None = None) -> gym.vector.VectorEnv
     return env
 
 
-def make_maniskill_env(cfg: DictConfig, n_envs: int | None = None) -> gym.vector.VectorEnv | None:
+def make_maniskill_env(
+    cfg: DictConfig, n_envs: int | None = None
+) -> gym.vector.VectorEnv | None:
     """Make ManiSkill3 gym environment"""
     from mani_skill.vector.wrappers.gymnasium import ManiSkillVectorEnv
 
@@ -83,7 +91,9 @@ def make_maniskill_env(cfg: DictConfig, n_envs: int | None = None) -> gym.vector
     # state should have the size of 25
     # env = ConvertToLeRobotEnv(env, n_envs)
     # env = PixelWrapper(cfg, env, n_envs)
-    env._max_episode_steps = env.max_episode_steps = 50  # gym_utils.find_max_episode_steps_value(env)
+    env._max_episode_steps = env.max_episode_steps = (
+        50  # gym_utils.find_max_episode_steps_value(env)
+    )
     env.unwrapped.metadata["render_fps"] = 20
 
     return env
@@ -110,7 +120,11 @@ class PixelWrapper(gym.Wrapper):
     def _get_obs(self, obs):
         frame = obs["sensor_data"]["base_camera"]["rgb"].cpu().permute(0, 3, 1, 2)
         self._frames.append(frame)
-        return {"pixels": torch.from_numpy(np.concatenate(self._frames, axis=1)).to(self.env.device)}
+        return {
+            "pixels": torch.from_numpy(np.concatenate(self._frames, axis=1)).to(
+                self.env.device
+            )
+        }
 
     def reset(self, seed):
         obs, info = self.env.reset()  # (seed=seed)
@@ -144,7 +158,9 @@ class ConvertToLeRobotEnv(gym.Wrapper):
 
         images = torch.concat(images, axis=-1)
         # flatten the rest of the data which should just be state data
-        observation = common.flatten_state_dict(observation, use_torch=True, device=self.base_env.device)
+        observation = common.flatten_state_dict(
+            observation, use_torch=True, device=self.base_env.device
+        )
         ret = dict()
         ret["state"] = observation
         ret["pixels"] = images
diff --git a/lerobot/common/envs/utils.py b/lerobot/common/envs/utils.py
index faa4c026..a163e6f8 100644
--- a/lerobot/common/envs/utils.py
+++ b/lerobot/common/envs/utils.py
@@ -39,7 +39,9 @@ def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Ten
             img = img.unsqueeze(0)
         # sanity check that images are channel last
         _, h, w, c = img.shape
-        assert c < h and c < w, f"expect channel last images, but instead got {img.shape=}"
+        assert (
+            c < h and c < w
+        ), f"expect channel last images, but instead got {img.shape=}"
 
         # sanity check that images are uint8
         assert img.dtype == torch.uint8, f"expect torch.uint8, but instead {img.dtype=}"
@@ -65,7 +67,9 @@ def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Ten
     return return_observations
 
 
-def preprocess_maniskill_observation(observations: dict[str, np.ndarray]) -> dict[str, Tensor]:
+def preprocess_maniskill_observation(
+    observations: dict[str, np.ndarray],
+) -> dict[str, Tensor]:
     """Convert environment observation to LeRobot format observation.
     Args:
         observation: Dictionary of observation batches from a Gym vector environment.
diff --git a/lerobot/common/logger.py b/lerobot/common/logger.py
index 3a9cb2a5..b140270b 100644
--- a/lerobot/common/logger.py
+++ b/lerobot/common/logger.py
@@ -84,7 +84,9 @@ class Logger:
     pretrained_model_dir_name = "pretrained_model"
     training_state_file_name = "training_state.pth"
 
-    def __init__(self, cfg: DictConfig, log_dir: str, wandb_job_name: str | None = None):
+    def __init__(
+        self, cfg: DictConfig, log_dir: str, wandb_job_name: str | None = None
+    ):
         """
         Args:
             log_dir: The directory to save all logs and training outputs to.
@@ -104,7 +106,9 @@ class Logger:
         enable_wandb = cfg.get("wandb", {}).get("enable", False)
         run_offline = not enable_wandb or not project
         if run_offline:
-            logging.info(colored("Logs will be saved locally.", "yellow", attrs=["bold"]))
+            logging.info(
+                colored("Logs will be saved locally.", "yellow", attrs=["bold"])
+            )
             self._wandb = None
         else:
             os.environ["WANDB_SILENT"] = "true"
@@ -130,7 +134,9 @@ class Logger:
             # Handle custom step key for rl asynchronous training.
             self._wandb_custom_step_key: set[str] | None = None
             print(colored("Logs will be synced with wandb.", "blue", attrs=["bold"]))
-            logging.info(f"Track this run --> {colored(wandb.run.get_url(), 'yellow', attrs=['bold'])}")
+            logging.info(
+                f"Track this run --> {colored(wandb.run.get_url(), 'yellow', attrs=['bold'])}"
+            )
             self._wandb = wandb
 
     @classmethod
@@ -151,7 +157,9 @@ class Logger:
         """
         return cls.get_last_checkpoint_dir(log_dir) / cls.pretrained_model_dir_name
 
-    def save_model(self, save_dir: Path, policy: Policy, wandb_artifact_name: str | None = None):
+    def save_model(
+        self, save_dir: Path, policy: Policy, wandb_artifact_name: str | None = None
+    ):
         """Save the weights of the Policy model using PyTorchModelHubMixin.
 
         The weights are saved in a folder called "pretrained_model" under the checkpoint directory.
@@ -221,22 +229,30 @@ class Logger:
             else f"{self._group.replace(':', '_').replace('/', '_')}-{self._cfg.seed}-{identifier}"
         )
         self.save_model(
-            checkpoint_dir / self.pretrained_model_dir_name, policy, wandb_artifact_name=wandb_artifact_name
+            checkpoint_dir / self.pretrained_model_dir_name,
+            policy,
+            wandb_artifact_name=wandb_artifact_name,
+        )
+        self.save_training_state(
+            checkpoint_dir, train_step, optimizer, scheduler, interaction_step
         )
-        self.save_training_state(checkpoint_dir, train_step, optimizer, scheduler, interaction_step)
         os.symlink(checkpoint_dir.absolute(), self.last_checkpoint_dir)
 
-    def load_last_training_state(self, optimizer: Optimizer | dict, scheduler: LRScheduler | None) -> int:
+    def load_last_training_state(
+        self, optimizer: Optimizer | dict, scheduler: LRScheduler | None
+    ) -> int:
         """
         Given the last checkpoint in the logging directory, load the optimizer state, scheduler state, and
         random state, and return the global training step.
         """
-        training_state = torch.load(self.last_checkpoint_dir / self.training_state_file_name)
+        training_state = torch.load(
+            self.last_checkpoint_dir / self.training_state_file_name
+        )
         # For the case where the optimizer is a dictionary of optimizers (e.g., sac)
         if type(training_state["optimizer"]) is dict:
-            assert set(training_state["optimizer"].keys()) == set(optimizer.keys()), (
-                "Optimizer dictionaries do not have the same keys during resume!"
-            )
+            assert set(training_state["optimizer"].keys()) == set(
+                optimizer.keys()
+            ), "Optimizer dictionaries do not have the same keys during resume!"
             for k, v in training_state["optimizer"].items():
                 optimizer[k].load_state_dict(v)
         else:
@@ -248,10 +264,18 @@ class Logger:
                 "The checkpoint contains a scheduler state_dict, but no LRScheduler was provided."
             )
         # Small hack to get the expected keys: use `get_global_random_state`.
-        set_global_random_state({k: training_state[k] for k in get_global_random_state()})
+        set_global_random_state(
+            {k: training_state[k] for k in get_global_random_state()}
+        )
         return training_state["step"]
 
-    def log_dict(self, d, step: int | None = None, mode="train", custom_step_key: str | None = None):
+    def log_dict(
+        self,
+        d,
+        step: int | None = None,
+        mode="train",
+        custom_step_key: str | None = None,
+    ):
         """Log a dictionary of metrics to WandB."""
         assert mode in {"train", "eval"}
         # TODO(alexander-soare): Add local text log.
@@ -280,12 +304,20 @@ class Logger:
                     continue
 
                 # Do not log the custom step key itself.
-                if self._wandb_custom_step_key is not None and k in self._wandb_custom_step_key:
+                if (
+                    self._wandb_custom_step_key is not None
+                    and k in self._wandb_custom_step_key
+                ):
                     continue
 
                 if custom_step_key is not None:
                     value_custom_step = d[custom_step_key]
-                    self._wandb.log({f"{mode}/{k}": v, f"{mode}/{custom_step_key}": value_custom_step})
+                    self._wandb.log(
+                        {
+                            f"{mode}/{k}": v,
+                            f"{mode}/{custom_step_key}": value_custom_step,
+                        }
+                    )
                     continue
 
                 self._wandb.log(data={f"{mode}/{k}": v}, step=step)
diff --git a/lerobot/common/policies/act/configuration_act.py b/lerobot/common/policies/act/configuration_act.py
index a86c359c..51c95097 100644
--- a/lerobot/common/policies/act/configuration_act.py
+++ b/lerobot/common/policies/act/configuration_act.py
@@ -168,4 +168,6 @@ class ACTConfig:
             not any(k.startswith("observation.image") for k in self.input_shapes)
             and "observation.environment_state" not in self.input_shapes
         ):
-            raise ValueError("You must provide at least one image or the environment state among the inputs.")
+            raise ValueError(
+                "You must provide at least one image or the environment state among the inputs."
+            )
diff --git a/lerobot/common/policies/act/modeling_act.py b/lerobot/common/policies/act/modeling_act.py
index 418863a1..5eee2201 100644
--- a/lerobot/common/policies/act/modeling_act.py
+++ b/lerobot/common/policies/act/modeling_act.py
@@ -81,10 +81,14 @@ class ACTPolicy(
 
         self.model = ACT(config)
 
-        self.expected_image_keys = [k for k in config.input_shapes if k.startswith("observation.image")]
+        self.expected_image_keys = [
+            k for k in config.input_shapes if k.startswith("observation.image")
+        ]
 
         if config.temporal_ensemble_coeff is not None:
-            self.temporal_ensembler = ACTTemporalEnsembler(config.temporal_ensemble_coeff, config.chunk_size)
+            self.temporal_ensembler = ACTTemporalEnsembler(
+                config.temporal_ensemble_coeff, config.chunk_size
+            )
 
         self.reset()
 
@@ -107,8 +111,12 @@ class ACTPolicy(
 
         batch = self.normalize_inputs(batch)
         if len(self.expected_image_keys) > 0:
-            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
-            batch["observation.images"] = torch.stack([batch[k] for k in self.expected_image_keys], dim=-4)
+            batch = dict(
+                batch
+            )  # shallow copy so that adding a key doesn't modify the original
+            batch["observation.images"] = torch.stack(
+                [batch[k] for k in self.expected_image_keys], dim=-4
+            )
 
         # If we are doing temporal ensembling, do online updates where we keep track of the number of actions
         # we are ensembling over.
@@ -135,13 +143,18 @@ class ACTPolicy(
         """Run the batch through the model and compute the loss for training or validation."""
         batch = self.normalize_inputs(batch)
         if len(self.expected_image_keys) > 0:
-            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
-            batch["observation.images"] = torch.stack([batch[k] for k in self.expected_image_keys], dim=-4)
+            batch = dict(
+                batch
+            )  # shallow copy so that adding a key doesn't modify the original
+            batch["observation.images"] = torch.stack(
+                [batch[k] for k in self.expected_image_keys], dim=-4
+            )
         batch = self.normalize_targets(batch)
         actions_hat, (mu_hat, log_sigma_x2_hat) = self.model(batch)
 
         l1_loss = (
-            F.l1_loss(batch["action"], actions_hat, reduction="none") * ~batch["action_is_pad"].unsqueeze(-1)
+            F.l1_loss(batch["action"], actions_hat, reduction="none")
+            * ~batch["action_is_pad"].unsqueeze(-1)
         ).mean()
 
         loss_dict = {"l1_loss": l1_loss.item()}
@@ -151,7 +164,12 @@ class ACTPolicy(
             # KL-divergence per batch element, then take the mean over the batch.
             # (See App. B of https://arxiv.org/abs/1312.6114 for more details).
             mean_kld = (
-                (-0.5 * (1 + log_sigma_x2_hat - mu_hat.pow(2) - (log_sigma_x2_hat).exp())).sum(-1).mean()
+                (
+                    -0.5
+                    * (1 + log_sigma_x2_hat - mu_hat.pow(2) - (log_sigma_x2_hat).exp())
+                )
+                .sum(-1)
+                .mean()
             )
             loss_dict["kld_loss"] = mean_kld.item()
             loss_dict["loss"] = l1_loss + mean_kld * self.config.kl_weight
@@ -205,7 +223,9 @@ class ACTTemporalEnsembler:
         ```
         """
         self.chunk_size = chunk_size
-        self.ensemble_weights = torch.exp(-temporal_ensemble_coeff * torch.arange(chunk_size))
+        self.ensemble_weights = torch.exp(
+            -temporal_ensemble_coeff * torch.arange(chunk_size)
+        )
         self.ensemble_weights_cumsum = torch.cumsum(self.ensemble_weights, dim=0)
         self.reset()
 
@@ -221,7 +241,9 @@ class ACTTemporalEnsembler:
         time steps, and pop/return the next batch of actions in the sequence.
         """
         self.ensemble_weights = self.ensemble_weights.to(device=actions.device)
-        self.ensemble_weights_cumsum = self.ensemble_weights_cumsum.to(device=actions.device)
+        self.ensemble_weights_cumsum = self.ensemble_weights_cumsum.to(
+            device=actions.device
+        )
         if self.ensembled_actions is None:
             # Initializes `self._ensembled_action` to the sequence of actions predicted during the first
             # time step of the episode.
@@ -229,19 +251,34 @@ class ACTTemporalEnsembler:
             # Note: The last dimension is unsqueeze to make sure we can broadcast properly for tensor
             # operations later.
             self.ensembled_actions_count = torch.ones(
-                (self.chunk_size, 1), dtype=torch.long, device=self.ensembled_actions.device
+                (self.chunk_size, 1),
+                dtype=torch.long,
+                device=self.ensembled_actions.device,
             )
         else:
             # self.ensembled_actions will have shape (batch_size, chunk_size - 1, action_dim). Compute
             # the online update for those entries.
-            self.ensembled_actions *= self.ensemble_weights_cumsum[self.ensembled_actions_count - 1]
-            self.ensembled_actions += actions[:, :-1] * self.ensemble_weights[self.ensembled_actions_count]
-            self.ensembled_actions /= self.ensemble_weights_cumsum[self.ensembled_actions_count]
-            self.ensembled_actions_count = torch.clamp(self.ensembled_actions_count + 1, max=self.chunk_size)
+            self.ensembled_actions *= self.ensemble_weights_cumsum[
+                self.ensembled_actions_count - 1
+            ]
+            self.ensembled_actions += (
+                actions[:, :-1] * self.ensemble_weights[self.ensembled_actions_count]
+            )
+            self.ensembled_actions /= self.ensemble_weights_cumsum[
+                self.ensembled_actions_count
+            ]
+            self.ensembled_actions_count = torch.clamp(
+                self.ensembled_actions_count + 1, max=self.chunk_size
+            )
             # The last action, which has no prior online average, needs to get concatenated onto the end.
-            self.ensembled_actions = torch.cat([self.ensembled_actions, actions[:, -1:]], dim=1)
+            self.ensembled_actions = torch.cat(
+                [self.ensembled_actions, actions[:, -1:]], dim=1
+            )
             self.ensembled_actions_count = torch.cat(
-                [self.ensembled_actions_count, torch.ones_like(self.ensembled_actions_count[-1:])]
+                [
+                    self.ensembled_actions_count,
+                    torch.ones_like(self.ensembled_actions_count[-1:]),
+                ]
             )
         # "Consume" the first action.
         action, self.ensembled_actions, self.ensembled_actions_count = (
@@ -293,7 +330,9 @@ class ACT(nn.Module):
         # BERT style VAE encoder with input tokens [cls, robot_state, *action_sequence].
         # The cls token forms parameters of the latent's distribution (like this [*means, *log_variances]).
         self.use_robot_state = "observation.state" in config.input_shapes
-        self.use_images = any(k.startswith("observation.image") for k in config.input_shapes)
+        self.use_images = any(
+            k.startswith("observation.image") for k in config.input_shapes
+        )
         self.use_env_state = "observation.environment_state" in config.input_shapes
         if self.config.use_vae:
             self.vae_encoder = ACTEncoder(config, is_vae_encoder=True)
@@ -308,7 +347,9 @@ class ACT(nn.Module):
                 config.output_shapes["action"][0], config.dim_model
             )
             # Projection layer from the VAE encoder's output to the latent distribution's parameter space.
-            self.vae_encoder_latent_output_proj = nn.Linear(config.dim_model, config.latent_dim * 2)
+            self.vae_encoder_latent_output_proj = nn.Linear(
+                config.dim_model, config.latent_dim * 2
+            )
             # Fixed sinusoidal positional embedding for the input to the VAE encoder. Unsqueeze for batch
             # dimension.
             num_input_token_encoder = 1 + config.chunk_size
@@ -316,20 +357,28 @@ class ACT(nn.Module):
                 num_input_token_encoder += 1
             self.register_buffer(
                 "vae_encoder_pos_enc",
-                create_sinusoidal_pos_embedding(num_input_token_encoder, config.dim_model).unsqueeze(0),
+                create_sinusoidal_pos_embedding(
+                    num_input_token_encoder, config.dim_model
+                ).unsqueeze(0),
             )
 
         # Backbone for image feature extraction.
         if self.use_images:
             backbone_model = getattr(torchvision.models, config.vision_backbone)(
-                replace_stride_with_dilation=[False, False, config.replace_final_stride_with_dilation],
+                replace_stride_with_dilation=[
+                    False,
+                    False,
+                    config.replace_final_stride_with_dilation,
+                ],
                 weights=config.pretrained_backbone_weights,
                 norm_layer=FrozenBatchNorm2d,
             )
             # Note: The assumption here is that we are using a ResNet model (and hence layer4 is the final
             # feature map).
             # Note: The forward method of this returns a dict: {"feature_map": output}.
-            self.backbone = IntermediateLayerGetter(backbone_model, return_layers={"layer4": "feature_map"})
+            self.backbone = IntermediateLayerGetter(
+                backbone_model, return_layers={"layer4": "feature_map"}
+            )
 
         # Transformer (acts as VAE decoder when training with the variational objective).
         self.encoder = ACTEncoder(config)
@@ -343,7 +392,8 @@ class ACT(nn.Module):
             )
         if self.use_env_state:
             self.encoder_env_state_input_proj = nn.Linear(
-                config.input_shapes["observation.environment_state"][0], config.dim_model
+                config.input_shapes["observation.environment_state"][0],
+                config.dim_model,
             )
         self.encoder_latent_input_proj = nn.Linear(config.latent_dim, config.dim_model)
         if self.use_images:
@@ -358,14 +408,18 @@ class ACT(nn.Module):
             n_1d_tokens += 1
         self.encoder_1d_feature_pos_embed = nn.Embedding(n_1d_tokens, config.dim_model)
         if self.use_images:
-            self.encoder_cam_feat_pos_embed = ACTSinusoidalPositionEmbedding2d(config.dim_model // 2)
+            self.encoder_cam_feat_pos_embed = ACTSinusoidalPositionEmbedding2d(
+                config.dim_model // 2
+            )
 
         # Transformer decoder.
         # Learnable positional embedding for the transformer's decoder (in the style of DETR object queries).
         self.decoder_pos_embed = nn.Embedding(config.chunk_size, config.dim_model)
 
         # Final action regression head on the output of the transformer's decoder.
-        self.action_head = nn.Linear(config.dim_model, config.output_shapes["action"][0])
+        self.action_head = nn.Linear(
+            config.dim_model, config.output_shapes["action"][0]
+        )
 
         self._reset_parameters()
 
@@ -375,7 +429,9 @@ class ACT(nn.Module):
             if p.dim() > 1:
                 nn.init.xavier_uniform_(p)
 
-    def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, tuple[Tensor, Tensor] | tuple[None, None]]:
+    def forward(
+        self, batch: dict[str, Tensor]
+    ) -> tuple[Tensor, tuple[Tensor, Tensor] | tuple[None, None]]:
         """A forward pass through the Action Chunking Transformer (with optional VAE encoder).
 
         `batch` should have the following structure:
@@ -412,12 +468,20 @@ class ACT(nn.Module):
                 self.vae_encoder_cls_embed.weight, "1 d -> b 1 d", b=batch_size
             )  # (B, 1, D)
             if self.use_robot_state:
-                robot_state_embed = self.vae_encoder_robot_state_input_proj(batch["observation.state"])
+                robot_state_embed = self.vae_encoder_robot_state_input_proj(
+                    batch["observation.state"]
+                )
                 robot_state_embed = robot_state_embed.unsqueeze(1)  # (B, 1, D)
-            action_embed = self.vae_encoder_action_input_proj(batch["action"])  # (B, S, D)
+            action_embed = self.vae_encoder_action_input_proj(
+                batch["action"]
+            )  # (B, S, D)
 
             if self.use_robot_state:
-                vae_encoder_input = [cls_embed, robot_state_embed, action_embed]  # (B, S+2, D)
+                vae_encoder_input = [
+                    cls_embed,
+                    robot_state_embed,
+                    action_embed,
+                ]  # (B, S+2, D)
             else:
                 vae_encoder_input = [cls_embed, action_embed]
             vae_encoder_input = torch.cat(vae_encoder_input, axis=1)
@@ -455,20 +519,26 @@ class ACT(nn.Module):
             # When not using the VAE encoder, we set the latent to be all zeros.
             mu = log_sigma_x2 = None
             # TODO(rcadene, alexander-soare): remove call to `.to` to speedup forward ; precompute and use buffer
-            latent_sample = torch.zeros([batch_size, self.config.latent_dim], dtype=torch.float32).to(
-                batch["observation.state"].device
-            )
+            latent_sample = torch.zeros(
+                [batch_size, self.config.latent_dim], dtype=torch.float32
+            ).to(batch["observation.state"].device)
 
         # Prepare transformer encoder inputs.
         encoder_in_tokens = [self.encoder_latent_input_proj(latent_sample)]
-        encoder_in_pos_embed = list(self.encoder_1d_feature_pos_embed.weight.unsqueeze(1))
+        encoder_in_pos_embed = list(
+            self.encoder_1d_feature_pos_embed.weight.unsqueeze(1)
+        )
         # Robot state token.
         if self.use_robot_state:
-            encoder_in_tokens.append(self.encoder_robot_state_input_proj(batch["observation.state"]))
+            encoder_in_tokens.append(
+                self.encoder_robot_state_input_proj(batch["observation.state"])
+            )
         # Environment state token.
         if self.use_env_state:
             encoder_in_tokens.append(
-                self.encoder_env_state_input_proj(batch["observation.environment_state"])
+                self.encoder_env_state_input_proj(
+                    batch["observation.environment_state"]
+                )
             )
 
         # Camera observation features and positional embeddings.
@@ -477,19 +547,29 @@ class ACT(nn.Module):
             all_cam_pos_embeds = []
 
             for cam_index in range(batch["observation.images"].shape[-4]):
-                cam_features = self.backbone(batch["observation.images"][:, cam_index])["feature_map"]
+                cam_features = self.backbone(batch["observation.images"][:, cam_index])[
+                    "feature_map"
+                ]
                 # TODO(rcadene, alexander-soare): remove call to `.to` to speedup forward ; precompute and use
                 # buffer
-                cam_pos_embed = self.encoder_cam_feat_pos_embed(cam_features).to(dtype=cam_features.dtype)
-                cam_features = self.encoder_img_feat_input_proj(cam_features)  # (B, C, h, w)
+                cam_pos_embed = self.encoder_cam_feat_pos_embed(cam_features).to(
+                    dtype=cam_features.dtype
+                )
+                cam_features = self.encoder_img_feat_input_proj(
+                    cam_features
+                )  # (B, C, h, w)
                 all_cam_features.append(cam_features)
                 all_cam_pos_embeds.append(cam_pos_embed)
             # Concatenate camera observation feature maps and positional embeddings along the width dimension,
             # and move to (sequence, batch, dim).
             all_cam_features = torch.cat(all_cam_features, axis=-1)
-            encoder_in_tokens.extend(einops.rearrange(all_cam_features, "b c h w -> (h w) b c"))
+            encoder_in_tokens.extend(
+                einops.rearrange(all_cam_features, "b c h w -> (h w) b c")
+            )
             all_cam_pos_embeds = torch.cat(all_cam_pos_embeds, axis=-1)
-            encoder_in_pos_embed.extend(einops.rearrange(all_cam_pos_embeds, "b c h w -> (h w) b c"))
+            encoder_in_pos_embed.extend(
+                einops.rearrange(all_cam_pos_embeds, "b c h w -> (h w) b c")
+            )
 
         # Stack all tokens along the sequence dimension.
         encoder_in_tokens = torch.stack(encoder_in_tokens, axis=0)
@@ -524,12 +604,21 @@ class ACTEncoder(nn.Module):
     def __init__(self, config: ACTConfig, is_vae_encoder: bool = False):
         super().__init__()
         self.is_vae_encoder = is_vae_encoder
-        num_layers = config.n_vae_encoder_layers if self.is_vae_encoder else config.n_encoder_layers
-        self.layers = nn.ModuleList([ACTEncoderLayer(config) for _ in range(num_layers)])
+        num_layers = (
+            config.n_vae_encoder_layers
+            if self.is_vae_encoder
+            else config.n_encoder_layers
+        )
+        self.layers = nn.ModuleList(
+            [ACTEncoderLayer(config) for _ in range(num_layers)]
+        )
         self.norm = nn.LayerNorm(config.dim_model) if config.pre_norm else nn.Identity()
 
     def forward(
-        self, x: Tensor, pos_embed: Tensor | None = None, key_padding_mask: Tensor | None = None
+        self,
+        x: Tensor,
+        pos_embed: Tensor | None = None,
+        key_padding_mask: Tensor | None = None,
     ) -> Tensor:
         for layer in self.layers:
             x = layer(x, pos_embed=pos_embed, key_padding_mask=key_padding_mask)
@@ -540,7 +629,9 @@ class ACTEncoder(nn.Module):
 class ACTEncoderLayer(nn.Module):
     def __init__(self, config: ACTConfig):
         super().__init__()
-        self.self_attn = nn.MultiheadAttention(config.dim_model, config.n_heads, dropout=config.dropout)
+        self.self_attn = nn.MultiheadAttention(
+            config.dim_model, config.n_heads, dropout=config.dropout
+        )
 
         # Feed forward layers.
         self.linear1 = nn.Linear(config.dim_model, config.dim_feedforward)
@@ -555,7 +646,9 @@ class ACTEncoderLayer(nn.Module):
         self.activation = get_activation_fn(config.feedforward_activation)
         self.pre_norm = config.pre_norm
 
-    def forward(self, x, pos_embed: Tensor | None = None, key_padding_mask: Tensor | None = None) -> Tensor:
+    def forward(
+        self, x, pos_embed: Tensor | None = None, key_padding_mask: Tensor | None = None
+    ) -> Tensor:
         skip = x
         if self.pre_norm:
             x = self.norm1(x)
@@ -580,7 +673,9 @@ class ACTDecoder(nn.Module):
     def __init__(self, config: ACTConfig):
         """Convenience module for running multiple decoder layers followed by normalization."""
         super().__init__()
-        self.layers = nn.ModuleList([ACTDecoderLayer(config) for _ in range(config.n_decoder_layers)])
+        self.layers = nn.ModuleList(
+            [ACTDecoderLayer(config) for _ in range(config.n_decoder_layers)]
+        )
         self.norm = nn.LayerNorm(config.dim_model)
 
     def forward(
@@ -592,7 +687,10 @@ class ACTDecoder(nn.Module):
     ) -> Tensor:
         for layer in self.layers:
             x = layer(
-                x, encoder_out, decoder_pos_embed=decoder_pos_embed, encoder_pos_embed=encoder_pos_embed
+                x,
+                encoder_out,
+                decoder_pos_embed=decoder_pos_embed,
+                encoder_pos_embed=encoder_pos_embed,
             )
         if self.norm is not None:
             x = self.norm(x)
@@ -602,8 +700,12 @@ class ACTDecoder(nn.Module):
 class ACTDecoderLayer(nn.Module):
     def __init__(self, config: ACTConfig):
         super().__init__()
-        self.self_attn = nn.MultiheadAttention(config.dim_model, config.n_heads, dropout=config.dropout)
-        self.multihead_attn = nn.MultiheadAttention(config.dim_model, config.n_heads, dropout=config.dropout)
+        self.self_attn = nn.MultiheadAttention(
+            config.dim_model, config.n_heads, dropout=config.dropout
+        )
+        self.multihead_attn = nn.MultiheadAttention(
+            config.dim_model, config.n_heads, dropout=config.dropout
+        )
 
         # Feed forward layers.
         self.linear1 = nn.Linear(config.dim_model, config.dim_feedforward)
@@ -644,7 +746,9 @@ class ACTDecoderLayer(nn.Module):
         if self.pre_norm:
             x = self.norm1(x)
         q = k = self.maybe_add_pos_embed(x, decoder_pos_embed)
-        x = self.self_attn(q, k, value=x)[0]  # select just the output, not the attention weights
+        x = self.self_attn(q, k, value=x)[
+            0
+        ]  # select just the output, not the attention weights
         x = skip + self.dropout1(x)
         if self.pre_norm:
             skip = x
@@ -681,9 +785,14 @@ def create_sinusoidal_pos_embedding(num_positions: int, dimension: int) -> Tenso
     """
 
     def get_position_angle_vec(position):
-        return [position / np.power(10000, 2 * (hid_j // 2) / dimension) for hid_j in range(dimension)]
+        return [
+            position / np.power(10000, 2 * (hid_j // 2) / dimension)
+            for hid_j in range(dimension)
+        ]
 
-    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(num_positions)])
+    sinusoid_table = np.array(
+        [get_position_angle_vec(pos_i) for pos_i in range(num_positions)]
+    )
     sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
     sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
     return torch.from_numpy(sinusoid_table).float()
@@ -728,7 +837,9 @@ class ACTSinusoidalPositionEmbedding2d(nn.Module):
         x_range = x_range / (x_range[:, :, -1:] + self._eps) * self._two_pi
 
         inverse_frequency = self._temperature ** (
-            2 * (torch.arange(self.dimension, dtype=torch.float32, device=x.device) // 2) / self.dimension
+            2
+            * (torch.arange(self.dimension, dtype=torch.float32, device=x.device) // 2)
+            / self.dimension
         )
 
         x_range = x_range.unsqueeze(-1) / inverse_frequency  # (1, H, W, 1)
@@ -736,9 +847,15 @@ class ACTSinusoidalPositionEmbedding2d(nn.Module):
 
         # Note: this stack then flatten operation results in interleaved sine and cosine terms.
         # pos_embed_x and pos_embed_y are (1, H, W, C // 2).
-        pos_embed_x = torch.stack((x_range[..., 0::2].sin(), x_range[..., 1::2].cos()), dim=-1).flatten(3)
-        pos_embed_y = torch.stack((y_range[..., 0::2].sin(), y_range[..., 1::2].cos()), dim=-1).flatten(3)
-        pos_embed = torch.cat((pos_embed_y, pos_embed_x), dim=3).permute(0, 3, 1, 2)  # (1, C, H, W)
+        pos_embed_x = torch.stack(
+            (x_range[..., 0::2].sin(), x_range[..., 1::2].cos()), dim=-1
+        ).flatten(3)
+        pos_embed_y = torch.stack(
+            (y_range[..., 0::2].sin(), y_range[..., 1::2].cos()), dim=-1
+        ).flatten(3)
+        pos_embed = torch.cat((pos_embed_y, pos_embed_x), dim=3).permute(
+            0, 3, 1, 2
+        )  # (1, C, H, W)
 
         return pos_embed
 
diff --git a/lerobot/common/policies/diffusion/configuration_diffusion.py b/lerobot/common/policies/diffusion/configuration_diffusion.py
index 531f49e4..4ee53c86 100644
--- a/lerobot/common/policies/diffusion/configuration_diffusion.py
+++ b/lerobot/common/policies/diffusion/configuration_diffusion.py
@@ -121,7 +121,9 @@ class DiffusionConfig:
             "observation.state": "min_max",
         }
     )
-    output_normalization_modes: dict[str, str] = field(default_factory=lambda: {"action": "min_max"})
+    output_normalization_modes: dict[str, str] = field(
+        default_factory=lambda: {"action": "min_max"}
+    )
 
     # Architecture / modeling.
     # Vision backbone.
@@ -163,8 +165,13 @@ class DiffusionConfig:
 
         image_keys = {k for k in self.input_shapes if k.startswith("observation.image")}
 
-        if len(image_keys) == 0 and "observation.environment_state" not in self.input_shapes:
-            raise ValueError("You must provide at least one image or the environment state among the inputs.")
+        if (
+            len(image_keys) == 0
+            and "observation.environment_state" not in self.input_shapes
+        ):
+            raise ValueError(
+                "You must provide at least one image or the environment state among the inputs."
+            )
 
         if len(image_keys) > 0:
             if self.crop_shape is not None:
diff --git a/lerobot/common/policies/diffusion/modeling_diffusion.py b/lerobot/common/policies/diffusion/modeling_diffusion.py
index 9ba56260..7f6858be 100644
--- a/lerobot/common/policies/diffusion/modeling_diffusion.py
+++ b/lerobot/common/policies/diffusion/modeling_diffusion.py
@@ -88,7 +88,9 @@ class DiffusionPolicy(
 
         self.diffusion = DiffusionModel(config)
 
-        self.expected_image_keys = [k for k in config.input_shapes if k.startswith("observation.image")]
+        self.expected_image_keys = [
+            k for k in config.input_shapes if k.startswith("observation.image")
+        ]
         self.use_env_state = "observation.environment_state" in config.input_shapes
 
         self.reset()
@@ -102,7 +104,9 @@ class DiffusionPolicy(
         if len(self.expected_image_keys) > 0:
             self._queues["observation.images"] = deque(maxlen=self.config.n_obs_steps)
         if self.use_env_state:
-            self._queues["observation.environment_state"] = deque(maxlen=self.config.n_obs_steps)
+            self._queues["observation.environment_state"] = deque(
+                maxlen=self.config.n_obs_steps
+            )
 
     @torch.no_grad
     def select_action(self, batch: dict[str, Tensor]) -> Tensor:
@@ -128,14 +132,22 @@ class DiffusionPolicy(
         """
         batch = self.normalize_inputs(batch)
         if len(self.expected_image_keys) > 0:
-            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
-            batch["observation.images"] = torch.stack([batch[k] for k in self.expected_image_keys], dim=-4)
+            batch = dict(
+                batch
+            )  # shallow copy so that adding a key doesn't modify the original
+            batch["observation.images"] = torch.stack(
+                [batch[k] for k in self.expected_image_keys], dim=-4
+            )
         # Note: It's important that this happens after stacking the images into a single key.
         self._queues = populate_queues(self._queues, batch)
 
         if len(self._queues["action"]) == 0:
             # stack n latest observations from the queue
-            batch = {k: torch.stack(list(self._queues[k]), dim=1) for k in batch if k in self._queues}
+            batch = {
+                k: torch.stack(list(self._queues[k]), dim=1)
+                for k in batch
+                if k in self._queues
+            }
             actions = self.diffusion.generate_actions(batch)
 
             # TODO(rcadene): make above methods return output dictionary?
@@ -150,8 +162,12 @@ class DiffusionPolicy(
         """Run the batch through the model and compute the loss for training or validation."""
         batch = self.normalize_inputs(batch)
         if len(self.expected_image_keys) > 0:
-            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
-            batch["observation.images"] = torch.stack([batch[k] for k in self.expected_image_keys], dim=-4)
+            batch = dict(
+                batch
+            )  # shallow copy so that adding a key doesn't modify the original
+            batch["observation.images"] = torch.stack(
+                [batch[k] for k in self.expected_image_keys], dim=-4
+            )
         batch = self.normalize_targets(batch)
         loss = self.diffusion.compute_loss(batch)
         return {"loss": loss}
@@ -177,7 +193,9 @@ class DiffusionModel(nn.Module):
 
         # Build observation encoders (depending on which observations are provided).
         global_cond_dim = config.input_shapes["observation.state"][0]
-        num_images = len([k for k in config.input_shapes if k.startswith("observation.image")])
+        num_images = len(
+            [k for k in config.input_shapes if k.startswith("observation.image")]
+        )
         self._use_images = False
         self._use_env_state = False
         if num_images > 0:
@@ -193,7 +211,9 @@ class DiffusionModel(nn.Module):
             self._use_env_state = True
             global_cond_dim += config.input_shapes["observation.environment_state"][0]
 
-        self.unet = DiffusionConditionalUnet1d(config, global_cond_dim=global_cond_dim * config.n_obs_steps)
+        self.unet = DiffusionConditionalUnet1d(
+            config, global_cond_dim=global_cond_dim * config.n_obs_steps
+        )
 
         self.noise_scheduler = _make_noise_scheduler(
             config.noise_scheduler_type,
@@ -213,14 +233,21 @@ class DiffusionModel(nn.Module):
 
     # ========= inference  ============
     def conditional_sample(
-        self, batch_size: int, global_cond: Tensor | None = None, generator: torch.Generator | None = None
+        self,
+        batch_size: int,
+        global_cond: Tensor | None = None,
+        generator: torch.Generator | None = None,
     ) -> Tensor:
         device = get_device_from_parameters(self)
         dtype = get_dtype_from_parameters(self)
 
         # Sample prior.
         sample = torch.randn(
-            size=(batch_size, self.config.horizon, self.config.output_shapes["action"][0]),
+            size=(
+                batch_size,
+                self.config.horizon,
+                self.config.output_shapes["action"][0],
+            ),
             dtype=dtype,
             device=device,
             generator=generator,
@@ -236,7 +263,9 @@ class DiffusionModel(nn.Module):
                 global_cond=global_cond,
             )
             # Compute previous image: x_t -> x_t-1
-            sample = self.noise_scheduler.step(model_output, t, sample, generator=generator).prev_sample
+            sample = self.noise_scheduler.step(
+                model_output, t, sample, generator=generator
+            ).prev_sample
 
         return sample
 
@@ -248,27 +277,39 @@ class DiffusionModel(nn.Module):
         if self._use_images:
             if self.config.use_separate_rgb_encoder_per_camera:
                 # Combine batch and sequence dims while rearranging to make the camera index dimension first.
-                images_per_camera = einops.rearrange(batch["observation.images"], "b s n ... -> n (b s) ...")
+                images_per_camera = einops.rearrange(
+                    batch["observation.images"], "b s n ... -> n (b s) ..."
+                )
                 img_features_list = torch.cat(
                     [
                         encoder(images)
-                        for encoder, images in zip(self.rgb_encoder, images_per_camera, strict=True)
+                        for encoder, images in zip(
+                            self.rgb_encoder, images_per_camera, strict=True
+                        )
                     ]
                 )
                 # Separate batch and sequence dims back out. The camera index dim gets absorbed into the
                 # feature dim (effectively concatenating the camera features).
                 img_features = einops.rearrange(
-                    img_features_list, "(n b s) ... -> b s (n ...)", b=batch_size, s=n_obs_steps
+                    img_features_list,
+                    "(n b s) ... -> b s (n ...)",
+                    b=batch_size,
+                    s=n_obs_steps,
                 )
             else:
                 # Combine batch, sequence, and "which camera" dims before passing to shared encoder.
                 img_features = self.rgb_encoder(
-                    einops.rearrange(batch["observation.images"], "b s n ... -> (b s n) ...")
+                    einops.rearrange(
+                        batch["observation.images"], "b s n ... -> (b s n) ..."
+                    )
                 )
                 # Separate batch dim and sequence dim back out. The camera index dim gets absorbed into the
                 # feature dim (effectively concatenating the camera features).
                 img_features = einops.rearrange(
-                    img_features, "(b s n) ... -> b s (n ...)", b=batch_size, s=n_obs_steps
+                    img_features,
+                    "(b s n) ... -> b s (n ...)",
+                    b=batch_size,
+                    s=n_obs_steps,
                 )
             global_cond_feats.append(img_features)
 
@@ -354,7 +395,9 @@ class DiffusionModel(nn.Module):
         elif self.config.prediction_type == "sample":
             target = batch["action"]
         else:
-            raise ValueError(f"Unsupported prediction type {self.config.prediction_type}")
+            raise ValueError(
+                f"Unsupported prediction type {self.config.prediction_type}"
+            )
 
         loss = F.mse_loss(pred, target, reduction="none")
 
@@ -414,7 +457,9 @@ class SpatialSoftmax(nn.Module):
 
         # we could use torch.linspace directly but that seems to behave slightly differently than numpy
         # and causes a small degradation in pc_success of pre-trained models.
-        pos_x, pos_y = np.meshgrid(np.linspace(-1.0, 1.0, self._in_w), np.linspace(-1.0, 1.0, self._in_h))
+        pos_x, pos_y = np.meshgrid(
+            np.linspace(-1.0, 1.0, self._in_w), np.linspace(-1.0, 1.0, self._in_h)
+        )
         pos_x = torch.from_numpy(pos_x.reshape(self._in_h * self._in_w, 1)).float()
         pos_y = torch.from_numpy(pos_y.reshape(self._in_h * self._in_w, 1)).float()
         # register as buffer so it's moved to the correct device.
@@ -456,7 +501,9 @@ class DiffusionRgbEncoder(nn.Module):
             # Always use center crop for eval
             self.center_crop = torchvision.transforms.CenterCrop(config.crop_shape)
             if config.crop_is_random:
-                self.maybe_random_crop = torchvision.transforms.RandomCrop(config.crop_shape)
+                self.maybe_random_crop = torchvision.transforms.RandomCrop(
+                    config.crop_shape
+                )
             else:
                 self.maybe_random_crop = self.center_crop
         else:
@@ -477,7 +524,9 @@ class DiffusionRgbEncoder(nn.Module):
             self.backbone = _replace_submodules(
                 root_module=self.backbone,
                 predicate=lambda x: isinstance(x, nn.BatchNorm2d),
-                func=lambda x: nn.GroupNorm(num_groups=x.num_features // 16, num_channels=x.num_features),
+                func=lambda x: nn.GroupNorm(
+                    num_groups=x.num_features // 16, num_channels=x.num_features
+                ),
             )
 
         # Set up pooling and final layers.
@@ -485,17 +534,25 @@ class DiffusionRgbEncoder(nn.Module):
         # The dummy input should take the number of image channels from `config.input_shapes` and it should
         # use the height and width from `config.crop_shape` if it is provided, otherwise it should use the
         # height and width from `config.input_shapes`.
-        image_keys = [k for k in config.input_shapes if k.startswith("observation.image")]
+        image_keys = [
+            k for k in config.input_shapes if k.startswith("observation.image")
+        ]
         # Note: we have a check in the config class to make sure all images have the same shape.
         image_key = image_keys[0]
         dummy_input_h_w = (
-            config.crop_shape if config.crop_shape is not None else config.input_shapes[image_key][1:]
+            config.crop_shape
+            if config.crop_shape is not None
+            else config.input_shapes[image_key][1:]
+        )
+        dummy_input = torch.zeros(
+            size=(1, config.input_shapes[image_key][0], *dummy_input_h_w)
         )
-        dummy_input = torch.zeros(size=(1, config.input_shapes[image_key][0], *dummy_input_h_w))
         with torch.inference_mode():
             dummy_feature_map = self.backbone(dummy_input)
         feature_map_shape = tuple(dummy_feature_map.shape[1:])
-        self.pool = SpatialSoftmax(feature_map_shape, num_kp=config.spatial_softmax_num_keypoints)
+        self.pool = SpatialSoftmax(
+            feature_map_shape, num_kp=config.spatial_softmax_num_keypoints
+        )
         self.feature_dim = config.spatial_softmax_num_keypoints * 2
         self.out = nn.Linear(config.spatial_softmax_num_keypoints * 2, self.feature_dim)
         self.relu = nn.ReLU()
@@ -522,7 +579,9 @@ class DiffusionRgbEncoder(nn.Module):
 
 
 def _replace_submodules(
-    root_module: nn.Module, predicate: Callable[[nn.Module], bool], func: Callable[[nn.Module], nn.Module]
+    root_module: nn.Module,
+    predicate: Callable[[nn.Module], bool],
+    func: Callable[[nn.Module], nn.Module],
 ) -> nn.Module:
     """
     Args:
@@ -535,7 +594,11 @@ def _replace_submodules(
     if predicate(root_module):
         return func(root_module)
 
-    replace_list = [k.split(".") for k, m in root_module.named_modules(remove_duplicate=True) if predicate(m)]
+    replace_list = [
+        k.split(".")
+        for k, m in root_module.named_modules(remove_duplicate=True)
+        if predicate(m)
+    ]
     for *parents, k in replace_list:
         parent_module = root_module
         if len(parents) > 0:
@@ -550,7 +613,9 @@ def _replace_submodules(
         else:
             setattr(parent_module, k, tgt_module)
     # verify that all BN are replaced
-    assert not any(predicate(m) for _, m in root_module.named_modules(remove_duplicate=True))
+    assert not any(
+        predicate(m) for _, m in root_module.named_modules(remove_duplicate=True)
+    )
     return root_module
 
 
@@ -578,7 +643,9 @@ class DiffusionConv1dBlock(nn.Module):
         super().__init__()
 
         self.block = nn.Sequential(
-            nn.Conv1d(inp_channels, out_channels, kernel_size, padding=kernel_size // 2),
+            nn.Conv1d(
+                inp_channels, out_channels, kernel_size, padding=kernel_size // 2
+            ),
             nn.GroupNorm(n_groups, out_channels),
             nn.Mish(),
         )
@@ -601,9 +668,13 @@ class DiffusionConditionalUnet1d(nn.Module):
         # Encoder for the diffusion timestep.
         self.diffusion_step_encoder = nn.Sequential(
             DiffusionSinusoidalPosEmb(config.diffusion_step_embed_dim),
-            nn.Linear(config.diffusion_step_embed_dim, config.diffusion_step_embed_dim * 4),
+            nn.Linear(
+                config.diffusion_step_embed_dim, config.diffusion_step_embed_dim * 4
+            ),
             nn.Mish(),
-            nn.Linear(config.diffusion_step_embed_dim * 4, config.diffusion_step_embed_dim),
+            nn.Linear(
+                config.diffusion_step_embed_dim * 4, config.diffusion_step_embed_dim
+            ),
         )
 
         # The FiLM conditioning dimension.
@@ -628,10 +699,16 @@ class DiffusionConditionalUnet1d(nn.Module):
             self.down_modules.append(
                 nn.ModuleList(
                     [
-                        DiffusionConditionalResidualBlock1d(dim_in, dim_out, **common_res_block_kwargs),
-                        DiffusionConditionalResidualBlock1d(dim_out, dim_out, **common_res_block_kwargs),
+                        DiffusionConditionalResidualBlock1d(
+                            dim_in, dim_out, **common_res_block_kwargs
+                        ),
+                        DiffusionConditionalResidualBlock1d(
+                            dim_out, dim_out, **common_res_block_kwargs
+                        ),
                         # Downsample as long as it is not the last block.
-                        nn.Conv1d(dim_out, dim_out, 3, 2, 1) if not is_last else nn.Identity(),
+                        nn.Conv1d(dim_out, dim_out, 3, 2, 1)
+                        if not is_last
+                        else nn.Identity(),
                     ]
                 )
             )
@@ -640,10 +717,14 @@ class DiffusionConditionalUnet1d(nn.Module):
         self.mid_modules = nn.ModuleList(
             [
                 DiffusionConditionalResidualBlock1d(
-                    config.down_dims[-1], config.down_dims[-1], **common_res_block_kwargs
+                    config.down_dims[-1],
+                    config.down_dims[-1],
+                    **common_res_block_kwargs,
                 ),
                 DiffusionConditionalResidualBlock1d(
-                    config.down_dims[-1], config.down_dims[-1], **common_res_block_kwargs
+                    config.down_dims[-1],
+                    config.down_dims[-1],
+                    **common_res_block_kwargs,
                 ),
             ]
         )
@@ -656,16 +737,24 @@ class DiffusionConditionalUnet1d(nn.Module):
                 nn.ModuleList(
                     [
                         # dim_in * 2, because it takes the encoder's skip connection as well
-                        DiffusionConditionalResidualBlock1d(dim_in * 2, dim_out, **common_res_block_kwargs),
-                        DiffusionConditionalResidualBlock1d(dim_out, dim_out, **common_res_block_kwargs),
+                        DiffusionConditionalResidualBlock1d(
+                            dim_in * 2, dim_out, **common_res_block_kwargs
+                        ),
+                        DiffusionConditionalResidualBlock1d(
+                            dim_out, dim_out, **common_res_block_kwargs
+                        ),
                         # Upsample as long as it is not the last block.
-                        nn.ConvTranspose1d(dim_out, dim_out, 4, 2, 1) if not is_last else nn.Identity(),
+                        nn.ConvTranspose1d(dim_out, dim_out, 4, 2, 1)
+                        if not is_last
+                        else nn.Identity(),
                     ]
                 )
             )
 
         self.final_conv = nn.Sequential(
-            DiffusionConv1dBlock(config.down_dims[0], config.down_dims[0], kernel_size=config.kernel_size),
+            DiffusionConv1dBlock(
+                config.down_dims[0], config.down_dims[0], kernel_size=config.kernel_size
+            ),
             nn.Conv1d(config.down_dims[0], config.output_shapes["action"][0], 1),
         )
 
@@ -733,17 +822,23 @@ class DiffusionConditionalResidualBlock1d(nn.Module):
         self.use_film_scale_modulation = use_film_scale_modulation
         self.out_channels = out_channels
 
-        self.conv1 = DiffusionConv1dBlock(in_channels, out_channels, kernel_size, n_groups=n_groups)
+        self.conv1 = DiffusionConv1dBlock(
+            in_channels, out_channels, kernel_size, n_groups=n_groups
+        )
 
         # FiLM modulation (https://arxiv.org/abs/1709.07871) outputs per-channel bias and (maybe) scale.
         cond_channels = out_channels * 2 if use_film_scale_modulation else out_channels
         self.cond_encoder = nn.Sequential(nn.Mish(), nn.Linear(cond_dim, cond_channels))
 
-        self.conv2 = DiffusionConv1dBlock(out_channels, out_channels, kernel_size, n_groups=n_groups)
+        self.conv2 = DiffusionConv1dBlock(
+            out_channels, out_channels, kernel_size, n_groups=n_groups
+        )
 
         # A final convolution for dimension matching the residual (if needed).
         self.residual_conv = (
-            nn.Conv1d(in_channels, out_channels, 1) if in_channels != out_channels else nn.Identity()
+            nn.Conv1d(in_channels, out_channels, 1)
+            if in_channels != out_channels
+            else nn.Identity()
         )
 
     def forward(self, x: Tensor, cond: Tensor) -> Tensor:
diff --git a/lerobot/common/policies/factory.py b/lerobot/common/policies/factory.py
index 7162c39f..814a4d0a 100644
--- a/lerobot/common/policies/factory.py
+++ b/lerobot/common/policies/factory.py
@@ -52,7 +52,9 @@ def get_policy_and_config_classes(name: str) -> tuple[Policy, object]:
 
         return TDMPCPolicy, TDMPCConfig
     elif name == "diffusion":
-        from lerobot.common.policies.diffusion.configuration_diffusion import DiffusionConfig
+        from lerobot.common.policies.diffusion.configuration_diffusion import (
+            DiffusionConfig,
+        )
         from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy
 
         return DiffusionPolicy, DiffusionConfig
@@ -115,7 +117,9 @@ def make_policy(
         # huggingface_hub should make it possible to avoid the hack:
         # https://github.com/huggingface/huggingface_hub/pull/2274.
         policy = policy_cls(policy_cfg)
-        policy.load_state_dict(policy_cls.from_pretrained(pretrained_policy_name_or_path).state_dict())
+        policy.load_state_dict(
+            policy_cls.from_pretrained(pretrained_policy_name_or_path).state_dict()
+        )
 
     policy.to(get_safe_torch_device(hydra_cfg.device))
 
diff --git a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
index e6700547..eb023f9f 100644
--- a/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
+++ b/lerobot/common/policies/hilserl/classifier/modeling_classifier.py
@@ -7,7 +7,9 @@ from torch import Tensor, nn
 
 from .configuration_classifier import ClassifierConfig
 
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
 logger = logging.getLogger(__name__)
 
 
@@ -15,7 +17,10 @@ class ClassifierOutput:
     """Wrapper for classifier outputs with additional metadata."""
 
     def __init__(
-        self, logits: Tensor, probabilities: Optional[Tensor] = None, hidden_states: Optional[Tensor] = None
+        self,
+        logits: Tensor,
+        probabilities: Optional[Tensor] = None,
+        hidden_states: Optional[Tensor] = None,
     ):
         self.logits = logits
         self.probabilities = probabilities
@@ -43,12 +48,14 @@ class Classifier(
     name = "classifier"
 
     def __init__(self, config: ClassifierConfig):
-        from transformers import AutoImageProcessor, AutoModel
+        from transformers import AutoModel
 
         super().__init__()
         self.config = config
         # self.processor = AutoImageProcessor.from_pretrained(self.config.model_name, trust_remote_code=True)
-        encoder = AutoModel.from_pretrained(self.config.model_name, trust_remote_code=True)
+        encoder = AutoModel.from_pretrained(
+            self.config.model_name, trust_remote_code=True
+        )
         # Extract vision model if we're given a multimodal model
         if hasattr(encoder, "vision_model"):
             logging.info("Multimodal model detected - using vision encoder only")
@@ -74,7 +81,9 @@ class Classifier(
             self.feature_dim = self.encoder.fc.in_features
             self.encoder = nn.Sequential(*list(self.encoder.children())[:-1])
         elif hasattr(self.encoder.config, "hidden_sizes"):
-            self.feature_dim = self.encoder.config.hidden_sizes[-1]  # Last channel dimension
+            self.feature_dim = self.encoder.config.hidden_sizes[
+                -1
+            ]  # Last channel dimension
         else:
             raise ValueError("Unsupported CNN architecture")
 
@@ -94,14 +103,19 @@ class Classifier(
             if hasattr(self.encoder.config, "hidden_size"):
                 input_dim = self.encoder.config.hidden_size
             else:
-                raise ValueError("Unsupported transformer architecture since hidden_size is not found")
+                raise ValueError(
+                    "Unsupported transformer architecture since hidden_size is not found"
+                )
 
         self.classifier_head = nn.Sequential(
             nn.Linear(input_dim * self.config.num_cameras, self.config.hidden_dim),
             nn.Dropout(self.config.dropout_rate),
             nn.LayerNorm(self.config.hidden_dim),
             nn.ReLU(),
-            nn.Linear(self.config.hidden_dim, 1 if self.config.num_classes == 2 else self.config.num_classes),
+            nn.Linear(
+                self.config.hidden_dim,
+                1 if self.config.num_classes == 2 else self.config.num_classes,
+            ),
         )
         self.classifier_head = self.classifier_head.to(self.config.device)
 
@@ -127,7 +141,10 @@ class Classifier(
                 return features
             else:  # Transformer models
                 outputs = self.encoder(processed)
-                if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
+                if (
+                    hasattr(outputs, "pooler_output")
+                    and outputs.pooler_output is not None
+                ):
                     return outputs.pooler_output
                 return outputs.last_hidden_state[:, 0, :]
 
@@ -143,7 +160,9 @@ class Classifier(
         else:
             probabilities = torch.softmax(logits, dim=-1)
 
-        return ClassifierOutput(logits=logits, probabilities=probabilities, hidden_states=encoder_outputs)
+        return ClassifierOutput(
+            logits=logits, probabilities=probabilities, hidden_states=encoder_outputs
+        )
 
     def predict_reward(self, x, threshold=0.6):
         if self.config.num_classes == 2:
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index db596982..9eb864ec 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -59,7 +59,9 @@ class SACPolicy(
                 config.input_normalization_params
             )
             self.normalize_inputs = Normalize(
-                config.input_shapes, config.input_normalization_modes, input_normalization_params
+                config.input_shapes,
+                config.input_normalization_modes,
+                input_normalization_params,
             )
         else:
             self.normalize_inputs = nn.Identity()
@@ -90,7 +92,8 @@ class SACPolicy(
             ensemble=Ensemble(
                 [
                     CriticHead(
-                        input_dim=encoder_critic.output_dim + config.output_shapes["action"][0],
+                        input_dim=encoder_critic.output_dim
+                        + config.output_shapes["action"][0],
                         **config.critic_network_kwargs,
                     )
                     for _ in range(config.num_critics)
@@ -104,7 +107,8 @@ class SACPolicy(
             ensemble=Ensemble(
                 [
                     CriticHead(
-                        input_dim=encoder_critic.output_dim + config.output_shapes["action"][0],
+                        input_dim=encoder_critic.output_dim
+                        + config.output_shapes["action"][0],
                         **config.critic_network_kwargs,
                     )
                     for _ in range(config.num_critics)
@@ -120,13 +124,17 @@ class SACPolicy(
 
         self.actor = Policy(
             encoder=encoder_actor,
-            network=MLP(input_dim=encoder_actor.output_dim, **config.actor_network_kwargs),
+            network=MLP(
+                input_dim=encoder_actor.output_dim, **config.actor_network_kwargs
+            ),
             action_dim=config.output_shapes["action"][0],
             encoder_is_shared=config.shared_encoder,
             **config.policy_kwargs,
         )
         if config.target_entropy is None:
-            config.target_entropy = -np.prod(config.output_shapes["action"][0]) / 2  # (-dim(A)/2)
+            config.target_entropy = (
+                -np.prod(config.output_shapes["action"][0]) / 2
+            )  # (-dim(A)/2)
 
         # TODO (azouitine): Handle the case where the temparameter is a fixed
         # TODO (michel-aractingi): Put the log_alpha in cuda by default because otherwise
@@ -153,7 +161,11 @@ class SACPolicy(
         return actions
 
     def critic_forward(
-        self, observations: dict[str, Tensor], actions: Tensor, use_target: bool = False, observation_features: Tensor | None = None
+        self,
+        observations: dict[str, Tensor],
+        actions: Tensor,
+        use_target: bool = False,
+        observation_features: Tensor | None = None,
     ) -> Tensor:
         """Forward pass through a critic network ensemble
 
@@ -173,21 +185,37 @@ class SACPolicy(
     def update_target_networks(self):
         """Update target networks with exponential moving average"""
         for target_param, param in zip(
-            self.critic_target.parameters(), self.critic_ensemble.parameters(), strict=False
+            self.critic_target.parameters(),
+            self.critic_ensemble.parameters(),
+            strict=False,
         ):
             target_param.data.copy_(
                 param.data * self.config.critic_target_update_weight
                 + target_param.data * (1.0 - self.config.critic_target_update_weight)
             )
 
-    def compute_loss_critic(self, observations, actions, rewards, next_observations, done, observation_features: Tensor | None = None, next_observation_features: Tensor | None = None) -> Tensor:
+    def compute_loss_critic(
+        self,
+        observations,
+        actions,
+        rewards,
+        next_observations,
+        done,
+        observation_features: Tensor | None = None,
+        next_observation_features: Tensor | None = None,
+    ) -> Tensor:
         temperature = self.log_alpha.exp().item()
         with torch.no_grad():
-            next_action_preds, next_log_probs, _ = self.actor(next_observations, next_observation_features)
+            next_action_preds, next_log_probs, _ = self.actor(
+                next_observations, next_observation_features
+            )
 
             # 2- compute q targets
             q_targets = self.critic_forward(
-                observations=next_observations, actions=next_action_preds, use_target=True, observation_features=next_observation_features
+                observations=next_observations,
+                actions=next_action_preds,
+                use_target=True,
+                observation_features=next_observation_features,
             )
 
             # subsample critics to prevent overfitting if use high UTD (update to date)
@@ -204,7 +232,12 @@ class SACPolicy(
             td_target = rewards + (1 - done) * self.config.discount * min_q
 
         # 3- compute predicted qs
-        q_preds = self.critic_forward(observations, actions, use_target=False, observation_features=observation_features)
+        q_preds = self.critic_forward(
+            observations,
+            actions,
+            use_target=False,
+            observation_features=observation_features,
+        )
 
         # 4- Calculate loss
         # Compute state-action value loss (TD loss) for all of the Q functions in the ensemble.
@@ -219,20 +252,31 @@ class SACPolicy(
         ).sum()
         return critics_loss
 
-    def compute_loss_temperature(self, observations, observation_features: Tensor | None = None) -> Tensor:
+    def compute_loss_temperature(
+        self, observations, observation_features: Tensor | None = None
+    ) -> Tensor:
         """Compute the temperature loss"""
         # calculate temperature loss
         with torch.no_grad():
             _, log_probs, _ = self.actor(observations, observation_features)
-        temperature_loss = (-self.log_alpha.exp() * (log_probs + self.config.target_entropy)).mean()
+        temperature_loss = (
+            -self.log_alpha.exp() * (log_probs + self.config.target_entropy)
+        ).mean()
         return temperature_loss
 
-    def compute_loss_actor(self, observations, observation_features: Tensor | None = None) -> Tensor:
+    def compute_loss_actor(
+        self, observations, observation_features: Tensor | None = None
+    ) -> Tensor:
         temperature = self.log_alpha.exp().item()
 
         actions_pi, log_probs, _ = self.actor(observations, observation_features)
 
-        q_preds = self.critic_forward(observations, actions_pi, use_target=False, observation_features=observation_features)
+        q_preds = self.critic_forward(
+            observations,
+            actions_pi,
+            use_target=False,
+            observation_features=observation_features,
+        )
         min_q_preds = q_preds.min(dim=0)[0]
 
         actor_loss = ((temperature * log_probs) - min_q_preds).mean()
@@ -259,7 +303,11 @@ class MLP(nn.Module):
         if dropout_rate is not None and dropout_rate > 0:
             layers.append(nn.Dropout(p=dropout_rate))
         layers.append(nn.LayerNorm(hidden_dims[0]))
-        layers.append(activations if isinstance(activations, nn.Module) else getattr(nn, activations)())
+        layers.append(
+            activations
+            if isinstance(activations, nn.Module)
+            else getattr(nn, activations)()
+        )
 
         # Rest of the layers
         for i in range(1, len(hidden_dims)):
@@ -270,7 +318,9 @@ class MLP(nn.Module):
                     layers.append(nn.Dropout(p=dropout_rate))
                 layers.append(nn.LayerNorm(hidden_dims[i]))
                 layers.append(
-                    activations if isinstance(activations, nn.Module) else getattr(nn, activations)()
+                    activations
+                    if isinstance(activations, nn.Module)
+                    else getattr(nn, activations)()
                 )
 
         self.net = nn.Sequential(*layers)
@@ -381,7 +431,11 @@ class CriticEnsemble(nn.Module):
         actions = self.output_normalization(actions)["action"]
         actions = actions.to(device)
 
-        obs_enc = observation_features if observation_features is not None else (observations if self.encoder is None else self.encoder(observations))
+        obs_enc = (
+            observation_features
+            if observation_features is not None
+            else (observations if self.encoder is None else self.encoder(observations))
+        )
 
         inputs = torch.cat([obs_enc, actions], dim=-1)
         q_values = self.ensemble(inputs)  # [num_critics, B, 1]
@@ -445,7 +499,11 @@ class Policy(nn.Module):
         observation_features: torch.Tensor | None = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Encode observations if encoder exists
-        obs_enc = observation_features if observation_features is not None else (observations if self.encoder is None else self.encoder(observations))
+        obs_enc = (
+            observation_features
+            if observation_features is not None
+            else (observations if self.encoder is None else self.encoder(observations))
+        )
 
         # Get network outputs
         outputs = self.network(obs_enc)
@@ -454,11 +512,15 @@ class Policy(nn.Module):
         # Compute standard deviations
         if self.fixed_std is None:
             log_std = self.std_layer(outputs)
-            assert not torch.isnan(log_std).any(), "[ERROR] log_std became NaN after std_layer!"
+            assert not torch.isnan(
+                log_std
+            ).any(), "[ERROR] log_std became NaN after std_layer!"
 
             if self.use_tanh_squash:
                 log_std = torch.tanh(log_std)
-                log_std = self.log_std_min + 0.5 * (self.log_std_max - self.log_std_min) * (log_std + 1.0)
+                log_std = self.log_std_min + 0.5 * (
+                    self.log_std_max - self.log_std_min
+                ) * (log_std + 1.0)
             else:
                 log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
         else:
@@ -471,7 +533,9 @@ class Policy(nn.Module):
 
         if self.use_tanh_squash:
             actions = torch.tanh(x_t)
-            log_probs -= torch.log((1 - actions.pow(2)) + 1e-6)  # Adjust log-probs for Tanh
+            log_probs -= torch.log(
+                (1 - actions.pow(2)) + 1e-6
+            )  # Adjust log-probs for Tanh
         else:
             actions = x_t  # No Tanh; raw Gaussian sample
 
@@ -518,12 +582,15 @@ class SACObservationEncoder(nn.Module):
                 freeze_image_encoder(self.image_enc_layers)
             else:
                 self.parameters_to_optimize += list(self.image_enc_layers.parameters())
-            self.all_image_keys = [k for k in config.input_shapes if k.startswith("observation.image")]
+            self.all_image_keys = [
+                k for k in config.input_shapes if k.startswith("observation.image")
+            ]
 
         if "observation.state" in config.input_shapes:
             self.state_enc_layers = nn.Sequential(
                 nn.Linear(
-                    in_features=config.input_shapes["observation.state"][0], out_features=config.latent_dim
+                    in_features=config.input_shapes["observation.state"][0],
+                    out_features=config.latent_dim,
                 ),
                 nn.LayerNorm(normalized_shape=config.latent_dim),
                 nn.Tanh(),
@@ -544,7 +611,9 @@ class SACObservationEncoder(nn.Module):
             self.aggregation_size += config.latent_dim
             self.parameters_to_optimize += list(self.env_state_enc_layers.parameters())
 
-        self.aggregation_layer = nn.Linear(in_features=self.aggregation_size, out_features=config.latent_dim)
+        self.aggregation_layer = nn.Linear(
+            in_features=self.aggregation_size, out_features=config.latent_dim
+        )
         self.parameters_to_optimize += list(self.aggregation_layer.parameters())
 
     def forward(self, obs_dict: dict[str, Tensor]) -> Tensor:
@@ -557,13 +626,19 @@ class SACObservationEncoder(nn.Module):
         obs_dict = self.input_normalization(obs_dict)
         # Batch all images along the batch dimension, then encode them.
         if len(self.all_image_keys) > 0:
-            images_batched = torch.cat([obs_dict[key] for key in self.all_image_keys], dim=0)
+            images_batched = torch.cat(
+                [obs_dict[key] for key in self.all_image_keys], dim=0
+            )
             images_batched = self.image_enc_layers(images_batched)
-            embeddings_chunks = torch.chunk(images_batched, dim=0, chunks=len(self.all_image_keys))
+            embeddings_chunks = torch.chunk(
+                images_batched, dim=0, chunks=len(self.all_image_keys)
+            )
             feat.extend(embeddings_chunks)
 
         if "observation.environment_state" in self.config.input_shapes:
-            feat.append(self.env_state_enc_layers(obs_dict["observation.environment_state"]))
+            feat.append(
+                self.env_state_enc_layers(obs_dict["observation.environment_state"])
+            )
         if "observation.state" in self.config.input_shapes:
             feat.append(self.state_enc_layers(obs_dict["observation.state"]))
 
@@ -631,7 +706,9 @@ class PretrainedImageEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
 
-        self.image_enc_layers, self.image_enc_out_shape = self._load_pretrained_vision_encoder(config)
+        self.image_enc_layers, self.image_enc_out_shape = (
+            self._load_pretrained_vision_encoder(config)
+        )
         self.image_enc_proj = nn.Sequential(
             nn.Linear(np.prod(self.image_enc_out_shape), config.latent_dim),
             nn.LayerNorm(config.latent_dim),
@@ -642,15 +719,21 @@ class PretrainedImageEncoder(nn.Module):
         """Set up CNN encoder"""
         from transformers import AutoModel
 
-        self.image_enc_layers = AutoModel.from_pretrained(config.vision_encoder_name, trust_remote_code=True)
+        self.image_enc_layers = AutoModel.from_pretrained(
+            config.vision_encoder_name, trust_remote_code=True
+        )
         # self.image_enc_layers.pooler = Identity()
 
         if hasattr(self.image_enc_layers.config, "hidden_sizes"):
-            self.image_enc_out_shape = self.image_enc_layers.config.hidden_sizes[-1]  # Last channel dimension
+            self.image_enc_out_shape = self.image_enc_layers.config.hidden_sizes[
+                -1
+            ]  # Last channel dimension
         elif hasattr(self.image_enc_layers, "fc"):
             self.image_enc_out_shape = self.image_enc_layers.fc.in_features
         else:
-            raise ValueError("Unsupported vision encoder architecture, make sure you are using a CNN")
+            raise ValueError(
+                "Unsupported vision encoder architecture, make sure you are using a CNN"
+            )
         return self.image_enc_layers, self.image_enc_out_shape
 
     def forward(self, x):
@@ -673,7 +756,7 @@ def orthogonal_init():
 
 class Identity(nn.Module):
     def __init__(self):
-        super(Identity, self).__init__()
+        super().__init__()
 
     def forward(self, x):
         return x
@@ -701,7 +784,9 @@ class Ensemble(nn.Module):
             return self.module(*args, **kwargs)
 
     def forward(self, *args, **kwargs):
-        return torch.vmap(self._call, (0, None), randomness="different")(self.params, *args, **kwargs)
+        return torch.vmap(self._call, (0, None), randomness="different")(
+            self.params, *args, **kwargs
+        )
 
     def __repr__(self):
         return f"Vectorized {len(self)}x " + self._repr
@@ -710,7 +795,9 @@ class Ensemble(nn.Module):
 # TODO (azouitine): I think in our case this function is not usefull we should remove it
 # after some investigation
 # borrowed from tdmpc
-def flatten_forward_unflatten(fn: Callable[[Tensor], Tensor], image_tensor: Tensor) -> Tensor:
+def flatten_forward_unflatten(
+    fn: Callable[[Tensor], Tensor], image_tensor: Tensor
+) -> Tensor:
     """Helper to temporarily flatten extra dims at the start of the image tensor.
 
     Args:
@@ -736,7 +823,9 @@ def _convert_normalization_params_to_tensor(normalization_params: dict) -> dict:
         for key, value in inner_dict.items():
             converted_params[outer_key][key] = torch.tensor(value)
             if "image" in outer_key:
-                converted_params[outer_key][key] = converted_params[outer_key][key].view(3, 1, 1)
+                converted_params[outer_key][key] = converted_params[outer_key][
+                    key
+                ].view(3, 1, 1)
 
     return converted_params
 
diff --git a/lerobot/common/policies/tdmpc/configuration_tdmpc.py b/lerobot/common/policies/tdmpc/configuration_tdmpc.py
index 4a5415a1..8f4683a1 100644
--- a/lerobot/common/policies/tdmpc/configuration_tdmpc.py
+++ b/lerobot/common/policies/tdmpc/configuration_tdmpc.py
@@ -191,6 +191,10 @@ class TDMPCConfig:
                     "If `n_action_steps > 1`, `n_action_repeats` must be left to its default value of 1."
                 )
             if not self.use_mpc:
-                raise ValueError("If `n_action_steps > 1`, `use_mpc` must be set to `True`.")
+                raise ValueError(
+                    "If `n_action_steps > 1`, `use_mpc` must be set to `True`."
+                )
             if self.n_action_steps > self.horizon:
-                raise ValueError("`n_action_steps` must be less than or equal to `horizon`.")
+                raise ValueError(
+                    "`n_action_steps` must be less than or equal to `horizon`."
+                )
diff --git a/lerobot/common/policies/tdmpc/modeling_tdmpc.py b/lerobot/common/policies/tdmpc/modeling_tdmpc.py
index d97c4824..fdccbe23 100644
--- a/lerobot/common/policies/tdmpc/modeling_tdmpc.py
+++ b/lerobot/common/policies/tdmpc/modeling_tdmpc.py
@@ -68,7 +68,9 @@ class TDMPCPolicy(
     name = "tdmpc"
 
     def __init__(
-        self, config: TDMPCConfig | None = None, dataset_stats: dict[str, dict[str, Tensor]] | None = None
+        self,
+        config: TDMPCConfig | None = None,
+        dataset_stats: dict[str, dict[str, Tensor]] | None = None,
     ):
         """
         Args:
@@ -100,7 +102,9 @@ class TDMPCPolicy(
             config.output_shapes, config.output_normalization_modes, dataset_stats
         )
 
-        image_keys = [k for k in config.input_shapes if k.startswith("observation.image")]
+        image_keys = [
+            k for k in config.input_shapes if k.startswith("observation.image")
+        ]
         # Note: This check is covered in the post-init of the config but have a sanity check just in case.
         self._use_image = False
         self._use_env_state = False
@@ -120,7 +124,9 @@ class TDMPCPolicy(
         """
         self._queues = {
             "observation.state": deque(maxlen=1),
-            "action": deque(maxlen=max(self.config.n_action_steps, self.config.n_action_repeats)),
+            "action": deque(
+                maxlen=max(self.config.n_action_steps, self.config.n_action_repeats)
+            ),
         }
         if self._use_image:
             self._queues["observation.image"] = deque(maxlen=1)
@@ -135,7 +141,9 @@ class TDMPCPolicy(
         """Select a single action given environment observations."""
         batch = self.normalize_inputs(batch)
         if self._use_image:
-            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
+            batch = dict(
+                batch
+            )  # shallow copy so that adding a key doesn't modify the original
             batch["observation.image"] = batch[self.input_image_key]
 
         self._queues = populate_queues(self._queues, batch)
@@ -209,13 +217,20 @@ class TDMPCPolicy(
 
         # In the CEM loop we will need this for a call to estimate_value with the gaussian sampled
         # trajectories.
-        z = einops.repeat(z, "b d -> n b d", n=self.config.n_gaussian_samples + self.config.n_pi_samples)
+        z = einops.repeat(
+            z,
+            "b d -> n b d",
+            n=self.config.n_gaussian_samples + self.config.n_pi_samples,
+        )
 
         # Model Predictive Path Integral (MPPI) with the cross-entropy method (CEM) as the optimization
         # algorithm.
         # The initial mean and standard deviation for the cross-entropy method (CEM).
         mean = torch.zeros(
-            self.config.horizon, batch_size, self.config.output_shapes["action"][0], device=device
+            self.config.horizon,
+            batch_size,
+            self.config.output_shapes["action"][0],
+            device=device,
         )
         # Maybe warm start CEM with the mean from the previous step.
         if self._prev_mean is not None:
@@ -231,35 +246,47 @@ class TDMPCPolicy(
                 self.config.output_shapes["action"][0],
                 device=std.device,
             )
-            gaussian_actions = torch.clamp(mean.unsqueeze(1) + std.unsqueeze(1) * std_normal_noise, -1, 1)
+            gaussian_actions = torch.clamp(
+                mean.unsqueeze(1) + std.unsqueeze(1) * std_normal_noise, -1, 1
+            )
 
             # Compute elite actions.
             actions = torch.cat([gaussian_actions, pi_actions], dim=1)
             value = self.estimate_value(z, actions).nan_to_num_(0)
-            elite_idxs = torch.topk(value, self.config.n_elites, dim=0).indices  # (n_elites, batch)
+            elite_idxs = torch.topk(
+                value, self.config.n_elites, dim=0
+            ).indices  # (n_elites, batch)
             elite_value = value.take_along_dim(elite_idxs, dim=0)  # (n_elites, batch)
             # (horizon, n_elites, batch, action_dim)
-            elite_actions = actions.take_along_dim(einops.rearrange(elite_idxs, "n b -> 1 n b 1"), dim=1)
+            elite_actions = actions.take_along_dim(
+                einops.rearrange(elite_idxs, "n b -> 1 n b 1"), dim=1
+            )
 
             # Update gaussian PDF parameters to be the (weighted) mean and standard deviation of the elites.
             max_value = elite_value.max(0, keepdim=True)[0]  # (1, batch)
             # The weighting is a softmax over trajectory values. Note that this is not the same as the usage
             # of Ω in eqn 4 of the TD-MPC paper. Instead it is the normalized version of it: s = Ω/ΣΩ. This
             # makes the equations: μ = Σ(s⋅Γ), σ = Σ(s⋅(Γ-μ)²).
-            score = torch.exp(self.config.elite_weighting_temperature * (elite_value - max_value))
+            score = torch.exp(
+                self.config.elite_weighting_temperature * (elite_value - max_value)
+            )
             score /= score.sum(axis=0, keepdim=True)
             # (horizon, batch, action_dim)
-            _mean = torch.sum(einops.rearrange(score, "n b -> n b 1") * elite_actions, dim=1)
+            _mean = torch.sum(
+                einops.rearrange(score, "n b -> n b 1") * elite_actions, dim=1
+            )
             _std = torch.sqrt(
                 torch.sum(
                     einops.rearrange(score, "n b -> n b 1")
-                    * (elite_actions - einops.rearrange(_mean, "h b d -> h 1 b d")) ** 2,
+                    * (elite_actions - einops.rearrange(_mean, "h b d -> h 1 b d"))
+                    ** 2,
                     dim=1,
                 )
             )
             # Update mean with an exponential moving average, and std with a direct replacement.
             mean = (
-                self.config.gaussian_mean_momentum * mean + (1 - self.config.gaussian_mean_momentum) * _mean
+                self.config.gaussian_mean_momentum * mean
+                + (1 - self.config.gaussian_mean_momentum) * _mean
             )
             std = _std.clamp_(self.config.min_std, self.config.max_std)
 
@@ -268,7 +295,9 @@ class TDMPCPolicy(
 
         # Randomly select one of the elite actions from the last iteration of MPPI/CEM using the softmax
         # scores from the last iteration.
-        actions = elite_actions[:, torch.multinomial(score.T, 1).squeeze(), torch.arange(batch_size)]
+        actions = elite_actions[
+            :, torch.multinomial(score.T, 1).squeeze(), torch.arange(batch_size)
+        ]
 
         return actions
 
@@ -291,7 +320,8 @@ class TDMPCPolicy(
             # of the FOWM paper.
             if self.config.uncertainty_regularizer_coeff > 0:
                 regularization = -(
-                    self.config.uncertainty_regularizer_coeff * self.model.Qs(z, actions[t]).std(0)
+                    self.config.uncertainty_regularizer_coeff
+                    * self.model.Qs(z, actions[t]).std(0)
                 )
             else:
                 regularization = 0
@@ -311,15 +341,22 @@ class TDMPCPolicy(
         if self.config.q_ensemble_size > 2:
             G += (
                 running_discount
-                * torch.min(terminal_values[torch.randint(0, self.config.q_ensemble_size, size=(2,))], dim=0)[
-                    0
-                ]
+                * torch.min(
+                    terminal_values[
+                        torch.randint(0, self.config.q_ensemble_size, size=(2,))
+                    ],
+                    dim=0,
+                )[0]
             )
         else:
             G += running_discount * torch.min(terminal_values, dim=0)[0]
         # Finally, also regularize the terminal value.
         if self.config.uncertainty_regularizer_coeff > 0:
-            G -= running_discount * self.config.uncertainty_regularizer_coeff * terminal_values.std(0)
+            G -= (
+                running_discount
+                * self.config.uncertainty_regularizer_coeff
+                * terminal_values.std(0)
+            )
         return G
 
     def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor | float]:
@@ -331,7 +368,9 @@ class TDMPCPolicy(
 
         batch = self.normalize_inputs(batch)
         if self._use_image:
-            batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
+            batch = dict(
+                batch
+            )  # shallow copy so that adding a key doesn't modify the original
             batch["observation.image"] = batch[self.input_image_key]
         batch = self.normalize_targets(batch)
 
@@ -349,7 +388,10 @@ class TDMPCPolicy(
         # Apply random image augmentations.
         if self._use_image and self.config.max_random_shift_ratio > 0:
             observations["observation.image"] = flatten_forward_unflatten(
-                partial(random_shifts_aug, max_random_shift_ratio=self.config.max_random_shift_ratio),
+                partial(
+                    random_shifts_aug,
+                    max_random_shift_ratio=self.config.max_random_shift_ratio,
+                ),
                 observations["observation.image"],
             )
 
@@ -367,14 +409,20 @@ class TDMPCPolicy(
         # Note this has shape `horizon+1` because there are `horizon` actions and a current `z`. Each action
         # gives us a next `z`.
         batch_size = batch["index"].shape[0]
-        z_preds = torch.empty(horizon + 1, batch_size, self.config.latent_dim, device=device)
+        z_preds = torch.empty(
+            horizon + 1, batch_size, self.config.latent_dim, device=device
+        )
         z_preds[0] = self.model.encode(current_observation)
         reward_preds = torch.empty_like(reward, device=device)
         for t in range(horizon):
-            z_preds[t + 1], reward_preds[t] = self.model.latent_dynamics_and_reward(z_preds[t], action[t])
+            z_preds[t + 1], reward_preds[t] = self.model.latent_dynamics_and_reward(
+                z_preds[t], action[t]
+            )
 
         # Compute Q and V value predictions based on the latent rollout.
-        q_preds_ensemble = self.model.Qs(z_preds[:-1], action)  # (ensemble, horizon, batch)
+        q_preds_ensemble = self.model.Qs(
+            z_preds[:-1], action
+        )  # (ensemble, horizon, batch)
         v_preds = self.model.V(z_preds[:-1])
         info.update({"Q": q_preds_ensemble.mean().item(), "V": v_preds.mean().item()})
 
@@ -388,10 +436,14 @@ class TDMPCPolicy(
             # actions (not actions estimated by π).
             # Note: Here we do not use self.model_target, but self.model. This is to follow the original code
             # and the FOWM paper.
-            q_targets = reward + self.config.discount * self.model.V(self.model.encode(next_observations))
+            q_targets = reward + self.config.discount * self.model.V(
+                self.model.encode(next_observations)
+            )
             # From eqn 3 of FOWM. These appear as Q(z, a). Here we call them v_targets to emphasize that we
             # are using them to compute loss for V.
-            v_targets = self.model_target.Qs(z_preds[:-1].detach(), action, return_min=True)
+            v_targets = self.model_target.Qs(
+                z_preds[:-1].detach(), action, return_min=True
+            )
 
         # Compute losses.
         # Exponentially decay the loss weight with respect to the timestep. Steps that are more distant in the
@@ -434,7 +486,9 @@ class TDMPCPolicy(
                 temporal_loss_coeffs
                 * F.mse_loss(
                     q_preds_ensemble,
-                    einops.repeat(q_targets, "t b -> e t b", e=q_preds_ensemble.shape[0]),
+                    einops.repeat(
+                        q_targets, "t b -> e t b", e=q_preds_ensemble.shape[0]
+                    ),
                     reduction="none",
                 ).sum(0)  # sum over ensemble
                 # `q_preds_ensemble` depends on the first observation and the actions.
@@ -472,12 +526,14 @@ class TDMPCPolicy(
         z_preds = z_preds.detach()
         # Use stopgrad for the advantage calculation.
         with torch.no_grad():
-            advantage = self.model_target.Qs(z_preds[:-1], action, return_min=True) - self.model.V(
-                z_preds[:-1]
-            )
+            advantage = self.model_target.Qs(
+                z_preds[:-1], action, return_min=True
+            ) - self.model.V(z_preds[:-1])
             info["advantage"] = advantage[0]
             # (t, b)
-            exp_advantage = torch.clamp(torch.exp(advantage * self.config.advantage_scaling), max=100.0)
+            exp_advantage = torch.clamp(
+                torch.exp(advantage * self.config.advantage_scaling), max=100.0
+            )
         action_preds = self.model.pi(z_preds[:-1])  # (t, b, a)
         # Calculate the MSE between the actions and the action predictions.
         # Note: FOWM's original code calculates the log probability (wrt to a unit standard deviation
@@ -532,7 +588,9 @@ class TDMPCPolicy(
         # Note a minor variation with respect to the original FOWM code. Here they do this based on an EMA
         # update frequency parameter which is set to 2 (every 2 steps an update is done). To simplify the code
         # we update every step and adjust the decay parameter `alpha` accordingly (0.99 -> 0.995)
-        update_ema_parameters(self.model_target, self.model, self.config.target_model_momentum)
+        update_ema_parameters(
+            self.model_target, self.model, self.config.target_model_momentum
+        )
 
 
 class TDMPCTOLD(nn.Module):
@@ -543,7 +601,9 @@ class TDMPCTOLD(nn.Module):
         self.config = config
         self._encoder = TDMPCObservationEncoder(config)
         self._dynamics = nn.Sequential(
-            nn.Linear(config.latent_dim + config.output_shapes["action"][0], config.mlp_dim),
+            nn.Linear(
+                config.latent_dim + config.output_shapes["action"][0], config.mlp_dim
+            ),
             nn.LayerNorm(config.mlp_dim),
             nn.Mish(),
             nn.Linear(config.mlp_dim, config.mlp_dim),
@@ -554,7 +614,9 @@ class TDMPCTOLD(nn.Module):
             nn.Sigmoid(),
         )
         self._reward = nn.Sequential(
-            nn.Linear(config.latent_dim + config.output_shapes["action"][0], config.mlp_dim),
+            nn.Linear(
+                config.latent_dim + config.output_shapes["action"][0], config.mlp_dim
+            ),
             nn.LayerNorm(config.mlp_dim),
             nn.Mish(),
             nn.Linear(config.mlp_dim, config.mlp_dim),
@@ -574,7 +636,10 @@ class TDMPCTOLD(nn.Module):
         self._Qs = nn.ModuleList(
             [
                 nn.Sequential(
-                    nn.Linear(config.latent_dim + config.output_shapes["action"][0], config.mlp_dim),
+                    nn.Linear(
+                        config.latent_dim + config.output_shapes["action"][0],
+                        config.mlp_dim,
+                    ),
                     nn.LayerNorm(config.mlp_dim),
                     nn.Tanh(),
                     nn.Linear(config.mlp_dim, config.mlp_dim),
@@ -619,7 +684,9 @@ class TDMPCTOLD(nn.Module):
                 m[-1], nn.Linear
             ), "Sanity check. The last linear layer needs 0 initialization on weights."
             nn.init.zeros_(m[-1].weight)
-            nn.init.zeros_(m[-1].bias)  # this has already been done, but keep this line here for good measure
+            nn.init.zeros_(
+                m[-1].bias
+            )  # this has already been done, but keep this line here for good measure
 
     def encode(self, obs: dict[str, Tensor]) -> Tensor:
         """Encodes an observation into its latent representation."""
@@ -717,14 +784,32 @@ class TDMPCObservationEncoder(nn.Module):
         if "observation.image" in config.input_shapes:
             self.image_enc_layers = nn.Sequential(
                 nn.Conv2d(
-                    config.input_shapes["observation.image"][0], config.image_encoder_hidden_dim, 7, stride=2
+                    config.input_shapes["observation.image"][0],
+                    config.image_encoder_hidden_dim,
+                    7,
+                    stride=2,
                 ),
                 nn.ReLU(),
-                nn.Conv2d(config.image_encoder_hidden_dim, config.image_encoder_hidden_dim, 5, stride=2),
+                nn.Conv2d(
+                    config.image_encoder_hidden_dim,
+                    config.image_encoder_hidden_dim,
+                    5,
+                    stride=2,
+                ),
                 nn.ReLU(),
-                nn.Conv2d(config.image_encoder_hidden_dim, config.image_encoder_hidden_dim, 3, stride=2),
+                nn.Conv2d(
+                    config.image_encoder_hidden_dim,
+                    config.image_encoder_hidden_dim,
+                    3,
+                    stride=2,
+                ),
                 nn.ReLU(),
-                nn.Conv2d(config.image_encoder_hidden_dim, config.image_encoder_hidden_dim, 3, stride=2),
+                nn.Conv2d(
+                    config.image_encoder_hidden_dim,
+                    config.image_encoder_hidden_dim,
+                    3,
+                    stride=2,
+                ),
                 nn.ReLU(),
             )
             dummy_batch = torch.zeros(1, *config.input_shapes["observation.image"])
@@ -740,7 +825,10 @@ class TDMPCObservationEncoder(nn.Module):
             )
         if "observation.state" in config.input_shapes:
             self.state_enc_layers = nn.Sequential(
-                nn.Linear(config.input_shapes["observation.state"][0], config.state_encoder_hidden_dim),
+                nn.Linear(
+                    config.input_shapes["observation.state"][0],
+                    config.state_encoder_hidden_dim,
+                ),
                 nn.ELU(),
                 nn.Linear(config.state_encoder_hidden_dim, config.latent_dim),
                 nn.LayerNorm(config.latent_dim),
@@ -749,7 +837,8 @@ class TDMPCObservationEncoder(nn.Module):
         if "observation.environment_state" in config.input_shapes:
             self.env_state_enc_layers = nn.Sequential(
                 nn.Linear(
-                    config.input_shapes["observation.environment_state"][0], config.state_encoder_hidden_dim
+                    config.input_shapes["observation.environment_state"][0],
+                    config.state_encoder_hidden_dim,
                 ),
                 nn.ELU(),
                 nn.Linear(config.state_encoder_hidden_dim, config.latent_dim),
@@ -766,9 +855,15 @@ class TDMPCObservationEncoder(nn.Module):
         feat = []
         # NOTE: Order of observations matters here.
         if "observation.image" in self.config.input_shapes:
-            feat.append(flatten_forward_unflatten(self.image_enc_layers, obs_dict["observation.image"]))
+            feat.append(
+                flatten_forward_unflatten(
+                    self.image_enc_layers, obs_dict["observation.image"]
+                )
+            )
         if "observation.environment_state" in self.config.input_shapes:
-            feat.append(self.env_state_enc_layers(obs_dict["observation.environment_state"]))
+            feat.append(
+                self.env_state_enc_layers(obs_dict["observation.environment_state"])
+            )
         if "observation.state" in self.config.input_shapes:
             feat.append(self.state_enc_layers(obs_dict["observation.state"]))
         return torch.stack(feat, dim=0).mean(0)
@@ -811,12 +906,17 @@ def update_ema_parameters(ema_net: nn.Module, net: nn.Module, alpha: float):
     """Update EMA parameters in place with ema_param <- alpha * ema_param + (1 - alpha) * param."""
     for ema_module, module in zip(ema_net.modules(), net.modules(), strict=True):
         for (n_p_ema, p_ema), (n_p, p) in zip(
-            ema_module.named_parameters(recurse=False), module.named_parameters(recurse=False), strict=True
+            ema_module.named_parameters(recurse=False),
+            module.named_parameters(recurse=False),
+            strict=True,
         ):
             assert n_p_ema == n_p, "Parameter names don't match for EMA model update"
             if isinstance(p, dict):
                 raise RuntimeError("Dict parameter not supported")
-            if isinstance(module, nn.modules.batchnorm._BatchNorm) or not p.requires_grad:
+            if (
+                isinstance(module, nn.modules.batchnorm._BatchNorm)
+                or not p.requires_grad
+            ):
                 # Copy BatchNorm parameters, and non-trainable parameters directly.
                 p_ema.copy_(p.to(dtype=p_ema.dtype).data)
             with torch.no_grad():
@@ -824,7 +924,9 @@ def update_ema_parameters(ema_net: nn.Module, net: nn.Module, alpha: float):
                 p_ema.add_(p.to(dtype=p_ema.dtype).data, alpha=1 - alpha)
 
 
-def flatten_forward_unflatten(fn: Callable[[Tensor], Tensor], image_tensor: Tensor) -> Tensor:
+def flatten_forward_unflatten(
+    fn: Callable[[Tensor], Tensor], image_tensor: Tensor
+) -> Tensor:
     """Helper to temporarily flatten extra dims at the start of the image tensor.
 
     Args:
diff --git a/lerobot/common/policies/vqbet/configuration_vqbet.py b/lerobot/common/policies/vqbet/configuration_vqbet.py
index dfe4684d..e92c269e 100644
--- a/lerobot/common/policies/vqbet/configuration_vqbet.py
+++ b/lerobot/common/policies/vqbet/configuration_vqbet.py
@@ -109,7 +109,9 @@ class VQBeTConfig:
             "observation.state": "min_max",
         }
     )
-    output_normalization_modes: dict[str, str] = field(default_factory=lambda: {"action": "min_max"})
+    output_normalization_modes: dict[str, str] = field(
+        default_factory=lambda: {"action": "min_max"}
+    )
 
     # Architecture / modeling.
     # Vision backbone.
diff --git a/lerobot/common/policies/vqbet/modeling_vqbet.py b/lerobot/common/policies/vqbet/modeling_vqbet.py
index 98adce00..25af6a7d 100644
--- a/lerobot/common/policies/vqbet/modeling_vqbet.py
+++ b/lerobot/common/policies/vqbet/modeling_vqbet.py
@@ -79,7 +79,9 @@ class VQBeTPolicy(
 
         self.vqbet = VQBeTModel(config)
 
-        self.expected_image_keys = [k for k in config.input_shapes if k.startswith("observation.image")]
+        self.expected_image_keys = [
+            k for k in config.input_shapes if k.startswith("observation.image")
+        ]
 
         self.reset()
 
@@ -104,8 +106,12 @@ class VQBeTPolicy(
         """
 
         batch = self.normalize_inputs(batch)
-        batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
-        batch["observation.images"] = torch.stack([batch[k] for k in self.expected_image_keys], dim=-4)
+        batch = dict(
+            batch
+        )  # shallow copy so that adding a key doesn't modify the original
+        batch["observation.images"] = torch.stack(
+            [batch[k] for k in self.expected_image_keys], dim=-4
+        )
         # Note: It's important that this happens after stacking the images into a single key.
         self._queues = populate_queues(self._queues, batch)
 
@@ -116,8 +122,14 @@ class VQBeTPolicy(
             )
 
         if len(self._queues["action"]) == 0:
-            batch = {k: torch.stack(list(self._queues[k]), dim=1) for k in batch if k in self._queues}
-            actions = self.vqbet(batch, rollout=True)[:, : self.config.action_chunk_size]
+            batch = {
+                k: torch.stack(list(self._queues[k]), dim=1)
+                for k in batch
+                if k in self._queues
+            }
+            actions = self.vqbet(batch, rollout=True)[
+                :, : self.config.action_chunk_size
+            ]
 
             # the dimension of returned action is (batch_size, action_chunk_size, action_dim)
             actions = self.unnormalize_outputs({"action": actions})["action"]
@@ -130,8 +142,12 @@ class VQBeTPolicy(
     def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
         """Run the batch through the model and compute the loss for training or validation."""
         batch = self.normalize_inputs(batch)
-        batch = dict(batch)  # shallow copy so that adding a key doesn't modify the original
-        batch["observation.images"] = torch.stack([batch[k] for k in self.expected_image_keys], dim=-4)
+        batch = dict(
+            batch
+        )  # shallow copy so that adding a key doesn't modify the original
+        batch["observation.images"] = torch.stack(
+            [batch[k] for k in self.expected_image_keys], dim=-4
+        )
         batch = self.normalize_targets(batch)
         # VQ-BeT discretizes action using VQ-VAE before training BeT (please refer to section 3.2 in the VQ-BeT paper https://arxiv.org/pdf/2403.03181)
         if not self.vqbet.action_head.vqvae_model.discretized.item():
@@ -139,7 +155,9 @@ class VQBeTPolicy(
             # n_different_codes: how many of the total possible VQ codes are being used in single batch (how many of them have at least one encoder embedding as a nearest neighbor). This can be at most `vqvae_n_embed * number of layers of RVQ (=2)`.
             # n_different_combinations: how many different code combinations are being used out of all possible combinations in single batch. This can be at most `vqvae_n_embed ^ number of layers of RVQ (=2)` (hint consider the RVQ as a decision tree).
             loss, n_different_codes, n_different_combinations, recon_l1_error = (
-                self.vqbet.action_head.discretize(self.config.n_vqvae_training_steps, batch["action"])
+                self.vqbet.action_head.discretize(
+                    self.config.n_vqvae_training_steps, batch["action"]
+                )
             )
             return {
                 "loss": loss,
@@ -196,7 +214,9 @@ class SpatialSoftmax(nn.Module):
 
         # we could use torch.linspace directly but that seems to behave slightly differently than numpy
         # and causes a small degradation in pc_success of pre-trained models.
-        pos_x, pos_y = np.meshgrid(np.linspace(-1.0, 1.0, self._in_w), np.linspace(-1.0, 1.0, self._in_h))
+        pos_x, pos_y = np.meshgrid(
+            np.linspace(-1.0, 1.0, self._in_w), np.linspace(-1.0, 1.0, self._in_h)
+        )
         pos_x = torch.from_numpy(pos_x.reshape(self._in_h * self._in_w, 1)).float()
         pos_y = torch.from_numpy(pos_y.reshape(self._in_h * self._in_w, 1)).float()
         # register as buffer so it's moved to the correct device.
@@ -288,14 +308,17 @@ class VQBeTModel(nn.Module):
         self.config = config
 
         self.rgb_encoder = VQBeTRgbEncoder(config)
-        self.num_images = len([k for k in config.input_shapes if k.startswith("observation.image")])
+        self.num_images = len(
+            [k for k in config.input_shapes if k.startswith("observation.image")]
+        )
         # This action query token is used as a prompt for querying action chunks. Please refer to "A_Q" in the image above.
         # Note: During the forward pass, this token is repeated as many times as needed. The authors also experimented with initializing the necessary number of tokens independently and observed inferior results.
         self.action_token = nn.Parameter(torch.randn(1, 1, self.config.gpt_input_dim))
 
         # To input state and observation features into GPT layers, we first project the features to fit the shape of input size of GPT.
         self.state_projector = MLP(
-            config.input_shapes["observation.state"][0], hidden_channels=[self.config.gpt_input_dim]
+            config.input_shapes["observation.state"][0],
+            hidden_channels=[self.config.gpt_input_dim],
         )
         self.rgb_feature_projector = MLP(
             self.rgb_encoder.feature_dim, hidden_channels=[self.config.gpt_input_dim]
@@ -310,7 +333,12 @@ class VQBeTModel(nn.Module):
         num_tokens = self.config.n_action_pred_token + self.config.n_obs_steps - 1
         self.register_buffer(
             "select_target_actions_indices",
-            torch.row_stack([torch.arange(i, i + self.config.action_chunk_size) for i in range(num_tokens)]),
+            torch.row_stack(
+                [
+                    torch.arange(i, i + self.config.action_chunk_size)
+                    for i in range(num_tokens)
+                ]
+            ),
         )
 
     def forward(self, batch: dict[str, Tensor], rollout: bool) -> Tensor:
@@ -325,7 +353,11 @@ class VQBeTModel(nn.Module):
         )
         # Separate batch and sequence dims.
         img_features = einops.rearrange(
-            img_features, "(b s n) ... -> b s n ...", b=batch_size, s=n_obs_steps, n=self.num_images
+            img_features,
+            "(b s n) ... -> b s n ...",
+            b=batch_size,
+            s=n_obs_steps,
+            n=self.num_images,
         )
 
         # Arrange prior and current observation step tokens as shown in the class docstring.
@@ -337,13 +369,19 @@ class VQBeTModel(nn.Module):
         input_tokens.append(
             self.state_projector(batch["observation.state"])
         )  # (batch, obs_step, projection dims)
-        input_tokens.append(einops.repeat(self.action_token, "1 1 d -> b n d", b=batch_size, n=n_obs_steps))
+        input_tokens.append(
+            einops.repeat(
+                self.action_token, "1 1 d -> b n d", b=batch_size, n=n_obs_steps
+            )
+        )
         # Interleave tokens by stacking and rearranging.
         input_tokens = torch.stack(input_tokens, dim=2)
         input_tokens = einops.rearrange(input_tokens, "b n t d -> b (n t) d")
 
         len_additional_action_token = self.config.n_action_pred_token - 1
-        future_action_tokens = self.action_token.repeat(batch_size, len_additional_action_token, 1)
+        future_action_tokens = self.action_token.repeat(
+            batch_size, len_additional_action_token, 1
+        )
 
         # add additional action query tokens for predicting future action chunks
         input_tokens = torch.cat([input_tokens, future_action_tokens], dim=1)
@@ -352,9 +390,9 @@ class VQBeTModel(nn.Module):
         features = self.policy(input_tokens)
         # len(self.config.input_shapes) is the number of different observation modes.
         # this line gets the index of action prompt tokens.
-        historical_act_pred_index = np.arange(0, n_obs_steps) * (len(self.config.input_shapes) + 1) + len(
-            self.config.input_shapes
-        )
+        historical_act_pred_index = np.arange(0, n_obs_steps) * (
+            len(self.config.input_shapes) + 1
+        ) + len(self.config.input_shapes)
 
         # only extract the output tokens at the position of action query:
         # Behavior Transformer (BeT), and VQ-BeT are both sequence-to-sequence prediction models,
@@ -362,7 +400,11 @@ class VQBeTModel(nn.Module):
         # Thus, it predicts a historical action sequence, in addition to current and future actions (predicting future actions : optional).
         if len_additional_action_token > 0:
             features = torch.cat(
-                [features[:, historical_act_pred_index], features[:, -len_additional_action_token:]], dim=1
+                [
+                    features[:, historical_act_pred_index],
+                    features[:, -len_additional_action_token:],
+                ],
+                dim=1,
             )
         else:
             features = features[:, historical_act_pred_index]
@@ -370,13 +412,15 @@ class VQBeTModel(nn.Module):
         action_head_output = self.action_head(features)
         # if rollout, VQ-BeT don't calculate loss
         if rollout:
-            return action_head_output["predicted_action"][:, n_obs_steps - 1, :].reshape(
-                batch_size, self.config.action_chunk_size, -1
-            )
+            return action_head_output["predicted_action"][
+                :, n_obs_steps - 1, :
+            ].reshape(batch_size, self.config.action_chunk_size, -1)
         # else, it calculate overall loss (bin prediction loss, and offset loss)
         else:
             output = batch["action"][:, self.select_target_actions_indices]
-            loss = self.action_head.loss_fn(action_head_output, output, reduction="mean")
+            loss = self.action_head.loss_fn(
+                action_head_output, output, reduction="mean"
+            )
             return action_head_output, loss
 
 
@@ -411,7 +455,9 @@ class VQBeTHead(nn.Module):
         else:
             self.map_to_cbet_preds_bin = MLP(
                 in_channels=config.gpt_output_dim,
-                hidden_channels=[self.vqvae_model.vqvae_num_layers * self.config.vqvae_n_embed],
+                hidden_channels=[
+                    self.vqvae_model.vqvae_num_layers * self.config.vqvae_n_embed
+                ],
             )
         self.map_to_cbet_preds_offset = MLP(
             in_channels=config.gpt_output_dim,
@@ -438,7 +484,10 @@ class VQBeTHead(nn.Module):
 
         loss, metric = self.vqvae_model.vqvae_forward(actions)
         n_different_codes = sum(
-            [len(torch.unique(metric[2][:, i])) for i in range(self.vqvae_model.vqvae_num_layers)]
+            [
+                len(torch.unique(metric[2][:, i]))
+                for i in range(self.vqvae_model.vqvae_num_layers)
+            ]
         )
         n_different_combinations = len(torch.unique(metric[2], dim=0))
         recon_l1_error = metric[0].detach().cpu().item()
@@ -485,7 +534,13 @@ class VQBeTHead(nn.Module):
 
             cbet_secondary_logits = self.map_to_cbet_preds_secondary_bin(
                 torch.cat(
-                    (x, F.one_hot(sampled_primary_centers, num_classes=self.config.vqvae_n_embed)),
+                    (
+                        x,
+                        F.one_hot(
+                            sampled_primary_centers,
+                            num_classes=self.config.vqvae_n_embed,
+                        ),
+                    ),
                     axis=1,
                 )
             )
@@ -493,19 +548,29 @@ class VQBeTHead(nn.Module):
                 cbet_secondary_logits / self.config.bet_softmax_temperature, dim=-1
             )
             sampled_secondary_centers = einops.rearrange(
-                torch.multinomial(cbet_secondary_probs.view(-1, choices), num_samples=1),
+                torch.multinomial(
+                    cbet_secondary_probs.view(-1, choices), num_samples=1
+                ),
                 "(NT) 1 -> NT",
                 NT=NT,
             )
-            sampled_centers = torch.stack((sampled_primary_centers, sampled_secondary_centers), axis=1)
-            cbet_logits = torch.stack([cbet_primary_logits, cbet_secondary_logits], dim=1)
+            sampled_centers = torch.stack(
+                (sampled_primary_centers, sampled_secondary_centers), axis=1
+            )
+            cbet_logits = torch.stack(
+                [cbet_primary_logits, cbet_secondary_logits], dim=1
+            )
         # if self.config.sequentially_select is False, bin prediction head samples primary and secondary code at once.
         else:
             cbet_logits = self.map_to_cbet_preds_bin(x)
             cbet_logits = einops.rearrange(
-                cbet_logits, "(NT) (G C) -> (NT) G C", G=self.vqvae_model.vqvae_num_layers
+                cbet_logits,
+                "(NT) (G C) -> (NT) G C",
+                G=self.vqvae_model.vqvae_num_layers,
+            )
+            cbet_probs = torch.softmax(
+                cbet_logits / self.config.bet_softmax_temperature, dim=-1
             )
-            cbet_probs = torch.softmax(cbet_logits / self.config.bet_softmax_temperature, dim=-1)
             NT, G, choices = cbet_probs.shape
             sampled_centers = einops.rearrange(
                 torch.multinomial(cbet_probs.view(-1, choices), num_samples=1),
@@ -525,9 +590,17 @@ class VQBeTHead(nn.Module):
         sampled_offsets = sampled_offsets.sum(dim=1)
         with torch.no_grad():
             # Get the centroids (= vectors corresponding to the codes) of each layer to pass it through RVQ decoder
-            return_decoder_input = self.vqvae_model.get_embeddings_from_code(sampled_centers).clone().detach()
+            return_decoder_input = (
+                self.vqvae_model.get_embeddings_from_code(sampled_centers)
+                .clone()
+                .detach()
+            )
             # pass the centroids through decoder to get actions.
-            decoded_action = self.vqvae_model.get_action_from_latent(return_decoder_input).clone().detach()
+            decoded_action = (
+                self.vqvae_model.get_action_from_latent(return_decoder_input)
+                .clone()
+                .detach()
+            )
         # reshaped extracted offset to match with decoded centroids
         sampled_offsets = einops.rearrange(
             sampled_offsets, "NT (W A) -> NT W A", W=self.config.action_chunk_size
@@ -576,7 +649,9 @@ class VQBeTHead(nn.Module):
         # Figure out the loss for the actions.
         # First, we need to find the closest cluster center for each ground truth action.
         with torch.no_grad():
-            state_vq, action_bins = self.vqvae_model.get_code(action_seq)  # action_bins: NT, G
+            state_vq, action_bins = self.vqvae_model.get_code(
+                action_seq
+            )  # action_bins: NT, G
 
         # Now we can compute the loss.
 
@@ -599,8 +674,12 @@ class VQBeTHead(nn.Module):
             + cbet_loss2 * self.config.secondary_code_loss_weight
         )
 
-        equal_primary_code_rate = torch.sum((action_bins[:, 0] == sampled_centers[:, 0]).int()) / (NT)
-        equal_secondary_code_rate = torch.sum((action_bins[:, 1] == sampled_centers[:, 1]).int()) / (NT)
+        equal_primary_code_rate = torch.sum(
+            (action_bins[:, 0] == sampled_centers[:, 0]).int()
+        ) / (NT)
+        equal_secondary_code_rate = torch.sum(
+            (action_bins[:, 1] == sampled_centers[:, 1]).int()
+        ) / (NT)
 
         action_mse_error = torch.mean((action_seq - predicted_action) ** 2)
         vq_action_error = torch.mean(torch.abs(action_seq - decoded_action))
@@ -614,7 +693,9 @@ class VQBeTHead(nn.Module):
             "classification_loss": cbet_loss.detach().cpu().item(),
             "offset_loss": offset_loss.detach().cpu().item(),
             "equal_primary_code_rate": equal_primary_code_rate.detach().cpu().item(),
-            "equal_secondary_code_rate": equal_secondary_code_rate.detach().cpu().item(),
+            "equal_secondary_code_rate": equal_secondary_code_rate.detach()
+            .cpu()
+            .item(),
             "vq_action_error": vq_action_error.detach().cpu().item(),
             "offset_action_error": offset_action_error.detach().cpu().item(),
             "action_error_max": action_error_max.detach().cpu().item(),
@@ -643,11 +724,17 @@ class VQBeTOptimizer(torch.optim.Adam):
         if cfg.policy.sequentially_select:
             decay_params = (
                 decay_params
-                + list(policy.vqbet.action_head.map_to_cbet_preds_primary_bin.parameters())
-                + list(policy.vqbet.action_head.map_to_cbet_preds_secondary_bin.parameters())
+                + list(
+                    policy.vqbet.action_head.map_to_cbet_preds_primary_bin.parameters()
+                )
+                + list(
+                    policy.vqbet.action_head.map_to_cbet_preds_secondary_bin.parameters()
+                )
             )
         else:
-            decay_params = decay_params + list(policy.vqbet.action_head.map_to_cbet_preds_bin.parameters())
+            decay_params = decay_params + list(
+                policy.vqbet.action_head.map_to_cbet_preds_bin.parameters()
+            )
 
         optim_groups = [
             {
@@ -693,7 +780,11 @@ class VQBeTScheduler(nn.Module):
                 progress = float(current_step - num_warmup_steps) / float(
                     max(1, num_training_steps - num_warmup_steps)
                 )
-                return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+                return max(
+                    0.0,
+                    0.5
+                    * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)),
+                )
 
         self.lr_scheduler = LambdaLR(optimizer, lr_lambda, -1)
 
@@ -717,7 +808,9 @@ class VQBeTRgbEncoder(nn.Module):
             # Always use center crop for eval
             self.center_crop = torchvision.transforms.CenterCrop(config.crop_shape)
             if config.crop_is_random:
-                self.maybe_random_crop = torchvision.transforms.RandomCrop(config.crop_shape)
+                self.maybe_random_crop = torchvision.transforms.RandomCrop(
+                    config.crop_shape
+                )
             else:
                 self.maybe_random_crop = self.center_crop
         else:
@@ -738,7 +831,9 @@ class VQBeTRgbEncoder(nn.Module):
             self.backbone = _replace_submodules(
                 root_module=self.backbone,
                 predicate=lambda x: isinstance(x, nn.BatchNorm2d),
-                func=lambda x: nn.GroupNorm(num_groups=x.num_features // 16, num_channels=x.num_features),
+                func=lambda x: nn.GroupNorm(
+                    num_groups=x.num_features // 16, num_channels=x.num_features
+                ),
             )
 
         # Set up pooling and final layers.
@@ -746,17 +841,25 @@ class VQBeTRgbEncoder(nn.Module):
         # The dummy input should take the number of image channels from `config.input_shapes` and it should
         # use the height and width from `config.crop_shape` if it is provided, otherwise it should use the
         # height and width from `config.input_shapes`.
-        image_keys = [k for k in config.input_shapes if k.startswith("observation.image")]
+        image_keys = [
+            k for k in config.input_shapes if k.startswith("observation.image")
+        ]
         assert len(image_keys) == 1
         image_key = image_keys[0]
         dummy_input_h_w = (
-            config.crop_shape if config.crop_shape is not None else config.input_shapes[image_key][1:]
+            config.crop_shape
+            if config.crop_shape is not None
+            else config.input_shapes[image_key][1:]
+        )
+        dummy_input = torch.zeros(
+            size=(1, config.input_shapes[image_key][0], *dummy_input_h_w)
         )
-        dummy_input = torch.zeros(size=(1, config.input_shapes[image_key][0], *dummy_input_h_w))
         with torch.inference_mode():
             dummy_feature_map = self.backbone(dummy_input)
         feature_map_shape = tuple(dummy_feature_map.shape[1:])
-        self.pool = SpatialSoftmax(feature_map_shape, num_kp=config.spatial_softmax_num_keypoints)
+        self.pool = SpatialSoftmax(
+            feature_map_shape, num_kp=config.spatial_softmax_num_keypoints
+        )
         self.feature_dim = config.spatial_softmax_num_keypoints * 2
         self.out = nn.Linear(config.spatial_softmax_num_keypoints * 2, self.feature_dim)
         self.relu = nn.ReLU()
@@ -783,7 +886,9 @@ class VQBeTRgbEncoder(nn.Module):
 
 
 def _replace_submodules(
-    root_module: nn.Module, predicate: Callable[[nn.Module], bool], func: Callable[[nn.Module], nn.Module]
+    root_module: nn.Module,
+    predicate: Callable[[nn.Module], bool],
+    func: Callable[[nn.Module], nn.Module],
 ) -> nn.Module:
     """
     Args:
@@ -796,7 +901,11 @@ def _replace_submodules(
     if predicate(root_module):
         return func(root_module)
 
-    replace_list = [k.split(".") for k, m in root_module.named_modules(remove_duplicate=True) if predicate(m)]
+    replace_list = [
+        k.split(".")
+        for k, m in root_module.named_modules(remove_duplicate=True)
+        if predicate(m)
+    ]
     for *parents, k in replace_list:
         parent_module = root_module
         if len(parents) > 0:
@@ -811,7 +920,9 @@ def _replace_submodules(
         else:
             setattr(parent_module, k, tgt_module)
     # verify that all BN are replaced
-    assert not any(predicate(m) for _, m in root_module.named_modules(remove_duplicate=True))
+    assert not any(
+        predicate(m) for _, m in root_module.named_modules(remove_duplicate=True)
+    )
     return root_module
 
 
@@ -844,7 +955,8 @@ class VqVae(nn.Module):
         )
 
         self.encoder = MLP(
-            in_channels=self.config.output_shapes["action"][0] * self.config.action_chunk_size,
+            in_channels=self.config.output_shapes["action"][0]
+            * self.config.action_chunk_size,
             hidden_channels=[
                 config.vqvae_enc_hidden_dim,
                 config.vqvae_enc_hidden_dim,
@@ -872,9 +984,13 @@ class VqVae(nn.Module):
         # given latent vector, this function outputs the decoded action.
         output = self.decoder(latent)
         if self.config.action_chunk_size == 1:
-            return einops.rearrange(output, "N (T A) -> N T A", A=self.config.output_shapes["action"][0])
+            return einops.rearrange(
+                output, "N (T A) -> N T A", A=self.config.output_shapes["action"][0]
+            )
         else:
-            return einops.rearrange(output, "N (T A) -> N T A", A=self.config.output_shapes["action"][0])
+            return einops.rearrange(
+                output, "N (T A) -> N T A", A=self.config.output_shapes["action"][0]
+            )
 
     def get_code(self, state):
         # in phase 2 of VQ-BeT training, we need a `ground truth labels of action data` to calculate the Focal loss for code prediction head. (please refer to section 3.3 in the paper https://arxiv.org/pdf/2403.03181)
diff --git a/lerobot/common/policies/vqbet/vqbet_utils.py b/lerobot/common/policies/vqbet/vqbet_utils.py
index 90a2cfda..acbe9ade 100644
--- a/lerobot/common/policies/vqbet/vqbet_utils.py
+++ b/lerobot/common/policies/vqbet/vqbet_utils.py
@@ -123,9 +123,15 @@ class CausalSelfAttention(nn.Module):
 
         # calculate query, key, values for all heads in batch and move head forward to be the batch dim
         q, k, v = self.c_attn(x).split(self.gpt_hidden_dim, dim=2)
-        k = k.view(B, T, self.gpt_n_head, C // self.gpt_n_head).transpose(1, 2)  # (B, nh, T, hs)
-        q = q.view(B, T, self.gpt_n_head, C // self.gpt_n_head).transpose(1, 2)  # (B, nh, T, hs)
-        v = v.view(B, T, self.gpt_n_head, C // self.gpt_n_head).transpose(1, 2)  # (B, nh, T, hs)
+        k = k.view(B, T, self.gpt_n_head, C // self.gpt_n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+        q = q.view(B, T, self.gpt_n_head, C // self.gpt_n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+        v = v.view(B, T, self.gpt_n_head, C // self.gpt_n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
 
         # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
         att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
@@ -133,7 +139,9 @@ class CausalSelfAttention(nn.Module):
         att = F.softmax(att, dim=-1)
         att = self.attn_dropout(att)
         y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
-        y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side
+        y = (
+            y.transpose(1, 2).contiguous().view(B, T, C)
+        )  # re-assemble all head outputs side by side
 
         # output projection
         y = self.resid_dropout(self.c_proj(y))
@@ -189,12 +197,16 @@ class GPT(nn.Module):
                 "ln_f": nn.LayerNorm(config.gpt_hidden_dim),
             }
         )
-        self.lm_head = nn.Linear(config.gpt_hidden_dim, config.gpt_output_dim, bias=False)
+        self.lm_head = nn.Linear(
+            config.gpt_hidden_dim, config.gpt_output_dim, bias=False
+        )
         # init all weights, and apply a special scaled init to the residual projections, per GPT-2 paper
         self.apply(self._init_weights)
         for pn, p in self.named_parameters():
             if pn.endswith("c_proj.weight"):
-                torch.nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * config.gpt_n_layer))
+                torch.nn.init.normal_(
+                    p, mean=0.0, std=0.02 / math.sqrt(2 * config.gpt_n_layer)
+                )
 
         # report number of parameters
         n_params = sum(p.numel() for p in self.parameters())
@@ -208,11 +220,17 @@ class GPT(nn.Module):
         ), f"Cannot forward sequence of length {t}, block size is only {self.config.gpt_block_size}"
 
         # positional encodings that are added to the input embeddings
-        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)  # shape (1, t)
+        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(
+            0
+        )  # shape (1, t)
 
         # forward the GPT model itself
-        tok_emb = self.transformer.wte(input)  # token embeddings of shape (b, t, gpt_hidden_dim)
-        pos_emb = self.transformer.wpe(pos)  # position embeddings of shape (1, t, gpt_hidden_dim)
+        tok_emb = self.transformer.wte(
+            input
+        )  # token embeddings of shape (b, t, gpt_hidden_dim)
+        pos_emb = self.transformer.wpe(
+            pos
+        )  # position embeddings of shape (1, t, gpt_hidden_dim)
         x = self.transformer.drop(tok_emb + pos_emb)
         for block in self.transformer.h:
             x = block(x)
@@ -237,7 +255,9 @@ class GPT(nn.Module):
         # but want to use a smaller block size for some smaller, simpler model
         assert gpt_block_size <= self.config.gpt_block_size
         self.config.gpt_block_size = gpt_block_size
-        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:gpt_block_size])
+        self.transformer.wpe.weight = nn.Parameter(
+            self.transformer.wpe.weight[:gpt_block_size]
+        )
         for block in self.transformer.h:
             block.attn.bias = block.attn.bias[:, :, :gpt_block_size, :gpt_block_size]
 
@@ -270,7 +290,9 @@ class GPT(nn.Module):
         param_dict = dict(self.named_parameters())
         inter_params = decay & no_decay
         union_params = decay | no_decay
-        assert len(inter_params) == 0, "parameters {} made it into both decay/no_decay sets!".format(
+        assert (
+            len(inter_params) == 0
+        ), "parameters {} made it into both decay/no_decay sets!".format(
             str(inter_params)
         )
         assert (
@@ -368,8 +390,12 @@ class ResidualVQ(nn.Module):
         codebook_input_dim = codebook_dim * heads
 
         requires_projection = codebook_input_dim != dim
-        self.project_in = nn.Linear(dim, codebook_input_dim) if requires_projection else nn.Identity()
-        self.project_out = nn.Linear(codebook_input_dim, dim) if requires_projection else nn.Identity()
+        self.project_in = (
+            nn.Linear(dim, codebook_input_dim) if requires_projection else nn.Identity()
+        )
+        self.project_out = (
+            nn.Linear(codebook_input_dim, dim) if requires_projection else nn.Identity()
+        )
 
         self.num_quantizers = num_quantizers
 
@@ -377,7 +403,10 @@ class ResidualVQ(nn.Module):
         self.layers = nn.ModuleList(
             [
                 VectorQuantize(
-                    dim=codebook_dim, codebook_dim=codebook_dim, accept_image_fmap=accept_image_fmap, **kwargs
+                    dim=codebook_dim,
+                    codebook_dim=codebook_dim,
+                    accept_image_fmap=accept_image_fmap,
+                    **kwargs,
                 )
                 for _ in range(num_quantizers)
             ]
@@ -448,7 +477,9 @@ class ResidualVQ(nn.Module):
 
         return all_codes
 
-    def forward(self, x, indices=None, return_all_codes=False, sample_codebook_temp=None):
+    def forward(
+        self, x, indices=None, return_all_codes=False, sample_codebook_temp=None
+    ):
         """
         For given input tensor x, this function will return the quantized output, the indices of the quantized output, and the loss.
         First, the input tensor x is projected to the codebook dimension. Then, the input tensor x is passed through Nq layers of VectorQuantize.
@@ -477,13 +508,17 @@ class ResidualVQ(nn.Module):
             ), "some of the residual vq indices were dropped out. please use indices derived when the module is in eval mode to derive cross entropy loss"
             ce_losses = []
 
-        should_quantize_dropout = self.training and self.quantize_dropout and not return_loss
+        should_quantize_dropout = (
+            self.training and self.quantize_dropout and not return_loss
+        )
 
         # sample a layer index at which to dropout further residual quantization
         # also prepare null indices and loss
 
         if should_quantize_dropout:
-            rand_quantize_dropout_index = randrange(self.quantize_dropout_cutoff_index, num_quant)
+            rand_quantize_dropout_index = randrange(
+                self.quantize_dropout_cutoff_index, num_quant
+            )
 
             if quant_dropout_multiple_of != 1:
                 rand_quantize_dropout_index = (
@@ -492,14 +527,23 @@ class ResidualVQ(nn.Module):
                     - 1
                 )
 
-            null_indices_shape = (x.shape[0], *x.shape[-2:]) if self.accept_image_fmap else tuple(x.shape[:2])
-            null_indices = torch.full(null_indices_shape, -1.0, device=device, dtype=torch.long)
+            null_indices_shape = (
+                (x.shape[0], *x.shape[-2:])
+                if self.accept_image_fmap
+                else tuple(x.shape[:2])
+            )
+            null_indices = torch.full(
+                null_indices_shape, -1.0, device=device, dtype=torch.long
+            )
             null_loss = torch.full((1,), 0.0, device=device, dtype=x.dtype)
 
         # go through the layers
 
         for quantizer_index, layer in enumerate(self.layers):
-            if should_quantize_dropout and quantizer_index > rand_quantize_dropout_index:
+            if (
+                should_quantize_dropout
+                and quantizer_index > rand_quantize_dropout_index
+            ):
                 all_indices.append(null_indices)
                 all_losses.append(null_loss)
                 continue
@@ -539,7 +583,9 @@ class ResidualVQ(nn.Module):
 
         # stack all losses and indices
 
-        all_losses, all_indices = map(partial(torch.stack, dim=-1), (all_losses, all_indices))
+        all_losses, all_indices = map(
+            partial(torch.stack, dim=-1), (all_losses, all_indices)
+        )
 
         ret = (quantized_out, all_indices, all_losses)
 
@@ -599,8 +645,12 @@ class VectorQuantize(nn.Module):
         codebook_input_dim = codebook_dim * heads
 
         requires_projection = codebook_input_dim != dim
-        self.project_in = nn.Linear(dim, codebook_input_dim) if requires_projection else nn.Identity()
-        self.project_out = nn.Linear(codebook_input_dim, dim) if requires_projection else nn.Identity()
+        self.project_in = (
+            nn.Linear(dim, codebook_input_dim) if requires_projection else nn.Identity()
+        )
+        self.project_out = (
+            nn.Linear(codebook_input_dim, dim) if requires_projection else nn.Identity()
+        )
 
         self.eps = eps
         self.commitment_weight = commitment_weight
@@ -614,10 +664,14 @@ class VectorQuantize(nn.Module):
         self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only
         self.orthogonal_reg_max_codes = orthogonal_reg_max_codes
 
-        assert not (ema_update and learnable_codebook), "learnable codebook not compatible with EMA update"
+        assert not (
+            ema_update and learnable_codebook
+        ), "learnable codebook not compatible with EMA update"
 
         assert 0 <= sync_update_v <= 1.0
-        assert not (sync_update_v > 0.0 and not learnable_codebook), "learnable codebook must be turned on"
+        assert not (
+            sync_update_v > 0.0 and not learnable_codebook
+        ), "learnable codebook must be turned on"
 
         self.sync_update_v = sync_update_v
 
@@ -629,7 +683,9 @@ class VectorQuantize(nn.Module):
         )
 
         if sync_codebook is None:
-            sync_codebook = distributed.is_initialized() and distributed.get_world_size() > 1
+            sync_codebook = (
+                distributed.is_initialized() and distributed.get_world_size() > 1
+            )
 
         codebook_kwargs = {
             "dim": codebook_dim,
@@ -794,11 +850,17 @@ class VectorQuantize(nn.Module):
 
             # quantize again
 
-            quantize, embed_ind, distances = self._codebook(x, **codebook_forward_kwargs)
+            quantize, embed_ind, distances = self._codebook(
+                x, **codebook_forward_kwargs
+            )
 
         if self.training:
             # determine code to use for commitment loss
-            maybe_detach = torch.detach if not self.learnable_codebook or freeze_codebook else identity
+            maybe_detach = (
+                torch.detach
+                if not self.learnable_codebook or freeze_codebook
+                else identity
+            )
 
             commit_quantize = maybe_detach(quantize)
 
@@ -808,7 +870,9 @@ class VectorQuantize(nn.Module):
 
             if self.sync_update_v > 0.0:
                 # (21) in https://minyoungg.github.io/vqtorch/assets/draft_050523.pdf
-                quantize = quantize + self.sync_update_v * (quantize - quantize.detach())
+                quantize = quantize + self.sync_update_v * (
+                    quantize - quantize.detach()
+                )
 
         # function for calculating cross entropy loss to distance matrix
         # used for (1) naturalspeech2 training residual vq latents to be close to the correct codes and (2) cross-entropy based commitment loss
@@ -841,7 +905,9 @@ class VectorQuantize(nn.Module):
                 embed_ind = rearrange(embed_ind, "1 (b h) n -> b n h", h=heads)
 
         if self.accept_image_fmap:
-            embed_ind = rearrange(embed_ind, "b (h w) ... -> b h w ...", h=height, w=width)
+            embed_ind = rearrange(
+                embed_ind, "b (h w) ... -> b h w ...", h=height, w=width
+            )
 
         if only_one:
             embed_ind = rearrange(embed_ind, "b 1 -> b")
@@ -895,8 +961,12 @@ class VectorQuantize(nn.Module):
 
                 num_codes = codebook.shape[-2]
 
-                if (self.orthogonal_reg_max_codes is not None) and num_codes > self.orthogonal_reg_max_codes:
-                    rand_ids = torch.randperm(num_codes, device=device)[: self.orthogonal_reg_max_codes]
+                if (
+                    self.orthogonal_reg_max_codes is not None
+                ) and num_codes > self.orthogonal_reg_max_codes:
+                    rand_ids = torch.randperm(num_codes, device=device)[
+                        : self.orthogonal_reg_max_codes
+                    ]
                     codebook = codebook[:, rand_ids]
 
                 orthogonal_reg_loss = orthogonal_loss_fn(codebook)
@@ -928,7 +998,9 @@ class VectorQuantize(nn.Module):
         # if masking, only return quantized for where mask has True
 
         if mask is not None:
-            quantize = torch.where(rearrange(mask, "... -> ... 1"), quantize, orig_input)
+            quantize = torch.where(
+                rearrange(mask, "... -> ... 1"), quantize, orig_input
+            )
 
         return quantize, embed_ind, loss
 
@@ -1038,7 +1110,9 @@ def sample_vectors(samples, num):
 
 
 def batched_sample_vectors(samples, num):
-    return torch.stack([sample_vectors(sample, num) for sample in samples.unbind(dim=0)], dim=0)
+    return torch.stack(
+        [sample_vectors(sample, num) for sample in samples.unbind(dim=0)], dim=0
+    )
 
 
 def pad_shape(shape, size, dim=0):
@@ -1089,7 +1163,9 @@ def sample_vectors_distributed(local_samples, num):
     all_num_samples = all_gather_sizes(local_samples, dim=0)
 
     if rank == 0:
-        samples_per_rank = sample_multinomial(num, all_num_samples / all_num_samples.sum())
+        samples_per_rank = sample_multinomial(
+            num, all_num_samples / all_num_samples.sum()
+        )
     else:
         samples_per_rank = torch.empty_like(all_num_samples)
 
@@ -1202,7 +1278,9 @@ class EuclideanCodebook(nn.Module):
         self.eps = eps
         self.threshold_ema_dead_code = threshold_ema_dead_code
         self.reset_cluster_size = (
-            reset_cluster_size if (reset_cluster_size is not None) else threshold_ema_dead_code
+            reset_cluster_size
+            if (reset_cluster_size is not None)
+            else threshold_ema_dead_code
         )
 
         assert callable(gumbel_sample)
@@ -1213,8 +1291,14 @@ class EuclideanCodebook(nn.Module):
             use_ddp and num_codebooks > 1 and kmeans_init
         ), "kmeans init is not compatible with multiple codebooks in distributed environment for now"
 
-        self.sample_fn = sample_vectors_distributed if use_ddp and sync_kmeans else batched_sample_vectors
-        self.kmeans_all_reduce_fn = distributed.all_reduce if use_ddp and sync_kmeans else noop
+        self.sample_fn = (
+            sample_vectors_distributed
+            if use_ddp and sync_kmeans
+            else batched_sample_vectors
+        )
+        self.kmeans_all_reduce_fn = (
+            distributed.all_reduce if use_ddp and sync_kmeans else noop
+        )
         self.all_reduce_fn = distributed.all_reduce if use_ddp else noop
 
         self.register_buffer("initted", torch.Tensor([not kmeans_init]))
@@ -1353,7 +1437,9 @@ class EuclideanCodebook(nn.Module):
         distributed.all_reduce(variance_numer)
         batch_variance = variance_numer / num_vectors
 
-        self.update_with_decay("batch_variance", batch_variance, self.affine_param_batch_decay)
+        self.update_with_decay(
+            "batch_variance", batch_variance, self.affine_param_batch_decay
+        )
 
     def replace(self, batch_samples, batch_mask):
         for ind, (samples, mask) in enumerate(
@@ -1362,7 +1448,9 @@ class EuclideanCodebook(nn.Module):
             if not torch.any(mask):
                 continue
 
-            sampled = self.sample_fn(rearrange(samples, "... -> 1 ..."), mask.sum().item())
+            sampled = self.sample_fn(
+                rearrange(samples, "... -> 1 ..."), mask.sum().item()
+            )
             sampled = rearrange(sampled, "1 ... -> ...")
 
             self.embed.data[ind][mask] = sampled
@@ -1386,7 +1474,9 @@ class EuclideanCodebook(nn.Module):
     def forward(self, x, sample_codebook_temp=None, mask=None, freeze_codebook=False):
         needs_codebook_dim = x.ndim < 4
         sample_codebook_temp = (
-            sample_codebook_temp if (sample_codebook_temp is not None) else self.sample_codebook_temp
+            sample_codebook_temp
+            if (sample_codebook_temp is not None)
+            else self.sample_codebook_temp
         )
 
         x = x.float()
@@ -1414,7 +1504,9 @@ class EuclideanCodebook(nn.Module):
         if self.affine_param:
             codebook_std = self.codebook_variance.clamp(min=1e-5).sqrt()
             batch_std = self.batch_variance.clamp(min=1e-5).sqrt()
-            embed = (embed - self.codebook_mean) * (batch_std / codebook_std) + self.batch_mean
+            embed = (embed - self.codebook_mean) * (
+                batch_std / codebook_std
+            ) + self.batch_mean
 
         dist = -cdist(flatten, embed)
 
@@ -1432,7 +1524,9 @@ class EuclideanCodebook(nn.Module):
 
         if self.training and self.ema_update and not freeze_codebook:
             if self.affine_param:
-                flatten = (flatten - self.batch_mean) * (codebook_std / batch_std) + self.codebook_mean
+                flatten = (flatten - self.batch_mean) * (
+                    codebook_std / batch_std
+                ) + self.codebook_mean
 
             if mask is not None:
                 embed_onehot[~mask] = 0.0
@@ -1455,7 +1549,9 @@ class EuclideanCodebook(nn.Module):
             self.expire_codes_(x)
 
         if needs_codebook_dim:
-            quantize, embed_ind = tuple(rearrange(t, "1 ... -> ...") for t in (quantize, embed_ind))
+            quantize, embed_ind = tuple(
+                rearrange(t, "1 ... -> ...") for t in (quantize, embed_ind)
+            )
 
         dist = unpack_one(dist, ps, "h * d")
 
diff --git a/lerobot/common/robot_devices/cameras/intelrealsense.py b/lerobot/common/robot_devices/cameras/intelrealsense.py
index 84ac540f..cda24169 100644
--- a/lerobot/common/robot_devices/cameras/intelrealsense.py
+++ b/lerobot/common/robot_devices/cameras/intelrealsense.py
@@ -65,7 +65,9 @@ def save_image(img_array, serial_number, frame_index, images_dir):
         img.save(str(path), quality=100)
         logging.info(f"Saved image: {path}")
     except Exception as e:
-        logging.error(f"Failed to save image for camera {serial_number} frame {frame_index}: {e}")
+        logging.error(
+            f"Failed to save image for camera {serial_number} frame {frame_index}: {e}"
+        )
 
 
 def save_images_from_cameras(
@@ -94,7 +96,9 @@ def save_images_from_cameras(
     cameras = []
     for cam_sn in serial_numbers:
         print(f"{cam_sn=}")
-        camera = IntelRealSenseCamera(cam_sn, fps=fps, width=width, height=height, mock=mock)
+        camera = IntelRealSenseCamera(
+            cam_sn, fps=fps, width=width, height=height, mock=mock
+        )
         camera.connect()
         print(
             f"IntelRealSenseCamera({camera.serial_number}, fps={camera.fps}, width={camera.width}, height={camera.height}, color_mode={camera.color_mode})"
@@ -140,7 +144,9 @@ def save_images_from_cameras(
                 if time.perf_counter() - start_time > record_time_s:
                     break
 
-                print(f"Frame: {frame_index:04d}\tLatency (ms): {(time.perf_counter() - now) * 1000:.2f}")
+                print(
+                    f"Frame: {frame_index:04d}\tLatency (ms): {(time.perf_counter() - now) * 1000:.2f}"
+                )
 
                 frame_index += 1
     finally:
@@ -182,8 +188,12 @@ class IntelRealSenseCameraConfig:
 
         self.channels = 3
 
-        at_least_one_is_not_none = self.fps is not None or self.width is not None or self.height is not None
-        at_least_one_is_none = self.fps is None or self.width is None or self.height is None
+        at_least_one_is_not_none = (
+            self.fps is not None or self.width is not None or self.height is not None
+        )
+        at_least_one_is_none = (
+            self.fps is None or self.width is None or self.height is None
+        )
         if at_least_one_is_not_none and at_least_one_is_none:
             raise ValueError(
                 "For `fps`, `width` and `height`, either all of them need to be set, or none of them, "
@@ -191,7 +201,9 @@ class IntelRealSenseCameraConfig:
             )
 
         if self.rotation not in [-90, None, 90, 180]:
-            raise ValueError(f"`rotation` must be in [-90, None, 90, 180] (got {self.rotation})")
+            raise ValueError(
+                f"`rotation` must be in [-90, None, 90, 180] (got {self.rotation})"
+            )
 
 
 class IntelRealSenseCamera:
@@ -286,7 +298,9 @@ class IntelRealSenseCamera:
             self.rotation = cv2.ROTATE_180
 
     @classmethod
-    def init_from_name(cls, name: str, config: IntelRealSenseCameraConfig | None = None, **kwargs):
+    def init_from_name(
+        cls, name: str, config: IntelRealSenseCameraConfig | None = None, **kwargs
+    ):
         camera_infos = find_cameras()
         camera_names = [cam["name"] for cam in camera_infos]
         this_name_count = Counter(camera_names)[name]
@@ -296,7 +310,9 @@ class IntelRealSenseCamera:
                 f"Multiple {name} cameras have been detected. Please use their serial number to instantiate them."
             )
 
-        name_to_serial_dict = {cam["name"]: cam["serial_number"] for cam in camera_infos}
+        name_to_serial_dict = {
+            cam["name"]: cam["serial_number"] for cam in camera_infos
+        }
         cam_sn = name_to_serial_dict[name]
 
         if config is None:
@@ -323,13 +339,17 @@ class IntelRealSenseCamera:
 
         if self.fps and self.width and self.height:
             # TODO(rcadene): can we set rgb8 directly?
-            config.enable_stream(rs.stream.color, self.width, self.height, rs.format.rgb8, self.fps)
+            config.enable_stream(
+                rs.stream.color, self.width, self.height, rs.format.rgb8, self.fps
+            )
         else:
             config.enable_stream(rs.stream.color)
 
         if self.use_depth:
             if self.fps and self.width and self.height:
-                config.enable_stream(rs.stream.depth, self.width, self.height, rs.format.z16, self.fps)
+                config.enable_stream(
+                    rs.stream.depth, self.width, self.height, rs.format.z16, self.fps
+                )
             else:
                 config.enable_stream(rs.stream.depth)
 
@@ -362,7 +382,9 @@ class IntelRealSenseCamera:
         actual_height = color_profile.height()
 
         # Using `math.isclose` since actual fps can be a float (e.g. 29.9 instead of 30)
-        if self.fps is not None and not math.isclose(self.fps, actual_fps, rel_tol=1e-3):
+        if self.fps is not None and not math.isclose(
+            self.fps, actual_fps, rel_tol=1e-3
+        ):
             # Using `OSError` since it's a broad that encompasses issues related to device communication
             raise OSError(
                 f"Can't set {self.fps=} for IntelRealSenseCamera({self.serial_number}). Actual value is {actual_fps}."
@@ -382,7 +404,9 @@ class IntelRealSenseCamera:
 
         self.is_connected = True
 
-    def read(self, temporary_color: str | None = None) -> np.ndarray | tuple[np.ndarray, np.ndarray]:
+    def read(
+        self, temporary_color: str | None = None
+    ) -> np.ndarray | tuple[np.ndarray, np.ndarray]:
         """Read a frame from the camera returned in the format height x width x channels (e.g. 480 x 640 x 3)
         of type `np.uint8`, contrarily to the pytorch format which is float channel first.
 
@@ -409,11 +433,15 @@ class IntelRealSenseCamera:
         color_frame = frame.get_color_frame()
 
         if not color_frame:
-            raise OSError(f"Can't capture color image from IntelRealSenseCamera({self.serial_number}).")
+            raise OSError(
+                f"Can't capture color image from IntelRealSenseCamera({self.serial_number})."
+            )
 
         color_image = np.asanyarray(color_frame.get_data())
 
-        requested_color_mode = self.color_mode if temporary_color is None else temporary_color
+        requested_color_mode = (
+            self.color_mode if temporary_color is None else temporary_color
+        )
         if requested_color_mode not in ["rgb", "bgr"]:
             raise ValueError(
                 f"Expected color values are 'rgb' or 'bgr', but {requested_color_mode} is provided."
@@ -441,7 +469,9 @@ class IntelRealSenseCamera:
         if self.use_depth:
             depth_frame = frame.get_depth_frame()
             if not depth_frame:
-                raise OSError(f"Can't capture depth image from IntelRealSenseCamera({self.serial_number}).")
+                raise OSError(
+                    f"Can't capture depth image from IntelRealSenseCamera({self.serial_number})."
+                )
 
             depth_map = np.asanyarray(depth_frame.get_data())
 
@@ -483,7 +513,9 @@ class IntelRealSenseCamera:
             # TODO(rcadene, aliberts): intelrealsense has diverged compared to opencv over here
             num_tries += 1
             time.sleep(1 / self.fps)
-            if num_tries > self.fps and (self.thread.ident is None or not self.thread.is_alive()):
+            if num_tries > self.fps and (
+                self.thread.ident is None or not self.thread.is_alive()
+            ):
                 raise Exception(
                     "The thread responsible for `self.async_read()` took too much time to start. There might be an issue. Verify that `self.thread.start()` has been called."
                 )
diff --git a/lerobot/common/robot_devices/cameras/opencv.py b/lerobot/common/robot_devices/cameras/opencv.py
index d284cf55..3b46c8f5 100644
--- a/lerobot/common/robot_devices/cameras/opencv.py
+++ b/lerobot/common/robot_devices/cameras/opencv.py
@@ -31,10 +31,14 @@ from lerobot.common.utils.utils import capture_timestamp_utc
 MAX_OPENCV_INDEX = 60
 
 
-def find_cameras(raise_when_empty=False, max_index_search_range=MAX_OPENCV_INDEX, mock=False) -> list[dict]:
+def find_cameras(
+    raise_when_empty=False, max_index_search_range=MAX_OPENCV_INDEX, mock=False
+) -> list[dict]:
     cameras = []
     if platform.system() == "Linux":
-        print("Linux detected. Finding available camera indices through scanning '/dev/video*' ports")
+        print(
+            "Linux detected. Finding available camera indices through scanning '/dev/video*' ports"
+        )
         possible_ports = [str(port) for port in Path("/dev").glob("video*")]
         ports = _find_cameras(possible_ports, mock=mock)
         for port in ports:
@@ -165,7 +169,9 @@ def save_images_from_cameras(
                 dt_s = time.perf_counter() - now
                 busy_wait(1 / fps - dt_s)
 
-            print(f"Frame: {frame_index:04d}\tLatency (ms): {(time.perf_counter() - now) * 1000:.2f}")
+            print(
+                f"Frame: {frame_index:04d}\tLatency (ms): {(time.perf_counter() - now) * 1000:.2f}"
+            )
 
             if time.perf_counter() - start_time > record_time_s:
                 break
@@ -205,7 +211,9 @@ class OpenCVCameraConfig:
         self.channels = 3
 
         if self.rotation not in [-90, None, 90, 180]:
-            raise ValueError(f"`rotation` must be in [-90, None, 90, 180] (got {self.rotation})")
+            raise ValueError(
+                f"`rotation` must be in [-90, None, 90, 180] (got {self.rotation})"
+            )
 
 
 class OpenCVCamera:
@@ -247,7 +255,12 @@ class OpenCVCamera:
     ```
     """
 
-    def __init__(self, camera_index: int | str, config: OpenCVCameraConfig | None = None, **kwargs):
+    def __init__(
+        self,
+        camera_index: int | str,
+        config: OpenCVCameraConfig | None = None,
+        **kwargs,
+    ):
         if config is None:
             config = OpenCVCameraConfig()
 
@@ -261,12 +274,16 @@ class OpenCVCamera:
         if platform.system() == "Linux":
             if isinstance(self.camera_index, int):
                 self.port = Path(f"/dev/video{self.camera_index}")
-            elif isinstance(self.camera_index, str) and is_valid_unix_path(self.camera_index):
+            elif isinstance(self.camera_index, str) and is_valid_unix_path(
+                self.camera_index
+            ):
                 self.port = Path(self.camera_index)
                 # Retrieve the camera index from a potentially symlinked path
                 self.camera_index = get_camera_index_from_unix_port(self.port)
             else:
-                raise ValueError(f"Please check the provided camera_index: {camera_index}")
+                raise ValueError(
+                    f"Please check the provided camera_index: {camera_index}"
+                )
 
         self.fps = config.fps
         self.width = config.width
@@ -298,7 +315,9 @@ class OpenCVCamera:
 
     def connect(self):
         if self.is_connected:
-            raise RobotDeviceAlreadyConnectedError(f"OpenCVCamera({self.camera_index}) is already connected.")
+            raise RobotDeviceAlreadyConnectedError(
+                f"OpenCVCamera({self.camera_index}) is already connected."
+            )
 
         if self.mock:
             import tests.mock_cv2 as cv2
@@ -309,7 +328,11 @@ class OpenCVCamera:
             # when other threads are used to save the images.
             cv2.setNumThreads(1)
 
-        camera_idx = f"/dev/video{self.camera_index}" if platform.system() == "Linux" else self.camera_index
+        camera_idx = (
+            f"/dev/video{self.camera_index}"
+            if platform.system() == "Linux"
+            else self.camera_index
+        )
         # First create a temporary camera trying to access `camera_index`,
         # and verify it is a valid camera by calling `isOpened`.
         tmp_camera = cv2.VideoCapture(camera_idx)
@@ -349,16 +372,22 @@ class OpenCVCamera:
         actual_height = self.camera.get(cv2.CAP_PROP_FRAME_HEIGHT)
 
         # Using `math.isclose` since actual fps can be a float (e.g. 29.9 instead of 30)
-        if self.fps is not None and not math.isclose(self.fps, actual_fps, rel_tol=1e-3):
+        if self.fps is not None and not math.isclose(
+            self.fps, actual_fps, rel_tol=1e-3
+        ):
             # Using `OSError` since it's a broad that encompasses issues related to device communication
             raise OSError(
                 f"Can't set {self.fps=} for OpenCVCamera({self.camera_index}). Actual value is {actual_fps}."
             )
-        if self.width is not None and not math.isclose(self.width, actual_width, rel_tol=1e-3):
+        if self.width is not None and not math.isclose(
+            self.width, actual_width, rel_tol=1e-3
+        ):
             raise OSError(
                 f"Can't set {self.width=} for OpenCVCamera({self.camera_index}). Actual value is {actual_width}."
             )
-        if self.height is not None and not math.isclose(self.height, actual_height, rel_tol=1e-3):
+        if self.height is not None and not math.isclose(
+            self.height, actual_height, rel_tol=1e-3
+        ):
             raise OSError(
                 f"Can't set {self.height=} for OpenCVCamera({self.camera_index}). Actual value is {actual_height}."
             )
@@ -388,7 +417,9 @@ class OpenCVCamera:
         if not ret:
             raise OSError(f"Can't capture color image from camera {self.camera_index}.")
 
-        requested_color_mode = self.color_mode if temporary_color_mode is None else temporary_color_mode
+        requested_color_mode = (
+            self.color_mode if temporary_color_mode is None else temporary_color_mode
+        )
 
         if requested_color_mode not in ["rgb", "bgr"]:
             raise ValueError(
diff --git a/lerobot/common/robot_devices/control_utils.py b/lerobot/common/robot_devices/control_utils.py
index 7fdf37d0..ae25f7ae 100644
--- a/lerobot/common/robot_devices/control_utils.py
+++ b/lerobot/common/robot_devices/control_utils.py
@@ -23,11 +23,17 @@ from lerobot.common.datasets.utils import get_features_from_robot
 from lerobot.common.policies.factory import make_policy
 from lerobot.common.robot_devices.robots.utils import Robot
 from lerobot.common.robot_devices.utils import busy_wait
-from lerobot.common.utils.utils import get_safe_torch_device, init_hydra_config, set_global_seed
+from lerobot.common.utils.utils import (
+    get_safe_torch_device,
+    init_hydra_config,
+    set_global_seed,
+)
 from lerobot.scripts.eval import get_pretrained_policy_path
 
 
-def log_control_info(robot: Robot, dt_s, episode_index=None, frame_index=None, fps=None):
+def log_control_info(
+    robot: Robot, dt_s, episode_index=None, frame_index=None, fps=None
+):
     log_items = []
     if episode_index is not None:
         log_items.append(f"ep:{episode_index}")
@@ -98,7 +104,9 @@ def predict_action(observation, policy, device, use_amp):
     observation = copy(observation)
     with (
         torch.inference_mode(),
-        torch.autocast(device_type=device.type) if device.type == "cuda" and use_amp else nullcontext(),
+        torch.autocast(device_type=device.type)
+        if device.type == "cuda" and use_amp
+        else nullcontext(),
     ):
         # Convert to pytorch format: channel first and float32 in [0,1] with batch dimension
         for name in observation:
@@ -154,7 +162,9 @@ def init_keyboard_listener(assign_rewards=False):
                 print("Right arrow key pressed. Exiting loop...")
                 events["exit_early"] = True
             elif key == keyboard.Key.left:
-                print("Left arrow key pressed. Exiting loop and rerecord the last episode...")
+                print(
+                    "Left arrow key pressed. Exiting loop and rerecord the last episode..."
+                )
                 events["rerecord_episode"] = True
                 events["exit_early"] = True
             elif key == keyboard.Key.esc:
@@ -180,8 +190,12 @@ def init_keyboard_listener(assign_rewards=False):
 def init_policy(pretrained_policy_name_or_path, policy_overrides):
     """Instantiate the policy and load fps, device and use_amp from config yaml"""
     pretrained_policy_path = get_pretrained_policy_path(pretrained_policy_name_or_path)
-    hydra_cfg = init_hydra_config(pretrained_policy_path / "config.yaml", policy_overrides)
-    policy = make_policy(hydra_cfg=hydra_cfg, pretrained_policy_name_or_path=pretrained_policy_path)
+    hydra_cfg = init_hydra_config(
+        pretrained_policy_path / "config.yaml", policy_overrides
+    )
+    policy = make_policy(
+        hydra_cfg=hydra_cfg, pretrained_policy_name_or_path=pretrained_policy_path
+    )
 
     # Check device is available
     device = get_safe_torch_device(hydra_cfg.device, log=True)
@@ -270,7 +284,9 @@ def control_loop(
         raise ValueError("When `teleoperate` is True, `policy` should be None.")
 
     if dataset is not None and fps is not None and dataset.fps != fps:
-        raise ValueError(f"The dataset fps should be equal to requested fps ({dataset['fps']} != {fps}).")
+        raise ValueError(
+            f"The dataset fps should be equal to requested fps ({dataset['fps']} != {fps})."
+        )
 
     timestamp = 0
     start_episode_t = time.perf_counter()
@@ -297,7 +313,9 @@ def control_loop(
             frame = {**observation, **action}
             if "next.reward" in events:
                 frame["next.reward"] = events["next.reward"]
-                frame["next.done"] = (events["next.reward"] == 1) or (events["exit_early"])
+                frame["next.done"] = (events["next.reward"] == 1) or (
+                    events["exit_early"]
+                )
             dataset.add_frame(frame)
 
             # if frame["next.done"]:
@@ -306,7 +324,9 @@ def control_loop(
         if display_cameras and not is_headless():
             image_keys = [key for key in observation if "image" in key]
             for key in image_keys:
-                cv2.imshow(key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR))
+                cv2.imshow(
+                    key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR)
+                )
             cv2.waitKey(1)
 
         if fps is not None:
@@ -384,7 +404,11 @@ def sanity_check_dataset_name(repo_id, policy):
 
 
 def sanity_check_dataset_robot_compatibility(
-    dataset: LeRobotDataset, robot: Robot, fps: int, use_videos: bool, extra_features: dict = None
+    dataset: LeRobotDataset,
+    robot: Robot,
+    fps: int,
+    use_videos: bool,
+    extra_features: dict = None,
 ) -> None:
     features_from_robot = get_features_from_robot(robot, use_videos)
     if extra_features is not None:
@@ -398,11 +422,14 @@ def sanity_check_dataset_robot_compatibility(
 
     mismatches = []
     for field, dataset_value, present_value in fields:
-        diff = DeepDiff(dataset_value, present_value, exclude_regex_paths=[r".*\['info'\]$"])
+        diff = DeepDiff(
+            dataset_value, present_value, exclude_regex_paths=[r".*\['info'\]$"]
+        )
         if diff:
             mismatches.append(f"{field}: expected {present_value}, got {dataset_value}")
 
     if mismatches:
         raise ValueError(
-            "Dataset metadata compatibility check failed with mismatches:\n" + "\n".join(mismatches)
+            "Dataset metadata compatibility check failed with mismatches:\n"
+            + "\n".join(mismatches)
         )
diff --git a/lerobot/common/robot_devices/motors/dynamixel.py b/lerobot/common/robot_devices/motors/dynamixel.py
index 1e1396f7..8a5ad19b 100644
--- a/lerobot/common/robot_devices/motors/dynamixel.py
+++ b/lerobot/common/robot_devices/motors/dynamixel.py
@@ -8,7 +8,10 @@ from copy import deepcopy
 import numpy as np
 import tqdm
 
-from lerobot.common.robot_devices.utils import RobotDeviceAlreadyConnectedError, RobotDeviceNotConnectedError
+from lerobot.common.robot_devices.utils import (
+    RobotDeviceAlreadyConnectedError,
+    RobotDeviceNotConnectedError,
+)
 from lerobot.common.utils.utils import capture_timestamp_utc
 
 PROTOCOL_VERSION = 2.0
@@ -143,7 +146,9 @@ NUM_READ_RETRY = 10
 NUM_WRITE_RETRY = 10
 
 
-def convert_degrees_to_steps(degrees: float | np.ndarray, models: str | list[str]) -> np.ndarray:
+def convert_degrees_to_steps(
+    degrees: float | np.ndarray, models: str | list[str]
+) -> np.ndarray:
     """This function converts the degree range to the step range for indicating motors rotation.
     It assumes a motor achieves a full rotation by going from -180 degree position to +180.
     The motor resolution (e.g. 4096) corresponds to the number of steps needed to achieve a full rotation.
@@ -378,7 +383,9 @@ class DynamixelMotorsBus:
         indices = []
         for idx in tqdm.tqdm(possible_ids):
             try:
-                present_idx = self.read_with_motor_ids(self.motor_models, [idx], "ID", num_retry=num_retry)[0]
+                present_idx = self.read_with_motor_ids(
+                    self.motor_models, [idx], "ID", num_retry=num_retry
+                )[0]
             except ConnectionError:
                 continue
 
@@ -394,7 +401,9 @@ class DynamixelMotorsBus:
     def set_bus_baudrate(self, baudrate):
         present_bus_baudrate = self.port_handler.getBaudRate()
         if present_bus_baudrate != baudrate:
-            print(f"Setting bus baud rate to {baudrate}. Previously {present_bus_baudrate}.")
+            print(
+                f"Setting bus baud rate to {baudrate}. Previously {present_bus_baudrate}."
+            )
             self.port_handler.setBaudRate(baudrate)
 
             if self.port_handler.getBaudRate() != baudrate:
@@ -415,7 +424,9 @@ class DynamixelMotorsBus:
     def set_calibration(self, calibration: dict[str, list]):
         self.calibration = calibration
 
-    def apply_calibration_autocorrect(self, values: np.ndarray | list, motor_names: list[str] | None):
+    def apply_calibration_autocorrect(
+        self, values: np.ndarray | list, motor_names: list[str] | None
+    ):
         """This function applies the calibration, automatically detects out of range errors for motors values and attempts to correct.
 
         For more info, see docstring of `apply_calibration` and `autocorrect_calibration`.
@@ -428,7 +439,9 @@ class DynamixelMotorsBus:
             values = self.apply_calibration(values, motor_names)
         return values
 
-    def apply_calibration(self, values: np.ndarray | list, motor_names: list[str] | None):
+    def apply_calibration(
+        self, values: np.ndarray | list, motor_names: list[str] | None
+    ):
         """Convert from unsigned int32 joint position range [0, 2**32[ to the universal float32 nominal degree range ]-180.0, 180.0[ with
         a "zero position" at 0 degree.
 
@@ -503,7 +516,9 @@ class DynamixelMotorsBus:
 
         return values
 
-    def autocorrect_calibration(self, values: np.ndarray | list, motor_names: list[str] | None):
+    def autocorrect_calibration(
+        self, values: np.ndarray | list, motor_names: list[str] | None
+    ):
         """This function automatically detects issues with values of motors after calibration, and correct for these issues.
 
         Some motors might have values outside of expected maximum bounds after calibration.
@@ -545,15 +560,23 @@ class DynamixelMotorsBus:
                     values[i] *= -1
 
                 # Convert from initial range to range [-180, 180] degrees
-                calib_val = (values[i] + homing_offset) / (resolution // 2) * HALF_TURN_DEGREE
-                in_range = (calib_val > LOWER_BOUND_DEGREE) and (calib_val < UPPER_BOUND_DEGREE)
+                calib_val = (
+                    (values[i] + homing_offset) / (resolution // 2) * HALF_TURN_DEGREE
+                )
+                in_range = (calib_val > LOWER_BOUND_DEGREE) and (
+                    calib_val < UPPER_BOUND_DEGREE
+                )
 
                 # Solve this inequality to find the factor to shift the range into [-180, 180] degrees
                 # values[i] = (values[i] + homing_offset + resolution * factor) / (resolution // 2) * HALF_TURN_DEGREE
                 # - HALF_TURN_DEGREE <= (values[i] + homing_offset + resolution * factor) / (resolution // 2) * HALF_TURN_DEGREE <= HALF_TURN_DEGREE
                 # (- (resolution // 2) - values[i] - homing_offset) / resolution <= factor <= ((resolution // 2) - values[i] - homing_offset) / resolution
-                low_factor = (-(resolution // 2) - values[i] - homing_offset) / resolution
-                upp_factor = ((resolution // 2) - values[i] - homing_offset) / resolution
+                low_factor = (
+                    -(resolution // 2) - values[i] - homing_offset
+                ) / resolution
+                upp_factor = (
+                    (resolution // 2) - values[i] - homing_offset
+                ) / resolution
 
             elif CalibrationMode[calib_mode] == CalibrationMode.LINEAR:
                 start_pos = self.calibration["start_pos"][calib_idx]
@@ -561,7 +584,9 @@ class DynamixelMotorsBus:
 
                 # Convert from initial range to range [0, 100] in %
                 calib_val = (values[i] - start_pos) / (end_pos - start_pos) * 100
-                in_range = (calib_val > LOWER_BOUND_LINEAR) and (calib_val < UPPER_BOUND_LINEAR)
+                in_range = (calib_val > LOWER_BOUND_LINEAR) and (
+                    calib_val < UPPER_BOUND_LINEAR
+                )
 
                 # Solve this inequality to find the factor to shift the range into [0, 100] %
                 # values[i] = (values[i] - start_pos + resolution * factor) / (end_pos + resolution * factor - start_pos - resolution * factor) * 100
@@ -577,19 +602,27 @@ class DynamixelMotorsBus:
                     factor = math.ceil(low_factor)
 
                     if factor > upp_factor:
-                        raise ValueError(f"No integer found between bounds [{low_factor=}, {upp_factor=}]")
+                        raise ValueError(
+                            f"No integer found between bounds [{low_factor=}, {upp_factor=}]"
+                        )
                 else:
                     factor = math.ceil(upp_factor)
 
                     if factor > low_factor:
-                        raise ValueError(f"No integer found between bounds [{low_factor=}, {upp_factor=}]")
+                        raise ValueError(
+                            f"No integer found between bounds [{low_factor=}, {upp_factor=}]"
+                        )
 
                 if CalibrationMode[calib_mode] == CalibrationMode.DEGREE:
                     out_of_range_str = f"{LOWER_BOUND_DEGREE} < {calib_val} < {UPPER_BOUND_DEGREE} degrees"
                     in_range_str = f"{LOWER_BOUND_DEGREE} < {calib_val} < {UPPER_BOUND_DEGREE} degrees"
                 elif CalibrationMode[calib_mode] == CalibrationMode.LINEAR:
-                    out_of_range_str = f"{LOWER_BOUND_LINEAR} < {calib_val} < {UPPER_BOUND_LINEAR} %"
-                    in_range_str = f"{LOWER_BOUND_LINEAR} < {calib_val} < {UPPER_BOUND_LINEAR} %"
+                    out_of_range_str = (
+                        f"{LOWER_BOUND_LINEAR} < {calib_val} < {UPPER_BOUND_LINEAR} %"
+                    )
+                    in_range_str = (
+                        f"{LOWER_BOUND_LINEAR} < {calib_val} < {UPPER_BOUND_LINEAR} %"
+                    )
 
                 logging.warning(
                     f"Auto-correct calibration of motor '{name}' by shifting value by {abs(factor)} full turns, "
@@ -599,7 +632,9 @@ class DynamixelMotorsBus:
                 # A full turn corresponds to 360 degrees but also to 4096 steps for a motor resolution of 4096.
                 self.calibration["homing_offset"][calib_idx] += resolution * factor
 
-    def revert_calibration(self, values: np.ndarray | list, motor_names: list[str] | None):
+    def revert_calibration(
+        self, values: np.ndarray | list, motor_names: list[str] | None
+    ):
         """Inverse of `apply_calibration`."""
         if motor_names is None:
             motor_names = self.motor_names
@@ -638,7 +673,9 @@ class DynamixelMotorsBus:
         values = np.round(values).astype(np.int32)
         return values
 
-    def read_with_motor_ids(self, motor_models, motor_ids, data_name, num_retry=NUM_READ_RETRY):
+    def read_with_motor_ids(
+        self, motor_models, motor_ids, data_name, num_retry=NUM_READ_RETRY
+    ):
         if self.mock:
             import tests.mock_dynamixel_sdk as dxl
         else:
@@ -740,7 +777,9 @@ class DynamixelMotorsBus:
             values = self.apply_calibration_autocorrect(values, motor_names)
 
         # log the number of seconds it took to read the data from the motors
-        delta_ts_name = get_log_name("delta_timestamp_s", "read", data_name, motor_names)
+        delta_ts_name = get_log_name(
+            "delta_timestamp_s", "read", data_name, motor_names
+        )
         self.logs[delta_ts_name] = time.perf_counter() - start_time
 
         # log the utc time at which the data was received
@@ -749,7 +788,9 @@ class DynamixelMotorsBus:
 
         return values
 
-    def write_with_motor_ids(self, motor_models, motor_ids, data_name, values, num_retry=NUM_WRITE_RETRY):
+    def write_with_motor_ids(
+        self, motor_models, motor_ids, data_name, values, num_retry=NUM_WRITE_RETRY
+    ):
         if self.mock:
             import tests.mock_dynamixel_sdk as dxl
         else:
@@ -778,7 +819,12 @@ class DynamixelMotorsBus:
                 f"{self.packet_handler.getTxRxResult(comm)}"
             )
 
-    def write(self, data_name, values: int | float | np.ndarray, motor_names: str | list[str] | None = None):
+    def write(
+        self,
+        data_name,
+        values: int | float | np.ndarray,
+        motor_names: str | list[str] | None = None,
+    ):
         if not self.is_connected:
             raise RobotDeviceNotConnectedError(
                 f"DynamixelMotorsBus({self.port}) is not connected. You need to run `motors_bus.connect()`."
@@ -839,7 +885,9 @@ class DynamixelMotorsBus:
             )
 
         # log the number of seconds it took to write the data to the motors
-        delta_ts_name = get_log_name("delta_timestamp_s", "write", data_name, motor_names)
+        delta_ts_name = get_log_name(
+            "delta_timestamp_s", "write", data_name, motor_names
+        )
         self.logs[delta_ts_name] = time.perf_counter() - start_time
 
         # TODO(rcadene): should we log the time before sending the write command?
diff --git a/lerobot/common/robot_devices/motors/feetech.py b/lerobot/common/robot_devices/motors/feetech.py
index 0d5480f7..51a770f6 100644
--- a/lerobot/common/robot_devices/motors/feetech.py
+++ b/lerobot/common/robot_devices/motors/feetech.py
@@ -8,7 +8,10 @@ from copy import deepcopy
 import numpy as np
 import tqdm
 
-from lerobot.common.robot_devices.utils import RobotDeviceAlreadyConnectedError, RobotDeviceNotConnectedError
+from lerobot.common.robot_devices.utils import (
+    RobotDeviceAlreadyConnectedError,
+    RobotDeviceNotConnectedError,
+)
 from lerobot.common.utils.utils import capture_timestamp_utc
 
 PROTOCOL_VERSION = 0
@@ -122,7 +125,9 @@ NUM_READ_RETRY = 20
 NUM_WRITE_RETRY = 20
 
 
-def convert_degrees_to_steps(degrees: float | np.ndarray, models: str | list[str]) -> np.ndarray:
+def convert_degrees_to_steps(
+    degrees: float | np.ndarray, models: str | list[str]
+) -> np.ndarray:
     """This function converts the degree range to the step range for indicating motors rotation.
     It assumes a motor achieves a full rotation by going from -180 degree position to +180.
     The motor resolution (e.g. 4096) corresponds to the number of steps needed to achieve a full rotation.
@@ -358,7 +363,9 @@ class FeetechMotorsBus:
         indices = []
         for idx in tqdm.tqdm(possible_ids):
             try:
-                present_idx = self.read_with_motor_ids(self.motor_models, [idx], "ID", num_retry=num_retry)[0]
+                present_idx = self.read_with_motor_ids(
+                    self.motor_models, [idx], "ID", num_retry=num_retry
+                )[0]
             except ConnectionError:
                 continue
 
@@ -374,7 +381,9 @@ class FeetechMotorsBus:
     def set_bus_baudrate(self, baudrate):
         present_bus_baudrate = self.port_handler.getBaudRate()
         if present_bus_baudrate != baudrate:
-            print(f"Setting bus baud rate to {baudrate}. Previously {present_bus_baudrate}.")
+            print(
+                f"Setting bus baud rate to {baudrate}. Previously {present_bus_baudrate}."
+            )
             self.port_handler.setBaudRate(baudrate)
 
             if self.port_handler.getBaudRate() != baudrate:
@@ -395,7 +404,9 @@ class FeetechMotorsBus:
     def set_calibration(self, calibration: dict[str, list]):
         self.calibration = calibration
 
-    def apply_calibration_autocorrect(self, values: np.ndarray | list, motor_names: list[str] | None):
+    def apply_calibration_autocorrect(
+        self, values: np.ndarray | list, motor_names: list[str] | None
+    ):
         """This function apply the calibration, automatically detects out of range errors for motors values and attempt to correct.
 
         For more info, see docstring of `apply_calibration` and `autocorrect_calibration`.
@@ -408,7 +419,9 @@ class FeetechMotorsBus:
             values = self.apply_calibration(values, motor_names)
         return values
 
-    def apply_calibration(self, values: np.ndarray | list, motor_names: list[str] | None):
+    def apply_calibration(
+        self, values: np.ndarray | list, motor_names: list[str] | None
+    ):
         """Convert from unsigned int32 joint position range [0, 2**32[ to the universal float32 nominal degree range ]-180.0, 180.0[ with
         a "zero position" at 0 degree.
 
@@ -482,7 +495,9 @@ class FeetechMotorsBus:
 
         return values
 
-    def autocorrect_calibration(self, values: np.ndarray | list, motor_names: list[str] | None):
+    def autocorrect_calibration(
+        self, values: np.ndarray | list, motor_names: list[str] | None
+    ):
         """This function automatically detects issues with values of motors after calibration, and correct for these issues.
 
         Some motors might have values outside of expected maximum bounds after calibration.
@@ -521,18 +536,26 @@ class FeetechMotorsBus:
                     values[i] *= -1
 
                 # Convert from initial range to range [-180, 180] degrees
-                calib_val = (values[i] + homing_offset) / (resolution // 2) * HALF_TURN_DEGREE
-                in_range = (calib_val > LOWER_BOUND_DEGREE) and (calib_val < UPPER_BOUND_DEGREE)
+                calib_val = (
+                    (values[i] + homing_offset) / (resolution // 2) * HALF_TURN_DEGREE
+                )
+                in_range = (calib_val > LOWER_BOUND_DEGREE) and (
+                    calib_val < UPPER_BOUND_DEGREE
+                )
 
                 # Solve this inequality to find the factor to shift the range into [-180, 180] degrees
                 # values[i] = (values[i] + homing_offset + resolution * factor) / (resolution // 2) * HALF_TURN_DEGREE
                 # - HALF_TURN_DEGREE <= (values[i] + homing_offset + resolution * factor) / (resolution // 2) * HALF_TURN_DEGREE <= HALF_TURN_DEGREE
                 # (- HALF_TURN_DEGREE / HALF_TURN_DEGREE * (resolution // 2) - values[i] - homing_offset) / resolution <= factor <= (HALF_TURN_DEGREE / 180 * (resolution // 2) - values[i] - homing_offset) / resolution
                 low_factor = (
-                    -HALF_TURN_DEGREE / HALF_TURN_DEGREE * (resolution // 2) - values[i] - homing_offset
+                    -HALF_TURN_DEGREE / HALF_TURN_DEGREE * (resolution // 2)
+                    - values[i]
+                    - homing_offset
                 ) / resolution
                 upp_factor = (
-                    HALF_TURN_DEGREE / HALF_TURN_DEGREE * (resolution // 2) - values[i] - homing_offset
+                    HALF_TURN_DEGREE / HALF_TURN_DEGREE * (resolution // 2)
+                    - values[i]
+                    - homing_offset
                 ) / resolution
 
             elif CalibrationMode[calib_mode] == CalibrationMode.LINEAR:
@@ -541,7 +564,9 @@ class FeetechMotorsBus:
 
                 # Convert from initial range to range [0, 100] in %
                 calib_val = (values[i] - start_pos) / (end_pos - start_pos) * 100
-                in_range = (calib_val > LOWER_BOUND_LINEAR) and (calib_val < UPPER_BOUND_LINEAR)
+                in_range = (calib_val > LOWER_BOUND_LINEAR) and (
+                    calib_val < UPPER_BOUND_LINEAR
+                )
 
                 # Solve this inequality to find the factor to shift the range into [0, 100] %
                 # values[i] = (values[i] - start_pos + resolution * factor) / (end_pos + resolution * factor - start_pos - resolution * factor) * 100
@@ -557,19 +582,27 @@ class FeetechMotorsBus:
                     factor = math.ceil(low_factor)
 
                     if factor > upp_factor:
-                        raise ValueError(f"No integer found between bounds [{low_factor=}, {upp_factor=}]")
+                        raise ValueError(
+                            f"No integer found between bounds [{low_factor=}, {upp_factor=}]"
+                        )
                 else:
                     factor = math.ceil(upp_factor)
 
                     if factor > low_factor:
-                        raise ValueError(f"No integer found between bounds [{low_factor=}, {upp_factor=}]")
+                        raise ValueError(
+                            f"No integer found between bounds [{low_factor=}, {upp_factor=}]"
+                        )
 
                 if CalibrationMode[calib_mode] == CalibrationMode.DEGREE:
                     out_of_range_str = f"{LOWER_BOUND_DEGREE} < {calib_val} < {UPPER_BOUND_DEGREE} degrees"
                     in_range_str = f"{LOWER_BOUND_DEGREE} < {calib_val} < {UPPER_BOUND_DEGREE} degrees"
                 elif CalibrationMode[calib_mode] == CalibrationMode.LINEAR:
-                    out_of_range_str = f"{LOWER_BOUND_LINEAR} < {calib_val} < {UPPER_BOUND_LINEAR} %"
-                    in_range_str = f"{LOWER_BOUND_LINEAR} < {calib_val} < {UPPER_BOUND_LINEAR} %"
+                    out_of_range_str = (
+                        f"{LOWER_BOUND_LINEAR} < {calib_val} < {UPPER_BOUND_LINEAR} %"
+                    )
+                    in_range_str = (
+                        f"{LOWER_BOUND_LINEAR} < {calib_val} < {UPPER_BOUND_LINEAR} %"
+                    )
 
                 logging.warning(
                     f"Auto-correct calibration of motor '{name}' by shifting value by {abs(factor)} full turns, "
@@ -579,7 +612,9 @@ class FeetechMotorsBus:
                 # A full turn corresponds to 360 degrees but also to 4096 steps for a motor resolution of 4096.
                 self.calibration["homing_offset"][calib_idx] += resolution * factor
 
-    def revert_calibration(self, values: np.ndarray | list, motor_names: list[str] | None):
+    def revert_calibration(
+        self, values: np.ndarray | list, motor_names: list[str] | None
+    ):
         """Inverse of `apply_calibration`."""
         if motor_names is None:
             motor_names = self.motor_names
@@ -655,7 +690,9 @@ class FeetechMotorsBus:
 
         return values
 
-    def read_with_motor_ids(self, motor_models, motor_ids, data_name, num_retry=NUM_READ_RETRY):
+    def read_with_motor_ids(
+        self, motor_models, motor_ids, data_name, num_retry=NUM_READ_RETRY
+    ):
         if self.mock:
             import tests.mock_scservo_sdk as scs
         else:
@@ -760,7 +797,9 @@ class FeetechMotorsBus:
             values = self.apply_calibration_autocorrect(values, motor_names)
 
         # log the number of seconds it took to read the data from the motors
-        delta_ts_name = get_log_name("delta_timestamp_s", "read", data_name, motor_names)
+        delta_ts_name = get_log_name(
+            "delta_timestamp_s", "read", data_name, motor_names
+        )
         self.logs[delta_ts_name] = time.perf_counter() - start_time
 
         # log the utc time at which the data was received
@@ -769,7 +808,9 @@ class FeetechMotorsBus:
 
         return values
 
-    def write_with_motor_ids(self, motor_models, motor_ids, data_name, values, num_retry=NUM_WRITE_RETRY):
+    def write_with_motor_ids(
+        self, motor_models, motor_ids, data_name, values, num_retry=NUM_WRITE_RETRY
+    ):
         if self.mock:
             import tests.mock_scservo_sdk as scs
         else:
@@ -798,7 +839,12 @@ class FeetechMotorsBus:
                 f"{self.packet_handler.getTxRxResult(comm)}"
             )
 
-    def write(self, data_name, values: int | float | np.ndarray, motor_names: str | list[str] | None = None):
+    def write(
+        self,
+        data_name,
+        values: int | float | np.ndarray,
+        motor_names: str | list[str] | None = None,
+    ):
         if not self.is_connected:
             raise RobotDeviceNotConnectedError(
                 f"FeetechMotorsBus({self.port}) is not connected. You need to run `motors_bus.connect()`."
@@ -859,7 +905,9 @@ class FeetechMotorsBus:
             )
 
         # log the number of seconds it took to write the data to the motors
-        delta_ts_name = get_log_name("delta_timestamp_s", "write", data_name, motor_names)
+        delta_ts_name = get_log_name(
+            "delta_timestamp_s", "write", data_name, motor_names
+        )
         self.logs[delta_ts_name] = time.perf_counter() - start_time
 
         # TODO(rcadene): should we log the time before sending the write command?
diff --git a/lerobot/common/robot_devices/robots/dynamixel_calibration.py b/lerobot/common/robot_devices/robots/dynamixel_calibration.py
index 5c4932d2..b6aa976a 100644
--- a/lerobot/common/robot_devices/robots/dynamixel_calibration.py
+++ b/lerobot/common/robot_devices/robots/dynamixel_calibration.py
@@ -10,9 +10,7 @@ from lerobot.common.robot_devices.motors.dynamixel import (
 )
 from lerobot.common.robot_devices.motors.utils import MotorsBus
 
-URL_TEMPLATE = (
-    "https://raw.githubusercontent.com/huggingface/lerobot/main/media/{robot}/{arm}_{position}.webp"
-)
+URL_TEMPLATE = "https://raw.githubusercontent.com/huggingface/lerobot/main/media/{robot}/{arm}_{position}.webp"
 
 # The following positions are provided in nominal degree range ]-180, +180[
 # For more info on these constants, see comments in the code where they get used.
@@ -23,7 +21,9 @@ ROTATED_POSITION_DEGREE = 90
 def assert_drive_mode(drive_mode):
     # `drive_mode` is in [0,1] with 0 means original rotation direction for the motor, and 1 means inverted.
     if not np.all(np.isin(drive_mode, [0, 1])):
-        raise ValueError(f"`drive_mode` contains values other than 0 or 1: ({drive_mode})")
+        raise ValueError(
+            f"`drive_mode` contains values other than 0 or 1: ({drive_mode})"
+        )
 
 
 def apply_drive_mode(position, drive_mode):
@@ -64,12 +64,16 @@ def run_arm_calibration(arm: MotorsBus, robot_type: str, arm_name: str, arm_type
     ```
     """
     if (arm.read("Torque_Enable") != TorqueMode.DISABLED.value).any():
-        raise ValueError("To run calibration, the torque must be disabled on all motors.")
+        raise ValueError(
+            "To run calibration, the torque must be disabled on all motors."
+        )
 
     print(f"\nRunning calibration of {robot_type} {arm_name} {arm_type}...")
 
     print("\nMove arm to zero position")
-    print("See: " + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="zero"))
+    print(
+        "See: " + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="zero")
+    )
     input("Press Enter to continue...")
 
     # We arbitrarily chose our zero target position to be a straight horizontal position with gripper upwards and closed.
@@ -90,10 +94,15 @@ def run_arm_calibration(arm: MotorsBus, robot_type: str, arm_name: str, arm_type
     # corresponds to opening the gripper. When the rotation direction is ambiguous, we arbitrarely rotate clockwise from the point of view
     # of the previous motor in the kinetic chain.
     print("\nMove arm to rotated target position")
-    print("See: " + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="rotated"))
+    print(
+        "See: "
+        + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="rotated")
+    )
     input("Press Enter to continue...")
 
-    rotated_target_pos = convert_degrees_to_steps(ROTATED_POSITION_DEGREE, arm.motor_models)
+    rotated_target_pos = convert_degrees_to_steps(
+        ROTATED_POSITION_DEGREE, arm.motor_models
+    )
 
     # Find drive mode by rotating each motor by a quarter of a turn.
     # Drive mode indicates if the motor rotation direction should be inverted (=1) or not (=0).
@@ -102,11 +111,15 @@ def run_arm_calibration(arm: MotorsBus, robot_type: str, arm_name: str, arm_type
 
     # Re-compute homing offset to take into account drive mode
     rotated_drived_pos = apply_drive_mode(rotated_pos, drive_mode)
-    rotated_nearest_pos = compute_nearest_rounded_position(rotated_drived_pos, arm.motor_models)
+    rotated_nearest_pos = compute_nearest_rounded_position(
+        rotated_drived_pos, arm.motor_models
+    )
     homing_offset = rotated_target_pos - rotated_nearest_pos
 
     print("\nMove arm to rest position")
-    print("See: " + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="rest"))
+    print(
+        "See: " + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="rest")
+    )
     input("Press Enter to continue...")
     print()
 
diff --git a/lerobot/common/robot_devices/robots/feetech_calibration.py b/lerobot/common/robot_devices/robots/feetech_calibration.py
index b015951a..f702e6d8 100644
--- a/lerobot/common/robot_devices/robots/feetech_calibration.py
+++ b/lerobot/common/robot_devices/robots/feetech_calibration.py
@@ -12,9 +12,7 @@ from lerobot.common.robot_devices.motors.feetech import (
 )
 from lerobot.common.robot_devices.motors.utils import MotorsBus
 
-URL_TEMPLATE = (
-    "https://raw.githubusercontent.com/huggingface/lerobot/main/media/{robot}/{arm}_{position}.webp"
-)
+URL_TEMPLATE = "https://raw.githubusercontent.com/huggingface/lerobot/main/media/{robot}/{arm}_{position}.webp"
 
 # The following positions are provided in nominal degree range ]-180, +180[
 # For more info on these constants, see comments in the code where they get used.
@@ -25,7 +23,9 @@ ROTATED_POSITION_DEGREE = 90
 def assert_drive_mode(drive_mode):
     # `drive_mode` is in [0,1] with 0 means original rotation direction for the motor, and 1 means inverted.
     if not np.all(np.isin(drive_mode, [0, 1])):
-        raise ValueError(f"`drive_mode` contains values other than 0 or 1: ({drive_mode})")
+        raise ValueError(
+            f"`drive_mode` contains values other than 0 or 1: ({drive_mode})"
+        )
 
 
 def apply_drive_mode(position, drive_mode):
@@ -126,7 +126,9 @@ def apply_offset(calib, offset):
     return calib
 
 
-def run_arm_auto_calibration(arm: MotorsBus, robot_type: str, arm_name: str, arm_type: str):
+def run_arm_auto_calibration(
+    arm: MotorsBus, robot_type: str, arm_name: str, arm_type: str
+):
     if robot_type == "so100":
         return run_arm_auto_calibration_so100(arm, robot_type, arm_name, arm_type)
     elif robot_type == "moss":
@@ -135,18 +137,27 @@ def run_arm_auto_calibration(arm: MotorsBus, robot_type: str, arm_name: str, arm
         raise ValueError(robot_type)
 
 
-def run_arm_auto_calibration_so100(arm: MotorsBus, robot_type: str, arm_name: str, arm_type: str):
+def run_arm_auto_calibration_so100(
+    arm: MotorsBus, robot_type: str, arm_name: str, arm_type: str
+):
     """All the offsets and magic numbers are hand tuned, and are unique to SO-100 follower arms"""
     if (arm.read("Torque_Enable") != TorqueMode.DISABLED.value).any():
-        raise ValueError("To run calibration, the torque must be disabled on all motors.")
+        raise ValueError(
+            "To run calibration, the torque must be disabled on all motors."
+        )
 
     if not (robot_type == "so100" and arm_type == "follower"):
-        raise NotImplementedError("Auto calibration only supports the follower of so100 arms for now.")
+        raise NotImplementedError(
+            "Auto calibration only supports the follower of so100 arms for now."
+        )
 
     print(f"\nRunning calibration of {robot_type} {arm_name} {arm_type}...")
 
     print("\nMove arm to initial position")
-    print("See: " + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="initial"))
+    print(
+        "See: "
+        + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="initial")
+    )
     input("Press Enter to continue...")
 
     # Lower the acceleration of the motors (in [0,254])
@@ -193,11 +204,16 @@ def run_arm_auto_calibration_so100(arm: MotorsBus, robot_type: str, arm_name: st
 
     print("Calibrate elbow_flex")
     calib["elbow_flex"] = move_to_calibrate(
-        arm, "elbow_flex", positive_first=False, in_between_move_hook=in_between_move_hook
+        arm,
+        "elbow_flex",
+        positive_first=False,
+        in_between_move_hook=in_between_move_hook,
     )
     calib["elbow_flex"] = apply_offset(calib["elbow_flex"], offset=80 - 1024)
 
-    arm.write("Goal_Position", calib["elbow_flex"]["zero_pos"] + 1024 + 512, "elbow_flex")
+    arm.write(
+        "Goal_Position", calib["elbow_flex"]["zero_pos"] + 1024 + 512, "elbow_flex"
+    )
     time.sleep(1)
 
     def in_between_move_hook():
@@ -225,18 +241,30 @@ def run_arm_auto_calibration_so100(arm: MotorsBus, robot_type: str, arm_name: st
         }
         arm.write("Goal_Position", list(positions.values()), list(positions.keys()))
 
-    arm.write("Goal_Position", round(calib["shoulder_lift"]["zero_pos"] - 1600), "shoulder_lift")
+    arm.write(
+        "Goal_Position",
+        round(calib["shoulder_lift"]["zero_pos"] - 1600),
+        "shoulder_lift",
+    )
     time.sleep(2)
-    arm.write("Goal_Position", round(calib["elbow_flex"]["zero_pos"] + 1700), "elbow_flex")
+    arm.write(
+        "Goal_Position", round(calib["elbow_flex"]["zero_pos"] + 1700), "elbow_flex"
+    )
     time.sleep(2)
-    arm.write("Goal_Position", round(calib["wrist_flex"]["zero_pos"] + 800), "wrist_flex")
+    arm.write(
+        "Goal_Position", round(calib["wrist_flex"]["zero_pos"] + 800), "wrist_flex"
+    )
     time.sleep(2)
     arm.write("Goal_Position", round(calib["gripper"]["end_pos"]), "gripper")
     time.sleep(2)
 
     print("Calibrate wrist_roll")
     calib["wrist_roll"] = move_to_calibrate(
-        arm, "wrist_roll", invert_drive_mode=True, positive_first=False, while_move_hook=while_move_hook
+        arm,
+        "wrist_roll",
+        invert_drive_mode=True,
+        positive_first=False,
+        while_move_hook=while_move_hook,
     )
 
     arm.write("Goal_Position", calib["wrist_roll"]["zero_pos"], "wrist_roll")
@@ -246,7 +274,9 @@ def run_arm_auto_calibration_so100(arm: MotorsBus, robot_type: str, arm_name: st
     arm.write("Goal_Position", calib["wrist_flex"]["zero_pos"], "wrist_flex")
     time.sleep(1)
     arm.write("Goal_Position", calib["elbow_flex"]["zero_pos"] + 2048, "elbow_flex")
-    arm.write("Goal_Position", calib["shoulder_lift"]["zero_pos"] - 2048, "shoulder_lift")
+    arm.write(
+        "Goal_Position", calib["shoulder_lift"]["zero_pos"] - 2048, "shoulder_lift"
+    )
     time.sleep(1)
     arm.write("Goal_Position", calib["shoulder_pan"]["zero_pos"], "shoulder_pan")
     time.sleep(1)
@@ -275,18 +305,27 @@ def run_arm_auto_calibration_so100(arm: MotorsBus, robot_type: str, arm_name: st
     return calib_dict
 
 
-def run_arm_auto_calibration_moss(arm: MotorsBus, robot_type: str, arm_name: str, arm_type: str):
+def run_arm_auto_calibration_moss(
+    arm: MotorsBus, robot_type: str, arm_name: str, arm_type: str
+):
     """All the offsets and magic numbers are hand tuned, and are unique to SO-100 follower arms"""
     if (arm.read("Torque_Enable") != TorqueMode.DISABLED.value).any():
-        raise ValueError("To run calibration, the torque must be disabled on all motors.")
+        raise ValueError(
+            "To run calibration, the torque must be disabled on all motors."
+        )
 
     if not (robot_type == "moss" and arm_type == "follower"):
-        raise NotImplementedError("Auto calibration only supports the follower of moss arms for now.")
+        raise NotImplementedError(
+            "Auto calibration only supports the follower of moss arms for now."
+        )
 
     print(f"\nRunning calibration of {robot_type} {arm_name} {arm_type}...")
 
     print("\nMove arm to initial position")
-    print("See: " + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="initial"))
+    print(
+        "See: "
+        + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="initial")
+    )
     input("Press Enter to continue...")
 
     # Lower the acceleration of the motors (in [0,254])
@@ -370,8 +409,12 @@ def run_arm_auto_calibration_moss(arm: MotorsBus, robot_type: str, arm_name: str
 
     arm.write("Goal_Position", calib["wrist_flex"]["zero_pos"] - 1024, "wrist_flex")
     time.sleep(1)
-    arm.write("Goal_Position", calib["shoulder_lift"]["zero_pos"] + 2048, "shoulder_lift")
-    arm.write("Goal_Position", calib["elbow_flex"]["zero_pos"] - 1024 - 400, "elbow_flex")
+    arm.write(
+        "Goal_Position", calib["shoulder_lift"]["zero_pos"] + 2048, "shoulder_lift"
+    )
+    arm.write(
+        "Goal_Position", calib["elbow_flex"]["zero_pos"] - 1024 - 400, "elbow_flex"
+    )
     time.sleep(2)
 
     calib_modes = []
@@ -398,7 +441,9 @@ def run_arm_auto_calibration_moss(arm: MotorsBus, robot_type: str, arm_name: str
     return calib_dict
 
 
-def run_arm_manual_calibration(arm: MotorsBus, robot_type: str, arm_name: str, arm_type: str):
+def run_arm_manual_calibration(
+    arm: MotorsBus, robot_type: str, arm_name: str, arm_type: str
+):
     """This function ensures that a neural network trained on data collected on a given robot
     can work on another robot. For instance before calibration, setting a same goal position
     for each motor of two different robots will get two very different positions. But after calibration,
@@ -421,12 +466,16 @@ def run_arm_manual_calibration(arm: MotorsBus, robot_type: str, arm_name: str, a
     ```
     """
     if (arm.read("Torque_Enable") != TorqueMode.DISABLED.value).any():
-        raise ValueError("To run calibration, the torque must be disabled on all motors.")
+        raise ValueError(
+            "To run calibration, the torque must be disabled on all motors."
+        )
 
     print(f"\nRunning calibration of {robot_type} {arm_name} {arm_type}...")
 
     print("\nMove arm to zero position")
-    print("See: " + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="zero"))
+    print(
+        "See: " + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="zero")
+    )
     input("Press Enter to continue...")
 
     # We arbitrarily chose our zero target position to be a straight horizontal position with gripper upwards and closed.
@@ -446,10 +495,15 @@ def run_arm_manual_calibration(arm: MotorsBus, robot_type: str, arm_name: str, a
     # corresponds to opening the gripper. When the rotation direction is ambiguous, we arbitrarely rotate clockwise from the point of view
     # of the previous motor in the kinetic chain.
     print("\nMove arm to rotated target position")
-    print("See: " + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="rotated"))
+    print(
+        "See: "
+        + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="rotated")
+    )
     input("Press Enter to continue...")
 
-    rotated_target_pos = convert_degrees_to_steps(ROTATED_POSITION_DEGREE, arm.motor_models)
+    rotated_target_pos = convert_degrees_to_steps(
+        ROTATED_POSITION_DEGREE, arm.motor_models
+    )
 
     # Find drive mode by rotating each motor by a quarter of a turn.
     # Drive mode indicates if the motor rotation direction should be inverted (=1) or not (=0).
@@ -461,7 +515,9 @@ def run_arm_manual_calibration(arm: MotorsBus, robot_type: str, arm_name: str, a
     homing_offset = rotated_target_pos - rotated_drived_pos
 
     print("\nMove arm to rest position")
-    print("See: " + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="rest"))
+    print(
+        "See: " + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="rest")
+    )
     input("Press Enter to continue...")
     print()
 
diff --git a/lerobot/common/robot_devices/robots/manipulator.py b/lerobot/common/robot_devices/robots/manipulator.py
index 2381de28..8671a02b 100644
--- a/lerobot/common/robot_devices/robots/manipulator.py
+++ b/lerobot/common/robot_devices/robots/manipulator.py
@@ -18,11 +18,16 @@ import torch
 from lerobot.common.robot_devices.cameras.utils import Camera
 from lerobot.common.robot_devices.motors.utils import MotorsBus
 from lerobot.common.robot_devices.robots.utils import get_arm_id
-from lerobot.common.robot_devices.utils import RobotDeviceAlreadyConnectedError, RobotDeviceNotConnectedError
+from lerobot.common.robot_devices.utils import (
+    RobotDeviceAlreadyConnectedError,
+    RobotDeviceNotConnectedError,
+)
 
 
 def ensure_safe_goal_position(
-    goal_pos: torch.Tensor, present_pos: torch.Tensor, max_relative_target: float | list[float]
+    goal_pos: torch.Tensor,
+    present_pos: torch.Tensor,
+    max_relative_target: float | list[float],
 ):
     # Cap relative action target magnitude for safety.
     diff = goal_pos - present_pos
@@ -70,7 +75,11 @@ class ManipulatorRobotConfig:
     joint_position_relative_bounds: dict[np.ndarray] | None = None
 
     def __setattr__(self, prop: str, val):
-        if prop == "max_relative_target" and val is not None and isinstance(val, Sequence):
+        if (
+            prop == "max_relative_target"
+            and val is not None
+            and isinstance(val, Sequence)
+        ):
             for name in self.follower_arms:
                 if len(self.follower_arms[name].motors) != len(val):
                     raise ValueError(
@@ -87,7 +96,9 @@ class ManipulatorRobotConfig:
 
     def __post_init__(self):
         if self.robot_type not in ["koch", "koch_bimanual", "aloha", "so100", "moss"]:
-            raise ValueError(f"Provided robot type ({self.robot_type}) is not supported.")
+            raise ValueError(
+                f"Provided robot type ({self.robot_type}) is not supported."
+            )
 
 
 class ManipulatorRobot:
@@ -341,7 +352,9 @@ class ManipulatorRobot:
             # to squeeze the gripper and have it spring back to an open position on its own.
             for name in self.leader_arms:
                 self.leader_arms[name].write("Torque_Enable", 1, "gripper")
-                self.leader_arms[name].write("Goal_Position", self.config.gripper_open_degree, "gripper")
+                self.leader_arms[name].write(
+                    "Goal_Position", self.config.gripper_open_degree, "gripper"
+                )
 
         # Check both arms can be read
         for name in self.follower_arms:
@@ -373,18 +386,26 @@ class ManipulatorRobot:
                 print(f"Missing calibration file '{arm_calib_path}'")
 
                 if self.robot_type in ["koch", "koch_bimanual", "aloha"]:
-                    from lerobot.common.robot_devices.robots.dynamixel_calibration import run_arm_calibration
+                    from lerobot.common.robot_devices.robots.dynamixel_calibration import (
+                        run_arm_calibration,
+                    )
 
-                    calibration = run_arm_calibration(arm, self.robot_type, name, arm_type)
+                    calibration = run_arm_calibration(
+                        arm, self.robot_type, name, arm_type
+                    )
 
                 elif self.robot_type in ["so100", "moss"]:
                     from lerobot.common.robot_devices.robots.feetech_calibration import (
                         run_arm_manual_calibration,
                     )
 
-                    calibration = run_arm_manual_calibration(arm, self.robot_type, name, arm_type)
+                    calibration = run_arm_manual_calibration(
+                        arm, self.robot_type, name, arm_type
+                    )
 
-                print(f"Calibration is done! Saving calibration file '{arm_calib_path}'")
+                print(
+                    f"Calibration is done! Saving calibration file '{arm_calib_path}'"
+                )
                 arm_calib_path.parent.mkdir(parents=True, exist_ok=True)
                 with open(arm_calib_path, "w") as f:
                     json.dump(calibration, f)
@@ -403,13 +424,17 @@ class ManipulatorRobot:
             from lerobot.common.robot_devices.motors.dynamixel import TorqueMode
 
             if (arm.read("Torque_Enable") != TorqueMode.DISABLED.value).any():
-                raise ValueError("To run set robot preset, the torque must be disabled on all motors.")
+                raise ValueError(
+                    "To run set robot preset, the torque must be disabled on all motors."
+                )
 
             # Use 'extended position mode' for all motors except gripper, because in joint mode the servos can't
             # rotate more than 360 degrees (from 0 to 4095) And some mistake can happen while assembling the arm,
             # you could end up with a servo with a position 0 or 4095 at a crucial point See [
             # https://emanual.robotis.com/docs/en/dxl/x/x_series/#operating-mode11]
-            all_motors_except_gripper = [name for name in arm.motor_names if name != "gripper"]
+            all_motors_except_gripper = [
+                name for name in arm.motor_names if name != "gripper"
+            ]
             if len(all_motors_except_gripper) > 0:
                 # 4 corresponds to Extended Position on Koch motors
                 arm.write("Operating_Mode", 4, all_motors_except_gripper)
@@ -438,7 +463,9 @@ class ManipulatorRobot:
                 # Enable torque on the gripper of the leader arms, and move it to 45 degrees,
                 # so that we can use it as a trigger to close the gripper of the follower arms.
                 self.leader_arms[name].write("Torque_Enable", 1, "gripper")
-                self.leader_arms[name].write("Goal_Position", self.config.gripper_open_degree, "gripper")
+                self.leader_arms[name].write(
+                    "Goal_Position", self.config.gripper_open_degree, "gripper"
+                )
 
     def set_aloha_robot_preset(self):
         def set_shadow_(arm):
@@ -468,11 +495,15 @@ class ManipulatorRobot:
             # you could end up with a servo with a position 0 or 4095 at a crucial point See [
             # https://emanual.robotis.com/docs/en/dxl/x/x_series/#operating-mode11]
             all_motors_except_gripper = [
-                name for name in self.follower_arms[name].motor_names if name != "gripper"
+                name
+                for name in self.follower_arms[name].motor_names
+                if name != "gripper"
             ]
             if len(all_motors_except_gripper) > 0:
                 # 4 corresponds to Extended Position on Aloha motors
-                self.follower_arms[name].write("Operating_Mode", 4, all_motors_except_gripper)
+                self.follower_arms[name].write(
+                    "Operating_Mode", 4, all_motors_except_gripper
+                )
 
             # Use 'position control current based' for follower gripper to be limited by the limit of the current.
             # It can grasp an object without forcing too much even tho,
@@ -520,7 +551,9 @@ class ManipulatorRobot:
             before_lread_t = time.perf_counter()
             leader_pos[name] = self.leader_arms[name].read("Present_Position")
             leader_pos[name] = torch.from_numpy(leader_pos[name])
-            self.logs[f"read_leader_{name}_pos_dt_s"] = time.perf_counter() - before_lread_t
+            self.logs[f"read_leader_{name}_pos_dt_s"] = (
+                time.perf_counter() - before_lread_t
+            )
 
         # Send goal position to the follower
         follower_goal_pos = {}
@@ -541,14 +574,18 @@ class ManipulatorRobot:
             if self.config.max_relative_target is not None:
                 present_pos = self.follower_arms[name].read("Present_Position")
                 present_pos = torch.from_numpy(present_pos)
-                goal_pos = ensure_safe_goal_position(goal_pos, present_pos, self.config.max_relative_target)
+                goal_pos = ensure_safe_goal_position(
+                    goal_pos, present_pos, self.config.max_relative_target
+                )
 
             # Used when record_data=True
             follower_goal_pos[name] = goal_pos
 
             goal_pos = goal_pos.numpy().astype(np.int32)
             self.follower_arms[name].write("Goal_Position", goal_pos)
-            self.logs[f"write_follower_{name}_goal_pos_dt_s"] = time.perf_counter() - before_fwrite_t
+            self.logs[f"write_follower_{name}_goal_pos_dt_s"] = (
+                time.perf_counter() - before_fwrite_t
+            )
 
         # Early exit when recording data is not requested
         if not record_data:
@@ -561,7 +598,9 @@ class ManipulatorRobot:
             before_fread_t = time.perf_counter()
             follower_pos[name] = self.follower_arms[name].read("Present_Position")
             follower_pos[name] = torch.from_numpy(follower_pos[name])
-            self.logs[f"read_follower_{name}_pos_dt_s"] = time.perf_counter() - before_fread_t
+            self.logs[f"read_follower_{name}_pos_dt_s"] = (
+                time.perf_counter() - before_fread_t
+            )
 
         # Create state by concatenating follower current position
         state = []
@@ -583,8 +622,12 @@ class ManipulatorRobot:
             before_camread_t = time.perf_counter()
             images[name] = self.cameras[name].async_read()
             images[name] = torch.from_numpy(images[name])
-            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs["delta_timestamp_s"]
-            self.logs[f"async_read_camera_{name}_dt_s"] = time.perf_counter() - before_camread_t
+            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs[
+                "delta_timestamp_s"
+            ]
+            self.logs[f"async_read_camera_{name}_dt_s"] = (
+                time.perf_counter() - before_camread_t
+            )
 
         # Populate output dictionnaries
         obs_dict, action_dict = {}, {}
@@ -608,7 +651,9 @@ class ManipulatorRobot:
             before_fread_t = time.perf_counter()
             follower_pos[name] = self.follower_arms[name].read("Present_Position")
             follower_pos[name] = torch.from_numpy(follower_pos[name])
-            self.logs[f"read_follower_{name}_pos_dt_s"] = time.perf_counter() - before_fread_t
+            self.logs[f"read_follower_{name}_pos_dt_s"] = (
+                time.perf_counter() - before_fread_t
+            )
 
         # Create state by concatenating follower current position
         state = []
@@ -623,8 +668,12 @@ class ManipulatorRobot:
             before_camread_t = time.perf_counter()
             images[name] = self.cameras[name].async_read()
             images[name] = torch.from_numpy(images[name])
-            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs["delta_timestamp_s"]
-            self.logs[f"async_read_camera_{name}_dt_s"] = time.perf_counter() - before_camread_t
+            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs[
+                "delta_timestamp_s"
+            ]
+            self.logs[f"async_read_camera_{name}_dt_s"] = (
+                time.perf_counter() - before_camread_t
+            )
 
         # Populate output dictionnaries and format to pytorch
         obs_dict = {}
@@ -670,7 +719,9 @@ class ManipulatorRobot:
             if self.config.max_relative_target is not None:
                 present_pos = self.follower_arms[name].read("Present_Position")
                 present_pos = torch.from_numpy(present_pos)
-                goal_pos = ensure_safe_goal_position(goal_pos, present_pos, self.config.max_relative_target)
+                goal_pos = ensure_safe_goal_position(
+                    goal_pos, present_pos, self.config.max_relative_target
+                )
 
             # Save tensor to concat and return
             action_sent.append(goal_pos)
diff --git a/lerobot/common/robot_devices/robots/stretch.py b/lerobot/common/robot_devices/robots/stretch.py
index ff86b6d8..13209715 100644
--- a/lerobot/common/robot_devices/robots/stretch.py
+++ b/lerobot/common/robot_devices/robots/stretch.py
@@ -60,7 +60,9 @@ class StretchRobot(StretchAPI):
     def connect(self) -> None:
         self.is_connected = self.startup()
         if not self.is_connected:
-            print("Another process is already using Stretch. Try running 'stretch_free_robot_process.py'")
+            print(
+                "Another process is already using Stretch. Try running 'stretch_free_robot_process.py'"
+            )
             raise ConnectionError()
 
         for name in self.cameras:
@@ -68,7 +70,9 @@ class StretchRobot(StretchAPI):
             self.is_connected = self.is_connected and self.cameras[name].is_connected
 
         if not self.is_connected:
-            print("Could not connect to the cameras, check that all cameras are plugged-in.")
+            print(
+                "Could not connect to the cameras, check that all cameras are plugged-in."
+            )
             raise ConnectionError()
 
         self.run_calibration()
@@ -113,8 +117,12 @@ class StretchRobot(StretchAPI):
             before_camread_t = time.perf_counter()
             images[name] = self.cameras[name].async_read()
             images[name] = torch.from_numpy(images[name])
-            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs["delta_timestamp_s"]
-            self.logs[f"async_read_camera_{name}_dt_s"] = time.perf_counter() - before_camread_t
+            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs[
+                "delta_timestamp_s"
+            ]
+            self.logs[f"async_read_camera_{name}_dt_s"] = (
+                time.perf_counter() - before_camread_t
+            )
 
         # Populate output dictionnaries
         obs_dict, action_dict = {}, {}
@@ -158,8 +166,12 @@ class StretchRobot(StretchAPI):
             before_camread_t = time.perf_counter()
             images[name] = self.cameras[name].async_read()
             images[name] = torch.from_numpy(images[name])
-            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs["delta_timestamp_s"]
-            self.logs[f"async_read_camera_{name}_dt_s"] = time.perf_counter() - before_camread_t
+            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs[
+                "delta_timestamp_s"
+            ]
+            self.logs[f"async_read_camera_{name}_dt_s"] = (
+                time.perf_counter() - before_camread_t
+            )
 
         # Populate output dictionnaries
         obs_dict = {}
diff --git a/lerobot/common/robot_devices/utils.py b/lerobot/common/robot_devices/utils.py
index 19bb637e..fe9c4f42 100644
--- a/lerobot/common/robot_devices/utils.py
+++ b/lerobot/common/robot_devices/utils.py
@@ -34,7 +34,8 @@ class RobotDeviceNotConnectedError(Exception):
     """Exception raised when the robot device is not connected."""
 
     def __init__(
-        self, message="This robot device is not connected. Try calling `robot_device.connect()` first."
+        self,
+        message="This robot device is not connected. Try calling `robot_device.connect()` first.",
     ):
         self.message = message
         super().__init__(self.message)
diff --git a/lerobot/common/utils/import_utils.py b/lerobot/common/utils/import_utils.py
index cd5f8245..e2ce5a87 100644
--- a/lerobot/common/utils/import_utils.py
+++ b/lerobot/common/utils/import_utils.py
@@ -17,7 +17,9 @@ import importlib
 import logging
 
 
-def is_package_available(pkg_name: str, return_version: bool = False) -> tuple[bool, str] | bool:
+def is_package_available(
+    pkg_name: str, return_version: bool = False
+) -> tuple[bool, str] | bool:
     """Copied from https://github.com/huggingface/transformers/blob/main/src/transformers/utils/import_utils.py
     Check if the package spec exists and grab its version to avoid importing a local directory.
     **Note:** this doesn't work for all packages.
diff --git a/lerobot/common/utils/io_utils.py b/lerobot/common/utils/io_utils.py
index b85f17c7..664b8a0d 100644
--- a/lerobot/common/utils/io_utils.py
+++ b/lerobot/common/utils/io_utils.py
@@ -22,6 +22,8 @@ def write_video(video_path, stacked_frames, fps):
     # Filter out DeprecationWarnings raised from pkg_resources
     with warnings.catch_warnings():
         warnings.filterwarnings(
-            "ignore", "pkg_resources is deprecated as an API", category=DeprecationWarning
+            "ignore",
+            "pkg_resources is deprecated as an API",
+            category=DeprecationWarning,
         )
         imageio.mimsave(video_path, stacked_frames, fps=fps)
diff --git a/lerobot/common/utils/utils.py b/lerobot/common/utils/utils.py
index e4460f5f..2bf19738 100644
--- a/lerobot/common/utils/utils.py
+++ b/lerobot/common/utils/utils.py
@@ -156,11 +156,16 @@ def _relative_path_between(path1: Path, path2: Path) -> Path:
     except ValueError:  # most likely because path1 is not a subpath of path2
         common_parts = Path(osp.commonpath([path1, path2])).parts
         return Path(
-            "/".join([".."] * (len(path2.parts) - len(common_parts)) + list(path1.parts[len(common_parts) :]))
+            "/".join(
+                [".."] * (len(path2.parts) - len(common_parts))
+                + list(path1.parts[len(common_parts) :])
+            )
         )
 
 
-def init_hydra_config(config_path: str, overrides: list[str] | None = None) -> DictConfig:
+def init_hydra_config(
+    config_path: str, overrides: list[str] | None = None
+) -> DictConfig:
     """Initialize a Hydra config given only the path to the relevant config file.
 
     For config resolution, it is assumed that the config file's parent is the Hydra config dir.
@@ -169,7 +174,11 @@ def init_hydra_config(config_path: str, overrides: list[str] | None = None) -> D
     hydra.core.global_hydra.GlobalHydra.instance().clear()
     # Hydra needs a path relative to this file.
     hydra.initialize(
-        str(_relative_path_between(Path(config_path).absolute().parent, Path(__file__).absolute().parent)),
+        str(
+            _relative_path_between(
+                Path(config_path).absolute().parent, Path(__file__).absolute().parent
+            )
+        ),
         version_base="1.2",
     )
     cfg = hydra.compose(Path(config_path).stem, overrides)
@@ -183,10 +192,26 @@ def print_cuda_memory_usage():
     gc.collect()
     # Also clear the cache if you want to fully release the memory
     torch.cuda.empty_cache()
-    print("Current GPU Memory Allocated: {:.2f} MB".format(torch.cuda.memory_allocated(0) / 1024**2))
-    print("Maximum GPU Memory Allocated: {:.2f} MB".format(torch.cuda.max_memory_allocated(0) / 1024**2))
-    print("Current GPU Memory Reserved: {:.2f} MB".format(torch.cuda.memory_reserved(0) / 1024**2))
-    print("Maximum GPU Memory Reserved: {:.2f} MB".format(torch.cuda.max_memory_reserved(0) / 1024**2))
+    print(
+        "Current GPU Memory Allocated: {:.2f} MB".format(
+            torch.cuda.memory_allocated(0) / 1024**2
+        )
+    )
+    print(
+        "Maximum GPU Memory Allocated: {:.2f} MB".format(
+            torch.cuda.max_memory_allocated(0) / 1024**2
+        )
+    )
+    print(
+        "Current GPU Memory Reserved: {:.2f} MB".format(
+            torch.cuda.memory_reserved(0) / 1024**2
+        )
+    )
+    print(
+        "Maximum GPU Memory Reserved: {:.2f} MB".format(
+            torch.cuda.max_memory_reserved(0) / 1024**2
+        )
+    )
 
 
 def capture_timestamp_utc():
@@ -221,7 +246,12 @@ def log_say(text, play_sounds, blocking=False):
 
 
 class TimerManager:
-    def __init__(self, elapsed_time_list: list[float] | None = None, label="Elapsed time", log=True):
+    def __init__(
+        self,
+        elapsed_time_list: list[float] | None = None,
+        label="Elapsed time",
+        log=True,
+    ):
         self.label = label
         self.elapsed_time_list = elapsed_time_list
         self.log = log
diff --git a/lerobot/configs/env/so100_real.yaml b/lerobot/configs/env/so100_real.yaml
index bceeae59..1bd5cd83 100644
--- a/lerobot/configs/env/so100_real.yaml
+++ b/lerobot/configs/env/so100_real.yaml
@@ -9,7 +9,7 @@ env:
   action_dim: 6
   fps: ${fps}
   device: mps
-  
+
   wrapper:
     crop_params_dict:
       observation.images.front: [102, 43, 358, 523]
@@ -28,4 +28,4 @@ env:
   reward_classifier:
     pretrained_path:  outputs/classifier/13-02-random-sample-resnet10-frozen/checkpoints/best/pretrained_model
     config_path: lerobot/configs/policy/hilserl_classifier.yaml
-    
\ No newline at end of file
+
diff --git a/lerobot/configs/policy/sac_maniskill.yaml b/lerobot/configs/policy/sac_maniskill.yaml
index 87fc4095..c954b1ea 100644
--- a/lerobot/configs/policy/sac_maniskill.yaml
+++ b/lerobot/configs/policy/sac_maniskill.yaml
@@ -66,7 +66,7 @@ policy:
     observation.image: [3, 64, 64]
   output_shapes:
     action: [7]
-  
+
   camera_number: 1
 
   # Normalization / Unnormalization
@@ -79,7 +79,7 @@ policy:
     #       1.0764e+00, -1.2680e+00,  0.0000e+00,  0.0000e+00, -9.3448e+00,
     #      -3.3828e+00, -3.8420e+00, -5.2553e+00, -3.4154e+00, -6.5082e+00,
     #      -6.0500e+00, -8.7193e+00, -8.2337e+00, -3.4650e-01, -4.9441e-01,
-    #       8.3516e-03, -3.1114e-01, -9.9700e-01, -2.3471e-01, -2.7137e-01] 
+    #       8.3516e-03, -3.1114e-01, -9.9700e-01, -2.3471e-01, -2.7137e-01]
 
     #   max: [ 0.8644,  1.4306,  1.8520, -0.7578,  0.9508,  3.4901,  1.9381,  0.0400,
     #       0.0400,  5.0885,  4.7156,  7.9393,  7.9100,  2.9796,  5.7720,  4.7163,
diff --git a/lerobot/scripts/configure_motor.py b/lerobot/scripts/configure_motor.py
index 18707397..1a53ab6c 100644
--- a/lerobot/scripts/configure_motor.py
+++ b/lerobot/scripts/configure_motor.py
@@ -22,13 +22,17 @@ def configure_motor(port, brand, model, motor_idx_des, baudrate_des):
         from lerobot.common.robot_devices.motors.feetech import (
             SCS_SERIES_BAUDRATE_TABLE as SERIES_BAUDRATE_TABLE,
         )
-        from lerobot.common.robot_devices.motors.feetech import FeetechMotorsBus as MotorsBusClass
+        from lerobot.common.robot_devices.motors.feetech import (
+            FeetechMotorsBus as MotorsBusClass,
+        )
     elif brand == "dynamixel":
         from lerobot.common.robot_devices.motors.dynamixel import MODEL_BAUDRATE_TABLE
         from lerobot.common.robot_devices.motors.dynamixel import (
             X_SERIES_BAUDRATE_TABLE as SERIES_BAUDRATE_TABLE,
         )
-        from lerobot.common.robot_devices.motors.dynamixel import DynamixelMotorsBus as MotorsBusClass
+        from lerobot.common.robot_devices.motors.dynamixel import (
+            DynamixelMotorsBus as MotorsBusClass,
+        )
     else:
         raise ValueError(
             f"Currently we do not support this motor brand: {brand}. We currently support feetech and dynamixel motors."
@@ -46,7 +50,9 @@ def configure_motor(port, brand, model, motor_idx_des, baudrate_des):
     motor_model = model  # Use the motor model passed via argument
 
     # Initialize the MotorBus with the correct port and motor configurations
-    motor_bus = MotorsBusClass(port=port, motors={motor_name: (motor_index_arbitrary, motor_model)})
+    motor_bus = MotorsBusClass(
+        port=port, motors={motor_name: (motor_index_arbitrary, motor_model)}
+    )
 
     # Try to connect to the motor bus and handle any connection-specific errors
     try:
@@ -78,20 +84,26 @@ def configure_motor(port, brand, model, motor_idx_des, baudrate_des):
                 motor_index = present_ids[0]
 
         if motor_index == -1:
-            raise ValueError("No motors detected. Please ensure you have one motor connected.")
+            raise ValueError(
+                "No motors detected. Please ensure you have one motor connected."
+            )
 
         print(f"Motor index found at: {motor_index}")
 
         if brand == "feetech":
             # Allows ID and BAUDRATE to be written in memory
-            motor_bus.write_with_motor_ids(motor_bus.motor_models, motor_index, "Lock", 0)
+            motor_bus.write_with_motor_ids(
+                motor_bus.motor_models, motor_index, "Lock", 0
+            )
 
         if baudrate != baudrate_des:
             print(f"Setting its baudrate to {baudrate_des}")
             baudrate_idx = list(SERIES_BAUDRATE_TABLE.values()).index(baudrate_des)
 
             # The write can fail, so we allow retries
-            motor_bus.write_with_motor_ids(motor_bus.motor_models, motor_index, "Baud_Rate", baudrate_idx)
+            motor_bus.write_with_motor_ids(
+                motor_bus.motor_models, motor_index, "Baud_Rate", baudrate_idx
+            )
             time.sleep(0.5)
             motor_bus.set_bus_baudrate(baudrate_des)
             present_baudrate_idx = motor_bus.read_with_motor_ids(
@@ -103,9 +115,13 @@ def configure_motor(port, brand, model, motor_idx_des, baudrate_des):
 
         print(f"Setting its index to desired index {motor_idx_des}")
         motor_bus.write_with_motor_ids(motor_bus.motor_models, motor_index, "Lock", 0)
-        motor_bus.write_with_motor_ids(motor_bus.motor_models, motor_index, "ID", motor_idx_des)
+        motor_bus.write_with_motor_ids(
+            motor_bus.motor_models, motor_index, "ID", motor_idx_des
+        )
 
-        present_idx = motor_bus.read_with_motor_ids(motor_bus.motor_models, motor_idx_des, "ID", num_retry=2)
+        present_idx = motor_bus.read_with_motor_ids(
+            motor_bus.motor_models, motor_idx_des, "ID", num_retry=2
+        )
         if present_idx != motor_idx_des:
             raise OSError("Failed to write index.")
 
@@ -133,12 +149,29 @@ def configure_motor(port, brand, model, motor_idx_des, baudrate_des):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--port", type=str, required=True, help="Motors bus port (e.g. dynamixel,feetech)")
-    parser.add_argument("--brand", type=str, required=True, help="Motor brand (e.g. dynamixel,feetech)")
-    parser.add_argument("--model", type=str, required=True, help="Motor model (e.g. xl330-m077,sts3215)")
-    parser.add_argument("--ID", type=int, required=True, help="Desired ID of the current motor (e.g. 1,2,3)")
     parser.add_argument(
-        "--baudrate", type=int, default=1000000, help="Desired baudrate for the motor (default: 1000000)"
+        "--port",
+        type=str,
+        required=True,
+        help="Motors bus port (e.g. dynamixel,feetech)",
+    )
+    parser.add_argument(
+        "--brand", type=str, required=True, help="Motor brand (e.g. dynamixel,feetech)"
+    )
+    parser.add_argument(
+        "--model", type=str, required=True, help="Motor model (e.g. xl330-m077,sts3215)"
+    )
+    parser.add_argument(
+        "--ID",
+        type=int,
+        required=True,
+        help="Desired ID of the current motor (e.g. 1,2,3)",
+    )
+    parser.add_argument(
+        "--baudrate",
+        type=int,
+        default=1000000,
+        help="Desired baudrate for the motor (default: 1000000)",
     )
     args = parser.parse_args()
 
diff --git a/lerobot/scripts/control_robot.py b/lerobot/scripts/control_robot.py
index 77ab9a5d..599df1f6 100644
--- a/lerobot/scripts/control_robot.py
+++ b/lerobot/scripts/control_robot.py
@@ -118,7 +118,12 @@ from lerobot.common.robot_devices.control_utils import (
 from lerobot.common.robot_devices.robots.factory import make_robot
 from lerobot.common.robot_devices.robots.utils import Robot
 from lerobot.common.robot_devices.utils import busy_wait, safe_disconnect
-from lerobot.common.utils.utils import init_hydra_config, init_logging, log_say, none_or_int
+from lerobot.common.utils.utils import (
+    init_hydra_config,
+    init_logging,
+    log_say,
+    none_or_int,
+)
 
 ########################################################################################
 # Control modes
@@ -173,7 +178,10 @@ def calibrate(robot: Robot, arms: list[str] | None):
 
 @safe_disconnect
 def teleoperate(
-    robot: Robot, fps: int | None = None, teleop_time_s: float | None = None, display_cameras: bool = False
+    robot: Robot,
+    fps: int | None = None,
+    teleop_time_s: float | None = None,
+    display_cameras: bool = False,
 ):
     control_loop(
         robot,
@@ -234,11 +242,15 @@ def record(
 
     # Load pretrained policy
     if pretrained_policy_name_or_path is not None:
-        policy, policy_fps, device, use_amp = init_policy(pretrained_policy_name_or_path, policy_overrides)
+        policy, policy_fps, device, use_amp = init_policy(
+            pretrained_policy_name_or_path, policy_overrides
+        )
 
         if fps is None:
             fps = policy_fps
-            logging.warning(f"No fps provided, so using the fps from policy config ({policy_fps}).")
+            logging.warning(
+                f"No fps provided, so using the fps from policy config ({policy_fps})."
+            )
         elif fps != policy_fps:
             logging.warning(
                 f"There is a mismatch between the provided fps ({fps}) and the one from policy config ({policy_fps})."
@@ -254,7 +266,9 @@ def record(
             num_processes=num_image_writer_processes,
             num_threads=num_image_writer_threads_per_camera * len(robot.cameras),
         )
-        sanity_check_dataset_robot_compatibility(dataset, robot, fps, video, extra_features)
+        sanity_check_dataset_robot_compatibility(
+            dataset, robot, fps, video, extra_features
+        )
     else:
         # Create empty dataset or load existing saved episodes
         sanity_check_dataset_name(repo_id, policy)
@@ -265,7 +279,8 @@ def record(
             robot=robot,
             use_videos=video,
             image_writer_processes=num_image_writer_processes,
-            image_writer_threads=num_image_writer_threads_per_camera * len(robot.cameras),
+            image_writer_threads=num_image_writer_threads_per_camera
+            * len(robot.cameras),
             features=extra_features,
         )
 
@@ -282,7 +297,9 @@ def record(
     # 3. place the cameras windows on screen
     enable_teleoperation = policy is None
     log_say("Warmup record", play_sounds)
-    warmup_record(robot, events, enable_teleoperation, warmup_time_s, display_cameras, fps)
+    warmup_record(
+        robot, events, enable_teleoperation, warmup_time_s, display_cameras, fps
+    )
 
     if has_method(robot, "teleop_safety_stop"):
         robot.teleop_safety_stop()
@@ -365,7 +382,9 @@ def replay(
     # TODO(rcadene, aliberts): refactor with control_loop, once `dataset` is an instance of LeRobotDataset
     # TODO(rcadene): Add option to record logs
 
-    dataset = LeRobotDataset(repo_id, root=root, episodes=[episode], local_files_only=local_files_only)
+    dataset = LeRobotDataset(
+        repo_id, root=root, episodes=[episode], local_files_only=local_files_only
+    )
     actions = dataset.hf_dataset.select_columns("action")
     if not robot.is_connected:
         robot.connect()
@@ -416,7 +435,10 @@ if __name__ == "__main__":
 
     parser_teleop = subparsers.add_parser("teleoperate", parents=[base_parser])
     parser_teleop.add_argument(
-        "--fps", type=none_or_int, default=None, help="Frames per second (set to None to disable)"
+        "--fps",
+        type=none_or_int,
+        default=None,
+        help="Frames per second (set to None to disable)",
     )
     parser_teleop.add_argument(
         "--display-cameras",
@@ -428,7 +450,10 @@ if __name__ == "__main__":
     parser_record = subparsers.add_parser("record", parents=[base_parser])
     task_args = parser_record.add_mutually_exclusive_group(required=True)
     parser_record.add_argument(
-        "--fps", type=none_or_int, default=None, help="Frames per second (set to None to disable)"
+        "--fps",
+        type=none_or_int,
+        default=None,
+        help="Frames per second (set to None to disable)",
     )
     task_args.add_argument(
         "--single-task",
@@ -477,7 +502,9 @@ if __name__ == "__main__":
         default=60,
         help="Number of seconds for resetting the environment after each episode.",
     )
-    parser_record.add_argument("--num-episodes", type=int, default=50, help="Number of episodes to record.")
+    parser_record.add_argument(
+        "--num-episodes", type=int, default=50, help="Number of episodes to record."
+    )
     parser_record.add_argument(
         "--run-compute-stats",
         type=int,
@@ -559,7 +586,10 @@ if __name__ == "__main__":
 
     parser_replay = subparsers.add_parser("replay", parents=[base_parser])
     parser_replay.add_argument(
-        "--fps", type=none_or_int, default=None, help="Frames per second (set to None to disable)"
+        "--fps",
+        type=none_or_int,
+        default=None,
+        help="Frames per second (set to None to disable)",
     )
     parser_replay.add_argument(
         "--root",
@@ -585,7 +615,9 @@ if __name__ == "__main__":
         default=0,
         help="Enables the replay of delta actions instead of absolute actions.",
     )
-    parser_replay.add_argument("--episode", type=int, default=0, help="Index of the episode to replay.")
+    parser_replay.add_argument(
+        "--episode", type=int, default=0, help="Index of the episode to replay."
+    )
 
     args = parser.parse_args()
 
diff --git a/lerobot/scripts/control_sim_robot.py b/lerobot/scripts/control_sim_robot.py
index 67bdfb85..36bd1670 100644
--- a/lerobot/scripts/control_sim_robot.py
+++ b/lerobot/scripts/control_sim_robot.py
@@ -135,7 +135,11 @@ def init_sim_calibration(robot, cfg):
     axis_directions = np.array(cfg.get("axis_directions", [1]))
     offsets = np.array(cfg.get("offsets", [0])) * np.pi
 
-    return {"start_pos": start_pos, "axis_directions": axis_directions, "offsets": offsets}
+    return {
+        "start_pos": start_pos,
+        "axis_directions": axis_directions,
+        "offsets": offsets,
+    }
 
 
 def real_positions_to_sim(real_positions, axis_directions, start_pos, offsets):
@@ -156,7 +160,10 @@ def teleoperate(env, robot: Robot, process_action_fn, teleop_time_s=None):
         leader_pos = robot.leader_arms.main.read("Present_Position")
         action = process_action_fn(leader_pos)
         env.step(np.expand_dims(action, 0))
-        if teleop_time_s is not None and time.perf_counter() - start_teleop_t > teleop_time_s:
+        if (
+            teleop_time_s is not None
+            and time.perf_counter() - start_teleop_t > teleop_time_s
+        ):
             print("Teleoperation processes finished.")
             break
 
@@ -188,19 +195,27 @@ def record(
     # Load pretrained policy
 
     extra_features = (
-        {"next.reward": {"dtype": "int64", "shape": (1,), "names": None}} if assign_rewards else None
+        {"next.reward": {"dtype": "int64", "shape": (1,), "names": None}}
+        if assign_rewards
+        else None
     )
 
     policy = None
     if pretrained_policy_name_or_path is not None:
-        policy, policy_fps, device, use_amp = init_policy(pretrained_policy_name_or_path, policy_overrides)
+        policy, policy_fps, device, use_amp = init_policy(
+            pretrained_policy_name_or_path, policy_overrides
+        )
 
         if fps is None:
             fps = policy_fps
-            logging.warning(f"No fps provided, so using the fps from policy config ({policy_fps}).")
+            logging.warning(
+                f"No fps provided, so using the fps from policy config ({policy_fps})."
+            )
 
     if policy is None and process_action_from_leader is None:
-        raise ValueError("Either policy or process_action_fn has to be set to enable control in sim.")
+        raise ValueError(
+            "Either policy or process_action_fn has to be set to enable control in sim."
+        )
 
     # initialize listener before sim env
     listener, events = init_keyboard_listener(assign_rewards=assign_rewards)
@@ -233,7 +248,11 @@ def record(
             shape = env.observation_space[key].shape
             if not key.startswith("observation.image."):
                 key = "observation.image." + key
-            features[key] = {"dtype": "video", "names": ["channel", "height", "width"], "shape": shape}
+            features[key] = {
+                "dtype": "video",
+                "names": ["channel", "height", "width"],
+                "shape": shape,
+            }
 
         for key, obs_key in state_keys_dict.items():
             features[key] = {
@@ -242,7 +261,11 @@ def record(
                 "shape": env.observation_space[obs_key].shape,
             }
 
-        features["action"] = {"dtype": "float32", "shape": env.action_space.shape, "names": None}
+        features["action"] = {
+            "dtype": "float32",
+            "shape": env.action_space.shape,
+            "names": None,
+        }
         features = {**features, **extra_features}
 
         # Create empty dataset or load existing saved episodes
@@ -343,7 +366,9 @@ def record(
         if events["stop_recording"] or recorded_episodes >= num_episodes:
             break
         else:
-            logging.info("Waiting for a few seconds before starting next episode recording...")
+            logging.info(
+                "Waiting for a few seconds before starting next episode recording..."
+            )
             busy_wait(3)
 
     log_say("Stop recording", play_sounds, blocking=True)
@@ -361,7 +386,12 @@ def record(
 
 
 def replay(
-    env, root: Path, repo_id: str, episode: int, fps: int | None = None, local_files_only: bool = True
+    env,
+    root: Path,
+    repo_id: str,
+    episode: int,
+    fps: int | None = None,
+    local_files_only: bool = True,
 ):
     env = env()
 
@@ -408,7 +438,10 @@ if __name__ == "__main__":
 
     parser_record = subparsers.add_parser("record", parents=[base_parser])
     parser_record.add_argument(
-        "--fps", type=none_or_int, default=None, help="Frames per second (set to None to disable)"
+        "--fps",
+        type=none_or_int,
+        default=None,
+        help="Frames per second (set to None to disable)",
     )
     parser_record.add_argument(
         "--root",
@@ -434,7 +467,9 @@ if __name__ == "__main__":
         required=True,
         help="A description of the task preformed during recording that can be used as a language instruction.",
     )
-    parser_record.add_argument("--num-episodes", type=int, default=50, help="Number of episodes to record.")
+    parser_record.add_argument(
+        "--num-episodes", type=int, default=50, help="Number of episodes to record."
+    )
     parser_record.add_argument(
         "--run-compute-stats",
         type=int,
@@ -495,7 +530,10 @@ if __name__ == "__main__":
 
     parser_replay = subparsers.add_parser("replay", parents=[base_parser])
     parser_replay.add_argument(
-        "--fps", type=none_or_int, default=None, help="Frames per second (set to None to disable)"
+        "--fps",
+        type=none_or_int,
+        default=None,
+        help="Frames per second (set to None to disable)",
     )
     parser_replay.add_argument(
         "--root",
@@ -509,7 +547,9 @@ if __name__ == "__main__":
         default="lerobot/test",
         help="Dataset identifier. By convention it should match '{hf_username}/{dataset_name}' (e.g. `lerobot/test`).",
     )
-    parser_replay.add_argument("--episode", type=int, default=0, help="Index of the episodes to replay.")
+    parser_replay.add_argument(
+        "--episode", type=int, default=0, help="Index of the episodes to replay."
+    )
 
     args = parser.parse_args()
 
diff --git a/lerobot/scripts/display_sys_info.py b/lerobot/scripts/display_sys_info.py
index 4d3cc291..2d844990 100644
--- a/lerobot/scripts/display_sys_info.py
+++ b/lerobot/scripts/display_sys_info.py
@@ -59,7 +59,11 @@ np_version = np.__version__ if HAS_NP else "N/A"
 
 torch_version = torch.__version__ if HAS_TORCH else "N/A"
 torch_cuda_available = torch.cuda.is_available() if HAS_TORCH else "N/A"
-cuda_version = torch._C._cuda_getCompiledVersion() if HAS_TORCH and torch.version.cuda is not None else "N/A"
+cuda_version = (
+    torch._C._cuda_getCompiledVersion()
+    if HAS_TORCH and torch.version.cuda is not None
+    else "N/A"
+)
 
 
 # TODO(aliberts): refactor into an actual command `lerobot env`
@@ -77,7 +81,9 @@ def display_sys_info() -> dict:
         "Using GPU in script?": "<fill in>",
         # "Using distributed or parallel set-up in script?": "<fill in>",
     }
-    print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the last point.\n")
+    print(
+        "\nCopy-and-paste the text below in your GitHub issue and FILL OUT the last point.\n"
+    )
     print(format_dict(info))
     return info
 
diff --git a/lerobot/scripts/eval.py b/lerobot/scripts/eval.py
index 040f92d9..8b7b9e80 100644
--- a/lerobot/scripts/eval.py
+++ b/lerobot/scripts/eval.py
@@ -149,7 +149,9 @@ def rollout(
         if return_observations:
             all_observations.append(deepcopy(observation))
 
-        observation = {key: observation[key].to(device, non_blocking=True) for key in observation}
+        observation = {
+            key: observation[key].to(device, non_blocking=True) for key in observation
+        }
 
         with torch.inference_mode():
             action = policy.select_action(observation)
@@ -166,7 +168,10 @@ def rollout(
         # VectorEnv stores is_success in `info["final_info"][env_index]["is_success"]`. "final_info" isn't
         # available of none of the envs finished.
         if "final_info" in info:
-            successes = [info["is_success"] if info is not None else False for info in info["final_info"]]
+            successes = [
+                info["is_success"] if info is not None else False
+                for info in info["final_info"]
+            ]
         else:
             successes = [False] * env.num_envs
 
@@ -180,9 +185,13 @@ def rollout(
 
         step += 1
         running_success_rate = (
-            einops.reduce(torch.stack(all_successes, dim=1), "b n -> b", "any").numpy().mean()
+            einops.reduce(torch.stack(all_successes, dim=1), "b n -> b", "any")
+            .numpy()
+            .mean()
+        )
+        progbar.set_postfix(
+            {"running_success_rate": f"{running_success_rate.item() * 100:.1f}%"}
         )
-        progbar.set_postfix({"running_success_rate": f"{running_success_rate.item() * 100:.1f}%"})
         progbar.update()
 
     # Track the final observation.
@@ -200,7 +209,9 @@ def rollout(
     if return_observations:
         stacked_observations = {}
         for key in all_observations[0]:
-            stacked_observations[key] = torch.stack([obs[key] for obs in all_observations], dim=1)
+            stacked_observations[key] = torch.stack(
+                [obs[key] for obs in all_observations], dim=1
+            )
         ret["observation"] = stacked_observations
 
     return ret
@@ -255,7 +266,9 @@ def eval_policy(
             return
         n_to_render_now = min(max_episodes_rendered - n_episodes_rendered, env.num_envs)
         if isinstance(env, gym.vector.SyncVectorEnv):
-            ep_frames.append(np.stack([env.envs[i].render() for i in range(n_to_render_now)]))  # noqa: B023
+            ep_frames.append(
+                np.stack([env.envs[i].render() for i in range(n_to_render_now)])
+            )  # noqa: B023
         elif isinstance(env, gym.vector.AsyncVectorEnv):
             # Here we must render all frames and discard any we don't need.
             ep_frames.append(np.stack(env.call("render")[:n_to_render_now]))
@@ -267,7 +280,9 @@ def eval_policy(
         episode_data: dict | None = None
 
     # we dont want progress bar when we use slurm, since it clutters the logs
-    progbar = trange(n_batches, desc="Stepping through eval batches", disable=inside_slurm())
+    progbar = trange(
+        n_batches, desc="Stepping through eval batches", disable=inside_slurm()
+    )
     for batch_ix in progbar:
         # Cache frames for rendering videos. Each item will be (b, h, w, c), and the list indexes the rollout
         # step.
@@ -278,7 +293,8 @@ def eval_policy(
             seeds = None
         else:
             seeds = range(
-                start_seed + (batch_ix * env.num_envs), start_seed + ((batch_ix + 1) * env.num_envs)
+                start_seed + (batch_ix * env.num_envs),
+                start_seed + ((batch_ix + 1) * env.num_envs),
             )
         rollout_data = rollout(
             env,
@@ -296,13 +312,22 @@ def eval_policy(
 
         # Make a mask with shape (batch, n_steps) to mask out rollout data after the first done
         # (batch-element-wise). Note the `done_indices + 1` to make sure to keep the data from the done step.
-        mask = (torch.arange(n_steps) <= einops.repeat(done_indices + 1, "b -> b s", s=n_steps)).int()
+        mask = (
+            torch.arange(n_steps)
+            <= einops.repeat(done_indices + 1, "b -> b s", s=n_steps)
+        ).int()
         # Extend metrics.
-        batch_sum_rewards = einops.reduce((rollout_data["reward"] * mask), "b n -> b", "sum")
+        batch_sum_rewards = einops.reduce(
+            (rollout_data["reward"] * mask), "b n -> b", "sum"
+        )
         sum_rewards.extend(batch_sum_rewards.tolist())
-        batch_max_rewards = einops.reduce((rollout_data["reward"] * mask), "b n -> b", "max")
+        batch_max_rewards = einops.reduce(
+            (rollout_data["reward"] * mask), "b n -> b", "max"
+        )
         max_rewards.extend(batch_max_rewards.tolist())
-        batch_successes = einops.reduce((rollout_data["success"] * mask), "b n -> b", "any")
+        batch_successes = einops.reduce(
+            (rollout_data["success"] * mask), "b n -> b", "any"
+        )
         all_successes.extend(batch_successes.tolist())
         if seeds:
             all_seeds.extend(seeds)
@@ -315,17 +340,27 @@ def eval_policy(
                 rollout_data,
                 done_indices,
                 start_episode_index=batch_ix * env.num_envs,
-                start_data_index=(0 if episode_data is None else (episode_data["index"][-1].item() + 1)),
+                start_data_index=(
+                    0
+                    if episode_data is None
+                    else (episode_data["index"][-1].item() + 1)
+                ),
                 fps=env.unwrapped.metadata["render_fps"],
             )
             if episode_data is None:
                 episode_data = this_episode_data
             else:
                 # Some sanity checks to make sure we are correctly compiling the data.
-                assert episode_data["episode_index"][-1] + 1 == this_episode_data["episode_index"][0]
+                assert (
+                    episode_data["episode_index"][-1] + 1
+                    == this_episode_data["episode_index"][0]
+                )
                 assert episode_data["index"][-1] + 1 == this_episode_data["index"][0]
                 # Concatenate the episode data.
-                episode_data = {k: torch.cat([episode_data[k], this_episode_data[k]]) for k in episode_data}
+                episode_data = {
+                    k: torch.cat([episode_data[k], this_episode_data[k]])
+                    for k in episode_data
+                }
 
         # Maybe render video for visualization.
         if max_episodes_rendered > 0 and len(ep_frames) > 0:
@@ -343,7 +378,9 @@ def eval_policy(
                     target=write_video,
                     args=(
                         str(video_path),
-                        stacked_frames[: done_index + 1],  # + 1 to capture the last observation
+                        stacked_frames[
+                            : done_index + 1
+                        ],  # + 1 to capture the last observation
                         env.unwrapped.metadata["render_fps"],
                     ),
                 )
@@ -352,7 +389,9 @@ def eval_policy(
                 n_episodes_rendered += 1
 
         progbar.set_postfix(
-            {"running_success_rate": f"{np.mean(all_successes[:n_episodes]).item() * 100:.1f}%"}
+            {
+                "running_success_rate": f"{np.mean(all_successes[:n_episodes]).item() * 100:.1f}%"
+            }
         )
 
     # Wait till all video rendering threads are done.
@@ -398,7 +437,11 @@ def eval_policy(
 
 
 def _compile_episode_data(
-    rollout_data: dict, done_indices: Tensor, start_episode_index: int, start_data_index: int, fps: float
+    rollout_data: dict,
+    done_indices: Tensor,
+    start_episode_index: int,
+    start_data_index: int,
+    fps: float,
 ) -> dict:
     """Convenience function for `eval_policy(return_episode_data=True)`
 
@@ -416,12 +459,16 @@ def _compile_episode_data(
         # Here we do `num_frames - 1` as we don't want to include the last observation frame just yet.
         ep_dict = {
             "action": rollout_data["action"][ep_ix, : num_frames - 1],
-            "episode_index": torch.tensor([start_episode_index + ep_ix] * (num_frames - 1)),
+            "episode_index": torch.tensor(
+                [start_episode_index + ep_ix] * (num_frames - 1)
+            ),
             "frame_index": torch.arange(0, num_frames - 1, 1),
             "timestamp": torch.arange(0, num_frames - 1, 1) / fps,
             "next.done": rollout_data["done"][ep_ix, : num_frames - 1],
             "next.success": rollout_data["success"][ep_ix, : num_frames - 1],
-            "next.reward": rollout_data["reward"][ep_ix, : num_frames - 1].type(torch.float32),
+            "next.reward": rollout_data["reward"][ep_ix, : num_frames - 1].type(
+                torch.float32
+            ),
         }
 
         # For the last observation frame, all other keys will just be copy padded.
@@ -437,7 +484,9 @@ def _compile_episode_data(
     for key in ep_dicts[0]:
         data_dict[key] = torch.cat([x[key] for x in ep_dicts])
 
-    data_dict["index"] = torch.arange(start_data_index, start_data_index + total_frames, 1)
+    data_dict["index"] = torch.arange(
+        start_data_index, start_data_index + total_frames, 1
+    )
 
     return data_dict
 
@@ -450,7 +499,9 @@ def main(
 ):
     assert (pretrained_policy_path is None) ^ (hydra_cfg_path is None)
     if pretrained_policy_path is not None:
-        hydra_cfg = init_hydra_config(str(pretrained_policy_path / "config.yaml"), config_overrides)
+        hydra_cfg = init_hydra_config(
+            str(pretrained_policy_path / "config.yaml"), config_overrides
+        )
     else:
         hydra_cfg = init_hydra_config(hydra_cfg_path, config_overrides)
 
@@ -481,15 +532,23 @@ def main(
 
     logging.info("Making policy.")
     if hydra_cfg_path is None:
-        policy = make_policy(hydra_cfg=hydra_cfg, pretrained_policy_name_or_path=str(pretrained_policy_path))
+        policy = make_policy(
+            hydra_cfg=hydra_cfg,
+            pretrained_policy_name_or_path=str(pretrained_policy_path),
+        )
     else:
         # Note: We need the dataset stats to pass to the policy's normalization modules.
-        policy = make_policy(hydra_cfg=hydra_cfg, dataset_stats=make_dataset(hydra_cfg).meta.stats)
+        policy = make_policy(
+            hydra_cfg=hydra_cfg, dataset_stats=make_dataset(hydra_cfg).meta.stats
+        )
 
     assert isinstance(policy, nn.Module)
     policy.eval()
 
-    with torch.no_grad(), torch.autocast(device_type=device.type) if hydra_cfg.use_amp else nullcontext():
+    with (
+        torch.no_grad(),
+        torch.autocast(device_type=device.type) if hydra_cfg.use_amp else nullcontext(),
+    ):
         info = eval_policy(
             env,
             policy,
@@ -511,16 +570,14 @@ def main(
 
 def get_pretrained_policy_path(pretrained_policy_name_or_path, revision=None):
     try:
-        pretrained_policy_path = Path(snapshot_download(pretrained_policy_name_or_path, revision=revision))
+        pretrained_policy_path = Path(
+            snapshot_download(pretrained_policy_name_or_path, revision=revision)
+        )
     except (HFValidationError, RepositoryNotFoundError) as e:
         if isinstance(e, HFValidationError):
-            error_message = (
-                "The provided pretrained_policy_name_or_path is not a valid Hugging Face Hub repo ID."
-            )
+            error_message = "The provided pretrained_policy_name_or_path is not a valid Hugging Face Hub repo ID."
         else:
-            error_message = (
-                "The provided pretrained_policy_name_or_path was not found on the Hugging Face Hub."
-            )
+            error_message = "The provided pretrained_policy_name_or_path was not found on the Hugging Face Hub."
 
         logging.warning(f"{error_message} Treating it as a local directory.")
         pretrained_policy_path = Path(pretrained_policy_name_or_path)
@@ -555,7 +612,9 @@ if __name__ == "__main__":
             "debugging). This argument is mutually exclusive with `--pretrained-policy-name-or-path` (`-p`)."
         ),
     )
-    parser.add_argument("--revision", help="Optionally provide the Hugging Face Hub revision ID.")
+    parser.add_argument(
+        "--revision", help="Optionally provide the Hugging Face Hub revision ID."
+    )
     parser.add_argument(
         "--out-dir",
         help=(
@@ -571,7 +630,11 @@ if __name__ == "__main__":
     args = parser.parse_args()
 
     if args.pretrained_policy_name_or_path is None:
-        main(hydra_cfg_path=args.config, out_dir=args.out_dir, config_overrides=args.overrides)
+        main(
+            hydra_cfg_path=args.config,
+            out_dir=args.out_dir,
+            config_overrides=args.overrides,
+        )
     else:
         pretrained_policy_path = get_pretrained_policy_path(
             args.pretrained_policy_name_or_path, revision=args.revision
diff --git a/lerobot/scripts/eval_on_robot.py b/lerobot/scripts/eval_on_robot.py
index 842c1a28..8a7062e7 100644
--- a/lerobot/scripts/eval_on_robot.py
+++ b/lerobot/scripts/eval_on_robot.py
@@ -46,7 +46,11 @@ import torch
 from tqdm import trange
 
 from lerobot.common.policies.policy_protocol import Policy
-from lerobot.common.robot_devices.control_utils import busy_wait, is_headless, reset_follower_position
+from lerobot.common.robot_devices.control_utils import (
+    busy_wait,
+    is_headless,
+    reset_follower_position,
+)
 from lerobot.common.robot_devices.robots.factory import Robot, make_robot
 from lerobot.common.utils.utils import (
     init_hydra_config,
@@ -60,13 +64,19 @@ def get_classifier(pretrained_path, config_path):
         return
 
     from lerobot.common.policies.factory import _policy_cfg_from_hydra_cfg
-    from lerobot.common.policies.hilserl.classifier.configuration_classifier import ClassifierConfig
-    from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
+    from lerobot.common.policies.hilserl.classifier.configuration_classifier import (
+        ClassifierConfig,
+    )
+    from lerobot.common.policies.hilserl.classifier.modeling_classifier import (
+        Classifier,
+    )
 
     cfg = init_hydra_config(config_path)
 
     classifier_config = _policy_cfg_from_hydra_cfg(ClassifierConfig, cfg)
-    classifier_config.num_cameras = len(cfg.training.image_keys)  # TODO automate these paths
+    classifier_config.num_cameras = len(
+        cfg.training.image_keys
+    )  # TODO automate these paths
     model = Classifier(classifier_config)
     model.load_state_dict(Classifier.from_pretrained(pretrained_path).state_dict())
     model = model.to("mps")
@@ -151,11 +161,17 @@ def rollout(
         images = []
         for key in image_keys:
             if display_cameras:
-                cv2.imshow(key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR))
+                cv2.imshow(
+                    key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR)
+                )
                 cv2.waitKey(1)
             images.append(observation[key].to("mps"))
 
-        reward = reward_classifier.predict_reward(images) if reward_classifier is not None else 0.0
+        reward = (
+            reward_classifier.predict_reward(images)
+            if reward_classifier is not None
+            else 0.0
+        )
         all_rewards.append(reward)
 
         # print("REWARD : ", reward)
@@ -219,11 +235,19 @@ def eval_policy(
 
     start_eval = time.perf_counter()
     progbar = trange(n_episodes, desc="Evaluating policy on real robot")
-    reward_classifier = get_classifier(reward_classifier_pretrained_path, reward_classifier_config_file)
+    reward_classifier = get_classifier(
+        reward_classifier_pretrained_path, reward_classifier_config_file
+    )
 
     for _ in progbar:
         rollout_data = rollout(
-            robot, policy, reward_classifier, fps, control_time_s, use_amp, display_cameras
+            robot,
+            policy,
+            reward_classifier,
+            fps,
+            control_time_s,
+            use_amp,
+            display_cameras,
         )
 
         rollouts.append(rollout_data)
@@ -289,7 +313,9 @@ def init_keyboard_listener():
                 print("Right arrow key pressed. Exiting loop...")
                 events["exit_early"] = True
             elif key == keyboard.Key.left:
-                print("Left arrow key pressed. Exiting loop and rerecord the last episode...")
+                print(
+                    "Left arrow key pressed. Exiting loop and rerecord the last episode..."
+                )
                 events["rerecord_episode"] = True
                 events["exit_early"] = True
             elif key == keyboard.Key.space:
@@ -301,7 +327,10 @@ def init_keyboard_listener():
                         "Place the leader in similar pose to the follower and press space again."
                     )
                     events["pause_policy"] = True
-                    log_say("Human intervention stage. Get ready to take over.", play_sounds=True)
+                    log_say(
+                        "Human intervention stage. Get ready to take over.",
+                        play_sounds=True,
+                    )
                 else:
                     events["human_intervention_step"] = True
                     print("Space key pressed. Human intervention starting.")
@@ -351,7 +380,9 @@ if __name__ == "__main__":
             "debugging). This argument is mutually exclusive with `--pretrained-policy-name-or-path` (`-p`)."
         ),
     )
-    parser.add_argument("--revision", help="Optionally provide the Hugging Face Hub revision ID.")
+    parser.add_argument(
+        "--revision", help="Optionally provide the Hugging Face Hub revision ID."
+    )
     parser.add_argument(
         "--out-dir",
         help=(
@@ -360,7 +391,8 @@ if __name__ == "__main__":
         ),
     )
     parser.add_argument(
-        "--display-cameras", help=("Whether to display the camera feed while the rollout is happening")
+        "--display-cameras",
+        help=("Whether to display the camera feed while the rollout is happening"),
     )
     parser.add_argument(
         "--reward-classifier-pretrained-path",
diff --git a/lerobot/scripts/find_motors_bus_port.py b/lerobot/scripts/find_motors_bus_port.py
index 67b92ad7..b4dcbe4e 100644
--- a/lerobot/scripts/find_motors_bus_port.py
+++ b/lerobot/scripts/find_motors_bus_port.py
@@ -32,9 +32,13 @@ def find_port():
         print(f"The port of this MotorsBus is '{port}'")
         print("Reconnect the USB cable.")
     elif len(ports_diff) == 0:
-        raise OSError(f"Could not detect the port. No difference was found ({ports_diff}).")
+        raise OSError(
+            f"Could not detect the port. No difference was found ({ports_diff})."
+        )
     else:
-        raise OSError(f"Could not detect the port. More than one port was found ({ports_diff}).")
+        raise OSError(
+            f"Could not detect the port. More than one port was found ({ports_diff})."
+        )
 
 
 if __name__ == "__main__":
diff --git a/lerobot/scripts/push_dataset_to_hub.py b/lerobot/scripts/push_dataset_to_hub.py
index 0233ede6..85e1be40 100644
--- a/lerobot/scripts/push_dataset_to_hub.py
+++ b/lerobot/scripts/push_dataset_to_hub.py
@@ -56,24 +56,42 @@ from safetensors.torch import save_file
 from lerobot.common.datasets.compute_stats import compute_stats
 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
 from lerobot.common.datasets.push_dataset_to_hub.utils import check_repo_id
-from lerobot.common.datasets.utils import create_branch, create_lerobot_dataset_card, flatten_dict
+from lerobot.common.datasets.utils import (
+    create_branch,
+    create_lerobot_dataset_card,
+    flatten_dict,
+)
 
 
 def get_from_raw_to_lerobot_format_fn(raw_format: str):
     if raw_format == "pusht_zarr":
-        from lerobot.common.datasets.push_dataset_to_hub.pusht_zarr_format import from_raw_to_lerobot_format
+        from lerobot.common.datasets.push_dataset_to_hub.pusht_zarr_format import (
+            from_raw_to_lerobot_format,
+        )
     elif raw_format == "umi_zarr":
-        from lerobot.common.datasets.push_dataset_to_hub.umi_zarr_format import from_raw_to_lerobot_format
+        from lerobot.common.datasets.push_dataset_to_hub.umi_zarr_format import (
+            from_raw_to_lerobot_format,
+        )
     elif raw_format == "aloha_hdf5":
-        from lerobot.common.datasets.push_dataset_to_hub.aloha_hdf5_format import from_raw_to_lerobot_format
+        from lerobot.common.datasets.push_dataset_to_hub.aloha_hdf5_format import (
+            from_raw_to_lerobot_format,
+        )
     elif raw_format in ["rlds", "openx"]:
-        from lerobot.common.datasets.push_dataset_to_hub.openx_rlds_format import from_raw_to_lerobot_format
+        from lerobot.common.datasets.push_dataset_to_hub.openx_rlds_format import (
+            from_raw_to_lerobot_format,
+        )
     elif raw_format == "dora_parquet":
-        from lerobot.common.datasets.push_dataset_to_hub.dora_parquet_format import from_raw_to_lerobot_format
+        from lerobot.common.datasets.push_dataset_to_hub.dora_parquet_format import (
+            from_raw_to_lerobot_format,
+        )
     elif raw_format == "xarm_pkl":
-        from lerobot.common.datasets.push_dataset_to_hub.xarm_pkl_format import from_raw_to_lerobot_format
+        from lerobot.common.datasets.push_dataset_to_hub.xarm_pkl_format import (
+            from_raw_to_lerobot_format,
+        )
     elif raw_format == "cam_png":
-        from lerobot.common.datasets.push_dataset_to_hub.cam_png_format import from_raw_to_lerobot_format
+        from lerobot.common.datasets.push_dataset_to_hub.cam_png_format import (
+            from_raw_to_lerobot_format,
+        )
     else:
         raise ValueError(
             f"The selected {raw_format} can't be found. Did you add it to `lerobot/scripts/push_dataset_to_hub.py::get_from_raw_to_lerobot_format_fn`?"
@@ -83,7 +101,10 @@ def get_from_raw_to_lerobot_format_fn(raw_format: str):
 
 
 def save_meta_data(
-    info: dict[str, Any], stats: dict, episode_data_index: dict[str, list], meta_data_dir: Path
+    info: dict[str, Any],
+    stats: dict,
+    episode_data_index: dict[str, list],
+    meta_data_dir: Path,
 ):
     meta_data_dir.mkdir(parents=True, exist_ok=True)
 
@@ -97,12 +118,16 @@ def save_meta_data(
     save_file(flatten_dict(stats), stats_path)
 
     # save episode_data_index
-    episode_data_index = {key: torch.tensor(episode_data_index[key]) for key in episode_data_index}
+    episode_data_index = {
+        key: torch.tensor(episode_data_index[key]) for key in episode_data_index
+    }
     ep_data_idx_path = meta_data_dir / "episode_data_index.safetensors"
     save_file(episode_data_index, ep_data_idx_path)
 
 
-def push_meta_data_to_hub(repo_id: str, meta_data_dir: str | Path, revision: str | None):
+def push_meta_data_to_hub(
+    repo_id: str, meta_data_dir: str | Path, revision: str | None
+):
     """Expect all meta data files to be all stored in a single "meta_data" directory.
     On the hugging face repositery, they will be uploaded in a "meta_data" directory at the root.
     """
@@ -187,7 +212,9 @@ def push_dataset_to_hub(
             if force_override:
                 shutil.rmtree(local_dir)
             elif not resume:
-                raise ValueError(f"`local_dir` already exists ({local_dir}). Use `--force-override 1`.")
+                raise ValueError(
+                    f"`local_dir` already exists ({local_dir}). Use `--force-override 1`."
+                )
 
         meta_data_dir = local_dir / "meta_data"
         videos_dir = local_dir / "videos"
@@ -223,7 +250,9 @@ def push_dataset_to_hub(
     stats = compute_stats(lerobot_dataset, batch_size, num_workers)
 
     if local_dir:
-        hf_dataset = hf_dataset.with_format(None)  # to remove transforms that cant be saved
+        hf_dataset = hf_dataset.with_format(
+            None
+        )  # to remove transforms that cant be saved
         hf_dataset.save_to_disk(str(local_dir / "train"))
 
     if push_to_hub or local_dir:
diff --git a/lerobot/scripts/server/buffer.py b/lerobot/scripts/server/buffer.py
index 905157f1..f93b40ca 100644
--- a/lerobot/scripts/server/buffer.py
+++ b/lerobot/scripts/server/buffer.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import functools
-import random
 from typing import Any, Callable, Optional, Sequence, TypedDict
 
 import io
@@ -737,7 +736,6 @@ def concatenate_batch_transitions(
 
 
 if __name__ == "__main__":
-    import numpy as np
     from tempfile import TemporaryDirectory
 
     # ===== Test 1: Create and use a synthetic ReplayBuffer =====
@@ -1139,7 +1137,7 @@ if __name__ == "__main__":
 
     savings_percent = (std_mem - opt_mem) / std_mem * 100
 
-    print(f"\nMemory optimization result:")
+    print("\nMemory optimization result:")
     print(f"- Standard buffer state memory: {std_mem / (1024 * 1024):.2f} MB")
     print(f"- Optimized buffer state memory: {opt_mem / (1024 * 1024):.2f} MB")
     print(f"- Memory savings for state tensors: {savings_percent:.1f}%")
diff --git a/lerobot/scripts/server/crop_dataset_roi.py b/lerobot/scripts/server/crop_dataset_roi.py
index fb9077c9..8bb414fe 100644
--- a/lerobot/scripts/server/crop_dataset_roi.py
+++ b/lerobot/scripts/server/crop_dataset_roi.py
@@ -225,7 +225,9 @@ def convert_lerobot_dataset_to_cropper_lerobot_dataset(
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Crop rectangular ROIs from a LeRobot dataset.")
+    parser = argparse.ArgumentParser(
+        description="Crop rectangular ROIs from a LeRobot dataset."
+    )
     parser.add_argument(
         "--repo-id",
         type=str,
@@ -247,7 +249,9 @@ if __name__ == "__main__":
     args = parser.parse_args()
 
     local_files_only = args.root is not None
-    dataset = LeRobotDataset(repo_id=args.repo_id, root=args.root, local_files_only=local_files_only)
+    dataset = LeRobotDataset(
+        repo_id=args.repo_id, root=args.root, local_files_only=local_files_only
+    )
 
     images = get_image_from_lerobot_dataset(dataset)
     images = {k: v.cpu().permute(1, 2, 0).numpy() for k, v in images.items()}
@@ -256,7 +260,7 @@ if __name__ == "__main__":
     if args.crop_params_path is None:
         rois = select_square_roi_for_images(images)
     else:
-        with open(args.crop_params_path, "r") as f:
+        with open(args.crop_params_path) as f:
             rois = json.load(f)
 
     # rois = {
diff --git a/lerobot/scripts/server/find_joint_limits.py b/lerobot/scripts/server/find_joint_limits.py
index 1c2443d6..d5870027 100644
--- a/lerobot/scripts/server/find_joint_limits.py
+++ b/lerobot/scripts/server/find_joint_limits.py
@@ -31,7 +31,9 @@ def find_joint_bounds(
         if display_cameras and not is_headless():
             image_keys = [key for key in observation if "image" in key]
             for key in image_keys:
-                cv2.imshow(key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR))
+                cv2.imshow(
+                    key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR)
+                )
             cv2.waitKey(1)
 
         timestamp = time.perf_counter() - start_episode_t
@@ -57,7 +59,12 @@ if __name__ == "__main__":
         nargs="*",
         help="Any key=value arguments to override config values (use dots for.nested=overrides)",
     )
-    parser.add_argument("--control-time-s", type=float, default=20, help="Maximum episode length in seconds")
+    parser.add_argument(
+        "--control-time-s",
+        type=float,
+        default=20,
+        help="Maximum episode length in seconds",
+    )
     args = parser.parse_args()
     robot_cfg = init_hydra_config(args.robot_path, args.robot_overrides)
 
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index baba99e7..4bab9ac2 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -146,7 +146,7 @@ def log_training_info(cfg: DictConfig, out_dir: str, policy: nn.Module) -> None:
 
 
 def initialize_replay_buffer(
-    cfg: DictConfig, logger: Logger, device: str, storage_device:str
+    cfg: DictConfig, logger: Logger, device: str, storage_device: str
 ) -> ReplayBuffer:
     if not cfg.resume:
         return ReplayBuffer(
diff --git a/lerobot/scripts/server/maniskill_manipulator.py b/lerobot/scripts/server/maniskill_manipulator.py
index e1c0840a..b9c9d216 100644
--- a/lerobot/scripts/server/maniskill_manipulator.py
+++ b/lerobot/scripts/server/maniskill_manipulator.py
@@ -10,7 +10,9 @@ from typing import Any
 from mani_skill.vector.wrappers.gymnasium import ManiSkillVectorEnv
 
 
-def preprocess_maniskill_observation(observations: dict[str, np.ndarray]) -> dict[str, torch.Tensor]:
+def preprocess_maniskill_observation(
+    observations: dict[str, np.ndarray],
+) -> dict[str, torch.Tensor]:
     """Convert environment observation to LeRobot format observation.
     Args:
         observation: Dictionary of observation batches from a Gym vector environment.
@@ -62,7 +64,9 @@ class ManiSkillCompat(gym.Wrapper):
         new_action_space_shape = env.action_space.shape[-1]
         new_low = np.squeeze(env.action_space.low, axis=0)
         new_high = np.squeeze(env.action_space.high, axis=0)
-        self.action_space = gym.spaces.Box(low=new_low, high=new_high, shape=(new_action_space_shape,))
+        self.action_space = gym.spaces.Box(
+            low=new_low, high=new_high, shape=(new_action_space_shape,)
+        )
 
     def reset(
         self, *, seed: int | None = None, options: dict[str, Any] | None = None
@@ -81,7 +85,9 @@ class ManiSkillCompat(gym.Wrapper):
 class ManiSkillActionWrapper(gym.ActionWrapper):
     def __init__(self, env):
         super().__init__(env)
-        self.action_space = gym.spaces.Tuple(spaces=(env.action_space, gym.spaces.Discrete(2)))
+        self.action_space = gym.spaces.Tuple(
+            spaces=(env.action_space, gym.spaces.Discrete(2))
+        )
 
     def action(self, action):
         action, telop = action
@@ -95,7 +101,9 @@ class ManiSkillMultiplyActionWrapper(gym.Wrapper):
         action_space_agent: gym.spaces.Box = env.action_space[0]
         action_space_agent.low = action_space_agent.low * multiply_factor
         action_space_agent.high = action_space_agent.high * multiply_factor
-        self.action_space = gym.spaces.Tuple(spaces=(action_space_agent, gym.spaces.Discrete(2)))
+        self.action_space = gym.spaces.Tuple(
+            spaces=(action_space_agent, gym.spaces.Discrete(2))
+        )
 
     def step(self, action):
         if isinstance(action, tuple):
@@ -137,7 +145,9 @@ def make_maniskill(
 
     env = ManiSkillObservationWrapper(env, device=cfg.env.device)
     env = ManiSkillVectorEnv(env, ignore_terminations=True, auto_reset=False)
-    env._max_episode_steps = env.max_episode_steps = 50  # gym_utils.find_max_episode_steps_value(env)
+    env._max_episode_steps = env.max_episode_steps = (
+        50  # gym_utils.find_max_episode_steps_value(env)
+    )
     env.unwrapped.metadata["render_fps"] = 20
     env = ManiSkillCompat(env)
     env = ManiSkillActionWrapper(env)
@@ -149,10 +159,11 @@ def make_maniskill(
 if __name__ == "__main__":
     import argparse
     import hydra
-    from omegaconf import OmegaConf
 
     parser = argparse.ArgumentParser()
-    parser.add_argument("--config", type=str, default="lerobot/configs/env/maniskill_example.yaml")
+    parser.add_argument(
+        "--config", type=str, default="lerobot/configs/env/maniskill_example.yaml"
+    )
     args = parser.parse_args()
 
     # Initialize config
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index a4eb3528..120895c4 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -71,7 +71,9 @@ def make_optimizer_and_scheduler(cfg, policy):
             },
         ]
         optimizer = torch.optim.AdamW(
-            optimizer_params_dicts, lr=cfg.training.lr, weight_decay=cfg.training.weight_decay
+            optimizer_params_dicts,
+            lr=cfg.training.lr,
+            weight_decay=cfg.training.weight_decay,
         )
         lr_scheduler = None
     elif cfg.policy.name == "diffusion":
@@ -98,14 +100,23 @@ def make_optimizer_and_scheduler(cfg, policy):
         optimizer = torch.optim.Adam(
             [
                 {"params": policy.actor.parameters(), "lr": policy.config.actor_lr},
-                {"params": policy.critic_ensemble.parameters(), "lr": policy.config.critic_lr},
-                {"params": policy.temperature.parameters(), "lr": policy.config.temperature_lr},
+                {
+                    "params": policy.critic_ensemble.parameters(),
+                    "lr": policy.config.critic_lr,
+                },
+                {
+                    "params": policy.temperature.parameters(),
+                    "lr": policy.config.temperature_lr,
+                },
             ]
         )
         lr_scheduler = None
 
     elif cfg.policy.name == "vqbet":
-        from lerobot.common.policies.vqbet.modeling_vqbet import VQBeTOptimizer, VQBeTScheduler
+        from lerobot.common.policies.vqbet.modeling_vqbet import (
+            VQBeTOptimizer,
+            VQBeTScheduler,
+        )
 
         optimizer = VQBeTOptimizer(policy, cfg)
         lr_scheduler = VQBeTScheduler(optimizer, cfg)
@@ -255,7 +266,9 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     logging.info(pformat(OmegaConf.to_container(cfg)))
 
     if cfg.training.online_steps > 0 and isinstance(cfg.dataset_repo_id, ListConfig):
-        raise NotImplementedError("Online training with LeRobotMultiDataset is not implemented.")
+        raise NotImplementedError(
+            "Online training with LeRobotMultiDataset is not implemented."
+        )
 
     # If we are resuming a run, we need to check that a checkpoint exists in the log directory, and we need
     # to check for any differences between the provided config and the checkpoint's config.
@@ -265,7 +278,9 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
                 "You have set resume=True, but there is no model checkpoint in "
                 f"{Logger.get_last_checkpoint_dir(out_dir)}"
             )
-        checkpoint_cfg_path = str(Logger.get_last_pretrained_model_dir(out_dir) / "config.yaml")
+        checkpoint_cfg_path = str(
+            Logger.get_last_pretrained_model_dir(out_dir) / "config.yaml"
+        )
         logging.info(
             colored(
                 "You have set resume=True, indicating that you wish to resume a run",
@@ -278,7 +293,9 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
         # Check for differences between the checkpoint configuration and provided configuration.
         # Hack to resolve the delta_timestamps ahead of time in order to properly diff.
         resolve_delta_timestamps(cfg)
-        diff = DeepDiff(OmegaConf.to_container(checkpoint_cfg), OmegaConf.to_container(cfg))
+        diff = DeepDiff(
+            OmegaConf.to_container(checkpoint_cfg), OmegaConf.to_container(cfg)
+        )
         # Ignore the `resume` and parameters.
         if "values_changed" in diff and "root['resume']" in diff["values_changed"]:
             del diff["values_changed"]["root['resume']"]
@@ -325,7 +342,9 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     # TODO (michel-aractingi): temporary fix to avoid datasets with task_index key that doesn't exist in online environment
     # i.e., pusht
     if "task_index" in offline_dataset.hf_dataset[0]:
-        offline_dataset.hf_dataset = offline_dataset.hf_dataset.remove_columns(["task_index"])
+        offline_dataset.hf_dataset = offline_dataset.hf_dataset.remove_columns(
+            ["task_index"]
+        )
 
     if isinstance(offline_dataset, MultiLeRobotDataset):
         logging.info(
@@ -345,7 +364,9 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     policy = make_policy(
         hydra_cfg=cfg,
         dataset_stats=offline_dataset.meta.stats if not cfg.resume else None,
-        pretrained_policy_name_or_path=str(logger.last_pretrained_model_dir) if cfg.resume else None,
+        pretrained_policy_name_or_path=str(logger.last_pretrained_model_dir)
+        if cfg.resume
+        else None,
     )
     assert isinstance(policy, nn.Module)
     # Create optimizer and scheduler
@@ -358,36 +379,58 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     if cfg.resume:
         step = logger.load_last_training_state(optimizer, lr_scheduler)
 
-    num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
+    num_learnable_params = sum(
+        p.numel() for p in policy.parameters() if p.requires_grad
+    )
     num_total_params = sum(p.numel() for p in policy.parameters())
 
     log_output_dir(out_dir)
     logging.info(f"{cfg.env.task=}")
-    logging.info(f"{cfg.training.offline_steps=} ({format_big_number(cfg.training.offline_steps)})")
+    logging.info(
+        f"{cfg.training.offline_steps=} ({format_big_number(cfg.training.offline_steps)})"
+    )
     logging.info(f"{cfg.training.online_steps=}")
-    logging.info(f"{offline_dataset.num_frames=} ({format_big_number(offline_dataset.num_frames)})")
+    logging.info(
+        f"{offline_dataset.num_frames=} ({format_big_number(offline_dataset.num_frames)})"
+    )
     logging.info(f"{offline_dataset.num_episodes=}")
     logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
     logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
 
     # Note: this helper will be used in offline and online training loops.
     def evaluate_and_checkpoint_if_needed(step, is_online):
-        _num_digits = max(6, len(str(cfg.training.offline_steps + cfg.training.online_steps)))
+        _num_digits = max(
+            6, len(str(cfg.training.offline_steps + cfg.training.online_steps))
+        )
         step_identifier = f"{step:0{_num_digits}d}"
 
         if cfg.training.eval_freq > 0 and step % cfg.training.eval_freq == 0:
             logging.info(f"Eval policy at step {step}")
-            with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.use_amp else nullcontext():
+            with (
+                torch.no_grad(),
+                torch.autocast(device_type=device.type)
+                if cfg.use_amp
+                else nullcontext(),
+            ):
                 assert eval_env is not None
                 eval_info = eval_policy(
                     eval_env,
                     policy,
                     cfg.eval.n_episodes,
-                    videos_dir=Path(out_dir) / "eval" / f"videos_step_{step_identifier}",
+                    videos_dir=Path(out_dir)
+                    / "eval"
+                    / f"videos_step_{step_identifier}",
                     max_episodes_rendered=4,
                     start_seed=cfg.seed,
                 )
-            log_eval_info(logger, eval_info["aggregated"], step, cfg, offline_dataset, is_online=is_online)
+            log_eval_info(
+                logger,
+                eval_info["aggregated"],
+                step,
+                cfg,
+                offline_dataset,
+                is_online=is_online,
+            )
             if cfg.wandb.enable:
                 logger.log_video(eval_info["video_paths"][0], step, mode="eval")
             logging.info("Resume training")
@@ -456,7 +499,9 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
         train_info["dataloading_s"] = dataloading_s
 
         if step % cfg.training.log_freq == 0:
-            log_train_info(logger, train_info, step, cfg, offline_dataset, is_online=False)
+            log_train_info(
+                logger, train_info, step, cfg, offline_dataset, is_online=False
+            )
 
         # Note: evaluate_and_checkpoint_if_needed happens **after** the `step`th training update has completed,
         # so we pass in step + 1.
@@ -489,8 +534,14 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     online_dataset = OnlineBuffer(
         online_buffer_path,
         data_spec={
-            **{k: {"shape": v, "dtype": np.dtype("float32")} for k, v in policy.config.input_shapes.items()},
-            **{k: {"shape": v, "dtype": np.dtype("float32")} for k, v in policy.config.output_shapes.items()},
+            **{
+                k: {"shape": v, "dtype": np.dtype("float32")}
+                for k, v in policy.config.input_shapes.items()
+            },
+            **{
+                k: {"shape": v, "dtype": np.dtype("float32")}
+                for k, v in policy.config.output_shapes.items()
+            },
             "next.reward": {"shape": (), "dtype": np.dtype("float32")},
             "next.done": {"shape": (), "dtype": np.dtype("?")},
             "next.success": {"shape": (), "dtype": np.dtype("?")},
@@ -502,7 +553,9 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
 
     # If we are doing online rollouts asynchronously, deepcopy the policy to use for online rollouts (this
     # makes it possible to do online rollouts in parallel with training updates).
-    online_rollout_policy = deepcopy(policy) if cfg.training.do_online_rollout_async else policy
+    online_rollout_policy = (
+        deepcopy(policy) if cfg.training.do_online_rollout_async else policy
+    )
 
     # Create dataloader for online training.
     concat_dataset = torch.utils.data.ConcatDataset([offline_dataset, online_dataset])
@@ -539,7 +592,9 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
 
     online_step = 0
     online_rollout_s = 0  # time take to do online rollout
-    update_online_buffer_s = 0  # time taken to update the online buffer with the online rollout data
+    update_online_buffer_s = (
+        0  # time taken to update the online buffer with the online rollout data
+    )
     # Time taken waiting for the online buffer to finish being updated. This is relevant when using the async
     # online rollout option.
     await_update_online_buffer_s = 0
@@ -563,11 +618,16 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
                     online_env,
                     online_rollout_policy,
                     n_episodes=cfg.training.online_rollout_n_episodes,
-                    max_episodes_rendered=min(10, cfg.training.online_rollout_n_episodes),
+                    max_episodes_rendered=min(
+                        10, cfg.training.online_rollout_n_episodes
+                    ),
                     videos_dir=logger.log_dir / "online_rollout_videos",
                     return_episode_data=True,
                     start_seed=(
-                        rollout_start_seed := (rollout_start_seed + cfg.training.batch_size) % 1000000
+                        rollout_start_seed := (
+                            rollout_start_seed + cfg.training.batch_size
+                        )
+                        % 1000000
                     ),
                 )
             online_rollout_s = time.perf_counter() - start_rollout_time
@@ -577,16 +637,21 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
                 online_dataset.add_data(eval_info["episodes"])
 
                 # Update the concatenated dataset length used during sampling.
-                concat_dataset.cumulative_sizes = concat_dataset.cumsum(concat_dataset.datasets)
+                concat_dataset.cumulative_sizes = concat_dataset.cumsum(
+                    concat_dataset.datasets
+                )
 
                 # Update the sampling weights.
                 sampler.weights = compute_sampler_weights(
                     offline_dataset,
-                    offline_drop_n_last_frames=cfg.training.get("drop_n_last_frames", 0),
+                    offline_drop_n_last_frames=cfg.training.get(
+                        "drop_n_last_frames", 0
+                    ),
                     online_dataset=online_dataset,
                     # +1 because online rollouts return an extra frame for the "final observation". Note: we don't have
                     # this final observation in the offline datasets, but we might add them in future.
-                    online_drop_n_last_frames=cfg.training.get("drop_n_last_frames", 0) + 1,
+                    online_drop_n_last_frames=cfg.training.get("drop_n_last_frames", 0)
+                    + 1,
                     online_sampling_ratio=cfg.training.online_sampling_ratio,
                 )
                 sampler.num_frames = len(concat_dataset)
@@ -639,7 +704,9 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
                 train_info["online_buffer_size"] = len(online_dataset)
 
             if step % cfg.training.log_freq == 0:
-                log_train_info(logger, train_info, step, cfg, online_dataset, is_online=True)
+                log_train_info(
+                    logger, train_info, step, cfg, online_dataset, is_online=True
+                )
 
             # Note: evaluate_and_checkpoint_if_needed happens **after** the `step`th training update has completed,
             # so we pass in step + 1.
@@ -672,7 +739,9 @@ def train_cli(cfg: dict):
     )
 
 
-def train_notebook(out_dir=None, job_name=None, config_name="default", config_path="../configs"):
+def train_notebook(
+    out_dir=None, job_name=None, config_name="default", config_path="../configs"
+):
     from hydra import compose, initialize
 
     hydra.core.global_hydra.GlobalHydra.instance().clear()
diff --git a/lerobot/scripts/train_hilserl_classifier.py b/lerobot/scripts/train_hilserl_classifier.py
index e0e01a5d..6044b038 100644
--- a/lerobot/scripts/train_hilserl_classifier.py
+++ b/lerobot/scripts/train_hilserl_classifier.py
@@ -14,7 +14,6 @@
 import logging
 import time
 from contextlib import nullcontext
-from pathlib import Path
 from pprint import pformat
 
 import hydra
@@ -28,14 +27,16 @@ from termcolor import colored
 from torch import optim
 from torch.autograd import profiler
 from torch.cuda.amp import GradScaler
-from torch.utils.data import DataLoader, RandomSampler, WeightedRandomSampler, random_split
+from torch.utils.data import DataLoader, RandomSampler, WeightedRandomSampler
 from tqdm import tqdm
 
 from lerobot.common.datasets.factory import resolve_delta_timestamps
 from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
 from lerobot.common.logger import Logger
 from lerobot.common.policies.factory import _policy_cfg_from_hydra_cfg
-from lerobot.common.policies.hilserl.classifier.configuration_classifier import ClassifierConfig
+from lerobot.common.policies.hilserl.classifier.configuration_classifier import (
+    ClassifierConfig,
+)
 from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
 from lerobot.common.utils.utils import (
     format_big_number,
@@ -50,7 +51,11 @@ def get_model(cfg, logger):  # noqa I001
     classifier_config = _policy_cfg_from_hydra_cfg(ClassifierConfig, cfg)
     model = Classifier(classifier_config)
     if cfg.resume:
-        model.load_state_dict(Classifier.from_pretrained(str(logger.last_pretrained_model_dir)).state_dict())
+        model.load_state_dict(
+            Classifier.from_pretrained(
+                str(logger.last_pretrained_model_dir)
+            ).state_dict()
+        )
     return model
 
 
@@ -62,7 +67,9 @@ def create_balanced_sampler(dataset, cfg):
     class_weights = 1.0 / counts.float()
     sample_weights = class_weights[labels]
 
-    return WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)
+    return WeightedRandomSampler(
+        weights=sample_weights, num_samples=len(sample_weights), replacement=True
+    )
 
 
 def support_amp(device: torch.device, cfg: DictConfig) -> bool:
@@ -71,7 +78,9 @@ def support_amp(device: torch.device, cfg: DictConfig) -> bool:
     return cfg.training.use_amp and device.type in ("cuda", "cpu")
 
 
-def train_epoch(model, train_loader, criterion, optimizer, grad_scaler, device, logger, step, cfg):
+def train_epoch(
+    model, train_loader, criterion, optimizer, grad_scaler, device, logger, step, cfg
+):
     # Single epoch training loop with AMP support and progress tracking
     model.train()
     correct = 0
@@ -85,7 +94,11 @@ def train_epoch(model, train_loader, criterion, optimizer, grad_scaler, device,
         labels = batch[cfg.training.label_key].float().to(device)
 
         # Forward pass with optional AMP
-        with torch.autocast(device_type=device.type) if support_amp(device, cfg) else nullcontext():
+        with (
+            torch.autocast(device_type=device.type)
+            if support_amp(device, cfg)
+            else nullcontext()
+        ):
             outputs = model(images)
             loss = criterion(outputs.logits, labels)
 
@@ -130,7 +143,9 @@ def validate(model, val_loader, criterion, device, logger, cfg):
 
     with (
         torch.no_grad(),
-        torch.autocast(device_type=device.type) if support_amp(device, cfg) else nullcontext(),
+        torch.autocast(device_type=device.type)
+        if support_amp(device, cfg)
+        else nullcontext(),
     ):
         for batch in tqdm(val_loader, desc="Validation"):
             images = [batch[img_key].to(device) for img_key in cfg.training.image_keys]
@@ -143,7 +158,9 @@ def validate(model, val_loader, criterion, device, logger, cfg):
                 ):
                     outputs = model(images)
                 inference_times.append(
-                    next(x for x in prof.key_averages() if x.key == "model_inference").cpu_time
+                    next(
+                        x for x in prof.key_averages() if x.key == "model_inference"
+                    ).cpu_time
                 )
             else:
                 outputs = model(images)
@@ -161,16 +178,24 @@ def validate(model, val_loader, criterion, device, logger, cfg):
 
             # Log sample predictions for visualization
             if len(samples) < cfg.eval.num_samples_to_log:
-                for i in range(min(cfg.eval.num_samples_to_log - len(samples), len(images))):
+                for i in range(
+                    min(cfg.eval.num_samples_to_log - len(samples), len(images))
+                ):
                     if model.config.num_classes == 2:
                         confidence = round(outputs.probabilities[i].item(), 3)
                     else:
-                        confidence = [round(prob, 3) for prob in outputs.probabilities[i].tolist()]
+                        confidence = [
+                            round(prob, 3) for prob in outputs.probabilities[i].tolist()
+                        ]
                     samples.append(
                         {
                             **{
-                                f"image_{img_key}": wandb.Image(images[img_idx][i].cpu())
-                                for img_idx, img_key in enumerate(cfg.training.image_keys)
+                                f"image_{img_key}": wandb.Image(
+                                    images[img_idx][i].cpu()
+                                )
+                                for img_idx, img_key in enumerate(
+                                    cfg.training.image_keys
+                                )
                             },
                             "true_label": labels[i].item(),
                             "predicted": predictions[i].item(),
@@ -238,15 +263,24 @@ def benchmark_inference_time(model, dataset, logger, cfg, device, step):
             elif device.type == "mps":
                 torch.mps.synchronize()
 
-            with profiler.profile(record_shapes=True) as prof, profiler.record_function("model_inference"):
+            with (
+                profiler.profile(record_shapes=True) as prof,
+                profiler.record_function("model_inference"),
+            ):
                 _ = model(x)
 
             inference_times.append(
-                next(x for x in prof.key_averages() if x.key == "model_inference").cpu_time
+                next(
+                    x for x in prof.key_averages() if x.key == "model_inference"
+                ).cpu_time
             )
 
     inference_times = np.array(inference_times)
-    avg, median, std = inference_times.mean(), np.median(inference_times), inference_times.std()
+    avg, median, std = (
+        inference_times.mean(),
+        np.median(inference_times),
+        inference_times.std(),
+    )
     print(
         f"Inference time mean: {avg:.2f} us, median: {median:.2f} us, std: {std:.2f} us, with {iters} iterations on {device.type} device"
     )
@@ -264,7 +298,11 @@ def benchmark_inference_time(model, dataset, logger, cfg, device, step):
     return avg, median, std
 
 
-@hydra.main(version_base="1.2", config_path="../configs/policy", config_name="hilserl_classifier")
+@hydra.main(
+    version_base="1.2",
+    config_path="../configs/policy",
+    config_name="hilserl_classifier",
+)
 def train(cfg: DictConfig) -> None:
     # Main training pipeline with support for resuming training
     logging.info(OmegaConf.to_yaml(cfg))
@@ -278,7 +316,9 @@ def train(cfg: DictConfig) -> None:
 
     # Setup dataset and dataloaders
     dataset = LeRobotDataset(
-        cfg.dataset_repo_id, root=cfg.dataset_root, local_files_only=cfg.local_files_only
+        cfg.dataset_repo_id,
+        root=cfg.dataset_root,
+        local_files_only=cfg.local_files_only,
     )
     logging.info(f"Dataset size: {len(dataset)}")
 
@@ -314,7 +354,9 @@ def train(cfg: DictConfig) -> None:
                 "You have set resume=True, but there is no model checkpoint in "
                 f"{Logger.get_last_checkpoint_dir(out_dir)}"
             )
-        checkpoint_cfg_path = str(Logger.get_last_pretrained_model_dir(out_dir) / "config.yaml")
+        checkpoint_cfg_path = str(
+            Logger.get_last_pretrained_model_dir(out_dir) / "config.yaml"
+        )
         logging.info(
             colored(
                 "You have set resume=True, indicating that you wish to resume a run",
@@ -327,7 +369,9 @@ def train(cfg: DictConfig) -> None:
         # Check for differences between the checkpoint configuration and provided configuration.
         # Hack to resolve the delta_timestamps ahead of time in order to properly diff.
         resolve_delta_timestamps(cfg)
-        diff = DeepDiff(OmegaConf.to_container(checkpoint_cfg), OmegaConf.to_container(cfg))
+        diff = DeepDiff(
+            OmegaConf.to_container(checkpoint_cfg), OmegaConf.to_container(cfg)
+        )
         # Ignore the `resume` and parameters.
         if "values_changed" in diff and "root['resume']" in diff["values_changed"]:
             del diff["values_changed"]["root['resume']"]
@@ -346,7 +390,11 @@ def train(cfg: DictConfig) -> None:
 
     optimizer = optim.AdamW(model.parameters(), lr=cfg.training.learning_rate)
     # Use BCEWithLogitsLoss for binary classification and CrossEntropyLoss for multi-class
-    criterion = nn.BCEWithLogitsLoss() if model.config.num_classes == 2 else nn.CrossEntropyLoss()
+    criterion = (
+        nn.BCEWithLogitsLoss()
+        if model.config.num_classes == 2
+        else nn.CrossEntropyLoss()
+    )
     grad_scaler = GradScaler(enabled=cfg.training.use_amp)
 
     # Log model parameters
@@ -362,7 +410,17 @@ def train(cfg: DictConfig) -> None:
     for epoch in range(cfg.training.num_epochs):
         logging.info(f"\nEpoch {epoch+1}/{cfg.training.num_epochs}")
 
-        train_epoch(model, train_loader, criterion, optimizer, grad_scaler, device, logger, step, cfg)
+        train_epoch(
+            model,
+            train_loader,
+            criterion,
+            optimizer,
+            grad_scaler,
+            device,
+            logger,
+            step,
+            cfg,
+        )
 
         # Periodic validation
         if cfg.training.eval_freq > 0 and (epoch + 1) % cfg.training.eval_freq == 0:
diff --git a/lerobot/scripts/train_sac.py b/lerobot/scripts/train_sac.py
index 4f7b55cc..cfd05f62 100644
--- a/lerobot/scripts/train_sac.py
+++ b/lerobot/scripts/train_sac.py
@@ -22,7 +22,6 @@ from typing import Callable, Optional, Sequence, TypedDict
 import hydra
 import torch
 import torch.nn.functional as F
-from deepdiff import DeepDiff
 from omegaconf import DictConfig, OmegaConf
 from torch import nn
 from tqdm import tqdm
@@ -30,20 +29,17 @@ from tqdm import tqdm
 # TODO: Remove the import of maniskill
 from lerobot.common.datasets.factory import make_dataset
 from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
-from lerobot.common.envs.factory import make_env, make_maniskill_env
-from lerobot.common.envs.utils import preprocess_maniskill_observation, preprocess_observation
+from lerobot.common.envs.factory import make_maniskill_env
+from lerobot.common.envs.utils import preprocess_maniskill_observation
 from lerobot.common.logger import Logger, log_output_dir
 from lerobot.common.policies.factory import make_policy
 from lerobot.common.policies.sac.modeling_sac import SACPolicy
-from lerobot.common.policies.utils import get_device_from_parameters
 from lerobot.common.utils.utils import (
     format_big_number,
     get_safe_torch_device,
-    init_hydra_config,
     init_logging,
     set_global_seed,
 )
-from lerobot.scripts.eval import eval_policy
 
 
 def make_optimizers_and_scheduler(cfg, policy):
@@ -56,7 +52,9 @@ def make_optimizers_and_scheduler(cfg, policy):
         params=policy.critic_ensemble.parameters(), lr=policy.config.critic_lr
     )
     # We wrap policy log temperature in list because this is a torch tensor and not a nn.Module
-    optimizer_temperature = torch.optim.Adam(params=[policy.log_alpha], lr=policy.config.critic_lr)
+    optimizer_temperature = torch.optim.Adam(
+        params=[policy.log_alpha], lr=policy.config.critic_lr
+    )
     lr_scheduler = None
     optimizers = {
         "actor": optimizer_actor,
@@ -108,7 +106,9 @@ def random_crop_vectorized(images: torch.Tensor, output_size: tuple) -> torch.Te
     images_hwcn = images.permute(0, 2, 3, 1)  # (B, H, W, C)
 
     # Gather pixels
-    cropped_hwcn = images_hwcn[torch.arange(B, device=images.device).view(B, 1, 1), rows, cols, :]
+    cropped_hwcn = images_hwcn[
+        torch.arange(B, device=images.device).view(B, 1, 1), rows, cols, :
+    ]
     # cropped_hwcn => (B, crop_h, crop_w, C)
 
     cropped = cropped_hwcn.permute(0, 3, 1, 2)  # (B, C, crop_h, crop_w)
@@ -198,8 +198,12 @@ class ReplayBuffer:
         """
         # We convert the LeRobotDataset into a replay buffer, because it is more efficient to sample from
         # a replay buffer than from a lerobot dataset.
-        replay_buffer = cls(capacity=len(lerobot_dataset), device=device, state_keys=state_keys)
-        list_transition = cls._lerobotdataset_to_transitions(dataset=lerobot_dataset, state_keys=state_keys)
+        replay_buffer = cls(
+            capacity=len(lerobot_dataset), device=device, state_keys=state_keys
+        )
+        list_transition = cls._lerobotdataset_to_transitions(
+            dataset=lerobot_dataset, state_keys=state_keys
+        )
         # Fill the replay buffer with the lerobot dataset transitions
         for data in list_transition:
             replay_buffer.add(
@@ -244,7 +248,9 @@ class ReplayBuffer:
 
         # If not provided, you can either raise an error or define a default:
         if state_keys is None:
-            raise ValueError("You must provide a list of keys in `state_keys` that define your 'state'.")
+            raise ValueError(
+                "You must provide a list of keys in `state_keys` that define your 'state'."
+            )
 
         transitions: list[Transition] = []
         num_frames = len(dataset)
@@ -298,36 +304,40 @@ class ReplayBuffer:
         # -- Build batched states --
         batch_state = {}
         for key in self.state_keys:
-            batch_state[key] = torch.cat([t["state"][key] for t in list_of_transitions], dim=0).to(
-                self.device
-            )
+            batch_state[key] = torch.cat(
+                [t["state"][key] for t in list_of_transitions], dim=0
+            ).to(self.device)
             if key.startswith("observation.image") and self.use_drq:
                 batch_state[key] = self.image_augmentation_function(batch_state[key])
 
         # -- Build batched actions --
-        batch_actions = torch.cat([t["action"] for t in list_of_transitions]).to(self.device)
-
-        # -- Build batched rewards --
-        batch_rewards = torch.tensor([t["reward"] for t in list_of_transitions], dtype=torch.float32).to(
+        batch_actions = torch.cat([t["action"] for t in list_of_transitions]).to(
             self.device
         )
 
+        # -- Build batched rewards --
+        batch_rewards = torch.tensor(
+            [t["reward"] for t in list_of_transitions], dtype=torch.float32
+        ).to(self.device)
+
         # -- Build batched next states --
         batch_next_state = {}
         for key in self.state_keys:
-            batch_next_state[key] = torch.cat([t["next_state"][key] for t in list_of_transitions], dim=0).to(
-                self.device
-            )
+            batch_next_state[key] = torch.cat(
+                [t["next_state"][key] for t in list_of_transitions], dim=0
+            ).to(self.device)
             if key.startswith("observation.image") and self.use_drq:
-                batch_next_state[key] = self.image_augmentation_function(batch_next_state[key])
+                batch_next_state[key] = self.image_augmentation_function(
+                    batch_next_state[key]
+                )
 
         # -- Build batched dones --
-        batch_dones = torch.tensor([t["done"] for t in list_of_transitions], dtype=torch.float32).to(
-            self.device
-        )
-        batch_dones = torch.tensor([t["done"] for t in list_of_transitions], dtype=torch.float32).to(
-            self.device
-        )
+        batch_dones = torch.tensor(
+            [t["done"] for t in list_of_transitions], dtype=torch.float32
+        ).to(self.device)
+        batch_dones = torch.tensor(
+            [t["done"] for t in list_of_transitions], dtype=torch.float32
+        ).to(self.device)
 
         # Return a BatchTransition typed dict
         return BatchTransition(
@@ -344,7 +354,13 @@ def concatenate_batch_transitions(
 ) -> BatchTransition:
     """NOTE: Be careful it change the left_batch_transitions in place"""
     left_batch_transitions["state"] = {
-        key: torch.cat([left_batch_transitions["state"][key], right_batch_transition["state"][key]], dim=0)
+        key: torch.cat(
+            [
+                left_batch_transitions["state"][key],
+                right_batch_transition["state"][key],
+            ],
+            dim=0,
+        )
         for key in left_batch_transitions["state"]
     }
     left_batch_transitions["action"] = torch.cat(
@@ -355,7 +371,11 @@ def concatenate_batch_transitions(
     )
     left_batch_transitions["next_state"] = {
         key: torch.cat(
-            [left_batch_transitions["next_state"][key], right_batch_transition["next_state"][key]], dim=0
+            [
+                left_batch_transitions["next_state"][key],
+                right_batch_transition["next_state"][key],
+            ],
+            dim=0,
         )
         for key in left_batch_transitions["next_state"]
     }
@@ -407,7 +427,9 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
         # dataset_stats=offline_dataset.meta.stats if not cfg.resume else None,
         # Hack: But if we do online traning, we do not need dataset_stats
         dataset_stats=None,
-        pretrained_policy_name_or_path=str(logger.last_pretrained_model_dir) if cfg.resume else None,
+        pretrained_policy_name_or_path=str(logger.last_pretrained_model_dir)
+        if cfg.resume
+        else None,
         device=device,
     )
     assert isinstance(policy, nn.Module)
@@ -416,7 +438,9 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
 
     # TODO: Handle resume
 
-    num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
+    num_learnable_params = sum(
+        p.numel() for p in policy.parameters() if p.requires_grad
+    )
     num_total_params = sum(p.numel() for p in policy.parameters())
 
     log_output_dir(out_dir)
@@ -433,7 +457,9 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
     obs = {key: obs[key].to(device, non_blocking=True) for key in obs}
 
     replay_buffer = ReplayBuffer(
-        capacity=cfg.training.online_buffer_capacity, device=device, state_keys=cfg.policy.input_shapes.keys()
+        capacity=cfg.training.online_buffer_capacity,
+        device=device,
+        state_keys=cfg.policy.input_shapes.keys(),
     )
 
     batch_size = cfg.training.batch_size
@@ -455,12 +481,16 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
 
         if interaction_step >= cfg.training.online_step_before_learning:
             action = policy.select_action(batch=obs)
-            next_obs, reward, done, truncated, info = online_env.step(action.cpu().numpy())
+            next_obs, reward, done, truncated, info = online_env.step(
+                action.cpu().numpy()
+            )
         else:
             action = online_env.action_space.sample()
             next_obs, reward, done, truncated, info = online_env.step(action)
             # HACK
-            action = torch.tensor(action, dtype=torch.float32).to(device, non_blocking=True)
+            action = torch.tensor(action, dtype=torch.float32).to(
+                device, non_blocking=True
+            )
 
         # HACK: For maniskill
         # next_obs = preprocess_observation(next_obs)
@@ -470,14 +500,20 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
         # Because we are using a single environment
         # we can safely assume that the episode is done
         if done[0] or truncated[0]:
-            logging.info(f"Global step {interaction_step}: Episode reward: {sum_reward_episode}")
-            logger.log_dict({"Sum episode reward": sum_reward_episode}, interaction_step)
+            logging.info(
+                f"Global step {interaction_step}: Episode reward: {sum_reward_episode}"
+            )
+            logger.log_dict(
+                {"Sum episode reward": sum_reward_episode}, interaction_step
+            )
             sum_reward_episode = 0
             # HACK: This is for maniskill
             logging.info(
                 f"global step {interaction_step}: episode success: {info['success'].float().item()} \n"
             )
-            logger.log_dict({"Episode success": info["success"].float().item()}, interaction_step)
+            logger.log_dict(
+                {"Episode success": info["success"].float().item()}, interaction_step
+            )
 
         replay_buffer.add(
             state=obs,
@@ -551,7 +587,9 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
 
                 training_infos["loss_actor"] = loss_actor.item()
 
-                loss_temperature = policy.compute_loss_temperature(observations=observations)
+                loss_temperature = policy.compute_loss_temperature(
+                    observations=observations
+                )
                 optimizers["temperature"].zero_grad()
                 loss_temperature.backward()
                 optimizers["temperature"].step()
@@ -573,7 +611,9 @@ def train_cli(cfg: dict):
     )
 
 
-def train_notebook(out_dir=None, job_name=None, config_name="default", config_path="../configs"):
+def train_notebook(
+    out_dir=None, job_name=None, config_name="default", config_path="../configs"
+):
     from hydra import compose, initialize
 
     hydra.core.global_hydra.GlobalHydra.instance().clear()
diff --git a/lerobot/scripts/visualize_dataset.py b/lerobot/scripts/visualize_dataset.py
index cdd5ce60..25bac4d3 100644
--- a/lerobot/scripts/visualize_dataset.py
+++ b/lerobot/scripts/visualize_dataset.py
@@ -94,8 +94,12 @@ def to_hwc_uint8_numpy(chw_float32_torch: torch.Tensor) -> np.ndarray:
     assert chw_float32_torch.dtype == torch.float32
     assert chw_float32_torch.ndim == 3
     c, h, w = chw_float32_torch.shape
-    assert c < h and c < w, f"expect channel first images, but instead {chw_float32_torch.shape}"
-    hwc_uint8_numpy = (chw_float32_torch * 255).type(torch.uint8).permute(1, 2, 0).numpy()
+    assert (
+        c < h and c < w
+    ), f"expect channel first images, but instead {chw_float32_torch.shape}"
+    hwc_uint8_numpy = (
+        (chw_float32_torch * 255).type(torch.uint8).permute(1, 2, 0).numpy()
+    )
     return hwc_uint8_numpy
 
 
diff --git a/lerobot/scripts/visualize_dataset_html.py b/lerobot/scripts/visualize_dataset_html.py
index cc3f3930..5f7e371c 100644
--- a/lerobot/scripts/visualize_dataset_html.py
+++ b/lerobot/scripts/visualize_dataset_html.py
@@ -81,7 +81,11 @@ def run_server(
     static_folder: Path,
     template_folder: Path,
 ):
-    app = Flask(__name__, static_folder=static_folder.resolve(), template_folder=template_folder.resolve())
+    app = Flask(
+        __name__,
+        static_folder=static_folder.resolve(),
+        template_folder=template_folder.resolve(),
+    )
     app.config["SEND_FILE_MAX_AGE_DEFAULT"] = 0  # specifying not to cache
 
     @app.route("/")
@@ -138,8 +142,12 @@ def run_server(
             )
         )
 
-    @app.route("/<string:dataset_namespace>/<string:dataset_name>/episode_<int:episode_id>")
-    def show_episode(dataset_namespace, dataset_name, episode_id, dataset=dataset, episodes=episodes):
+    @app.route(
+        "/<string:dataset_namespace>/<string:dataset_name>/episode_<int:episode_id>"
+    )
+    def show_episode(
+        dataset_namespace, dataset_name, episode_id, dataset=dataset, episodes=episodes
+    ):
         repo_id = f"{dataset_namespace}/{dataset_name}"
         try:
             if dataset is None:
@@ -150,7 +158,9 @@ def run_server(
                 400,
             )
         dataset_version = (
-            dataset.meta._version if isinstance(dataset, LeRobotDataset) else dataset.codebase_version
+            dataset.meta._version
+            if isinstance(dataset, LeRobotDataset)
+            else dataset.codebase_version
         )
         match = re.search(r"v(\d+)\.", dataset_version)
         if match:
@@ -171,15 +181,21 @@ def run_server(
         }
         if isinstance(dataset, LeRobotDataset):
             video_paths = [
-                dataset.meta.get_video_file_path(episode_id, key) for key in dataset.meta.video_keys
+                dataset.meta.get_video_file_path(episode_id, key)
+                for key in dataset.meta.video_keys
             ]
             videos_info = [
-                {"url": url_for("static", filename=video_path), "filename": video_path.parent.name}
+                {
+                    "url": url_for("static", filename=video_path),
+                    "filename": video_path.parent.name,
+                }
                 for video_path in video_paths
             ]
             tasks = dataset.meta.episodes[episode_id]["tasks"]
         else:
-            video_keys = [key for key, ft in dataset.features.items() if ft["dtype"] == "video"]
+            video_keys = [
+                key for key, ft in dataset.features.items() if ft["dtype"] == "video"
+            ]
             videos_info = [
                 {
                     "url": f"https://huggingface.co/datasets/{repo_id}/resolve/main/"
@@ -198,16 +214,24 @@ def run_server(
             )
             response.raise_for_status()
             # Split into lines and parse each line as JSON
-            tasks_jsonl = [json.loads(line) for line in response.text.splitlines() if line.strip()]
+            tasks_jsonl = [
+                json.loads(line) for line in response.text.splitlines() if line.strip()
+            ]
 
-            filtered_tasks_jsonl = [row for row in tasks_jsonl if row["episode_index"] == episode_id]
+            filtered_tasks_jsonl = [
+                row for row in tasks_jsonl if row["episode_index"] == episode_id
+            ]
             tasks = filtered_tasks_jsonl[0]["tasks"]
 
         videos_info[0]["language_instruction"] = tasks
 
         if episodes is None:
             episodes = list(
-                range(dataset.num_episodes if isinstance(dataset, LeRobotDataset) else dataset.total_episodes)
+                range(
+                    dataset.num_episodes
+                    if isinstance(dataset, LeRobotDataset)
+                    else dataset.total_episodes
+                )
             )
 
         return render_template(
@@ -233,7 +257,9 @@ def get_episode_data(dataset: LeRobotDataset | IterableNamespace, episode_index)
     This file will be loaded by Dygraph javascript to plot data in real time."""
     columns = []
 
-    selected_columns = [col for col, ft in dataset.features.items() if ft["dtype"] == "float32"]
+    selected_columns = [
+        col for col, ft in dataset.features.items() if ft["dtype"] == "float32"
+    ]
     selected_columns.remove("timestamp")
 
     # init header of csv with state and action names
@@ -247,7 +273,10 @@ def get_episode_data(dataset: LeRobotDataset | IterableNamespace, episode_index)
         )
         header += [f"{column_name}_{i}" for i in range(dim_state)]
 
-        if "names" in dataset.features[column_name] and dataset.features[column_name]["names"]:
+        if (
+            "names" in dataset.features[column_name]
+            and dataset.features[column_name]["names"]
+        ):
             column_names = dataset.features[column_name]["names"]
             while not isinstance(column_names, list):
                 column_names = list(column_names.values())[0]
@@ -268,8 +297,12 @@ def get_episode_data(dataset: LeRobotDataset | IterableNamespace, episode_index)
     else:
         repo_id = dataset.repo_id
 
-        url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/" + dataset.data_path.format(
-            episode_chunk=int(episode_index) // dataset.chunks_size, episode_index=episode_index
+        url = (
+            f"https://huggingface.co/datasets/{repo_id}/resolve/main/"
+            + dataset.data_path.format(
+                episode_chunk=int(episode_index) // dataset.chunks_size,
+                episode_index=episode_index,
+            )
         )
         df = pd.read_parquet(url)
         data = df[selected_columns]  # Select specific columns
@@ -302,7 +335,9 @@ def get_episode_video_paths(dataset: LeRobotDataset, ep_index: int) -> list[str]
     ]
 
 
-def get_episode_language_instruction(dataset: LeRobotDataset, ep_index: int) -> list[str]:
+def get_episode_language_instruction(
+    dataset: LeRobotDataset, ep_index: int
+) -> list[str]:
     # check if the dataset has language instructions
     if "language_instruction" not in dataset.features:
         return None
@@ -313,11 +348,15 @@ def get_episode_language_instruction(dataset: LeRobotDataset, ep_index: int) ->
     language_instruction = dataset.hf_dataset[first_frame_idx]["language_instruction"]
     # TODO (michel-aractingi) hack to get the sentence, some strings in openx are badly stored
     # with the tf.tensor appearing in the string
-    return language_instruction.removeprefix("tf.Tensor(b'").removesuffix("', shape=(), dtype=string)")
+    return language_instruction.removeprefix("tf.Tensor(b'").removesuffix(
+        "', shape=(), dtype=string)"
+    )
 
 
 def get_dataset_info(repo_id: str) -> IterableNamespace:
-    response = requests.get(f"https://huggingface.co/datasets/{repo_id}/resolve/main/meta/info.json")
+    response = requests.get(
+        f"https://huggingface.co/datasets/{repo_id}/resolve/main/meta/info.json"
+    )
     response.raise_for_status()  # Raises an HTTPError for bad responses
     dataset_info = response.json()
     dataset_info["repo_id"] = repo_id
@@ -346,7 +385,9 @@ def visualize_dataset_html(
         if force_override:
             shutil.rmtree(output_dir)
         else:
-            logging.info(f"Output directory already exists. Loading from it: '{output_dir}'")
+            logging.info(
+                f"Output directory already exists. Loading from it: '{output_dir}'"
+            )
 
     output_dir.mkdir(parents=True, exist_ok=True)
 
diff --git a/lerobot/scripts/visualize_image_transforms.py b/lerobot/scripts/visualize_image_transforms.py
index f9fb5c08..a4ae4b5f 100644
--- a/lerobot/scripts/visualize_image_transforms.py
+++ b/lerobot/scripts/visualize_image_transforms.py
@@ -162,8 +162,12 @@ def visualize_transforms(cfg, output_dir: Path, n_examples: int = 5):
     print("\nOriginal frame saved to:")
     print(f"    {output_dir / 'original_frame.png'}.")
 
-    save_config_all_transforms(cfg.training.image_transforms, original_frame, output_dir, n_examples)
-    save_config_single_transforms(cfg.training.image_transforms, original_frame, output_dir, n_examples)
+    save_config_all_transforms(
+        cfg.training.image_transforms, original_frame, output_dir, n_examples
+    )
+    save_config_single_transforms(
+        cfg.training.image_transforms, original_frame, output_dir, n_examples
+    )
 
 
 @hydra.main(version_base="1.2", config_name="default", config_path="../configs")
diff --git a/lerobot/templates/visualize_dataset_homepage.html b/lerobot/templates/visualize_dataset_homepage.html
index adff07be..19613afb 100644
--- a/lerobot/templates/visualize_dataset_homepage.html
+++ b/lerobot/templates/visualize_dataset_homepage.html
@@ -7,7 +7,7 @@
     <script src="https://cdn.tailwindcss.com"></script>
     <script defer src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"></script>
 </head>
-<body class="h-screen overflow-hidden font-mono text-white" x-data="{ 
+<body class="h-screen overflow-hidden font-mono text-white" x-data="{
     inputValue: '',
     navigateToDataset() {
         const trimmedValue = this.inputValue.trim();
@@ -40,14 +40,14 @@
             </div>
         </div>
         <div class="flex w-full max-w-lg px-4 mb-4">
-            <input 
-                type="text" 
+            <input
+                type="text"
                 x-model="inputValue"
                 @keyup.enter="navigateToDataset"
                 placeholder="enter dataset id (ex: lerobot/droid_100)"
                 class="flex-grow px-4 py-2 rounded-l bg-white bg-opacity-20 text-white placeholder-gray-300 focus:outline-none focus:ring-2 focus:ring-blue-300"
             >
-            <button 
+            <button
                 @click="navigateToDataset"
                 class="px-4 py-2 bg-blue-500 text-white rounded-r hover:bg-blue-600 focus:outline-none focus:ring-2 focus:ring-blue-300"
             >
@@ -65,4 +65,4 @@
         </details>
     </div>
 </body>
-</html>
\ No newline at end of file
+</html>
diff --git a/lerobot/templates/visualize_dataset_template.html b/lerobot/templates/visualize_dataset_template.html
index 3c93d2d6..08de3e3d 100644
--- a/lerobot/templates/visualize_dataset_template.html
+++ b/lerobot/templates/visualize_dataset_template.html
@@ -107,8 +107,8 @@
             <span class="truncate">filter videos</span>
             <div class="transition-transform" :class="{ 'rotate-180': isVideosDropdownOpen }">🔽</div>
             </div>
-    
-            <div x-show="isVideosDropdownOpen" 
+
+            <div x-show="isVideosDropdownOpen"
                 class="absolute mt-1 border border-slate-500 rounded shadow-lg z-10">
             <div>
                 <template x-for="option in videosKeys" :key="option">
diff --git a/tests/conftest.py b/tests/conftest.py
index adf050aa..93bb0b17 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -129,7 +129,12 @@ def patch_builtins_input(monkeypatch):
 
 
 def pytest_addoption(parser):
-    parser.addoption("--seed", action="store", default="42", help="Set random seed for reproducibility")
+    parser.addoption(
+        "--seed",
+        action="store",
+        default="42",
+        help="Set random seed for reproducibility",
+    )
 
 
 @pytest.fixture(autouse=True)
diff --git a/tests/fixtures/constants.py b/tests/fixtures/constants.py
index bfe6c339..2aab13d3 100644
--- a/tests/fixtures/constants.py
+++ b/tests/fixtures/constants.py
@@ -7,17 +7,39 @@ DUMMY_MOTOR_FEATURES = {
     "action": {
         "dtype": "float32",
         "shape": (6,),
-        "names": ["shoulder_pan", "shoulder_lift", "elbow_flex", "wrist_flex", "wrist_roll", "gripper"],
+        "names": [
+            "shoulder_pan",
+            "shoulder_lift",
+            "elbow_flex",
+            "wrist_flex",
+            "wrist_roll",
+            "gripper",
+        ],
     },
     "state": {
         "dtype": "float32",
         "shape": (6,),
-        "names": ["shoulder_pan", "shoulder_lift", "elbow_flex", "wrist_flex", "wrist_roll", "gripper"],
+        "names": [
+            "shoulder_pan",
+            "shoulder_lift",
+            "elbow_flex",
+            "wrist_flex",
+            "wrist_roll",
+            "gripper",
+        ],
     },
 }
 DUMMY_CAMERA_FEATURES = {
-    "laptop": {"shape": (480, 640, 3), "names": ["height", "width", "channels"], "info": None},
-    "phone": {"shape": (480, 640, 3), "names": ["height", "width", "channels"], "info": None},
+    "laptop": {
+        "shape": (480, 640, 3),
+        "names": ["height", "width", "channels"],
+        "info": None,
+    },
+    "phone": {
+        "shape": (480, 640, 3),
+        "names": ["height", "width", "channels"],
+        "info": None,
+    },
 }
 DEFAULT_FPS = 30
 DUMMY_VIDEO_INFO = {
diff --git a/tests/fixtures/dataset_factories.py b/tests/fixtures/dataset_factories.py
index c28a1165..cdb675e4 100644
--- a/tests/fixtures/dataset_factories.py
+++ b/tests/fixtures/dataset_factories.py
@@ -8,7 +8,11 @@ import PIL.Image
 import pytest
 import torch
 
-from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset, LeRobotDatasetMetadata
+from lerobot.common.datasets.lerobot_dataset import (
+    CODEBASE_VERSION,
+    LeRobotDataset,
+    LeRobotDatasetMetadata,
+)
 from lerobot.common.datasets.utils import (
     DEFAULT_CHUNK_SIZE,
     DEFAULT_FEATURES,
@@ -35,7 +39,9 @@ def get_task_index(task_dicts: dict, task: str) -> int:
 
 @pytest.fixture(scope="session")
 def img_tensor_factory():
-    def _create_img_tensor(height=100, width=100, channels=3, dtype=torch.float32) -> torch.Tensor:
+    def _create_img_tensor(
+        height=100, width=100, channels=3, dtype=torch.float32
+    ) -> torch.Tensor:
         return torch.rand((channels, height, width), dtype=dtype)
 
     return _create_img_tensor
@@ -43,10 +49,14 @@ def img_tensor_factory():
 
 @pytest.fixture(scope="session")
 def img_array_factory():
-    def _create_img_array(height=100, width=100, channels=3, dtype=np.uint8) -> np.ndarray:
+    def _create_img_array(
+        height=100, width=100, channels=3, dtype=np.uint8
+    ) -> np.ndarray:
         if np.issubdtype(dtype, np.unsignedinteger):
             # Int array in [0, 255] range
-            img_array = np.random.randint(0, 256, size=(height, width, channels), dtype=dtype)
+            img_array = np.random.randint(
+                0, 256, size=(height, width, channels), dtype=dtype
+            )
         elif np.issubdtype(dtype, np.floating):
             # Float array in [0, 1] range
             img_array = np.random.rand(height, width, channels).astype(dtype)
@@ -75,10 +85,13 @@ def features_factory():
     ) -> dict:
         if use_videos:
             camera_ft = {
-                key: {"dtype": "video", **ft, **DUMMY_VIDEO_INFO} for key, ft in camera_features.items()
+                key: {"dtype": "video", **ft, **DUMMY_VIDEO_INFO}
+                for key, ft in camera_features.items()
             }
         else:
-            camera_ft = {key: {"dtype": "image", **ft} for key, ft in camera_features.items()}
+            camera_ft = {
+                key: {"dtype": "image", **ft} for key, ft in camera_features.items()
+            }
         return {
             **motor_features,
             **camera_ft,
@@ -177,7 +190,9 @@ def episodes_factory(tasks_factory):
         if total_episodes <= 0 or total_frames <= 0:
             raise ValueError("num_episodes and total_length must be positive integers.")
         if total_frames < total_episodes:
-            raise ValueError("total_length must be greater than or equal to num_episodes.")
+            raise ValueError(
+                "total_length must be greater than or equal to num_episodes."
+            )
 
         if not tasks:
             min_tasks = 2 if multi_task else 1
@@ -185,10 +200,14 @@ def episodes_factory(tasks_factory):
             tasks = tasks_factory(total_tasks)
 
         if total_episodes < len(tasks) and not multi_task:
-            raise ValueError("The number of tasks should be less than the number of episodes.")
+            raise ValueError(
+                "The number of tasks should be less than the number of episodes."
+            )
 
         # Generate random lengths that sum up to total_length
-        lengths = np.random.multinomial(total_frames, [1 / total_episodes] * total_episodes).tolist()
+        lengths = np.random.multinomial(
+            total_frames, [1 / total_episodes] * total_episodes
+        ).tolist()
 
         tasks_list = [task_dict["task"] for task_dict in tasks]
         num_tasks_available = len(tasks_list)
@@ -196,9 +215,13 @@ def episodes_factory(tasks_factory):
         episodes_list = []
         remaining_tasks = tasks_list.copy()
         for ep_idx in range(total_episodes):
-            num_tasks_in_episode = random.randint(1, min(3, num_tasks_available)) if multi_task else 1
+            num_tasks_in_episode = (
+                random.randint(1, min(3, num_tasks_available)) if multi_task else 1
+            )
             tasks_to_sample = remaining_tasks if remaining_tasks else tasks_list
-            episode_tasks = random.sample(tasks_to_sample, min(num_tasks_in_episode, len(tasks_to_sample)))
+            episode_tasks = random.sample(
+                tasks_to_sample, min(num_tasks_in_episode, len(tasks_to_sample))
+            )
             if remaining_tasks:
                 for task in episode_tasks:
                     remaining_tasks.remove(task)
@@ -217,7 +240,9 @@ def episodes_factory(tasks_factory):
 
 
 @pytest.fixture(scope="session")
-def hf_dataset_factory(features_factory, tasks_factory, episodes_factory, img_array_factory):
+def hf_dataset_factory(
+    features_factory, tasks_factory, episodes_factory, img_array_factory
+):
     def _create_hf_dataset(
         features: dict | None = None,
         tasks: list[dict] | None = None,
@@ -236,13 +261,22 @@ def hf_dataset_factory(features_factory, tasks_factory, episodes_factory, img_ar
         episode_index_col = np.array([], dtype=np.int64)
         task_index = np.array([], dtype=np.int64)
         for ep_dict in episodes:
-            timestamp_col = np.concatenate((timestamp_col, np.arange(ep_dict["length"]) / fps))
-            frame_index_col = np.concatenate((frame_index_col, np.arange(ep_dict["length"], dtype=int)))
+            timestamp_col = np.concatenate(
+                (timestamp_col, np.arange(ep_dict["length"]) / fps)
+            )
+            frame_index_col = np.concatenate(
+                (frame_index_col, np.arange(ep_dict["length"], dtype=int))
+            )
             episode_index_col = np.concatenate(
-                (episode_index_col, np.full(ep_dict["length"], ep_dict["episode_index"], dtype=int))
+                (
+                    episode_index_col,
+                    np.full(ep_dict["length"], ep_dict["episode_index"], dtype=int),
+                )
             )
             ep_task_index = get_task_index(tasks, ep_dict["tasks"][0])
-            task_index = np.concatenate((task_index, np.full(ep_dict["length"], ep_task_index, dtype=int)))
+            task_index = np.concatenate(
+                (task_index, np.full(ep_dict["length"], ep_task_index, dtype=int))
+            )
 
         index_col = np.arange(len(episode_index_col))
 
@@ -254,7 +288,9 @@ def hf_dataset_factory(features_factory, tasks_factory, episodes_factory, img_ar
                     for _ in range(len(index_col))
                 ]
             elif ft["shape"][0] > 1 and ft["dtype"] != "video":
-                robot_cols[key] = np.random.random((len(index_col), ft["shape"][0])).astype(ft["dtype"])
+                robot_cols[key] = np.random.random(
+                    (len(index_col), ft["shape"][0])
+                ).astype(ft["dtype"])
 
         hf_features = get_hf_features_from_features(features)
         dataset = datasets.Dataset.from_dict(
@@ -299,7 +335,9 @@ def lerobot_dataset_metadata_factory(
             tasks = tasks_factory(total_tasks=info["total_tasks"])
         if not episodes:
             episodes = episodes_factory(
-                total_episodes=info["total_episodes"], total_frames=info["total_frames"], tasks=tasks
+                total_episodes=info["total_episodes"],
+                total_frames=info["total_frames"],
+                tasks=tasks,
             )
 
         mock_snapshot_download = mock_snapshot_download_factory(
@@ -316,10 +354,14 @@ def lerobot_dataset_metadata_factory(
                 "lerobot.common.datasets.lerobot_dataset.snapshot_download"
             ) as mock_snapshot_download_patch,
         ):
-            mock_get_hub_safe_version_patch.side_effect = lambda repo_id, version: version
+            mock_get_hub_safe_version_patch.side_effect = (
+                lambda repo_id, version: version
+            )
             mock_snapshot_download_patch.side_effect = mock_snapshot_download
 
-            return LeRobotDatasetMetadata(repo_id=repo_id, root=root, local_files_only=local_files_only)
+            return LeRobotDatasetMetadata(
+                repo_id=repo_id, root=root, local_files_only=local_files_only
+            )
 
     return _create_lerobot_dataset_metadata
 
@@ -350,7 +392,9 @@ def lerobot_dataset_factory(
     ) -> LeRobotDataset:
         if not info:
             info = info_factory(
-                total_episodes=total_episodes, total_frames=total_frames, total_tasks=total_tasks
+                total_episodes=total_episodes,
+                total_frames=total_frames,
+                total_tasks=total_tasks,
             )
         if not stats:
             stats = stats_factory(features=info["features"])
@@ -364,7 +408,9 @@ def lerobot_dataset_factory(
                 multi_task=multi_task,
             )
         if not hf_dataset:
-            hf_dataset = hf_dataset_factory(tasks=tasks, episodes=episode_dicts, fps=info["fps"])
+            hf_dataset = hf_dataset_factory(
+                tasks=tasks, episodes=episode_dicts, fps=info["fps"]
+            )
 
         mock_snapshot_download = mock_snapshot_download_factory(
             info=info,
@@ -383,7 +429,9 @@ def lerobot_dataset_factory(
             local_files_only=kwargs.get("local_files_only", False),
         )
         with (
-            patch("lerobot.common.datasets.lerobot_dataset.LeRobotDatasetMetadata") as mock_metadata_patch,
+            patch(
+                "lerobot.common.datasets.lerobot_dataset.LeRobotDatasetMetadata"
+            ) as mock_metadata_patch,
             patch(
                 "lerobot.common.datasets.lerobot_dataset.snapshot_download"
             ) as mock_snapshot_download_patch,
diff --git a/tests/fixtures/files.py b/tests/fixtures/files.py
index 5fe8a314..e834828a 100644
--- a/tests/fixtures/files.py
+++ b/tests/fixtures/files.py
@@ -7,7 +7,12 @@ import pyarrow.compute as pc
 import pyarrow.parquet as pq
 import pytest
 
-from lerobot.common.datasets.utils import EPISODES_PATH, INFO_PATH, STATS_PATH, TASKS_PATH
+from lerobot.common.datasets.utils import (
+    EPISODES_PATH,
+    INFO_PATH,
+    STATS_PATH,
+    TASKS_PATH,
+)
 
 
 @pytest.fixture(scope="session")
@@ -69,7 +74,10 @@ def episode_path(episodes_factory):
 @pytest.fixture(scope="session")
 def single_episode_parquet_path(hf_dataset_factory, info_factory):
     def _create_single_episode_parquet(
-        dir: Path, ep_idx: int = 0, hf_dataset: datasets.Dataset | None = None, info: dict | None = None
+        dir: Path,
+        ep_idx: int = 0,
+        hf_dataset: datasets.Dataset | None = None,
+        info: dict | None = None,
     ) -> Path:
         if not info:
             info = info_factory()
diff --git a/tests/fixtures/hub.py b/tests/fixtures/hub.py
index 351768c0..8142e3d7 100644
--- a/tests/fixtures/hub.py
+++ b/tests/fixtures/hub.py
@@ -4,7 +4,12 @@ import datasets
 import pytest
 from huggingface_hub.utils import filter_repo_objects
 
-from lerobot.common.datasets.utils import EPISODES_PATH, INFO_PATH, STATS_PATH, TASKS_PATH
+from lerobot.common.datasets.utils import (
+    EPISODES_PATH,
+    INFO_PATH,
+    STATS_PATH,
+    TASKS_PATH,
+)
 from tests.fixtures.constants import LEROBOT_TEST_DIR
 
 
@@ -41,15 +46,21 @@ def mock_snapshot_download_factory(
             tasks = tasks_factory(total_tasks=info["total_tasks"])
         if not episodes:
             episodes = episodes_factory(
-                total_episodes=info["total_episodes"], total_frames=info["total_frames"], tasks=tasks
+                total_episodes=info["total_episodes"],
+                total_frames=info["total_frames"],
+                tasks=tasks,
             )
         if not hf_dataset:
-            hf_dataset = hf_dataset_factory(tasks=tasks, episodes=episodes, fps=info["fps"])
+            hf_dataset = hf_dataset_factory(
+                tasks=tasks, episodes=episodes, fps=info["fps"]
+            )
 
         def _extract_episode_index_from_path(fpath: str) -> int:
             path = Path(fpath)
             if path.suffix == ".parquet" and path.stem.startswith("episode_"):
-                episode_index = int(path.stem[len("episode_") :])  # 'episode_000000' -> 0
+                episode_index = int(
+                    path.stem[len("episode_") :]
+                )  # 'episode_000000' -> 0
                 return episode_index
             else:
                 return None
@@ -74,12 +85,16 @@ def mock_snapshot_download_factory(
             for episode_dict in episodes:
                 ep_idx = episode_dict["episode_index"]
                 ep_chunk = ep_idx // info["chunks_size"]
-                data_path = info["data_path"].format(episode_chunk=ep_chunk, episode_index=ep_idx)
+                data_path = info["data_path"].format(
+                    episode_chunk=ep_chunk, episode_index=ep_idx
+                )
                 data_files.append(data_path)
             all_files.extend(data_files)
 
             allowed_files = filter_repo_objects(
-                all_files, allow_patterns=allow_patterns, ignore_patterns=ignore_patterns
+                all_files,
+                allow_patterns=allow_patterns,
+                ignore_patterns=ignore_patterns,
             )
 
             # Create allowed files
@@ -87,7 +102,9 @@ def mock_snapshot_download_factory(
                 if rel_path.startswith("data/"):
                     episode_index = _extract_episode_index_from_path(rel_path)
                     if episode_index is not None:
-                        _ = single_episode_parquet_path(local_dir, episode_index, hf_dataset, info)
+                        _ = single_episode_parquet_path(
+                            local_dir, episode_index, hf_dataset, info
+                        )
                 if rel_path == INFO_PATH:
                     _ = info_path(local_dir, info)
                 elif rel_path == STATS_PATH:
diff --git a/tests/mock_dynamixel_sdk.py b/tests/mock_dynamixel_sdk.py
index a790dff0..557d04c1 100644
--- a/tests/mock_dynamixel_sdk.py
+++ b/tests/mock_dynamixel_sdk.py
@@ -67,7 +67,9 @@ class GroupSyncRead:
     def addParam(self, motor_index):  # noqa: N802
         # Initialize motor default values
         if motor_index not in self.packet_handler.data:
-            self.packet_handler.data[motor_index] = get_default_motor_values(motor_index)
+            self.packet_handler.data[motor_index] = get_default_motor_values(
+                motor_index
+            )
 
     def txRxPacket(self):  # noqa: N802
         return COMM_SUCCESS
diff --git a/tests/mock_pyrealsense2.py b/tests/mock_pyrealsense2.py
index 5a39fc2b..935af58e 100644
--- a/tests/mock_pyrealsense2.py
+++ b/tests/mock_pyrealsense2.py
@@ -17,7 +17,9 @@ class config:  # noqa: N801
     def enable_device(self, device_id: str):
         self.device_enabled = device_id
 
-    def enable_stream(self, stream_type: stream, width=None, height=None, color_format=None, fps=None):
+    def enable_stream(
+        self, stream_type: stream, width=None, height=None, color_format=None, fps=None
+    ):
         self.stream_type = stream_type
         # Overwrite default values when possible
         self.width = 848 if width is None else width
diff --git a/tests/mock_scservo_sdk.py b/tests/mock_scservo_sdk.py
index 596978c0..43a0b4a5 100644
--- a/tests/mock_scservo_sdk.py
+++ b/tests/mock_scservo_sdk.py
@@ -77,7 +77,9 @@ class GroupSyncRead:
     def addParam(self, motor_index):  # noqa: N802
         # Initialize motor default values
         if motor_index not in self.packet_handler.data:
-            self.packet_handler.data[motor_index] = get_default_motor_values(motor_index)
+            self.packet_handler.data[motor_index] = get_default_motor_values(
+                motor_index
+            )
 
     def txRxPacket(self):  # noqa: N802
         return COMM_SUCCESS
diff --git a/tests/policies/hilserl/classifier/check_hiserl_reward_classifier.py b/tests/policies/hilserl/classifier/check_hiserl_reward_classifier.py
index 55e6e381..84b96b6d 100644
--- a/tests/policies/hilserl/classifier/check_hiserl_reward_classifier.py
+++ b/tests/policies/hilserl/classifier/check_hiserl_reward_classifier.py
@@ -25,7 +25,10 @@ from torchmetrics import AUROC, Accuracy, F1Score, Precision, Recall
 from torchvision.datasets import CIFAR10
 from torchvision.transforms import ToTensor
 
-from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier, ClassifierConfig
+from lerobot.common.policies.hilserl.classifier.modeling_classifier import (
+    Classifier,
+    ClassifierConfig,
+)
 
 BATCH_SIZE = 1000
 LR = 0.1
@@ -43,7 +46,9 @@ def train_evaluate_multiclass_classifier():
     logging.info(
         f"Start multiclass classifier train eval with {DEVICE} device, batch size {BATCH_SIZE}, learning rate {LR}"
     )
-    multiclass_config = ClassifierConfig(model_name="microsoft/resnet-18", device=DEVICE, num_classes=10)
+    multiclass_config = ClassifierConfig(
+        model_name="microsoft/resnet-18", device=DEVICE, num_classes=10
+    )
     multiclass_classifier = Classifier(multiclass_config)
 
     trainset = CIFAR10(root="data", train=True, download=True, transform=ToTensor())
@@ -114,10 +119,18 @@ def train_evaluate_multiclass_classifier():
     test_probs = torch.stack(test_probs)
 
     accuracy = Accuracy(task="multiclass", num_classes=multiclass_num_classes)
-    precision = Precision(task="multiclass", average="weighted", num_classes=multiclass_num_classes)
-    recall = Recall(task="multiclass", average="weighted", num_classes=multiclass_num_classes)
-    f1 = F1Score(task="multiclass", average="weighted", num_classes=multiclass_num_classes)
-    auroc = AUROC(task="multiclass", num_classes=multiclass_num_classes, average="weighted")
+    precision = Precision(
+        task="multiclass", average="weighted", num_classes=multiclass_num_classes
+    )
+    recall = Recall(
+        task="multiclass", average="weighted", num_classes=multiclass_num_classes
+    )
+    f1 = F1Score(
+        task="multiclass", average="weighted", num_classes=multiclass_num_classes
+    )
+    auroc = AUROC(
+        task="multiclass", num_classes=multiclass_num_classes, average="weighted"
+    )
 
     # Calculate metrics
     acc = accuracy(test_predictions, test_labels)
@@ -146,18 +159,28 @@ def train_evaluate_binary_classifier():
             new_label = float(1.0) if label == target_class else float(0.0)
             new_targets.append(new_label)
 
-        dataset.targets = new_targets  # Replace the original labels with the binary ones
+        dataset.targets = (
+            new_targets  # Replace the original labels with the binary ones
+        )
         return dataset
 
-    binary_train_dataset = CIFAR10(root="data", train=True, download=True, transform=ToTensor())
-    binary_test_dataset = CIFAR10(root="data", train=False, download=True, transform=ToTensor())
+    binary_train_dataset = CIFAR10(
+        root="data", train=True, download=True, transform=ToTensor()
+    )
+    binary_test_dataset = CIFAR10(
+        root="data", train=False, download=True, transform=ToTensor()
+    )
 
     # Apply one-vs-rest labeling
     binary_train_dataset = one_vs_rest(binary_train_dataset, target_binary_class)
     binary_test_dataset = one_vs_rest(binary_test_dataset, target_binary_class)
 
-    binary_trainloader = DataLoader(binary_train_dataset, batch_size=BATCH_SIZE, shuffle=True)
-    binary_testloader = DataLoader(binary_test_dataset, batch_size=BATCH_SIZE, shuffle=False)
+    binary_trainloader = DataLoader(
+        binary_train_dataset, batch_size=BATCH_SIZE, shuffle=True
+    )
+    binary_testloader = DataLoader(
+        binary_test_dataset, batch_size=BATCH_SIZE, shuffle=False
+    )
 
     binary_epoch = 1
 
diff --git a/tests/policies/hilserl/classifier/test_modelling_classifier.py b/tests/policies/hilserl/classifier/test_modelling_classifier.py
index a3db4211..e8223a52 100644
--- a/tests/policies/hilserl/classifier/test_modelling_classifier.py
+++ b/tests/policies/hilserl/classifier/test_modelling_classifier.py
@@ -9,7 +9,9 @@ from tests.utils import require_package
 
 def test_classifier_output():
     output = ClassifierOutput(
-        logits=torch.tensor([1, 2, 3]), probabilities=torch.tensor([0.1, 0.2, 0.3]), hidden_states=None
+        logits=torch.tensor([1, 2, 3]),
+        probabilities=torch.tensor([0.1, 0.2, 0.3]),
+        hidden_states=None,
     )
 
     assert (
@@ -20,7 +22,9 @@ def test_classifier_output():
 
 @require_package("transformers")
 def test_binary_classifier_with_default_params():
-    from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
+    from lerobot.common.policies.hilserl.classifier.modeling_classifier import (
+        Classifier,
+    )
 
     config = ClassifierConfig()
     classifier = Classifier(config)
@@ -41,7 +45,9 @@ def test_binary_classifier_with_default_params():
 
 @require_package("transformers")
 def test_multiclass_classifier():
-    from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
+    from lerobot.common.policies.hilserl.classifier.modeling_classifier import (
+        Classifier,
+    )
 
     num_classes = 5
     config = ClassifierConfig(num_classes=num_classes)
@@ -63,7 +69,9 @@ def test_multiclass_classifier():
 
 @require_package("transformers")
 def test_default_device():
-    from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
+    from lerobot.common.policies.hilserl.classifier.modeling_classifier import (
+        Classifier,
+    )
 
     config = ClassifierConfig()
     assert config.device == "cpu"
@@ -75,7 +83,9 @@ def test_default_device():
 
 @require_package("transformers")
 def test_explicit_device_setup():
-    from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
+    from lerobot.common.policies.hilserl.classifier.modeling_classifier import (
+        Classifier,
+    )
 
     config = ClassifierConfig(device="meta")
     assert config.device == "meta"
diff --git a/tests/scripts/save_dataset_to_safetensors.py b/tests/scripts/save_dataset_to_safetensors.py
index b2f4969f..1ff28863 100644
--- a/tests/scripts/save_dataset_to_safetensors.py
+++ b/tests/scripts/save_dataset_to_safetensors.py
@@ -51,7 +51,13 @@ def save_dataset_to_safetensors(output_dir, repo_id="lerobot/pusht"):
     save_file(dataset[i + 1], repo_dir / f"frame_{i+1}.safetensors")
 
     # save 2 frames at the middle of first episode
-    i = int((dataset.episode_data_index["to"][0].item() - dataset.episode_data_index["from"][0].item()) / 2)
+    i = int(
+        (
+            dataset.episode_data_index["to"][0].item()
+            - dataset.episode_data_index["from"][0].item()
+        )
+        / 2
+    )
     save_file(dataset[i], repo_dir / f"frame_{i}.safetensors")
     save_file(dataset[i + 1], repo_dir / f"frame_{i+1}.safetensors")
 
@@ -87,4 +93,6 @@ if __name__ == "__main__":
         "lerobot/nyu_franka_play_dataset",
         "lerobot/cmu_stretch",
     ]:
-        save_dataset_to_safetensors("tests/data/save_dataset_to_safetensors", repo_id=dataset)
+        save_dataset_to_safetensors(
+            "tests/data/save_dataset_to_safetensors", repo_id=dataset
+        )
diff --git a/tests/scripts/save_policy_to_safetensors.py b/tests/scripts/save_policy_to_safetensors.py
index 29d0ae19..0a45aa9d 100644
--- a/tests/scripts/save_policy_to_safetensors.py
+++ b/tests/scripts/save_policy_to_safetensors.py
@@ -67,7 +67,9 @@ def get_policy_stats(env_name, policy_name, extra_overrides):
     param_stats = {}
     for key, param in policy.named_parameters():
         param_stats[f"{key}_mean"] = param.mean()
-        param_stats[f"{key}_std"] = param.std() if param.numel() > 1 else torch.tensor(float(0.0))
+        param_stats[f"{key}_std"] = (
+            param.std() if param.numel() > 1 else torch.tensor(float(0.0))
+        )
 
     optimizer.zero_grad()
     policy.reset()
@@ -85,11 +87,15 @@ def get_policy_stats(env_name, policy_name, extra_overrides):
     else:
         actions_queue = cfg.policy.n_action_repeats
 
-    actions = {str(i): policy.select_action(obs).contiguous() for i in range(actions_queue)}
+    actions = {
+        str(i): policy.select_action(obs).contiguous() for i in range(actions_queue)
+    }
     return output_dict, grad_stats, param_stats, actions
 
 
-def save_policy_to_safetensors(output_dir, env_name, policy_name, extra_overrides, file_name_extra):
+def save_policy_to_safetensors(
+    output_dir, env_name, policy_name, extra_overrides, file_name_extra
+):
     env_policy_dir = Path(output_dir) / f"{env_name}_{policy_name}{file_name_extra}"
 
     if env_policy_dir.exists():
@@ -99,7 +105,9 @@ def save_policy_to_safetensors(output_dir, env_name, policy_name, extra_override
         shutil.rmtree(env_policy_dir)
 
     env_policy_dir.mkdir(parents=True, exist_ok=True)
-    output_dict, grad_stats, param_stats, actions = get_policy_stats(env_name, policy_name, extra_overrides)
+    output_dict, grad_stats, param_stats, actions = get_policy_stats(
+        env_name, policy_name, extra_overrides
+    )
     save_file(output_dict, env_policy_dir / "output_dict.safetensors")
     save_file(grad_stats, env_policy_dir / "grad_stats.safetensors")
     save_file(param_stats, env_policy_dir / "param_stats.safetensors")
@@ -129,5 +137,9 @@ if __name__ == "__main__":
         raise RuntimeError("No policies were provided!")
     for env, policy, extra_overrides, file_name_extra in env_policies:
         save_policy_to_safetensors(
-            "tests/data/save_policy_to_safetensors", env, policy, extra_overrides, file_name_extra
+            "tests/data/save_policy_to_safetensors",
+            env,
+            policy,
+            extra_overrides,
+            file_name_extra,
         )
diff --git a/tests/test_cameras.py b/tests/test_cameras.py
index 67512779..c8e8b378 100644
--- a/tests/test_cameras.py
+++ b/tests/test_cameras.py
@@ -24,7 +24,10 @@ pytest -sx 'tests/test_cameras.py::test_camera[intelrealsense-True]'
 import numpy as np
 import pytest
 
-from lerobot.common.robot_devices.utils import RobotDeviceAlreadyConnectedError, RobotDeviceNotConnectedError
+from lerobot.common.robot_devices.utils import (
+    RobotDeviceAlreadyConnectedError,
+    RobotDeviceNotConnectedError,
+)
 from tests.utils import TEST_CAMERA_TYPES, make_camera, require_camera
 
 # Maximum absolute difference between two consecutive images recored by a camera.
@@ -97,7 +100,11 @@ def test_camera(request, camera_type, mock):
     )
     # TODO(rcadene): properly set `rtol`
     np.testing.assert_allclose(
-        color_image, async_color_image, rtol=1e-5, atol=MAX_PIXEL_DIFFERENCE, err_msg=error_msg
+        color_image,
+        async_color_image,
+        rtol=1e-5,
+        atol=MAX_PIXEL_DIFFERENCE,
+        err_msg=error_msg,
     )
 
     # Test disconnecting
@@ -116,7 +123,11 @@ def test_camera(request, camera_type, mock):
     assert camera.color_mode == "bgr"
     bgr_color_image = camera.read()
     np.testing.assert_allclose(
-        color_image, bgr_color_image[:, :, [2, 1, 0]], rtol=1e-5, atol=MAX_PIXEL_DIFFERENCE, err_msg=error_msg
+        color_image,
+        bgr_color_image[:, :, [2, 1, 0]],
+        rtol=1e-5,
+        atol=MAX_PIXEL_DIFFERENCE,
+        err_msg=error_msg,
     )
     del camera
 
@@ -151,7 +162,11 @@ def test_camera(request, camera_type, mock):
         rot_color_image = camera.read()
 
         np.testing.assert_allclose(
-            rot_color_image, manual_rot_img, rtol=1e-5, atol=MAX_PIXEL_DIFFERENCE, err_msg=error_msg
+            rot_color_image,
+            manual_rot_img,
+            rtol=1e-5,
+            atol=MAX_PIXEL_DIFFERENCE,
+            err_msg=error_msg,
         )
         del camera
 
@@ -185,7 +200,9 @@ def test_save_images_from_cameras(tmpdir, request, camera_type, mock):
     if camera_type == "opencv":
         from lerobot.common.robot_devices.cameras.opencv import save_images_from_cameras
     elif camera_type == "intelrealsense":
-        from lerobot.common.robot_devices.cameras.intelrealsense import save_images_from_cameras
+        from lerobot.common.robot_devices.cameras.intelrealsense import (
+            save_images_from_cameras,
+        )
 
     # Small `record_time_s` to speedup unit tests
     save_images_from_cameras(tmpdir, record_time_s=0.02, mock=mock)
diff --git a/tests/test_control_robot.py b/tests/test_control_robot.py
index 8df10894..75dd2e7c 100644
--- a/tests/test_control_robot.py
+++ b/tests/test_control_robot.py
@@ -35,7 +35,13 @@ from lerobot.common.utils.utils import init_hydra_config
 from lerobot.scripts.control_robot import calibrate, record, replay, teleoperate
 from lerobot.scripts.train import make_optimizer_and_scheduler
 from tests.test_robots import make_robot
-from tests.utils import DEFAULT_CONFIG_PATH, DEVICE, TEST_ROBOT_TYPES, mock_calibration_dir, require_robot
+from tests.utils import (
+    DEFAULT_CONFIG_PATH,
+    DEVICE,
+    TEST_ROBOT_TYPES,
+    mock_calibration_dir,
+    require_robot,
+)
 
 
 @pytest.mark.parametrize("robot_type, mock", TEST_ROBOT_TYPES)
@@ -158,7 +164,15 @@ def test_record_and_replay_and_policy(tmpdir, request, robot_type, mock):
     assert dataset.meta.total_episodes == 2
     assert len(dataset) == 2
 
-    replay(robot, episode=0, fps=1, root=root, repo_id=repo_id, play_sounds=False, local_files_only=True)
+    replay(
+        robot,
+        episode=0,
+        fps=1,
+        root=root,
+        repo_id=repo_id,
+        play_sounds=False,
+        local_files_only=True,
+    )
 
     # TODO(rcadene, aliberts): rethink this design
     if robot_type == "aloha":
@@ -346,8 +360,12 @@ def test_record_with_event_rerecord_episode(tmpdir, request, robot_type, mock):
             run_compute_stats=False,
         )
 
-        assert not mock_events["rerecord_episode"], "`rerecord_episode` wasn't properly reset to False"
-        assert not mock_events["exit_early"], "`exit_early` wasn't properly reset to False"
+        assert not mock_events[
+            "rerecord_episode"
+        ], "`rerecord_episode` wasn't properly reset to False"
+        assert not mock_events[
+            "exit_early"
+        ], "`exit_early` wasn't properly reset to False"
         assert len(dataset) == 1, "`dataset` should contain only 1 frame"
 
 
@@ -394,15 +412,20 @@ def test_record_with_event_exit_early(tmpdir, request, robot_type, mock):
             run_compute_stats=False,
         )
 
-        assert not mock_events["exit_early"], "`exit_early` wasn't properly reset to False"
+        assert not mock_events[
+            "exit_early"
+        ], "`exit_early` wasn't properly reset to False"
         assert len(dataset) == 1, "`dataset` should contain only 1 frame"
 
 
 @pytest.mark.parametrize(
-    "robot_type, mock, num_image_writer_processes", [("koch", True, 0), ("koch", True, 1)]
+    "robot_type, mock, num_image_writer_processes",
+    [("koch", True, 0), ("koch", True, 1)],
 )
 @require_robot
-def test_record_with_event_stop_recording(tmpdir, request, robot_type, mock, num_image_writer_processes):
+def test_record_with_event_stop_recording(
+    tmpdir, request, robot_type, mock, num_image_writer_processes
+):
     if mock:
         request.getfixturevalue("patch_builtins_input")
 
@@ -444,5 +467,7 @@ def test_record_with_event_stop_recording(tmpdir, request, robot_type, mock, num
             num_image_writer_processes=num_image_writer_processes,
         )
 
-        assert not mock_events["exit_early"], "`exit_early` wasn't properly reset to False"
+        assert not mock_events[
+            "exit_early"
+        ], "`exit_early` wasn't properly reset to False"
         assert len(dataset) == 1, "`dataset` should contain only 1 frame"
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 9f361587..336e4308 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -56,7 +56,9 @@ def test_same_attributes_defined(lerobot_dataset_factory, tmp_path):
     # Instantiate both ways
     robot = make_robot("koch", mock=True)
     root_create = tmp_path / "create"
-    dataset_create = LeRobotDataset.create(repo_id=DUMMY_REPO_ID, fps=30, robot=robot, root=root_create)
+    dataset_create = LeRobotDataset.create(
+        repo_id=DUMMY_REPO_ID, fps=30, robot=robot, root=root_create
+    )
 
     root_init = tmp_path / "init"
     dataset_init = lerobot_dataset_factory(root=root_init)
@@ -102,7 +104,16 @@ def test_dataset_initialization(lerobot_dataset_factory, tmp_path):
 @pytest.mark.parametrize(
     "env_name, repo_id, policy_name",
     lerobot.env_dataset_policy_triplets
-    + [("aloha", ["lerobot/aloha_sim_insertion_human", "lerobot/aloha_sim_transfer_cube_human"], "act")],
+    + [
+        (
+            "aloha",
+            [
+                "lerobot/aloha_sim_insertion_human",
+                "lerobot/aloha_sim_transfer_cube_human",
+            ],
+            "act",
+        )
+    ],
 )
 def test_factory(env_name, repo_id, policy_name):
     """
@@ -220,7 +231,9 @@ def test_compute_stats_on_xarm():
     # Note: we set the batch size to be smaller than the whole dataset to make sure we are testing batched
     # computation of the statistics. While doing this, we also make sure it works when we don't divide the
     # dataset into even batches.
-    computed_stats = compute_stats(dataset, batch_size=int(len(dataset) * 0.25), num_workers=0)
+    computed_stats = compute_stats(
+        dataset, batch_size=int(len(dataset) * 0.25), num_workers=0
+    )
 
     # get einops patterns to aggregate batches and compute statistics
     stats_patterns = get_stats_einops_patterns(dataset)
@@ -241,7 +254,9 @@ def test_compute_stats_on_xarm():
         expected_stats[k] = {}
         expected_stats[k]["mean"] = einops.reduce(full_batch[k], pattern, "mean")
         expected_stats[k]["std"] = torch.sqrt(
-            einops.reduce((full_batch[k] - expected_stats[k]["mean"]) ** 2, pattern, "mean")
+            einops.reduce(
+                (full_batch[k] - expected_stats[k]["mean"]) ** 2, pattern, "mean"
+            )
         )
         expected_stats[k]["min"] = einops.reduce(full_batch[k], pattern, "min")
         expected_stats[k]["max"] = einops.reduce(full_batch[k], pattern, "max")
@@ -286,7 +301,9 @@ def test_flatten_unflatten_dict():
     d = unflatten_dict(flatten_dict(d))
 
     # test equality between nested dicts
-    assert json.dumps(original_d, sort_keys=True) == json.dumps(d, sort_keys=True), f"{original_d} != {d}"
+    assert json.dumps(original_d, sort_keys=True) == json.dumps(
+        d, sort_keys=True
+    ), f"{original_d} != {d}"
 
 
 @pytest.mark.skip("TODO after v2 migration / removing hydra")
@@ -333,7 +350,13 @@ def test_backward_compatibility(repo_id):
     load_and_compare(i + 1)
 
     # test 2 frames at the middle of first episode
-    i = int((dataset.episode_data_index["to"][0].item() - dataset.episode_data_index["from"][0].item()) / 2)
+    i = int(
+        (
+            dataset.episode_data_index["to"][0].item()
+            - dataset.episode_data_index["from"][0].item()
+        )
+        / 2
+    )
     load_and_compare(i)
     load_and_compare(i + 1)
 
@@ -370,23 +393,40 @@ def test_aggregate_stats():
         data_c = torch.rand(20, dtype=torch.float32)
 
     hf_dataset_1 = Dataset.from_dict(
-        {"a": data_a[:10], "b": data_b[:10], "c": data_c[:10], "index": torch.arange(10)}
+        {
+            "a": data_a[:10],
+            "b": data_b[:10],
+            "c": data_c[:10],
+            "index": torch.arange(10),
+        }
     )
     hf_dataset_1.set_transform(hf_transform_to_torch)
-    hf_dataset_2 = Dataset.from_dict({"a": data_a[10:20], "b": data_b[10:], "index": torch.arange(10)})
+    hf_dataset_2 = Dataset.from_dict(
+        {"a": data_a[10:20], "b": data_b[10:], "index": torch.arange(10)}
+    )
     hf_dataset_2.set_transform(hf_transform_to_torch)
-    hf_dataset_3 = Dataset.from_dict({"a": data_a[20:], "c": data_c[10:], "index": torch.arange(10)})
+    hf_dataset_3 = Dataset.from_dict(
+        {"a": data_a[20:], "c": data_c[10:], "index": torch.arange(10)}
+    )
     hf_dataset_3.set_transform(hf_transform_to_torch)
     dataset_1 = LeRobotDataset.from_preloaded("d1", hf_dataset=hf_dataset_1)
-    dataset_1.stats = compute_stats(dataset_1, batch_size=len(hf_dataset_1), num_workers=0)
+    dataset_1.stats = compute_stats(
+        dataset_1, batch_size=len(hf_dataset_1), num_workers=0
+    )
     dataset_2 = LeRobotDataset.from_preloaded("d2", hf_dataset=hf_dataset_2)
-    dataset_2.stats = compute_stats(dataset_2, batch_size=len(hf_dataset_2), num_workers=0)
+    dataset_2.stats = compute_stats(
+        dataset_2, batch_size=len(hf_dataset_2), num_workers=0
+    )
     dataset_3 = LeRobotDataset.from_preloaded("d3", hf_dataset=hf_dataset_3)
-    dataset_3.stats = compute_stats(dataset_3, batch_size=len(hf_dataset_3), num_workers=0)
+    dataset_3.stats = compute_stats(
+        dataset_3, batch_size=len(hf_dataset_3), num_workers=0
+    )
     stats = aggregate_stats([dataset_1, dataset_2, dataset_3])
     for data_key, data in zip(["a", "b", "c"], [data_a, data_b, data_c], strict=True):
         for agg_fn in ["mean", "min", "max"]:
-            assert torch.allclose(stats[data_key][agg_fn], einops.reduce(data, "n -> 1", agg_fn))
+            assert torch.allclose(
+                stats[data_key][agg_fn], einops.reduce(data, "n -> 1", agg_fn)
+            )
         assert torch.allclose(stats[data_key]["std"], torch.std(data, correction=0))
 
 
diff --git a/tests/test_delta_timestamps.py b/tests/test_delta_timestamps.py
index 3c2e307f..07fa9b12 100644
--- a/tests/test_delta_timestamps.py
+++ b/tests/test_delta_timestamps.py
@@ -22,13 +22,17 @@ def synced_hf_dataset_factory(hf_dataset_factory):
 
 @pytest.fixture(scope="module")
 def unsynced_hf_dataset_factory(synced_hf_dataset_factory):
-    def _create_unsynced_hf_dataset(fps: int = 30, tolerance_s: float = 1e-4) -> Dataset:
+    def _create_unsynced_hf_dataset(
+        fps: int = 30, tolerance_s: float = 1e-4
+    ) -> Dataset:
         hf_dataset = synced_hf_dataset_factory(fps=fps)
         features = hf_dataset.features
         df = hf_dataset.to_pandas()
         dtype = df["timestamp"].dtype  # This is to avoid pandas type warning
         # Modify a single timestamp just outside tolerance
-        df.at[30, "timestamp"] = dtype.type(df.at[30, "timestamp"] + (tolerance_s * 1.1))
+        df.at[30, "timestamp"] = dtype.type(
+            df.at[30, "timestamp"] + (tolerance_s * 1.1)
+        )
         unsynced_hf_dataset = Dataset.from_pandas(df, features=features)
         unsynced_hf_dataset.set_transform(hf_transform_to_torch)
         return unsynced_hf_dataset
@@ -38,13 +42,17 @@ def unsynced_hf_dataset_factory(synced_hf_dataset_factory):
 
 @pytest.fixture(scope="module")
 def slightly_off_hf_dataset_factory(synced_hf_dataset_factory):
-    def _create_slightly_off_hf_dataset(fps: int = 30, tolerance_s: float = 1e-4) -> Dataset:
+    def _create_slightly_off_hf_dataset(
+        fps: int = 30, tolerance_s: float = 1e-4
+    ) -> Dataset:
         hf_dataset = synced_hf_dataset_factory(fps=fps)
         features = hf_dataset.features
         df = hf_dataset.to_pandas()
         dtype = df["timestamp"].dtype  # This is to avoid pandas type warning
         # Modify a single timestamp just inside tolerance
-        df.at[30, "timestamp"] = dtype.type(df.at[30, "timestamp"] + (tolerance_s * 0.9))
+        df.at[30, "timestamp"] = dtype.type(
+            df.at[30, "timestamp"] + (tolerance_s * 0.9)
+        )
         unsynced_hf_dataset = Dataset.from_pandas(df, features=features)
         unsynced_hf_dataset.set_transform(hf_transform_to_torch)
         return unsynced_hf_dataset
@@ -54,8 +62,12 @@ def slightly_off_hf_dataset_factory(synced_hf_dataset_factory):
 
 @pytest.fixture(scope="module")
 def valid_delta_timestamps_factory():
-    def _create_valid_delta_timestamps(fps: int = 30, keys: list = DUMMY_MOTOR_FEATURES) -> dict:
-        delta_timestamps = {key: [i * (1 / fps) for i in range(-10, 10)] for key in keys}
+    def _create_valid_delta_timestamps(
+        fps: int = 30, keys: list = DUMMY_MOTOR_FEATURES
+    ) -> dict:
+        delta_timestamps = {
+            key: [i * (1 / fps) for i in range(-10, 10)] for key in keys
+        }
         return delta_timestamps
 
     return _create_valid_delta_timestamps
@@ -153,7 +165,9 @@ def test_check_timestamps_sync_slightly_off(slightly_off_hf_dataset_factory):
 
 
 def test_check_timestamps_sync_single_timestamp():
-    single_timestamp_hf_dataset = Dataset.from_dict({"timestamp": [0.0], "episode_index": [0]})
+    single_timestamp_hf_dataset = Dataset.from_dict(
+        {"timestamp": [0.0], "episode_index": [0]}
+    )
     single_timestamp_hf_dataset.set_transform(hf_transform_to_torch)
     episode_data_index = {"to": torch.tensor([1]), "from": torch.tensor([0])}
     fps = 30
@@ -202,7 +216,9 @@ def test_check_delta_timestamps_valid(valid_delta_timestamps_factory):
 def test_check_delta_timestamps_slightly_off(slightly_off_delta_timestamps_factory):
     fps = 30
     tolerance_s = 1e-4
-    slightly_off_delta_timestamps = slightly_off_delta_timestamps_factory(fps, tolerance_s)
+    slightly_off_delta_timestamps = slightly_off_delta_timestamps_factory(
+        fps, tolerance_s
+    )
     result = check_delta_timestamps(
         delta_timestamps=slightly_off_delta_timestamps,
         fps=fps,
diff --git a/tests/test_image_transforms.py b/tests/test_image_transforms.py
index 8b1a0f4b..b0bf6980 100644
--- a/tests/test_image_transforms.py
+++ b/tests/test_image_transforms.py
@@ -21,7 +21,11 @@ from safetensors.torch import load_file
 from torchvision.transforms import v2
 from torchvision.transforms.v2 import functional as F  # noqa: N812
 
-from lerobot.common.datasets.transforms import RandomSubsetApply, SharpnessJitter, get_image_transforms
+from lerobot.common.datasets.transforms import (
+    RandomSubsetApply,
+    SharpnessJitter,
+    get_image_transforms,
+)
 from lerobot.common.utils.utils import init_hydra_config, seeded_context
 from lerobot.scripts.visualize_image_transforms import visualize_transforms
 from tests.utils import DEFAULT_CONFIG_PATH, require_x86_64_kernel
@@ -51,7 +55,9 @@ def default_transforms():
 
 def test_get_image_transforms_no_transform(img_tensor_factory):
     img_tensor = img_tensor_factory()
-    tf_actual = get_image_transforms(brightness_min_max=(0.5, 0.5), max_num_transforms=0)
+    tf_actual = get_image_transforms(
+        brightness_min_max=(0.5, 0.5), max_num_transforms=0
+    )
     torch.testing.assert_close(tf_actual(img_tensor), img_tensor)
 
 
@@ -149,7 +155,9 @@ def test_get_image_transforms_random_order(img_tensor_factory):
         ("sharpness", [(0.5, 0.5), (2.0, 2.0)]),
     ],
 )
-def test_backward_compatibility_torchvision(img_tensor_factory, transform, min_max_values, single_transforms):
+def test_backward_compatibility_torchvision(
+    img_tensor_factory, transform, min_max_values, single_transforms
+):
     img_tensor = img_tensor_factory()
     for min_max in min_max_values:
         kwargs = {
@@ -268,23 +276,33 @@ def test_sharpness_jitter_invalid_range_max_smaller():
     ],
 )
 def test_visualize_image_transforms(repo_id, n_examples):
-    cfg = init_hydra_config(DEFAULT_CONFIG_PATH, overrides=[f"dataset_repo_id={repo_id}"])
+    cfg = init_hydra_config(
+        DEFAULT_CONFIG_PATH, overrides=[f"dataset_repo_id={repo_id}"]
+    )
     output_dir = Path(__file__).parent / "outputs" / "image_transforms"
     visualize_transforms(cfg, output_dir=output_dir, n_examples=n_examples)
     output_dir = output_dir / repo_id.split("/")[-1]
 
     # Check if the original frame image exists
-    assert (output_dir / "original_frame.png").exists(), "Original frame image was not saved."
+    assert (
+        output_dir / "original_frame.png"
+    ).exists(), "Original frame image was not saved."
 
     # Check if the transformed images exist for each transform type
     transforms = ["brightness", "contrast", "saturation", "hue", "sharpness"]
     for transform in transforms:
         transform_dir = output_dir / transform
         assert transform_dir.exists(), f"{transform} directory was not created."
-        assert any(transform_dir.iterdir()), f"No transformed images found in {transform} directory."
+        assert any(
+            transform_dir.iterdir()
+        ), f"No transformed images found in {transform} directory."
 
         # Check for specific files within each transform directory
-        expected_files = [f"{i}.png" for i in range(1, n_examples + 1)] + ["min.png", "max.png", "mean.png"]
+        expected_files = [f"{i}.png" for i in range(1, n_examples + 1)] + [
+            "min.png",
+            "max.png",
+            "mean.png",
+        ]
         for file_name in expected_files:
             assert (
                 transform_dir / file_name
@@ -292,7 +310,9 @@ def test_visualize_image_transforms(repo_id, n_examples):
 
     # Check if the combined transforms directory exists and contains the right files
     combined_transforms_dir = output_dir / "all"
-    assert combined_transforms_dir.exists(), "Combined transforms directory was not created."
+    assert (
+        combined_transforms_dir.exists()
+    ), "Combined transforms directory was not created."
     assert any(
         combined_transforms_dir.iterdir()
     ), "No transformed images found in combined transforms directory."
diff --git a/tests/test_image_writer.py b/tests/test_image_writer.py
index f51e86b4..fdfd3377 100644
--- a/tests/test_image_writer.py
+++ b/tests/test_image_writer.py
@@ -160,7 +160,9 @@ def test_save_image_torch(tmp_path, img_tensor_factory):
         writer.wait_until_done()
         assert fpath.exists()
         saved_image = np.array(Image.open(fpath))
-        expected_image = (image_tensor.permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)
+        expected_image = (image_tensor.permute(1, 2, 0).cpu().numpy() * 255).astype(
+            np.uint8
+        )
         assert np.array_equal(expected_image, saved_image)
     finally:
         writer.stop()
@@ -175,7 +177,9 @@ def test_save_image_torch_multiprocessing(tmp_path, img_tensor_factory):
         writer.wait_until_done()
         assert fpath.exists()
         saved_image = np.array(Image.open(fpath))
-        expected_image = (image_tensor.permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)
+        expected_image = (image_tensor.permute(1, 2, 0).cpu().numpy() * 255).astype(
+            np.uint8
+        )
         assert np.array_equal(expected_image, saved_image)
     finally:
         writer.stop()
@@ -265,7 +269,9 @@ def test_wait_until_done(tmp_path, img_array_factory):
     writer = AsyncImageWriter(num_processes=0, num_threads=4)
     try:
         num_images = 100
-        image_arrays = [img_array_factory(height=500, width=500) for _ in range(num_images)]
+        image_arrays = [
+            img_array_factory(height=500, width=500) for _ in range(num_images)
+        ]
         fpaths = [tmp_path / f"frame_{i:06d}.png" for i in range(num_images)]
         for image_array, fpath in zip(image_arrays, fpaths, strict=True):
             fpath.parent.mkdir(parents=True, exist_ok=True)
diff --git a/tests/test_motors.py b/tests/test_motors.py
index 2f668926..1a951f06 100644
--- a/tests/test_motors.py
+++ b/tests/test_motors.py
@@ -30,7 +30,10 @@ import time
 import numpy as np
 import pytest
 
-from lerobot.common.robot_devices.utils import RobotDeviceAlreadyConnectedError, RobotDeviceNotConnectedError
+from lerobot.common.robot_devices.utils import (
+    RobotDeviceAlreadyConnectedError,
+    RobotDeviceNotConnectedError,
+)
 from lerobot.scripts.find_motors_bus_port import find_port
 from tests.utils import TEST_MOTOR_TYPES, make_motors_bus, require_motor
 
@@ -63,7 +66,9 @@ def test_configure_motors_all_ids_1(request, motor_type, mock):
     else:
         raise ValueError(motor_type)
 
-    input("Are you sure you want to re-configure the motors? Press enter to continue...")
+    input(
+        "Are you sure you want to re-configure the motors? Press enter to continue..."
+    )
     # This test expect the configuration was already correct.
     motors_bus = make_motors_bus(motor_type, mock=mock)
     motors_bus.connect()
diff --git a/tests/test_online_buffer.py b/tests/test_online_buffer.py
index 092cd3d0..ebd08dd9 100644
--- a/tests/test_online_buffer.py
+++ b/tests/test_online_buffer.py
@@ -44,13 +44,23 @@ def make_new_buffer(
     return buffer, write_dir
 
 
-def make_spoof_data_frames(n_episodes: int, n_frames_per_episode: int) -> dict[str, np.ndarray]:
+def make_spoof_data_frames(
+    n_episodes: int, n_frames_per_episode: int
+) -> dict[str, np.ndarray]:
     new_data = {
-        data_key: np.arange(n_frames_per_episode * n_episodes * np.prod(data_shape)).reshape(-1, *data_shape),
+        data_key: np.arange(
+            n_frames_per_episode * n_episodes * np.prod(data_shape)
+        ).reshape(-1, *data_shape),
         OnlineBuffer.INDEX_KEY: np.arange(n_frames_per_episode * n_episodes),
-        OnlineBuffer.EPISODE_INDEX_KEY: np.repeat(np.arange(n_episodes), n_frames_per_episode),
-        OnlineBuffer.FRAME_INDEX_KEY: np.tile(np.arange(n_frames_per_episode), n_episodes),
-        OnlineBuffer.TIMESTAMP_KEY: np.tile(np.arange(n_frames_per_episode) / fps, n_episodes),
+        OnlineBuffer.EPISODE_INDEX_KEY: np.repeat(
+            np.arange(n_episodes), n_frames_per_episode
+        ),
+        OnlineBuffer.FRAME_INDEX_KEY: np.tile(
+            np.arange(n_frames_per_episode), n_episodes
+        ),
+        OnlineBuffer.TIMESTAMP_KEY: np.tile(
+            np.arange(n_frames_per_episode) / fps, n_episodes
+        ),
     }
     return new_data
 
@@ -133,8 +143,8 @@ def test_fifo():
     n_more_episodes = 2
     # Developer sanity check (in case someone changes the global `buffer_capacity`).
     assert (
-        n_episodes + n_more_episodes
-    ) * n_frames_per_episode > buffer_capacity, "Something went wrong with the test code."
+        (n_episodes + n_more_episodes) * n_frames_per_episode > buffer_capacity
+    ), "Something went wrong with the test code."
     more_new_data = make_spoof_data_frames(n_more_episodes, n_frames_per_episode)
     buffer.add_data(more_new_data)
     assert len(buffer) == buffer_capacity, "The buffer should be full."
@@ -166,7 +176,9 @@ def test_delta_timestamps_within_tolerance():
     buffer.tolerance_s = 0.04
     item = buffer[2]
     data, is_pad = item["index"], item[f"index{OnlineBuffer.IS_PAD_POSTFIX}"]
-    assert torch.allclose(data, torch.tensor([0, 2, 3])), "Data does not match expected values"
+    assert torch.allclose(
+        data, torch.tensor([0, 2, 3])
+    ), "Data does not match expected values"
     assert not is_pad.any(), "Unexpected padding detected"
 
 
@@ -202,7 +214,9 @@ def test_delta_timestamps_outside_tolerance_outside_episode_range():
     buffer.tolerance_s = 0.04
     item = buffer[2]
     data, is_pad = item["index"], item["index_is_pad"]
-    assert torch.equal(data, torch.tensor([0, 0, 2, 4, 4])), "Data does not match expected values"
+    assert torch.equal(
+        data, torch.tensor([0, 0, 2, 4, 4])
+    ), "Data does not match expected values"
     assert torch.equal(
         is_pad, torch.tensor([True, False, False, True, True])
     ), "Padding does not match expected values"
@@ -219,58 +233,89 @@ def test_compute_sampler_weights_trivial(
     online_dataset_size: int,
     online_sampling_ratio: float,
 ):
-    offline_dataset = lerobot_dataset_factory(tmp_path, total_episodes=1, total_frames=offline_dataset_size)
+    offline_dataset = lerobot_dataset_factory(
+        tmp_path, total_episodes=1, total_frames=offline_dataset_size
+    )
     online_dataset, _ = make_new_buffer()
     if online_dataset_size > 0:
         online_dataset.add_data(
-            make_spoof_data_frames(n_episodes=2, n_frames_per_episode=online_dataset_size // 2)
+            make_spoof_data_frames(
+                n_episodes=2, n_frames_per_episode=online_dataset_size // 2
+            )
         )
 
     weights = compute_sampler_weights(
-        offline_dataset, online_dataset=online_dataset, online_sampling_ratio=online_sampling_ratio
+        offline_dataset,
+        online_dataset=online_dataset,
+        online_sampling_ratio=online_sampling_ratio,
     )
     if offline_dataset_size == 0 or online_dataset_size == 0:
         expected_weights = torch.ones(offline_dataset_size + online_dataset_size)
     elif online_sampling_ratio == 0:
-        expected_weights = torch.cat([torch.ones(offline_dataset_size), torch.zeros(online_dataset_size)])
+        expected_weights = torch.cat(
+            [torch.ones(offline_dataset_size), torch.zeros(online_dataset_size)]
+        )
     elif online_sampling_ratio == 1:
-        expected_weights = torch.cat([torch.zeros(offline_dataset_size), torch.ones(online_dataset_size)])
+        expected_weights = torch.cat(
+            [torch.zeros(offline_dataset_size), torch.ones(online_dataset_size)]
+        )
     expected_weights /= expected_weights.sum()
     assert torch.allclose(weights, expected_weights)
 
 
 def test_compute_sampler_weights_nontrivial_ratio(lerobot_dataset_factory, tmp_path):
     # Arbitrarily set small dataset sizes, making sure to have uneven sizes.
-    offline_dataset = lerobot_dataset_factory(tmp_path, total_episodes=1, total_frames=4)
+    offline_dataset = lerobot_dataset_factory(
+        tmp_path, total_episodes=1, total_frames=4
+    )
     online_dataset, _ = make_new_buffer()
-    online_dataset.add_data(make_spoof_data_frames(n_episodes=4, n_frames_per_episode=2))
+    online_dataset.add_data(
+        make_spoof_data_frames(n_episodes=4, n_frames_per_episode=2)
+    )
     online_sampling_ratio = 0.8
     weights = compute_sampler_weights(
-        offline_dataset, online_dataset=online_dataset, online_sampling_ratio=online_sampling_ratio
+        offline_dataset,
+        online_dataset=online_dataset,
+        online_sampling_ratio=online_sampling_ratio,
     )
     assert torch.allclose(
-        weights, torch.tensor([0.05, 0.05, 0.05, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])
+        weights,
+        torch.tensor([0.05, 0.05, 0.05, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]),
     )
 
 
-def test_compute_sampler_weights_nontrivial_ratio_and_drop_last_n(lerobot_dataset_factory, tmp_path):
+def test_compute_sampler_weights_nontrivial_ratio_and_drop_last_n(
+    lerobot_dataset_factory, tmp_path
+):
     # Arbitrarily set small dataset sizes, making sure to have uneven sizes.
-    offline_dataset = lerobot_dataset_factory(tmp_path, total_episodes=1, total_frames=4)
+    offline_dataset = lerobot_dataset_factory(
+        tmp_path, total_episodes=1, total_frames=4
+    )
     online_dataset, _ = make_new_buffer()
-    online_dataset.add_data(make_spoof_data_frames(n_episodes=4, n_frames_per_episode=2))
+    online_dataset.add_data(
+        make_spoof_data_frames(n_episodes=4, n_frames_per_episode=2)
+    )
     weights = compute_sampler_weights(
-        offline_dataset, online_dataset=online_dataset, online_sampling_ratio=0.8, online_drop_n_last_frames=1
+        offline_dataset,
+        online_dataset=online_dataset,
+        online_sampling_ratio=0.8,
+        online_drop_n_last_frames=1,
     )
     assert torch.allclose(
-        weights, torch.tensor([0.05, 0.05, 0.05, 0.05, 0.2, 0.0, 0.2, 0.0, 0.2, 0.0, 0.2, 0.0])
+        weights,
+        torch.tensor([0.05, 0.05, 0.05, 0.05, 0.2, 0.0, 0.2, 0.0, 0.2, 0.0, 0.2, 0.0]),
     )
 
 
 def test_compute_sampler_weights_drop_n_last_frames(lerobot_dataset_factory, tmp_path):
     """Note: test copied from test_sampler."""
-    offline_dataset = lerobot_dataset_factory(tmp_path, total_episodes=1, total_frames=2)
+    offline_dataset = lerobot_dataset_factory(
+        tmp_path, total_episodes=1, total_frames=2
+    )
     online_dataset, _ = make_new_buffer()
-    online_dataset.add_data(make_spoof_data_frames(n_episodes=4, n_frames_per_episode=2))
+    online_dataset.add_data(
+        make_spoof_data_frames(n_episodes=4, n_frames_per_episode=2)
+    )
 
     weights = compute_sampler_weights(
         offline_dataset,
@@ -279,4 +324,6 @@ def test_compute_sampler_weights_drop_n_last_frames(lerobot_dataset_factory, tmp
         online_sampling_ratio=0.5,
         online_drop_n_last_frames=1,
     )
-    assert torch.allclose(weights, torch.tensor([0.5, 0, 0.125, 0, 0.125, 0, 0.125, 0, 0.125, 0]))
+    assert torch.allclose(
+        weights, torch.tensor([0.5, 0, 0.125, 0, 0.125, 0, 0.125, 0, 0.125, 0])
+    )
diff --git a/tests/test_policies.py b/tests/test_policies.py
index ae356743..6b177b2d 100644
--- a/tests/test_policies.py
+++ b/tests/test_policies.py
@@ -39,7 +39,13 @@ from lerobot.common.policies.policy_protocol import Policy
 from lerobot.common.utils.utils import init_hydra_config, seeded_context
 from lerobot.scripts.train import make_optimizer_and_scheduler
 from tests.scripts.save_policy_to_safetensors import get_policy_stats
-from tests.utils import DEFAULT_CONFIG_PATH, DEVICE, require_cpu, require_env, require_x86_64_kernel
+from tests.utils import (
+    DEFAULT_CONFIG_PATH,
+    DEVICE,
+    require_cpu,
+    require_env,
+    require_x86_64_kernel,
+)
 
 
 @pytest.mark.parametrize("policy_name", available_policies)
@@ -47,37 +53,63 @@ def test_get_policy_and_config_classes(policy_name: str):
     """Check that the correct policy and config classes are returned."""
     policy_cls, config_cls = get_policy_and_config_classes(policy_name)
     assert policy_cls.name == policy_name
-    assert issubclass(config_cls, inspect.signature(policy_cls.__init__).parameters["config"].annotation)
+    assert issubclass(
+        config_cls,
+        inspect.signature(policy_cls.__init__).parameters["config"].annotation,
+    )
 
 
 @pytest.mark.skip("TODO after v2 migration / removing hydra")
 @pytest.mark.parametrize(
     "env_name,policy_name,extra_overrides",
     [
-        ("xarm", "tdmpc", ["policy.use_mpc=true", "dataset_repo_id=lerobot/xarm_lift_medium"]),
+        (
+            "xarm",
+            "tdmpc",
+            ["policy.use_mpc=true", "dataset_repo_id=lerobot/xarm_lift_medium"],
+        ),
         ("pusht", "diffusion", []),
         ("pusht", "vqbet", []),
-        ("aloha", "act", ["env.task=AlohaInsertion-v0", "dataset_repo_id=lerobot/aloha_sim_insertion_human"]),
         (
             "aloha",
             "act",
-            ["env.task=AlohaInsertion-v0", "dataset_repo_id=lerobot/aloha_sim_insertion_scripted"],
+            [
+                "env.task=AlohaInsertion-v0",
+                "dataset_repo_id=lerobot/aloha_sim_insertion_human",
+            ],
         ),
         (
             "aloha",
             "act",
-            ["env.task=AlohaTransferCube-v0", "dataset_repo_id=lerobot/aloha_sim_transfer_cube_human"],
+            [
+                "env.task=AlohaInsertion-v0",
+                "dataset_repo_id=lerobot/aloha_sim_insertion_scripted",
+            ],
         ),
         (
             "aloha",
             "act",
-            ["env.task=AlohaTransferCube-v0", "dataset_repo_id=lerobot/aloha_sim_transfer_cube_scripted"],
+            [
+                "env.task=AlohaTransferCube-v0",
+                "dataset_repo_id=lerobot/aloha_sim_transfer_cube_human",
+            ],
+        ),
+        (
+            "aloha",
+            "act",
+            [
+                "env.task=AlohaTransferCube-v0",
+                "dataset_repo_id=lerobot/aloha_sim_transfer_cube_scripted",
+            ],
         ),
         # Note: these parameters also need custom logic in the test function for overriding the Hydra config.
         (
             "aloha",
             "diffusion",
-            ["env.task=AlohaInsertion-v0", "dataset_repo_id=lerobot/aloha_sim_insertion_human"],
+            [
+                "env.task=AlohaInsertion-v0",
+                "dataset_repo_id=lerobot/aloha_sim_insertion_human",
+            ],
         ),
         # Note: these parameters also need custom logic in the test function for overriding the Hydra config.
         ("pusht", "act", ["env.task=PushT-v0", "dataset_repo_id=lerobot/pusht"]),
@@ -165,7 +197,9 @@ def test_policy(env_name, policy_name, extra_overrides):
     # Test updating the policy (and test that it does not mutate the batch)
     batch_ = deepcopy(batch)
     policy.forward(batch)
-    assert set(batch) == set(batch_), "Batch keys are not the same after a forward pass."
+    assert set(batch) == set(
+        batch_
+    ), "Batch keys are not the same after a forward pass."
     assert all(
         torch.equal(batch[k], batch_[k]) for k in batch
     ), "Batch values are not the same after a forward pass."
@@ -178,7 +212,9 @@ def test_policy(env_name, policy_name, extra_overrides):
     observation = preprocess_observation(observation)
 
     # send observation to device/gpu
-    observation = {key: observation[key].to(DEVICE, non_blocking=True) for key in observation}
+    observation = {
+        key: observation[key].to(DEVICE, non_blocking=True) for key in observation
+    }
 
     # get the next action for the environment (also check that the observation batch is not modified)
     observation_ = deepcopy(observation)
@@ -240,7 +276,9 @@ def test_policy_defaults(policy_name: str):
 )
 def test_yaml_matches_dataclass(env_name: str, policy_name: str):
     """Check that dataclass configs match their respective yaml configs."""
-    hydra_cfg = init_hydra_config(DEFAULT_CONFIG_PATH, overrides=[f"env={env_name}", f"policy={policy_name}"])
+    hydra_cfg = init_hydra_config(
+        DEFAULT_CONFIG_PATH, overrides=[f"env={env_name}", f"policy={policy_name}"]
+    )
     _, policy_cfg_cls = get_policy_and_config_classes(policy_name)
     policy_cfg_from_hydra = _policy_cfg_from_hydra_cfg(policy_cfg_cls, hydra_cfg)
     policy_cfg_from_dataclass = policy_cfg_cls()
@@ -254,7 +292,10 @@ def test_save_and_load_pretrained(policy_name: str):
     save_dir = "/tmp/test_save_and_load_pretrained_{policy_cls.__name__}"
     policy.save_pretrained(save_dir)
     policy_ = policy_cls.from_pretrained(save_dir)
-    assert all(torch.equal(p, p_) for p, p_ in zip(policy.parameters(), policy_.parameters(), strict=True))
+    assert all(
+        torch.equal(p, p_)
+        for p, p_ in zip(policy.parameters(), policy_.parameters(), strict=True)
+    )
 
 
 @pytest.mark.parametrize("insert_temporal_dim", [False, True])
@@ -343,7 +384,9 @@ def test_normalize(insert_temporal_dim):
         unnormalize(output_batch)
 
     # test with stats
-    unnormalize = Unnormalize(output_shapes, unnormalize_output_modes, stats=dataset_stats)
+    unnormalize = Unnormalize(
+        output_shapes, unnormalize_output_modes, stats=dataset_stats
+    )
     unnormalize(output_batch)
 
     # test loading pretrained models
@@ -364,11 +407,20 @@ def test_normalize(insert_temporal_dim):
         (
             "pusht",
             "diffusion",
-            ["policy.n_action_steps=8", "policy.num_inference_steps=10", "policy.down_dims=[128, 256, 512]"],
+            [
+                "policy.n_action_steps=8",
+                "policy.num_inference_steps=10",
+                "policy.down_dims=[128, 256, 512]",
+            ],
             "",
         ),
         ("aloha", "act", ["policy.n_action_steps=10"], ""),
-        ("aloha", "act", ["policy.n_action_steps=1000", "policy.chunk_size=1000"], "_1000_steps"),
+        (
+            "aloha",
+            "act",
+            ["policy.n_action_steps=1000", "policy.chunk_size=1000"],
+            "_1000_steps",
+        ),
         ("dora_aloha_real", "act_aloha_real", ["policy.n_action_steps=10"], ""),
     ],
 )
@@ -376,7 +428,9 @@ def test_normalize(insert_temporal_dim):
 # pass if it's run on another platform due to floating point errors
 @require_x86_64_kernel
 @require_cpu
-def test_backward_compatibility(env_name, policy_name, extra_overrides, file_name_extra):
+def test_backward_compatibility(
+    env_name, policy_name, extra_overrides, file_name_extra
+):
     """
     NOTE: If this test does not pass, and you have intentionally changed something in the policy:
         1. Inspect the differences in policy outputs and make sure you can account for them. Your PR should
@@ -390,23 +444,34 @@ def test_backward_compatibility(env_name, policy_name, extra_overrides, file_nam
         6. Remember to stage and commit the resulting changes to `tests/data`.
     """
     env_policy_dir = (
-        Path("tests/data/save_policy_to_safetensors") / f"{env_name}_{policy_name}{file_name_extra}"
+        Path("tests/data/save_policy_to_safetensors")
+        / f"{env_name}_{policy_name}{file_name_extra}"
     )
     saved_output_dict = load_file(env_policy_dir / "output_dict.safetensors")
     saved_grad_stats = load_file(env_policy_dir / "grad_stats.safetensors")
     saved_param_stats = load_file(env_policy_dir / "param_stats.safetensors")
     saved_actions = load_file(env_policy_dir / "actions.safetensors")
 
-    output_dict, grad_stats, param_stats, actions = get_policy_stats(env_name, policy_name, extra_overrides)
+    output_dict, grad_stats, param_stats, actions = get_policy_stats(
+        env_name, policy_name, extra_overrides
+    )
 
     for key in saved_output_dict:
-        assert torch.isclose(output_dict[key], saved_output_dict[key], rtol=0.1, atol=1e-7).all()
+        assert torch.isclose(
+            output_dict[key], saved_output_dict[key], rtol=0.1, atol=1e-7
+        ).all()
     for key in saved_grad_stats:
-        assert torch.isclose(grad_stats[key], saved_grad_stats[key], rtol=0.1, atol=1e-7).all()
+        assert torch.isclose(
+            grad_stats[key], saved_grad_stats[key], rtol=0.1, atol=1e-7
+        ).all()
     for key in saved_param_stats:
-        assert torch.isclose(param_stats[key], saved_param_stats[key], rtol=50, atol=1e-7).all()
+        assert torch.isclose(
+            param_stats[key], saved_param_stats[key], rtol=50, atol=1e-7
+        ).all()
     for key in saved_actions:
-        assert torch.isclose(actions[key], saved_actions[key], rtol=0.1, atol=1e-7).all()
+        assert torch.isclose(
+            actions[key], saved_actions[key], rtol=0.1, atol=1e-7
+        ).all()
 
 
 def test_act_temporal_ensembler():
@@ -432,7 +497,9 @@ def test_act_temporal_ensembler():
     batch_size = batch_seq.shape[0]
     # Exponential weighting (normalized). Unsqueeze once to match the position of the `episode_length`
     # dimension of `batch_seq`.
-    weights = torch.exp(-temporal_ensemble_coeff * torch.arange(chunk_size)).unsqueeze(-1)
+    weights = torch.exp(-temporal_ensemble_coeff * torch.arange(chunk_size)).unsqueeze(
+        -1
+    )
 
     # Simulate stepping through a rollout and computing a batch of actions with model on each step.
     for i in range(episode_length):
@@ -455,7 +522,8 @@ def test_act_temporal_ensembler():
         episode_step_indices = torch.arange(i + 1)[-len(chunk_indices) :]
         seq_slice = batch_seq[:, episode_step_indices, chunk_indices]
         offline_avg = (
-            einops.reduce(seq_slice * weights[: i + 1], "b s 1 -> b 1", "sum") / weights[: i + 1].sum()
+            einops.reduce(seq_slice * weights[: i + 1], "b s 1 -> b 1", "sum")
+            / weights[: i + 1].sum()
         )
         # Sanity check. The average should be between the extrema.
         assert torch.all(einops.reduce(seq_slice, "b s 1 -> b 1", "min") <= offline_avg)
diff --git a/tests/test_push_dataset_to_hub.py b/tests/test_push_dataset_to_hub.py
index ff630ab6..b671e9bb 100644
--- a/tests/test_push_dataset_to_hub.py
+++ b/tests/test_push_dataset_to_hub.py
@@ -31,7 +31,11 @@ def _mock_download_raw_pusht(raw_dir, num_frames=4, num_episodes=3):
     zarr_data = zarr.group(store=store)
 
     zarr_data.create_dataset(
-        "data/action", shape=(num_frames, 1), chunks=(num_frames, 1), dtype=np.float32, overwrite=True
+        "data/action",
+        shape=(num_frames, 1),
+        chunks=(num_frames, 1),
+        dtype=np.float32,
+        overwrite=True,
     )
     zarr_data.create_dataset(
         "data/img",
@@ -41,20 +45,38 @@ def _mock_download_raw_pusht(raw_dir, num_frames=4, num_episodes=3):
         overwrite=True,
     )
     zarr_data.create_dataset(
-        "data/n_contacts", shape=(num_frames, 2), chunks=(num_frames, 2), dtype=np.float32, overwrite=True
+        "data/n_contacts",
+        shape=(num_frames, 2),
+        chunks=(num_frames, 2),
+        dtype=np.float32,
+        overwrite=True,
     )
     zarr_data.create_dataset(
-        "data/state", shape=(num_frames, 5), chunks=(num_frames, 5), dtype=np.float32, overwrite=True
+        "data/state",
+        shape=(num_frames, 5),
+        chunks=(num_frames, 5),
+        dtype=np.float32,
+        overwrite=True,
     )
     zarr_data.create_dataset(
-        "data/keypoint", shape=(num_frames, 9, 2), chunks=(num_frames, 9, 2), dtype=np.float32, overwrite=True
+        "data/keypoint",
+        shape=(num_frames, 9, 2),
+        chunks=(num_frames, 9, 2),
+        dtype=np.float32,
+        overwrite=True,
     )
     zarr_data.create_dataset(
-        "meta/episode_ends", shape=(num_episodes,), chunks=(num_episodes,), dtype=np.int32, overwrite=True
+        "meta/episode_ends",
+        shape=(num_episodes,),
+        chunks=(num_episodes,),
+        dtype=np.int32,
+        overwrite=True,
     )
 
     zarr_data["data/action"][:] = np.random.randn(num_frames, 1)
-    zarr_data["data/img"][:] = np.random.randint(0, 255, size=(num_frames, 96, 96, 3), dtype=np.uint8)
+    zarr_data["data/img"][:] = np.random.randint(
+        0, 255, size=(num_frames, 96, 96, 3), dtype=np.uint8
+    )
     zarr_data["data/n_contacts"][:] = np.random.randn(num_frames, 2)
     zarr_data["data/state"][:] = np.random.randn(num_frames, 5)
     zarr_data["data/keypoint"][:] = np.random.randn(num_frames, 9, 2)
@@ -93,7 +115,11 @@ def _mock_download_raw_umi(raw_dir, num_frames=4, num_episodes=3):
         overwrite=True,
     )
     zarr_data.create_dataset(
-        "data/robot0_eef_pos", shape=(num_frames, 5), chunks=(num_frames, 5), dtype=np.float32, overwrite=True
+        "data/robot0_eef_pos",
+        shape=(num_frames, 5),
+        chunks=(num_frames, 5),
+        dtype=np.float32,
+        overwrite=True,
     )
     zarr_data.create_dataset(
         "data/robot0_eef_rot_axis_angle",
@@ -110,10 +136,16 @@ def _mock_download_raw_umi(raw_dir, num_frames=4, num_episodes=3):
         overwrite=True,
     )
     zarr_data.create_dataset(
-        "meta/episode_ends", shape=(num_episodes,), chunks=(num_episodes,), dtype=np.int32, overwrite=True
+        "meta/episode_ends",
+        shape=(num_episodes,),
+        chunks=(num_episodes,),
+        dtype=np.int32,
+        overwrite=True,
     )
 
-    zarr_data["data/camera0_rgb"][:] = np.random.randint(0, 255, size=(num_frames, 96, 96, 3), dtype=np.uint8)
+    zarr_data["data/camera0_rgb"][:] = np.random.randint(
+        0, 255, size=(num_frames, 96, 96, 3), dtype=np.uint8
+    )
     zarr_data["data/robot0_demo_end_pose"][:] = np.random.randn(num_frames, 5)
     zarr_data["data/robot0_demo_start_pose"][:] = np.random.randn(num_frames, 5)
     zarr_data["data/robot0_eef_pos"][:] = np.random.randn(num_frames, 5)
@@ -129,7 +161,9 @@ def _mock_download_raw_xarm(raw_dir, num_frames=4):
 
     dataset_dict = {
         "observations": {
-            "rgb": np.random.randint(0, 255, size=(num_frames, 3, 84, 84), dtype=np.uint8),
+            "rgb": np.random.randint(
+                0, 255, size=(num_frames, 3, 84, 84), dtype=np.uint8
+            ),
             "state": np.random.randn(num_frames, 4),
         },
         "actions": np.random.randn(num_frames, 3),
@@ -151,13 +185,24 @@ def _mock_download_raw_aloha(raw_dir, num_frames=6, num_episodes=3):
         raw_dir.mkdir(parents=True, exist_ok=True)
         path_h5 = raw_dir / f"episode_{ep_idx}.hdf5"
         with h5py.File(str(path_h5), "w") as f:
-            f.create_dataset("action", data=np.random.randn(num_frames // num_episodes, 14))
-            f.create_dataset("observations/qpos", data=np.random.randn(num_frames // num_episodes, 14))
-            f.create_dataset("observations/qvel", data=np.random.randn(num_frames // num_episodes, 14))
+            f.create_dataset(
+                "action", data=np.random.randn(num_frames // num_episodes, 14)
+            )
+            f.create_dataset(
+                "observations/qpos",
+                data=np.random.randn(num_frames // num_episodes, 14),
+            )
+            f.create_dataset(
+                "observations/qvel",
+                data=np.random.randn(num_frames // num_episodes, 14),
+            )
             f.create_dataset(
                 "observations/images/top",
                 data=np.random.randint(
-                    0, 255, size=(num_frames // num_episodes, 480, 640, 3), dtype=np.uint8
+                    0,
+                    255,
+                    size=(num_frames // num_episodes, 480, 640, 3),
+                    dtype=np.uint8,
                 ),
             )
 
@@ -191,7 +236,12 @@ def _mock_download_raw_dora(raw_dir, num_frames=6, num_episodes=3, fps=30):
         action = np.random.randn(21).tolist()
         state = np.random.randn(21).tolist()
         ep_idx = episode_indices_mapping[i]
-        frame = [{"path": f"videos/{cam_key}_episode_{ep_idx:06d}.mp4", "timestamp": frame_idx / fps}]
+        frame = [
+            {
+                "path": f"videos/{cam_key}_episode_{ep_idx:06d}.mp4",
+                "timestamp": frame_idx / fps,
+            }
+        ]
         timestamps.append(t_utc)
         actions.append(action)
         states.append(state)
@@ -204,7 +254,9 @@ def _mock_download_raw_dora(raw_dir, num_frames=6, num_episodes=3, fps=30):
 
     # write fake mp4 file for each episode
     for ep_idx in range(num_episodes):
-        imgs_array = np.random.randint(0, 255, size=(num_frames // num_episodes, 480, 640, 3), dtype=np.uint8)
+        imgs_array = np.random.randint(
+            0, 255, size=(num_frames // num_episodes, 480, 640, 3), dtype=np.uint8
+        )
 
         tmp_imgs_dir = raw_dir / "tmp_images"
         save_images_concurrently(imgs_array, tmp_imgs_dir)
@@ -263,7 +315,9 @@ def test_push_dataset_to_hub_out_dir_force_override_false(tmpdir):
     ],
 )
 @require_package_arg
-def test_push_dataset_to_hub_format(required_packages, tmpdir, raw_format, repo_id, make_test_data):
+def test_push_dataset_to_hub_format(
+    required_packages, tmpdir, raw_format, repo_id, make_test_data
+):
     num_episodes = 3
     tmpdir = Path(tmpdir)
 
@@ -315,7 +369,10 @@ def test_push_dataset_to_hub_format(required_packages, tmpdir, raw_format, repo_
             == lerobot_dataset.hf_dataset["episode_index"][:num_frames]
         )
         for k in ["from", "to"]:
-            assert torch.equal(test_dataset.episode_data_index[k], lerobot_dataset.episode_data_index[k][:1])
+            assert torch.equal(
+                test_dataset.episode_data_index[k],
+                lerobot_dataset.episode_data_index[k][:1],
+            )
 
 
 @pytest.mark.parametrize(
@@ -359,8 +416,12 @@ def test_push_dataset_to_hub_pusht_backward_compatibility(tmpdir, raw_format, re
         assert item1.keys() == item2.keys(), "Keys mismatch"
 
         for key in item1:
-            if isinstance(item1[key], torch.Tensor) and isinstance(item2[key], torch.Tensor):
-                assert torch.equal(item1[key], item2[key]), f"Mismatch found in key: {key}"
+            if isinstance(item1[key], torch.Tensor) and isinstance(
+                item2[key], torch.Tensor
+            ):
+                assert torch.equal(
+                    item1[key], item2[key]
+                ), f"Mismatch found in key: {key}"
             else:
                 assert item1[key] == item2[key], f"Mismatch found in key: {key}"
 
diff --git a/tests/test_robots.py b/tests/test_robots.py
index 05966ff1..980b7adf 100644
--- a/tests/test_robots.py
+++ b/tests/test_robots.py
@@ -29,8 +29,16 @@ import pytest
 import torch
 
 from lerobot.common.robot_devices.robots.manipulator import ManipulatorRobot
-from lerobot.common.robot_devices.utils import RobotDeviceAlreadyConnectedError, RobotDeviceNotConnectedError
-from tests.utils import TEST_ROBOT_TYPES, make_robot, mock_calibration_dir, require_robot
+from lerobot.common.robot_devices.utils import (
+    RobotDeviceAlreadyConnectedError,
+    RobotDeviceNotConnectedError,
+)
+from tests.utils import (
+    TEST_ROBOT_TYPES,
+    make_robot,
+    mock_calibration_dir,
+    require_robot,
+)
 
 
 @pytest.mark.parametrize("robot_type, mock", TEST_ROBOT_TYPES)
@@ -104,7 +112,9 @@ def test_robot(tmpdir, request, robot_type, mock):
     assert "observation.state" in observation
     assert isinstance(observation["observation.state"], torch.Tensor)
     assert observation["observation.state"].ndim == 1
-    dim_state = sum(len(robot.follower_arms[name].motors) for name in robot.follower_arms)
+    dim_state = sum(
+        len(robot.follower_arms[name].motors) for name in robot.follower_arms
+    )
     assert observation["observation.state"].shape[0] == dim_state
     # Cameras
     for name in robot.cameras:
@@ -115,7 +125,9 @@ def test_robot(tmpdir, request, robot_type, mock):
     assert "action" in action
     assert isinstance(action["action"], torch.Tensor)
     assert action["action"].ndim == 1
-    dim_action = sum(len(robot.follower_arms[name].motors) for name in robot.follower_arms)
+    dim_action = sum(
+        len(robot.follower_arms[name].motors) for name in robot.follower_arms
+    )
     assert action["action"].shape[0] == dim_action
     # TODO(rcadene): test if observation and action data are returned as expected
 
diff --git a/tests/test_sampler.py b/tests/test_sampler.py
index ee143f37..2b329a16 100644
--- a/tests/test_sampler.py
+++ b/tests/test_sampler.py
@@ -15,7 +15,9 @@
 # limitations under the License.
 from datasets import Dataset
 
-from lerobot.common.datasets.push_dataset_to_hub.utils import calculate_episode_data_index
+from lerobot.common.datasets.push_dataset_to_hub.utils import (
+    calculate_episode_data_index,
+)
 from lerobot.common.datasets.sampler import EpisodeAwareSampler
 from lerobot.common.datasets.utils import (
     hf_transform_to_torch,
diff --git a/tests/test_train_hilserl_classifier.py b/tests/test_train_hilserl_classifier.py
index 8c1ad453..bc7a18bc 100644
--- a/tests/test_train_hilserl_classifier.py
+++ b/tests/test_train_hilserl_classifier.py
@@ -9,7 +9,9 @@ from hydra import compose, initialize_config_dir
 from torch import nn
 from torch.utils.data import Dataset
 
-from lerobot.common.policies.hilserl.classifier.configuration_classifier import ClassifierConfig
+from lerobot.common.policies.hilserl.classifier.configuration_classifier import (
+    ClassifierConfig,
+)
 from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
 from lerobot.scripts.train_hilserl_classifier import (
     create_balanced_sampler,
@@ -34,7 +36,9 @@ class MockDataset(Dataset):
 
 def make_dummy_model():
     model_config = ClassifierConfig(
-        num_classes=2, model_name="hf-tiny-model-private/tiny-random-ResNetModel", num_cameras=1
+        num_classes=2,
+        model_name="hf-tiny-model-private/tiny-random-ResNetModel",
+        num_cameras=1,
     )
     model = Classifier(config=model_config)
     return model
@@ -65,7 +69,9 @@ def test_create_balanced_sampler():
     labels = [item["label"] for item in data]
     class_counts = torch.tensor([labels.count(0), labels.count(1)], dtype=torch.float32)
     class_weights = 1.0 / class_counts
-    expected_weights = torch.tensor([class_weights[label] for label in labels], dtype=torch.float32)
+    expected_weights = torch.tensor(
+        [class_weights[label] for label in labels], dtype=torch.float32
+    )
 
     # Test that the weights are correct
     assert torch.allclose(weights, expected_weights)
@@ -149,7 +155,9 @@ def test_validate():
 
 def test_train_epoch_multiple_cameras():
     model_config = ClassifierConfig(
-        num_classes=2, model_name="hf-tiny-model-private/tiny-random-ResNetModel", num_cameras=2
+        num_classes=2,
+        model_name="hf-tiny-model-private/tiny-random-ResNetModel",
+        num_cameras=2,
     )
     model = Classifier(config=model_config)
 
@@ -216,10 +224,16 @@ def test_resume_function(
 ):
     # Initialize Hydra
     test_file_dir = os.path.dirname(os.path.abspath(__file__))
-    config_dir = os.path.abspath(os.path.join(test_file_dir, "..", "lerobot", "configs", "policy"))
-    assert os.path.exists(config_dir), f"Config directory does not exist at {config_dir}"
+    config_dir = os.path.abspath(
+        os.path.join(test_file_dir, "..", "lerobot", "configs", "policy")
+    )
+    assert os.path.exists(
+        config_dir
+    ), f"Config directory does not exist at {config_dir}"
 
-    with initialize_config_dir(config_dir=config_dir, job_name="test_app", version_base="1.2"):
+    with initialize_config_dir(
+        config_dir=config_dir, job_name="test_app", version_base="1.2"
+    ):
         cfg = compose(
             config_name="hilserl_classifier",
             overrides=[
@@ -244,7 +258,9 @@ def test_resume_function(
     mock_init_hydra_config.return_value = cfg
 
     # Mock dataset
-    dataset = MockDataset([{"image": torch.rand(3, 224, 224), "label": i % 2} for i in range(10)])
+    dataset = MockDataset(
+        [{"image": torch.rand(3, 224, 224), "label": i % 2} for i in range(10)]
+    )
     mock_dataset.return_value = dataset
 
     # Mock checkpoint handling
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 8880d28c..24731ebf 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -7,7 +7,9 @@ import pytest
 import torch
 from datasets import Dataset
 
-from lerobot.common.datasets.push_dataset_to_hub.utils import calculate_episode_data_index
+from lerobot.common.datasets.push_dataset_to_hub.utils import (
+    calculate_episode_data_index,
+)
 from lerobot.common.datasets.utils import (
     hf_transform_to_torch,
 )
diff --git a/tests/utils.py b/tests/utils.py
index f24b3551..e6c0384a 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -26,7 +26,9 @@ import torch
 from lerobot import available_cameras, available_motors, available_robots
 from lerobot.common.robot_devices.cameras.utils import Camera
 from lerobot.common.robot_devices.motors.utils import MotorsBus
-from lerobot.common.robot_devices.robots.factory import make_robot as make_robot_from_cfg
+from lerobot.common.robot_devices.robots.factory import (
+    make_robot as make_robot_from_cfg,
+)
 from lerobot.common.robot_devices.robots.utils import Robot
 from lerobot.common.utils.import_utils import is_package_available
 from lerobot.common.utils.utils import init_hydra_config
@@ -52,9 +54,13 @@ for motor_type in available_motors:
 
 # Camera indices used for connecting physical cameras
 OPENCV_CAMERA_INDEX = int(os.environ.get("LEROBOT_TEST_OPENCV_CAMERA_INDEX", 0))
-INTELREALSENSE_CAMERA_INDEX = int(os.environ.get("LEROBOT_TEST_INTELREALSENSE_CAMERA_INDEX", 128422271614))
+INTELREALSENSE_CAMERA_INDEX = int(
+    os.environ.get("LEROBOT_TEST_INTELREALSENSE_CAMERA_INDEX", 128422271614)
+)
 
-DYNAMIXEL_PORT = os.environ.get("LEROBOT_TEST_DYNAMIXEL_PORT", "/dev/tty.usbmodem575E0032081")
+DYNAMIXEL_PORT = os.environ.get(
+    "LEROBOT_TEST_DYNAMIXEL_PORT", "/dev/tty.usbmodem575E0032081"
+)
 DYNAMIXEL_MOTORS = {
     "shoulder_pan": [1, "xl430-w250"],
     "shoulder_lift": [2, "xl430-w250"],
@@ -64,7 +70,9 @@ DYNAMIXEL_MOTORS = {
     "gripper": [6, "xl330-m288"],
 }
 
-FEETECH_PORT = os.environ.get("LEROBOT_TEST_FEETECH_PORT", "/dev/tty.usbmodem585A0080971")
+FEETECH_PORT = os.environ.get(
+    "LEROBOT_TEST_FEETECH_PORT", "/dev/tty.usbmodem585A0080971"
+)
 FEETECH_MOTORS = {
     "shoulder_pan": [1, "sts3215"],
     "shoulder_lift": [2, "sts3215"],
@@ -163,9 +171,13 @@ def require_package_arg(func):
         if "required_packages" in arg_names:
             # Get the index of 'required_packages' and retrieve the value from args
             index = arg_names.index("required_packages")
-            required_packages = args[index] if len(args) > index else kwargs.get("required_packages")
+            required_packages = (
+                args[index] if len(args) > index else kwargs.get("required_packages")
+            )
         else:
-            raise ValueError("Function does not have 'required_packages' as an argument.")
+            raise ValueError(
+                "Function does not have 'required_packages' as an argument."
+            )
 
         if required_packages is None:
             return func(*args, **kwargs)
@@ -222,11 +234,17 @@ def require_robot(func):
         mock = kwargs.get("mock")
 
         if robot_type is None:
-            raise ValueError("The 'robot_type' must be an argument of the test function.")
+            raise ValueError(
+                "The 'robot_type' must be an argument of the test function."
+            )
         if request is None:
-            raise ValueError("The 'request' fixture must be an argument of the test function.")
+            raise ValueError(
+                "The 'request' fixture must be an argument of the test function."
+            )
         if mock is None:
-            raise ValueError("The 'mock' variable must be an argument of the test function.")
+            raise ValueError(
+                "The 'mock' variable must be an argument of the test function."
+            )
 
         # Run test with a real robot. Skip test if robot connection fails.
         if not mock and not request.getfixturevalue("is_robot_available"):
@@ -246,11 +264,17 @@ def require_camera(func):
         mock = kwargs.get("mock")
 
         if request is None:
-            raise ValueError("The 'request' fixture must be an argument of the test function.")
+            raise ValueError(
+                "The 'request' fixture must be an argument of the test function."
+            )
         if camera_type is None:
-            raise ValueError("The 'camera_type' must be an argument of the test function.")
+            raise ValueError(
+                "The 'camera_type' must be an argument of the test function."
+            )
         if mock is None:
-            raise ValueError("The 'mock' variable must be an argument of the test function.")
+            raise ValueError(
+                "The 'mock' variable must be an argument of the test function."
+            )
 
         if not mock and not request.getfixturevalue("is_camera_available"):
             pytest.skip(f"A {camera_type} camera is not available.")
@@ -269,11 +293,17 @@ def require_motor(func):
         mock = kwargs.get("mock")
 
         if request is None:
-            raise ValueError("The 'request' fixture must be an argument of the test function.")
+            raise ValueError(
+                "The 'request' fixture must be an argument of the test function."
+            )
         if motor_type is None:
-            raise ValueError("The 'motor_type' must be an argument of the test function.")
+            raise ValueError(
+                "The 'motor_type' must be an argument of the test function."
+            )
         if mock is None:
-            raise ValueError("The 'mock' variable must be an argument of the test function.")
+            raise ValueError(
+                "The 'mock' variable must be an argument of the test function."
+            )
 
         if not mock and not request.getfixturevalue("is_motor_available"):
             pytest.skip(f"A {motor_type} motor is not available.")
@@ -292,7 +322,14 @@ def mock_calibration_dir(calibration_dir):
         "start_pos": [1442, 843, 2166, 2849, 1988, 1835],
         "end_pos": [2440, 1869, -1106, -1848, -926, 3235],
         "calib_mode": ["DEGREE", "DEGREE", "DEGREE", "DEGREE", "DEGREE", "LINEAR"],
-        "motor_names": ["shoulder_pan", "shoulder_lift", "elbow_flex", "wrist_flex", "wrist_roll", "gripper"],
+        "motor_names": [
+            "shoulder_pan",
+            "shoulder_lift",
+            "elbow_flex",
+            "wrist_flex",
+            "wrist_roll",
+            "gripper",
+        ],
     }
     Path(str(calibration_dir)).mkdir(parents=True, exist_ok=True)
     with open(calibration_dir / "main_follower.json", "w") as f:
@@ -309,7 +346,9 @@ def mock_calibration_dir(calibration_dir):
         json.dump(example_calib, f)
 
 
-def make_robot(robot_type: str, overrides: list[str] | None = None, mock=False) -> Robot:
+def make_robot(
+    robot_type: str, overrides: list[str] | None = None, mock=False
+) -> Robot:
     if mock:
         overrides = [] if overrides is None else copy(overrides)
 
@@ -359,7 +398,9 @@ def make_camera(camera_type, **kwargs) -> Camera:
         return OpenCVCamera(camera_index, **kwargs)
 
     elif camera_type == "intelrealsense":
-        from lerobot.common.robot_devices.cameras.intelrealsense import IntelRealSenseCamera
+        from lerobot.common.robot_devices.cameras.intelrealsense import (
+            IntelRealSenseCamera,
+        )
 
         camera_index = kwargs.pop("camera_index", INTELREALSENSE_CAMERA_INDEX)
         return IntelRealSenseCamera(camera_index, **kwargs)

From 700f00c01457151161368f7e7f42b3a4711d1a38 Mon Sep 17 00:00:00 2001
From: Eugene Mironov <helper2424@gmail.com>
Date: Wed, 5 Mar 2025 17:19:31 +0700
Subject: [PATCH 094/112] [HIL-SERL] Migrate threading to multiprocessing
 (#759)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 lerobot/common/utils/utils.py                 |  10 +-
 lerobot/configs/env/maniskill_example.yaml    |   6 +
 lerobot/configs/env/so100_real.yaml           |   1 -
 lerobot/configs/policy/sac_maniskill.yaml     |   7 +-
 lerobot/scripts/server/actor_server.py        | 422 ++++++++++-------
 lerobot/scripts/server/buffer.py              |  36 +-
 lerobot/scripts/server/hilserl.proto          |  19 +-
 lerobot/scripts/server/hilserl_pb2.py         |  28 +-
 lerobot/scripts/server/hilserl_pb2_grpc.py    | 106 ++++-
 lerobot/scripts/server/learner_server.py      | 440 ++++++++++--------
 lerobot/scripts/server/learner_service.py     | 131 ++----
 .../scripts/server/maniskill_manipulator.py   |  12 +-
 lerobot/scripts/server/network_utils.py       | 102 ++++
 lerobot/scripts/server/utils.py               |  72 +++
 14 files changed, 900 insertions(+), 492 deletions(-)
 create mode 100644 lerobot/scripts/server/network_utils.py
 create mode 100644 lerobot/scripts/server/utils.py

diff --git a/lerobot/common/utils/utils.py b/lerobot/common/utils/utils.py
index 2bf19738..fecf88f9 100644
--- a/lerobot/common/utils/utils.py
+++ b/lerobot/common/utils/utils.py
@@ -116,11 +116,11 @@ def seeded_context(seed: int) -> Generator[None, None, None]:
     set_global_random_state(random_state_dict)
 
 
-def init_logging():
+def init_logging(log_file=None):
     def custom_format(record):
         dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
         fnameline = f"{record.pathname}:{record.lineno}"
-        message = f"{record.levelname} {dt} {fnameline[-15:]:>15} {record.msg}"
+        message = f"{record.levelname} [PID: {os.getpid()}] {dt} {fnameline[-15:]:>15} {record.msg}"
         return message
 
     logging.basicConfig(level=logging.INFO)
@@ -134,6 +134,12 @@ def init_logging():
     console_handler.setFormatter(formatter)
     logging.getLogger().addHandler(console_handler)
 
+    if log_file is not None:
+        # File handler
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(formatter)
+        logging.getLogger().addHandler(file_handler)
+
 
 def format_big_number(num, precision=0):
     suffixes = ["", "K", "M", "B", "T", "Q"]
diff --git a/lerobot/configs/env/maniskill_example.yaml b/lerobot/configs/env/maniskill_example.yaml
index 9098bcbe..3df23b2e 100644
--- a/lerobot/configs/env/maniskill_example.yaml
+++ b/lerobot/configs/env/maniskill_example.yaml
@@ -22,3 +22,9 @@ env:
   wrapper:
     joint_masking_action_space: null
     delta_action: null
+
+  video_record:
+    enabled: false
+    record_dir: maniskill_videos
+    trajectory_name: trajectory
+    fps: ${fps}
diff --git a/lerobot/configs/env/so100_real.yaml b/lerobot/configs/env/so100_real.yaml
index 1bd5cd83..dc30224c 100644
--- a/lerobot/configs/env/so100_real.yaml
+++ b/lerobot/configs/env/so100_real.yaml
@@ -28,4 +28,3 @@ env:
   reward_classifier:
     pretrained_path:  outputs/classifier/13-02-random-sample-resnet10-frozen/checkpoints/best/pretrained_model
     config_path: lerobot/configs/policy/hilserl_classifier.yaml
-
diff --git a/lerobot/configs/policy/sac_maniskill.yaml b/lerobot/configs/policy/sac_maniskill.yaml
index c954b1ea..c9bbca44 100644
--- a/lerobot/configs/policy/sac_maniskill.yaml
+++ b/lerobot/configs/policy/sac_maniskill.yaml
@@ -8,14 +8,12 @@
 #   env.gym.obs_type=environment_state_agent_pos \
 
 seed: 1
-# dataset_repo_id: null
 dataset_repo_id: "AdilZtn/Maniskill-Pushcube-demonstration-medium"
 
 training:
   # Offline training dataloader
   num_workers: 4
 
-  # batch_size: 256
   batch_size: 512
   grad_clip_norm: 10.0
   lr: 3e-4
@@ -113,4 +111,7 @@ policy:
 actor_learner_config:
   learner_host: "127.0.0.1"
   learner_port: 50051
-  policy_parameters_push_frequency: 15
+  policy_parameters_push_frequency: 1
+  concurrency:
+    actor: 'processes'
+    learner: 'processes'
diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index c70417cf..24d8356d 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -13,22 +13,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import io
 import logging
-import pickle
-import queue
 from statistics import mean, quantiles
-import signal
 from functools import lru_cache
+from lerobot.scripts.server.utils import setup_process_handlers
 
 # from lerobot.scripts.eval import eval_policy
-from threading import Thread
 
 import grpc
 import hydra
 import torch
 from omegaconf import DictConfig
 from torch import nn
+import time
 
 # TODO: Remove the import of maniskill
 # from lerobot.common.envs.factory import make_maniskill_env
@@ -47,157 +44,184 @@ from lerobot.scripts.server.buffer import (
     Transition,
     move_state_dict_to_device,
     move_transition_to_device,
-    bytes_buffer_size,
+    python_object_to_bytes,
+    transitions_to_bytes,
+    bytes_to_state_dict,
+)
+from lerobot.scripts.server.network_utils import (
+    receive_bytes_in_chunks,
+    send_bytes_in_chunks,
 )
 from lerobot.scripts.server.gym_manipulator import get_classifier, make_robot_env
 from lerobot.scripts.server import learner_service
 
-from threading import Event
+from torch.multiprocessing import Queue, Event
+from queue import Empty
 
-logging.basicConfig(level=logging.INFO)
+from lerobot.common.utils.utils import init_logging
 
-parameters_queue = queue.Queue(maxsize=1)
-message_queue = queue.Queue(maxsize=1_000_000)
+from lerobot.scripts.server.utils import get_last_item_from_queue
 
 ACTOR_SHUTDOWN_TIMEOUT = 30
 
 
-class ActorInformation:
-    """
-    This helper class is used to differentiate between two types of messages that are placed in the same queue during streaming:
-
-    - **Transition Data:** Contains experience tuples (observation, action, reward, next observation) collected during interaction.
-    - **Interaction Messages:** Encapsulates statistics related to the interaction process.
-
-    Attributes:
-        transition (Optional): Transition data to be sent to the learner.
-        interaction_message (Optional): Iteraction message providing additional statistics for logging.
-    """
-
-    def __init__(self, transition=None, interaction_message=None):
-        self.transition = transition
-        self.interaction_message = interaction_message
-
-
 def receive_policy(
-    learner_client: hilserl_pb2_grpc.LearnerServiceStub,
-    shutdown_event: Event,
-    parameters_queue: queue.Queue,
+    cfg: DictConfig,
+    parameters_queue: Queue,
+    shutdown_event: any,  # Event,
+    learner_client: hilserl_pb2_grpc.LearnerServiceStub | None = None,
+    grpc_channel: grpc.Channel | None = None,
 ):
     logging.info("[ACTOR] Start receiving parameters from the Learner")
-    bytes_buffer = io.BytesIO()
-    step = 0
+
+    if not use_threads(cfg):
+        # Setup process handlers to handle shutdown signal
+        # But use shutdown event from the main process
+        setup_process_handlers(False)
+
+    if grpc_channel is None or learner_client is None:
+        learner_client, grpc_channel = learner_service_client(
+            host=cfg.actor_learner_config.learner_host,
+            port=cfg.actor_learner_config.learner_port,
+        )
+
     try:
-        for model_update in learner_client.StreamParameters(hilserl_pb2.Empty()):
-            if shutdown_event.is_set():
-                logging.info("[ACTOR] Shutting down policy streaming receiver")
-                return hilserl_pb2.Empty()
-
-            if model_update.transfer_state == hilserl_pb2.TransferState.TRANSFER_BEGIN:
-                bytes_buffer.seek(0)
-                bytes_buffer.truncate(0)
-                bytes_buffer.write(model_update.parameter_bytes)
-                logging.info("Received model update at step 0")
-                step = 0
-                continue
-            elif (
-                model_update.transfer_state == hilserl_pb2.TransferState.TRANSFER_MIDDLE
-            ):
-                bytes_buffer.write(model_update.parameter_bytes)
-                step += 1
-                logging.info(f"Received model update at step {step}")
-            elif model_update.transfer_state == hilserl_pb2.TransferState.TRANSFER_END:
-                bytes_buffer.write(model_update.parameter_bytes)
-                logging.info(
-                    f"Received model update at step end size {bytes_buffer_size(bytes_buffer)}"
-                )
-
-                state_dict = torch.load(bytes_buffer)
-
-                bytes_buffer.seek(0)
-                bytes_buffer.truncate(0)
-                step = 0
-
-                logging.info("Model updated")
-
-                parameters_queue.put(state_dict)
-
+        iterator = learner_client.StreamParameters(hilserl_pb2.Empty())
+        receive_bytes_in_chunks(
+            iterator,
+            parameters_queue,
+            shutdown_event,
+            log_prefix="[ACTOR] parameters",
+        )
     except grpc.RpcError as e:
         logging.error(f"[ACTOR] gRPC error: {e}")
 
+    if not use_threads(cfg):
+        grpc_channel.close()
+    logging.info("[ACTOR] Received policy loop stopped")
+
+
+def transitions_stream(
+    shutdown_event: Event, transitions_queue: Queue
+) -> hilserl_pb2.Empty:
+    while not shutdown_event.is_set():
+        try:
+            message = transitions_queue.get(block=True, timeout=5)
+        except Empty:
+            logging.debug("[ACTOR] Transition queue is empty")
+            continue
+
+        yield from send_bytes_in_chunks(
+            message, hilserl_pb2.Transition, log_prefix="[ACTOR] Send transitions"
+        )
+
     return hilserl_pb2.Empty()
 
 
-def transitions_stream(shutdown_event: Event, message_queue: queue.Queue):
+def interactions_stream(
+    shutdown_event: any,  # Event,
+    interactions_queue: Queue,
+) -> hilserl_pb2.Empty:
     while not shutdown_event.is_set():
         try:
-            message = message_queue.get(block=True, timeout=5)
-        except queue.Empty:
-            logging.debug("[ACTOR] Transition queue is empty")
+            message = interactions_queue.get(block=True, timeout=5)
+        except Empty:
+            logging.debug("[ACTOR] Interaction queue is empty")
             continue
 
-        if message.transition is not None:
-            transition_to_send_to_learner: list[Transition] = [
-                move_transition_to_device(transition=T, device="cpu")
-                for T in message.transition
-            ]
-            # Check for NaNs in transitions before sending to learner
-            for transition in transition_to_send_to_learner:
-                for key, value in transition["state"].items():
-                    if torch.isnan(value).any():
-                        logging.warning(f"Found NaN values in transition {key}")
-            buf = io.BytesIO()
-            torch.save(transition_to_send_to_learner, buf)
-            transition_bytes = buf.getvalue()
-
-            transition_message = hilserl_pb2.Transition(
-                transition_bytes=transition_bytes
-            )
-
-            response = hilserl_pb2.ActorInformation(transition=transition_message)
-
-        elif message.interaction_message is not None:
-            content = hilserl_pb2.InteractionMessage(
-                interaction_message_bytes=pickle.dumps(message.interaction_message)
-            )
-            response = hilserl_pb2.ActorInformation(interaction_message=content)
-
-        yield response
+        yield from send_bytes_in_chunks(
+            message,
+            hilserl_pb2.InteractionMessage,
+            log_prefix="[ACTOR] Send interactions",
+        )
 
     return hilserl_pb2.Empty()
 
 
 def send_transitions(
-    learner_client: hilserl_pb2_grpc.LearnerServiceStub,
-    shutdown_event: Event,
-    message_queue: queue.Queue,
-):
+    cfg: DictConfig,
+    transitions_queue: Queue,
+    shutdown_event: any,  # Event,
+    learner_client: hilserl_pb2_grpc.LearnerServiceStub | None = None,
+    grpc_channel: grpc.Channel | None = None,
+) -> hilserl_pb2.Empty:
     """
-    Streams data from the actor to the learner.
+    Sends transitions to the learner.
 
-    This function continuously retrieves messages from the queue and processes them based on their type:
+    This function continuously retrieves messages from the queue and processes:
 
     - **Transition Data:**
         - A batch of transitions (observation, action, reward, next observation) is collected.
         - Transitions are moved to the CPU and serialized using PyTorch.
         - The serialized data is wrapped in a `hilserl_pb2.Transition` message and sent to the learner.
-
-    - **Interaction Messages:**
-        - Contains useful statistics about episodic rewards and policy timings.
-        - The message is serialized using `pickle` and sent to the learner.
-
-    Yields:
-        hilserl_pb2.ActorInformation: The response message containing either transition data or an interaction message.
     """
+
+    if not use_threads(cfg):
+        # Setup process handlers to handle shutdown signal
+        # But use shutdown event from the main process
+        setup_process_handlers(False)
+
+    if grpc_channel is None or learner_client is None:
+        learner_client, grpc_channel = learner_service_client(
+            host=cfg.actor_learner_config.learner_host,
+            port=cfg.actor_learner_config.learner_port,
+        )
+
     try:
-        learner_client.ReceiveTransitions(
-            transitions_stream(shutdown_event, message_queue)
+        learner_client.SendTransitions(
+            transitions_stream(shutdown_event, transitions_queue)
         )
     except grpc.RpcError as e:
         logging.error(f"[ACTOR] gRPC error: {e}")
 
     logging.info("[ACTOR] Finished streaming transitions")
 
+    if not use_threads(cfg):
+        grpc_channel.close()
+    logging.info("[ACTOR] Transitions process stopped")
+
+
+def send_interactions(
+    cfg: DictConfig,
+    interactions_queue: Queue,
+    shutdown_event: any,  # Event,
+    learner_client: hilserl_pb2_grpc.LearnerServiceStub | None = None,
+    grpc_channel: grpc.Channel | None = None,
+) -> hilserl_pb2.Empty:
+    """
+    Sends interactions to the learner.
+
+    This function continuously retrieves messages from the queue and processes:
+
+    - **Interaction Messages:**
+        - Contains useful statistics about episodic rewards and policy timings.
+        - The message is serialized using `pickle` and sent to the learner.
+    """
+
+    if not use_threads(cfg):
+        # Setup process handlers to handle shutdown signal
+        # But use shutdown event from the main process
+        setup_process_handlers(False)
+
+    if grpc_channel is None or learner_client is None:
+        learner_client, grpc_channel = learner_service_client(
+            host=cfg.actor_learner_config.learner_host,
+            port=cfg.actor_learner_config.learner_port,
+        )
+
+    try:
+        learner_client.SendInteractions(
+            interactions_stream(shutdown_event, interactions_queue)
+        )
+    except grpc.RpcError as e:
+        logging.error(f"[ACTOR] gRPC error: {e}")
+
+    logging.info("[ACTOR] Finished streaming interactions")
+
+    if not use_threads(cfg):
+        grpc_channel.close()
+    logging.info("[ACTOR] Interactions process stopped")
+
 
 @lru_cache(maxsize=1)
 def learner_service_client(
@@ -217,7 +241,7 @@ def learner_service_client(
             {
                 "name": [{}],  # Applies to ALL methods in ALL services
                 "retryPolicy": {
-                    "maxAttempts": 7,  # Max retries (total attempts = 5)
+                    "maxAttempts": 5,  # Max retries (total attempts = 5)
                     "initialBackoff": "0.1s",  # First retry after 0.1s
                     "maxBackoff": "2s",  # Max wait time between retries
                     "backoffMultiplier": 2,  # Exponential backoff factor
@@ -242,20 +266,27 @@ def learner_service_client(
         ],
     )
     stub = hilserl_pb2_grpc.LearnerServiceStub(channel)
-    logging.info("[LEARNER] Learner service client created")
+    logging.info("[ACTOR] Learner service client created")
     return stub, channel
 
 
-def update_policy_parameters(policy: SACPolicy, parameters_queue: queue.Queue, device):
+def update_policy_parameters(policy: SACPolicy, parameters_queue: Queue, device):
     if not parameters_queue.empty():
         logging.info("[ACTOR] Load new parameters from Learner.")
-        state_dict = parameters_queue.get()
+        bytes_state_dict = get_last_item_from_queue(parameters_queue)
+        state_dict = bytes_to_state_dict(bytes_state_dict)
         state_dict = move_state_dict_to_device(state_dict, device=device)
         policy.load_state_dict(state_dict)
 
 
 def act_with_policy(
-    cfg: DictConfig, robot: Robot, reward_classifier: nn.Module, shutdown_event: Event
+    cfg: DictConfig,
+    robot: Robot,
+    reward_classifier: nn.Module,
+    shutdown_event: any,  # Event,
+    parameters_queue: Queue,
+    transitions_queue: Queue,
+    interactions_queue: Queue,
 ):
     """
     Executes policy interaction within the environment.
@@ -317,7 +348,7 @@ def act_with_policy(
 
     for interaction_step in range(cfg.training.online_steps):
         if shutdown_event.is_set():
-            logging.info("[ACTOR] Shutdown signal received. Exiting...")
+            logging.info("[ACTOR] Shutting down act_with_policy")
             return
 
         if interaction_step >= cfg.training.online_step_before_learning:
@@ -394,10 +425,9 @@ def act_with_policy(
             )
 
             if len(list_transition_to_send_to_learner) > 0:
-                send_transitions_in_chunks(
+                push_transitions_to_transport_queue(
                     transitions=list_transition_to_send_to_learner,
-                    message_queue=message_queue,
-                    chunk_size=4,
+                    transitions_queue=transitions_queue,
                 )
                 list_transition_to_send_to_learner = []
 
@@ -405,9 +435,9 @@ def act_with_policy(
             list_policy_time.clear()
 
             # Send episodic reward to the learner
-            message_queue.put(
-                ActorInformation(
-                    interaction_message={
+            interactions_queue.put(
+                python_object_to_bytes(
+                    {
                         "Episodic reward": sum_reward_episode,
                         "Interaction step": interaction_step,
                         "Episode intervention": int(episode_intervention),
@@ -420,7 +450,7 @@ def act_with_policy(
             obs, info = online_env.reset()
 
 
-def send_transitions_in_chunks(transitions: list, message_queue, chunk_size: int = 100):
+def push_transitions_to_transport_queue(transitions: list, transitions_queue):
     """Send transitions to learner in smaller chunks to avoid network issues.
 
     Args:
@@ -428,10 +458,16 @@ def send_transitions_in_chunks(transitions: list, message_queue, chunk_size: int
         message_queue: Queue to send messages to learner
         chunk_size: Size of each chunk to send
     """
-    for i in range(0, len(transitions), chunk_size):
-        chunk = transitions[i : i + chunk_size]
-        logging.debug(f"[ACTOR] Sending chunk of {len(chunk)} transitions to Learner.")
-        message_queue.put(ActorInformation(transition=chunk))
+    transition_to_send_to_learner = []
+    for transition in transitions:
+        tr = move_transition_to_device(transition=transition, device="cpu")
+        for key, value in tr["state"].items():
+            if torch.isnan(value).any():
+                logging.warning(f"Found NaN values in transition {key}")
+
+        transition_to_send_to_learner.append(tr)
+
+    transitions_queue.put(transitions_to_bytes(transition_to_send_to_learner))
 
 
 def get_frequency_stats(list_policy_time: list[float]) -> dict[str, float]:
@@ -458,39 +494,96 @@ def log_policy_frequency_issue(
         )
 
 
+def establish_learner_connection(
+    stub,
+    shutdown_event: any,  # Event,
+    attempts=30,
+):
+    for _ in range(attempts):
+        if shutdown_event.is_set():
+            logging.info("[ACTOR] Shutting down establish_learner_connection")
+            return False
+
+        # Force a connection attempt and check state
+        try:
+            logging.info("[ACTOR] Send ready message to Learner")
+            if stub.Ready(hilserl_pb2.Empty()) == hilserl_pb2.Empty():
+                return True
+        except grpc.RpcError as e:
+            logging.error(f"[ACTOR] Waiting for Learner to be ready... {e}")
+            time.sleep(2)
+    return False
+
+
+def use_threads(cfg: DictConfig) -> bool:
+    return cfg.actor_learner_config.concurrency.actor == "threads"
+
+
 @hydra.main(version_base="1.2", config_name="default", config_path="../../configs")
 def actor_cli(cfg: dict):
+    if not use_threads(cfg):
+        import torch.multiprocessing as mp
+
+        mp.set_start_method("spawn")
+
+    init_logging(log_file="actor.log")
     robot = make_robot(cfg=cfg.robot)
 
-    shutdown_event = Event()
-
-    # Define signal handler
-    def signal_handler(signum, frame):
-        logging.info("Shutdown signal received. Cleaning up...")
-        shutdown_event.set()
-
-    signal.signal(signal.SIGINT, signal_handler)  # Ctrl+C
-    signal.signal(signal.SIGTERM, signal_handler)  # Termination request (kill)
-    signal.signal(signal.SIGHUP, signal_handler)  # Terminal closed/Hangup
-    signal.signal(signal.SIGQUIT, signal_handler)  # Ctrl+\
+    shutdown_event = setup_process_handlers(use_threads(cfg))
 
     learner_client, grpc_channel = learner_service_client(
         host=cfg.actor_learner_config.learner_host,
         port=cfg.actor_learner_config.learner_port,
     )
 
-    receive_policy_thread = Thread(
+    logging.info("[ACTOR] Establishing connection with Learner")
+    if not establish_learner_connection(learner_client, shutdown_event):
+        logging.error("[ACTOR] Failed to establish connection with Learner")
+        return
+
+    if not use_threads(cfg):
+        # If we use multithreading, we can reuse the channel
+        grpc_channel.close()
+        grpc_channel = None
+
+    logging.info("[ACTOR] Connection with Learner established")
+
+    parameters_queue = Queue()
+    transitions_queue = Queue()
+    interactions_queue = Queue()
+
+    concurrency_entity = None
+    if use_threads(cfg):
+        from threading import Thread
+
+        concurrency_entity = Thread
+    else:
+        from multiprocessing import Process
+
+        concurrency_entity = Process
+
+    receive_policy_process = concurrency_entity(
         target=receive_policy,
-        args=(learner_client, shutdown_event, parameters_queue),
+        args=(cfg, parameters_queue, shutdown_event, grpc_channel),
         daemon=True,
     )
 
-    transitions_thread = Thread(
+    transitions_process = concurrency_entity(
         target=send_transitions,
-        args=(learner_client, shutdown_event, message_queue),
+        args=(cfg, transitions_queue, shutdown_event, grpc_channel),
         daemon=True,
     )
 
+    interactions_process = concurrency_entity(
+        target=send_interactions,
+        args=(cfg, interactions_queue, shutdown_event, grpc_channel),
+        daemon=True,
+    )
+
+    transitions_process.start()
+    interactions_process.start()
+    receive_policy_process.start()
+
     # HACK: FOR MANISKILL we do not have a reward classifier
     # TODO: Remove this once we merge into main
     reward_classifier = None
@@ -503,26 +596,35 @@ def actor_cli(cfg: dict):
             config_path=cfg.env.reward_classifier.config_path,
         )
 
-    policy_thread = Thread(
-        target=act_with_policy,
-        daemon=True,
-        args=(cfg, robot, reward_classifier, shutdown_event),
+    act_with_policy(
+        cfg,
+        robot,
+        reward_classifier,
+        shutdown_event,
+        parameters_queue,
+        transitions_queue,
+        interactions_queue,
     )
+    logging.info("[ACTOR] Policy process joined")
 
-    transitions_thread.start()
-    policy_thread.start()
-    receive_policy_thread.start()
+    logging.info("[ACTOR] Closing queues")
+    transitions_queue.close()
+    interactions_queue.close()
+    parameters_queue.close()
 
-    shutdown_event.wait()
-    logging.info("[ACTOR] Shutdown event received")
-    grpc_channel.close()
+    transitions_process.join()
+    logging.info("[ACTOR] Transitions process joined")
+    interactions_process.join()
+    logging.info("[ACTOR] Interactions process joined")
+    receive_policy_process.join()
+    logging.info("[ACTOR] Receive policy process joined")
 
-    policy_thread.join()
-    logging.info("[ACTOR] Policy thread joined")
-    transitions_thread.join()
-    logging.info("[ACTOR] Transitions thread joined")
-    receive_policy_thread.join()
-    logging.info("[ACTOR] Receive policy thread joined")
+    logging.info("[ACTOR] join queues")
+    transitions_queue.cancel_join_thread()
+    interactions_queue.cancel_join_thread()
+    parameters_queue.cancel_join_thread()
+
+    logging.info("[ACTOR] queues closed")
 
 
 if __name__ == "__main__":
diff --git a/lerobot/scripts/server/buffer.py b/lerobot/scripts/server/buffer.py
index f93b40ca..80834eac 100644
--- a/lerobot/scripts/server/buffer.py
+++ b/lerobot/scripts/server/buffer.py
@@ -23,6 +23,7 @@ from tqdm import tqdm
 
 from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
 import os
+import pickle
 
 
 class Transition(TypedDict):
@@ -91,7 +92,7 @@ def move_transition_to_device(
     return transition
 
 
-def move_state_dict_to_device(state_dict, device):
+def move_state_dict_to_device(state_dict, device="cpu"):
     """
     Recursively move all tensors in a (potentially) nested
     dict/list/tuple structure to the CPU.
@@ -111,20 +112,41 @@ def move_state_dict_to_device(state_dict, device):
         return state_dict
 
 
-def state_to_bytes(state_dict: dict[str, torch.Tensor]) -> io.BytesIO:
+def state_to_bytes(state_dict: dict[str, torch.Tensor]) -> bytes:
     """Convert model state dict to flat array for transmission"""
     buffer = io.BytesIO()
 
     torch.save(state_dict, buffer)
 
-    return buffer
+    return buffer.getvalue()
 
 
-def bytes_buffer_size(buffer: io.BytesIO) -> int:
-    buffer.seek(0, io.SEEK_END)
-    result = buffer.tell()
+def bytes_to_state_dict(buffer: bytes) -> dict[str, torch.Tensor]:
+    buffer = io.BytesIO(buffer)
     buffer.seek(0)
-    return result
+    return torch.load(buffer)
+
+
+def python_object_to_bytes(python_object: Any) -> bytes:
+    return pickle.dumps(python_object)
+
+
+def bytes_to_python_object(buffer: bytes) -> Any:
+    buffer = io.BytesIO(buffer)
+    buffer.seek(0)
+    return pickle.load(buffer)
+
+
+def bytes_to_transitions(buffer: bytes) -> list[Transition]:
+    buffer = io.BytesIO(buffer)
+    buffer.seek(0)
+    return torch.load(buffer)
+
+
+def transitions_to_bytes(transitions: list[Transition]) -> bytes:
+    buffer = io.BytesIO()
+    torch.save(transitions, buffer)
+    return buffer.getvalue()
 
 
 def random_crop_vectorized(images: torch.Tensor, output_size: tuple) -> torch.Tensor:
diff --git a/lerobot/scripts/server/hilserl.proto b/lerobot/scripts/server/hilserl.proto
index 6aa46e0e..dec2117b 100644
--- a/lerobot/scripts/server/hilserl.proto
+++ b/lerobot/scripts/server/hilserl.proto
@@ -24,14 +24,9 @@ service LearnerService {
   // Actor -> Learner to store transitions
   rpc SendInteractionMessage(InteractionMessage) returns (Empty);
   rpc StreamParameters(Empty) returns (stream Parameters);
-  rpc ReceiveTransitions(stream ActorInformation) returns (Empty);
-}
-
-message ActorInformation {
-    oneof data {
-        Transition transition = 1;
-        InteractionMessage interaction_message = 2;
-    }
+  rpc SendTransitions(stream Transition) returns (Empty);
+  rpc SendInteractions(stream InteractionMessage) returns (Empty);
+  rpc Ready(Empty) returns (Empty);
 }
 
 enum TransferState {
@@ -43,16 +38,18 @@ enum TransferState {
 
 // Messages
 message Transition {
-  bytes transition_bytes = 1;
+  TransferState transfer_state = 1;
+  bytes data = 2;
 }
 
 message Parameters {
   TransferState transfer_state = 1;
-  bytes parameter_bytes = 2;
+  bytes data = 2;
 }
 
 message InteractionMessage {
-  bytes interaction_message_bytes = 1;
+  TransferState transfer_state = 1;
+  bytes data = 2;
 }
 
 message Empty {}
diff --git a/lerobot/scripts/server/hilserl_pb2.py b/lerobot/scripts/server/hilserl_pb2.py
index d5eb8d4c..4a4cbea7 100644
--- a/lerobot/scripts/server/hilserl_pb2.py
+++ b/lerobot/scripts/server/hilserl_pb2.py
@@ -24,25 +24,23 @@ _sym_db = _symbol_database.Default()
 
 
 
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rhilserl.proto\x12\x08hil_serl\"\x83\x01\n\x10\x41\x63torInformation\x12*\n\ntransition\x18\x01 \x01(\x0b\x32\x14.hil_serl.TransitionH\x00\x12;\n\x13interaction_message\x18\x02 \x01(\x0b\x32\x1c.hil_serl.InteractionMessageH\x00\x42\x06\n\x04\x64\x61ta\"&\n\nTransition\x12\x18\n\x10transition_bytes\x18\x01 \x01(\x0c\"V\n\nParameters\x12/\n\x0etransfer_state\x18\x01 \x01(\x0e\x32\x17.hil_serl.TransferState\x12\x17\n\x0fparameter_bytes\x18\x02 \x01(\x0c\"7\n\x12InteractionMessage\x12!\n\x19interaction_message_bytes\x18\x01 \x01(\x0c\"\x07\n\x05\x45mpty*`\n\rTransferState\x12\x14\n\x10TRANSFER_UNKNOWN\x10\x00\x12\x12\n\x0eTRANSFER_BEGIN\x10\x01\x12\x13\n\x0fTRANSFER_MIDDLE\x10\x02\x12\x10\n\x0cTRANSFER_END\x10\x03\x32\xdb\x01\n\x0eLearnerService\x12G\n\x16SendInteractionMessage\x12\x1c.hil_serl.InteractionMessage\x1a\x0f.hil_serl.Empty\x12;\n\x10StreamParameters\x12\x0f.hil_serl.Empty\x1a\x14.hil_serl.Parameters0\x01\x12\x43\n\x12ReceiveTransitions\x12\x1a.hil_serl.ActorInformation\x1a\x0f.hil_serl.Empty(\x01\x62\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rhilserl.proto\x12\x08hil_serl\"K\n\nTransition\x12/\n\x0etransfer_state\x18\x01 \x01(\x0e\x32\x17.hil_serl.TransferState\x12\x0c\n\x04\x64\x61ta\x18\x02 \x01(\x0c\"K\n\nParameters\x12/\n\x0etransfer_state\x18\x01 \x01(\x0e\x32\x17.hil_serl.TransferState\x12\x0c\n\x04\x64\x61ta\x18\x02 \x01(\x0c\"S\n\x12InteractionMessage\x12/\n\x0etransfer_state\x18\x01 \x01(\x0e\x32\x17.hil_serl.TransferState\x12\x0c\n\x04\x64\x61ta\x18\x02 \x01(\x0c\"\x07\n\x05\x45mpty*`\n\rTransferState\x12\x14\n\x10TRANSFER_UNKNOWN\x10\x00\x12\x12\n\x0eTRANSFER_BEGIN\x10\x01\x12\x13\n\x0fTRANSFER_MIDDLE\x10\x02\x12\x10\n\x0cTRANSFER_END\x10\x03\x32\xc2\x02\n\x0eLearnerService\x12G\n\x16SendInteractionMessage\x12\x1c.hil_serl.InteractionMessage\x1a\x0f.hil_serl.Empty\x12;\n\x10StreamParameters\x12\x0f.hil_serl.Empty\x1a\x14.hil_serl.Parameters0\x01\x12:\n\x0fSendTransitions\x12\x14.hil_serl.Transition\x1a\x0f.hil_serl.Empty(\x01\x12\x43\n\x10SendInteractions\x12\x1c.hil_serl.InteractionMessage\x1a\x0f.hil_serl.Empty(\x01\x12)\n\x05Ready\x12\x0f.hil_serl.Empty\x1a\x0f.hil_serl.Emptyb\x06proto3')
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
 _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'hilserl_pb2', _globals)
 if not _descriptor._USE_C_DESCRIPTORS:
   DESCRIPTOR._loaded_options = None
-  _globals['_TRANSFERSTATE']._serialized_start=355
-  _globals['_TRANSFERSTATE']._serialized_end=451
-  _globals['_ACTORINFORMATION']._serialized_start=28
-  _globals['_ACTORINFORMATION']._serialized_end=159
-  _globals['_TRANSITION']._serialized_start=161
-  _globals['_TRANSITION']._serialized_end=199
-  _globals['_PARAMETERS']._serialized_start=201
-  _globals['_PARAMETERS']._serialized_end=287
-  _globals['_INTERACTIONMESSAGE']._serialized_start=289
-  _globals['_INTERACTIONMESSAGE']._serialized_end=344
-  _globals['_EMPTY']._serialized_start=346
-  _globals['_EMPTY']._serialized_end=353
-  _globals['_LEARNERSERVICE']._serialized_start=454
-  _globals['_LEARNERSERVICE']._serialized_end=673
+  _globals['_TRANSFERSTATE']._serialized_start=275
+  _globals['_TRANSFERSTATE']._serialized_end=371
+  _globals['_TRANSITION']._serialized_start=27
+  _globals['_TRANSITION']._serialized_end=102
+  _globals['_PARAMETERS']._serialized_start=104
+  _globals['_PARAMETERS']._serialized_end=179
+  _globals['_INTERACTIONMESSAGE']._serialized_start=181
+  _globals['_INTERACTIONMESSAGE']._serialized_end=264
+  _globals['_EMPTY']._serialized_start=266
+  _globals['_EMPTY']._serialized_end=273
+  _globals['_LEARNERSERVICE']._serialized_start=374
+  _globals['_LEARNERSERVICE']._serialized_end=696
 # @@protoc_insertion_point(module_scope)
diff --git a/lerobot/scripts/server/hilserl_pb2_grpc.py b/lerobot/scripts/server/hilserl_pb2_grpc.py
index 42d4674e..1fa96e81 100644
--- a/lerobot/scripts/server/hilserl_pb2_grpc.py
+++ b/lerobot/scripts/server/hilserl_pb2_grpc.py
@@ -46,9 +46,19 @@ class LearnerServiceStub(object):
                 request_serializer=hilserl__pb2.Empty.SerializeToString,
                 response_deserializer=hilserl__pb2.Parameters.FromString,
                 _registered_method=True)
-        self.ReceiveTransitions = channel.stream_unary(
-                '/hil_serl.LearnerService/ReceiveTransitions',
-                request_serializer=hilserl__pb2.ActorInformation.SerializeToString,
+        self.SendTransitions = channel.stream_unary(
+                '/hil_serl.LearnerService/SendTransitions',
+                request_serializer=hilserl__pb2.Transition.SerializeToString,
+                response_deserializer=hilserl__pb2.Empty.FromString,
+                _registered_method=True)
+        self.SendInteractions = channel.stream_unary(
+                '/hil_serl.LearnerService/SendInteractions',
+                request_serializer=hilserl__pb2.InteractionMessage.SerializeToString,
+                response_deserializer=hilserl__pb2.Empty.FromString,
+                _registered_method=True)
+        self.Ready = channel.unary_unary(
+                '/hil_serl.LearnerService/Ready',
+                request_serializer=hilserl__pb2.Empty.SerializeToString,
                 response_deserializer=hilserl__pb2.Empty.FromString,
                 _registered_method=True)
 
@@ -71,7 +81,19 @@ class LearnerServiceServicer(object):
         context.set_details('Method not implemented!')
         raise NotImplementedError('Method not implemented!')
 
-    def ReceiveTransitions(self, request_iterator, context):
+    def SendTransitions(self, request_iterator, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def SendInteractions(self, request_iterator, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Ready(self, request, context):
         """Missing associated documentation comment in .proto file."""
         context.set_code(grpc.StatusCode.UNIMPLEMENTED)
         context.set_details('Method not implemented!')
@@ -90,9 +112,19 @@ def add_LearnerServiceServicer_to_server(servicer, server):
                     request_deserializer=hilserl__pb2.Empty.FromString,
                     response_serializer=hilserl__pb2.Parameters.SerializeToString,
             ),
-            'ReceiveTransitions': grpc.stream_unary_rpc_method_handler(
-                    servicer.ReceiveTransitions,
-                    request_deserializer=hilserl__pb2.ActorInformation.FromString,
+            'SendTransitions': grpc.stream_unary_rpc_method_handler(
+                    servicer.SendTransitions,
+                    request_deserializer=hilserl__pb2.Transition.FromString,
+                    response_serializer=hilserl__pb2.Empty.SerializeToString,
+            ),
+            'SendInteractions': grpc.stream_unary_rpc_method_handler(
+                    servicer.SendInteractions,
+                    request_deserializer=hilserl__pb2.InteractionMessage.FromString,
+                    response_serializer=hilserl__pb2.Empty.SerializeToString,
+            ),
+            'Ready': grpc.unary_unary_rpc_method_handler(
+                    servicer.Ready,
+                    request_deserializer=hilserl__pb2.Empty.FromString,
                     response_serializer=hilserl__pb2.Empty.SerializeToString,
             ),
     }
@@ -163,7 +195,7 @@ class LearnerService(object):
             _registered_method=True)
 
     @staticmethod
-    def ReceiveTransitions(request_iterator,
+    def SendTransitions(request_iterator,
             target,
             options=(),
             channel_credentials=None,
@@ -176,8 +208,62 @@ class LearnerService(object):
         return grpc.experimental.stream_unary(
             request_iterator,
             target,
-            '/hil_serl.LearnerService/ReceiveTransitions',
-            hilserl__pb2.ActorInformation.SerializeToString,
+            '/hil_serl.LearnerService/SendTransitions',
+            hilserl__pb2.Transition.SerializeToString,
+            hilserl__pb2.Empty.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+
+    @staticmethod
+    def SendInteractions(request_iterator,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.stream_unary(
+            request_iterator,
+            target,
+            '/hil_serl.LearnerService/SendInteractions',
+            hilserl__pb2.InteractionMessage.SerializeToString,
+            hilserl__pb2.Empty.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+
+    @staticmethod
+    def Ready(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/hil_serl.LearnerService/Ready',
+            hilserl__pb2.Empty.SerializeToString,
             hilserl__pb2.Empty.FromString,
             options,
             channel_credentials,
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index 4bab9ac2..7bd4aee0 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -15,15 +15,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-import queue
 import shutil
 import time
 from pprint import pformat
-from threading import Lock, Thread
-import signal
-from threading import Event
 from concurrent.futures import ThreadPoolExecutor
 
+# from torch.multiprocessing import Event, Queue, Process
+# from threading import Event, Thread
+# from torch.multiprocessing import Queue, Event
+from torch.multiprocessing import Queue
+
+from lerobot.scripts.server.utils import setup_process_handlers
+
 import grpc
 
 # Import generated stubs
@@ -52,19 +55,19 @@ from lerobot.common.utils.utils import (
     set_global_random_state,
     set_global_seed,
 )
+
 from lerobot.scripts.server.buffer import (
     ReplayBuffer,
     concatenate_batch_transitions,
     move_transition_to_device,
+    move_state_dict_to_device,
+    bytes_to_transitions,
+    state_to_bytes,
+    bytes_to_python_object,
 )
 
 from lerobot.scripts.server import learner_service
 
-logging.basicConfig(level=logging.INFO)
-
-transition_queue = queue.Queue()
-interaction_message_queue = queue.Queue()
-
 
 def handle_resume_logic(cfg: DictConfig, out_dir: str) -> DictConfig:
     if not cfg.resume:
@@ -195,67 +198,96 @@ def get_observation_features(
     return observation_features, next_observation_features
 
 
+def use_threads(cfg: DictConfig) -> bool:
+    return cfg.actor_learner_config.concurrency.learner == "threads"
+
+
 def start_learner_threads(
     cfg: DictConfig,
-    device: str,
-    replay_buffer: ReplayBuffer,
-    offline_replay_buffer: ReplayBuffer,
-    batch_size: int,
-    optimizers: dict,
-    policy: SACPolicy,
-    policy_lock: Lock,
     logger: Logger,
-    resume_optimization_step: int | None = None,
-    resume_interaction_step: int | None = None,
-    shutdown_event: Event | None = None,
+    out_dir: str,
+    shutdown_event: any,  # Event,
 ) -> None:
-    host = cfg.actor_learner_config.learner_host
-    port = cfg.actor_learner_config.learner_port
+    # Create multiprocessing queues
+    transition_queue = Queue()
+    interaction_message_queue = Queue()
+    parameters_queue = Queue()
 
-    transition_thread = Thread(
-        target=add_actor_information_and_train,
-        daemon=True,
+    concurrency_entity = None
+
+    if use_threads(cfg):
+        from threading import Thread
+
+        concurrency_entity = Thread
+    else:
+        from torch.multiprocessing import Process
+
+        concurrency_entity = Process
+
+    communication_process = concurrency_entity(
+        target=start_learner_server,
         args=(
-            cfg,
-            device,
-            replay_buffer,
-            offline_replay_buffer,
-            batch_size,
-            optimizers,
-            policy,
-            policy_lock,
-            logger,
-            resume_optimization_step,
-            resume_interaction_step,
+            parameters_queue,
+            transition_queue,
+            interaction_message_queue,
             shutdown_event,
+            cfg,
         ),
+        daemon=True,
     )
+    communication_process.start()
 
-    transition_thread.start()
+    add_actor_information_and_train(
+        cfg,
+        logger,
+        out_dir,
+        shutdown_event,
+        transition_queue,
+        interaction_message_queue,
+        parameters_queue,
+    )
+    logging.info("[LEARNER] Training process stopped")
+
+    logging.info("[LEARNER] Closing queues")
+    transition_queue.close()
+    interaction_message_queue.close()
+    parameters_queue.close()
+
+    communication_process.join()
+    logging.info("[LEARNER] Communication process joined")
+
+    logging.info("[LEARNER] join queues")
+    transition_queue.cancel_join_thread()
+    interaction_message_queue.cancel_join_thread()
+    parameters_queue.cancel_join_thread()
+
+    logging.info("[LEARNER] queues closed")
+
+
+def start_learner_server(
+    parameters_queue: Queue,
+    transition_queue: Queue,
+    interaction_message_queue: Queue,
+    shutdown_event: any,  # Event,
+    cfg: DictConfig,
+):
+    if not use_threads(cfg):
+        # We need init logging for MP separataly
+        init_logging()
+
+        # Setup process handlers to handle shutdown signal
+        # But use shutdown event from the main process
+        # Return back for MP
+        setup_process_handlers(False)
 
     service = learner_service.LearnerService(
         shutdown_event,
-        policy,
-        policy_lock,
+        parameters_queue,
         cfg.actor_learner_config.policy_parameters_push_frequency,
         transition_queue,
         interaction_message_queue,
     )
-    server = start_learner_server(service, host, port)
 
-    shutdown_event.wait()
-    server.stop(learner_service.STUTDOWN_TIMEOUT)
-    logging.info("[LEARNER] gRPC server stopped")
-
-    transition_thread.join()
-    logging.info("[LEARNER] Transition thread stopped")
-
-
-def start_learner_server(
-    service: learner_service.LearnerService,
-    host="0.0.0.0",
-    port=50051,
-) -> grpc.server:
     server = grpc.server(
         ThreadPoolExecutor(max_workers=learner_service.MAX_WORKERS),
         options=[
@@ -263,15 +295,23 @@ def start_learner_server(
             ("grpc.max_send_message_length", learner_service.MAX_MESSAGE_SIZE),
         ],
     )
+
     hilserl_pb2_grpc.add_LearnerServiceServicer_to_server(
         service,
         server,
     )
+
+    host = cfg.actor_learner_config.learner_host
+    port = cfg.actor_learner_config.learner_port
+
     server.add_insecure_port(f"{host}:{port}")
     server.start()
     logging.info("[LEARNER] gRPC server started")
 
-    return server
+    shutdown_event.wait()
+    logging.info("[LEARNER] Stopping gRPC server...")
+    server.stop(learner_service.STUTDOWN_TIMEOUT)
+    logging.info("[LEARNER] gRPC server stopped")
 
 
 def check_nan_in_transition(
@@ -287,19 +327,21 @@ def check_nan_in_transition(
         logging.error("actions contains NaN values")
 
 
+def push_actor_policy_to_queue(parameters_queue: Queue, policy: nn.Module):
+    logging.debug("[LEARNER] Pushing actor policy to the queue")
+    state_dict = move_state_dict_to_device(policy.actor.state_dict(), device="cpu")
+    state_bytes = state_to_bytes(state_dict)
+    parameters_queue.put(state_bytes)
+
+
 def add_actor_information_and_train(
     cfg,
-    device: str,
-    replay_buffer: ReplayBuffer,
-    offline_replay_buffer: ReplayBuffer,
-    batch_size: int,
-    optimizers: dict[str, torch.optim.Optimizer],
-    policy: nn.Module,
-    policy_lock: Lock,
     logger: Logger,
-    resume_optimization_step: int | None = None,
-    resume_interaction_step: int | None = None,
-    shutdown_event: Event | None = None,
+    out_dir: str,
+    shutdown_event: any,  # Event,
+    transition_queue: Queue,
+    interaction_message_queue: Queue,
+    parameters_queue: Queue,
 ):
     """
     Handles data transfer from the actor to the learner, manages training updates,
@@ -322,17 +364,73 @@ def add_actor_information_and_train(
     Args:
         cfg: Configuration object containing hyperparameters.
         device (str): The computing device (`"cpu"` or `"cuda"`).
-        replay_buffer (ReplayBuffer): The primary replay buffer storing online transitions.
-        offline_replay_buffer (ReplayBuffer): An additional buffer for offline transitions.
-        batch_size (int): The number of transitions to sample per training step.
-        optimizers (Dict[str, torch.optim.Optimizer]): A dictionary of optimizers (`"actor"`, `"critic"`, `"temperature"`).
-        policy (nn.Module): The reinforcement learning policy with critic, actor, and temperature parameters.
-        policy_lock (Lock): A threading lock to ensure safe policy updates.
         logger (Logger): Logger instance for tracking training progress.
-        resume_optimization_step (int | None): In the case of resume training, start from the last optimization step reached.
-        resume_interaction_step (int | None): In the case of resume training, shift the interaction step with the last saved step in order to not break logging.
-        shutdown_event (Event | None): Event to signal shutdown.
+        out_dir (str): The output directory for storing training checkpoints and logs.
+        shutdown_event (Event): Event to signal shutdown.
+        transition_queue (Queue): Queue for receiving transitions from the actor.
+        interaction_message_queue (Queue): Queue for receiving interaction messages from the actor.
+        parameters_queue (Queue): Queue for sending policy parameters to the actor.
     """
+
+    device = get_safe_torch_device(cfg.device, log=True)
+    storage_device = get_safe_torch_device(cfg_device=cfg.training.storage_device)
+
+    logging.info("Initializing policy")
+    ### Instantiate the policy in both the actor and learner processes
+    ### To avoid sending a SACPolicy object through the port, we create a policy intance
+    ### on both sides, the learner sends the updated parameters every n steps to update the actor's parameters
+    # TODO: At some point we should just need make sac policy
+
+    policy: SACPolicy = make_policy(
+        hydra_cfg=cfg,
+        # dataset_stats=offline_dataset.meta.stats if not cfg.resume else None,
+        # Hack: But if we do online traning, we do not need dataset_stats
+        dataset_stats=None,
+        pretrained_policy_name_or_path=str(logger.last_pretrained_model_dir)
+        if cfg.resume
+        else None,
+    )
+    # compile policy
+    policy = torch.compile(policy)
+    assert isinstance(policy, nn.Module)
+
+    push_actor_policy_to_queue(parameters_queue, policy)
+
+    last_time_policy_pushed = time.time()
+
+    optimizers, lr_scheduler = make_optimizers_and_scheduler(cfg, policy)
+    resume_optimization_step, resume_interaction_step = load_training_state(
+        cfg, logger, optimizers
+    )
+
+    log_training_info(cfg, out_dir, policy)
+
+    replay_buffer = initialize_replay_buffer(cfg, logger, device, storage_device)
+    batch_size = cfg.training.batch_size
+    offline_replay_buffer = None
+
+    if cfg.dataset_repo_id is not None:
+        logging.info("make_dataset offline buffer")
+        offline_dataset = make_dataset(cfg)
+        logging.info("Convertion to a offline replay buffer")
+        active_action_dims = None
+        if cfg.env.wrapper.joint_masking_action_space is not None:
+            active_action_dims = [
+                i
+                for i, mask in enumerate(cfg.env.wrapper.joint_masking_action_space)
+                if mask
+            ]
+        offline_replay_buffer = ReplayBuffer.from_lerobot_dataset(
+            offline_dataset,
+            device=device,
+            state_keys=cfg.policy.input_shapes.keys(),
+            action_mask=active_action_dims,
+            action_delta=cfg.env.wrapper.delta_action,
+            storage_device=storage_device,
+            optimize_memory=True,
+        )
+        batch_size: int = batch_size // 2  # We will sample from both replay buffer
+
     # NOTE: This function doesn't have a single responsibility, it should be split into multiple functions
     # in the future. The reason why we did that is the  GIL in Python. It's super slow the performance
     # are divided by 200. So we need to have a single thread that does all the work.
@@ -345,33 +443,39 @@ def add_actor_information_and_train(
     interaction_step_shift = (
         resume_interaction_step if resume_interaction_step is not None else 0
     )
-    saved_data = False
+
     while True:
         if shutdown_event is not None and shutdown_event.is_set():
             logging.info("[LEARNER] Shutdown signal received. Exiting...")
             break
 
-        while not transition_queue.empty():
+        logging.debug("[LEARNER] Waiting for transitions")
+        while not transition_queue.empty() and not shutdown_event.is_set():
             transition_list = transition_queue.get()
+            transition_list = bytes_to_transitions(transition_list)
+
             for transition in transition_list:
                 transition = move_transition_to_device(transition, device=device)
                 replay_buffer.add(**transition)
-
                 if transition.get("complementary_info", {}).get("is_intervention"):
                     offline_replay_buffer.add(**transition)
-
-        while not interaction_message_queue.empty():
+        logging.debug("[LEARNER] Received transitions")
+        logging.debug("[LEARNER] Waiting for interactions")
+        while not interaction_message_queue.empty() and not shutdown_event.is_set():
             interaction_message = interaction_message_queue.get()
+            interaction_message = bytes_to_python_object(interaction_message)
             # If cfg.resume, shift the interaction step with the last checkpointed step in order to not break the logging
             interaction_message["Interaction step"] += interaction_step_shift
             logger.log_dict(
                 interaction_message, mode="train", custom_step_key="Interaction step"
             )
-            # logging.info(f"Interaction message: {interaction_message}")
+
+        logging.debug("[LEARNER] Received interactions")
 
         if len(replay_buffer) < cfg.training.online_step_before_learning:
             continue
 
+        logging.debug("[LEARNER] Starting optimization loop")
         time_for_one_optimization_step = time.time()
         for _ in range(cfg.policy.utd_ratio - 1):
             batch = replay_buffer.sample(batch_size)
@@ -392,19 +496,18 @@ def add_actor_information_and_train(
             observation_features, next_observation_features = get_observation_features(
                 policy, observations, next_observations
             )
-            with policy_lock:
-                loss_critic = policy.compute_loss_critic(
-                    observations=observations,
-                    actions=actions,
-                    rewards=rewards,
-                    next_observations=next_observations,
-                    done=done,
-                    observation_features=observation_features,
-                    next_observation_features=next_observation_features,
-                )
-                optimizers["critic"].zero_grad()
-                loss_critic.backward()
-                optimizers["critic"].step()
+            loss_critic = policy.compute_loss_critic(
+                observations=observations,
+                actions=actions,
+                rewards=rewards,
+                next_observations=next_observations,
+                done=done,
+                observation_features=observation_features,
+                next_observation_features=next_observation_features,
+            )
+            optimizers["critic"].zero_grad()
+            loss_critic.backward()
+            optimizers["critic"].step()
 
         batch = replay_buffer.sample(batch_size)
 
@@ -427,46 +530,51 @@ def add_actor_information_and_train(
         observation_features, next_observation_features = get_observation_features(
             policy, observations, next_observations
         )
-        with policy_lock:
-            loss_critic = policy.compute_loss_critic(
-                observations=observations,
-                actions=actions,
-                rewards=rewards,
-                next_observations=next_observations,
-                done=done,
-                observation_features=observation_features,
-                next_observation_features=next_observation_features,
-            )
-            optimizers["critic"].zero_grad()
-            loss_critic.backward()
-            optimizers["critic"].step()
+        loss_critic = policy.compute_loss_critic(
+            observations=observations,
+            actions=actions,
+            rewards=rewards,
+            next_observations=next_observations,
+            done=done,
+            observation_features=observation_features,
+            next_observation_features=next_observation_features,
+        )
+        optimizers["critic"].zero_grad()
+        loss_critic.backward()
+        optimizers["critic"].step()
 
         training_infos = {}
         training_infos["loss_critic"] = loss_critic.item()
 
         if optimization_step % cfg.training.policy_update_freq == 0:
             for _ in range(cfg.training.policy_update_freq):
-                with policy_lock:
-                    loss_actor = policy.compute_loss_actor(
-                        observations=observations,
-                        observation_features=observation_features,
-                    )
+                loss_actor = policy.compute_loss_actor(
+                    observations=observations,
+                    observation_features=observation_features,
+                )
 
-                    optimizers["actor"].zero_grad()
-                    loss_actor.backward()
-                    optimizers["actor"].step()
+                optimizers["actor"].zero_grad()
+                loss_actor.backward()
+                optimizers["actor"].step()
 
-                    training_infos["loss_actor"] = loss_actor.item()
+                training_infos["loss_actor"] = loss_actor.item()
 
-                    loss_temperature = policy.compute_loss_temperature(
-                        observations=observations,
-                        observation_features=observation_features,
-                    )
-                    optimizers["temperature"].zero_grad()
-                    loss_temperature.backward()
-                    optimizers["temperature"].step()
+                loss_temperature = policy.compute_loss_temperature(
+                    observations=observations,
+                    observation_features=observation_features,
+                )
+                optimizers["temperature"].zero_grad()
+                loss_temperature.backward()
+                optimizers["temperature"].step()
 
-                    training_infos["loss_temperature"] = loss_temperature.item()
+                training_infos["loss_temperature"] = loss_temperature.item()
+
+        if (
+            time.time() - last_time_policy_pushed
+            > cfg.actor_learner_config.policy_parameters_push_frequency
+        ):
+            push_actor_policy_to_queue(parameters_queue, policy)
+            last_time_policy_pushed = time.time()
 
         policy.update_target_networks()
         if optimization_step % cfg.training.log_freq == 0:
@@ -595,104 +703,36 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
 
     set_global_seed(cfg.seed)
 
-    device = get_safe_torch_device(cfg.device, log=True)
-    storage_device = get_safe_torch_device(cfg_device=cfg.training.storage_device)
-
     torch.backends.cudnn.benchmark = True
     torch.backends.cuda.matmul.allow_tf32 = True
 
-    logging.info("make_policy")
-
-    ### Instantiate the policy in both the actor and learner processes
-    ### To avoid sending a SACPolicy object through the port, we create a policy intance
-    ### on both sides, the learner sends the updated parameters every n steps to update the actor's parameters
-    # TODO: At some point we should just need make sac policy
-
-    policy_lock = Lock()
-    policy: SACPolicy = make_policy(
-        hydra_cfg=cfg,
-        # dataset_stats=offline_dataset.meta.stats if not cfg.resume else None,
-        # Hack: But if we do online traning, we do not need dataset_stats
-        dataset_stats=None,
-        pretrained_policy_name_or_path=str(logger.last_pretrained_model_dir)
-        if cfg.resume
-        else None,
-    )
-    # compile policy
-    policy = torch.compile(policy)
-    assert isinstance(policy, nn.Module)
-
-    optimizers, lr_scheduler = make_optimizers_and_scheduler(cfg, policy)
-    resume_optimization_step, resume_interaction_step = load_training_state(
-        cfg, logger, optimizers
-    )
-
-    log_training_info(cfg, out_dir, policy)
-
-    replay_buffer = initialize_replay_buffer(cfg, logger, device, storage_device)
-    batch_size = cfg.training.batch_size
-    offline_replay_buffer = None
-
-    if cfg.dataset_repo_id is not None:
-        logging.info("make_dataset offline buffer")
-        offline_dataset = make_dataset(cfg)
-        logging.info("Convertion to a offline replay buffer")
-        active_action_dims = None
-        if cfg.env.wrapper.joint_masking_action_space is not None:
-            active_action_dims = [
-                i
-                for i, mask in enumerate(cfg.env.wrapper.joint_masking_action_space)
-                if mask
-            ]
-        offline_replay_buffer = ReplayBuffer.from_lerobot_dataset(
-            offline_dataset,
-            device=device,
-            state_keys=cfg.policy.input_shapes.keys(),
-            action_mask=active_action_dims,
-            action_delta=cfg.env.wrapper.delta_action,
-            storage_device=storage_device,
-            optimize_memory=True,
-        )
-        batch_size: int = batch_size // 2  # We will sample from both replay buffer
-
-    shutdown_event = Event()
-
-    def signal_handler(signum, frame):
-        print(
-            f"\nReceived signal {signal.Signals(signum).name}. Initiating learner shutdown..."
-        )
-        shutdown_event.set()
-
-    # Register signal handlers
-    signal.signal(signal.SIGINT, signal_handler)  # Ctrl+C
-    signal.signal(signal.SIGTERM, signal_handler)  # Termination request
-    signal.signal(signal.SIGHUP, signal_handler)  # Terminal closed
-    signal.signal(signal.SIGQUIT, signal_handler)  # Ctrl+\
+    shutdown_event = setup_process_handlers(use_threads(cfg))
 
     start_learner_threads(
         cfg,
-        device,
-        replay_buffer,
-        offline_replay_buffer,
-        batch_size,
-        optimizers,
-        policy,
-        policy_lock,
         logger,
-        resume_optimization_step,
-        resume_interaction_step,
+        out_dir,
         shutdown_event,
     )
 
 
 @hydra.main(version_base="1.2", config_name="default", config_path="../../configs")
 def train_cli(cfg: dict):
+    if not use_threads(cfg):
+        import torch.multiprocessing as mp
+
+        mp.set_start_method("spawn")
+
     train(
         cfg,
         out_dir=hydra.core.hydra_config.HydraConfig.get().run.dir,
         job_name=hydra.core.hydra_config.HydraConfig.get().job.name,
     )
 
+    logging.info("[LEARNER] train_cli finished")
+
 
 if __name__ == "__main__":
     train_cli()
+
+    logging.info("[LEARNER] main finished")
diff --git a/lerobot/scripts/server/learner_service.py b/lerobot/scripts/server/learner_service.py
index d6e6b5b7..b1f91cdc 100644
--- a/lerobot/scripts/server/learner_service.py
+++ b/lerobot/scripts/server/learner_service.py
@@ -1,23 +1,13 @@
 import hilserl_pb2  # type: ignore
 import hilserl_pb2_grpc  # type: ignore
-import torch
-from torch import nn
-from threading import Lock, Event
 import logging
-import queue
-import io
-import pickle
-
-from lerobot.scripts.server.buffer import (
-    move_state_dict_to_device,
-    bytes_buffer_size,
-    state_to_bytes,
-)
+from multiprocessing import Event, Queue
 
+from lerobot.scripts.server.network_utils import receive_bytes_in_chunks
+from lerobot.scripts.server.network_utils import send_bytes_in_chunks
 
 MAX_MESSAGE_SIZE = 4 * 1024 * 1024  # 4 MB
-CHUNK_SIZE = 2 * 1024 * 1024  # 2 MB
-MAX_WORKERS = 10
+MAX_WORKERS = 3  # Stream parameters, send transitions and interactions
 STUTDOWN_TIMEOUT = 10
 
 
@@ -25,89 +15,68 @@ class LearnerService(hilserl_pb2_grpc.LearnerServiceServicer):
     def __init__(
         self,
         shutdown_event: Event,
-        policy: nn.Module,
-        policy_lock: Lock,
+        parameters_queue: Queue,
         seconds_between_pushes: float,
-        transition_queue: queue.Queue,
-        interaction_message_queue: queue.Queue,
+        transition_queue: Queue,
+        interaction_message_queue: Queue,
     ):
         self.shutdown_event = shutdown_event
-        self.policy = policy
-        self.policy_lock = policy_lock
+        self.parameters_queue = parameters_queue
         self.seconds_between_pushes = seconds_between_pushes
         self.transition_queue = transition_queue
         self.interaction_message_queue = interaction_message_queue
 
-    def _get_policy_state(self):
-        with self.policy_lock:
-            params_dict = self.policy.actor.state_dict()
-            # if self.policy.config.vision_encoder_name is not None:
-            #     if self.policy.config.freeze_vision_encoder:
-            #         params_dict: dict[str, torch.Tensor] = {
-            #             k: v
-            #             for k, v in params_dict.items()
-            #             if not k.startswith("encoder.")
-            #         }
-            #     else:
-            #         raise NotImplementedError(
-            #             "Vision encoder is not frozen, we need to send the full model over the network which requires chunking the model."
-            #         )
-
-        return move_state_dict_to_device(params_dict, device="cpu")
-
-    def _send_bytes(self, buffer: bytes):
-        size_in_bytes = bytes_buffer_size(buffer)
-
-        sent_bytes = 0
-
-        logging.info(f"Model state size {size_in_bytes/1024/1024} MB with")
-
-        while sent_bytes < size_in_bytes:
-            transfer_state = hilserl_pb2.TransferState.TRANSFER_MIDDLE
-
-            if sent_bytes + CHUNK_SIZE >= size_in_bytes:
-                transfer_state = hilserl_pb2.TransferState.TRANSFER_END
-            elif sent_bytes == 0:
-                transfer_state = hilserl_pb2.TransferState.TRANSFER_BEGIN
-
-            size_to_read = min(CHUNK_SIZE, size_in_bytes - sent_bytes)
-            chunk = buffer.read(size_to_read)
-
-            yield hilserl_pb2.Parameters(
-                transfer_state=transfer_state, parameter_bytes=chunk
-            )
-            sent_bytes += size_to_read
-            logging.info(
-                f"[Learner] Sent {sent_bytes}/{size_in_bytes} bytes with state {transfer_state}"
-            )
-
-        logging.info(f"[LEARNER] Published {sent_bytes/1024/1024} MB to the Actor")
-
     def StreamParameters(self, request, context):
         # TODO: authorize the request
         logging.info("[LEARNER] Received request to stream parameters from the Actor")
 
         while not self.shutdown_event.is_set():
-            logging.debug("[LEARNER] Push parameters to the Actor")
-            state_dict = self._get_policy_state()
+            logging.info("[LEARNER] Push parameters to the Actor")
+            buffer = self.parameters_queue.get()
 
-            with state_to_bytes(state_dict) as buffer:
-                yield from self._send_bytes(buffer)
+            yield from send_bytes_in_chunks(
+                buffer,
+                hilserl_pb2.Parameters,
+                log_prefix="[LEARNER] Sending parameters",
+                silent=True,
+            )
+
+            logging.info("[LEARNER] Parameters sent")
 
             self.shutdown_event.wait(self.seconds_between_pushes)
 
-    def ReceiveTransitions(self, request_iterator, context):
+        logging.info("[LEARNER] Stream parameters finished")
+        return hilserl_pb2.Empty()
+
+    def SendTransitions(self, request_iterator, _context):
         # TODO: authorize the request
         logging.info("[LEARNER] Received request to receive transitions from the Actor")
 
-        for request in request_iterator:
-            logging.debug("[LEARNER] Received request")
-            if request.HasField("transition"):
-                buffer = io.BytesIO(request.transition.transition_bytes)
-                transition = torch.load(buffer)
-                self.transition_queue.put(transition)
-            if request.HasField("interaction_message"):
-                content = pickle.loads(
-                    request.interaction_message.interaction_message_bytes
-                )
-                self.interaction_message_queue.put(content)
+        receive_bytes_in_chunks(
+            request_iterator,
+            self.transition_queue,
+            self.shutdown_event,
+            log_prefix="[LEARNER] transitions",
+        )
+
+        logging.debug("[LEARNER] Finished receiving transitions")
+        return hilserl_pb2.Empty()
+
+    def SendInteractions(self, request_iterator, _context):
+        # TODO: authorize the request
+        logging.info(
+            "[LEARNER] Received request to receive interactions from the Actor"
+        )
+
+        receive_bytes_in_chunks(
+            request_iterator,
+            self.interaction_message_queue,
+            self.shutdown_event,
+            log_prefix="[LEARNER] interactions",
+        )
+
+        logging.debug("[LEARNER] Finished receiving interactions")
+        return hilserl_pb2.Empty()
+
+    def Ready(self, request, context):
+        return hilserl_pb2.Empty()
diff --git a/lerobot/scripts/server/maniskill_manipulator.py b/lerobot/scripts/server/maniskill_manipulator.py
index b9c9d216..e4d55955 100644
--- a/lerobot/scripts/server/maniskill_manipulator.py
+++ b/lerobot/scripts/server/maniskill_manipulator.py
@@ -5,9 +5,8 @@ import torch
 
 from omegaconf import DictConfig
 from typing import Any
-
-"""Make ManiSkill3 gym environment"""
 from mani_skill.vector.wrappers.gymnasium import ManiSkillVectorEnv
+from mani_skill.utils.wrappers.record import RecordEpisode
 
 
 def preprocess_maniskill_observation(
@@ -143,6 +142,15 @@ def make_maniskill(
         num_envs=n_envs,
     )
 
+    if cfg.env.video_record.enabled:
+        env = RecordEpisode(
+            env,
+            output_dir=cfg.env.video_record.record_dir,
+            save_trajectory=True,
+            trajectory_name=cfg.env.video_record.trajectory_name,
+            save_video=True,
+            video_fps=30,
+        )
     env = ManiSkillObservationWrapper(env, device=cfg.env.device)
     env = ManiSkillVectorEnv(env, ignore_terminations=True, auto_reset=False)
     env._max_episode_steps = env.max_episode_steps = (
diff --git a/lerobot/scripts/server/network_utils.py b/lerobot/scripts/server/network_utils.py
new file mode 100644
index 00000000..f5e8973b
--- /dev/null
+++ b/lerobot/scripts/server/network_utils.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from lerobot.scripts.server import hilserl_pb2
+import logging
+import io
+from multiprocessing import Queue, Event
+from typing import Any
+
+CHUNK_SIZE = 2 * 1024 * 1024  # 2 MB
+
+
+def bytes_buffer_size(buffer: io.BytesIO) -> int:
+    buffer.seek(0, io.SEEK_END)
+    result = buffer.tell()
+    buffer.seek(0)
+    return result
+
+
+def send_bytes_in_chunks(
+    buffer: bytes, message_class: Any, log_prefix: str = "", silent: bool = True
+):
+    buffer = io.BytesIO(buffer)
+    size_in_bytes = bytes_buffer_size(buffer)
+
+    sent_bytes = 0
+
+    logging_method = logging.info if not silent else logging.debug
+
+    logging_method(f"{log_prefix} Buffer size {size_in_bytes/1024/1024} MB with")
+
+    while sent_bytes < size_in_bytes:
+        transfer_state = hilserl_pb2.TransferState.TRANSFER_MIDDLE
+
+        if sent_bytes + CHUNK_SIZE >= size_in_bytes:
+            transfer_state = hilserl_pb2.TransferState.TRANSFER_END
+        elif sent_bytes == 0:
+            transfer_state = hilserl_pb2.TransferState.TRANSFER_BEGIN
+
+        size_to_read = min(CHUNK_SIZE, size_in_bytes - sent_bytes)
+        chunk = buffer.read(size_to_read)
+
+        yield message_class(transfer_state=transfer_state, data=chunk)
+        sent_bytes += size_to_read
+        logging_method(
+            f"{log_prefix} Sent {sent_bytes}/{size_in_bytes} bytes with state {transfer_state}"
+        )
+
+    logging_method(f"{log_prefix} Published {sent_bytes/1024/1024} MB")
+
+
+def receive_bytes_in_chunks(
+    iterator, queue: Queue, shutdown_event: Event, log_prefix: str = ""
+):
+    bytes_buffer = io.BytesIO()
+    step = 0
+
+    logging.info(f"{log_prefix} Starting receiver")
+    for item in iterator:
+        logging.debug(f"{log_prefix} Received item")
+        if shutdown_event.is_set():
+            logging.info(f"{log_prefix} Shutting down receiver")
+            return
+
+        if item.transfer_state == hilserl_pb2.TransferState.TRANSFER_BEGIN:
+            bytes_buffer.seek(0)
+            bytes_buffer.truncate(0)
+            bytes_buffer.write(item.data)
+            logging.debug(f"{log_prefix} Received data at step 0")
+            step = 0
+            continue
+        elif item.transfer_state == hilserl_pb2.TransferState.TRANSFER_MIDDLE:
+            bytes_buffer.write(item.data)
+            step += 1
+            logging.debug(f"{log_prefix} Received data at step {step}")
+        elif item.transfer_state == hilserl_pb2.TransferState.TRANSFER_END:
+            bytes_buffer.write(item.data)
+            logging.debug(
+                f"{log_prefix} Received data at step end size {bytes_buffer_size(bytes_buffer)}"
+            )
+
+            queue.put(bytes_buffer.getvalue())
+
+            bytes_buffer.seek(0)
+            bytes_buffer.truncate(0)
+            step = 0
+
+            logging.debug(f"{log_prefix} Queue updated")
diff --git a/lerobot/scripts/server/utils.py b/lerobot/scripts/server/utils.py
new file mode 100644
index 00000000..699717e4
--- /dev/null
+++ b/lerobot/scripts/server/utils.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import signal
+import sys
+from torch.multiprocessing import Queue
+from queue import Empty
+
+shutdown_event_counter = 0
+
+
+def setup_process_handlers(use_threads: bool) -> any:
+    if use_threads:
+        from threading import Event
+    else:
+        from multiprocessing import Event
+
+    shutdown_event = Event()
+
+    # Define signal handler
+    def signal_handler(signum, frame):
+        logging.info("Shutdown signal received. Cleaning up...")
+        shutdown_event.set()
+        global shutdown_event_counter
+        shutdown_event_counter += 1
+
+        if shutdown_event_counter > 1:
+            logging.info("Force shutdown")
+            sys.exit(1)
+
+    signal.signal(signal.SIGINT, signal_handler)  # Ctrl+C
+    signal.signal(signal.SIGTERM, signal_handler)  # Termination request (kill)
+    signal.signal(signal.SIGHUP, signal_handler)  # Terminal closed/Hangup
+    signal.signal(signal.SIGQUIT, signal_handler)  # Ctrl+\
+
+    def signal_handler(signum, frame):
+        logging.info("Shutdown signal received. Cleaning up...")
+        shutdown_event.set()
+
+    return shutdown_event
+
+
+def get_last_item_from_queue(queue: Queue):
+    item = queue.get()
+    counter = 1
+
+    # Drain queue and keep only the most recent parameters
+    try:
+        while True:
+            item = queue.get_nowait()
+            counter += 1
+    except Empty:
+        pass
+
+    logging.debug(f"Drained {counter} items from queue")
+
+    return item

From d711e20b5fd87f7d85292869d372be0f50387bf9 Mon Sep 17 00:00:00 2001
From: s1lent4gnt <kmeftah.khalil@gmail.com>
Date: Wed, 12 Mar 2025 10:35:30 +0100
Subject: [PATCH 095/112] [Port HIL-SERL] Balanced sampler function speed up
 and refactor to align with train.py (#715)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../configs/policy/hilserl_classifier.yaml    |  8 +++
 lerobot/scripts/train_hilserl_classifier.py   | 69 +++++++++++++++----
 2 files changed, 65 insertions(+), 12 deletions(-)

diff --git a/lerobot/configs/policy/hilserl_classifier.yaml b/lerobot/configs/policy/hilserl_classifier.yaml
index 149eeab2..9ab181d5 100644
--- a/lerobot/configs/policy/hilserl_classifier.yaml
+++ b/lerobot/configs/policy/hilserl_classifier.yaml
@@ -3,6 +3,14 @@
 defaults:
   - _self_
 
+hydra:
+  run:
+    # Set `dir` to where you would like to save all of the run outputs. If you run another training session
+    # with the same value for `dir` its contents will be overwritten unless you set `resume` to true.
+    dir: outputs/train_hilserl_classifier/${now:%Y-%m-%d}/${now:%H-%M-%S}_${env.name}_${hydra.job.name}
+  job:
+    name: default
+
 seed: 13
 dataset_repo_id: aractingi/push_cube_square_light_reward_cropped_resized
 # aractingi/push_cube_square_reward_1_cropped_resized
diff --git a/lerobot/scripts/train_hilserl_classifier.py b/lerobot/scripts/train_hilserl_classifier.py
index 6044b038..1cae0183 100644
--- a/lerobot/scripts/train_hilserl_classifier.py
+++ b/lerobot/scripts/train_hilserl_classifier.py
@@ -42,6 +42,7 @@ from lerobot.common.utils.utils import (
     format_big_number,
     get_safe_torch_device,
     init_hydra_config,
+    init_logging,
     set_global_seed,
 )
 from lerobot.scripts.server.buffer import random_shift
@@ -60,9 +61,24 @@ def get_model(cfg, logger):  # noqa I001
 
 
 def create_balanced_sampler(dataset, cfg):
-    # Creates a weighted sampler to handle class imbalance
+    # Get underlying dataset if using Subset
+    original_dataset = (
+        dataset.dataset if isinstance(dataset, torch.utils.data.Subset) else dataset
+    )
 
-    labels = torch.tensor([item[cfg.training.label_key] for item in dataset])
+    # Get indices if using Subset (for slicing)
+    indices = dataset.indices if isinstance(dataset, torch.utils.data.Subset) else None
+
+    # Get labels from Hugging Face dataset
+    if indices is not None:
+        # Get subset of labels using Hugging Face's select()
+        hf_subset = original_dataset.hf_dataset.select(indices)
+        labels = hf_subset[cfg.training.label_key]
+    else:
+        # Get all labels directly
+        labels = original_dataset.hf_dataset[cfg.training.label_key]
+
+    labels = torch.stack(labels)
     _, counts = torch.unique(labels, return_counts=True)
     class_weights = 1.0 / counts.float()
     sample_weights = class_weights[labels]
@@ -298,22 +314,24 @@ def benchmark_inference_time(model, dataset, logger, cfg, device, step):
     return avg, median, std
 
 
-@hydra.main(
-    version_base="1.2",
-    config_path="../configs/policy",
-    config_name="hilserl_classifier",
-)
-def train(cfg: DictConfig) -> None:
+def train(
+    cfg: DictConfig, out_dir: str | None = None, job_name: str | None = None
+) -> None:
+    if out_dir is None:
+        raise NotImplementedError()
+    if job_name is None:
+        raise NotImplementedError()
+
     # Main training pipeline with support for resuming training
+    init_logging()
     logging.info(OmegaConf.to_yaml(cfg))
 
+    logger = Logger(cfg, out_dir, wandb_job_name=job_name)
+
     # Initialize training environment
     device = get_safe_torch_device(cfg.device, log=True)
     set_global_seed(cfg.seed)
 
-    out_dir = hydra.core.hydra_config.HydraConfig.get().run.dir + "frozen_resnet10_2"
-    logger = Logger(cfg, out_dir, cfg.wandb.job_name if cfg.wandb.enable else None)
-
     # Setup dataset and dataloaders
     dataset = LeRobotDataset(
         cfg.dataset_repo_id,
@@ -462,5 +480,32 @@ def train(cfg: DictConfig) -> None:
     logging.info("Training completed")
 
 
+@hydra.main(
+    version_base="1.2",
+    config_name="hilserl_classifier",
+    config_path="../configs/policy",
+)
+def train_cli(cfg: dict):
+    train(
+        cfg,
+        out_dir=hydra.core.hydra_config.HydraConfig.get().run.dir,
+        job_name=hydra.core.hydra_config.HydraConfig.get().job.name,
+    )
+
+
+def train_notebook(
+    out_dir=None,
+    job_name=None,
+    config_name="hilserl_classifier",
+    config_path="../configs/policy",
+):
+    from hydra import compose, initialize
+
+    hydra.core.global_hydra.GlobalHydra.instance().clear()
+    initialize(config_path=config_path)
+    cfg = compose(config_name=config_name)
+    train(cfg, out_dir=out_dir, job_name=job_name)
+
+
 if __name__ == "__main__":
-    train()
+    train_cli()

From 25b88f3b86ae9c19fe8db499a0118a95e8bcf8d4 Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Mon, 10 Mar 2025 10:31:38 +0000
Subject: [PATCH 096/112] Remove torch.no_grad decorator and optimize next
 action prediction in SAC policy

- Removed `@torch.no_grad` decorator from Unnormalize forward method

- Added TODO comment for optimizing next action prediction in SAC policy
- Minor formatting adjustment in NaN assertion for log standard deviation
Co-authored-by: Yoel Chornton <yoel.chornton@gmail.com>
---
 lerobot/common/policies/normalize.py        |  2 +-
 lerobot/common/policies/sac/modeling_sac.py | 11 ++++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/lerobot/common/policies/normalize.py b/lerobot/common/policies/normalize.py
index 2e0b266e..8dbe048d 100644
--- a/lerobot/common/policies/normalize.py
+++ b/lerobot/common/policies/normalize.py
@@ -196,7 +196,7 @@ class Unnormalize(nn.Module):
             setattr(self, "buffer_" + key.replace(".", "_"), buffer)
 
     # TODO(rcadene): should we remove torch.no_grad?
-    @torch.no_grad
+    # @torch.no_grad
     def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
         batch = dict(batch)  # shallow copy avoids mutating the input batch
         for key, mode in self.modes.items():
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 9eb864ec..4baf7d88 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -210,6 +210,11 @@ class SACPolicy(
                 next_observations, next_observation_features
             )
 
+            # TODO: (maractingi, azouitine) This is to slow, we should find a way to do this in a more efficient way
+            next_action_preds = self.unnormalize_outputs({"action": next_action_preds})[
+                "action"
+            ]
+
             # 2- compute q targets
             q_targets = self.critic_forward(
                 observations=next_observations,
@@ -512,9 +517,9 @@ class Policy(nn.Module):
         # Compute standard deviations
         if self.fixed_std is None:
             log_std = self.std_layer(outputs)
-            assert not torch.isnan(
-                log_std
-            ).any(), "[ERROR] log_std became NaN after std_layer!"
+            assert not torch.isnan(log_std).any(), (
+                "[ERROR] log_std became NaN after std_layer!"
+            )
 
             if self.use_tanh_squash:
                 log_std = torch.tanh(log_std)

From 5081c145dcd6f0c33afcb8e4b9574558d2566206 Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Wed, 12 Mar 2025 10:15:37 +0000
Subject: [PATCH 097/112] Add custom save and load methods for SAC policy

- Implement `_save_pretrained` method to handle TensorDict state saving
- Add `_from_pretrained` class method for loading SAC policy from files
- Create utility function `find_and_copy_params` to handle parameter copying
---
 lerobot/common/policies/sac/modeling_sac.py | 175 +++++++++++++++++++-
 1 file changed, 174 insertions(+), 1 deletion(-)

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 4baf7d88..8ea00a1b 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -18,7 +18,8 @@
 # TODO: (1) better device management
 
 from copy import deepcopy
-from typing import Callable, Optional, Tuple
+from typing import Callable, Optional, Tuple, Union, Dict
+from pathlib import Path
 
 import einops
 import numpy as np
@@ -142,6 +143,131 @@ class SACPolicy(
         self.log_alpha = nn.Parameter(torch.tensor([0.0]))
         self.temperature = self.log_alpha.exp().item()
 
+    def _save_pretrained(self, save_directory):
+        """Custom save method to handle TensorDict properly"""
+        import os
+        import json
+        from dataclasses import asdict
+        from huggingface_hub.constants import SAFETENSORS_SINGLE_FILE, CONFIG_NAME
+        from safetensors.torch import save_file
+
+        # NOTE: Using tensordict.from_modules in the model to batch the inference using torch.vmap
+        # implies one side effect: the __batch_size parameters are saved in the state_dict
+        # __batch_size is torch.Size or safetensor save only torch.Tensor
+        # so we need to filter them out before saving
+        simplified_state_dict = {}
+
+        for name, param in self.named_parameters():
+            simplified_state_dict[name] = param
+        save_file(
+            simplified_state_dict, os.path.join(save_directory, SAFETENSORS_SINGLE_FILE)
+        )
+
+        # Save config
+        config_dict = asdict(self.config)
+        with open(os.path.join(save_directory, CONFIG_NAME), "w") as f:
+            json.dump(config_dict, f, indent=2)
+            print(f"Saved config to {os.path.join(save_directory, CONFIG_NAME)}")
+
+    @classmethod
+    def _from_pretrained(
+        cls,
+        *,
+        model_id: str,
+        revision: Optional[str],
+        cache_dir: Optional[Union[str, Path]],
+        force_download: bool,
+        proxies: Optional[Dict],
+        resume_download: Optional[bool],
+        local_files_only: bool,
+        token: Optional[Union[str, bool]],
+        map_location: str = "cpu",
+        strict: bool = False,
+        **model_kwargs,
+    ) -> "SACPolicy":
+        """Custom load method to handle loading SAC policy from saved files"""
+        import os
+        import json
+        from pathlib import Path
+        from huggingface_hub import hf_hub_download
+        from huggingface_hub.constants import SAFETENSORS_SINGLE_FILE, CONFIG_NAME
+        from safetensors.torch import load_file
+        from lerobot.common.policies.sac.configuration_sac import SACConfig
+
+        # Check if model_id is a local path or a hub model ID
+        if os.path.isdir(model_id):
+            model_path = Path(model_id)
+            safetensors_file = os.path.join(model_path, SAFETENSORS_SINGLE_FILE)
+            config_file = os.path.join(model_path, CONFIG_NAME)
+        else:
+            # Download the safetensors file from the hub
+            safetensors_file = hf_hub_download(
+                repo_id=model_id,
+                filename=SAFETENSORS_SINGLE_FILE,
+                revision=revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                token=token,
+                local_files_only=local_files_only,
+            )
+            # Download the config file
+            try:
+                config_file = hf_hub_download(
+                    repo_id=model_id,
+                    filename=CONFIG_NAME,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    token=token,
+                    local_files_only=local_files_only,
+                )
+            except Exception:
+                config_file = None
+
+        # Load or create config
+        if config_file and os.path.exists(config_file):
+            # Load config from file
+            with open(config_file, "r") as f:
+                config_dict = json.load(f)
+            config = SACConfig(**config_dict)
+        else:
+            # Use the provided config or create a default one
+            config = model_kwargs.get("config", SACConfig())
+
+        # Create a new instance with the loaded config
+        model = cls(config=config)
+
+        # Load state dict from safetensors file
+        if os.path.exists(safetensors_file):
+            # Note: The load_file function returns a dict with the parameters, but __batch_size
+            # is not loaded so we need to copy it from the model state_dict
+            # Load the parameters only
+            loaded_state_dict = load_file(safetensors_file, device=map_location)
+
+            # Copy batch size parameters
+            find_and_copy_params(
+                original_state_dict=model.state_dict(),
+                loaded_state_dict=loaded_state_dict,
+                pattern="__batch_size",
+                match_type="endswith",
+            )
+
+            # Copy normalization buffer parameters
+            find_and_copy_params(
+                original_state_dict=model.state_dict(),
+                loaded_state_dict=loaded_state_dict,
+                pattern="_orig_mod.output_normalization.buffer_action",
+                match_type="contains",
+            )
+
+            model.load_state_dict(loaded_state_dict, strict=False)
+
+        return model
+
     def reset(self):
         """Reset the policy"""
         pass
@@ -276,6 +402,9 @@ class SACPolicy(
 
         actions_pi, log_probs, _ = self.actor(observations, observation_features)
 
+        # TODO: (maractingi, azouitine) This is to slow, we should find a way to do this in a more efficient way
+        actions_pi = self.unnormalize_outputs({"action": actions_pi})["action"]
+
         q_preds = self.critic_forward(
             observations,
             actions_pi,
@@ -334,6 +463,50 @@ class MLP(nn.Module):
         return self.net(x)
 
 
+def find_and_copy_params(
+    original_state_dict: dict[str, torch.Tensor],
+    loaded_state_dict: dict[str, torch.Tensor],
+    pattern: str,
+    match_type: str = "contains",
+) -> list[str]:
+    """Find and copy parameters from original state dict to loaded state dict based on a pattern.
+
+    This function can search for keys in different ways based on the match_type:
+    - "exact": The key must exactly match the pattern
+    - "contains": The key must contain the pattern anywhere
+    - "startswith": The key must start with the pattern
+    - "endswith": The key must end with the pattern
+
+    Args:
+        original_state_dict: The source state dictionary
+        loaded_state_dict: The target state dictionary
+        pattern: The pattern to search for in keys
+        match_type: How to match the pattern (exact, contains, startswith, endswith)
+
+    Returns:
+        list[str]: List of keys that were copied
+    """
+    copied_keys = []
+
+    for key in original_state_dict:
+        should_copy = False
+
+        if match_type == "exact":
+            should_copy = key == pattern
+        elif match_type == "contains":
+            should_copy = pattern in key
+        elif match_type == "startswith":
+            should_copy = key.startswith(pattern)
+        elif match_type == "endswith":
+            should_copy = key.endswith(pattern)
+
+        if should_copy:
+            loaded_state_dict[key] = original_state_dict[key]
+            copied_keys.append(key)
+
+    return copied_keys
+
+
 class CriticHead(nn.Module):
     def __init__(
         self,

From 41219fe81e1d93b8b2b9ed29c5bab60aaabf2b4d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 12 Mar 2025 10:16:54 +0000
Subject: [PATCH 098/112] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 lerobot/common/policies/sac/modeling_sac.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 8ea00a1b..afbbc945 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -231,7 +231,7 @@ class SACPolicy(
         # Load or create config
         if config_file and os.path.exists(config_file):
             # Load config from file
-            with open(config_file, "r") as f:
+            with open(config_file) as f:
                 config_dict = json.load(f)
             config = SACConfig(**config_dict)
         else:
@@ -690,9 +690,9 @@ class Policy(nn.Module):
         # Compute standard deviations
         if self.fixed_std is None:
             log_std = self.std_layer(outputs)
-            assert not torch.isnan(log_std).any(), (
-                "[ERROR] log_std became NaN after std_layer!"
-            )
+            assert not torch.isnan(
+                log_std
+            ).any(), "[ERROR] log_std became NaN after std_layer!"
 
             if self.use_tanh_squash:
                 log_std = torch.tanh(log_std)

From 1f23ef78891197e9fe22c3993b722b31f3a746da Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Mon, 17 Mar 2025 10:50:28 +0000
Subject: [PATCH 099/112] Enhance SAC configuration and policy with gradient
 clipping and temperature management

- Introduced `grad_clip_norm` parameter in SAC configuration for gradient clipping
- Updated SACPolicy to store temperature as an instance variable for consistent usage
- Modified loss calculations in SACPolicy to utilize the instance temperature
- Enhanced MLP and CriticHead to support a customizable final activation function
- Implemented gradient clipping in the learner server during training steps for both actor and critic
- Added tracking for gradient norms in training information
---
 .../common/policies/sac/configuration_sac.py  |  2 ++
 lerobot/common/policies/sac/modeling_sac.py   | 34 ++++++++++++++-----
 lerobot/scripts/server/learner_server.py      | 33 ++++++++++++++++++
 3 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index b834896e..61e08df4 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -84,10 +84,12 @@ class SACConfig:
     latent_dim: int = 256
     target_entropy: float | None = None
     use_backup_entropy: bool = True
+    grad_clip_norm: float = 40.0
     critic_network_kwargs: dict[str, Any] = field(
         default_factory=lambda: {
             "hidden_dims": [256, 256],
             "activate_final": True,
+            "final_activation": None,
         }
     )
     actor_network_kwargs: dict[str, Any] = field(
diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index afbbc945..2c4bad5f 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -330,7 +330,7 @@ class SACPolicy(
         observation_features: Tensor | None = None,
         next_observation_features: Tensor | None = None,
     ) -> Tensor:
-        temperature = self.log_alpha.exp().item()
+        self.temperature = self.log_alpha.exp().item()
         with torch.no_grad():
             next_action_preds, next_log_probs, _ = self.actor(
                 next_observations, next_observation_features
@@ -358,7 +358,7 @@ class SACPolicy(
             # critics subsample size
             min_q, _ = q_targets.min(dim=0)  # Get values from min operation
             if self.config.use_backup_entropy:
-                min_q = min_q - (temperature * next_log_probs)
+                min_q = min_q - (self.temperature * next_log_probs)
 
             td_target = rewards + (1 - done) * self.config.discount * min_q
 
@@ -398,7 +398,7 @@ class SACPolicy(
     def compute_loss_actor(
         self, observations, observation_features: Tensor | None = None
     ) -> Tensor:
-        temperature = self.log_alpha.exp().item()
+        self.temperature = self.log_alpha.exp().item()
 
         actions_pi, log_probs, _ = self.actor(observations, observation_features)
 
@@ -413,7 +413,7 @@ class SACPolicy(
         )
         min_q_preds = q_preds.min(dim=0)[0]
 
-        actor_loss = ((temperature * log_probs) - min_q_preds).mean()
+        actor_loss = ((self.temperature * log_probs) - min_q_preds).mean()
         return actor_loss
 
 
@@ -425,6 +425,7 @@ class MLP(nn.Module):
         activations: Callable[[torch.Tensor], torch.Tensor] | str = nn.SiLU(),
         activate_final: bool = False,
         dropout_rate: Optional[float] = None,
+        final_activation: Callable[[torch.Tensor], torch.Tensor] | str | None = None,
     ):
         super().__init__()
         self.activate_final = activate_final
@@ -451,11 +452,24 @@ class MLP(nn.Module):
                 if dropout_rate is not None and dropout_rate > 0:
                     layers.append(nn.Dropout(p=dropout_rate))
                 layers.append(nn.LayerNorm(hidden_dims[i]))
-                layers.append(
-                    activations
-                    if isinstance(activations, nn.Module)
-                    else getattr(nn, activations)()
-                )
+
+                # If we're at the final layer and a final activation is specified, use it
+                if (
+                    i + 1 == len(hidden_dims)
+                    and activate_final
+                    and final_activation is not None
+                ):
+                    layers.append(
+                        final_activation
+                        if isinstance(final_activation, nn.Module)
+                        else getattr(nn, final_activation)()
+                    )
+                else:
+                    layers.append(
+                        activations
+                        if isinstance(activations, nn.Module)
+                        else getattr(nn, activations)()
+                    )
 
         self.net = nn.Sequential(*layers)
 
@@ -516,6 +530,7 @@ class CriticHead(nn.Module):
         activate_final: bool = False,
         dropout_rate: Optional[float] = None,
         init_final: Optional[float] = None,
+        final_activation: Callable[[torch.Tensor], torch.Tensor] | str | None = None,
     ):
         super().__init__()
         self.net = MLP(
@@ -524,6 +539,7 @@ class CriticHead(nn.Module):
             activations=activations,
             activate_final=activate_final,
             dropout_rate=dropout_rate,
+            final_activation=final_activation,
         )
         self.output_layer = nn.Linear(in_features=hidden_dims[-1], out_features=1)
         if init_final is not None:
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index 7bd4aee0..580eed1a 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -390,6 +390,10 @@ def add_actor_information_and_train(
         if cfg.resume
         else None,
     )
+
+    # Update the policy config with the grad_clip_norm value from training config if it exists
+    clip_grad_norm_value = cfg.training.grad_clip_norm
+
     # compile policy
     policy = torch.compile(policy)
     assert isinstance(policy, nn.Module)
@@ -507,6 +511,12 @@ def add_actor_information_and_train(
             )
             optimizers["critic"].zero_grad()
             loss_critic.backward()
+
+            # clip gradients
+            critic_grad_norm = torch.nn.utils.clip_grad_norm_(
+                policy.critic_ensemble.parameters(), clip_grad_norm_value
+            )
+
             optimizers["critic"].step()
 
         batch = replay_buffer.sample(batch_size)
@@ -541,10 +551,17 @@ def add_actor_information_and_train(
         )
         optimizers["critic"].zero_grad()
         loss_critic.backward()
+
+        # clip gradients
+        critic_grad_norm = torch.nn.utils.clip_grad_norm_(
+            policy.critic_ensemble.parameters(), clip_grad_norm_value
+        ).item()
+
         optimizers["critic"].step()
 
         training_infos = {}
         training_infos["loss_critic"] = loss_critic.item()
+        training_infos["critic_grad_norm"] = critic_grad_norm
 
         if optimization_step % cfg.training.policy_update_freq == 0:
             for _ in range(cfg.training.policy_update_freq):
@@ -555,19 +572,35 @@ def add_actor_information_and_train(
 
                 optimizers["actor"].zero_grad()
                 loss_actor.backward()
+
+                # clip gradients
+                actor_grad_norm = torch.nn.utils.clip_grad_norm_(
+                    policy.actor.parameters_to_optimize, clip_grad_norm_value
+                ).item()
+
                 optimizers["actor"].step()
 
                 training_infos["loss_actor"] = loss_actor.item()
+                training_infos["actor_grad_norm"] = actor_grad_norm
 
+                # Temperature optimization
                 loss_temperature = policy.compute_loss_temperature(
                     observations=observations,
                     observation_features=observation_features,
                 )
                 optimizers["temperature"].zero_grad()
                 loss_temperature.backward()
+
+                #  clip gradients
+                temp_grad_norm = torch.nn.utils.clip_grad_norm_(
+                    [policy.log_alpha], clip_grad_norm_value
+                ).item()
+
                 optimizers["temperature"].step()
 
                 training_infos["loss_temperature"] = loss_temperature.item()
+                training_infos["temperature_grad_norm"] = temp_grad_norm
+                training_infos["temperature"] = policy.temperature
 
         if (
             time.time() - last_time_policy_pushed

From 9e3c8461cac22e1c1463f7c70937741a05ac9053 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Mon, 17 Mar 2025 14:22:33 +0100
Subject: [PATCH 100/112] Add end effector action space to hil-serl (#861)

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 lerobot/common/envs/factory.py                |   1 +
 .../common/policies/sac/configuration_sac.py  |   2 +-
 lerobot/common/robot_devices/control_utils.py |   2 +-
 lerobot/configs/env/so100_real.yaml           |  44 +-
 lerobot/configs/policy/sac_real.yaml          |  24 +-
 lerobot/configs/robot/so100.yaml              |  14 +-
 lerobot/scripts/server/actor_server.py        |  18 +-
 lerobot/scripts/server/crop_dataset_roi.py    |   5 -
 .../server/end_effector_control_utils.py      | 797 ++++++++++++++++++
 lerobot/scripts/server/find_joint_limits.py   |  73 +-
 lerobot/scripts/server/gym_manipulator.py     | 694 +++++++++++++--
 lerobot/scripts/server/kinematics.py          | 543 ++++++++++++
 lerobot/scripts/server/learner_server.py      |  60 +-
 13 files changed, 2138 insertions(+), 139 deletions(-)
 create mode 100644 lerobot/scripts/server/end_effector_control_utils.py
 create mode 100644 lerobot/scripts/server/kinematics.py

diff --git a/lerobot/common/envs/factory.py b/lerobot/common/envs/factory.py
index 457b7af6..c23dcd1d 100644
--- a/lerobot/common/envs/factory.py
+++ b/lerobot/common/envs/factory.py
@@ -137,6 +137,7 @@ class PixelWrapper(gym.Wrapper):
         return self._get_obs(obs), reward, terminated, truncated, info
 
 
+# TODO: Remove this
 class ConvertToLeRobotEnv(gym.Wrapper):
     def __init__(self, env, num_envs):
         super().__init__(env)
diff --git a/lerobot/common/policies/sac/configuration_sac.py b/lerobot/common/policies/sac/configuration_sac.py
index 61e08df4..3f1a7fbb 100644
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -103,6 +103,6 @@ class SACConfig:
             "use_tanh_squash": True,
             "log_std_min": -5,
             "log_std_max": 2,
-            "init_final": 0.005,
+            "init_final": 0.05,
         }
     )
diff --git a/lerobot/common/robot_devices/control_utils.py b/lerobot/common/robot_devices/control_utils.py
index ae25f7ae..08429fc1 100644
--- a/lerobot/common/robot_devices/control_utils.py
+++ b/lerobot/common/robot_devices/control_utils.py
@@ -367,7 +367,7 @@ def reset_environment(robot, events, reset_time_s):
 def reset_follower_position(robot: Robot, target_position):
     current_position = robot.follower_arms["main"].read("Present_Position")
     trajectory = torch.from_numpy(
-        np.linspace(current_position, target_position, 30)
+        np.linspace(current_position, target_position, 50)
     )  # NOTE: 30 is just an aribtrary number
     for pose in trajectory:
         robot.send_action(pose)
diff --git a/lerobot/configs/env/so100_real.yaml b/lerobot/configs/env/so100_real.yaml
index dc30224c..5265a252 100644
--- a/lerobot/configs/env/so100_real.yaml
+++ b/lerobot/configs/env/so100_real.yaml
@@ -5,26 +5,46 @@ fps: 10
 env:
   name: real_world
   task: null
-  state_dim: 6
-  action_dim: 6
+  state_dim: 15
+  action_dim: 3
   fps: ${fps}
   device: mps
 
   wrapper:
     crop_params_dict:
-      observation.images.front: [102, 43, 358, 523]
-      observation.images.side: [92, 123, 379, 349]
-      # observation.images.front: [109, 37, 361, 557]
-      # observation.images.side: [94, 161, 372, 315]
+      observation.images.front: [171, 207, 116, 251]
+      observation.images.side: [232, 200, 142, 204]
     resize_size: [128, 128]
-    control_time_s: 20
-    reset_follower_pos: true
+    control_time_s: 10
+    reset_follower_pos: false
     use_relative_joint_positions: true
     reset_time_s: 5
     display_cameras: false
-    delta_action: 0.1
-    joint_masking_action_space: [1, 1, 1, 1, 0, 0] # disable wrist and gripper
+    delta_action: null #0.3
+    joint_masking_action_space: null #[1, 1, 1, 1, 0, 0] # disable wrist and gripper
+    add_joint_velocity_to_observation: true
+    add_ee_pose_to_observation: true
+
+    # If null then the teleoperation will be used to reset the robot
+    # Bounds for pushcube_gamepad_lerobot15 dataset and experiments
+    # fixed_reset_joint_positions: [-19.86, 103.19, 117.33, 42.7, 13.89, 0.297]
+    # ee_action_space_params: # If null then ee_action_space is not used
+    #   bounds:
+    #     max: [0.291, 0.147, 0.074]
+    #     min: [0.139, -0.143, 0.03]
+
+    # Bounds for insertcube_gamepad dataset and experiments
+    fixed_reset_joint_positions: [20.0,  90.,   90.,   75.,  -0.7910156, -0.5673759]
+    ee_action_space_params:
+      bounds:
+        max: [0.25295413, 0.07498981, 0.06862044]
+        min: [0.2010096,  -0.12, 0.0433196]
+
+      use_gamepad: true
+      x_step_size: 0.03
+      y_step_size: 0.03
+      z_step_size: 0.03
 
   reward_classifier:
-    pretrained_path:  outputs/classifier/13-02-random-sample-resnet10-frozen/checkpoints/best/pretrained_model
-    config_path: lerobot/configs/policy/hilserl_classifier.yaml
+    pretrained_path: null # outputs/classifier/13-02-random-sample-resnet10-frozen/checkpoints/best/pretrained_model
+    config_path: null # lerobot/configs/policy/hilserl_classifier.yaml
diff --git a/lerobot/configs/policy/sac_real.yaml b/lerobot/configs/policy/sac_real.yaml
index 139463f9..039fe0f0 100644
--- a/lerobot/configs/policy/sac_real.yaml
+++ b/lerobot/configs/policy/sac_real.yaml
@@ -8,8 +8,7 @@
 #   env.gym.obs_type=environment_state_agent_pos \
 
 seed: 1
-dataset_repo_id: aractingi/push_cube_overfit_cropped_resized
-#aractingi/push_cube_square_offline_demo_cropped_resized
+dataset_repo_id:  aractingi/insertcube_simple
 
 training:
   # Offline training dataloader
@@ -30,7 +29,7 @@ training:
   online_steps_between_rollouts: 1000
   online_sampling_ratio: 1.0
   online_env_seed: 10000
-  online_buffer_capacity: 1000000
+  online_buffer_capacity: 10000
   online_buffer_seed_size: 0
   online_step_before_learning: 100 #5000
   do_online_rollout_async: false
@@ -62,7 +61,7 @@ policy:
     observation.images.side: [3, 128, 128]
     # observation.image: [3, 128, 128]
   output_shapes:
-    action: [4] # ["${env.action_dim}"]
+    action: ["${env.action_dim}"]
 
   # Normalization / Unnormalization
   input_normalization_modes:
@@ -77,23 +76,16 @@ policy:
       mean: [0.485, 0.456, 0.406]
       std: [0.229, 0.224, 0.225]
     observation.state:
-      min: [-77.08008,     56.25,        60.55664,     19.511719,   0., -0.63829786]
-      max: [ 7.215820e+01,  1.5398438e+02,  1.6075195e+02,  9.3251953e+01, 0., -1.4184397e-01]
-
-      # min: [-87.09961,     62.402344,    67.23633,     36.035156,    77.34375,0.53691274]
-      # max: [58.183594,   131.83594,    145.98633,     82.08984,     78.22266, 0.60402685]
-      # min: [-88.50586,  23.81836, 0.87890625, -32.16797, 78.66211,   0.53691274]
-      # max: [84.55078, 187.11914, 145.98633, 101.60156, 146.60156,  88.18792]
+      # 6- joint positions, 6- joint velocities, 3- ee position
+      max: [ 52.822266,  136.14258,   142.03125,   72.1582,     22.675781,   -0.5673759, 100., 100., 100., 100., 100., 100., 0.25295413, 0.07498981, 0.06862044]
+      min: [-2.6367188,  86.572266,   89.82422,    12.392578,    -26.015625,   -0.5673759, -100., -100., -100., -100., -100., -100., 0.2010096,  -0.12, 0.0433196]
 
   output_normalization_modes:
     action: min_max
   output_normalization_params:
-    # action:
-    #   min: [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0]
-    #   max: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
     action:
-      min: [-149.23828125, -97.734375, -100.1953125, -73.740234375]
-      max: [149.23828125, 97.734375, 100.1953125, 73.740234375]
+      min: [-0.03, -0.03, -0.01]
+      max: [0.03, 0.03, 0.03]
 
   # Architecture / modeling.
   # Neural networks.
diff --git a/lerobot/configs/robot/so100.yaml b/lerobot/configs/robot/so100.yaml
index 459308ae..59ecfa0b 100644
--- a/lerobot/configs/robot/so100.yaml
+++ b/lerobot/configs/robot/so100.yaml
@@ -14,9 +14,13 @@ calibration_dir: .cache/calibration/so100
 # Set this to a positive scalar to have the same value for all motors, or a list that is the same length as
 # the number of motors in your follower arms.
 max_relative_target: null
-joint_position_relative_bounds:
-  max: [ 7.2158203e+01,  1.5398438e+02,  1.6075195e+02,  9.3251953e+01, 0., -1.4184397e-01]
-  min: [-77.08008,     56.25,        60.55664,     19.511719,   0., -0.63829786]
+joint_position_relative_bounds: null
+  # max: [100, 100, 100, 100, 100, 100]
+  # min: [-100, -100, -100, -100, -100, -100]
+  # max: [ 7.2158203e+01,  1.5398438e+02,  1.6075195e+02,  9.3251953e+01, 0., -1.4184397e-01]
+  # min: [-77.08008,     56.25,        60.55664,     19.511719,   0., -0.63829786]
+  # max: [ 35.06836 ,  103.18359 ,  127.61719 ,  75.58594 , 0., 0.]
+  # min: [ -8.876953 ,  63.808594 ,  90.49805 ,  49.48242 , 0., 0.]
 
 leader_arms:
   main:
@@ -47,13 +51,13 @@ follower_arms:
 cameras:
   front:
     _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
-    camera_index: 0
+    camera_index: 1
     fps: 30
     width: 640
     height: 480
   side:
     _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
-    camera_index: 1
+    camera_index: 0
     fps: 30
     width: 640
     height: 480
diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index 24d8356d..45fd34a3 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -54,6 +54,7 @@ from lerobot.scripts.server.network_utils import (
 )
 from lerobot.scripts.server.gym_manipulator import get_classifier, make_robot_env
 from lerobot.scripts.server import learner_service
+from lerobot.common.robot_devices.utils import busy_wait
 
 from torch.multiprocessing import Queue, Event
 from queue import Empty
@@ -312,17 +313,6 @@ def act_with_policy(
 
     logging.info("make_policy")
 
-    # HACK: This is an ugly hack to pass the normalization parameters to the policy
-    # Because the action space is dynamic so we override the output normalization parameters
-    # it's ugly, we know ... and we will fix it
-    min_action_space: list = online_env.action_space.spaces[0].low.tolist()
-    max_action_space: list = online_env.action_space.spaces[0].high.tolist()
-    output_normalization_params: dict[dict[str, list]] = {
-        "action": {"min": min_action_space, "max": max_action_space}
-    }
-    cfg.policy.output_normalization_params = output_normalization_params
-    cfg.policy.output_shapes["action"] = online_env.action_space.spaces[0].shape
-
     ### Instantiate the policy in both the actor and learner processes
     ### To avoid sending a SACPolicy object through the port, we create a policy intance
     ### on both sides, the learner sends the updated parameters every n steps to update the actor's parameters
@@ -347,6 +337,7 @@ def act_with_policy(
     episode_intervention = False
 
     for interaction_step in range(cfg.training.online_steps):
+        start_time = time.perf_counter()
         if shutdown_event.is_set():
             logging.info("[ACTOR] Shutting down act_with_policy")
             return
@@ -408,7 +399,6 @@ def act_with_policy(
                 complementary_info=info,  # TODO Handle information for the transition, is_demonstraction: bool
             )
         )
-
         # assign obs to the next obs and continue the rollout
         obs = next_obs
 
@@ -449,6 +439,10 @@ def act_with_policy(
             episode_intervention = False
             obs, info = online_env.reset()
 
+        if cfg.fps is not None:
+            dt_time = time.perf_counter() - start_time
+            busy_wait(1 / cfg.fps - dt_time)
+
 
 def push_transitions_to_transport_queue(transitions: list, transitions_queue):
     """Send transitions to learner in smaller chunks to avoid network issues.
diff --git a/lerobot/scripts/server/crop_dataset_roi.py b/lerobot/scripts/server/crop_dataset_roi.py
index 8bb414fe..d6c3dd51 100644
--- a/lerobot/scripts/server/crop_dataset_roi.py
+++ b/lerobot/scripts/server/crop_dataset_roi.py
@@ -263,11 +263,6 @@ if __name__ == "__main__":
         with open(args.crop_params_path) as f:
             rois = json.load(f)
 
-    # rois = {
-    #     "observation.images.front": [102, 43, 358, 523],
-    #     "observation.images.side": [92, 123, 379, 349],
-    # }
-
     # Print the selected rectangular ROIs
     print("\nSelected Rectangular Regions of Interest (top, left, height, width):")
     for key, roi in rois.items():
diff --git a/lerobot/scripts/server/end_effector_control_utils.py b/lerobot/scripts/server/end_effector_control_utils.py
new file mode 100644
index 00000000..253a8ebd
--- /dev/null
+++ b/lerobot/scripts/server/end_effector_control_utils.py
@@ -0,0 +1,797 @@
+from lerobot.common.robot_devices.robots.factory import make_robot
+from lerobot.common.utils.utils import init_hydra_config
+from lerobot.common.robot_devices.utils import busy_wait
+from lerobot.scripts.server.kinematics import RobotKinematics
+import logging
+import time
+import torch
+import numpy as np
+import argparse
+
+
+logging.basicConfig(level=logging.INFO)
+
+
+class InputController:
+    """Base class for input controllers that generate motion deltas."""
+
+    def __init__(self, x_step_size=0.01, y_step_size=0.01, z_step_size=0.01):
+        """
+        Initialize the controller.
+
+        Args:
+            x_step_size: Base movement step size in meters
+            y_step_size: Base movement step size in meters
+            z_step_size: Base movement step size in meters
+        """
+        self.x_step_size = x_step_size
+        self.y_step_size = y_step_size
+        self.z_step_size = z_step_size
+        self.running = True
+        self.episode_end_status = None  # None, "success", or "failure"
+
+    def start(self):
+        """Start the controller and initialize resources."""
+        pass
+
+    def stop(self):
+        """Stop the controller and release resources."""
+        pass
+
+    def get_deltas(self):
+        """Get the current movement deltas (dx, dy, dz) in meters."""
+        return 0.0, 0.0, 0.0
+
+    def should_quit(self):
+        """Return True if the user has requested to quit."""
+        return not self.running
+
+    def update(self):
+        """Update controller state - call this once per frame."""
+        pass
+
+    def __enter__(self):
+        """Support for use in 'with' statements."""
+        self.start()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Ensure resources are released when exiting 'with' block."""
+        self.stop()
+
+    def get_episode_end_status(self):
+        """
+        Get the current episode end status.
+
+        Returns:
+            None if episode should continue, "success" or "failure" otherwise
+        """
+        status = self.episode_end_status
+        self.episode_end_status = None  # Reset after reading
+        return status
+
+
+class KeyboardController(InputController):
+    """Generate motion deltas from keyboard input."""
+
+    def __init__(self, x_step_size=0.01, y_step_size=0.01, z_step_size=0.01):
+        super().__init__(x_step_size, y_step_size, z_step_size)
+        self.key_states = {
+            "forward_x": False,
+            "backward_x": False,
+            "forward_y": False,
+            "backward_y": False,
+            "forward_z": False,
+            "backward_z": False,
+            "quit": False,
+            "success": False,
+            "failure": False,
+        }
+        self.listener = None
+
+    def start(self):
+        """Start the keyboard listener."""
+        from pynput import keyboard
+
+        def on_press(key):
+            try:
+                if key == keyboard.Key.up:
+                    self.key_states["forward_x"] = True
+                elif key == keyboard.Key.down:
+                    self.key_states["backward_x"] = True
+                elif key == keyboard.Key.left:
+                    self.key_states["forward_y"] = True
+                elif key == keyboard.Key.right:
+                    self.key_states["backward_y"] = True
+                elif key == keyboard.Key.shift:
+                    self.key_states["backward_z"] = True
+                elif key == keyboard.Key.shift_r:
+                    self.key_states["forward_z"] = True
+                elif key == keyboard.Key.esc:
+                    self.key_states["quit"] = True
+                    self.running = False
+                    return False
+                elif key == keyboard.Key.enter:
+                    self.key_states["success"] = True
+                    self.episode_end_status = "success"
+                elif key == keyboard.Key.backspace:
+                    self.key_states["failure"] = True
+                    self.episode_end_status = "failure"
+            except AttributeError:
+                pass
+
+        def on_release(key):
+            try:
+                if key == keyboard.Key.up:
+                    self.key_states["forward_x"] = False
+                elif key == keyboard.Key.down:
+                    self.key_states["backward_x"] = False
+                elif key == keyboard.Key.left:
+                    self.key_states["forward_y"] = False
+                elif key == keyboard.Key.right:
+                    self.key_states["backward_y"] = False
+                elif key == keyboard.Key.shift:
+                    self.key_states["backward_z"] = False
+                elif key == keyboard.Key.shift_r:
+                    self.key_states["forward_z"] = False
+                elif key == keyboard.Key.enter:
+                    self.key_states["success"] = False
+                elif key == keyboard.Key.backspace:
+                    self.key_states["failure"] = False
+            except AttributeError:
+                pass
+
+        self.listener = keyboard.Listener(on_press=on_press, on_release=on_release)
+        self.listener.start()
+
+        print("Keyboard controls:")
+        print("  Arrow keys: Move in X-Y plane")
+        print("  Shift and Shift_R: Move in Z axis")
+        print("  Enter: End episode with SUCCESS")
+        print("  Backspace: End episode with FAILURE")
+        print("  ESC: Exit")
+
+    def stop(self):
+        """Stop the keyboard listener."""
+        if self.listener and self.listener.is_alive():
+            self.listener.stop()
+
+    def get_deltas(self):
+        """Get the current movement deltas from keyboard state."""
+        delta_x = delta_y = delta_z = 0.0
+
+        if self.key_states["forward_x"]:
+            delta_x += self.x_step_size
+        if self.key_states["backward_x"]:
+            delta_x -= self.x_step_size
+        if self.key_states["forward_y"]:
+            delta_y += self.y_step_size
+        if self.key_states["backward_y"]:
+            delta_y -= self.y_step_size
+        if self.key_states["forward_z"]:
+            delta_z += self.z_step_size
+        if self.key_states["backward_z"]:
+            delta_z -= self.z_step_size
+
+        return delta_x, delta_y, delta_z
+
+    def should_quit(self):
+        """Return True if ESC was pressed."""
+        return self.key_states["quit"]
+
+    def should_save(self):
+        """Return True if Enter was pressed (save episode)."""
+        return self.key_states["success"] or self.key_states["failure"]
+
+
+class GamepadController(InputController):
+    """Generate motion deltas from gamepad input."""
+
+    def __init__(
+        self, x_step_size=0.01, y_step_size=0.01, z_step_size=0.01, deadzone=0.1
+    ):
+        super().__init__(x_step_size, y_step_size, z_step_size)
+        self.deadzone = deadzone
+        self.joystick = None
+        self.intervention_flag = False
+
+    def start(self):
+        """Initialize pygame and the gamepad."""
+        import pygame
+
+        pygame.init()
+        pygame.joystick.init()
+
+        if pygame.joystick.get_count() == 0:
+            logging.error(
+                "No gamepad detected. Please connect a gamepad and try again."
+            )
+            self.running = False
+            return
+
+        self.joystick = pygame.joystick.Joystick(0)
+        self.joystick.init()
+        logging.info(f"Initialized gamepad: {self.joystick.get_name()}")
+
+        print("Gamepad controls:")
+        print("  Left analog stick: Move in X-Y plane")
+        print("  Right analog stick (vertical): Move in Z axis")
+        print("  B/Circle button: Exit")
+        print("  Y/Triangle button: End episode with SUCCESS")
+        print("  A/Cross button: End episode with FAILURE")
+        print("  X/Square button: Rerecord episode")
+
+    def stop(self):
+        """Clean up pygame resources."""
+        import pygame
+
+        if pygame.joystick.get_init():
+            if self.joystick:
+                self.joystick.quit()
+            pygame.joystick.quit()
+        pygame.quit()
+
+    def update(self):
+        """Process pygame events to get fresh gamepad readings."""
+        import pygame
+
+        for event in pygame.event.get():
+            if event.type == pygame.JOYBUTTONDOWN:
+                if event.button == 3:
+                    self.episode_end_status = "success"
+                # A button (1) for failure
+                elif event.button == 1:
+                    self.episode_end_status = "failure"
+                # X button (0) for rerecord
+                elif event.button == 0:
+                    self.episode_end_status = "rerecord_episode"
+
+            # Reset episode status on button release
+            elif event.type == pygame.JOYBUTTONUP:
+                if event.button in [0, 2, 3]:
+                    self.episode_end_status = None
+
+            # Check for RB button (typically button 5) for intervention flag
+            if self.joystick.get_button(5):
+                self.intervention_flag = True
+            else:
+                self.intervention_flag = False
+
+    def get_deltas(self):
+        """Get the current movement deltas from gamepad state."""
+        import pygame
+
+        try:
+            # Read joystick axes
+            # Left stick X and Y (typically axes 0 and 1)
+            x_input = self.joystick.get_axis(0)  # Left/Right
+            y_input = self.joystick.get_axis(1)  # Up/Down (often inverted)
+
+            # Right stick Y (typically axis 3 or 4)
+            z_input = self.joystick.get_axis(3)  # Up/Down for Z
+
+            # Apply deadzone to avoid drift
+            x_input = 0 if abs(x_input) < self.deadzone else x_input
+            y_input = 0 if abs(y_input) < self.deadzone else y_input
+            z_input = 0 if abs(z_input) < self.deadzone else z_input
+
+            # Calculate deltas (note: may need to invert axes depending on controller)
+            delta_x = -y_input * self.y_step_size  # Forward/backward
+            delta_y = -x_input * self.x_step_size  # Left/right
+            delta_z = -z_input * self.z_step_size  # Up/down
+
+            return delta_x, delta_y, delta_z
+
+        except pygame.error:
+            logging.error("Error reading gamepad. Is it still connected?")
+            return 0.0, 0.0, 0.0
+
+    def should_intervene(self):
+        """Return True if intervention flag was set."""
+        return self.intervention_flag
+
+
+class GamepadControllerHID(InputController):
+    """Generate motion deltas from gamepad input using HIDAPI."""
+
+    def __init__(
+        self,
+        x_step_size=0.01,
+        y_step_size=0.01,
+        z_step_size=0.01,
+        deadzone=0.1,
+        vendor_id=0x046D,
+        product_id=0xC219,
+    ):
+        """
+        Initialize the HID gamepad controller.
+
+        Args:
+            step_size: Base movement step size in meters
+            z_scale: Scaling factor for Z-axis movement
+            deadzone: Joystick deadzone to prevent drift
+            vendor_id: USB vendor ID of the gamepad (default: Logitech)
+            product_id: USB product ID of the gamepad (default: RumblePad 2)
+        """
+        super().__init__(x_step_size, y_step_size, z_step_size)
+        self.deadzone = deadzone
+        self.vendor_id = vendor_id
+        self.product_id = product_id
+        self.device = None
+        self.device_info = None
+
+        # Movement values (normalized from -1.0 to 1.0)
+        self.left_x = 0.0
+        self.left_y = 0.0
+        self.right_x = 0.0
+        self.right_y = 0.0
+
+        # Button states
+        self.buttons = {}
+        self.quit_requested = False
+        self.save_requested = False
+        self.intervention_flag = False
+
+    def find_device(self):
+        """Look for the gamepad device by vendor and product ID."""
+        import hid
+
+        devices = hid.enumerate()
+        for device in devices:
+            if (
+                device["vendor_id"] == self.vendor_id
+                and device["product_id"] == self.product_id
+            ):
+                logging.info(
+                    f"Found gamepad: {device.get('product_string', 'Unknown')}"
+                )
+                return device
+
+        logging.error(
+            f"No gamepad with vendor ID 0x{self.vendor_id:04X} and "
+            f"product ID 0x{self.product_id:04X} found"
+        )
+        return None
+
+    def start(self):
+        """Connect to the gamepad using HIDAPI."""
+        import hid
+
+        self.device_info = self.find_device()
+        if not self.device_info:
+            self.running = False
+            return
+
+        try:
+            logging.info(f"Connecting to gamepad at path: {self.device_info['path']}")
+            self.device = hid.device()
+            self.device.open_path(self.device_info["path"])
+            self.device.set_nonblocking(1)
+
+            manufacturer = self.device.get_manufacturer_string()
+            product = self.device.get_product_string()
+            logging.info(f"Connected to {manufacturer} {product}")
+
+            logging.info("Gamepad controls (HID mode):")
+            logging.info("  Left analog stick: Move in X-Y plane")
+            logging.info("  Right analog stick: Move in Z axis (vertical)")
+            logging.info("  Button 1/B/Circle: Exit")
+            logging.info("  Button 2/A/Cross: End episode with SUCCESS")
+            logging.info("  Button 3/X/Square: End episode with FAILURE")
+
+        except OSError as e:
+            logging.error(f"Error opening gamepad: {e}")
+            logging.error(
+                "You might need to run this with sudo/admin privileges on some systems"
+            )
+            self.running = False
+
+    def stop(self):
+        """Close the HID device connection."""
+        if self.device:
+            self.device.close()
+            self.device = None
+
+    def update(self):
+        """
+        Read and process the latest gamepad data.
+        Due to an issue with the HIDAPI, we need to read the read the device several times in order to get a stable reading
+        """
+        for _ in range(10):
+            self._update()
+
+    def _update(self):
+        """Read and process the latest gamepad data."""
+        if not self.device or not self.running:
+            return
+
+        try:
+            # Read data from the gamepad
+            data = self.device.read(64)
+            if data:
+                # Interpret gamepad data - this will vary by controller model
+                # These offsets are for the Logitech RumblePad 2
+                if len(data) >= 8:
+                    # Normalize joystick values from 0-255 to -1.0-1.0
+                    self.left_x = (data[1] - 128) / 128.0
+                    self.left_y = (data[2] - 128) / 128.0
+                    self.right_x = (data[3] - 128) / 128.0
+                    self.right_y = (data[4] - 128) / 128.0
+
+                    # Apply deadzone
+                    self.left_x = 0 if abs(self.left_x) < self.deadzone else self.left_x
+                    self.left_y = 0 if abs(self.left_y) < self.deadzone else self.left_y
+                    self.right_x = (
+                        0 if abs(self.right_x) < self.deadzone else self.right_x
+                    )
+                    self.right_y = (
+                        0 if abs(self.right_y) < self.deadzone else self.right_y
+                    )
+
+                    # Parse button states (byte 5 in the Logitech RumblePad 2)
+                    buttons = data[5]
+
+                    # Check if RB is pressed then the intervention flag should be set
+                    self.intervention_flag = data[6] == 2
+
+                    # Check if Y/Triangle button (bit 7) is pressed for saving
+                    # Check if X/Square button (bit 5) is pressed for failure
+                    # Check if A/Cross button (bit 4) is pressed for rerecording
+                    if buttons & 1 << 7:
+                        self.episode_end_status = "success"
+                    elif buttons & 1 << 5:
+                        self.episode_end_status = "failure"
+                    elif buttons & 1 << 4:
+                        self.episode_end_status = "rerecord_episode"
+                    else:
+                        self.episode_end_status = None
+
+        except OSError as e:
+            logging.error(f"Error reading from gamepad: {e}")
+
+    def get_deltas(self):
+        """Get the current movement deltas from gamepad state."""
+        # Calculate deltas - invert as needed based on controller orientation
+        delta_x = -self.left_y * self.x_step_size  # Forward/backward
+        delta_y = -self.left_x * self.y_step_size  # Left/right
+        delta_z = -self.right_y * self.z_step_size  # Up/down
+
+        return delta_x, delta_y, delta_z
+
+    def should_quit(self):
+        """Return True if quit button was pressed."""
+        return self.quit_requested
+
+    def should_save(self):
+        """Return True if save button was pressed."""
+        return self.save_requested
+
+    def should_intervene(self):
+        """Return True if intervention flag was set."""
+        return self.intervention_flag
+
+
+def test_forward_kinematics(robot, fps=10):
+    logging.info("Testing Forward Kinematics")
+    timestep = time.perf_counter()
+    while time.perf_counter() - timestep < 60.0:
+        loop_start_time = time.perf_counter()
+        robot.teleop_step()
+        obs = robot.capture_observation()
+        joint_positions = obs["observation.state"].cpu().numpy()
+        ee_pos = RobotKinematics.fk_gripper_tip(joint_positions)
+        logging.info(f"EE Position: {ee_pos[:3,3]}")
+        busy_wait(1 / fps - (time.perf_counter() - loop_start_time))
+
+
+def test_inverse_kinematics(robot, fps=10):
+    logging.info("Testing Inverse Kinematics")
+    timestep = time.perf_counter()
+    while time.perf_counter() - timestep < 60.0:
+        loop_start_time = time.perf_counter()
+        obs = robot.capture_observation()
+        joint_positions = obs["observation.state"].cpu().numpy()
+        ee_pos = RobotKinematics.fk_gripper_tip(joint_positions)
+        desired_ee_pos = ee_pos
+        target_joint_state = RobotKinematics.ik(
+            joint_positions, desired_ee_pos, position_only=True
+        )
+        robot.send_action(torch.from_numpy(target_joint_state))
+        logging.info(f"Target Joint State: {target_joint_state}")
+        busy_wait(1 / fps - (time.perf_counter() - loop_start_time))
+
+
+def teleoperate_inverse_kinematics_with_leader(robot, fps=10):
+    logging.info("Testing Inverse Kinematics")
+    fk_func = RobotKinematics.fk_gripper_tip
+    timestep = time.perf_counter()
+    while time.perf_counter() - timestep < 60.0:
+        loop_start_time = time.perf_counter()
+        obs = robot.capture_observation()
+        joint_positions = obs["observation.state"].cpu().numpy()
+        ee_pos = fk_func(joint_positions)
+
+        leader_joint_positions = robot.leader_arms["main"].read("Present_Position")
+        leader_ee = fk_func(leader_joint_positions)
+
+        desired_ee_pos = leader_ee
+        target_joint_state = RobotKinematics.ik(
+            joint_positions, desired_ee_pos, position_only=True, fk_func=fk_func
+        )
+        robot.send_action(torch.from_numpy(target_joint_state))
+        logging.info(f"Leader EE: {leader_ee[:3,3]}, Follower EE: {ee_pos[:3,3]}")
+        busy_wait(1 / fps - (time.perf_counter() - loop_start_time))
+
+
+def teleoperate_delta_inverse_kinematics_with_leader(robot, fps=10):
+    logging.info("Testing Delta End-Effector Control")
+    timestep = time.perf_counter()
+
+    # Initial position capture
+    obs = robot.capture_observation()
+    joint_positions = obs["observation.state"].cpu().numpy()
+
+    fk_func = RobotKinematics.fk_gripper_tip
+
+    leader_joint_positions = robot.leader_arms["main"].read("Present_Position")
+    initial_leader_ee = fk_func(leader_joint_positions)
+
+    desired_ee_pos = np.diag(np.ones(4))
+
+    while time.perf_counter() - timestep < 60.0:
+        loop_start_time = time.perf_counter()
+
+        # Get leader state for teleoperation
+        leader_joint_positions = robot.leader_arms["main"].read("Present_Position")
+        leader_ee = fk_func(leader_joint_positions)
+
+        # Get current state
+        # obs = robot.capture_observation()
+        # joint_positions = obs["observation.state"].cpu().numpy()
+        joint_positions = robot.follower_arms["main"].read("Present_Position")
+        current_ee_pos = fk_func(joint_positions)
+
+        # Calculate delta between leader and follower end-effectors
+        # Scaling factor can be adjusted for sensitivity
+        scaling_factor = 1.0
+        ee_delta = (leader_ee - initial_leader_ee) * scaling_factor
+
+        # Apply delta to current position
+        desired_ee_pos[0, 3] = current_ee_pos[0, 3] + ee_delta[0, 3]
+        desired_ee_pos[1, 3] = current_ee_pos[1, 3] + ee_delta[1, 3]
+        desired_ee_pos[2, 3] = current_ee_pos[2, 3] + ee_delta[2, 3]
+
+        if np.any(np.abs(ee_delta[:3, 3]) > 0.01):
+            # Compute joint targets via inverse kinematics
+            target_joint_state = RobotKinematics.ik(
+                joint_positions, desired_ee_pos, position_only=True, fk_func=fk_func
+            )
+
+            initial_leader_ee = leader_ee.copy()
+
+            # Send command to robot
+            robot.send_action(torch.from_numpy(target_joint_state))
+
+            # Logging
+            logging.info(
+                f"Current EE: {current_ee_pos[:3,3]}, Desired EE: {desired_ee_pos[:3,3]}"
+            )
+            logging.info(f"Delta EE: {ee_delta[:3,3]}")
+
+        busy_wait(1 / fps - (time.perf_counter() - loop_start_time))
+
+
+def teleoperate_delta_inverse_kinematics(
+    robot, controller, fps=10, bounds=None, fk_func=None
+):
+    """
+    Control a robot using delta end-effector movements from any input controller.
+
+    Args:
+        robot: Robot instance to control
+        controller: InputController instance (keyboard, gamepad, etc.)
+        fps: Control frequency in Hz
+        bounds: Optional position limits
+        fk_func: Forward kinematics function to use
+    """
+    if fk_func is None:
+        fk_func = RobotKinematics.fk_gripper_tip
+
+    logging.info(
+        f"Testing Delta End-Effector Control with {controller.__class__.__name__}"
+    )
+
+    # Initial position capture
+    obs = robot.capture_observation()
+    joint_positions = obs["observation.state"].cpu().numpy()
+    current_ee_pos = fk_func(joint_positions)
+
+    # Initialize desired position with current position
+    desired_ee_pos = np.eye(4)  # Identity matrix
+
+    timestep = time.perf_counter()
+    with controller:
+        while not controller.should_quit() and time.perf_counter() - timestep < 60.0:
+            loop_start_time = time.perf_counter()
+
+            # Process input events
+            controller.update()
+
+            # Get currrent robot state
+            joint_positions = robot.follower_arms["main"].read("Present_Position")
+            current_ee_pos = fk_func(joint_positions)
+
+            # Get movement deltas from the controller
+            delta_x, delta_y, delta_z = controller.get_deltas()
+
+            # Update desired position
+            desired_ee_pos[0, 3] = current_ee_pos[0, 3] + delta_x
+            desired_ee_pos[1, 3] = current_ee_pos[1, 3] + delta_y
+            desired_ee_pos[2, 3] = current_ee_pos[2, 3] + delta_z
+
+            # Apply bounds if provided
+            if bounds is not None:
+                desired_ee_pos[:3, 3] = np.clip(
+                    desired_ee_pos[:3, 3], bounds["min"], bounds["max"]
+                )
+
+            # Only send commands if there's actual movement
+            if any([abs(v) > 0.001 for v in [delta_x, delta_y, delta_z]]):
+                # Compute joint targets via inverse kinematics
+                target_joint_state = RobotKinematics.ik(
+                    joint_positions, desired_ee_pos, position_only=True, fk_func=fk_func
+                )
+
+                # Send command to robot
+                robot.send_action(torch.from_numpy(target_joint_state))
+
+            busy_wait(1 / fps - (time.perf_counter() - loop_start_time))
+
+
+def teleoperate_gym_env(env, controller, fps: int = 30):
+    """
+    Control a robot through a gym environment using keyboard inputs.
+
+    Args:
+        env: A gym environment created with make_robot_env
+        fps: Target control frequency
+    """
+
+    logging.info("Testing Keyboard Control of Gym Environment")
+    print("Keyboard controls:")
+    print("  Arrow keys: Move in X-Y plane")
+    print("  Shift and Shift_R: Move in Z axis")
+    print("  ESC: Exit")
+
+    # Reset the environment to get initial observation
+    obs, info = env.reset()
+
+    try:
+        with controller:
+            while not controller.should_quit():
+                loop_start_time = time.perf_counter()
+
+                # Process input events
+                controller.update()
+
+                # Get movement deltas from the controller
+                delta_x, delta_y, delta_z = controller.get_deltas()
+
+                # Create the action vector
+                action = np.array([delta_x, delta_y, delta_z])
+
+                # Skip if no movement
+                if any([abs(v) > 0.001 for v in [delta_x, delta_y, delta_z]]):
+                    # Step the environment - pass action as a tensor with intervention flag
+                    action_tensor = torch.from_numpy(action.astype(np.float32))
+                    obs, reward, terminated, truncated, info = env.step(
+                        (action_tensor, False)
+                    )
+
+                    # Log information
+                    logging.info(
+                        f"Action: [{delta_x:.4f}, {delta_y:.4f}, {delta_z:.4f}]"
+                    )
+                    logging.info(f"Reward: {reward}")
+
+                    # Reset if episode ended
+                    if terminated or truncated:
+                        logging.info("Episode ended, resetting environment")
+                        obs, info = env.reset()
+
+                # Maintain target frame rate
+                busy_wait(1 / fps - (time.perf_counter() - loop_start_time))
+
+    finally:
+        # Close the environment
+        env.close()
+
+
+def make_robot_from_config(config_path, overrides=None):
+    """Helper function to create a robot from a config file."""
+    if overrides is None:
+        overrides = []
+    robot_cfg = init_hydra_config(config_path, overrides)
+    return make_robot(robot_cfg)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Test end-effector control")
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default="keyboard",
+        choices=[
+            "keyboard",
+            "gamepad",
+            "keyboard_gym",
+            "gamepad_gym",
+            "leader",
+            "leader_abs",
+        ],
+        help="Control mode to use",
+    )
+    parser.add_argument(
+        "--task",
+        type=str,
+        default="Robot manipulation task",
+        help="Description of the task being performed",
+    )
+    parser.add_argument(
+        "--push-to-hub",
+        default=True,
+        type=bool,
+        help="Push the dataset to Hugging Face Hub",
+    )
+    # Add the rest of your existing arguments
+    args = parser.parse_args()
+
+    robot = make_robot_from_config("lerobot/configs/robot/so100.yaml", [])
+
+    if not robot.is_connected:
+        robot.connect()
+
+    # Example bounds
+    bounds = {
+        "max": np.array([0.32170487, 0.201285, 0.10273342]),
+        "min": np.array([0.16631757, -0.08237468, 0.03364977]),
+    }
+
+    try:
+        # Determine controller type based on mode prefix
+        controller = None
+        if args.mode.startswith("keyboard"):
+            controller = KeyboardController(
+                x_step_size=0.01, y_step_size=0.01, z_step_size=0.05
+            )
+        elif args.mode.startswith("gamepad"):
+            controller = GamepadController(
+                x_step_size=0.02, y_step_size=0.02, z_step_size=0.05
+            )
+
+        # Handle mode categories
+        if args.mode in ["keyboard", "gamepad"]:
+            # Direct robot control modes
+            teleoperate_delta_inverse_kinematics(
+                robot, controller, bounds=bounds, fps=10
+            )
+
+        elif args.mode in ["keyboard_gym", "gamepad_gym"]:
+            # Gym environment control modes
+            from lerobot.scripts.server.gym_manipulator import make_robot_env
+
+            cfg = init_hydra_config("lerobot/configs/env/so100_real.yaml", [])
+            cfg.env.wrapper.ee_action_space_params.use_gamepad = False
+            env = make_robot_env(robot, None, cfg)
+            teleoperate_gym_env(env, controller)
+
+        elif args.mode == "leader":
+            # Leader-follower modes don't use controllers
+            teleoperate_delta_inverse_kinematics_with_leader(robot)
+
+        elif args.mode == "leader_abs":
+            teleoperate_inverse_kinematics_with_leader(robot)
+
+    finally:
+        if robot.is_connected:
+            robot.disconnect()
diff --git a/lerobot/scripts/server/find_joint_limits.py b/lerobot/scripts/server/find_joint_limits.py
index d5870027..7834f821 100644
--- a/lerobot/scripts/server/find_joint_limits.py
+++ b/lerobot/scripts/server/find_joint_limits.py
@@ -7,25 +7,26 @@ import numpy as np
 from lerobot.common.robot_devices.control_utils import is_headless
 from lerobot.common.robot_devices.robots.factory import make_robot
 from lerobot.common.utils.utils import init_hydra_config
+from lerobot.scripts.server.kinematics import RobotKinematics
 
 
 def find_joint_bounds(
     robot,
-    control_time_s=20,
+    control_time_s=30,
     display_cameras=False,
 ):
-    # TODO(rcadene): Add option to record logs
     if not robot.is_connected:
         robot.connect()
 
-        control_time_s = float("inf")
-
-    timestamp = 0
     start_episode_t = time.perf_counter()
     pos_list = []
-    while timestamp < control_time_s:
+    while True:
         observation, action = robot.teleop_step(record_data=True)
 
+        # Wait for 5 seconds to stabilize the robot initial position
+        if time.perf_counter() - start_episode_t < 5:
+            continue
+
         pos_list.append(robot.follower_arms["main"].read("Present_Position"))
 
         if display_cameras and not is_headless():
@@ -36,8 +37,7 @@ def find_joint_bounds(
                 )
             cv2.waitKey(1)
 
-        timestamp = time.perf_counter() - start_episode_t
-        if timestamp > 60:
+        if time.perf_counter() - start_episode_t > control_time_s:
             max = np.max(np.stack(pos_list), 0)
             min = np.min(np.stack(pos_list), 0)
             print(f"Max angle position per joint {max}")
@@ -45,6 +45,43 @@ def find_joint_bounds(
             break
 
 
+def find_ee_bounds(
+    robot,
+    control_time_s=30,
+    display_cameras=False,
+):
+    if not robot.is_connected:
+        robot.connect()
+
+    start_episode_t = time.perf_counter()
+    ee_list = []
+    while True:
+        observation, action = robot.teleop_step(record_data=True)
+
+        # Wait for 5 seconds to stabilize the robot initial position
+        if time.perf_counter() - start_episode_t < 5:
+            continue
+
+        joint_positions = robot.follower_arms["main"].read("Present_Position")
+        print(f"Joint positions: {joint_positions}")
+        ee_list.append(RobotKinematics.fk_gripper_tip(joint_positions)[:3, 3])
+
+        if display_cameras and not is_headless():
+            image_keys = [key for key in observation if "image" in key]
+            for key in image_keys:
+                cv2.imshow(
+                    key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR)
+                )
+            cv2.waitKey(1)
+
+        if time.perf_counter() - start_episode_t > control_time_s:
+            max = np.max(np.stack(ee_list), 0)
+            min = np.min(np.stack(ee_list), 0)
+            print(f"Max ee position {max}")
+            print(f"Min ee position {min}")
+            break
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -59,14 +96,26 @@ if __name__ == "__main__":
         nargs="*",
         help="Any key=value arguments to override config values (use dots for.nested=overrides)",
     )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default="joint",
+        choices=["joint", "ee"],
+        help="Mode to run the script in. Can be 'joint' or 'ee'.",
+    )
     parser.add_argument(
         "--control-time-s",
-        type=float,
-        default=20,
-        help="Maximum episode length in seconds",
+        type=int,
+        default=30,
+        help="Time step to use for control.",
     )
     args = parser.parse_args()
     robot_cfg = init_hydra_config(args.robot_path, args.robot_overrides)
 
     robot = make_robot(robot_cfg)
-    find_joint_bounds(robot, control_time_s=args.control_time_s)
+    if args.mode == "joint":
+        find_joint_bounds(robot, args.control_time_s)
+    elif args.mode == "ee":
+        find_ee_bounds(robot, args.control_time_s)
+    if robot.is_connected:
+        robot.disconnect()
diff --git a/lerobot/scripts/server/gym_manipulator.py b/lerobot/scripts/server/gym_manipulator.py
index c1a7c88c..728afdfa 100644
--- a/lerobot/scripts/server/gym_manipulator.py
+++ b/lerobot/scripts/server/gym_manipulator.py
@@ -1,19 +1,26 @@
 import argparse
+import sys
+
 import logging
 import time
 from threading import Lock
-from typing import Annotated, Any, Callable, Dict, Optional, Tuple
-
+from typing import Annotated, Any, Dict, Tuple
 import gymnasium as gym
 import numpy as np
 import torch
 import torchvision.transforms.functional as F  # noqa: N812
 
 from lerobot.common.envs.utils import preprocess_observation
-from lerobot.common.robot_devices.control_utils import busy_wait, is_headless
+from lerobot.common.robot_devices.control_utils import (
+    busy_wait,
+    is_headless,
+    reset_follower_position,
+)
 from lerobot.common.robot_devices.robots.factory import make_robot
 from lerobot.common.utils.utils import init_hydra_config, log_say
 
+from lerobot.scripts.server.kinematics import RobotKinematics
+
 logging.basicConfig(level=logging.INFO)
 
 
@@ -76,13 +83,19 @@ class HILSerlRobotEnv(gym.Env):
 
         # Retrieve the size of the joint position interval bound.
         self.relative_bounds_size = (
-            self.robot.config.joint_position_relative_bounds["max"]
-            - self.robot.config.joint_position_relative_bounds["min"]
+            (
+                self.robot.config.joint_position_relative_bounds["max"]
+                - self.robot.config.joint_position_relative_bounds["min"]
+            )
+            if self.robot.config.joint_position_relative_bounds is not None
+            else None
         )
 
-        self.delta_relative_bounds_size = self.relative_bounds_size * self.delta
-
-        self.robot.config.max_relative_target = self.delta_relative_bounds_size.float()
+        self.robot.config.max_relative_target = (
+            self.relative_bounds_size.float()
+            if self.relative_bounds_size is not None
+            else None
+        )
 
         # Dynamically configure the observation and action spaces.
         self._setup_spaces()
@@ -99,26 +112,23 @@ class HILSerlRobotEnv(gym.Env):
             - The action space is defined as a Tuple where:
                 • The first element is a Box space representing joint position commands. It is defined as relative (delta)
                   or absolute, based on the configuration.
-                • The second element is a Discrete space (with 2 values) serving as a flag for intervention (teleoperation).
+                • ThE SECONd element is a Discrete space (with 2 values) serving as a flag for intervention (teleoperation).
         """
         example_obs = self.robot.capture_observation()
 
         # Define observation spaces for images and other states.
         image_keys = [key for key in example_obs if "image" in key]
-        state_keys = [key for key in example_obs if "image" not in key]
         observation_spaces = {
             key: gym.spaces.Box(
                 low=0, high=255, shape=example_obs[key].shape, dtype=np.uint8
             )
             for key in image_keys
         }
-        observation_spaces["observation.state"] = gym.spaces.Dict(
-            {
-                key: gym.spaces.Box(
-                    low=0, high=10, shape=example_obs[key].shape, dtype=np.float32
-                )
-                for key in state_keys
-            }
+        observation_spaces["observation.state"] = gym.spaces.Box(
+            low=0,
+            high=10,
+            shape=example_obs["observation.state"].shape,
+            dtype=np.float32,
         )
 
         self.observation_space = gym.spaces.Dict(observation_spaces)
@@ -126,20 +136,31 @@ class HILSerlRobotEnv(gym.Env):
         # Define the action space for joint positions along with setting an intervention flag.
         action_dim = len(self.robot.follower_arms["main"].read("Present_Position"))
         if self.use_delta_action_space:
+            bounds = (
+                self.relative_bounds_size
+                if self.relative_bounds_size is not None
+                else np.ones(action_dim) * 1000
+            )
             action_space_robot = gym.spaces.Box(
-                low=-self.relative_bounds_size.cpu().numpy(),
-                high=self.relative_bounds_size.cpu().numpy(),
+                low=-bounds,
+                high=bounds,
                 shape=(action_dim,),
                 dtype=np.float32,
             )
         else:
+            bounds_min = (
+                self.robot.config.joint_position_relative_bounds["min"].cpu().numpy()
+                if self.robot.config.joint_position_relative_bounds is not None
+                else np.ones(action_dim) * -1000
+            )
+            bounds_max = (
+                self.robot.config.joint_position_relative_bounds["max"].cpu().numpy()
+                if self.robot.config.joint_position_relative_bounds is not None
+                else np.ones(action_dim) * 1000
+            )
             action_space_robot = gym.spaces.Box(
-                low=self.robot.config.joint_position_relative_bounds["min"]
-                .cpu()
-                .numpy(),
-                high=self.robot.config.joint_position_relative_bounds["max"]
-                .cpu()
-                .numpy(),
+                low=bounds_min,
+                high=bounds_max,
                 shape=(action_dim,),
                 dtype=np.float32,
             )
@@ -176,7 +197,7 @@ class HILSerlRobotEnv(gym.Env):
         self.current_step = 0
         self.episode_data = None
 
-        return observation, {"initial_position": self.initial_follower_position}
+        return observation, {}
 
     def step(
         self, action: Tuple[np.ndarray, bool]
@@ -218,6 +239,7 @@ class HILSerlRobotEnv(gym.Env):
             policy_action = np.clip(
                 policy_action, self.action_space[0].low, self.action_space[0].high
             )
+
         if not intervention_bool:
             if self.use_delta_action_space:
                 target_joint_positions = (
@@ -238,8 +260,9 @@ class HILSerlRobotEnv(gym.Env):
                 teleop_action = (
                     teleop_action - self.current_joint_positions
                 ) / self.delta
-                if torch.any(teleop_action < -self.relative_bounds_size) and torch.any(
-                    teleop_action > self.relative_bounds_size
+                if self.relative_bounds_size is not None and (
+                    torch.any(teleop_action < -self.relative_bounds_size)
+                    and torch.any(teleop_action > self.relative_bounds_size)
                 ):
                     logging.debug(
                         f"Relative teleop delta exceeded bounds {self.relative_bounds_size}, teleop_action {teleop_action}\n"
@@ -299,6 +322,46 @@ class HILSerlRobotEnv(gym.Env):
             self.robot.disconnect()
 
 
+class AddJointVelocityToObservation(gym.ObservationWrapper):
+    def __init__(self, env, joint_velocity_limits=100.0, fps=30):
+        super().__init__(env)
+
+        # Extend observation space to include joint velocities
+        old_low = self.observation_space["observation.state"].low
+        old_high = self.observation_space["observation.state"].high
+        old_shape = self.observation_space["observation.state"].shape
+
+        self.last_joint_positions = np.zeros(old_shape)
+
+        new_low = np.concatenate(
+            [old_low, np.ones_like(old_low) * -joint_velocity_limits]
+        )
+        new_high = np.concatenate(
+            [old_high, np.ones_like(old_high) * joint_velocity_limits]
+        )
+
+        new_shape = (old_shape[0] * 2,)
+
+        self.observation_space["observation.state"] = gym.spaces.Box(
+            low=new_low,
+            high=new_high,
+            shape=new_shape,
+            dtype=np.float32,
+        )
+
+        self.dt = 1.0 / fps
+
+    def observation(self, observation):
+        joint_velocities = (
+            observation["observation.state"] - self.last_joint_positions
+        ) / self.dt
+        self.last_joint_positions = observation["observation.state"].clone()
+        observation["observation.state"] = torch.cat(
+            [observation["observation.state"], joint_velocities], dim=-1
+        )
+        return observation
+
+
 class ActionRepeatWrapper(gym.Wrapper):
     def __init__(self, env, nb_repeat: int = 1):
         super().__init__(env)
@@ -347,8 +410,6 @@ class RewardWrapper(gym.Wrapper):
             )
         info["Reward classifer frequency"] = 1 / (time.perf_counter() - start_time)
 
-        # logging.info(f"Reward: {reward}")
-
         if reward == 1.0:
             terminated = True
         return observation, reward, terminated, truncated, info
@@ -465,9 +526,7 @@ class TimeLimitWrapper(gym.Wrapper):
         if 1.0 / time_since_last_step < self.fps:
             logging.debug(f"Current timestep exceeded expected fps {self.fps}")
 
-        if self.episode_time_in_s > self.control_time_s:
-            # if self.current_step >= self.max_episode_steps:
-            # Terminated = True
+        if self.current_step >= self.max_episode_steps:
             terminated = True
         return obs, reward, terminated, truncated, info
 
@@ -508,7 +567,20 @@ class ImageCropResizeWrapper(gym.Wrapper):
         obs, reward, terminated, truncated, info = self.env.step(action)
         for k in self.crop_params_dict:
             device = obs[k].device
+            if obs[k].dim() >= 3:
+                # Reshape to combine height and width dimensions for easier calculation
+                batch_size = obs[k].size(0)
+                channels = obs[k].size(1)
+                flattened_spatial_dims = obs[k].view(batch_size, channels, -1)
 
+                # Calculate standard deviation across spatial dimensions (H, W)
+                std_per_channel = torch.std(flattened_spatial_dims, dim=2)
+
+                # If any channel has std=0, all pixels in that channel have the same value
+                if (std_per_channel <= 0.02).any():
+                    logging.warning(
+                        f"Potential hardware issue detected: All pixels have the same value in observation {k}"
+                    )
             # Check for NaNs before processing
             if torch.isnan(obs[k]).any():
                 logging.error(
@@ -703,19 +775,21 @@ class ResetWrapper(gym.Wrapper):
     def __init__(
         self,
         env: HILSerlRobotEnv,
-        reset_fn: Optional[Callable[[], None]] = None,
+        reset_pose: np.ndarray | None = None,
         reset_time_s: float = 5,
     ):
         super().__init__(env)
-        self.reset_fn = reset_fn
         self.reset_time_s = reset_time_s
-
+        self.reset_pose = reset_pose
         self.robot = self.unwrapped.robot
-        self.init_pos = self.unwrapped.initial_follower_position
 
     def reset(self, *, seed=None, options=None):
-        if self.reset_fn is not None:
-            self.reset_fn(self.env)
+        if self.reset_pose is not None:
+            start_time = time.perf_counter()
+            log_say("Reset the environment.", play_sounds=True)
+            reset_follower_position(self.robot, self.reset_pose)
+            busy_wait(self.reset_time_s - (time.perf_counter() - start_time))
+            log_say("Reset the environment done.", play_sounds=True)
         else:
             log_say(
                 f"Manually reset the environment for {self.reset_time_s} seconds.",
@@ -741,10 +815,297 @@ class BatchCompitableWrapper(gym.ObservationWrapper):
                 observation[key] = observation[key].unsqueeze(0)
             if "state" in key and observation[key].dim() == 1:
                 observation[key] = observation[key].unsqueeze(0)
+            if "velocity" in key and observation[key].dim() == 1:
+                observation[key] = observation[key].unsqueeze(0)
         return observation
 
 
-# TODO: REMOVE TH
+class EEActionWrapper(gym.ActionWrapper):
+    def __init__(self, env, ee_action_space_params=None):
+        super().__init__(env)
+        self.ee_action_space_params = ee_action_space_params
+
+        # Initialize kinematics instance for the appropriate robot type
+        robot_type = getattr(env.unwrapped.robot.config, "robot_type", "so100")
+        self.kinematics = RobotKinematics(robot_type)
+        self.fk_function = self.kinematics.fk_gripper_tip
+
+        action_space_bounds = np.array(
+            [
+                ee_action_space_params.x_step_size,
+                ee_action_space_params.y_step_size,
+                ee_action_space_params.z_step_size,
+            ]
+        )
+        ee_action_space = gym.spaces.Box(
+            low=-action_space_bounds,
+            high=action_space_bounds,
+            shape=(3,),
+            dtype=np.float32,
+        )
+        if isinstance(self.action_space, gym.spaces.Tuple):
+            self.action_space = gym.spaces.Tuple(
+                (ee_action_space, self.action_space[1])
+            )
+        else:
+            self.action_space = ee_action_space
+
+        self.bounds = ee_action_space_params.bounds
+
+    def action(self, action):
+        is_intervention = False
+        desired_ee_pos = np.eye(4)
+        if isinstance(action, tuple):
+            action, _ = action
+
+        current_joint_pos = self.unwrapped.robot.follower_arms["main"].read(
+            "Present_Position"
+        )
+        current_ee_pos = self.fk_function(current_joint_pos)
+        if isinstance(action, torch.Tensor):
+            action = action.cpu().numpy()
+        desired_ee_pos[:3, 3] = np.clip(
+            current_ee_pos[:3, 3] + action,
+            self.bounds["min"],
+            self.bounds["max"],
+        )
+        target_joint_pos = self.kinematics.ik(
+            current_joint_pos,
+            desired_ee_pos,
+            position_only=True,
+            fk_func=self.fk_function,
+        )
+        return target_joint_pos, is_intervention
+
+
+class EEObservationWrapper(gym.ObservationWrapper):
+    def __init__(self, env, ee_pose_limits):
+        super().__init__(env)
+
+        # Extend observation space to include end effector pose
+        prev_space = self.observation_space["observation.state"]
+
+        self.observation_space["observation.state"] = gym.spaces.Box(
+            low=np.concatenate([prev_space.low, ee_pose_limits["min"]]),
+            high=np.concatenate([prev_space.high, ee_pose_limits["max"]]),
+            shape=(prev_space.shape[0] + 3,),
+            dtype=np.float32,
+        )
+
+        # Initialize kinematics instance for the appropriate robot type
+        robot_type = getattr(env.unwrapped.robot.config, "robot_type", "so100")
+        self.kinematics = RobotKinematics(robot_type)
+        self.fk_function = self.kinematics.fk_gripper_tip
+
+    def observation(self, observation):
+        current_joint_pos = self.unwrapped.robot.follower_arms["main"].read(
+            "Present_Position"
+        )
+        current_ee_pos = self.fk_function(current_joint_pos)
+        observation["observation.state"] = torch.cat(
+            [
+                observation["observation.state"],
+                torch.from_numpy(current_ee_pos[:3, 3]),
+            ],
+            dim=-1,
+        )
+        return observation
+
+
+class GamepadControlWrapper(gym.Wrapper):
+    """
+    Wrapper that allows controlling a gym environment with a gamepad.
+
+    This wrapper intercepts the step method and allows human input via gamepad
+    to override the agent's actions when desired.
+    """
+
+    def __init__(
+        self,
+        env,
+        x_step_size=1.0,
+        y_step_size=1.0,
+        z_step_size=1.0,
+        auto_reset=False,
+        input_threshold=0.001,
+    ):
+        """
+        Initialize the gamepad controller wrapper.
+
+        Args:
+            env: The environment to wrap
+            x_step_size: Base movement step size for X axis in meters
+            y_step_size: Base movement step size for Y axis in meters
+            z_step_size: Base movement step size for Z axis in meters
+            vendor_id: USB vendor ID of the gamepad (default: Logitech)
+            product_id: USB product ID of the gamepad (default: RumblePad 2)
+            auto_reset: Whether to auto reset the environment when episode ends
+            input_threshold: Minimum movement delta to consider as active input
+        """
+        super().__init__(env)
+        from lerobot.scripts.server.end_effector_control_utils import (
+            GamepadControllerHID,
+            GamepadController,
+        )
+
+        # use HidApi for macos
+        if sys.platform == "darwin":
+            self.controller = GamepadControllerHID(
+                x_step_size=x_step_size,
+                y_step_size=y_step_size,
+                z_step_size=z_step_size,
+            )
+        else:
+            self.controller = GamepadController(
+                x_step_size=x_step_size,
+                y_step_size=y_step_size,
+                z_step_size=z_step_size,
+            )
+        self.auto_reset = auto_reset
+        self.input_threshold = input_threshold
+        self.controller.start()
+
+        logging.info("Gamepad control wrapper initialized")
+        print("Gamepad controls:")
+        print("  Left analog stick: Move in X-Y plane")
+        print("  Right analog stick: Move in Z axis (up/down)")
+        print("  X/Square button: End episode (FAILURE)")
+        print("  Y/Triangle button: End episode (SUCCESS)")
+        print("  B/Circle button: Exit program")
+
+    def get_gamepad_action(self):
+        """
+        Get the current action from the gamepad if any input is active.
+
+        Returns:
+            Tuple of (is_active, action, terminate_episode, success)
+        """
+        # Update the controller to get fresh inputs
+        self.controller.update()
+
+        # Get movement deltas from the controller
+        delta_x, delta_y, delta_z = self.controller.get_deltas()
+
+        intervention_is_active = self.controller.should_intervene()
+
+        # Create action from gamepad input
+        gamepad_action = np.array([delta_x, delta_y, delta_z], dtype=np.float32)
+
+        # Check episode ending buttons
+        # We'll rely on controller.get_episode_end_status() which returns "success", "failure", or None
+        episode_end_status = self.controller.get_episode_end_status()
+        terminate_episode = episode_end_status is not None
+        success = episode_end_status == "success"
+        rerecord_episode = episode_end_status == "rerecord_episode"
+
+        return (
+            intervention_is_active,
+            gamepad_action,
+            terminate_episode,
+            success,
+            rerecord_episode,
+        )
+
+    def step(self, action):
+        """
+        Step the environment, using gamepad input to override actions when active.
+
+        Args:
+            action: Original action from agent
+
+        Returns:
+            observation, reward, terminated, truncated, info
+        """
+        # Get gamepad state and action
+        (
+            is_intervention,
+            gamepad_action,
+            terminate_episode,
+            success,
+            rerecord_episode,
+        ) = self.get_gamepad_action()
+
+        # Update episode ending state if requested
+        if terminate_episode:
+            logging.info(
+                f"Episode manually ended: {'SUCCESS' if success else 'FAILURE'}"
+            )
+
+        # Only override the action if gamepad is active
+        if is_intervention:
+            # Format according to the expected action type
+            if isinstance(self.action_space, gym.spaces.Tuple):
+                # For environments that use (action, is_intervention) tuples
+                final_action = (torch.from_numpy(gamepad_action), False)
+            else:
+                final_action = torch.from_numpy(gamepad_action)
+        else:
+            # Use the original action
+            final_action = action
+
+        # Step the environment
+        obs, reward, terminated, truncated, info = self.env.step(final_action)
+
+        # Add episode ending if requested via gamepad
+        terminated = terminated or truncated or terminate_episode
+
+        if success:
+            reward = 1.0
+            logging.info("Episode ended successfully with reward 1.0")
+
+        info["is_intervention"] = is_intervention
+        action_intervention = (
+            final_action[0] if isinstance(final_action, Tuple) else final_action
+        )
+        if isinstance(action_intervention, np.ndarray):
+            action_intervention = torch.from_numpy(action_intervention)
+        info["action_intervention"] = action_intervention
+        info["rerecord_episode"] = rerecord_episode
+
+        # If episode ended, reset the state
+        if terminated or truncated:
+            # Add success/failure information to info dict
+            info["next.success"] = success
+
+            # Auto reset if configured
+            if self.auto_reset:
+                obs, reset_info = self.reset()
+                info.update(reset_info)
+
+        return obs, reward, terminated, truncated, info
+
+    def close(self):
+        """Clean up resources when environment closes."""
+        # Stop the controller
+        if hasattr(self, "controller"):
+            self.controller.stop()
+
+        # Call the parent close method
+        return self.env.close()
+
+
+class ActionScaleWrapper(gym.ActionWrapper):
+    def __init__(self, env, ee_action_space_params=None):
+        super().__init__(env)
+        assert (
+            ee_action_space_params is not None
+        ), "TODO: method implemented for ee action space only so far"
+        self.scale_vector = np.array(
+            [
+                [
+                    ee_action_space_params.x_step_size,
+                    ee_action_space_params.y_step_size,
+                    ee_action_space_params.z_step_size,
+                ]
+            ]
+        )
+
+    def action(self, action):
+        is_intervention = False
+        if isinstance(action, tuple):
+            action, is_intervention = action
+
+        return action * self.scale_vector, is_intervention
 
 
 def make_robot_env(
@@ -779,11 +1140,20 @@ def make_robot_env(
         robot=robot,
         display_cameras=cfg.env.wrapper.display_cameras,
         delta=cfg.env.wrapper.delta_action,
-        use_delta_action_space=cfg.env.wrapper.use_relative_joint_positions,
+        use_delta_action_space=cfg.env.wrapper.use_relative_joint_positions
+        and cfg.env.wrapper.ee_action_space_params is None,
     )
 
     # Add observation and image processing
-    env = ConvertToLeRobotObservation(env=env, device=cfg.device)
+    if cfg.env.wrapper.add_joint_velocity_to_observation:
+        env = AddJointVelocityToObservation(env=env, fps=cfg.fps)
+    if cfg.env.wrapper.add_ee_pose_to_observation:
+        env = EEObservationWrapper(
+            env=env, ee_pose_limits=cfg.env.wrapper.ee_action_space_params.bounds
+        )
+
+    env = ConvertToLeRobotObservation(env=env, device=cfg.env.device)
+
     if cfg.env.wrapper.crop_params_dict is not None:
         env = ImageCropResizeWrapper(
             env=env,
@@ -792,23 +1162,44 @@ def make_robot_env(
         )
 
     # Add reward computation and control wrappers
-    env = RewardWrapper(env=env, reward_classifier=reward_classifier, device=cfg.device)
+    # env = RewardWrapper(env=env, reward_classifier=reward_classifier, device=cfg.device)
     env = TimeLimitWrapper(
         env=env, control_time_s=cfg.env.wrapper.control_time_s, fps=cfg.fps
     )
-    env = KeyboardInterfaceWrapper(env=env)
+    if cfg.env.wrapper.ee_action_space_params is not None:
+        env = EEActionWrapper(
+            env=env, ee_action_space_params=cfg.env.wrapper.ee_action_space_params
+        )
+    if (
+        cfg.env.wrapper.ee_action_space_params is not None
+        and cfg.env.wrapper.ee_action_space_params.use_gamepad
+    ):
+        # env = ActionScaleWrapper(env=env, ee_action_space_params=cfg.env.wrapper.ee_action_space_params)
+        env = GamepadControlWrapper(
+            env=env,
+            x_step_size=cfg.env.wrapper.ee_action_space_params.x_step_size,
+            y_step_size=cfg.env.wrapper.ee_action_space_params.y_step_size,
+            z_step_size=cfg.env.wrapper.ee_action_space_params.z_step_size,
+        )
+    else:
+        env = KeyboardInterfaceWrapper(env=env)
+
     env = ResetWrapper(
-        env=env, reset_fn=None, reset_time_s=cfg.env.wrapper.reset_time_s
-    )
-    env = JointMaskingActionSpace(
-        env=env, mask=cfg.env.wrapper.joint_masking_action_space
+        env=env,
+        reset_pose=cfg.env.wrapper.fixed_reset_joint_positions,
+        reset_time_s=cfg.env.wrapper.reset_time_s,
     )
+    if (
+        cfg.env.wrapper.ee_action_space_params is None
+        and cfg.env.wrapper.joint_masking_action_space is not None
+    ):
+        env = JointMaskingActionSpace(
+            env=env, mask=cfg.env.wrapper.joint_masking_action_space
+        )
     env = BatchCompitableWrapper(env=env)
 
     return env
 
-    # batched version of the env that returns an observation of shape (b, c)
-
 
 def get_classifier(pretrained_path, config_path, device="mps"):
     if pretrained_path is None or config_path is None:
@@ -834,6 +1225,134 @@ def get_classifier(pretrained_path, config_path, device="mps"):
     return model
 
 
+def record_dataset(
+    env,
+    repo_id,
+    root=None,
+    num_episodes=1,
+    control_time_s=20,
+    fps=30,
+    push_to_hub=True,
+    task_description="",
+    policy=None,
+):
+    """
+    Record a dataset of robot interactions using either a policy or teleop.
+
+    Args:
+        env: The environment to record from
+        repo_id: Repository ID for dataset storage
+        root: Local root directory for dataset (optional)
+        num_episodes: Number of episodes to record
+        control_time_s: Maximum episode length in seconds
+        fps: Frames per second for recording
+        push_to_hub: Whether to push dataset to Hugging Face Hub
+        task_description: Description of the task being recorded
+        policy: Optional policy to generate actions (if None, uses teleop)
+    """
+    from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+
+    # Setup initial action (zero action if using teleop)
+    dummy_action = env.action_space.sample()
+    dummy_action = (torch.from_numpy(dummy_action[0] * 0.0), False)
+    action = dummy_action
+
+    # Configure dataset features based on environment spaces
+    features = {
+        "observation.state": {
+            "dtype": "float32",
+            "shape": env.observation_space["observation.state"].shape,
+            "names": None,
+        },
+        "action": {
+            "dtype": "float32",
+            "shape": env.action_space[0].shape,
+            "names": None,
+        },
+        "next.reward": {"dtype": "float32", "shape": (1,), "names": None},
+        "next.done": {"dtype": "bool", "shape": (1,), "names": None},
+    }
+
+    # Add image features
+    for key in env.observation_space:
+        if "image" in key:
+            features[key] = {
+                "dtype": "video",
+                "shape": env.observation_space[key].shape,
+                "names": None,
+            }
+
+    # Create dataset
+    dataset = LeRobotDataset.create(
+        repo_id,
+        fps,
+        root=root,
+        use_videos=True,
+        image_writer_threads=4,
+        image_writer_processes=0,
+        features=features,
+    )
+
+    # Record episodes
+    episode_index = 0
+    while episode_index < num_episodes:
+        obs, _ = env.reset()
+        start_episode_t = time.perf_counter()
+        log_say(f"Recording episode {episode_index}", play_sounds=True)
+
+        # Run episode steps
+        while time.perf_counter() - start_episode_t < control_time_s:
+            start_loop_t = time.perf_counter()
+
+            # Get action from policy if available
+            if policy is not None:
+                action = policy.select_action(obs)
+
+            # Step environment
+            obs, reward, terminated, truncated, info = env.step(action)
+
+            # Check if episode needs to be rerecorded
+            if info.get("rerecord_episode", False):
+                break
+
+            # For teleop, get action from intervention
+            if policy is None:
+                action = {
+                    "action": info["action_intervention"].cpu().squeeze(0).float()
+                }
+
+            # Process observation for dataset
+            obs = {k: v.cpu().squeeze(0).float() for k, v in obs.items()}
+
+            # Add frame to dataset
+            frame = {**obs, **action}
+            frame["next.reward"] = reward
+            frame["next.done"] = terminated or truncated
+            dataset.add_frame(frame)
+
+            # Maintain consistent timing
+            if fps:
+                dt_s = time.perf_counter() - start_loop_t
+                busy_wait(1 / fps - dt_s)
+
+            if terminated or truncated:
+                break
+
+        # Handle episode recording
+        if info.get("rerecord_episode", False):
+            dataset.clear_episode_buffer()
+            logging.info(f"Re-recording episode {episode_index}")
+            continue
+
+        dataset.save_episode(task_description)
+        episode_index += 1
+
+    # Finalize dataset
+    dataset.consolidate(run_compute_stats=True)
+    if push_to_hub:
+        dataset.push_to_hub(repo_id)
+
+
 def replay_episode(env, repo_id, root=None, episode=0):
     from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
 
@@ -841,14 +1360,16 @@ def replay_episode(env, repo_id, root=None, episode=0):
     dataset = LeRobotDataset(
         repo_id, root=root, episodes=[episode], local_files_only=local_files_only
     )
+    env.reset()
+
     actions = dataset.hf_dataset.select_columns("action")
 
     for idx in range(dataset.num_frames):
         start_episode_t = time.perf_counter()
 
         action = actions[idx]["action"][:4]
-        print(action)
-        env.step((action / env.unwrapped.delta, False))
+        env.step((action, False))
+        # env.step((action / env.unwrapped.delta, False))
 
         dt_s = time.perf_counter() - start_episode_t
         busy_wait(1 / 10 - dt_s)
@@ -875,14 +1396,6 @@ if __name__ == "__main__":
         help=(
             "Either the repo ID of a model hosted on the Hub or a path to a directory containing weights "
             "saved using `Policy.save_pretrained`. If not provided, the policy is initialized from scratch "
-            "(useful for debugging). This argument is mutually exclusive with `--config`."
-        ),
-    )
-    parser.add_argument(
-        "--config",
-        help=(
-            "Path to a yaml config you want to use for initializing a policy from scratch (useful for "
-            "debugging). This argument is mutually exclusive with `--pretrained-policy-name-or-path` (`-p`)."
         ),
     )
     parser.add_argument(
@@ -929,11 +1442,30 @@ if __name__ == "__main__":
         help="Repo ID of the episode to replay",
     )
     parser.add_argument(
-        "--replay-root", type=str, default=None, help="Root of the dataset to replay"
+        "--dataset-root", type=str, default=None, help="Root of the dataset to replay"
     )
     parser.add_argument(
         "--replay-episode", type=int, default=0, help="Episode to replay"
     )
+    parser.add_argument(
+        "--record-repo-id",
+        type=str,
+        default=None,
+        help="Repo ID of the dataset to record",
+    )
+    parser.add_argument(
+        "--record-num-episodes",
+        type=int,
+        default=1,
+        help="Number of episodes to record",
+    )
+    parser.add_argument(
+        "--record-episode-task",
+        type=str,
+        default="",
+        help="Single line description of the task to record",
+    )
+
     args = parser.parse_args()
 
     robot_cfg = init_hydra_config(args.robot_path, args.robot_overrides)
@@ -948,17 +1480,40 @@ if __name__ == "__main__":
     env = make_robot_env(
         robot,
         reward_classifier,
-        cfg.env,  # .wrapper,
+        cfg,  # .wrapper,
     )
 
-    env.reset()
+    if args.record_repo_id is not None:
+        policy = None
+        if args.pretrained_policy_name_or_path is not None:
+            from lerobot.common.policies.sac.modeling_sac import SACPolicy
+
+            policy = SACPolicy.from_pretrained(args.pretrained_policy_name_or_path)
+            policy.to(cfg.device)
+            policy.eval()
+
+        record_dataset(
+            env,
+            args.record_repo_id,
+            root=args.dataset_root,
+            num_episodes=args.record_num_episodes,
+            fps=args.fps,
+            task_description=args.record_episode_task,
+            policy=policy,
+        )
+        exit()
 
     if args.replay_repo_id is not None:
         replay_episode(
-            env, args.replay_repo_id, root=args.replay_root, episode=args.replay_episode
+            env,
+            args.replay_repo_id,
+            root=args.dataset_root,
+            episode=args.replay_episode,
         )
         exit()
 
+    env.reset()
+
     # Retrieve the robot's action space for joint commands.
     action_space_robot = env.action_space.spaces[0]
 
@@ -967,9 +1522,11 @@ if __name__ == "__main__":
 
     # Smoothing coefficient (alpha) defines how much of the new random sample to mix in.
     # A value close to 0 makes the trajectory very smooth (slow to change), while a value close to 1 is less smooth.
-    alpha = 0.4
+    alpha = 1.0
 
-    while True:
+    num_episode = 0
+    sucesses = []
+    while num_episode < 20:
         start_loop_s = time.perf_counter()
         # Sample a new random action from the robot's action space.
         new_random_action = action_space_robot.sample()
@@ -981,7 +1538,12 @@ if __name__ == "__main__":
             (torch.from_numpy(smoothed_action), False)
         )
         if terminated or truncated:
+            sucesses.append(reward)
             env.reset()
+            num_episode += 1
 
         dt_s = time.perf_counter() - start_loop_s
         busy_wait(1 / args.fps - dt_s)
+
+    logging.info(f"Success after 20 steps {sucesses}")
+    logging.info(f"success rate {sum(sucesses)/ len(sucesses)}")
diff --git a/lerobot/scripts/server/kinematics.py b/lerobot/scripts/server/kinematics.py
new file mode 100644
index 00000000..6622fe76
--- /dev/null
+++ b/lerobot/scripts/server/kinematics.py
@@ -0,0 +1,543 @@
+import numpy as np
+from scipy.spatial.transform import Rotation
+
+
+def skew_symmetric(w):
+    """Creates the skew-symmetric matrix from a 3D vector."""
+    return np.array([[0, -w[2], w[1]], [w[2], 0, -w[0]], [-w[1], w[0], 0]])
+
+
+def rodrigues_rotation(w, theta):
+    """Computes the rotation matrix using Rodrigues' formula."""
+    w_hat = skew_symmetric(w)
+    return np.eye(3) + np.sin(theta) * w_hat + (1 - np.cos(theta)) * w_hat @ w_hat
+
+
+def screw_axis_to_transform(S, theta):
+    """Converts a screw axis to a 4x4 transformation matrix."""
+    S_w = S[:3]
+    S_v = S[3:]
+    if np.allclose(S_w, 0) and np.linalg.norm(S_v) == 1:  # Pure translation
+        T = np.eye(4)
+        T[:3, 3] = S_v * theta
+    elif np.linalg.norm(S_w) == 1:  # Rotation and translation
+        w_hat = skew_symmetric(S_w)
+        R = np.eye(3) + np.sin(theta) * w_hat + (1 - np.cos(theta)) * w_hat @ w_hat
+        t = (
+            np.eye(3) * theta
+            + (1 - np.cos(theta)) * w_hat
+            + (theta - np.sin(theta)) * w_hat @ w_hat
+        ) @ S_v
+        T = np.eye(4)
+        T[:3, :3] = R
+        T[:3, 3] = t
+    else:
+        raise ValueError("Invalid screw axis parameters")
+    return T
+
+
+def pose_difference_se3(pose1, pose2):
+    """
+    Calculates the SE(3) difference between two 4x4 homogeneous transformation matrices.
+
+    pose1 - pose2
+
+    Args:
+        pose1: A 4x4 numpy array representing the first pose.
+        pose2: A 4x4 numpy array representing the second pose.
+
+    Returns:
+        A tuple (translation_diff, rotation_diff) where:
+        - translation_diff is a 3x1 numpy array representing the translational difference.
+        - rotation_diff is a 3x1 numpy array representing the rotational difference in axis-angle representation.
+    """
+
+    # Extract rotation matrices from poses
+    R1 = pose1[:3, :3]
+    R2 = pose2[:3, :3]
+
+    # Calculate translational difference
+    translation_diff = pose1[:3, 3] - pose2[:3, 3]
+
+    # Calculate rotational difference using scipy's Rotation library
+    R_diff = Rotation.from_matrix(R1 @ R2.T)
+    rotation_diff = R_diff.as_rotvec()  # Convert to axis-angle representation
+
+    return np.concatenate([translation_diff, rotation_diff])
+
+
+def se3_error(target_pose, current_pose):
+    pos_error = target_pose[:3, 3] - current_pose[:3, 3]
+    R_target = target_pose[:3, :3]
+    R_current = current_pose[:3, :3]
+    R_error = R_target @ R_current.T
+    rot_error = Rotation.from_matrix(R_error).as_rotvec()
+    return np.concatenate([pos_error, rot_error])
+
+
+class RobotKinematics:
+    """Robot kinematics class supporting multiple robot models."""
+
+    # Robot measurements dictionary
+    ROBOT_MEASUREMENTS = {
+        "koch": {
+            "gripper": [0.239, -0.001, 0.024],
+            "wrist": [0.209, 0, 0.024],
+            "forearm": [0.108, 0, 0.02],
+            "humerus": [0, 0, 0.036],
+            "shoulder": [0, 0, 0],
+            "base": [0, 0, 0.02],
+        },
+        "so100": {
+            "gripper": [0.320, 0, 0.050],
+            "wrist": [0.278, 0, 0.050],
+            "forearm": [0.143, 0, 0.044],
+            "humerus": [0.031, 0, 0.072],
+            "shoulder": [0, 0, 0],
+            "base": [0, 0, 0.02],
+        },
+        "moss": {
+            "gripper": [0.246, 0.013, 0.111],
+            "wrist": [0.245, 0.002, 0.064],
+            "forearm": [0.122, 0, 0.064],
+            "humerus": [0.001, 0.001, 0.063],
+            "shoulder": [0, 0, 0],
+            "base": [0, 0, 0.02],
+        },
+    }
+
+    def __init__(self, robot_type="so100"):
+        """Initialize kinematics for the specified robot type.
+
+        Args:
+            robot_type: String specifying the robot model ("koch", "so100", or "moss")
+        """
+        if robot_type not in self.ROBOT_MEASUREMENTS:
+            raise ValueError(
+                f"Unknown robot type: {robot_type}. Available types: {list(self.ROBOT_MEASUREMENTS.keys())}"
+            )
+
+        self.robot_type = robot_type
+        self.measurements = self.ROBOT_MEASUREMENTS[robot_type]
+
+        # Initialize all transformation matrices and screw axes
+        self._setup_transforms()
+
+    def _create_translation_matrix(self, x=0, y=0, z=0):
+        """Create a 4x4 translation matrix."""
+        return np.array([[1, 0, 0, x], [0, 1, 0, y], [0, 0, 1, z], [0, 0, 0, 1]])
+
+    def _setup_transforms(self):
+        """Setup all transformation matrices and screw axes for the robot."""
+        # Set up rotation matrices (constant across robot types)
+
+        # Gripper orientation
+        self.gripper_X0 = np.array(
+            [
+                [1, 0, 0, 0],
+                [0, 0, 1, 0],
+                [0, -1, 0, 0],
+                [0, 0, 0, 1],
+            ]
+        )
+
+        # Wrist orientation
+        self.wrist_X0 = np.array(
+            [
+                [0, -1, 0, 0],
+                [1, 0, 0, 0],
+                [0, 0, 1, 0],
+                [0, 0, 0, 1],
+            ]
+        )
+
+        # Base orientation
+        self.base_X0 = np.array(
+            [
+                [0, 0, 1, 0],
+                [1, 0, 0, 0],
+                [0, 1, 0, 0],
+                [0, 0, 0, 1],
+            ]
+        )
+
+        # Gripper
+        # Screw axis of gripper frame wrt base frame
+        self.S_BG = np.array(
+            [
+                1,
+                0,
+                0,
+                0,
+                self.measurements["gripper"][2],
+                -self.measurements["gripper"][1],
+            ]
+        )
+
+        # Gripper origin to centroid transform
+        self.X_GoGc = self._create_translation_matrix(x=0.07)
+
+        # Gripper origin to tip transform
+        self.X_GoGt = self._create_translation_matrix(x=0.12)
+
+        # 0-position gripper frame pose wrt base
+        self.X_BoGo = self._create_translation_matrix(
+            x=self.measurements["gripper"][0],
+            y=self.measurements["gripper"][1],
+            z=self.measurements["gripper"][2],
+        )
+
+        # Wrist
+        # Screw axis of wrist frame wrt base frame
+        self.S_BR = np.array(
+            [0, 1, 0, -self.measurements["wrist"][2], 0, self.measurements["wrist"][0]]
+        )
+
+        # 0-position origin to centroid transform
+        self.X_RoRc = self._create_translation_matrix(x=0.0035, y=-0.002)
+
+        # 0-position wrist frame pose wrt base
+        self.X_BR = self._create_translation_matrix(
+            x=self.measurements["wrist"][0],
+            y=self.measurements["wrist"][1],
+            z=self.measurements["wrist"][2],
+        )
+
+        # Forearm
+        # Screw axis of forearm frame wrt base frame
+        self.S_BF = np.array(
+            [
+                0,
+                1,
+                0,
+                -self.measurements["forearm"][2],
+                0,
+                self.measurements["forearm"][0],
+            ]
+        )
+
+        # Forearm origin + centroid transform
+        self.X_FoFc = self._create_translation_matrix(x=0.036)
+
+        # 0-position forearm frame pose wrt base
+        self.X_BF = self._create_translation_matrix(
+            x=self.measurements["forearm"][0],
+            y=self.measurements["forearm"][1],
+            z=self.measurements["forearm"][2],
+        )
+
+        # Humerus
+        # Screw axis of humerus frame wrt base frame
+        self.S_BH = np.array(
+            [
+                0,
+                -1,
+                0,
+                self.measurements["humerus"][2],
+                0,
+                -self.measurements["humerus"][0],
+            ]
+        )
+
+        # Humerus origin to centroid transform
+        self.X_HoHc = self._create_translation_matrix(x=0.0475)
+
+        # 0-position humerus frame pose wrt base
+        self.X_BH = self._create_translation_matrix(
+            x=self.measurements["humerus"][0],
+            y=self.measurements["humerus"][1],
+            z=self.measurements["humerus"][2],
+        )
+
+        # Shoulder
+        # Screw axis of shoulder frame wrt Base frame
+        self.S_BS = np.array([0, 0, -1, 0, 0, 0])
+
+        # Shoulder origin to centroid transform
+        self.X_SoSc = self._create_translation_matrix(x=-0.017, z=0.0235)
+
+        # 0-position shoulder frame pose wrt base
+        self.X_BS = self._create_translation_matrix(
+            x=self.measurements["shoulder"][0],
+            y=self.measurements["shoulder"][1],
+            z=self.measurements["shoulder"][2],
+        )
+
+        # Base
+        # Base origin to centroid transform
+        self.X_BoBc = self._create_translation_matrix(y=0.015)
+
+        # World to base transform
+        self.X_WoBo = self._create_translation_matrix(
+            x=self.measurements["base"][0],
+            y=self.measurements["base"][1],
+            z=self.measurements["base"][2],
+        )
+
+        # Pre-compute gripper post-multiplication matrix
+        self._fk_gripper_post = self.X_GoGc @ self.X_BoGo @ self.gripper_X0
+
+    def fk_base(self):
+        """Forward kinematics for the base frame."""
+        return self.X_WoBo @ self.X_BoBc @ self.base_X0
+
+    def fk_shoulder(self, robot_pos_deg):
+        """Forward kinematics for the shoulder frame."""
+        robot_pos_rad = robot_pos_deg / 180 * np.pi
+        return (
+            self.X_WoBo
+            @ screw_axis_to_transform(self.S_BS, robot_pos_rad[0])
+            @ self.X_SoSc
+            @ self.X_BS
+        )
+
+    def fk_humerus(self, robot_pos_deg):
+        """Forward kinematics for the humerus frame."""
+        robot_pos_rad = robot_pos_deg / 180 * np.pi
+        return (
+            self.X_WoBo
+            @ screw_axis_to_transform(self.S_BS, robot_pos_rad[0])
+            @ screw_axis_to_transform(self.S_BH, robot_pos_rad[1])
+            @ self.X_HoHc
+            @ self.X_BH
+        )
+
+    def fk_forearm(self, robot_pos_deg):
+        """Forward kinematics for the forearm frame."""
+        robot_pos_rad = robot_pos_deg / 180 * np.pi
+        return (
+            self.X_WoBo
+            @ screw_axis_to_transform(self.S_BS, robot_pos_rad[0])
+            @ screw_axis_to_transform(self.S_BH, robot_pos_rad[1])
+            @ screw_axis_to_transform(self.S_BF, robot_pos_rad[2])
+            @ self.X_FoFc
+            @ self.X_BF
+        )
+
+    def fk_wrist(self, robot_pos_deg):
+        """Forward kinematics for the wrist frame."""
+        robot_pos_rad = robot_pos_deg / 180 * np.pi
+        return (
+            self.X_WoBo
+            @ screw_axis_to_transform(self.S_BS, robot_pos_rad[0])
+            @ screw_axis_to_transform(self.S_BH, robot_pos_rad[1])
+            @ screw_axis_to_transform(self.S_BF, robot_pos_rad[2])
+            @ screw_axis_to_transform(self.S_BR, robot_pos_rad[3])
+            @ self.X_RoRc
+            @ self.X_BR
+            @ self.wrist_X0
+        )
+
+    def fk_gripper(self, robot_pos_deg):
+        """Forward kinematics for the gripper frame."""
+        robot_pos_rad = robot_pos_deg / 180 * np.pi
+        return (
+            self.X_WoBo
+            @ screw_axis_to_transform(self.S_BS, robot_pos_rad[0])
+            @ screw_axis_to_transform(self.S_BH, robot_pos_rad[1])
+            @ screw_axis_to_transform(self.S_BF, robot_pos_rad[2])
+            @ screw_axis_to_transform(self.S_BR, robot_pos_rad[3])
+            @ screw_axis_to_transform(self.S_BG, robot_pos_rad[4])
+            @ self._fk_gripper_post
+        )
+
+    def fk_gripper_tip(self, robot_pos_deg):
+        """Forward kinematics for the gripper tip frame."""
+        robot_pos_rad = robot_pos_deg / 180 * np.pi
+        return (
+            self.X_WoBo
+            @ screw_axis_to_transform(self.S_BS, robot_pos_rad[0])
+            @ screw_axis_to_transform(self.S_BH, robot_pos_rad[1])
+            @ screw_axis_to_transform(self.S_BF, robot_pos_rad[2])
+            @ screw_axis_to_transform(self.S_BR, robot_pos_rad[3])
+            @ screw_axis_to_transform(self.S_BG, robot_pos_rad[4])
+            @ self.X_GoGt
+            @ self.X_BoGo
+            @ self.gripper_X0
+        )
+
+    def compute_jacobian(self, robot_pos_deg, fk_func=None):
+        """Finite differences to compute the Jacobian.
+        J(i, j) represents how the ith component of the end-effector's velocity changes wrt a small change
+        in the jth joint's velocity.
+
+        Args:
+            robot_pos_deg: Current joint positions in degrees
+            fk_func: Forward kinematics function to use (defaults to fk_gripper)
+        """
+        if fk_func is None:
+            fk_func = self.fk_gripper
+
+        eps = 1e-8
+        jac = np.zeros(shape=(6, 5))
+        delta = np.zeros(len(robot_pos_deg[:-1]), dtype=np.float64)
+        for el_ix in range(len(robot_pos_deg[:-1])):
+            delta *= 0
+            delta[el_ix] = eps / 2
+            Sdot = (
+                pose_difference_se3(
+                    fk_func(robot_pos_deg[:-1] + delta),
+                    fk_func(robot_pos_deg[:-1] - delta),
+                )
+                / eps
+            )
+            jac[:, el_ix] = Sdot
+        return jac
+
+    def compute_positional_jacobian(self, robot_pos_deg, fk_func=None):
+        """Finite differences to compute the positional Jacobian.
+        J(i, j) represents how the ith component of the end-effector's position changes wrt a small change
+        in the jth joint's velocity.
+
+        Args:
+            robot_pos_deg: Current joint positions in degrees
+            fk_func: Forward kinematics function to use (defaults to fk_gripper)
+        """
+        if fk_func is None:
+            fk_func = self.fk_gripper
+
+        eps = 1e-8
+        jac = np.zeros(shape=(3, 5))
+        delta = np.zeros(len(robot_pos_deg[:-1]), dtype=np.float64)
+        for el_ix in range(len(robot_pos_deg[:-1])):
+            delta *= 0
+            delta[el_ix] = eps / 2
+            Sdot = (
+                fk_func(robot_pos_deg[:-1] + delta)[:3, 3]
+                - fk_func(robot_pos_deg[:-1] - delta)[:3, 3]
+            ) / eps
+            jac[:, el_ix] = Sdot
+        return jac
+
+    def ik(
+        self, current_joint_state, desired_ee_pose, position_only=True, fk_func=None
+    ):
+        """Inverse kinematics using gradient descent.
+
+        Args:
+            current_joint_state: Initial joint positions in degrees
+            desired_ee_pose: Target end-effector pose as a 4x4 transformation matrix
+            position_only: If True, only match end-effector position, not orientation
+            fk_func: Forward kinematics function to use (defaults to fk_gripper)
+
+        Returns:
+            Joint positions in degrees that achieve the desired end-effector pose
+        """
+        if fk_func is None:
+            fk_func = self.fk_gripper
+
+        # Do gradient descent.
+        max_iterations = 5
+        learning_rate = 1
+        for _ in range(max_iterations):
+            current_ee_pose = fk_func(current_joint_state)
+            if not position_only:
+                error = se3_error(desired_ee_pose, current_ee_pose)
+                jac = self.compute_jacobian(current_joint_state, fk_func)
+            else:
+                error = desired_ee_pose[:3, 3] - current_ee_pose[:3, 3]
+                jac = self.compute_positional_jacobian(current_joint_state, fk_func)
+            delta_angles = np.linalg.pinv(jac) @ error
+            current_joint_state[:-1] += learning_rate * delta_angles
+
+            if np.linalg.norm(error) < 5e-3:
+                return current_joint_state
+        return current_joint_state
+
+
+if __name__ == "__main__":
+    import time
+
+    def run_test(robot_type):
+        """Run test suite for a specific robot type."""
+        print(f"\n--- Testing {robot_type.upper()} Robot ---")
+
+        # Initialize kinematics for this robot
+        robot = RobotKinematics(robot_type)
+
+        # Test 1: Forward kinematics consistency
+        print("Test 1: Forward kinematics consistency")
+        test_angles = np.array(
+            [30, 45, -30, 20, 10, 0]
+        )  # Example joint angles in degrees
+
+        # Calculate FK for different joints
+        shoulder_pose = robot.fk_shoulder(test_angles)
+        humerus_pose = robot.fk_humerus(test_angles)
+        forearm_pose = robot.fk_forearm(test_angles)
+        wrist_pose = robot.fk_wrist(test_angles)
+        gripper_pose = robot.fk_gripper(test_angles)
+        gripper_tip_pose = robot.fk_gripper_tip(test_angles)
+
+        # Check that poses form a consistent kinematic chain (positions should be progressively further from origin)
+        distances = [
+            np.linalg.norm(shoulder_pose[:3, 3]),
+            np.linalg.norm(humerus_pose[:3, 3]),
+            np.linalg.norm(forearm_pose[:3, 3]),
+            np.linalg.norm(wrist_pose[:3, 3]),
+            np.linalg.norm(gripper_pose[:3, 3]),
+            np.linalg.norm(gripper_tip_pose[:3, 3]),
+        ]
+
+        # Check if distances generally increase along the chain
+        is_consistent = all(
+            distances[i] <= distances[i + 1] for i in range(len(distances) - 1)
+        )
+        print(f"  Pose distances from origin: {[round(d, 3) for d in distances]}")
+        print(
+            f"  Kinematic chain consistency: {'PASSED' if is_consistent else 'FAILED'}"
+        )
+
+        # Test 2: Jacobian computation
+        print("Test 2: Jacobian computation")
+        jacobian = robot.compute_jacobian(test_angles)
+        positional_jacobian = robot.compute_positional_jacobian(test_angles)
+
+        # Check shapes
+        jacobian_shape_ok = jacobian.shape == (6, 5)
+        pos_jacobian_shape_ok = positional_jacobian.shape == (3, 5)
+
+        print(f"  Jacobian shape: {'PASSED' if jacobian_shape_ok else 'FAILED'}")
+        print(
+            f"  Positional Jacobian shape: {'PASSED' if pos_jacobian_shape_ok else 'FAILED'}"
+        )
+
+        # Test 3: Inverse kinematics
+        print("Test 3: Inverse kinematics (position only)")
+
+        # Generate target pose from known joint angles
+        original_angles = np.array([10, 20, 30, -10, 5, 0])
+        target_pose = robot.fk_gripper(original_angles)
+
+        # Start IK from a different position
+        initial_guess = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
+
+        # Measure IK performance
+        start_time = time.time()
+        computed_angles = robot.ik(initial_guess.copy(), target_pose)
+        ik_time = time.time() - start_time
+
+        # Compute resulting pose from IK solution
+        result_pose = robot.fk_gripper(computed_angles)
+
+        # Calculate position error
+        pos_error = np.linalg.norm(target_pose[:3, 3] - result_pose[:3, 3])
+        passed = pos_error < 0.01  # Accept errors less than 1cm
+
+        print(f"  IK computation time: {ik_time:.4f} seconds")
+        print(f"  Position error: {pos_error:.4f}")
+        print(f"  IK position accuracy: {'PASSED' if passed else 'FAILED'}")
+
+        return is_consistent and jacobian_shape_ok and pos_jacobian_shape_ok and passed
+
+    # Run tests for all robot types
+    results = {}
+    for robot_type in ["koch", "so100", "moss"]:
+        results[robot_type] = run_test(robot_type)
+
+    # Print overall summary
+    print("\n=== Test Summary ===")
+    all_passed = all(results.values())
+    for robot_type, passed in results.items():
+        print(f"{robot_type.upper()}: {'PASSED' if passed else 'FAILED'}")
+    print(f"\nOverall: {'ALL TESTS PASSED' if all_passed else 'SOME TESTS FAILED'}")
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index 580eed1a..eb04effd 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -315,16 +315,49 @@ def start_learner_server(
 
 
 def check_nan_in_transition(
-    observations: torch.Tensor, actions: torch.Tensor, next_state: torch.Tensor
-):
-    for k in observations:
-        if torch.isnan(observations[k]).any():
-            logging.error(f"observations[{k}] contains NaN values")
-    for k in next_state:
-        if torch.isnan(next_state[k]).any():
-            logging.error(f"next_state[{k}] contains NaN values")
+    observations: torch.Tensor,
+    actions: torch.Tensor,
+    next_state: torch.Tensor,
+    raise_error: bool = False,
+) -> bool:
+    """
+    Check for NaN values in transition data.
+
+    Args:
+        observations: Dictionary of observation tensors
+        actions: Action tensor
+        next_state: Dictionary of next state tensors
+        raise_error: If True, raises ValueError when NaN is detected
+
+    Returns:
+        bool: True if NaN values were detected, False otherwise
+    """
+    nan_detected = False
+
+    # Check observations
+    for key, tensor in observations.items():
+        if torch.isnan(tensor).any():
+            logging.error(f"observations[{key}] contains NaN values")
+            nan_detected = True
+            if raise_error:
+                raise ValueError(f"NaN detected in observations[{key}]")
+
+    # Check next state
+    for key, tensor in next_state.items():
+        if torch.isnan(tensor).any():
+            logging.error(f"next_state[{key}] contains NaN values")
+            nan_detected = True
+            if raise_error:
+                raise ValueError(f"NaN detected in next_state[{key}]")
+
+    # Check actions
     if torch.isnan(actions).any():
         logging.error("actions contains NaN values")
+        nan_detected = True
+        if raise_error:
+            raise ValueError("NaN detected in actions")
+
+    return nan_detected
 
 
 def push_actor_policy_to_queue(parameters_queue: Queue, policy: nn.Module):
@@ -460,9 +493,18 @@ def add_actor_information_and_train(
 
             for transition in transition_list:
                 transition = move_transition_to_device(transition, device=device)
+                if check_nan_in_transition(
+                    transition["state"], transition["action"], transition["next_state"]
+                ):
+                    logging.warning("NaN detected in transition, skipping")
+                    continue
                 replay_buffer.add(**transition)
-                if transition.get("complementary_info", {}).get("is_intervention"):
+
+                if cfg.dataset_repo_id is not None and transition.get(
+                    "complementary_info", {}
+                ).get("is_intervention"):
                     offline_replay_buffer.add(**transition)
+
         logging.debug("[LEARNER] Received transitions")
         logging.debug("[LEARNER] Waiting for interactions")
         while not interaction_message_queue.empty() and not shutdown_event.is_set():

From 17ec837a7a3bada51d4648bdb190030913d8d5a3 Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Tue, 18 Mar 2025 14:57:15 +0000
Subject: [PATCH 101/112] Refactor SACPolicy and learner server for improved
 replay buffer management

- Updated SACPolicy to create critic heads using a list comprehension for better readability.
- Simplified the saving and loading of models using `save_model` and `load_model` functions from the safetensors library.
- Introduced `initialize_offline_replay_buffer` function in the learner server to streamline offline dataset handling and replay buffer initialization.
- Enhanced logging for dataset loading processes to improve traceability during training.
---
 lerobot/common/policies/sac/modeling_sac.py | 237 +++++++++-----------
 lerobot/scripts/server/learner_server.py    |  60 ++++-
 2 files changed, 159 insertions(+), 138 deletions(-)

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 2c4bad5f..646da874 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -18,7 +18,7 @@
 # TODO: (1) better device management
 
 from copy import deepcopy
-from typing import Callable, Optional, Tuple, Union, Dict
+from typing import Callable, Optional, Tuple, Union, Dict, List
 from pathlib import Path
 
 import einops
@@ -88,33 +88,33 @@ class SACPolicy(
             encoder_critic = SACObservationEncoder(config, self.normalize_inputs)
             encoder_actor = SACObservationEncoder(config, self.normalize_inputs)
 
+        # Create a list of critic heads
+        critic_heads = [
+            CriticHead(
+                input_dim=encoder_critic.output_dim + config.output_shapes["action"][0],
+                **config.critic_network_kwargs,
+            )
+            for _ in range(config.num_critics)
+        ]
+
         self.critic_ensemble = CriticEnsemble(
             encoder=encoder_critic,
-            ensemble=Ensemble(
-                [
-                    CriticHead(
-                        input_dim=encoder_critic.output_dim
-                        + config.output_shapes["action"][0],
-                        **config.critic_network_kwargs,
-                    )
-                    for _ in range(config.num_critics)
-                ]
-            ),
+            ensemble=critic_heads,
             output_normalization=self.normalize_targets,
         )
 
+        # Create target critic heads as deepcopies of the original critic heads
+        target_critic_heads = [
+            CriticHead(
+                input_dim=encoder_critic.output_dim + config.output_shapes["action"][0],
+                **config.critic_network_kwargs,
+            )
+            for _ in range(config.num_critics)
+        ]
+
         self.critic_target = CriticEnsemble(
             encoder=encoder_critic,
-            ensemble=Ensemble(
-                [
-                    CriticHead(
-                        input_dim=encoder_critic.output_dim
-                        + config.output_shapes["action"][0],
-                        **config.critic_network_kwargs,
-                    )
-                    for _ in range(config.num_critics)
-                ]
-            ),
+            ensemble=target_critic_heads,
             output_normalization=self.normalize_targets,
         )
 
@@ -149,19 +149,9 @@ class SACPolicy(
         import json
         from dataclasses import asdict
         from huggingface_hub.constants import SAFETENSORS_SINGLE_FILE, CONFIG_NAME
-        from safetensors.torch import save_file
+        from safetensors.torch import save_model
 
-        # NOTE: Using tensordict.from_modules in the model to batch the inference using torch.vmap
-        # implies one side effect: the __batch_size parameters are saved in the state_dict
-        # __batch_size is torch.Size or safetensor save only torch.Tensor
-        # so we need to filter them out before saving
-        simplified_state_dict = {}
-
-        for name, param in self.named_parameters():
-            simplified_state_dict[name] = param
-        save_file(
-            simplified_state_dict, os.path.join(save_directory, SAFETENSORS_SINGLE_FILE)
-        )
+        save_model(self, os.path.join(save_directory, SAFETENSORS_SINGLE_FILE))
 
         # Save config
         config_dict = asdict(self.config)
@@ -191,7 +181,7 @@ class SACPolicy(
         from pathlib import Path
         from huggingface_hub import hf_hub_download
         from huggingface_hub.constants import SAFETENSORS_SINGLE_FILE, CONFIG_NAME
-        from safetensors.torch import load_file
+        from safetensors.torch import load_model
         from lerobot.common.policies.sac.configuration_sac import SACConfig
 
         # Check if model_id is a local path or a hub model ID
@@ -243,28 +233,7 @@ class SACPolicy(
 
         # Load state dict from safetensors file
         if os.path.exists(safetensors_file):
-            # Note: The load_file function returns a dict with the parameters, but __batch_size
-            # is not loaded so we need to copy it from the model state_dict
-            # Load the parameters only
-            loaded_state_dict = load_file(safetensors_file, device=map_location)
-
-            # Copy batch size parameters
-            find_and_copy_params(
-                original_state_dict=model.state_dict(),
-                loaded_state_dict=loaded_state_dict,
-                pattern="__batch_size",
-                match_type="endswith",
-            )
-
-            # Copy normalization buffer parameters
-            find_and_copy_params(
-                original_state_dict=model.state_dict(),
-                loaded_state_dict=loaded_state_dict,
-                pattern="_orig_mod.output_normalization.buffer_action",
-                match_type="contains",
-            )
-
-            model.load_state_dict(loaded_state_dict, strict=False)
+            load_model(model, filename=safetensors_file, device=map_location)
 
         return model
 
@@ -594,21 +563,21 @@ class CriticEnsemble(nn.Module):
     def __init__(
         self,
         encoder: Optional[nn.Module],
-        ensemble: "Ensemble[CriticHead]",
+        ensemble: List[CriticHead],
         output_normalization: nn.Module,
         init_final: Optional[float] = None,
     ):
         super().__init__()
         self.encoder = encoder
-        self.ensemble = ensemble
         self.init_final = init_final
         self.output_normalization = output_normalization
+        self.critics = nn.ModuleList(ensemble)
 
         self.parameters_to_optimize = []
         # Handle the case where a part of the encoder if frozen
         if self.encoder is not None:
             self.parameters_to_optimize += list(self.encoder.parameters_to_optimize)
-        self.parameters_to_optimize += list(self.ensemble.parameters())
+        self.parameters_to_optimize += list(self.critics.parameters())
 
     def forward(
         self,
@@ -632,8 +601,15 @@ class CriticEnsemble(nn.Module):
         )
 
         inputs = torch.cat([obs_enc, actions], dim=-1)
-        q_values = self.ensemble(inputs)  # [num_critics, B, 1]
-        return q_values.squeeze(-1)  # [num_critics, B]
+
+        # Loop through critics and collect outputs
+        q_values = []
+        for critic in self.critics:
+            q_values.append(critic(inputs))
+
+        # Stack outputs to match expected shape [num_critics, batch_size]
+        q_values = torch.stack([q.squeeze(-1) for q in q_values], dim=0)
+        return q_values
 
 
 class Policy(nn.Module):
@@ -706,9 +682,9 @@ class Policy(nn.Module):
         # Compute standard deviations
         if self.fixed_std is None:
             log_std = self.std_layer(outputs)
-            assert not torch.isnan(
-                log_std
-            ).any(), "[ERROR] log_std became NaN after std_layer!"
+            assert not torch.isnan(log_std).any(), (
+                "[ERROR] log_std became NaN after std_layer!"
+            )
 
             if self.use_tanh_squash:
                 log_std = torch.tanh(log_std)
@@ -1025,73 +1001,78 @@ def _convert_normalization_params_to_tensor(normalization_params: dict) -> dict:
 
 
 if __name__ == "__main__":
-    # Test the SACObservationEncoder
+    # Benchmark the CriticEnsemble performance
     import time
 
-    config = SACConfig()
-    config.num_critics = 10
-    config.vision_encoder_name = None
-    encoder = SACObservationEncoder(config, nn.Identity())
-    # actor_encoder = SACObservationEncoder(config)
-    # encoder = torch.compile(encoder)
+    # Configuration
+    num_critics = 10
+    batch_size = 32
+    action_dim = 7
+    obs_dim = 64
+    hidden_dims = [256, 256]
+    num_iterations = 100
+
+    print("Creating test environment...")
+
+    # Create a simple dummy encoder
+    class DummyEncoder(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.output_dim = obs_dim
+            self.parameters_to_optimize = []
+
+        def forward(self, obs):
+            # Just return a random tensor of the right shape
+            # In practice, this would encode the observations
+            return torch.randn(batch_size, obs_dim, device=device)
+
+    # Create critic heads
+    print(f"Creating {num_critics} critic heads...")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    critic_heads = [
+        CriticHead(
+            input_dim=obs_dim + action_dim,
+            hidden_dims=hidden_dims,
+        ).to(device)
+        for _ in range(num_critics)
+    ]
+
+    # Create the critic ensemble
+    print("Creating CriticEnsemble...")
     critic_ensemble = CriticEnsemble(
-        encoder=encoder,
-        ensemble=Ensemble(
-            [
-                CriticHead(
-                    input_dim=encoder.output_dim + config.output_shapes["action"][0],
-                    **config.critic_network_kwargs,
-                )
-                for _ in range(config.num_critics)
-            ]
-        ),
+        encoder=DummyEncoder().to(device),
+        ensemble=critic_heads,
         output_normalization=nn.Identity(),
-    )
-    # actor = Policy(
-    #     encoder=actor_encoder,
-    #     network=MLP(input_dim=actor_encoder.output_dim, **config.actor_network_kwargs),
-    #     action_dim=config.output_shapes["action"][0],
-    #     encoder_is_shared=config.shared_encoder,
-    #     **config.policy_kwargs,
-    # )
-    # encoder = encoder.to("cuda:0")
-    # critic_ensemble = torch.compile(critic_ensemble)
-    critic_ensemble = critic_ensemble.to("cuda:0")
-    # actor = torch.compile(actor)
-    # actor = actor.to("cuda:0")
+    ).to(device)
+
+    # Create random input data
+    print("Creating input data...")
     obs_dict = {
-        "observation.image": torch.randn(8, 3, 84, 84),
-        "observation.state": torch.randn(8, 4),
+        "observation.state": torch.randn(batch_size, obs_dim, device=device),
     }
-    actions = torch.randn(8, 2).to("cuda:0")
-    # obs_dict = {k: v.to("cuda:0") for k, v in obs_dict.items()}
-    # print("compiling...")
-    q_value = critic_ensemble(obs_dict, actions)
-    print(q_value.size())
-    # action = actor(obs_dict)
-    # print("compiled")
-    # start = time.perf_counter()
-    # for _ in range(1000):
-    #     # features = encoder(obs_dict)
-    #     action = actor(obs_dict)
-    #     # q_value = critic_ensemble(obs_dict, actions)
-    # print("Time taken:", time.perf_counter() - start)
-    # Compare the performance of the ensemble vs a for loop of 16 MLPs
-    ensemble = Ensemble([CriticHead(256, [256, 256]) for _ in range(2)])
-    ensemble = ensemble.to("cuda:0")
-    critic = CriticHead(256, [256, 256])
-    critic = critic.to("cuda:0")
-    data_ensemble = torch.randn(8, 256).to("cuda:0")
-    ensemble = torch.compile(ensemble)
-    # critic = torch.compile(critic)
-    print(ensemble(data_ensemble).size())
-    print(critic(data_ensemble).size())
-    start = time.perf_counter()
-    for _ in range(1000):
-        ensemble(data_ensemble)
-    print("Time taken:", time.perf_counter() - start)
-    start = time.perf_counter()
-    for _ in range(1000):
-        for i in range(2):
-            critic(data_ensemble)
-    print("Time taken:", time.perf_counter() - start)
+    actions = torch.randn(batch_size, action_dim, device=device)
+
+    # Warmup run
+    print("Warming up...")
+    _ = critic_ensemble(obs_dict, actions)
+
+    # Time the forward pass
+    print(f"Running benchmark with {num_iterations} iterations...")
+    start_time = time.perf_counter()
+    for _ in range(num_iterations):
+        q_values = critic_ensemble(obs_dict, actions)
+    end_time = time.perf_counter()
+
+    # Print results
+    elapsed_time = end_time - start_time
+    print(f"Total time: {elapsed_time:.4f} seconds")
+    print(f"Average time per iteration: {elapsed_time / num_iterations * 1000:.4f} ms")
+    print(f"Output shape: {q_values.shape}")  # Should be [num_critics, batch_size]
+
+    # Verify that all critic heads produce different outputs
+    # This confirms each critic head is unique
+    # print("\nVerifying critic outputs are different:")
+    # for i in range(num_critics):
+    #     for j in range(i + 1, num_critics):
+    #         diff = torch.abs(q_values[i] - q_values[j]).mean().item()
+    #         print(f"Mean difference between critic {i} and {j}: {diff:.6f}")
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index eb04effd..d1235980 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -121,7 +121,7 @@ def load_training_state(
         return None, None
 
     training_state = torch.load(
-        logger.last_checkpoint_dir / logger.training_state_file_name
+        logger.last_checkpoint_dir / logger.training_state_file_name, weights_only=False
     )
 
     if isinstance(training_state["optimizer"], dict):
@@ -160,6 +160,7 @@ def initialize_replay_buffer(
             optimize_memory=True,
         )
 
+    logging.info("Resume training load the online dataset")
     dataset = LeRobotDataset(
         repo_id=cfg.dataset_repo_id,
         local_files_only=True,
@@ -174,6 +175,37 @@ def initialize_replay_buffer(
     )
 
 
+def initialize_offline_replay_buffer(
+    cfg: DictConfig,
+    logger: Logger,
+    device: str,
+    storage_device: str,
+    active_action_dims: list[int] | None = None,
+) -> ReplayBuffer:
+    if not cfg.resume:
+        logging.info("make_dataset offline buffer")
+        offline_dataset = make_dataset(cfg)
+    if cfg.resume:
+        logging.info("load offline dataset")
+        offline_dataset = LeRobotDataset(
+            repo_id=cfg.dataset_repo_id,
+            local_files_only=True,
+            root=logger.log_dir / "dataset_offline",
+        )
+
+    logging.info("Convert to a offline replay buffer")
+    offline_replay_buffer = ReplayBuffer.from_lerobot_dataset(
+        offline_dataset,
+        device=device,
+        state_keys=cfg.policy.input_shapes.keys(),
+        action_mask=active_action_dims,
+        action_delta=cfg.env.wrapper.delta_action,
+        storage_device=storage_device,
+        optimize_memory=True,
+    )
+    return offline_replay_buffer
+
+
 def get_observation_features(
     policy: SACPolicy, observations: torch.Tensor, next_observations: torch.Tensor
 ) -> tuple[torch.Tensor | None, torch.Tensor | None]:
@@ -447,9 +479,6 @@ def add_actor_information_and_train(
     offline_replay_buffer = None
 
     if cfg.dataset_repo_id is not None:
-        logging.info("make_dataset offline buffer")
-        offline_dataset = make_dataset(cfg)
-        logging.info("Convertion to a offline replay buffer")
         active_action_dims = None
         if cfg.env.wrapper.joint_masking_action_space is not None:
             active_action_dims = [
@@ -457,14 +486,12 @@ def add_actor_information_and_train(
                 for i, mask in enumerate(cfg.env.wrapper.joint_masking_action_space)
                 if mask
             ]
-        offline_replay_buffer = ReplayBuffer.from_lerobot_dataset(
-            offline_dataset,
+        offline_replay_buffer = initialize_offline_replay_buffer(
+            cfg=cfg,
+            logger=logger,
             device=device,
-            state_keys=cfg.policy.input_shapes.keys(),
-            action_mask=active_action_dims,
-            action_delta=cfg.env.wrapper.delta_action,
             storage_device=storage_device,
-            optimize_memory=True,
+            active_action_dims=active_action_dims,
         )
         batch_size: int = batch_size // 2  # We will sample from both replay buffer
 
@@ -714,6 +741,19 @@ def add_actor_information_and_train(
             replay_buffer.to_lerobot_dataset(
                 cfg.dataset_repo_id, fps=cfg.fps, root=logger.log_dir / "dataset"
             )
+            if offline_replay_buffer is not None:
+                dataset_dir = logger.log_dir / "dataset_offline"
+
+                if dataset_dir.exists() and dataset_dir.is_dir():
+                    shutil.rmtree(
+                        dataset_dir,
+                    )
+
+                offline_replay_buffer.to_lerobot_dataset(
+                    cfg.dataset_repo_id,
+                    fps=cfg.fps,
+                    root=logger.log_dir / "dataset_offline",
+                )
 
             logging.info("Resume training")
 

From f899edb57fe3c4bc1caec9a4053aa90f7899e2bc Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 18 Mar 2025 14:57:57 +0000
Subject: [PATCH 102/112] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 lerobot/common/policies/sac/modeling_sac.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 646da874..e634af3f 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -682,9 +682,9 @@ class Policy(nn.Module):
         # Compute standard deviations
         if self.fixed_std is None:
             log_std = self.std_layer(outputs)
-            assert not torch.isnan(log_std).any(), (
-                "[ERROR] log_std became NaN after std_layer!"
-            )
+            assert not torch.isnan(
+                log_std
+            ).any(), "[ERROR] log_std became NaN after std_layer!"
 
             if self.use_tanh_squash:
                 log_std = torch.tanh(log_std)

From b7bd13570f0953b4766271fef9a4c4149c2e59f7 Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Wed, 19 Mar 2025 09:54:46 +0000
Subject: [PATCH 103/112] Update configuration files for improved performance
 and flexibility

- Increased frame rate in `maniskill_example.yaml` from 20 to 400 for enhanced simulation speed.
- Updated `sac_maniskill.yaml` to set `dataset_repo_id` to null and adjusted `grad_clip_norm` from 10.0 to 40.0.
- Changed `storage_device` from "cpu" to "cuda" for better resource utilization.
- Modified `save_freq` from 2000000 to 1000000 to optimize saving intervals.
- Enhanced input normalization parameters for `observation.state` and `observation.image` in SAC policy.
- Adjusted `num_critics` from 10 to 2 and `policy_parameters_push_frequency` from 1 to 4 for improved training dynamics.
- Updated `learner_server.py` to utilize `offline_buffer_capacity` for replay buffer initialization.
- Changed action multiplier in `maniskill_manipulator.py` from 1 to 0.03 for finer control over actions.
---
 lerobot/configs/env/maniskill_example.yaml    |  2 +-
 lerobot/configs/policy/sac_maniskill.yaml     | 63 ++++++++++---------
 lerobot/scripts/server/learner_server.py      |  1 +
 .../scripts/server/maniskill_manipulator.py   |  2 +-
 4 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/lerobot/configs/env/maniskill_example.yaml b/lerobot/configs/env/maniskill_example.yaml
index 3df23b2e..2beaa8a6 100644
--- a/lerobot/configs/env/maniskill_example.yaml
+++ b/lerobot/configs/env/maniskill_example.yaml
@@ -1,6 +1,6 @@
 # @package _global_
 
-fps: 20
+fps: 400
 
 env:
   name: maniskill/pushcube
diff --git a/lerobot/configs/policy/sac_maniskill.yaml b/lerobot/configs/policy/sac_maniskill.yaml
index c9bbca44..cf20d059 100644
--- a/lerobot/configs/policy/sac_maniskill.yaml
+++ b/lerobot/configs/policy/sac_maniskill.yaml
@@ -8,22 +8,23 @@
 #   env.gym.obs_type=environment_state_agent_pos \
 
 seed: 1
-dataset_repo_id: "AdilZtn/Maniskill-Pushcube-demonstration-medium"
+# dataset_repo_id: "AdilZtn/Maniskill-Pushcube-demonstration-medium"
+dataset_repo_id: null
 
 training:
   # Offline training dataloader
   num_workers: 4
 
   batch_size: 512
-  grad_clip_norm: 10.0
+  grad_clip_norm: 40.0
   lr: 3e-4
 
 
-  storage_device: "cpu"
+  storage_device: "cuda"
 
   eval_freq: 2500
   log_freq: 10
-  save_freq: 2000000
+  save_freq: 1000000
 
   online_steps: 1000000
   online_rollout_n_episodes: 10
@@ -32,17 +33,12 @@ training:
   online_sampling_ratio: 1.0
   online_env_seed: 10000
   online_buffer_capacity: 200000
+  offline_buffer_capacity: 100000
   online_buffer_seed_size: 0
   online_step_before_learning: 500
   do_online_rollout_async: false
   policy_update_freq: 1
 
-  # delta_timestamps:
-  #   observation.environment_state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
-  #   observation.state: "[i / ${fps} for i in range(${policy.horizon} + 1)]"
-  #   action: "[i / ${fps} for i in range(${policy.horizon})]"
-  #   next.reward: "[i / ${fps} for i in range(${policy.horizon})]"
-
 policy:
   name: sac
 
@@ -68,28 +64,33 @@ policy:
   camera_number: 1
 
   # Normalization / Unnormalization
-  input_normalization_modes: null
-  # input_normalization_modes:
-  #  observation.state: min_max
-  input_normalization_params: null
-    # observation.state:
-    #   min: [-1.9361e+00, -7.7640e-01, -7.7094e-01, -2.9709e+00, -8.5656e-01,
-    #       1.0764e+00, -1.2680e+00,  0.0000e+00,  0.0000e+00, -9.3448e+00,
-    #      -3.3828e+00, -3.8420e+00, -5.2553e+00, -3.4154e+00, -6.5082e+00,
-    #      -6.0500e+00, -8.7193e+00, -8.2337e+00, -3.4650e-01, -4.9441e-01,
-    #       8.3516e-03, -3.1114e-01, -9.9700e-01, -2.3471e-01, -2.7137e-01]
+  # input_normalization_modes: null
+  input_normalization_modes:
+    observation.state: min_max
+    observation.image: mean_std
+  # input_normalization_params: null
+  input_normalization_params:
+    observation.state:
+      min: [-1.9361e+00, -7.7640e-01, -7.7094e-01, -2.9709e+00, -8.5656e-01,
+          1.0764e+00, -1.2680e+00,  0.0000e+00,  0.0000e+00, -9.3448e+00,
+         -3.3828e+00, -3.8420e+00, -5.2553e+00, -3.4154e+00, -6.5082e+00,
+         -6.0500e+00, -8.7193e+00, -8.2337e+00, -3.4650e-01, -4.9441e-01,
+          8.3516e-03, -3.1114e-01, -9.9700e-01, -2.3471e-01, -2.7137e-01]
+      max: [ 0.8644,  1.4306,  1.8520, -0.7578,  0.9508,  3.4901,  1.9381,  0.0400,
+          0.0400,  5.0885,  4.7156,  7.9393,  7.9100,  2.9796,  5.7720,  4.7163,
+          7.8145,  9.7415,  0.2422,  0.4505,  0.6306,  0.2622,  1.0000,  0.5135,
+          0.4001]
 
-    #   max: [ 0.8644,  1.4306,  1.8520, -0.7578,  0.9508,  3.4901,  1.9381,  0.0400,
-    #       0.0400,  5.0885,  4.7156,  7.9393,  7.9100,  2.9796,  5.7720,  4.7163,
-    #       7.8145,  9.7415,  0.2422,  0.4505,  0.6306,  0.2622,  1.0000,  0.5135,
-    #       0.4001]
+    observation.image:
+      mean: [0.485, 0.456, 0.406]
+      std: [0.229, 0.224, 0.225]
 
   output_normalization_modes:
     action: min_max
   output_normalization_params:
     action:
-      min: [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]
-      max: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+      min: [-0.03, -0.03, -0.03, -0.03, -0.03, -0.03, -0.03]
+      max: [0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03]
   output_normalization_shapes:
     action: [7]
 
@@ -99,8 +100,8 @@ policy:
   # discount: 0.99
   discount: 0.80
   temperature_init: 1.0
-  num_critics: 10 #10
-  num_subsample_critics: 2
+  num_critics: 2 #10
+  num_subsample_critics: null
   critic_lr: 3e-4
   actor_lr: 3e-4
   temperature_lr: 3e-4
@@ -111,7 +112,7 @@ policy:
 actor_learner_config:
   learner_host: "127.0.0.1"
   learner_port: 50051
-  policy_parameters_push_frequency: 1
+  policy_parameters_push_frequency: 4
   concurrency:
-    actor: 'processes'
-    learner: 'processes'
+    actor: 'threads'
+    learner: 'threads'
diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index d1235980..713fc2a8 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -202,6 +202,7 @@ def initialize_offline_replay_buffer(
         action_delta=cfg.env.wrapper.delta_action,
         storage_device=storage_device,
         optimize_memory=True,
+        capacity=cfg.training.offline_buffer_capacity,
     )
     return offline_replay_buffer
 
diff --git a/lerobot/scripts/server/maniskill_manipulator.py b/lerobot/scripts/server/maniskill_manipulator.py
index e4d55955..495042de 100644
--- a/lerobot/scripts/server/maniskill_manipulator.py
+++ b/lerobot/scripts/server/maniskill_manipulator.py
@@ -159,7 +159,7 @@ def make_maniskill(
     env.unwrapped.metadata["render_fps"] = 20
     env = ManiSkillCompat(env)
     env = ManiSkillActionWrapper(env)
-    env = ManiSkillMultiplyActionWrapper(env, multiply_factor=1)
+    env = ManiSkillMultiplyActionWrapper(env, multiply_factor=0.03)
 
     return env
 

From 6fa3e5f9ad2cbee6e5ebe10f7b42c13d92d99b12 Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Wed, 19 Mar 2025 13:16:31 +0000
Subject: [PATCH 104/112] Enhance training information logging in learner
 server

- Added tracking for replay buffer size and offline replay buffer size during training steps.
---
 lerobot/scripts/server/learner_server.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index 713fc2a8..2b19fea2 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -681,6 +681,11 @@ def add_actor_information_and_train(
 
         policy.update_target_networks()
         if optimization_step % cfg.training.log_freq == 0:
+            training_infos["replay_buffer_size"] = len(replay_buffer)
+            if offline_replay_buffer is not None:
+                training_infos["offline_replay_buffer_size"] = len(
+                    offline_replay_buffer
+                )
             training_infos["Optimization step"] = optimization_step
             logger.log_dict(
                 d=training_infos, mode="train", custom_step_key="Optimization step"

From 8598e80718f60453b25c8e3f585b6df4912c76db Mon Sep 17 00:00:00 2001
From: Eugene Mironov <helper2424@gmail.com>
Date: Wed, 19 Mar 2025 20:27:32 +0700
Subject: [PATCH 105/112] [PORT HIL-SERL] Optimize training loop, extract
 config usage (#855)

Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
---
 lerobot/scripts/server/learner_server.py | 47 +++++++++++++++---------
 1 file changed, 29 insertions(+), 18 deletions(-)

diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index 2b19fea2..89e51f62 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -509,6 +509,22 @@ def add_actor_information_and_train(
         resume_interaction_step if resume_interaction_step is not None else 0
     )
 
+    # Extract variables from cfg
+    online_step_before_learning = cfg.training.online_step_before_learning
+    utd_ratio = cfg.policy.utd_ratio
+    dataset_repo_id = cfg.dataset_repo_id
+    fps = cfg.fps
+    log_freq = cfg.training.log_freq
+    save_freq = cfg.training.save_freq
+    device = cfg.device
+    storage_device = cfg.training.storage_device
+    policy_update_freq = cfg.training.policy_update_freq
+    policy_parameters_push_frequency = (
+        cfg.actor_learner_config.policy_parameters_push_frequency
+    )
+    save_checkpoint = cfg.training.save_checkpoint
+    online_steps = cfg.training.online_steps
+
     while True:
         if shutdown_event is not None and shutdown_event.is_set():
             logging.info("[LEARNER] Shutdown signal received. Exiting...")
@@ -546,15 +562,15 @@ def add_actor_information_and_train(
 
         logging.debug("[LEARNER] Received interactions")
 
-        if len(replay_buffer) < cfg.training.online_step_before_learning:
+        if len(replay_buffer) < online_step_before_learning:
             continue
 
         logging.debug("[LEARNER] Starting optimization loop")
         time_for_one_optimization_step = time.time()
-        for _ in range(cfg.policy.utd_ratio - 1):
+        for _ in range(utd_ratio - 1):
             batch = replay_buffer.sample(batch_size)
 
-            if cfg.dataset_repo_id is not None:
+            if dataset_repo_id is not None:
                 batch_offline = offline_replay_buffer.sample(batch_size)
                 batch = concatenate_batch_transitions(batch, batch_offline)
 
@@ -591,7 +607,7 @@ def add_actor_information_and_train(
 
         batch = replay_buffer.sample(batch_size)
 
-        if cfg.dataset_repo_id is not None:
+        if dataset_repo_id is not None:
             batch_offline = offline_replay_buffer.sample(batch_size)
             batch = concatenate_batch_transitions(
                 left_batch_transitions=batch, right_batch_transition=batch_offline
@@ -633,8 +649,8 @@ def add_actor_information_and_train(
         training_infos["loss_critic"] = loss_critic.item()
         training_infos["critic_grad_norm"] = critic_grad_norm
 
-        if optimization_step % cfg.training.policy_update_freq == 0:
-            for _ in range(cfg.training.policy_update_freq):
+        if optimization_step % policy_update_freq == 0:
+            for _ in range(policy_update_freq):
                 loss_actor = policy.compute_loss_actor(
                     observations=observations,
                     observation_features=observation_features,
@@ -672,14 +688,12 @@ def add_actor_information_and_train(
                 training_infos["temperature_grad_norm"] = temp_grad_norm
                 training_infos["temperature"] = policy.temperature
 
-        if (
-            time.time() - last_time_policy_pushed
-            > cfg.actor_learner_config.policy_parameters_push_frequency
-        ):
+        if time.time() - last_time_policy_pushed > policy_parameters_push_frequency:
             push_actor_policy_to_queue(parameters_queue, policy)
             last_time_policy_pushed = time.time()
 
         policy.update_target_networks()
+
         if optimization_step % cfg.training.log_freq == 0:
             training_infos["replay_buffer_size"] = len(replay_buffer)
             if offline_replay_buffer is not None:
@@ -711,17 +725,14 @@ def add_actor_information_and_train(
         )
 
         optimization_step += 1
-        if optimization_step % cfg.training.log_freq == 0:
+        if optimization_step % log_freq == 0:
             logging.info(f"[LEARNER] Number of optimization step: {optimization_step}")
 
-        if cfg.training.save_checkpoint and (
-            optimization_step % cfg.training.save_freq == 0
-            or optimization_step == cfg.training.online_steps
+        if save_checkpoint and (
+            optimization_step % save_freq == 0 or optimization_step == online_steps
         ):
             logging.info(f"Checkpoint policy after step {optimization_step}")
-            # Note: Save with step as the identifier, and format it to have at least 6 digits but more if
-            # needed (choose 6 as a minimum for consistency without being overkill).
-            _num_digits = max(6, len(str(cfg.training.online_steps)))
+            _num_digits = max(6, len(str(online_steps)))
             step_identifier = f"{optimization_step:0{_num_digits}d}"
             interaction_step = (
                 interaction_message["Interaction step"]
@@ -745,7 +756,7 @@ def add_actor_information_and_train(
                     dataset_dir,
                 )
             replay_buffer.to_lerobot_dataset(
-                cfg.dataset_repo_id, fps=cfg.fps, root=logger.log_dir / "dataset"
+                dataset_repo_id, fps=fps, root=logger.log_dir / "dataset"
             )
             if offline_replay_buffer is not None:
                 dataset_dir = logger.log_dir / "dataset_offline"

From 2ecc34ceb90e2c65f16f56ed4fa57511ac632799 Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Wed, 19 Mar 2025 13:40:23 +0000
Subject: [PATCH 106/112] - Updated the logging condition to use `log_freq`
 directly instead of accessing it through `cfg.training.log_freq` for improved
 readability and speed.

---
 lerobot/scripts/server/learner_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lerobot/scripts/server/learner_server.py b/lerobot/scripts/server/learner_server.py
index 89e51f62..82765a28 100644
--- a/lerobot/scripts/server/learner_server.py
+++ b/lerobot/scripts/server/learner_server.py
@@ -694,7 +694,7 @@ def add_actor_information_and_train(
 
         policy.update_target_networks()
 
-        if optimization_step % cfg.training.log_freq == 0:
+        if optimization_step % log_freq == 0:
             training_infos["replay_buffer_size"] = len(replay_buffer)
             if offline_replay_buffer is not None:
                 training_infos["offline_replay_buffer_size"] = len(

From 95758cb867632a7bb55a363b7e7133f16ea89ced Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Wed, 19 Mar 2025 18:37:50 +0000
Subject: [PATCH 107/112] Add intervention rate tracking in act_with_policy
 function

- Introduced counters for tracking intervention steps and total steps during training.
- Calculated and logged the intervention rate at the end of each episode.
- Reset intervention counters after each episode to ensure accurate tracking.
---
 lerobot/scripts/server/actor_server.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/lerobot/scripts/server/actor_server.py b/lerobot/scripts/server/actor_server.py
index 45fd34a3..8264bf00 100644
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -335,6 +335,9 @@ def act_with_policy(
     list_transition_to_send_to_learner = []
     list_policy_time = []
     episode_intervention = False
+    # Add counters for intervention rate calculation
+    episode_intervention_steps = 0
+    episode_total_steps = 0
 
     for interaction_step in range(cfg.training.online_steps):
         start_time = time.perf_counter()
@@ -372,6 +375,8 @@ def act_with_policy(
             )
 
         sum_reward_episode += float(reward)
+        # Increment total steps counter for intervention rate
+        episode_total_steps += 1
 
         # NOTE: We overide the action if the intervention is True, because the action applied is the intervention action
         if "is_intervention" in info and info["is_intervention"]:
@@ -380,6 +385,8 @@ def act_with_policy(
             # but sometimes for example we want to deactivate the gripper
             action = info["action_intervention"]
             episode_intervention = True
+            # Increment intervention steps counter
+            episode_intervention_steps += 1
 
         # Check for NaN values in observations
         for key, tensor in obs.items():
@@ -424,6 +431,11 @@ def act_with_policy(
             stats = get_frequency_stats(list_policy_time)
             list_policy_time.clear()
 
+            # Calculate intervention rate
+            intervention_rate = 0.0
+            if episode_total_steps > 0:
+                intervention_rate = episode_intervention_steps / episode_total_steps
+
             # Send episodic reward to the learner
             interactions_queue.put(
                 python_object_to_bytes(
@@ -431,12 +443,16 @@ def act_with_policy(
                         "Episodic reward": sum_reward_episode,
                         "Interaction step": interaction_step,
                         "Episode intervention": int(episode_intervention),
+                        "Intervention rate": intervention_rate,
                         **stats,
                     }
                 )
             )
             sum_reward_episode = 0.0
             episode_intervention = False
+            # Reset intervention counters
+            episode_intervention_steps = 0
+            episode_total_steps = 0
             obs, info = online_env.reset()
 
         if cfg.fps is not None:

From 75512601040116851487c42d9726e2137fe58b63 Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Wed, 19 Mar 2025 18:53:01 +0000
Subject: [PATCH 108/112] Remove unused functions and imports from
 modeling_sac.py

- Deleted the `find_and_copy_params` function and the `Ensemble` class, as they were deemed unnecessary.
- Cleaned up imports by removing `from_modules` from `tensordict` to enhance code clarity.
- Simplified the assertion in the `Policy` class for better readability.
---
 lerobot/common/policies/sac/modeling_sac.py | 105 +-------------------
 pyproject.toml                              |   2 +-
 2 files changed, 4 insertions(+), 103 deletions(-)

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index e634af3f..43221d5c 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -23,7 +23,6 @@ from pathlib import Path
 
 import einops
 import numpy as np
-from tensordict import from_modules
 import torch
 import torch.nn as nn
 import torch.nn.functional as F  # noqa: N812
@@ -446,50 +445,6 @@ class MLP(nn.Module):
         return self.net(x)
 
 
-def find_and_copy_params(
-    original_state_dict: dict[str, torch.Tensor],
-    loaded_state_dict: dict[str, torch.Tensor],
-    pattern: str,
-    match_type: str = "contains",
-) -> list[str]:
-    """Find and copy parameters from original state dict to loaded state dict based on a pattern.
-
-    This function can search for keys in different ways based on the match_type:
-    - "exact": The key must exactly match the pattern
-    - "contains": The key must contain the pattern anywhere
-    - "startswith": The key must start with the pattern
-    - "endswith": The key must end with the pattern
-
-    Args:
-        original_state_dict: The source state dictionary
-        loaded_state_dict: The target state dictionary
-        pattern: The pattern to search for in keys
-        match_type: How to match the pattern (exact, contains, startswith, endswith)
-
-    Returns:
-        list[str]: List of keys that were copied
-    """
-    copied_keys = []
-
-    for key in original_state_dict:
-        should_copy = False
-
-        if match_type == "exact":
-            should_copy = key == pattern
-        elif match_type == "contains":
-            should_copy = pattern in key
-        elif match_type == "startswith":
-            should_copy = key.startswith(pattern)
-        elif match_type == "endswith":
-            should_copy = key.endswith(pattern)
-
-        if should_copy:
-            loaded_state_dict[key] = original_state_dict[key]
-            copied_keys.append(key)
-
-    return copied_keys
-
-
 class CriticHead(nn.Module):
     def __init__(
         self,
@@ -682,9 +637,9 @@ class Policy(nn.Module):
         # Compute standard deviations
         if self.fixed_std is None:
             log_std = self.std_layer(outputs)
-            assert not torch.isnan(
-                log_std
-            ).any(), "[ERROR] log_std became NaN after std_layer!"
+            assert not torch.isnan(log_std).any(), (
+                "[ERROR] log_std became NaN after std_layer!"
+            )
 
             if self.use_tanh_squash:
                 log_std = torch.tanh(log_std)
@@ -932,60 +887,6 @@ class Identity(nn.Module):
         return x
 
 
-class Ensemble(nn.Module):
-    """
-    Vectorized ensemble of modules.
-    """
-
-    def __init__(self, modules, **kwargs):
-        super().__init__()
-        # combine_state_for_ensemble causes graph breaks
-        self.params = from_modules(*modules, as_module=True)
-        with self.params[0].data.to("meta").to_module(modules[0]):
-            self.module = deepcopy(modules[0])
-        self._repr = str(modules[0])
-        self._n = len(modules)
-
-    def __len__(self):
-        return self._n
-
-    def _call(self, params, *args, **kwargs):
-        with params.to_module(self.module):
-            return self.module(*args, **kwargs)
-
-    def forward(self, *args, **kwargs):
-        return torch.vmap(self._call, (0, None), randomness="different")(
-            self.params, *args, **kwargs
-        )
-
-    def __repr__(self):
-        return f"Vectorized {len(self)}x " + self._repr
-
-
-# TODO (azouitine): I think in our case this function is not usefull we should remove it
-# after some investigation
-# borrowed from tdmpc
-def flatten_forward_unflatten(
-    fn: Callable[[Tensor], Tensor], image_tensor: Tensor
-) -> Tensor:
-    """Helper to temporarily flatten extra dims at the start of the image tensor.
-
-    Args:
-        fn: Callable that the image tensor will be passed to. It should accept (B, C, H, W) and return
-            (B, *), where * is any number of dimensions.
-        image_tensor: An image tensor of shape (**, C, H, W), where ** is any number of dimensions and
-        can be more than 1 dimensions, generally different from *.
-    Returns:
-        A return value from the callable reshaped to (**, *).
-    """
-    if image_tensor.ndim == 4:
-        return fn(image_tensor)
-    start_dims = image_tensor.shape[:-3]
-    inp = torch.flatten(image_tensor, end_dim=-4)
-    flat_out = fn(inp)
-    return torch.reshape(flat_out, (*start_dims, *flat_out.shape[1:]))
-
-
 def _convert_normalization_params_to_tensor(normalization_params: dict) -> dict:
     converted_params = {}
     for outer_key, inner_dict in normalization_params.items():
diff --git a/pyproject.toml b/pyproject.toml
index eaeda408..4f992bb5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -91,7 +91,7 @@ dynamixel = ["dynamixel-sdk", "pynput"]
 feetech = ["feetech-servo-sdk", "pynput"]
 intelrealsense = ["pyrealsense2"]
 stretch = ["hello-robot-stretch-body", "pyrender", "pyrealsense2", "pynput"]
-hilserl = ["transformers", "torchmetrics", "grpcio", "protobuf", "tensordict"]
+hilserl = ["transformers", "torchmetrics", "grpcio", "protobuf"]
 mani_skill = ["mani-skill"]
 
 [tool.ruff]

From 1c9eccd2798f192b7c025f6764804558de4422a4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 19 Mar 2025 18:53:26 +0000
Subject: [PATCH 109/112] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 lerobot/common/policies/sac/modeling_sac.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 43221d5c..3e99dbd4 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -17,7 +17,6 @@
 
 # TODO: (1) better device management
 
-from copy import deepcopy
 from typing import Callable, Optional, Tuple, Union, Dict, List
 from pathlib import Path
 
@@ -637,9 +636,9 @@ class Policy(nn.Module):
         # Compute standard deviations
         if self.fixed_std is None:
             log_std = self.std_layer(outputs)
-            assert not torch.isnan(log_std).any(), (
-                "[ERROR] log_std became NaN after std_layer!"
-            )
+            assert not torch.isnan(
+                log_std
+            ).any(), "[ERROR] log_std became NaN after std_layer!"
 
             if self.use_tanh_squash:
                 log_std = torch.tanh(log_std)

From 1a7b4ec890570c8a9f6f7c61ae5045c9bc7ecf0a Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Thu, 20 Mar 2025 12:55:22 +0000
Subject: [PATCH 110/112] Initialize log_alpha with the logarithm of
 temperature_init in SACPolicy

- Updated the SACPolicy class to set log_alpha using the logarithm of the initial temperature value from the configuration.
---
 lerobot/common/policies/sac/modeling_sac.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 3e99dbd4..266c306d 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -17,6 +17,8 @@
 
 # TODO: (1) better device management
 
+from copy import deepcopy
+import math
 from typing import Callable, Optional, Tuple, Union, Dict, List
 from pathlib import Path
 
@@ -138,7 +140,9 @@ class SACPolicy(
         # TODO (azouitine): Handle the case where the temparameter is a fixed
         # TODO (michel-aractingi): Put the log_alpha in cuda by default because otherwise
         # it triggers "can't optimize a non-leaf Tensor"
-        self.log_alpha = nn.Parameter(torch.tensor([0.0]))
+
+        temperature_init = config.temperature_init
+        self.log_alpha = nn.Parameter(torch.tensor([math.log(temperature_init)]))
         self.temperature = self.log_alpha.exp().item()
 
     def _save_pretrained(self, save_directory):
@@ -636,9 +640,9 @@ class Policy(nn.Module):
         # Compute standard deviations
         if self.fixed_std is None:
             log_std = self.std_layer(outputs)
-            assert not torch.isnan(
-                log_std
-            ).any(), "[ERROR] log_std became NaN after std_layer!"
+            assert not torch.isnan(log_std).any(), (
+                "[ERROR] log_std became NaN after std_layer!"
+            )
 
             if self.use_tanh_squash:
                 log_std = torch.tanh(log_std)

From 68b8e274dd1de0686cad8ee27fcf32f1e0106d2e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 20 Mar 2025 12:58:43 +0000
Subject: [PATCH 111/112] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 lerobot/common/policies/sac/modeling_sac.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/lerobot/common/policies/sac/modeling_sac.py b/lerobot/common/policies/sac/modeling_sac.py
index 266c306d..c1b23b3d 100644
--- a/lerobot/common/policies/sac/modeling_sac.py
+++ b/lerobot/common/policies/sac/modeling_sac.py
@@ -17,7 +17,6 @@
 
 # TODO: (1) better device management
 
-from copy import deepcopy
 import math
 from typing import Callable, Optional, Tuple, Union, Dict, List
 from pathlib import Path
@@ -640,9 +639,9 @@ class Policy(nn.Module):
         # Compute standard deviations
         if self.fixed_std is None:
             log_std = self.std_layer(outputs)
-            assert not torch.isnan(log_std).any(), (
-                "[ERROR] log_std became NaN after std_layer!"
-            )
+            assert not torch.isnan(
+                log_std
+            ).any(), "[ERROR] log_std became NaN after std_layer!"
 
             if self.use_tanh_squash:
                 log_std = torch.tanh(log_std)

From 36714a14a765ef174a8e2289ac1de93ca6a603e4 Mon Sep 17 00:00:00 2001
From: AdilZouitine <adilzouitinegm@gmail.com>
Date: Fri, 21 Mar 2025 14:21:31 +0000
Subject: [PATCH 112/112] Update tensor device assignment in ReplayBuffer class

- Changed the device assignment for tensors in the ReplayBuffer class from `device` to `storage_device` for consistency and improved resource management.
---
 lerobot/scripts/server/buffer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lerobot/scripts/server/buffer.py b/lerobot/scripts/server/buffer.py
index 80834eac..8ca14a03 100644
--- a/lerobot/scripts/server/buffer.py
+++ b/lerobot/scripts/server/buffer.py
@@ -463,9 +463,9 @@ class ReplayBuffer:
             for k, v in data.items():
                 if isinstance(v, dict):
                     for key, tensor in v.items():
-                        v[key] = tensor.to(device)
+                        v[key] = tensor.to(storage_device)
                 elif isinstance(v, torch.Tensor):
-                    data[k] = v.to(device)
+                    data[k] = v.to(storage_device)
 
             action = data["action"]
             if action_mask is not None: