From 538455a965ce25a6f7b81d8b5c8cee46b90a7df9 Mon Sep 17 00:00:00 2001
From: Hirokazu Ishida <38597814+HiroIshida@users.noreply.github.com>
Date: Wed, 30 Oct 2024 19:00:05 +0900
Subject: [PATCH 1/2] feat: enable to use multiple rgb encoders per camera in
 diffusion policy (#484)

Co-authored-by: Alexander Soare <alexander.soare159@gmail.com>
---
 .../diffusion/configuration_diffusion.py      |  2 +
 .../policies/diffusion/modeling_diffusion.py  | 43 ++++++++++++++-----
 2 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/lerobot/common/policies/diffusion/configuration_diffusion.py b/lerobot/common/policies/diffusion/configuration_diffusion.py
index bd3692ac..531f49e4 100644
--- a/lerobot/common/policies/diffusion/configuration_diffusion.py
+++ b/lerobot/common/policies/diffusion/configuration_diffusion.py
@@ -67,6 +67,7 @@ class DiffusionConfig:
         use_group_norm: Whether to replace batch normalization with group normalization in the backbone.
             The group sizes are set to be about 16 (to be precise, feature_dim // 16).
         spatial_softmax_num_keypoints: Number of keypoints for SpatialSoftmax.
+        use_separate_rgb_encoders_per_camera: Whether to use a separate RGB encoder for each camera view.
         down_dims: Feature dimension for each stage of temporal downsampling in the diffusion modeling Unet.
             You may provide a variable number of dimensions, therefore also controlling the degree of
             downsampling.
@@ -130,6 +131,7 @@ class DiffusionConfig:
     pretrained_backbone_weights: str | None = None
     use_group_norm: bool = True
     spatial_softmax_num_keypoints: int = 32
+    use_separate_rgb_encoder_per_camera: bool = False
     # Unet.
     down_dims: tuple[int, ...] = (512, 1024, 2048)
     kernel_size: int = 5
diff --git a/lerobot/common/policies/diffusion/modeling_diffusion.py b/lerobot/common/policies/diffusion/modeling_diffusion.py
index 308a8be3..9ba56260 100644
--- a/lerobot/common/policies/diffusion/modeling_diffusion.py
+++ b/lerobot/common/policies/diffusion/modeling_diffusion.py
@@ -182,8 +182,13 @@ class DiffusionModel(nn.Module):
         self._use_env_state = False
         if num_images > 0:
             self._use_images = True
-            self.rgb_encoder = DiffusionRgbEncoder(config)
-            global_cond_dim += self.rgb_encoder.feature_dim * num_images
+            if self.config.use_separate_rgb_encoder_per_camera:
+                encoders = [DiffusionRgbEncoder(config) for _ in range(num_images)]
+                self.rgb_encoder = nn.ModuleList(encoders)
+                global_cond_dim += encoders[0].feature_dim * num_images
+            else:
+                self.rgb_encoder = DiffusionRgbEncoder(config)
+                global_cond_dim += self.rgb_encoder.feature_dim * num_images
         if "observation.environment_state" in config.input_shapes:
             self._use_env_state = True
             global_cond_dim += config.input_shapes["observation.environment_state"][0]
@@ -239,16 +244,32 @@ class DiffusionModel(nn.Module):
         """Encode image features and concatenate them all together along with the state vector."""
         batch_size, n_obs_steps = batch["observation.state"].shape[:2]
         global_cond_feats = [batch["observation.state"]]
-        # Extract image feature (first combine batch, sequence, and camera index dims).
+        # Extract image features.
         if self._use_images:
-            img_features = self.rgb_encoder(
-                einops.rearrange(batch["observation.images"], "b s n ... -> (b s n) ...")
-            )
-            # Separate batch dim and sequence dim back out. The camera index dim gets absorbed into the
-            # feature dim (effectively concatenating the camera features).
-            img_features = einops.rearrange(
-                img_features, "(b s n) ... -> b s (n ...)", b=batch_size, s=n_obs_steps
-            )
+            if self.config.use_separate_rgb_encoder_per_camera:
+                # Combine batch and sequence dims while rearranging to make the camera index dimension first.
+                images_per_camera = einops.rearrange(batch["observation.images"], "b s n ... -> n (b s) ...")
+                img_features_list = torch.cat(
+                    [
+                        encoder(images)
+                        for encoder, images in zip(self.rgb_encoder, images_per_camera, strict=True)
+                    ]
+                )
+                # Separate batch and sequence dims back out. The camera index dim gets absorbed into the
+                # feature dim (effectively concatenating the camera features).
+                img_features = einops.rearrange(
+                    img_features_list, "(n b s) ... -> b s (n ...)", b=batch_size, s=n_obs_steps
+                )
+            else:
+                # Combine batch, sequence, and "which camera" dims before passing to shared encoder.
+                img_features = self.rgb_encoder(
+                    einops.rearrange(batch["observation.images"], "b s n ... -> (b s n) ...")
+                )
+                # Separate batch dim and sequence dim back out. The camera index dim gets absorbed into the
+                # feature dim (effectively concatenating the camera features).
+                img_features = einops.rearrange(
+                    img_features, "(b s n) ... -> b s (n ...)", b=batch_size, s=n_obs_steps
+                )
             global_cond_feats.append(img_features)
 
         if self._use_env_state:

From e0df56de621b6f7ee501719ee0b1e4af00a98635 Mon Sep 17 00:00:00 2001
From: Arsen Ohanyan <arsenohanyan@gmail.com>
Date: Thu, 31 Oct 2024 08:41:49 -0700
Subject: [PATCH 2/2] Fix config file (#495)

---
 examples/10_use_so100.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/10_use_so100.md b/examples/10_use_so100.md
index 405e80ec..32d7f22d 100644
--- a/examples/10_use_so100.md
+++ b/examples/10_use_so100.md
@@ -135,7 +135,7 @@ You will need to move the follower arm to these positions sequentially:
 Make sure both arms are connected and run this script to launch manual calibration:
 ```bash
 python lerobot/scripts/control_robot.py calibrate \
-    --robot-path lerobot/configs/robot/moss.yaml \
+    --robot-path lerobot/configs/robot/so100.yaml \
     --robot-overrides '~cameras' --arms main_follower
 ```