From 538455a965ce25a6f7b81d8b5c8cee46b90a7df9 Mon Sep 17 00:00:00 2001 From: Hirokazu Ishida <38597814+HiroIshida@users.noreply.github.com> Date: Wed, 30 Oct 2024 19:00:05 +0900 Subject: [PATCH 1/2] feat: enable to use multiple rgb encoders per camera in diffusion policy (#484) Co-authored-by: Alexander Soare --- .../diffusion/configuration_diffusion.py | 2 + .../policies/diffusion/modeling_diffusion.py | 43 ++++++++++++++----- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/lerobot/common/policies/diffusion/configuration_diffusion.py b/lerobot/common/policies/diffusion/configuration_diffusion.py index bd3692ac..531f49e4 100644 --- a/lerobot/common/policies/diffusion/configuration_diffusion.py +++ b/lerobot/common/policies/diffusion/configuration_diffusion.py @@ -67,6 +67,7 @@ class DiffusionConfig: use_group_norm: Whether to replace batch normalization with group normalization in the backbone. The group sizes are set to be about 16 (to be precise, feature_dim // 16). spatial_softmax_num_keypoints: Number of keypoints for SpatialSoftmax. + use_separate_rgb_encoders_per_camera: Whether to use a separate RGB encoder for each camera view. down_dims: Feature dimension for each stage of temporal downsampling in the diffusion modeling Unet. You may provide a variable number of dimensions, therefore also controlling the degree of downsampling. @@ -130,6 +131,7 @@ class DiffusionConfig: pretrained_backbone_weights: str | None = None use_group_norm: bool = True spatial_softmax_num_keypoints: int = 32 + use_separate_rgb_encoder_per_camera: bool = False # Unet. down_dims: tuple[int, ...] = (512, 1024, 2048) kernel_size: int = 5 diff --git a/lerobot/common/policies/diffusion/modeling_diffusion.py b/lerobot/common/policies/diffusion/modeling_diffusion.py index 308a8be3..9ba56260 100644 --- a/lerobot/common/policies/diffusion/modeling_diffusion.py +++ b/lerobot/common/policies/diffusion/modeling_diffusion.py @@ -182,8 +182,13 @@ class DiffusionModel(nn.Module): self._use_env_state = False if num_images > 0: self._use_images = True - self.rgb_encoder = DiffusionRgbEncoder(config) - global_cond_dim += self.rgb_encoder.feature_dim * num_images + if self.config.use_separate_rgb_encoder_per_camera: + encoders = [DiffusionRgbEncoder(config) for _ in range(num_images)] + self.rgb_encoder = nn.ModuleList(encoders) + global_cond_dim += encoders[0].feature_dim * num_images + else: + self.rgb_encoder = DiffusionRgbEncoder(config) + global_cond_dim += self.rgb_encoder.feature_dim * num_images if "observation.environment_state" in config.input_shapes: self._use_env_state = True global_cond_dim += config.input_shapes["observation.environment_state"][0] @@ -239,16 +244,32 @@ class DiffusionModel(nn.Module): """Encode image features and concatenate them all together along with the state vector.""" batch_size, n_obs_steps = batch["observation.state"].shape[:2] global_cond_feats = [batch["observation.state"]] - # Extract image feature (first combine batch, sequence, and camera index dims). + # Extract image features. if self._use_images: - img_features = self.rgb_encoder( - einops.rearrange(batch["observation.images"], "b s n ... -> (b s n) ...") - ) - # Separate batch dim and sequence dim back out. The camera index dim gets absorbed into the - # feature dim (effectively concatenating the camera features). - img_features = einops.rearrange( - img_features, "(b s n) ... -> b s (n ...)", b=batch_size, s=n_obs_steps - ) + if self.config.use_separate_rgb_encoder_per_camera: + # Combine batch and sequence dims while rearranging to make the camera index dimension first. + images_per_camera = einops.rearrange(batch["observation.images"], "b s n ... -> n (b s) ...") + img_features_list = torch.cat( + [ + encoder(images) + for encoder, images in zip(self.rgb_encoder, images_per_camera, strict=True) + ] + ) + # Separate batch and sequence dims back out. The camera index dim gets absorbed into the + # feature dim (effectively concatenating the camera features). + img_features = einops.rearrange( + img_features_list, "(n b s) ... -> b s (n ...)", b=batch_size, s=n_obs_steps + ) + else: + # Combine batch, sequence, and "which camera" dims before passing to shared encoder. + img_features = self.rgb_encoder( + einops.rearrange(batch["observation.images"], "b s n ... -> (b s n) ...") + ) + # Separate batch dim and sequence dim back out. The camera index dim gets absorbed into the + # feature dim (effectively concatenating the camera features). + img_features = einops.rearrange( + img_features, "(b s n) ... -> b s (n ...)", b=batch_size, s=n_obs_steps + ) global_cond_feats.append(img_features) if self._use_env_state: From e0df56de621b6f7ee501719ee0b1e4af00a98635 Mon Sep 17 00:00:00 2001 From: Arsen Ohanyan Date: Thu, 31 Oct 2024 08:41:49 -0700 Subject: [PATCH 2/2] Fix config file (#495) --- examples/10_use_so100.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/10_use_so100.md b/examples/10_use_so100.md index 405e80ec..32d7f22d 100644 --- a/examples/10_use_so100.md +++ b/examples/10_use_so100.md @@ -135,7 +135,7 @@ You will need to move the follower arm to these positions sequentially: Make sure both arms are connected and run this script to launch manual calibration: ```bash python lerobot/scripts/control_robot.py calibrate \ - --robot-path lerobot/configs/robot/moss.yaml \ + --robot-path lerobot/configs/robot/so100.yaml \ --robot-overrides '~cameras' --arms main_follower ```