add comments, add primary_code_loss_weight

2024-05-24 18:06:12 -04:00 · 2024-05-24 18:06:12 -04:00 · 547d3c3ea1
parent 71ec76fe2a
commit 547d3c3ea1
3 changed files with 126 additions and 79 deletions
--- a/lerobot/common/policies/vqbet/configuration_vqbet.py
+++ b/lerobot/common/policies/vqbet/configuration_vqbet.py
@ -55,7 +55,8 @@ class VQBeTConfig:
        dropout: Dropout rate for GPT
        mlp_hidden_dim: Size of hidden dimensions of offset header / bin prediction headers parts of VQ-BeT
        offset_loss_weight:  A constant that is multiplied to the offset loss
-        secondary_code_loss_weight: A constant that is multiplied to the secondary loss
+        primary_code_loss_weight: A constant that is multiplied to the primary code prediction loss
        secondary_code_loss_weight: A constant that is multiplied to the secondary code prediction loss
        bet_softmax_temperature: Sampling temperature of code for rollout with VQ-BeT
    """
@ -110,6 +111,7 @@ class VQBeTConfig:
    dropout: float = 0.1
    mlp_hidden_dim: int = 1024
    offset_loss_weight: float = 10000.
    primary_code_loss_weight: float = 5.0
    secondary_code_loss_weight: float = 0.5
    bet_softmax_temperature: float = 0.1
--- a/lerobot/common/policies/vqbet/modeling_vqbet.py
+++ b/lerobot/common/policies/vqbet/modeling_vqbet.py
@ -1,20 +1,16 @@
 import os
 from pathlib import Path
 from collections import deque
-from typing import Callable, List, Optional
+from typing import Callable, List
 from functools import partial
 from itertools import zip_longest
 from random import randrange
 import math
 from math import ceil
 from dataclasses import dataclass
 import warnings
 import einops
 from einops import rearrange, repeat, reduce, pack, unpack
 import numpy as np
 import torch
-import torch.nn.functional as F  # noqa: N812
+import torch.nn.functional as F
 import torchvision
 from torch import Tensor, nn, einsum
 import torch.distributed as distributed
@ -123,7 +119,7 @@ class VQBeTPolicy(nn.Module, PyTorchModelHubMixin):
        if not self.check_discretized():
            loss, n_different_codes, n_different_combinations = self.vqbet.discretize(self.config.discretize_step, batch['action'])
            return {"loss": loss, "n_different_codes": n_different_codes, "n_different_combinations": n_different_combinations}
-
+        # if Residual VQ is already trained, VQ-BeT trains its GPT and bin ped header / offset header parts.
        _, loss = self.vqbet(batch, rollout=False)
        return loss
@ -159,6 +155,8 @@ class VQBeTModel(nn.Module):
        Phase 2.
          timestep {t-n+1}   timestep {t-n+2}                timestep {t}
            ┌─────┴─────┐     ┌─────┴─────┐                 ┌─────┴─────┐
        o_{t-n+1}         o_{t-n+2}           ...         o_{t}
            │                 │                             │ 
@ -191,10 +189,10 @@ class VQBeTModel(nn.Module):
        self.rgb_encoder = VQBeTRgbEncoder(config)
        # This action query token is used as a prompt for querying action chunks. Please refer to "A_Q" in the image above.
        self._action_token = nn.Parameter(torch.randn(1, 1, self.config.gpt_input_dim))
-        # action token and EOS token
+        # To input state and observation features into GPT layers, we first project the features to fit the shape of input size of GPT.
        self._action_token = nn.Parameter(torch.randn(1, 1, self.config.gpt_input_dim)) # Batch, Timestep, Data type, GPT input dim
        self.state_projector = MLP(
                config.output_shapes["action"][0], 
                hidden_channels=[self.config.gpt_input_dim]
@ -203,7 +201,10 @@ class VQBeTModel(nn.Module):
                self.rgb_encoder.feature_dim, 
                hidden_channels=[self.config.gpt_input_dim]
            )
        # GPT part of VQ-BeT
        self.policy = GPT(config)
        # bin prediction header / offset prediction header part of VQ-BeT
        self.action_head = VQBeTHead(config)
    def discretize(self, discretize_step, actions):
@ -220,7 +221,7 @@ class VQBeTModel(nn.Module):
        # Separate batch and sequence dims.
        img_features = einops.rearrange(img_features, "(b n) ... -> b n ...", b=batch_size)
-
+        # image observation feature, state feature, and action query token are grouped together with the same timestpe to form a group, which is listed in order to be entered into GPT sequentially.
        observation_feature = torch.cat([
                torch.unsqueeze(self.obs_projector(img_features), dim=2), 
                torch.unsqueeze(self.state_projector(batch["observation.state"]), dim=2), 
@ -230,24 +231,28 @@ class VQBeTModel(nn.Module):
            raise NotImplementedError
        len_additional_action_token = self.config.n_action_pred_token-1
        action_token = self._action_token.repeat(batch_size, len_additional_action_token, 1)
-        
+
        # add additional action query tokens for predicting future action chunks
        observation_feature = torch.cat([observation_feature, action_token], dim=1)
-        # get action features
+        # get action features (pass through GPT)
        features = self.policy(observation_feature)
        historical_act_pred_index = np.arange(0, n_obs_steps) * (self.config.gpt_num_obs_mode+1) + self.config.gpt_num_obs_mode
        # only extract the output tokens at the position of action query
        features = torch.cat([
            features[:, historical_act_pred_index],
            features[:, -len_additional_action_token:]
        ], dim=1)
-        # action head
+        # pass through action head
        pred_action = self.action_head(
            features,
        )
-
+        # if rollout, VQ-BeT don't calculate loss
        if rollout:
            return pred_action["predicted_action"][:, n_obs_steps-1, :].reshape(batch_size, self.config.n_action_pred_chunk, -1)
        # else, it calculate overall loss (bin prediction loss, and offset loss)
        else:
            action = batch["action"]
            n, total_w, act_dim = action.shape
@ -270,7 +275,21 @@ class VQBeTModel(nn.Module):
 class VQBeTHead(nn.Module):
    def __init__(self, config: VQBeTConfig):
        """
-        TODO: add explanation for each value.
+        VQBeTHead takes output of GPT layers, and pass the feature through bin prediction head (`self.map_to_cbet_preds_bin`), and offset prediction head (`self.map_to_cbet_preds_offset`)
        self.map_to_cbet_preds_bin: outputs probability of each code (for each layer).
            The input dimension of `self.map_to_cbet_preds_bin` is same with the output of GPT, 
            and the output dimension of `self.map_to_cbet_preds_bin` is `self.config.vqvae_groups * self.config.vqvae_n_embed`, where 
                `self.config.vqvae_groups` is number of RVQ layers, and
                `self.config.vqvae_n_embed` is codebook size of RVQ.
        self.map_to_cbet_preds_offset: output the predicted offsets for all the codes in all the layers. 
            The input dimension of ` self.map_to_cbet_preds_offset` is same with the output of GPT, 
            and the output dimension of ` self.map_to_cbet_preds_offset` is `self.config.vqvae_groups * self.config.vqvae_n_embed * config.n_action_pred_chunk * config.output_shapes["action"][0]`, where
                `self.config.vqvae_groups` is number of RVQ layers,
                `self.config.vqvae_n_embed` is codebook size of RVQ,
                `config.n_action_pred_chunk is action chunk size of each token, and
                `config.output_shapes["action"][0]` is the dimension of action 
        """
        super().__init__()
@ -301,6 +320,7 @@ class VQBeTHead(nn.Module):
            self.vqvae_model.device = get_device_from_parameters(self)
        loss, n_different_codes, n_different_combinations = pretrain_vqvae(self.vqvae_model, discretize_step, actions)
        # if we updated RVQ more than `discretize_step` steps,
        if self.vqvae_model.discretized:
            print("Finished discretizing action data!")
            self.vqvae_model.eval()
@ -309,7 +329,10 @@ class VQBeTHead(nn.Module):
        return loss, n_different_codes, n_different_combinations
    def forward(self, x, **kwargs):
        # N is the batch size, and T is number of action query tokens, which are process through same GPT
        N, T, _ = x.shape
        # we calculate N and T side parallely. Thus, the dimensions would be 
        # (batch size * number of action query tokens, action chunk size, action dimension)
        x = einops.rearrange(x, "N T WA -> (N T) WA")
        cbet_logits = self.map_to_cbet_preds_bin(x)
@ -334,23 +357,27 @@ class VQBeTHead(nn.Module):
            sampled_centers,
        )
        # Use advanced indexing to sample the values
-        sampled_offsets = cbet_offsets[indices]  # NT, G, W, A(?) or NT, G, A
+        sampled_offsets = cbet_offsets[indices]
-
+        # Extract the only offsets corresponding to the sampled codes.
        sampled_offsets = sampled_offsets.sum(dim=1)
        # Get the centroids of each layer to pass it through RVQ decoder
        centers = self.vqvae_model.draw_code_forward(sampled_centers).view(
            NT, -1, self.config.vqvae_embedding_dim
        )
        return_decoder_input = einops.rearrange(
            centers.clone().detach(), "NT 1 D -> NT D"
        )
        # pass the centroids through decoder to get actions.
        decoded_action = (
            self.vqvae_model.get_action_from_latent(return_decoder_input)
            .clone()
            .detach()
-        )  # NT, A
+        )
        # reshaped extracted offset to match with decoded centroids
        sampled_offsets = einops.rearrange(
            sampled_offsets, "NT (W A) -> NT W A", W=self.config.n_action_pred_chunk
        )
        # add offset and decoded centroids
        predicted_action = decoded_action + sampled_offsets
        predicted_action = einops.rearrange(
            predicted_action,
@ -368,7 +395,16 @@ class VQBeTHead(nn.Module):
        }
    def loss_fn(self, pred, target, **kwargs):
-        # Rename the inputs for clarity.
+        """
        for given ground truth action values (target), and prediction (pred) this function calculates the overall loss.
        predicted_action: predicted action chunk (offset + decoded centroids)
        sampled_centers: sampled centroids (code of RVQ)
        decoded_action: decoded action, which is produced by passing sampled_centers through RVQ decoder
        NT: batch size * T
        T: number of action query tokens, which are process through same GPT
        cbet_logits: probability of all codes in each layer
        """
        action_seq = target
        predicted_action = pred["predicted_action"]
        sampled_centers = pred["sampled_centers"]
@ -383,7 +419,7 @@ class VQBeTHead(nn.Module):
        action_seq = einops.rearrange(action_seq, "N T W A -> (N T) W A")
        # Figure out the loss for the actions.
-        # First, we need to find the closest cluster center for each action.
+        # First, we need to find the closest cluster center for each ground truth action.
        state_vq, action_bins = self.vqvae_model.get_code(
            action_seq
        )  # action_bins: NT, G
@ -392,18 +428,21 @@ class VQBeTHead(nn.Module):
        if action_seq.ndim == 2:
            action_seq = action_seq.unsqueeze(0)
        # offset loss is L1 distance between the predicted action and ground truth action
        offset_loss = torch.nn.L1Loss()(action_seq, predicted_action)
-
+        # calculate primary code prediction loss
        cbet_loss1 = self._criterion(  # F.cross_entropy
            cbet_logits[:, 0, :],
            action_bins[:, 0],
        )
        # calculate secondary code prediction loss
        cbet_loss2 = self._criterion(  # F.cross_entropy
            cbet_logits[:, 1, :],
            action_bins[:, 1],
        )
-        cbet_loss = cbet_loss1 * 5 + cbet_loss2 * self.config.secondary_code_loss_weight
+        # add all the prediction loss
        cbet_loss = cbet_loss1 * self.config.primary_code_loss_weight + cbet_loss2 * self.config.secondary_code_loss_weight
        equal_primary_code_rate = torch.sum(
            (action_bins[:, 0] == sampled_centers[:, 0]).int()
@ -416,21 +455,9 @@ class VQBeTHead(nn.Module):
            einops.rearrange(action_seq, "(N T) W A -> N T W A", T=T),
            einops.rearrange(predicted_action, "(N T) W A -> N T W A", T=T),
        )
-        vq_action_error = (
+        vq_action_error = (abs(einops.rearrange(action_seq, "(N T) W A -> N T W A", T=T) - einops.rearrange(decoded_action, "(N T) W A -> N T W A", T=T))).mean()
-            abs(
+        offset_action_error = (abs(einops.rearrange(action_seq, "(N T) W A -> N T W A", T=T) - einops.rearrange(predicted_action, "(N T) W A -> N T W A", T=T))).mean()
-                einops.rearrange(action_seq, "(N T) W A -> N T W A", T=T) - einops.rearrange(decoded_action, "(N T) W A -> N T W A", T=T)
+        action_error_max = (abs(einops.rearrange(action_seq, "(N T) W A -> N T W A", T=T) - einops.rearrange(predicted_action, "(N T) W A -> N T W A", T=T))).max()
            )
        ).mean()
        offset_action_error = (
            abs(
                einops.rearrange(action_seq, "(N T) W A -> N T W A", T=T) - einops.rearrange(predicted_action, "(N T) W A -> N T W A", T=T)
            )
        ).mean()
        action_error_max = (
            abs(
                einops.rearrange(action_seq, "(N T) W A -> N T W A", T=T) - einops.rearrange(predicted_action, "(N T) W A -> N T W A", T=T)
            )
        ).max()
        loss = cbet_loss + self.config.offset_loss_weight * offset_loss
@ -506,35 +533,30 @@ class VQBeTOptimizer:
    def step(self):
        self.optimizing_step +=1
        # pretraining VQ-VAE (phase 1)
        if self.optimizing_step < self.discretize_step:
            # pretraining VQ-VAE
            self.vqvae_optimizer.step()
        # training BeT (phase 2)
        else:
-            # training BeT
+            self.encoder_optimizer.step()
-            if self.optimizing_step < 0.6 * self.offline_steps:
+            self.bet_optimizer1.step()
-                self.encoder_optimizer.step()
+            self.bet_optimizer2.step()
-                self.bet_optimizer1.step()
+            self.bet_optimizer3.step()
                self.bet_optimizer2.step()
                self.bet_optimizer3.step()
            else:
                self.bet_optimizer3.step()
    def zero_grad(self):
        # pretraining VQ-VAE (phase 1)
        if self.optimizing_step < self.discretize_step:
            # pretraining VQ-VAE
            self.vqvae_optimizer.zero_grad()
        # training BeT (phase 2)
        else:
-            # training BeT
+            self.encoder_optimizer.zero_grad()
-            if self.optimizing_step < 0.6 * self.offline_steps:
+            self.bet_optimizer1.zero_grad()
-                self.encoder_optimizer.zero_grad()
+            self.bet_optimizer2.zero_grad()
-                self.bet_optimizer1.zero_grad()
+            self.bet_optimizer3.zero_grad()
                self.bet_optimizer2.zero_grad()
                self.bet_optimizer3.zero_grad()
            else:
                self.bet_optimizer3.zero_grad()
 class VQBeTScheduler:
    def __init__(self, optimizer, cfg):
        # VQ-BeT use scheduler only for rgb encoder. Since we took rgb encoder part from diffusion policy, we also follow the same scheduler from it.
        from diffusers.optimization import get_scheduler
        self.discretize_step = cfg.training.discretize_step
        self.optimizing_step = 0
@ -663,6 +685,11 @@ class VqVae(nn.Module):
    def __init__(
        self, config: VQBeTConfig,
    ):
        """
        VQ-VAE is composed of three parts: encoder, vq_layer, and decoder.
        Encoder and decoder are MLPs consisting of an input, output layer, and hidden layer, respectively.
        The vq_layer uses residual VQs.
        """
        super(VqVae, self).__init__()
        self.config = config
@ -676,24 +703,14 @@ class VqVae(nn.Module):
            codebook_size=config.vqvae_n_embed,
        )
-        if self.config.n_action_pred_chunk == 1:
+        self.encoder = MLP(
-            self.encoder = MLP(
+            in_channels=self.config.output_shapes["action"][0] * self.config.n_action_pred_chunk,
-                in_channels=self.config.output_shapes["action"][0],
+            hidden_channels=[config.vqvae_enc_hidden_dim, config.vqvae_enc_hidden_dim, config.vqvae_embedding_dim],
-                hidden_channels=[config.vqvae_enc_hidden_dim, config.vqvae_enc_hidden_dim, config.vqvae_embedding_dim],
+        )
-            )
+        self.decoder = MLP(
-            self.decoder = MLP(
+            in_channels=config.vqvae_embedding_dim,
-                in_channels=config.vqvae_embedding_dim,
+            hidden_channels=[config.vqvae_enc_hidden_dim, config.vqvae_enc_hidden_dim, self.config.output_shapes["action"][0] * self.config.n_action_pred_chunk],
-                hidden_channels=[config.vqvae_enc_hidden_dim, config.vqvae_enc_hidden_dim, self.config.output_shapes["action"][0]],
+        )
            )
        else:
            self.encoder = MLP(
                in_channels=self.config.output_shapes["action"][0] * self.config.n_action_pred_chunk,
                hidden_channels=[config.vqvae_enc_hidden_dim, config.vqvae_enc_hidden_dim, config.vqvae_embedding_dim],
            )
            self.decoder = MLP(
                in_channels=config.vqvae_embedding_dim,
                hidden_channels=[config.vqvae_enc_hidden_dim, config.vqvae_enc_hidden_dim, self.config.output_shapes["action"][0] * self.config.n_action_pred_chunk],
            )
        self.train()
@ -704,6 +721,11 @@ class VqVae(nn.Module):
        self.decoder.eval()
    def train(self, mode=True):
        """
        This function forces the RVQ to no longer update when action discretization is complete.
        Since VQs are partly updated via the EMA method, simply passing data through them can cause unintended modifications.
        Therefore, we use function overriding to prevent RVQs from being updated during the training of VQ-BeT after discretization completes.
        """
        if mode:
            if self.discretized:
                pass
@ -736,7 +758,7 @@ class VqVae(nn.Module):
        if not torch.is_tensor(state):
            state = torch.FloatTensor(state.copy())
        if self.config.n_action_pred_chunk == 1:
-            state = state.squeeze(-2)  # state.squeeze(-1)
+            state = state.squeeze(-2)
        else:
            state = einops.rearrange(state, "N T A -> N (T A)")
        return state
@ -764,7 +786,6 @@ class VqVae(nn.Module):
                        torch.swapaxes(recon_state_ae, -2, -1),
                    )
            else:
                # econ_from_code = self.draw_code_forward(vq_code)
                return state_vq, vq_code
    def vqvae_forward(self, state):
@ -815,7 +836,7 @@ def pretrain_vqvae(vqvae_model, discretize_step, actions):
    loss, metric = vqvae_model.vqvae_forward(
        actions
-    )  # N T D
+    )
    n_different_codes = len(torch.unique(metric[2]))
    n_different_combinations = len(torch.unique(metric[2], dim=0))
    vqvae_model.optimized_steps += 1
@ -837,6 +858,29 @@ def round_up_multiple(num, mult):
 class ResidualVQ(nn.Module):
    """
    MIT License
    Copyright (c) 2020 Phil Wang
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
    The above copyright notice and this permission notice shall be included in all
    copies or substantial portions of the Software.
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    SOFTWARE.
    """
    """Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf"""
    def __init__(
--- a/lerobot/configs/policy/vqbet.yaml
+++ b/lerobot/configs/policy/vqbet.yaml
@ -1,6 +1,6 @@
 # @package _global_
-# Defaults for training for the PushT dataset as per https://github.com/real-stanford/diffusion_policy.
+# Defaults for training for the PushT dataset.
 seed: 100000
 dataset_repo_id: lerobot/pusht
@ -100,5 +100,6 @@ policy:
  dropout: 0.1
  mlp_hidden_dim: 1024
  offset_loss_weight: 10000.
  primary_code_loss_weight: 5.0
  secondary_code_loss_weight: 0.5
  bet_softmax_temperature: 0.1