diff --git a/lerobot/common/policies/vqbet/configuration_vqbet.py b/lerobot/common/policies/vqbet/configuration_vqbet.py index 48c9be5a..5b2e4b36 100644 --- a/lerobot/common/policies/vqbet/configuration_vqbet.py +++ b/lerobot/common/policies/vqbet/configuration_vqbet.py @@ -3,7 +3,7 @@ from dataclasses import dataclass, field @dataclass class VQBeTConfig: - """Configuration class for DiffusionPolicy. + """Configuration class for VQ-BeT. Defaults are configured for training with PushT providing proprioceptive and single camera observations. @@ -13,9 +13,8 @@ class VQBeTConfig: Args: n_obs_steps: Number of environment steps worth of observations to pass to the policy (takes the current step and additional steps going back). - horizon: Diffusion model action prediction size as detailed in `DiffusionPolicy.select_action`. - n_action_steps: The number of action steps to run in the environment for one invocation of the policy. - See `DiffusionPolicy.select_action` for more details. + n_action_pred_token: TODO(jayLEE0301) + n_action_pred_chunk: TODO(jayLEE0301) input_shapes: A dictionary defining the shapes of the input data for the policy. The key represents the input data name, and the value is a list indicating the dimensions of the corresponding data. For example, "observation.image" refers to an input from @@ -41,32 +40,21 @@ class VQBeTConfig: use_group_norm: Whether to replace batch normalization with group normalization in the backbone. The group sizes are set to be about 16 (to be precise, feature_dim // 16). spatial_softmax_num_keypoints: Number of keypoints for SpatialSoftmax. - down_dims: Feature dimension for each stage of temporal downsampling in the diffusion modeling Unet. - You may provide a variable number of dimensions, therefore also controlling the degree of - downsampling. - kernel_size: The convolutional kernel size of the diffusion modeling Unet. - n_groups: Number of groups used in the group norm of the Unet's convolutional blocks. - diffusion_step_embed_dim: The Unet is conditioned on the diffusion timestep via a small non-linear - network. This is the output dimension of that network, i.e., the embedding dimension. - use_film_scale_modulation: FiLM (https://arxiv.org/abs/1709.07871) is used for the Unet conditioning. - Bias modulation is used be default, while this parameter indicates whether to also use scale - modulation. - num_train_timesteps: Number of diffusion steps for the forward diffusion schedule. - beta_schedule: Name of the diffusion beta schedule as per DDPMScheduler from Hugging Face diffusers. - beta_start: Beta value for the first forward-diffusion step. - beta_end: Beta value for the last forward-diffusion step. - prediction_type: The type of prediction that the diffusion modeling Unet makes. Choose from "epsilon" - or "sample". These have equivalent outcomes from a latent variable modeling perspective, but - "epsilon" has been shown to work better in many deep neural network settings. - clip_sample: Whether to clip the sample to [-`clip_sample_range`, +`clip_sample_range`] for each - denoising step at inference time. WARNING: you will need to make sure your action-space is - normalized to fit within this range. - clip_sample_range: The magnitude of the clipping range as described above. - num_inference_steps: Number of reverse diffusion steps to use at inference time (steps are evenly - spaced). If not provided, this defaults to be the same as `num_train_timesteps`. - do_mask_loss_for_padding: Whether to mask the loss when there are copy-padded actions. See - `LeRobotDataset` and `load_previous_and_future_frames` for mor information. Note, this defaults - to False as the original Diffusion Policy implementation does the same. + discretize_step: TODO(jayLEE0301) + vqvae_groups: TODO(jayLEE0301) + vqvae_n_embed: TODO(jayLEE0301) + vqvae_embedding_dim: TODO(jayLEE0301) + vqvae_enc_hidden_dim: TODO(jayLEE0301) + gpt_block_size: TODO(jayLEE0301) + gpt_input_dim: TODO(jayLEE0301) + gpt_output_dim: TODO(jayLEE0301) + gpt_n_layer: TODO(jayLEE0301) + gpt_n_head: TODO(jayLEE0301) + gpt_hidden_dim: TODO(jayLEE0301) + dropout: TODO(jayLEE0301) + mlp_hidden_dim: TODO(jayLEE0301) + offset_loss_weight: TODO(jayLEE0301) + secondary_code_loss_weight: TODO(jayLEE0301) """ # Inputs / output structure. @@ -115,11 +103,11 @@ class VQBeTConfig: gpt_output_dim: int = 512 gpt_n_layer: int = 8 gpt_n_head: int = 8 - gpt_n_embed: int = 512 + gpt_hidden_dim: int = 512 dropout: float = 0.1 mlp_hidden_dim: int = 1024 offset_loss_weight: float = 10000. - secondary_code_multiplier: float = 0.5 + secondary_code_loss_weight: float = 0.5 def __post_init__(self): """Input validation (not exhaustive).""" diff --git a/lerobot/common/policies/vqbet/modeling_vqbet.py b/lerobot/common/policies/vqbet/modeling_vqbet.py index 8713cb4e..54ff2569 100644 --- a/lerobot/common/policies/vqbet/modeling_vqbet.py +++ b/lerobot/common/policies/vqbet/modeling_vqbet.py @@ -173,7 +173,7 @@ class VQBeTModel(nn.Module): ], dim=-2).view(batch_size, -1, self.config.gpt_input_dim) if img_features.shape[1] != n_obs_steps: raise NotImplementedError - # eos_token = self._eos_token.repeat(batch_size, 1, 1) # TODO remove EOS token + # eos_token = self._eos_token.repeat(batch_size, 1, 1) # TODO(jayLEE0301) remove EOS token len_additional_action_token = self.config.n_action_pred_token-1 action_token = self._action_token.repeat(batch_size, len_additional_action_token, 1) @@ -183,7 +183,7 @@ class VQBeTModel(nn.Module): # get action features features = self._policy(global_cond) - historical_act_pred_index = np.arange(0, n_obs_steps) * 3 + 2 # TODO make it compatible with other values + historical_act_pred_index = np.arange(0, n_obs_steps) * 3 + 2 # TODO(jayLEE0301) make it compatible with other values features = torch.cat([ features[:, historical_act_pred_index], features[:, -len_additional_action_token:] @@ -225,7 +225,7 @@ class VQBeTHead(nn.Module): self.output_size = config.output_shapes["action"][0] self.hidden_size = config.mlp_hidden_dim self.offset_loss_weight = config.offset_loss_weight - self.secondary_code_multiplier = config.secondary_code_multiplier + self.secondary_code_loss_weight = config.secondary_code_loss_weight self.vqvae_groups = config.vqvae_groups self.vqvae_n_embed = config.vqvae_n_embed # C(number of code integers) @@ -358,7 +358,7 @@ class VQBeTHead(nn.Module): cbet_logits[:, 1, :], action_bins[:, 1], ) - cbet_loss = cbet_loss1 * 5 + cbet_loss2 * self.secondary_code_multiplier + cbet_loss = cbet_loss1 * 5 + cbet_loss2 * self.secondary_code_loss_weight equal_primary_code_rate = torch.sum( (action_bins[:, 0] == sampled_centers[:, 0]).int() @@ -2058,11 +2058,11 @@ class MLP(torch.nn.Sequential): class CausalSelfAttention(nn.Module): def __init__(self, config): super().__init__() - assert config.gpt_n_embed % config.gpt_n_head == 0 + assert config.gpt_hidden_dim % config.gpt_n_head == 0 # key, query, value projections for all heads, but in a batch - self.c_attn = nn.Linear(config.gpt_n_embed, 3 * config.gpt_n_embed) + self.c_attn = nn.Linear(config.gpt_hidden_dim, 3 * config.gpt_hidden_dim) # output projection - self.c_proj = nn.Linear(config.gpt_n_embed, config.gpt_n_embed) + self.c_proj = nn.Linear(config.gpt_hidden_dim, config.gpt_hidden_dim) # regularization self.attn_dropout = nn.Dropout(config.dropout) self.resid_dropout = nn.Dropout(config.dropout) @@ -2074,17 +2074,17 @@ class CausalSelfAttention(nn.Module): ), ) self.gpt_n_head = config.gpt_n_head - self.gpt_n_embed = config.gpt_n_embed + self.gpt_hidden_dim = config.gpt_hidden_dim def forward(self, x): ( B, T, C, - ) = x.size() # batch size, sequence length, embedding dimensionality (gpt_n_embed) + ) = x.size() # batch size, sequence length, embedding dimensionality (gpt_hidden_dim) # calculate query, key, values for all heads in batch and move head forward to be the batch dim - q, k, v = self.c_attn(x).split(self.gpt_n_embed, dim=2) + q, k, v = self.c_attn(x).split(self.gpt_hidden_dim, dim=2) k = k.view(B, T, self.gpt_n_head, C // self.gpt_n_head).transpose( 1, 2 ) # (B, nh, T, hs) @@ -2114,13 +2114,13 @@ class CausalSelfAttention(nn.Module): class Block(nn.Module): def __init__(self, config): super().__init__() - self.ln_1 = nn.LayerNorm(config.gpt_n_embed) + self.ln_1 = nn.LayerNorm(config.gpt_hidden_dim) self.attn = CausalSelfAttention(config) - self.ln_2 = nn.LayerNorm(config.gpt_n_embed) + self.ln_2 = nn.LayerNorm(config.gpt_hidden_dim) self.mlp = nn.Sequential( - nn.Linear(config.gpt_n_embed, 4 * config.gpt_n_embed), + nn.Linear(config.gpt_hidden_dim, 4 * config.gpt_hidden_dim), nn.GELU(), - nn.Linear(4 * config.gpt_n_embed, config.gpt_n_embed), + nn.Linear(4 * config.gpt_hidden_dim, config.gpt_hidden_dim), nn.Dropout(config.dropout) ) @@ -2178,14 +2178,14 @@ class GPT(nn.Module): self.transformer = nn.ModuleDict( dict( - wte=nn.Linear(config.gpt_input_dim, config.gpt_n_embed), - wpe=nn.Embedding(config.gpt_block_size, config.gpt_n_embed), + wte=nn.Linear(config.gpt_input_dim, config.gpt_hidden_dim), + wpe=nn.Embedding(config.gpt_block_size, config.gpt_hidden_dim), drop=nn.Dropout(config.dropout), h=nn.ModuleList([Block(config) for _ in range(config.gpt_n_layer)]), - ln_f=nn.LayerNorm(config.gpt_n_embed), + ln_f=nn.LayerNorm(config.gpt_hidden_dim), ) ) - self.lm_head = nn.Linear(config.gpt_n_embed, config.gpt_output_dim, bias=False) + self.lm_head = nn.Linear(config.gpt_hidden_dim, config.gpt_output_dim, bias=False) # init all weights, and apply a special scaled init to the residual projections, per GPT-2 paper self.apply(self._init_weights) for pn, p in self.named_parameters(): @@ -2211,10 +2211,10 @@ class GPT(nn.Module): # forward the GPT model itself tok_emb = self.transformer.wte( input - ) # token embeddings of shape (b, t, gpt_n_embed) + ) # token embeddings of shape (b, t, gpt_hidden_dim) pos_emb = self.transformer.wpe( pos - ) # position embeddings of shape (1, t, gpt_n_embed) + ) # position embeddings of shape (1, t, gpt_hidden_dim) x = self.transformer.drop(tok_emb + pos_emb) for block in self.transformer.h: x = block(x) diff --git a/lerobot/configs/policy/vqbet.yaml b/lerobot/configs/policy/vqbet.yaml index dfd0d8d7..2e6eb038 100644 --- a/lerobot/configs/policy/vqbet.yaml +++ b/lerobot/configs/policy/vqbet.yaml @@ -1,8 +1,6 @@ # @package _global_ # Defaults for training for the PushT dataset as per https://github.com/real-stanford/diffusion_policy. -# Note: We do not track EMA model weights as we discovered it does not improve the results. See -# https://github.com/huggingface/lerobot/pull/134 for more details. seed: 100000 dataset_repo_id: lerobot/pusht @@ -97,8 +95,8 @@ policy: gpt_output_dim: 512 gpt_n_layer: 8 gpt_n_head: 8 - gpt_n_embed: 512 + gpt_hidden_dim: 512 dropout: 0.1 mlp_hidden_dim: 1024 offset_loss_weight: 10000. - secondary_code_multiplier: 0.5 \ No newline at end of file + secondary_code_loss_weight: 0.5 \ No newline at end of file